From 731d2931bec86bdeede2d293e144bc2dbf6d2f46 Mon Sep 17 00:00:00 2001 From: Earl Warren Date: Sun, 20 Oct 2024 11:24:52 +0200 Subject: [PATCH] split the README into separate files for clarity --- README.md | 685 ++-------------------------------------------- drbd-nginx-lxc.md | 177 ++++++++++++ k8s.md | 176 ++++++++++++ lxc.md | 160 +++++++++++ octopuce.md | 35 +++ runner-lxc.md | 44 +++ 6 files changed, 613 insertions(+), 664 deletions(-) create mode 100644 drbd-nginx-lxc.md create mode 100644 k8s.md create mode 100644 lxc.md create mode 100644 octopuce.md create mode 100644 runner-lxc.md diff --git a/README.md b/README.md index 62d98a6..85d3374 100644 --- a/README.md +++ b/README.md @@ -1,493 +1,25 @@ The resources used by the infrastructure are in the https://code.forgejo.org/infrastructure/ organization. There is a [dedicated chatroom](https://matrix.to/#/#forgejo-ci:matrix.org). A mirror of this repository is available at https://git.pub.solar/forgejo/infrastructure-documentation. -## LXC Hosts +## Table of content -All LXC hosts are setup with [lxc-helpers](https://code.forgejo.org/forgejo/lxc-helpers/). +- Setting up a new [K8S/DRBD/NFS k8s node](k8s.md) +- Setting up a new [LXC/DRBD Host](lxc.md) +- Managing services with a [LXC/DRBD/nginx stack](drbd-nginx-lxc.md) +- Installing a [Forgejo runner in an LXC container](runner-lxc.md) +- Managing the [Octopuce host](octopuce.md) -```sh -name=forgejo-host -lxc-helpers.sh lxc_container_run $name -- sudo --user debian bash -``` - -See https://github.com/mikesart/inotify-info. Running multiple LXC -containers will quickly use the default limit (128 on bookworm). - -```sh -echo fs.inotify.max_user_instances=8192 | sudo tee -a /etc/sysctl.conf -sudo sysctl -p -``` - -### Unprivileged - -```sh -name=forgejo-host -lxc-helpers.sh lxc_container_create --config "unprivileged" $name -echo "lxc.start.auto = 1" | sudo tee -a /var/lib/lxc/$name/config -lxc-helpers.sh lxc_container_start $name -lxc-helpers.sh lxc_container_user_install $name $(id -u) $USER -``` - -### Docker enabled - -```sh -name=forgejo-host -lxc-helpers.sh lxc_container_create --config "docker" $name -echo "lxc.start.auto = 1" | sudo tee -a /var/lib/lxc/$name/config -lxc-helpers.sh lxc_container_start $name -lxc-helpers.sh lxc_install_docker $name -lxc-helpers.sh lxc_container_user_install $name $(id -u) $USER -``` - -### K8S enabled - -```sh -name=forgejo-host -lxc-helpers.sh lxc_container_create --config "k8s" $name -echo "lxc.start.auto = 1" | sudo tee -a /var/lib/lxc/$name/config -lxc-helpers.sh lxc_container_start $name -lxc-helpers.sh lxc_container_user_install $name $(id -u) $USER -``` - -### Docker and LXC enabled - -```sh -name=forgejo-host -ipv4=10.85.12 -ipv6=fc33 -lxc-helpers.sh lxc_container_create --config "docker lxc" $name -echo "lxc.start.auto = 1" | sudo tee -a /var/lib/lxc/$name/config -lxc-helpers.sh lxc_container_start $name -lxc-helpers.sh lxc_install_docker $name -lxc-helpers.sh lxc_install_lxc $name $ipv4 $ipv6 -lxc-helpers.sh lxc_container_user_install $name $(id -u) $USER -``` - -## firewall - -```sh -sudo apt-get install ufw -``` - -```sh -sudo ufw default allow incoming -sudo ufw default allow outgoing -sudo ufw default allow routed - -interface=enp5s0 - -function internode() { - for from in $@ ; do - for to in $@ ; do - if test $from != $to ; then - sudo ufw allow in on $interface from $from to $to - fi - done - done -} - -ipv4="65.108.204.171 88.198.58.177" -internode $ipv4 - -ipv6="2a01:4f9:1a:a082::2 2a01:4f8:222:507::2" -internode $ipv6 - -for host_ip in $ipv4 $ipv6 ; do - sudo ufw allow in on $interface to $host_ip port 22 proto tcp - sudo ufw deny in on $interface log-all to $host_ip -done - -failover="188.40.16.47 2a01:4f8:fff2:48::2" - -for public_ip in $failover ; do - sudo ufw allow in on $interface to $public_ip port 22,80,443,2000:3000 proto tcp - sudo ufw deny in on $interface log-all to $public_ip -done -``` - -```sh -sudo systemctl enable ufw -sudo ufw enable -``` - -```sh -sudo ufw status verbose -``` - -## nftables - -```sh -sudo nft list ruleset -``` - -## Host reverse proxy - -The reverse proxy on a host forwards to the designated LXC container with -something like the following examples in -`/etc/nginx/sites-available/example.com`, where A.B.C.D is the -IP allocated to the LXC container running the web service. - -And symlink: - -```sh -ln -s /etc/nginx/sites-available/example.com /etc/nginx/sites-enabled/example.com -``` - -The certificate is obtained once and automatically renewed with: - -``` -sudo apt-get install certbot python3-certbot-nginx -sudo certbot -n --agree-tos --email contact@forgejo.org -d example.com --nginx -``` - -When removing a configuration, the certificate can also be removed with: - -``` -sudo certbot delete --cert-name example.com -``` - -Forwarding TCP streams (useful for ssh) requires installing the module: - -```sh -sudo apt-get install libnginx-mod-stream -``` - -Rate limiting crawlers is done by adding the following to `/etc/nginx/conf.d/limit.conf`: - -``` -# http://nginx.org/en/docs/http/ngx_http_limit_req_module.html -# https://blog.nginx.org/blog/rate-limiting-nginx -map $http_user_agent $isbot_ua { - default 0; - ~*(GoogleBot|GoogleOther|bingbot|YandexBot) 1; -} -map $isbot_ua $limit_bot { - 0 ""; - 1 $binary_remote_addr; -} -limit_req_zone $limit_bot zone=bots:10m rate=1r/m; -limit_req_status 429; -``` - -and the following in the location to be rate limited: - -``` - location / { - limit_req zone=bots burst=2 nodelay; - ... -``` - -## Host wakeup-on-logs - -https://code.forgejo.org/infrastructure/wakeup-on-logs - -### K8S wakeup-on-logs script - -``` -$ cat /etc/wakeup-on-logs/forgejo-v8 -#!/bin/bash - -set -x - -self="${BASH_SOURCE[0]}" -name=$(basename $self) -# keep it lower than https://code.forgejo.org/infrastructure/wakeup-on-logs -# otherwise it will get killed by it -timeout=4m - -function lxc_run() { - lxc-attach $name -- sudo --user debian KUBECONFIG=/etc/rancher/k3s/k3s.yaml "$@" |& tee -a /var/log/$name.log -} - -image=codeberg.org/forgejo-experimental/forgejo -major=${name##*v} -digest=$(skopeo inspect --format "{{.Digest}}" docker://$image:$major-rootless) -values=https://code.forgejo.org/infrastructure/k8s/raw/branch/main/forgejo-v$major/values.yml -lxc_run helm upgrade forgejo -f $values -f /home/debian/secrets.yml oci://code.forgejo.org/forgejo-helm/forgejo --atomic --wait --timeout $timeout --install --set image.digest=$digest -``` - -### Forgejo example - -``` -server { - listen 80; - listen [::]:80; - - server_name example.com; - - location / { - proxy_pass http://A.B.C.D:8080; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto https; - client_max_body_size 2G; - } -} -``` - -### GitLab example - -```nginx -server { - listen 80; - listen [::]:80; - - server_name example.com; - - location / { - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection "upgrade"; - proxy_set_header Host $http_host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_set_header X-Frame-Options SAMEORIGIN; - - client_body_timeout 60; - client_max_body_size 200M; - send_timeout 1200; - lingering_timeout 5; - - proxy_buffering off; - proxy_connect_timeout 90; - proxy_send_timeout 300; - proxy_read_timeout 600s; - - proxy_pass http://example.com; - proxy_http_version 1.1; - } -} -``` - -### Vanila example - -```nginx -server { - listen 80; - listen [::]:80; - - server_name example.com; - - location / { - proxy_pass http://A.B.C.D; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto https; - } -} -``` - -### 302 redirection - -```nginx -server { - listen 80; - listen [::]:80; - - server_name example.com; - return 302 https://other.example.com$request_uri; -} -``` - -## Forgejo runners - -The LXC container in which the runner is installed must have capabilities that support the backend. - -- docker:// needs a Docker enabled container -- lxc:// needs a Docker and LXC enabled container - -The runners it contains are not started at boot, it must be done manually. The bash history has the command line to do so. - -### Installation - -```shell -version=3.5.0 -sudo wget -O /usr/local/bin/forgejo-runner-$version https://code.forgejo.org/forgejo/runner/releases/download/v$version/forgejo-runner-$version-linux-amd64 -sudo chmod +x /usr/local/bin/forgejo-runner-$version -echo 'export TERM=xterm-256color' >> .bashrc -``` - -### Creating a runner - -Multiple runners can co-exist on the same machine. To keep things -organized they are located in a directory that is the same as the URL -from which the token is obtained. For instance -DIR=codeberg.org/forgejo-integration means that the token was obtained from the -https://codeberg.org/forgejo-integration organization. - -If a runner only provides unprivileged docker containers, the labels -in `config.yml` should be: -`labels: ['docker:docker://node:20-bookworm']`. - -If a runner provides LXC containers and unprivileged docker -containers, the labels in `config.yml` should be -`labels: ['self-hosted:lxc://debian:bookworm', 'docker:docker://node:20-bookworm']`. - -```shell -name=myrunner -mkdir -p $DIR ; cd $DIR -forgejo-runner generate-config > config-$name.yml -## edit config-$name.yml and adjust the `labels:` -## Obtain a $TOKEN from https://$DIR -forgejo-runner-$version register --no-interactive --token $TOKEN --name runner --instance https://codeberg.org -forgejo-runner-$version --config config-$name.yml daemon |& cat -v > runner.log & -``` - -## Octopuce - -[Octopuce provides hardware](https://codeberg.org/forgejo/sustainability) managed by [the devops team](https://codeberg.org/forgejo/governance/src/branch/main/TEAMS.md#devops). It can only be accessed via SSH. - -To access the services hosted on the LXC containers, ssh port forwarding to the private IPs can be used. For instance: - -```sh -echo 127.0.0.1 private.forgejo.org >> /etc/hosts -sudo ssh -i ~/.ssh/id_rsa -L 80:10.77.0.128:80 debian@forgejo01.octopuce.fr -firefox http://private.forgejo.org -``` - -### Containers - -- `forgejo-host` - - Dedicated to http://private.forgejo.org - - - Docker enabled - - upgrades checklist: - ```sh - emacs /home/debian/run-forgejo.sh # change the `image=` - docker stop forgejo - sudo rsync -av --numeric-ids --delete --progress /srv/forgejo/ /root/forgejo-backup/ - docker rm forgejo - bash -x /home/debian/run-forgejo.sh - docker logs -n 200 -f forgejo - ``` - -- `forgejo-runner-host` - - Has runners installed to run against private.forgejo.org - - - Docker and LXC enabled 10.85.12 fc33 - -## Hetzner - -All hardware machines are running Debian GNU/linux bookworm. They are LXC hosts -setup with [lxc-helpers](https://code.forgejo.org/forgejo/lxc-helpers/). - -> **NOTE:** only use [EX101 with a ASRockRack W680D4U-1L motherboard](https://forum.hetzner.com/index.php?thread/31135-all-ex101-with-asustek-w680-crash-on-sequential-read/). - -### vSwitch - -A vSwitch is assigned via the Robot console on all servers for backend communications -and [configured](https://docs.hetzner.com/robot/dedicated-server/network/vswitch#example-debian-configuration) -in /etc/network/interfaces for each of them with something like: - -``` -auto enp5s0.4000 -iface enp5s0.4000 inet static - address 10.53.100.2 - netmask 255.255.255.0 - vlan-raw-device enp5s0 - mtu 1400 -``` - -The IP address ends with the same number as the hardware (hetzner02 => .2). - -#### vSwitch DRBD - -The vSwitch on VLAN 4000 is for DRBD exclusively - -#### vSwitch NFS - -The vSwitch on VLAN 4001 is for NFS - -#### vSwitch k8s - -The vSwitch on VLAN 4002 is for the k8s control plane - -### DRBD - -DRBD is [configured](https://linbit.com/drbd-user-guide/drbd-guide-9_0-en/#p-work) like in the following example with hetzner02 as the primary and hetzner03 as the secondary: - -```sh -$ apt-get install drbd-utils -$ cat /etc/drbd.d/r0.res -resource r0 { - net { - # A : write completion is determined when data is written to the local disk and the local TCP transmission buffer - # B : write completion is determined when data is written to the local disk and remote buffer cache - # C : write completion is determined when data is written to both the local disk and the remote disk - protocol C; - cram-hmac-alg sha1; - # any secret key for authentication among nodes - shared-secret "***"; - } - disk { - resync-rate 1000M; - } - on hetzner02 { - address 10.53.100.2:7788; - volume 0 { - # device name - device /dev/drbd0; - # specify disk to be used for device above - disk /dev/nvme0n1p5; - # where to create metadata - # specify the block device name when using a different disk - meta-disk internal; - } - } - on hetzner03 { - address 10.53.100.3:7788; - volume 0 { - device /dev/drbd0; - disk /dev/nvme1n1p5; - meta-disk internal; - } - } -} -$ sudo drbdadm create-md r0 -$ sudo systemctl enable drbd -$ sudo systemctl start drbd -``` - -On hetzner02 (the primary), [pretend all is in sync](https://linbit.com/drbd-user-guide/drbd-guide-9_0-en/#s-skip-initial-resync) to save the initial bitmap sync since -there is actually no data at all. - -```sh -sudo drbdadm new-current-uuid --clear-bitmap r0/0 -``` - -The DRBD device is mounted on `/var/lib/lxc` in `/etc/fstab` there is a noauto line: - -``` -/dev/drbd0 /var/lib/lxc ext4 noauto,defaults 0 0 -``` - -To prevent split brain situations a manual step is required at boot -time, on the machine that is going to be the primary. - -```sh -sudo drbdadm primary r0 -sudo drbdsetup status -sudo mount /var/lib/lxc -sudo lxc-autostart start -sudo lxc-ls -f -sudo drbdsetup status -``` - -### hetzner{01,04} +## hetzner{01,04} https://hetzner{01,04}.forgejo.org run on [EX101](https://www.hetzner.com/dedicated-rootserver/ex101) Hetzner hardware. -#### LXC +### LXC ```sh lxc-helpers.sh lxc_install_lxc_inside 10.41.13 fc29 ``` -#### Disk partitioning +### Disk partitioning - First disk - OS @@ -495,14 +27,14 @@ lxc-helpers.sh lxc_install_lxc_inside 10.41.13 fc29 - Second disk - configured with DRBD for precious data. -#### Root filesystem backups +### Root filesystem backups - `hetzner01:/etc/cron.daily/backup-hetzner04` `rsync -aHS --delete-excluded --delete --numeric-ids --exclude /proc --exclude /dev --exclude /sys --exclude /precious --exclude /srv --exclude /var/lib/lxc 10.53.100.4:/ /srv/backups/hetzner04/ >& /var/log/$(basename $0).log` - `hetzner04:/etc/cron.daily/backup-hetzner01` `rsync -aHS --delete-excluded --delete --numeric-ids --exclude /proc --exclude /dev --exclude /sys --exclude /precious --exclude /srv --exclude /var/lib/lxc 10.53.100.1:/ /srv/backups/hetzner01/ >& /var/log/$(basename $0).log` -#### LXC containers +### LXC containers - `runner-lxc-helpers` (hetzner01) @@ -588,17 +120,17 @@ lxc-helpers.sh lxc_install_lxc_inside 10.41.13 fc29 - Docker enabled -### hetzner{02,03} +## hetzner{02,03} https://hetzner02.forgejo.org & https://hetzner03.forgejo.org run on [EX44](https://www.hetzner.com/dedicated-rootserver/ex44) Hetzner hardware. -#### LXC +### LXC ```sh lxc-helpers.sh lxc_install_lxc_inside 10.6.83 fc16 ``` -#### Disk partitioning +### Disk partitioning - First disk - OS @@ -606,14 +138,14 @@ lxc-helpers.sh lxc_install_lxc_inside 10.6.83 fc16 - Second disk - non precious data such as the LXC containers with runners. -#### Root filesystem backups +### Root filesystem backups - `hetzner03:/etc/cron.daily/backup-hetzner02` `rsync -aHS --delete-excluded --delete --numeric-ids --exclude /proc --exclude /dev --exclude /sys --exclude /srv --exclude /var/lib/lxc 10.53.100.2:/ /srv/backups/hetzner02/` - `hetzner02:/etc/cron.daily/backup-hetzner03` `rsync -aHS --delete-excluded --delete --numeric-ids --exclude /proc --exclude /dev --exclude /sys --exclude /srv --exclude /var/lib/lxc 10.53.100.3:/ /srv/backups/hetzner03/` -#### Public IP addresses +### Public IP addresses The public IP addresses attached to the hosts are not failover IPs that can be moved from one host to the next. The DNS entry needs to be updated if the primary hosts changes. @@ -653,7 +185,7 @@ ListenAddress 65.21.67.73 #ListenAddress :: ``` -#### Port forwarding +### Port forwarding Forwarding a port to an LXC container can be done with [nginx streeam](https://nginx.org/en/docs/stream/ngx_stream_core_module.html) for the public IP of code.forgejo.org (65.21.67.71 & 2a01:4f9:3081:51ec::102) to the private IP (10.6.83.195) of the `code` LXC container in `/etc/nginx/modules-enabled/ssh.conf`: @@ -677,13 +209,13 @@ stream { } ``` -#### 302 redirects +### 302 redirects - On hetzner02 - try.next.forgejo.org redirects to v(latest stable).next.forgejo.org - dev.next.forgejo.org redirects to v(latest dev).next.forgejo.org -#### Containers +### Containers - `forgejo-code` on hetzner02 @@ -785,186 +317,11 @@ stream { Dedicated to https://codeberg.org/forgejo-contrib/forgejo-helm and running from an ephemeral disk -### hetzner{05,06} +## hetzner{05,06} https://hetzner05.forgejo.org & https://hetzner06.forgejo.org run on [EX44](https://www.hetzner.com/dedicated-rootserver/ex44) Hetzner hardware. - -#### Imaging - -Using installimage from the rescue instance. - -- `wipefs -fa /dev/nvme*n1` -- `installimage -r no -n hetzner0?` -- Debian bookworm -- `PART / ext4 100G` -- `PART /srv ext4 all` -- ESC 0 + yes -- reboot - -Partitioning. - -- First disk - - OS - - non precious data such as the LXC containers with runners. -- Second disk - - a partition configured with DRBD - -Debian user. - -- `ssh root@hetzner0?.forgejo.org` -- `useradd --shell /bin/bash --create-home --groups sudo debian` -- `mkdir -p /home/debian/.ssh ; cp -a .ssh/authorized_keys /home/debian/.ssh ; chown -R debian /home/debian/.ssh` -- in `/etc/sudoers` edit `%sudo ALL=(ALL:ALL) NOPASSWD:ALL` - -#### Install helpers - -Each node is identifed by the last digit of the hostname. - -```sh -sudo apt-get install git etckeeper -git clone https://code.forgejo.org/infrastructure/documentation -cd documentation/k3s-host -cp variables.sh.example variables.sh -cp secrets.sh.example secrets.sh -``` - -Variables that must be set depending on the role of the node. - -- first server node - - secrets.sh: node_drbd_shared_secret -- other server node - - secrets.sh: node_drbd_shared_secret - - secrets.sh: node_k8s_token: content of /var/lib/rancher/k3s/server/token on the first node - - variables.sh: node_k8s_existing: identifier of the first node (e.g. 5) -- etcd node - - secrets.sh: node_k8s_token: content of /var/lib/rancher/k3s/server/token on the first node - - variables.sh: node_k8s_existing: identifier of the first node (e.g. 5) - - variables.sh: node_k8s_etcd: identifier of the node whose role is just etcd (e.g. 3) - -The other variables depend on the setup. - -#### Firewall - -`./setup.sh setup_ufw` - -#### DRBD - -DRBD is [configured](https://linbit.com/drbd-user-guide/drbd-guide-9_0-en/#p-work) with: - -`./setup.sh setup_drbd` - -Once two nodes have DRBD setup for the first time, it can be initialized by [pretending all is in sync](https://linbit.com/drbd-user-guide/drbd-guide-9_0-en/#s-skip-initial-resync) to save the initial bitmap sync since there is actually no data at all. - - -```sh -sudo drbdadm primary r1 -sudo drbdadm new-current-uuid --clear-bitmap r1/0 -sudo mount /precious -``` - -#### NFS - -`./setup.sh setup_nfs` - -On the node that has the DRBD volume `/precious` mounted, set the IP of the NFS server to be used by k8s: - -```sh -sudo ip addr add 10.53.101.100/24 dev enp5s0.4001 -``` - -#### K8S - -For the first node `./setup.sh setup_k8s`. For nodes joining the cluster `./setup.sh setup_k8s 6` where `hetzner06` is an existing node. - -- [metallb](https://metallb.universe.tf) instead of the default load balancer because it does not allow for a public IP different from the `k8s` node IP. - `./setup.sh setup_k8s_metallb` -- [traefik](https://traefik.io/) requests with [annotations](https://github.com/traefik/traefik-helm-chart/blob/7a13fc8a61a6ad30fcec32eec497dab9d8aea686/traefik/values.yaml#L736) specific IPs from `metalldb`. - `./setup.sh setup_k8s_traefik` -- [cert-manager](https://cert-manager.io/). - `./setup.sh setup_k8s_certmanager` -- NFS storage class - `./setup.sh setup_k8s_nfs` - -#### Forgejo - -[forgejo](https://code.forgejo.org/forgejo-helm/forgejo-helm) configuration in [ingress](https://code.forgejo.org/forgejo-helm/forgejo-helm#ingress) for the reverse proxy (`traefik`) to route the domain and for the ACME issuer (`cert-manager`) to obtain a certificate. And in [service](https://code.forgejo.org/forgejo-helm/forgejo-helm#service) for the `ssh` port to be bound to the desired IPs of the load balancer (`metallb`). - -``` -ingress: - enabled: true - annotations: - # https://cert-manager.io/docs/usage/ingress/#supported-annotations - # https://github.com/cert-manager/cert-manager/issues/2239 - cert-manager.io/cluster-issuer: letsencrypt-http - cert-manager.io/private-key-algorithm: ECDSA - cert-manager.io/private-key-size: 384 - kubernetes.io/ingress.class: traefik - traefik.ingress.kubernetes.io/router.entrypoints: websecure - tls: - - hosts: - - t1.forgejo.org - secretName: tls-forgejo-t1-ingress-http - hosts: - - host: t1.forgejo.org - paths: - - path: / - pathType: Prefix - -service: - http: - type: ClusterIP - ipFamilyPolicy: PreferDualStack - port: 3000 - ssh: - type: LoadBalancer - annotations: - metallb.universe.tf/loadBalancerIPs: 188.40.16.47,2a01:4f8:fff2:48::2 - metallb.universe.tf/allow-shared-ip: "key-to-share-failover" - ipFamilyPolicy: PreferDualStack - port: 2222 -``` - -### K8S NFS storage creation - -Define the 20GB `forgejo-data` pvc owned by user id 1000. - -```sh -./setup.sh setup_k8s_pvc forgejo-data 20Gi 1000 -``` - -[Instruct the forgejo pod](https://code.forgejo.org/forgejo-helm/forgejo-helm#persistence) to use the `forgejo-data` pvc. - -```yaml -persistence: - enabled: true - create: false - claimName: forgejo-data -``` - -## Disaster recovery and maintenance - -### When a machine or disk is scheduled for replacement. - -* `kubectl drain hetzner05` # evacuate all the pods out of the node to be shutdown -* `kubectl taint nodes hetzner05 key1=value1:NoSchedule` # prevent any pod from being created there (metallb speaker won't be drained, for instance) -* `kubectl delete node hetzner05` # let the cluster know it no longer exists so a new one by the same name can replace it - -### Routing the failover IP - -When the machine to which the failover IP (failover.forgejo.org) is routed is unavailable or to be shutdown, to the [Hetzner server panel](https://robot.hetzner.com/server), to the IPs tab and change the route of the failover IP to another node. All nodes are configured with the failover IP, there is nothing else to do. - -### Manual boot operations - -#### On the machine that runs the NFS server - -* `sudo drbdadm primary r1` # Switch the DRBD to primary -* `sudo mount /precious` # DRBD volume shared via NFS -* `sudo ip addr add 10.53.101.100/24 dev enp5s0.4001` # add NFS server IP - -#### On the other machines - -* `sudo ip addr del 10.53.101.100/24 dev enp5s0.4001` # remove NFS server IP +Nodes of [a k8s cluster](k8s.md). ## Uberspace diff --git a/drbd-nginx-lxc.md b/drbd-nginx-lxc.md new file mode 100644 index 0000000..9194592 --- /dev/null +++ b/drbd-nginx-lxc.md @@ -0,0 +1,177 @@ +## nftables + +```sh +sudo nft list ruleset +``` + +## Host reverse proxy + +The reverse proxy on a host forwards to the designated LXC container with +something like the following examples in +`/etc/nginx/sites-available/example.com`, where A.B.C.D is the +IP allocated to the LXC container running the web service. + +And symlink: + +```sh +ln -s /etc/nginx/sites-available/example.com /etc/nginx/sites-enabled/example.com +``` + +The certificate is obtained once and automatically renewed with: + +``` +sudo apt-get install certbot python3-certbot-nginx +sudo certbot -n --agree-tos --email contact@forgejo.org -d example.com --nginx +``` + +When removing a configuration, the certificate can also be removed with: + +``` +sudo certbot delete --cert-name example.com +``` + +Forwarding TCP streams (useful for ssh) requires installing the module: + +```sh +sudo apt-get install libnginx-mod-stream +``` + +Rate limiting crawlers is done by adding the following to `/etc/nginx/conf.d/limit.conf`: + +``` +# http://nginx.org/en/docs/http/ngx_http_limit_req_module.html +# https://blog.nginx.org/blog/rate-limiting-nginx +map $http_user_agent $isbot_ua { + default 0; + ~*(GoogleBot|GoogleOther|bingbot|YandexBot) 1; +} +map $isbot_ua $limit_bot { + 0 ""; + 1 $binary_remote_addr; +} +limit_req_zone $limit_bot zone=bots:10m rate=1r/m; +limit_req_status 429; +``` + +and the following in the location to be rate limited: + +``` + location / { + limit_req zone=bots burst=2 nodelay; + ... +``` + +## Host wakeup-on-logs + +https://code.forgejo.org/infrastructure/wakeup-on-logs + +### K8S wakeup-on-logs script + +``` +$ cat /etc/wakeup-on-logs/forgejo-v8 +#!/bin/bash + +set -x + +self="${BASH_SOURCE[0]}" +name=$(basename $self) +# keep it lower than https://code.forgejo.org/infrastructure/wakeup-on-logs +# otherwise it will get killed by it +timeout=4m + +function lxc_run() { + lxc-attach $name -- sudo --user debian KUBECONFIG=/etc/rancher/k3s/k3s.yaml "$@" |& tee -a /var/log/$name.log +} + +image=codeberg.org/forgejo-experimental/forgejo +major=${name##*v} +digest=$(skopeo inspect --format "{{.Digest}}" docker://$image:$major-rootless) +values=https://code.forgejo.org/infrastructure/k8s/raw/branch/main/forgejo-v$major/values.yml +lxc_run helm upgrade forgejo -f $values -f /home/debian/secrets.yml oci://code.forgejo.org/forgejo-helm/forgejo --atomic --wait --timeout $timeout --install --set image.digest=$digest +``` + +### Forgejo example + +``` +server { + listen 80; + listen [::]:80; + + server_name example.com; + + location / { + proxy_pass http://A.B.C.D:8080; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto https; + client_max_body_size 2G; + } +} +``` + +### GitLab example + +```nginx +server { + listen 80; + listen [::]:80; + + server_name example.com; + + location / { + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Frame-Options SAMEORIGIN; + + client_body_timeout 60; + client_max_body_size 200M; + send_timeout 1200; + lingering_timeout 5; + + proxy_buffering off; + proxy_connect_timeout 90; + proxy_send_timeout 300; + proxy_read_timeout 600s; + + proxy_pass http://example.com; + proxy_http_version 1.1; + } +} +``` + +### Vanila example + +```nginx +server { + listen 80; + listen [::]:80; + + server_name example.com; + + location / { + proxy_pass http://A.B.C.D; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto https; + } +} +``` + +### 302 redirection + +```nginx +server { + listen 80; + listen [::]:80; + + server_name example.com; + return 302 https://other.example.com$request_uri; +} +``` + diff --git a/k8s.md b/k8s.md new file mode 100644 index 0000000..7f69d49 --- /dev/null +++ b/k8s.md @@ -0,0 +1,176 @@ +#### Imaging + +Using installimage from the rescue instance. + +- `wipefs -fa /dev/nvme*n1` +- `installimage -r no -n hetzner0?` +- Debian bookworm +- `PART / ext4 100G` +- `PART /srv ext4 all` +- ESC 0 + yes +- reboot + +Partitioning. + +- First disk + - OS + - non precious data such as the LXC containers with runners. +- Second disk + - a partition configured with DRBD + +Debian user. + +- `ssh root@hetzner0?.forgejo.org` +- `useradd --shell /bin/bash --create-home --groups sudo debian` +- `mkdir -p /home/debian/.ssh ; cp -a .ssh/authorized_keys /home/debian/.ssh ; chown -R debian /home/debian/.ssh` +- in `/etc/sudoers` edit `%sudo ALL=(ALL:ALL) NOPASSWD:ALL` + +#### Install helpers + +Each node is identifed by the last digit of the hostname. + +```sh +sudo apt-get install git etckeeper +git clone https://code.forgejo.org/infrastructure/documentation +cd documentation/k3s-host +cp variables.sh.example variables.sh +cp secrets.sh.example secrets.sh +``` + +Variables that must be set depending on the role of the node. + +- first server node + - secrets.sh: node_drbd_shared_secret +- other server node + - secrets.sh: node_drbd_shared_secret + - secrets.sh: node_k8s_token: content of /var/lib/rancher/k3s/server/token on the first node + - variables.sh: node_k8s_existing: identifier of the first node (e.g. 5) +- etcd node + - secrets.sh: node_k8s_token: content of /var/lib/rancher/k3s/server/token on the first node + - variables.sh: node_k8s_existing: identifier of the first node (e.g. 5) + - variables.sh: node_k8s_etcd: identifier of the node whose role is just etcd (e.g. 3) + +The other variables depend on the setup. + +#### Firewall + +`./setup.sh setup_ufw` + +#### DRBD + +DRBD is [configured](https://linbit.com/drbd-user-guide/drbd-guide-9_0-en/#p-work) with: + +`./setup.sh setup_drbd` + +Once two nodes have DRBD setup for the first time, it can be initialized by [pretending all is in sync](https://linbit.com/drbd-user-guide/drbd-guide-9_0-en/#s-skip-initial-resync) to save the initial bitmap sync since there is actually no data at all. + + +```sh +sudo drbdadm primary r1 +sudo drbdadm new-current-uuid --clear-bitmap r1/0 +sudo mount /precious +``` + +#### NFS + +`./setup.sh setup_nfs` + +On the node that has the DRBD volume `/precious` mounted, set the IP of the NFS server to be used by k8s: + +```sh +sudo ip addr add 10.53.101.100/24 dev enp5s0.4001 +``` + +#### K8S + +For the first node `./setup.sh setup_k8s`. For nodes joining the cluster `./setup.sh setup_k8s 6` where `hetzner06` is an existing node. + +- [metallb](https://metallb.universe.tf) instead of the default load balancer because it does not allow for a public IP different from the `k8s` node IP. + `./setup.sh setup_k8s_metallb` +- [traefik](https://traefik.io/) requests with [annotations](https://github.com/traefik/traefik-helm-chart/blob/7a13fc8a61a6ad30fcec32eec497dab9d8aea686/traefik/values.yaml#L736) specific IPs from `metalldb`. + `./setup.sh setup_k8s_traefik` +- [cert-manager](https://cert-manager.io/). + `./setup.sh setup_k8s_certmanager` +- NFS storage class + `./setup.sh setup_k8s_nfs` + +#### Forgejo + +[forgejo](https://code.forgejo.org/forgejo-helm/forgejo-helm) configuration in [ingress](https://code.forgejo.org/forgejo-helm/forgejo-helm#ingress) for the reverse proxy (`traefik`) to route the domain and for the ACME issuer (`cert-manager`) to obtain a certificate. And in [service](https://code.forgejo.org/forgejo-helm/forgejo-helm#service) for the `ssh` port to be bound to the desired IPs of the load balancer (`metallb`). + +``` +ingress: + enabled: true + annotations: + # https://cert-manager.io/docs/usage/ingress/#supported-annotations + # https://github.com/cert-manager/cert-manager/issues/2239 + cert-manager.io/cluster-issuer: letsencrypt-http + cert-manager.io/private-key-algorithm: ECDSA + cert-manager.io/private-key-size: 384 + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + tls: + - hosts: + - t1.forgejo.org + secretName: tls-forgejo-t1-ingress-http + hosts: + - host: t1.forgejo.org + paths: + - path: / + pathType: Prefix + +service: + http: + type: ClusterIP + ipFamilyPolicy: PreferDualStack + port: 3000 + ssh: + type: LoadBalancer + annotations: + metallb.universe.tf/loadBalancerIPs: 188.40.16.47,2a01:4f8:fff2:48::2 + metallb.universe.tf/allow-shared-ip: "key-to-share-failover" + ipFamilyPolicy: PreferDualStack + port: 2222 +``` + +### K8S NFS storage creation + +Define the 20GB `forgejo-data` pvc owned by user id 1000. + +```sh +./setup.sh setup_k8s_pvc forgejo-data 20Gi 1000 +``` + +[Instruct the forgejo pod](https://code.forgejo.org/forgejo-helm/forgejo-helm#persistence) to use the `forgejo-data` pvc. + +```yaml +persistence: + enabled: true + create: false + claimName: forgejo-data +``` + +## Disaster recovery and maintenance + +### When a machine or disk is scheduled for replacement. + +* `kubectl drain hetzner05` # evacuate all the pods out of the node to be shutdown +* `kubectl taint nodes hetzner05 key1=value1:NoSchedule` # prevent any pod from being created there (metallb speaker won't be drained, for instance) +* `kubectl delete node hetzner05` # let the cluster know it no longer exists so a new one by the same name can replace it + +### Routing the failover IP + +When the machine to which the failover IP (failover.forgejo.org) is routed is unavailable or to be shutdown, to the [Hetzner server panel](https://robot.hetzner.com/server), to the IPs tab and change the route of the failover IP to another node. All nodes are configured with the failover IP, there is nothing else to do. + +### Manual boot operations + +#### On the machine that runs the NFS server + +* `sudo drbdadm primary r1` # Switch the DRBD to primary +* `sudo mount /precious` # DRBD volume shared via NFS +* `sudo ip addr add 10.53.101.100/24 dev enp5s0.4001` # add NFS server IP + +#### On the other machines + +* `sudo ip addr del 10.53.101.100/24 dev enp5s0.4001` # remove NFS server IP + diff --git a/lxc.md b/lxc.md new file mode 100644 index 0000000..a8abe24 --- /dev/null +++ b/lxc.md @@ -0,0 +1,160 @@ +## LXC Hosts + +All LXC hosts are setup with [lxc-helpers](https://code.forgejo.org/forgejo/lxc-helpers/). + +```sh +name=forgejo-host +lxc-helpers.sh lxc_container_run $name -- sudo --user debian bash +``` + +See https://github.com/mikesart/inotify-info. Running multiple LXC +containers will quickly use the default limit (128 on bookworm). + +```sh +echo fs.inotify.max_user_instances=8192 | sudo tee -a /etc/sysctl.conf +sudo sysctl -p +``` + +### Unprivileged + +```sh +name=forgejo-host +lxc-helpers.sh lxc_container_create --config "unprivileged" $name +echo "lxc.start.auto = 1" | sudo tee -a /var/lib/lxc/$name/config +lxc-helpers.sh lxc_container_start $name +lxc-helpers.sh lxc_container_user_install $name $(id -u) $USER +``` + +### Docker enabled + +```sh +name=forgejo-host +lxc-helpers.sh lxc_container_create --config "docker" $name +echo "lxc.start.auto = 1" | sudo tee -a /var/lib/lxc/$name/config +lxc-helpers.sh lxc_container_start $name +lxc-helpers.sh lxc_install_docker $name +lxc-helpers.sh lxc_container_user_install $name $(id -u) $USER +``` + +### K8S enabled + +```sh +name=forgejo-host +lxc-helpers.sh lxc_container_create --config "k8s" $name +echo "lxc.start.auto = 1" | sudo tee -a /var/lib/lxc/$name/config +lxc-helpers.sh lxc_container_start $name +lxc-helpers.sh lxc_container_user_install $name $(id -u) $USER +``` + +### Docker and LXC enabled + +```sh +name=forgejo-host +ipv4=10.85.12 +ipv6=fc33 +lxc-helpers.sh lxc_container_create --config "docker lxc" $name +echo "lxc.start.auto = 1" | sudo tee -a /var/lib/lxc/$name/config +lxc-helpers.sh lxc_container_start $name +lxc-helpers.sh lxc_install_docker $name +lxc-helpers.sh lxc_install_lxc $name $ipv4 $ipv6 +lxc-helpers.sh lxc_container_user_install $name $(id -u) $USER +``` + +## Hetzner + +All hardware machines are running Debian GNU/linux bookworm. They are LXC hosts +setup with [lxc-helpers](https://code.forgejo.org/forgejo/lxc-helpers/). + +> **NOTE:** only use [EX101 with a ASRockRack W680D4U-1L motherboard](https://forum.hetzner.com/index.php?thread/31135-all-ex101-with-asustek-w680-crash-on-sequential-read/). + +### vSwitch + +A vSwitch is assigned via the Robot console on all servers for backend communications +and [configured](https://docs.hetzner.com/robot/dedicated-server/network/vswitch#example-debian-configuration) +in /etc/network/interfaces for each of them with something like: + +``` +auto enp5s0.4000 +iface enp5s0.4000 inet static + address 10.53.100.2 + netmask 255.255.255.0 + vlan-raw-device enp5s0 + mtu 1400 +``` + +The IP address ends with the same number as the hardware (hetzner02 => .2). + +#### vSwitch DRBD + +The vSwitch on VLAN 4000 is for DRBD exclusively + +### DRBD + +DRBD is [configured](https://linbit.com/drbd-user-guide/drbd-guide-9_0-en/#p-work) like in the following example with hetzner02 as the primary and hetzner03 as the secondary: + +```sh +$ apt-get install drbd-utils +$ cat /etc/drbd.d/r0.res +resource r0 { + net { + # A : write completion is determined when data is written to the local disk and the local TCP transmission buffer + # B : write completion is determined when data is written to the local disk and remote buffer cache + # C : write completion is determined when data is written to both the local disk and the remote disk + protocol C; + cram-hmac-alg sha1; + # any secret key for authentication among nodes + shared-secret "***"; + } + disk { + resync-rate 1000M; + } + on hetzner02 { + address 10.53.100.2:7788; + volume 0 { + # device name + device /dev/drbd0; + # specify disk to be used for device above + disk /dev/nvme0n1p5; + # where to create metadata + # specify the block device name when using a different disk + meta-disk internal; + } + } + on hetzner03 { + address 10.53.100.3:7788; + volume 0 { + device /dev/drbd0; + disk /dev/nvme1n1p5; + meta-disk internal; + } + } +} +$ sudo drbdadm create-md r0 +$ sudo systemctl enable drbd +$ sudo systemctl start drbd +``` + +On hetzner02 (the primary), [pretend all is in sync](https://linbit.com/drbd-user-guide/drbd-guide-9_0-en/#s-skip-initial-resync) to save the initial bitmap sync since +there is actually no data at all. + +```sh +sudo drbdadm new-current-uuid --clear-bitmap r0/0 +``` + +The DRBD device is mounted on `/var/lib/lxc` in `/etc/fstab` there is a noauto line: + +``` +/dev/drbd0 /var/lib/lxc ext4 noauto,defaults 0 0 +``` + +To prevent split brain situations a manual step is required at boot +time, on the machine that is going to be the primary. + +```sh +sudo drbdadm primary r0 +sudo drbdsetup status +sudo mount /var/lib/lxc +sudo lxc-autostart start +sudo lxc-ls -f +sudo drbdsetup status +``` diff --git a/octopuce.md b/octopuce.md new file mode 100644 index 0000000..daac9c3 --- /dev/null +++ b/octopuce.md @@ -0,0 +1,35 @@ +## Octopuce + +[Octopuce provides hardware](https://codeberg.org/forgejo/sustainability) managed by [the devops team](https://codeberg.org/forgejo/governance/src/branch/main/TEAMS.md#devops). It can only be accessed via SSH. + +To access the services hosted on the LXC containers, ssh port forwarding to the private IPs can be used. For instance: + +```sh +echo 127.0.0.1 private.forgejo.org >> /etc/hosts +sudo ssh -i ~/.ssh/id_rsa -L 80:10.77.0.128:80 debian@forgejo01.octopuce.fr +firefox http://private.forgejo.org +``` + +### Containers + +- `forgejo-host` + + Dedicated to http://private.forgejo.org + + - Docker enabled + - upgrades checklist: + ```sh + emacs /home/debian/run-forgejo.sh # change the `image=` + docker stop forgejo + sudo rsync -av --numeric-ids --delete --progress /srv/forgejo/ /root/forgejo-backup/ + docker rm forgejo + bash -x /home/debian/run-forgejo.sh + docker logs -n 200 -f forgejo + ``` + +- `forgejo-runner-host` + + Has runners installed to run against private.forgejo.org + + - Docker and LXC enabled 10.85.12 fc33 + diff --git a/runner-lxc.md b/runner-lxc.md new file mode 100644 index 0000000..6b328f7 --- /dev/null +++ b/runner-lxc.md @@ -0,0 +1,44 @@ +## Forgejo runners + +The LXC container in which the runner is installed must have capabilities that support the backend. + +- docker:// needs a Docker enabled container +- lxc:// needs a Docker and LXC enabled container + +The runners it contains are not started at boot, it must be done manually. The bash history has the command line to do so. + +### Installation + +```shell +version=3.5.0 +sudo wget -O /usr/local/bin/forgejo-runner-$version https://code.forgejo.org/forgejo/runner/releases/download/v$version/forgejo-runner-$version-linux-amd64 +sudo chmod +x /usr/local/bin/forgejo-runner-$version +echo 'export TERM=xterm-256color' >> .bashrc +``` + +### Creating a runner + +Multiple runners can co-exist on the same machine. To keep things +organized they are located in a directory that is the same as the URL +from which the token is obtained. For instance +DIR=codeberg.org/forgejo-integration means that the token was obtained from the +https://codeberg.org/forgejo-integration organization. + +If a runner only provides unprivileged docker containers, the labels +in `config.yml` should be: +`labels: ['docker:docker://node:20-bookworm']`. + +If a runner provides LXC containers and unprivileged docker +containers, the labels in `config.yml` should be +`labels: ['self-hosted:lxc://debian:bookworm', 'docker:docker://node:20-bookworm']`. + +```shell +name=myrunner +mkdir -p $DIR ; cd $DIR +forgejo-runner generate-config > config-$name.yml +## edit config-$name.yml and adjust the `labels:` +## Obtain a $TOKEN from https://$DIR +forgejo-runner-$version register --no-interactive --token $TOKEN --name runner --instance https://codeberg.org +forgejo-runner-$version --config config-$name.yml daemon |& cat -v > runner.log & +``` +