Portal:Toolforge/Admin/Kubernetes/Upgrading Kubernetes/1.27 to 1.28 notes
Appearance
Working etherpad: https://etherpad.wikimedia.org/p/k8s-1.27-to-1.28-upgrade
Prepare packages
- [] send and merge a patch similar to https://gerrit.wikimedia.org/r/c/operations/puppet/+/1058560 but for the destination version
- [] check that the packages show up in https://apt.wikimedia.org/wikimedia/pool/thirdparty/
Toolsbeta
- get list of nodes
root@toolsbeta-test-k8s-control-10:~# for node in $(kubectl get nodes -o json | jq '.items[].metadata.name' -r); do echo "* [] $node"; done
prep
- [] run prepare upgrade cookbook
~ $ sudo cookbook wmcs.toolforge.k8s.prepare_upgrade --cluster-name toolsbeta --src-version 1.27.16 --dst-version 1.28.14 --task-id <id_of_task>
- [] downtime project via https://prometheus-alerts.wmcloud.org/?q=team%3Dwmcs
- [] update topic on -cloud
control nodes
- run upgrade node cookbook
sudo cookbook wmcs.toolforge.k8s.worker.upgrade --task-id <id_of_task> --src-version 1.27.16 --dst-version 1.28.14 --cluster-name toolsbeta --hostname <control_node_name>
- check that services start healthy
- depool control-<x> and <y> via haproxy, check that control-<z> is still doing ok
ssh tools-test-k8s-haproxy-6.toolsbeta.eqiad1.wikimedia.cloud sudo puppet agent --disable "<user> k8s upgrade" sudo vim /etc/haproxy/conf.d/k8s-api-servers.cfg sudo systemctl reload haproxy
check:
echo "show stat" | sudo socat stdio /run/haproxy/haproxy.sock | grep k8s-api
revert:
sudo puppet agent --enable sudo run-puppet-agent sudo systemctl reload haproxy
Issues:
- We might want to upgrade the pause image:
W0905 14:29:47.959951 961216 checks.go:835] detected that the sandbox image "docker-registry.tools.wmflabs.org/pause:3.1" of the container runtime is inconsistent with that used by kubeadm. It is recommended that using "registry.k8s.io/pause:3.9" as the CRI sandbox image.
toolsbeta-test-k8s-control-10
- [] run upgrade node cookbook
- [] check that services start healthy
- [] depool control-11 and -12 via haproxy, check that control-10 is still doing ok
toolsbeta-test-k8s-control-11
- [] run upgrade node cookbook
- [] check that services start healthy
- [] depool control-12 and -10 via haproxy, check that control-11 is still doing ok
toolsbeta-test-k8s-control-12
- [] run upgrade node cookbook
- [] check that services start healthy
- [] depool control-10 and -11 via haproxy, check that control-12 is still doing ok
worker nodes
- run upgrade node cookbook for each
sudo cookbook wmcs.toolforge.k8s.worker.upgrade --task-id <id_of_task> --src-version 1.27.16 --dst-version 1.28.14 --cluster-name toolsbeta --hostname <worker_node_name>
- [] toolsbeta-test-k8s-worker-nfs-5
- [] toolsbeta-test-k8s-worker-nfs-7
- [] toolsbeta-test-k8s-worker-nfs-8
- [] toolsbeta-test-k8s-worker-nfs-9
- [] toolsbeta-test-k8s-worker-12
- [] toolsbeta-test-k8s-worker-13
ingress nodes
- run upgrade node cookbook for each
sudo cookbook wmcs.toolforge.k8s.worker.upgrade --task-id <id_of_task> --src-version 1.27.16 --dst-version 1.28.14 --cluster-name toolsbeta --hostname <worker_node_name>
- [] toolsbeta-test-k8s-ingress-10
- [] toolsbeta-test-k8s-ingress-11
- [] toolsbeta-test-k8s-ingress-9
cleanup
- [] remove downtime
- [] revert topic change
Tools
- get list of nodes
root@tools-k8s-control-7:~# for node in $(kubectl get nodes -o json | jq '.items[].metadata.name' -r); do echo "* [] $node"; done
prep
- [] run prepare upgrade cookbook
~ $ sudo cookbook wmcs.toolforge.k8s.prepare_upgrade --cluster-name tools --src-version 1.27.16 --dst-version 1.28.14 --task-id <id_of_task>
- [] downtime project via https://prometheus-alerts.wmcloud.org/?q=team%3Dwmcs
- [] update topic on -cloud
control nodes
- run upgrade node cookbook
sudo cookbook wmcs.toolforge.k8s.worker.upgrade --task-id <id_of_task> --src-version 1.27.16 --dst-version 1.28.14 --cluster-name tools --hostname <control_node_name>
- check that services start healthy
- depool control-<x> and <y> via haproxy, check that control-<z> is still doing ok
ssh tools-k8s-haproxy-5.tools.eqiad1.wikimedia.cloud sudo puppet agent --disable "<user> k8s upgrade" sudo nano /etc/haproxy/conf.d/k8s-api-servers.cfg sudo systemctl reload haproxy
check:
echo "show stat" | sudo socat stdio /run/haproxy/haproxy.sock | grep k8s-api
revert:
sudo puppet agent --enable sudo run-puppet-agent sudo systemctl reload haproxy
tools-k8s-control-7
- [] run upgrade node cookbook
- [] check that services start healthy
- [] depool control-8 and -9 via haproxy, check that control-7 is still doing ok
tools-k8s-control-8
- [] run upgrade node cookbook
- [] check that services start healthy
- [] depool control-7 and -9 via haproxy, check that control-8 is still doing ok
tools-k8s-control-9
- [] run upgrade node cookbook
- [] check that services start healthy
- [] depool control-7 and -8 via haproxy, check that control-9 is still doing ok
worker nodes
- run upgrade node cookbook for each. it's ok to do a couple in parallel
sudo cookbook wmcs.toolforge.k8s.worker.upgrade --task-id <id_of_task> --src-version 1.27.16 --dst-version 1.28.14 --cluster-name tools --hostname <worker_node_name>
- [] tools-k8s-worker-102
- [] tools-k8s-worker-103
- [] tools-k8s-worker-105
- [] tools-k8s-worker-106
- [] tools-k8s-worker-107
- [] tools-k8s-worker-108
- [] tools-k8s-worker-nfs-1
- [] tools-k8s-worker-nfs-10
- [] tools-k8s-worker-nfs-11
- [] tools-k8s-worker-nfs-12
- [] tools-k8s-worker-nfs-13
- [] tools-k8s-worker-nfs-14
- [] tools-k8s-worker-nfs-16
- [] tools-k8s-worker-nfs-17
- [] tools-k8s-worker-nfs-19
- [] tools-k8s-worker-nfs-2
- [] tools-k8s-worker-nfs-21
- [] tools-k8s-worker-nfs-22
- [] tools-k8s-worker-nfs-23
- [] tools-k8s-worker-nfs-24
- [] tools-k8s-worker-nfs-26
- [] tools-k8s-worker-nfs-27
- [] tools-k8s-worker-nfs-3
- [] tools-k8s-worker-nfs-32
- [] tools-k8s-worker-nfs-33
- [] tools-k8s-worker-nfs-34
- [] tools-k8s-worker-nfs-35
- [] tools-k8s-worker-nfs-36
- [] tools-k8s-worker-nfs-37
- [] tools-k8s-worker-nfs-38
- [] tools-k8s-worker-nfs-39
- [] tools-k8s-worker-nfs-40
- [] tools-k8s-worker-nfs-41
- [] tools-k8s-worker-nfs-42
- [] tools-k8s-worker-nfs-43
- [] tools-k8s-worker-nfs-44
- [] tools-k8s-worker-nfs-45
- [] tools-k8s-worker-nfs-46
- [] tools-k8s-worker-nfs-47
- [] tools-k8s-worker-nfs-48
- [] tools-k8s-worker-nfs-5
- [] tools-k8s-worker-nfs-50
- [] tools-k8s-worker-nfs-53
- [] tools-k8s-worker-nfs-54
- [] tools-k8s-worker-nfs-55
- [] tools-k8s-worker-nfs-57
- [] tools-k8s-worker-nfs-58
- [] tools-k8s-worker-nfs-61
- [] tools-k8s-worker-nfs-65
- [] tools-k8s-worker-nfs-66
- [] tools-k8s-worker-nfs-67
- [] tools-k8s-worker-nfs-68
- [] tools-k8s-worker-nfs-69
- [] tools-k8s-worker-nfs-7
- [] tools-k8s-worker-nfs-70
- [] tools-k8s-worker-nfs-71
- [] tools-k8s-worker-nfs-72
- [] tools-k8s-worker-nfs-73
- [] tools-k8s-worker-nfs-74
- [] tools-k8s-worker-nfs-75
- [] tools-k8s-worker-nfs-76
- [] tools-k8s-worker-nfs-8
- [] tools-k8s-worker-nfs-9
ingress nodes
- [] kubectl -n ingress-nginx-gen2 scale deployment ingress-nginx-gen2-controller --replicas=2
run upgrade node cookbook for each:
- [] tools-k8s-ingress-7
- [] tools-k8s-ingress-8
- [] tools-k8s-ingress-9
- [] revert afterwards: kubectl -n ingress-nginx-gen2 scale deployment ingress-nginx-gen2-controller --replicas=3
cleanup
- [] remove downtime
- [] revert topic change