From e2603b41413756294535643385603fed128e23a3 Mon Sep 17 00:00:00 2001 From: Andrew Durbin Date: Thu, 2 Jan 2025 17:21:25 -0700 Subject: [PATCH] Kubevirt: Run Descheduler on node boot to rebalance cluster apps Update_RunDeschedulerOnBoot will run the descheduler to evict pods from the edge node on boot. This is to allow rebalancing apps via re-scheduling them with an aim to meet affinity as specified in the pod config. This path includes a series of gates to ensure the destination node is available as a scheduling destination. - Wait for the kubernetes api to be available. - Wait until node is online and uncordoned. - Wait until infrastructure is ready (kubevirt/longhorn). Signed-off-by: Andrew Durbin --- pkg/kube/cluster-init.sh | 1 + pkg/kube/cluster-update.sh | 50 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/pkg/kube/cluster-init.sh b/pkg/kube/cluster-init.sh index e73141b784..324183ffea 100755 --- a/pkg/kube/cluster-init.sh +++ b/pkg/kube/cluster-init.sh @@ -934,6 +934,7 @@ fi check_and_remove_excessive_k3s_logs check_and_run_vnc Update_CheckClusterComponents + Update_RunDeschedulerOnBoot wait_for_item "wait" sleep 15 done diff --git a/pkg/kube/cluster-update.sh b/pkg/kube/cluster-update.sh index d428af9a40..372b971a94 100644 --- a/pkg/kube/cluster-update.sh +++ b/pkg/kube/cluster-update.sh @@ -47,6 +47,12 @@ trigger_k3s_selfextraction() { /usr/bin/k3s check-config >> "$INSTALL_LOG" 2>&1 } +# shellcheck source=pkg/kube/descheduler-utils.sh +. /usr/bin/descheduler-utils.sh + +EdgeNodeInfoPath="/persist/status/zedagent/EdgeNodeInfo/global.json" +COMP_UPDATE_PATH="/usr/bin/update-component" + link_multus_into_k3s() { ln -s /var/lib/cni/bin/multus /var/lib/rancher/k3s/data/current/bin/multus } @@ -141,6 +147,49 @@ Update_CheckClusterComponents() { wait_for_item "update_cluster_post" } +# Update_RunDeschedulerOnBoot will run the descheduler to evict pods from the edge node +# on boot. This is to allow rebalancing apps via re-scheduling them with an aim to meet +# affinity as specified in the pod config. +Update_RunDeschedulerOnBoot() { + # Currently only run once per boot + if [ -f /tmp/descheduler-ran-onboot ]; then + return + fi + + if [ ! -f $EdgeNodeInfoPath ]; then + return + fi + # is api ready + if ! update_isClusterReady; then + return + fi + # Don't run unless it has been installed + if ! descheduler_install; then + return + fi + # node ready and allowing scheduling + node=$(jq -r '.DeviceName' < $EdgeNodeInfoPath | tr -d '\n' | tr '[:upper:]' '[:lower:]') + node_count_ready=$(kubectl get "node/${node}" | grep -v SchedulingDisabled | grep -cw Ready ) + if [ "$node_count_ready" -ne 1 ]; then + return + fi + # Ensure all infrastructure pods are online on node + lhStatus=$(kubectl -n longhorn-system get daemonsets -o json | jq '.items[].status | .numberReady==.desiredNumberScheduled' | tr -d '\n') + if [ "$lhStatus" != "truetruetrue" ]; then + return + fi + kvStatus=$(kubectl -n kubevirt get daemonsets -o json | jq '.items[].status | .numberReady==.desiredNumberScheduled' | tr -d '\n') + if [ "$kvStatus" != "true" ]; then + return + fi + # Job lives persistently in cluster, cleanup after old runs + if kubectl -n kube-system get job/descheduler-job; then + kubectl -n kube-system delete job/descheduler-job + fi + kubectl apply -f /etc/descheduler-job.yaml + touch /tmp/descheduler-ran-onboot +} + update_isClusterReady() { if ! kubectl cluster-info; then return 1 @@ -152,6 +201,7 @@ update_isClusterReady() { return 0 } + # # Handle kube component updates #