Skip to content

Commit

Permalink
Kubevirt: Run Descheduler on node boot to rebalance cluster apps
Browse files Browse the repository at this point in the history
Update_RunDeschedulerOnBoot will run the descheduler to
evict pods from the edge node on boot.
This is to allow rebalancing apps via re-scheduling them
with an aim to meet affinity as specified in the pod config.

This path includes a series of gates to ensure the destination
node is available as a scheduling destination.
- Wait for the kubernetes api to be available.
- Wait until node is online and uncordoned.
- Wait until infrastructure is ready (kubevirt/longhorn).

Signed-off-by: Andrew Durbin <[email protected]>
  • Loading branch information
andrewd-zededa committed Feb 5, 2025
1 parent 8af3f5e commit e2603b4
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 0 deletions.
1 change: 1 addition & 0 deletions pkg/kube/cluster-init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,7 @@ fi
check_and_remove_excessive_k3s_logs
check_and_run_vnc
Update_CheckClusterComponents
Update_RunDeschedulerOnBoot
wait_for_item "wait"
sleep 15
done
50 changes: 50 additions & 0 deletions pkg/kube/cluster-update.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ trigger_k3s_selfextraction() {
/usr/bin/k3s check-config >> "$INSTALL_LOG" 2>&1
}

# shellcheck source=pkg/kube/descheduler-utils.sh
. /usr/bin/descheduler-utils.sh

EdgeNodeInfoPath="/persist/status/zedagent/EdgeNodeInfo/global.json"
COMP_UPDATE_PATH="/usr/bin/update-component"

link_multus_into_k3s() {
ln -s /var/lib/cni/bin/multus /var/lib/rancher/k3s/data/current/bin/multus
}
Expand Down Expand Up @@ -141,6 +147,49 @@ Update_CheckClusterComponents() {
wait_for_item "update_cluster_post"
}

# Update_RunDeschedulerOnBoot will run the descheduler to evict pods from the edge node
# on boot. This is to allow rebalancing apps via re-scheduling them with an aim to meet
# affinity as specified in the pod config.
Update_RunDeschedulerOnBoot() {
# Currently only run once per boot
if [ -f /tmp/descheduler-ran-onboot ]; then
return
fi

if [ ! -f $EdgeNodeInfoPath ]; then
return
fi
# is api ready
if ! update_isClusterReady; then
return
fi
# Don't run unless it has been installed
if ! descheduler_install; then
return
fi
# node ready and allowing scheduling
node=$(jq -r '.DeviceName' < $EdgeNodeInfoPath | tr -d '\n' | tr '[:upper:]' '[:lower:]')
node_count_ready=$(kubectl get "node/${node}" | grep -v SchedulingDisabled | grep -cw Ready )
if [ "$node_count_ready" -ne 1 ]; then
return
fi
# Ensure all infrastructure pods are online on node
lhStatus=$(kubectl -n longhorn-system get daemonsets -o json | jq '.items[].status | .numberReady==.desiredNumberScheduled' | tr -d '\n')
if [ "$lhStatus" != "truetruetrue" ]; then
return
fi
kvStatus=$(kubectl -n kubevirt get daemonsets -o json | jq '.items[].status | .numberReady==.desiredNumberScheduled' | tr -d '\n')
if [ "$kvStatus" != "true" ]; then
return
fi
# Job lives persistently in cluster, cleanup after old runs
if kubectl -n kube-system get job/descheduler-job; then
kubectl -n kube-system delete job/descheduler-job
fi
kubectl apply -f /etc/descheduler-job.yaml
touch /tmp/descheduler-ran-onboot
}

update_isClusterReady() {
if ! kubectl cluster-info; then
return 1
Expand All @@ -152,6 +201,7 @@ update_isClusterReady() {
return 0
}


#
# Handle kube component updates
#
Expand Down

0 comments on commit e2603b4

Please sign in to comment.