Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kubevirt: upgrade k3s,multus,kubevirt,cdi,longhorn #4501

Merged
merged 2 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
1 change: 1 addition & 0 deletions .spdxignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ pkg/kube/descheduler_rbac.yaml
pkg/kube/lh-cfg-v1.6.2.yaml
pkg/vtpm/swtpm-vtpm/vendor/
pkg/dom0-ztools/rootfs/usr/bin/rungetty.sh
pkg/kube/update-component/vendor/
7 changes: 7 additions & 0 deletions pkg/kube/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,20 @@ COPY cert-gen /plugins/cert-gen
WORKDIR /plugins/cert-gen
RUN GO111MODULE=on CGO_ENABLED=0 go build -v -ldflags "-s -w" -o /out/usr/bin/cert-gen .

COPY update-component /plugins/update-component
WORKDIR /plugins/update-component
RUN GO111MODULE=on go build -v -ldflags "-s -w" -mod=vendor -o /out/usr/bin/update-component .

FROM scratch
COPY --from=build /out/ /
COPY cluster-init.sh /usr/bin/
COPY cluster-utils.sh /usr/bin/
COPY cgconfig.conf /etc

# upgrades
COPY cluster-update.sh /usr/bin/
COPY update-component/expected_versions.yaml /etc/
COPY update-component/settings_longhorn.yaml /etc/

# k3s
COPY install-etcdctl.sh /usr/bin/
Expand Down
3 changes: 3 additions & 0 deletions pkg/kube/cluster-init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,8 @@ logmsg "Using ZFS persistent storage"

setup_prereqs

Update_CheckNodeComponents


if [ -f /var/lib/convert-to-single-node ]; then
logmsg "remove /var/lib and copy saved single node /var/lib"
Expand Down Expand Up @@ -931,6 +933,7 @@ fi
check_kubeconfig_yaml_files
check_and_remove_excessive_k3s_logs
check_and_run_vnc
Update_CheckClusterComponents
wait_for_item "wait"
sleep 15
done
215 changes: 215 additions & 0 deletions pkg/kube/cluster-update.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,222 @@
#
# Copyright (c) 2024 Zededa, Inc.
# SPDX-License-Identifier: Apache-2.0
K3S_VERSION=v1.28.5+k3s1

#
# Handle any migrations needed due to updated cluster-init.sh
# This is expected to be bumped any time:
# - a migration is needed (new path for something)
# - a version bump of: K3s, multus, kubevirt, cdi, longhorn
#
KUBE_VERSION=1
APPLIED_KUBE_VERSION_PATH="/var/lib/applied-kube-version"
update_Version_Set() {
version=$1
echo "$version" > "$APPLIED_KUBE_VERSION_PATH"
}

update_Version_Get() {
if [ ! -f "$APPLIED_KUBE_VERSION_PATH" ]; then
# First Boot
echo "0"
fi
cat "$APPLIED_KUBE_VERSION_PATH"
}

#
# update_Failed()
# Mark failure if Status == COMP_STATUS_FAILED and DestinationKubeUpdateVersion == KUBE_VERSION
# This allows:
# - update retry control for a given version
# - recovery update if the eve os version is updated to another release (with a different cluster-init.sh)
#
UPDATE_STATUS_PATH=/persist/status/zedkube/KubeClusterUpdateStatus/global.json
update_Failed() {
if [ -f $UPDATE_STATUS_PATH ]; then
if [ "$(jq --arg gen $KUBE_VERSION '.Status==4 and .DestinationKubeUpdateVersion==$gen' < $UPDATE_STATUS_PATH)" = "true" ]; then
return 0
fi
fi
return 1
}

trigger_k3s_selfextraction() {
# Run some k3s cli command so that binaries are self-extracted
/usr/bin/k3s check-config >> "$INSTALL_LOG" 2>&1
}

link_multus_into_k3s() {
ln -s /var/lib/cni/bin/multus /var/lib/rancher/k3s/data/current/bin/multus
}

update_k3s() {
logmsg "Installing K3S version $K3S_VERSION"
mkdir -p /var/lib/k3s/bin
/usr/bin/curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=${K3S_VERSION} INSTALL_K3S_SKIP_ENABLE=true INSTALL_K3S_SKIP_START=true INSTALL_K3S_BIN_DIR=/var/lib/k3s/bin sh -
sleep 5
logmsg "Initializing K3S version $K3S_VERSION"
ln -s /var/lib/k3s/bin/* /usr/bin
trigger_k3s_selfextraction
link_multus_into_k3s
touch /var/lib/k3s_installed_unpacked
}

# k3s_get_version: return version in form "vW.X.Y+k3sZ"
k3s_get_version() {
if [ ! -f /var/lib/k3s/bin/k3s ]; then
echo "v0.0.0+k3s0"
return
fi
/var/lib/k3s/bin/k3s --version | awk '$1=="k3s" {print $3}' | tr -d '\n'
}

# Run on every boot before k3s starts
Update_CheckNodeComponents() {
applied_version=$(update_Version_Get)
if [ "$KUBE_VERSION" = "$applied_version" ]; then
return
fi

if update_Failed; then
return
fi
logmsg "update_HandleNode: version:$KUBE_VERSION appliedversion:$applied_version continuing"

# Handle version specific node migrations here

# Handle node specific updates, just k3s for now
if [ "$(k3s_get_version)" != "$K3S_VERSION" ]; then
publishUpdateStatus "k3s" "download"
update_k3s
current_k3s_version=$(k3s_get_version)
if [ "$current_k3s_version" != "$K3S_VERSION" ]; then
logmsg "k3s version mismatch after install:$current_k3s_version"
publishUpdateStatus "k3s" "failed" "version mismatch after install:$current_k3s_version"
else
logmsg "k3s installed and unpacked or copied"
publishUpdateStatus "k3s" "completed"
fi
fi
}

# Run on every boot after k3s is started
Update_CheckClusterComponents() {
wait_for_item "update_cluster_pre"

applied_version=$(update_Version_Get)
if [ "$KUBE_VERSION" = "$applied_version" ]; then
return
fi

if update_Failed; then
return
fi

if ! update_isClusterReady; then
return
fi
logmsg "update_HandleCluster: version:$KUBE_VERSION appliedversion:$applied_version continuing"

# Handle cluster wide component updates
for comp in multus kubevirt cdi longhorn; do
while ! update_Component_CheckReady "$comp"; do
logmsg "Component: $comp not ready on existing version"
sleep 60
done
logmsg "Component: $comp ready on existing version"
if update_Component_IsRunningExpectedVersion "$comp"; then
logmsg "Component:$comp running expected version, continuing"
publishUpdateStatus "$comp" "completed"
continue
fi
if ! update_Component "$comp"; then
logmsg "Not continuing with further updates after component:${comp} update failed"
break
fi
done

update_Version_Set "$KUBE_VERSION"
wait_for_item "update_cluster_post"
}

update_isClusterReady() {
if ! kubectl cluster-info; then
return 1
fi

if ! update_Helper_APIResponding; then
return 1
fi
return 0
}

#
# Handle kube component updates
#
COMP_UPDATE_PATH="/usr/bin/update-component"

update_Helper_APIResponding() {
if $COMP_UPDATE_PATH --check-api-ready; then
return 0
fi
return 1
}
update_Component_CheckReady() {
comp=$1
if $COMP_UPDATE_PATH --versions-file /etc/expected_versions.yaml --component "$comp" --check-comp-ready; then
return 0
fi
return 1
}
update_Component_Uptime() {
comp=$1
$COMP_UPDATE_PATH --versions-file /etc/expected_versions.yaml --component "$comp" --get-uptime
}
update_Component_IsRunningExpectedVersion() {
comp=$1
if $COMP_UPDATE_PATH --versions-file /etc/expected_versions.yaml --component "$comp" --compare; then
return 0
fi
return 1
}

update_Component() {
comp=$1
# Run go app to check and apply updates and block until new version is ready
publishUpdateStatus "$comp" "in_progress"
if $COMP_UPDATE_PATH --versions-file /etc/expected_versions.yaml --component "$comp" --upgrade; then
publishUpdateStatus "$comp" "completed"
return 0
fi
upgrade_log_path="/persist/kubelog/upgrade-component.log"
logmsg "update_Component comp:${comp} error starting update, see $upgrade_log_path"
publishUpdateStatus "$comp" "failed" "error in $upgrade_log_path"
return 1
}

publishUpdateStatus() {
component=$1
status=$2
errorstr=""
if [ ! -x "$3" ]; then
errorstr=$3
fi

# If gen==0, then we are in the initial boot not updating, just installing first versions at most-likely first
# boot of the device. Don't publish as this will trigger zedagent to claim baseos_updating.
cur_version=$(update_Version_Get)
if [ "$cur_version" = "0" ]; then
return
fi

node=$(jq -r '.DeviceName' < /persist/status/zedagent/EdgeNodeInfo/global.json | tr -d '\n')
logmsg "publishUpdateStatus() $node $component $status"

pillarRootfs=/hostfs/containers/services/pillar/rootfs
LD_LIBRARY_PATH=${pillarRootfs}/usr/lib/ ${pillarRootfs}/opt/zededa/bin/zedkube pubKubeClusterUpdateStatus "$node" "$component" "$status" "$KUBE_VERSION" "$errorstr"
rc=$?
if [ $rc -ne 0 ]; then
logmsg "publishUpdateStatus() $node $component $status in error:$rc"
fi
}
90 changes: 90 additions & 0 deletions pkg/kube/update-component/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Kube "update-component" helper tool

update-component is a utility, short lived process which provides a convenience interface for kubernetes
component upgrades and status information.

The interface specifies a series of generalized upgrade methods and backend handlers are implemented for
currently used "infrastructure" components which the kube service installs in HV=kubevirt eve builds.

Supported Components: cdi, kubevirt, longhorn, multus

Upgrade Interface:

1. GetVersion - returns a string version
1. UpgradeSupported - accepts source and destination version, checks component backend to determine
if the upgrade is supported. Some components have strict version max distance upgrade rules.
eg. v1.0.0->v3.1.0 not supported.
1. Uptime - returns the time a component has been ready at a given version
1. Ready - returns nil if the component is online
1. UpgradeStart - initiates a component upgrade to requested version.

## Options

### General Arguments

--component : string component name
--versions-file : path to a single level yaml file defining a list of `<component> : "<expected version>"`

### Optional Arguments

-f Force: skip uptime checks and version constraints

### Check Kubernetes API Ready "--check-api-ready"

Check if api is responding, (rc 0 for success)
eg.
`$ /usr/bin/update-component --check-api-ready
$ echo $?
0`

### Check Component Ready "--check-comp-ready"

Check if component is ready, according to its daemonsets (rc 0 for success)
eg.
`$ /usr/bin/update-component --versions-file /etc/expected_versions.yaml --component longhorn --check-comp-ready
$ echo $?
0`

### Check Component Uptime "--get-uptime"

Print component uptime in seconds
eg.
`$ /usr/bin/update-component --versions-file /etc/expected_versions.yaml --component longhorn --get-uptime
623011`

### Compare Component Version Against Expected "--compare"

Just compare current version, return 0 for matching, 1 for not matching
eg.
`$ /usr/bin/update-component --versions-file /etc/expected_versions.yaml --component longhorn --compare
$ echo $?
0`

### Execute Component Upgrade "--upgrade"

Begin component upgrade to the version listed for it in --versions-file
eg.
`$ /usr/bin/update-component --versions-file /etc/expected_versions.yaml --component "$comp" --upgrade
$ echo $?
0`

## Logging

By default this tool logs to /persist/kubelog/upgrade-component.log

Example Output:
2024/11/19 19:44:30 Component:multus ready:true running:v3.9.3 expected_version:v3.9.3 uptime_seconds:569.930566
2024/11/19 19:44:32 Component:kubevirt ready:true running:v1.1.0-dirty expected_version:v1.1.0-dirty uptime_seconds:478.254250
2024/11/19 19:44:33 Component:cdi ready:true running:v1.57.1 expected_version:v1.57.1 uptime_seconds:499.523674
2024/11/19 19:44:34 Component:longhorn ready:true running:v1.6.3 expected_version:v1.6.3 uptime_seconds:553.801213

## EVE Runtime Usage

After the kube service container has started and k3s has been started, the main run loop will call
Update_CheckClusterComponents which checks a series of prerequisites:

- if applied overall kube version (integer in /var/lib/applied-kube-version) is less than requested version as defined in cluster-update.sh
- if previous update is not failed

If both above checks pass then cluster-update proceeds to check component health and initiate upgrades serially.
After all component upgrades are complete then the applied overall kube version is incremented.
Loading
Loading