From 62687a55feb4f8b73e6984a013d619ace050b730 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 26 Jul 2024 11:37:12 -0700 Subject: [PATCH] [spark-rapids] update for 2024/07 (#1204) * [spark-rapids] update for 2024/07 * no reboots * added dkms signing code * added utility functions for checking OS * cleaned up URL generation a bit * using full path to bdconfig so that the script works from `sudo -i` prompt * fixed error messages to indicate that secure boot with driver signing is available on 2.2 * cleaned up remove_old_backports * clean apt cache before exit * remove emacs temp file * Some nits * grammar correction maybe * saying nvidia-smi a lot ; add existence check before running it * remove upgrade_kernel function ; dataproc kernel version should never change. * I meant oldoldstable, not oldstable * DKMS packages sign with the wrong key, so installed key to the correct path * Repaired Ubuntu18 installer * reducing noise from credentials function * removing mok symlink on ubuntu * installing from .run file for ubuntu18 --- .gitignore | 3 + spark-rapids/spark-rapids.sh | 473 +++++++++++++++++++++++------------ 2 files changed, 321 insertions(+), 155 deletions(-) diff --git a/.gitignore b/.gitignore index dec37a9a6..7950ab8d6 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ # Ignore all bazel-* symlinks. There is no full list since this can change # based on the name of the directory bazel is cloned into. /bazel-* + +# Emacs +*~ diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index e0c73e77e..82a973aaa 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -20,15 +20,190 @@ set -euxo pipefail +function os_id() { + grep '^ID=' /etc/os-release | cut -d= -f2 | xargs +} + +function os_version() { + grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs +} + +function is_debian() { + [[ "$(os_id)" == 'debian' ]] +} + +function is_debian10() { + is_debian && [[ "$(os_version)" == '10'* ]] +} + +function is_debian11() { + is_debian && [[ "$(os_version)" == '11'* ]] +} + +function is_debian12() { + is_debian && [[ "$(os_version)" == '12'* ]] +} + +function is_ubuntu() { + [[ "$(os_id)" == 'ubuntu' ]] +} + +function is_ubuntu18() { + is_ubuntu && [[ "$(os_version)" == '18.04'* ]] +} + +function is_ubuntu20() { + is_ubuntu && [[ "$(os_version)" == '20.04'* ]] +} + +function is_ubuntu22() { + is_ubuntu && [[ "$(os_version)" == '22.04'* ]] +} + +function is_rocky() { + [[ "$(os_id)" == 'rocky' ]] +} + +function is_rocky8() { + is_rocky && [[ "$(os_version)" == '8'* ]] +} + +function is_rocky9() { + is_rocky && [[ "$(os_version)" == '9'* ]] +} + +function os_vercat() { + if is_ubuntu ; then + os_version | sed -e 's/[^0-9]//g' + elif is_rocky ; then + os_version | sed -e 's/[^0-9].*$//g' + else + os_version + fi +} + function get_metadata_attribute() { local -r attribute_name=$1 - local -r default_value=$2 + local -r default_value="${2:-}" /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" } +CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" +PSN="$(get_metadata_attribute private_secret_name)" +readonly PSN +function configure_dkms_certs() { + if [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping"; + return 0 + fi + + mkdir -p "${CA_TMPDIR}" + + # If the private key exists, verify it + if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then + echo "Private key material exists" + + local expected_modulus_md5sum + expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum) + if [[ -n "${expected_modulus_md5sum}" ]]; then + modulus_md5sum="${expected_modulus_md5sum}" + else + modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3" + fi + + # Verify that cert md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched rsa key modulus" + fi + ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key + + # Verify that key md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched x509 cert modulus" + fi + + return + fi + + + # Retrieve cloud secrets keys + local sig_priv_secret_name + sig_priv_secret_name="${PSN}" + local sig_pub_secret_name + sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" + local sig_secret_project + sig_secret_project="$(get_metadata_attribute secret_project)" + local sig_secret_version + sig_secret_version="$(get_metadata_attribute secret_version)" + + # If metadata values are not set, do not write mok keys + if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi + + # Write private material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_priv_secret_name}" \ + | dd status=none of="${CA_TMPDIR}/db.rsa" + + # Write public material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_pub_secret_name}" \ + | base64 --decode \ + | dd status=none of="${CA_TMPDIR}/db.der" + + # symlink private key and copy public cert from volatile storage for DKMS + if is_ubuntu ; then + mkdir -p /var/lib/shim-signed/mok + ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv + cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der + else + mkdir -p /var/lib/dkms/ + ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key + cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub + fi +} + +function clear_dkms_key { + if [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping" >2 + return 0 + fi + echo "WARN -- PURGING SIGNING MATERIAL -- WARN" >2 + echo "future dkms runs will not use correct signing key" >2 + rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv +} + +function add_contrib_components() { + if ! is_debian ; then + return + fi + if is_debian12 ; then + # Include in sources file components on which nvidia-open-kernel-dkms depends + local -r debian_sources="/etc/apt/sources.list.d/debian.sources" + local components="main contrib" + + sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" + elif is_debian ; then + sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list + fi +} + +# Short name for nvidia urls +if is_rocky ; then + shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" +else + shortname="$(os_id)$(os_vercat)" +fi +readonly shortname + +# Detect dataproc image version from its various names +if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" +fi + # Fetch Linux Family distro and Dataproc Image version readonly OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') -readonly DATAPROC_IMAGE_VERSION=$(/usr/share/google/get_metadata_value image|grep -Eo 'dataproc-[0-9]-[0-9]'|grep -Eo '[0-9]-[0-9]'|sed -e 's/-/./g') # Fetch SPARK config readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) @@ -58,9 +233,7 @@ CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.2 # EXCEPTIONS # Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) if [[ "${OS_NAME}" == "ubuntu" ]]; then - UBUNTU_VERSION=$(lsb_release -r | awk '{print $2}') # 20.04 - UBUNTU_VERSION=${UBUNTU_VERSION%.*} - if [[ "${UBUNTU_VERSION}" == "18" ]]; then + if is_ubuntu18 ; then CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1') #12.1.1 NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02 CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1 @@ -164,6 +337,8 @@ EOF systemctl enable --now install-headers.service } +readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' +readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { @@ -171,82 +346,144 @@ function install_nvidia_gpu_driver() { ## common steps for all linux family distros readonly NVIDIA_DRIVER_VERSION_PREFIX=${NVIDIA_DRIVER_VERSION%%.*} - ## installation steps based OS_NAME - if [[ ${OS_NAME} == "debian" ]]; then + ## For Debian & Ubuntu + readonly LOCAL_INSTALLER_DEB="cuda-repo-${shortname}-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb" + readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" + readonly DIST_KEYRING_DIR="/var/cuda-repo-${shortname}-${CUDA_VERSION_MAJOR//./-}-local" + + ## installation steps based OS + if is_debian ; then - DEBIAN_VERSION=$(lsb_release -r|awk '{print $2}') # 10 or 11 export DEBIAN_FRONTEND=noninteractive execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" - readonly LOCAL_INSTALLER_DEB="cuda-repo-debian${DEBIAN_VERSION}-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb" curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" -o /tmp/local-installer.deb + "${LOCAL_DEB_URL}" -o /tmp/local-installer.deb dpkg -i /tmp/local-installer.deb - cp /var/cuda-repo-debian${DEBIAN_VERSION}-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/ + rm /tmp/local-installer.deb + cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ - ## EXCEPTION - if [[ ${DEBIAN_VERSION} == 12 ]]; then - sed -i '0,/Components: main/s//& contrib/' /etc/apt/sources.list.d/debian.sources - fi + add_contrib_components - add-apt-repository contrib execute_with_retries "apt-get update" ## EXCEPTION - if [[ ${DEBIAN_VERSION} == 10 ]]; then - apt remove -y libglvnd0 - apt install -y ca-certificates-java + if is_debian10 ; then + apt-get remove -y libglvnd0 + apt-get install -y ca-certificates-java fi + configure_dkms_certs execute_with_retries "apt-get install -y -q nvidia-kernel-open-dkms" - execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" - execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" + clear_dkms_key + execute_with_retries \ + "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" + execute_with_retries \ + "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" + + modprobe nvidia # enable a systemd service that updates kernel headers after reboot setup_systemd_update_headers - - elif [[ ${OS_NAME} == "ubuntu" ]]; then - UBUNTU_VERSION=$(lsb_release -r|awk '{print $2}') # 20.04 or 22.04 - UBUNTU_VERSION=${UBUNTU_VERSION%.*} # 20 or 22 + elif is_ubuntu ; then execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" - readonly UBUNTU_REPO_CUDA_PIN="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/cuda-ubuntu${UBUNTU_VERSION}04.pin" - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600 - - readonly LOCAL_INSTALLER_DEB="cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb" - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" -o /tmp/local-installer.deb - - dpkg -i /tmp/local-installer.deb - cp /var/cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/ - execute_with_retries "apt-get update" - - execute_with_retries "apt-get install -y -q --no-install-recommends nvidia-driver-${NVIDIA_DRIVER_VERSION_PREFIX}-open" - execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" - execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" + # Ubuntu 18.04 is not supported by new style NV debs; install from .run files + github + if is_ubuntu18 ; then + + # fetch .run file + curl -o driver.run \ + "https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run" + # Install all but kernel driver + bash driver.run --no-kernel-modules --silent --install-libglvnd + rm driver.run + + WORKDIR=/opt/install-nvidia-driver + mkdir -p "${WORKDIR}" + pushd $_ + # Fetch open souce kernel module with corresponding tag + git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \ + --branch "${NVIDIA_DRIVER_VERSION}" --single-branch + cd ${WORKDIR}/open-gpu-kernel-modules + # + # build kernel modules + # + make -j$(nproc) modules \ + > /var/log/open-gpu-kernel-modules-build.log \ + 2> /var/log/open-gpu-kernel-modules-build_error.log + configure_dkms_certs + # sign + for module in $(find kernel-open -name '*.ko'); do + /lib/modules/$(uname -r)/build/scripts/sign-file sha256 \ + "${CA_TMPDIR}/db.rsa" \ + "${CA_TMPDIR}/db.der" \ + "${module}" + done + clear_dkms_key + # install + make modules_install \ + >> /var/log/open-gpu-kernel-modules-build.log \ + 2>> /var/log/open-gpu-kernel-modules-build_error.log + depmod -a + modprobe nvidia + popd + + # + # Install CUDA + # + cuda_runfile="cuda_${CUDA_VERSION}_${NVIDIA_DRIVER_VERSION}_linux.run" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${cuda_runfile}" \ + -o cuda.run + bash cuda.run --silent --toolkit --no-opengl-libs + rm cuda.run + else + # Install from repo provided by NV + readonly UBUNTU_REPO_CUDA_PIN="${NVIDIA_REPO_URL}/cuda-${shortname}.pin" + + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600 + + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${LOCAL_DEB_URL}" -o /tmp/local-installer.deb + + dpkg -i /tmp/local-installer.deb + rm /tmp/local-installer.deb + cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ + execute_with_retries "apt-get update" + + execute_with_retries "apt-get install -y -q --no-install-recommends dkms" + configure_dkms_certs + for pkg in "nvidia-driver-${NVIDIA_DRIVER_VERSION_PREFIX}-open" \ + "cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" \ + "cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" ; do + execute_with_retries "apt-get install -y -q --no-install-recommends ${pkg}" + done + clear_dkms_key + + modprobe nvidia + fi - modprobe nvidia # enable a systemd service that updates kernel headers after reboot setup_systemd_update_headers - elif [[ ${OS_NAME} == "rocky" ]]; then + elif is_rocky ; then # Ensure the Correct Kernel Development Packages are Installed - execute_with_retries "yum install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r)" + execute_with_retries "dnf -y -q update --exclude=systemd*,kernel*" + execute_with_retries "dnf -y -q install pciutils kernel-devel gcc" - ROCKY_VERSION=$(lsb_release -r | awk '{print $2}') # 8.8 or 9.1 - ROCKY_VERSION=${ROCKY_VERSION%.*} # 8 or 9 - - readonly NVIDIA_ROCKY_REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/rhel${ROCKY_VERSION}/x86_64/cuda-rhel${ROCKY_VERSION}.repo" + readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" execute_with_retries "dnf clean all" + configure_dkms_certs execute_with_retries "dnf -y -q module install nvidia-driver:latest-dkms" + clear_dkms_key execute_with_retries "dnf -y -q install cuda-toolkit" modprobe nvidia @@ -294,7 +531,7 @@ function set_hadoop_property() { local -r config_file=$1 local -r property=$2 local -r value=$3 - bdconfig set_property \ + /usr/local/bin/bdconfig set_property \ --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ --name "${property}" --value "${value}" \ --clobber @@ -430,15 +667,16 @@ function setup_gpu_yarn() { exit 1 fi - # This configuration should be ran on all nodes + # This configuration should be run on all nodes # regardless if they have attached GPUs configure_yarn # Detect NVIDIA GPU if (lspci | grep -q NVIDIA); then # if this is called without the MIG script then the drivers are not installed - if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l); then - NUM_MIG_GPUS=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l` + nv_smi="/usr/bin/nvidia-smi" + if (test -f "${nv_smi}" && "${nv_smi}" --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l); then + NUM_MIG_GPUS="$($nv_smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l)" if [[ $NUM_MIG_GPUS -eq 1 ]]; then if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then IS_MIG_ENABLED=1 @@ -449,9 +687,9 @@ function setup_gpu_yarn() { fi fi - if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then + if is_debian || is_ubuntu ; then execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" - elif [[ ${OS_NAME} == rocky ]]; then + elif is_rocky ; then echo "kernel devel and headers not required on rocky. installing from binary" fi @@ -485,141 +723,63 @@ function setup_gpu_yarn() { done } -function upgrade_kernel() { - # Determine which kernel is installed - if [[ "${OS_NAME}" == "debian" ]]; then - CURRENT_KERNEL_VERSION=`cat /proc/version | perl -ne 'print( / Debian (\S+) / )'` - elif [[ "${OS_NAME}" == "ubuntu" ]]; then - CURRENT_KERNEL_VERSION=`cat /proc/version | perl -ne 'print( /^Linux version (\S+) / )'` - elif [[ ${OS_NAME} == rocky ]]; then - KERN_VER=$(yum info --installed kernel | awk '/^Version/ {print $3}') - KERN_REL=$(yum info --installed kernel | awk '/^Release/ {print $3}') - # something like 4.18.0-425.10.1.el8_7 - CURRENT_KERNEL_VERSION="${KERN_VER}-${KERN_REL}" - else - echo "unsupported OS: ${OS_NAME}!" - exit -1 - fi - - # Get latest version available in repos - if [[ "${OS_NAME}" == "debian" ]]; then - apt-get -qq update - TARGET_VERSION=$(apt-cache show --no-all-versions linux-image-amd64 | awk '/^Version/ {print $2}') - elif [[ "${OS_NAME}" == "ubuntu" ]]; then - apt-get -qq update - LATEST_VERSION=$(apt-cache show --no-all-versions linux-image-gcp | awk '/^Version/ {print $2}') - TARGET_VERSION=`echo ${LATEST_VERSION} | perl -ne 'printf(q{%s-%s-gcp},/(\d+\.\d+\.\d+)\.(\d+)/)'` - elif [[ "${OS_NAME}" == "rocky" ]]; then - if yum info --available kernel ; then - KERN_VER=$(yum info --available kernel | awk '/^Version/ {print $3}') - KERN_REL=$(yum info --available kernel | awk '/^Release/ {print $3}') - TARGET_VERSION="${KERN_VER}-${KERN_REL}" - else - TARGET_VERSION="${CURRENT_KERNEL_VERSION}" - fi - fi - - # Skip this script if we are already on the target version - if [[ "${CURRENT_KERNEL_VERSION}" == "${TARGET_VERSION}" ]]; then - echo "target kernel version [${TARGET_VERSION}] is installed" - - # Reboot may have interrupted dpkg. Bring package system to a good state - if [[ "${OS_NAME}" == "debian" || "${OS_NAME}" == "ubuntu" ]]; then - dpkg --configure -a - fi - - return 0 - fi - - # Install the latest kernel - if [[ ${OS_NAME} == debian ]]; then - apt-get install -y linux-image-amd64 - elif [[ "${OS_NAME}" == "ubuntu" ]]; then - apt-get install -y linux-image-gcp - elif [[ "${OS_NAME}" == "rocky" ]]; then - dnf -y -q install kernel - fi - - # Make it possible to reboot before init actions are complete - #1033 - DP_ROOT=/usr/local/share/google/dataproc - STARTUP_SCRIPT="${DP_ROOT}/startup-script.sh" - POST_HDFS_STARTUP_SCRIPT="${DP_ROOT}/post-hdfs-startup-script.sh" - - for startup_script in ${STARTUP_SCRIPT} ${POST_HDFS_STARTUP_SCRIPT} ; do - sed -i -e 's:/usr/bin/env bash:/usr/bin/env bash\nexit 0:' ${startup_script} - done - - cp /var/log/dataproc-initialization-script-0.log /var/log/dataproc-initialization-script-0.log.0 - - systemctl reboot -} - # Verify if compatible linux distros and secure boot options are used function check_os_and_secure_boot() { - if [[ "${OS_NAME}" == "debian" ]]; then - DEBIAN_VERSION=$(lsb_release -r | awk '{print $2}') # 10 or 11 - if [[ "${DEBIAN_VERSION}" != "10" && "${DEBIAN_VERSION}" != "11" && "${DEBIAN_VERSION}" != "12" ]]; then - echo "Error: The Debian version (${DEBIAN_VERSION}) is not supported. Please use a compatible Debian version." + if is_debian ; then + if ! is_debian10 && ! is_debian11 && ! is_debian12 ; then + echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." exit 1 fi - elif [[ "${OS_NAME}" == "ubuntu" ]]; then - UBUNTU_VERSION=$(lsb_release -r | awk '{print $2}') # 20.04 - UBUNTU_VERSION=${UBUNTU_VERSION%.*} - if [[ "${UBUNTU_VERSION}" != "18" && "${UBUNTU_VERSION}" != "20" && "${UBUNTU_VERSION}" != "22" ]]; then - echo "Error: The Ubuntu version (${UBUNTU_VERSION}) is not supported. Please use a compatible Ubuntu version." + elif is_ubuntu ; then + if ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ; then + echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." exit 1 fi - elif [[ "${OS_NAME}" == "rocky" ]]; then - ROCKY_VERSION=$(lsb_release -r | awk '{print $2}') # 8 or 9 - ROCKY_VERSION=${ROCKY_VERSION%.*} - if [[ "${ROCKY_VERSION}" != "8" && "${ROCKY_VERSION}" != "9" ]]; then - echo "Error: The Rocky Linux version (${ROCKY_VERSION}) is not supported. Please use a compatible Rocky Linux version." + elif is_rocky ; then + if ! is_rocky8 && ! is_rocky9 ; then + echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." exit 1 fi fi - if [[ "${SECURE_BOOT}" == "enabled" ]]; then - echo "Error: Secure Boot is enabled. Please disable Secure Boot while creating the cluster." + if [[ "${SECURE_BOOT}" == "enabled" && $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then + echo "Error: Secure Boot is not supported before image 2.2. Please disable Secure Boot while creating the cluster." exit 1 + elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then + echo "Secure boot is enabled, but no signing material provided." + echo "Please either disable secure boot or provide signing material as per" + echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" + return 1 fi } -# Detect dataproc image version from its various names -if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" -fi - function remove_old_backports { # This script uses 'apt-get update' and is therefore potentially dependent on # backports repositories which have been archived. In order to mitigate this # problem, we will remove any reference to backports repos older than oldstable # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 + oldoldstable=$(curl -s https://deb.debian.org/debian/dists/oldoldstable/Release | awk '/^Codename/ {print $2}'); oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}'); stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}'); - matched_files="$(grep -rsil '\-backports' /etc/apt/sources.list*)" + matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) + if [[ -n "$matched_files" ]]; then - for filename in "$matched_files"; do - grep -e "$oldstable-backports" -e "$stable-backports" "$filename" || \ - sed -i -e 's/^.*-backports.*$//' "$filename" + for filename in "${matched_files[@]}"; do + # Fetch from archive.debian.org for ${oldoldstable}-backports + perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports } + {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}" done fi } function main() { - if [[ ${OS_NAME} == debian ]] && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then + if is_debian && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then remove_old_backports fi check_os_and_secure_boot - if [[ "${OS_NAME}" == "rocky" ]]; then - if dnf list kernel-devel-$(uname -r) && dnf list kernel-headers-$(uname -r); then - echo "kernel devel and headers packages are available. Proceed without kernel upgrade." - else - upgrade_kernel - fi - fi setup_gpu_yarn if [[ "${RUNTIME}" == "SPARK" ]]; then install_spark_rapids @@ -635,6 +795,9 @@ function main() { systemctl restart hadoop-yarn-${svc}.service fi done + if is_debian || is_ubuntu ; then + apt-get clean + fi } main