Skip to content

Commit

Permalink
[spark-rapids] Update MIG script (#1102)
Browse files Browse the repository at this point in the history
* update mig script

Signed-off-by: Suraj Aralihalli <[email protected]>

* add header service

Signed-off-by: Suraj Aralihalli <[email protected]>

* add support for ubuntu18

Signed-off-by: Suraj Aralihalli <[email protected]>

---------

Signed-off-by: Suraj Aralihalli <[email protected]>
  • Loading branch information
SurajAralihalli authored Oct 26, 2023
1 parent db3bd88 commit b3a9e92
Showing 1 changed file with 128 additions and 138 deletions.
266 changes: 128 additions & 138 deletions spark-rapids/mig.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,64 +32,30 @@ function get_metadata_attribute() {
/usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
}

OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
readonly OS_NAME

# Dataproc role
ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)"
readonly ROLE

# Parameters for NVIDIA-provided Debian GPU driver
readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION='495.29.05'
readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION}.run"
NVIDIA_DEBIAN_GPU_DRIVER_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_URL}")
readonly NVIDIA_DEBIAN_GPU_DRIVER_URL

readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'

## CUDA Version
CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '11.5')
readonly CUDA_VERSION
readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION_PREFIX=${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION%%.*}

readonly -A DEFAULT_NVIDIA_DEBIAN_CUDA_URLS=(
[10.1]="${NVIDIA_BASE_DL_URL}/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run"
[10.2]="${NVIDIA_BASE_DL_URL}/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run"
[11.0]="${NVIDIA_BASE_DL_URL}/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run"
[11.1]="${NVIDIA_BASE_DL_URL}/cuda/11.1.0/local_installers/cuda_11.1.0_455.23.05_linux.run"
[11.2]="${NVIDIA_BASE_DL_URL}/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run"
[11.5]="${NVIDIA_BASE_DL_URL}/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run"
[11.6]="${NVIDIA_BASE_DL_URL}/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run"
[11.7]="${NVIDIA_BASE_DL_URL}/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run"
[11.8]="${NVIDIA_BASE_DL_URL}/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run")

readonly DEFAULT_NVIDIA_DEBIAN_CUDA_URL=${DEFAULT_NVIDIA_DEBIAN_CUDA_URLS["${CUDA_VERSION}"]}
NVIDIA_DEBIAN_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_DEBIAN_CUDA_URL}")
readonly NVIDIA_DEBIAN_CUDA_URL
# Parameters for NVIDIA-provided Ubuntu GPU driver
NVIDIA_UBUNTU_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/ubuntu1804/x86_64"
NVIDIA_UBUNTU_REPO_CUDA_PIN="${NVIDIA_UBUNTU_REPO_URL}/cuda-ubuntu1804.pin"
readonly NVIDIA_UBUNTU_REPO_KEY_PACKAGE="${NVIDIA_UBUNTU_REPO_URL}/cuda-keyring_1.0-1_all.deb"

SECURE_BOOT="disabled"
SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')

#echo ${DATAPROC_IMAGE_VERSION}
# Fetch Linux Family distro and Dataproc Image version
readonly OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
readonly ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)"
DATAPROC_IMAGE_VERSION=$(/usr/share/google/get_metadata_value image|grep -Eo 'dataproc-[0-9]-[0-9]'|grep -Eo '[0-9]-[0-9]'|sed -e 's/-/./g')
echo "${DATAPROC_IMAGE_VERSION}" >> /usr/local/share/startup-mig-log

if [[ ${DATAPROC_IMAGE_VERSION} == 2.1 ]]; then
echo "${DATAPROC_IMAGE_VERSION}" >> /usr/local/share/startup-mig-log
NVIDIA_UBUNTU_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/ubuntu2004/x86_64"
NVIDIA_UBUNTU_REPO_CUDA_PIN="${NVIDIA_UBUNTU_REPO_URL}/cuda-ubuntu2004.pin"
# CUDA version and Driver version config
CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.2.2') #12.2.2
NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '535.104.05') #535.104.05
CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.2

# Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
if [[ "${OS_NAME}" == "ubuntu" ]]; then
UBUNTU_VERSION=$(lsb_release -r | awk '{print $2}') # 20.04
UBUNTU_VERSION=${UBUNTU_VERSION%.*}
if [[ "${UBUNTU_VERSION}" == "18" ]]; then
CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1') #12.1.1
NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02
CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1
fi
fi

# Parameter for NVIDIA-provided Rocky Linux GPU driver
readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/rhel8/x86_64/cuda-rhel8.repo"

# Whether to install NVIDIA-provided or OS-provided GPU driver
GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
readonly GPU_DRIVER_PROVIDER
SECURE_BOOT="disabled"
SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')

function execute_with_retries() {
local -r cmd=$1
Expand All @@ -102,103 +68,100 @@ function execute_with_retries() {
return 1
}

# Enables a systemd service on bootup to install new headers.
# This service recompiles kernel modules for Ubuntu and Debian, which are necessary for the functioning of nvidia-smi.
function setup_systemd_update_headers() {
cat <<EOF >/lib/systemd/system/install-headers.service
[Unit]
Description=Install Linux headers for the current kernel
After=network-online.target
[Service]
ExecStart=/bin/bash -c 'count=0; while [ \$count -lt 3 ]; do /usr/bin/apt-get install -y -q linux-headers-\$(/bin/uname -r) && break; count=\$((count+1)); sleep 5; done'
Type=oneshot
RemainAfterExit=yes
[Install]
WantedBy=multi-user.target
EOF

# Reload systemd to recognize the new unit file
systemctl daemon-reload

# Enable and start the service
systemctl enable --now install-headers.service
}

# Install NVIDIA GPU driver provided by NVIDIA
function install_nvidia_gpu_driver() {
if [[ ${OS_NAME} == debian ]]; then
curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
"${NVIDIA_UBUNTU_REPO_KEY_PACKAGE}" -o /tmp/cuda-keyring.deb
dpkg -i "/tmp/cuda-keyring.deb"

curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
"${NVIDIA_DEBIAN_GPU_DRIVER_URL}" -o driver.run
bash "./driver.run" --silent --install-libglvnd
## common steps for all linux family distros
readonly NVIDIA_DRIVER_VERSION_PREFIX=${NVIDIA_DRIVER_VERSION%%.*}

## installation steps based OS_NAME
if [[ ${OS_NAME} == "debian" ]]; then

DEBIAN_VERSION=$(lsb_release -r|awk '{print $2}') # 10 or 11
export DEBIAN_FRONTEND=noninteractive

execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"

readonly LOCAL_INSTALLER_DEB="cuda-repo-debian${DEBIAN_VERSION}-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb"
curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
"${NVIDIA_DEBIAN_CUDA_URL}" -o cuda.run
bash "./cuda.run" --silent --toolkit --no-opengl-libs
elif [[ ${OS_NAME} == ubuntu ]]; then
# we need to install additional modules with enabling secure boot, see issue: https://github.com/GoogleCloudDataproc/initialization-actions/issues/1043
# following [guide](https://cloud.google.com/compute/docs/gpus/install-drivers-gpu#secure-boot) for detailed information.
if [[ ${SECURE_BOOT} == enabled ]]; then
NVIDIA_DRIVER_VERSION=$(apt-cache search 'linux-modules-nvidia-[0-9]+-gcp$' | awk '{print $1}' | sort | tail -n 1 | head -n 1 | awk -F"-" '{print $4}')
apt install linux-modules-nvidia-${NVIDIA_DRIVER_VERSION}-gcp -y
apt install nvidia-driver-${NVIDIA_DRIVER_VERSION} -y

echo """
Package: nsight-compute
Pin: origin *ubuntu.com*
Pin-Priority: -1
Package: nsight-systems
Pin: origin *ubuntu.com*
Pin-Priority: -1
Package: nvidia-modprobe
Pin: release l=NVIDIA CUDA
Pin-Priority: 600
Package: nvidia-settings
Pin: release l=NVIDIA CUDA
Pin-Priority: 600
Package: *
Pin: release l=NVIDIA CUDA
Pin-Priority: 100
""" > /etc/apt/preferences.d/cuda-repository-pin-600

apt install software-properties-common -y

apt-key adv --fetch-keys ${NVIDIA_UBUNTU_REPO_URL}/3bf863cc.pub
add-apt-repository "deb ${NVIDIA_UBUNTU_REPO_URL} /"

# CUDA_DRIVER_VERSION should be like "525.60.13-1"
CUDA_DRIVER_VERSION=$(apt-cache madison cuda-drivers | awk '{print $3}' | sort -r | while read line; do
if dpkg --compare-versions $(dpkg-query -f='${Version}\n' -W nvidia-driver-${NVIDIA_DRIVER_VERSION}) ge $line ; then
echo "$line"
break
fi
done)
"https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" -o /tmp/local-installer.deb

# apt-get install -y cuda-drivers-${NVIDIA_DRIVER_VERSION} cuda-drivers=${CUDA_DRIVER_VERSION}
apt install -y cuda-drivers-${NVIDIA_DRIVER_VERSION}=${CUDA_DRIVER_VERSION} cuda-drivers=${CUDA_DRIVER_VERSION}
dpkg -i /tmp/local-installer.deb
cp /var/cuda-repo-debian${DEBIAN_VERSION}-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/
add-apt-repository contrib
execute_with_retries "apt-get update"

apt-get remove dkms && apt-mark hold dkms
if [[ ${DEBIAN_VERSION} == 10 ]]; then
apt remove -y libglvnd0
fi

# the $line should be "cuda-runtime-12-0,cuda-drivers 525.85.12"
CUDA_VERSION=$(apt-cache showpkg cuda-drivers | grep -o 'cuda-runtime-[0-9][0-9]-[0-9],cuda-drivers [0-9\.]*' | while read line; do
if dpkg --compare-versions ${CUDA_DRIVER_VERSION} ge $(echo $line | grep -Eo '[[:digit:]]+\.[[:digit:]]+') ; then
echo $(echo $line | grep -Eo '[[:digit:]]+-[[:digit:]]')
break
fi
done)
execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}"
execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"

apt install -y cuda-${CUDA_VERSION}
# enable a systemd service that updates kernel headers after reboot
setup_systemd_update_headers

elif [[ ${OS_NAME} == "ubuntu" ]]; then

else
curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
"${NVIDIA_UBUNTU_REPO_KEY_PACKAGE}" -o /tmp/cuda-keyring.deb
dpkg -i "/tmp/cuda-keyring.deb"
curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
"${NVIDIA_UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600
UBUNTU_VERSION=$(lsb_release -r|awk '{print $2}') # 20.04 or 22.04
UBUNTU_VERSION=${UBUNTU_VERSION%.*} # 20 or 22

add-apt-repository "deb ${NVIDIA_UBUNTU_REPO_URL} /"
execute_with_retries "apt-get update"
execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"

if [[ -n "${CUDA_VERSION}" ]]; then
local -r cuda_package=cuda-toolkit-${CUDA_VERSION//./-}
else
local -r cuda_package=cuda-toolkit
fi
# Without --no-install-recommends this takes a very long time.
execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION_PREFIX}"
execute_with_retries "apt-get install -y -q --no-install-recommends ${cuda_package}"
fi
elif [[ ${OS_NAME} == rocky ]]; then
readonly UBUNTU_REPO_CUDA_PIN="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/cuda-ubuntu${UBUNTU_VERSION}04.pin"
curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
"${UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600

readonly LOCAL_INSTALLER_DEB="cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb"
curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
"https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" -o /tmp/local-installer.deb

dpkg -i /tmp/local-installer.deb
cp /var/cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/
execute_with_retries "apt-get update"

execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}"
execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"

# enable a systemd service that updates kernel headers after reboot
setup_systemd_update_headers

elif [[ ${OS_NAME} == "rocky" ]]; then

ROCKY_VERSION=$(lsb_release -r | awk '{print $2}') # 8.8 or 9.1
ROCKY_VERSION=${ROCKY_VERSION%.*} # 8 or 9

readonly NVIDIA_ROCKY_REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/rhel${ROCKY_VERSION}/x86_64/cuda-rhel${ROCKY_VERSION}.repo"
execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
execute_with_retries "dnf clean all"
# Always install the latest cuda/driver version because old driver version 495 has issues
execute_with_retries "dnf install -y -q nvidia-driver nvidia-settings cuda-driver"
execute_with_retries "dnf -y -q module install nvidia-driver:${NVIDIA_DRIVER_VERSION_PREFIX}"
execute_with_retries "dnf -y -q install cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"
modprobe nvidia

else
echo "Unsupported OS: '${OS_NAME}'"
exit 1
Expand Down Expand Up @@ -289,12 +252,39 @@ function upgrade_kernel() {
systemctl reboot
}

function main() {
# Verify if compatible linux distros and secure boot options are used
function check_os_and_secure_boot() {
if [[ "${OS_NAME}" == "debian" ]]; then
DEBIAN_VERSION=$(lsb_release -r | awk '{print $2}') # 10 or 11
if [[ "${DEBIAN_VERSION}" != "10" && "${DEBIAN_VERSION}" != "11" ]]; then
echo "Error: The Debian version (${DEBIAN_VERSION}) is not supported. Please use a compatible Debian version."
exit 1
fi
elif [[ "${OS_NAME}" == "ubuntu" ]]; then
UBUNTU_VERSION=$(lsb_release -r | awk '{print $2}') # 20.04
UBUNTU_VERSION=${UBUNTU_VERSION%.*}
if [[ "${UBUNTU_VERSION}" != "18" && "${UBUNTU_VERSION}" != "20" && "${UBUNTU_VERSION}" != "22" ]]; then
echo "Error: The Ubuntu version (${UBUNTU_VERSION}) is not supported. Please use a compatible Ubuntu version."
exit 1
fi
elif [[ "${OS_NAME}" == "rocky" ]]; then
ROCKY_VERSION=$(lsb_release -r | awk '{print $2}') # 8 or 9
ROCKY_VERSION=${ROCKY_VERSION%.*}
if [[ "${ROCKY_VERSION}" != "8" && "${ROCKY_VERSION}" != "9" ]]; then
echo "Error: The Rocky Linux version (${ROCKY_VERSION}) is not supported. Please use a compatible Rocky Linux version."
exit 1
fi
fi

if [[ ${OS_NAME} != debian ]] && [[ ${OS_NAME} != ubuntu ]] && [[ ${OS_NAME} != rocky ]]; then
echo "Unsupported OS: '${OS_NAME}'"
if [[ "${SECURE_BOOT}" == "enabled" ]]; then
echo "Error: Secure Boot is enabled. Please disable Secure Boot while creating the cluster."
exit 1
fi
}

function main() {

check_os_and_secure_boot

if [[ "${OS_NAME}" == "rocky" ]]; then
if dnf list kernel-devel-$(uname -r) && dnf list kernel-headers-$(uname -r); then
Expand Down

0 comments on commit b3a9e92

Please sign in to comment.