diff --git a/gpu/Dockerfile b/gpu/Dockerfile
index 1127293e1..05724eb8c 100644
--- a/gpu/Dockerfile
+++ b/gpu/Dockerfile
@@ -15,8 +15,10 @@ RUN apt-get -qq update \
curl jq less screen > /dev/null 2>&1 && apt-get clean
# Install bazel signing key, repo and package
-ENV bazel_kr_path=/usr/share/keyrings/bazel-release.pub.gpg
-ENV bazel_repo_data="http://storage.googleapis.com/bazel-apt stable jdk1.8"
+ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg \
+ bazel_version=7.4.0 \
+ bazel_repo_data="http://storage.googleapis.com/bazel-apt stable jdk1.8" \
+ DEBIAN_FRONTEND=noninteractive
RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg \
| gpg --dearmor -o "${bazel_kr_path}" \
@@ -24,10 +26,14 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg \
| dd of=/etc/apt/sources.list.d/bazel.list status=none \
&& apt-get update -qq
-RUN apt-get autoremove -y -qq && \
- apt-get install -y -qq default-jdk python3-setuptools bazel > /dev/null 2>&1 && \
+RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \
+ apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \
apt-get clean
+# Set bazel-${bazel_version} as the default bazel alternative in this container
+RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-${bazel_version} 1 && \
+ update-alternatives --set bazel /usr/bin/bazel-${bazel_version}
+
# Install here any utilities you find useful when troubleshooting
RUN apt-get -y -qq install emacs-nox vim uuid-runtime > /dev/null 2>&1 && apt-get clean
diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 25efb2a49..66964b4d1 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+
#
# This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
@@ -53,16 +54,16 @@ function os_vercat() ( set +x
else os_version ; fi ; )
function repair_old_backports {
- if ge_debian12 || ! is_debuntu ; then return ; fi
+ if ! is_debuntu ; then return ; fi
# This script uses 'apt-get update' and is therefore potentially dependent on
# backports repositories which have been archived. In order to mitigate this
# problem, we will use archive.debian.org for the oldoldstable repo
# https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
debdists="https://deb.debian.org/debian/dists"
- oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}');
- oldstable=$( curl -s "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}');
- stable=$( curl -s "${debdists}/stable/Release" | awk '/^Codename/ {print $2}');
+ oldoldstable=$(curl ${curl_retry_args} "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}');
+ oldstable=$( curl ${curl_retry_args} "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}');
+ stable=$( curl ${curl_retry_args} "${debdists}/stable/Release" | awk '/^Codename/ {print $2}');
matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) )
@@ -94,6 +95,7 @@ function print_metadata_value_if_exists() {
return ${return_code}
}
+# replicates /usr/share/google/get_metadata_value
function get_metadata_value() (
set +x
local readonly varname=$1
@@ -117,7 +119,7 @@ function get_metadata_attribute() (
get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
)
-OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
+OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
readonly OS_NAME
@@ -126,58 +128,92 @@ ROLE="$(get_metadata_attribute dataproc-role)"
readonly ROLE
# CUDA version and Driver version
+# https://docs.nvidia.com/deploy/cuda-compatibility/
# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
# https://developer.nvidia.com/cuda-downloads
-# Rocky8: 12.0: 525.147.05
+
+# Minimum supported version for open kernel driver is 515.43.04
+# https://github.com/NVIDIA/open-gpu-kernel-modules/tags
readonly -A DRIVER_FOR_CUDA=(
- ["11.8"]="560.35.03"
- ["12.0"]="525.60.13" ["12.4"]="560.35.03" ["12.6"]="560.35.03"
+ ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01"
+ ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31"
+ ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03"
+ ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05"
+ ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.29.06"
+ ["12.4"]="550.135" ["12.5"]="550.142" ["12.6"]="550.142"
)
-# https://developer.nvidia.com/cudnn-downloads
-if is_debuntu ; then
-readonly -A CUDNN_FOR_CUDA=(
- ["11.8"]="9.5.1.17"
- ["12.0"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.6"]="9.5.1.17"
+readonly -A DRIVER_SUBVER=(
+ ["410"]="410.104" ["415"]="415.27" ["418"]="418.113"
+ ["430"]="430.64" ["435"]="435.21" ["440"]="440.100"
+ ["450"]="450.119.03" ["455"]="455.45.01" ["460"]="460.91.03"
+ ["465"]="465.31" ["470"]="470.256.02" ["495"]="495.46"
+ ["510"]="510.108.03" ["515"]="515.48.07" ["520"]="525.147.05"
+ ["525"]="525.147.05" ["535"]="535.216.01" ["545"]="545.29.06"
+ ["550"]="550.142" ["555"]="555.58.02" ["560"]="560.35.03"
+ ["565"]="565.77"
)
-elif is_rocky ; then
-# rocky:
-# 12.0: 8.8.1.3
-# 12.1: 8.9.3.28
-# 12.2: 8.9.7.29
-# 12.3: 9.0.0.312
-# 12.4: 9.1.1.17
-# 12.5: 9.2.1.18
-# 12.6: 9.5.1.17
+# https://developer.nvidia.com/cudnn-downloads
readonly -A CUDNN_FOR_CUDA=(
- ["11.8"]="9.5.1.17"
- ["12.0"]="8.8.1.3" ["12.4"]="9.1.1.17" ["12.6"]="9.5.1.17"
+ ["10.0"]="7.4.1" ["10.1"]="7.6.4" ["10.2"]="7.6.5"
+ ["11.0"]="8.0.4" ["11.1"]="8.0.5" ["11.2"]="8.1.1"
+ ["11.3"]="8.2.1" ["11.4"]="8.2.4.15" ["11.5"]="8.3.1.22"
+ ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17"
+ ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.2"]="8.9.5"
+ ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" ["12.5"]="9.2.1.18"
+ ["12.6"]="9.6.0.74"
)
-fi
# https://developer.nvidia.com/nccl/nccl-download
-# 12.2: 2.19.3, 12.5: 2.21.5
readonly -A NCCL_FOR_CUDA=(
- ["11.8"]="2.15.5"
- ["12.0"]="2.16.5" ["12.4"]="2.23.4" ["12.6"]="2.23.4"
+ ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3"
+ ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4"
+ ["11.5"]="2.11.4" ["11.6"]="2.12.10" ["11.7"]="2.12.12"
+ ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3"
+ ["12.2"]="2.19.3" ["12.3"]="2.19.4" ["12.4"]="2.23.4"
+ ["12.5"]="2.22.3" ["12.6"]="2.23.4"
)
readonly -A CUDA_SUBVER=(
- ["11.8"]="11.8.0"
- ["12.0"]="12.0.0" ["12.4"]="12.4.1" ["12.6"]="12.6.2"
+ ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89"
+ ["11.0"]="11.0.3" ["11.1"]="11.1.1" ["11.2"]="11.2.2"
+ ["11.3"]="11.3.1" ["11.4"]="11.4.4" ["11.5"]="11.5.2"
+ ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0"
+ ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2"
+ ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1"
+ ["12.6"]="12.6.3"
)
-RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-readonly DEFAULT_CUDA_VERSION='12.4'
-CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
-if ( ( ge_debian12 || ge_rocky9 ) && version_le "${CUDA_VERSION%%.*}" "11" ) ; then
- # CUDA 11 no longer supported on debian12 - 2024-11-22, rocky9 - 2024-11-27
- CUDA_VERSION="${DEFAULT_CUDA_VERSION}"
-fi
+function set_cuda_version() {
+ case "${DATAPROC_IMAGE_VERSION}" in
+ "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
+ "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
+ "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
+ * )
+ echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
+ exit 1
+ ;;
+ esac
+ local cuda_url
+ cuda_url=$(get_metadata_attribute 'cuda-url' '')
+ if [[ -n "${cuda_url}" ]] ; then
+ # if cuda-url metadata variable has been passed, extract default version from url
+ local CUDA_URL_VERSION
+ CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
+ if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
+ DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION}"
+ fi
+ fi
+ readonly DEFAULT_CUDA_VERSION
-if ( version_ge "${CUDA_VERSION}" "12" && (le_debian11 || le_ubuntu18) ) ; then
- # Only CUDA 12.0 supported on older debuntu
- CUDA_VERSION="12.0"
-fi
-readonly CUDA_VERSION
-readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}"
+ CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
+ if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then
+ CUDA_FULL_VERSION="${CUDA_VERSION}"
+ CUDA_VERSION="${CUDA_VERSION%.*}"
+ fi
+ readonly CUDA_VERSION
+ if ( ! test -v CUDA_FULL_VERSION ) ; then
+ CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
+ fi
+ readonly CUDA_FULL_VERSION
+}
function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
@@ -187,45 +223,76 @@ function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
-DEFAULT_DRIVER="${DRIVER_FOR_CUDA[${CUDA_VERSION}]}"
-if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then
- DEFAULT_DRIVER="560.28.03" ; fi
-if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03" ; fi
-if ( is_rocky && le_cuda11 ) ; then DEFAULT_DRIVER="525.147.05" ; fi
-if ( is_ubuntu20 && le_cuda11 ) ; then DEFAULT_DRIVER="535.183.06" ; fi
-if ( is_rocky9 && ge_cuda12 ) ; then DEFAULT_DRIVER="565.57.01" ; fi
-DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
-
-readonly DRIVER_VERSION
-readonly DRIVER=${DRIVER_VERSION%%.*}
-
-readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
-readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
-
-# Parameters for NVIDIA-provided cuDNN library
-readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
-CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
-function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
-function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
-# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
-if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
- CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
-elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
- # cuDNN v8 is not distribution for ubuntu20+, debian12
- CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
-elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
- # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
- CUDNN_VERSION="8.8.0.121"
-fi
-readonly CUDNN_VERSION
+function set_driver_version() {
+ local gpu_driver_url
+ gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '')
+
+ local cuda_url
+ cuda_url=$(get_metadata_attribute 'cuda-url' '')
+
+ local nv_xf86_x64_base="https://us.download.nvidia.com/XFree86/Linux-x86_64"
+
+ local DEFAULT_DRIVER
+ # Take default from gpu-driver-url metadata value
+ if [[ -n "${gpu_driver_url}" ]] ; then
+ DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')"
+ if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi
+ # Take default from cuda-url metadata value as a backup
+ elif [[ -n "${cuda_url}" ]] ; then
+ local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
+ if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
+ major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
+ driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
+ if curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+ # use the version indicated by the cuda url as the default if it exists
+ DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
+ elif curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+ # use the maximum sub-version available for the major version indicated in cuda url as the default
+ DEFAULT_DRIVER="${driver_max_maj_version}"
+ fi
+ fi
+ fi
+
+ if ( ! test -v DEFAULT_DRIVER ) ; then
+ # If a default driver version has not been extracted, use the default for this version of CUDA
+ DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
+ fi
-readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
-readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
+ DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
-# Parameters for NVIDIA-provided Debian GPU driver
-readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+ readonly DRIVER_VERSION
+ readonly DRIVER="${DRIVER_VERSION%%.*}"
+
+ export DRIVER_VERSION DRIVER
+
+ gpu_driver_url="${nv_xf86_x64_base}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+ if ! curl ${curl_retry_args} --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
+ echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
+ exit 1
+ fi
+}
+
+function set_cudnn_version() {
+ readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39"
+ readonly DEFAULT_CUDNN8_VERSION="8.3.1.22"
+ readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+
+ # Parameters for NVIDIA-provided cuDNN library
+ readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
+ CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
+ # The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION}
+ if ( is_rocky && version_lt "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then
+ CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}"
+ elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
+ # cuDNN v8 is not distribution for ubuntu20+, debian12
+ CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
+ elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
+ # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
+ CUDNN_VERSION="8.8.0.121"
+ fi
+ readonly CUDNN_VERSION
+}
-readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
# Short name for urls
if is_ubuntu22 ; then
@@ -250,47 +317,112 @@ else
nccl_shortname="${shortname}"
fi
-# Parameters for NVIDIA-provided package repositories
-readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
-readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
+function set_nv_urls() {
+ # Parameters for NVIDIA-provided package repositories
+ readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
+ readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
-# Parameters for NVIDIA-provided NCCL library
-readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb"
-NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}")
-readonly NCCL_REPO_URL
-readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
+ # Parameter for NVIDIA-provided Rocky Linux GPU driver
+ readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+}
function set_cuda_runfile_url() {
- local RUNFILE_DRIVER_VERSION="${DRIVER_VERSION}"
- local RUNFILE_CUDA_VERSION="${CUDA_FULL_VERSION}"
-
- if ge_cuda12 ; then
- if ( le_debian11 || le_ubuntu18 ) ; then
- RUNFILE_DRIVER_VERSION="525.60.13"
- RUNFILE_CUDA_VERSION="12.0.0"
- elif ( le_rocky8 && version_le "${DATAPROC_IMAGE_VERSION}" "2.0" ) ; then
- RUNFILE_DRIVER_VERSION="525.147.05"
- RUNFILE_CUDA_VERSION="12.0.0"
+ local MAX_DRIVER_VERSION
+ local MAX_CUDA_VERSION
+
+ MIN_OPEN_DRIVER_VER="515.43.04"
+ local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
+ local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
+
+ if is_cuda12 ; then
+ if is_debian12 ; then
+ MIN_DRIVER_VERSION="545.23.06"
+ MIN_CUDA_VERSION="12.3.0"
+ elif is_debian10 ; then
+ MAX_DRIVER_VERSION="555.42.02"
+ MAX_CUDA_VERSION="12.5.0"
+ elif is_ubuntu18 ; then
+ MAX_DRIVER_VERSION="530.30.02"
+ MAX_CUDA_VERSION="12.1.1"
+ fi
+ elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+ if le_debian10 ; then
+ # cuda 11 is not supported for <= debian10
+ MAX_CUDA_VERSION="0"
+ MAX_DRIVER_VERSION="0"
fi
else
- RUNFILE_DRIVER_VERSION="520.61.05"
- RUNFILE_CUDA_VERSION="11.8.0"
+ echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}"
fi
- readonly RUNFILE_FILENAME="cuda_${RUNFILE_CUDA_VERSION}_${RUNFILE_DRIVER_VERSION}_linux.run"
- CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${RUNFILE_CUDA_VERSION}"
- DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${RUNFILE_FILENAME}"
- readonly DEFAULT_NVIDIA_CUDA_URL
+ if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+ echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}"
+ elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then
+ echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}. Specified: ${CUDA_VERSION}"
+ fi
+ if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then
+ echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}"
+ elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then
+ echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}"
+ fi
+
+ # driver version named in cuda runfile filename
+ # (these may not be actual driver versions - see https://us.download.nvidia.com/XFree86/Linux-x86_64/)
+ readonly -A drv_for_cuda=(
+ ["10.0.130"]="410.48"
+ ["10.1.234"]="418.87.00"
+ ["10.2.89"]="440.33.01"
+ ["11.0.3"]="450.51.06"
+ ["11.1.1"]="455.32.00"
+ ["11.2.2"]="460.32.03"
+ ["11.3.1"]="465.19.01"
+ ["11.4.4"]="470.82.01"
+ ["11.5.2"]="495.29.05"
+ ["11.6.2"]="510.47.03"
+ ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
+ ["11.8.0"]="520.61.05"
+ ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
+ ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
+ ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
+ ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
+ ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://us.download.nvidia.com/XFree86/Linux-x86_64/
+ ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
+ ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05"
+ )
+
+ # Verify that the file with the indicated combination exists
+ local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]}
+ CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run"
+ local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
+ local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"
NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
- readonly NVIDIA_CUDA_URL
-}
-set_cuda_runfile_url
+ if ! curl ${curl_retry_args} --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
+ echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
+ if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then
+ echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead"
+ fi
+ exit 1
+ fi
+
+ readonly NVIDIA_CUDA_URL
-# Parameter for NVIDIA-provided Rocky Linux GPU driver
-readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+ CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
+ readonly CUDA_RUNFILE
+
+ if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
+ echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
+ elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
+ echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18. Requested version: ${CUDA_VERSION}"
+ elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
+ echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
+ elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
+ echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
+ fi
+}
+function set_cudnn_tarball_url() {
CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
@@ -310,13 +442,12 @@ if ( version_ge "${CUDA_VERSION}" "12.0" ); then
fi
readonly CUDNN_TARBALL
readonly CUDNN_TARBALL_URL
+}
# Whether to install NVIDIA-provided or OS-provided GPU driver
GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
readonly GPU_DRIVER_PROVIDER
-# Stackdriver GPU agent parameters
-readonly GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
readonly INSTALL_GPU_AGENT
@@ -336,7 +467,7 @@ function execute_with_retries() (
if [[ "$cmd" =~ "^apt-get install" ]] ; then
apt-get -y clean
- apt-get -y autoremove
+ apt-get -o DPkg::Lock::Timeout=60 -y autoremove
fi
for ((i = 0; i < 3; i++)); do
set -x
@@ -348,34 +479,32 @@ function execute_with_retries() (
return 1
)
-CUDA_KEYRING_PKG_INSTALLED="0"
function install_cuda_keyring_pkg() {
- if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
+ is_complete cuda-keyring-installed && return
local kr_ver=1.1
- curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+ curl ${curl_retry_args} \
"${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
-o "${tmpdir}/cuda-keyring.deb"
dpkg -i "${tmpdir}/cuda-keyring.deb"
rm -f "${tmpdir}/cuda-keyring.deb"
- CUDA_KEYRING_PKG_INSTALLED="1"
+ mark_complete cuda-keyring-installed
}
function uninstall_cuda_keyring_pkg() {
apt-get purge -yq cuda-keyring
- CUDA_KEYRING_PKG_INSTALLED="0"
+ mark_incomplete cuda-keyring-installed
}
-CUDA_LOCAL_REPO_INSTALLED="0"
function install_local_cuda_repo() {
- if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
- CUDA_LOCAL_REPO_INSTALLED="1"
+ is_complete install-local-cuda-repo && return
+
pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
readonly DIST_KEYRING_DIR="/var/${pkgname}"
- curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+ curl ${curl_retry_args} \
"${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"
dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
@@ -383,47 +512,46 @@ function install_local_cuda_repo() {
cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
if is_ubuntu ; then
- curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+ curl ${curl_retry_args} \
"${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
-o /etc/apt/preferences.d/cuda-repository-pin-600
fi
+
+ mark_complete install-local-cuda-repo
}
function uninstall_local_cuda_repo(){
apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
- CUDA_LOCAL_REPO_INSTALLED="0"
+ mark_incomplete install-local-cuda-repo
}
-CUDNN_LOCAL_REPO_INSTALLED="0"
-CUDNN_PKG_NAME=""
function install_local_cudnn_repo() {
- if [[ "${CUDNN_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
- pkgname="cudnn-local-repo-${shortname}-${CUDNN}"
+ is_complete install-local-cudnn-repo && return
+ pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
CUDNN_PKG_NAME="${pkgname}"
local_deb_fn="${pkgname}_1.0-1_amd64.deb"
- local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN}/local_installers/${local_deb_fn}"
+ local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"
# ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
- curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+ curl ${curl_retry_args} \
"${local_deb_url}" -o "${tmpdir}/local-installer.deb"
dpkg -i "${tmpdir}/local-installer.deb"
rm -f "${tmpdir}/local-installer.deb"
- cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
+ cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
- CUDNN_LOCAL_REPO_INSTALLED="1"
+ mark_complete install-local-cudnn-repo
}
function uninstall_local_cudnn_repo() {
apt-get purge -yq "${CUDNN_PKG_NAME}"
- CUDNN_LOCAL_REPO_INSTALLED="0"
+ mark_incomplete install-local-cudnn-repo
}
-CUDNN8_LOCAL_REPO_INSTALLED="0"
-CUDNN8_PKG_NAME=""
function install_local_cudnn8_repo() {
- if [[ "${CUDNN8_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
+ is_complete install-local-cudnn8-repo && return
+
if is_ubuntu ; then cudnn8_shortname="ubuntu2004"
elif is_debian ; then cudnn8_shortname="debian11"
else return 0 ; fi
@@ -437,61 +565,163 @@ function install_local_cudnn8_repo() {
deb_fn="${pkgname}_1.0-1_amd64.deb"
local_deb_fn="${tmpdir}/${deb_fn}"
- local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
- curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
- "${local_deb_url}" -o "${local_deb_fn}"
+ local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
+
+ # cache the cudnn package
+ cache_fetched_package "${local_deb_url}" \
+ "${pkg_bucket}/nvidia/cudnn/${CUDNN8_CUDA_VER}/${deb_fn}" \
+ "${local_deb_fn}"
+
+ local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
+ # If we are using a ram disk, mount another where we will unpack the cudnn local installer
+ if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then
+ mkdir -p "${cudnn_path}"
+ mount -t tmpfs tmpfs "${cudnn_path}"
+ fi
dpkg -i "${local_deb_fn}"
rm -f "${local_deb_fn}"
- cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
- CUDNN8_LOCAL_REPO_INSTALLED="1"
+ cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
+ mark_complete install-local-cudnn8-repo
}
function uninstall_local_cudnn8_repo() {
apt-get purge -yq "${CUDNN8_PKG_NAME}"
- CUDNN8_LOCAL_REPO_INSTALLED="0"
+ mark_incomplete install-local-cudnn8-repo
}
function install_nvidia_nccl() {
+ readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
+ readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
+
+ is_complete nccl && return
+
+ if is_cuda11 && is_debian12 ; then
+ echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
+ return
+ fi
+
local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
- if is_rocky ; then
- execute_with_retries \
- dnf -y -q install \
- "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}"
- sync
- elif is_ubuntu ; then
- install_cuda_keyring_pkg
+ mkdir -p "${workdir}"
+ pushd "${workdir}"
- apt-get update -qq
+ test -d "${workdir}/nccl" || {
+ local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
+ curl ${curl_retry_args} \
+ "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
+ | tar xz
+ mv "nccl-${NCCL_VERSION}-1" nccl
+ }
- if is_ubuntu18 ; then
- execute_with_retries \
- apt-get install -q -y \
- libnccl2 libnccl-dev
- sync
+ local build_path
+ if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else
+ build_path="nccl/build/pkg/rpm/x86_64" ; fi
+
+ test -d "${workdir}/nccl/build" || {
+ local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
+ local local_tarball="${workdir}/${build_tarball}"
+ local gcs_tarball="${pkg_bucket}/nvidia/nccl/${_shortname}/${build_tarball}"
+
+ if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
+ # when running with fewer than 32 cores, yield to in-progress build
+ sleep $(( ( RANDOM % 11 ) + 10 ))
+ if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then
+ local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")"
+ local build_start_epoch="$(date -d "${build_start_time}" +%s)"
+ local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
+ while gsutil ls -L "${gcs_tarball}.building" ; do
+ local now_epoch="$(date -u +%s)"
+ if (( now_epoch > timeout_epoch )) ; then
+ # detect unexpected build failure after 45m
+ gsutil rm "${gcs_tarball}.building"
+ break
+ fi
+ sleep 5m
+ done
+ fi
+ fi
+
+ output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+ if echo "${output}" | grep -q "${gcs_tarball}" ; then
+ # cache hit - unpack from cache
+ echo "cache hit"
+ gcloud storage cat "${gcs_tarball}" | tar xvz
else
- execute_with_retries \
- apt-get install -q -y \
- "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}"
- sync
+ # build and cache
+ touch "${local_tarball}.building"
+ gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
+ building_file="${gcs_tarball}.building"
+ pushd nccl
+ # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
+ install_build_dependencies
+
+ # https://github.com/NVIDIA/nccl/blob/master/README.md
+ # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+ # Fermi: SM_20, compute_30
+ # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
+ # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
+ # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
+
+ # The following architectures are suppored by open kernel driver
+ # Volta: SM_70,SM_72, compute_70,compute_72
+ # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
+
+ # The following architectures are supported by CUDA v11.8+
+ # Ada: SM_89, compute_89
+ # Hopper: SM_90,SM_90a compute_90,compute_90a
+ # Blackwell: SM_100, compute_100
+ NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
+ NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
+ if version_gt "${CUDA_VERSION}" "11.6" ; then
+ NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
+ if version_ge "${CUDA_VERSION}" "11.8" ; then
+ NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
+ if version_ge "${CUDA_VERSION}" "12.0" ; then
+ NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
+
+ if is_debuntu ; then
+ # These packages are required to build .deb packages from source
+ execute_with_retries \
+ apt-get install -y -qq build-essential devscripts debhelper fakeroot
+ export NVCC_GENCODE
+ execute_with_retries make -j$(nproc) pkg.debian.build
+ elif is_rocky ; then
+ # These packages are required to build .rpm packages from source
+ execute_with_retries \
+ dnf -y -q install rpm-build rpmdevtools
+ export NVCC_GENCODE
+ execute_with_retries make -j$(nproc) pkg.redhat.build
+ fi
+ tar czvf "${local_tarball}" "../${build_path}"
+ make clean
+ popd
+ tar xzvf "${local_tarball}"
+ gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+ if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
+ building_file=""
+ rm "${local_tarball}"
fi
- else
- echo "Unsupported OS: '${OS_NAME}'"
- # NB: this tarball is 10GB in size, but can be used to install NCCL on non-ubuntu systems
- # wget https://developer.download.nvidia.com/hpc-sdk/24.7/nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz
- # tar xpzf nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz
- # nvhpc_2024_247_Linux_x86_64_cuda_multi/install
- return
+ }
+
+ if is_debuntu ; then
+ dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb"
+ elif is_rocky ; then
+ rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm"
fi
+
+ popd
+ mark_complete nccl
}
function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
function install_nvidia_cudnn() {
+ is_complete cudnn && return
+ if le_debian10 ; then return ; fi
local major_version
major_version="${CUDNN_VERSION%%.*}"
local cudnn_pkg_version
@@ -515,19 +745,21 @@ function install_nvidia_cudnn() {
if ge_debian12 && is_src_os ; then
apt-get -y install nvidia-cudnn
else
- local CUDNN="${CUDNN_VERSION%.*}"
if is_cudnn8 ; then
- install_local_cudnn8_repo
+ add_repo_cuda
apt-get update -qq
+ # Ignore version requested and use the latest version in the package index
+ cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)"
execute_with_retries \
apt-get -y install --no-install-recommends \
"libcudnn8=${cudnn_pkg_version}" \
"libcudnn8-dev=${cudnn_pkg_version}"
- sync
+
+ sync
elif is_cudnn9 ; then
- install_cuda_keyring_pkg
+ install_cuda_keyring_pkg
apt-get update -qq
@@ -536,19 +768,12 @@ function install_nvidia_cudnn() {
"libcudnn9-cuda-${CUDA_VERSION%%.*}" \
"libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
"libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
- sync
+
+ sync
else
echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
fi
fi
- elif is_ubuntu ; then
- local -a packages
- packages=(
- "libcudnn${major_version}=${cudnn_pkg_version}"
- "libcudnn${major_version}-dev=${cudnn_pkg_version}")
- execute_with_retries \
- apt-get install -q -y --no-install-recommends "${packages[*]}"
- sync
else
echo "Unsupported OS: '${OS_NAME}'"
exit 1
@@ -557,13 +782,85 @@ function install_nvidia_cudnn() {
ldconfig
echo "NVIDIA cuDNN successfully installed for ${OS_NAME}."
+ mark_complete cudnn
+}
+
+function install_pytorch() {
+ is_complete pytorch && return
+
+ local env
+ env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
+ local mc3=/opt/conda/miniconda3
+ local envpath="${mc3}/envs/${env}"
+ if [[ "${env}" == "base" ]]; then
+ echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${mc3}" ; fi
+ # Set numa node to 0 for all GPUs
+ for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
+
+ local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
+ local local_tarball="${workdir}/${build_tarball}"
+ local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
+
+ if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
+ # when running with fewer than 32 cores, yield to in-progress build
+ sleep $(( ( RANDOM % 11 ) + 10 ))
+ if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then
+ local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")"
+ local build_start_epoch="$(date -d "${build_start_time}" +%s)"
+ local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
+ while gsutil ls -L "${gcs_tarball}.building" ; do
+ local now_epoch="$(date -u +%s)"
+ if (( now_epoch > timeout_epoch )) ; then
+ # detect unexpected build failure after 45m
+ gsutil rm "${gcs_tarball}.building"
+ break
+ fi
+ sleep 5m
+ done
+ fi
+ fi
+
+ output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+ if echo "${output}" | grep -q "${gcs_tarball}" ; then
+ # cache hit - unpack from cache
+ echo "cache hit"
+ mkdir -p "${envpath}"
+ gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
+ else
+ touch "${local_tarball}.building"
+ gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
+ building_file="${gcs_tarball}.building"
+ local verb=create
+ if test -d "${envpath}" ; then verb=install ; fi
+ cudart_spec="cuda-cudart"
+ if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
+
+ # Install pytorch and company to this environment
+ "${mc3}/bin/mamba" "${verb}" -n "${env}" \
+ -c conda-forge -c nvidia -c rapidsai \
+ numba pytorch tensorflow[and-cuda] rapids pyspark \
+ "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
+
+ # Install jupyter kernel in this environment
+ "${envpath}/bin/python3" -m pip install ipykernel
+
+ # package environment and cache in GCS
+ pushd "${envpath}"
+ tar czf "${local_tarball}" .
+ popd
+ gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+ if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
+ building_file=""
+ fi
+
+ # register the environment as a selectable kernel
+ "${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})"
+
+ mark_complete pytorch
}
-CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
-PSN="$(get_metadata_attribute private_secret_name)"
-readonly PSN
function configure_dkms_certs() {
- if [[ -z "${PSN}" ]]; then
+ if test -v PSN && [[ -z "${PSN}" ]]; then
echo "No signing secret provided. skipping";
return 0
fi
@@ -575,28 +872,27 @@ function configure_dkms_certs() {
echo "Private key material exists"
local expected_modulus_md5sum
- expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum)
+ expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
if [[ -n "${expected_modulus_md5sum}" ]]; then
modulus_md5sum="${expected_modulus_md5sum}"
- else
- modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3"
- fi
- # Verify that cert md5sum matches expected md5sum
- if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then
- echo "unmatched rsa key modulus"
- fi
- ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
+ # Verify that cert md5sum matches expected md5sum
+ if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
+ echo "unmatched rsa key"
+ fi
- # Verify that key md5sum matches expected md5sum
- if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then
- echo "unmatched x509 cert modulus"
+ # Verify that key md5sum matches expected md5sum
+ if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
+ echo "unmatched x509 cert"
+ fi
+ else
+ modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
fi
+ ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
return
fi
-
# Retrieve cloud secrets keys
local sig_priv_secret_name
sig_priv_secret_name="${PSN}"
@@ -623,16 +919,14 @@ function configure_dkms_certs() {
| base64 --decode \
| dd status=none of="${CA_TMPDIR}/db.der"
- # symlink private key and copy public cert from volatile storage for DKMS
- if is_ubuntu ; then
- mkdir -p /var/lib/shim-signed/mok
- ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv
- cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der
- else
- mkdir -p /var/lib/dkms/
- ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
- cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub
- fi
+ local mok_directory="$(dirname "${mok_key}")"
+ mkdir -p "${mok_directory}"
+
+ # symlink private key and copy public cert from volatile storage to DKMS directory
+ ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+ cp -f "${CA_TMPDIR}/db.der" "${mok_der}"
+
+ modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
}
function clear_dkms_key {
@@ -640,10 +934,11 @@ function clear_dkms_key {
echo "No signing secret provided. skipping" >&2
return 0
fi
- rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv
+ rm -rf "${CA_TMPDIR}" "${mok_key}"
}
function add_contrib_component() {
+ if ! is_debuntu ; then return ; fi
if ge_debian12 ; then
# Include in sources file components on which nvidia-kernel-open-dkms depends
local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
@@ -668,76 +963,129 @@ function add_nonfree_components() {
fi
}
+#
+# Install package signing key and add corresponding repository
+# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
function add_repo_nvidia_container_toolkit() {
- if is_debuntu ; then
- local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
- local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list
- # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
- test -f "${kr_path}" ||
- curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
- | gpg --dearmor -o "${kr_path}"
+ local nvctk_root="https://nvidia.github.io/libnvidia-container"
+ local signing_key_url="${nvctk_root}/gpgkey"
+ local repo_data
- test -f "${sources_list_path}" ||
- curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
- | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \
- | tee "${sources_list_path}"
- fi
+ if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /"
+ else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi
+
+ os_add_repo nvidia-container-toolkit \
+ "${signing_key_url}" \
+ "${repo_data}" \
+ "no"
}
function add_repo_cuda() {
if is_debuntu ; then
- local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
- local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
- echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
- | sudo tee "${sources_list_path}"
- curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
- -o "${kr_path}"
+ if version_le "${CUDA_VERSION}" 11.6 ; then
+ local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
+ local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
+ echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
+ | sudo tee "${sources_list_path}"
+ curl ${curl_retry_args} "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
+ -o "${kr_path}"
+ else
+ install_cuda_keyring_pkg # 11.7+, 12.0+
+ fi
elif is_rocky ; then
execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
- execute_with_retries "dnf clean all"
fi
}
readonly uname_r=$(uname -r)
+
function build_driver_from_github() {
- if is_ubuntu ; then
- mok_key=/var/lib/shim-signed/mok/MOK.priv
- mok_der=/var/lib/shim-signed/mok/MOK.der
- else
- mok_key=/var/lib/dkms/mok.key
- mok_der=/var/lib/dkms/mok.pub
- fi
- workdir=/opt/install-nvidia-driver
- mkdir -p "${workdir}"
+ # non-GPL driver will have been built on rocky8 or if driver version is prior to open kernel version
+ if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then return 0 ; fi
pushd "${workdir}"
test -d "${workdir}/open-gpu-kernel-modules" || {
tarball_fn="${DRIVER_VERSION}.tar.gz"
- curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+ curl ${curl_retry_args} \
"https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
| tar xz
mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
}
- cd open-gpu-kernel-modules
- time make -j$(nproc) modules \
- > /var/log/open-gpu-kernel-modules-build.log \
- 2> /var/log/open-gpu-kernel-modules-build_error.log
- sync
+ local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+ test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+ local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+ local local_tarball="${workdir}/${build_tarball}"
+ local build_dir
+ if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
+ then build_dir="${modulus_md5sum}"
+ else build_dir="unsigned" ; fi
+
+ local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+
+ if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
+ # when running with fewer than 32 cores, yield to in-progress build
+ sleep $(( ( RANDOM % 11 ) + 10 ))
+ if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then
+ local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")"
+ local build_start_epoch="$(date -d "${build_start_time}" +%s)"
+ local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
+ while gsutil ls -L "${gcs_tarball}.building" ; do
+ local now_epoch="$(date -u +%s)"
+ if (( now_epoch > timeout_epoch )) ; then
+ # detect unexpected build failure after 45m
+ gsutil rm "${gcs_tarball}.building"
+ break
+ fi
+ sleep 5m
+ done
+ fi
+ fi
- if [[ -n "${PSN}" ]]; then
- #configure_dkms_certs
- for module in $(find kernel-open -name '*.ko'); do
- "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
- "${mok_key}" \
- "${mok_der}" \
- "${module}"
- done
- #clear_dkms_key
- fi
+ if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+ echo "cache hit"
+ else
+ # build the kernel modules
+ touch "${local_tarball}.building"
+ gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
+ building_file="${gcs_tarball}.building"
+ pushd open-gpu-kernel-modules
+ install_build_dependencies
+ if ( is_cuda11 && is_ubuntu22 ) ; then
+ echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
+ exit 1
+ fi
+ execute_with_retries make -j$(nproc) modules \
+ > kernel-open/build.log \
+ 2> kernel-open/build_error.log
+ # Sign kernel modules
+ if [[ -n "${PSN}" ]]; then
+ configure_dkms_certs
+ for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
+ "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
+ "${mok_key}" \
+ "${mok_der}" \
+ "${module}"
+ done
+ clear_dkms_key
+ fi
+ make modules_install \
+ >> kernel-open/build.log \
+ 2>> kernel-open/build_error.log
+ # Collect build logs and installed binaries
+ tar czvf "${local_tarball}" \
+ "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
+ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+ gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+ if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
+ building_file=""
+ rm "${local_tarball}"
+ make clean
+ popd
+ fi
+ gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+ depmod -a
+ }
- make modules_install \
- >> /var/log/open-gpu-kernel-modules-build.log \
- 2>> /var/log/open-gpu-kernel-modules-build_error.log
popd
}
@@ -760,12 +1108,12 @@ function build_driver_from_packages() {
add_contrib_component
apt-get update -qq
execute_with_retries apt-get install -y -qq --no-install-recommends dkms
- #configure_dkms_certs
+ configure_dkms_certs
execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
sync
elif is_rocky ; then
- #configure_dkms_certs
+ configure_dkms_certs
if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
echo "nvidia-driver:${DRIVER}-dkms installed successfully"
else
@@ -773,26 +1121,142 @@ function build_driver_from_packages() {
fi
sync
fi
- #clear_dkms_key
+ clear_dkms_key
}
function install_nvidia_userspace_runfile() {
- if test -f "${tmpdir}/userspace-complete" ; then return ; fi
- curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
- "${USERSPACE_URL}" -o "${tmpdir}/userspace.run"
- execute_with_retries bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd --tmpdir="${tmpdir}"
- rm -f "${tmpdir}/userspace.run"
- touch "${tmpdir}/userspace-complete"
+ # Parameters for NVIDIA-provided Debian GPU driver
+ readonly DEFAULT_USERSPACE_URL="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+
+ readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
+
+ USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
+ readonly USERSPACE_FILENAME
+
+ # This .run file contains NV's OpenGL implementation as well as
+ # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
+ # including glib (https://docs.gtk.org/glib/), and what appears to
+ # be a copy of the source from the kernel-open directory of for
+ # example DRIVER_VERSION=560.35.03
+ #
+ # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz
+ #
+ # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
+ # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
+ is_complete userspace && return
+ local local_fn="${tmpdir}/userspace.run"
+
+ cache_fetched_package "${USERSPACE_URL}" \
+ "${pkg_bucket}/nvidia/${USERSPACE_FILENAME}" \
+ "${local_fn}"
+
+ local runfile_args
+ runfile_args=""
+ local cache_hit="0"
+ local local_tarball
+
+ if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then
+ local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+ test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+ local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+ local_tarball="${workdir}/${build_tarball}"
+ local build_dir
+ if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
+ then build_dir="${modulus_md5sum}"
+ else build_dir="unsigned" ; fi
+
+ local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+
+ if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
+ # when running with fewer than 32 cores, yield to in-progress build
+ sleep $(( ( RANDOM % 11 ) + 10 ))
+ if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then
+ local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")"
+ local build_start_epoch="$(date -d "${build_start_time}" +%s)"
+ local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
+ while gsutil ls -L "${gcs_tarball}.building" ; do
+ local now_epoch="$(date -u +%s)"
+ if (( now_epoch > timeout_epoch )) ; then
+ # detect unexpected build failure after 45m
+ gsutil rm "${gcs_tarball}.building"
+ break
+ fi
+ sleep 5m
+ done
+ fi
+ fi
+
+ if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+ cache_hit="1"
+ if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
+ runfile_args="${runfile_args} --no-kernel-modules"
+ fi
+ echo "cache hit"
+ else
+ # build the kernel modules
+ touch "${local_tarball}.building"
+ gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
+ building_file="${gcs_tarball}.building"
+ install_build_dependencies
+ configure_dkms_certs
+ local signing_options
+ signing_options=""
+ if [[ -n "${PSN}" ]]; then
+ signing_options="--module-signing-hash sha256 \
+ --module-signing-x509-hash sha256 \
+ --module-signing-secret-key \"${mok_key}\" \
+ --module-signing-public-key \"${mok_der}\" \
+ --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
+ "
+ fi
+ runfile_args="${signing_options}"
+ if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
+ runfile_args="${runfile_args} --no-dkms"
+ fi
+ fi
+ }
+ elif version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
+ runfile_args="--no-kernel-modules"
+ fi
+
+ execute_with_retries bash "${local_fn}" -e -q \
+ ${runfile_args} \
+ --ui=none \
+ --install-libglvnd \
+ --tmpdir="${tmpdir}"
+
+ if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then
+ if [[ "${cache_hit}" == "1" ]] ; then
+ gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+ depmod -a
+ else
+ clear_dkms_key
+ tar czvf "${local_tarball}" \
+ /var/log/nvidia-installer.log \
+ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+ gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+ if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
+ building_file=""
+ fi
+ fi
+
+ rm -f "${local_fn}"
+ mark_complete userspace
sync
}
function install_cuda_runfile() {
- if test -f "${tmpdir}/cuda-complete" ; then return ; fi
- time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
- "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run"
- execute_with_retries bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs --tmpdir="${tmpdir}"
- rm -f "${tmpdir}/cuda.run"
- touch "${tmpdir}/cuda-complete"
+ is_complete cuda && return
+
+ local local_fn="${tmpdir}/cuda.run"
+
+ cache_fetched_package "${NVIDIA_CUDA_URL}" \
+ "${pkg_bucket}/nvidia/${CUDA_RUNFILE}" \
+ "${local_fn}"
+
+ execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
+ rm -f "${local_fn}"
+ mark_complete cuda
sync
}
@@ -808,18 +1272,19 @@ function install_cuda_toolkit() {
if is_debuntu ; then
# if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
- sync
elif is_rocky ; then
# rocky9: cuda-11-[7,8], cuda-12-[1..6]
execute_with_retries dnf -y -q install "${cudatk_package}"
- sync
fi
+ sync
}
function load_kernel_module() {
# for some use cases, the kernel module needs to be removed before first use of nvidia-smi
for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
- rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+ ( set +e
+ rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+ )
done
depmod -a
@@ -830,67 +1295,109 @@ function load_kernel_module() {
# TODO: if peermem is available, also modprobe nvidia-peermem
}
+function install_cuda(){
+ is_complete cuda-repo && return
+ if [[ "${gpu_count}" == "0" ]] ; then return ; fi
+
+ if ( ge_debian12 && is_src_os ) ; then
+ echo "installed with the driver on ${_shortname}"
+ return 0
+ fi
+
+ # The OS package distributions are unreliable
+ install_cuda_runfile
+
+ # Includes CUDA packages
+ add_repo_cuda
+
+ mark_complete cuda-repo
+}
+
+function install_nvidia_container_toolkit() {
+ is_complete install-nvtk && return
+
+ local container_runtime_default
+ if command -v docker ; then container_runtime_default='docker'
+ elif command -v containerd ; then container_runtime_default='containerd'
+ elif command -v crio ; then container_runtime_default='crio'
+ else container_runtime_default='' ; fi
+ CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}")
+
+ if test -z "${CONTAINER_RUNTIME}" ; then return ; fi
+
+ add_repo_nvidia_container_toolkit
+ if is_debuntu ; then
+ execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else
+ execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi
+ nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
+ systemctl restart "${CONTAINER_RUNTIME}"
+
+ mark_complete install-nvtk
+}
+
# Install NVIDIA GPU driver provided by NVIDIA
function install_nvidia_gpu_driver() {
+ is_complete gpu-driver && return
+ if [[ "${gpu_count}" == "0" ]] ; then return ; fi
+
if ( ge_debian12 && is_src_os ) ; then
add_nonfree_components
- add_repo_nvidia_container_toolkit
apt-get update -qq
- #configure_dkms_certs
apt-get -yq install \
- nvidia-container-toolkit \
- dkms \
- nvidia-open-kernel-dkms \
- nvidia-open-kernel-support \
- nvidia-smi \
- libglvnd0 \
- libcuda1
- #clear_dkms_key
- elif ( le_ubuntu18 || le_debian10 || (ge_debian12 && le_cuda11) ) ; then
-
- install_nvidia_userspace_runfile
+ dkms \
+ nvidia-open-kernel-dkms \
+ nvidia-open-kernel-support \
+ nvidia-smi \
+ libglvnd0 \
+ libcuda1
+ echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
+ return 0
+ fi
- build_driver_from_github
+ # OS driver packages do not produce reliable driver ; use runfile
+ install_nvidia_userspace_runfile
- install_cuda_runfile
- elif is_debuntu ; then
- install_cuda_keyring_pkg
+ build_driver_from_github
- build_driver_from_packages
+ echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
+ mark_complete gpu-driver
+}
- install_cuda_toolkit
- elif is_rocky ; then
- add_repo_cuda
+function install_ops_agent(){
+ is_complete ops-agent && return
- build_driver_from_packages
+ mkdir -p /opt/google
+ cd /opt/google
+ # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
+ curl ${curl_retry_args} -O https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
+ execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
- install_cuda_toolkit
- else
- echo "Unsupported OS: '${OS_NAME}'"
- exit 1
- fi
- ldconfig
- if is_src_os ; then
- echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully"
- else
- echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
- fi
+ mark_complete ops-agent
}
# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
function install_gpu_agent() {
- if ! command -v pip; then
- execute_with_retries "apt-get install -y -qq python-pip"
+ # Stackdriver GPU agent parameters
+# local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
+ local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics'
+ if ( ! command -v pip && is_debuntu ) ; then
+ execute_with_retries "apt-get install -y -qq python3-pip"
fi
local install_dir=/opt/gpu-utilization-agent
mkdir -p "${install_dir}"
- curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+ curl ${curl_retry_args} \
"${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt"
- curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+ curl ${curl_retry_args} \
"${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
| sed -e 's/-u --format=/--format=/' \
| dd status=none of="${install_dir}/report_gpu_metrics.py"
- execute_with_retries pip install -r "${install_dir}/requirements.txt"
+ local venv="${install_dir}/venv"
+ /opt/conda/miniconda3/bin/python3 -m venv "${venv}"
+(
+ source "${venv}/bin/activate"
+ python3 -m pip install --upgrade pip
+ execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
+)
sync
# Generate GPU service.
@@ -901,7 +1408,7 @@ Description=GPU Utilization Metric Agent
[Service]
Type=simple
PIDFile=/run/gpu_agent.pid
-ExecStart=/bin/bash --login -c 'python "${install_dir}/report_gpu_metrics.py"'
+ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"'
User=root
Group=root
WorkingDirectory=/
@@ -926,8 +1433,9 @@ function set_hadoop_property() {
--clobber
}
-function configure_yarn() {
- if [[ -d "${HADOOP_CONF_DIR}" && ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
+function configure_yarn_resources() {
+ if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
+ if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml"
fi
set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
@@ -941,11 +1449,13 @@ function configure_yarn() {
# This configuration should be applied only if GPU is attached to the node
function configure_yarn_nodemanager() {
- set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
+ if [[ "${gpu_count}" == "0" ]] ; then return ; fi
+ set_hadoop_property 'yarn-site.xml' \
+ 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
set_hadoop_property 'yarn-site.xml' \
'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
set_hadoop_property 'yarn-site.xml' \
- 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
+ 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"
set_hadoop_property 'yarn-site.xml' \
'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
set_hadoop_property 'yarn-site.xml' \
@@ -953,9 +1463,9 @@ function configure_yarn_nodemanager() {
set_hadoop_property 'yarn-site.xml' \
'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
set_hadoop_property 'yarn-site.xml' \
- 'yarn.nodemanager.container-executor.class' \
- 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
- set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
+ 'yarn.nodemanager.container-executor.class' 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
+ set_hadoop_property 'yarn-site.xml' \
+ 'yarn.nodemanager.linux-container-executor.group' 'yarn'
# Fix local dirs access permissions
local yarn_local_dirs=()
@@ -970,13 +1480,11 @@ function configure_yarn_nodemanager() {
}
function configure_gpu_exclusive_mode() {
- # check if running spark 3, if not, enable GPU exclusive mode
- local spark_version
- spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
- if [[ ${spark_version} != 3.* ]]; then
- # include exclusive mode on GPU
- nvsmi -c EXCLUSIVE_PROCESS
- fi
+ if [[ "${gpu_count}" == "0" ]] ; then return ; fi
+ # only run this function when spark < 3.0
+ if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
+ # include exclusive mode on GPU
+ nvsmi -c EXCLUSIVE_PROCESS
}
function fetch_mig_scripts() {
@@ -988,6 +1496,7 @@ function fetch_mig_scripts() {
}
function configure_gpu_script() {
+ if [[ "${gpu_count}" == "0" ]] ; then return ; fi
# Download GPU discovery script
local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
mkdir -p ${spark_gpu_script_dir}
@@ -1014,21 +1523,59 @@ function configure_gpu_script() {
# See the License for the specific language governing permissions and
# limitations under the License.
#
+# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
+
+set -e
+resources_json="/dev/shm/nvidia/gpusResources.json"
+if test -f "${resources_json}" ; then cat "${resources_json}" ; exit 0 ; fi
+
+mkdir -p "$(dirname ${resources_json})"
ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))')
-echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
+echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} | tee "${resources_json}"
EOF
chmod a+rx "${gpus_resources_script}"
local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
+ if version_lt "${SPARK_VERSION}" "3.0" ; then return ; fi
+
if ! grep spark.executor.resource.gpu.discoveryScript "${spark_defaults_conf}" ; then
echo "spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}" >> "${spark_defaults_conf}"
fi
+ local executor_cores
+ executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
+ local executor_memory
+ executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
+ local task_cpus=2
+ local gpu_amount
+# gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
+ gpu_amount="$(perl -e "print 1 / ${executor_cores}")"
+
+ cat >>"${spark_defaults_conf}" <&2
+ if [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
elif [[ ! -f "${nvsmi}" ]] ; then echo "nvidia-smi not installed" >&2 ; return 0
elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
else nvsmi_works="1" ; fi
- if [[ "$1" == "-L" ]] ; then
+ if test -v 1 && [[ "$1" == "-L" ]] ; then
local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
@@ -1074,14 +1621,23 @@ function nvsmi() {
"${nvsmi}" $*
}
-function install_dependencies() {
+function install_build_dependencies() {
+ is_complete build-dependencies && return
+
if is_debuntu ; then
- execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" screen
+ if is_ubuntu22 && is_cuda12 ; then
+ # On ubuntu22, the default compiler does not build some kernel module versions
+ # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
+ execute_with_retries apt-get install -y -qq gcc-12
+ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+ update-alternatives --set gcc /usr/bin/gcc-12
+ fi
+
elif is_rocky ; then
- execute_with_retries dnf -y -q install pciutils gcc screen
+ execute_with_retries dnf -y -q install gcc
local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
- local install_log="${tmpdir}/install.log"
set +e
eval "${dnf_cmd}" > "${install_log}" 2>&1
local retval="$?"
@@ -1104,12 +1660,113 @@ function install_dependencies() {
execute_with_retries "${dnf_cmd}"
fi
+ mark_complete build-dependencies
+}
+
+function is_complete() {
+ phase="$1"
+ test -f "${workdir}/complete/${phase}"
}
+function mark_complete() {
+ phase="$1"
+ touch "${workdir}/complete/${phase}"
+}
+
+function mark_incomplete() {
+ phase="$1"
+ rm -f "${workdir}/complete/${phase}"
+}
+
+function install_dependencies() {
+ is_complete install-dependencies && return 0
+
+ pkg_list="pciutils screen"
+ if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
+ elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi
+ mark_complete install-dependencies
+}
+
+function prepare_gpu_env(){
+ #set_support_matrix
+
+ # if set, this variable includes a gcs path to a build-in-progress indicator
+ building_file=""
+
+ set_cuda_version
+ set_driver_version
+
+ set +e
+ gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
+ set -e
+
+ nvsmi_works="0"
+
+ if is_cuda11 ; then gcc_ver="11"
+ elif is_cuda12 ; then gcc_ver="12" ; fi
+
+ if ! test -v DEFAULT_RAPIDS_RUNTIME ; then
+ readonly DEFAULT_RAPIDS_RUNTIME='SPARK'
+ fi
+
+ # Set variables from metadata
+ RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
+ INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")"
+ INCLUDE_PYTORCH="$(get_metadata_attribute 'include-pytorch' 'no')"
+ readonly RAPIDS_RUNTIME INCLUDE_GPUS INCLUDE_PYTORCH
+
+ # determine whether we have nvidia-smi installed and working
+ nvsmi
+
+ set_nv_urls
+ set_cuda_runfile_url
+ set_cudnn_version
+ set_cudnn_tarball_url
+}
+
+# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
+# Users should run apt-mark unhold before they wish to upgrade these packages
+function hold_nvidia_packages() {
+ if ! is_debuntu ; then return ; fi
+
+ apt-mark hold nvidia-* > /dev/null 2>&1
+ apt-mark hold libnvidia-* > /dev/null 2>&1
+ if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
+ apt-mark hold xserver-xorg-video-nvidia*
+ fi
+}
+
+function check_secure_boot() {
+ local SECURE_BOOT="disabled"
+ SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
+
+ PSN="$(get_metadata_attribute private_secret_name)"
+ readonly PSN
+
+ if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
+ echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
+ exit 1
+ elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
+ echo "Secure boot is enabled, but no signing material provided."
+ echo "Please either disable secure boot or provide signing material as per"
+ echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
+ return 1
+ fi
+
+ CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
+ readonly CA_TMPDIR
+
+ if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
+ mok_der=/var/lib/shim-signed/mok/MOK.der
+ else mok_key=/var/lib/dkms/mok.key
+ mok_der=/var/lib/dkms/mok.pub ; fi
+}
+
+
function main() {
# This configuration should be run on all nodes
# regardless if they have attached GPUs
- configure_yarn
+ configure_yarn_resources
# Detect NVIDIA GPU
if (lspci | grep -q NVIDIA); then
@@ -1132,15 +1789,20 @@ function main() {
# if mig is enabled drivers would have already been installed
if [[ $IS_MIG_ENABLED -eq 0 ]]; then
install_nvidia_gpu_driver
-
+ install_nvidia_container_toolkit
+ install_cuda
load_kernel_module
if [[ -n ${CUDNN_VERSION} ]]; then
install_nvidia_nccl
install_nvidia_cudnn
fi
+ case "${INCLUDE_PYTORCH^^}" in
+ "1" | "YES" | "TRUE" ) install_pytorch ;;
+ esac
#Install GPU metrics collection in Stackdriver if needed
if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+ #install_ops_agent
install_gpu_agent
echo 'GPU metrics agent successfully deployed.'
else
@@ -1152,18 +1814,23 @@ function main() {
rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
done
- MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")"
if test -n "$(nvsmi -L)" ; then
- # cache the result of the gpu query
+ # cache the result of the gpu query
ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))')
echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
+ chmod a+r "/var/run/nvidia-gpu-index.txt"
fi
+ MIG_GPU_LIST="$(nvsmi -L | grep -E '(MIG|[PVAH]100)' || echo -n "")"
NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"
if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
# enable MIG on every GPU
- for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do
- nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
- done
+ for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do
+ if version_le "${CUDA_VERSION}" "11.6" ; then
+ nvsmi -i "${GPU_ID}" --multi-instance-gpu=1
+ else
+ nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
+ fi
+ done
NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)"
@@ -1174,6 +1841,7 @@ function main() {
fi
configure_yarn_nodemanager
+ install_spark_rapids
configure_gpu_script
configure_gpu_isolation
elif [[ "${ROLE}" == "Master" ]]; then
@@ -1182,11 +1850,23 @@ function main() {
fi
# Restart YARN services if they are running already
- if [[ $(systemctl show hadoop-yarn-resourcemanager.service -p SubState --value) == 'running' ]]; then
- systemctl restart hadoop-yarn-resourcemanager.service
- fi
- if [[ $(systemctl show hadoop-yarn-nodemanager.service -p SubState --value) == 'running' ]]; then
- systemctl restart hadoop-yarn-nodemanager.service
+ for svc in resourcemanager nodemanager; do
+ if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then
+ systemctl restart hadoop-yarn-${svc}.service
+ fi
+ done
+}
+
+function cache_fetched_package() {
+ local src_url="$1"
+ local gcs_fn="$2"
+ local local_fn="$3"
+
+ if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
+ time gcloud storage cp "${gcs_fn}" "${local_fn}"
+ else
+ time ( curl ${curl_retry_args} "${src_url}" -o "${local_fn}" && \
+ gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
fi
}
@@ -1214,7 +1894,7 @@ function clean_up_sources_lists() {
local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
rm -f "${bigtop_kr_path}"
- curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
+ curl ${curl_retry_args} \
"${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
@@ -1228,7 +1908,7 @@ function clean_up_sources_lists() {
local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
rm -f "${adoptium_kr_path}"
- curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
+ curl ${curl_retry_args} "${key_url}" \
| gpg --dearmor -o "${adoptium_kr_path}"
echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
> /etc/apt/sources.list.d/adoptium.list
@@ -1242,7 +1922,7 @@ function clean_up_sources_lists() {
local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
rm -f "${docker_kr_path}"
- curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
+ curl ${curl_retry_args} "${docker_key_url}" \
| gpg --dearmor -o "${docker_kr_path}"
echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
> ${docker_repo_file}
@@ -1252,7 +1932,7 @@ function clean_up_sources_lists() {
#
if ls /etc/apt/sources.list.d/google-cloud*.list ; then
rm -f /usr/share/keyrings/cloud.google.gpg
- curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
+ curl ${curl_retry_args} https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
list_file="/etc/apt/sources.list.d/${list}.list"
if [[ -f "${list_file}" ]]; then
@@ -1268,7 +1948,7 @@ function clean_up_sources_lists() {
keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
rm -f /usr/share/keyrings/cran-r.gpg
- curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
+ curl ${curl_retry_args} "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
fi
@@ -1278,7 +1958,7 @@ function clean_up_sources_lists() {
#
if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
rm -f /usr/share/keyrings/mysql.gpg
- curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
+ curl ${curl_retry_args} 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
gpg --dearmor -o /usr/share/keyrings/mysql.gpg
sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
fi
@@ -1288,12 +1968,17 @@ function clean_up_sources_lists() {
}
function exit_handler() {
- set +ex
- echo "Exit handler invoked"
-
# Purge private key material until next grant
clear_dkms_key
+ # clean up incomplete build indicators
+ if test -n "${building_file}" ; then
+ if gcloud storage ls "${building_file}" ; then gcloud storage rm "${building_file}" || true ; fi
+ fi
+
+ set +ex
+ echo "Exit handler invoked"
+
# Clear pip cache
pip cache purge || echo "unable to purge pip cache"
@@ -1303,8 +1988,8 @@ function exit_handler() {
pip config unset global.cache-dir || echo "unable to unset global pip cache"
# Clean up shared memory mounts
- for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
- if grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ; then
+ for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do
+ if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
umount -f ${shmdir}
fi
done
@@ -1316,10 +2001,11 @@ function exit_handler() {
if is_debuntu ; then
# Clean up OS package cache
apt-get -y -qq clean
- apt-get -y -qq autoremove
+ apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
# re-hold systemd package
if ge_debian12 ; then
apt-mark hold systemd libsystemd0 ; fi
+ hold_nvidia_packages
else
dnf clean all
fi
@@ -1330,22 +2016,23 @@ function exit_handler() {
/usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
/usr/lib \
/opt/nvidia/* \
- /usr/local/cuda-1?.? \
/opt/conda/miniconda3 | sort -h
elif is_debian ; then
- du -hs \
- /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
- /usr/lib \
- /usr/local/cuda-1?.? \
- /opt/conda/miniconda3 | sort -h
+ du -x -hs \
+ /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \
+ /var/lib/{docker,mysql,} \
+ /opt/nvidia/* \
+ /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
+ /usr/bin \
+ /usr \
+ /var \
+ / 2>/dev/null | sort -h
else
du -hs \
/var/lib/docker \
- /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
+ /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \
/usr/lib64/google-cloud-sdk \
- /usr/lib \
/opt/nvidia/* \
- /usr/local/cuda-1?.? \
/opt/conda/miniconda3
fi
@@ -1362,11 +2049,12 @@ function exit_handler() {
perl -e '@siz=( sort { $a => $b }
map { (split)[2] =~ /^(\d+)/ }
grep { m:^/: } );
-$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+$max=$siz[0]; $min=$siz[-1]; $starting="unknown"; $inc=q{$max-$starting};
print( " samples-taken: ", scalar @siz, $/,
- "maximum-disk-used: $max", $/,
- "minimum-disk-used: $min", $/,
- " increased-by: $inc", $/ )' < "/run/disk-usage.log"
+ "starting-disk-used: $starting", $/,
+ "maximum-disk-used: $max", $/,
+ "minimum-disk-used: $min", $/,
+ " increased-by: $inc", $/ )' < "/run/disk-usage.log"
echo "exit_handler has completed"
@@ -1382,26 +2070,40 @@ print( " samples-taken: ", scalar @siz, $/,
}
function set_proxy(){
- export METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy)"
+ METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
+
+ if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
+
+ export METADATA_HTTP_PROXY
export http_proxy="${METADATA_HTTP_PROXY}"
export https_proxy="${METADATA_HTTP_PROXY}"
export HTTP_PROXY="${METADATA_HTTP_PROXY}"
export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
- export no_proxy=metadata.google.internal,169.254.169.254
- export NO_PROXY=metadata.google.internal,169.254.169.254
+ no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
+ local no_proxy_svc
+ for no_proxy_svc in compute secretmanager dns servicedirectory logging \
+ bigquery composer pubsub bigquerydatatransfer dataflow \
+ storage datafusion ; do
+ no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
+ done
+
+ export NO_PROXY="${no_proxy}"
}
function mount_ramdisk(){
local free_mem
free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
- if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+ if [[ ${free_mem} -lt 20500000 ]]; then return 0 ; fi
# Write to a ramdisk instead of churning the persistent disk
tmpdir="/mnt/shm"
- mkdir -p "${tmpdir}"
+ mkdir -p "${tmpdir}/pkgs_dirs"
mount -t tmpfs tmpfs "${tmpdir}"
+ # Download conda packages to tmpfs
+ /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
+
# Clear pip cache
# TODO: make this conditional on which OSs have pip without cache purge
pip cache purge || echo "unable to purge pip cache"
@@ -1417,33 +2119,69 @@ function mount_ramdisk(){
fi
}
-function prepare_to_install(){
- nvsmi_works="0"
- readonly bdcfg="/usr/local/bin/bdconfig"
- tmpdir=/tmp/
- if ! is_debuntu && ! is_rocky ; then
- echo "Unsupported OS: '$(os_name)'"
- exit 1
+function harden_sshd_config() {
+ # disable sha1 and md5 use in kex and kex-gss features
+ declare -A feature_map=(["kex"]="kexalgorithms")
+ if ( is_rocky || version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ) ; then
+ feature_map["kex-gss"]="gssapikexalgorithms"
fi
+ for ftr in "${!feature_map[@]}" ; do
+ local feature=${feature_map[$ftr]}
+ local sshd_config_line
+ sshd_config_line="${feature} $(
+ (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g';
+ ssh -Q "${ftr}" ) \
+ | sort -u | grep -v -ie sha1 -e md5 | paste -sd "," -)"
+
+ grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new
+ echo "$sshd_config_line" >> /tmp/sshd_config_new
+ # TODO: test whether sshd will reload with this change before mv
+ mv -f /tmp/sshd_config_new /etc/ssh/sshd_config
+ done
+ local svc=ssh
+ if is_rocky ; then svc="sshd" ; fi
+ systemctl reload "${svc}"
+}
+
+function prepare_to_install(){
+ # Verify OS compatability and Secure boot state
+ check_os
+ check_secure_boot
+
+ curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30"
- repair_old_backports
+ prepare_gpu_env
+ workdir=/opt/install-dpgce
+ tmpdir=/tmp/
+ temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
+ readonly temp_bucket
+ readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
+ readonly bdcfg="/usr/local/bin/bdconfig"
export DEBIAN_FRONTEND=noninteractive
+ mkdir -p "${workdir}/complete"
trap exit_handler EXIT
+ set_proxy
mount_ramdisk
- install_log="${tmpdir}/install.log"
- set_proxy
+ readonly install_log="${tmpdir}/install.log"
+
+ is_complete prepare.common && return
+
+ harden_sshd_config
if is_debuntu ; then
+ repair_old_backports
clean_up_sources_lists
- apt-get update -qq
+ apt-get update -qq --allow-releaseinfo-change
apt-get -y clean
- sleep 5s
- apt-get -y -qq autoremove
+ apt-get -o DPkg::Lock::Timeout=60 -y autoremove
if ge_debian12 ; then
apt-mark unhold systemd libsystemd0 ; fi
+ if is_ubuntu ; then
+ while ! command -v gcloud ; do sleep 5s ; done
+ fi
else
dnf clean all
fi
@@ -1453,15 +2191,147 @@ function prepare_to_install(){
time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
) fi
- configure_dkms_certs
-
install_dependencies
# Monitor disk usage in a screen session
df / > "/run/disk-usage.log"
touch "/run/keep-running-df"
- screen -d -m -US keep-running-df \
+ screen -d -m -LUS keep-running-df \
bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+
+ mark_complete prepare.common
+}
+
+function check_os() {
+ if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
+ echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
+ exit 1
+ elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then
+ echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
+ exit 1
+ elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
+ echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
+ exit 1
+ fi
+
+ SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
+ readonly SPARK_VERSION
+ if version_lt "${SPARK_VERSION}" "3.1" || \
+ version_ge "${SPARK_VERSION}" "4.0" ; then
+ echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+ exit 1
+ fi
+
+ # Detect dataproc image version
+ if (! test -v DATAPROC_IMAGE_VERSION) ; then
+ if test -v DATAPROC_VERSION ; then
+ DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+ else
+ # When building custom-images, neither of the above variables
+ # are defined and we need to make a reasonable guess
+
+ if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+ elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+ elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+ else echo "Unknown dataproc image version" ; exit 1 ; fi
+ fi
+ fi
+}
+
+#
+# Generate repo file under /etc/apt/sources.list.d/
+#
+function apt_add_repo() {
+ local -r repo_name="$1"
+ local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+ local -r include_src="${4:-yes}"
+ local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+ local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
+
+ echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
+ if [[ "${include_src}" == "yes" ]] ; then
+ echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
+ fi
+
+ apt-get update -qq
+}
+
+#
+# Generate repo file under /etc/yum.repos.d/
+#
+function dnf_add_repo() {
+ local -r repo_name="$1"
+ local -r repo_url="$3" # "http(s)://host/path/filename.repo"
+ local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
+ local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
+
+ curl ${curl_retry_args} "${repo_url}" \
+ | dd of="${repo_path}" status=progress
+}
+
+#
+# Keyrings default to
+# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
+# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL)
+#
+function os_add_repo() {
+ local -r repo_name="$1"
+ local -r signing_key_url="$2"
+ local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+ local kr_path
+ if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+ else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
+
+ mkdir -p "$(dirname "${kr_path}")"
+
+ curl ${curl_retry_args} "${signing_key_url}" \
+ | gpg --import --no-default-keyring --keyring "${kr_path}"
+
+ if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
+ else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
+}
+
+
+readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
+
+function install_spark_rapids() {
+ if [[ "${RAPIDS_RUNTIME}" != "SPARK" ]]; then return ; fi
+
+ # Update SPARK RAPIDS config
+ local DEFAULT_SPARK_RAPIDS_VERSION
+ DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+ local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
+
+ # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
+ local -r scala_ver="2.12"
+
+ if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
+ DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
+ fi
+
+ readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
+ readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
+
+ local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
+ local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
+ local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
+
+ local jar_basename
+
+ jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+ cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+ "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+ "/usr/lib/spark/jars/${jar_basename}"
+
+ jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+ cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+ "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+ "/usr/lib/spark/jars/${jar_basename}"
+
+ jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar"
+ cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+ "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+ "/usr/lib/spark/jars/${jar_basename}"
}
prepare_to_install
diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh
index 7545c1a1e..37982bfe4 100644
--- a/gpu/manual-test-runner.sh
+++ b/gpu/manual-test-runner.sh
@@ -4,15 +4,16 @@
#
# To run the script, the following will bootstrap
#
-# git clone git@github.com:LLC-Technologies-Collier/initialization-actions
-# git checkout gpu-20241121
+# git clone git@github.com:GoogleCloudDataproc/initialization-actions
# cd initialization-actions
+# git checkout 2024.12
# cp gpu/env.json.sample env.json
# vi env.json
# docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest .
# time docker run -it gpu-init-actions-runner:latest gpu/manual-test-runner.sh
#
# The bazel run(s) happen in separate screen windows.
+# To create a new screen window, press ^a c
# To see a list of screen windows, press ^a "
# Num Name
#
diff --git a/gpu/run-bazel-tests.sh b/gpu/run-bazel-tests.sh
index 8e7cd663d..ae717bf5b 100644
--- a/gpu/run-bazel-tests.sh
+++ b/gpu/run-bazel-tests.sh
@@ -17,7 +17,6 @@ declare -a TESTS_TO_RUN=('gpu:test_gpu')
time bazel test \
--jobs="${max_parallel_tests}" \
--local_test_jobs="${max_parallel_tests}" \
- --flaky_test_attempts=3 \
--action_env="INTERNAL_IP_SSH=true" \
--test_output="errors" \
--test_arg="--image_version=${IMAGE_VERSION}" \
diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index f8438915f..47b4c7d61 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -6,25 +6,92 @@
from integration_tests.dataproc_test_case import DataprocTestCase
+DEFAULT_TIMEOUT = 45 # minutes
+DEFAULT_CUDA_VERSION = "12.4"
class NvidiaGpuDriverTestCase(DataprocTestCase):
COMPONENT = "gpu"
INIT_ACTIONS = ["gpu/install_gpu_driver.sh"]
GPU_L4 = "type=nvidia-l4"
GPU_T4 = "type=nvidia-tesla-t4"
- GPU_V100 = "type=nvidia-tesla-v100" # not available in us-central1-a
- GPU_A100 = "type=nvidia-tesla-a100"
- GPU_H100 = "type=nvidia-h100-80gb,count=8"
+ GPU_V100 = "type=nvidia-tesla-v100"
+ GPU_A100 = "type=nvidia-tesla-a100,count=2"
+ GPU_H100 = "type=nvidia-h100-80gb,count=2"
+
+ # Tests for PyTorch
+ TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py"
+
+ # Tests for TensorFlow
+ TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py"
+
+ def assert_instance_command(self,
+ instance,
+ cmd,
+ timeout_in_minutes=DEFAULT_TIMEOUT):
+
+ retry_count = 5
+
+ ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format(
+ instance, self.cluster_zone, cmd)
+
+ while retry_count > 0:
+ try:
+ ret_code, stdout, stderr = self.assert_command( ssh_cmd, timeout_in_minutes )
+ return ret_code, stdout, stderr
+ except Exception as e:
+ print("An error occurred: ", e)
+ retry_count -= 1
+ if retry_count > 0:
+ time.sleep(10)
+ continue
+ else:
+ raise
def verify_instance(self, name):
# Verify that nvidia-smi works
- time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience
+ import random
+ # Many failed nvidia-smi attempts have been caused by impatience and temporal collisions
+ time.sleep( 3 + random.randint(1, 30) )
self.assert_instance_command(name, "nvidia-smi", 1)
def verify_pyspark(self, name):
# Verify that pyspark works
self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
+ def verify_pytorch(self, name):
+ test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)),
+ self.TORCH_TEST_SCRIPT_FILE_NAME)
+ self.upload_test_file(test_filename, name)
+
+ conda_env="dpgce"
+
+ # until the numa node is selected, every time the GPU is accessed
+ # from pytorch, log noise about numa node not being selected is
+ # printed to the console. Selecting numa node before the python is
+ # executed improves readability of the diagnostic information.
+
+ verify_cmd = \
+ "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \
+ "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \
+ "${envpath}/bin/python {}".format(
+ self.TORCH_TEST_SCRIPT_FILE_NAME)
+ self.assert_instance_command(name, verify_cmd)
+ self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name)
+
+ def verify_tensorflow(self, name):
+ test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)),
+ self.TF_TEST_SCRIPT_FILE_NAME)
+ self.upload_test_file(test_filename, name)
+ # all on a single numa node
+ conda_env="dpgce"
+ verify_cmd = \
+ "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \
+ "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \
+ "${envpath}/bin/python {}".format(
+ self.TF_TEST_SCRIPT_FILE_NAME)
+ self.assert_instance_command(name, verify_cmd)
+ self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name)
+
def verify_mig_instance(self, name):
self.assert_instance_command(name,
"/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
@@ -41,49 +108,71 @@ def verify_instance_nvcc(self, name, cuda_version):
self.assert_instance_command(
name, "/usr/local/cuda-{}/bin/nvcc --version | grep 'release {}'".format(cuda_version,cuda_version) )
+ def verify_instance_pyspark(self, name):
+ # Verify that pyspark works
+ self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
+
+ def verify_instance_cuda_version(self, name, cuda_version):
+ self.assert_instance_command(
+ name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/cuda_version/text()' - | grep {}".format(cuda_version) )
+
+ def verify_instance_driver_version(self, name, driver_version):
+ self.assert_instance_command(
+ name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) )
+
def verify_instance_spark(self):
+ self.assert_dataproc_job(
+ self.getClusterName(),
+ "spark",
+ "--jars=file:///usr/lib/spark/examples/jars/spark-examples.jar " \
+ + "--class=org.apache.spark.examples.SparkPi " \
+ + " -- 1000"
+ )
self.assert_dataproc_job(
self.getClusterName(),
"spark",
"--jars=file:///usr/lib/spark/examples/jars/spark-examples.jar " \
+ "--class=org.apache.spark.examples.ml.JavaIndexToStringExample " \
- + "--properties=" \
- + "spark.executor.resource.gpu.amount=1," \
- + "spark.executor.cores=6," \
- + "spark.executor.memory=4G," \
- + "spark.task.resource.gpu.amount=0.333," \
- + "spark.task.cpus=2," \
+ + "--properties="\
+ + "spark.executor.resource.gpu.amount=1,"\
+ + "spark.executor.cores=6,"\
+ + "spark.executor.memory=4G,"\
+ + "spark.plugins=com.nvidia.spark.SQLPlugin,"\
+ + "spark.executor.resource.gpu.discoveryScript=/usr/lib/spark/scripts/gpu/getGpusResources.sh,"\
+ + "spark.dynamicAllocation.enabled=false,"\
+ + "spark.sql.autoBroadcastJoinThreshold=10m,"\
+ + "spark.sql.files.maxPartitionBytes=512m,"\
+ + "spark.task.resource.gpu.amount=0.333,"\
+ + "spark.task.cpus=2,"\
+ "spark.yarn.unmanagedAM.enabled=false"
)
+ self.assert_dataproc_job(
+ self.getClusterName(),
+ "spark",
+ "--jars=file:///usr/lib/spark/examples/jars/spark-examples.jar " \
+ + "--class=org.apache.spark.examples.ml.JavaIndexToStringExample " \
+ + "--properties="\
+ + "spark.driver.resource.gpu.amount=1,"\
+ + "spark.driver.resource.gpu.discoveryScript=/usr/lib/spark/scripts/gpu/getGpusResources.sh,"\
+ + "spark.executor.resource.gpu.amount=1,"\
+ + "spark.executor.resource.gpu.discoveryScript=/usr/lib/spark/scripts/gpu/getGpusResources.sh"
+ )
- @parameterized.parameters(
- ("SINGLE", ["m"], GPU_T4, None, None),
-# ("STANDARD", ["m"], GPU_T4, None, None),
- ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"),
- )
- def test_install_gpu_default_agent(self, configuration, machine_suffixes,
- master_accelerator, worker_accelerator,
- driver_provider):
- if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
- self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
- metadata = None
- if driver_provider is not None:
- metadata = "gpu-driver-provider={}".format(driver_provider)
- self.createCluster(
- configuration,
- self.INIT_ACTIONS,
- machine_type="n1-highmem-8",
- master_accelerator=master_accelerator,
- worker_accelerator=worker_accelerator,
- metadata=metadata,
- timeout_in_minutes=90,
- boot_disk_size="50GB")
- for machine_suffix in machine_suffixes:
- machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
- self.verify_instance(machine_name)
- if ( self.getImageOs() != 'rocky' ) or ( configuration != 'SINGLE' ) or ( configuration == 'SINGLE' and self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1") ):
- self.verify_pyspark(machine_name)
+ def verify_driver_signature(self, name):
+ cert_path='/var/lib/dkms/mok.pub'
+ if self.getImageOs() == 'ubuntu':
+ cert_path='/var/lib/shim-signed/mok/MOK.der'
+
+ cert_verification_cmd = """
+perl -Mv5.10 -e '
+my $cert = ( qx{openssl x509 -inform DER -in {} -text}
+ =~ /Serial Number:.*? +(.+?)\s*$/ms );
+my $kmod = ( qx{modinfo nvidia}
+ =~ /^sig_key:\s+(\S+)/ms );
+exit 1 unless $cert eq lc $kmod
+'
+"""
+ self.assert_instance_command( name, cert_verification_cmd.format(cert_path) )
@parameterized.parameters(
("SINGLE", ["m"], GPU_T4, None, None),
@@ -91,38 +180,44 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
def test_install_gpu_without_agent(self, configuration, machine_suffixes,
master_accelerator, worker_accelerator,
driver_provider):
-
self.skipTest("No need to regularly test not installing the agent")
- if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
- self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
metadata = "install-gpu-agent=false"
+ if configuration == 'SINGLE' \
+ and self.getImageOs() == 'rocky' \
+ and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+ # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+ self.skipTest("known to fail")
+
if driver_provider is not None:
metadata += ",gpu-driver-provider={}".format(driver_provider)
self.createCluster(
configuration,
self.INIT_ACTIONS,
- machine_type="n1-highmem-8",
+ machine_type="n1-standard-16",
master_accelerator=master_accelerator,
worker_accelerator=worker_accelerator,
metadata=metadata,
- timeout_in_minutes=30,
+ timeout_in_minutes=90,
boot_disk_size="50GB")
for machine_suffix in machine_suffixes:
- self.verify_instance("{}-{}".format(self.getClusterName(),
- machine_suffix))
+ machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+ self.verify_instance(machine_name)
@parameterized.parameters(
- ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
+ ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"),
# ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"),
)
def test_install_gpu_with_agent(self, configuration, machine_suffixes,
master_accelerator, worker_accelerator,
driver_provider):
- if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
- self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+ self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
+
+ if configuration == 'KERBEROS' \
+ and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+ # ('KERBEROS fails with image version <= 2.1')
+ self.skipTest("known to fail")
metadata = "install-gpu-agent=true"
if driver_provider is not None:
@@ -130,59 +225,66 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
self.createCluster(
configuration,
self.INIT_ACTIONS,
- machine_type="n1-highmem-8",
+ machine_type="n1-standard-16",
master_accelerator=master_accelerator,
worker_accelerator=worker_accelerator,
metadata=metadata,
- timeout_in_minutes=30,
+ timeout_in_minutes=90,
boot_disk_size="50GB",
scopes="https://www.googleapis.com/auth/monitoring.write")
for machine_suffix in machine_suffixes:
- self.verify_instance("{}-{}".format(self.getClusterName(),
- machine_suffix))
- self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),
- machine_suffix))
+ machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+ self.verify_instance(machine_name)
+ self.verify_instance_gpu_agent(machine_name)
@parameterized.parameters(
-# ("SINGLE", ["m"], GPU_T4, None, "12.0"),
- ("SINGLE", ["m"], GPU_T4, None, "11.8"),
+ ("SINGLE", ["m"], GPU_T4, None, "12.4"),
+# ("SINGLE", ["m"], GPU_T4, None, "11.8"),
("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
-# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
+ ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"),
)
def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
master_accelerator, worker_accelerator,
cuda_version):
- if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
- self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
- if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
- and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
- self.skipTest("CUDA == 12.0 not supported on debian 12")
+ if configuration == 'KERBEROS' \
+ and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+ # ('KERBEROS fails with image version <= 2.1')
+ self.skipTest("known to fail")
- if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+ if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
- self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+ self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
- if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
- and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+ if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
- self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+ self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+
+ if configuration == 'SINGLE' \
+ and self.getImageOs() == 'rocky' \
+ and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+ # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+ self.skipTest("known to fail")
+
metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
self.createCluster(
configuration,
self.INIT_ACTIONS,
- machine_type="n1-highmem-8",
+ machine_type="n1-standard-16",
master_accelerator=master_accelerator,
worker_accelerator=worker_accelerator,
metadata=metadata,
- timeout_in_minutes=30,
+ timeout_in_minutes=90,
boot_disk_size="50GB")
+
for machine_suffix in machine_suffixes:
machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
self.verify_instance(machine_name)
self.verify_instance_nvcc(machine_name, cuda_version)
+ self.verify_instance_pyspark(machine_name)
+ self.verify_instance_spark()
@parameterized.parameters(
("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"),
@@ -192,37 +294,34 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
def test_install_gpu_with_mig(self, configuration, machine_suffixes,
master_accelerator, worker_accelerator,
driver_provider, cuda_version):
-
- self.skipTest("Test is known to fail. Skipping so that we can exercise others")
-
- if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
- self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
- if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
- and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
- self.skipTest("CUDA == 12.0 not supported on debian 12")
-
- if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+ # Operation [projects/.../regions/.../operations/...] failed:
+ # Invalid value for field 'resource.machineType': \
+ # 'https://www.googleapis.com/compute/v1/projects/.../zones/.../' \
+ # 'machineTypes/a3-highgpu-2g'. \
+ # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature..
+ # ('This use case not thoroughly tested')
+ self.skipTest("known to fail")
+
+ if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
- self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+ self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
- if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
- and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+ if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
- self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+ self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version)
self.createCluster(
configuration,
self.INIT_ACTIONS,
- master_machine_type="a3-highgpu-8g",
+ master_machine_type="a3-highgpu-2g",
worker_machine_type="a2-highgpu-2g",
master_accelerator=master_accelerator,
worker_accelerator=worker_accelerator,
metadata=metadata,
- timeout_in_minutes=30,
+ timeout_in_minutes=90,
boot_disk_size="50GB",
startup_script="gpu/mig.sh")
@@ -236,12 +335,12 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
)
def test_gpu_allocation(self, configuration, master_accelerator,
worker_accelerator, driver_provider):
- if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
- self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
- if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
- and configuration == 'SINGLE':
- self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
+ if configuration == 'SINGLE' \
+ and self.getImageOs() == 'rocky' \
+ and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+ # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+ self.skipTest("known to fail")
metadata = None
if driver_provider is not None:
@@ -251,11 +350,11 @@ def test_gpu_allocation(self, configuration, master_accelerator,
configuration,
self.INIT_ACTIONS,
metadata=metadata,
- machine_type="n1-highmem-8",
+ machine_type="n1-standard-16",
master_accelerator=master_accelerator,
worker_accelerator=worker_accelerator,
boot_disk_size="50GB",
- timeout_in_minutes=30)
+ timeout_in_minutes=90)
self.verify_instance_spark()
@@ -270,43 +369,92 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
master_accelerator, worker_accelerator,
cuda_version):
- if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
- self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
- if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
- and configuration == 'SINGLE':
- self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests fail with errors about nodes_include being empty")
-
- if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
- and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
- self.skipTest("CUDA == 12.0 not supported on debian 12")
-
- if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+ if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
- self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+ self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
- if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
- and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+ if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
- self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+ self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+
+ if configuration == 'SINGLE' \
+ and self.getImageOs() == 'rocky' \
+ and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+ # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+ self.skipTest("known to fail")
metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
self.createCluster(
configuration,
self.INIT_ACTIONS,
- machine_type="n1-highmem-8",
+ machine_type="n1-standard-16",
master_accelerator=master_accelerator,
worker_accelerator=worker_accelerator,
metadata=metadata,
- timeout_in_minutes=30,
+ timeout_in_minutes=90,
boot_disk_size="50GB",
scopes="https://www.googleapis.com/auth/monitoring.write")
+
for machine_suffix in machine_suffixes:
- self.verify_instance("{}-{}".format(self.getClusterName(),
- machine_suffix))
- self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),
- machine_suffix))
+ machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+ self.verify_instance(machine_name)
+ self.verify_instance_gpu_agent(machine_name)
+ self.verify_instance_spark()
+
+ @parameterized.parameters(
+# ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''),
+# ("STANDARD", ["m"], GPU_T4, None, "12.0"),
+# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8", 'rocky', '2.0'),
+ ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'),
+# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'),
+# ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'),
+# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
+# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"),
+ )
+ def untested_driver_signing(self, configuration, machine_suffixes,
+ master_accelerator, worker_accelerator,
+ cuda_version, image_os, image_version):
+
+ if configuration == 'KERBEROS' \
+ and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+ # ('KERBEROS fails with image version <= 2.1')
+ self.skipTest("known to fail")
+
+ kvp_array=[]
+ import os
+
+ if "private_secret_name" in os.environ:
+ for env_var in ['public_secret_name', 'private_secret_name', 'secret_project', 'secret_version' 'modulus_md5sum']:
+ kvp_array.append( "{}={}".format( env_var, os.environ[env_var] ) )
+
+ if kvp_array[0] == "public_secret_name=":
+ self.skipTest("This test only runs when signing environment has been configured in presubmit.sh")
+ else:
+ self.skipTest("This test only runs when signing environment has been configured in presubmit.sh")
+
+ metadata = ",".join( kvp_array )
+
+ if self.getImageOs() != image_os:
+ self.skipTest("This test is only run on os {}".format(image_os))
+ if self.getImageVersion() != image_version:
+ self.skipTest("This test is only run on Dataproc Image Version {}".format(image_os))
+
+ self.createCluster(
+ configuration,
+ self.INIT_ACTIONS,
+ machine_type="n1-standard-16",
+ master_accelerator=master_accelerator,
+ worker_accelerator=worker_accelerator,
+ metadata=metadata,
+ timeout_in_minutes=90,
+ boot_disk_size="50GB",
+ scopes="https://www.googleapis.com/auth/monitoring.write")
+ for machine_suffix in machine_suffixes:
+ hostname="{}-{}".format(self.getClusterName(),machine_suffix)
+ self.verify_instance(hostname)
+ self.verify_instance_gpu_agent(hostname)
+# self.verify_driver_signature(hostname)
self.verify_instance_spark()
diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py
index 936718498..8f08472bd 100644
--- a/integration_tests/dataproc_test_case.py
+++ b/integration_tests/dataproc_test_case.py
@@ -23,7 +23,7 @@
INTERNAL_IP_SSH = os.getenv("INTERNAL_IP_SSH", "false").lower() == "true"
-DEFAULT_TIMEOUT = 15 # minutes
+DEFAULT_TIMEOUT = 45 # minutes
class DataprocTestCase(parameterized.TestCase):
@@ -178,9 +178,9 @@ def createCluster(self,
args.append("--zone={}".format(self.cluster_zone))
if not FLAGS.skip_cleanup:
- args.append("--max-age=60m")
+ args.append("--max-age=120m")
- args.append("--max-idle=25m")
+ args.append("--max-idle=60m")
cmd = "{} dataproc clusters create {} {}".format(
"gcloud beta" if beta else "gcloud", self.name, " ".join(args))