From 1376d171ee00954bac679d35e9881afb2da024be Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 18 Feb 2025 09:32:50 -0800 Subject: [PATCH] [gpu] Exercise installer with all supported GPU types (#1302) * [gpu] Exercise installer with all supported GPU types * enabling GPU agent by default * for cuda repo, add known signing keys to trustdb * move variable definition to prepare function * add some TODO notes for installing this script as a custom image * spark supports maximum 1 GPU per executor * printing warning when old cuda is used with ubuntu22 * built a list of PCI devices for each supported GPU type * improvements to disk usage metrics display * Exercised script with variety of images * Supported on 2.0.27-debian10 * * added fallback to using gsutil when gcloud sdk < 402 * * adoptium, cran-r and gcloud pgp databases updated * * writing multi-valued gpg databases using `gpg --import --no-default-keyring --keyring ...` vice `gpg --dearmor -o` * * evaluating variable name for keyring path when writing sources.list file * driver builds for P4, P100, and V100 now tested * * build driver from proprietary source when PCI Device ID < 0x1E00 (Turing) * * separating open vs non-free kmod builds into their own gcs path * renamed completion variable for nvidia container toolkit from nvtk to nvctk * removed export of METADATA_HTTP_PROXY variable, since it's only used in this one function --- gpu/install_gpu_driver.sh | 264 +++++++++++++++++++++++++------------- 1 file changed, 175 insertions(+), 89 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 66964b4d1..49c55e651 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -449,7 +449,7 @@ GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') readonly GPU_DRIVER_PROVIDER # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver -INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') +INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'true') readonly INSTALL_GPU_AGENT # Dataproc configurations @@ -628,15 +628,20 @@ function install_nvidia_nccl() { if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then - local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")" - local build_start_epoch="$(date -d "${build_start_time}" +%s)" - local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while gsutil ls -L "${gcs_tarball}.building" ; do + if ${gsutil_cmd} ls "${gcs_tarball}.building" ; then + local build_start_time build_start_epoch timeout_epoch + if [[ "${gsutil_cmd}" =~ "gsutil" ]] ; then + build_start_time="$(${gsutil_cmd} ls -L "${gcs_tarball}.building" | awk -F': +' '/Creation time/ {print $2}')" + else + build_start_time="$(${gsutil_cmd} ls -j "${gcs_tarball}.building" | jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")" + fi + build_start_epoch="$(date -d "${build_start_time}" +%s)" + timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes + while ${gsutil_cmd} ls -L "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - gsutil rm "${gcs_tarball}.building" + ${gsutil_cmd} rm "${gcs_tarball}.building" break fi sleep 5m @@ -644,15 +649,15 @@ function install_nvidia_nccl() { fi fi - output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') + output=$(${gsutil_cmd} ls "${gcs_tarball}" 2>&1 || echo '') if echo "${output}" | grep -q "${gcs_tarball}" ; then # cache hit - unpack from cache echo "cache hit" - gcloud storage cat "${gcs_tarball}" | tar xvz + ${gsutil_cmd} cat "${gcs_tarball}" | tar xvz else # build and cache touch "${local_tarball}.building" - gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" + ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" pushd nccl # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install @@ -699,8 +704,8 @@ function install_nvidia_nccl() { make clean popd tar xzvf "${local_tarball}" - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi + ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" + if ${gsutil_cmd} ls "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi building_file="" rm "${local_tarball}" fi @@ -804,15 +809,20 @@ function install_pytorch() { if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then - local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")" - local build_start_epoch="$(date -d "${build_start_time}" +%s)" - local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while gsutil ls -L "${gcs_tarball}.building" ; do + if ${gsutil_cmd} ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then + local build_start_time build_start_epoch timeout_epoch + if [[ "${gsutil_cmd}" =~ "gsutil" ]] ; then + build_start_time="$(${gsutil_cmd} ls -L "${gcs_tarball}.building" | awk -F': +' '/Creation time/ {print $2}')" + else + build_start_time="$(${gsutil_cmd} ls -j "${gcs_tarball}.building" | jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")" + fi + build_start_epoch="$(date -d "${build_start_time}" +%s)" + timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes + while ${gsutil_cmd} ls -L "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - gsutil rm "${gcs_tarball}.building" + ${gsutil_cmd} rm "${gcs_tarball}.building" break fi sleep 5m @@ -820,15 +830,15 @@ function install_pytorch() { fi fi - output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') + output=$(${gsutil_cmd} ls "${gcs_tarball}" 2>&1 || echo '') if echo "${output}" | grep -q "${gcs_tarball}" ; then # cache hit - unpack from cache echo "cache hit" mkdir -p "${envpath}" - gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz + ${gsutil_cmd} cat "${gcs_tarball}" | tar -C "${envpath}" -xz else touch "${local_tarball}.building" - gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" + ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" local verb=create if test -d "${envpath}" ; then verb=install ; fi @@ -848,8 +858,8 @@ function install_pytorch() { pushd "${envpath}" tar czf "${local_tarball}" . popd - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi + ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" + if ${gsutil_cmd} ls "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi building_file="" fi @@ -987,8 +997,11 @@ function add_repo_cuda() { local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list" echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \ | sudo tee "${sources_list_path}" - curl ${curl_retry_args} "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \ - -o "${kr_path}" + + for keyid in "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" ; do + curl ${curl_retry_args} "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" \ + | gpg --import --no-default-keyring --keyring "${kr_path}" + done else install_cuda_keyring_pkg # 11.7+, 12.0+ fi @@ -997,11 +1010,13 @@ function add_repo_cuda() { fi } -readonly uname_r=$(uname -r) - function build_driver_from_github() { - # non-GPL driver will have been built on rocky8 or if driver version is prior to open kernel version - if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then return 0 ; fi + # non-GPL driver will have been built on rocky8, or when driver + # version is prior to open driver min, or GPU architecture is prior + # to Turing + if ( is_rocky8 \ + || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \ + || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] ) ; then return 0 ; fi pushd "${workdir}" test -d "${workdir}/open-gpu-kernel-modules" || { tarball_fn="${DRIVER_VERSION}.tar.gz" @@ -1025,15 +1040,20 @@ function build_driver_from_github() { if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then - local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")" - local build_start_epoch="$(date -d "${build_start_time}" +%s)" - local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while gsutil ls -L "${gcs_tarball}.building" ; do + if ${gsutil_cmd} ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then + local build_start_time build_start_epoch timeout_epoch + if [[ "${gsutil_cmd}" =~ "gsutil" ]] ; then + build_start_time="$(${gsutil_cmd} ls -L "${gcs_tarball}.building" | awk -F': +' '/Creation time/ {print $2}')" + else + build_start_time="$(${gsutil_cmd} ls -j "${gcs_tarball}.building" | jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")" + fi + build_start_epoch="$(date -d "${build_start_time}" +%s)" + timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes + while ${gsutil_cmd} ls -L "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - gsutil rm "${gcs_tarball}.building" + ${gsutil_cmd} rm "${gcs_tarball}.building" break fi sleep 5m @@ -1041,12 +1061,12 @@ function build_driver_from_github() { fi fi - if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + if ${gsutil_cmd} ls "${gcs_tarball}" 2>&1 ; then echo "cache hit" else # build the kernel modules touch "${local_tarball}.building" - gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" + ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" pushd open-gpu-kernel-modules install_build_dependencies @@ -1075,14 +1095,14 @@ function build_driver_from_github() { tar czvf "${local_tarball}" \ "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi + ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" + if ${gsutil_cmd} ls "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi building_file="" rm "${local_tarball}" make clean popd fi - gcloud storage cat "${gcs_tarball}" | tar -C / -xzv + ${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv depmod -a } @@ -1155,10 +1175,15 @@ function install_nvidia_userspace_runfile() { local cache_hit="0" local local_tarball - if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then + # Build nonfree driver on rocky8, or when driver version is prior to + # open driver min, or when GPU architecture is prior to Turing + if ( is_rocky8 \ + || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \ + || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] ) + then local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { - local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" + local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}_nonfree.tar.gz" local_tarball="${workdir}/${build_tarball}" local build_dir if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] @@ -1170,15 +1195,20 @@ function install_nvidia_userspace_runfile() { if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then - local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")" - local build_start_epoch="$(date -d "${build_start_time}" +%s)" - local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while gsutil ls -L "${gcs_tarball}.building" ; do + if ${gsutil_cmd} ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then + local build_start_time build_start_epoch timeout_epoch + if [[ "${gsutil_cmd}" =~ "gsutil" ]] ; then + build_start_time="$(${gsutil_cmd} ls -L "${gcs_tarball}.building" | awk -F': +' '/Creation time/ {print $2}')" + else + build_start_time="$(${gsutil_cmd} ls -j "${gcs_tarball}.building" | jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")" + fi + build_start_epoch="$(date -d "${build_start_time}" +%s)" + timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes + while ${gsutil_cmd} ls -L "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - gsutil rm "${gcs_tarball}.building" + ${gsutil_cmd} rm "${gcs_tarball}.building" break fi sleep 5m @@ -1186,7 +1216,7 @@ function install_nvidia_userspace_runfile() { fi fi - if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + if ${gsutil_cmd} ls "${gcs_tarball}" ; then cache_hit="1" if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then runfile_args="${runfile_args} --no-kernel-modules" @@ -1195,7 +1225,7 @@ function install_nvidia_userspace_runfile() { else # build the kernel modules touch "${local_tarball}.building" - gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" + ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" install_build_dependencies configure_dkms_certs @@ -1225,17 +1255,20 @@ function install_nvidia_userspace_runfile() { --install-libglvnd \ --tmpdir="${tmpdir}" - if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then + # On rocky8, or when driver version is prior to open driver min, or when GPU architecture is prior to Turing + if ( is_rocky8 \ + || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \ + || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] ) ; then if [[ "${cache_hit}" == "1" ]] ; then - gcloud storage cat "${gcs_tarball}" | tar -C / -xzv + ${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv depmod -a else clear_dkms_key tar czvf "${local_tarball}" \ /var/log/nvidia-installer.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi + ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" + if ${gsutil_cmd} ls "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi building_file="" fi fi @@ -1314,7 +1347,7 @@ function install_cuda(){ } function install_nvidia_container_toolkit() { - is_complete install-nvtk && return + is_complete install-nvctk && return local container_runtime_default if command -v docker ; then container_runtime_default='docker' @@ -1332,7 +1365,7 @@ function install_nvidia_container_toolkit() { nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" systemctl restart "${CONTAINER_RUNTIME}" - mark_complete install-nvtk + mark_complete install-nvctk } # Install NVIDIA GPU driver provided by NVIDIA @@ -1370,6 +1403,8 @@ function install_ops_agent(){ cd /opt/google # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation curl ${curl_retry_args} -O https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh + local expected="038d98644e4c4a7969d26da790946720d278c8d49bb82b677f550c2a2b858411 add-google-cloud-ops-agent-repo.sh" + execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install mark_complete ops-agent @@ -1434,7 +1469,12 @@ function set_hadoop_property() { } function configure_yarn_resources() { - if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts + if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then + # TODO: when running this script to customize an image, this file + # needs to be written *after* bdutil completes + + return 0 + fi # pre-init scripts if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" fi @@ -1538,9 +1578,10 @@ EOF chmod a+rx "${gpus_resources_script}" - local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" if version_lt "${SPARK_VERSION}" "3.0" ; then return ; fi + local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" + local spark_defaults_dir="$(dirname "${spark_defaults_conf}")" if ! grep spark.executor.resource.gpu.discoveryScript "${spark_defaults_conf}" ; then echo "spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}" >> "${spark_defaults_conf}" fi @@ -1553,6 +1594,9 @@ EOF # gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" gpu_amount="$(perl -e "print 1 / ${executor_cores}")" + # TODO: when running this script to customize an image, this file + # needs to be written *after* bdutil completes + cat >>"${spark_defaults_conf}" < "0" ]] ; then + # N.B.: https://pci-ids.ucw.cz/v2.2/pci.ids.xz + pci_device_id="$(grep -h -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | head -1 | awk -F: '{print $2}')" + pci_device_id_int="$((16#${pci_device_id}))" + case "${pci_device_id}" in + "15F8" ) gpu_type="nvidia-tesla-p100" ;; + "1BB3" ) gpu_type="nvidia-tesla-p4" ;; + "1DB1" ) gpu_type="nvidia-tesla-v100" ;; + "1EB8" ) gpu_type="nvidia-tesla-t4" ;; + "20*" ) gpu_type="nvidia-tesla-a100" ;; + "23*" ) gpu_type="nvidia-h100" ;; # install does not begin with image 2.0.68-debian10/cuda11.1 + "27B8" ) gpu_type="nvidia-l4" ;; # install does not complete with image 2.0.68-debian10/cuda11.1 + esac + + ACCELERATOR="type=${gpu_type},count=${gpu_count}" + fi + nvsmi_works="0" if is_cuda11 ; then gcc_ver="11" @@ -1862,11 +1929,11 @@ function cache_fetched_package() { local gcs_fn="$2" local local_fn="$3" - if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then - time gcloud storage cp "${gcs_fn}" "${local_fn}" + if ${gsutil_cmd} ls "${gcs_fn}" 2>&1 ; then + time ${gsutil_cmd} cp "${gcs_fn}" "${local_fn}" else time ( curl ${curl_retry_args} "${src_url}" -o "${local_fn}" && \ - gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) + ${gsutil_cmd} cp "${local_fn}" "${gcs_fn}" ; ) fi } @@ -1881,8 +1948,8 @@ function clean_up_sources_lists() { local regional_bigtop_repo_uri regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | - sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | - grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | + sed -E "s#/dataproc-bigtop-repo(-dev)?/#/goog-dataproc-bigtop-repo\\1-${region}/#" | + grep -E "deb .*goog-dataproc-bigtop-repo(-dev)?-${region}.* dataproc contrib" | cut -d ' ' -f 2 | head -1) @@ -1908,12 +1975,17 @@ function clean_up_sources_lists() { local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" rm -f "${adoptium_kr_path}" - curl ${curl_retry_args} "${key_url}" \ - | gpg --dearmor -o "${adoptium_kr_path}" + local -r old_adoptium_list="/etc/apt/sources.list.d/adoptopenjdk.list" + if test -f "${old_adoptium_list}" ; then + rm -f "${old_adoptium_list}" + fi + for keyid in "0x3b04d753c9050d9a5d343f39843c48a565f8f04b" "0x35baa0b33e9eb396f59ca838c0ba5ce6dc6315a3" ; do + curl ${curl_retry_args} "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" \ + | gpg --import --no-default-keyring --keyring "${adoptium_kr_path}" + done echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ > /etc/apt/sources.list.d/adoptium.list - # # docker # @@ -1923,20 +1995,22 @@ function clean_up_sources_lists() { rm -f "${docker_kr_path}" curl ${curl_retry_args} "${docker_key_url}" \ - | gpg --dearmor -o "${docker_kr_path}" + | gpg --import --no-default-keyring --keyring "${docker_kr_path}" echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ > ${docker_repo_file} # # google cloud + logging/monitoring # - if ls /etc/apt/sources.list.d/google-cloud*.list ; then - rm -f /usr/share/keyrings/cloud.google.gpg - curl ${curl_retry_args} https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg + local gcloud_kr_path="/usr/share/keyrings/cloud.google.gpg" + if ls /etc/apt/sources.list.d/google-clou*.list ; then + rm -f "${gcloud_kr_path}" + curl ${curl_retry_args} https://packages.cloud.google.com/apt/doc/apt-key.gpg \ + | gpg --import --no-default-keyring --keyring "${gcloud_kr_path}" for list in google-cloud google-cloud-logging google-cloud-monitoring ; do list_file="/etc/apt/sources.list.d/${list}.list" if [[ -f "${list_file}" ]]; then - sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" + sed -i -e "s:deb https:deb [signed-by=${gcloud_kr_path}] https:g" "${list_file}" fi done fi @@ -1945,12 +2019,13 @@ function clean_up_sources_lists() { # cran-r # if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then - keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" - if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi - rm -f /usr/share/keyrings/cran-r.gpg - curl ${curl_retry_args} "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ - gpg --dearmor -o /usr/share/keyrings/cran-r.gpg - sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list + local cranr_kr_path="/usr/share/keyrings/cran-r.gpg" + rm -f "${cranr_kr_path}" + for keyid in "0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" "0xe298a3a825c0d65dfd57cbb651716619e084dab9" ; do + curl ${curl_retry_args} "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" \ + | gpg --import --no-default-keyring --keyring "${cranr_kr_path}" + done + sed -i -e "s:deb http:deb [signed-by=${cranr_kr_path}] http:g" /etc/apt/sources.list.d/cran-r.list fi # @@ -1973,7 +2048,7 @@ function exit_handler() { # clean up incomplete build indicators if test -n "${building_file}" ; then - if gcloud storage ls "${building_file}" ; then gcloud storage rm "${building_file}" || true ; fi + if ${gsutil_cmd} ls "${building_file}" ; then ${gsutil_cmd} rm "${building_file}" || true ; fi fi set +ex @@ -2046,15 +2121,19 @@ function exit_handler() { #/dev/vda2 7096908 2611344 4182932 39% / df / | tee -a "/run/disk-usage.log" - perl -e '@siz=( sort { $a => $b } - map { (split)[2] =~ /^(\d+)/ } - grep { m:^/: } ); -$max=$siz[0]; $min=$siz[-1]; $starting="unknown"; $inc=q{$max-$starting}; -print( " samples-taken: ", scalar @siz, $/, + perl -e '($first, @samples) = grep { m:^/: } ; + unshift(@samples,$first); $final=$samples[-1]; + ($starting)=(split(/\s+/,$first))[2] =~ /^(\d+)/; + ($ending)=(split(/\s+/,$final))[2] =~ /^(\d+)/; + @siz=( sort { $a => $b } + map { (split)[2] =~ /^(\d+)/ } @samples ); +$max=$siz[0]; $min=$siz[-1]; $inc=$max-$starting; +print( " samples-taken: ", scalar @siz, $/, "starting-disk-used: $starting", $/, - "maximum-disk-used: $max", $/, - "minimum-disk-used: $min", $/, - " increased-by: $inc", $/ )' < "/run/disk-usage.log" + " ending-disk-used: $ending", $/, + " maximum-disk-used: $max", $/, + " minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" echo "exit_handler has completed" @@ -2074,7 +2153,6 @@ function set_proxy(){ if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi - export METADATA_HTTP_PROXY export http_proxy="${METADATA_HTTP_PROXY}" export https_proxy="${METADATA_HTTP_PROXY}" export HTTP_PROXY="${METADATA_HTTP_PROXY}" @@ -2144,10 +2222,18 @@ function harden_sshd_config() { } function prepare_to_install(){ + readonly uname_r=$(uname -r) # Verify OS compatability and Secure boot state check_os check_secure_boot + # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be + # used as a more performant replacement for `gsutil` + gsutil_cmd="gcloud storage" + gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" + if version_lt "${gcloud_sdk_version}" "402.0.0" ; then + gsutil_cmd="gsutil -o GSUtil:check_hashes=never" + fi curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" prepare_gpu_env