diff --git a/horovod/horovod.sh b/horovod/horovod.sh index 410b32012..3404a40c7 100644 --- a/horovod/horovod.sh +++ b/horovod/horovod.sh @@ -22,10 +22,10 @@ set -euxo pipefail readonly DEFAULT_HOROVOD_VERSION="0.21.2" readonly DEFAULT_TENSORFLOW_VERSION="2.4.1" -readonly DEFAULT_PYTORCH_VERSION="1.11.0" +readonly DEFAULT_PYTORCH_VERSION="1.7.1" readonly DEFAULT_TORCHVISION_VERSION="0.8.2" readonly DEFAULT_MXNET_VERSION="1.7.0.post1" -readonly DEFAULT_CUDA_VERSION="11.0" +readonly DEFAULT_CUDA_VERSION="12.4" HOROVOD_VERSION="$(/usr/share/google/get_metadata_value attributes/horovod-version || echo ${DEFAULT_HOROVOD_VERSION})" readonly HOROVOD_VERSION diff --git a/horovod/test_horovod.py b/horovod/test_horovod.py index 511dbade1..8037a38a8 100644 --- a/horovod/test_horovod.py +++ b/horovod/test_horovod.py @@ -1,3 +1,5 @@ +import pkg_resources + from absl.testing import absltest from absl.testing import parameterized @@ -9,6 +11,7 @@ class HorovodTestCase(DataprocTestCase): INIT_ACTIONS = ["horovod/horovod.sh"] GPU_INIT_ACTIONS = ["gpu/install_gpu_driver.sh"] + INIT_ACTIONS GPU_P100 = "type=nvidia-tesla-p100" + GPU_T4 = "type=nvidia-tesla-t4" TENSORFLOW_TEST_SCRIPT = "scripts/verify_tensorflow.py" PYTORCH_TEST_SCRIPT = "scripts/verify_pytorch.py" @@ -26,6 +29,8 @@ def _submit_spark_job(self, script): def test_horovod_cpu(self, configuration, controller): if self.getImageOs() == 'rocky': self.skipTest("Not supported in Rocky Linux-based images") + if self.getImageVersion() > pkg_resources.parse_version("2.0"): + self.skipTest("Not supported in Dataproc image version 2.1 and 2.2") metadata = "" if controller == "mpi": @@ -44,16 +49,18 @@ def test_horovod_cpu(self, configuration, controller): def test_horovod_gpu(self, configuration, controller): if self.getImageOs() == 'rocky': self.skipTest("Not supported in Rocky Linux-based images") + if self.getImageVersion() > pkg_resources.parse_version("2.0"): + self.skipTest("Not supported in Dataproc image version 2.1 and 2.2") - metadata = "cuda-version=11.1,cudnn-version=8.0.5.39,gpu-driver-provider=NVIDIA" + metadata = "cuda-version=12.4,cudnn-version=9.1.0.70,gpu-driver-provider=NVIDIA" self.createCluster( configuration, self.GPU_INIT_ACTIONS, timeout_in_minutes=60, machine_type="n1-standard-8", - master_accelerator=self.GPU_P100, - worker_accelerator=self.GPU_P100, + master_accelerator=self.GPU_T4, + worker_accelerator=self.GPU_T4, metadata=metadata)