Skip to content

Commit

Permalink
Horovod test fix (#1264)
Browse files Browse the repository at this point in the history
* Fixing horovod test with correct framework versions and ignoring the tests for 2.1 and 2.2

* Metadata correction

* Accelerator correction
  • Loading branch information
prince-cs authored Nov 15, 2024
1 parent c063e5f commit da3d8c1
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 5 deletions.
4 changes: 2 additions & 2 deletions horovod/horovod.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ set -euxo pipefail

readonly DEFAULT_HOROVOD_VERSION="0.21.2"
readonly DEFAULT_TENSORFLOW_VERSION="2.4.1"
readonly DEFAULT_PYTORCH_VERSION="1.11.0"
readonly DEFAULT_PYTORCH_VERSION="1.7.1"
readonly DEFAULT_TORCHVISION_VERSION="0.8.2"
readonly DEFAULT_MXNET_VERSION="1.7.0.post1"
readonly DEFAULT_CUDA_VERSION="11.0"
readonly DEFAULT_CUDA_VERSION="12.4"

HOROVOD_VERSION="$(/usr/share/google/get_metadata_value attributes/horovod-version || echo ${DEFAULT_HOROVOD_VERSION})"
readonly HOROVOD_VERSION
Expand Down
13 changes: 10 additions & 3 deletions horovod/test_horovod.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pkg_resources

from absl.testing import absltest
from absl.testing import parameterized

Expand All @@ -9,6 +11,7 @@ class HorovodTestCase(DataprocTestCase):
INIT_ACTIONS = ["horovod/horovod.sh"]
GPU_INIT_ACTIONS = ["gpu/install_gpu_driver.sh"] + INIT_ACTIONS
GPU_P100 = "type=nvidia-tesla-p100"
GPU_T4 = "type=nvidia-tesla-t4"

TENSORFLOW_TEST_SCRIPT = "scripts/verify_tensorflow.py"
PYTORCH_TEST_SCRIPT = "scripts/verify_pytorch.py"
Expand All @@ -26,6 +29,8 @@ def _submit_spark_job(self, script):
def test_horovod_cpu(self, configuration, controller):
if self.getImageOs() == 'rocky':
self.skipTest("Not supported in Rocky Linux-based images")
if self.getImageVersion() > pkg_resources.parse_version("2.0"):
self.skipTest("Not supported in Dataproc image version 2.1 and 2.2")

metadata = ""
if controller == "mpi":
Expand All @@ -44,16 +49,18 @@ def test_horovod_cpu(self, configuration, controller):
def test_horovod_gpu(self, configuration, controller):
if self.getImageOs() == 'rocky':
self.skipTest("Not supported in Rocky Linux-based images")
if self.getImageVersion() > pkg_resources.parse_version("2.0"):
self.skipTest("Not supported in Dataproc image version 2.1 and 2.2")

metadata = "cuda-version=11.1,cudnn-version=8.0.5.39,gpu-driver-provider=NVIDIA"
metadata = "cuda-version=12.4,cudnn-version=9.1.0.70,gpu-driver-provider=NVIDIA"

self.createCluster(
configuration,
self.GPU_INIT_ACTIONS,
timeout_in_minutes=60,
machine_type="n1-standard-8",
master_accelerator=self.GPU_P100,
worker_accelerator=self.GPU_P100,
master_accelerator=self.GPU_T4,
worker_accelerator=self.GPU_T4,
metadata=metadata)


Expand Down

0 comments on commit da3d8c1

Please sign in to comment.