pytorch · danielvegamyhre · Aug 22, 2025
diff --git a/test/prototype/moe_training/test_everything.sh b/test/prototype/moe_training/test_everything.sh
@@ -12,9 +12,9 @@ IS_ROCM=$(rocm-smi --version || true)
 # These tests do not work on ROCm yet
 if [ -z "$IS_ROCM" ]
 then
-./test/prototype/moe_training/test_fsdp.sh
-./test/prototype/moe_training/test_tp.sh
-./test/prototype/moe_training/test_fsdp_tp.sh
+./test_fsdp.sh
+./test_tp.sh
+./test_fsdp_tp.sh
 fi
 
 echo "all tests successful"
diff --git a/test/prototype/moe_training/test_fsdp.py b/test/prototype/moe_training/test_fsdp.py
@@ -7,7 +7,7 @@
 #
 # To run these unit tests, use the following command:
 #
-# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_fsdp.py
+# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py
 #
 #######################################################################
 
@@ -45,7 +45,14 @@
     )
 
 
-def test_moe_float8_training_fsdp():
+@pytest.mark.parametrize(
+    "target_fqns",
+    [
+        ["experts"],
+        ["experts,shared_expert"],
+    ],
+)
+def test_moe_float8_training_fsdp(target_fqns: list[str]):
     assert torch.cuda.is_available()
 
     # setup distributed for fsdp
@@ -55,7 +62,6 @@ def test_moe_float8_training_fsdp():
     set_token_group_alignment_size_m(16)
 
     # define model args
-    target_fqns = ["experts"]
     model_args = MoEArgs(
         num_experts=8,
     )

diff --git a/test/prototype/moe_training/test_fsdp.sh b/test/prototype/moe_training/test_fsdp.sh
@@ -1 +1 @@
-torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp.py -s
+torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py -s
diff --git a/test/prototype/moe_training/test_fsdp_tp.py b/test/prototype/moe_training/test_fsdp_tp.py
@@ -7,7 +7,7 @@
 #
 # To run these unit tests, use the following command:
 #
-# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_fsdp_tp.py
+# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp_tp.py
 #
 #######################################################################
 
@@ -67,8 +67,7 @@
     "target_fqns",
     [
         ["experts"],
-        # TODO: investigate hang when shared_expert is converted
-        # ["experts,shared_expert"],
+        ["experts,shared_expert"],
     ],
 )
 def test_moe_float8_training_fsdp_tp(target_fqns: list[str]):

diff --git a/test/prototype/moe_training/test_fsdp_tp.sh b/test/prototype/moe_training/test_fsdp_tp.sh
@@ -1 +1 @@
-torchrun --nproc_per_node=4 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp_tp.py -s
+torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp_tp.py -s
diff --git a/test/prototype/moe_training/test_tp.py b/test/prototype/moe_training/test_tp.py
@@ -7,7 +7,7 @@
 #
 # To run these unit tests, use the following command:
 #
-# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_tp.py
+# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_tp.py
 #
 #######################################################################
 
@@ -67,8 +67,7 @@
     "target_fqns",
     [
         ["experts"],
-        # TODO: investigate hang when shared_expert is converted
-        # ["experts,shared_expert"],
+        ["experts,shared_expert"],
     ],
 )
 def test_moe_float8_training_tp(target_fqns: list[str]):

diff --git a/test/prototype/moe_training/test_tp.sh b/test/prototype/moe_training/test_tp.sh
@@ -1 +1 @@
-torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_tp.py -s
+torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_tp.py -s
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp.py -s
		torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py -s
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		torchrun --nproc_per_node=4 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp_tp.py -s
		torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp_tp.py -s