Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions test/prototype/moe_training/test_everything.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ IS_ROCM=$(rocm-smi --version || true)
# These tests do not work on ROCm yet
if [ -z "$IS_ROCM" ]
then
./test/prototype/moe_training/test_fsdp.sh
./test/prototype/moe_training/test_tp.sh
./test/prototype/moe_training/test_fsdp_tp.sh
./test_fsdp.sh
./test_tp.sh
./test_fsdp_tp.sh
fi

echo "all tests successful"
12 changes: 9 additions & 3 deletions test/prototype/moe_training/test_fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#
# To run these unit tests, use the following command:
#
# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_fsdp.py
# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py
#
#######################################################################

Expand Down Expand Up @@ -45,7 +45,14 @@
)


def test_moe_float8_training_fsdp():
@pytest.mark.parametrize(
"target_fqns",
[
["experts"],
["experts,shared_expert"],
],
)
def test_moe_float8_training_fsdp(target_fqns: list[str]):
assert torch.cuda.is_available()

# setup distributed for fsdp
Expand All @@ -55,7 +62,6 @@ def test_moe_float8_training_fsdp():
set_token_group_alignment_size_m(16)

# define model args
target_fqns = ["experts"]
model_args = MoEArgs(
num_experts=8,
)
Expand Down
2 changes: 1 addition & 1 deletion test/prototype/moe_training/test_fsdp.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp.py -s
torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py -s
5 changes: 2 additions & 3 deletions test/prototype/moe_training/test_fsdp_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#
# To run these unit tests, use the following command:
#
# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_fsdp_tp.py
# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp_tp.py
#
#######################################################################

Expand Down Expand Up @@ -67,8 +67,7 @@
"target_fqns",
[
["experts"],
# TODO: investigate hang when shared_expert is converted
# ["experts,shared_expert"],
["experts,shared_expert"],
],
)
def test_moe_float8_training_fsdp_tp(target_fqns: list[str]):
Expand Down
2 changes: 1 addition & 1 deletion test/prototype/moe_training/test_fsdp_tp.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
torchrun --nproc_per_node=4 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp_tp.py -s
torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp_tp.py -s
5 changes: 2 additions & 3 deletions test/prototype/moe_training/test_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#
# To run these unit tests, use the following command:
#
# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_tp.py
# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_tp.py
#
#######################################################################

Expand Down Expand Up @@ -67,8 +67,7 @@
"target_fqns",
[
["experts"],
# TODO: investigate hang when shared_expert is converted
# ["experts,shared_expert"],
["experts,shared_expert"],
],
)
def test_moe_float8_training_tp(target_fqns: list[str]):
Expand Down
2 changes: 1 addition & 1 deletion test/prototype/moe_training/test_tp.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_tp.py -s
torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_tp.py -s
Loading