tests: add llama 3.3 70b 2 nodes tests (#4391)

xinhe-nv · web-flow · commit 750f412b8f51 · 2025-05-21T12:42:45.000+08:00
* add llama 3.3 70b 2 nodes tests

Signed-off-by: xinhe-nv &lt;200704525+xinhe-nv@users.noreply.github.com&gt;

* remove enable_overlap_scheduler parameter

Signed-off-by: xinhe-nv &lt;200704525+xinhe-nv@users.noreply.github.com&gt;

---------

Signed-off-by: xinhe-nv &lt;200704525+xinhe-nv@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -1551,20 +1551,19 @@ def test_ptq_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
 
 @pytest.mark.skip_less_device_memory(80000)
 @pytest.mark.skip_less_device(8)
-@pytest.mark.parametrize("model_name,model_path", [
-    pytest.param('DeepSeek-V3', 'DeepSeek-V3', marks=skip_pre_hopper),
-])
+@skip_pre_hopper
+@skip_post_blackwell
+@pytest.mark.parametrize("model_path", ['DeepSeek-V3'])
 def test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus(
-        llm_root, llm_venv, model_name, model_path):
+        llm_root, llm_venv, model_path):
     # "RCCA https://nvbugs/5163844"
-    print(f"Testing {model_name}.")
+    print(f"Testing {model_path}.")
     example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
     run_cmd = [
         "trtllm-llmapi-launch",
         "python3",
         str(example_root / "quickstart_advanced.py"),
-        "--model_dir",
-        f"{llm_models_root()}/{model_path}",
+        f"--model_dir={llm_models_root()}/{model_path}",
         "--moe_ep_size=8",
         "--tp_size=16",
         "--use_cuda_graph",
@@ -2063,4 +2062,30 @@ def test_ptp_scaffolding(llm_root, llm_venv, model_name, model_path):
     ])
 
 
+@pytest.mark.skip_less_device_memory(80000)
+@pytest.mark.skip_less_device(4)
+@pytest.mark.parametrize("model_path", [
+    pytest.param('llama-3.3-models/Llama-3.3-70B-Instruct',
+                 marks=skip_pre_hopper),
+    pytest.param('Llama-4-Maverick-17B-128E-Instruct', marks=skip_pre_hopper),
+])
+def test_ptp_quickstart_advanced_llama_2nodes(llm_root, llm_venv, model_path):
+    print(f"Testing {model_path}.")
+    example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
+    run_cmd = [
+        "trtllm-llmapi-launch",
+        "python3",
+        str(example_root / "quickstart_advanced.py"),
+        f"--model_dir={llm_models_root()}/{model_path}",
+        "--moe_ep_size=8",
+        "--tp_size=16",
+        "--use_cuda_graph",
+        f"--kv_cache_fraction={_MEM_FRACTION_50}",
+        "--max_batch_size=32",
+        "--max_num_tokens=2048",
+        "--disable_kv_cache_reuse",
+    ]
+    check_call(" ".join(run_cmd), shell=True, env=llm_venv._new_env)
+
+
 # End of Pivot-To-Python examples
diff --git a/tests/integration/test_lists/qa/llm_multinodes_function_test.txt b/tests/integration/test_lists/qa/llm_multinodes_function_test.txt
@@ -2,5 +2,6 @@ examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp
 examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp8-tp16pp1-infer]
 examples/test_mixtral.py::test_llm_mixtral_2nodes_8gpus[Mixtral-8x22B-v0.1-plugin-renormalize-tensor_parallel-build]
 examples/test_mixtral.py::test_llm_mixtral_2nodes_8gpus[Mixtral-8x22B-v0.1-plugin-renormalize-tensor_parallel-infer]
-test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus[DeepSeek-V3-DeepSeek-V3]
+test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus[DeepSeek-V3]
+test_e2e.py::test_ptp_quickstart_advanced_llama_2nodes[llama-3.3-models/Llama-3.3-70B-Instruct]
 test_e2e.py::test_openai_multinodes_chat_tp16pp1