add heuristic for xqa hopper spec-dec kernel, add test for fp8 llama3 ckpt

jhaotingc · jhaotingc · commit d20ab7d8c820 · 2025-05-18T13:29:59.000-07:00
Signed-off-by: Jhao-Ting Chen &lt;jhaotingc@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h
@@ -368,5 +368,84 @@ inline int computeMultiBlockCount(XQAParams const& xqaParams, int batch_size, in
     return multi_block_count;
 }
 
+inline int computeMultiBlockCountSpecDecGMMA(
+    XQAParams const& xqaParams, int batch_size, int multiprocessor_count, int specDecBlocks)
+{
+    auto const userSpecified = tensorrt_llm::common::getEnvXqaBlocksPerSequence();
+    if (userSpecified.has_value())
+    {
+        return userSpecified.value();
+    }
+    int multi_block_count = 1;
+
+    // skip large batch size
+    TLLM_CHECK_WITH_INFO(batch_size <= 32, "Multiblock tuning should be for only batch size <= 32");
+
+    int num_kv_heads = xqaParams.num_kv_heads;
+    int history_length = xqaParams.max_past_kv_length;
+
+    // gridDim = dim3{specDecBlocks, multi_block, nbKVHeads * xqaParams.batch_size}
+    int single_block_count = specDecBlocks * num_kv_heads * batch_size;
+    double wave_count = (double) single_block_count / (double) multiprocessor_count;
+
+    // Multi block tuning for low CTA: populating CTAs to at most 1 wave of SMs
+    if (wave_count < 1)
+    {
+        auto highestPowerof2 = [](int x)
+        {
+            x |= x >> 1;
+            x |= x >> 2;
+            x |= x >> 4;
+            x |= x >> 8;
+            x |= x >> 16;
+            return x ^ (x >> 1);
+        };
+
+        // calculate the maximum blocks to be populated at most 1 wave
+        multi_block_count = floor(multiprocessor_count / single_block_count);
+        // make multi_block_count a power of 2 for tuning convenience.
+        multi_block_count = highestPowerof2(multi_block_count);
+        // make multi_block_count at most 64 and at least 1.
+        multi_block_count = std::min(multi_block_count, 64);
+        multi_block_count = std::max(multi_block_count, 1);
+
+        // tune only when original CTA is too small, multi_block_count is too big, and history length < 2^16
+        // For Hopper, most cases there are 114, 132, 144 SMs. For H20 about 78.
+        // single_block_count = [1..8]
+        // multi_block_count = [16,32,64,128]
+        // history_length = [1024..65536]
+        if (single_block_count <= 8 && multi_block_count >= 16 && history_length < 65536)
+        {
+            if (history_length <= 1024)
+            {
+                // for history length <= 1024 and low CTA, scaling is not effective, so we set a hard limit to
+                // multi_block_count = 4
+                multi_block_count = std::min(multi_block_count, 4);
+            }
+            else if (history_length < 65536)
+            {
+                // at single_block == 8, multi_block_count can only be 16. (SM / 8 ~= 16)
+                // tune only 1024 < kvlen < 8192
+                if (single_block_count == 8 && history_length <= 8192)
+                {
+                    multi_block_count >>= 1;
+                }
+                else
+                {
+                    auto getLog2 = [](int x) { return x ? 31 - __builtin_clz(x) : -1; };
+                    auto history_length_log2 = getLog2(history_length);
+                    multi_block_count >>= 3 - (history_length_log2 - 10) / 2;
+                    // 2^15 (< 65536) -> shift 1
+                    // 2^13, 2^14 -> shift 2
+                    // 2^11, 2^12 (> 1024) -> shift 3
+                }
+            }
+        }
+        TLLM_CHECK_WITH_INFO((multi_block_count * single_block_count) <= multiprocessor_count,
+            "The adjusted MultiBlock exceed number of SMs, adding additional wave may result to perf drop.");
+    }
+    return multi_block_count;
+}
+
 } // namespace kernels
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
@@ -380,6 +380,12 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&
     {
         multi_block = computeMultiBlockCount(xqaParams, xqaParams.batch_size, multiprocessor_count);
     }
+    // A WAR to enable Hopper XQA multi-token multi_block mode for low batch size
+    if (isSpecDec && isGMMAKernel && xqaParams.batch_size <= 32)
+    {
+        multi_block
+            = computeMultiBlockCountSpecDecGMMA(xqaParams, xqaParams.batch_size, multiprocessor_count, specDecBlocks);
+    }
     uint32_t const nbKVHeads = xqaParams.num_kv_heads;
     auto const gridDim = (isGMMAKernel ? dim3{specDecBlocks, multi_block, nbKVHeads * xqaParams.batch_size}
                                        : dim3{multi_block, nbKVHeads, xqaParams.batch_size});
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
@@ -705,6 +705,7 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
         mUseSpecDecoding = useSpecDecoding;
         // change mMultiBlockMode to default
         mMultiBlockMode = mUseSpecDecoding ? false : true;
+        // if Hopper XQA kernel is enabled, multi block mode will be true in decoderXQAImplJIT::runImpl
     }
 
     [[maybe_unused]] MlaParams<T> mla_params;
diff --git a/tests/integration/defs/examples/test_eagle.py b/tests/integration/defs/examples/test_eagle.py
@@ -96,14 +96,16 @@ def test_llm_eagle_1gpu(batch_size, data_type, use_dynamic_tree,
 # TODO: remove skip_post_blackwell after Speculative decoding is supported.
 @skip_post_blackwell
 @skip_pre_ada
+@pytest.mark.parametrize("use_dynamic_tree", [False, True],
+                         ids=['eagle1', 'eagle2'])
 @pytest.mark.parametrize("batch_size", [8], ids=['bs8'])
 @pytest.mark.parametrize("data_type", ['float16'])
 @pytest.mark.parametrize("eagle_model_roots", ["llama3.1-eagle-8b-hf_v0.5"],
                          indirect=True)
-def test_llm_eagle_1gpu_modelopt_ckpt(batch_size, data_type, eagle_model_roots,
-                                      eagle_example_root, llm_datasets_root,
-                                      llm_rouge_root, llm_venv, cmodel_dir,
-                                      engine_dir):
+def test_llm_eagle_1gpu_modelopt_ckpt(batch_size, data_type, use_dynamic_tree,
+                                      eagle_model_roots, eagle_example_root,
+                                      llm_datasets_root, llm_rouge_root,
+                                      llm_venv, cmodel_dir, engine_dir):
     print("Build engines...")
     model_name = "eagle"
 
@@ -141,6 +143,22 @@ def test_llm_eagle_1gpu_modelopt_ckpt(batch_size, data_type, eagle_model_roots,
 
     venv_check_call(llm_venv, run_cmd)
 
+    print("Run summarize...")
+    summary_cmd = [
+        f"{eagle_example_root}/../summarize.py", "--test_trt_llm",
+        "--hf_model_dir", f"{eagle_model_roots}", "--tokenizer_dir",
+        f"{eagle_model_roots}", f"--engine_dir={engine_dir}",
+        "--check_accuracy", "--tensorrt_llm_rouge1_threshold=24",
+        "--eagle_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], [0, 5], [0, 0, 1], [5], [0, 6], [6], [0, 7], [0, 1, 0], [1, 1], [7], [0, 8], [0, 0, 2], [3, 0], [0, 9], [8], [9], [1, 0, 0], [0, 2, 0], [1, 2], [0, 0, 3], [4, 0], [2, 1], [0, 0, 4], [0, 0, 5], [0, 0, 0, 0], [0, 1, 1], [0, 0, 6], [0, 3, 0], [5, 0], [1, 3], [0, 0, 7], [0, 0, 8], [0, 0, 9], [6, 0], [0, 4, 0], [1, 4], [7, 0], [0, 1, 2], [2, 0, 0], [3, 1], [2, 2], [8, 0], [0, 5, 0], [1, 5], [1, 0, 1], [0, 2, 1], [9, 0], [0, 6, 0], [0, 0, 0, 1], [1, 6], [0, 7, 0]]",
+        f"--max_ite=40", f"--batch_size={batch_size}",
+        f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
+    ]
+    if use_dynamic_tree:
+        summary_cmd.extend(
+            [f"--eagle_dynamic_tree_max_top_k={3}", "--eagle_use_dynamic_tree"])
+
+    venv_check_call(llm_venv, summary_cmd)
+
 
 def test_with_dummy_eagle(hf_model_root,
                           use_dynamic_tree,
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -220,7 +220,8 @@ l0_h100:
       backend: tensorrt
   tests:
   # ------------- TRT tests ---------------
-  - examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8] # 9 mins
+  - examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8-eagle1] # 9 mins
+  - examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8-eagle2] # 9 mins
   - examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1]
   - examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] # 5 mins
   - accuracy/test_llm_api.py::TestMistral_NeMo_Minitron_8B_Instruct::test_fp8

Original file line number	Diff line number	Diff line change
`@@ -380,6 +380,12 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&`
`380`	`380`	`{`
`381`	`381`	`multi_block = computeMultiBlockCount(xqaParams, xqaParams.batch_size, multiprocessor_count);`
`382`	`382`	`}`
	`383`	`+ // A WAR to enable Hopper XQA multi-token multi_block mode for low batch size`
	`384`	`+ if (isSpecDec && isGMMAKernel && xqaParams.batch_size <= 32)`
	`385`	`+ {`
	`386`	`+ multi_block`
	`387`	`+ = computeMultiBlockCountSpecDecGMMA(xqaParams, xqaParams.batch_size, multiprocessor_count, specDecBlocks);`
	`388`	`+ }`
`383`	`389`	`uint32_t const nbKVHeads = xqaParams.num_kv_heads;`
`384`	`390`	`auto const gridDim = (isGMMAKernel ? dim3{specDecBlocks, multi_block, nbKVHeads * xqaParams.batch_size}`
`385`	`391`	`: dim3{multi_block, nbKVHeads, xqaParams.batch_size});`
Original file line number	Diff line number	Diff line change
`@@ -705,6 +705,7 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32`
`705`	`705`	`mUseSpecDecoding = useSpecDecoding;`
`706`	`706`	`// change mMultiBlockMode to default`
`707`	`707`	`mMultiBlockMode = mUseSpecDecoding ? false : true;`
	`708`	`+ // if Hopper XQA kernel is enabled, multi block mode will be true in decoderXQAImplJIT::runImpl`
`708`	`709`	`}`
`709`	`710`
`710`	`711`	`[[maybe_unused]] MlaParams<T> mla_params;`