add chunked-attention kernels for blackwell

PerkzZheng · PerkzZheng · commit d21f87fb2113 · 2025-05-19T03:54:32.000Z
Signed-off-by: Perkz Zheng &lt;67892460+PerkzZheng@users.noreply.github.com&gt;

fix

Signed-off-by: Perkz Zheng &lt;67892460+PerkzZheng@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp
@@ -694,7 +694,7 @@ size_t AttentionOp::getWorkspaceSizeForContext(nvinfer1::DataType type, int32_t
     else if (mFmhaDispatcher->isSeparateQAndKvInput())
     {
         // Paged context fmha
-        q_buf_2_size = (mFP8ContextFMHA ? 1 : size) * max_num_tokens * local_hidden_units_qo;
+        q_buf_2_size = (isFP8ContextFmhaInput() ? 1 : size) * max_num_tokens * local_hidden_units_qo;
     }
 
     size_t const k_buf_2_size = mEnableContextFMHA ? 0 : size * batch_size * kv_seq_length * local_hidden_units_kv;
@@ -705,16 +705,16 @@ size_t AttentionOp::getWorkspaceSizeForContext(nvinfer1::DataType type, int32_t
     size_t const qk_buf_float_size
         = mEnableContextFMHA ? 0 : sizeof(float) * batch_size * mNumHeads * input_seq_length * kv_seq_length;
     size_t const fp8_qkv_buffer_size
-        = mFP8ContextFMHA && mEnableContextFMHA && !mFmhaDispatcher->isSeparateQAndKvInput()
+        = isFP8ContextFmhaInput() && mEnableContextFMHA && !mFmhaDispatcher->isSeparateQAndKvInput()
         ? max_num_tokens * size_t(local_hidden_units_qo + 2 * local_hidden_units_kv)
         : 0;
     size_t const padding_offset_size = mEnableContextFMHA ? 0 : sizeof(int) * max_num_tokens;
     size_t const encoder_padding_offset_size = mEnableContextFMHA ? 0 : sizeof(int) * max_num_tokens;
     // Each token holds (batch_idx, token_idx_in_seq) int2.
     size_t const tokens_info_size = sizeof(int2) * max_num_tokens;
     size_t const fmha_scheduler_counter = mEnableContextFMHA ? sizeof(uint32_t) : 0;
-    size_t const fmha_bmm1_scale_size = mFP8ContextFMHA ? sizeof(float) * 2 : 0;
-    size_t const fmha_bmm2_scale_size = mFP8ContextFMHA ? sizeof(float) : 0;
+    size_t const fmha_bmm1_scale_size = isFP8ContextFmhaInput() ? sizeof(float) * 2 : 0;
+    size_t const fmha_bmm2_scale_size = isFP8ContextFmhaInput() ? sizeof(float) : 0;
 
     // cp workspace size upper bound
     size_t const cpMaxPaddedSequenceLength = max_num_tokens + batch_size * (mCpSize - 1);
@@ -1254,6 +1254,7 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
         = mEnableContextFMHA ? 0 : sizeof(T) * params.batch_size * params.input_seq_length * kv_seq_length;
     size_t const cu_seqlens_size = sizeof(int) * (params.batch_size + 1);
     size_t const rotary_inv_freq_size = sizeof(float) * params.batch_size * mRotaryEmbeddingDim / 2;
+    TLLM_CHECK_WITH_INFO(isFP8ContextFmhaInput() == mFmhaDispatcher->isFP8InputQ(), "internal error");
     size_t q_buf_2_size = 0;
     if (!mEnableContextFMHA)
     {
@@ -1263,7 +1264,7 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
     else if (mFmhaDispatcher->isSeparateQAndKvInput())
     {
         // Paged context fmha
-        q_buf_2_size = (mFP8ContextFMHA ? 1 : sizeof(T)) * params.num_tokens * local_hidden_units_qo;
+        q_buf_2_size = (isFP8ContextFmhaInput() ? 1 : sizeof(T)) * params.num_tokens * local_hidden_units_qo;
     }
 
     size_t const k_buf_2_size
@@ -1278,7 +1279,7 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
         ? 0
         : sizeof(float) * params.batch_size * mNumHeads * params.input_seq_length * kv_seq_length;
     size_t const fp8_qkv_buffer_size
-        = mEnableContextFMHA && mFP8ContextFMHA && !mFmhaDispatcher->isSeparateQAndKvInput()
+        = mEnableContextFMHA && isFP8ContextFmhaInput() && !mFmhaDispatcher->isSeparateQAndKvInput()
         ? params.num_tokens * (local_hidden_units_qo + 2 * local_hidden_units_kv)
         : 0;
     size_t const padding_offset_size
@@ -1288,8 +1289,8 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
     // Each token holds (batch_idx, token_idx_in_seq) int2.
     size_t const tokens_info_size = sizeof(int2) * params.num_tokens;
     size_t const fmha_scheduler_counter = mEnableContextFMHA ? sizeof(uint32_t) : 0;
-    size_t const fmha_bmm1_scale_size = mFP8ContextFMHA ? sizeof(float) * 2 : 0;
-    size_t const fmha_bmm2_scale_size = mFP8ContextFMHA ? sizeof(float) : 0;
+    size_t const fmha_bmm1_scale_size = isFP8ContextFmhaInput() ? sizeof(float) * 2 : 0;
+    size_t const fmha_bmm2_scale_size = isFP8ContextFmhaInput() ? sizeof(float) : 0;
 
     // cp workspace size upper bound
     size_t const cpMaxPadedSequenceLength = params.num_tokens + params.batch_size * (mCpSize - 1);
@@ -1514,7 +1515,7 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
         preprocessingParams.position_shift_enabled = mPosShiftEnabled;
         preprocessingParams.cache_type = cache_type;
         preprocessingParams.separate_q_kv_output = enablePagedKVContextFMHA || isCrossAttention();
-        preprocessingParams.quantized_fp8_output = mFP8ContextFMHA;
+        preprocessingParams.quantized_fp8_output = isFP8ContextFmhaInput();
         preprocessingParams.generation_phase = false;
         preprocessingParams.multi_processor_count = mMultiProcessorCount;
 
@@ -1614,8 +1615,8 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
         // TODO: set it correctly for contiguous kv buffer (cross-attention).
         fmhaParams.totalKvSeqLen = isCrossAttention() ? params.num_encoder_tokens : params.num_tokens;
         // Device buffer pointers.
-        fmhaParams.qkvPtr = mFP8ContextFMHA ? reinterpret_cast<void const*>(fp8_qkv_buffer)
-                                            : reinterpret_cast<void const*>(attention_input);
+        fmhaParams.qkvPtr = isFP8ContextFmhaInput() ? reinterpret_cast<void const*>(fp8_qkv_buffer)
+                                                    : reinterpret_cast<void const*>(attention_input);
         fmhaParams.qPtr = reinterpret_cast<void const*>(q_buf_2_);
         // TODO: add contiguous kv buffer (cross-attention).
         fmhaParams.kvPtr = nullptr;
@@ -2423,6 +2424,12 @@ int AttentionOp::initialize() noexcept
             if (mKVCacheQuantMode.hasFp8KvCache())
             {
                 fmhaParams.dataTypeKv = DATA_TYPE_E4M3;
+                // Trtllm-gen kernels forces using FP8 FMHA kernels if fp8 kv cache is used.
+                // FP8 Q/KV input and FP8/BF16/FP16 output are supported.
+                if (mUseTllmGen)
+                {
+                    fmhaParams.dataType = DATA_TYPE_E4M3;
+                }
             }
             // TODO: add FP4 KV cache support.
         }
diff --git a/cpp/tensorrt_llm/common/attentionOp.h b/cpp/tensorrt_llm/common/attentionOp.h
@@ -258,6 +258,17 @@ class AttentionOp
         return mPositionEmbeddingType == tensorrt_llm::kernels::PositionEmbeddingType::kALIBI_WITH_SCALE;
     }
 
+    [[nodiscard]] bool isFP8ContextFmhaInput() const
+    {
+        if (mUseTllmGen)
+        {
+            // Trtllm-gen kernels forces using FP8 FMHA kernels if fp8 kv cache is used.
+            // FP8 Q/KV input and FP8/BF16/FP16 output are supported.
+            return mKVCacheQuantMode.hasFp8KvCache();
+        }
+        return mFP8ContextFMHA;
+    }
+
     [[nodiscard]] bool isRoPE() const
     {
         return mPositionEmbeddingType == tensorrt_llm::kernels::PositionEmbeddingType::kROPE_GPTJ
diff --git a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp
@@ -183,6 +183,8 @@ void FmhaDispatcher::run(MHARunnerParams runnerParams)
         tllmRunnerParams.mMaxSeqLenQ = runnerParams.qSeqLen;
         tllmRunnerParams.mMaxSeqLenKv = runnerParams.kvSeqLen;
         tllmRunnerParams.mAttentionWindowSize = runnerParams.slidingWindowSize;
+        // Set chunked attention size to INT_MAX to disable chunked attention for now.
+        tllmRunnerParams.mChunkedAttentionSize = INT_MAX;
         tllmRunnerParams.mSumOfSeqLensQ = runnerParams.totalQSeqLen;
         tllmRunnerParams.mSumOfSeqLensKv = runnerParams.totalKvSeqLen;
         tllmRunnerParams.mMaxNumPagesPerSeqKv = maxBlocksPerSeq;
diff --git a/cpp/tensorrt_llm/kernels/fmhaDispatcher.h b/cpp/tensorrt_llm/kernels/fmhaDispatcher.h
@@ -37,6 +37,12 @@ class FmhaDispatcher
     // Deconstructor.
     ~FmhaDispatcher() = default;
 
+    // Does the fmha kernel need FP8 inputQ ?
+    bool isFP8InputQ() const
+    {
+        return mFixedParams.dataType == DATA_TYPE_E4M3;
+    }
+
     // Check if any fmha kernel meets the requirements.
     bool isSupported();
 
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h
@@ -465,10 +465,13 @@ class TllmGenFmhaKernel
 
         // The mask type.
         TrtllmGenAttentionMaskType maskType = params.mMaskType;
-        // Enable sliding window causal if the max kv sequence length exceeds attention window size.
-        if (params.mAttentionWindowSize < params.mMaxSeqLenKv && maskType == TrtllmGenAttentionMaskType::Causal)
+        // Enable sliding window or chunked causal if the max kv sequence length exceeds attention window size or
+        // chunked attention size.
+        if (maskType == TrtllmGenAttentionMaskType::Causal
+            && (params.mMaxSeqLenKv > params.mAttentionWindowSize
+                || params.mMaxSeqLenKv > params.mChunkedAttentionSize))
         {
-            maskType = TrtllmGenAttentionMaskType::SlidingWindowCausal;
+            maskType = TrtllmGenAttentionMaskType::SlidingOrChunkedCausal;
         }
         // NumTokensPerPage is set to 0 when not selecting pagedKv-layout kernels.
         int numTokensPerPage = (!isPagedKv(params.mQkvLayout)) ? 0 : params.mNumTokensPerPage;
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h
@@ -32,8 +32,8 @@ enum class TrtllmGenAttentionMaskType
     Dense = 0,
     // Causal mask.
     Causal,
-    // Sliding window causal mask.
-    SlidingWindowCausal,
+    // Sliding window or chunked causal mask.
+    SlidingOrChunkedCausal,
     // Custom mask.
     Custom
 };
@@ -50,7 +50,7 @@ enum class TrtllmGenAttentionMaskType
 
 ATTENTION_MASK_TYPE_FUNCTION(Dense)
 ATTENTION_MASK_TYPE_FUNCTION(Causal)
-ATTENTION_MASK_TYPE_FUNCTION(SlidingWindowCausal)
+ATTENTION_MASK_TYPE_FUNCTION(SlidingOrChunkedCausal)
 ATTENTION_MASK_TYPE_FUNCTION(Custom)
 
 #undef ATTENTION_MASK_TYPE_FUNCTION
@@ -246,8 +246,11 @@ struct TllmGenFmhaRunnerParams
     int mMaxSeqLenQ;
     // The max kv sequence length.
     int mMaxSeqLenKv;
-    // The attention window size for sliding window attention.
+    // The attention window size for sliding window attention (sliding-window-attention is enabled when seqLenKv >
+    // mAttentionWindowSize).
     int mAttentionWindowSize;
+    // The chunked attention size (chunked-context is enabled when seqLenKv > mChunkedAttentionSize).
+    int mChunkedAttentionSize;
     // The sum of sequence lengths for Q and K/V. (Only used when mSupportsVarSeqLens = true)
     int mSumOfSeqLensQ;
     int mSumOfSeqLensKv;
@@ -283,8 +286,8 @@ struct TllmGenFmhaRunnerParams
         case 1: // tensorrt_llm::kernels::ContextAttentionMaskType::CAUSAL
             mMaskType = TrtllmGenAttentionMaskType::Causal;
             break;
-        case 2: // tensorrt_llm::kernels::ContextAttentionMaskType::SLIDING_WINDOW_CAUSAL
-            mMaskType = TrtllmGenAttentionMaskType::SlidingWindowCausal;
+        case 2: // tensorrt_llm::kernels::ContextAttentionMaskType::SLIDING_OR_CHUNKED_CAUSAL
+            mMaskType = TrtllmGenAttentionMaskType::SlidingOrChunkedCausal;
             break;
         case 3: // tensorrt_llm::kernels::ContextAttentionMaskType::CUSTOM_MASK
             mMaskType = TrtllmGenAttentionMaskType::Custom;
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h
@@ -108,6 +108,8 @@ struct KernelParams
     int32_t mAttentionWindowSize;
     // The batch size
     int32_t mBatchSize;
+    // The chunked attention size in log2.
+    int32_t mChunkedAttentionSizeLog2;
     // The log of the Sage Attention block size for K.
     int32_t mLogNumEltsPerSageAttnBlkK;
     // The log of the Sage Attention block size for P.
@@ -741,6 +743,17 @@ struct KernelParams
         params.ptrSoftmaxStats = options.softmaxStatsPtr;
 
         params.mAttentionWindowSize = options.mAttentionWindowSize;
+        if (isSlidingOrChunkedCausalMask(options.mMaskType) && options.mMaxSeqLenKv > options.mChunkedAttentionSize)
+        {
+            TLLM_CHECK_WITH_INFO((options.mChunkedAttentionSize & (options.mChunkedAttentionSize - 1)) == 0,
+                "Chunked attention size must be a power of 2");
+            params.mChunkedAttentionSizeLog2 = std::log2(options.mChunkedAttentionSize);
+        }
+        else
+        {
+            // Default 0 means that chunked attention is disabled.
+            params.mChunkedAttentionSizeLog2 = 0;
+        }
         params.mMaxSeqLenQ = options.mMaxSeqLenQ;
         params.mMaxSeqLenKv = options.mMaxSeqLenKv;
         params.mMaxNumCtasQ = maxNumCtasQ;