Padded KV Partial Prefill Case (#4848)

Aya-ZIbra · facebook-github-bot · commit 7ec2a1cbd6d0 · 2025-09-10T17:35:47.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1879 Pull Request resolved: #4848 The current KV padding only suppported full prefill case (D78967317). This diff adds partial prefill support as well. Coverage added in the tests. WIP: upstreaming this. ( D78967317 and this diff) Reviewed By: sryap Differential Revision: D82080682 fbshipit-source-id: 7a6c7a0d3c32245e5c13864b1f0cfe37d8d254c4
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/fmha_fusion.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/fmha_fusion.hpp
@@ -636,6 +636,29 @@ apply_variable_length_offset(Shape const& shape, Coord const& coord) {
   return cute::make_tuple(result_shape, result_offset);
 }
 
+template <class Shape, class Idx>
+CUTE_HOST_DEVICE constexpr auto apply_variable_length_paddedkv(
+    Shape const& shape,
+    Idx const& idx,
+    int kv_length) {
+  // Use a position counter to track which element we're processing
+  int position_counter = 0;
+
+  return transform_leaf(shape, [&](auto const& s) {
+    if constexpr (is_variable_length_v<decltype(s)>) {
+      int current_pos = position_counter++;
+      if (current_pos == 1) {
+        return kv_length;
+      } else {
+        return s.cumulative_length[idx + 1] - s.cumulative_length[idx];
+      }
+    } else {
+      position_counter++;
+      return s;
+    }
+  });
+}
+
 }  // namespace cutlass::fmha::collective
 
 namespace cute {
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp
@@ -255,13 +255,9 @@ struct Sm100FmhaFwdKernelTmaWarpspecialized {
     // If seqlen_kv is provided, use it to determine the sequence length for
     // key-value pairs
     if (params.seqlen_kv != nullptr) {
-      return transform_leaf(problem_shape, [&](auto const& s) {
-        if constexpr (is_variable_length_v<decltype(s)>) {
-          return params.seqlen_kv[batch_idx];
-        } else {
-          return s;
-        }
-      });
+      // Position-aware replacement that only replaces K/V (position 1)
+      return apply_variable_length_paddedkv(
+          problem_shape, batch_idx, params.seqlen_kv[batch_idx]);
     } else {
       // Fall back to the original behavior
       return apply_variable_length(params.problem_shape, batch_idx);
diff --git a/fbgemm_gpu/experimental/gen_ai/test/attention/blackwell_fmha_test.py b/fbgemm_gpu/experimental/gen_ai/test/attention/blackwell_fmha_test.py
@@ -438,11 +438,13 @@ def test_decode(
             (
                 kv_padding,
                 batch_size,
+                q_heads,
                 causal,
                 window_size,
             )
             for kv_padding in [128, 256, 512, 1024]
             for batch_size in [2, 8]
+            for q_heads in [8, 16]
             for causal in [True, False]
             for window_size in [(-1, -1), (0, 0), (0, 128), (128, 0), (1024, 0)]
         ]
@@ -451,6 +453,7 @@ def test_jagged_vs_padded_kv(
         self,
         kv_padding: int,
         batch_size: int,
+        q_heads: int,
         causal: bool,
         window_size: tuple[int, int] = (-1, -1),
     ) -> None:
@@ -465,11 +468,9 @@ def test_jagged_vs_padded_kv(
         # kv_padding = 128
         seqlen_q = kv_padding  # Maximum sequence length (padded size)
         device = torch.accelerator.current_accelerator()
-        q_heads = 1
         kv_heads = 1
         head_dim = 128
         dtype = torch.bfloat16
-        causal = False
 
         # Create tensors
         q_padded = torch.randn(
@@ -499,11 +500,14 @@ def test_jagged_vs_padded_kv(
             device=device,
         ).to(dtype)
 
-        qk_padding_mask = generate_random_padding_mask(
+        k_padding_mask = generate_random_padding_mask(
+            kv_padding, batch_size, device, mode="random", zero_lengths=False
+        )
+        q_padding_mask = generate_random_padding_mask(
             kv_padding, batch_size, device, mode="third", zero_lengths=False
         )
         # # Always have seqlen_k >= seqlen_q
-        # key_padding_mask[:, :seqlen_q] |= query_padding_mask
+        k_padding_mask[:, :seqlen_q] |= q_padding_mask
         (
             q_unpad,
             k_unpad,
@@ -524,8 +528,8 @@ def test_jagged_vs_padded_kv(
             q_padded,
             k_padded,
             v_padded,
-            qk_padding_mask,
-            qk_padding_mask,
+            q_padding_mask,
+            k_padding_mask,
         )
         # Create variable length sequences
         cu_seqlens_k_padded = torch.zeros(
@@ -546,6 +550,9 @@ def test_jagged_vs_padded_kv(
             print(f"jagged cu_seqlens_k: {cu_seqlens_k_jagged}")
             print(f"padded cu_seqlens_k: {cu_seqlens_k_padded}")
             print(f"seqlen_kv: {seqused_k}")
+            print(f"max_seqlen_q: {max_seqlen_q}")
+            print(f"max_seqlen_k: {max_seqlen_k}")
+            print(f"q_unpad: {q_unpad.shape}")
 
         # Scenario A: Jagged KV with cu_seqlens_k
         out_jagged = cutlass_blackwell_fmha_func(
@@ -554,7 +561,7 @@ def test_jagged_vs_padded_kv(
             v_unpad,
             cu_seqlens_q=cu_seqlens_q,
             cu_seqlens_k=cu_seqlens_k_jagged,
-            max_seq_len_q=seqlen_q,
+            max_seq_len_q=max_seqlen_q,
             max_seq_len_k=max_seqlen_k,
             causal=causal,
             window_size=window_size,
@@ -571,12 +578,17 @@ def test_jagged_vs_padded_kv(
             v_,
             cu_seqlens_q=cu_seqlens_q,
             cu_seqlens_k=cu_seqlens_k_padded,
-            max_seq_len_q=seqlen_q,
+            max_seq_len_q=max_seqlen_q,
             max_seq_len_k=max_seqlen_k,
             causal=causal,
             window_size=window_size,
             seqlen_kv=seqused_k,
         )
+        if DEBUG:
+            print(f"out_jagged: {out_jagged}")
+            print(f"k_: {k_.shape}")
+            print(f"v_: {v_.shape}")
+            print(f"out_padded: {out_padded}")
 
         # # Compare outputs
         diff = (out_jagged - out_padded).abs().max().item()