Support general K in group gemm (#4636)

renganxu · facebook-github-bot · commit 7937cb871cec · 2025-08-04T08:02:12.000-07:00
Summary: Pull Request resolved: #4636 X-link: facebookresearch/FBGEMM#1668 The existing group gemm does not work when K is not a multiple of 8. In this case, TMA load cannot be used. This diff adds the support for this general case. It works by adding mask when loading the data. With this change, when K is not a multiple of 8, using fuse_scatter_add results in larger numerical discrepancies that cause the unit test to fail. This is most likely due to atomic add operations in scatter_add or increased rounding errors with negative numbers, since the unit test uses randn. However, if the unit test uses rand to generate only positive numbers, then the test passes successfully. For now, in this case we just disable fused scatter_add in unit test, and do not allow it in group gemm impl. Reviewed By: sgrigory Differential Revision: D79393881 fbshipit-source-id: a04ad8ed1be64ac31b5db325524b44e6eabf7de3
diff --git a/fbgemm_gpu/experimental/gemm/test/grouped_gemm_test.py b/fbgemm_gpu/experimental/gemm/test/grouped_gemm_test.py
@@ -156,7 +156,7 @@ def msg(s: str) -> str:
         G=st.sampled_from([1, 4, 16, 128]),
         M=st.sampled_from([0, 128, 2048, 16384]),
         N=st.sampled_from([256]),
-        K=st.sampled_from([256]),
+        K=st.sampled_from([100, 256, 257]),
         warp_specialization=st.sampled_from(
             [True, False] if torch.cuda.is_available() and _HAS_WS_SUPPORT else [False]
         ),
@@ -179,9 +179,14 @@ def test_grouped_gemm_bf16(
         warp_specialization: bool,
         fuse_scatter_add: bool,
     ) -> None:
+        if K % 8 != 0:
+            # When K is not a multiple of 8, using fuse_scatter_add has large numerical discrepancy,
+            # possibly due to atomic add in scatter_add or larger rounding error with negative numbers.
+            fuse_scatter_add = False
+
         torch.manual_seed(0)
 
-        device = torch.device("cuda")
+        device = torch.accelerator.current_accelerator()
         a = torch.randn(M, K, dtype=torch.bfloat16, device=device)
         b = torch.randn(N * G, K, dtype=torch.bfloat16, device=device)
         m_ends, _ = torch.sort(
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py
@@ -149,11 +149,10 @@ def early_config_prune(configs, named_args, dtsize=None, dtype=None, **kwargs):
             config.num_consumer_groups,
             kw.get("USE_TMA_LOAD_ON_SCALES", False),
         )
-        G, M, N, K = (
+        G, M, N = (
             named_args["G"],
             named_args["M_BUCKET"],
             named_args["N"],
-            named_args["K"],
         )
 
         # 1. make sure we have enough smem
@@ -198,11 +197,7 @@ def early_config_prune(configs, named_args, dtsize=None, dtype=None, **kwargs):
         if BLOCK_N < 128 and M * N_TILES > 2 * num_sm:
             continue
 
-        # 6. make sure K can be evenly divided
-        if K % BLOCK_K != 0:
-            continue
-
-        # 7. make sure we can partition for ws
+        # 6. make sure we can partition for ws
         if use_warp_specialization:
             if num_warps != 4:
                 continue
@@ -302,8 +297,9 @@ def _fbgemm_grouped_gemm(
                 tile_n_idx = gidx // num_m_tiles
 
                 accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-                tl.static_assert(K % BLOCK_SIZE_K == 0)
+
                 if USE_TMA_LOAD:
+                    tl.static_assert(K % BLOCK_SIZE_K == 0)
                     m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
                     n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
                     for k_offset in range(0, K, BLOCK_SIZE_K):
@@ -338,8 +334,18 @@ def _fbgemm_grouped_gemm(
                         + offs_k[None, :]
                     )
                     for k_offset in range(0, K, BLOCK_SIZE_K):
-                        a = tl.load(a_ptrs, mask=offs_am[:, None] < m_size)
-                        b = tl.load(b_ptrs, mask=offs_bn[:, None] < n_size)
+                        updated_k_offset = k_offset + offs_k
+                        updated_k_offset_mask = updated_k_offset[None, :] < K  # type: ignore[16]
+                        a = tl.load(
+                            a_ptrs,
+                            mask=((offs_am[:, None] < m_size) & updated_k_offset_mask),
+                            other=0.0,
+                        )
+                        b = tl.load(
+                            b_ptrs,
+                            mask=((offs_bn[:, None] < n_size) & updated_k_offset_mask),
+                            other=0.0,
+                        )
                         accumulator += tl.dot(a, b.T)
                         a_ptrs += BLOCK_SIZE_K
                         b_ptrs += BLOCK_SIZE_K
@@ -960,20 +966,28 @@ def _grouped_gemm(
 
     if USE_TMA_LOAD and not utils.HAS_TMA_DESC:
         USE_TMA_LOAD = False
-        warnings.warn("TMA load is disabled as there is no TMA descriptor support!")
+        warnings.warn(
+            "TMA load is disabled as there is no TMA descriptor support!", stacklevel=2
+        )
 
     if USE_TMA_STORE and not utils.HAS_TMA_DESC:
         USE_TMA_STORE = False
-        warnings.warn("TMA store is disabled as there is no TMA descriptor support!")
+        warnings.warn(
+            "TMA store is disabled as there is no TMA descriptor support!", stacklevel=2
+        )
 
     # TODO(shikaili): Check the readniess of WS on ROCm side in Meta's Triton.
     if use_warp_specialization and torch.version.hip:
-        warnings.warn("Warp specialization is disabled as it is not supported on ROCm.")
+        warnings.warn(
+            "Warp specialization is disabled as it is not supported on ROCm.",
+            stacklevel=2,
+        )
         use_warp_specialization = False
 
     if use_warp_specialization and not _HAS_WS_SUPPORT:
         warnings.warn(
-            "Warp specialization is disabled as the Triton build in current environment doesn't have such support. Please build from https://github.com/facebookexperimental/triton/tree/ws-3.2.x to enable it for best performance on Nvidia's SM90 GPUs."
+            "Warp specialization is disabled as the Triton build in current environment doesn't have such support. Please build from https://github.com/facebookexperimental/triton/tree/ws-3.2.x to enable it for best performance on Nvidia's SM90 GPUs.",
+            stacklevel=2,
         )
         use_warp_specialization = False
 
@@ -991,6 +1005,22 @@ def _grouped_gemm(
     N = w.shape[0] // G
     assert K == w.shape[1]
 
+    if K % 8 != 0:
+        use_warp_specialization = False
+        USE_TMA_LOAD = False
+        USE_TMA_STORE = False
+        warnings.warn(
+            f"TMA load and warp specialization are disabled since K is not a multiple of 8: {K=}.",
+            stacklevel=2,
+        )
+        assert (
+            x_scale is None
+        ), f"Quantisation is not supported yet when K is not a multiple of 8: {K=}"
+
+        assert (
+            output_tensor is None
+        ), f"Fused scatter add has large rounding error when K is not a multiple of 8: {K=}"
+
     if output_tensor is None:
         FUSE_SCATTER_ADD = False
         assert scatter_add_indices is None