pytorch · jiawenliu64 · Jun 15, 2025
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_grouped.cu
@@ -51,6 +51,20 @@ get_kernel_via_heuristics(int total_M, int max_N, int max_K, int G) {
 
   // Use heuristics to pick the best kernel implementation.
   if (arch == 10) {
+    // Llama4 shapes
+    if ((max_N == 5120 && max_K == 1024) || (max_N == 2048 && max_K == 5120)) {
+      if (total_M <= 256) {
+        return f8f8bf16_rowwise_grouped_256_32_128_2_1_1_10_f;
+      } else if (total_M <= 512) {
+        return f8f8bf16_rowwise_grouped_256_64_128_2_1_1_10_f;
+      } else if (total_M <= 1024) {
+        return f8f8bf16_rowwise_grouped_256_128_128_2_1_1_10_f;
+      } else {
+        return f8f8bf16_rowwise_grouped_256_256_128_2_1_1_10_f;
+      }
+    }
+
+    // Fallback to legacy heuristic.
     if (total_M <= 64 || (total_M <= 256 and max_N <= 1024)) {
       if (max_K <= 4096) {
         return f8f8bf16_rowwise_grouped_256_32_128_2_1_1_10_f;

diff --git a/...ensions/f8f8bf16_rowwise_grouped_sm100/f8f8bf16_rowwise_grouped_128_256_128_2_1_1_10_f.cu b/...ensions/f8f8bf16_rowwise_grouped_sm100/f8f8bf16_rowwise_grouped_128_256_128_2_1_1_10_f.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f8f8bf16_rowwise_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor f8f8bf16_rowwise_grouped_128_256_128_2_1_1_10_f(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return f8f8bf16_rowwise_grouped_sm100_impl<
+      at::Tensor,
+      128,
+      256,
+      128,
+      2,
+      1,
+      1,
+      10,
+      false>(XQ, WQ, x_scale, w_scale, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor f8f8bf16_rowwise_grouped_128_256_128_2_1_1_10_f(
+    at::TensorList XQ,
+    at::TensorList WQ,
+    at::TensorList x_scale,
+    at::TensorList w_scale,
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return f8f8bf16_rowwise_grouped_sm100_impl<
+      at::TensorList,
+      128,
+      256,
+      128,
+      2,
+      1,
+      1,
+      10,
+      false>(XQ, WQ, x_scale, w_scale, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/...e/cutlass_extensions/f8f8bf16_rowwise_grouped_sm100/f8f8bf16_rowwise_grouped_manifest.cuh b/...e/cutlass_extensions/f8f8bf16_rowwise_grouped_sm100/f8f8bf16_rowwise_grouped_manifest.cuh
@@ -64,6 +64,24 @@ at::Tensor f8f8bf16_rowwise_grouped_128_128_128_2_1_1_10_f(
     std::optional<at::Tensor> zero_start_index_M,
     std::optional<at::Tensor> M_sizes);
 
+at::Tensor f8f8bf16_rowwise_grouped_128_256_128_2_1_1_10_f(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes);
+
+at::Tensor f8f8bf16_rowwise_grouped_128_256_128_2_1_1_10_f(
+    at::TensorList XQ,
+    at::TensorList WQ,
+    at::TensorList x_scale,
+    at::TensorList w_scale,
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes);
+
 at::Tensor f8f8bf16_rowwise_grouped_256_32_128_2_1_1_10_f(
     at::Tensor XQ,
     at::Tensor WQ,