Support tuning cache for Cutlass BF16 grouped GEMM

cthi · facebook-github-bot · commit 02ee7ccd417b · 2025-06-09T07:56:38.000-07:00
Summary:
This diff adds support for the tuning cache to the kernel. There should be no performance changes to the existing heuristics.
- I refactored the kernel dispatch logic to instead return the kernel function, as it removes some duplication of the kernel invoke.
- The next diff in this stack will add the new kernels D75806957, to make the review easier

Reviewed By: q10

Differential Revision: D75541013
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped.cu
@@ -8,61 +8,55 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <fmt/core.h>
 
 #include "bf16bf16bf16_grouped/bf16bf16bf16_grouped_manifest.cuh"
+#include "fbgemm_gpu/quantize/common/tuning_cache.hpp"
+#include "fbgemm_gpu/quantize/common/utils.h"
 
 namespace fbgemm_gpu {
 
 #if CUDART_VERSION >= 12000
 
-// BF16 grouped cutlass kernel dispatch.
+namespace {
+TuningCache& getTuningCache() {
+  // This kernel has multiple APIs templated based on InputType, so we use this
+  // to have a single cache instance across APIs.
+  static TuningCache cache("bf16bf16bf16_grouped");
+  return cache;
+}
+} // namespace
+
 template <typename InputType>
-at::Tensor dispatch_bf16_grouped_kernel(
-    int G,
-    int total_M,
-    int N,
-    int K,
-    InputType X, // BF16
-    InputType W, // BF16
-    at::Tensor output,
-    std::optional<at::Tensor> zero_start_index_M = std::nullopt,
-    std::optional<at::Tensor> M_sizes = std::nullopt) {
+Kernel_bf16bf16bf16_grouped<InputType>
+get_kernel_via_heuristic(int G, int total_M, int N, int K) {
   // Use heuristics to pick best kernel implementation.
 
   // Llama4 128E
   if (G == 128) {
     if (N == 5120 && K == 1024) {
       if (total_M <= 128) {
-        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f;
       } else if (total_M <= 256) {
-        return bf16bf16bf16_grouped_128_32_128_2_1_1_t(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_32_128_2_1_1_t;
       } else if (total_M <= 2048) {
-        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f;
       } else if (total_M <= 4096) {
-        return bf16bf16bf16_grouped_128_32_128_2_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_32_128_2_1_1_f;
       } else if (total_M <= 8192) {
-        return bf16bf16bf16_grouped_128_64_128_1_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_64_128_1_1_1_f;
       } else if (total_M <= 16384) {
-        return bf16bf16bf16_grouped_128_128_128_2_1_1_t(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_128_128_2_1_1_t;
       } else {
-        return bf16bf16bf16_grouped_128_256_128_2_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_256_128_2_1_1_f;
       }
     }
 
     if (N == 2048 && K == 5120) {
       if (total_M <= 2048) {
-        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f;
       } else {
-        return bf16bf16bf16_grouped_128_128_128_2_1_1_t(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_128_128_2_1_1_t;
       }
     }
   }
@@ -71,71 +65,102 @@ at::Tensor dispatch_bf16_grouped_kernel(
   if (G == 16) {
     if (N == 5120 && K == 1024) {
       if (total_M <= 32) {
-        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f;
       } else if (total_M <= 64) {
-        return bf16bf16bf16_grouped_128_32_128_2_1_1_t(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_32_128_2_1_1_t;
       } else if (total_M <= 256) {
-        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f;
       } else if (total_M <= 512) {
-        return bf16bf16bf16_grouped_128_32_128_2_1_1_t(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_32_128_2_1_1_t;
       } else if (total_M <= 1024) {
-        return bf16bf16bf16_grouped_128_64_128_2_1_1_t(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_64_128_2_1_1_t;
       } else {
-        return bf16bf16bf16_grouped_128_256_128_2_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_256_128_2_1_1_f;
       }
     }
 
     if (N == 2048 && K == 5120) {
       if (total_M <= 16) {
-        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f;
       } else if (total_M <= 64) {
-        return bf16bf16bf16_grouped_128_32_128_2_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_32_128_2_1_1_f;
       } else if (total_M <= 256) {
-        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f;
       } else if (total_M <= 512) {
-        return bf16bf16bf16_grouped_128_32_128_2_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_32_128_2_1_1_f;
       } else if (total_M <= 1024) {
-        return bf16bf16bf16_grouped_128_64_128_1_1_1_f(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_64_128_1_1_1_f;
       } else {
-        return bf16bf16bf16_grouped_128_128_128_2_1_1_t(
-            X, W, output, zero_start_index_M, M_sizes);
+        return bf16bf16bf16_grouped_128_128_128_2_1_1_t;
       }
     }
   }
 
   // Fallback to legacy heuristic for now.
   if (total_M <= 16) {
-    return bf16bf16bf16_grouped_128_16_128_1_1_1_f(
-        X, W, output, zero_start_index_M, M_sizes);
+    return bf16bf16bf16_grouped_128_16_128_1_1_1_f;
   } else if (total_M <= 32) {
-    return bf16bf16bf16_grouped_128_32_128_1_1_1_f(
-        X, W, output, zero_start_index_M, M_sizes);
+    return bf16bf16bf16_grouped_128_32_128_1_1_1_f;
   } else if (total_M <= 64) {
-    return bf16bf16bf16_grouped_128_64_128_1_1_1_f(
-        X, W, output, zero_start_index_M, M_sizes);
+    return bf16bf16bf16_grouped_128_64_128_1_1_1_f;
   } else if (total_M <= 128) {
-    return bf16bf16bf16_grouped_128_128_128_1_1_1_f(
-        X, W, output, zero_start_index_M, M_sizes);
+    return bf16bf16bf16_grouped_128_128_128_1_1_1_f;
   } else if (total_M <= 512) {
-    return bf16bf16bf16_grouped_256_128_128_2_1_1_f(
-        X, W, output, zero_start_index_M, M_sizes);
+    return bf16bf16bf16_grouped_256_128_128_2_1_1_f;
   } else {
-    return bf16bf16bf16_grouped_128_256_128_2_1_1_f(
-        X, W, output, zero_start_index_M, M_sizes);
+    return bf16bf16bf16_grouped_128_256_128_2_1_1_f;
   }
 }
 
+template <typename InputType>
+Kernel_bf16bf16bf16_grouped<InputType> get_kernel_via_tuning(
+    int G,
+    int total_M,
+    int N,
+    int K,
+    InputType X, // BF16
+    InputType W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M = std::nullopt,
+    std::optional<at::Tensor> M_sizes = std::nullopt) {
+  auto& cache = getTuningCache();
+
+  // Reducing amount of auto tuning by rounding up total_m to next power of 2.
+  total_M = nextPowerOf2(total_M);
+  // Use (total_M, N, K, G) shape as the key.
+  const std::string shape_key = fmt::format("{}_{}_{}_{}", total_M, N, K, G);
+  const auto& kernels = get_bf16bf16bf16_grouped_kernels<InputType>();
+  auto kernel = cache.findBestKernelMaybeAutotune(
+      shape_key, kernels, X, W, output, zero_start_index_M, M_sizes);
+
+  return kernel;
+}
+
+// BF16 grouped cutlass kernel dispatch.
+template <typename InputType>
+at::Tensor dispatch_bf16_grouped_kernel(
+    int G,
+    int total_M,
+    int N,
+    int K,
+    InputType X, // BF16
+    InputType W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M = std::nullopt,
+    std::optional<at::Tensor> M_sizes = std::nullopt) {
+  // Select kernel to run via heuristics or tuning.
+  auto kernel = [&]() {
+    if (std::getenv("FBGEMM_AUTOTUNE_ENABLE")) {
+      return get_kernel_via_tuning(
+          G, total_M, N, K, X, W, output, zero_start_index_M, M_sizes);
+    } else {
+      return get_kernel_via_heuristic<InputType>(G, total_M, N, K);
+    }
+  }();
+  // Invoke kernel
+  return kernel(X, W, output, zero_start_index_M, M_sizes);
+}
+
 template <typename OutputType>
 OutputType _bf16bf16bf16_grouped(at::TensorList X, at::TensorList W) {
   at::Tensor Y;
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_manifest.cuh b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_manifest.cuh
@@ -180,4 +180,21 @@ at::Tensor bf16bf16bf16_grouped_128_256_128_2_1_1_f(
     std::optional<at::Tensor> zero_start_index_M,
     std::optional<at::Tensor> M_sizes);
 
+template <typename InputType>
+using Kernel_bf16bf16bf16_grouped = at::Tensor (*)(
+    InputType,
+    InputType,
+    at::Tensor,
+    std::optional<at::Tensor>,
+    std::optional<at::Tensor>);
+
+template <typename InputType>
+const std::unordered_map<std::string, Kernel_bf16bf16bf16_grouped<InputType>>&
+get_bf16bf16bf16_grouped_kernels() {
+  static const std::
+      unordered_map<std::string, Kernel_bf16bf16bf16_grouped<InputType>>
+          kernels = {};
+  return kernels;
+}
+
 } // namespace fbgemm_gpu