Support tuning cache for Cutlass FP8 GEMM (#4301)

cthi · facebook-github-bot · commit ccbdff47acd7 · 2025-06-12T13:36:53.000-07:00
Summary: Pull Request resolved: #4301 X-link: facebookresearch/FBGEMM#1377 This diff adds support for the tuning cache to the kernel. There should be no performance changes to the existing heuristics. - I refactored the kernel dispatch logic to instead return the kernel function, as it removes some duplication of the kernel invoke. - The next diff in this stack will add the new kernels D75820688, to make the review easier - Note that we are having some issues with adding the new kernels, as I have found this kernel is actually compiling 12 variants for each configuration, see D75820688 for more context. So for now we won't add the new kernels in D75820688, but we can just onboard it to auto tuning incase someone wants to compile them locally. Will revisit D75820688 later. Reviewed By: q10, jiawenliu64 Differential Revision: D75541025
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise.cu
@@ -10,166 +10,198 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 // clang-format on
+#include <fmt/core.h>
 
 #include "f8f8bf16_rowwise/f8f8bf16_rowwise_manifest.cuh"
+#include "fbgemm_gpu/quantize/common/tuning_cache.hpp"
+#include "fbgemm_gpu/quantize/common/utils.h"
 
 namespace fbgemm_gpu {
 
 #if CUDART_VERSION >= 12000
 
 // FP8 Rowwise Cutlass kernel dispatch.
-at::Tensor dispatch_fp8_rowwise_kernel(
-    at::Tensor XQ,
-    at::Tensor WQ,
-    at::Tensor x_scale,
-    at::Tensor w_scale,
-    bool use_fast_accum,
-    std::optional<at::Tensor> bias = std::nullopt,
-    std::optional<at::Tensor> output = std::nullopt) {
-  int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
-  int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
-  int K = XQ.size(-1);
-  static int arch = -1;
-  // Avoid expensive cudaGetDeviceProperties call.
-  if (arch < 0) {
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, 0);
-    if (prop.major >= 10) {
-      arch = 10;
-      int runtimeVersion;
-      C10_CUDA_CHECK(cudaRuntimeGetVersion(&runtimeVersion));
-      TORCH_CHECK(
-          runtimeVersion >= 12080,
-          "FP8 GEMM on sm100a or above requires cuda >= 12.8");
-    } else {
-      arch = 9;
-    }
-  }
-
+Kernel_f8f8bf16_rowwise
+get_kernel_via_heuristic(int arch, int M, int N, int K, bool use_fast_accum) {
   // Use shape heuristics to dispatch to optimized kernel configuration.
   if (arch == 10) {
     if (M <= 128) {
       if (N <= 1024) {
-        return f8f8bf16_rowwise_128_32_128_1_1_1_10_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_128_32_128_1_1_1_10_f_f;
       } else {
-        return f8f8bf16_rowwise_128_64_128_1_1_1_10_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_128_64_128_1_1_1_10_f_f;
       }
     } else if (M <= 1024) {
       if (N <= 1024) {
-        return f8f8bf16_rowwise_128_256_128_2_1_1_10_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_128_256_128_2_1_1_10_f_f;
       } else {
-        return f8f8bf16_rowwise_128_128_128_2_2_1_10_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_128_128_128_2_2_1_10_f_f;
       }
     } else if (M <= 2048) {
-      return f8f8bf16_rowwise_128_256_128_2_1_1_10_f_f(
-          XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+      return f8f8bf16_rowwise_128_256_128_2_1_1_10_f_f;
     } else {
       if (N <= 1024) {
-        return f8f8bf16_rowwise_128_256_128_1_2_1_10_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_128_256_128_1_2_1_10_f_f;
       } else {
-        return f8f8bf16_rowwise_128_256_128_2_1_1_10_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_128_256_128_2_1_1_10_f_f;
       }
     }
   } else {
     if (M <= 16) {
-      return f8f8bf16_rowwise_64_16_128_1_1_1_9_f_f(
-          XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+      return f8f8bf16_rowwise_64_16_128_1_1_1_9_f_f;
     } else if (M <= 32) {
       if (N <= 4096) {
-        return f8f8bf16_rowwise_64_16_128_1_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_16_128_1_1_1_9_f_f;
       } else {
-        return f8f8bf16_rowwise_64_32_128_2_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_32_128_2_1_1_9_f_f;
       }
     } else if (M <= 64) {
       if (N <= 2048) {
-        return f8f8bf16_rowwise_64_16_128_1_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_16_128_1_1_1_9_f_f;
       } else if (N <= 4096) {
-        return f8f8bf16_rowwise_64_32_128_2_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_32_128_2_1_1_9_f_f;
       } else {
-        return f8f8bf16_rowwise_64_64_128_2_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_64_128_2_1_1_9_f_f;
       }
     } else if (M <= 128) {
       if (N <= 1024) {
-        return f8f8bf16_rowwise_64_16_128_1_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_16_128_1_1_1_9_f_f;
       } else if (N <= 2048) {
-        return f8f8bf16_rowwise_64_32_128_2_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_32_128_2_1_1_9_f_f;
       } else if (N <= 4096) {
-        return f8f8bf16_rowwise_64_64_128_2_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_64_128_2_1_1_9_f_f;
       } else {
-        return f8f8bf16_rowwise_64_128_128_1_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_128_128_1_1_1_9_f_f;
       }
     } else if (M <= 256) {
       if (N <= 1024) {
-        return f8f8bf16_rowwise_64_32_128_2_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_32_128_2_1_1_9_f_f;
       } else if (N <= 2048) {
-        return f8f8bf16_rowwise_64_64_128_2_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_64_128_2_1_1_9_f_f;
       } else if (N <= 4096) {
-        return f8f8bf16_rowwise_64_128_128_1_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_128_128_1_1_1_9_f_f;
       } else {
-        return f8f8bf16_rowwise_64_256_128_1_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_256_128_1_1_1_9_f_f;
       }
     } else if (M <= 512) {
       if (N <= 1024) {
-        return f8f8bf16_rowwise_64_64_128_2_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_64_128_2_1_1_9_f_f;
       } else if (N <= 2048) {
-        return f8f8bf16_rowwise_64_128_128_1_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_128_128_1_1_1_9_f_f;
       } else if (N <= 4096 || use_fast_accum == false) {
-        return f8f8bf16_rowwise_64_256_128_1_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_256_128_1_1_1_9_f_f;
       } else {
-        return f8f8bf16_rowwise_128_256_128_2_1_1_9_f_t(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_128_256_128_2_1_1_9_f_t;
       }
     } else if (M <= 1024) {
       if (N <= 1024) {
-        return f8f8bf16_rowwise_64_128_128_1_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_128_128_1_1_1_9_f_f;
       } else if (N <= 2048 || use_fast_accum == false) {
-        return f8f8bf16_rowwise_64_256_128_1_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_256_128_1_1_1_9_f_f;
       } else {
-        return f8f8bf16_rowwise_128_256_128_2_1_1_9_f_t(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_128_256_128_2_1_1_9_f_t;
       }
     } else {
       if (M <= 2048 && N <= 1024) {
-        return f8f8bf16_rowwise_64_256_128_2_1_1_9_f_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_64_256_128_2_1_1_9_f_f;
       } else if (K <= 4096 || use_fast_accum == false) {
-        return f8f8bf16_rowwise_128_128_128_2_1_1_9_t_f(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_128_128_128_2_1_1_9_t_f;
       } else if (M > 8192 && N > 8192) {
-        return f8f8bf16_rowwise_128_256_128_4_4_1_9_f_t(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_128_256_128_4_4_1_9_f_t;
       } else {
-        return f8f8bf16_rowwise_128_256_128_2_1_1_9_f_t(
-            XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+        return f8f8bf16_rowwise_128_256_128_2_1_1_9_f_t;
       }
     }
   }
 }
 
+Kernel_f8f8bf16_rowwise get_kernel_via_tuning(
+    int arch,
+    int M,
+    int N,
+    int K,
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    bool use_fast_accum,
+    std::optional<at::Tensor> bias = std::nullopt,
+    std::optional<at::Tensor> output = std::nullopt) {
+  // One cache per kernel type
+  static TuningCache cache("f8f8bf16_rowwise");
+
+  // Reducing amount of auto tuning by rounding up M to next power of 2.
+  M = nextPowerOf2(M);
+  // Use (M, N, K) shape as the key.
+  const std::string shape_key = fmt::format("{}_{}_{}", M, N, K);
+  const auto& kernels = get_f8f8bf16_rowwise_kernels(arch);
+  auto kernel = cache.findBestKernelMaybeAutotune(
+      shape_key,
+      kernels,
+      XQ,
+      WQ,
+      x_scale,
+      w_scale,
+      use_fast_accum,
+      bias,
+      output);
+
+  return kernel;
+}
+
+// FP8 Rowwise Cutlass kernel dispatch.
+at::Tensor dispatch_fp8_rowwise_kernel(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    bool use_fast_accum,
+    std::optional<at::Tensor> bias = std::nullopt,
+    std::optional<at::Tensor> output = std::nullopt) {
+  int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
+  int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
+  int K = XQ.size(-1);
+
+  static int arch = -1;
+  // Avoid expensive cudaGetDeviceProperties call.
+  if (arch < 0) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    if (prop.major >= 10) {
+      arch = 10;
+      int runtimeVersion;
+      C10_CUDA_CHECK(cudaRuntimeGetVersion(&runtimeVersion));
+      TORCH_CHECK(
+          runtimeVersion >= 12080,
+          "FP8 GEMM on sm100a or above requires cuda >= 12.8");
+    } else {
+      arch = 9;
+    }
+  }
+
+  // Select kernel to run via heuristics or tuning.
+  auto kernel = [&]() {
+    if (std::getenv("FBGEMM_AUTOTUNE_ENABLE")) {
+      return get_kernel_via_tuning(
+          arch,
+          M,
+          N,
+          K,
+          XQ,
+          WQ,
+          x_scale,
+          w_scale,
+          use_fast_accum,
+          bias,
+          output);
+    } else {
+      return get_kernel_via_heuristic(arch, M, N, K, use_fast_accum);
+    }
+  }();
+  // Invoke kernel
+  return kernel(XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
+}
+
 void f8f8bf16_rowwise_out(
     at::Tensor XQ, // FP8
     at::Tensor WQ, // FP8
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise/f8f8bf16_rowwise_manifest.cuh b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise/f8f8bf16_rowwise_manifest.cuh
@@ -135,4 +135,27 @@ at::Tensor f8f8bf16_rowwise_128_256_128_2_1_1_10_f_f(
     bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt);
+
+using Kernel_f8f8bf16_rowwise = at::Tensor (*)(
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    bool,
+    std::optional<at::Tensor>,
+    std::optional<at::Tensor>);
+
+inline const std::unordered_map<std::string, Kernel_f8f8bf16_rowwise>&
+get_f8f8bf16_rowwise_kernels(int arch) {
+  static const std::unordered_map<std::string, Kernel_f8f8bf16_rowwise>
+      kernelsSM90 = {};
+  static const std::unordered_map<std::string, Kernel_f8f8bf16_rowwise>
+      kernelsSM100 = {};
+  if (arch == 10) {
+    return kernelsSM100;
+  } else {
+    return kernelsSM90;
+  }
+}
+
 } // namespace fbgemm_gpu