Add output as an option in CUTLASS grouped GEMM (#4931)

jiawenliu64 · facebook-github-bot · commit 16dd8b300aa3 · 2025-09-23T23:43:50.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1954 Enable output as an option in CUTLASS grouped GEMM, as pretraining requires assigning empty preallocated output tensor for usecases in fprop and dgrad. Differential Revision: D83126291
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped.cu
@@ -345,8 +345,11 @@ at::Tensor bf16bf16bf16_grouped_cat(at::TensorList X, at::TensorList W) {
   return _bf16bf16bf16_grouped<at::Tensor>(X, W);
 }
 
-at::Tensor
-bf16bf16bf16_grouped_stacked(at::Tensor X, at::Tensor W, at::Tensor M_sizes) {
+at::Tensor bf16bf16bf16_grouped_stacked(
+    at::Tensor X,
+    at::Tensor W,
+    at::Tensor M_sizes,
+    std::optional<at::Tensor> Y) {
   int64_t total_M = X.size(0);
   int64_t N = W.size(1);
   int64_t K = W.size(2);
@@ -356,14 +359,21 @@ bf16bf16bf16_grouped_stacked(at::Tensor X, at::Tensor W, at::Tensor M_sizes) {
       "M_sizes must be on same device as inputs.");
   TORCH_CHECK(
       W.dim() == 3 && W.size(0) == G, "Weights should be shape [G, N, K].")
-  at::Tensor Y = at::empty(total_M * N, X.options().dtype(at::kBFloat16));
+
+  at::Tensor output_tensor;
+  if (Y.has_value()) {
+    output_tensor = Y.value();
+  } else {
+    output_tensor = at::empty(total_M * N, X.options().dtype(at::kBFloat16));
+  }
+
   // Early exit for empty inputs.
   if (total_M == 0) {
-    return Y.view({total_M, N});
+    return output_tensor.view({total_M, N});
   }
   // Return continuous view of output.
   at::Tensor out = dispatch_bf16_grouped_kernel<at::Tensor>(
-      G, total_M, N, K, X, W, Y, std::nullopt, M_sizes);
+      G, total_M, N, K, X, W, output_tensor, std::nullopt, M_sizes);
   return out.view({total_M, N});
 }
 
@@ -411,7 +421,11 @@ at::Tensor bf16bf16bf16_grouped_dynamic(
       "CUDA version is older than 12.0"); // requires CUDA>=12
 }
 
-at::Tensor bf16bf16bf16_grouped_stacked(at::Tensor, at::Tensor, at::Tensor) {
+at::Tensor bf16bf16bf16_grouped_stacked(
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    std::optional<at::Tensor>) {
   throw std::runtime_error(
       "CUDA version is older than 12.0"); // requires CUDA>=12
 }
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped_grad.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped_grad.cu
@@ -300,8 +300,11 @@ at::Tensor dispatch_bf16_grouped_kernel(
   return kernel(X, W, output, M_sizes);
 }
 
-at::Tensor
-bf16bf16bf16_grouped_grad(at::Tensor X, at::Tensor W, at::Tensor M_sizes) {
+at::Tensor bf16bf16bf16_grouped_grad(
+    at::Tensor X,
+    at::Tensor W,
+    at::Tensor M_sizes,
+    std::optional<at::Tensor> Y) {
   int64_t total_M = X.size(0);
   int64_t N = W.size(1);
   int64_t K = W.size(2);
@@ -315,20 +318,29 @@ bf16bf16bf16_grouped_grad(at::Tensor X, at::Tensor W, at::Tensor M_sizes) {
   TORCH_CHECK(X.stride(-1) == 1, "Activation memory layout must be row-major.");
   TORCH_CHECK(W.stride(-2) == 1, "Weight memory layout must be column-major.");
 
-  at::Tensor Y = at::empty(total_M * N, X.options().dtype(at::kBFloat16));
+  at::Tensor output_tensor;
+  if (Y.has_value()) {
+    output_tensor = Y.value();
+  } else {
+    output_tensor = at::empty(total_M * N, X.options().dtype(at::kBFloat16));
+  }
   // Early exit for empty inputs.
   if (total_M == 0) {
-    return Y.view({total_M, N});
+    return output_tensor.view({total_M, N});
   }
   // Return continuous view of output.
-  at::Tensor out =
-      dispatch_bf16_grouped_kernel(G, total_M, N, K, X, W, Y, M_sizes);
+  at::Tensor out = dispatch_bf16_grouped_kernel(
+      G, total_M, N, K, X, W, output_tensor, M_sizes);
   return out.view({total_M, N});
 }
 
 #else
 
-at::Tensor bf16bf16bf16_grouped_grad(at::Tensor, at::Tensor, at::Tensor) {
+at::Tensor bf16bf16bf16_grouped_grad(
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    std::optional<at::Tensor>) {
   throw std::runtime_error(
       "CUDA version is older than 12.0"); // requires CUDA>=12
 }
@@ -338,12 +350,18 @@ at::Tensor bf16bf16bf16_grouped_grad(at::Tensor, at::Tensor, at::Tensor) {
 at::Tensor bf16bf16bf16_grouped_grad_meta(
     at::Tensor X,
     at::Tensor W,
-    at::Tensor /* M_sizes */) {
+    at::Tensor /* M_sizes */,
+    std::optional<at::Tensor> Y) {
   const at::SymInt total_M = X.sym_size(0);
   const at::SymInt N = W.sym_size(1);
-  at::Tensor Y =
-      at::empty_symint({total_M, N}, X.options().dtype(at::kBFloat16));
-  return Y;
+
+  if (Y.has_value()) {
+    return Y.value();
+  } else {
+    at::Tensor output =
+        at::empty_symint({total_M, N}, X.options().dtype(at::kBFloat16));
+    return output;
+  }
 }
 
 TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
@@ -356,7 +374,7 @@ TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
-      "bf16bf16bf16_grouped_grad(Tensor X, Tensor W, Tensor M_sizes) -> Tensor");
+      "bf16bf16bf16_grouped_grad(Tensor X, Tensor W, Tensor M_sizes, Tensor? Y=None) -> Tensor");
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp
@@ -76,8 +76,11 @@ at::Tensor bf16bf16bf16_grouped_dynamic(
     at::Tensor X,
     at::Tensor W,
     at::Tensor zero_start_index_M);
-at::Tensor
-bf16bf16bf16_grouped_stacked(at::Tensor X, at::Tensor W, at::Tensor M_sizes);
+at::Tensor bf16bf16bf16_grouped_stacked(
+    at::Tensor X,
+    at::Tensor W,
+    at::Tensor M_sizes,
+    std::optional<at::Tensor> Y = std::nullopt);
 at::Tensor f8f8bf16_rowwise(
     at::Tensor XQ,
     at::Tensor WQ,
@@ -781,12 +784,18 @@ at::Tensor bf16bf16bf16_grouped_dynamic_meta(
 at::Tensor bf16bf16bf16_grouped_stacked_meta(
     at::Tensor X,
     at::Tensor W,
-    at::Tensor /* M_sizes */) {
+    at::Tensor /* M_sizes */,
+    std::optional<at::Tensor> Y) {
   const at::SymInt total_M = X.sym_size(0);
   const at::SymInt N = W.sym_size(1);
-  at::Tensor Y =
-      at::empty_symint({total_M, N}, X.options().dtype(at::kBFloat16));
-  return Y;
+
+  if (Y.has_value()) {
+    return Y.value();
+  } else {
+    at::Tensor output =
+        at::empty_symint({total_M, N}, X.options().dtype(at::kBFloat16));
+    return output;
+  }
 }
 
 at::Tensor f8f8bf16_rowwise_grouped_stacked_meta(
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize_defs.cpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize_defs.cpp
@@ -63,7 +63,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "bf16bf16bf16_grouped_dynamic(Tensor X, Tensor W, Tensor zero_start_index_M) -> Tensor");
   m.def(
-      "bf16bf16bf16_grouped_stacked(Tensor X, Tensor W, Tensor M_sizes) -> Tensor");
+      "bf16bf16bf16_grouped_stacked(Tensor X, Tensor W, Tensor M_sizes, Tensor? Y=None) -> Tensor");
   m.def(
       "f8f8bf16_blockwise(Tensor XQ, Tensor WQ, Tensor x_scale, Tensor w_scale, int block_m=128, int block_n=128, int block_k=128) -> Tensor");
   m.def(
diff --git a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py