pytorch
diff --git a/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py‎
Lines changed: 1 addition & 27 deletions b/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py‎
Lines changed: 1 addition & 27 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py‎
Lines changed: 1 addition & 47 deletions b/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py‎
Lines changed: 1 addition & 47 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache_convert.cu‎
Lines changed: 12 additions & 12 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache_convert.cu‎
Lines changed: 12 additions & 12 deletions
@@ -486,16 +486,10 @@ def print_kernels(kernels: Optional[List[str]]) -> List[QuantizeOpBase]:
     default=None,
     help="If set with grouped mode, repeat input shapes this many times. Comma separated list of groups to benchmark",
 )
-@click.option(
-    "--total-K",
-    default=None,
-    help="If set, adjusts the K values to sum to this number. "
-    "This can help simulate real grouped workloads in backward wgrad.",
-)
 @click.option(
     "--total-M",
     default=None,
-    help="If set, adjusts the M values to sum to this number. "
+    help="If set, Adjusts the M values to sum to this number. "
     "This can help simulate real grouped workloads.",
 )
 @click.option(
@@ -548,7 +542,6 @@ def invoke_main(
     pair_nk: bool,
     grouped: bool,
     groups: Optional[str],
-    total_k: Optional[str],
     total_m: Optional[str],
     no_cuda_graph: bool,
     use_rotating_buffer_bench: bool,
@@ -560,14 +553,6 @@ def invoke_main(
 ):
     if enable_amd_env_vars:
         set_amd_env_vars()
-
-    # Validate that total_m and total_k are mutually exclusive
-    if total_m is not None and total_k is not None:
-        raise ValueError(
-            "total_m and total_k cannot be specified at the same time. "
-            "Please provide only one of them."
-        )
-
     # If kernel filter is provided, parse it. Else, benchmark all kernels.
     all_kernels = kernels.strip().split(",") if kernels else None
     quantize_ops = collect_kernels_to_profile(all_kernels)
@@ -644,17 +629,6 @@ def invoke_main(
                 for g in groups_list
                 for b, _, n, k in MNK
             ]
-        elif total_k:
-            MNK = [
-                [
-                    [b] * g,
-                    [m] * g,
-                    [n] * g,
-                    generate_group_tensor(g, int(total_k)),
-                ]
-                for g in groups_list
-                for b, m, n, _ in MNK
-            ]
         else:
             MNK = [
                 [[b] * g, [m] * g, [n] * g, [k] * g]
 
@@ -2084,7 +2084,7 @@ def cuda(self) -> bool:
 @register_quantize_op
 class BF16GroupedGrad(QuantizeOpBase):
     """
-    BF16 grouped matmul with dgrad inputs in pretraining backed by cutlass
+    BF16 grouped matmul with grad inputs backed by cutlass
     """
 
     def preprocess(self, x, w):
@@ -2126,52 +2126,6 @@ def cuda(self) -> bool:
         return True
 
 
-@register_quantize_op
-class BF16GroupedWGrad(QuantizeOpBase):
-    """
-    BF16 grouped matmul with wgrad inputs in pretraining backed by cutlass
-    """
-
-    def preprocess(self, x, w):
-        # Get K values for each group
-        k_values = [xi.shape[1] for xi in x]  # K dimension for each group
-
-        # Convert k_values into sizes tensor
-        k_sizes = torch.tensor(k_values).to(dtype=torch.int64, device=x[0].device)
-
-        x = torch.concat(x, dim=1).contiguous()  # shape: (M, G*K)
-        w = torch.concat(w, dim=1).contiguous()  # shape: (N, G*K)
-
-        # Transpose the follows to simulate wgrad shapes
-        x = x.t().contiguous()  # shape: (G*K, M)
-        w = w.t().contiguous()  # shape: (G*K, N)
-
-        # Return processed tensors
-        return x, w, k_sizes
-
-    def quantize(self, x, w, k_sizes):
-        return x, w, k_sizes
-
-    def compute(self, x, w, k_sizes):
-        return torch.ops.fbgemm.bf16bf16bf16_grouped_wgrad(x, w, k_sizes)
-
-    def quantize_and_compute(self, x, w, k_sizes):
-        x, w, k_sizes = self.quantize(x, w, k_sizes)
-        return self.compute(x, w, k_sizes)
-
-    @property
-    def name(self) -> str:
-        return "bf16_grouped_wgrad"
-
-    @property
-    def hip(self) -> bool:
-        return False
-
-    @property
-    def cuda(self) -> bool:
-        return True
-
-
 @register_quantize_op
 class BF16GroupedStacked(QuantizeOpBase):
     """
 
@@ -23,6 +23,7 @@
 #endif
 
 #include "fbgemm_gpu/utils/cuda_block_count.h"
+#include "fbgemm_gpu/utils/kernel_launcher.cuh"
 #include "fbgemm_gpu/utils/vec_quant.cuh"
 
 #include <torch/torch.h>
@@ -47,12 +48,12 @@ namespace fbgemm_gpu {
  * 32-63 to convert the V tensors. NV only has threads 0-31 per warp.
  */
 __global__ void convert_e4m3fn_kv_cache_to_e4m3fnuz_inplace_kernel(
-    at::PackedTensorAccessor64<uint8_t, 5, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<uint8_t, 5, at::RestrictPtrTraits>
         cache_K, // [N_H_L][B][MAX_T][N_KVH][D_H]
-    at::PackedTensorAccessor64<uint8_t, 5, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<uint8_t, 5, at::RestrictPtrTraits>
         cache_V, // [N_H_L][B][MAX_T][N_KVH][D_H]
-    at::PackedTensorAccessor64<int32_t, 5, at::RestrictPtrTraits> qparam_K,
-    at::PackedTensorAccessor64<int32_t, 5, at::RestrictPtrTraits> qparam_V) {
+    pta::PackedTensorAccessor64<int32_t, 5, at::RestrictPtrTraits> qparam_K,
+    pta::PackedTensorAccessor64<int32_t, 5, at::RestrictPtrTraits> qparam_V) {
   auto N_KVH = cache_K.size(3);
   auto MAX_T = cache_K.size(2);
   auto D_H = cache_K.size(4);
@@ -133,17 +134,16 @@ void convert_e4m3fn_kv_cache_to_e4m3fnuz_inplace(
   dim3 blocks(N_H_L, B, std::max<int32_t>(1, kMaxBlocks / (B * N_H_L)));
   dim3 threads(kThreadsPerWarp, kWarpsPerBlock);
 
-  convert_e4m3fn_kv_cache_to_e4m3fnuz_inplace_kernel<<<
+  FBGEMM_LAUNCH_KERNEL(
+      (convert_e4m3fn_kv_cache_to_e4m3fnuz_inplace_kernel),
       blocks,
       threads,
       0,
-      at::cuda::getCurrentCUDAStream()>>>(
-      cache_K.packed_accessor64<uint8_t, 5, at::RestrictPtrTraits>(),
-      cache_V.packed_accessor64<uint8_t, 5, at::RestrictPtrTraits>(),
-      qparam_K.packed_accessor64<int32_t, 5, at::RestrictPtrTraits>(),
-      qparam_V.packed_accessor64<int32_t, 5, at::RestrictPtrTraits>());
-
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
+      at::cuda::getCurrentCUDAStream(),
+      PTA_B(cache_K, uint8_t, 5, 64),
+      PTA_B(cache_V, uint8_t, 5, 64),
+      PTA_B(qparam_K, int32_t, 5, 64),
+      PTA_B(qparam_V, int32_t, 5, 64));
 }
 #else
 void convert_e4m3fn_kv_cache_to_e4m3fnuz_inplace(