Add ability to pad the rowwise quantized tensors

yyetim · facebook-github-bot · commit 39b8f1881865 · 2025-09-15T17:22:41.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1899 Some downstream kernels assume a certain width from quantized tensors. This adds the ability to do this as part of the triton fp8 quantize kernel. Differential Revision: D82486197
diff --git a/fbgemm_gpu/experimental/gemm/test/fp8_gemm_test.py b/fbgemm_gpu/experimental/gemm/test/fp8_gemm_test.py
@@ -26,6 +26,7 @@
         quantize_fp8_packed_row,
         quantize_fp8_packed_row_raw,
         quantize_fp8_row,
+        quantize_fp8_row_meta,
         scale_fp8_row,
     )
 
@@ -48,6 +49,10 @@ def _test_quantize_fp8_row(
             use_jagged: bool = False,
             use_scale_ub: bool = False,
             transpose_inputs: bool = False,
+            pad_rows_to_multiple_of: Optional[int] = None,
+            expected_padded_size: Optional[
+                int
+            ] = None,  # only set with pad_rows_to_multiple_of
         ) -> None:
             a = torch.randn(shape, dtype=torch.bfloat16, device=device)
             inputs = [a]
@@ -91,8 +96,23 @@ def _test_quantize_fp8_row(
                     zero_start_index_M=zero_start_index_M,
                     use_triton=use_triton,
                     output_device=output_device,
+                    pad_rows_to_multiple_of=pad_rows_to_multiple_of,
                 )
 
+                a_fp8_meta, a_scale_meta = quantize_fp8_row_meta(
+                    input_a,
+                    scale_ub=scale_ub,
+                    zero_start_index_M=zero_start_index_M,
+                    use_triton=use_triton,
+                    output_device=output_device,
+                    pad_rows_to_multiple_of=pad_rows_to_multiple_of,
+                )
+
+                self.assertEqual(a_fp8.dtype, a_fp8_meta.dtype)
+                self.assertEqual(a_fp8.shape, a_fp8_meta.shape)
+                self.assertEqual(a_scale.dtype, a_scale_meta.dtype)
+                self.assertEqual(a_scale.shape, a_scale_meta.shape)
+
                 # Undo scaling.
                 a_torch = a_fp8.to(torch.bfloat16)
                 broadcast_shape = list(a_torch.shape[:-1]) + [-1]
@@ -101,6 +121,20 @@ def _test_quantize_fp8_row(
 
                 a_torch *= a_scale.view(broadcast_shape)
 
+                if pad_rows_to_multiple_of is not None:
+                    # Pad input_a's row dimension to expected_padded_size if specified.
+                    if expected_padded_size is not None:
+                        pad_rows = expected_padded_size - input_a.shape[-1]
+                        if pad_rows > 0:
+                            pad_shape = list(input_a.shape)
+                            pad_shape[-1] = pad_rows
+                            pad_tensor = torch.zeros(
+                                pad_shape,
+                                dtype=input_a.dtype,
+                                device=input_a.device,
+                            )
+                            input_a = torch.cat([input_a, pad_tensor], dim=-1)
+
                 self.assertTrue(
                     torch.allclose(
                         input_a.to(device=output_device),
@@ -114,6 +148,20 @@ def _test_quantize_fp8_row(
             _test_quantize_fp8_row((2, n_col), True, torch.device("cuda"))
         # Test with batched input.
         _test_quantize_fp8_row((4, 2, 3), True, torch.device("cuda"))
+        _test_quantize_fp8_row(
+            (4, 2, 3),
+            True,
+            torch.accelerator.current_accelerator("cuda"),
+            pad_rows_to_multiple_of=8,
+            expected_padded_size=8,
+        )
+        _test_quantize_fp8_row(
+            (4, 2, 13),
+            True,
+            torch.accelerator.current_accelerator("cuda"),
+            pad_rows_to_multiple_of=8,
+            expected_padded_size=16,
+        )
         _test_quantize_fp8_row((6, 4, 2, 3), True, torch.device("cuda"))
         # Test with non-contiguous input
         _test_quantize_fp8_row(
@@ -132,6 +180,14 @@ def _test_quantize_fp8_row(
         _test_quantize_fp8_row((6, 4, 2, 3), True, torch.device("cpu"))
         # Test with zero_start_index_M
         _test_quantize_fp8_row((20, 30), True, torch.device("cuda"), use_jagged=True)
+        _test_quantize_fp8_row(
+            (20, 30),
+            True,
+            torch.accelerator.current_accelerator("cuda"),
+            use_jagged=True,
+            pad_rows_to_multiple_of=16,
+            expected_padded_size=32,
+        )
         _test_quantize_fp8_row(
             (6, 4, 2, 3), True, torch.device("cuda"), use_jagged=True
         )
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
@@ -2328,6 +2328,7 @@ def _kernel_quantize_fp8_row(
     M,
     N,
     K,
+    K_fp8,  # used when padding
     stride_ab,
     stride_am,
     stride_an,
@@ -2364,7 +2365,8 @@ def _kernel_quantize_fp8_row(
         B (int): Size of dimenion 0
         M (int): Size of dimenion 1
         N (int): Size of dimenion 2
-        K (int): Size of dimenion 3
+        K (int): Size of dimenion 3 (input row size)
+        K_fp8 (int): Size of dimenion 3 for A_fp8 (output row size, can be >= K)
         stride_ab (int): Stride of b dimension of A.
         stride_am (int): Stride of m dimension of A.
         stride_an (int): Stride of n dimension of A.
@@ -2433,21 +2435,32 @@ def _kernel_quantize_fp8_row(
     tl.store(A_scale + pid, 1.0 / a_scale)
     n_offset = tl.arange(0, BLOCK_SIZE)
 
-    for _k in range(0, tl.cdiv(K, BLOCK_SIZE)):
+    # Write quantized values for the first K elements (from A), and pad the rest with zeros up to K_fp8
+    for _k in range(0, tl.cdiv(K_fp8, BLOCK_SIZE)):
+        # For the first K elements, use A; for the rest, use 0
+        # Compute the valid range for this tile
+        tile_start = _k * BLOCK_SIZE
+        # Calculate masks for both cases
+        mask_in_A = (n_offset + tile_start) < K_in
+        mask_in_A_fp8 = (n_offset + tile_start) < K_fp8
+
+        # Load from A if in range, else 0 (we're going all the way to K_fp8)
         a = tl.load(
-            A + a_offset_base + n_offset * stride_ak,
-            mask=n_offset < K_in,
+            A + a_offset_base + (n_offset + tile_start) * stride_ak,
+            mask=mask_in_A & mask_in_A_fp8,
             other=0.0,
         )
+        # For elements >= K, a will be 0
         a_fp8 = a * a_scale
         # Clamp A to fp8 range to make sure there's no overflow.
         # This is required for AMD. Nvidia's default saturation
         # handles it, but it's nice to have anyway.
         a_fp8 = tl.clamp(a_fp8, -MAX_FP8, MAX_FP8).to(TL_FP8_DTYPE)
+        # For elements >= K, a_fp8 is already 0
         tl.store(
-            A_fp8 + a_fp8_offset_base + n_offset * stride_ok,
+            A_fp8 + a_fp8_offset_base + (n_offset + tile_start) * stride_ok,
             a_fp8,
-            mask=n_offset < K,
+            mask=mask_in_A_fp8,
         )
         n_offset += BLOCK_SIZE
 
@@ -2456,6 +2469,9 @@ def triton_quantize_fp8_row(
     a: Tensor,
     scale_ub: Optional[Tensor] = None,
     zero_start_index_M: Optional[Tensor] = None,
+    pad_rows_to_multiple_of: Optional[
+        int
+    ] = None,  # TODO(yyetim) Add a test case, validate padding and 0 setting
 ) -> Tuple[Tensor, Tensor]:
     """
     Call the triton quantize fp8 row kernel to quantize a tensor to fp8 with row-wise scalings.
@@ -2464,6 +2480,7 @@ def triton_quantize_fp8_row(
         a (Tensor): higher precision input tensor of 4 dimension.
         scale_ub (Tensor): Maximum allowed value for scale.
         zero_start_index_M (Tensor): Indicates number of nonzero elements in each row.
+        pad_rows_to_multiple_of: Pad rows to this value. Useful for downstream kernels accepting specific sizes (e.g., multiple of 16)
 
     Returns:
         torch.Tensor: fp8 scaled tensor.
@@ -2485,7 +2502,18 @@ def triton_quantize_fp8_row(
     pt_dtype, tl_dtype, max_fp8, eps = get_fp8_constants()
     num_rows = a.numel() // a.shape[-1]
     a_scale = torch.empty((num_rows), dtype=torch.float32, device=a.device)
-    a_fp8 = torch.empty(a.shape, device=a.device, dtype=pt_dtype)
+    # If pad_rows_to_multiple_of is provided, pad the last dimension to be a multiple of it
+    if pad_rows_to_multiple_of is not None:
+        last_dim = a.shape[-1]
+        padded_last_dim = (
+            (last_dim + pad_rows_to_multiple_of - 1) // pad_rows_to_multiple_of
+        ) * pad_rows_to_multiple_of
+        a_fp8 = torch.empty(
+            (*a.shape[:-1], padded_last_dim), device=a.device, dtype=pt_dtype
+        )
+        a_shape = (*a_shape[:-1], padded_last_dim)
+    else:
+        a_fp8 = torch.empty(a.shape, device=a.device, dtype=pt_dtype)
 
     # If input tensor is sufficiently large, we need to use int64 indexing.
     use_int64 = a.numel() > (2**31 - 1)
@@ -2504,6 +2532,7 @@ def triton_quantize_fp8_row(
                 a.shape[1],
                 a.shape[2],
                 a.shape[3],
+                a_fp8.shape[3],
                 a.stride(0),
                 a.stride(1),
                 a.stride(2),
@@ -2908,6 +2937,7 @@ def quantize_fp8_row(
     zero_start_index_M: Optional[Tensor] = None,
     use_triton: bool = True,
     output_device: Optional[torch.device] = None,
+    pad_rows_to_multiple_of: Optional[int] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize a to fp8 with row-wise scalings and optionally move to output device.
@@ -2928,7 +2958,12 @@ def quantize_fp8_row(
         logger.info("Triton does not support cpu, falling back to torch ops.")
         use_triton = False
     if use_triton:
-        return triton_quantize_fp8_row(a, scale_ub, zero_start_index_M)
+        return triton_quantize_fp8_row(
+            a,
+            scale_ub,
+            zero_start_index_M,
+            pad_rows_to_multiple_of=pad_rows_to_multiple_of,
+        )
     # else use pytorch implementation.
     if not output_device:
         output_device = a.device
@@ -2958,18 +2993,34 @@ def quantize_fp8_row(
 def quantize_fp8_row_meta(
     a: Tensor,
     scale_ub: Optional[Tensor] = None,
+    zero_start_index_M: Optional[Tensor] = None,
     use_triton: bool = True,
     output_device: Optional[torch.device] = None,
+    pad_rows_to_multiple_of: Optional[int] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Shape function for torch compile."""
     if output_device is None:
         output_device = a.device
     a_shape = a.shape
-    # Flatten to 2D since each row of each potential batch gets a scale.
     dtype = get_fp8_constants()[0]
-    fake_out = torch.empty(a.shape, device=output_device, dtype=dtype)
-    fake_scale = torch.empty(a_shape[:-1], device=output_device, dtype=torch.float32)
-    return fake_out, fake_scale
+    if pad_rows_to_multiple_of is not None:
+        last_dim = a.shape[-1]
+        padded_last_dim = (
+            (last_dim + pad_rows_to_multiple_of - 1) // pad_rows_to_multiple_of
+        ) * pad_rows_to_multiple_of
+        fake_out = torch.empty(
+            (*a.shape[:-1], padded_last_dim), device=output_device, dtype=dtype
+        )
+        fake_scale = torch.empty(
+            a_shape[:-1], device=output_device, dtype=torch.float32
+        )
+        return fake_out, fake_scale
+    else:
+        fake_out = torch.empty(a.shape, device=output_device, dtype=dtype)
+        fake_scale = torch.empty(
+            a_shape[:-1], device=output_device, dtype=torch.float32
+        )
+        return fake_out, fake_scale
 
 
 @triton.autotune(