Skip to content

Commit 3cea487

Browse files
author
pytorchbot
committed
2025-09-19 nightly release (9b66962)
1 parent 5206781 commit 3cea487

File tree

46 files changed

+88
-2669
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+88
-2669
lines changed

fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -486,16 +486,10 @@ def print_kernels(kernels: Optional[List[str]]) -> List[QuantizeOpBase]:
486486
default=None,
487487
help="If set with grouped mode, repeat input shapes this many times. Comma separated list of groups to benchmark",
488488
)
489-
@click.option(
490-
"--total-K",
491-
default=None,
492-
help="If set, adjusts the K values to sum to this number. "
493-
"This can help simulate real grouped workloads in backward wgrad.",
494-
)
495489
@click.option(
496490
"--total-M",
497491
default=None,
498-
help="If set, adjusts the M values to sum to this number. "
492+
help="If set, Adjusts the M values to sum to this number. "
499493
"This can help simulate real grouped workloads.",
500494
)
501495
@click.option(
@@ -548,7 +542,6 @@ def invoke_main(
548542
pair_nk: bool,
549543
grouped: bool,
550544
groups: Optional[str],
551-
total_k: Optional[str],
552545
total_m: Optional[str],
553546
no_cuda_graph: bool,
554547
use_rotating_buffer_bench: bool,
@@ -560,14 +553,6 @@ def invoke_main(
560553
):
561554
if enable_amd_env_vars:
562555
set_amd_env_vars()
563-
564-
# Validate that total_m and total_k are mutually exclusive
565-
if total_m is not None and total_k is not None:
566-
raise ValueError(
567-
"total_m and total_k cannot be specified at the same time. "
568-
"Please provide only one of them."
569-
)
570-
571556
# If kernel filter is provided, parse it. Else, benchmark all kernels.
572557
all_kernels = kernels.strip().split(",") if kernels else None
573558
quantize_ops = collect_kernels_to_profile(all_kernels)
@@ -644,17 +629,6 @@ def invoke_main(
644629
for g in groups_list
645630
for b, _, n, k in MNK
646631
]
647-
elif total_k:
648-
MNK = [
649-
[
650-
[b] * g,
651-
[m] * g,
652-
[n] * g,
653-
generate_group_tensor(g, int(total_k)),
654-
]
655-
for g in groups_list
656-
for b, m, n, _ in MNK
657-
]
658632
else:
659633
MNK = [
660634
[[b] * g, [m] * g, [n] * g, [k] * g]

fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py

Lines changed: 1 addition & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -2084,7 +2084,7 @@ def cuda(self) -> bool:
20842084
@register_quantize_op
20852085
class BF16GroupedGrad(QuantizeOpBase):
20862086
"""
2087-
BF16 grouped matmul with dgrad inputs in pretraining backed by cutlass
2087+
BF16 grouped matmul with grad inputs backed by cutlass
20882088
"""
20892089

20902090
def preprocess(self, x, w):
@@ -2126,52 +2126,6 @@ def cuda(self) -> bool:
21262126
return True
21272127

21282128

2129-
@register_quantize_op
2130-
class BF16GroupedWGrad(QuantizeOpBase):
2131-
"""
2132-
BF16 grouped matmul with wgrad inputs in pretraining backed by cutlass
2133-
"""
2134-
2135-
def preprocess(self, x, w):
2136-
# Get K values for each group
2137-
k_values = [xi.shape[1] for xi in x] # K dimension for each group
2138-
2139-
# Convert k_values into sizes tensor
2140-
k_sizes = torch.tensor(k_values).to(dtype=torch.int64, device=x[0].device)
2141-
2142-
x = torch.concat(x, dim=1).contiguous() # shape: (M, G*K)
2143-
w = torch.concat(w, dim=1).contiguous() # shape: (N, G*K)
2144-
2145-
# Transpose the follows to simulate wgrad shapes
2146-
x = x.t().contiguous() # shape: (G*K, M)
2147-
w = w.t().contiguous() # shape: (G*K, N)
2148-
2149-
# Return processed tensors
2150-
return x, w, k_sizes
2151-
2152-
def quantize(self, x, w, k_sizes):
2153-
return x, w, k_sizes
2154-
2155-
def compute(self, x, w, k_sizes):
2156-
return torch.ops.fbgemm.bf16bf16bf16_grouped_wgrad(x, w, k_sizes)
2157-
2158-
def quantize_and_compute(self, x, w, k_sizes):
2159-
x, w, k_sizes = self.quantize(x, w, k_sizes)
2160-
return self.compute(x, w, k_sizes)
2161-
2162-
@property
2163-
def name(self) -> str:
2164-
return "bf16_grouped_wgrad"
2165-
2166-
@property
2167-
def hip(self) -> bool:
2168-
return False
2169-
2170-
@property
2171-
def cuda(self) -> bool:
2172-
return True
2173-
2174-
21752129
@register_quantize_op
21762130
class BF16GroupedStacked(QuantizeOpBase):
21772131
"""

fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache_convert.cu

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#endif
2424

2525
#include "fbgemm_gpu/utils/cuda_block_count.h"
26+
#include "fbgemm_gpu/utils/kernel_launcher.cuh"
2627
#include "fbgemm_gpu/utils/vec_quant.cuh"
2728

2829
#include <torch/torch.h>
@@ -47,12 +48,12 @@ namespace fbgemm_gpu {
4748
* 32-63 to convert the V tensors. NV only has threads 0-31 per warp.
4849
*/
4950
__global__ void convert_e4m3fn_kv_cache_to_e4m3fnuz_inplace_kernel(
50-
at::PackedTensorAccessor64<uint8_t, 5, at::RestrictPtrTraits>
51+
pta::PackedTensorAccessor64<uint8_t, 5, at::RestrictPtrTraits>
5152
cache_K, // [N_H_L][B][MAX_T][N_KVH][D_H]
52-
at::PackedTensorAccessor64<uint8_t, 5, at::RestrictPtrTraits>
53+
pta::PackedTensorAccessor64<uint8_t, 5, at::RestrictPtrTraits>
5354
cache_V, // [N_H_L][B][MAX_T][N_KVH][D_H]
54-
at::PackedTensorAccessor64<int32_t, 5, at::RestrictPtrTraits> qparam_K,
55-
at::PackedTensorAccessor64<int32_t, 5, at::RestrictPtrTraits> qparam_V) {
55+
pta::PackedTensorAccessor64<int32_t, 5, at::RestrictPtrTraits> qparam_K,
56+
pta::PackedTensorAccessor64<int32_t, 5, at::RestrictPtrTraits> qparam_V) {
5657
auto N_KVH = cache_K.size(3);
5758
auto MAX_T = cache_K.size(2);
5859
auto D_H = cache_K.size(4);
@@ -133,17 +134,16 @@ void convert_e4m3fn_kv_cache_to_e4m3fnuz_inplace(
133134
dim3 blocks(N_H_L, B, std::max<int32_t>(1, kMaxBlocks / (B * N_H_L)));
134135
dim3 threads(kThreadsPerWarp, kWarpsPerBlock);
135136

136-
convert_e4m3fn_kv_cache_to_e4m3fnuz_inplace_kernel<<<
137+
FBGEMM_LAUNCH_KERNEL(
138+
(convert_e4m3fn_kv_cache_to_e4m3fnuz_inplace_kernel),
137139
blocks,
138140
threads,
139141
0,
140-
at::cuda::getCurrentCUDAStream()>>>(
141-
cache_K.packed_accessor64<uint8_t, 5, at::RestrictPtrTraits>(),
142-
cache_V.packed_accessor64<uint8_t, 5, at::RestrictPtrTraits>(),
143-
qparam_K.packed_accessor64<int32_t, 5, at::RestrictPtrTraits>(),
144-
qparam_V.packed_accessor64<int32_t, 5, at::RestrictPtrTraits>());
145-
146-
C10_CUDA_KERNEL_LAUNCH_CHECK();
142+
at::cuda::getCurrentCUDAStream(),
143+
PTA_B(cache_K, uint8_t, 5, 64),
144+
PTA_B(cache_V, uint8_t, 5, 64),
145+
PTA_B(qparam_K, int32_t, 5, 64),
146+
PTA_B(qparam_V, int32_t, 5, 64));
147147
}
148148
#else
149149
void convert_e4m3fn_kv_cache_to_e4m3fnuz_inplace(

0 commit comments

Comments
 (0)