support swapAB for m_grouped_fp8_gemm_nt_masked

root · root · commit f2e2357fb642 · 2025-09-11T22:14:35.000+08:00
diff --git a/csrc/jit_kernels/heuristics/common.hpp b/csrc/jit_kernels/heuristics/common.hpp
@@ -152,11 +152,17 @@ static GemmConfig get_best_config(const GemmType& gemm_type, const KernelType& k
 
     // Select M/N block sizes
     // TODO: support `% 16 == 8` block size on SM90
-    const auto& block_ms = gemm_type == GemmType::MGroupedContiguous ?
+
+    std::vector<int> block_ms = gemm_type == GemmType::MGroupedContiguous ?
         std::vector{get_mk_alignment_for_contiguous_layout()} : std::vector{64, 128, 256};
     std::vector<int> block_ns;
     for (int i = 16; i <= 256; i += 16)
         block_ns.push_back(i);
+    if(get_env<int>("ENABLE_SWAPAB")){
+        block_ms = std::vector{32};  // 32, 64
+        block_ns = std::vector{256}; // 64, 128, 256 
+    }
+    
 
     // K block size is selected in a fixed manner
     const auto& block_k = 128 / static_cast<int>(c10::elementSize(ab_dtype));
diff --git a/csrc/jit_kernels/heuristics/sm90.hpp b/csrc/jit_kernels/heuristics/sm90.hpp
@@ -42,9 +42,15 @@ struct SM90ArchSpec {
 
         // Too many scaling factors in a single block: `block_n > block_k and std::gcd(block_n, block_k) != block_n - block_k`
         // Or too many register spills
-        if (block_n > 128 and kernel_type == KernelType::Kernel1D2D and (block_n != 144 and block_n != 160 and block_n != 192))
-            return false;
 
+        if(get_env<int>("ENABLE_SWAPAB")){
+            if (block_n != 64 and block_n != 128 and block_n != 256)
+                return false;
+        }else{
+           if (block_n > 128 and kernel_type == KernelType::Kernel1D2D and (block_n != 144 and block_n != 160 and block_n != 192))
+                return false;
+        }
+        
         // Avoid bank conflicts for FP32 output
         if (cd_dtype == torch::kFloat and block_n % 16 == 0)
             return false;
@@ -77,7 +83,13 @@ struct SM90ArchSpec {
 
     static ThreadConfig get_thread_config(const KernelType& kernel_type,
                                           const int& block_m, const int& block_n) {
-        return ThreadConfig::sm90(128, (block_m == 64 ? 1 : 2) * 128);
+        int tile = 64;
+        if(get_env<int>("ENABLE_SWAPAB")){
+            tile = block_n;
+        }else{
+            tile = block_m;
+        }
+        return ThreadConfig::sm90(128, (tile > 64 ? 2 : 1) * 128);
     }
 
     static int get_smem_cd_size(const KernelType& kernel_type,
@@ -102,7 +114,8 @@ struct SM90ArchSpec {
 
     static int get_extra_sfb_smem_size(const int& m, const int& n, const int& k,
                                        const int& block_m, const int& block_n, const int& block_k) {
-        const auto& use_uniform_sfb = block_k % block_n == 0 ? 1 : 2;
+        const auto& use_uniform_sfb = get_env<int>("ENABLE_SWAPAB") ? (block_n / 64):(block_k % block_n == 0 ? 1 : 2);
+
         return align<int>(ceil_div(k, block_k) * static_cast<int>(sizeof(float)) * use_uniform_sfb, 8);
     }
 
diff --git a/csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp b/csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp
@@ -29,13 +29,19 @@ class SM90FP8Gemm1D2DRuntime final: public LaunchRuntime<SM90FP8Gemm1D2DRuntime>
     };
 
     static std::string generate_impl(const Args& args) {
+
+    const char* kernel_name =
+        get_env<int>("ENABLE_SWAPAB") ?
+            "swapAB_sm90_fp8_gemm_1d2d_impl" :
+            "sm90_fp8_gemm_1d2d_impl";
+
         return fmt::format(R"(
 #include <deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh>
 
 using namespace deep_gemm;
 
 static void __instantiate_kernel() {{
-    auto ptr = reinterpret_cast<void*>(&sm90_fp8_gemm_1d2d_impl<
+    auto ptr = reinterpret_cast<void*>(&{}<
         {}, {}, {},
         {},
         {}, {}, {},
@@ -47,6 +53,7 @@ static void __instantiate_kernel() {{
     >);
 }};
 )",
+        kernel_name,
         // TODO: add CD dtype
         get_compiled_dim(args.m, 'm', args.compiled_dims), get_compiled_dim(args.n, 'n', args.compiled_dims), get_compiled_dim(args.k, 'k', args.compiled_dims),
         args.num_groups,
diff --git a/deep_gemm/include/deep_gemm/common/sm90_utils.cuh b/deep_gemm/include/deep_gemm/common/sm90_utils.cuh
@@ -144,6 +144,17 @@ struct SM90_U32x2_STSM_N {
     }
 };
 
+template <typename dtype_t>
+struct SM90_U32x2_STSM_T
+{
+    __device__ __forceinline__ static void copy(dtype_t src_0, dtype_t src_1, void* smem_dst)
+    {
+        const uint32_t src[2] = {*reinterpret_cast<uint32_t*>(&src_0), *reinterpret_cast<uint32_t*>(&src_1)};
+        asm volatile("stmatrix.sync.aligned.x2.m8n8.shared.b16.trans [%0], {%1, %2};\n" ::"l"(smem_dst), "r"(src[0]),
+            "r"(src[1]));
+    }
+};
+
 __forceinline__ __device__ void warpgroup_arrive() {
     asm volatile("wgmma.fence.sync.aligned;\n" ::: "memory");
 }
diff --git a/deep_gemm/include/deep_gemm/common/utils.cuh b/deep_gemm/include/deep_gemm/common/utils.cuh
@@ -122,6 +122,12 @@ __device__  __forceinline__ float ld_shared(const float* ptr) {
     return ret;
 }
 
+__device__  __forceinline__ float2 ld_shared(const float2* __restrict__ ptr) {
+    float2 ret;
+    asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : "l"(ptr));
+    return ret;
+}
+
 __device__ __forceinline__ void st_shared(const float* ptr, float val) {
     asm volatile("st.shared.f32 [%0], %1;" :: "l"(ptr), "f"(val));
 }
diff --git a/deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh b/deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh
diff --git a/tests/generators.py b/tests/generators.py

Original file line number	Diff line number	Diff line change
`@@ -122,6 +122,12 @@ __device__ __forceinline__ float ld_shared(const float* ptr) {`
`122`	`122`	`return ret;`
`123`	`123`	`}`
`124`	`124`
	`125`	`+__device__ __forceinline__ float2 ld_shared(const float2* __restrict__ ptr) {`
	`126`	`+ float2 ret;`
	`127`	`+ asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : "l"(ptr));`
	`128`	`+ return ret;`
	`129`	`+}`
	`130`	`+`
`125`	`131`	`__device__ __forceinline__ void st_shared(const float* ptr, float val) {`
`126`	`132`	`asm volatile("st.shared.f32 [%0], %1;" :: "l"(ptr), "f"(val));`
`127`	`133`	`}`