add pingpong buffer for b_frag

baoqiwen · baoqiwen · commit a29ff4124e97 · 2025-07-15T19:49:05.000+08:00
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
@@ -209,7 +209,7 @@ class Wint2xMmaMultistage :
     WarpTransformedFragmentA warp_frag_A_[2];
 
     /// Pair of B fragments used to overlap shared memory loads and math instructions
-    WarpLoadedFragmentB warp_loaded_frag_B_;
+    WarpLoadedFragmentB warp_loaded_frag_B_[2];
     WarpTransformedFragmentB warp_frag_B_;
   };
 
@@ -691,10 +691,10 @@ class Wint2xMmaMultistage :
       int warp_k_compute_offset_B = warp_mma_k % Base::kWarpGemmIterationsPerLoadForB;
       int warp_mma_k_for_B = warp_mma_k / Base::kWarpGemmIterationsPerLoadForB;
 
-      if (warp_k_compute_offset_B  == Base::kWarpGemmIterationsPerLoadForB - 1) {
+      if (warp_k_compute_offset_B == Base::kWarpGemmIterationsPerLoadForB - 1) {
         // Load the next warp-tile's B fragment from shared memory
         this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k_for_B + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
+        this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[(warp_mma_k_for_B + 1) % 2]);
         ++this->warp_tile_iterator_B_;
 
         warp_dequantizer_.load(pipe_state.warp_frag_local_scale_);
@@ -718,6 +718,16 @@ class Wint2xMmaMultistage :
       //     static_cast<int>(reg_uint8_ptr[14]), static_cast<int>(reg_uint8_ptr[15]),
       //     sizeof_bits<typename PipeState::WarpLoadedFragmentB>::value / 8);
 
+      if (warp_k_compute_offset_B == 0) {
+        warp_dequantizer_.dequantize(pipe_state.warp_frag_local_scale_,
+                                     pipe_state.warp_frag_code_scale_,
+                                     pipe_state.warp_frag_code_zp_,
+                                     pipe_state.warp_frag_super_scale_,
+                                     pipe_state.warp_loaded_frag_B_[warp_mma_k_for_B % 2],
+                                     pipe_state.warp_frag_B_,
+                                     (stage - Base::kStages + 2) * Shape::kK);
+      }
+
       if (Detail::kStagedAccumulation) {
         //CUTLASS_TRACE_DEVICE(" [MMa-kStagedAccumulation][stage=%d] warp_mma_k=%d, warp_k_compute_offset_B=%d", stage, warp_mma_k, warp_k_compute_offset_B);
         warp_mma_(
@@ -814,16 +824,6 @@ class Wint2xMmaMultistage :
         iterator_B.clear_mask(gemm_k_iterations == 0);
         quant_params_accessor_B_.clear_mask(mma_quant_args, gemm_k_iterations == 0);
       }
-
-      if (warp_k_compute_offset_B == Base::kWarpGemmIterationsPerLoadForB - 1) {
-        warp_dequantizer_.dequantize(pipe_state.warp_frag_local_scale_,
-                                     pipe_state.warp_frag_code_scale_,
-                                     pipe_state.warp_frag_code_zp_,
-                                     pipe_state.warp_frag_super_scale_,
-                                     pipe_state.warp_loaded_frag_B_,
-                                     pipe_state.warp_frag_B_,
-                                     (stage - Base::kStages + 2) * Shape::kK);
-      }
     }
   }
 
@@ -861,7 +861,7 @@ class Wint2xMmaMultistage :
 
     // Load first warp-tile's B fragment from shared memory
     this->warp_tile_iterator_B_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
+    this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[0]);
     ++this->warp_tile_iterator_B_;
 
 #if 0
@@ -907,14 +907,6 @@ class Wint2xMmaMultistage :
     }
 #endif
 
-    warp_dequantizer_.dequantize(pipe_state.warp_frag_local_scale_,
-                                 pipe_state.warp_frag_code_scale_,
-                                 pipe_state.warp_frag_code_zp_,
-                                 pipe_state.warp_frag_super_scale_,
-                                 pipe_state.warp_loaded_frag_B_,
-                                 pipe_state.warp_frag_B_,
-                                 0);
-
 #if 0
     if (TransformBAfterLDS::result_type::kElements == 64) {
       CUTLASS_TRACE_DEVICE(" TransformBAfterLDS::result_type::kElements: 64, %d bytes", sizeof_bits<typename TransformBAfterLDS::result_type>::value / 8);