fix the conflict

zeroRains · zeroRains · commit bbd42d3c63df · 2025-04-20T09:08:37.000Z
diff --git a/csrc/gpu/moe/fused_moe/cutlass_kernels/moe_gemm/fused_moe_cutlass_kernel.h b/csrc/gpu/moe/fused_moe/cutlass_kernels/moe_gemm/fused_moe_cutlass_kernel.h
@@ -211,6 +211,8 @@ struct MoeFCGemm {
     // Only used by device-level operator
     GemmCoord* host_problem_sizes;
 
+    int group_size;
+
     //
     // Methods
     //
@@ -220,6 +222,7 @@ struct MoeFCGemm {
     Arguments()
         : problem_count(0),
           threadblock_count(0),
+          group_size(-1),
           ptr_A(nullptr),
           ptr_B(nullptr),
           weight_scales(nullptr),
@@ -243,10 +246,12 @@ struct MoeFCGemm {
               int64_t* total_rows_before_expert,
               int64_t gemm_n,
               int64_t gemm_k,
+              int group_size,
               GemmCoord* host_problem_sizes = nullptr)
         : problem_count(problem_count),
           threadblock_count(threadblock_count),
           output_op(output_op),
+          group_size(group_size),
           ptr_A(const_cast<ElementA*>(ptr_A)),
           ptr_B(const_cast<ElementB*>(ptr_B)),
           weight_scales(const_cast<ElementScale*>(weight_scales)),
@@ -280,6 +285,8 @@ struct MoeFCGemm {
     ElementC* ptr_C;
     ElementC* ptr_D;
 
+    int group_size;
+
     //
     // Methods
     //
@@ -290,7 +297,8 @@ struct MoeFCGemm {
           ptr_B(nullptr),
           weight_scales(nullptr),
           ptr_C(nullptr),
-          ptr_D(nullptr) {}
+          ptr_D(nullptr),
+          group_size(-1) {}
 
     CUTLASS_HOST_DEVICE
     Params(Arguments const& args,
@@ -308,7 +316,8 @@ struct MoeFCGemm {
           ptr_B(args.ptr_B),
           weight_scales(args.weight_scales),
           ptr_C(args.ptr_C),
-          ptr_D(args.ptr_D) {}
+          ptr_D(args.ptr_D),
+          group_size(args.group_size) {}
 
     CUTLASS_HOST_DEVICE
     void update(Arguments const& args,
@@ -498,7 +507,7 @@ struct MoeFCGemm {
         auto CreateMMA = [&]() {
           if constexpr (use_dq_gemm<Mma>::value)
             return Mma(
-                shared_storage.main_loop, -1, thread_idx, warp_idx, lane_idx);
+                shared_storage.main_loop, params.group_size, thread_idx, warp_idx, lane_idx);
           else
             return Mma(
                 shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
diff --git a/csrc/gpu/moe/fused_moe/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels.h b/csrc/gpu/moe/fused_moe/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels.h
@@ -37,6 +37,7 @@ class MoeGemmRunner {
                          int64_t gemm_k,
                          int num_experts,
                          std::string activation_type,
+                         const int32_t weightonly_group_size,
                          cudaStream_t stream);
 
   void moe_gemm(const T* A,
@@ -48,6 +49,7 @@ class MoeGemmRunner {
                 int64_t gemm_n,
                 int64_t gemm_k,
                 int num_experts,
+                int group_size,
                 cudaStream_t stream);
 
  private:
@@ -62,6 +64,7 @@ class MoeGemmRunner {
                         int64_t gemm_n,
                         int64_t gemm_k,
                         int num_experts,
+                        int group_size,
                         CutlassGemmConfig gemm_config,
                         cudaStream_t stream,
                         int* occupancy = nullptr);
@@ -77,6 +80,7 @@ class MoeGemmRunner {
                 int64_t gemm_n,
                 int64_t gemm_k,
                 int num_experts,
+                int group_size,
                 cudaStream_t stream);
 
  private:
diff --git a/csrc/gpu/moe/fused_moe/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h b/csrc/gpu/moe/fused_moe/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h
@@ -66,6 +66,7 @@ void generic_moe_gemm_kernelLauncher(const T* A,
                                      int64_t gemm_n,
                                      int64_t gemm_k,
                                      int num_experts,
+                                     int group_size,
                                      CutlassGemmConfig gemm_config,
                                      const int multi_processor_count,
                                      cudaStream_t stream,
@@ -191,7 +192,8 @@ void generic_moe_gemm_kernelLauncher(const T* A,
       reinterpret_cast<ElementType*>(C),
       total_rows_before_expert,
       gemm_n,
-      gemm_k);
+      gemm_k,
+      group_size);
 
   GemmGrouped gemm;
 
@@ -237,6 +239,7 @@ struct dispatch_stages {
                        int64_t gemm_n,
                        int64_t gemm_k,
                        int num_experts,
+                       int group_size,
                        CutlassGemmConfig gemm_config,
                        int multi_processor_count,
                        cudaStream_t stream,
@@ -271,6 +274,7 @@ struct dispatch_stages<T,
                        int64_t gemm_n,
                        int64_t gemm_k,
                        int num_experts,
+                       int group_size,
                        CutlassGemmConfig gemm_config,
                        int multi_processor_count,
                        cudaStream_t stream,
@@ -290,6 +294,7 @@ struct dispatch_stages<T,
                                        gemm_n,
                                        gemm_k,
                                        num_experts,
+                                       group_size,
                                        gemm_config,
                                        multi_processor_count,
                                        stream,
@@ -320,6 +325,7 @@ struct dispatch_stages<T,
                        int64_t gemm_n,
                        int64_t gemm_k,
                        int num_experts,
+                       int group_size,
                        CutlassGemmConfig gemm_config,
                        int multi_processor_count,
                        cudaStream_t stream,
@@ -339,6 +345,7 @@ struct dispatch_stages<T,
                                             gemm_n,
                                             gemm_k,
                                             num_experts,
+                                            group_size,
                                             gemm_config,
                                             multi_processor_count,
                                             stream,
@@ -361,6 +368,7 @@ void dispatch_gemm_config(const T* A,
                           int64_t gemm_n,
                           int64_t gemm_k,
                           int num_experts,
+                          int group_size,
                           CutlassGemmConfig gemm_config,
                           int multi_processor_count,
                           cudaStream_t stream,
@@ -382,6 +390,7 @@ void dispatch_gemm_config(const T* A,
                                      gemm_n,                   \
                                      gemm_k,                   \
                                      num_experts,              \
+                                     group_size,               \
                                      gemm_config,              \
                                      multi_processor_count,    \
                                      stream,                   \
@@ -419,6 +428,7 @@ void dispatch_gemm_config(const T* A,
         gemm_n,                                                 \
         gemm_k,                                                 \
         num_experts,                                            \
+        group_size,                                             \
         gemm_config,                                            \
         multi_processor_count,                                  \
         stream,                                                 \
@@ -444,6 +454,7 @@ void dispatch_moe_gemm_to_cutlass(const T* A,
                                   int64_t gemm_n,
                                   int64_t gemm_k,
                                   int num_experts,
+                                  int group_size,
                                   CutlassGemmConfig gemm_config,
                                   int sm_version,
                                   int multi_processor_count,
@@ -489,6 +500,7 @@ void dispatch_moe_gemm_to_cutlass(const T* A,
                                   int64_t gemm_n,
                                   int64_t gemm_k,
                                   int num_experts,
+                                  int group_size,
                                   CutlassGemmConfig gemm_config,
                                   int sm_version,
                                   int multi_processor_count,
@@ -555,6 +567,7 @@ void dispatch_moe_gemm_to_cutlass(const T* A,
                                   int64_t gemm_n,
                                   int64_t gemm_k,
                                   int num_experts,
+                                  int group_size,
                                   CutlassGemmConfig gemm_config,
                                   int sm_version,
                                   int multi_processor_count,
@@ -602,6 +615,7 @@ void MoeGemmRunner<T, WeightType>::dispatch_to_arch<EpilogueTag>(
     int64_t gemm_n,
     int64_t gemm_k,
     int num_experts,
+    int group_size,
     CutlassGemmConfig gemm_config,
     cudaStream_t stream,
     int* occupancy) {
@@ -617,6 +631,7 @@ void MoeGemmRunner<T, WeightType>::dispatch_to_arch<EpilogueTag>(
       gemm_n,                                                     \
       gemm_k,                                                     \
       num_experts,                                                \
+      group_size,                                                 \
       gemm_config,                                                \
       sm_,                                                        \
       multi_processor_count_,                                     \
@@ -647,11 +662,12 @@ void MoeGemmRunner<T, WeightType>::run_gemm<EpilogueTag>(
     int64_t gemm_n,
     int64_t gemm_k,
     int num_experts,
+    int group_size,
     cudaStream_t stream) {
   static constexpr bool is_weight_only = !std::is_same<T, WeightType>::value;
   static constexpr bool only_simt_configs = std::is_same<T, float>::value;
   std::vector<CutlassGemmConfig> candidate_configs =
-      get_candidate_configs(sm_, -1, is_weight_only, only_simt_configs, true);
+      get_candidate_configs(sm_, group_size, is_weight_only, only_simt_configs, true);
   static constexpr int warm_time = 5;
   static constexpr int test_time = 10;
   auto& gemmConfigManager = GemmConfigManager::Instance();
@@ -670,7 +686,6 @@ void MoeGemmRunner<T, WeightType>::run_gemm<EpilogueTag>(
     int profile_total_rows =
         std::min(gemmConfigManager.nextPowerOfTwo(total_rows),
                  gemmConfigManager.getMaxProfileM());
-
     for (size_t ii = 0; ii < candidate_configs.size(); ++ii) {
       for (int i = 0; i < warm_time; i++) {
         dispatch_to_arch<EpilogueTag>(A,
@@ -748,6 +763,7 @@ void MoeGemmRunner<T, WeightType>::moe_gemm_bias_act(
     int64_t gemm_k,
     int num_experts,
     std::string activation_type,
+    const int32_t weightonly_group_size,
     cudaStream_t stream) {
   if (activation_type == "none") {
     if (biases) {
@@ -761,6 +777,7 @@ void MoeGemmRunner<T, WeightType>::moe_gemm_bias_act(
                                gemm_n,
                                gemm_k,
                                num_experts,
+                               weightonly_group_size,
                                stream);
     } else {
       run_gemm<EpilogueOpNoBias>(A,
@@ -773,6 +790,7 @@ void MoeGemmRunner<T, WeightType>::moe_gemm_bias_act(
                                  gemm_n,
                                  gemm_k,
                                  num_experts,
+                                 weightonly_group_size,
                                  stream);
     }
   }
@@ -788,6 +806,7 @@ void MoeGemmRunner<T, WeightType>::moe_gemm(const T* A,
                                             int64_t gemm_n,
                                             int64_t gemm_k,
                                             int num_experts,
+                                            int group_size,
                                             cudaStream_t stream) {
   run_gemm<EpilogueOpNoBias>(A,
                              B,
@@ -799,5 +818,6 @@ void MoeGemmRunner<T, WeightType>::moe_gemm(const T* A,
                              gemm_n,
                              gemm_k,
                              num_experts,
+                             group_size,
                              stream);
 }
diff --git a/csrc/gpu/moe/fused_moe/fused_moe.cu b/csrc/gpu/moe/fused_moe/fused_moe.cu
@@ -57,6 +57,7 @@ void FusedMoeKernel(const paddle::Tensor& input,
                     const paddle::optional<paddle::Tensor>& ffn2_scale,
                     const paddle::optional<paddle::Tensor>& ffn2_bias,
                     const std::string& quant_method,
+                    const int weightonly_group_size,
                     const int moe_topk,
                     const bool group_moe,
                     const bool norm_topk_prob,
@@ -86,6 +87,7 @@ void FusedMoeKernel(const paddle::Tensor& input,
                          ffn2_scale ? ffn2_scale.get_ptr() : nullptr,
                          ffn2_bias ? ffn2_bias.get_ptr() : nullptr,
                          nullptr,
+                         weightonly_group_size,
                          moe_topk,
                          group_moe,
                          norm_topk_prob,
@@ -105,6 +107,7 @@ std::vector<paddle::Tensor> FusedExpertMoe(
     const paddle::optional<paddle::Tensor>& ffn2_bias,
     const paddle::optional<paddle::Tensor>& ffn2_scale,
     const std::string& quant_method,
+    const int weightonly_group_size,
     const int moe_topk,
     const bool norm_topk_prob,
     const bool group_moe) {
@@ -122,6 +125,7 @@ std::vector<paddle::Tensor> FusedExpertMoe(
                                                  ffn2_scale,
                                                  ffn2_bias,
                                                  quant_method,
+                                                 weightonly_group_size,
                                                  moe_topk,
                                                  group_moe,
                                                  norm_topk_prob,
@@ -137,6 +141,7 @@ std::vector<paddle::Tensor> FusedExpertMoe(
                                                 ffn2_scale,
                                                 ffn2_bias,
                                                 quant_method,
+                                                weightonly_group_size,
                                                 moe_topk,
                                                 group_moe,
                                                 norm_topk_prob,
@@ -184,6 +189,7 @@ PD_BUILD_OP(fused_expert_moe)
              paddle::Optional("ffn2_scale")})
     .Outputs({"output"})
     .Attrs({"quant_method:std::string",
+            "weightonly_group_size:int",
             "moe_topk:int",
             "norm_topk_prob:bool",
             "group_moe:bool"})
diff --git a/csrc/gpu/moe/fused_moe/moe/fused_moe_helper.h b/csrc/gpu/moe/fused_moe/moe/fused_moe_helper.h
@@ -126,6 +126,7 @@ class MoeHelper {
                   const paddle::Tensor *ffn2_scale,
                   const paddle::Tensor *ffn2_bias,
                   const paddle::Tensor *moe_token_type_ids,
+                  const int weightonly_group_size,
                   const int moe_topk,
                   const bool group_moe,
                   const bool norm_topk_prob,
@@ -304,6 +305,7 @@ class MoeHelper {
           hidden_size,
           num_experts,
           "none",
+          weightonly_group_size,
           stream);
     } else if (gemm_method_ == "weight_only_int4") {
       int4_moe_gemm_runner_->moe_gemm_bias_act(
@@ -319,6 +321,7 @@ class MoeHelper {
           hidden_size,
           num_experts,
           "none",
+          weightonly_group_size,
           stream);
     } else {
       fp16_moe_gemm_runner_->moe_gemm_bias_act(
@@ -333,6 +336,7 @@ class MoeHelper {
           hidden_size,
           num_experts,
           "none",
+          weightonly_group_size,
           stream);
     }
 
@@ -356,6 +360,7 @@ class MoeHelper {
             hidden_size,
             inter_size / 2,
             num_experts,
+            weightonly_group_size,
             stream);
       } else if (gemm_method_ == "weight_only_int4") {
         int4_moe_gemm_runner_->moe_gemm(
@@ -369,6 +374,7 @@ class MoeHelper {
             hidden_size,
             inter_size / 2,
             num_experts,
+            weightonly_group_size,
             stream);
       } else {
         fp16_moe_gemm_runner_->moe_gemm(
@@ -381,6 +387,7 @@ class MoeHelper {
             hidden_size,
             inter_size / 2,
             num_experts,
+            weightonly_group_size,
             stream);
       }
 
diff --git a/csrc/gpu/moe/fused_moe/moe_ffn.cu b/csrc/gpu/moe/fused_moe/moe_ffn.cu
diff --git a/paddlenlp/experimental/transformers/fused_transformer_layers.py b/paddlenlp/experimental/transformers/fused_transformer_layers.py