fix tune

root · root · commit b536f8e9ccb6 · 2025-02-25T15:40:20.000+08:00
diff --git a/csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h b/csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h
@@ -296,16 +296,15 @@ struct MoeFCGemm
 
     static Status can_implement(Arguments const& args)
     {   
-        std::cout << "我改了can_implement"<< std::endl;
         if (platform::is_same<uint8_t, ElementB>::value || platform::is_same<uint4b_t, ElementB>::value)
         {
             if (args.weight_scales == nullptr)
             {
                 // CUTLASS_TRACE_HOST("MoeFCGemm::can_implement() - weight scales are required for uint8_t and uint4b_t");
                 printf("MoeFCGemm::can_implement() - weight scales are required for uint8_t and uint4b_t \n");
-                printf("暂时改为sucess \n");
-                return Status::kSuccess;
-                // return Status::kInvalid;
+                // printf("暂时改为sucess \n");
+                // return Status::kSuccess;
+                return Status::kInvalid;
             }
         }
         else if (args.weight_scales != nullptr)
diff --git a/csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h b/csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
@@ -53,10 +53,10 @@ enum class CutlassTileConfig
     
     // Warp configs for M=128
     CtaShape128x64x64_WarpShape64x32x64, // 9
-    CtaShape128x128x64_WarpShape64x32x64,
-    CtaShape128x128x64_WarpShape64x64x64,
-    CtaShape128x128x64_WarpShape128x32x64,
-    CtaShape128x256x64_WarpShape64x64x64,
+    CtaShape128x128x64_WarpShape64x32x64, // 10
+    CtaShape128x128x64_WarpShape64x64x64, // 11
+    CtaShape128x128x64_WarpShape128x32x64, // 12
+    CtaShape128x256x64_WarpShape64x64x64, // 13
 
     // Warp configs for M=256
     CtaShape256x128x64_WarpShape64x64x64,
diff --git a/csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
@@ -153,10 +153,10 @@ std::vector<CutlassTileConfig> get_candidate_tiles(
     case CutlassGemmType::WeightOnly:
         if (sm >= 75)
         {   
-            std::cout << "我增加了一些配置"<< std::endl;
+            std::cout << "全部配置"<< std::endl;
             return {
-                // CutlassTileConfig::CtaShape16x128x64_WarpShape16x32x64,这两个配置比较慢
-                // CutlassTileConfig::CtaShape16x256x64_WarpShape16x64x64,
+                CutlassTileConfig::CtaShape16x128x64_WarpShape16x32x64, //这两个配置比较慢
+                CutlassTileConfig::CtaShape16x256x64_WarpShape16x64x64,
                 CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,
                 CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64,
                 CutlassTileConfig::CtaShape64x128x64_WarpShape64x64x64,
diff --git a/csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h b/csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
@@ -173,9 +173,6 @@ struct genericMoeGemmKernelLauncher
             PADDLE_ENFORCE(occupancy > 0, "GPU lacks the shared memory resources to run GroupedGEMM kernel");
             int const threadblock_count = multi_processor_count * occupancy;
 
-            if (weight_scales == nullptr) {
-                std::cout << "why fuck wo bu dong le !!!!!!!!!"<< std::endl;
-            }
             int const group_size = gemm_k;
             typename GemmGrouped::Arguments args(num_experts, threadblock_count, group_size, epilogue_op,
                 reinterpret_cast<ElementType const*>(A), reinterpret_cast<CutlassWeightType const*>(B),
@@ -185,7 +182,6 @@ struct genericMoeGemmKernelLauncher
 
             GemmGrouped gemm;
 
-            std::cout << "gemm can_imple"<< std::endl;
             auto can_implement = gemm.can_implement(args);
             PADDLE_ENFORCE(can_implement == cutlass::Status::kSuccess,
                 "MoE FC kernel will fail for params.");
@@ -268,7 +264,6 @@ void dispatchGemmConfig(T const* A, WeightType const* B, GemmOutputType const* w
     cutlass_extensions::CutlassGemmConfig gemm_config, int multi_processor_count, bool use_fused_moe,
     float const** alpha_scale_ptr_array, cudaStream_t stream, int* occupancy = nullptr)
 {   
-    // std::cout << "我又修改为了3，3，3，3"<< std::endl;
     switch (gemm_config.stages)
     {
     case 2:
@@ -397,6 +392,12 @@ void dispatchMoeGemmToCutlass(T const* A, WeightType const* B, GemmOutputType co
             use_fused_moe, alpha_scale_ptr_array, stream, occupancy);
         break;
     // 新加的
+    case cutlass_extensions::CutlassTileConfig::CtaShape64x128x64_WarpShape64x64x64:
+        dispatchGemmConfig<T, WeightType, GemmOutputType, arch, EpilogueTag, cutlass::gemm::GemmShape<64, 128, 64>,
+            cutlass::gemm::GemmShape<64, 64, 64>>(A, B, weight_scales, biases, bias_is_broadcast, C,
+            total_tokens_including_expert, total_rows, gemm_n, gemm_k, num_experts, gemm_config, multi_processor_count,
+            use_fused_moe, alpha_scale_ptr_array, stream, occupancy);
+        break;
     case cutlass_extensions::CutlassTileConfig::CtaShape64x128x64_WarpShape32x64x64:
         dispatchGemmConfig<T, WeightType, GemmOutputType, arch, EpilogueTag, cutlass::gemm::GemmShape<64, 128, 64>,
             cutlass::gemm::GemmShape<32, 64, 64>>(A, B, weight_scales, biases, bias_is_broadcast, C,
@@ -844,11 +845,11 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::moeGemmBiasAct(T c
             total_tokens_including_expert, hopper_input, total_rows, gemm_n, gemm_k, num_experts, use_fused_moe,
             alpha_scale_ptr_array, stream, chosen_conf);
         break;
-    case ActivationType::Geglu:
-        runGemm<cutlass_extensions::EpilogueOpDefaultFtGelu>(A, B, weight_scales, biases, bias_is_broadcast, C,
-            total_tokens_including_expert, hopper_input, total_rows, gemm_n, gemm_k, num_experts, use_fused_moe,
-            alpha_scale_ptr_array, stream, chosen_conf);
-        break;
+    // case ActivationType::Geglu:
+    //     runGemm<cutlass_extensions::EpilogueOpDefaultFtGelu>(A, B, weight_scales, biases, bias_is_broadcast, C,
+    //         total_tokens_including_expert, hopper_input, total_rows, gemm_n, gemm_k, num_experts, use_fused_moe,
+    //         alpha_scale_ptr_array, stream, chosen_conf);
+    //     break;
     case ActivationType::InvalidType: PADDLE_THROW("Activation type for fpA_intB must be valid."); break;
     default: PADDLE_THROW("Invalid activation type."); break;
     }
diff --git a/csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu b/csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
@@ -66,7 +66,7 @@
 
 
 #include "paddle/phi/core/enforce.h"
-
+#include "moe/utils.h"
 
 using namespace tensorrt_llm::kernels;
 using namespace tensorrt_llm::common;
@@ -573,7 +573,7 @@ void topkGatingSoftmaxKernelLauncher(float const* input, float* output, float* s
         default:
         {
             static constexpr int TPB = 256;
-            // PADDLE_CHECK(softmax_temp_output != nullptr);
+            PADDLE_CHECK(softmax_temp_output != nullptr);
             moeSoftmax<TPB><<<num_rows, TPB, 0, stream>>>(input, nullptr, softmax_temp_output, num_experts);
             moeTopK<TPB><<<num_rows, TPB, 0, stream>>>(softmax_temp_output, nullptr, output, indices, source_row,
                 num_experts, k, startk, endk, start_expert, end_expert, norm_mode);
@@ -1662,9 +1662,10 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, ScaleBiasType, Enable>::gemm1
     int64_t const* total_tokens_including_expert = expert_first_token_offset + 1;
 
     if (using_hopper_gemm1)
-    {
-        // PADDLE_CHECK(config.is_sm90);
-        // PADDLE_CHECK(!use_ampere_activation_fusion);
+    {   
+        std::cout << "sm 90 swiglu走的这里"<< std::endl;
+        PADDLE_CHECK(config.is_sm90);
+        PADDLE_CHECK(!use_ampere_activation_fusion);
         bool has_different_gemm_output_type = using_hopper_gemm1 && !std::is_same_v<T, OutputType>;
         bool const has_intermediate = has_different_gemm_output_type || is_gated_activation;
         // PADDLE_ENFORCE(has_intermediate || input != output, "Input and output buffers are overlapping");
@@ -1691,8 +1692,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, ScaleBiasType, Enable>::gemm1
     }
     else if (use_fp8)
     {
-        // PADDLE_CHECK(!use_ampere_activation_fusion);
-        // PADDLE_CHECK(!config.is_sm90);
+        PADDLE_CHECK(!use_ampere_activation_fusion);
+        PADDLE_CHECK(!config.is_sm90);
 
         alpha_scale_ptr_array
             = computeFP8DequantScale(alpha_scale_ptr_array, num_experts_per_node, fc1_fp8_dequant, stream);
@@ -1710,16 +1711,17 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, ScaleBiasType, Enable>::gemm1
     }
     else if (!is_gated_activation)
     {
-        // PADDLE_CHECK(!use_ampere_activation_fusion);
-        // PADDLE_CHECK(!config.is_sm90);
+        PADDLE_CHECK(!use_ampere_activation_fusion);
+        PADDLE_CHECK(!config.is_sm90);
+        std::cout << "sm 80 swiglu走的这里 is_gated_activation"<< std::endl;
         gemm_runner.moeGemmBiasAct(input, fc1_expert_weights, nullptr, nullptr, false,
             output, total_tokens_including_expert, HopperGroupedGemmInput{}, expanded_num_rows, fc1_out_size,
             hidden_size, num_experts_per_node, fc1_activation_type, false, nullptr, stream, config);
     }
     else
     {
-        // PADDLE_CHECK(!config.is_sm90);
-        // PADDLE_CHECK(is_gated_activation);
+        PADDLE_CHECK(!config.is_sm90);
+        PADDLE_CHECK(is_gated_activation);
         PADDLE_ENFORCE(
             !use_ampere_activation_fusion || input != output, "Input and output buffers are overlapping");
 
@@ -2331,15 +2333,13 @@ void GemmProfilerBackend::runProfiler(
         hopper_input_template.configureWorkspace(
             static_cast<int8_t*>(hopper_workspace), num_experts_per_node, gemm_workspace, workspaces.back());
     }
-    if (scale_1 == nullptr) {
-        std::cout << "我不懂了 "<< std::endl;
-    }
 
     QuantParams quant_params;
-    if (mWType == paddle::DataType::INT8)
-    {
+    if (QuantMode == "weight_only_int8" || QuantMode == "weight_only_int4")
+    {   
         PADDLE_CHECK(scale_1 && scale_2);
         quant_params = QuantParams::Int(scale_1, scale_2);
+        
     }
     else if (mWType == paddle::DataType::FLOAT8_E4M3FN)
     {
@@ -2350,7 +2350,7 @@ void GemmProfilerBackend::runProfiler(
 
     mInterface->is_profiler = true;
     if (mGemmToProfile == GemmToProfile::GEMM_1)
-    {
+    {   
         mInterface->gemm1(inputs,                             //
             outputs,                                          //
             intermediate,                                     //
@@ -2373,7 +2373,7 @@ void GemmProfilerBackend::runProfiler(
             tactic);
     }
     else
-    {
+    {   
         PADDLE_CHECK(mGemmToProfile == GemmToProfile::GEMM_2);
         mInterface->gemm2(inputs,                           //
             intermediate,                                   //
diff --git a/csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h b/csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h
@@ -453,7 +453,7 @@ struct GemmProfilerBackend
         mK = k;
         mExpertHiddenSize = hidden_size;
         mExpertInterSize = inter_size;
-        // mActivationType = activation_type;
+        mActivationType = tensorrt_llm::ActivationType::Swiglu; // 固定为Swiglu
         mBias = bias;
         mParallelismConfig = parallelism_config;
         QuantMode = quant_mode;
diff --git a/csrc/gpu/moe/tensorrt-llm-moe/moe/deepseek_v3.py b/csrc/gpu/moe/tensorrt-llm-moe/moe/deepseek_v3.py
@@ -93,6 +93,7 @@
 for i in range(10):
     paddle.device.synchronize()
     start = time.time()
+    
     out = trt_llm_fused_moe(
             tmp_out, # input
             # batch_input,
diff --git a/csrc/gpu/moe/tensorrt-llm-moe/moe/moe.cu b/csrc/gpu/moe/tensorrt-llm-moe/moe/moe.cu
@@ -6,7 +6,7 @@
 #include "tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h"
 #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h"
 #include "cutlass_helper.h"
-#include "utils.h"
+#include "moe/utils.h"
 #include "profile.h"
 
 // profile部分 ***************************************
@@ -118,8 +118,8 @@ public:
 
         mProfiler->mGemmToProfile = gemm_idx;
         // TODO: support more dtypes and expert parallelism
-        auto parallelism_config = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank);
-        mProfiler->init(*mKernelRunner, mProfiler->mGemmToProfile,
+        auto parallelism_config = kernels::MOEParallelismConfig(1, 0, 1, 0);
+        mProfiler->init(*mKernelRunner.get(), mProfiler->mGemmToProfile,
             mActivationDtype,
             mWeightDtype,
             mOutputDtype, num_experts, top_k, hidden_size, inter_size,
@@ -129,6 +129,7 @@ public:
         size_t tmp_workspace_size = mProfiler->getWorkspaceSize(mMaxDimM);
         auto const cu_malloc_status = cudaMalloc(&profile_workspace, tmp_workspace_size);
         
+        PADDLE_ENFORCE(cu_malloc_status == cudaSuccess, "Can't allocate tmp workspace for MOE GEMM tactics profiling.");
         
         if (cu_malloc_status != cudaSuccess) {
             std::cout << "Can't allocate tmp workspace for MOE GEMM tactics profiling." << std::endl;
@@ -141,7 +142,7 @@ public:
         }
 
         auto const cu_free = cudaFree(profile_workspace);
-        // TORCH_CHECK(cu_free == cudaSuccess, "Can't free tmp workspace for MOE GEMM profiling.");
+        PADDLE_ENFORCE(cu_free == cudaSuccess, "Can't free tmp workspace for MOE GEMM profiling.");
     }
 
     std::vector<Profile> getFilteredConfigs(std::vector<Profile> tactics, int sm) {
@@ -180,8 +181,7 @@ public:
     float runSingleProfile(int64_t const m, Profile const& profile, char* profile_workspace, cudaStream_t stream)
     {
         constexpr int warmup = 5;
-        constexpr int runs = 15;
-
+        constexpr int runs = 20;
         // warmup
         for (int i = 0; i < warmup; ++i)
         {
@@ -224,6 +224,7 @@ public:
             try
             {
                 candidate_time = runSingleProfile(m, profile, profile_workspace, stream);
+                std::cout <<"i : " << i << std::endl;
                 std::cout <<"candidate_time : " << candidate_time << std::endl;
                 std::cout <<"tile_config : " << static_cast<int>(profile.tile_config) << std::endl;
                 std::cout <<"stages : " << static_cast<int>(profile.stages) << std::endl;
@@ -266,6 +267,7 @@ public:
         int64_t inter_size = fc2_expert_weights.shape()[1];
 
         int num_experts = static_cast<int>(fc2_expert_weights.shape()[0] * ep_size);
+        std::cout << "num_experts : " << num_experts << std::endl;
 
         std::sort(num_token_buckets.begin(), num_token_buckets.end());
         mMinDimM = num_token_buckets.front();
@@ -279,7 +281,8 @@ public:
             = {profiler_backend::GemmToProfile::GEMM_1, profiler_backend::GemmToProfile::GEMM_2};
 
         for (auto const& gemm_idx : gemm_idxes)
-        {
+        {   
+            std::cout << "********************* start gemm profile*****************"<< std::endl;
             runProfileGemmIdx(hidden_size, inter_size, num_experts, static_cast<int>(top_k), static_cast<int>(tp_size),
                 static_cast<int>(tp_rank), static_cast<int>(ep_size), static_cast<int>(ep_rank), num_token_buckets,
                 gemm_idx, stream);
@@ -298,6 +301,7 @@ public:
         int64_t inter_size = fc2_expert_weights.shape()[1];
         auto gemm_id_moe1 = GemmIDMoe{profiler_backend::GemmToProfile::GEMM_1, hidden_size, inter_size,
             static_cast<int>(num_experts), static_cast<int>(top_k)};
+        
         auto gemm_id_moe2 = GemmIDMoe{profiler_backend::GemmToProfile::GEMM_2, hidden_size, inter_size,
             static_cast<int>(num_experts), static_cast<int>(top_k)};
 
@@ -628,11 +632,8 @@ Tensor trt_llm_fused_moe_helper(Tensor input_activations,
                                     /* moe_runner= */ moe_runner_ptr, 
                                     /* quant_method= */ quant_method);
 
-            // std::vector<int64_t> num_token_buckets = get_power_of_2_num_tokens_buckets(tune_max_num_tokens);
-            // std::cout <<"num_token_buckets : " <<   tune_max_num_tokens << std::endl;
-
-            std::vector<int64_t> num_token_buckets = {1024};
-            std::cout << "我只tune 1024"<< std::endl;
+            std::vector<int64_t> num_token_buckets = get_power_of_2_num_tokens_buckets(tune_max_num_tokens);
+            std::cout <<"num_token_buckets : " <<   tune_max_num_tokens << std::endl;
             profiler.runProfile(fc2_expert_weights, k, 1, 0, 1, 0, num_token_buckets);
             // 需要将profile的结果，即num_tokens和对应的profile_ids落在本地efficientllm_op_configs路径下，这里需要补充代码
             profiler.saveProfileResultsToFile(profile_file);
@@ -666,11 +667,6 @@ Tensor trt_llm_fused_moe_helper(Tensor input_activations,
         auto [tactic1, tactic2] = selectTacticsForArch(moe_runner_ptr);
         moe_runner_ptr->setTactic(std::make_optional(tactic1), std::make_optional(tactic2));
     }
-    
-    
-    // std::vector<int64_t> profile_ids = {20, 19};
-    // setRunnerProfiles(moe_runner_ptr, profile_ids, quant_method);
-    // std::cout <<"我设置了tatic 20 19" << std::endl;
 
 
     kernels::MOEExpertScaleNormalizationMode normalization_mode_enum = getNormalizationMode(normalization_mode);
diff --git a/csrc/gpu/moe/tensorrt-llm-moe/moe/speed.py b/csrc/gpu/moe/tensorrt-llm-moe/moe/speed.py

Original file line number	Diff line number	Diff line change
`@@ -296,16 +296,15 @@ struct MoeFCGemm`
`296`	`296`
`297`	`297`	`static Status can_implement(Arguments const& args)`
`298`	`298`	`{`
`299`		`- std::cout << "我改了can_implement"<< std::endl;`
`300`	`299`	`if (platform::is_same<uint8_t, ElementB>::value \|\| platform::is_same<uint4b_t, ElementB>::value)`
`301`	`300`	`{`
`302`	`301`	`if (args.weight_scales == nullptr)`
`303`	`302`	`{`
`304`	`303`	`// CUTLASS_TRACE_HOST("MoeFCGemm::can_implement() - weight scales are required for uint8_t and uint4b_t");`
`305`	`304`	`printf("MoeFCGemm::can_implement() - weight scales are required for uint8_t and uint4b_t \n");`
`306`		`- printf("暂时改为sucess \n");`
`307`		`- return Status::kSuccess;`
`308`		`- // return Status::kInvalid;`
	`305`	`+ // printf("暂时改为sucess \n");`
	`306`	`+ // return Status::kSuccess;`
	`307`	`+ return Status::kInvalid;`
`309`	`308`	`}`
`310`	`309`	`}`
`311`	`310`	`else if (args.weight_scales != nullptr)`