Skip to content

Commit b536f8e

Browse files
author
root
committed
fix tune
1 parent 9c85186 commit b536f8e

File tree

9 files changed

+68
-76
lines changed

9 files changed

+68
-76
lines changed

csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -296,16 +296,15 @@ struct MoeFCGemm
296296

297297
static Status can_implement(Arguments const& args)
298298
{
299-
std::cout << "我改了can_implement"<< std::endl;
300299
if (platform::is_same<uint8_t, ElementB>::value || platform::is_same<uint4b_t, ElementB>::value)
301300
{
302301
if (args.weight_scales == nullptr)
303302
{
304303
// CUTLASS_TRACE_HOST("MoeFCGemm::can_implement() - weight scales are required for uint8_t and uint4b_t");
305304
printf("MoeFCGemm::can_implement() - weight scales are required for uint8_t and uint4b_t \n");
306-
printf("暂时改为sucess \n");
307-
return Status::kSuccess;
308-
// return Status::kInvalid;
305+
// printf("暂时改为sucess \n");
306+
// return Status::kSuccess;
307+
return Status::kInvalid;
309308
}
310309
}
311310
else if (args.weight_scales != nullptr)

csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@ enum class CutlassTileConfig
5353

5454
// Warp configs for M=128
5555
CtaShape128x64x64_WarpShape64x32x64, // 9
56-
CtaShape128x128x64_WarpShape64x32x64,
57-
CtaShape128x128x64_WarpShape64x64x64,
58-
CtaShape128x128x64_WarpShape128x32x64,
59-
CtaShape128x256x64_WarpShape64x64x64,
56+
CtaShape128x128x64_WarpShape64x32x64, // 10
57+
CtaShape128x128x64_WarpShape64x64x64, // 11
58+
CtaShape128x128x64_WarpShape128x32x64, // 12
59+
CtaShape128x256x64_WarpShape64x64x64, // 13
6060

6161
// Warp configs for M=256
6262
CtaShape256x128x64_WarpShape64x64x64,

csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,10 +153,10 @@ std::vector<CutlassTileConfig> get_candidate_tiles(
153153
case CutlassGemmType::WeightOnly:
154154
if (sm >= 75)
155155
{
156-
std::cout << "我增加了一些配置"<< std::endl;
156+
std::cout << "全部配置"<< std::endl;
157157
return {
158-
// CutlassTileConfig::CtaShape16x128x64_WarpShape16x32x64,这两个配置比较慢
159-
// CutlassTileConfig::CtaShape16x256x64_WarpShape16x64x64,
158+
CutlassTileConfig::CtaShape16x128x64_WarpShape16x32x64, //这两个配置比较慢
159+
CutlassTileConfig::CtaShape16x256x64_WarpShape16x64x64,
160160
CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,
161161
CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64,
162162
CutlassTileConfig::CtaShape64x128x64_WarpShape64x64x64,

csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -173,9 +173,6 @@ struct genericMoeGemmKernelLauncher
173173
PADDLE_ENFORCE(occupancy > 0, "GPU lacks the shared memory resources to run GroupedGEMM kernel");
174174
int const threadblock_count = multi_processor_count * occupancy;
175175

176-
if (weight_scales == nullptr) {
177-
std::cout << "why fuck wo bu dong le !!!!!!!!!"<< std::endl;
178-
}
179176
int const group_size = gemm_k;
180177
typename GemmGrouped::Arguments args(num_experts, threadblock_count, group_size, epilogue_op,
181178
reinterpret_cast<ElementType const*>(A), reinterpret_cast<CutlassWeightType const*>(B),
@@ -185,7 +182,6 @@ struct genericMoeGemmKernelLauncher
185182

186183
GemmGrouped gemm;
187184

188-
std::cout << "gemm can_imple"<< std::endl;
189185
auto can_implement = gemm.can_implement(args);
190186
PADDLE_ENFORCE(can_implement == cutlass::Status::kSuccess,
191187
"MoE FC kernel will fail for params.");
@@ -268,7 +264,6 @@ void dispatchGemmConfig(T const* A, WeightType const* B, GemmOutputType const* w
268264
cutlass_extensions::CutlassGemmConfig gemm_config, int multi_processor_count, bool use_fused_moe,
269265
float const** alpha_scale_ptr_array, cudaStream_t stream, int* occupancy = nullptr)
270266
{
271-
// std::cout << "我又修改为了3,3,3,3"<< std::endl;
272267
switch (gemm_config.stages)
273268
{
274269
case 2:
@@ -397,6 +392,12 @@ void dispatchMoeGemmToCutlass(T const* A, WeightType const* B, GemmOutputType co
397392
use_fused_moe, alpha_scale_ptr_array, stream, occupancy);
398393
break;
399394
// 新加的
395+
case cutlass_extensions::CutlassTileConfig::CtaShape64x128x64_WarpShape64x64x64:
396+
dispatchGemmConfig<T, WeightType, GemmOutputType, arch, EpilogueTag, cutlass::gemm::GemmShape<64, 128, 64>,
397+
cutlass::gemm::GemmShape<64, 64, 64>>(A, B, weight_scales, biases, bias_is_broadcast, C,
398+
total_tokens_including_expert, total_rows, gemm_n, gemm_k, num_experts, gemm_config, multi_processor_count,
399+
use_fused_moe, alpha_scale_ptr_array, stream, occupancy);
400+
break;
400401
case cutlass_extensions::CutlassTileConfig::CtaShape64x128x64_WarpShape32x64x64:
401402
dispatchGemmConfig<T, WeightType, GemmOutputType, arch, EpilogueTag, cutlass::gemm::GemmShape<64, 128, 64>,
402403
cutlass::gemm::GemmShape<32, 64, 64>>(A, B, weight_scales, biases, bias_is_broadcast, C,
@@ -844,11 +845,11 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::moeGemmBiasAct(T c
844845
total_tokens_including_expert, hopper_input, total_rows, gemm_n, gemm_k, num_experts, use_fused_moe,
845846
alpha_scale_ptr_array, stream, chosen_conf);
846847
break;
847-
case ActivationType::Geglu:
848-
runGemm<cutlass_extensions::EpilogueOpDefaultFtGelu>(A, B, weight_scales, biases, bias_is_broadcast, C,
849-
total_tokens_including_expert, hopper_input, total_rows, gemm_n, gemm_k, num_experts, use_fused_moe,
850-
alpha_scale_ptr_array, stream, chosen_conf);
851-
break;
848+
// case ActivationType::Geglu:
849+
// runGemm<cutlass_extensions::EpilogueOpDefaultFtGelu>(A, B, weight_scales, biases, bias_is_broadcast, C,
850+
// total_tokens_including_expert, hopper_input, total_rows, gemm_n, gemm_k, num_experts, use_fused_moe,
851+
// alpha_scale_ptr_array, stream, chosen_conf);
852+
// break;
852853
case ActivationType::InvalidType: PADDLE_THROW("Activation type for fpA_intB must be valid."); break;
853854
default: PADDLE_THROW("Invalid activation type."); break;
854855
}

csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666

6767

6868
#include "paddle/phi/core/enforce.h"
69-
69+
#include "moe/utils.h"
7070

7171
using namespace tensorrt_llm::kernels;
7272
using namespace tensorrt_llm::common;
@@ -573,7 +573,7 @@ void topkGatingSoftmaxKernelLauncher(float const* input, float* output, float* s
573573
default:
574574
{
575575
static constexpr int TPB = 256;
576-
// PADDLE_CHECK(softmax_temp_output != nullptr);
576+
PADDLE_CHECK(softmax_temp_output != nullptr);
577577
moeSoftmax<TPB><<<num_rows, TPB, 0, stream>>>(input, nullptr, softmax_temp_output, num_experts);
578578
moeTopK<TPB><<<num_rows, TPB, 0, stream>>>(softmax_temp_output, nullptr, output, indices, source_row,
579579
num_experts, k, startk, endk, start_expert, end_expert, norm_mode);
@@ -1662,9 +1662,10 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, ScaleBiasType, Enable>::gemm1
16621662
int64_t const* total_tokens_including_expert = expert_first_token_offset + 1;
16631663

16641664
if (using_hopper_gemm1)
1665-
{
1666-
// PADDLE_CHECK(config.is_sm90);
1667-
// PADDLE_CHECK(!use_ampere_activation_fusion);
1665+
{
1666+
std::cout << "sm 90 swiglu走的这里"<< std::endl;
1667+
PADDLE_CHECK(config.is_sm90);
1668+
PADDLE_CHECK(!use_ampere_activation_fusion);
16681669
bool has_different_gemm_output_type = using_hopper_gemm1 && !std::is_same_v<T, OutputType>;
16691670
bool const has_intermediate = has_different_gemm_output_type || is_gated_activation;
16701671
// PADDLE_ENFORCE(has_intermediate || input != output, "Input and output buffers are overlapping");
@@ -1691,8 +1692,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, ScaleBiasType, Enable>::gemm1
16911692
}
16921693
else if (use_fp8)
16931694
{
1694-
// PADDLE_CHECK(!use_ampere_activation_fusion);
1695-
// PADDLE_CHECK(!config.is_sm90);
1695+
PADDLE_CHECK(!use_ampere_activation_fusion);
1696+
PADDLE_CHECK(!config.is_sm90);
16961697

16971698
alpha_scale_ptr_array
16981699
= computeFP8DequantScale(alpha_scale_ptr_array, num_experts_per_node, fc1_fp8_dequant, stream);
@@ -1710,16 +1711,17 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, ScaleBiasType, Enable>::gemm1
17101711
}
17111712
else if (!is_gated_activation)
17121713
{
1713-
// PADDLE_CHECK(!use_ampere_activation_fusion);
1714-
// PADDLE_CHECK(!config.is_sm90);
1714+
PADDLE_CHECK(!use_ampere_activation_fusion);
1715+
PADDLE_CHECK(!config.is_sm90);
1716+
std::cout << "sm 80 swiglu走的这里 is_gated_activation"<< std::endl;
17151717
gemm_runner.moeGemmBiasAct(input, fc1_expert_weights, nullptr, nullptr, false,
17161718
output, total_tokens_including_expert, HopperGroupedGemmInput{}, expanded_num_rows, fc1_out_size,
17171719
hidden_size, num_experts_per_node, fc1_activation_type, false, nullptr, stream, config);
17181720
}
17191721
else
17201722
{
1721-
// PADDLE_CHECK(!config.is_sm90);
1722-
// PADDLE_CHECK(is_gated_activation);
1723+
PADDLE_CHECK(!config.is_sm90);
1724+
PADDLE_CHECK(is_gated_activation);
17231725
PADDLE_ENFORCE(
17241726
!use_ampere_activation_fusion || input != output, "Input and output buffers are overlapping");
17251727

@@ -2331,15 +2333,13 @@ void GemmProfilerBackend::runProfiler(
23312333
hopper_input_template.configureWorkspace(
23322334
static_cast<int8_t*>(hopper_workspace), num_experts_per_node, gemm_workspace, workspaces.back());
23332335
}
2334-
if (scale_1 == nullptr) {
2335-
std::cout << "我不懂了 "<< std::endl;
2336-
}
23372336

23382337
QuantParams quant_params;
2339-
if (mWType == paddle::DataType::INT8)
2340-
{
2338+
if (QuantMode == "weight_only_int8" || QuantMode == "weight_only_int4")
2339+
{
23412340
PADDLE_CHECK(scale_1 && scale_2);
23422341
quant_params = QuantParams::Int(scale_1, scale_2);
2342+
23432343
}
23442344
else if (mWType == paddle::DataType::FLOAT8_E4M3FN)
23452345
{
@@ -2350,7 +2350,7 @@ void GemmProfilerBackend::runProfiler(
23502350

23512351
mInterface->is_profiler = true;
23522352
if (mGemmToProfile == GemmToProfile::GEMM_1)
2353-
{
2353+
{
23542354
mInterface->gemm1(inputs, //
23552355
outputs, //
23562356
intermediate, //
@@ -2373,7 +2373,7 @@ void GemmProfilerBackend::runProfiler(
23732373
tactic);
23742374
}
23752375
else
2376-
{
2376+
{
23772377
PADDLE_CHECK(mGemmToProfile == GemmToProfile::GEMM_2);
23782378
mInterface->gemm2(inputs, //
23792379
intermediate, //

csrc/gpu/moe/tensorrt-llm-moe/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,7 @@ struct GemmProfilerBackend
453453
mK = k;
454454
mExpertHiddenSize = hidden_size;
455455
mExpertInterSize = inter_size;
456-
// mActivationType = activation_type;
456+
mActivationType = tensorrt_llm::ActivationType::Swiglu; // 固定为Swiglu
457457
mBias = bias;
458458
mParallelismConfig = parallelism_config;
459459
QuantMode = quant_mode;

csrc/gpu/moe/tensorrt-llm-moe/moe/deepseek_v3.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393
for i in range(10):
9494
paddle.device.synchronize()
9595
start = time.time()
96+
9697
out = trt_llm_fused_moe(
9798
tmp_out, # input
9899
# batch_input,

csrc/gpu/moe/tensorrt-llm-moe/moe/moe.cu

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#include "tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h"
77
#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h"
88
#include "cutlass_helper.h"
9-
#include "utils.h"
9+
#include "moe/utils.h"
1010
#include "profile.h"
1111

1212
// profile部分 ***************************************
@@ -118,8 +118,8 @@ public:
118118

119119
mProfiler->mGemmToProfile = gemm_idx;
120120
// TODO: support more dtypes and expert parallelism
121-
auto parallelism_config = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank);
122-
mProfiler->init(*mKernelRunner, mProfiler->mGemmToProfile,
121+
auto parallelism_config = kernels::MOEParallelismConfig(1, 0, 1, 0);
122+
mProfiler->init(*mKernelRunner.get(), mProfiler->mGemmToProfile,
123123
mActivationDtype,
124124
mWeightDtype,
125125
mOutputDtype, num_experts, top_k, hidden_size, inter_size,
@@ -129,6 +129,7 @@ public:
129129
size_t tmp_workspace_size = mProfiler->getWorkspaceSize(mMaxDimM);
130130
auto const cu_malloc_status = cudaMalloc(&profile_workspace, tmp_workspace_size);
131131

132+
PADDLE_ENFORCE(cu_malloc_status == cudaSuccess, "Can't allocate tmp workspace for MOE GEMM tactics profiling.");
132133

133134
if (cu_malloc_status != cudaSuccess) {
134135
std::cout << "Can't allocate tmp workspace for MOE GEMM tactics profiling." << std::endl;
@@ -141,7 +142,7 @@ public:
141142
}
142143

143144
auto const cu_free = cudaFree(profile_workspace);
144-
// TORCH_CHECK(cu_free == cudaSuccess, "Can't free tmp workspace for MOE GEMM profiling.");
145+
PADDLE_ENFORCE(cu_free == cudaSuccess, "Can't free tmp workspace for MOE GEMM profiling.");
145146
}
146147

147148
std::vector<Profile> getFilteredConfigs(std::vector<Profile> tactics, int sm) {
@@ -180,8 +181,7 @@ public:
180181
float runSingleProfile(int64_t const m, Profile const& profile, char* profile_workspace, cudaStream_t stream)
181182
{
182183
constexpr int warmup = 5;
183-
constexpr int runs = 15;
184-
184+
constexpr int runs = 20;
185185
// warmup
186186
for (int i = 0; i < warmup; ++i)
187187
{
@@ -224,6 +224,7 @@ public:
224224
try
225225
{
226226
candidate_time = runSingleProfile(m, profile, profile_workspace, stream);
227+
std::cout <<"i : " << i << std::endl;
227228
std::cout <<"candidate_time : " << candidate_time << std::endl;
228229
std::cout <<"tile_config : " << static_cast<int>(profile.tile_config) << std::endl;
229230
std::cout <<"stages : " << static_cast<int>(profile.stages) << std::endl;
@@ -266,6 +267,7 @@ public:
266267
int64_t inter_size = fc2_expert_weights.shape()[1];
267268

268269
int num_experts = static_cast<int>(fc2_expert_weights.shape()[0] * ep_size);
270+
std::cout << "num_experts : " << num_experts << std::endl;
269271

270272
std::sort(num_token_buckets.begin(), num_token_buckets.end());
271273
mMinDimM = num_token_buckets.front();
@@ -279,7 +281,8 @@ public:
279281
= {profiler_backend::GemmToProfile::GEMM_1, profiler_backend::GemmToProfile::GEMM_2};
280282

281283
for (auto const& gemm_idx : gemm_idxes)
282-
{
284+
{
285+
std::cout << "********************* start gemm profile*****************"<< std::endl;
283286
runProfileGemmIdx(hidden_size, inter_size, num_experts, static_cast<int>(top_k), static_cast<int>(tp_size),
284287
static_cast<int>(tp_rank), static_cast<int>(ep_size), static_cast<int>(ep_rank), num_token_buckets,
285288
gemm_idx, stream);
@@ -298,6 +301,7 @@ public:
298301
int64_t inter_size = fc2_expert_weights.shape()[1];
299302
auto gemm_id_moe1 = GemmIDMoe{profiler_backend::GemmToProfile::GEMM_1, hidden_size, inter_size,
300303
static_cast<int>(num_experts), static_cast<int>(top_k)};
304+
301305
auto gemm_id_moe2 = GemmIDMoe{profiler_backend::GemmToProfile::GEMM_2, hidden_size, inter_size,
302306
static_cast<int>(num_experts), static_cast<int>(top_k)};
303307

@@ -628,11 +632,8 @@ Tensor trt_llm_fused_moe_helper(Tensor input_activations,
628632
/* moe_runner= */ moe_runner_ptr,
629633
/* quant_method= */ quant_method);
630634

631-
// std::vector<int64_t> num_token_buckets = get_power_of_2_num_tokens_buckets(tune_max_num_tokens);
632-
// std::cout <<"num_token_buckets : " << tune_max_num_tokens << std::endl;
633-
634-
std::vector<int64_t> num_token_buckets = {1024};
635-
std::cout << "我只tune 1024"<< std::endl;
635+
std::vector<int64_t> num_token_buckets = get_power_of_2_num_tokens_buckets(tune_max_num_tokens);
636+
std::cout <<"num_token_buckets : " << tune_max_num_tokens << std::endl;
636637
profiler.runProfile(fc2_expert_weights, k, 1, 0, 1, 0, num_token_buckets);
637638
// 需要将profile的结果,即num_tokens和对应的profile_ids落在本地efficientllm_op_configs路径下,这里需要补充代码
638639
profiler.saveProfileResultsToFile(profile_file);
@@ -666,11 +667,6 @@ Tensor trt_llm_fused_moe_helper(Tensor input_activations,
666667
auto [tactic1, tactic2] = selectTacticsForArch(moe_runner_ptr);
667668
moe_runner_ptr->setTactic(std::make_optional(tactic1), std::make_optional(tactic2));
668669
}
669-
670-
671-
// std::vector<int64_t> profile_ids = {20, 19};
672-
// setRunnerProfiles(moe_runner_ptr, profile_ids, quant_method);
673-
// std::cout <<"我设置了tatic 20 19" << std::endl;
674670

675671

676672
kernels::MOEExpertScaleNormalizationMode normalization_mode_enum = getNormalizationMode(normalization_mode);

0 commit comments

Comments
 (0)