sgl-project
diff --git a/‎csrc/deepep/deep_ep.cpp‎
Lines changed: 11 additions & 12 deletions b/‎csrc/deepep/deep_ep.cpp‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎csrc/deepep/ops/op_host/fused_deep_moe_tiling.cpp‎
Lines changed: 134 additions & 47 deletions b/‎csrc/deepep/ops/op_host/fused_deep_moe_tiling.cpp‎
Lines changed: 134 additions & 47 deletions
diff --git a/‎csrc/deepep/ops/op_kernel/fused_deep_moe.h‎
Lines changed: 42 additions & 44 deletions b/‎csrc/deepep/ops/op_kernel/fused_deep_moe.h‎
Lines changed: 42 additions & 44 deletions
diff --git a/‎csrc/deepep/ops/utils/.DS_Store‎
-6 KB b/‎csrc/deepep/ops/utils/.DS_Store‎
-6 KB
diff --git a/‎csrc/deepep/ops/utils/op_kernel/.DS_Store‎
-6 KB b/‎csrc/deepep/ops/utils/op_kernel/.DS_Store‎
-6 KB
diff --git a/‎csrc/deepep/ops/utils/op_kernel/operator/.DS_Store‎
-6 KB b/‎csrc/deepep/ops/utils/op_kernel/operator/.DS_Store‎
-6 KB
diff --git a/‎csrc/deepep/ops/utils/op_kernel/operator/cam_moe_distribute_combine/op_kernel/a3/cam_moe_distribute_combine.h‎
Lines changed: 10 additions & 5 deletions b/‎csrc/deepep/ops/utils/op_kernel/operator/cam_moe_distribute_combine/op_kernel/a3/cam_moe_distribute_combine.h‎
Lines changed: 10 additions & 5 deletions
@@ -87,8 +87,7 @@ Buffer::get_dispatch_layout(const torch::Tensor &topk_idx, int num_experts, std:
     const int notify_send_data_size =
         num_experts * EXPERT_DATA_SIZE + server_num + MAX_BATCH_SIZE * (1 + 2 * server_num + num_topk);
     /*
-    The notify send data is constructed by 8 parameters and
-    the parameters are ordered as follows:
+    The notify send data is constructed by 8 parameters and the parameters are ordered as follows:
     1. the number of the tokens that every expert received from this NPU.
        size:[numExpert]
     2. The number of tokens received by each server from this NPU (deduplicated).
@@ -217,11 +216,11 @@ Buffer::intranode_dispatch(const at::Tensor &x, const std::optional<at::Tensor>
 
     int send_per_group = 3;  // (send_to_expert_num, send_to_expert_offset, send_rank_tokens)
 
-    auto send_data = at::zeros({num_experts * send_per_group}, at::dtype(at::kInt).device(x.device()));
+    auto send_data = torch::empty({num_experts * send_per_group}, at::dtype(at::kInt).device(x.device()));
     int64_t send_count = send_per_group * num_local_experts * num_ranks;
 
-    auto send_data_offset = at::zeros({num_experts}, at::dtype(at::kInt).device(x.device()));
-    at::Tensor recv_data = at::zeros({num_experts * send_per_group}, at::dtype(at::kInt).device(x.device()));
+    auto send_data_offset = torch::empty({num_experts}, at::dtype(at::kInt).device(x.device()));
+    at::Tensor recv_data = torch::empty({num_experts * send_per_group}, at::dtype(at::kInt).device(x.device()));
 
     // get ep name
     char hcom_ep_name[HCOMM_NAME_LEN];
@@ -243,7 +242,7 @@ Buffer::intranode_dispatch(const at::Tensor &x, const std::optional<at::Tensor>
 
     auto options_cpu = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
     std::vector<int32_t> local_expert_acc(num_experts, 0);
-    auto send_token_idx_cpu = at::zeros({num_tokens, num_topk}, options_cpu);
+    auto send_token_idx_cpu = torch::empty({num_tokens, num_topk}, options_cpu);
     auto send_token_idx_ptr = send_token_idx_cpu.data_ptr<int>();
 
     auto topk_idx_cpu = new_topk_idx.to(at::kCPU);
@@ -261,8 +260,8 @@ Buffer::intranode_dispatch(const at::Tensor &x, const std::optional<at::Tensor>
 
     EP_HOST_ASSERT(recv_data.dim() == 1 and recv_data.is_contiguous());
     EP_HOST_ASSERT(recv_data.size(0) % num_experts == 0);
-    at::Tensor recv_offset_cpu = at::zeros({num_experts}, options_cpu);
-    at::Tensor recv_count_cpu = at::zeros({num_experts}, options_cpu);
+    at::Tensor recv_offset_cpu = torch::empty({num_experts}, options_cpu);
+    at::Tensor recv_count_cpu = torch::empty({num_experts}, options_cpu);
     auto recv_data_cpu = recv_data.to(at::kCPU);
     auto recv_data_ptr = recv_data_cpu.data_ptr<int>();
     auto recv_count_ptr = recv_count_cpu.data_ptr<int>();
@@ -303,10 +302,10 @@ Buffer::intranode_dispatch(const at::Tensor &x, const std::optional<at::Tensor>
     auto recv_count = recv_count_cpu.to(x.device());
 
     int num_recv_tokens = (total_recv_tokens == 0) ? 1 : total_recv_tokens;
-    auto expandx_out = use_quant ? at::zeros({num_recv_tokens, hidden}, at::dtype(at::kChar).device(x.device()))
-                                 : at::zeros({num_recv_tokens, hidden}, x.options());
-    auto dynamic_scales_out = at::zeros({num_recv_tokens}, at::dtype(at::kFloat).device(x.device()));
-    auto expand_idx_out = at::zeros({num_recv_tokens * 3}, at::dtype(at::kInt).device(x.device()));
+    auto expandx_out = use_quant ? torch::empty({num_recv_tokens, hidden}, at::dtype(at::kChar).device(x.device()))
+                                 : torch::empty({num_recv_tokens, hidden}, x.options());
+    auto dynamic_scales_out = torch::empty({num_recv_tokens}, at::dtype(at::kFloat).device(x.device()));
+    auto expand_idx_out = torch::empty({num_recv_tokens * 3}, at::dtype(at::kInt).device(x.device()));
 
     EXEC_NPU_CMD(aclnnCamMoeDispatchNormal, new_x, expert_ids, send_data_offset, send_token_idx, recv_offset,
                  recv_count, hcom_ep_name,
 
@@ -9,35 +9,34 @@
 #ifndef FUSED_DEEP_MOE_H
 #define FUSED_DEEP_MOE_H
 
-#include <kernel_operator.h>
 #include "lib/matmul_intf.h"
+#include <kernel_operator.h>
 
-#include "../utils/op_kernel/operator/catlass/catlass/catlass.hpp"
-#include "../utils/op_kernel/operator/catlass/catlass/arch/arch.hpp"
-#include "../utils/op_kernel/operator/catlass/catlass/layout/layout.hpp"
-#include "../utils/op_kernel/operator/catlass/catlass/epilogue/tile/tile_broadcast_mul.hpp"
-#include "../utils/op_kernel/operator/catlass/catlass/epilogue/tile/tile_broadcast_one_blk.hpp"
-#include "../utils/op_kernel/operator/catlass/catlass/epilogue/tile/tile_swizzle.hpp"
-#include "../utils/op_kernel/operator/catlass/catlass/gemm/block/block_swizzle.hpp"
-#include "../utils/op_kernel/operator/catlass/catlass/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_multistage_workspace.hpp"
-#include "../utils/op_kernel/operator/catlass/catlass/gemm/gemm_type.hpp"
+#include "../utils/op_kernel/operator/catlass/act/act.hpp"
+#include "../utils/op_kernel/operator/catlass/act/arch/arch.hpp"
+#include "../utils/op_kernel/operator/catlass/act/layout/layout.hpp"
+#include "../utils/op_kernel/operator/catlass/act/epilogue/tile/tile_broadcast_mul.hpp"
+#include "../utils/op_kernel/operator/catlass/act/epilogue/tile/tile_broadcast_one_blk.hpp"
+#include "../utils/op_kernel/operator/catlass/act/epilogue/tile/tile_swizzle.hpp"
+#include "../utils/op_kernel/operator/catlass/act/gemm/block/block_swizzle.hpp"
+#include "../utils/op_kernel/operator/catlass/act/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_multistage_workspace.hpp"
+#include "../utils/op_kernel/operator/catlass/act/gemm/gemm_type.hpp"
 #include "../utils/op_kernel/operator/epilogue/dispatch_policy.h"
 #include "../utils/op_kernel/operator/gemm/dispatch_policy.h"
 #include "../utils/op_kernel/operator/epilogue/block/block_epilogue.h"
 #include "../utils/op_kernel/operator/gemm/block/block_mmad.h"
 #include "../utils/op_kernel/operator/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_swiglu_quant_multistage_workspace.h"
 
 #include "../utils/op_kernel/operator/cam_moe_distribute_combine/op_kernel/a3/cam_moe_distribute_dispatch.h"
-#include "../utils/op_kernel/operator/cam_moe_distribute_combine/op_kernel/a3/cam_moe_distribute_combine.h"
 
 #include "fused_deep_moe_tiling.h"
 #include "fused_deep_moe_base.h"
 
 #define ENABLE_GMM2_COMBINE
-constexpr uint32_t GMM1_HIDDEN_SIZE = 4096;
-constexpr uint32_t TOKEN_LENGTH = 7168;
+#define GMM1_HIDDEN_SIZE 4096
+#define TOKEN_LENGTH 7168
 
-using namespace Catlass;
+using namespace Act;
 
 using MmadAtlasA2Custom =
     Gemm::MmadAtlasA2PreloadAsyncWithCallback<CUSTOM_PRELOAD_STAGES, CUSTOM_L1_STAGES, CUSTOM_L0A_STAGES,
@@ -60,16 +59,16 @@ using Gmm2DispatchPolicy =
 
 template <uint32_t EXEC_FLAG, typename XType_, class L1TileShape_, class L0TileShape_, class EpilogueTileShape_,
           class BlockScheduler_, class DispatchPolicy_ = MmadAtlasA2Custom>
-CATLASS_DEVICE void GmmDeqSwigluQuant(GemmCoord problemShape, uint32_t groupCount, GM_ADDR gmGroupList, GM_ADDR gmA,
-                                      layout::RowMajor layoutA, GM_ADDR gmB, layout::zN layoutB, GM_ADDR gmScale,
-                                      layout::VectorLayout layoutScale, GM_ADDR gmPerTokenScale,
-                                      layout::VectorLayout layoutPerTokenScale, GM_ADDR gmD, layout::RowMajor layoutD,
-                                      GM_ADDR gmDequantScale, layout::VectorLayout layoutDequantScale,
-                                      GM_ADDR gmWorkspace, GM_ADDR gmX, GM_ADDR debugGm, GM_ADDR gmexpertIds,
-                                      GM_ADDR gmExpandIdx, GM_ADDR gmEpSendCount, GM_ADDR gmResvered,
-                                      uint32_t epRankSize, uint32_t epRankId, uint32_t moeExpertNum,
-                                      uint32_t moeExpertNumPerRank, uint32_t sharedExpertNum,
-                                      uint32_t sharedExpertRankNum, uint32_t quantMode, uint32_t globalBs, uint32_t bs)
+ACT_DEVICE void GmmDeqSwigluQuant(GemmCoord problemShape, uint32_t groupCount, GM_ADDR gmGroupList, GM_ADDR gmA,
+                                  layout::RowMajor layoutA, GM_ADDR gmB, layout::zN layoutB, GM_ADDR gmScale,
+                                  layout::VectorLayout layoutScale, GM_ADDR gmPerTokenScale,
+                                  layout::VectorLayout layoutPerTokenScale, GM_ADDR gmD, layout::RowMajor layoutD,
+                                  GM_ADDR gmDequantScale, layout::VectorLayout layoutDequantScale, GM_ADDR gmWorkspace,
+                                  GM_ADDR gmX, GM_ADDR debugGm, GM_ADDR gmexpertIds, GM_ADDR gmExpandIdx,
+                                  GM_ADDR gmEpSendCount, GM_ADDR gmResvered, uint32_t epRankSize, uint32_t epRankId,
+                                  uint32_t moeExpertNum, uint32_t moeExpertNumPerRank, uint32_t sharedExpertNum,
+                                  uint32_t sharedExpertRankNum, uint32_t quantMode, uint32_t globalBs, uint32_t bs,
+                                  uint32_t topK)
 {
     using ArchTag = Arch::AtlasA2;
     using DispatchPolicy = DispatchPolicy_;
@@ -149,7 +148,8 @@ CATLASS_DEVICE void GmmDeqSwigluQuant(GemmCoord problemShape, uint32_t groupCoun
                                            sharedExpertRankNum,
                                            quantMode,
                                            globalBs,
-                                           bs};
+                                           bs,
+                                           topK};
         // call a kernel
         GemmKernel gemm;
         gemm(params);
@@ -178,11 +178,11 @@ CATLASS_DEVICE void GmmDeqSwigluQuant(GemmCoord problemShape, uint32_t groupCoun
 
 template <TemplateMC2TypeClass, class L1TileShape_, class L0TileShape_, class EpilogueTileShape_, class BlockScheduler_,
           class DispatchPolicy_ = MmadAtlasA2Custom>
-CATLASS_DEVICE void GmmDeq(GemmCoord problemShape, uint32_t groupCount, GM_ADDR gmGroupList, GM_ADDR gmA,
-                           layout::RowMajor layoutA, GM_ADDR gmB, layout::nZ layoutB, GM_ADDR gmScale,
-                           layout::VectorLayout layoutScale, GM_ADDR gmPerTokenScale,
-                           layout::VectorLayout layoutPerTokenScale, GM_ADDR gmD, layout::RowMajor layoutD,
-                           GM_ADDR gmWorkspace, void *combiner)
+ACT_DEVICE void GmmDeq(GemmCoord problemShape, uint32_t groupCount, GM_ADDR gmGroupList, GM_ADDR gmA,
+                       layout::RowMajor layoutA, GM_ADDR gmB, layout::nZ layoutB, GM_ADDR gmScale,
+                       layout::VectorLayout layoutScale, GM_ADDR gmPerTokenScale,
+                       layout::VectorLayout layoutPerTokenScale, GM_ADDR gmD, layout::RowMajor layoutD,
+                       GM_ADDR gmWorkspace, void *combiner)
 {
     using ArchTag = Arch::AtlasA2;
     using DispatchPolicy = DispatchPolicy_;
@@ -196,7 +196,7 @@ CATLASS_DEVICE void GmmDeq(GemmCoord problemShape, uint32_t groupCount, GM_ADDR
     using BlockMmad = Gemm::Block::BlockMmad<DispatchPolicy, L1TileShape, L0TileShape, AType, BType, CType>;
 
     constexpr uint32_t ubStages = 1;
-    using EpilogueDispatchPolicy = Catlass::Epilogue::EpilogueAtlasA2PerTokenDequant<ubStages>;
+    using EpilogueDispatchPolicy = Epilogue::EpilogueAtlasA2PerTokenDequant<ubStages, EXEC_FLAG>;
     using ScaleType = Gemm::GemmType<float, layout::VectorLayout>;
     using PerTokenScaleType = Gemm::GemmType<float, layout::VectorLayout>;
     using DType = Gemm::GemmType<ExpandXType, layout::RowMajor>;
@@ -214,23 +214,20 @@ CATLASS_DEVICE void GmmDeq(GemmCoord problemShape, uint32_t groupCount, GM_ADDR
     using TileCopy = Epilogue::Tile::TileCopy<ArchTag, CType, ScaleType, PerTokenScaleType, DType>;
     using TileScheduler = Epilogue::Tile::EpilogueHorizontalTileSwizzle;
 
-    using BlockEpilogue =
-        Catlass::Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy, CType, ScaleType, PerTokenScaleType, DType,
-                                                TileRowBroadcastMul, TileBroadcastOneBlk, TileOneBlkColumnBroadcastMul,
-                                                TileCopy, TileScheduler>;
+    using BlockEpilogue = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy, CType, ScaleType, PerTokenScaleType,
+                                                         DType, TileRowBroadcastMul, TileBroadcastOneBlk,
+                                                         TileOneBlkColumnBroadcastMul, TileCopy, TileScheduler>;
 
     using BlockScheduler = BlockScheduler_;
 
     // kernel level
     using ElementGroupList = int64_t;
-    using GemmKernel =
-        Gemm::Kernel::GroupedMatmulSliceMPerTokenDequantMultiStageWorkspace<BlockMmad, BlockEpilogue, BlockScheduler,
-                                                                            WORKSPACE_STAGES, ElementGroupList>;
+    using GemmKernel = Gemm::Kernel::GroupedMatmulSliceMPerTokenDequantMultiStageWorkspace<
+        TemplateMC2TypeFunc, BlockMmad, BlockEpilogue, BlockScheduler, WORKSPACE_STAGES, ElementGroupList>;
 
     typename GemmKernel::Params params{
         problemShape, groupCount,      gmGroupList,         gmA, layoutA, gmB,         layoutB, gmScale,
-        layoutScale,  gmPerTokenScale, layoutPerTokenScale, gmD, layoutD, gmWorkspace,
-    };
+        layoutScale,  gmPerTokenScale, layoutPerTokenScale, gmD, layoutD, gmWorkspace, combiner};
 
     // call a kernel
     GemmKernel gemm;
@@ -282,6 +279,7 @@ class FusedDeepMoe
     uint32_t quantMode_{0};
     uint32_t globalBs_{0};
     uint32_t bs_{0};
+    uint32_t maxBs_{0};
     uint32_t topK_{0};
 
     AscendC::TPipe *tpipe_{nullptr};
@@ -324,12 +322,13 @@ __aicore__ inline void FusedDeepMoe<TemplateMC2TypeFunc>::Init(
     globalBs_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.globalBs;
     bs_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.bs;
     topK_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.k;
+    maxBs_ = globalBs_ / epRankSize_;
 
     bool isShareExpert = (epRankId_ < sharedExpertRankNum_);
     if (isShareExpert) {
-        m_ = bs_ * epRankSize_ / sharedExpertRankNum_;
+        m_ = maxBs_ * epRankSize_ / sharedExpertRankNum_;
     } else {
-        m_ = bs_ * epRankSize_ * (topK_ < moeExpertNumPerRank_ ? topK_ : moeExpertNumPerRank_);
+        m_ = maxBs_ * epRankSize_ * (topK_ < moeExpertNumPerRank_ ? topK_ : moeExpertNumPerRank_);
     }
 
     n_ = GMM1_HIDDEN_SIZE;
@@ -421,8 +420,7 @@ __aicore__ inline void FusedDeepMoe<TemplateMC2TypeFunc>::Process()
                                           layoutPerTokenScale1, gmX2, layoutX2, gmPerTokenScale2, layoutPerTokenScale2,
                                           gmWorkspace, gmX_, gmSmoothScales_, gmexpertIds_, gmExpandIdx, gmEpSendCount,
                                           gmResvered, epRankSize_, epRankId_, moeExpertNum_, moeExpertNumPerRank_,
-                                          sharedExpertNum_, sharedExpertRankNum_, quantMode_, globalBs_, bs_);
-
+                                          sharedExpertNum_, sharedExpertRankNum_, quantMode_, globalBs_, bs_, topK_);
 #ifdef ENABLE_GMM2_COMBINE
     AscendC::PipeBarrier<PIPE_ALL>();
     Arch::CrossCoreFlag gmm1AivFinished{0};
 
@@ -8,6 +8,7 @@
  */
 #ifndef CAM_MOE_DISTRIBUTE_COMBINE_H
 #define CAM_MOE_DISTRIBUTE_COMBINE_H
+#define OPT_RANK_OFFSET 512
 
 #include "kernel_operator.h"
 #include "kernel_tiling/kernel_tiling.h"
@@ -29,7 +30,6 @@ constexpr uint64_t WIN_STATE_OFFSET = 512 * 1024;
 constexpr uint64_t STATE_WIN_OFFSET = 900 * 1024;
 constexpr uint16_t SEND_SYNC_EVENT_ID = 9;
 constexpr uint16_t RECV_SYNC_EVENT_ID = 10;
-constexpr uint32_t OPT_RANK_OFFSET = 512;
 
 template <AscendC::HardEvent event>
 __aicore__ inline void SyncFunc()
@@ -246,7 +246,11 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::Init(
         selfDataStatusTensor[coreIdx_ * UB_ALIGN]);
     __asm__ __volatile__("");
     dataState_ = selfDataStatusTensor(coreIdx_ * UB_ALIGN);
-    selfDataStatusTensor(coreIdx_ * UB_ALIGN) = 1 - dataState_;
+    if (dataState_ == 0) {
+        selfDataStatusTensor(coreIdx_ * UB_ALIGN) = 1;
+    } else {
+        selfDataStatusTensor(coreIdx_ * UB_ALIGN) = 0;
+    }
     __asm__ __volatile__("");
     DataCacheCleanAndInvalid<int32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(
         selfDataStatusTensor[coreIdx_ * UB_ALIGN]);
@@ -372,15 +376,16 @@ template <TemplateMC2TypeClass>
 __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::AlltoAllBuffInit()
 {
     tpipe_->Reset();
+    uint32_t bsMulTopkSizeAligned = Ceil(axisBS_ * axisK_ * sizeof(int32_t), UB_ALIGN) * UB_ALIGN;  // 防止UB不对齐
     tpipe_->InitBuffer(readStateBuf_, UB_ALIGN);
     tpipe_->InitBuffer(statusBuf_, sendRankNum_ * UB_ALIGN);
-    tpipe_->InitBuffer(expertIdsBuf_, axisBS_ * axisK_ * sizeof(int32_t));
-    tpipe_->InitBuffer(expandScalesBuf_, axisBS_ * axisK_ * sizeof(float));
+    tpipe_->InitBuffer(expertIdsBuf_, bsMulTopkSizeAligned);
+    tpipe_->InitBuffer(expandScalesBuf_, bsMulTopkSizeAligned);
     tpipe_->InitBuffer(tokenBuf_, axisH_ * sizeof(ExpandXType));
     tpipe_->InitBuffer(rowTmpFloatBuf_, axisHFloatSize_);  // 7168 * 4 = 28672
     tpipe_->InitBuffer(mulBuf_, axisHFloatSize_);          // 7168 * 4 = 28672
     tpipe_->InitBuffer(sumFloatBuf_, axisHFloatSize_);     // 7168 * 4 = 28672
-    tpipe_->InitBuffer(indexCountsBuf_, axisBS_ * axisK_ * sizeof(int32_t));
+    tpipe_->InitBuffer(indexCountsBuf_, bsMulTopkSizeAligned);
     tpipe_->InitBuffer(moeSumQueue_, BUFFER_NUM, axisHExpandXTypeSize_);
     tpipe_->InitBuffer(gatherMaskOutBuf_, epWorldSize_ * sizeof(float));
     tpipe_->InitBuffer(gatherTmpBuf_, sizeof(uint32_t));  // 4