Fix the severe performance degradation issue of the top9 dispatch in normal mode compared to top8. (#117)

oagniqgnat · oagniqgnat · web-flow · commit 2a2aef20cd5d · 2025-10-09T10:53:45.000+08:00
Co-authored-by: oagniqgnat &lt;tangqingao@huawei.com&gt;
diff --git a/csrc/deepep/deep_ep.cpp b/csrc/deepep/deep_ep.cpp
@@ -77,7 +77,7 @@ Buffer::get_dispatch_layout(const torch::Tensor &topk_idx, int num_experts, std:
     auto device = new_topk_idx.device();
     auto num_tokens_per_expert = at::zeros({num_experts}, at::dtype(at::kInt).device(device));
     auto num_tokens_per_rank = at::zeros({num_ranks}, at::dtype(at::kInt).device(device));
-    auto is_token_in_rank = at::zeros({num_tokens, num_ranks}, at::dtype(at::kInt).device(device));
+    auto is_token_in_rank = torch::empty({num_tokens, num_ranks}, at::dtype(at::kInt).device(device));
 
     EXEC_NPU_CMD(aclnnDispatchLayout, new_topk_idx, num_tokens, num_ranks, num_experts, num_topk, num_tokens_per_rank,
                  num_tokens_per_expert, is_token_in_rank);
@@ -183,11 +183,11 @@ Buffer::intranode_dispatch(const at::Tensor &x, const std::optional<at::Tensor>
 
     int send_per_group = 3;  // (send_to_expert_num, send_to_expert_offset, send_rank_tokens)
 
-    auto send_data = at::zeros({num_experts * send_per_group}, at::dtype(at::kInt).device(x.device()));
+    auto send_data = torch::empty({num_experts * send_per_group}, at::dtype(at::kInt).device(x.device()));
     int64_t send_count = send_per_group * num_local_experts * num_ranks;
 
-    auto send_data_offset = at::zeros({num_experts}, at::dtype(at::kInt).device(x.device()));
-    at::Tensor recv_data = at::zeros({num_experts * send_per_group}, at::dtype(at::kInt).device(x.device()));
+    auto send_data_offset = torch::empty({num_experts}, at::dtype(at::kInt).device(x.device()));
+    at::Tensor recv_data = torch::empty({num_experts * send_per_group}, at::dtype(at::kInt).device(x.device()));
 
     // get ep name
     char hcom_ep_name[HCOMM_NAME_LEN];
@@ -209,7 +209,7 @@ Buffer::intranode_dispatch(const at::Tensor &x, const std::optional<at::Tensor>
 
     auto options_cpu = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
     std::vector<int32_t> local_expert_acc(num_experts, 0);
-    auto send_token_idx_cpu = at::zeros({num_tokens, num_topk}, options_cpu);
+    auto send_token_idx_cpu = torch::empty({num_tokens, num_topk}, options_cpu);
     auto send_token_idx_ptr = send_token_idx_cpu.data_ptr<int>();
 
     auto topk_idx_cpu = new_topk_idx.to(at::kCPU);
@@ -227,8 +227,8 @@ Buffer::intranode_dispatch(const at::Tensor &x, const std::optional<at::Tensor>
 
     EP_HOST_ASSERT(recv_data.dim() == 1 and recv_data.is_contiguous());
     EP_HOST_ASSERT(recv_data.size(0) % num_experts == 0);
-    at::Tensor recv_offset_cpu = at::zeros({num_experts}, options_cpu);
-    at::Tensor recv_count_cpu = at::zeros({num_experts}, options_cpu);
+    at::Tensor recv_offset_cpu = torch::empty({num_experts}, options_cpu);
+    at::Tensor recv_count_cpu = torch::empty({num_experts}, options_cpu);
     auto recv_data_cpu = recv_data.to(at::kCPU);
     auto recv_data_ptr = recv_data_cpu.data_ptr<int>();
     auto recv_count_ptr = recv_count_cpu.data_ptr<int>();
@@ -269,10 +269,10 @@ Buffer::intranode_dispatch(const at::Tensor &x, const std::optional<at::Tensor>
     auto recv_count = recv_count_cpu.to(x.device());
 
     int num_recv_tokens = (total_recv_tokens == 0) ? 1 : total_recv_tokens;
-    auto expandx_out = use_quant ? at::zeros({num_recv_tokens, hidden}, at::dtype(at::kChar).device(x.device()))
-                                 : at::zeros({num_recv_tokens, hidden}, x.options());
-    auto dynamic_scales_out = at::zeros({num_recv_tokens}, at::dtype(at::kFloat).device(x.device()));
-    auto expand_idx_out = at::zeros({num_recv_tokens * 3}, at::dtype(at::kInt).device(x.device()));
+    auto expandx_out = use_quant ? torch::empty({num_recv_tokens, hidden}, at::dtype(at::kChar).device(x.device()))
+                                 : torch::empty({num_recv_tokens, hidden}, x.options());
+    auto dynamic_scales_out = torch::empty({num_recv_tokens}, at::dtype(at::kFloat).device(x.device()));
+    auto expand_idx_out = torch::empty({num_recv_tokens * 3}, at::dtype(at::kInt).device(x.device()));
 
     EXEC_NPU_CMD(aclnnCamMoeDispatchNormal, new_x, expert_ids, send_data_offset, send_token_idx, recv_offset,
                  recv_count, hcom_ep_name,
diff --git a/tests/python/deepep/test_intranode.py b/tests/python/deepep/test_intranode.py
@@ -245,7 +245,7 @@ def test_diagnose(
     for current_x in filter(lambda elem: elem is not None, (x_pure_rand, x)):
         if local_rank == 0:
             print(
-                f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, with top-k ...',
+                f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, with top-k {num_topk} ...',
                 flush=True,
             )
         dispatch_args = {

Original file line number	Diff line number	Diff line change
`@@ -245,7 +245,7 @@ def test_diagnose(`
`245`	`245`	`for current_x in filter(lambda elem: elem is not None, (x_pure_rand, x)):`
`246`	`246`	`if local_rank == 0:`
`247`	`247`	`print(`
`248`		`- f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, with top-k ...',`
	`248`	`+ f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, with top-k {num_topk} ...',`
`249`	`249`	`flush=True,`
`250`	`250`	`)`
`251`	`251`	`dispatch_args = {`