fix all reduce accu bug

whx-sjtu · whx-sjtu · commit 89015dc1c214 · 2025-08-07T22:40:50.000+08:00
Signed-off-by: whx-sjtu &lt;2952154980@qq.com&gt;
diff --git a/vllm_ascend/models/glm4_moe.py b/vllm_ascend/models/glm4_moe.py
@@ -29,7 +29,8 @@
                               get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               get_tp_group, split_tensor_along_last_dim,
-                              tensor_model_parallel_reduce_scatter)
+                              tensor_model_parallel_reduce_scatter,
+                              tensor_model_parallel_all_reduce)
 from vllm.model_executor.models.glm4_moe import Glm4MoeForCausalLM, Glm4MoeDecoderLayer, Glm4MoeModel, Glm4MoeAttention, Glm4MoeMLP
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
 from vllm.forward_context import get_forward_context
@@ -93,7 +94,7 @@ def __init__(
                 intermediate_size=intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
-                reduce_results=True,
+                reduce_results=False,
                 prefix=f"{prefix}.shared_experts",
             )
         else:
@@ -131,6 +132,8 @@ def forward(
         hidden_states = (
             experts_hidden_states[0] * self.routed_scaling_factor +
             experts_hidden_states[1])
+        if self.tp_size > 1:
+            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
 
         return hidden_states
 
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -1454,7 +1454,7 @@ def forward(
         else:
             final_hidden_states = e_hidden_states
 
-        if tp_size > 1 and fused_moe_state == FusedMoEState.AllGather:
+        if self.reduce_results and (tp_size > 1 and fused_moe_state == FusedMoEState.AllGather):
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)