fix: fix minor bugs

yiz-liu · yiz-liu · commit a3c69a93b56c · 2025-08-30T09:54:09.000+08:00
Removes unnecessary weight transpose operations within the fused MoE expert function to improve performance.

Refactors how quantization flags are passed for MoE communication primitives.

Skips a W8A8 MoE test, as the required All-Gather communication operation does not yet support this quantization mode.

Signed-off-by: Yizhou Liu &lt;liu_yizhou@outlook.com&gt;
diff --git a/tests/e2e/multicard/test_qwen3_moe.py b/tests/e2e/multicard/test_qwen3_moe.py
@@ -23,6 +23,7 @@
 
 import os
 
+import pytest
 from modelscope import snapshot_download  # type: ignore
 
 from tests.e2e.conftest import VllmRunner
@@ -107,4 +108,4 @@ def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH():
             tensor_parallel_size=2,
             enforce_eager=False,
     ) as vllm_model:
-        vllm_model.generate_greedy(example_prompts, max_tokens)
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/vllm_ascend/distributed/moe_comm_method.py b/vllm_ascend/distributed/moe_comm_method.py
@@ -54,7 +54,7 @@ def permute(
         topk_weights: torch.Tensor,
         expert_map: torch.Tensor,
         num_experts: int,
-        use_a8: bool,
+        apply_a8_quantization: bool,
     ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
         """Pre-process before MLP.
 
@@ -65,6 +65,7 @@ def permute(
             expert_map (torch.Tensor): Tensor of shape (global_num_experts, )
                 Mapping from global expert IDs to local expert IDs.
             num_experts (int): Number of local experts (experts on this device).
+            apply_a8_quantization (bool): Whether to apply A8 quantization (W4A8 and W8A8).
 
         Returns:
             tuple[torch.Tensor, torch.Tensor, int]: Return a tuple containing:
@@ -73,6 +74,8 @@ def permute(
                     hidden_states based on topk_ids.
                 - expert_tokens (torch.Tensor): Tensor of shape (num_experts, )
                     Number of tokens assigned to each expert.
+                - dynamic_scale (torch.Tensor, optional): Tensor of shape (num_experts, )
+                    Dynamic scale for each expert, used for quantization.
                 - group_list_type (int): Type of group list, 0 for `cumsum`
                     and 1 for `count`. This is mainly for `npu_grouped_matmul`
                     to determine how to handle the output.
@@ -160,7 +163,7 @@ def permute(
         topk_weights: torch.Tensor,
         expert_map: torch.Tensor,  # noqa: F841
         num_experts: int,
-        use_a8: bool,
+        apply_a8_quantization: bool,
     ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
         num_tokens = hidden_states.shape[0]
 
@@ -221,7 +224,7 @@ def permute(
         topk_weights: torch.Tensor,
         expert_map: torch.Tensor,
         num_experts: int,
-        use_a8: bool,
+        apply_a8_quantization: bool,
     ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
         num_tokens = hidden_states.shape[0]
 
@@ -378,7 +381,7 @@ def permute(
         topk_weights: torch.Tensor,
         expert_map: torch.Tensor,
         num_experts: int,
-        use_a8: bool,
+        apply_a8_quantization: bool,
     ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
         # Store tensors needed for post_process
         self.topk_ids = topk_ids
@@ -392,7 +395,7 @@ def permute(
             "moe_expert_num": self.moe_config.num_experts,
             "global_bs": 0,
             "scales": None,
-            "quant_mode": 2 if use_a8 else 0,
+            "quant_mode": 2 if apply_a8_quantization else 0,
             "group_ep": self.mc2_comm_name,
             "ep_world_size": self.moe_config.ep_size,
             "ep_rank_id": self.moe_config.ep_rank,
@@ -536,13 +539,15 @@ def permute(
         topk_weights: torch.Tensor,
         expert_map: torch.Tensor,
         num_experts: int,
-        use_a8: bool,
+        apply_a8_quantization: bool,
     ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
-        results = self.token_dispatcher.token_dispatch(hidden_states,
-                                                       topk_weights,
-                                                       topk_ids,
-                                                       None,
-                                                       log2phy=None)
+        results = self.token_dispatcher.token_dispatch(
+            hidden_states,
+            topk_weights,
+            topk_ids,
+            None,
+            log2phy=None,
+            with_quant=apply_a8_quantization)
         return results["hidden_states"], results["group_list"], results[
             "dynamic_scale"], results["group_list_type"]
 
diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py
@@ -287,12 +287,10 @@ def __init__(
             has_bias,
         )
 
-        with_quant = quant_config is not None
         setup_token_dispatchers(self.moe_config.ep_size,
                                 top_k=self.top_k,
                                 num_experts=self.global_num_experts,
-                                num_local_experts=self.local_num_experts,
-                                with_quant=with_quant)
+                                num_local_experts=self.local_num_experts)
 
         self.moe_config.tp_group = get_tp_group()
         self.moe_config.dp_group = get_dp_group()
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -230,7 +230,6 @@ def fused_experts_moge(
         0, sorted_topk_ids).unsqueeze(-1)
     group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64)
 
-    w1 = w1.transpose(1, 2)
     gate_up_out = torch_npu.npu_grouped_matmul(
         x=[sorted_hidden_states],
         weight=[w1],
@@ -247,7 +246,6 @@ def fused_experts_moge(
         gate_up_out = torch_npu.npu_swiglu(gate_up_out)
     gate_up_out *= topk_scales
 
-    w2 = w2.transpose(1, 2)
     down_out_list = torch_npu.npu_grouped_matmul(
         x=[gate_up_out],
         weight=[w2],