refactor moe_comm_method selection process

Pr0Wh1teGivee · Pr0Wh1teGivee · commit fc83278c8b3a · 2025-09-18T12:45:51.000+08:00
Signed-off-by: Pr0Wh1teGivee &lt;calvin_zhu0210@outlook.com&gt;
diff --git a/tests/ut/ops/test_fused_ops.py b/tests/ut/ops/test_fused_ops.py
@@ -23,6 +23,7 @@
 from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
 
 from tests.ut.base import TestBase
+from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.ops.fused_moe import (AscendFusedMoE,
                                        AscendUnquantizedFusedMoEMethod)
 from vllm_ascend.ops.moe.experts_selector import select_experts
@@ -497,7 +498,7 @@ def test_unified_apply_mlp_with_quantization_mc2(self, mock_npu_dequant,
                                                      mock_get_forward_context):
 
         mock_forward_context = MagicMock()
-        mock_forward_context.moe_comm_method_name = "mc2commimpl"
+        mock_forward_context.moe_comm_type = MoECommType.MC2
         mock_get_forward_context.return_value = mock_forward_context
 
         mock_is_310p.return_value = False
diff --git a/tests/ut/worker/test_model_runner_v1.py b/tests/ut/worker/test_model_runner_v1.py
@@ -15,6 +15,7 @@
 
 import pytest
 
+from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.utils import AscendSocVersion
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
@@ -24,21 +25,21 @@
     "soc_version, enable_expert_parallel, world_size, num_tokens, mc2_tokens_capacity, quant_type, expected_method",
     [
         # Case 1: Expert parallel is disabled, should always be 'allgather'
-        (AscendSocVersion.A2, False, 8, 100, 256, None, "allgather"),
-        (AscendSocVersion.A3, False, 16, 500, 256, None, "allgather"),
+        (AscendSocVersion.A2, False, 8, 100, 256, None, MoECommType.ALLGATHER),
+        (AscendSocVersion.A3, False, 16, 500, 256, None, MoECommType.ALLGATHER),
 
         # Case 2: A2 SOC with w4a8_dynamic -> use alltoall when not mc2
-        (AscendSocVersion.A2, True, 8, 100, 256, "w4a8_dynamic", "alltoall"),
-        (AscendSocVersion.A2, True, 16, 257, 256, "w4a8_dynamic", "alltoall"),
-        (AscendSocVersion.A2, True, 16, 100, 256, "w4a8_dynamic", "mc2"),  # meets mc2 condition
+        (AscendSocVersion.A2, True, 8, 100, 256, "w4a8_dynamic", MoECommType.ALLTOALL),
+        (AscendSocVersion.A2, True, 16, 257, 256, "w4a8_dynamic", MoECommType.ALLTOALL),
+        (AscendSocVersion.A2, True, 16, 100, 256, "w4a8_dynamic", MoECommType.MC2),  # meets mc2 condition
 
         # Case 3: A2 SOC without w4a8_dynamic -> fallback to allgather
-        (AscendSocVersion.A2, True, 8, 100, 256, None, "allgather"),
-        (AscendSocVersion.A2, True, 16, 257, 256, None, "allgather"),
+        (AscendSocVersion.A2, True, 8, 100, 256, None, MoECommType.ALLGATHER),
+        (AscendSocVersion.A2, True, 16, 257, 256, None, MoECommType.ALLGATHER),
 
         # Case 4: A3 SOC
-        (AscendSocVersion.A3, True, 8, 100, 256, None, "mc2"),
-        (AscendSocVersion.A3, True, 8, 257, 256, None, "alltoall"),
+        (AscendSocVersion.A3, True, 8, 100, 256, None, MoECommType.MC2),
+        (AscendSocVersion.A3, True, 8, 257, 256, None, MoECommType.ALLGATHER),
     ])
 # yapf: enable
 def test_select_moe_comm_method(soc_version, enable_expert_parallel,
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -22,6 +22,16 @@ class FusedMoEState(Enum):
     All2AllSeq = 5
 
 
+class MoECommType(Enum):
+    ALLGATHER = "AllGather"
+    MC2 = "MC2"
+    ALLTOALL = "AlltoAll"
+    NAIVE_MULTICAST = "NaiveMulticast"
+
+    def __str__(self):
+        return self.value + "CommImpl"
+
+
 # TODO(zzzzwwjj): add soc_version to choose branch
 def _get_fused_moe_state(ep_size: int, with_prefill: bool,
                          is_deepseek_v3_r1: bool):
@@ -52,7 +62,7 @@ def set_ascend_forward_context(
         with_prefill: bool = True,
         in_profile_run: bool = False,
         reserved_mc2_mask: Optional[torch.Tensor] = None,
-        moe_comm_method: str = "",
+        moe_comm_type: Optional[MoECommType] = None,
         num_actual_tokens: Optional[int] = None,
         aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
         batch_descriptor: Optional[BatchDescriptor] = None,
@@ -72,7 +82,11 @@ def set_ascend_forward_context(
             batch_descriptor=batch_descriptor,
     ):
         forward_context = get_forward_context()
-        forward_context.moe_comm_method_name = moe_comm_method + "commimpl"
+
+        from vllm_ascend.ops.moe.moe_comm_method import get_moe_comm_method
+        forward_context.moe_comm_type = moe_comm_type
+        forward_context.moe_comm_method = get_moe_comm_method(moe_comm_type)
+
         forward_context.with_prefill = with_prefill
         tp_world_size = get_tensor_model_parallel_world_size()
         ep_size = (get_ep_group().world_size if
diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py
@@ -29,14 +29,13 @@
     FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
 
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
                                               determine_default_log2phy_map)
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
 from vllm_ascend.ops.moe.experts_selector import select_experts
-from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
-                                                 AlltoAllCommImpl, MC2CommImpl,
-                                                 NaiveMulticastCommImpl)
+from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
 from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
 
 original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__
@@ -276,13 +275,7 @@ def __init__(self, *args, **kwargs):
         if self.dynamic_eplb:
             self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64)
 
-        for method in {
-                AllGatherCommImpl, AlltoAllCommImpl, MC2CommImpl,
-                NaiveMulticastCommImpl
-        }:
-            setattr(
-                self, method.__name__.lower(),
-                method(moe_config=self.moe_config))  # type: ignore[abstract]
+        setup_moe_comm_method(self.moe_config)
 
     def update_expert_map(self, new_expert_map):
         self.expert_map = new_expert_map
@@ -306,8 +299,8 @@ def maybe_all_reduce_tensor_model_parallel(
         outputs since each rank only has partial outputs.
         """
         forward_context = get_forward_context()
-        moe_comm_method_name = forward_context.moe_comm_method_name
-        if moe_comm_method_name in {"alltoallcommimpl", "mc2commimpl"}:
+        moe_comm_type = forward_context.moe_comm_type
+        if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2}:
             return final_hidden_states
         else:
             return tensor_model_parallel_all_reduce(final_hidden_states)
@@ -317,10 +310,6 @@ def forward_impl(self, hidden_states: torch.Tensor,
         assert self.quant_method is not None
 
         forward_context = get_forward_context()
-        moe_comm_method_name = forward_context.moe_comm_method_name
-
-        forward_context.moe_comm_method = getattr(self, moe_comm_method_name)
-
         hidden_states, router_logits = forward_context.moe_comm_method.prepare(
             hidden_states=hidden_states, router_logits=router_logits)
 
@@ -436,8 +425,8 @@ def forward(
 
         # NOTE: This is exactly the opposite of `maybe_all_reduce_tensor_model_parallel`
         forward_context = get_forward_context()
-        moe_comm_method_name = forward_context.moe_comm_method_name
-        if moe_comm_method_name in {"alltoallcommimpl", "mc2commimpl"}:
+        moe_comm_type = forward_context.moe_comm_type
+        if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2}:
             shared_out = tensor_model_parallel_all_reduce(shared_out)
 
         fused_out = super().forward(
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -41,9 +41,7 @@
                                               determine_default_log2phy_map)
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
 from vllm_ascend.ops.moe.experts_selector import select_experts
-from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
-                                                 AlltoAllCommImpl, MC2CommImpl,
-                                                 NaiveMulticastCommImpl)
+from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
 from vllm_ascend.ops.sequence_parallel import MetadataForPadding
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
                                get_all_reduce_merge_state,
@@ -329,13 +327,7 @@ def __init__(
         self.moe_config.mc2_group = get_mc2_group()
         self.moe_config.num_global_redundant_experts = self.global_redundant_expert_num
 
-        for method in {
-                AllGatherCommImpl, AlltoAllCommImpl, MC2CommImpl,
-                NaiveMulticastCommImpl
-        }:
-            setattr(
-                self, method.__name__.lower(),
-                method(moe_config=self.moe_config))  # type: ignore[abstract]
+        setup_moe_comm_method(self.moe_config)
 
     def update_expert_map(self, new_expert_map):
         self.expert_map = new_expert_map
@@ -402,9 +394,6 @@ def forward(self,
             mc2_mask = chunk_mc2_mask[tp_rank]
             replace_allreduce = True
 
-        moe_comm_method_name = forward_context.moe_comm_method_name
-        forward_context.moe_comm_method = getattr(self, moe_comm_method_name)
-
         hidden_states, router_logits = forward_context.moe_comm_method.prepare(
             hidden_states=hidden_states,
             router_logits=router_logits,
diff --git a/vllm_ascend/ops/moe/moe_comm_method.py b/vllm_ascend/ops/moe/moe_comm_method.py
@@ -13,14 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
+from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any, Dict, Optional
 
 import torch
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.fused_moe import FusedMoEConfig
 
+from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.ops.moe.fused_moe_prepare_and_finalize import (
     FusedMoEPrepareAndFinalizeWithAll2All,
     FusedMoEPrepareAndFinalizeWithAllGather, FusedMoEPrepareAndFinalizeWithMC2,
@@ -30,6 +32,24 @@
                                                   TokenDispatcherWithAllGather,
                                                   TokenDispatcherWithMC2)
 
+_MoECommMethods: Dict[str, MoECommMethod] = {}
+
+
+def _register_moe_comm_method(moe_comm_method: MoECommMethod):
+    _MoECommMethods[moe_comm_method.__class__.__name__] = moe_comm_method
+
+
+def get_moe_comm_method(
+        name: Optional[MoECommType]) -> Optional[MoECommMethod]:
+    return _MoECommMethods.get(str(name))
+
+
+def setup_moe_comm_method(moe_config):
+    _register_moe_comm_method(AlltoAllCommImpl(moe_config))
+    _register_moe_comm_method(AllGatherCommImpl(moe_config))
+    _register_moe_comm_method(MC2CommImpl(moe_config))
+    _register_moe_comm_method(NaiveMulticastCommImpl(moe_config))
+
 
 class MoECommMethod(ABC):
     """Base class for MoE communication methods."""
diff --git a/vllm_ascend/ops/moe/moe_mlp.py b/vllm_ascend/ops/moe/moe_mlp.py
@@ -21,6 +21,7 @@
 from torch.nn.functional import pad
 from vllm.forward_context import get_forward_context
 
+from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.utils import dispose_tensor, is_310p
 
 
@@ -76,7 +77,7 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
     bias1, bias2 = None, None
     _output_dtype = w2_scale.dtype
 
-    is_mc2 = get_forward_context().moe_comm_method_name == "mc2commimpl"
+    is_mc2 = get_forward_context().moe_comm_type == MoECommType.MC2
     if w1_scale_bias is None and is_mc2:
         if w1_scale.dtype != torch.float32:
             w1_scale = w1_scale.to(torch.float32)
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -117,11 +117,11 @@ def dummy_run(self,
                   skip_attn: bool = False,
                   num_reqs: int = 0,
                   num_tokens_across_dp: Optional[torch.Tensor] = None):
-        moe_comm_method = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(
             num_tokens, with_prefill)
         with set_ascend_forward_context(None,
                                         self.vllm_config,
-                                        moe_comm_method=moe_comm_method,
+                                        moe_comm_type=moe_comm_type,
                                         num_tokens=num_tokens):
             self.model(
                 input_ids=self.input_ids[:num_tokens],
@@ -454,7 +454,7 @@ def _propose(
         with_prefill = attn_metadata.attn_state not in [
             AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
         ]
-        moe_comm_method = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(
             num_input_tokens, with_prefill)
 
         # copy inputs to buffer for cudagraph
@@ -463,7 +463,7 @@ def _propose(
         attn_metadata.block_tables = block_table.to(device)
         with set_ascend_forward_context(attn_metadata,
                                         self.vllm_config,
-                                        moe_comm_method=moe_comm_method,
+                                        moe_comm_type=moe_comm_type,
                                         num_tokens=num_input_tokens):
             last_hidden_states, hidden_states = self.model(
                 input_ids=self.input_ids[:num_input_tokens],
@@ -495,7 +495,7 @@ def _propose(
         else:
             input_batch_size = batch_size
 
-        moe_comm_method = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(
             input_batch_size, False)
 
         attn_metadata.num_actual_tokens = batch_size
@@ -568,7 +568,7 @@ def _propose(
             # Run the model.
             with set_ascend_forward_context(attn_metadata,
                                             self.vllm_config,
-                                            moe_comm_method=moe_comm_method,
+                                            moe_comm_type=moe_comm_type,
                                             num_tokens=input_batch_size):
 
                 last_hidden_states, hidden_states = self.model(
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -113,7 +113,7 @@ def dummy_run(self,
              _) = self.runner._sync_metadata_across_dp(num_tokens,
                                                        with_prefill, False)
 
-        moe_comm_method = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(
             num_tokens, with_prefill)
 
         is_running_torchair = self.torchair_graph_enabled and \
@@ -146,7 +146,7 @@ def dummy_run(self,
                     with_prefill=with_prefill,
                     num_tokens_across_dp=num_tokens_across_dp,
                     reserved_mc2_mask=self.runner.reserved_mc2_mask,
-                    moe_comm_method=moe_comm_method,
+                    moe_comm_type=moe_comm_type,
                     in_profile_run=self.runner.in_profile_run,
                     num_actual_tokens=0):
                 if is_running_torchair:
@@ -416,7 +416,7 @@ def _propose(
             num_tokens_across_dp = self.runner.num_tokens_across_dp
             with_prefill = self.runner.with_prefill
 
-        moe_comm_method = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(
             num_input_tokens, with_prefill)
 
         for step in range(self.num_speculative_tokens):
@@ -427,7 +427,7 @@ def _propose(
                     with_prefill=with_prefill,
                     num_tokens_across_dp=num_tokens_across_dp,
                     reserved_mc2_mask=self.runner.reserved_mc2_mask,
-                    moe_comm_method=moe_comm_method,
+                    moe_comm_type=moe_comm_type,
                     in_profile_run=self.runner.in_profile_run,
                     num_actual_tokens=num_tokens):
                 with ProfileExecuteDuration().capture_async('mtp_forward'):
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py