support moe ms in aclgraph

whx-sjtu · whx-sjtu · commit 950af06bdf96 · 2025-09-17T16:26:26.000+08:00
Signed-off-by: whx-sjtu &lt;2952154980@qq.com&gt;
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -61,6 +61,8 @@ def __init__(self, vllm_config):
         self.enable_shared_expert_dp = additional_config.get(
             "enable_shared_expert_dp", False
         ) and not self.torchair_graph_config.enabled and vllm_config.parallel_config.enable_expert_parallel
+        self.enable_multistream_moe = additional_config.get(
+            "enable_multistream_moe", False)
         self.enable_prefetch = additional_config.get("enable_prefetch", False)
         self.lmhead_tensor_parallel_size = additional_config.get(
             "lmhead_tensor_parallel_size", None)
diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py
@@ -37,7 +37,8 @@
 from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
                                                  AlltoAllCommImpl, MC2CommImpl,
                                                  NaiveMulticastCommImpl)
-from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, is_310p,
+                               npu_stream_switch, npu_wait_stream)
 
 original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__
 
@@ -426,24 +427,35 @@ def __init__(
         super().__init__(**kwargs)
         self._shared_experts = shared_experts
         self.use_overlapped = use_overlapped
+        self.shared_expert_stream = None
+        ascend_config = get_ascend_config()
+        self.enable_multistream_moe = ascend_config.enable_multistream_moe
+        if self.enable_multistream_moe:
+            self.shared_expert_stream = torch.npu.Stream()
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        shared_out = self._shared_experts(hidden_states)
-
-        # NOTE: This is exactly the opposite of `maybe_all_reduce_tensor_model_parallel`
-        forward_context = get_forward_context()
-        moe_comm_method_name = forward_context.moe_comm_method_name
-        if moe_comm_method_name in {"alltoallcommimpl", "mc2commimpl"}:
-            shared_out = tensor_model_parallel_all_reduce(shared_out)
+        # Make sure the shared experts stream begins after hidden_states are ready.
+        npu_wait_stream(self.shared_expert_stream, torch.npu.current_stream(), enabled=self.enable_multistream_moe)
+        with npu_stream_switch(self.shared_expert_stream, enabled=self.enable_multistream_moe):
+            # Use a separate stream to run shared experts.
+            shared_out = self._shared_experts(hidden_states)
+
+            # NOTE: This is exactly the opposite of `maybe_all_reduce_tensor_model_parallel`
+            forward_context = get_forward_context()
+            moe_comm_method_name = forward_context.moe_comm_method_name
+            if moe_comm_method_name in {"alltoallcommimpl", "mc2commimpl"}:
+                shared_out = tensor_model_parallel_all_reduce(shared_out)
 
         fused_out = super().forward(
             hidden_states=hidden_states,
             router_logits=router_logits,
         )
+        # Make sure the default stream waits for the shared experts stream to finish.
+        npu_wait_stream(torch.npu.current_stream(), self.shared_expert_stream, enabled=self.enable_multistream_moe)
         return shared_out, fused_out
 
 
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -21,7 +21,7 @@
 import functools
 import math
 import os
-from contextlib import contextmanager
+from contextlib import contextmanager, nullcontext
 from enum import Enum
 from threading import Lock
 from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
@@ -617,3 +617,29 @@ def weak_ref_tensors(
     if isinstance(tensors, tuple):
         return tuple(weak_ref_tensor(t) for t in tensors)
     raise ValueError("Invalid type for tensors")
+
+
+def npu_stream_switch(target_stream: torch.npu.Stream, *, enabled: bool = True):
+    """
+    Switch to the target stream if enabled is True.
+    Otherwise, do nothing.
+    """
+    if not enabled:
+        return nullcontext()
+    return torch.npu.stream(target_stream)
+
+
+def npu_wait_stream(
+    current_stream: torch.npu.Stream,
+    target_stream: torch.npu.Stream,
+    *,
+    enabled: bool = True
+):
+    """
+    Make current stream wait for the target stream if enabled is True.
+    This operation will launch a record event on the target stream,
+    and launch a wait event on current stream, waitint for the record event.
+    Otherwise, do nothing.
+    """
+    if enabled:
+        current_stream.wait_stream(target_stream)