remove npu_wait_stream

whx-sjtu · whx-sjtu · commit c9ec4306afdc · 2025-09-17T21:04:10.000+08:00
Signed-off-by: whx-sjtu &lt;2952154980@qq.com&gt;
diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py
@@ -37,8 +37,7 @@
 from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
                                                  AlltoAllCommImpl, MC2CommImpl,
                                                  NaiveMulticastCommImpl)
-from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, is_310p,
-                               npu_stream_switch, npu_wait_stream)
+from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, npu_stream_switch
 
 original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__
 
@@ -439,8 +438,10 @@ def forward(
         router_logits: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Make sure the shared experts stream begins after hidden_states are ready.
-        npu_wait_stream(self.shared_expert_stream, torch.npu.current_stream(), enabled=self.multistream_overlap_shared_expert)
-        with npu_stream_switch(self.shared_expert_stream, enabled=self.multistream_overlap_shared_expert):
+        if self.multistream_overlap_shared_expert:
+            self.shared_expert_stream.wait_stream(torch.npu.current_stream())
+        with npu_stream_switch(self.shared_expert_stream,
+                               enabled=self.multistream_overlap_shared_expert):
             # Use a separate stream to run shared experts.
             shared_out = self._shared_experts(hidden_states)
 
@@ -455,7 +456,8 @@ def forward(
             router_logits=router_logits,
         )
         # Make sure the default stream waits for the shared experts stream to finish.
-        npu_wait_stream(torch.npu.current_stream(), self.shared_expert_stream, enabled=self.multistream_overlap_shared_expert)
+        if self.multistream_overlap_shared_expert:
+            torch.npu.current_stream().wait_stream(self.shared_expert_stream)
         return shared_out, fused_out
 
 
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -631,19 +631,3 @@ def npu_stream_switch(target_stream: torch.npu.Stream,
         return nullcontext()
     assert target_stream is not None
     return torch.npu.stream(target_stream)
-
-
-def npu_wait_stream(current_stream: torch.npu.Stream,
-                    target_stream: torch.npu.Stream,
-                    *,
-                    enabled: bool = True):
-    """
-    Make current stream wait for the target stream if enabled is True.
-    This operation will launch a record event on the target stream,
-    and launch a wait event on current stream, waitint for the record event.
-    Otherwise, do nothing.
-    """
-    if enabled:
-        assert current_stream is not None 
-        assert target_stream is not None
-        current_stream.wait_stream(target_stream)