Add super kernel in moe

NNUCJ · NNUCJ · commit dfc59356efc9 · 2025-07-18T19:51:16.000+08:00
Signed-off-by: NNUCJ &lt;616151263@qq.com&gt;
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -48,7 +48,7 @@
     MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig)
 from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
                                get_ascend_soc_version, npu_stream_switch,
-                               npu_wait_tensor)
+                               npu_wait_tensor, super_kernel)
 
 VLLM_ASCEND_MOE_ALL2ALL_BUFFER: bool = envs_ascend.VLLM_ASCEND_MOE_ALL2ALL_BUFFER
 
@@ -1123,6 +1123,7 @@ def __init__(
 
         AscendFusedMoE.moe_counter += 1
         self.moe_instance_id = AscendFusedMoE.moe_counter
+        self.prefix = prefix
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -1264,20 +1265,24 @@ def forward(
 
         forward_context = get_forward_context()
         fused_moe_state = get_forward_context().fused_moe_state
+        is_prefill = get_forward_context().with_prefill
         # For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel.
         quantized_x_for_share, dynamic_scale_for_share = None, None
         from vllm_ascend.quantization.w8a8_dynamic import \
             AscendW8A8DynamicFusedMoEMethod
 
         if self.enable_multistream_moe:
             assert gate is not None
-            router_logits, _ = gate(hidden_states)
-            if (isinstance(self.quant_method.quant_method,
-                           AscendW8A8DynamicFusedMoEMethod)
-                    and fused_moe_state == FusedMoEState.MC2):
-                with npu_stream_switch("moe_secondary", 0):
-                    quantized_x_for_share, dynamic_scale_for_share = (
-                        torch_npu.npu_dynamic_quant(hidden_states))
+            with super_kernel(self.prefix,
+                              "stream-fusion=1",
+                              enabled=not is_prefill):
+                router_logits, _ = gate(hidden_states)
+                if (isinstance(self.quant_method.quant_method,
+                               AscendW8A8DynamicFusedMoEMethod)
+                        and fused_moe_state == FusedMoEState.MC2):
+                    with npu_stream_switch("moe_secondary", 0):
+                        quantized_x_for_share, dynamic_scale_for_share = (
+                            torch_npu.npu_dynamic_quant(hidden_states))
 
         if shared_experts:
             if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2:
@@ -1354,6 +1359,7 @@ def forward(
             dynamic_scale_for_share=dynamic_scale_for_share,
             mc2_mask=mc2_mask,
             token_dispatcher=self.token_dispatcher,
+            prefix=self.prefix,
         )
 
         if shared_experts:
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -30,7 +30,8 @@
 from vllm_ascend.ops.fused_moe import select_experts
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendSocVersion,
                                dispose_tensor, get_ascend_soc_version,
-                               npu_stream_switch, npu_wait_tensor)
+                               npu_stream_switch, npu_wait_tensor,
+                               super_kernel)
 
 CHUNK_SIZE: int = ascend_envs.VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE
 
@@ -853,125 +854,130 @@ def apply(
         shared_experts: Optional[Any] = None,
         quantized_x_for_share: Optional[Any] = None,
         dynamic_scale_for_share: Optional[Any] = None,
+        prefix: str = "",
         **kwargs,
     ) -> torch.Tensor:
         assert router_logits.shape[
             1] == global_num_experts, "Number of global experts mismatch"
-
-        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
-        if global_num_experts == 256:
-            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
-                router_logits,
-                k=top_k,  # topk当前写8
-                bias=e_score_correction_bias,
-                k_group=topk_group,  # fix: 4
-                group_count=num_expert_group,  # fix 8
-                group_select_mode=1,  # 0: group中的最大; 1: topk2.sum(fix)
-                renorm=0,  # 0: softmax->topk(fix); 1: topk->softmax
-                norm_type=1,  # 0: softmax; 1: sigmoid(fix)
-                # out_flag=False, # todo new api; 第三个输出是否输出
-                # y2_flag=False, # old api; 第三个输出是否输出
-                routed_scaling_factor=1,
-                eps=float(1e-20))
-        else:
-            topk_weights, topk_ids = select_experts(
-                hidden_states=x,
-                router_logits=router_logits,
-                top_k=top_k,
-                use_grouped_topk=use_grouped_topk,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                custom_routing_function=custom_routing_function,
-                scoring_func=scoring_func,
-                e_score_correction_bias=e_score_correction_bias,
-            )
-
-        fused_moe_state = get_forward_context().fused_moe_state
-        shared_gate_up, shared_dequant_scale = None, None
-        if shared_experts is not None and fused_moe_state == FusedMoEState.MC2:
-            with npu_stream_switch("moe_secondary", 0):
-                npu_wait_tensor(quantized_x_for_share, router_logits)
-                share_up_out, _ = shared_experts.gate_up_proj(
-                    (quantized_x_for_share, dynamic_scale_for_share))
-                shared_gate_up, shared_dequant_scale = share_up_out[
-                    0], share_up_out[1]
-
-        # this is a naive implementation for experts load balance so as
-        # to avoid accumulating too much tokens on a single rank.
-        # currently it is only activated when doing profile runs.
-        if enable_force_load_balance:
-            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
-
-        topk_weights = topk_weights.to(x.dtype)
-
-        if fused_moe_state == FusedMoEState.MC2:
-            return fused_experts_with_mc2(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                w1_scale=layer.w13_weight_scale_fp32,
-                w2_scale=layer.w2_weight_scale,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                top_k=top_k,
-                expert_map=expert_map,
-                moe_all_to_all_group_name=self.moe_all_to_all_group_name,
-                log2phy=log2phy,
-                global_redundant_expert_num=global_redundant_expert_num,
-                shared_experts=shared_experts,
-                is_torchair=self.torchair_graph_enabled,
-                quantized_x_for_share=shared_gate_up,
-                dynamic_scale_for_share=shared_dequant_scale,
-                mc2_mask=kwargs.get("mc2_mask", None))
-        elif fused_moe_state == FusedMoEState.MC2_PREFILL:
-            return fused_prefill_experts_with_mc2(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                w1_scale=layer.w13_weight_scale_fp32,
-                w2_scale=layer.w2_weight_scale,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                top_k=top_k,
-                expert_map=expert_map,
-                moe_all_to_all_group_name=self.moe_all_to_all_group_name,
-                log2phy=log2phy,
-                global_redundant_expert_num=global_redundant_expert_num,
-                shared_experts=shared_experts,
-                is_torchair=self.torchair_graph_enabled,
-                quantized_x_for_share=shared_gate_up,
-                dynamic_scale_for_share=shared_dequant_scale,
-                mc2_mask=kwargs.get("mc2_mask", None))
-        elif fused_moe_state == FusedMoEState.AllGather:
-            return fused_experts(hidden_states=x,
-                                 w1=layer.w13_weight,
-                                 w1_scale=layer.w13_weight_scale,
-                                 w2=layer.w2_weight,
-                                 w2_scale=layer.w2_weight_scale,
-                                 topk_weights=topk_weights,
-                                 topk_ids=topk_ids,
-                                 top_k=top_k,
-                                 expert_map=expert_map)
-        else:
-            # The current implementation of deepseek moe splits hidden_states
-            # according to tp_size before they are feed into fused_moe module.
-            # Therefore, all2all is needed no matter how dp/tp is set so as to
-            # dispatch/combine tokens.
-            return fused_experts_with_all2all(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w1_scale=layer.w13_weight_scale,
-                w2=layer.w2_weight,
-                w2_scale=layer.w2_weight_scale,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                top_k=top_k,
-                expert_map=expert_map,
-                ep_group=self.ep_group,
-                log2phy=log2phy,
-                global_redundant_expert_num=global_redundant_expert_num,
-            )
+        if shared_experts is not None:
+            router_logits = router_logits.float()
+        with super_kernel(prefix,
+                          "stream-fusion=1",
+                          enabled=shared_experts is not None):
+            # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
+            if global_num_experts == 256:
+                topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
+                    router_logits,
+                    k=top_k,  # topk当前写8
+                    bias=e_score_correction_bias,
+                    k_group=topk_group,  # fix: 4
+                    group_count=num_expert_group,  # fix 8
+                    group_select_mode=1,  # 0: group中的最大; 1: topk2.sum(fix)
+                    renorm=0,  # 0: softmax->topk(fix); 1: topk->softmax
+                    norm_type=1,  # 0: softmax; 1: sigmoid(fix)
+                    # out_flag=False, # todo new api; 第三个输出是否输出
+                    # y2_flag=False, # old api; 第三个输出是否输出
+                    routed_scaling_factor=1,
+                    eps=float(1e-20))
+            else:
+                topk_weights, topk_ids = select_experts(
+                    hidden_states=x,
+                    router_logits=router_logits,
+                    top_k=top_k,
+                    use_grouped_topk=use_grouped_topk,
+                    renormalize=renormalize,
+                    topk_group=topk_group,
+                    num_expert_group=num_expert_group,
+                    custom_routing_function=custom_routing_function,
+                    scoring_func=scoring_func,
+                    e_score_correction_bias=e_score_correction_bias,
+                )
+
+            fused_moe_state = get_forward_context().fused_moe_state
+            shared_gate_up, shared_dequant_scale = None, None
+            if shared_experts is not None and fused_moe_state == FusedMoEState.MC2:
+                with npu_stream_switch("moe_secondary", 0):
+                    npu_wait_tensor(quantized_x_for_share, router_logits)
+                    share_up_out, _ = shared_experts.gate_up_proj(
+                        (quantized_x_for_share, dynamic_scale_for_share))
+                    shared_gate_up, shared_dequant_scale = share_up_out[
+                        0], share_up_out[1]
+
+            # this is a naive implementation for experts load balance so as
+            # to avoid accumulating too much tokens on a single rank.
+            # currently it is only activated when doing profile runs.
+            if enable_force_load_balance:
+                topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
+
+            topk_weights = topk_weights.to(x.dtype)
+
+            if fused_moe_state == FusedMoEState.MC2:
+                return fused_experts_with_mc2(
+                    hidden_states=x,
+                    w1=layer.w13_weight,
+                    w2=layer.w2_weight,
+                    w1_scale=layer.w13_weight_scale_fp32,
+                    w2_scale=layer.w2_weight_scale,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    top_k=top_k,
+                    expert_map=expert_map,
+                    moe_all_to_all_group_name=self.moe_all_to_all_group_name,
+                    log2phy=log2phy,
+                    global_redundant_expert_num=global_redundant_expert_num,
+                    shared_experts=shared_experts,
+                    is_torchair=self.torchair_graph_enabled,
+                    quantized_x_for_share=shared_gate_up,
+                    dynamic_scale_for_share=shared_dequant_scale,
+                    mc2_mask=kwargs.get("mc2_mask", None))
+            elif fused_moe_state == FusedMoEState.MC2_PREFILL:
+                return fused_prefill_experts_with_mc2(
+                    hidden_states=x,
+                    w1=layer.w13_weight,
+                    w2=layer.w2_weight,
+                    w1_scale=layer.w13_weight_scale_fp32,
+                    w2_scale=layer.w2_weight_scale,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    top_k=top_k,
+                    expert_map=expert_map,
+                    moe_all_to_all_group_name=self.moe_all_to_all_group_name,
+                    log2phy=log2phy,
+                    global_redundant_expert_num=global_redundant_expert_num,
+                    shared_experts=shared_experts,
+                    is_torchair=self.torchair_graph_enabled,
+                    quantized_x_for_share=shared_gate_up,
+                    dynamic_scale_for_share=shared_dequant_scale,
+                    mc2_mask=kwargs.get("mc2_mask", None))
+            elif fused_moe_state == FusedMoEState.AllGather:
+                return fused_experts(hidden_states=x,
+                                     w1=layer.w13_weight,
+                                     w1_scale=layer.w13_weight_scale,
+                                     w2=layer.w2_weight,
+                                     w2_scale=layer.w2_weight_scale,
+                                     topk_weights=topk_weights,
+                                     topk_ids=topk_ids,
+                                     top_k=top_k,
+                                     expert_map=expert_map)
+            else:
+                # The current implementation of deepseek moe splits hidden_states
+                # according to tp_size before they are feed into fused_moe module.
+                # Therefore, all2all is needed no matter how dp/tp is set so as to
+                # dispatch/combine tokens.
+                return fused_experts_with_all2all(
+                    hidden_states=x,
+                    w1=layer.w13_weight,
+                    w1_scale=layer.w13_weight_scale,
+                    w2=layer.w2_weight,
+                    w2_scale=layer.w2_weight_scale,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    top_k=top_k,
+                    expert_map=expert_map,
+                    ep_group=self.ep_group,
+                    log2phy=log2phy,
+                    global_redundant_expert_num=global_redundant_expert_num,
+                )
 
     def process_weights_after_loading(self, layer):
         if self.transpose_weight:
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -30,6 +30,7 @@
 import torchair  # type: ignore[import]  # noqa: F401
 from packaging.version import InvalidVersion, Version
 from torch_npu.npu.streams import Event
+from torchair.scope import super_kernel as _super_kernel
 from vllm.logger import logger
 
 import vllm_ascend.envs as envs
@@ -296,6 +297,10 @@ def npu_stream_switch(tag: str, priority: int, *, enabled: bool = True):
     return _npu_stream_switch(tag, priority) if enabled else nullcontext()
 
 
+def super_kernel(prefix: str, stream: str, enabled: bool = True):
+    return _super_kernel(prefix, stream) if enabled else nullcontext()
+
+
 def npu_wait_tensor(self: torch.Tensor,
                     dependency: torch.Tensor,
                     *,