[bugfix] prefill optimization support torchair graph mode

kunpengW-code · kunpengW-code · commit 6029cde904d0 · 2025-07-16T10:00:20.000+08:00
Signed-off-by: Wang Kunpeng &lt;1289706727@qq.com&gt;
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -627,6 +627,7 @@ def __init__(
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
         self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz
+        self.enable_prefill_optimizations = ascend_config.enable_prefill_optimizations
 
         # Adapt torch air graph mode with spec decoding.
         speculative_config = get_current_vllm_config().speculative_config
@@ -1140,7 +1141,7 @@ def forward(
             # Inputs and outputs may be padded for CUDA graphs
             output_padded = output
             output = output[:num_actual_toks, ...]
-            if not self.torchair_graph_enabled:
+            if not self.torchair_graph_enabled or self.enable_prefill_optimizations:
                 kv_c_normed = kv_c_normed[:num_actual_toks, ...]
                 prefill_k_c_normed = kv_c_normed[num_decode_tokens:]
         if not self.running_in_graph:
@@ -1187,7 +1188,7 @@ def forward(
                 .view(-1, self.num_heads, self.qk_head_dim)
             prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:]
             prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim]
-            if self.torchair_graph_enabled:
+            if self.torchair_graph_enabled and not self.enable_prefill_optimizations:
                 num_tokens = prefill_hs_or_q_c.shape[0]
                 cos = attn_metadata.prefill.cos
                 sin = attn_metadata.prefill.sin
@@ -1203,6 +1204,7 @@ def forward(
                                                  -1)
                 prefill_q = torch.cat([prefill_q_nope, prefill_q_pe], dim=-1)
             else:
+                num_tokens = prefill_hs_or_q_c.shape[0]
                 prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb(
                     attn_metadata.prefill.input_positions,
                     prefill_q_pe.contiguous(), prefill_k_pe)
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -39,8 +39,8 @@
 from vllm.distributed import (get_dp_group, get_pp_group,
                               get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_reduce_scatter,
-                              split_tensor_along_last_dim, get_tp_group)
+                              get_tp_group, split_tensor_along_last_dim,
+                              tensor_model_parallel_reduce_scatter)
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -68,6 +68,7 @@
 from vllm.sequence import IntermediateTensors
 
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
 from vllm_ascend.quantization.quant_config import AscendLinearMethod
 from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
@@ -539,9 +540,24 @@ def forward(
         else:
             hidden_states_or_q_c = hidden_states
         is_mtp_model = attn_metadata is not None and attn_metadata.is_mtp_model
-        if self.torchair_graph_enabled and not is_mtp_model:
+        with_decode = attn_metadata is not None and attn_metadata.attn_state in [
+            AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
+        ]
+        with_optimization_prefill = self.enable_prefill_optimizations and not with_decode
+        if self.torchair_graph_enabled and not is_mtp_model and not with_optimization_prefill:
+            if self.enable_prefill_optimizations and self.debug_layer_idx > 3 and self.debug_layer_idx < 61:
+                hidden_states_or_q_c = get_tp_group().all_gather(
+                    hidden_states_or_q_c, 0)
+                hidden_states = get_tp_group().all_gather(hidden_states, 0)
             if envs.VLLM_USE_V1:
-                output_shape = hidden_states.shape
+                if not self.enable_prefill_optimizations or self.debug_layer_idx < 3:
+                    output_shape = hidden_states.shape
+                else:
+                    num_tokens = hidden_states.shape[0]
+                    rows = num_tokens // self.tp_size
+                    if num_tokens % self.tp_size:
+                        rows += 1
+                    output_shape = (rows, hidden_states.shape[1])
                 output = torch.empty(output_shape,
                                      dtype=hidden_states_or_q_c.dtype,
                                      device=hidden_states_or_q_c.device)