MLA layer eliminates redundant index operators

huiyingCCCC · huiyingCCCC · commit bfc94f3b4ac6 · 2025-06-11T20:56:54.000+08:00
Signed-off-by: huiying &lt;chenhuiying4@huawei.com&gt;
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -822,6 +822,8 @@ def forward(
         k_pe: torch.Tensor,  # value in unified attn
         kv_cache: torch.Tensor,
         attn_metadata: M,
+        rotary_cos: Optional[torch.Tensor] = None,
+        rotary_sin: Optional[torch.Tensor] = None,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         assert output is not None, "Output tensor must be provided."
@@ -870,15 +872,8 @@ def forward(
             decode_ql_nope, decode_q_pe = \
                 self._q_proj_and_k_up_proj(decode_hs_or_q_c)
             if self.running_in_graph:
-                seq_len = self.rotary_emb.max_position_embeddings
-                cos = self.rotary_emb.cos_cached[:seq_len].to(
-                    dtype=decode_q_pe.dtype)
-                sin = self.rotary_emb.sin_cached[:seq_len].to(
-                    dtype=decode_q_pe.dtype)
-                cos = cos[attn_metadata.decode.input_positions]
-                sin = sin[attn_metadata.decode.input_positions]
-                cos = cos[:, None, None, :]
-                sin = sin[:, None, None, :]
+                cos = rotary_cos.to(dtype=decode_q_pe.dtype)
+                sin = rotary_sin.to(dtype=decode_q_pe.dtype)
 
                 decode_q_pe = self.rope_single(decode_q_pe, cos, sin)
                 decode_k_pe, decode_k_nope = self.exec_kv(
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -72,6 +72,7 @@
 from vllm_ascend.quantization.quant_config import AscendLinearMethod
 from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
 from vllm_ascend.utils import dispose_tensor
+from vllm_ascend.attention.attention_v1 import AscendAttentionState
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
 
@@ -502,7 +503,9 @@ def forward(
             positions: torch.Tensor,
             hidden_states: torch.Tensor,
             kv_cache: Optional[torch.Tensor] = None,
-            attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor:
+            attn_metadata: Optional[AttentionMetadata] = None,
+            rotary_cos: Optional[torch.Tensor] = None,
+            rotary_sin: Optional[torch.Tensor] = None) -> torch.Tensor:
         if self.q_lora_rank is not None:
             ckq = self.q_a_proj(hidden_states)[0]
             hidden_states_or_q_c = self.q_a_layernorm(ckq)
@@ -516,6 +519,8 @@ def forward(
                                      dtype=hidden_states_or_q_c.dtype,
                                      device=hidden_states_or_q_c.device)
                 forward_kwargs['output'] = output
+                forward_kwargs['rotary_cos'] = rotary_cos
+                forward_kwargs['rotary_sin'] = rotary_sin
 
             output = self.mla_attn.impl.forward(self.mla_attn,
                                                 hidden_states_or_q_c,
@@ -607,6 +612,8 @@ def forward(
         residual: Optional[torch.Tensor],
         kv_cache: Optional[torch.Tensor] = None,
         attn_metadata: Optional[AttentionMetadata] = None,
+        rotary_cos: Optional[torch.Tensor] = None,
+        rotary_sin: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
@@ -626,6 +633,8 @@ def forward(
             hidden_states=hidden_states,
             kv_cache=kv_cache,
             attn_metadata=attn_metadata,
+            rotary_cos=rotary_cos,
+            rotary_sin=rotary_sin,
         )
 
         if hidden_states.dtype == torch.float16:
@@ -703,9 +712,43 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        if rope_scaling:
+            rope_scaling["rope_type"] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(config.qk_rope_head_dim,
+                                   rotary_dim=config.qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
+    def prepare_decoder_rotary_cos_sin(
+        self, attn_metadata: Optional[AttentionMetadata] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if (attn_metadata is not None and attn_metadata.num_decodes is not None and
+            attn_metadata.atten_state):
+            has_decode = attn_metadata.num_decodes > 0
+            running_in_graph = self.torchair_graph_enabled and attn_metadata.attn_state in [
+                AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding]
+            if has_decode and running_in_graph:
+                cos = self.rotary_emb.cos_cached
+                sin = self.rotary_emb.sin_cached
+                cos = cos[attn_metadata.decode.input_positions]
+                sin = sin[attn_metadata.decode.input_positions]
+                cos = cos[:, None, None, :]
+                sin = sin[:, None, None, :]
+                return cos, sin
+        return None, None
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -726,13 +769,17 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
+        # In graph mode and v1 engine,
+        # precomputing cos and sin can eliminate repeated calculations in each decode layer.
+        rotary_cos, rotary_sin = self.prepare_decoder_rotary_cos_sin(attn_metadata)
+
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(
                 positions, hidden_states, residual,
                 kv_caches[i -
                           self.start_layer] if kv_caches is not None else None,
-                attn_metadata)
+                attn_metadata, rotary_cos, rotary_sin)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({