roll back forward prefill

SunnyLee151064 · SunnyLee151064 · commit e42217a0b564 · 2025-08-26T21:24:36.000+08:00
Signed-off-by: SunnyLee219 &lt;3294305115@qq.com&gt;
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -50,6 +50,8 @@ def __init__(self, vllm_config):
         ) and not self.torchair_graph_config.enabled and vllm_config.parallel_config.enable_expert_parallel
         self.enable_mla_prefetch = additional_config.get(
             "enable_mla_prefetch", True)
+        self.chunked_prefill_for_mla = additional_config.get(
+            "chunked_prefill_for_mla", False)
 
 
 class TorchairGraphConfig:
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -20,6 +20,7 @@
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
+from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
 from vllm_ascend.utils import npu_prefetch
 from vllm_ascend.worker.npu_input_batch import InputBatch
 
@@ -654,43 +655,67 @@ def _forward_prefill(
     ) -> torch.Tensor:
         assert attn_metadata.prefill is not None
         assert len(kv_c_and_k_pe_cache) > 1
+        query = torch.cat([q_nope, q_pe], dim=-1)
         num_tokens = q_nope.size(0)
         attn_output = torch.empty(num_tokens,
                                   self.num_heads,
                                   self.v_head_dim,
-                                  dtype=q_nope.dtype,
-                                  device=q_nope.device)
-        if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
-            query = torch.cat((q_nope, q_pe), dim=-1)
-            key = torch.cat((k_nope, k_pe), dim=-1)
-            torch_npu._npu_flash_attention(
+                                  dtype=query.dtype,
+                                  device=query.device)
+        k_pe = k_pe.expand((*k_nope.shape[:-1], -1))
+        # Here is only 2 possibility of input, ChunkedPrefill or PrefillNoCache
+        ascend_config = get_ascend_config()
+
+        if attn_metadata.attn_state in [
+                AscendAttentionState.ChunkedPrefill,
+                AscendAttentionState.SpecDecoding,
+                AscendAttentionState.PrefillCacheHit
+        ] and not ascend_config.chunked_prefill_for_mla:
+            
+            attn_output_torch = torch.empty(num_tokens,
+                                            self.num_heads * self.v_head_dim,
+                                            dtype=query.dtype,
+                                            device=query.device)
+            # current requests is chunked in prefill, disable flash attention with chunked prefill
+            vanilla_chunked_prefill_mla(
+                output=attn_output_torch,
                 query=query,
-                key=key,
-                value=value,
-                mask=attn_metadata.attn_mask,
-                seq_len=attn_metadata.prefill.context_lens,
-                scale_value=self.scale,
-                num_heads=self.num_heads,
-                num_kv_heads=self.num_heads,
-                out=attn_output)
-        else:
+                kv_cache=kv_c_and_k_pe_cache,
+                block_tables=attn_metadata.prefill.block_table,
+                query_lens=attn_metadata.prefill.query_lens,
+                context_lens=attn_metadata.prefill.context_lens,
+                kv_b_proj=self.kv_b_proj,
+                max_query_len=attn_metadata.prefill.max_query_len,
+                max_context_len=attn_metadata.prefill.max_seq_lens,
+                nope_dim=self.qk_nope_head_dim,
+                rope_dim=self.qk_rope_head_dim,
+                v_head_dim=self.v_head_dim,
+                scale=self.scale,
+                alibi_slopes=None,
+                causal=True)
+        elif attn_metadata.attn_state in [
+                AscendAttentionState.ChunkedPrefill,
+                AscendAttentionState.SpecDecoding,
+                AscendAttentionState.PrefillCacheHit
+        ]:
+            query = torch.cat([q_nope, q_pe], dim=-1)
             attn_lse = torch.empty(self.num_heads,
                                    num_tokens,
                                    dtype=torch.float32,
                                    device=q_nope.device)
-            self.prefill_mask = torch.triu(
-                torch.ones(512, 512, device=q_nope.device, dtype=q_nope.dtype),
+            mask = torch.triu(
+                torch.ones(512, 512, device=query.device, dtype=query.dtype),
                 1)  # 512: mask only support 512
             if attn_metadata.num_prefills > 1:
-                self.prefill_mask = self.prefill_mask.unsqueeze(0).repeat(
-                    attn_metadata.num_prefills, 1, 1)
+                mask = mask.unsqueeze(0).repeat(attn_metadata.num_prefills, 1,
+                                                1)
             torch_npu.atb.npu_ring_mla(
                 q_nope=q_nope,
                 q_rope=q_pe,
                 k_nope=k_nope,
                 k_rope=k_pe,
                 value=value,
-                mask=self.prefill_mask,
+                mask=mask,
                 seqlen=torch.tensor(attn_metadata.prefill.query_lens,
                                     dtype=torch.int32),
                 head_num=self.num_heads,
@@ -705,10 +730,34 @@ def _forward_prefill(
                 output=attn_output,
                 softmax_lse=attn_lse)
             attn_output, attn_lse = self._compute_prefill_context( \
-                q_nope, q_pe, kv_c_and_k_pe_cache, self.qk_rope_head_dim, attn_metadata, attn_output, attn_lse)
+                query, kv_c_and_k_pe_cache, self.qk_rope_head_dim, attn_metadata, attn_output, attn_lse)
 
+        elif attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
+            key = torch.cat((k_nope, k_pe), dim=-1)
+            torch_npu._npu_flash_attention(
+                query=query,
+                key=key,
+                value=value,
+                mask=attn_metadata.attn_mask,
+                seq_len=attn_metadata.prefill.context_lens,
+                scale_value=self.scale,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_heads,
+                out=attn_output)
+            attn_output = attn_output.view(-1, self.num_heads, self.v_head_dim)
+        else:
+            raise RuntimeError(
+                "Unexpected path reached, AscendMLAImpl should only have PrefillNoCache, PrefillCacheHit, ChunkedPrefill and SpecDecoding scenario in forward prefill, please file a bug to vllm-ascend !"
+            )
         attn_output = attn_output.reshape(
             [num_tokens, self.num_heads * self.v_head_dim])
+        if attn_metadata.attn_state in [
+                AscendAttentionState.ChunkedPrefill,
+                AscendAttentionState.SpecDecoding,
+                AscendAttentionState.PrefillCacheHit
+        ] and not ascend_config.chunked_prefill_for_mla:
+            attn_output = attn_output_torch
+
         return attn_output
 
     def exec_kv_decode(