[V0.9.1] torchair_graph bugfix when chunked_prefill is true (#1748)

fems14 · SlightwindSec · web-flow · commit f4d4c0a2c651 · 2025-07-16T09:32:38.000+08:00
### What this PR does / why we need it?
when torchair_graph and chunked_prefill are both true, save the
decode kv_cache.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


---------

Signed-off-by: fems14 &lt;1804143737@qq.com&gt;
Signed-off-by: SlightwindSec &lt;slightwindsec@gmail.com&gt;
Co-authored-by: SlightwindSec &lt;slightwindsec@gmail.com&gt;
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -1197,7 +1197,7 @@ def forward(
                     prefill_hs, cos, sin, kv_cache,
                     attn_metadata.slot_mapping[num_decode_tokens:])
 
-                kv_c_normed = prefill_k_nope[:num_actual_toks, ...]
+                kv_c_normed_prefill = prefill_k_nope[:num_actual_toks, ...]
                 prefill_k_c_normed = prefill_k_nope
                 prefill_k_pe = prefill_k_pe.view(num_tokens, self.num_kv_heads,
                                                  -1)
@@ -1215,12 +1215,23 @@ def forward(
             ) > 0 and attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
                 slots = attn_metadata.slot_mapping
                 # NOTE: Separate the kv cache in advance to avoid OOM or other issues
-                torch_npu._npu_reshape_and_cache(key=kv_c_normed.view(
+                torch_npu._npu_reshape_and_cache(key=kv_c_normed_prefill.view(
                     num_tokens, self.num_kv_heads, -1),
                                                  value=prefill_k_pe,
                                                  key_cache=kv_cache[0],
                                                  value_cache=kv_cache[1],
                                                  slot_indices=slots)
+
+            if kv_cache[0].numel(
+            ) > 0 and attn_metadata.attn_state == AscendAttentionState.ChunkedPrefill and has_decode:
+                slots = attn_metadata.slot_mapping[:num_decode_tokens]
+                k_c_normed_decode = kv_c_normed[:num_decode_tokens]
+                torch_npu._npu_reshape_and_cache(key=k_c_normed_decode.view(
+                    num_decode_tokens, self.num_kv_heads, -1),
+                                                 value=decode_k_pe,
+                                                 key_cache=kv_cache[0],
+                                                 value_cache=kv_cache[1],
+                                                 slot_indices=slots)
         else:
             kv_c_normed = kv_c_normed.view(
                 [num_actual_toks, self.num_kv_heads, -1])