vllm-project
diff --git a/‎tests/ut/attention/test_attention_v1.py
Lines changed: 41 additions & 0 deletions b/‎tests/ut/attention/test_attention_v1.py
Lines changed: 41 additions & 0 deletions
diff --git a/‎vllm_ascend/ascend_config.py
Lines changed: 3 additions & 3 deletions b/‎vllm_ascend/ascend_config.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎vllm_ascend/attention/attention_v1.py
Lines changed: 162 additions & 14 deletions b/‎vllm_ascend/attention/attention_v1.py
Lines changed: 162 additions & 14 deletions
diff --git a/‎vllm_ascend/models/__init__.py
Lines changed: 4 additions & 0 deletions b/‎vllm_ascend/models/__init__.py
Lines changed: 4 additions & 0 deletions
@@ -0,0 +1,41 @@
+import unittest
+import torch
+
+class DummyNPU:
+    @staticmethod
+    def npu_scatter_nd_update_(tensor, indices, updates):
+        batch = indices.shape[0]
+        for i in range(batch):
+            b = indices[i,0,0].item()
+            o = indices[i,0,1].item()
+            tensor[b, o] = updates[i]
+
+import torch_npu
+from vllm_ascend.attention.attention_v1 import AscendAttentionBackendImpl
+
+
+class TestUpdateKVCache(unittest.TestCase):
+
+    def test_basic_update(self):
+        block_num, block_size = 3, 2
+        num_heads, head_dim = 1, 1
+
+        key_cache = torch.zeros(block_num, block_size, num_heads, head_dim)
+        value_cache = torch.zeros_like(key_cache)
+
+        batch_size = 2
+        key = torch.tensor([[[1.0]], [[2.0]]])
+        value = torch.tensor([[[3.0]], [[4.0]]])
+
+        slot_indices = torch.tensor([1, 3])
+
+        AscendAttentionBackendImpl.update_kv_cache(key, value, key_cache, value_cache, slot_indices)
+
+        self.assertEqual(key_cache[0,1,0,0].item(), 1.0)
+        self.assertEqual(value_cache[0,1,0,0].item(), 3.0)
+
+        self.assertEqual(key_cache[1,1,0,0].item(), 2.0)
+        self.assertEqual(value_cache[1,1,0,0].item(), 4.0)
+
+if __name__ == '__main__':
+    unittest.main()
@@ -169,12 +169,12 @@ def check_ascend_config(vllm_config, enforce_eager):
                         "Torchair graph mode is still experimental and not supported for V1 without mla currently, "
                         "it has been disabled automatically.")
                     ascend_config.torchair_graph_config.enabled = False
-                # torchair_graph is supported for deepseek model only currently.
+                # torchair_graph is supported for deepseek or qwen currently.
                 if vllm_config.model_config:
                     model_type = vllm_config.model_config.hf_config.model_type
-                    if "deepseek" not in model_type:
+                    if "deepseek" not in model_type and "qwen" not in model_type:
                         raise NotImplementedError(
-                            "Torchair graph mode only works with deepseek model."
+                            "Torchair graph mode only works with deepseek or qwen model."
                         )
             # aclgraph case
             else:
 
@@ -19,17 +19,19 @@
 from enum import Enum
 from typing import Any, Dict, List, Optional, Tuple, Type
 
+import numpy as np
 import torch
 import torch_npu
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
-from vllm.attention.backends.utils import CommonAttentionState
+from vllm.attention.backends.utils import PAD_SLOT_ID, CommonAttentionState
 from vllm.config import get_current_vllm_config
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.utils import direct_register_custom_op
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
+from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.attention.utils import \
     AscendCommonAttentionMetadata as CommonAttentionMetadata
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
@@ -52,7 +54,7 @@ def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
     def get_metadata_cls() -> Type["AscendMetadata"]:
         return AscendMetadata
 
-    @staticmethod
+    @AscendAttentionBackendImplstaticmethod
     def get_state_cls() -> Type["CommonAttentionState"]:
         return CommonAttentionState
 
@@ -140,6 +142,8 @@ class AscendMetadata:
     num_input_tokens: int = 0  # Number of tokens including padding.
 
     enable_dbo_across_dp: bool = False
+    with_prefill_across_dp: bool = False
+    use_torchair_graph: bool = False
 
     def split_metadata_for_multistream(
         self,
@@ -163,6 +167,32 @@ def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
         return False
 
+    def _get_graph_runner_block_tables(
+            self, num_seqs: int, block_tables: torch.Tensor) -> torch.Tensor:
+
+        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
+        assert max_batch_size >= num_seqs
+
+        if isinstance(self.runner.graph_block_tables, np.ndarray):
+            graph_block_tables = torch.zeros((max_batch_size, max_blocks),
+                                             dtype=block_tables.dtype,
+                                             device=block_tables.device)
+        else:
+            graph_block_tables = self.runner.graph_block_tables.to(
+                device=block_tables.device, dtype=block_tables.dtype)
+
+        num_blocks = block_tables.size(1)
+        if num_blocks <= max_blocks:
+            graph_block_tables[:num_seqs, :
+                               num_blocks] = block_tables[:num_seqs, :
+                                                          num_blocks]
+        else:
+            graph_block_tables[:num_seqs, :
+                               max_blocks] = block_tables[:num_seqs, :
+                                                          max_blocks]
+
+        return graph_block_tables[:num_seqs, :max_blocks]
+
     def build(self,
               num_reqs,
               num_actual_tokens,
@@ -178,7 +208,7 @@ def build(self,
             block_table[:num_reqs])
 
         query_start_loc = common_attn_metadata.query_start_loc
-        seq_lens = common_attn_metadata.seq_lens
+        seq_lens = common_attn_metadata.seq_lens  # type: ignore
         # TODO: Refactor these two param to common metadata in runners,
         # preparing for the hybrid KV groups feature
         query_lens = common_attn_metadata.query_lens or self.runner.query_lens
@@ -188,6 +218,41 @@ def build(self,
         slot_mapping = self.runner.slot_mapping[:num_actual_tokens]
         attn_mask = self.runner.attn_mask
         attn_state = self.runner.attn_state
+        query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
+        query_start_loc = query_start_loc_cpu.to(self.runner.device,
+                                                 non_blocking=True)
+
+        graph_pad_size = kwargs.get("graph_pad_size", -1)
+        with_prefill_across_dp = kwargs["with_prefill_across_dp"]
+        use_torchair_graph = graph_pad_size != -1
+        if not with_prefill_across_dp:
+            if use_torchair_graph and self.runner.attn_state in [
+                    AscendAttentionState.DecodeOnly,
+                    AscendAttentionState.SpecDecoding
+            ]:
+                num_seqs = len(seq_lens)
+                if graph_pad_size != 0:
+                    pad_value = 1
+                    padded_seq_lens = seq_lens.tolist() + [pad_value
+                                                           ] * graph_pad_size
+                else:
+                    padded_seq_lens = seq_lens.tolist()
+
+                seq_lens = torch.from_numpy(
+                    np.array(padded_seq_lens).astype(np.int32))
+                padding = torch.full((graph_pad_size, ),
+                                     PAD_SLOT_ID,
+                                     dtype=slot_mapping.dtype,
+                                     device=slot_mapping.device)
+                slot_mapping = torch.cat([slot_mapping, padding])
+                block_table_padding = torch.zeros(
+                    (graph_pad_size, ) + block_table.shape[1:],
+                    dtype=block_table.dtype,
+                    device=block_table.device)
+                block_table = torch.cat([block_table, block_table_padding],
+                                        dim=0)
+                block_table = self._get_graph_runner_block_tables(
+                    num_seqs + graph_pad_size, block_table)
 
         attn_metadata = AscendMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -200,7 +265,44 @@ def build(self,
             slot_mapping=slot_mapping,
             attn_mask=attn_mask,
             attn_state=attn_state,
-            enable_dbo_across_dp=enable_dbo_across_dp)
+            enable_dbo_across_dp=enable_dbo_across_dp,
+            with_prefill_across_dp=with_prefill_across_dp,
+            use_torchair_graph=use_torchair_graph)
+        return attn_metadata
+
+    def build_torchair_graph_dummy(self, num_reqs: int,
+                                   num_actual_tokens: int):
+        device = self.runner.device
+        _, max_blocks = self.runner.graph_block_tables.shape
+        block_table = torch.zeros((num_reqs, max_blocks),
+                                  dtype=torch.int32,
+                                  device=device)
+        block_table = self._get_graph_runner_block_tables(
+            num_reqs, block_table)
+        seq_lens = torch.ones(num_reqs, dtype=torch.int32, device=device)
+        slot_mapping = torch.full((num_reqs, ),
+                                  PAD_SLOT_ID,
+                                  dtype=torch.int32,
+                                  device=device)
+        query_start_loc = torch.full((num_reqs, ),
+                                     -1,
+                                     dtype=torch.int32,
+                                     device=device)
+
+        query_lens = torch.ones(num_reqs, dtype=torch.int32, device=device)
+        attn_mask = self.runner.attn_mask
+
+        attn_metadata = AscendMetadata(
+            num_actual_tokens=num_actual_tokens,
+            block_tables=block_table,
+            query_start_loc=query_start_loc,
+            query_lens=query_lens,
+            seq_lens=seq_lens,
+            seq_lens_list=seq_lens.tolist(),
+            max_query_len=query_lens.max().item(),
+            slot_mapping=slot_mapping,
+            attn_mask=attn_mask,
+            attn_state=AscendAttentionState.DecodeOnly)
         return attn_metadata
 
     def build_dummy_metadata(self, num_actual_tokens, num_reqs,
@@ -248,6 +350,7 @@ def __init__(
         attn_type: str = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
+        prefix: Optional[str] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -267,11 +370,30 @@ def __init__(
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
         self.key_cache = None
         self.value_cache = None
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
 
         vllm_config = get_current_vllm_config()
         self.full_graph = vllm_config.compilation_config.full_cuda_graph
         self.block_size = vllm_config.cache_config.block_size
 
+    @staticmethod
+    def update_kv_cache(key: torch.Tensor, value: torch.Tensor,
+                        key_cache: torch.Tensor, value_cache: torch.Tensor,
+                        slot_indices: torch.Tensor) -> None:
+        # calc indices by block_size
+        block_size = key_cache.shape[1]
+        slot_indices = slot_indices.view(-1, 1, 1).to(torch.int64)
+        block_idx = torch.div(slot_indices, block_size, rounding_mode='floor')
+        block_offset = slot_indices % block_size
+        indices = torch.cat([block_idx, block_offset], dim=2)
+        indices = indices.npu()
+
+        # [blocknum, blocksize, numKvHeads, headDims]
+        # -> [blocknum, blocksize, numKvHeads * headDims]
+        torch_npu.npu_scatter_nd_update_(key_cache, indices, key)
+        torch_npu.npu_scatter_nd_update_(value_cache, indices, value)
+
     def forward(
         self,
         layer: AttentionLayer,
@@ -320,12 +442,19 @@ def forward(
                 if self.key_cache is None:
                     self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
                 slots = attn_metadata.slot_mapping
-                torch_npu._npu_reshape_and_cache(
-                    key=key[:num_actual_tokens],
-                    value=value[:num_actual_tokens],
-                    key_cache=self.key_cache,
-                    value_cache=self.value_cache,
-                    slot_indices=slots)
+                if not attn_metadata.with_prefill_across_dp and self.torchair_graph_enabled:
+                    self.update_kv_cache(key=key,
+                                         value=value,
+                                         key_cache=self.key_cache,
+                                         value_cache=self.value_cache,
+                                         slot_indices=slots.to(torch.int64))
+                else:
+                    torch_npu._npu_reshape_and_cache(
+                        key=key[:num_actual_tokens],
+                        value=value[:num_actual_tokens],
+                        key_cache=self.key_cache,
+                        value_cache=self.value_cache,
+                        slot_indices=slots)
 
             if hasattr(layer, 'quant_method'):
                 # TODO: Add attr (num_prefills, prefill_metadata, decode_metadata) to AscendMetadata
@@ -363,10 +492,28 @@ def forward(
                     scale_value=self.scale,
                     out=output)
             elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
-                graph_params = get_graph_params()
-
-                forward_context = get_forward_context()
-                if not forward_context.capturing:
+                if self.torchair_graph_enabled:
+                    # query change to BSND
+                    query = query.view(-1, 1, self.num_heads * self.head_size)
+                    # [blocknum, numKvHeads, blocksize, headDims] -> [blocknum, blocksize, numKvHeads * headDims]
+                    key_cache = self.key_cache.view(  # type: ignore
+                        *self.key_cache.shape[:-2], -1)  # type: ignore
+                    value_cache = self.value_cache.view(  # type: ignore
+                        *self.value_cache.shape[:-2], -1)  # type: ignore
+
+                    output = torch_npu.npu_incre_flash_attention(
+                        query=query,
+                        key=key_cache,
+                        value=value_cache,
+                        num_heads=self.num_heads,
+                        num_key_value_heads=self.num_kv_heads,
+                        input_layout='BSH',
+                        scale_value=self.scale,
+                        actual_seq_lengths=attn_metadata.seq_lens_list,
+                        block_table=attn_metadata.block_tables,
+                        block_size=kv_cache[0].shape[1],
+                    )
+                elif not get_forward_context().capturing:
                     torch_npu._npu_paged_attention(
                         query=query,
                         key_cache=self.key_cache,
@@ -384,6 +531,7 @@ def forward(
                     event = torch.npu.ExternalEvent()
                     event.wait(stream)
                     event.reset(stream)
+                    graph_params = get_graph_params()
                     graph_params.events[num_tokens].append(event)
 
                     graph_params.attn_params[num_tokens].append((
 
@@ -8,6 +8,7 @@ def register_model():
     from .deepseek_mtp import CustomDeepSeekMTP  # noqa: F401
     from .deepseek_v2 import CustomDeepseekV2ForCausalLM  # noqa: F401
     from .deepseek_v2 import CustomDeepseekV3ForCausalLM  # noqa: F401
+    from .qwen2 import CustomQwen2ForCausalLM  # noqa: F401
     from .qwen2_5_vl import \
         AscendQwen2_5_VLForConditionalGeneration  # noqa: F401
     from .qwen2_vl import AscendQwen2VLForConditionalGeneration  # noqa: F401
@@ -60,3 +61,6 @@ def register_model():
 
     ModelRegistry.register_model(
         "Qwen3ForCausalLM", "vllm_ascend.models.qwen3:CustomQwen3ForCausalLM")
+
+    ModelRegistry.register_model(
+        "Qwen2ForCausalLM", "vllm_ascend.models.qwen2:CustomQwen2ForCausalLM")