From 66923d7d5fdba06b0585030a815b8a4e464bd1c6 Mon Sep 17 00:00:00 2001
From: tangtianyi <tangtianyi4@huawei.com>
Date: Tue, 16 Sep 2025 19:52:01 +0800
Subject: [PATCH 01/14] =?UTF-8?q?chunked=20prefill=20splitfuse=E7=AE=97?=
 =?UTF-8?q?=E5=AD=90=E6=8E=A5=E5=85=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: tangtianyi <tangtianyi4@huawei.com>
---
 vllm_ascend/attention/attention_v1.py | 25 ++++++++++++++++---------
 vllm_ascend/worker/model_runner_v1.py |  3 +--
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index 10a2f6a416..ead9431283 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -456,18 +456,25 @@ def _forward_v1_style(
             attn_metadata.seq_lens = \
                 attn_metadata.seq_lens.to(device=query.device)
 
-        torch_npu._npu_paged_attention_splitfuse(
+        num_block, block_size, head_num, head_dim = self.key_cache.shape
+        key = self.key_cache.view(num_block, block_size, -1)
+        value = self.value_cache.view(num_block, block_size, -1)
+
+        output, _ = torch_npu.npu_fused_infer_attention_score(
             query=query,
-            key_cache=self.key_cache,
-            value_cache=self.value_cache,
-            mask=attn_metadata.attn_mask,
+            key=key,
+            value=value,
+            atten_mask=attn_metadata.attn_mask.to(device=query.device),
             block_table=attn_metadata.block_tables,
-            seq_len=attn_metadata.query_lens,
-            context_lens=attn_metadata.seq_lens,
-            num_kv_heads=self.num_kv_heads,
+            input_layout="TND",
+            block_size=block_size,
+            actual_seq_lengths=attn_metadata.query_start_loc[1:],
+            actual_seq_lengths_kv=attn_metadata.seq_lens,
+            num_key_value_heads=self.num_kv_heads,
             num_heads=self.num_heads,
-            scale_value=self.scale,
-            out=output)
+            scale=self.scale,
+            sparse_mode=3,
+            )
         return output
 
     def forward(
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 6e42da1367..7745605ce1 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -818,8 +818,7 @@ def _make_attention_mask(self, seq_lens, position,
                              attn_state) -> torch.Tensor:
         # Chunk Prefill situation.
         if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla:
-            return self.attn_mask_builder.get_splitfuse_attn_mask(
-                seq_lens, position, self.dtype, self.device)
+            return torch.triu(torch.ones(2048, 2048), diagonal=1).to(torch.int8)
         # Prefill without cache situation.
         elif attn_state == AscendAttentionState.PrefillNoCache:
             max_seq_len = max(seq_lens, default=0)

From b0eaf77ca1d35fabcd21b5db38b8d2dbb74aa9ad Mon Sep 17 00:00:00 2001
From: tangtianyi <tangtianyi4@huawei.com>
Date: Tue, 16 Sep 2025 20:09:45 +0800
Subject: [PATCH 02/14] splitfuse access optimize

Signed-off-by: tangtianyi <tangtianyi4@huawei.com>
---
 vllm_ascend/attention/attention_mask.py | 18 ++----------------
 vllm_ascend/attention/attention_v1.py   |  2 +-
 vllm_ascend/worker/model_runner_v1.py   |  3 ++-
 3 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py
index a0e63349b1..46ebdedf14 100644
--- a/vllm_ascend/attention/attention_mask.py
+++ b/vllm_ascend/attention/attention_mask.py
@@ -44,6 +44,7 @@ def __init__(
 
         self._seq_len_cached = attn_mask.shape[0]
         self.attn_mask_cache = attn_mask
+        self.chunked_prefill_attn_mask = torch.triu(torch.ones(2048, 2048), diagonal=1).to(torch.int8)
 
     @staticmethod
     def get_mask_scale_factor(dtype: torch.dtype = torch.float16):
@@ -66,24 +67,9 @@ def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype,
 
     def get_splitfuse_attn_mask(
         self,
-        seq_lens: torch.Tensor,
-        position: torch.Tensor,
-        dtype: torch.dtype,
         device: torch.device,
     ) -> torch.Tensor:
-        if dtype not in [torch.float16, torch.bfloat16]:
-            raise ValueError(
-                "splitfuse_attn_mask now only supports bf16 and fp16")
-        max_seq_len = max(seq_lens, default=0)
-        self._update_attn_cache(max_seq_len, dtype)
-        # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation
-        # is not the same. Fix this in the future when kernel is ready.
-        mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor(dtype)
-        attn_mask = torch.index_select(self.attn_mask_cache,
-                                       dim=0,
-                                       index=position)[:, :max_seq_len]
-        attn_mask *= mask_scale_factor
-        return attn_mask.contiguous().to(device, non_blocking=True)
+        return self.chunked_prefill_attn_mask.to(device)
 
     def _update_attn_cache(self, seqlen: int, dtype: torch.dtype):
         if seqlen > self._seq_len_cached:
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index ead9431283..8048675146 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -464,7 +464,7 @@ def _forward_v1_style(
             query=query,
             key=key,
             value=value,
-            atten_mask=attn_metadata.attn_mask.to(device=query.device),
+            atten_mask=attn_metadata.attn_mask,
             block_table=attn_metadata.block_tables,
             input_layout="TND",
             block_size=block_size,
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 7745605ce1..f05a06c935 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -818,7 +818,8 @@ def _make_attention_mask(self, seq_lens, position,
                              attn_state) -> torch.Tensor:
         # Chunk Prefill situation.
         if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla:
-            return torch.triu(torch.ones(2048, 2048), diagonal=1).to(torch.int8)
+            return self.attn_mask_builder.get_splitfuse_attn_mask(
+                self.device)
         # Prefill without cache situation.
         elif attn_state == AscendAttentionState.PrefillNoCache:
             max_seq_len = max(seq_lens, default=0)

From 613c9275b20c7df7504d12cd0d6017cc7f32a75d Mon Sep 17 00:00:00 2001
From: tangtianyi <tangtianyi4@huawei.com>
Date: Tue, 16 Sep 2025 20:27:30 +0800
Subject: [PATCH 03/14] optimize again

Signed-off-by: tangtianyi <tangtianyi4@huawei.com>
---
 vllm_ascend/attention/attention_mask.py | 6 +++---
 vllm_ascend/worker/model_runner_v1.py   | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py
index 46ebdedf14..fed51f9068 100644
--- a/vllm_ascend/attention/attention_mask.py
+++ b/vllm_ascend/attention/attention_mask.py
@@ -39,12 +39,13 @@ def __init__(
         self,
         max_seq_len: int,
         dtype: torch.dtype,
+        device: torch.device,
     ):
         attn_mask = _generate_attn_mask(max_seq_len, dtype)
 
         self._seq_len_cached = attn_mask.shape[0]
         self.attn_mask_cache = attn_mask
-        self.chunked_prefill_attn_mask = torch.triu(torch.ones(2048, 2048), diagonal=1).to(torch.int8)
+        self.chunked_prefill_attn_mask = torch.triu(torch.ones(2048, 2048), diagonal=1).to(torch.int8).to(device)
 
     @staticmethod
     def get_mask_scale_factor(dtype: torch.dtype = torch.float16):
@@ -67,9 +68,8 @@ def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype,
 
     def get_splitfuse_attn_mask(
         self,
-        device: torch.device,
     ) -> torch.Tensor:
-        return self.chunked_prefill_attn_mask.to(device)
+        return self.chunked_prefill_attn_mask
 
     def _update_attn_cache(self, seqlen: int, dtype: torch.dtype):
         if seqlen > self._seq_len_cached:
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index f05a06c935..932709e056 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -302,7 +302,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         )
 
         self.attn_mask_builder = AttentionMaskBuilder(
-            self.model_config.max_model_len, self.dtype)
+            self.model_config.max_model_len, self.dtype, self.device)
 
         # Set up speculative decoding.
         self.spec_attn_mask = None
@@ -818,8 +818,7 @@ def _make_attention_mask(self, seq_lens, position,
                              attn_state) -> torch.Tensor:
         # Chunk Prefill situation.
         if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla:
-            return self.attn_mask_builder.get_splitfuse_attn_mask(
-                self.device)
+            return self.attn_mask_builder.get_splitfuse_attn_mask()
         # Prefill without cache situation.
         elif attn_state == AscendAttentionState.PrefillNoCache:
             max_seq_len = max(seq_lens, default=0)

From a678de968e5b8b218b1786a990dcfd9973f5dd9b Mon Sep 17 00:00:00 2001
From: tangtianyi <tangtianyi4@huawei.com>
Date: Fri, 19 Sep 2025 15:49:42 +0800
Subject: [PATCH 04/14] improve compatibility

Signed-off-by: tangtianyi <tangtianyi4@huawei.com>
---
 vllm_ascend/attention/attention_mask.py | 33 ++++++++++++++++++++++---
 vllm_ascend/utils.py                    |  2 +-
 vllm_ascend/worker/model_runner_v1.py   | 19 +++++++++++---
 3 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py
index fed51f9068..daa0ce24fc 100644
--- a/vllm_ascend/attention/attention_mask.py
+++ b/vllm_ascend/attention/attention_mask.py
@@ -39,13 +39,19 @@ def __init__(
         self,
         max_seq_len: int,
         dtype: torch.dtype,
-        device: torch.device,
+        device: torch.device = None,
     ):
         attn_mask = _generate_attn_mask(max_seq_len, dtype)
 
         self._seq_len_cached = attn_mask.shape[0]
         self.attn_mask_cache = attn_mask
-        self.chunked_prefill_attn_mask = torch.triu(torch.ones(2048, 2048), diagonal=1).to(torch.int8).to(device)
+        self.device = device
+        if self.device:
+            #NOTE: New compressed mask needs to be sent to certain device, 
+            # so device needs to be passed here.
+            assigned_mask_dim = 2048
+            self.chunked_prefill_attn_mask = torch.triu(torch.ones(assigned_mask_dim, assigned_mask_dim), diagonal=1
+            ).to(torch.int8).to(device)
 
     @staticmethod
     def get_mask_scale_factor(dtype: torch.dtype = torch.float16):
@@ -68,8 +74,29 @@ def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype,
 
     def get_splitfuse_attn_mask(
         self,
+        seq_lens: torch.Tensor = None,
+        position: torch.Tensor = None,
+        dtype: torch.dtype = None,
+        device: torch.device = None,
     ) -> torch.Tensor:
-        return self.chunked_prefill_attn_mask
+        if self.device:
+            return self.chunked_prefill_attn_mask
+        else:
+            if dtype not in [torch.float16, torch.bfloat16]:
+                raise ValueError(
+                    "splitfuse_attn_mask now only supports bf16 and fp16")
+            max_seq_len = max(seq_lens, default=0)
+            self._update_attn_cache(max_seq_len, dtype)
+            # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation
+            # is not the same. Fix this in the future when kernel is ready.
+            mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor(dtype)
+            attn_mask = torch.index_select(self.attn_mask_cache,
+                                        dim=0,
+                                        index=position)[:, :max_seq_len]
+            attn_mask *= mask_scale_factor
+            return attn_mask.contiguous().to(device, non_blocking=True)
+
+
 
     def _update_attn_cache(self, seqlen: int, dtype: torch.dtype):
         if seqlen > self._seq_len_cached:
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 06e1a2bb8c..114f8402a5 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -60,7 +60,7 @@
 _SLEEP_MODE_ENABLED = None
 _CURRENT_STREAM = None
 _ASCEND_CUSTOMOP_IS_REIGISTERED = False
-
+_CURRENT_TORCH_NPU_VERSION = torch_npu.__version__
 
 def is_310p():
     global _IS_310P
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 932709e056..c6da02959c 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -112,7 +112,7 @@
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
                                AscendSocVersion, ProfileExecuteDuration,
                                get_ascend_soc_version, is_310p,
-                               lmhead_tp_enable)
+                               lmhead_tp_enable, verify_torch_npu_version)
 from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
 
 if TYPE_CHECKING:
@@ -301,8 +301,15 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
             use_mla=self.model_config.use_mla,
         )
 
-        self.attn_mask_builder = AttentionMaskBuilder(
-            self.model_config.max_model_len, self.dtype, self.device)
+        pta_version_support_compressed_mask = "2.7.1.dev20250918"
+        self.compressed_mask = verify_torch_npu_version(pta_version_support_compressed_mask, "compressed mask")
+
+        if self.compressed_mask:
+            self.attn_mask_builder = AttentionMaskBuilder(
+                self.scheduler_config.max_num_batched_tokens, self.dtype, self.device)
+        else:
+            self.attn_mask_builder = AttentionMaskBuilder(
+                self.model_config.max_model_len, self.dtype)
 
         # Set up speculative decoding.
         self.spec_attn_mask = None
@@ -818,7 +825,11 @@ def _make_attention_mask(self, seq_lens, position,
                              attn_state) -> torch.Tensor:
         # Chunk Prefill situation.
         if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla:
-            return self.attn_mask_builder.get_splitfuse_attn_mask()
+            if selkf.compressed_mas:
+                return self.attn_mask_builder.get_splitfuse_attn_mask()
+            else:
+                return self.attn_mask_builder.get_splitfuse_attn_mask(
+                    seq_lens, position, self.dtype, self.device)
         # Prefill without cache situation.
         elif attn_state == AscendAttentionState.PrefillNoCache:
             max_seq_len = max(seq_lens, default=0)

From 7020e3a9f9d8f90e8916be81c9e78ae621c4377a Mon Sep 17 00:00:00 2001
From: tangtianyi <tangtianyi4@huawei.com>
Date: Fri, 19 Sep 2025 15:52:22 +0800
Subject: [PATCH 05/14] fix for dp padding

Signed-off-by: tangtianyi <tangtianyi4@huawei.com>
---
 vllm_ascend/attention/attention_v1.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index 8048675146..97aba6d49f 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -568,12 +568,14 @@ def forward(
                                                    output)
             # Normal V1 situation.
             else:
+                num_tokens = attn_metadata.query_start_loc[-1]
+                query = query[:num_tokens]
                 output = self._forward_v1_style(query, attn_metadata, output)
 
         # to make in-place change to the output tensor
         if hasattr(layer, 'quant_method') and use_kv_cache_int8:
             output = output.view(num_tokens, self.num_heads, self.head_size)
-        ori_output[:, :, :] = output[:num_tokens, :, :]
+        ori_output[:num_tokens, :, :] = output[:num_tokens, :, :]
         return output.view(num_tokens, self.hidden_size)
 
 

From 8bdda1d675863c842b8517ced0dfb358296ac194 Mon Sep 17 00:00:00 2001
From: tangtianyi <tangtianyi4@huawei.com>
Date: Fri, 19 Sep 2025 16:02:54 +0800
Subject: [PATCH 06/14] add compatibility for attention v1

Signed-off-by: tangtianyi <tangtianyi4@huawei.com>
---
 vllm_ascend/attention/attention_v1.py | 55 ++++++++++++++++++---------
 1 file changed, 36 insertions(+), 19 deletions(-)

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index 97aba6d49f..3995d20b3b 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -37,7 +37,7 @@
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
-                               nd_to_nz_2d, nd_to_nz_spec)
+                               nd_to_nz_2d, nd_to_nz_spec, verify_torch_npu_version)
 
 
 def wait_for_kv_layer_from_connector(layer_name: str):
@@ -304,6 +304,9 @@ def __init__(
         self.key_cache = None
         self.value_cache = None
 
+        pta_version_support_compressed_mask = "2.7.1.dev20250918"
+        self.compressed_mask = verify_torch_npu_version(pta_version_support_compressed_mask, "compressed mask")
+
     def _forward_prefill_no_cache(
         self,
         query: torch.Tensor,
@@ -456,25 +459,39 @@ def _forward_v1_style(
             attn_metadata.seq_lens = \
                 attn_metadata.seq_lens.to(device=query.device)
 
-        num_block, block_size, head_num, head_dim = self.key_cache.shape
-        key = self.key_cache.view(num_block, block_size, -1)
-        value = self.value_cache.view(num_block, block_size, -1)
+        if self.compressed_mask:
+            num_block, block_size, head_num, head_dim = self.key_cache.shape
+            key = self.key_cache.view(num_block, block_size, -1)
+            value = self.value_cache.view(num_block, block_size, -1)
 
-        output, _ = torch_npu.npu_fused_infer_attention_score(
-            query=query,
-            key=key,
-            value=value,
-            atten_mask=attn_metadata.attn_mask,
-            block_table=attn_metadata.block_tables,
-            input_layout="TND",
-            block_size=block_size,
-            actual_seq_lengths=attn_metadata.query_start_loc[1:],
-            actual_seq_lengths_kv=attn_metadata.seq_lens,
-            num_key_value_heads=self.num_kv_heads,
-            num_heads=self.num_heads,
-            scale=self.scale,
-            sparse_mode=3,
-            )
+            output, _ = torch_npu.npu_fused_infer_attention_score(
+                query=query,
+                key=key,
+                value=value,
+                atten_mask=attn_metadata.attn_mask,
+                block_table=attn_metadata.block_tables,
+                input_layout="TND",
+                block_size=block_size,
+                actual_seq_lengths=attn_metadata.query_start_loc[1:],
+                actual_seq_lengths_kv=attn_metadata.seq_lens,
+                num_key_value_heads=self.num_kv_heads,
+                num_heads=self.num_heads,
+                scale=self.scale,
+                sparse_mode=3,
+                )
+        else:
+            torch_npu._npu_paged_attention_splitfuse(
+                        query=query,
+                        key_cache=self.key_cache,
+                        value_cache=self.value_cache,
+                        mask=attn_metadata.attn_mask,
+                        block_table=attn_metadata.block_tables,
+                        seq_len=attn_metadata.query_lens,
+                        context_lens=attn_metadata.seq_lens,
+                        num_kv_heads=self.num_kv_heads,
+                        num_heads=self.num_heads,
+                        scale_value=self.scale,
+                        out=output)
         return output
 
     def forward(

From 6ef548d540b54a531ee21ca231d2365d050df13f Mon Sep 17 00:00:00 2001
From: tangtianyi <tangtianyi4@huawei.com>
Date: Fri, 19 Sep 2025 17:45:10 +0800
Subject: [PATCH 07/14] fix by review

Signed-off-by: tangtianyi <tangtianyi4@huawei.com>
---
 vllm_ascend/attention/attention_mask.py | 7 +++++--
 vllm_ascend/attention/attention_v1.py   | 2 ++
 vllm_ascend/worker/model_runner_v1.py   | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py
index daa0ce24fc..d4e205d1ad 100644
--- a/vllm_ascend/attention/attention_mask.py
+++ b/vllm_ascend/attention/attention_mask.py
@@ -41,14 +41,17 @@ def __init__(
         dtype: torch.dtype,
         device: torch.device = None,
     ):
+        # NOTE: The device argument specifies the target NPU 
+        # to be used for the newly added FIA operator.
+        # Only pass this parameter when using the new FIA operator.
+
         attn_mask = _generate_attn_mask(max_seq_len, dtype)
 
         self._seq_len_cached = attn_mask.shape[0]
         self.attn_mask_cache = attn_mask
         self.device = device
         if self.device:
-            #NOTE: New compressed mask needs to be sent to certain device, 
-            # so device needs to be passed here.
+
             assigned_mask_dim = 2048
             self.chunked_prefill_attn_mask = torch.triu(torch.ones(assigned_mask_dim, assigned_mask_dim), diagonal=1
             ).to(torch.int8).to(device)
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index 3995d20b3b..c326724747 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -460,6 +460,8 @@ def _forward_v1_style(
                 attn_metadata.seq_lens.to(device=query.device)
 
         if self.compressed_mask:
+            # TODO:The npu_fused_infer_attention_score op is planned to
+            # be utilized in a wider range in upcoming versions.
             num_block, block_size, head_num, head_dim = self.key_cache.shape
             key = self.key_cache.view(num_block, block_size, -1)
             value = self.value_cache.view(num_block, block_size, -1)
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index c6da02959c..738719b2a2 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -825,7 +825,7 @@ def _make_attention_mask(self, seq_lens, position,
                              attn_state) -> torch.Tensor:
         # Chunk Prefill situation.
         if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla:
-            if selkf.compressed_mas:
+            if self.compressed_mask:
                 return self.attn_mask_builder.get_splitfuse_attn_mask()
             else:
                 return self.attn_mask_builder.get_splitfuse_attn_mask(

From 821253cb6fd79b1fa9f3cd4c40d3b876d179f714 Mon Sep 17 00:00:00 2001
From: Angazenn <supperccell@163.com>
Date: Fri, 19 Sep 2025 22:55:43 +0800
Subject: [PATCH 08/14] change to cann version

Signed-off-by: Angazenn <supperccell@163.com>
---
 vllm_ascend/attention/attention_mask.py | 3 +--
 vllm_ascend/utils.py                    | 1 -
 vllm_ascend/worker/model_runner_v1.py   | 9 +++------
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py
index d4e205d1ad..3d402dde5f 100644
--- a/vllm_ascend/attention/attention_mask.py
+++ b/vllm_ascend/attention/attention_mask.py
@@ -50,8 +50,7 @@ def __init__(
         self._seq_len_cached = attn_mask.shape[0]
         self.attn_mask_cache = attn_mask
         self.device = device
-        if self.device:
-
+        if torch.version.cann.startswith("8.3"):
             assigned_mask_dim = 2048
             self.chunked_prefill_attn_mask = torch.triu(torch.ones(assigned_mask_dim, assigned_mask_dim), diagonal=1
             ).to(torch.int8).to(device)
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 114f8402a5..78a3d9becc 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -60,7 +60,6 @@
 _SLEEP_MODE_ENABLED = None
 _CURRENT_STREAM = None
 _ASCEND_CUSTOMOP_IS_REIGISTERED = False
-_CURRENT_TORCH_NPU_VERSION = torch_npu.__version__
 
 def is_310p():
     global _IS_310P
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 738719b2a2..41eff4eed1 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -112,7 +112,7 @@
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
                                AscendSocVersion, ProfileExecuteDuration,
                                get_ascend_soc_version, is_310p,
-                               lmhead_tp_enable, verify_torch_npu_version)
+                               lmhead_tp_enable)
 from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
 
 if TYPE_CHECKING:
@@ -301,10 +301,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
             use_mla=self.model_config.use_mla,
         )
 
-        pta_version_support_compressed_mask = "2.7.1.dev20250918"
-        self.compressed_mask = verify_torch_npu_version(pta_version_support_compressed_mask, "compressed mask")
-
-        if self.compressed_mask:
+        if torch.version.cann.startswith("8.3"):
             self.attn_mask_builder = AttentionMaskBuilder(
                 self.scheduler_config.max_num_batched_tokens, self.dtype, self.device)
         else:
@@ -825,7 +822,7 @@ def _make_attention_mask(self, seq_lens, position,
                              attn_state) -> torch.Tensor:
         # Chunk Prefill situation.
         if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla:
-            if self.compressed_mask:
+            if torch.version.cann.startswith("8.3"):
                 return self.attn_mask_builder.get_splitfuse_attn_mask()
             else:
                 return self.attn_mask_builder.get_splitfuse_attn_mask(

From 35acbb2808bbd531db1e2e8a3b2a5ae13dc13dd7 Mon Sep 17 00:00:00 2001
From: Angazenn <supperccell@163.com>
Date: Fri, 19 Sep 2025 23:06:01 +0800
Subject: [PATCH 09/14] remove another compressed

Signed-off-by: Angazenn <supperccell@163.com>
---
 vllm_ascend/attention/attention_v1.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index c326724747..ad4a2fbba5 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -304,9 +304,6 @@ def __init__(
         self.key_cache = None
         self.value_cache = None
 
-        pta_version_support_compressed_mask = "2.7.1.dev20250918"
-        self.compressed_mask = verify_torch_npu_version(pta_version_support_compressed_mask, "compressed mask")
-
     def _forward_prefill_no_cache(
         self,
         query: torch.Tensor,
@@ -459,7 +456,7 @@ def _forward_v1_style(
             attn_metadata.seq_lens = \
                 attn_metadata.seq_lens.to(device=query.device)
 
-        if self.compressed_mask:
+        if torch.version.cann.startswith("8.3"):
             # TODO:The npu_fused_infer_attention_score op is planned to
             # be utilized in a wider range in upcoming versions.
             num_block, block_size, head_num, head_dim = self.key_cache.shape

From a366af6c129aaeb02a034a46bb7efb83fefb30ca Mon Sep 17 00:00:00 2001
From: Angazenn <supperccell@163.com>
Date: Sat, 20 Sep 2025 08:24:06 +0800
Subject: [PATCH 10/14] fix lint

Signed-off-by: Angazenn <supperccell@163.com>
---
 vllm_ascend/attention/attention_mask.py | 16 +++++++--------
 vllm_ascend/attention/attention_v1.py   | 26 ++++++++++++-------------
 vllm_ascend/utils.py                    |  1 +
 vllm_ascend/worker/model_runner_v1.py   |  3 ++-
 4 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py
index 3d402dde5f..3819d8b8df 100644
--- a/vllm_ascend/attention/attention_mask.py
+++ b/vllm_ascend/attention/attention_mask.py
@@ -41,7 +41,7 @@ def __init__(
         dtype: torch.dtype,
         device: torch.device = None,
     ):
-        # NOTE: The device argument specifies the target NPU 
+        # NOTE: The device argument specifies the target NPU
         # to be used for the newly added FIA operator.
         # Only pass this parameter when using the new FIA operator.
 
@@ -52,8 +52,9 @@ def __init__(
         self.device = device
         if torch.version.cann.startswith("8.3"):
             assigned_mask_dim = 2048
-            self.chunked_prefill_attn_mask = torch.triu(torch.ones(assigned_mask_dim, assigned_mask_dim), diagonal=1
-            ).to(torch.int8).to(device)
+            self.chunked_prefill_attn_mask = torch.triu(
+                torch.ones(assigned_mask_dim, assigned_mask_dim),
+                diagonal=1).to(torch.int8).to(device)
 
     @staticmethod
     def get_mask_scale_factor(dtype: torch.dtype = torch.float16):
@@ -91,15 +92,14 @@ def get_splitfuse_attn_mask(
             self._update_attn_cache(max_seq_len, dtype)
             # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation
             # is not the same. Fix this in the future when kernel is ready.
-            mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor(dtype)
+            mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor(
+                dtype)
             attn_mask = torch.index_select(self.attn_mask_cache,
-                                        dim=0,
-                                        index=position)[:, :max_seq_len]
+                                           dim=0,
+                                           index=position)[:, :max_seq_len]
             attn_mask *= mask_scale_factor
             return attn_mask.contiguous().to(device, non_blocking=True)
 
-
-
     def _update_attn_cache(self, seqlen: int, dtype: torch.dtype):
         if seqlen > self._seq_len_cached:
             self._seq_len_cached = seqlen
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index ad4a2fbba5..1ea5119c61 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -37,7 +37,7 @@
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
-                               nd_to_nz_2d, nd_to_nz_spec, verify_torch_npu_version)
+                               nd_to_nz_2d, nd_to_nz_spec)
 
 
 def wait_for_kv_layer_from_connector(layer_name: str):
@@ -477,20 +477,20 @@ def _forward_v1_style(
                 num_heads=self.num_heads,
                 scale=self.scale,
                 sparse_mode=3,
-                )
+            )
         else:
             torch_npu._npu_paged_attention_splitfuse(
-                        query=query,
-                        key_cache=self.key_cache,
-                        value_cache=self.value_cache,
-                        mask=attn_metadata.attn_mask,
-                        block_table=attn_metadata.block_tables,
-                        seq_len=attn_metadata.query_lens,
-                        context_lens=attn_metadata.seq_lens,
-                        num_kv_heads=self.num_kv_heads,
-                        num_heads=self.num_heads,
-                        scale_value=self.scale,
-                        out=output)
+                query=query,
+                key_cache=self.key_cache,
+                value_cache=self.value_cache,
+                mask=attn_metadata.attn_mask,
+                block_table=attn_metadata.block_tables,
+                seq_len=attn_metadata.query_lens,
+                context_lens=attn_metadata.seq_lens,
+                num_kv_heads=self.num_kv_heads,
+                num_heads=self.num_heads,
+                scale_value=self.scale,
+                out=output)
         return output
 
     def forward(
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 78a3d9becc..06e1a2bb8c 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -61,6 +61,7 @@
 _CURRENT_STREAM = None
 _ASCEND_CUSTOMOP_IS_REIGISTERED = False
 
+
 def is_310p():
     global _IS_310P
     if _IS_310P is None:
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 41eff4eed1..659541e300 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -303,7 +303,8 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
 
         if torch.version.cann.startswith("8.3"):
             self.attn_mask_builder = AttentionMaskBuilder(
-                self.scheduler_config.max_num_batched_tokens, self.dtype, self.device)
+                self.scheduler_config.max_num_batched_tokens, self.dtype,
+                self.device)
         else:
             self.attn_mask_builder = AttentionMaskBuilder(
                 self.model_config.max_model_len, self.dtype)

From 5102efe3652bbe0fd2b2407df55a41107be36deb Mon Sep 17 00:00:00 2001
From: Angazenn <supperccell@163.com>
Date: Sat, 20 Sep 2025 10:23:28 +0800
Subject: [PATCH 11/14] fix mypy

Signed-off-by: Angazenn <supperccell@163.com>
---
 vllm_ascend/attention/attention_v1.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index 1ea5119c61..cd4bd57967 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -460,8 +460,8 @@ def _forward_v1_style(
             # TODO:The npu_fused_infer_attention_score op is planned to
             # be utilized in a wider range in upcoming versions.
             num_block, block_size, head_num, head_dim = self.key_cache.shape
-            key = self.key_cache.view(num_block, block_size, -1)
-            value = self.value_cache.view(num_block, block_size, -1)
+            key = self.key_cache.view(num_block, block_size, -1)  # type: ignore
+            value = self.value_cache.view(num_block, block_size, -1)  # type: ignore
 
             output, _ = torch_npu.npu_fused_infer_attention_score(
                 query=query,

From 5565a33e11364a04c3b2cfd5776492551c8ba018 Mon Sep 17 00:00:00 2001
From: Angazenn <supperccell@163.com>
Date: Sat, 20 Sep 2025 11:44:41 +0800
Subject: [PATCH 12/14] fix mypy

Signed-off-by: Angazenn <supperccell@163.com>
---
 vllm_ascend/attention/attention_v1.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index cd4bd57967..0d8f01579d 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -460,8 +460,10 @@ def _forward_v1_style(
             # TODO:The npu_fused_infer_attention_score op is planned to
             # be utilized in a wider range in upcoming versions.
             num_block, block_size, head_num, head_dim = self.key_cache.shape
-            key = self.key_cache.view(num_block, block_size, -1)  # type: ignore
-            value = self.value_cache.view(num_block, block_size, -1)  # type: ignore
+            key = self.key_cache.view(  # type: ignore
+                num_block, block_size, -1)
+            value = self.value_cache.view(  # type: ignore
+                num_block, block_size, -1)
 
             output, _ = torch_npu.npu_fused_infer_attention_score(
                 query=query,

From 8ac3f51c02ec6802b0eccff40adcebe3cf2f0733 Mon Sep 17 00:00:00 2001
From: Angazenn <supperccell@163.com>
Date: Sat, 20 Sep 2025 12:16:20 +0800
Subject: [PATCH 13/14] fix mypy

Signed-off-by: Angazenn <supperccell@163.com>
---
 vllm_ascend/attention/attention_v1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index 0d8f01579d..1db0ae3a33 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -459,7 +459,7 @@ def _forward_v1_style(
         if torch.version.cann.startswith("8.3"):
             # TODO:The npu_fused_infer_attention_score op is planned to
             # be utilized in a wider range in upcoming versions.
-            num_block, block_size, head_num, head_dim = self.key_cache.shape
+            num_block, block_size, _, _ = self.key_cache.shape  # type: ignore
             key = self.key_cache.view(  # type: ignore
                 num_block, block_size, -1)
             value = self.value_cache.view(  # type: ignore

From 3be8a40c40b30a03e747a0258c71747162b3ee5f Mon Sep 17 00:00:00 2001
From: Angazenn <supperccell@163.com>
Date: Sat, 20 Sep 2025 12:44:03 +0800
Subject: [PATCH 14/14] fix ut

Signed-off-by: Angazenn <supperccell@163.com>
---
 vllm_ascend/attention/attention_mask.py | 2 +-
 vllm_ascend/attention/attention_v1.py   | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py
index 3819d8b8df..cf92affd38 100644
--- a/vllm_ascend/attention/attention_mask.py
+++ b/vllm_ascend/attention/attention_mask.py
@@ -82,7 +82,7 @@ def get_splitfuse_attn_mask(
         dtype: torch.dtype = None,
         device: torch.device = None,
     ) -> torch.Tensor:
-        if self.device:
+        if torch.version.cann.startswith("8.3"):
             return self.chunked_prefill_attn_mask
         else:
             if dtype not in [torch.float16, torch.bfloat16]:
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index 1db0ae3a33..bc7f69ce5a 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -586,8 +586,12 @@ def forward(
                                                    output)
             # Normal V1 situation.
             else:
-                num_tokens = attn_metadata.query_start_loc[-1]
-                query = query[:num_tokens]
+                if torch.version.cann.startswith("8.3"):
+                    # npu_fused_infer_attention_score does not support cases
+                    # where query.shape[0] != attn_metadata.query_start_loc[-1].
+                    # Thus we need unpad it here.
+                    num_tokens = attn_metadata.query_start_loc[-1]
+                    query = query[:num_tokens]
                 output = self._forward_v1_style(query, attn_metadata, output)
 
         # to make in-place change to the output tensor