From 66923d7d5fdba06b0585030a815b8a4e464bd1c6 Mon Sep 17 00:00:00 2001 From: tangtianyi Date: Tue, 16 Sep 2025 19:52:01 +0800 Subject: [PATCH 01/14] =?UTF-8?q?chunked=20prefill=20splitfuse=E7=AE=97?= =?UTF-8?q?=E5=AD=90=E6=8E=A5=E5=85=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: tangtianyi --- vllm_ascend/attention/attention_v1.py | 25 ++++++++++++++++--------- vllm_ascend/worker/model_runner_v1.py | 3 +-- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 10a2f6a416..ead9431283 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -456,18 +456,25 @@ def _forward_v1_style( attn_metadata.seq_lens = \ attn_metadata.seq_lens.to(device=query.device) - torch_npu._npu_paged_attention_splitfuse( + num_block, block_size, head_num, head_dim = self.key_cache.shape + key = self.key_cache.view(num_block, block_size, -1) + value = self.value_cache.view(num_block, block_size, -1) + + output, _ = torch_npu.npu_fused_infer_attention_score( query=query, - key_cache=self.key_cache, - value_cache=self.value_cache, - mask=attn_metadata.attn_mask, + key=key, + value=value, + atten_mask=attn_metadata.attn_mask.to(device=query.device), block_table=attn_metadata.block_tables, - seq_len=attn_metadata.query_lens, - context_lens=attn_metadata.seq_lens, - num_kv_heads=self.num_kv_heads, + input_layout="TND", + block_size=block_size, + actual_seq_lengths=attn_metadata.query_start_loc[1:], + actual_seq_lengths_kv=attn_metadata.seq_lens, + num_key_value_heads=self.num_kv_heads, num_heads=self.num_heads, - scale_value=self.scale, - out=output) + scale=self.scale, + sparse_mode=3, + ) return output def forward( diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 6e42da1367..7745605ce1 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -818,8 +818,7 @@ def _make_attention_mask(self, seq_lens, position, attn_state) -> torch.Tensor: # Chunk Prefill situation. if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla: - return self.attn_mask_builder.get_splitfuse_attn_mask( - seq_lens, position, self.dtype, self.device) + return torch.triu(torch.ones(2048, 2048), diagonal=1).to(torch.int8) # Prefill without cache situation. elif attn_state == AscendAttentionState.PrefillNoCache: max_seq_len = max(seq_lens, default=0) From b0eaf77ca1d35fabcd21b5db38b8d2dbb74aa9ad Mon Sep 17 00:00:00 2001 From: tangtianyi Date: Tue, 16 Sep 2025 20:09:45 +0800 Subject: [PATCH 02/14] splitfuse access optimize Signed-off-by: tangtianyi --- vllm_ascend/attention/attention_mask.py | 18 ++---------------- vllm_ascend/attention/attention_v1.py | 2 +- vllm_ascend/worker/model_runner_v1.py | 3 ++- 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py index a0e63349b1..46ebdedf14 100644 --- a/vllm_ascend/attention/attention_mask.py +++ b/vllm_ascend/attention/attention_mask.py @@ -44,6 +44,7 @@ def __init__( self._seq_len_cached = attn_mask.shape[0] self.attn_mask_cache = attn_mask + self.chunked_prefill_attn_mask = torch.triu(torch.ones(2048, 2048), diagonal=1).to(torch.int8) @staticmethod def get_mask_scale_factor(dtype: torch.dtype = torch.float16): @@ -66,24 +67,9 @@ def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype, def get_splitfuse_attn_mask( self, - seq_lens: torch.Tensor, - position: torch.Tensor, - dtype: torch.dtype, device: torch.device, ) -> torch.Tensor: - if dtype not in [torch.float16, torch.bfloat16]: - raise ValueError( - "splitfuse_attn_mask now only supports bf16 and fp16") - max_seq_len = max(seq_lens, default=0) - self._update_attn_cache(max_seq_len, dtype) - # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation - # is not the same. Fix this in the future when kernel is ready. - mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor(dtype) - attn_mask = torch.index_select(self.attn_mask_cache, - dim=0, - index=position)[:, :max_seq_len] - attn_mask *= mask_scale_factor - return attn_mask.contiguous().to(device, non_blocking=True) + return self.chunked_prefill_attn_mask.to(device) def _update_attn_cache(self, seqlen: int, dtype: torch.dtype): if seqlen > self._seq_len_cached: diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index ead9431283..8048675146 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -464,7 +464,7 @@ def _forward_v1_style( query=query, key=key, value=value, - atten_mask=attn_metadata.attn_mask.to(device=query.device), + atten_mask=attn_metadata.attn_mask, block_table=attn_metadata.block_tables, input_layout="TND", block_size=block_size, diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 7745605ce1..f05a06c935 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -818,7 +818,8 @@ def _make_attention_mask(self, seq_lens, position, attn_state) -> torch.Tensor: # Chunk Prefill situation. if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla: - return torch.triu(torch.ones(2048, 2048), diagonal=1).to(torch.int8) + return self.attn_mask_builder.get_splitfuse_attn_mask( + self.device) # Prefill without cache situation. elif attn_state == AscendAttentionState.PrefillNoCache: max_seq_len = max(seq_lens, default=0) From 613c9275b20c7df7504d12cd0d6017cc7f32a75d Mon Sep 17 00:00:00 2001 From: tangtianyi Date: Tue, 16 Sep 2025 20:27:30 +0800 Subject: [PATCH 03/14] optimize again Signed-off-by: tangtianyi --- vllm_ascend/attention/attention_mask.py | 6 +++--- vllm_ascend/worker/model_runner_v1.py | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py index 46ebdedf14..fed51f9068 100644 --- a/vllm_ascend/attention/attention_mask.py +++ b/vllm_ascend/attention/attention_mask.py @@ -39,12 +39,13 @@ def __init__( self, max_seq_len: int, dtype: torch.dtype, + device: torch.device, ): attn_mask = _generate_attn_mask(max_seq_len, dtype) self._seq_len_cached = attn_mask.shape[0] self.attn_mask_cache = attn_mask - self.chunked_prefill_attn_mask = torch.triu(torch.ones(2048, 2048), diagonal=1).to(torch.int8) + self.chunked_prefill_attn_mask = torch.triu(torch.ones(2048, 2048), diagonal=1).to(torch.int8).to(device) @staticmethod def get_mask_scale_factor(dtype: torch.dtype = torch.float16): @@ -67,9 +68,8 @@ def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype, def get_splitfuse_attn_mask( self, - device: torch.device, ) -> torch.Tensor: - return self.chunked_prefill_attn_mask.to(device) + return self.chunked_prefill_attn_mask def _update_attn_cache(self, seqlen: int, dtype: torch.dtype): if seqlen > self._seq_len_cached: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index f05a06c935..932709e056 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -302,7 +302,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): ) self.attn_mask_builder = AttentionMaskBuilder( - self.model_config.max_model_len, self.dtype) + self.model_config.max_model_len, self.dtype, self.device) # Set up speculative decoding. self.spec_attn_mask = None @@ -818,8 +818,7 @@ def _make_attention_mask(self, seq_lens, position, attn_state) -> torch.Tensor: # Chunk Prefill situation. if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla: - return self.attn_mask_builder.get_splitfuse_attn_mask( - self.device) + return self.attn_mask_builder.get_splitfuse_attn_mask() # Prefill without cache situation. elif attn_state == AscendAttentionState.PrefillNoCache: max_seq_len = max(seq_lens, default=0) From a678de968e5b8b218b1786a990dcfd9973f5dd9b Mon Sep 17 00:00:00 2001 From: tangtianyi Date: Fri, 19 Sep 2025 15:49:42 +0800 Subject: [PATCH 04/14] improve compatibility Signed-off-by: tangtianyi --- vllm_ascend/attention/attention_mask.py | 33 ++++++++++++++++++++++--- vllm_ascend/utils.py | 2 +- vllm_ascend/worker/model_runner_v1.py | 19 +++++++++++--- 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py index fed51f9068..daa0ce24fc 100644 --- a/vllm_ascend/attention/attention_mask.py +++ b/vllm_ascend/attention/attention_mask.py @@ -39,13 +39,19 @@ def __init__( self, max_seq_len: int, dtype: torch.dtype, - device: torch.device, + device: torch.device = None, ): attn_mask = _generate_attn_mask(max_seq_len, dtype) self._seq_len_cached = attn_mask.shape[0] self.attn_mask_cache = attn_mask - self.chunked_prefill_attn_mask = torch.triu(torch.ones(2048, 2048), diagonal=1).to(torch.int8).to(device) + self.device = device + if self.device: + #NOTE: New compressed mask needs to be sent to certain device, + # so device needs to be passed here. + assigned_mask_dim = 2048 + self.chunked_prefill_attn_mask = torch.triu(torch.ones(assigned_mask_dim, assigned_mask_dim), diagonal=1 + ).to(torch.int8).to(device) @staticmethod def get_mask_scale_factor(dtype: torch.dtype = torch.float16): @@ -68,8 +74,29 @@ def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype, def get_splitfuse_attn_mask( self, + seq_lens: torch.Tensor = None, + position: torch.Tensor = None, + dtype: torch.dtype = None, + device: torch.device = None, ) -> torch.Tensor: - return self.chunked_prefill_attn_mask + if self.device: + return self.chunked_prefill_attn_mask + else: + if dtype not in [torch.float16, torch.bfloat16]: + raise ValueError( + "splitfuse_attn_mask now only supports bf16 and fp16") + max_seq_len = max(seq_lens, default=0) + self._update_attn_cache(max_seq_len, dtype) + # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation + # is not the same. Fix this in the future when kernel is ready. + mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor(dtype) + attn_mask = torch.index_select(self.attn_mask_cache, + dim=0, + index=position)[:, :max_seq_len] + attn_mask *= mask_scale_factor + return attn_mask.contiguous().to(device, non_blocking=True) + + def _update_attn_cache(self, seqlen: int, dtype: torch.dtype): if seqlen > self._seq_len_cached: diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 06e1a2bb8c..114f8402a5 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -60,7 +60,7 @@ _SLEEP_MODE_ENABLED = None _CURRENT_STREAM = None _ASCEND_CUSTOMOP_IS_REIGISTERED = False - +_CURRENT_TORCH_NPU_VERSION = torch_npu.__version__ def is_310p(): global _IS_310P diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 932709e056..c6da02959c 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -112,7 +112,7 @@ from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, AscendSocVersion, ProfileExecuteDuration, get_ascend_soc_version, is_310p, - lmhead_tp_enable) + lmhead_tp_enable, verify_torch_npu_version) from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch if TYPE_CHECKING: @@ -301,8 +301,15 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): use_mla=self.model_config.use_mla, ) - self.attn_mask_builder = AttentionMaskBuilder( - self.model_config.max_model_len, self.dtype, self.device) + pta_version_support_compressed_mask = "2.7.1.dev20250918" + self.compressed_mask = verify_torch_npu_version(pta_version_support_compressed_mask, "compressed mask") + + if self.compressed_mask: + self.attn_mask_builder = AttentionMaskBuilder( + self.scheduler_config.max_num_batched_tokens, self.dtype, self.device) + else: + self.attn_mask_builder = AttentionMaskBuilder( + self.model_config.max_model_len, self.dtype) # Set up speculative decoding. self.spec_attn_mask = None @@ -818,7 +825,11 @@ def _make_attention_mask(self, seq_lens, position, attn_state) -> torch.Tensor: # Chunk Prefill situation. if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla: - return self.attn_mask_builder.get_splitfuse_attn_mask() + if selkf.compressed_mas: + return self.attn_mask_builder.get_splitfuse_attn_mask() + else: + return self.attn_mask_builder.get_splitfuse_attn_mask( + seq_lens, position, self.dtype, self.device) # Prefill without cache situation. elif attn_state == AscendAttentionState.PrefillNoCache: max_seq_len = max(seq_lens, default=0) From 7020e3a9f9d8f90e8916be81c9e78ae621c4377a Mon Sep 17 00:00:00 2001 From: tangtianyi Date: Fri, 19 Sep 2025 15:52:22 +0800 Subject: [PATCH 05/14] fix for dp padding Signed-off-by: tangtianyi --- vllm_ascend/attention/attention_v1.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 8048675146..97aba6d49f 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -568,12 +568,14 @@ def forward( output) # Normal V1 situation. else: + num_tokens = attn_metadata.query_start_loc[-1] + query = query[:num_tokens] output = self._forward_v1_style(query, attn_metadata, output) # to make in-place change to the output tensor if hasattr(layer, 'quant_method') and use_kv_cache_int8: output = output.view(num_tokens, self.num_heads, self.head_size) - ori_output[:, :, :] = output[:num_tokens, :, :] + ori_output[:num_tokens, :, :] = output[:num_tokens, :, :] return output.view(num_tokens, self.hidden_size) From 8bdda1d675863c842b8517ced0dfb358296ac194 Mon Sep 17 00:00:00 2001 From: tangtianyi Date: Fri, 19 Sep 2025 16:02:54 +0800 Subject: [PATCH 06/14] add compatibility for attention v1 Signed-off-by: tangtianyi --- vllm_ascend/attention/attention_v1.py | 55 ++++++++++++++++++--------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 97aba6d49f..3995d20b3b 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -37,7 +37,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.ops.attention import vanilla_chunked_prefill from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, - nd_to_nz_2d, nd_to_nz_spec) + nd_to_nz_2d, nd_to_nz_spec, verify_torch_npu_version) def wait_for_kv_layer_from_connector(layer_name: str): @@ -304,6 +304,9 @@ def __init__( self.key_cache = None self.value_cache = None + pta_version_support_compressed_mask = "2.7.1.dev20250918" + self.compressed_mask = verify_torch_npu_version(pta_version_support_compressed_mask, "compressed mask") + def _forward_prefill_no_cache( self, query: torch.Tensor, @@ -456,25 +459,39 @@ def _forward_v1_style( attn_metadata.seq_lens = \ attn_metadata.seq_lens.to(device=query.device) - num_block, block_size, head_num, head_dim = self.key_cache.shape - key = self.key_cache.view(num_block, block_size, -1) - value = self.value_cache.view(num_block, block_size, -1) + if self.compressed_mask: + num_block, block_size, head_num, head_dim = self.key_cache.shape + key = self.key_cache.view(num_block, block_size, -1) + value = self.value_cache.view(num_block, block_size, -1) - output, _ = torch_npu.npu_fused_infer_attention_score( - query=query, - key=key, - value=value, - atten_mask=attn_metadata.attn_mask, - block_table=attn_metadata.block_tables, - input_layout="TND", - block_size=block_size, - actual_seq_lengths=attn_metadata.query_start_loc[1:], - actual_seq_lengths_kv=attn_metadata.seq_lens, - num_key_value_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale=self.scale, - sparse_mode=3, - ) + output, _ = torch_npu.npu_fused_infer_attention_score( + query=query, + key=key, + value=value, + atten_mask=attn_metadata.attn_mask, + block_table=attn_metadata.block_tables, + input_layout="TND", + block_size=block_size, + actual_seq_lengths=attn_metadata.query_start_loc[1:], + actual_seq_lengths_kv=attn_metadata.seq_lens, + num_key_value_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale=self.scale, + sparse_mode=3, + ) + else: + torch_npu._npu_paged_attention_splitfuse( + query=query, + key_cache=self.key_cache, + value_cache=self.value_cache, + mask=attn_metadata.attn_mask, + block_table=attn_metadata.block_tables, + seq_len=attn_metadata.query_lens, + context_lens=attn_metadata.seq_lens, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + out=output) return output def forward( From 6ef548d540b54a531ee21ca231d2365d050df13f Mon Sep 17 00:00:00 2001 From: tangtianyi Date: Fri, 19 Sep 2025 17:45:10 +0800 Subject: [PATCH 07/14] fix by review Signed-off-by: tangtianyi --- vllm_ascend/attention/attention_mask.py | 7 +++++-- vllm_ascend/attention/attention_v1.py | 2 ++ vllm_ascend/worker/model_runner_v1.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py index daa0ce24fc..d4e205d1ad 100644 --- a/vllm_ascend/attention/attention_mask.py +++ b/vllm_ascend/attention/attention_mask.py @@ -41,14 +41,17 @@ def __init__( dtype: torch.dtype, device: torch.device = None, ): + # NOTE: The device argument specifies the target NPU + # to be used for the newly added FIA operator. + # Only pass this parameter when using the new FIA operator. + attn_mask = _generate_attn_mask(max_seq_len, dtype) self._seq_len_cached = attn_mask.shape[0] self.attn_mask_cache = attn_mask self.device = device if self.device: - #NOTE: New compressed mask needs to be sent to certain device, - # so device needs to be passed here. + assigned_mask_dim = 2048 self.chunked_prefill_attn_mask = torch.triu(torch.ones(assigned_mask_dim, assigned_mask_dim), diagonal=1 ).to(torch.int8).to(device) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 3995d20b3b..c326724747 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -460,6 +460,8 @@ def _forward_v1_style( attn_metadata.seq_lens.to(device=query.device) if self.compressed_mask: + # TODO:The npu_fused_infer_attention_score op is planned to + # be utilized in a wider range in upcoming versions. num_block, block_size, head_num, head_dim = self.key_cache.shape key = self.key_cache.view(num_block, block_size, -1) value = self.value_cache.view(num_block, block_size, -1) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index c6da02959c..738719b2a2 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -825,7 +825,7 @@ def _make_attention_mask(self, seq_lens, position, attn_state) -> torch.Tensor: # Chunk Prefill situation. if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla: - if selkf.compressed_mas: + if self.compressed_mask: return self.attn_mask_builder.get_splitfuse_attn_mask() else: return self.attn_mask_builder.get_splitfuse_attn_mask( From 821253cb6fd79b1fa9f3cd4c40d3b876d179f714 Mon Sep 17 00:00:00 2001 From: Angazenn Date: Fri, 19 Sep 2025 22:55:43 +0800 Subject: [PATCH 08/14] change to cann version Signed-off-by: Angazenn --- vllm_ascend/attention/attention_mask.py | 3 +-- vllm_ascend/utils.py | 1 - vllm_ascend/worker/model_runner_v1.py | 9 +++------ 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py index d4e205d1ad..3d402dde5f 100644 --- a/vllm_ascend/attention/attention_mask.py +++ b/vllm_ascend/attention/attention_mask.py @@ -50,8 +50,7 @@ def __init__( self._seq_len_cached = attn_mask.shape[0] self.attn_mask_cache = attn_mask self.device = device - if self.device: - + if torch.version.cann.startswith("8.3"): assigned_mask_dim = 2048 self.chunked_prefill_attn_mask = torch.triu(torch.ones(assigned_mask_dim, assigned_mask_dim), diagonal=1 ).to(torch.int8).to(device) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 114f8402a5..78a3d9becc 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -60,7 +60,6 @@ _SLEEP_MODE_ENABLED = None _CURRENT_STREAM = None _ASCEND_CUSTOMOP_IS_REIGISTERED = False -_CURRENT_TORCH_NPU_VERSION = torch_npu.__version__ def is_310p(): global _IS_310P diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 738719b2a2..41eff4eed1 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -112,7 +112,7 @@ from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, AscendSocVersion, ProfileExecuteDuration, get_ascend_soc_version, is_310p, - lmhead_tp_enable, verify_torch_npu_version) + lmhead_tp_enable) from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch if TYPE_CHECKING: @@ -301,10 +301,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): use_mla=self.model_config.use_mla, ) - pta_version_support_compressed_mask = "2.7.1.dev20250918" - self.compressed_mask = verify_torch_npu_version(pta_version_support_compressed_mask, "compressed mask") - - if self.compressed_mask: + if torch.version.cann.startswith("8.3"): self.attn_mask_builder = AttentionMaskBuilder( self.scheduler_config.max_num_batched_tokens, self.dtype, self.device) else: @@ -825,7 +822,7 @@ def _make_attention_mask(self, seq_lens, position, attn_state) -> torch.Tensor: # Chunk Prefill situation. if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla: - if self.compressed_mask: + if torch.version.cann.startswith("8.3"): return self.attn_mask_builder.get_splitfuse_attn_mask() else: return self.attn_mask_builder.get_splitfuse_attn_mask( From 35acbb2808bbd531db1e2e8a3b2a5ae13dc13dd7 Mon Sep 17 00:00:00 2001 From: Angazenn Date: Fri, 19 Sep 2025 23:06:01 +0800 Subject: [PATCH 09/14] remove another compressed Signed-off-by: Angazenn --- vllm_ascend/attention/attention_v1.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index c326724747..ad4a2fbba5 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -304,9 +304,6 @@ def __init__( self.key_cache = None self.value_cache = None - pta_version_support_compressed_mask = "2.7.1.dev20250918" - self.compressed_mask = verify_torch_npu_version(pta_version_support_compressed_mask, "compressed mask") - def _forward_prefill_no_cache( self, query: torch.Tensor, @@ -459,7 +456,7 @@ def _forward_v1_style( attn_metadata.seq_lens = \ attn_metadata.seq_lens.to(device=query.device) - if self.compressed_mask: + if torch.version.cann.startswith("8.3"): # TODO:The npu_fused_infer_attention_score op is planned to # be utilized in a wider range in upcoming versions. num_block, block_size, head_num, head_dim = self.key_cache.shape From a366af6c129aaeb02a034a46bb7efb83fefb30ca Mon Sep 17 00:00:00 2001 From: Angazenn Date: Sat, 20 Sep 2025 08:24:06 +0800 Subject: [PATCH 10/14] fix lint Signed-off-by: Angazenn --- vllm_ascend/attention/attention_mask.py | 16 +++++++-------- vllm_ascend/attention/attention_v1.py | 26 ++++++++++++------------- vllm_ascend/utils.py | 1 + vllm_ascend/worker/model_runner_v1.py | 3 ++- 4 files changed, 24 insertions(+), 22 deletions(-) diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py index 3d402dde5f..3819d8b8df 100644 --- a/vllm_ascend/attention/attention_mask.py +++ b/vllm_ascend/attention/attention_mask.py @@ -41,7 +41,7 @@ def __init__( dtype: torch.dtype, device: torch.device = None, ): - # NOTE: The device argument specifies the target NPU + # NOTE: The device argument specifies the target NPU # to be used for the newly added FIA operator. # Only pass this parameter when using the new FIA operator. @@ -52,8 +52,9 @@ def __init__( self.device = device if torch.version.cann.startswith("8.3"): assigned_mask_dim = 2048 - self.chunked_prefill_attn_mask = torch.triu(torch.ones(assigned_mask_dim, assigned_mask_dim), diagonal=1 - ).to(torch.int8).to(device) + self.chunked_prefill_attn_mask = torch.triu( + torch.ones(assigned_mask_dim, assigned_mask_dim), + diagonal=1).to(torch.int8).to(device) @staticmethod def get_mask_scale_factor(dtype: torch.dtype = torch.float16): @@ -91,15 +92,14 @@ def get_splitfuse_attn_mask( self._update_attn_cache(max_seq_len, dtype) # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation # is not the same. Fix this in the future when kernel is ready. - mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor(dtype) + mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor( + dtype) attn_mask = torch.index_select(self.attn_mask_cache, - dim=0, - index=position)[:, :max_seq_len] + dim=0, + index=position)[:, :max_seq_len] attn_mask *= mask_scale_factor return attn_mask.contiguous().to(device, non_blocking=True) - - def _update_attn_cache(self, seqlen: int, dtype: torch.dtype): if seqlen > self._seq_len_cached: self._seq_len_cached = seqlen diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index ad4a2fbba5..1ea5119c61 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -37,7 +37,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.ops.attention import vanilla_chunked_prefill from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, - nd_to_nz_2d, nd_to_nz_spec, verify_torch_npu_version) + nd_to_nz_2d, nd_to_nz_spec) def wait_for_kv_layer_from_connector(layer_name: str): @@ -477,20 +477,20 @@ def _forward_v1_style( num_heads=self.num_heads, scale=self.scale, sparse_mode=3, - ) + ) else: torch_npu._npu_paged_attention_splitfuse( - query=query, - key_cache=self.key_cache, - value_cache=self.value_cache, - mask=attn_metadata.attn_mask, - block_table=attn_metadata.block_tables, - seq_len=attn_metadata.query_lens, - context_lens=attn_metadata.seq_lens, - num_kv_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale_value=self.scale, - out=output) + query=query, + key_cache=self.key_cache, + value_cache=self.value_cache, + mask=attn_metadata.attn_mask, + block_table=attn_metadata.block_tables, + seq_len=attn_metadata.query_lens, + context_lens=attn_metadata.seq_lens, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + out=output) return output def forward( diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 78a3d9becc..06e1a2bb8c 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -61,6 +61,7 @@ _CURRENT_STREAM = None _ASCEND_CUSTOMOP_IS_REIGISTERED = False + def is_310p(): global _IS_310P if _IS_310P is None: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 41eff4eed1..659541e300 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -303,7 +303,8 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): if torch.version.cann.startswith("8.3"): self.attn_mask_builder = AttentionMaskBuilder( - self.scheduler_config.max_num_batched_tokens, self.dtype, self.device) + self.scheduler_config.max_num_batched_tokens, self.dtype, + self.device) else: self.attn_mask_builder = AttentionMaskBuilder( self.model_config.max_model_len, self.dtype) From 5102efe3652bbe0fd2b2407df55a41107be36deb Mon Sep 17 00:00:00 2001 From: Angazenn Date: Sat, 20 Sep 2025 10:23:28 +0800 Subject: [PATCH 11/14] fix mypy Signed-off-by: Angazenn --- vllm_ascend/attention/attention_v1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 1ea5119c61..cd4bd57967 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -460,8 +460,8 @@ def _forward_v1_style( # TODO:The npu_fused_infer_attention_score op is planned to # be utilized in a wider range in upcoming versions. num_block, block_size, head_num, head_dim = self.key_cache.shape - key = self.key_cache.view(num_block, block_size, -1) - value = self.value_cache.view(num_block, block_size, -1) + key = self.key_cache.view(num_block, block_size, -1) # type: ignore + value = self.value_cache.view(num_block, block_size, -1) # type: ignore output, _ = torch_npu.npu_fused_infer_attention_score( query=query, From 5565a33e11364a04c3b2cfd5776492551c8ba018 Mon Sep 17 00:00:00 2001 From: Angazenn Date: Sat, 20 Sep 2025 11:44:41 +0800 Subject: [PATCH 12/14] fix mypy Signed-off-by: Angazenn --- vllm_ascend/attention/attention_v1.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index cd4bd57967..0d8f01579d 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -460,8 +460,10 @@ def _forward_v1_style( # TODO:The npu_fused_infer_attention_score op is planned to # be utilized in a wider range in upcoming versions. num_block, block_size, head_num, head_dim = self.key_cache.shape - key = self.key_cache.view(num_block, block_size, -1) # type: ignore - value = self.value_cache.view(num_block, block_size, -1) # type: ignore + key = self.key_cache.view( # type: ignore + num_block, block_size, -1) + value = self.value_cache.view( # type: ignore + num_block, block_size, -1) output, _ = torch_npu.npu_fused_infer_attention_score( query=query, From 8ac3f51c02ec6802b0eccff40adcebe3cf2f0733 Mon Sep 17 00:00:00 2001 From: Angazenn Date: Sat, 20 Sep 2025 12:16:20 +0800 Subject: [PATCH 13/14] fix mypy Signed-off-by: Angazenn --- vllm_ascend/attention/attention_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 0d8f01579d..1db0ae3a33 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -459,7 +459,7 @@ def _forward_v1_style( if torch.version.cann.startswith("8.3"): # TODO:The npu_fused_infer_attention_score op is planned to # be utilized in a wider range in upcoming versions. - num_block, block_size, head_num, head_dim = self.key_cache.shape + num_block, block_size, _, _ = self.key_cache.shape # type: ignore key = self.key_cache.view( # type: ignore num_block, block_size, -1) value = self.value_cache.view( # type: ignore From 3be8a40c40b30a03e747a0258c71747162b3ee5f Mon Sep 17 00:00:00 2001 From: Angazenn Date: Sat, 20 Sep 2025 12:44:03 +0800 Subject: [PATCH 14/14] fix ut Signed-off-by: Angazenn --- vllm_ascend/attention/attention_mask.py | 2 +- vllm_ascend/attention/attention_v1.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py index 3819d8b8df..cf92affd38 100644 --- a/vllm_ascend/attention/attention_mask.py +++ b/vllm_ascend/attention/attention_mask.py @@ -82,7 +82,7 @@ def get_splitfuse_attn_mask( dtype: torch.dtype = None, device: torch.device = None, ) -> torch.Tensor: - if self.device: + if torch.version.cann.startswith("8.3"): return self.chunked_prefill_attn_mask else: if dtype not in [torch.float16, torch.bfloat16]: diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 1db0ae3a33..bc7f69ce5a 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -586,8 +586,12 @@ def forward( output) # Normal V1 situation. else: - num_tokens = attn_metadata.query_start_loc[-1] - query = query[:num_tokens] + if torch.version.cann.startswith("8.3"): + # npu_fused_infer_attention_score does not support cases + # where query.shape[0] != attn_metadata.query_start_loc[-1]. + # Thus we need unpad it here. + num_tokens = attn_metadata.query_start_loc[-1] + query = query[:num_tokens] output = self._forward_v1_style(query, attn_metadata, output) # to make in-place change to the output tensor