From 6008f1228d569fbb9e743a5358d06c177ef57653 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Sun, 20 Jul 2025 10:58:29 +0800 Subject: [PATCH 1/2] [CI] Fix broken CI Signed-off-by: wangxiyuan --- vllm_ascend/attention/attention_v1.py | 37 +++++++++++++++- .../attention/attention_v1_torchair.py | 38 +++++++++++++++- vllm_ascend/attention/mla_v1.py | 43 +++++++++++++++++-- 3 files changed, 110 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 7d7f488f47..b0e9f3b5b2 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -31,7 +31,7 @@ from vllm_ascend.ops.attention import vanilla_chunked_prefill from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, - nd_to_nz_2d, nd_to_nz_spec) + nd_to_nz_2d, nd_to_nz_spec, vllm_version_is) class AscendAttentionBackend(AttentionBackend): @@ -43,6 +43,8 @@ def get_name() -> str: @staticmethod def get_impl_cls() -> Type["AscendAttentionBackendImpl"]: + if vllm_version_is("0.9.2"): + return AscendAttentionBackendImpl092 return AscendAttentionBackendImpl @staticmethod @@ -222,7 +224,6 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, @@ -437,6 +438,38 @@ def forward( return output.view(num_tokens, self.hidden_size) +class AscendAttentionBackendImpl092(AscendAttentionBackendImpl): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, + use_irope: bool = False, + ) -> None: + super().__init__( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=alibi_slopes, + sliding_window=sliding_window, + kv_cache_dtype=kv_cache_dtype, + logits_soft_cap=logits_soft_cap, + attn_type=attn_type, + kv_sharing_target_layer_name=kv_sharing_target_layer_name, + use_irope=use_irope, + ) + + def unified_ascend_attention_with_output( query: torch.Tensor, key: torch.Tensor, diff --git a/vllm_ascend/attention/attention_v1_torchair.py b/vllm_ascend/attention/attention_v1_torchair.py index 9d9b91bad6..0c50290e5a 100644 --- a/vllm_ascend/attention/attention_v1_torchair.py +++ b/vllm_ascend/attention/attention_v1_torchair.py @@ -29,7 +29,7 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, - nd_to_nz_2d) + nd_to_nz_2d, vllm_version_is) class AscendAttentionTorchairBackend(AttentionBackend): @@ -41,6 +41,8 @@ def get_name() -> str: @staticmethod def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]: + if vllm_version_is("0.9.2"): + return AscendAttentionTorchairBackendImpl092 return AscendAttentionTorchairBackendImpl @staticmethod @@ -333,7 +335,6 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, @@ -501,3 +502,36 @@ def forward( "to use ascend scheduler.") return output.view(num_tokens, self.hidden_size) + + +class AscendAttentionTorchairBackendImpl092(AscendAttentionTorchairBackendImpl + ): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, + use_irope: bool = False, + ) -> None: + super().__init__( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=alibi_slopes, + sliding_window=sliding_window, + kv_cache_dtype=kv_cache_dtype, + logits_soft_cap=logits_soft_cap, + attn_type=attn_type, + kv_sharing_target_layer_name=kv_sharing_target_layer_name, + use_irope=use_irope, + ) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index a8fb7bc68f..c70195b3ae 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -1,11 +1,12 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional, Tuple, Type, TypeVar +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, + TypeVar) import numpy as np import torch import torch_npu from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer, - AttentionMetadata, + AttentionMetadata, AttentionType, MLAAttentionImpl) from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.config import get_current_vllm_config @@ -20,7 +21,8 @@ from vllm_ascend.multistream.context import get_multistream_comm_context from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla -from vllm_ascend.utils import npu_prefetch, npu_stream_switch, npu_wait_tensor +from vllm_ascend.utils import (npu_prefetch, npu_stream_switch, + npu_wait_tensor, vllm_version_is) from vllm_ascend.worker.npu_input_batch import InputBatch if TYPE_CHECKING: @@ -66,6 +68,8 @@ def get_kv_cache_shape(num_blocks: int, block_size: int, num_kv_heads: int, @staticmethod def get_impl_cls() -> Type["MLAAttentionImpl"]: + if vllm_version_is("0.9.2"): + return AscendMLAImpl092 return AscendMLAImpl @@ -533,7 +537,6 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, kv_sharing_target_layer_name: Optional[str] = None, @@ -1226,3 +1229,35 @@ def forward( output[:num_decode_tokens] = output_decode return output_padded + + +class AscendMLAImpl092(AscendMLAImpl): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, + use_irope: bool = False, + ) -> None: + super().__init__( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=alibi_slopes, + sliding_window=sliding_window, + kv_cache_dtype=kv_cache_dtype, + logits_soft_cap=logits_soft_cap, + attn_type=attn_type, + kv_sharing_target_layer_name=kv_sharing_target_layer_name, + use_irope=use_irope, + ) From 0407ffc5b580c5f0e743017e6f4170d80c2d3b66 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Sun, 20 Jul 2025 11:37:26 +0800 Subject: [PATCH 2/2] Fix ut Signed-off-by: Yikun Jiang --- tests/ut/attention/test_attention_v1.py | 11 +++++++-- vllm_ascend/attention/mla_v1.py | 31 ++++++++++++------------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py index e9ce36e2e0..51fbae233d 100644 --- a/tests/ut/attention/test_attention_v1.py +++ b/tests/ut/attention/test_attention_v1.py @@ -3,12 +3,15 @@ import torch from tests.ut.base import TestBase +from vllm_ascend.attention.attention_v1 import \ + AscendAttentionBackendImpl092 # isort: skip from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend, AscendAttentionBackendImpl, AscendAttentionMetadataBuilder, AscendAttentionState, AscendMetadata, CommonAttentionState) +from vllm_ascend.utils import vllm_version_is class TestAscendAttentionBackend(TestBase): @@ -17,8 +20,12 @@ def test_get_name(self): self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND") def test_get_impl_cls(self): - self.assertEqual(AscendAttentionBackend.get_impl_cls(), - AscendAttentionBackendImpl) + if vllm_version_is("0.9.2"): + self.assertEqual(AscendAttentionBackend.get_impl_cls(), + AscendAttentionBackendImpl092) + else: + self.assertEqual(AscendAttentionBackend.get_impl_cls(), + AscendAttentionBackendImpl) def test_get_metadata_cls(self): self.assertEqual(AscendAttentionBackend.get_metadata_cls(), diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index c70195b3ae..37e9454c46 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -1233,21 +1233,20 @@ def forward( class AscendMLAImpl092(AscendMLAImpl): - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: + def __init__(self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, + use_irope: bool = False, + **kwargs) -> None: super().__init__( num_heads=num_heads, head_size=head_size, @@ -1260,4 +1259,4 @@ def __init__( attn_type=attn_type, kv_sharing_target_layer_name=kv_sharing_target_layer_name, use_irope=use_irope, - ) + **kwargs)