|
112 | 112 | from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
113 | 113 | AscendSocVersion, ProfileExecuteDuration,
|
114 | 114 | get_ascend_soc_version, is_310p,
|
115 |
| - lmhead_tp_enable, verify_torch_npu_version) |
| 115 | + lmhead_tp_enable) |
116 | 116 | from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
117 | 117 |
|
118 | 118 | if TYPE_CHECKING:
|
@@ -301,10 +301,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
|
301 | 301 | use_mla=self.model_config.use_mla,
|
302 | 302 | )
|
303 | 303 |
|
304 |
| - pta_version_support_compressed_mask = "2.7.1.dev20250918" |
305 |
| - self.compressed_mask = verify_torch_npu_version(pta_version_support_compressed_mask, "compressed mask") |
306 |
| - |
307 |
| - if self.compressed_mask: |
| 304 | + if torch.version.cann.startswith("8.3"): |
308 | 305 | self.attn_mask_builder = AttentionMaskBuilder(
|
309 | 306 | self.scheduler_config.max_num_batched_tokens, self.dtype, self.device)
|
310 | 307 | else:
|
@@ -825,7 +822,7 @@ def _make_attention_mask(self, seq_lens, position,
|
825 | 822 | attn_state) -> torch.Tensor:
|
826 | 823 | # Chunk Prefill situation.
|
827 | 824 | if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla:
|
828 |
| - if self.compressed_mask: |
| 825 | + if torch.version.cann.startswith("8.3"): |
829 | 826 | return self.attn_mask_builder.get_splitfuse_attn_mask()
|
830 | 827 | else:
|
831 | 828 | return self.attn_mask_builder.get_splitfuse_attn_mask(
|
|
0 commit comments