Skip to content

Commit 6d46c39

Browse files
committed
[CI] upgrade to vllm 0.9.0
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
1 parent 01e3d59 commit 6d46c39

File tree

10 files changed

+72
-125
lines changed

10 files changed

+72
-125
lines changed

tests/utils.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,11 @@
4646
from vllm.transformers_utils.tokenizer import get_tokenizer
4747
from vllm.utils import FlexibleArgumentParser, GB_bytes, get_open_port
4848

49-
from vllm_ascend.utils import vllm_version_is
5049

5150
from .model_utils import TextTextLogprobs
5251

53-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
54-
from vllm.model_executor.model_loader.loader import get_model_loader # type: ignore[import] # isort: skip
55-
else:
56-
from vllm.model_executor.model_loader import get_model_loader
52+
53+
from vllm.model_executor.model_loader import get_model_loader
5754

5855
VLLM_PATH = Path(__file__).parent.parent
5956
"""Path to root of the vLLM repository."""

vllm_ascend/attention/attention_v1.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from vllm.v1.worker.gpu_input_batch import InputBatch
3131

3232
from vllm_ascend.ops.attention import vanilla_chunked_prefill
33-
from vllm_ascend.utils import vllm_version_is
3433

3534

3635
class AscendAttentionBackend(AttentionBackend):
@@ -142,14 +141,11 @@ def reorder_batch(self, input_batch: "InputBatch",
142141

143142
def build(self, num_reqs, num_actual_tokens, max_query_len,
144143
common_prefix_len):
145-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
146-
block_table = (self.runner.input_batch.block_table.
147-
get_device_tensor()[:num_reqs])
148-
else:
149-
block_table = self.runner.input_batch.block_table[
150-
0].get_device_tensor()
151-
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
152-
block_table[:num_reqs])
144+
145+
block_table = self.runner.input_batch.block_table[
146+
0].get_device_tensor()
147+
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
148+
block_table[:num_reqs])
153149

154150
query_lens = self.runner.query_lens
155151
seq_lens = self.runner.seq_lens_cpu[:num_reqs]

vllm_ascend/attention/mla_v1.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
from vllm_ascend.attention.attention_v1 import AscendAttentionState
1818
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
19-
from vllm_ascend.utils import vllm_version_is
2019
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
2120

2221
if TYPE_CHECKING:
@@ -239,14 +238,11 @@ def build(self,
239238
# function. We should avoid GPU -> CPU sync as much as possible because
240239
# it blocks on all previous kernels.
241240
device = self.runner.device
242-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
243-
block_table = (self.runner.input_batch.block_table.
244-
get_device_tensor()[:num_reqs])
245-
else:
246-
block_table = self.runner.input_batch.block_table[
247-
0].get_device_tensor()
248-
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
249-
block_table[:num_reqs])
241+
242+
block_table = self.runner.input_batch.block_table[
243+
0].get_device_tensor()
244+
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
245+
block_table[:num_reqs])
250246
slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
251247
device, non_blocking=True)
252248
input_positions = self.runner.positions_cpu[:num_actual_tokens].to(

vllm_ascend/ops/fused_moe.py

Lines changed: 39 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,11 @@
2828
from vllm.model_executor.layers.fused_moe.layer import (
2929
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
3030

31-
from vllm_ascend.utils import vllm_version_is
3231

33-
if not (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")):
34-
from vllm.model_executor.layers.fused_moe.layer import (
35-
FusedMoEParallelConfig, MoEConfig)
36-
else:
37-
MoEConfig = None
32+
33+
from vllm.model_executor.layers.fused_moe.layer import (
34+
FusedMoEParallelConfig, MoEConfig)
35+
3836

3937
from vllm.model_executor.layers.quantization.base_config import (
4038
QuantizationConfig, QuantizeMethodBase)
@@ -587,10 +585,9 @@ def select_experts(
587585
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
588586

589587
def __init__(self, moe: MoEConfig = None):
590-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
591-
super().__init__()
592-
else:
593-
super().__init__(moe=moe)
588+
589+
590+
super().__init__(moe=moe)
594591
vllm_config = get_current_vllm_config()
595592

596593
ep_group = get_ep_group()
@@ -731,23 +728,16 @@ def __init__(
731728
params_dtype = torch.get_default_dtype()
732729

733730
vllm_config = get_current_vllm_config()
734-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
735-
self.ep_size = get_ep_group().world_size
736-
self.tp_size = get_etp_group().world_size
737-
self.dp_size = (dp_size if dp_size is not None else
738-
get_dp_group().world_size)
739-
self.dp_rank = (0 if self.dp_size == 1 else
740-
get_dp_group().rank_in_group)
741-
else:
742-
self.moe_parallel_config: FusedMoEParallelConfig = (
743-
FusedMoEParallelConfig.make(
744-
tp_size_=(tp_size if tp_size is not None else
745-
get_tensor_model_parallel_world_size()),
746-
dp_size_=(dp_size if dp_size is not None else
747-
get_dp_group().world_size),
748-
vllm_parallel_config=vllm_config.parallel_config))
749731

750-
self.moe_parallel_config.ep_size = get_ep_group().world_size
732+
self.moe_parallel_config: FusedMoEParallelConfig = (
733+
FusedMoEParallelConfig.make(
734+
tp_size_=(tp_size if tp_size is not None else
735+
get_tensor_model_parallel_world_size()),
736+
dp_size_=(dp_size if dp_size is not None else
737+
get_dp_group().world_size),
738+
vllm_parallel_config=vllm_config.parallel_config))
739+
740+
self.moe_parallel_config.ep_size = get_ep_group().world_size
751741

752742
self.top_k = top_k
753743
self.num_experts = num_experts
@@ -772,54 +762,40 @@ def __init__(
772762
self.local_num_experts, self.expert_map = determine_expert_map(
773763
self.ep_size,
774764
get_ep_group().rank_in_group, self.global_num_experts)
775-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
776-
self.tp_rank = get_etp_group().rank_in_group
777-
self.ep_rank = get_ep_group().rank_in_group
778-
else:
779-
self.moe_parallel_config.tp_rank = get_etp_group(
780-
).rank_in_group
781-
self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group
765+
766+
self.moe_parallel_config.tp_rank = get_etp_group(
767+
).rank_in_group
768+
self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group
782769

783770
else:
784771
# Adjust TP size for DP attention
785772
# haven't test its functionality yet, may remove in the future
786-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
787-
self.tp_rank = self.tp_size * self.dp_rank
788-
self.ep_rank = 0
789-
self.tp_size = self.tp_size * self.dp_size
790-
self.ep_size = 1
791-
else:
792-
self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank
793-
self.moe_parallel_config.ep_rank = 0
794-
self.moe_parallel_config.tp_size = self.tp_size * self.dp_size
795-
self.moe_parallel_config.ep_size = 1
773+
774+
self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank
775+
self.moe_parallel_config.ep_rank = 0
776+
self.moe_parallel_config.tp_size = self.tp_size * self.dp_size
777+
self.moe_parallel_config.ep_size = 1
796778

797779
self.local_num_experts, self.expert_map = (self.global_num_experts,
798780
None)
799781
if self.scoring_func != "softmax" and not self.use_grouped_topk:
800782
raise ValueError("Only softmax scoring function is supported for "
801783
"non-grouped topk.")
802-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
803-
if quant_config is None:
804-
self.quant_method: Optional[QuantizeMethodBase] = (
805-
AscendUnquantizedFusedMoEMethod())
806-
else:
807-
self.quant_method = quant_config.get_quant_method(self, prefix)
808-
else:
809-
moe = MoEConfig(
810-
num_experts=self.global_num_experts,
811-
experts_per_token=top_k,
812-
hidden_dim=hidden_size,
813-
num_local_experts=self.local_num_experts,
814-
moe_parallel_config=self.moe_parallel_config,
815-
# TODO (bnell): this needs to be fixed for quantized types.
816-
in_dtype=params_dtype,
817-
)
818784

819-
if quant_config is None:
820-
self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
821-
else:
822-
self.quant_method = quant_config.get_quant_method(self, prefix)
785+
moe = MoEConfig(
786+
num_experts=self.global_num_experts,
787+
experts_per_token=top_k,
788+
hidden_dim=hidden_size,
789+
num_local_experts=self.local_num_experts,
790+
moe_parallel_config=self.moe_parallel_config,
791+
# TODO (bnell): this needs to be fixed for quantized types.
792+
in_dtype=params_dtype,
793+
)
794+
795+
if quant_config is None:
796+
self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
797+
else:
798+
self.quant_method = quant_config.get_quant_method(self, prefix)
823799

824800
assert self.quant_method is not None
825801

vllm_ascend/patch/platform/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
from vllm_ascend.utils import vllm_version_is
1818

1919
# Import specific patches for different versions
20-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
21-
from vllm_ascend.patch.platform import patch_0_8_5 # noqa: F401
20+
if vllm_version_is("0.9.0"):
21+
from vllm_ascend.patch.platform import patch_0_9_0 # noqa: F401
2222
from vllm_ascend.patch.platform import patch_common # noqa: F401
2323
else:
2424
from vllm_ascend.patch.platform import patch_common # noqa: F401
File renamed without changes.

vllm_ascend/patch/worker/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
from vllm_ascend.utils import vllm_version_is
1919

2020
# Import specific patches for different versions
21-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
22-
from vllm_ascend.patch.worker import patch_0_8_5 # noqa: F401
21+
if vllm_version_is("0.9.0"):
22+
from vllm_ascend.patch.worker import patch_0_9_0 # noqa: F401
2323
from vllm_ascend.patch.worker import patch_common # noqa: F401
2424
else:
2525
from vllm_ascend.patch.worker import patch_common # noqa: F401
File renamed without changes.

vllm_ascend/worker/model_runner.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@
6464
_init_attn_metadata_from_tensor_dict,
6565
_init_sampling_metadata_from_tensor_dict)
6666

67-
from vllm_ascend.utils import vllm_version_is
6867

6968
if TYPE_CHECKING:
7069
from vllm.attention.backends.abstract import AttentionBackend
@@ -1017,10 +1016,8 @@ def save_sharded_state(
10171016
pattern: Optional[str] = None,
10181017
max_size: Optional[int] = None,
10191018
) -> None:
1020-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
1021-
from vllm.model_executor.model_loader.loader import ShardedStateLoader # type: ignore[import] # isort: skip # noqa
1022-
else:
1023-
from vllm.model_executor.model_loader import ShardedStateLoader
1019+
1020+
from vllm.model_executor.model_loader import ShardedStateLoader
10241021
ShardedStateLoader.save_model(
10251022
self.model,
10261023
path,
@@ -1032,11 +1029,8 @@ def save_tensorized_model(
10321029
self,
10331030
tensorizer_config: TensorizerConfig,
10341031
) -> None:
1035-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
1036-
from vllm.model_executor.model_loader.loader import \
1037-
TensorizerLoader # type: ignore # noqa
1038-
else:
1039-
from vllm.model_executor.model_loader import \
1032+
1033+
from vllm.model_executor.model_loader import \
10401034
TensorizerLoader # type: ignore # noqa
10411035
TensorizerLoader.save_model(
10421036
self.model,

vllm_ascend/worker/model_runner_v1.py

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@
6161
from vllm_ascend.attention.attention_v1 import AscendAttentionState
6262
from vllm_ascend.platform import NPUPlatform
6363
from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
64-
from vllm_ascend.utils import vllm_version_is
6564

6665
if TYPE_CHECKING:
6766
import xgrammar as xgr # type: ignore[import-untyped]
@@ -210,16 +209,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
210209
# Request states.
211210
self.requests: Dict[str, CachedRequestState] = {}
212211
# Persistent batch.
213-
# Remove this after we drop 0.8.5 support
214-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
215-
self.input_batch = InputBatch(
216-
max_num_reqs=self.max_num_reqs,
217-
max_model_len=self.model_config.max_model_len,
218-
max_num_blocks_per_req=self.max_num_blocks_per_req,
219-
device=self.device,
220-
pin_memory=True,
221-
vocab_size=self.model_config.get_vocab_size(),
222-
)
212+
223213

224214
self.input_ids = torch.zeros(self.max_num_tokens,
225215
dtype=torch.int32,
@@ -573,10 +563,8 @@ def _process_reqs(
573563

574564
block_table_indices = (req_indices * self.max_num_blocks_per_req +
575565
positions_np // self.block_size)
576-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
577-
block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
578-
else:
579-
block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor()
566+
567+
block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor()
580568
block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
581569
block_offsets = positions_np % self.block_size
582570
np.add(block_numbers * self.block_size,
@@ -1182,16 +1170,16 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
11821170
"""
11831171
import torch_npu
11841172
kv_caches: Dict[str, torch.Tensor] = {}
1185-
if not (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")):
1186-
self.input_batch = InputBatch(
1187-
max_num_reqs=self.max_num_reqs,
1188-
max_model_len=self.model_config.max_model_len,
1189-
max_num_batched_tokens=self.max_num_tokens,
1190-
device=self.device,
1191-
pin_memory=True,
1192-
vocab_size=self.model_config.get_vocab_size(),
1193-
block_size=self.cache_config.block_size,
1194-
)
1173+
1174+
self.input_batch = InputBatch(
1175+
max_num_reqs=self.max_num_reqs,
1176+
max_model_len=self.model_config.max_model_len,
1177+
max_num_batched_tokens=self.max_num_tokens,
1178+
device=self.device,
1179+
pin_memory=True,
1180+
vocab_size=self.model_config.get_vocab_size(),
1181+
block_size=self.cache_config.block_size,
1182+
)
11951183

11961184
for kv_cache_group in kv_cache_config.kv_cache_groups:
11971185
kv_cache_spec = kv_cache_group.kv_cache_spec

0 commit comments

Comments
 (0)