Skip to content

Commit c14a251

Browse files
committed
Merge branch 'main' into async-sched-dp
2 parents 2fdce73 + 88d7af6 commit c14a251

File tree

6 files changed

+42
-16
lines changed

6 files changed

+42
-16
lines changed

tests/ut/core/test_scheduler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
SchedulerConfig, SpeculativeConfig, VllmConfig)
99
from vllm.multimodal.inputs import PlaceholderRange
1010
from vllm.sampling_params import SamplingParams
11+
from vllm.utils import sha256
1112
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
1213
init_none_hash)
1314
from vllm.v1.core.sched.output import SchedulerOutput
@@ -38,7 +39,7 @@ def create_requests(
3839
max_tokens: int = 16,
3940
stop_token_ids: Optional[list[int]] = None,
4041
block_size: int = 3,
41-
hash_fn=hash,
42+
hash_fn=sha256,
4243
):
4344
init_none_hash(hash_fn)
4445
prompt_logprobs = PROMPT_LOGPROBS

tests/ut/kv_connector/utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from vllm import SamplingParams
1111
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
1212
ModelConfig, SchedulerConfig, VllmConfig)
13+
from vllm.utils import sha256
1314
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
1415
init_none_hash)
1516
from vllm.v1.core.sched.scheduler import Scheduler
@@ -129,10 +130,10 @@ def create_request(
129130
"""Make dummy request for testing."""
130131
global _none_hash_initialized
131132
if not _none_hash_initialized:
132-
init_none_hash(hash)
133+
init_none_hash(sha256)
133134
_none_hash_initialized = True
134135

135-
block_hasher = get_request_block_hasher(block_size, hash)
136+
block_hasher = get_request_block_hasher(block_size, sha256)
136137

137138
kv_transfer_params: Optional[dict[str, Any]] = None
138139

tests/ut/worker/test_worker_v1.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,7 +1008,10 @@ def test_load_model_sleep_mode_assertion_error(self, mock_allocator_class):
10081008

10091009
@patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything")
10101010
@patch("vllm_ascend.worker.worker_v1.logger")
1011-
def test_compile_or_warm_up_model_with_eager_mode(self, mock_logger,
1011+
@patch("torch_npu._npu_matmul_add_fp32")
1012+
def test_compile_or_warm_up_model_with_eager_mode(self,
1013+
mock_npu_matmul_add,
1014+
mock_logger,
10121015
mock_seed_everything):
10131016
"""Test compile_or_warm_up_model method - eager mode"""
10141017
from vllm_ascend.worker.worker_v1 import NPUWorker
@@ -1050,10 +1053,14 @@ def test_compile_or_warm_up_model_with_eager_mode(self, mock_logger,
10501053
# Verify seed setting
10511054
mock_seed_everything.assert_called_once_with(12345)
10521055

1056+
# Verify calls
1057+
mock_npu_matmul_add.assert_called_once()
1058+
10531059
@patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything")
10541060
@patch("vllm_ascend.worker.worker_v1.logger")
1061+
@patch("torch_npu._npu_matmul_add_fp32")
10551062
def test_compile_or_warm_up_model_with_graph_capture(
1056-
self, mock_logger, mock_seed_everything):
1063+
self, mock_npu_matmul_add, mock_logger, mock_seed_everything):
10571064
"""Test compile_or_warm_up_model method - with graph capture enabled"""
10581065
from vllm_ascend.worker.worker_v1 import NPUWorker
10591066

@@ -1086,6 +1093,9 @@ def test_compile_or_warm_up_model_with_graph_capture(
10861093
# Verify seed setting
10871094
mock_seed_everything.assert_called_once_with(67890)
10881095

1096+
# Verify calls
1097+
mock_npu_matmul_add.assert_called_once()
1098+
10891099
@patch("vllm_ascend.worker.worker_v1.CaMemAllocator")
10901100
def test_initialize_from_config_with_sleep_mode(self,
10911101
mock_allocator_class):

vllm_ascend/models/qwen3_moe.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from typing import Optional, Union
2121

2222
import torch
23-
import torch_npu
2423
from torch import nn
2524
from transformers import PretrainedConfig
2625
from vllm.compilation.decorators import support_torch_compile
@@ -277,11 +276,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
277276
self.make_empty_intermediate_tensors = (
278277
make_empty_intermediate_tensors_factory(
279278
["hidden_states", "residual"], config.hidden_size))
280-
# Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache) may cause performance degradation at runtime.
281-
x = torch.rand((2, 4), dtype=torch.float16).npu()
282-
weight = torch.rand((2, 4), dtype=torch.float16).npu()
283-
c = torch.rand((4, 4), dtype=torch.float32).npu()
284-
torch_npu._npu_matmul_add_fp32(x, weight, c)
285279

286280
def forward(
287281
self,

vllm_ascend/ops/linear.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def __init__(
6262
prefix: str = "",
6363
*,
6464
return_bias: bool = True,
65+
disable_tp: bool = False,
6566
):
6667
self.comm_group = None
6768
if prefix.find("gate_up_proj") != -1 and mlp_tp_enable():
@@ -88,7 +89,8 @@ def __init__(
8889
params_dtype,
8990
quant_config,
9091
prefix,
91-
return_bias=return_bias)
92+
return_bias=return_bias,
93+
disable_tp=disable_tp)
9294

9395
self.gather_output = gather_output
9496

@@ -137,6 +139,7 @@ def __init__(
137139
prefix: str = "",
138140
*,
139141
return_bias: bool = True,
142+
disable_tp: bool = False,
140143
):
141144
if prefix.find("down_proj") != -1 and mlp_tp_enable():
142145
comm_group = get_mlp_tp_group()
@@ -156,6 +159,7 @@ def __init__(
156159
self.forward_type = "normal"
157160
self.comm_group = comm_group
158161

162+
# TODO: check for disable_tp
159163
self.tp_size = self.comm_group.world_size
160164
self.tp_rank = self.comm_group.rank_in_group
161165

@@ -171,7 +175,8 @@ def __init__(
171175
params_dtype,
172176
quant_config,
173177
prefix,
174-
return_bias=return_bias)
178+
return_bias=return_bias,
179+
disable_tp=disable_tp)
175180

176181
self.input_is_parallel = input_is_parallel
177182
self.reduce_results = reduce_results
@@ -392,6 +397,7 @@ def __init__(
392397
prefix: str = "",
393398
*,
394399
return_bias: bool = True,
400+
disable_tp: bool = False,
395401
):
396402
if prefix.find("gate_up_proj") != -1 and mlp_tp_enable():
397403
comm_group = get_mlp_tp_group()
@@ -403,6 +409,7 @@ def __init__(
403409
comm_group = get_tp_group()
404410
self.forward_type = "normal_tp"
405411
self.comm_group = comm_group
412+
# TODO: check for disable_tp
406413
self.tp_rank = comm_group.rank_in_group
407414
self.tp_size = comm_group.world_size
408415

@@ -418,7 +425,8 @@ def __init__(
418425
params_dtype=params_dtype,
419426
quant_config=quant_config,
420427
prefix=prefix,
421-
return_bias=return_bias)
428+
return_bias=return_bias,
429+
disable_tp=disable_tp)
422430

423431
def forward(
424432
self,
@@ -498,6 +506,7 @@ def __init__(
498506
prefix: str = "",
499507
*,
500508
return_bias: bool = True,
509+
disable_tp: bool = False,
501510
):
502511
if dense_optim_enable():
503512
self.forward_type = "dense_optim"
@@ -511,6 +520,7 @@ def __init__(
511520
total_num_kv_heads = total_num_heads
512521
self.total_num_kv_heads = total_num_kv_heads
513522
# Divide the weight matrix along the last dimension.
523+
# TODO: check for disable_tp
514524
tp_size = self.comm_group.world_size
515525
self.num_heads = divide(self.total_num_heads, tp_size)
516526
if tp_size >= self.total_num_kv_heads:
@@ -537,7 +547,8 @@ def __init__(
537547
params_dtype=params_dtype,
538548
quant_config=quant_config,
539549
prefix=prefix,
540-
return_bias=return_bias)
550+
return_bias=return_bias,
551+
disable_tp=disable_tp)
541552

542553
def forward(
543554
self,
@@ -611,4 +622,4 @@ def __init__(
611622
self.quant_method = quant_config.get_quant_method(self,
612623
prefix=prefix)
613624
self.return_bias = return_bias
614-
self.disable_tp = disable_tp
625+
self.disable_tp = disable_tp

vllm_ascend/worker/worker_v1.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,10 +250,19 @@ def compile_or_warm_up_model(self) -> None:
250250
self.model_runner._dummy_run(size)
251251
if not self.model_config.enforce_eager:
252252
self.model_runner.capture_model()
253+
# Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache)
254+
# may cause performance degradation at runtime.
255+
self._warm_up_atb()
253256
# Reset the seed to ensure that the random state is not affected by
254257
# the model initialization and profiling.
255258
NPUPlatform.seed_everything(self.model_config.seed)
256259

260+
def _warm_up_atb(self):
261+
x = torch.rand((2, 4), dtype=torch.float16).npu()
262+
weight = torch.rand((2, 4), dtype=torch.float16).npu()
263+
c = torch.rand((4, 4), dtype=torch.float32).npu()
264+
torch_npu._npu_matmul_add_fp32(x, weight, c)
265+
257266
def get_model(self) -> nn.Module:
258267
return self.model_runner.get_model()
259268

0 commit comments

Comments
 (0)