File tree Expand file tree Collapse file tree 3 files changed +16
-10
lines changed Expand file tree Collapse file tree 3 files changed +16
-10
lines changed Original file line number Diff line number Diff line change @@ -429,7 +429,8 @@ def test_execute_dummy_batch(self):
429
429
worker .execute_dummy_batch ()
430
430
431
431
# Verify call
432
- mock_model_runner ._dummy_run .assert_called_once_with (1 )
432
+ mock_model_runner ._dummy_run .assert_called_once_with (
433
+ 1 , uniform_decode = True , force_attention = True )
433
434
434
435
@patch ("vllm_ascend.worker.worker_v1.envs_vllm" )
435
436
@patch ("vllm_ascend.worker.worker_v1.logger" )
Original file line number Diff line number Diff line change @@ -2377,11 +2377,11 @@ def _dummy_run(
2377
2377
with_prefill = True
2378
2378
2379
2379
attn_metadata = self ._build_attention_metadata (
2380
- with_prefill ,
2381
- num_reqs ,
2382
- num_tokens ,
2383
- max_query_len ,
2384
- force_attention ,
2380
+ create_mixed_batch = False ,
2381
+ num_reqs = num_reqs ,
2382
+ num_tokens = num_tokens ,
2383
+ max_query_len = max_query_len ,
2384
+ force_attention = force_attention ,
2385
2385
)
2386
2386
2387
2387
if not self .in_profile_run and self .dynamic_eplb :
Original file line number Diff line number Diff line change 25
25
import torch_npu
26
26
import vllm .envs as envs_vllm
27
27
from torch_npu .op_plugin .atb ._atb_ops import _register_atb_extensions
28
- from vllm .config import VllmConfig
28
+ from torch_npu .profiler import dynamic_profile as dp
29
+ from vllm .config import CUDAGraphMode , VllmConfig
29
30
from vllm .distributed import (ensure_model_parallel_initialized ,
30
31
init_distributed_environment )
31
32
from vllm .distributed .kv_transfer import ensure_kv_transfer_initialized
@@ -333,9 +334,13 @@ def pin_lora(self, lora_id: int) -> bool:
333
334
return self .model_runner .pin_lora (lora_id )
334
335
335
336
def execute_dummy_batch (self ) -> None :
336
- self .model_runner ._dummy_run (1 ,
337
- uniform_decode = True ,
338
- force_attention = True )
337
+ uniform_decode = self .compilation_config .cudagraph_mode in [
338
+ CUDAGraphMode .FULL , CUDAGraphMode .PIECEWISE
339
+ ]
340
+ force_attention = self .compilation_config .cudagraph_mode == CUDAGraphMode .FULL
341
+ self .model_runner ._dummy_run (num_tokens = 1 ,
342
+ uniform_decode = uniform_decode ,
343
+ force_attention = force_attention )
339
344
340
345
def _init_worker_distributed_environment (self ) -> None :
341
346
"""Initialize the distributed environment."""
You can’t perform that action at this time.
0 commit comments