23
23
from vllm .logger import logger
24
24
from vllm .multimodal import MULTIMODAL_REGISTRY , MultiModalRegistry
25
25
from vllm .utils import cdiv
26
+ from vllm .v1 .core .kv_cache_manager import KVCacheBlocks
26
27
from vllm .v1 .core .sched .output import NewRequestData , SchedulerOutput
27
28
from vllm .v1 .core .sched .scheduler import Scheduler
28
- from vllm .v1 .core .kv_cache_manager import KVCacheBlocks
29
- from vllm .v1 .core .sched .utils import check_stop
30
- from vllm .v1 .engine import EngineCoreOutput , EngineCoreOutputs , EngineCoreEventType
29
+ from vllm .v1 .engine import EngineCoreEventType , EngineCoreOutputs
31
30
from vllm .v1 .kv_cache_interface import KVCacheConfig
32
31
from vllm .v1 .outputs import ModelRunnerOutput
33
32
from vllm .v1 .request import Request , RequestStatus
@@ -55,7 +54,6 @@ def __init__(
55
54
self .scheduled_req_ids : set [str ] = set ()
56
55
self .running : list [Request ] = []
57
56
58
-
59
57
def schedule (self ) -> SchedulerOutput :
60
58
if self .scheduler_config .chunked_prefill_enabled :
61
59
return super ().schedule ()
@@ -129,7 +127,7 @@ def skip_cur_request():
129
127
130
128
# Total computed tokens (local + external).
131
129
num_computed_tokens = (num_native_computed_tokens +
132
- num_external_computed_tokens )
130
+ num_external_computed_tokens )
133
131
else :
134
132
# P/D: skip checking prefix cache if loaded from remote kvs.
135
133
new_computed_blocks = KVCacheBlocks .create_empty ()
@@ -154,7 +152,7 @@ def skip_cur_request():
154
152
# requests, which have output tokens.
155
153
num_new_tokens = request .num_tokens - num_computed_tokens
156
154
max_tokens_in_kvcache = (self .kv_cache_config .num_blocks *
157
- self .block_size )
155
+ self .block_size )
158
156
prompt_limit = min (prompt_limit , max_tokens_in_kvcache )
159
157
160
158
# Finish request that exceeds prompt_limit or kv cache size.
@@ -166,7 +164,8 @@ def skip_cur_request():
166
164
prompt_limit ,
167
165
)
168
166
request .status = RequestStatus .FINISHED_IGNORED
169
- self .finished_req_ids .add (request .request_id ) # type: ignore
167
+ self .finished_req_ids .add ( # type: ignore
168
+ request .request_id ) # type: ignore
170
169
self .waiting .popleft ()
171
170
continue
172
171
@@ -175,12 +174,11 @@ def skip_cur_request():
175
174
skip_cur_request ()
176
175
continue
177
176
assert num_new_tokens > 0
178
-
179
177
if vllm_version_is ("0.9.0" ):
180
178
blocks = computed_blocks .blocks
181
179
else :
182
180
blocks = computed_blocks .blocks [0 ]
183
-
181
+
184
182
watermark = getattr (self .scheduler_config , "watermark" , 0.01 )
185
183
if not self ._check_watermark_for_prefill (request , num_new_tokens ,
186
184
blocks , watermark ):
@@ -194,8 +192,7 @@ def skip_cur_request():
194
192
num_native_computed_tokens ,
195
193
new_computed_blocks = computed_blocks ,
196
194
num_lookahead_tokens = self .num_lookahead_tokens ,
197
- delay_cache_blocks = load_kv_async
198
- )
195
+ delay_cache_blocks = load_kv_async )
199
196
if new_blocks is None :
200
197
# The request cannot be scheduled.
201
198
break
@@ -221,7 +218,7 @@ def skip_cur_request():
221
218
self .running .append (request )
222
219
if self .log_stats :
223
220
request .record_event (EngineCoreEventType .SCHEDULED ,
224
- scheduled_timestamp )
221
+ scheduled_timestamp )
225
222
self .scheduled_req_ids .add (request .request_id )
226
223
# Check request status.
227
224
if request .status == RequestStatus .WAITING :
@@ -299,8 +296,7 @@ def skip_cur_request():
299
296
request ,
300
297
num_new_tokens ,
301
298
num_draft_tokens = num_draft_tokens ,
302
- num_lookahead_tokens = self .num_lookahead_tokens
303
- )
299
+ num_lookahead_tokens = self .num_lookahead_tokens )
304
300
if new_blocks is None :
305
301
# The request cannot be scheduled.
306
302
# Preempt the lowest-priority request.
@@ -310,7 +306,8 @@ def skip_cur_request():
310
306
preempted_req .num_computed_tokens = 0
311
307
if self .log_stats :
312
308
preempted_req .record_event (
313
- EngineCoreEventType .PREEMPTED , scheduled_timestamp )
309
+ EngineCoreEventType .PREEMPTED ,
310
+ scheduled_timestamp )
314
311
self .waiting .appendleft (preempted_req )
315
312
preempted_reqs .append (preempted_req )
316
313
if preempted_req == request :
@@ -344,7 +341,7 @@ def skip_cur_request():
344
341
del request .spec_token_ids [num_scheduled_spec_tokens :]
345
342
scheduled_spec_decode_tokens [request .request_id ] = (
346
343
request .spec_token_ids )
347
-
344
+
348
345
# Record scheduled LoRA requests.
349
346
if self .lora_config and request .lora_request :
350
347
scheduled_loras .add (request .lora_request .lora_int_id )
0 commit comments