File tree Expand file tree Collapse file tree 1 file changed +5
-1
lines changed Expand file tree Collapse file tree 1 file changed +5
-1
lines changed Original file line number Diff line number Diff line change @@ -189,6 +189,10 @@ def __init__(self,
189
189
self .block_size - 1 ) // self .block_size
190
190
self .chunked_prefill_enabled = scheduler_config .chunked_prefill_enabled
191
191
if self .chunked_prefill_enabled :
192
+ if vllm_config .kv_transfer_config .is_kv_consumer :
193
+ max_chunked_size = scheduler_config .max_num_seqs * self .block_size
194
+ else :
195
+ max_chunked_size = 128 * 1024
192
196
self .chunked_prefill_workspace_size = min (
193
197
# Max sure there is enough for 8 full length request or at least
194
198
# 4 pages of cache per request
@@ -202,7 +206,7 @@ def __init__(self,
202
206
# which would result in up-projected context being
203
207
# 2*(192*128)*(64*1024) = 3gb
204
208
# (assuming 192 QK head dim, 128 heads, and fp16)
205
- 128 * 1024 )
209
+ max_chunked_size )
206
210
assert self .chunked_prefill_workspace_size >= \
207
211
scheduler_config .max_num_seqs * self .block_size
208
212
self .chunked_prefill_workspace = torch .empty (
You can’t perform that action at this time.
0 commit comments