Skip to content

Commit bf03457

Browse files
bugfix_mla
Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
1 parent f5f88f7 commit bf03457

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

vllm_ascend/attention/mla_v1.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,10 @@ def __init__(self,
189189
self.block_size - 1) // self.block_size
190190
self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
191191
if self.chunked_prefill_enabled:
192+
if vllm_config.kv_transfer_config.is_kv_consumer:
193+
max_chunked_size = scheduler_config.max_num_seqs * self.block_size
194+
else:
195+
max_chunked_size = 128 * 1024
192196
self.chunked_prefill_workspace_size = min(
193197
# Max sure there is enough for 8 full length request or at least
194198
# 4 pages of cache per request
@@ -202,7 +206,7 @@ def __init__(self,
202206
# which would result in up-projected context being
203207
# 2*(192*128)*(64*1024) = 3gb
204208
# (assuming 192 QK head dim, 128 heads, and fp16)
205-
128 * 1024)
209+
max_chunked_size)
206210
assert self.chunked_prefill_workspace_size >= \
207211
scheduler_config.max_num_seqs * self.block_size
208212
self.chunked_prefill_workspace = torch.empty(

0 commit comments

Comments
 (0)