Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion vllm_ascend/attention/mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,11 @@ def __init__(self,
self.block_size - 1) // self.block_size
self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
if self.chunked_prefill_enabled:
if vllm_config.kv_transfer_config is not None and\
vllm_config.kv_transfer_config.is_kv_consumer:
max_chunked_size = scheduler_config.max_num_seqs * self.block_size
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the OOM caused by the batch size being greater than 1024?

else:
max_chunked_size = 128 * 1024
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

const variables should be defined at the front of file with bigger case, and don't forget to note it also.

self.chunked_prefill_workspace_size = min(
# Max sure there is enough for 8 full length request or at least
# 4 pages of cache per request
Expand All @@ -202,7 +207,7 @@ def __init__(self,
# which would result in up-projected context being
# 2*(192*128)*(64*1024) = 3gb
# (assuming 192 QK head dim, 128 heads, and fp16)
128 * 1024)
max_chunked_size)
assert self.chunked_prefill_workspace_size >= \
scheduler_config.max_num_seqs * self.block_size
self.chunked_prefill_workspace = torch.empty(
Expand Down
9 changes: 8 additions & 1 deletion vllm_ascend/worker/worker_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,14 @@ def determine_available_memory(self) -> int:
"not properly cleaned up before initializing the vLLM instance.")

# Get the peak memory allocation recorded by torch
peak_memory = torch_npu.npu.memory_stats()["allocated_bytes.all.peak"]
chuked_prefill_work_space_size = self.vllm_config.cache_config.block_size * \
self.vllm_config.scheduler_config.max_num_seqs * 1024 *1024
if self.vllm_config.scheduler_config.enable_chunked_prefill:
peak_memory = torch_npu.npu.memory_stats()["allocated_bytes.all.peak"] + \
chuked_prefill_work_space_size
else:
peak_memory = torch_npu.npu.memory_stats(
)["allocated_bytes.all.peak"]
# TODO: don`t need impl this func after empty_cache in
# Worker.determine_num_available_blocks() unified`
NPUPlatform.empty_cache()
Expand Down