From bf034579eace6bef5d786bd6eb22714337963f52 Mon Sep 17 00:00:00 2001 From: wangxiaoteng Date: Sat, 13 Sep 2025 15:47:24 +0800 Subject: [PATCH 1/4] bugfix_mla Signed-off-by: wangxiaoteng --- vllm_ascend/attention/mla_v1.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index b6ff26ab4a..326689849f 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -189,6 +189,10 @@ def __init__(self, self.block_size - 1) // self.block_size self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled if self.chunked_prefill_enabled: + if vllm_config.kv_transfer_config.is_kv_consumer: + max_chunked_size = scheduler_config.max_num_seqs * self.block_size + else: + max_chunked_size = 128 * 1024 self.chunked_prefill_workspace_size = min( # Max sure there is enough for 8 full length request or at least # 4 pages of cache per request @@ -202,7 +206,7 @@ def __init__(self, # which would result in up-projected context being # 2*(192*128)*(64*1024) = 3gb # (assuming 192 QK head dim, 128 heads, and fp16) - 128 * 1024) + max_chunked_size) assert self.chunked_prefill_workspace_size >= \ scheduler_config.max_num_seqs * self.block_size self.chunked_prefill_workspace = torch.empty( From 405cd8a389e5f1ec16ab9a1994310b6bd6fae5cb Mon Sep 17 00:00:00 2001 From: wangxiaoteng Date: Tue, 16 Sep 2025 21:15:20 +0800 Subject: [PATCH 2/4] bugfix_mla Signed-off-by: wangxiaoteng --- vllm_ascend/attention/mla_v1.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 326689849f..a26c8f0864 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -189,7 +189,8 @@ def __init__(self, self.block_size - 1) // self.block_size self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled if self.chunked_prefill_enabled: - if vllm_config.kv_transfer_config.is_kv_consumer: + if vllm_config.kv_transfer_config is not None and\ + vllm_config.kv_transfer_config.is_kv_consumer: max_chunked_size = scheduler_config.max_num_seqs * self.block_size else: max_chunked_size = 128 * 1024 From dce78b8a1fe34ab3afd432f8366331932d006a9a Mon Sep 17 00:00:00 2001 From: wangxiaoteng Date: Wed, 17 Sep 2025 10:19:20 +0800 Subject: [PATCH 3/4] bugfix_mla Signed-off-by: wangxiaoteng --- vllm_ascend/worker/worker_v1.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 2f3423c781..3f87c1bf40 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -175,7 +175,13 @@ def determine_available_memory(self) -> int: "not properly cleaned up before initializing the vLLM instance.") # Get the peak memory allocation recorded by torch - peak_memory = torch_npu.npu.memory_stats()["allocated_bytes.all.peak"] + chuked_prefill_work_space_size = self.vllm_config.cache_config.block_size * \ + self.vllm_config.scheduler_config.max_num_seqs * 1024 *1024 + if self.vllm_config.scheduler_config.enable_chunked_prefill: + peak_memory = torch_npu.npu.memory_stats()["allocated_bytes.all.peak"] + \ + chuked_prefill_work_space_size + else: + peak_memory = torch_npu.npu.memory_stats()["allocated_bytes.all.peak"] # TODO: don`t need impl this func after empty_cache in # Worker.determine_num_available_blocks() unified` NPUPlatform.empty_cache() From 98d234ab3e68df844e7956750ed7ff9059858c8d Mon Sep 17 00:00:00 2001 From: wangxiaoteng Date: Wed, 17 Sep 2025 10:26:17 +0800 Subject: [PATCH 4/4] bugfix_mla Signed-off-by: wangxiaoteng --- vllm_ascend/worker/worker_v1.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 3f87c1bf40..d52bf36085 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -181,7 +181,8 @@ def determine_available_memory(self) -> int: peak_memory = torch_npu.npu.memory_stats()["allocated_bytes.all.peak"] + \ chuked_prefill_work_space_size else: - peak_memory = torch_npu.npu.memory_stats()["allocated_bytes.all.peak"] + peak_memory = torch_npu.npu.memory_stats( + )["allocated_bytes.all.peak"] # TODO: don`t need impl this func after empty_cache in # Worker.determine_num_available_blocks() unified` NPUPlatform.empty_cache()