Skip to content

Commit 02e9248

Browse files
committed
fix: allocation failure for cache_tensor despite sufficient mbuf
Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
1 parent 4f34a29 commit 02e9248

File tree

2 files changed

+26
-16
lines changed

2 files changed

+26
-16
lines changed

vllm_ascend/distributed/llmdatadist_connector_v1.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@
2525
from vllm.v1.request import Request
2626

2727
import llm_datadist # type: ignore
28-
from llm_datadist import LLMException, LLMStatusCode
28+
from llm_datadist import LLMConfig, LLMException, LLMStatusCode
2929

30-
import vllm_ascend.envs as envs
30+
import vllm_ascend.envs as envs_ascend
3131
from vllm_ascend.attention.mla_v1 import AscendMLAMetadata
3232

3333
TORCH_DTYPE_TO_NPU_DTYPE = {
@@ -41,7 +41,7 @@
4141
torch.int32: llm_datadist.DataType.DT_INT32,
4242
}
4343

44-
GLOBAL_RANKTABLE = envs.GLOBAL_RANKTABLE
44+
GLOBAL_RANKTABLE = envs_ascend.LLMDATADIST_GLOBAL_RANKTABLE
4545

4646

4747
class ServerRole(enum.Enum):
@@ -289,16 +289,19 @@ def __init__(self, role: llm_datadist.LLMRole, local_rank: int,
289289
self.role, self.cluster_id)
290290

291291
def prepare_data_dist(self):
292-
# TODO: The maximum size of the mbuf for the llm datadist. We need to
293-
# find an appropriate value to minimize memory waste.
294-
options = {
295-
"llm.SyncKvCacheWaitTime": envs.LLMDATADIST_SYNC_CACHE_WAIT_TIME,
296-
"ge.flowGraphMemMaxSize": f"{int(2.25*1024*1024*1024):d}",
292+
buff_size = envs_ascend.LLMDATADIST_BUFFSIZE_MB * 1024 * 1024
293+
llm_config = LLMConfig()
294+
llm_config.ge_options = {
295+
"llm.SyncKvCacheWaitTime":
296+
envs_ascend.LLMDATADIST_SYNC_CACHE_WAIT_TIME,
297+
"ge.flowGraphMemMaxSize": f"{buff_size:d}",
297298
"ge.exec.deviceId": str(self.local_rank),
298299
}
300+
llm_config.buf_pool_cfg = '{"buf_cfg": [{"total_size":2097152,"blk_size":256,"max_buf_size":256}]}'
299301
if self.role == llm_datadist.LLMRole.PROMPT:
300-
options["llm.listenIpInfo"] = f"{self.local_device_ip}:26000"
301-
self.datadist_engine.init(options)
302+
llm_config.listen_ip_info = f"{self.local_device_ip}:26000"
303+
engine_options = llm_config.generate_options()
304+
self.datadist_engine.init(engine_options)
302305
logger.info("llm_datadist init done")
303306
self.kv_transfer = self.datadist_engine.kv_cache_manager
304307

@@ -869,7 +872,7 @@ def get_num_new_matched_tokens(
869872
self,
870873
request: "Request",
871874
num_computed_tokens: int,
872-
) -> int:
875+
) -> tuple[int, bool]:
873876
"""
874877
Get number of new tokens that can be loaded from the external KV cache
875878
beyond the num_computed_tokens.
@@ -887,15 +890,15 @@ def get_num_new_matched_tokens(
887890
# the block granularity. And it expects the returned blocks and
888891
# num_computed_tokens to also be aligned with the block granularity.
889892

890-
# NOTE: only request in waiting queue will come here. we use datadist
893+
# NOTE: only requests in waiting queue will come here. we use datadist
891894
# pull cache to do transfer, so we don't align to block_size in prefill,
892895
# we won't have extra new matched tokens; in decode, new request kv
893896
# cache will be transferred from prefill, so num_computed_tokens = 0,
894897
# and extra new matched tokens should be len(request.prompt_token_ids) -
895898
# 1
896899
if self.kv_role == llm_datadist.LLMRole.PROMPT:
897-
return 0
898-
return len(request.prompt_token_ids) - 1
900+
return 0, False
901+
return len(request.prompt_token_ids) - 1, False
899902

900903
def update_state_after_alloc(self, request: "Request",
901904
num_external_tokens: int):

vllm_ascend/envs.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,15 @@
9696
# 5000ms.
9797
"LLMDATADIST_SYNC_CACHE_WAIT_TIME":
9898
lambda: os.getenv("LLMDATADIST_SYNC_CACHE_WAIT_TIME", "5000"),
99+
# The path to the llmdatadist global rank table. If not set, the default
100+
# value is None. This results in an error if the global rank table is
101+
# required but not specified.
102+
"LLMDATADIST_GLOBAL_RANKTABLE":
103+
lambda: os.getenv("LLMDATADIST_GLOBAL_RANKTABLE", None),
104+
# The buffer size in MB for llmdatadist communication. If not set, the
105+
# default value is 2560 MB.
106+
"LLMDATADIST_BUFFSIZE_MB":
107+
lambda: int(os.getenv("LLMDATADIST_BUFFSIZE_MB", 2560)),
99108
# The version of vllm is installed. This value is used for developers who
100109
# installed vllm from source locally. In this case, the version of vllm is
101110
# usually changed. For example, if the version of vllm is "0.9.0", but when
@@ -133,8 +142,6 @@
133142
# value to False to disable the optimized model.
134143
"USE_OPTIMIZED_MODEL":
135144
lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))),
136-
"GLOBAL_RANKTABLE":
137-
lambda: os.getenv("GLOBAL_RANKTABLE", None)
138145
}
139146

140147
# end-env-vars-definition

0 commit comments

Comments
 (0)