fix: allocation failure for cache_tensor despite sufficient mbuf

jianzs · jianzs · commit 02e9248d23c5 · 2025-06-15T17:32:48.000+08:00
Signed-off-by: Jade Zheng &lt;zheng.shoujian@outlook.com&gt;
diff --git a/vllm_ascend/distributed/llmdatadist_connector_v1.py b/vllm_ascend/distributed/llmdatadist_connector_v1.py
@@ -25,9 +25,9 @@
     from vllm.v1.request import Request
 
 import llm_datadist  # type: ignore
-from llm_datadist import LLMException, LLMStatusCode
+from llm_datadist import LLMConfig, LLMException, LLMStatusCode
 
-import vllm_ascend.envs as envs
+import vllm_ascend.envs as envs_ascend
 from vllm_ascend.attention.mla_v1 import AscendMLAMetadata
 
 TORCH_DTYPE_TO_NPU_DTYPE = {
@@ -41,7 +41,7 @@
     torch.int32: llm_datadist.DataType.DT_INT32,
 }
 
-GLOBAL_RANKTABLE = envs.GLOBAL_RANKTABLE
+GLOBAL_RANKTABLE = envs_ascend.LLMDATADIST_GLOBAL_RANKTABLE
 
 
 class ServerRole(enum.Enum):
@@ -289,16 +289,19 @@ def __init__(self, role: llm_datadist.LLMRole, local_rank: int,
             self.role, self.cluster_id)
 
     def prepare_data_dist(self):
-        # TODO: The maximum size of the mbuf for the llm datadist. We need to
-        # find an appropriate value to minimize memory waste.
-        options = {
-            "llm.SyncKvCacheWaitTime": envs.LLMDATADIST_SYNC_CACHE_WAIT_TIME,
-            "ge.flowGraphMemMaxSize": f"{int(2.25*1024*1024*1024):d}",
+        buff_size = envs_ascend.LLMDATADIST_BUFFSIZE_MB * 1024 * 1024
+        llm_config = LLMConfig()
+        llm_config.ge_options = {
+            "llm.SyncKvCacheWaitTime":
+            envs_ascend.LLMDATADIST_SYNC_CACHE_WAIT_TIME,
+            "ge.flowGraphMemMaxSize": f"{buff_size:d}",
             "ge.exec.deviceId": str(self.local_rank),
         }
+        llm_config.buf_pool_cfg = '{"buf_cfg": [{"total_size":2097152,"blk_size":256,"max_buf_size":256}]}'
         if self.role == llm_datadist.LLMRole.PROMPT:
-            options["llm.listenIpInfo"] = f"{self.local_device_ip}:26000"
-        self.datadist_engine.init(options)
+            llm_config.listen_ip_info = f"{self.local_device_ip}:26000"
+        engine_options = llm_config.generate_options()
+        self.datadist_engine.init(engine_options)
         logger.info("llm_datadist init done")
         self.kv_transfer = self.datadist_engine.kv_cache_manager
 
@@ -869,7 +872,7 @@ def get_num_new_matched_tokens(
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> int:
+    ) -> tuple[int, bool]:
         """
         Get number of new tokens that can be loaded from the external KV cache
         beyond the num_computed_tokens.
@@ -887,15 +890,15 @@ def get_num_new_matched_tokens(
         # the block granularity. And it expects the returned blocks and
         # num_computed_tokens to also be aligned with the block granularity.
 
-        # NOTE: only request in waiting queue will come here. we use datadist
+        # NOTE: only requests in waiting queue will come here. we use datadist
         # pull cache to do transfer, so we don't align to block_size in prefill,
         # we won't have extra new matched tokens; in decode, new request kv
         # cache will be transferred from prefill, so num_computed_tokens = 0,
         # and extra new matched tokens should be len(request.prompt_token_ids) -
         # 1
         if self.kv_role == llm_datadist.LLMRole.PROMPT:
-            return 0
-        return len(request.prompt_token_ids) - 1
+            return 0, False
+        return len(request.prompt_token_ids) - 1, False
 
     def update_state_after_alloc(self, request: "Request",
                                  num_external_tokens: int):
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -96,6 +96,15 @@
     # 5000ms.
     "LLMDATADIST_SYNC_CACHE_WAIT_TIME":
     lambda: os.getenv("LLMDATADIST_SYNC_CACHE_WAIT_TIME", "5000"),
+    # The path to the llmdatadist global rank table. If not set, the default
+    # value is None. This results in an error if the global rank table is
+    # required but not specified.
+    "LLMDATADIST_GLOBAL_RANKTABLE":
+    lambda: os.getenv("LLMDATADIST_GLOBAL_RANKTABLE", None),
+    # The buffer size in MB for llmdatadist communication. If not set, the
+    # default value is 2560 MB.
+    "LLMDATADIST_BUFFSIZE_MB":
+    lambda: int(os.getenv("LLMDATADIST_BUFFSIZE_MB", 2560)),
     # The version of vllm is installed. This value is used for developers who
     # installed vllm from source locally. In this case, the version of vllm is
     # usually changed. For example, if the version of vllm is "0.9.0", but when
@@ -133,8 +142,6 @@
     # value to False to disable the optimized model.
     "USE_OPTIMIZED_MODEL":
     lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))),
-    "GLOBAL_RANKTABLE":
-    lambda: os.getenv("GLOBAL_RANKTABLE", None)
 }
 
 # end-env-vars-definition