update control stream

zeroRains · zeroRains · commit 9dad411e56d9 · 2025-07-21T21:39:04.000+08:00
diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py
@@ -151,8 +151,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request]):
         """
         Process inputs for prefill tasks and insert it to share_inputs buffer
         """
-        if "caches" not in self.share_inputs:
-            self.initialize_kv_cache()
 
         if req_dicts[-1].disaggregate_info is not None and req_dicts[-1].disaggregate_info["role"] == "prefill":
             os.environ["PREFILL_NODE_ONE_STEP_STOP"] = "1"
@@ -1035,11 +1033,11 @@ def update_share_input_block_num(self, num_gpu_blocks: int) -> None:
         Args:
             num_gpu_blocks:
         """
+        self.parallel_config.do_profile = False
         self.num_gcu_blocks = num_gpu_blocks
 
         # Reset block table and kv cache with global block num
-        if not (self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"):
-            self.initialize_kv_cache()
+        self.initialize_kv_cache()
 
         # Reset free list
         free_list = list(
@@ -1057,8 +1055,6 @@ def update_share_input_block_num(self, num_gpu_blocks: int) -> None:
             }
         )
 
-        self.parallel_config.do_profile = False
-
         if self.speculative_method in ["mtp"]:
             self.proposer.update_block_num(num_gpu_blocks)
 
diff --git a/fastdeploy/worker/gcu_worker.py b/fastdeploy/worker/gcu_worker.py
@@ -98,9 +98,9 @@ def get_model(self) -> nn.Layer:
         """ """
         return self.model_runner.get_model()
 
-    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
+    def initialize_cache(self, num_gpu_blocks: int) -> None:
         """ """
-        pass
+        self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)
 
     def execute_model(
         self,
@@ -134,7 +134,3 @@ def check_health(self) -> bool:
     def cal_theortical_kvcache(self) -> int:
         """ """
         return self.model_runner.cal_theortical_kvcache()
-
-    def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
-        """ """
-        self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -193,9 +193,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request]):
         Process inputs for prefill tasks and insert it to share_inputs buffer
         TODO(gongshaotian): Refactor this func
         """
-        # NOTE(luotingdan): Lazy initialize kv cache
-        if "caches" not in self.share_inputs:
-            self.initialize_kv_cache()
 
         # NOTE(luotingdan): Set environment variable of prefill node
         if req_dicts[-1].disaggregate_info is not None and req_dicts[-1].disaggregate_info["role"] == "prefill":
@@ -739,7 +736,6 @@ def initialize_kv_cache(self) -> None:
 
         else:
             for i in range(self.model_config.num_hidden_layers):
-
                 cache_kvs[f"key_caches_{i}"] = paddle.full(
                     shape=kv_cache_shape,
                     fill_value=0,
@@ -999,9 +995,6 @@ def capture_model(self) -> None:
         time_before_capture = time.perf_counter()
         expected_decode_len = 1
         capture_sizes = self.cudagraph_capture_sizes.copy()
-        need_init_cache = "caches" not in self.share_inputs
-        if need_init_cache:
-            self.initialize_kv_cache()
         for batch_size in sorted(capture_sizes, reverse=True):
             self._dummy_run(
                 num_tokens=self.parallel_config.max_num_batched_tokens,
@@ -1010,8 +1003,7 @@ def capture_model(self) -> None:
                 expected_decode_len=expected_decode_len,
             )
             logger.info(f"Warm up the model with the batch size:{batch_size}, num tokens:{expected_decode_len}")
-        if need_init_cache:
-            self.clear_cache()
+
         time_after_capture = time.perf_counter()
         logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds")
 
@@ -1237,6 +1229,7 @@ def profile_run(self) -> None:
 
         if self.speculative_method in ["mtp"]:
             self.proposer.clear_dummy_input()
+        self.parallel_config.do_profile = False
 
     def update_share_input_block_num(self, num_gpu_blocks: int) -> None:
         """
@@ -1247,8 +1240,7 @@ def update_share_input_block_num(self, num_gpu_blocks: int) -> None:
         self.num_gpu_blocks = num_gpu_blocks
 
         # Reset block table and kv cache with global block num
-        if not (self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"):
-            self.initialize_kv_cache()
+        self.initialize_kv_cache()
 
         # Reset free list
         free_list = list(
@@ -1266,8 +1258,6 @@ def update_share_input_block_num(self, num_gpu_blocks: int) -> None:
             }
         )
 
-        self.parallel_config.do_profile = False
-
         if self.speculative_method in ["mtp"]:
             self.proposer.update_block_num(num_gpu_blocks)
 
diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py
@@ -165,9 +165,10 @@ def get_model(self) -> nn.Layer:
         """Get current model"""
         return self.model_runner.get_model()
 
-    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
-        """Initizlize the KV Cache"""
-        pass
+    def initialize_cache(self, num_gpu_blocks: int) -> None:
+        """Initizlize the KV Cache with accurate num_gpu_blocks"""
+        # accurate cache size
+        self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)
 
     def execute_model(
         self,
@@ -198,7 +199,3 @@ def check_health(self) -> bool:
     def cal_theortical_kvcache(self) -> int:
         """Calculate the block memory required"""
         return self.model_runner.cal_theortical_kvcache()
-
-    def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
-        """Reinitialize the kv cache using the parameters from the profile"""
-        self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)
diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py
@@ -141,9 +141,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request]):
         Process inputs for prefill tasks and insert it to share_inputs buffer
         TODO(gongshaotian): Refactor this func
         """
-        # NOTE(luotingdan): Lazy initialize kv cache
-        if "caches" not in self.share_inputs:
-            self.initialize_kv_cache()
 
         # NOTE(luotingdan): Set environment variable of prefill node
         if req_dicts[-1].disaggregate_info is not None and req_dicts[-1].disaggregate_info["role"] == "prefill":
@@ -1013,11 +1010,11 @@ def update_share_input_block_num(self, num_gpu_blocks: int) -> None:
         Args:
             num_gpu_blocks:
         """
+        self.parallel_config.do_profile = False
         self.num_gpu_blocks = num_gpu_blocks
 
         # Reset block table and kv cache with global block num
-        if not (self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"):
-            self.initialize_kv_cache()
+        self.initialize_kv_cache()
 
         # Reset free list
         free_list = list(
@@ -1035,8 +1032,6 @@ def update_share_input_block_num(self, num_gpu_blocks: int) -> None:
             }
         )
 
-        self.parallel_config.do_profile = False
-
     def cal_theortical_kvcache(self):
         """
         Calculate the total block memory required at the model level
diff --git a/fastdeploy/worker/iluvatar_worker.py b/fastdeploy/worker/iluvatar_worker.py
@@ -99,9 +99,9 @@ def get_model(self) -> nn.Layer:
         """ """
         return self.model_runner.get_model()
 
-    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
+    def initialize_cache(self, num_gpu_blocks: int) -> None:
         """ """
-        pass
+        self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)
 
     def execute_model(
         self,
@@ -135,7 +135,3 @@ def check_health(self) -> bool:
     def cal_theortical_kvcache(self) -> int:
         """ """
         return self.model_runner.cal_theortical_kvcache()
-
-    def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
-        """ """
-        self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)
diff --git a/fastdeploy/worker/worker_base.py b/fastdeploy/worker/worker_base.py
@@ -64,7 +64,7 @@ def init_device(self) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
+    def initialize_cache(self, num_gpu_blocks: int) -> None:
         """Initizlize the KV Cache with the given size in blocks."""
         raise NotImplementedError
 
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
@@ -347,7 +347,7 @@ def event_loop_normal(self) -> None:
 
             self.exist_prefill_task_signal.value[0] = self.worker.prefill_finished()
 
-    def determine_num_available_blocks(self) -> None:
+    def initialize_kv_cache(self) -> None:
         """Profiles the peak memory usage of the model to determine how many
         KV blocks may be allocated without OOMs.
 
@@ -403,10 +403,7 @@ def determine_num_available_blocks(self) -> None:
         # logger.info will write in worker_process.log
         # Need `print` to triger engine->check_worker_initialize_status->detect_thread
         print(f"------- num_blocks_global: {num_blocks_local} --------")
-        # 4. Updata share inputs
-        self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_local)
-
-    def graph_optimize_and_warm_up_model(self) -> None:
+        # wait engine launch cache_manager
         if self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed":
             launched_cache_manager_signal_data = np.zeros([1], dtype=np.int32)
             self.launched_cache_manager_signal = IPCSignal(
@@ -418,6 +415,10 @@ def graph_optimize_and_warm_up_model(self) -> None:
             )
             while np.any(self.launched_cache_manager_signal.value[0] <= 0):
                 time.sleep(0.01)
+        # 4. init kv_cache with accurate num_blocks
+        self.worker.initialize_cache(num_gpu_blocks=num_blocks_local)
+
+    def graph_optimize_and_warm_up_model(self) -> None:
         self.worker.graph_optimize_and_warm_up_model()
 
     def init_device(self) -> None:
@@ -731,11 +732,11 @@ def run_worker_proc() -> None:
 
     # Load model
     worker_proc.load_model()
-    logger.info("determine_num_available_blocks")
-    worker_proc.determine_num_available_blocks()
+    # logger.info("determine_num_available_blocks")
+    worker_proc.initialize_kv_cache()
 
     # Trigger CUDAGraph capture
-    worker_proc.graph_optimize_and_warm_up_model()
+    worker_proc.worker.graph_optimize_and_warm_up_model()
 
     # Initialize health status
     worker_proc.init_health_status()