-
Notifications
You must be signed in to change notification settings - Fork 571
[Feature] Support using prefix-caching + cudagraph for inference #2924
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
94e5403
6080e07
560ab7d
a8da2cd
e0ffe2e
d280301
9dad411
bf28bcf
d8b9317
2fda709
ad3ca95
b609160
b12667b
1bf753d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -183,6 +183,7 @@ def start(self, api_server_pid=None): | |
engine_worker_queue_port=self.cfg.engine_worker_queue_port, | ||
pid_suffix=self.ipc_signal_suffix, | ||
) | ||
self.launched_cache_manager_signal.value[0] = 1 | ||
|
||
self.worker_proc = self._start_worker_service() | ||
console_logger.info("Waitting worker processes ready...") | ||
|
@@ -217,9 +218,6 @@ def start(self, api_server_pid=None): | |
# Start TokenProcessor thread | ||
self.token_processor.run() | ||
|
||
if self.do_profile: | ||
self._stop_profile() | ||
|
||
if self.cfg.splitwise_role != "mixed": | ||
# 单机逻辑 | ||
self.engine_worker_queue.available_prefill_instances.put(1) | ||
|
@@ -849,6 +847,17 @@ def _init_worker_signals(self): | |
create=True, | ||
) | ||
|
||
# launched_cache_manager_signal 用于感知engine是否启动了cache_manager | ||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed": | ||
gongshaotian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
launched_cache_manager_signal_data = np.zeros([1], dtype=np.int32) | ||
self.launched_cache_manager_signal = IPCSignal( | ||
name="launched_cache_manager_signal", | ||
array=launched_cache_manager_signal_data, | ||
dtype=np.int32, | ||
suffix=self.ipc_signal_suffix, | ||
create=True, | ||
) | ||
|
||
# worker_live_signal 用于engine感知各worker进程是否存活,记录每个step 时间 | ||
worker_healthy_live_recorded_time_array = np.zeros(shape=[self.cfg.worker_num_per_node], dtype=np.int32) | ||
self.worker_healthy_live_signal = IPCSignal( | ||
|
@@ -1133,6 +1142,7 @@ def _stop_profile(self): | |
engine_worker_queue_port=self.cfg.engine_worker_queue_port, | ||
pid_suffix=self.ipc_signal_suffix, | ||
) | ||
self.launched_cache_manager_signal.value[0] = 1 | ||
gongshaotian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def check_health(self, time_interval_threashold=30): | ||
""" | ||
|
@@ -1168,6 +1178,9 @@ def detect_thread(): | |
self.worker_init_status["layer_loadding"] = progress | ||
if self.worker_init_status["layer_loadding"] == self.cfg.model_config.num_layers - 1: | ||
self.worker_init_status["finished"] = True | ||
elif match := re.search(r"num_blocks_global", line): | ||
if self.do_profile: | ||
self._stop_profile() | ||
|
||
self.checking_worker_status_thread = threading.Thread(target=detect_thread, daemon=True) | ||
self.checking_worker_status_thread.start() | ||
Comment on lines
1181
to
1183
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这几行也需要删掉 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个删掉了之后,engine启动那边加载模型的进度条就没有了,只能在workerlog.0中看到加载进度。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 目前版本engine感知到worker启动结束,也需要这个线程去判断,是需要更改现有engine感知worker启动的方式吗? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 范围标错了 |
||
|
Uh oh!
There was an error while loading. Please reload this page.