2626
2727import queue
2828import threading
29- from typing import Dict , List , Union
29+ from typing import Dict , List , Optional , Union
3030
3131import triton_python_backend_utils as pb_utils
3232from vllm .config import VllmConfig
33- from vllm .engine .metrics import StatLoggerBase as VllmStatLoggerBase
34- from vllm .engine .metrics import Stats as VllmStats
35- from vllm .engine .metrics import SupportsMetricsInfo , build_1_2_5_buckets
33+ from vllm .v1 .metrics .loggers import StatLoggerBase , build_1_2_5_buckets
34+ from vllm .v1 .metrics .stats import IterationStats , SchedulerStats
3635
3736
3837class TritonMetrics :
@@ -161,13 +160,35 @@ def __init__(self, labels: List[str], max_model_len: int):
161160 )
162161
163162
164- class VllmStatLogger (VllmStatLoggerBase ):
163+ # Create a partially initialized callable that adapts VllmStatLogger to StatLoggerFactory interface
164+ class VllmStatLoggerFactory :
165+ def __init__ (self , labels , log_logger ):
166+ self ._labels = labels
167+ self ._log_logger = log_logger
168+ self ._instances_list = []
169+
170+ def __call__ (self , vllm_config , engine_index ):
171+ stat_logger = VllmStatLogger (
172+ self ._labels , self ._log_logger , vllm_config , engine_index
173+ )
174+ self ._instances_list .append (stat_logger )
175+ return stat_logger
176+
177+ def finalize (self ):
178+ for stat_logger in self ._instances_list :
179+ if stat_logger is not None :
180+ stat_logger .finalize ()
181+
182+
183+ class VllmStatLogger (StatLoggerBase ):
165184 """StatLogger is used as an adapter between vLLM stats collector and Triton metrics provider."""
166185
167- def __init__ (self , labels : Dict , vllm_config : VllmConfig , log_logger ) -> None :
186+ def __init__ (
187+ self , labels : Dict , log_logger , vllm_config : VllmConfig , engine_index : int
188+ ) -> None :
168189 # Tracked stats over current local logging interval.
169190 # local_interval not used here. It's for vLLM logs to stdout.
170- super ().__init__ (local_interval = 0 , vllm_config = vllm_config )
191+ super ().__init__ (vllm_config = vllm_config , engine_index = engine_index )
171192 self .metrics = TritonMetrics (
172193 labels = labels , max_model_len = vllm_config .model_config .max_model_len
173194 )
@@ -176,12 +197,9 @@ def __init__(self, labels: Dict, vllm_config: VllmConfig, log_logger) -> None:
176197 # Starting the metrics thread. It allows vLLM to keep making progress
177198 # while reporting metrics to triton metrics service.
178199 self ._logger_queue = queue .Queue ()
179- self ._logger_thread = threading .Thread (target = self .logger_loop )
200+ self ._logger_thread = threading .Thread (target = self ._logger_loop )
180201 self ._logger_thread .start ()
181202
182- def info (self , type : str , obj : SupportsMetricsInfo ) -> None :
183- pass
184-
185203 def _log_counter (self , counter , data : Union [int , float ]) -> None :
186204 """Convenience function for logging to counter.
187205
@@ -208,7 +226,12 @@ def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None
208226 for datum in data :
209227 self ._logger_queue .put_nowait ((histogram , "observe" , datum ))
210228
211- def log (self , stats : VllmStats ) -> None :
229+ def record (
230+ self ,
231+ scheduler_stats : Optional [SchedulerStats ],
232+ iteration_stats : Optional [IterationStats ],
233+ engine_idx : int = 0 ,
234+ ) -> None :
212235 """Report stats to Triton metrics server.
213236
214237 Args:
@@ -217,38 +240,54 @@ def log(self, stats: VllmStats) -> None:
217240 Returns:
218241 None
219242 """
243+
244+ # Parse finished request stats into lists
245+ e2e_latency : List [float ] = []
246+ num_prompt_tokens : List [int ] = []
247+ num_generation_tokens : List [int ] = []
248+ for finished_req in iteration_stats .finished_requests :
249+ e2e_latency .append (finished_req .e2e_latency )
250+ num_prompt_tokens .append (finished_req .num_prompt_tokens )
251+ num_generation_tokens .append (finished_req .num_generation_tokens )
252+
220253 # The list of vLLM metrics reporting to Triton is also documented here.
221254 # https://github.com/triton-inference-server/vllm_backend/blob/main/README.md#triton-metrics
222255 counter_metrics = [
223- (self .metrics .counter_prompt_tokens , stats .num_prompt_tokens_iter ),
224- (self .metrics .counter_generation_tokens , stats .num_generation_tokens_iter ),
256+ (self .metrics .counter_prompt_tokens , iteration_stats .num_prompt_tokens ),
257+ (
258+ self .metrics .counter_generation_tokens ,
259+ iteration_stats .num_generation_tokens ,
260+ ),
225261 ]
226262 histogram_metrics = [
227263 (
228264 self .metrics .histogram_time_to_first_token ,
229- stats .time_to_first_tokens_iter ,
265+ iteration_stats .time_to_first_tokens_iter ,
230266 ),
231267 (
232268 self .metrics .histogram_time_per_output_token ,
233- stats . time_per_output_tokens_iter ,
269+ iteration_stats . inter_token_latencies_iter ,
234270 ),
235- (self .metrics .histogram_e2e_time_request , stats . time_e2e_requests ),
271+ (self .metrics .histogram_e2e_time_request , e2e_latency ),
236272 (
237273 self .metrics .histogram_num_prompt_tokens_request ,
238- stats . num_prompt_tokens_requests ,
274+ num_prompt_tokens ,
239275 ),
240276 (
241277 self .metrics .histogram_num_generation_tokens_request ,
242- stats . num_generation_tokens_requests ,
278+ num_generation_tokens ,
243279 ),
244- (self .metrics .histogram_n_request , stats . n_requests ),
280+ (self .metrics .histogram_n_request , iteration_stats . n_params_iter ),
245281 ]
246282 for metric , data in counter_metrics :
247283 self ._log_counter (metric , data )
248284 for metric , data in histogram_metrics :
249285 self ._log_histogram (metric , data )
250286
251- def logger_loop (self ):
287+ def log_engine_initialized (self ) -> None :
288+ pass
289+
290+ def _logger_loop (self ):
252291 while True :
253292 item = self ._logger_queue .get ()
254293 # To signal shutdown a None item will be added to the queue.
0 commit comments