2929import triton_python_backend_utils as pb_utils
3030from vllm .engine .metrics import StatLoggerBase as VllmStatLoggerBase
3131from vllm .engine .metrics import Stats as VllmStats
32- from vllm .engine .metrics import SupportsMetricsInfo
32+ from vllm .engine .metrics import SupportsMetricsInfo , build_1_2_5_buckets
3333
3434
3535class TritonMetrics :
36- def __init__ (self , labels ):
36+ def __init__ (self , labels : List [ str ], max_model_len : int ):
3737 # Initialize metric families
3838 # Iteration stats
3939 self .counter_prompt_tokens_family = pb_utils .MetricFamily (
@@ -56,6 +56,34 @@ def __init__(self, labels):
5656 description = "Histogram of time per output token in seconds." ,
5757 kind = pb_utils .MetricFamily .HISTOGRAM ,
5858 )
59+ # Request stats
60+ # Latency
61+ self .histogram_e2e_time_request_family = pb_utils .MetricFamily (
62+ name = "vllm:e2e_request_latency_seconds" ,
63+ description = "Histogram of end to end request latency in seconds." ,
64+ kind = pb_utils .MetricFamily .HISTOGRAM ,
65+ )
66+ # Metadata
67+ self .histogram_num_prompt_tokens_request_family = pb_utils .MetricFamily (
68+ name = "vllm:request_prompt_tokens" ,
69+ description = "Number of prefill tokens processed." ,
70+ kind = pb_utils .MetricFamily .HISTOGRAM ,
71+ )
72+ self .histogram_num_generation_tokens_request_family = pb_utils .MetricFamily (
73+ name = "vllm:request_generation_tokens" ,
74+ description = "Number of generation tokens processed." ,
75+ kind = pb_utils .MetricFamily .HISTOGRAM ,
76+ )
77+ self .histogram_best_of_request_family = pb_utils .MetricFamily (
78+ name = "vllm:request_params_best_of" ,
79+ description = "Histogram of the best_of request parameter." ,
80+ kind = pb_utils .MetricFamily .HISTOGRAM ,
81+ )
82+ self .histogram_n_request_family = pb_utils .MetricFamily (
83+ name = "vllm:request_params_n" ,
84+ description = "Histogram of the n request parameter." ,
85+ kind = pb_utils .MetricFamily .HISTOGRAM ,
86+ )
5987
6088 # Initialize metrics
6189 # Iteration stats
@@ -65,7 +93,7 @@ def __init__(self, labels):
6593 self .counter_generation_tokens = self .counter_generation_tokens_family .Metric (
6694 labels = labels
6795 )
68- # Use the same bucket boundaries from vLLM sample metrics.
96+ # Use the same bucket boundaries from vLLM sample metrics as an example .
6997 # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96
7098 self .histogram_time_to_first_token = (
7199 self .histogram_time_to_first_token_family .Metric (
@@ -110,16 +138,43 @@ def __init__(self, labels):
110138 ],
111139 )
112140 )
141+ # Request stats
142+ # Latency
143+ self .histogram_e2e_time_request = self .histogram_e2e_time_request_family .Metric (
144+ labels = labels ,
145+ buckets = [1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ],
146+ )
147+ # Metadata
148+ self .histogram_num_prompt_tokens_request = (
149+ self .histogram_num_prompt_tokens_request_family .Metric (
150+ labels = labels ,
151+ buckets = build_1_2_5_buckets (max_model_len ),
152+ )
153+ )
154+ self .histogram_num_generation_tokens_request = (
155+ self .histogram_num_generation_tokens_request_family .Metric (
156+ labels = labels ,
157+ buckets = build_1_2_5_buckets (max_model_len ),
158+ )
159+ )
160+ self .histogram_best_of_request = self .histogram_best_of_request_family .Metric (
161+ labels = labels ,
162+ buckets = [1 , 2 , 5 , 10 , 20 ],
163+ )
164+ self .histogram_n_request = self .histogram_n_request_family .Metric (
165+ labels = labels ,
166+ buckets = [1 , 2 , 5 , 10 , 20 ],
167+ )
113168
114169
115170class VllmStatLogger (VllmStatLoggerBase ):
116171 """StatLogger is used as an adapter between vLLM stats collector and Triton metrics provider."""
117172
118173 # local_interval not used here. It's for vLLM logs to stdout.
119- def __init__ (self , labels : Dict , local_interval : float = 0 ) -> None :
174+ def __init__ (self , labels : Dict , max_model_len : int ) -> None :
120175 # Tracked stats over current local logging interval.
121- super ().__init__ (local_interval )
122- self .metrics = TritonMetrics (labels = labels )
176+ super ().__init__ (local_interval = 0 )
177+ self .metrics = TritonMetrics (labels , max_model_len )
123178
124179 def info (self , type : str , obj : SupportsMetricsInfo ) -> None :
125180 pass
@@ -159,16 +214,35 @@ def log(self, stats: VllmStats) -> None:
159214 Returns:
160215 None
161216 """
162- self ._log_counter (
163- self .metrics .counter_prompt_tokens , stats .num_prompt_tokens_iter
164- )
165- self ._log_counter (
166- self .metrics .counter_generation_tokens , stats .num_generation_tokens_iter
167- )
168- self ._log_histogram (
169- self .metrics .histogram_time_to_first_token , stats .time_to_first_tokens_iter
170- )
171- self ._log_histogram (
172- self .metrics .histogram_time_per_output_token ,
173- stats .time_per_output_tokens_iter ,
174- )
217+ # The list of vLLM metrics reporting to Triton is also documented here.
218+ # https://github.com/triton-inference-server/vllm_backend/blob/main/README.md#triton-metrics
219+ counter_metrics = [
220+ (self .metrics .counter_prompt_tokens , stats .num_prompt_tokens_iter ),
221+ (self .metrics .counter_generation_tokens , stats .num_generation_tokens_iter ),
222+ ]
223+ histogram_metrics = [
224+ (
225+ self .metrics .histogram_time_to_first_token ,
226+ stats .time_to_first_tokens_iter ,
227+ ),
228+ (
229+ self .metrics .histogram_time_per_output_token ,
230+ stats .time_per_output_tokens_iter ,
231+ ),
232+ (self .metrics .histogram_e2e_time_request , stats .time_e2e_requests ),
233+ (
234+ self .metrics .histogram_num_prompt_tokens_request ,
235+ stats .num_prompt_tokens_requests ,
236+ ),
237+ (
238+ self .metrics .histogram_num_generation_tokens_request ,
239+ stats .num_generation_tokens_requests ,
240+ ),
241+ (self .metrics .histogram_best_of_request , stats .best_of_requests ),
242+ (self .metrics .histogram_n_request , stats .n_requests ),
243+ ]
244+
245+ for metric , data in counter_metrics :
246+ self ._log_counter (metric , data )
247+ for metric , data in histogram_metrics :
248+ self ._log_histogram (metric , data )
0 commit comments