From b87a1cd583049e5b0696e43f2ebaffc10b3a3c7f Mon Sep 17 00:00:00 2001 From: Pavloveuge Date: Sun, 4 May 2025 05:10:23 +0300 Subject: [PATCH 1/3] add new metrics reporting --- src/utils/metrics.py | 171 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/src/utils/metrics.py b/src/utils/metrics.py index ecb044d..a593f70 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -29,6 +29,7 @@ from typing import Dict, List, Union import triton_python_backend_utils as pb_utils + from vllm.config import VllmConfig from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase from vllm.engine.metrics import Stats as VllmStats @@ -49,6 +50,16 @@ def __init__(self, labels: List[str], max_model_len: int): description="Number of generation tokens processed.", kind=pb_utils.MetricFamily.COUNTER, ) + self.counter_preemption_tokens_family = pb_utils.MetricFamily( + name="vllm:num_preemptions_total", + description="Number of preemption tokens processed.", + kind=pb_utils.MetricFamily.COUNTER, + ) + self.histogram_iteration_tokens_total_family = pb_utils.MetricFamily( + name="vllm:iteration_tokens_total", + description="Histogram of number of tokens per engine_step.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) self.histogram_time_to_first_token_family = pb_utils.MetricFamily( name="vllm:time_to_first_token_seconds", description="Histogram of time to first token in seconds.", @@ -66,6 +77,26 @@ def __init__(self, labels: List[str], max_model_len: int): description="Histogram of end to end request latency in seconds.", kind=pb_utils.MetricFamily.HISTOGRAM, ) + self.histogram_queue_time_request_family = pb_utils.MetricFamily( + name="vllm:request_queue_time_seconds", + description="Histogram of time spent in WAITING phase for request.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + self.histogram_inference_time_request_family = pb_utils.MetricFamily( + name="vllm:request_inference_time_seconds", + description="Histogram of time spent in RUNNING phase for request.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + self.histogram_prefill_time_request_family = pb_utils.MetricFamily( + name="vllm:request_prefill_time_seconds", + description="Histogram of time spent in PREFILL phase for request.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + self.histogram_decode_time_request_family = pb_utils.MetricFamily( + name="vllm:request_decode_time_seconds", + description="Histogram of time spent in DECODE phase for request.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) # Metadata self.histogram_num_prompt_tokens_request_family = pb_utils.MetricFamily( name="vllm:request_prompt_tokens", @@ -82,6 +113,45 @@ def __init__(self, labels: List[str], max_model_len: int): description="Histogram of the n request parameter.", kind=pb_utils.MetricFamily.HISTOGRAM, ) + # System stats + # Scheduler State + self.gauge_scheduler_running_family = pb_utils.MetricFamily( + name="vllm:num_requests_running", + description="Number of requests currently running on GPU.", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.gauge_scheduler_waiting_family = pb_utils.MetricFamily( + name="vllm:num_requests_waiting", + description="Number of requests waiting to be processed.", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.gauge_scheduler_swapped_family = pb_utils.MetricFamily( + name="vllm:num_requests_swapped", + description="Number of requests swapped to CPU.", + kind=pb_utils.MetricFamily.GAUGE, + ) + # KV Cache Usage in % + self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily( + name="vllm:gpu_cache_usage_perc", + description="GPU KV-cache usage. 1 means 100 percent usage.", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.gauge_cpu_cache_usage_family = pb_utils.MetricFamily( + name="vllm:cpu_cache_usage_perc", + description="CPU KV-cache usage. 1 means 100 percent usage.", + kind=pb_utils.MetricFamily.GAUGE, + ) + # Prefix caching block hit rate + self.gauge_cpu_prefix_cache_hit_rate_family = pb_utils.MetricFamily( + name="vllm:cpu_prefix_cache_hit_rate", + description="CPU prefix cache block hit rate.", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.gauge_gpu_prefix_cache_hit_rate_family = pb_utils.MetricFamily( + name="vllm:gpu_prefix_cache_hit_rate", + description="GPU prefix cache block hit rate.", + kind=pb_utils.MetricFamily.GAUGE, + ) # Initialize metrics # Iteration stats @@ -91,8 +161,19 @@ def __init__(self, labels: List[str], max_model_len: int): self.counter_generation_tokens = self.counter_generation_tokens_family.Metric( labels=labels ) + self.counter_preemption_tokens = self.counter_preemption_tokens_family.Metric( + labels=labels + ) + # Use the same bucket boundaries from vLLM sample metrics as an example. # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96 + self.histogram_iteration_tokens_total = ( + self.histogram_iteration_tokens_total_family.Metric( + labels=labels, + buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096], + ) + ) + self.histogram_time_to_first_token = ( self.histogram_time_to_first_token_family.Metric( labels=labels, @@ -142,6 +223,30 @@ def __init__(self, labels: List[str], max_model_len: int): labels=labels, buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], ) + self.histogram_prefill_time_request = ( + self.histogram_prefill_time_request_family.Metric( + labels=labels, + buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + ) + ) + self.histogram_decode_time_request = ( + self.histogram_decode_time_request_family.Metric( + labels=labels, + buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + ) + ) + self.histogram_inference_time_request = ( + self.histogram_inference_time_request_family.Metric( + labels=labels, + buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + ) + ) + self.histogram_queue_time_request = ( + self.histogram_queue_time_request_family.Metric( + labels=labels, + buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + ) + ) # Metadata self.histogram_num_prompt_tokens_request = ( self.histogram_num_prompt_tokens_request_family.Metric( @@ -159,6 +264,31 @@ def __init__(self, labels: List[str], max_model_len: int): labels=labels, buckets=[1, 2, 5, 10, 20], ) + # System stats + # Scheduler State + self.gauge_num_requests_running = self.gauge_scheduler_running_family.Metric( + labels=labels + ) + self.gauge_num_requests_waiting = self.gauge_scheduler_waiting_family.Metric( + labels=labels + ) + self.gauge_num_requests_swapped = self.gauge_scheduler_swapped_family.Metric( + labels=labels + ) + # KV Cache Usage in % + self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric( + labels=labels + ) + self.gauge_cpu_cache_usage = self.gauge_cpu_cache_usage_family.Metric( + labels=labels + ) + # Prefix caching block hit rate + self.gauge_cpu_prefix_cache_hit_rate = ( + self.gauge_cpu_prefix_cache_hit_rate_family.Metric(labels=labels) + ) + self.gauge_gpu_prefix_cache_hit_rate = ( + self.gauge_gpu_prefix_cache_hit_rate_family.Metric(labels=labels) + ) class VllmStatLogger(VllmStatLoggerBase): @@ -182,6 +312,18 @@ def __init__(self, labels: Dict, vllm_config: VllmConfig, log_logger) -> None: def info(self, type: str, obj: SupportsMetricsInfo) -> None: pass + def _log_gauge(self, gauge, data: Union[int, float]) -> None: + """Convenience function for logging to gauge. + + Args: + gauge: A gauge metric instance. + data: An int or float to set as the current gauge value. + + Returns: + None + """ + self._logger_queue.put_nowait((gauge, "set", data)) + def _log_counter(self, counter, data: Union[int, float]) -> None: """Convenience function for logging to counter. @@ -222,8 +364,10 @@ def log(self, stats: VllmStats) -> None: counter_metrics = [ (self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter), (self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter), + (self.metrics.counter_preemption_tokens, stats.num_preemption_iter), ] histogram_metrics = [ + (self.metrics.histogram_iteration_tokens_total, [stats.num_tokens_iter]), ( self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter, @@ -233,6 +377,13 @@ def log(self, stats: VllmStats) -> None: stats.time_per_output_tokens_iter, ), (self.metrics.histogram_e2e_time_request, stats.time_e2e_requests), + (self.metrics.histogram_prefill_time_request, stats.time_prefill_requests), + (self.metrics.histogram_decode_time_request, stats.time_decode_requests), + ( + self.metrics.histogram_inference_time_request, + stats.time_inference_requests, + ), + (self.metrics.histogram_queue_time_request, stats.time_queue_requests), ( self.metrics.histogram_num_prompt_tokens_request, stats.num_prompt_tokens_requests, @@ -243,10 +394,27 @@ def log(self, stats: VllmStats) -> None: ), (self.metrics.histogram_n_request, stats.n_requests), ] + gauge_metrics = [ + (self.metrics.gauge_num_requests_running, stats.num_running_sys), + (self.metrics.gauge_num_requests_waiting, stats.num_waiting_sys), + (self.metrics.gauge_num_requests_swapped, stats.num_swapped_sys), + (self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys), + (self.metrics.gauge_cpu_cache_usage, stats.cpu_cache_usage_sys), + ( + self.metrics.gauge_cpu_prefix_cache_hit_rate, + stats.cpu_prefix_cache_hit_rate, + ), + ( + self.metrics.gauge_gpu_prefix_cache_hit_rate, + stats.gpu_prefix_cache_hit_rate, + ), + ] for metric, data in counter_metrics: self._log_counter(metric, data) for metric, data in histogram_metrics: self._log_histogram(metric, data) + for metric, data in gauge_metrics: + self._log_gauge(metric, data) def logger_loop(self): while True: @@ -259,6 +427,8 @@ def logger_loop(self): metric.increment(data) elif command == "observe": metric.observe(data) + elif command == "set": + metric.set(data) else: self.log_logger.log_error(f"Undefined command name: {command}") @@ -268,3 +438,4 @@ def finalize(self): if self._logger_thread is not None: self._logger_thread.join() self._logger_thread = None + self._logger_thread = None From 1a1b66e71812b2b9a09f49dd057dc55067de2adb Mon Sep 17 00:00:00 2001 From: Pavloveuge Date: Sun, 4 May 2025 05:24:31 +0300 Subject: [PATCH 2/3] update readme --- README.md | 85 ++++++++++++++++++++++++++++++++++++++++++++ src/utils/metrics.py | 2 -- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b88f9ec..6be683b 100644 --- a/README.md +++ b/README.md @@ -227,12 +227,24 @@ VLLM stats are reported by the metrics endpoint in fields that are prefixed with counter_prompt_tokens # Number of generation tokens processed. counter_generation_tokens +# Number of preemption tokens processed. +counter_preemption_tokens +# Histogram of number of tokens per engine_step. +histogram_iteration_tokens # Histogram of time to first token in seconds. histogram_time_to_first_token # Histogram of time per output token in seconds. histogram_time_per_output_token # Histogram of end to end request latency in seconds. histogram_e2e_time_request +# Histogram of time spent in WAITING phase for request. +histogram_queue_time_request +# Histogram of time spent in RUNNING phase for request. +histogram_inference_time_request +# Histogram of time spent in PREFILL phase for request. +histogram_prefill_time_request +# Histogram of time spent in DECODE phase for request. +histogram_decode_time_request # Number of prefill tokens processed. histogram_num_prompt_tokens_request # Number of generation tokens processed. @@ -241,6 +253,20 @@ histogram_num_generation_tokens_request histogram_best_of_request # Histogram of the n request parameter. histogram_n_request +# Number of requests currently running on GPU. +gauge_scheduler_running +# Number of requests waiting to be processed. +gauge_scheduler_waiting +# Number of requests swapped to CPU. +gauge_scheduler_swapped +# GPU KV-cache usage. 1 means 100 percent usage. +gauge_gpu_cache_usage +# CPU KV-cache usage. 1 means 100 percent usage. +gauge_cpu_cache_usage +# CPU prefix cache block hit rate. +gauge_cpu_prefix_cache_hit_rate +# GPU prefix cache block hit rate. +gauge_gpu_prefix_cache_hit_rate ``` Your output for these fields should look similar to the following: ```bash @@ -250,6 +276,37 @@ vllm:prompt_tokens_total{model="vllm_model",version="1"} 10 # HELP vllm:generation_tokens_total Number of generation tokens processed. # TYPE vllm:generation_tokens_total counter vllm:generation_tokens_total{model="vllm_model",version="1"} 16 +# HELP vllm:num_preemptions_total Number of preemption tokens processed. +# TYPE vllm:num_preemptions_total counter +vllm:num_preemptions_total{model="vllm_model",version="1"} 0 +# HELP vllm:num_requests_running Number of requests currently running on GPU. +# TYPE vllm:num_requests_running gauge +vllm:num_requests_running{model="vllm_model",version="1"} 0 +# HELP vllm:num_requests_waiting Number of requests waiting to be processed. +# TYPE vllm:num_requests_waiting gauge +vllm:num_requests_waiting{model="vllm_model",version="1"} 0 +# HELP vllm:num_requests_swapped Number of requests swapped to CPU. +# TYPE vllm:num_requests_swapped gauge +vllm:num_requests_swapped{model="vllm_model",version="1"} 0 +# HELP vllm:gpu_cache_usage_perc Gauga of gpu cache usage. 1 means 100 percent usage. +# TYPE vllm:gpu_cache_usage_perc gauge +vllm:gpu_cache_usage_perc{model="vllm_model",version="1"} 0 +# HELP vllm:cpu_cache_usage_perc Gauga of cpu cache usage. 1 means 100 percent usage. +# TYPE vllm:cpu_cache_usage_perc gauge +vllm:cpu_cache_usage_perc{model="vllm_model",version="1"} 0 +# HELP vllm:cpu_prefix_cache_hit_rate CPU prefix cache block hit rate. +# TYPE vllm:cpu_prefix_cache_hit_rate gauge +vllm:cpu_prefix_cache_hit_rate{model="vllm_model",version="1"} -1 +# HELP vllm:gpu_prefix_cache_hit_rate GPU prefix cache block hit rate. +# TYPE vllm:gpu_prefix_cache_hit_rate gauge +vllm:gpu_prefix_cache_hit_rate{model="vllm_model",version="1"} -1 +# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. +# TYPE vllm:iteration_tokens_total histogram +vllm:iteration_tokens_total_count{model="vllm_model",version="1"} 10 +vllm:iteration_tokens_total_sum{model="vllm_model",version="1"} 12 +vllm:iteration_tokens_total_bucket{model="vllm_model",version="1",le="1"} 9 +... +vllm:iteration_tokens_total_bucket{model="vllm_model",version="1",le="+Inf"} 10 # HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds. # TYPE vllm:time_to_first_token_seconds histogram vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1 @@ -271,6 +328,34 @@ vllm:e2e_request_latency_seconds_sum{model="vllm_model",version="1"} 0.086861848 vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="1"} 1 ... vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 +# HELP vllm:request_queue_time_seconds Histogram of time spent in WAITING phase for request. +# TYPE vllm:request_queue_time_seconds histogram +vllm:request_queue_time_seconds_count{model="vllm_model",version="1"} 1 +vllm:request_queue_time_seconds_sum{model="vllm_model",version="1"} 0.0045166015625 +vllm:request_queue_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1 +... +vllm:request_queue_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 +# HELP vllm:request_inference_time_seconds Histogram of time spent in RUNNING phase for request +# TYPE vllm:request_inference_time_seconds histogram +vllm:request_inference_time_seconds_count{model="vllm_model",version="1"} 1 +vllm:request_inference_time_seconds_sum{model="vllm_model",version="1"} 0.1418392658233643 +vllm:request_inference_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1 +... +vllm:request_inference_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 +# HELP vllm:request_prefill_time_seconds Histogram of time spent in PREFILL phase for request. +# TYPE vllm:request_prefill_time_seconds histogram +vllm:request_prefill_time_seconds_count{model="vllm_model",version="1"} 1 +vllm:request_prefill_time_seconds_sum{model="vllm_model",version="1"} 0.05302977561950684 +vllm:request_prefill_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1 +... +vllm:request_prefill_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 +# HELP vllm:request_decode_time_seconds Histogram of time spent in DECODE phase for request. +# TYPE vllm:request_decode_time_seconds histogram +vllm:request_decode_time_seconds_count{model="vllm_model",version="1"} 1 +vllm:request_decode_time_seconds_sum{model="vllm_model",version="1"} 0.08880949020385742 +vllm:request_decode_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1 +... +vllm:request_decode_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 # HELP vllm:request_prompt_tokens Number of prefill tokens processed. # TYPE vllm:request_prompt_tokens histogram vllm:request_prompt_tokens_count{model="vllm_model",version="1"} 1 diff --git a/src/utils/metrics.py b/src/utils/metrics.py index a593f70..d4a74a9 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -29,7 +29,6 @@ from typing import Dict, List, Union import triton_python_backend_utils as pb_utils - from vllm.config import VllmConfig from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase from vllm.engine.metrics import Stats as VllmStats @@ -438,4 +437,3 @@ def finalize(self): if self._logger_thread is not None: self._logger_thread.join() self._logger_thread = None - self._logger_thread = None From 7a0096ef3378bd5f596783ced8f7c8f4afa767e3 Mon Sep 17 00:00:00 2001 From: Pavloveuge Date: Tue, 13 May 2025 18:06:18 +0300 Subject: [PATCH 3/3] update test --- .../metrics_test/vllm_metrics_test.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index b8ddeb4..393d75a 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -131,6 +131,8 @@ def test_vllm_metrics(self): # (5, 812, 9, 5, 1515, 3497, 4, 50118, 50118, 133, 812, 9, 1470, 16, 5, 812) # (11, 5, 1420, 9, 5, 82, 4, 50118, 50118, 133, 499, 9, 4687, 16, 11, 5) self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48) + # vllm:num_preemptions_total + self.assertEqual(metrics_dict["vllm:num_preemptions_total"], 0) # vllm:time_to_first_token_seconds self.assertEqual( metrics_dict["vllm:time_to_first_token_seconds_count"], total_prompts @@ -139,6 +141,10 @@ def test_vllm_metrics(self): self.assertEqual( metrics_dict["vllm:time_to_first_token_seconds_bucket"], total_prompts ) + # vllm:iteration_tokens_total + self.assertEqual(metrics_dict["vllm:iteration_tokens_total_count"], 18) + self.assertGreater(metrics_dict["vllm:iteration_tokens_total_sum"], 0) + self.assertEqual(metrics_dict["vllm:iteration_tokens_total_bucket"], 18) # vllm:time_per_output_token_seconds self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45) self.assertGreater(metrics_dict["vllm:time_per_output_token_seconds_sum"], 0) @@ -151,6 +157,16 @@ def test_vllm_metrics(self): self.assertEqual( metrics_dict["vllm:e2e_request_latency_seconds_bucket"], total_prompts ) + request_time_metrics_list = [ + "request_queue_time_seconds", + "request_inference_time_seconds", + "request_prefill_time_seconds", + "request_decode_time_seconds", + ] + for metric_name in request_time_metrics_list: + self.assertEqual(metrics_dict[f"{metric_name}_count"], total_prompts) + self.assertGreater(metrics_dict[f"{metric_name}_sum"], 0) + self.assertEqual(metrics_dict[f"{metric_name}_bucket"], total_prompts) # vllm:request_prompt_tokens self.assertEqual( metrics_dict["vllm:request_prompt_tokens_count"], total_prompts