Skip to content

Commit 2450773

Browse files
authored
Merge pull request #64 from aish1331/prometheus-metrics-client
Added Prometheus client to get model server metrics
2 parents 4551420 + dc7b7ef commit 2450773

File tree

14 files changed

+539
-82
lines changed

14 files changed

+539
-82
lines changed

config.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,8 @@ tokenizer:
1111
pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
1212
data:
1313
type: shareGPT
14+
metrics_client:
15+
type: prometheus
16+
prometheus:
17+
url: http://localhost:9090
18+
scrape_interval: 15

inference_perf/client/base.py

Lines changed: 83 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,82 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
from abc import ABC, abstractmethod
15-
from typing import Tuple
15+
from typing import List, Tuple, TypedDict
16+
17+
from pydantic import BaseModel
1618
from inference_perf.datagen import InferenceData
17-
from inference_perf.reportgen import ReportGenerator
19+
20+
21+
class RequestMetric(BaseModel):
22+
stage_id: int
23+
prompt_tokens: int
24+
output_tokens: int
25+
time_per_request: float
26+
27+
28+
class ModelServerPrometheusMetric:
29+
def __init__(self, name: str, op: str, type: str, filters: str) -> None:
30+
self.name = name
31+
self.op = op
32+
self.type = type
33+
self.filters = filters
34+
35+
36+
class ModelServerMetrics(BaseModel):
37+
# Throughput
38+
prompt_tokens_per_second: float = 0.0
39+
output_tokens_per_second: float = 0.0
40+
requests_per_second: float = 0.0
41+
42+
# Latency
43+
avg_request_latency: float = 0.0
44+
median_request_latency: float = 0.0
45+
p90_request_latency: float = 0.0
46+
p99_request_latency: float = 0.0
47+
avg_time_to_first_token: float = 0.0
48+
median_time_to_first_token: float = 0.0
49+
p90_time_to_first_token: float = 0.0
50+
p99_time_to_first_token: float = 0.0
51+
avg_time_per_output_token: float = 0.0
52+
median_time_per_output_token: float = 0.0
53+
p90_time_per_output_token: float = 0.0
54+
p99_time_per_output_token: float = 0.0
55+
56+
# Request
57+
total_requests: int = 0
58+
avg_prompt_tokens: int = 0
59+
avg_output_tokens: int = 0
60+
avg_queue_length: int = 0
61+
62+
63+
# PrometheusMetricMetadata stores the mapping of metrics to their model server names and types
64+
# and the filters to be applied to them.
65+
# This is used to generate Prometheus query for the metrics.
66+
class PrometheusMetricMetadata(TypedDict):
67+
# Throughput
68+
prompt_tokens_per_second: ModelServerPrometheusMetric
69+
output_tokens_per_second: ModelServerPrometheusMetric
70+
requests_per_second: ModelServerPrometheusMetric
71+
72+
# Latency
73+
avg_request_latency: ModelServerPrometheusMetric
74+
median_request_latency: ModelServerPrometheusMetric
75+
p90_request_latency: ModelServerPrometheusMetric
76+
p99_request_latency: ModelServerPrometheusMetric
77+
avg_time_to_first_token: ModelServerPrometheusMetric
78+
median_time_to_first_token: ModelServerPrometheusMetric
79+
p90_time_to_first_token: ModelServerPrometheusMetric
80+
p99_time_to_first_token: ModelServerPrometheusMetric
81+
avg_time_per_output_token: ModelServerPrometheusMetric
82+
median_time_per_output_token: ModelServerPrometheusMetric
83+
p90_time_per_output_token: ModelServerPrometheusMetric
84+
p99_time_per_output_token: ModelServerPrometheusMetric
85+
86+
# Request
87+
total_requests: ModelServerPrometheusMetric
88+
avg_prompt_tokens: ModelServerPrometheusMetric
89+
avg_output_tokens: ModelServerPrometheusMetric
90+
avg_queue_length: ModelServerPrometheusMetric
1891

1992

2093
class ModelServerClient(ABC):
@@ -23,9 +96,14 @@ def __init__(self, *args: Tuple[int, ...]) -> None:
2396
pass
2497

2598
@abstractmethod
26-
def set_report_generator(self, reportgen: ReportGenerator) -> None:
27-
self.reportgen = reportgen
99+
async def process_request(self, data: InferenceData, stage_id: int) -> None:
100+
raise NotImplementedError
28101

29102
@abstractmethod
30-
async def process_request(self, data: InferenceData, stage_id: int) -> None:
103+
def get_request_metrics(self) -> List[RequestMetric]:
104+
raise NotImplementedError
105+
106+
@abstractmethod
107+
def get_prometheus_metric_metadata(self) -> PrometheusMetricMetadata:
108+
# assumption: all metrics clients have metrics exported in Prometheus format
31109
raise NotImplementedError

inference_perf/client/mock_client.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,27 +11,27 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
from inference_perf.datagen import InferenceData
15-
from inference_perf.reportgen import ReportGenerator, RequestMetric
16-
from .base import ModelServerClient
1714
import asyncio
15+
from typing import List
16+
from inference_perf.datagen import InferenceData
17+
from .base import ModelServerClient, RequestMetric
1818

1919

2020
class MockModelServerClient(ModelServerClient):
2121
def __init__(self) -> None:
22-
pass
23-
24-
def set_report_generator(self, reportgen: ReportGenerator) -> None:
25-
self.reportgen = reportgen
22+
self.request_metrics: List[RequestMetric] = list()
2623

2724
async def process_request(self, payload: InferenceData, stage_id: int) -> None:
2825
print("Processing request - " + str(payload.data) + " for stage - " + str(stage_id))
2926
await asyncio.sleep(3)
30-
self.reportgen.collect_request_metrics(
27+
self.request_metrics.append(
3128
RequestMetric(
3229
stage_id=stage_id,
3330
prompt_tokens=0,
3431
output_tokens=0,
3532
time_per_request=3,
3633
)
3734
)
35+
36+
def get_request_metrics(self) -> List[RequestMetric]:
37+
return self.request_metrics

inference_perf/client/vllm_client.py

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,10 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
from inference_perf.datagen import InferenceData
15-
from inference_perf.reportgen import ReportGenerator, RequestMetric
1615
from inference_perf.config import APIType, CustomTokenizerConfig
1716
from inference_perf.utils import CustomTokenizer
18-
from .base import ModelServerClient
19-
from typing import Any, Optional
17+
from .base import ModelServerClient, ModelServerPrometheusMetric, PrometheusMetricMetadata, RequestMetric
18+
from typing import Any, Optional, List
2019
import aiohttp
2120
import json
2221
import time
@@ -42,9 +41,67 @@ def __init__(self, uri: str, model_name: str, tokenizer: Optional[CustomTokenize
4241
print("Falling back to usage metrics.")
4342
else:
4443
print("Tokenizer path is empty. Falling back to usage metrics.")
44+
self.request_metrics: List[RequestMetric] = list()
4545

46-
def set_report_generator(self, reportgen: ReportGenerator) -> None:
47-
self.reportgen = reportgen
46+
self.prometheus_metric_metadata: PrometheusMetricMetadata = {
47+
"avg_queue_length": ModelServerPrometheusMetric(
48+
"vllm:num_requests_waiting", "mean", "gauge", "model_name='%s'" % self.model_name
49+
),
50+
"avg_time_to_first_token": ModelServerPrometheusMetric(
51+
"vllm:time_to_first_token_seconds", "mean", "histogram", "model_name='%s'" % self.model_name
52+
),
53+
"median_time_to_first_token": ModelServerPrometheusMetric(
54+
"vllm:time_to_first_token_seconds", "median", "histogram", "model_name='%s'" % self.model_name
55+
),
56+
"p90_time_to_first_token": ModelServerPrometheusMetric(
57+
"vllm:time_to_first_token_seconds", "p90", "histogram", "model_name='%s'" % self.model_name
58+
),
59+
"p99_time_to_first_token": ModelServerPrometheusMetric(
60+
"vllm:time_to_first_token_seconds", "p99", "histogram", "model_name='%s'" % self.model_name
61+
),
62+
"avg_time_per_output_token": ModelServerPrometheusMetric(
63+
"vllm:time_per_output_token_seconds", "mean", "histogram", "model_name='%s'" % self.model_name
64+
),
65+
"median_time_per_output_token": ModelServerPrometheusMetric(
66+
"vllm:time_per_output_token_seconds", "median", "histogram", "model_name='%s'" % self.model_name
67+
),
68+
"p90_time_per_output_token": ModelServerPrometheusMetric(
69+
"vllm:time_per_output_token_seconds", "p90", "histogram", "model_name='%s'" % self.model_name
70+
),
71+
"p99_time_per_output_token": ModelServerPrometheusMetric(
72+
"vllm:time_per_output_token_seconds", "p99", "histogram", "model_name='%s'" % self.model_name
73+
),
74+
"avg_prompt_tokens": ModelServerPrometheusMetric(
75+
"vllm:prompt_tokens_total", "mean", "counter", "model_name='%s'" % self.model_name
76+
),
77+
"prompt_tokens_per_second": ModelServerPrometheusMetric(
78+
"vllm:prompt_tokens_total", "rate", "counter", "model_name='%s'" % self.model_name
79+
),
80+
"avg_output_tokens": ModelServerPrometheusMetric(
81+
"vllm:generation_tokens_total", "mean", "counter", "model_name='%s'" % self.model_name
82+
),
83+
"output_tokens_per_second": ModelServerPrometheusMetric(
84+
"vllm:generation_tokens_total", "rate", "counter", "model_name='%s'" % self.model_name
85+
),
86+
"total_requests": ModelServerPrometheusMetric(
87+
"vllm:e2e_request_latency_seconds_count", "increase", "counter", "model_name='%s'" % self.model_name
88+
),
89+
"requests_per_second": ModelServerPrometheusMetric(
90+
"vllm:e2e_request_latency_seconds_count", "rate", "counter", "model_name='%s'" % self.model_name
91+
),
92+
"avg_request_latency": ModelServerPrometheusMetric(
93+
"vllm:e2e_request_latency_seconds", "mean", "histogram", "model_name='%s'" % self.model_name
94+
),
95+
"median_request_latency": ModelServerPrometheusMetric(
96+
"vllm:e2e_request_latency_seconds", "median", "histogram", "model_name='%s'" % self.model_name
97+
),
98+
"p90_request_latency": ModelServerPrometheusMetric(
99+
"vllm:e2e_request_latency_seconds", "p90", "histogram", "model_name='%s'" % self.model_name
100+
),
101+
"p99_request_latency": ModelServerPrometheusMetric(
102+
"vllm:e2e_request_latency_seconds", "p99", "histogram", "model_name='%s'" % self.model_name
103+
),
104+
}
48105

49106
def _create_payload(self, payload: InferenceData) -> dict[str, Any]:
50107
if payload.type == APIType.Completion:
@@ -93,7 +150,7 @@ async def process_request(self, data: InferenceData, stage_id: int) -> None:
93150
prompt_tokens = usage.get("prompt_tokens", 0)
94151
output_tokens = usage.get("completion_tokens", 0)
95152

96-
self.reportgen.collect_request_metrics(
153+
self.request_metrics.append(
97154
RequestMetric(
98155
stage_id=stage_id,
99156
prompt_tokens=prompt_tokens,
@@ -105,3 +162,9 @@ async def process_request(self, data: InferenceData, stage_id: int) -> None:
105162
print(await response.text())
106163
except aiohttp.ClientConnectorError as e:
107164
print("vLLM Server connection error:\n", str(e))
165+
166+
def get_request_metrics(self) -> List[RequestMetric]:
167+
return self.request_metrics
168+
169+
def get_prometheus_metric_metadata(self) -> PrometheusMetricMetadata:
170+
return self.prometheus_metric_metadata

inference_perf/config.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ class LoadType(Enum):
3838
POISSON = "poisson"
3939

4040

41+
class MetricsClientType(Enum):
42+
PROMETHEUS = "prometheus"
43+
DEFAULT = "default"
44+
45+
4146
class LoadStage(BaseModel):
4247
rate: int = 1
4348
duration: int = 1
@@ -66,8 +71,14 @@ class ReportConfig(BaseModel):
6671
pass
6772

6873

69-
class MetricsConfig(BaseModel):
70-
url: str
74+
class PrometheusClientConfig(BaseModel):
75+
scrape_interval: Optional[int] = 15
76+
url: str = "http://localhost:9090"
77+
78+
79+
class MetricsClientConfig(BaseModel):
80+
type: MetricsClientType
81+
prometheus: Optional[PrometheusClientConfig] = None
7182

7283

7384
class VLLMConfig(BaseModel):
@@ -86,7 +97,7 @@ class Config(BaseModel):
8697
data: Optional[DataConfig] = DataConfig()
8798
load: Optional[LoadConfig] = LoadConfig(stages=[LoadStage()])
8899
report: Optional[ReportConfig] = ReportConfig()
89-
metrics: Optional[MetricsConfig] = MetricsConfig(url="")
100+
metrics_client: Optional[MetricsClientConfig] = None
90101
storage: Optional[StorageConfig] = StorageConfig()
91102
vllm: Optional[VLLMConfig] = None
92103
tokenizer: Optional[CustomTokenizerConfig] = None

inference_perf/main.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,16 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
import time
1415
from typing import List
1516
from inference_perf.loadgen import LoadGenerator
16-
from inference_perf.config import DataGenType
17+
from inference_perf.config import DataGenType, MetricsClientType
1718
from inference_perf.datagen import DataGenerator, MockDataGenerator, HFShareGPTDataGenerator
1819
from inference_perf.client import ModelServerClient, vLLMModelServerClient
20+
from inference_perf.metrics.base import MetricsClient, PerfRuntimeParameters
21+
from inference_perf.metrics.prometheus_client import PrometheusMetricsClient
1922
from inference_perf.client.storage import StorageClient, GoogleCloudStorageClient
2023
from inference_perf.reportgen import ReportGenerator, ReportFile
21-
from inference_perf.metrics import MockMetricsClient
2224
from inference_perf.config import read_config
2325
import asyncio
2426

@@ -35,13 +37,12 @@ def __init__(
3537
self.loadgen = loadgen
3638
self.reportgen = reportgen
3739
self.storage_clients = storage_clients
38-
self.client.set_report_generator(self.reportgen)
3940

4041
def run(self) -> None:
4142
asyncio.run(self.loadgen.run(self.client))
4243

43-
def generate_reports(self) -> List[ReportFile]:
44-
return asyncio.run(self.reportgen.generate_reports())
44+
def generate_reports(self, runtime_parameters: PerfRuntimeParameters) -> List[ReportFile]:
45+
return asyncio.run(self.reportgen.generate_reports(runtime_parameters=runtime_parameters))
4546

4647
def save_reports(self, reports: List[ReportFile]) -> None:
4748
for storage_client in self.storage_clients:
@@ -53,7 +54,7 @@ def main_cli() -> None:
5354

5455
# Define Model Server Client
5556
if config.vllm:
56-
client = vLLMModelServerClient(
57+
model_server_client = vLLMModelServerClient(
5758
uri=config.vllm.url, model_name=config.vllm.model_name, tokenizer=config.tokenizer, api_type=config.vllm.api
5859
)
5960
else:
@@ -76,13 +77,13 @@ def main_cli() -> None:
7677
raise Exception("load config missing")
7778

7879
# Define Metrics Client
79-
if config.metrics:
80-
metricsclient = MockMetricsClient(uri=config.metrics.url)
81-
else:
82-
raise Exception("metrics config missing")
80+
metrics_client: MetricsClient | None = None
81+
if config.metrics_client:
82+
if config.metrics_client.type == MetricsClientType.PROMETHEUS and config.metrics_client.prometheus:
83+
metrics_client = PrometheusMetricsClient(config=config.metrics_client.prometheus)
8384

8485
# Define Report Generator
85-
reportgen = ReportGenerator(metricsclient)
86+
reportgen = ReportGenerator(metrics_client)
8687

8788
# Define Storage Clients
8889
storage_clients: List[StorageClient] = []
@@ -91,13 +92,18 @@ def main_cli() -> None:
9192
storage_clients.append(GoogleCloudStorageClient(config=config.storage.google_cloud_storage))
9293

9394
# Setup Perf Test Runner
94-
perfrunner = InferencePerfRunner(client, loadgen, reportgen, storage_clients)
95+
perfrunner = InferencePerfRunner(model_server_client, loadgen, reportgen, storage_clients)
96+
97+
start_time = time.time()
9598

9699
# Run Perf Test
97100
perfrunner.run()
98101

99-
# Generate Reports
100-
reports = perfrunner.generate_reports()
102+
end_time = time.time()
103+
duration = end_time - start_time # Calculate the duration of the test
104+
105+
# Generate Report after the tests
106+
reports = perfrunner.generate_reports(PerfRuntimeParameters(start_time, duration, model_server_client))
101107

102108
# Save Reports
103109
perfrunner.save_reports(reports=reports)

inference_perf/metrics/README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@ This repository provides clients to query performance metrics from various monit
55
## Supported Monitoring Platforms
66

77
**Available now**:
8-
- None
8+
- Self Deployed Prometheus
99

1010
**Todo**:
1111
- Google Cloud Monitoring
1212
- AWS CloudWatch
13-
- Azure Monitor
14-
- Self Deployed Prometheus
13+
- Azure Monitor

inference_perf/metrics/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
from .base import MetricsClient, MetricsSummary
14+
from .base import MetricsClient
1515
from .mock_client import MockMetricsClient
1616

17-
__all__ = ["MetricsClient", "MetricsSummary", "MockMetricsClient"]
17+
__all__ = ["MetricsClient", "MockMetricsClient"]

0 commit comments

Comments
 (0)