From b9e8b8542036a90edeeb83c69e07f595d30dc594 Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Tue, 13 Jun 2023 15:47:27 +0200 Subject: [PATCH 1/2] Add some metrics to debug at scale --- pkg/metrics/client_go_adapter.go | 92 +++++++++++++++++++++++++++++--- 1 file changed, 86 insertions(+), 6 deletions(-) diff --git a/pkg/metrics/client_go_adapter.go b/pkg/metrics/client_go_adapter.go index ff28998c44..9ddf887dca 100644 --- a/pkg/metrics/client_go_adapter.go +++ b/pkg/metrics/client_go_adapter.go @@ -18,17 +18,53 @@ package metrics import ( "context" + "net/url" + "time" "github.com/prometheus/client_golang/prometheus" clientmetrics "k8s.io/client-go/tools/metrics" ) -// this file contains setup logic to initialize the myriad of places -// that client-go registers metrics. We copy the names and formats -// from Kubernetes so that we match the core controllers. - var ( - // client metrics. + // requestLatency is a Prometheus Histogram metric type partitioned by + // "verb", and "host" labels. It is used for the rest client latency metrics. + requestLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "rest_client_request_duration_seconds", + Help: "Request latency in seconds. Broken down by verb, and host.", + Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0}, + }, + []string{"verb", "host"}, + ) + + requestSize = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "rest_client_request_size_bytes", + Help: "Request size in bytes. Broken down by verb and host.", + // 64 bytes to 16MB + Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216}, + }, + []string{"verb", "host"}, + ) + + responseSize = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "rest_client_response_size_bytes", + Help: "Response size in bytes. Broken down by verb and host.", + // 64 bytes to 16MB + Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216}, + }, + []string{"verb", "host"}, + ) + + rateLimiterLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "rest_client_rate_limiter_duration_seconds", + Help: "Client side rate limiter latency in seconds. Broken down by verb, and host.", + Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0}, + }, + []string{"verb", "host"}, + ) requestResult = prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -37,6 +73,14 @@ var ( }, []string{"code", "method", "host"}, ) + + requestRetry = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "rest_client_request_retries_total", + Help: "Number of request retries, partitioned by status code, verb, and host.", + }, + []string{"code", "verb", "host"}, + ) ) func init() { @@ -46,11 +90,21 @@ func init() { // registerClientMetrics sets up the client latency metrics from client-go. func registerClientMetrics() { // register the metrics with our registry + Registry.MustRegister(requestLatency) + Registry.MustRegister(requestSize) + Registry.MustRegister(responseSize) + Registry.MustRegister(rateLimiterLatency) Registry.MustRegister(requestResult) + Registry.MustRegister(requestRetry) // register the metrics with client-go clientmetrics.Register(clientmetrics.RegisterOpts{ - RequestResult: &resultAdapter{metric: requestResult}, + RequestLatency: &LatencyAdapter{metric: requestLatency}, + RequestSize: &sizeAdapter{metric: requestSize}, + ResponseSize: &sizeAdapter{metric: responseSize}, + RateLimiterLatency: &LatencyAdapter{metric: rateLimiterLatency}, + RequestResult: &resultAdapter{metric: requestResult}, + RequestRetry: &retryAdapter{requestRetry}, }) } @@ -62,6 +116,24 @@ func registerClientMetrics() { // copied (more-or-less directly) from k8s.io/kubernetes setup code // (which isn't anywhere in an easily-importable place). +// LatencyAdapter implements LatencyMetric. +type LatencyAdapter struct { + metric *prometheus.HistogramVec +} + +// Observe increments the request latency metric for the given verb/URL. +func (l *LatencyAdapter) Observe(_ context.Context, verb string, u url.URL, latency time.Duration) { + l.metric.WithLabelValues(verb, u.String()).Observe(latency.Seconds()) +} + +type sizeAdapter struct { + metric *prometheus.HistogramVec +} + +func (s *sizeAdapter) Observe(ctx context.Context, verb string, host string, size float64) { + s.metric.WithLabelValues(verb, host).Observe(size) +} + type resultAdapter struct { metric *prometheus.CounterVec } @@ -69,3 +141,11 @@ type resultAdapter struct { func (r *resultAdapter) Increment(_ context.Context, code, method, host string) { r.metric.WithLabelValues(code, method, host).Inc() } + +type retryAdapter struct { + metric *prometheus.CounterVec +} + +func (r *retryAdapter) IncrementRetry(_ context.Context, code, method, host string) { + r.metric.WithLabelValues(code, method, host).Inc() +} From af02ca51359900363c845103dcf75d3c43c9fa2d Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Thu, 29 Jun 2023 13:51:13 +0200 Subject: [PATCH 2/2] optimized performance: dropped size metrics, dropped buckets --- pkg/metrics/client_go_adapter.go | 37 +++----------------------------- 1 file changed, 3 insertions(+), 34 deletions(-) diff --git a/pkg/metrics/client_go_adapter.go b/pkg/metrics/client_go_adapter.go index 9ddf887dca..e7d3afe878 100644 --- a/pkg/metrics/client_go_adapter.go +++ b/pkg/metrics/client_go_adapter.go @@ -18,6 +18,7 @@ package metrics import ( "context" + "math" "net/url" "time" @@ -32,27 +33,7 @@ var ( prometheus.HistogramOpts{ Name: "rest_client_request_duration_seconds", Help: "Request latency in seconds. Broken down by verb, and host.", - Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0}, - }, - []string{"verb", "host"}, - ) - - requestSize = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "rest_client_request_size_bytes", - Help: "Request size in bytes. Broken down by verb and host.", - // 64 bytes to 16MB - Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216}, - }, - []string{"verb", "host"}, - ) - - responseSize = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "rest_client_response_size_bytes", - Help: "Response size in bytes. Broken down by verb and host.", - // 64 bytes to 16MB - Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216}, + Buckets: []float64{math.Inf(+1)}, // Intentionally using minimum buckets for better performance / lower cardinality. }, []string{"verb", "host"}, ) @@ -61,7 +42,7 @@ var ( prometheus.HistogramOpts{ Name: "rest_client_rate_limiter_duration_seconds", Help: "Client side rate limiter latency in seconds. Broken down by verb, and host.", - Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0}, + Buckets: []float64{math.Inf(+1)}, // Intentionally using minimum buckets for better performance / lower cardinality. }, []string{"verb", "host"}, ) @@ -91,8 +72,6 @@ func init() { func registerClientMetrics() { // register the metrics with our registry Registry.MustRegister(requestLatency) - Registry.MustRegister(requestSize) - Registry.MustRegister(responseSize) Registry.MustRegister(rateLimiterLatency) Registry.MustRegister(requestResult) Registry.MustRegister(requestRetry) @@ -100,8 +79,6 @@ func registerClientMetrics() { // register the metrics with client-go clientmetrics.Register(clientmetrics.RegisterOpts{ RequestLatency: &LatencyAdapter{metric: requestLatency}, - RequestSize: &sizeAdapter{metric: requestSize}, - ResponseSize: &sizeAdapter{metric: responseSize}, RateLimiterLatency: &LatencyAdapter{metric: rateLimiterLatency}, RequestResult: &resultAdapter{metric: requestResult}, RequestRetry: &retryAdapter{requestRetry}, @@ -126,14 +103,6 @@ func (l *LatencyAdapter) Observe(_ context.Context, verb string, u url.URL, late l.metric.WithLabelValues(verb, u.String()).Observe(latency.Seconds()) } -type sizeAdapter struct { - metric *prometheus.HistogramVec -} - -func (s *sizeAdapter) Observe(ctx context.Context, verb string, host string, size float64) { - s.metric.WithLabelValues(verb, host).Observe(size) -} - type resultAdapter struct { metric *prometheus.CounterVec }