From db7312180248797cd183a117003fb630edd47ede Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 9 May 2025 00:33:01 -0700 Subject: [PATCH 1/2] Enable metrics provider --- pkg/config/controller.go | 5 + pkg/controller/controller.go | 14 ++ pkg/internal/controller/controller.go | 53 +++--- pkg/internal/controller/controller_test.go | 3 + pkg/internal/controller/metrics/metrics.go | 18 -- pkg/manager/internal.go | 1 - pkg/manager/manager.go | 10 ++ pkg/metrics/leaderelection.go | 47 ++---- pkg/metrics/provider.go | 187 +++++++++++++++++++++ pkg/metrics/registry.go | 13 +- 10 files changed, 277 insertions(+), 74 deletions(-) create mode 100644 pkg/metrics/provider.go diff --git a/pkg/config/controller.go b/pkg/config/controller.go index a5655593ef..3d64d30e50 100644 --- a/pkg/config/controller.go +++ b/pkg/config/controller.go @@ -20,6 +20,7 @@ import ( "time" "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/metrics" ) // Controller contains configuration options for controllers. It only includes options @@ -66,6 +67,10 @@ type Controller struct { // Note: This flag is disabled by default until a future version. It's currently in beta. UsePriorityQueue *bool + // MetricsProvider allows users to override the location where controller metrics are emitted. + // By default, metrics are emitted to a pre-configured Prometheus registry + MetricsProvider metrics.ControllerMetricsProvider + // Logger is the logger controllers should use. Logger logr.Logger } diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 9de959b48f..7a56aeeb26 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -25,6 +25,7 @@ import ( "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/config" "sigs.k8s.io/controller-runtime/pkg/controller/priorityqueue" @@ -84,6 +85,10 @@ type TypedOptions[request comparable] struct { // Logger will be used to build a default LogConstructor if unset. Logger logr.Logger + // MetricsProvider allows users to override the location where controller metrics are emitted. + // By default, metrics are emitted to a pre-configured Prometheus registry + MetricsProvider metrics.ControllerMetricsProvider + // LogConstructor is used to construct a logger used for this controller and passed // to each reconciliation via the context field. LogConstructor func(request *request) logr.Logger @@ -101,6 +106,10 @@ func (options *TypedOptions[request]) DefaultFromConfig(config config.Controller options.Logger = config.Logger } + if options.MetricsProvider == nil { + options.MetricsProvider = config.MetricsProvider + } + if options.SkipNameValidation == nil { options.SkipNameValidation = config.SkipNameValidation } @@ -196,6 +205,10 @@ func NewTypedUnmanaged[request comparable](name string, options TypedOptions[req } } + if options.MetricsProvider == nil { + options.MetricsProvider = metrics.NewPrometheusProvider() + } + if options.LogConstructor == nil { log := options.Logger.WithValues( "controller", name, @@ -250,6 +263,7 @@ func NewTypedUnmanaged[request comparable](name string, options TypedOptions[req MaxConcurrentReconciles: options.MaxConcurrentReconciles, CacheSyncTimeout: options.CacheSyncTimeout, Name: name, + MetricsProvider: options.MetricsProvider, LogConstructor: options.LogConstructor, RecoverPanic: options.RecoverPanic, LeaderElected: options.NeedLeaderElection, diff --git a/pkg/internal/controller/controller.go b/pkg/internal/controller/controller.go index 9fa7ec71e1..27b84ebc23 100644 --- a/pkg/internal/controller/controller.go +++ b/pkg/internal/controller/controller.go @@ -30,9 +30,9 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/client-go/util/workqueue" + "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/controller/priorityqueue" - ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/internal/controller/metrics" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" @@ -89,6 +89,9 @@ type Controller[request comparable] struct { // outside the context of a reconciliation. LogConstructor func(request *request) logr.Logger + // MetricsProvider is used to route metrics that are fired due to controller reconciles + MetricsProvider metrics.ControllerMetricsProvider + // RecoverPanic indicates whether the panic caused by reconcile should be recovered. // Defaults to true. RecoverPanic *bool @@ -101,7 +104,7 @@ type Controller[request comparable] struct { func (c *Controller[request]) Reconcile(ctx context.Context, req request) (_ reconcile.Result, err error) { defer func() { if r := recover(); r != nil { - ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Inc() + c.MetricsProvider.ReconcilePanics().Inc(map[string]string{labelKeyController: c.Name}) if c.RecoverPanic == nil || *c.RecoverPanic { for _, fn := range utilruntime.PanicHandlers { @@ -294,30 +297,32 @@ func (c *Controller[request]) processNextWorkItem(ctx context.Context) bool { // period. defer c.Queue.Done(obj) - ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Add(1) - defer ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Add(-1) + c.MetricsProvider.ActiveWorkers().Add(map[string]string{labelKeyController: c.Name}, 1) + defer c.MetricsProvider.ActiveWorkers().Add(map[string]string{labelKeyController: c.Name}, -1) c.reconcileHandler(ctx, obj, priority) return true } const ( - labelError = "error" - labelRequeueAfter = "requeue_after" - labelRequeue = "requeue" - labelSuccess = "success" + labelKeyController = "controller" + labelKeyResult = "result" + labelError = "error" + labelRequeueAfter = "requeue_after" + labelRequeue = "requeue" + labelSuccess = "success" ) func (c *Controller[request]) initMetrics() { - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelError).Add(0) - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeueAfter).Add(0) - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeue).Add(0) - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelSuccess).Add(0) - ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0) - ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Add(0) - ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Add(0) - ctrlmetrics.WorkerCount.WithLabelValues(c.Name).Set(float64(c.MaxConcurrentReconciles)) - ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0) + c.MetricsProvider.ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelError}, 0) + c.MetricsProvider.ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeueAfter}, 0) + c.MetricsProvider.ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeue}, 0) + c.MetricsProvider.ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelSuccess}, 0) + c.MetricsProvider.ReconcileErrors().Add(map[string]string{labelKeyController: c.Name}, 0) + c.MetricsProvider.TerminalReconcileErrors().Add(map[string]string{labelKeyController: c.Name}, 0) + c.MetricsProvider.ReconcilePanics().Add(map[string]string{labelKeyController: c.Name}, 0) + c.MetricsProvider.WorkerCount().Set(map[string]string{labelKeyController: c.Name}, float64(c.MaxConcurrentReconciles)) + c.MetricsProvider.ActiveWorkers().Set(map[string]string{labelKeyController: c.Name}, 0) } func (c *Controller[request]) reconcileHandler(ctx context.Context, req request, priority int) { @@ -341,12 +346,12 @@ func (c *Controller[request]) reconcileHandler(ctx context.Context, req request, switch { case err != nil: if errors.Is(err, reconcile.TerminalError(nil)) { - ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Inc() + c.MetricsProvider.TerminalReconcileErrors().Inc(map[string]string{"controller": c.Name}) } else { c.Queue.AddWithOpts(priorityqueue.AddOpts{RateLimited: true, Priority: priority}, req) } - ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Inc() - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelError).Inc() + c.MetricsProvider.ReconcileErrors().Inc(map[string]string{labelKeyController: c.Name}) + c.MetricsProvider.ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelError}) if !result.IsZero() { log.Info("Warning: Reconciler returned both a non-zero result and a non-nil error. The result will always be ignored if the error is non-nil and the non-nil error causes requeuing with exponential backoff. For more details, see: https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/reconcile#Reconciler") } @@ -359,17 +364,17 @@ func (c *Controller[request]) reconcileHandler(ctx context.Context, req request, // to result.RequestAfter c.Queue.Forget(req) c.Queue.AddWithOpts(priorityqueue.AddOpts{After: result.RequeueAfter, Priority: priority}, req) - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeueAfter).Inc() + c.MetricsProvider.ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeueAfter}) case result.Requeue: //nolint: staticcheck // We have to handle it until it is removed log.V(5).Info("Reconcile done, requeueing") c.Queue.AddWithOpts(priorityqueue.AddOpts{RateLimited: true, Priority: priority}, req) - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeue).Inc() + c.MetricsProvider.ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeue}) default: log.V(5).Info("Reconcile successful") // Finally, if no error occurs we Forget this item so it does not // get queued again until another change happens. c.Queue.Forget(req) - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelSuccess).Inc() + c.MetricsProvider.ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelSuccess}) } } @@ -380,7 +385,7 @@ func (c *Controller[request]) GetLogger() logr.Logger { // updateMetrics updates prometheus metrics within the controller. func (c *Controller[request]) updateMetrics(reconcileTime time.Duration) { - ctrlmetrics.ReconcileTime.WithLabelValues(c.Name).Observe(reconcileTime.Seconds()) + c.MetricsProvider.ReconcileTime().Observe(map[string]string{labelKeyController: c.Name}, reconcileTime.Seconds()) } // ReconcileIDFromContext gets the reconcileID from the current context. diff --git a/pkg/internal/controller/controller_test.go b/pkg/internal/controller/controller_test.go index 3fde5da9c8..431aeae11e 100644 --- a/pkg/internal/controller/controller_test.go +++ b/pkg/internal/controller/controller_test.go @@ -43,6 +43,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/handler" ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/internal/controller/metrics" "sigs.k8s.io/controller-runtime/pkg/internal/log" + "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" ) @@ -75,6 +76,7 @@ var _ = Describe("controller", func() { NewQueue: func(string, workqueue.TypedRateLimiter[reconcile.Request]) workqueue.TypedRateLimitingInterface[reconcile.Request] { return queue }, + MetricsProvider: metrics.NewPrometheusProvider(), LogConstructor: func(_ *reconcile.Request) logr.Logger { return log.RuntimeLog.WithName("controller").WithName("test") }, @@ -354,6 +356,7 @@ var _ = Describe("controller", func() { NewQueue: func(string, workqueue.TypedRateLimiter[TestRequest]) workqueue.TypedRateLimitingInterface[TestRequest] { return queue }, + MetricsProvider: metrics.NewPrometheusProvider(), LogConstructor: func(*TestRequest) logr.Logger { return log.RuntimeLog.WithName("controller").WithName("test") }, diff --git a/pkg/internal/controller/metrics/metrics.go b/pkg/internal/controller/metrics/metrics.go index 450e9ae25b..7f41ce01a8 100644 --- a/pkg/internal/controller/metrics/metrics.go +++ b/pkg/internal/controller/metrics/metrics.go @@ -20,8 +20,6 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/collectors" - "sigs.k8s.io/controller-runtime/pkg/metrics" ) var ( @@ -81,19 +79,3 @@ var ( Help: "Number of currently used workers per controller", }, []string{"controller"}) ) - -func init() { - metrics.Registry.MustRegister( - ReconcileTotal, - ReconcileErrors, - TerminalReconcileErrors, - ReconcilePanics, - ReconcileTime, - WorkerCount, - ActiveWorkers, - // expose process metrics like CPU, Memory, file descriptor usage etc. - collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}), - // expose all Go runtime metrics like GC stats, memory stats etc. - collectors.NewGoCollector(collectors.WithGoCollectorRuntimeMetrics(collectors.MetricsAll)), - ) -} diff --git a/pkg/manager/internal.go b/pkg/manager/internal.go index e5204a7506..e47f3082d2 100644 --- a/pkg/manager/internal.go +++ b/pkg/manager/internal.go @@ -35,7 +35,6 @@ import ( "k8s.io/client-go/tools/leaderelection" "k8s.io/client-go/tools/leaderelection/resourcelock" "k8s.io/client-go/tools/record" - "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index c3ae317b04..7511ec9cc3 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -33,6 +33,7 @@ import ( "k8s.io/client-go/tools/leaderelection/resourcelock" "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/metrics" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/cache" @@ -223,6 +224,10 @@ type Options struct { // Metrics are the metricsserver.Options that will be used to create the metricsserver.Server. Metrics metricsserver.Options + // LeaderEelectionMetricProvider allows users to override the location where leader election metrics are emitted. + // By default, metrics are emitted to a pre-configured Prometheus registry + LeaderElectionMetricProvider metrics.LeaderElectionMetricsProvider + // HealthProbeBindAddress is the TCP address that the controller should bind to // for serving health probes // It can be set to "0" or "" to disable serving the health probe. @@ -401,6 +406,11 @@ func New(config *rest.Config, options Options) (Manager, error) { if err != nil { return nil, err } + leaderElectionMetricsProvider := options.LeaderElectionMetricProvider + if leaderElectionMetricsProvider == nil { + leaderElectionMetricsProvider = metrics.NewPrometheusProvider() + } + metrics.SetLeaderElectionProvider(leaderElectionMetricsProvider) // Create health probes listener. This will throw an error if the bind // address is invalid or already in use. diff --git a/pkg/metrics/leaderelection.go b/pkg/metrics/leaderelection.go index 61e1009d32..2b6b979052 100644 --- a/pkg/metrics/leaderelection.go +++ b/pkg/metrics/leaderelection.go @@ -1,47 +1,34 @@ package metrics import ( - "github.com/prometheus/client_golang/prometheus" "k8s.io/client-go/tools/leaderelection" ) -// This file is copied and adapted from k8s.io/component-base/metrics/prometheus/clientgo/leaderelection -// which registers metrics to the k8s legacy Registry. We require very -// similar functionality, but must register metrics to a different Registry. - -var ( - leaderGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "leader_election_master_status", - Help: "Gauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.", - }, []string{"name"}) - - leaderSlowpathCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "leader_election_slowpath_total", - Help: "Total number of slow path exercised in renewing leader leases. 'name' is the string used to identify the lease. Please make sure to group by name.", - }, []string{"name"}) -) - -func init() { - Registry.MustRegister(leaderGauge) - leaderelection.SetProvider(leaderelectionMetricsProvider{}) +// SetLeaderElectionProvider sets the leader election provider leveraged by client-go +func SetLeaderElectionProvider(provider LeaderElectionMetricsProvider) { + leaderelection.SetProvider(leaderElectionMetricsProvider{provider: provider}) } -type leaderelectionMetricsProvider struct{} +type leaderElectionMetricsProvider struct { + provider LeaderElectionMetricsProvider +} -func (leaderelectionMetricsProvider) NewLeaderMetric() leaderelection.LeaderMetric { - return leaderElectionPrometheusAdapter{} +func (l leaderElectionMetricsProvider) NewLeaderMetric() leaderelection.LeaderMetric { + return leaderElectionMetricAdapter(l) } -type leaderElectionPrometheusAdapter struct{} +type leaderElectionMetricAdapter struct { + provider LeaderElectionMetricsProvider +} -func (s leaderElectionPrometheusAdapter) On(name string) { - leaderGauge.WithLabelValues(name).Set(1.0) +func (l leaderElectionMetricAdapter) On(name string) { + l.provider.LeaderGauge().Set(map[string]string{"name": name}, 1) } -func (s leaderElectionPrometheusAdapter) Off(name string) { - leaderGauge.WithLabelValues(name).Set(0.0) +func (l leaderElectionMetricAdapter) Off(name string) { + l.provider.LeaderGauge().Set(map[string]string{"name": name}, 0) } -func (leaderElectionPrometheusAdapter) SlowpathExercised(name string) { - leaderSlowpathCounter.WithLabelValues(name).Inc() +func (l leaderElectionMetricAdapter) SlowpathExercised(name string) { + l.provider.SlowpathExercised().Inc(map[string]string{"name": name}) } diff --git a/pkg/metrics/provider.go b/pkg/metrics/provider.go new file mode 100644 index 0000000000..d9b9448c5d --- /dev/null +++ b/pkg/metrics/provider.go @@ -0,0 +1,187 @@ +package metrics + +import ( + "sync" + + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/internal/controller/metrics" +) + +// LeaderElectionMetricsProvider is an interface that provides methods for firing leader election metrics +type LeaderElectionMetricsProvider interface { + LeaderGauge() GaugeMetric + SlowpathExercised() CounterMetric +} + +// ControllerMetricsProvider is an interface that provides methods for firing controller metrics +type ControllerMetricsProvider interface { + // ReconcileTotal is a prometheus counter metrics which holds the total + // number of reconciliations per controller. It has two labels. controller label refers + // to the controller name and result label refers to the reconcile result i.e + // success, error, requeue, requeue_after. + ReconcileTotal() CounterMetric + // ReconcileErrors is a prometheus counter metrics which holds the total + // number of errors from the Reconciler. + ReconcileErrors() CounterMetric + // TerminalReconcileErrors is a prometheus counter metrics which holds the total + // number of terminal errors from the Reconciler. + TerminalReconcileErrors() CounterMetric + // ReconcilePanics is a prometheus counter metrics which holds the total + // number of panics from the Reconciler. + ReconcilePanics() CounterMetric + // ReconcileTime is a prometheus metric which keeps track of the duration + // of reconciliations. + ReconcileTime() ObservationMetric + // WorkerCount is a prometheus metric which holds the number of + // concurrent reconciles per controller. + WorkerCount() GaugeMetric + // ActiveWorkers is a prometheus metric which holds the number + // of active workers per controller. + ActiveWorkers() GaugeMetric +} + +// ObservationMetric is a metric that stores the set of observed values +type ObservationMetric interface { + Observe(map[string]string, float64) +} + +// GaugeMetric is a metric that gets set and can be changed dynamically at runtime +type GaugeMetric interface { + Set(map[string]string, float64) + Add(map[string]string, float64) +} + +// CounterMetric is a metric that gets incremented monotonically +type CounterMetric interface { + Inc(map[string]string) + Add(map[string]string, float64) +} + +var once sync.Once + +// PrometheusProvider is a metrics.ControllerMetricsProvider and a metrics.LeaderElectionMetricsProvider +// that registers and fires prometheus metrics in response to leader election and controller events +type PrometheusProvider struct { + reconcileTotal *prometheus.CounterVec + reconcileErrors *prometheus.CounterVec + terminalReconcileErrors *prometheus.CounterVec + reconcilePanics *prometheus.CounterVec + reconcileTime *prometheus.HistogramVec + workerCount *prometheus.GaugeVec + activeWorkers *prometheus.GaugeVec + leaderGauge *prometheus.GaugeVec + leaderSlowpathCounter *prometheus.CounterVec +} + +// NewPrometheusProvider creates a PrometheusProvider +func NewPrometheusProvider() *PrometheusProvider { + leaderGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "leader_election_master_status", + Help: "Gauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.", + }, []string{"name"}) + leaderSlowpathCounter := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "leader_election_slowpath_total", + Help: "Total number of slow path exercised in renewing leader leases. 'name' is the string used to identify the lease. Please make sure to group by name.", + }, []string{"name"}) + once.Do(func() { + Registry.MustRegister( + metrics.ReconcileTotal, + metrics.ReconcileErrors, + metrics.TerminalReconcileErrors, + metrics.ReconcilePanics, + metrics.ReconcileTime, + metrics.WorkerCount, + metrics.ActiveWorkers, + leaderGauge, + leaderSlowpathCounter, + ) + }) + return &PrometheusProvider{ + reconcileTotal: metrics.ReconcileTotal, + reconcileErrors: metrics.ReconcileErrors, + terminalReconcileErrors: metrics.TerminalReconcileErrors, + reconcilePanics: metrics.ReconcilePanics, + reconcileTime: metrics.ReconcileTime, + workerCount: metrics.WorkerCount, + activeWorkers: metrics.ActiveWorkers, + leaderGauge: leaderGauge, + leaderSlowpathCounter: leaderSlowpathCounter, + } +} + +type prometheusCounterAdapter struct { + *prometheus.CounterVec +} + +func (p *prometheusCounterAdapter) Inc(labels map[string]string) { + p.With(labels).Inc() +} + +func (p *prometheusCounterAdapter) Add(labels map[string]string, val float64) { + p.With(labels).Add(val) +} + +type prometheusGaugeAdapter struct { + *prometheus.GaugeVec +} + +func (p *prometheusGaugeAdapter) Set(labels map[string]string, val float64) { + p.With(labels).Set(val) +} + +func (p *prometheusGaugeAdapter) Add(labels map[string]string, val float64) { + p.With(labels).Add(val) +} + +type prometheusHistogramAdapter struct { + *prometheus.HistogramVec +} + +func (p *prometheusHistogramAdapter) Observe(labels map[string]string, val float64) { + p.With(labels).Observe(val) +} + +// ReconcileTotal returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) ReconcileTotal() CounterMetric { + return &prometheusCounterAdapter{CounterVec: p.reconcileTotal} +} + +// ReconcileErrors returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) ReconcileErrors() CounterMetric { + return &prometheusCounterAdapter{CounterVec: p.reconcileErrors} +} + +// TerminalReconcileErrors returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) TerminalReconcileErrors() CounterMetric { + return &prometheusCounterAdapter{CounterVec: p.terminalReconcileErrors} +} + +// ReconcilePanics returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) ReconcilePanics() CounterMetric { + return &prometheusCounterAdapter{CounterVec: p.reconcilePanics} +} + +// ReconcileTime returns a Prometheus histogram that fulfills the ObservationMetric interface +func (p PrometheusProvider) ReconcileTime() ObservationMetric { + return &prometheusHistogramAdapter{HistogramVec: p.reconcileTime} +} + +// WorkerCount returns a Prometheus gauge that fulfills the GaugeMetric interface +func (p PrometheusProvider) WorkerCount() GaugeMetric { + return &prometheusGaugeAdapter{GaugeVec: p.workerCount} +} + +// ActiveWorkers returns a Prometheus gauge that fulfills the GaugeMetric interface +func (p PrometheusProvider) ActiveWorkers() GaugeMetric { + return &prometheusGaugeAdapter{GaugeVec: p.activeWorkers} +} + +// LeaderGauge returns a Prometheus gauge that fulfills the GaugeMetric interface +func (p PrometheusProvider) LeaderGauge() GaugeMetric { + return &prometheusGaugeAdapter{GaugeVec: p.leaderGauge} +} + +// SlowpathExercised returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) SlowpathExercised() CounterMetric { + return &prometheusCounterAdapter{CounterVec: p.leaderSlowpathCounter} +} diff --git a/pkg/metrics/registry.go b/pkg/metrics/registry.go index ce17124d53..000bba00f8 100644 --- a/pkg/metrics/registry.go +++ b/pkg/metrics/registry.go @@ -16,7 +16,10 @@ limitations under the License. package metrics -import "github.com/prometheus/client_golang/prometheus" +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" +) // RegistererGatherer combines both parts of the API of a Prometheus // registry, both the Registerer and the Gatherer interfaces. @@ -28,3 +31,11 @@ type RegistererGatherer interface { // Registry is a prometheus registry for storing metrics within the // controller-runtime. var Registry RegistererGatherer = prometheus.NewRegistry() + +func init() { + Registry.MustRegister( // expose process metrics like CPU, Memory, file descriptor usage etc. + collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}), + // expose all Go runtime metrics like GC stats, memory stats etc. + collectors.NewGoCollector(collectors.WithGoCollectorRuntimeMetrics(collectors.MetricsAll)), + ) +} From 9103ac460320e6d333573b64d4c6c866e42df7ea Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 9 May 2025 08:53:28 -0700 Subject: [PATCH 2/2] Use a global setter --- pkg/config/controller.go | 5 - pkg/controller/controller.go | 14 -- pkg/controller/metrics/metrics.go | 195 ++++++++++++++++++ pkg/controller/priorityqueue/metrics.go | 2 +- pkg/controller/priorityqueue/metrics_test.go | 2 +- pkg/controller/priorityqueue/priorityqueue.go | 4 +- pkg/internal/controller/controller.go | 43 ++-- pkg/internal/controller/controller_test.go | 36 ++-- pkg/internal/controller/metrics/metrics.go | 81 -------- pkg/internal/metrics/metrics.go | 68 ++++++ pkg/internal/metrics/workqueue.go | 170 --------------- pkg/manager/manager.go | 10 - pkg/metrics/leaderelection.go | 83 +++++++- pkg/metrics/provider.go | 187 ----------------- pkg/metrics/workqueue.go | 172 +++++++++++++++ 15 files changed, 554 insertions(+), 518 deletions(-) create mode 100644 pkg/controller/metrics/metrics.go delete mode 100644 pkg/internal/controller/metrics/metrics.go create mode 100644 pkg/internal/metrics/metrics.go delete mode 100644 pkg/internal/metrics/workqueue.go delete mode 100644 pkg/metrics/provider.go diff --git a/pkg/config/controller.go b/pkg/config/controller.go index 3d64d30e50..a5655593ef 100644 --- a/pkg/config/controller.go +++ b/pkg/config/controller.go @@ -20,7 +20,6 @@ import ( "time" "github.com/go-logr/logr" - "sigs.k8s.io/controller-runtime/pkg/metrics" ) // Controller contains configuration options for controllers. It only includes options @@ -67,10 +66,6 @@ type Controller struct { // Note: This flag is disabled by default until a future version. It's currently in beta. UsePriorityQueue *bool - // MetricsProvider allows users to override the location where controller metrics are emitted. - // By default, metrics are emitted to a pre-configured Prometheus registry - MetricsProvider metrics.ControllerMetricsProvider - // Logger is the logger controllers should use. Logger logr.Logger } diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 7a56aeeb26..9de959b48f 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -25,7 +25,6 @@ import ( "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" "k8s.io/utils/ptr" - "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/config" "sigs.k8s.io/controller-runtime/pkg/controller/priorityqueue" @@ -85,10 +84,6 @@ type TypedOptions[request comparable] struct { // Logger will be used to build a default LogConstructor if unset. Logger logr.Logger - // MetricsProvider allows users to override the location where controller metrics are emitted. - // By default, metrics are emitted to a pre-configured Prometheus registry - MetricsProvider metrics.ControllerMetricsProvider - // LogConstructor is used to construct a logger used for this controller and passed // to each reconciliation via the context field. LogConstructor func(request *request) logr.Logger @@ -106,10 +101,6 @@ func (options *TypedOptions[request]) DefaultFromConfig(config config.Controller options.Logger = config.Logger } - if options.MetricsProvider == nil { - options.MetricsProvider = config.MetricsProvider - } - if options.SkipNameValidation == nil { options.SkipNameValidation = config.SkipNameValidation } @@ -205,10 +196,6 @@ func NewTypedUnmanaged[request comparable](name string, options TypedOptions[req } } - if options.MetricsProvider == nil { - options.MetricsProvider = metrics.NewPrometheusProvider() - } - if options.LogConstructor == nil { log := options.Logger.WithValues( "controller", name, @@ -263,7 +250,6 @@ func NewTypedUnmanaged[request comparable](name string, options TypedOptions[req MaxConcurrentReconciles: options.MaxConcurrentReconciles, CacheSyncTimeout: options.CacheSyncTimeout, Name: name, - MetricsProvider: options.MetricsProvider, LogConstructor: options.LogConstructor, RecoverPanic: options.RecoverPanic, LeaderElected: options.NeedLeaderElection, diff --git a/pkg/controller/metrics/metrics.go b/pkg/controller/metrics/metrics.go new file mode 100644 index 0000000000..c04c8aaad0 --- /dev/null +++ b/pkg/controller/metrics/metrics.go @@ -0,0 +1,195 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + internalmetrics "sigs.k8s.io/controller-runtime/pkg/internal/metrics" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + // reconcileTotal is a prometheus counter metrics which holds the total + // number of reconciliations per controller. It has two labels. controller label refers + // to the controller name and result label refers to the reconcile result i.e + // success, error, requeue, requeue_after. + reconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_total", + Help: "Total number of reconciliations per controller", + }, []string{"controller", "result"}) + + // reconcileErrors is a prometheus counter metrics which holds the total + // number of errors from the Reconciler. + reconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_errors_total", + Help: "Total number of reconciliation errors per controller", + }, []string{"controller"}) + + // terminalReconcileErrors is a prometheus counter metrics which holds the total + // number of terminal errors from the Reconciler. + terminalReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_terminal_reconcile_errors_total", + Help: "Total number of terminal reconciliation errors per controller", + }, []string{"controller"}) + + // reconcilePanics is a prometheus counter metrics which holds the total + // number of panics from the Reconciler. + reconcilePanics = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_panics_total", + Help: "Total number of reconciliation panics per controller", + }, []string{"controller"}) + + // reconcileTime is a prometheus metric which keeps track of the duration + // of reconciliations. + reconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "controller_runtime_reconcile_time_seconds", + Help: "Length of time per reconciliation per controller", + Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, + 1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, + }, []string{"controller"}) + + // workerCount is a prometheus metric which holds the number of + // concurrent reconciles per controller. + workerCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "controller_runtime_max_concurrent_reconciles", + Help: "Maximum number of concurrent reconciles per controller", + }, []string{"controller"}) + + // activeWorkers is a prometheus metric which holds the number + // of active workers per controller. + activeWorkers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "controller_runtime_active_workers", + Help: "Number of currently used workers per controller", + }, []string{"controller"}) +) + +// ControllerMetricsProvider is an interface that provides methods for firing controller metrics +type ControllerMetricsProvider interface { + // ReconcileTotal is a prometheus counter metrics which holds the total + // number of reconciliations per controller. It has two labels. controller label refers + // to the controller name and result label refers to the reconcile result i.e + // success, error, requeue, requeue_after. + ReconcileTotal() internalmetrics.CounterMetric + // ReconcileErrors is a prometheus counter metrics which holds the total + // number of errors from the Reconciler. + ReconcileErrors() internalmetrics.CounterMetric + // TerminalReconcileErrors is a prometheus counter metrics which holds the total + // number of terminal errors from the Reconciler. + TerminalReconcileErrors() internalmetrics.CounterMetric + // ReconcilePanics is a prometheus counter metrics which holds the total + // number of panics from the Reconciler. + ReconcilePanics() internalmetrics.CounterMetric + // ReconcileTime is a prometheus metric which keeps track of the duration + // of reconciliations. + ReconcileTime() internalmetrics.HistogramMetric + // WorkerCount is a prometheus metric which holds the number of + // concurrent reconciles per controller. + WorkerCount() internalmetrics.GaugeMetric + // ActiveWorkers is a prometheus metric which holds the number + // of active workers per controller. + ActiveWorkers() internalmetrics.GaugeMetric +} + +// PrometheusProvider is a metrics.ControllerMetricsProvider and a metrics.LeaderElectionMetricsProvider +// that registers and fires prometheus metrics in response to leader election and controller events +type PrometheusProvider struct { + reconcileTotal *prometheus.CounterVec + reconcileErrors *prometheus.CounterVec + terminalReconcileErrors *prometheus.CounterVec + reconcilePanics *prometheus.CounterVec + reconcileTime *prometheus.HistogramVec + workerCount *prometheus.GaugeVec + activeWorkers *prometheus.GaugeVec +} + +// NewPrometheusProvider creates a PrometheusProvider +func NewPrometheusProvider() *PrometheusProvider { + return &PrometheusProvider{ + reconcileTotal: reconcileTotal, + reconcileErrors: reconcileErrors, + terminalReconcileErrors: terminalReconcileErrors, + reconcilePanics: reconcilePanics, + reconcileTime: reconcileTime, + workerCount: workerCount, + activeWorkers: activeWorkers, + } +} + +// ReconcileTotal returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) ReconcileTotal() internalmetrics.CounterMetric { + return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.reconcileTotal} +} + +// ReconcileErrors returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) ReconcileErrors() internalmetrics.CounterMetric { + return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.reconcileErrors} +} + +// TerminalReconcileErrors returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) TerminalReconcileErrors() internalmetrics.CounterMetric { + return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.terminalReconcileErrors} +} + +// ReconcilePanics returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) ReconcilePanics() internalmetrics.CounterMetric { + return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.reconcilePanics} +} + +// ReconcileTime returns a Prometheus histogram that fulfills the ObservationMetric interface +func (p PrometheusProvider) ReconcileTime() internalmetrics.HistogramMetric { + return &internalmetrics.PrometheusHistogramAdapter{HistogramVec: p.reconcileTime} +} + +// WorkerCount returns a Prometheus gauge that fulfills the GaugeMetric interface +func (p PrometheusProvider) WorkerCount() internalmetrics.GaugeMetric { + return &internalmetrics.PrometheusGaugeAdapter{GaugeVec: p.workerCount} +} + +// ActiveWorkers returns a Prometheus gauge that fulfills the GaugeMetric interface +func (p PrometheusProvider) ActiveWorkers() internalmetrics.GaugeMetric { + return &internalmetrics.PrometheusGaugeAdapter{GaugeVec: p.activeWorkers} +} + +func init() { + metrics.Registry.MustRegister( + reconcileTotal, + reconcileErrors, + terminalReconcileErrors, + reconcilePanics, + reconcileTime, + workerCount, + activeWorkers, + ) +} + +var controllerMetricsProvider ControllerMetricsProvider = NewPrometheusProvider() + +// SetControllerMetricsProvider assigns a provider to the ControllerMetricsProvider for exposing controller metrics. +// The PrometheusProvider will be used by default if the provider is not overridden +func SetControllerMetricsProvider(provider ControllerMetricsProvider) { + controllerMetricsProvider = provider +} + +// GetControllerMetricsProvider returns the controller metrics provider being used by the controller reconciliation +func GetControllerMetricsProvider() ControllerMetricsProvider { + return controllerMetricsProvider +} diff --git a/pkg/controller/priorityqueue/metrics.go b/pkg/controller/priorityqueue/metrics.go index 967a252dfb..d84a9b19c6 100644 --- a/pkg/controller/priorityqueue/metrics.go +++ b/pkg/controller/priorityqueue/metrics.go @@ -6,7 +6,7 @@ import ( "k8s.io/client-go/util/workqueue" "k8s.io/utils/clock" - "sigs.k8s.io/controller-runtime/pkg/internal/metrics" + "sigs.k8s.io/controller-runtime/pkg/metrics" ) // This file is mostly a copy of unexported code from diff --git a/pkg/controller/priorityqueue/metrics_test.go b/pkg/controller/priorityqueue/metrics_test.go index 3be3989d89..7292f5d81d 100644 --- a/pkg/controller/priorityqueue/metrics_test.go +++ b/pkg/controller/priorityqueue/metrics_test.go @@ -4,7 +4,7 @@ import ( "sync" "k8s.io/client-go/util/workqueue" - "sigs.k8s.io/controller-runtime/pkg/internal/metrics" + "sigs.k8s.io/controller-runtime/pkg/metrics" ) func newFakeMetricsProvider() *fakeMetricsProvider { diff --git a/pkg/controller/priorityqueue/priorityqueue.go b/pkg/controller/priorityqueue/priorityqueue.go index c3f77a6f39..29e6321d1a 100644 --- a/pkg/controller/priorityqueue/priorityqueue.go +++ b/pkg/controller/priorityqueue/priorityqueue.go @@ -12,7 +12,7 @@ import ( "k8s.io/utils/clock" "k8s.io/utils/ptr" - "sigs.k8s.io/controller-runtime/pkg/internal/metrics" + "sigs.k8s.io/controller-runtime/pkg/metrics" ) // AddOpts describes the options for adding items to the queue. @@ -56,7 +56,7 @@ func New[T comparable](name string, o ...Opt[T]) PriorityQueue[T] { } if opts.MetricProvider == nil { - opts.MetricProvider = metrics.WorkqueueMetricsProvider{} + opts.MetricProvider = metrics.PrometheusWorkqueueMetricsProvider{} } pq := &priorityqueue[T]{ diff --git a/pkg/internal/controller/controller.go b/pkg/internal/controller/controller.go index 27b84ebc23..85d8b07072 100644 --- a/pkg/internal/controller/controller.go +++ b/pkg/internal/controller/controller.go @@ -30,7 +30,7 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/client-go/util/workqueue" - "sigs.k8s.io/controller-runtime/pkg/metrics" + "sigs.k8s.io/controller-runtime/pkg/controller/metrics" "sigs.k8s.io/controller-runtime/pkg/controller/priorityqueue" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -89,9 +89,6 @@ type Controller[request comparable] struct { // outside the context of a reconciliation. LogConstructor func(request *request) logr.Logger - // MetricsProvider is used to route metrics that are fired due to controller reconciles - MetricsProvider metrics.ControllerMetricsProvider - // RecoverPanic indicates whether the panic caused by reconcile should be recovered. // Defaults to true. RecoverPanic *bool @@ -104,7 +101,7 @@ type Controller[request comparable] struct { func (c *Controller[request]) Reconcile(ctx context.Context, req request) (_ reconcile.Result, err error) { defer func() { if r := recover(); r != nil { - c.MetricsProvider.ReconcilePanics().Inc(map[string]string{labelKeyController: c.Name}) + metrics.GetControllerMetricsProvider().ReconcilePanics().Inc(map[string]string{labelKeyController: c.Name}) if c.RecoverPanic == nil || *c.RecoverPanic { for _, fn := range utilruntime.PanicHandlers { @@ -297,8 +294,8 @@ func (c *Controller[request]) processNextWorkItem(ctx context.Context) bool { // period. defer c.Queue.Done(obj) - c.MetricsProvider.ActiveWorkers().Add(map[string]string{labelKeyController: c.Name}, 1) - defer c.MetricsProvider.ActiveWorkers().Add(map[string]string{labelKeyController: c.Name}, -1) + metrics.GetControllerMetricsProvider().ActiveWorkers().Add(map[string]string{labelKeyController: c.Name}, 1) + defer metrics.GetControllerMetricsProvider().ActiveWorkers().Add(map[string]string{labelKeyController: c.Name}, -1) c.reconcileHandler(ctx, obj, priority) return true @@ -314,15 +311,15 @@ const ( ) func (c *Controller[request]) initMetrics() { - c.MetricsProvider.ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelError}, 0) - c.MetricsProvider.ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeueAfter}, 0) - c.MetricsProvider.ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeue}, 0) - c.MetricsProvider.ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelSuccess}, 0) - c.MetricsProvider.ReconcileErrors().Add(map[string]string{labelKeyController: c.Name}, 0) - c.MetricsProvider.TerminalReconcileErrors().Add(map[string]string{labelKeyController: c.Name}, 0) - c.MetricsProvider.ReconcilePanics().Add(map[string]string{labelKeyController: c.Name}, 0) - c.MetricsProvider.WorkerCount().Set(map[string]string{labelKeyController: c.Name}, float64(c.MaxConcurrentReconciles)) - c.MetricsProvider.ActiveWorkers().Set(map[string]string{labelKeyController: c.Name}, 0) + metrics.GetControllerMetricsProvider().ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelError}, 0) + metrics.GetControllerMetricsProvider().ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeueAfter}, 0) + metrics.GetControllerMetricsProvider().ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeue}, 0) + metrics.GetControllerMetricsProvider().ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelSuccess}, 0) + metrics.GetControllerMetricsProvider().ReconcileErrors().Add(map[string]string{labelKeyController: c.Name}, 0) + metrics.GetControllerMetricsProvider().TerminalReconcileErrors().Add(map[string]string{labelKeyController: c.Name}, 0) + metrics.GetControllerMetricsProvider().ReconcilePanics().Add(map[string]string{labelKeyController: c.Name}, 0) + metrics.GetControllerMetricsProvider().WorkerCount().Set(map[string]string{labelKeyController: c.Name}, float64(c.MaxConcurrentReconciles)) + metrics.GetControllerMetricsProvider().ActiveWorkers().Set(map[string]string{labelKeyController: c.Name}, 0) } func (c *Controller[request]) reconcileHandler(ctx context.Context, req request, priority int) { @@ -346,12 +343,12 @@ func (c *Controller[request]) reconcileHandler(ctx context.Context, req request, switch { case err != nil: if errors.Is(err, reconcile.TerminalError(nil)) { - c.MetricsProvider.TerminalReconcileErrors().Inc(map[string]string{"controller": c.Name}) + metrics.GetControllerMetricsProvider().TerminalReconcileErrors().Inc(map[string]string{"controller": c.Name}) } else { c.Queue.AddWithOpts(priorityqueue.AddOpts{RateLimited: true, Priority: priority}, req) } - c.MetricsProvider.ReconcileErrors().Inc(map[string]string{labelKeyController: c.Name}) - c.MetricsProvider.ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelError}) + metrics.GetControllerMetricsProvider().ReconcileErrors().Inc(map[string]string{labelKeyController: c.Name}) + metrics.GetControllerMetricsProvider().ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelError}) if !result.IsZero() { log.Info("Warning: Reconciler returned both a non-zero result and a non-nil error. The result will always be ignored if the error is non-nil and the non-nil error causes requeuing with exponential backoff. For more details, see: https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/reconcile#Reconciler") } @@ -364,17 +361,17 @@ func (c *Controller[request]) reconcileHandler(ctx context.Context, req request, // to result.RequestAfter c.Queue.Forget(req) c.Queue.AddWithOpts(priorityqueue.AddOpts{After: result.RequeueAfter, Priority: priority}, req) - c.MetricsProvider.ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeueAfter}) + metrics.GetControllerMetricsProvider().ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeueAfter}) case result.Requeue: //nolint: staticcheck // We have to handle it until it is removed log.V(5).Info("Reconcile done, requeueing") c.Queue.AddWithOpts(priorityqueue.AddOpts{RateLimited: true, Priority: priority}, req) - c.MetricsProvider.ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeue}) + metrics.GetControllerMetricsProvider().ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeue}) default: log.V(5).Info("Reconcile successful") // Finally, if no error occurs we Forget this item so it does not // get queued again until another change happens. c.Queue.Forget(req) - c.MetricsProvider.ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelSuccess}) + metrics.GetControllerMetricsProvider().ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelSuccess}) } } @@ -385,7 +382,7 @@ func (c *Controller[request]) GetLogger() logr.Logger { // updateMetrics updates prometheus metrics within the controller. func (c *Controller[request]) updateMetrics(reconcileTime time.Duration) { - c.MetricsProvider.ReconcileTime().Observe(map[string]string{labelKeyController: c.Name}, reconcileTime.Seconds()) + metrics.GetControllerMetricsProvider().ReconcileTime().Observe(map[string]string{labelKeyController: c.Name}, reconcileTime.Seconds()) } // ReconcileIDFromContext gets the reconcileID from the current context. diff --git a/pkg/internal/controller/controller_test.go b/pkg/internal/controller/controller_test.go index 431aeae11e..e8362b34e4 100644 --- a/pkg/internal/controller/controller_test.go +++ b/pkg/internal/controller/controller_test.go @@ -38,12 +38,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/cache/informertest" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllertest" + ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/controller/metrics" "sigs.k8s.io/controller-runtime/pkg/controller/priorityqueue" "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" - ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/internal/controller/metrics" "sigs.k8s.io/controller-runtime/pkg/internal/log" - "sigs.k8s.io/controller-runtime/pkg/metrics" + intmetrics "sigs.k8s.io/controller-runtime/pkg/internal/metrics" "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" ) @@ -76,7 +76,6 @@ var _ = Describe("controller", func() { NewQueue: func(string, workqueue.TypedRateLimiter[reconcile.Request]) workqueue.TypedRateLimitingInterface[reconcile.Request] { return queue }, - MetricsProvider: metrics.NewPrometheusProvider(), LogConstructor: func(_ *reconcile.Request) logr.Logger { return log.RuntimeLog.WithName("controller").WithName("test") }, @@ -356,7 +355,6 @@ var _ = Describe("controller", func() { NewQueue: func(string, workqueue.TypedRateLimiter[TestRequest]) workqueue.TypedRateLimitingInterface[TestRequest] { return queue }, - MetricsProvider: metrics.NewPrometheusProvider(), LogConstructor: func(*TestRequest) logr.Logger { return log.RuntimeLog.WithName("controller").WithName("test") }, @@ -813,13 +811,13 @@ var _ = Describe("controller", func() { var reconcileTotal dto.Metric BeforeEach(func() { - ctrlmetrics.ReconcileTotal.Reset() + ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).CounterVec.Reset() reconcileTotal.Reset() }) It("should get updated on successful reconciliation", func() { Expect(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "success").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "success").Write(&reconcileTotal)).To(Succeed()) if reconcileTotal.GetCounter().GetValue() != 0.0 { return fmt.Errorf("metric reconcile total not reset") } @@ -838,7 +836,7 @@ var _ = Describe("controller", func() { fakeReconcile.AddResult(reconcile.Result{}, nil) Expect(<-reconciled).To(Equal(request)) Eventually(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "success").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "success").Write(&reconcileTotal)).To(Succeed()) if actual := reconcileTotal.GetCounter().GetValue(); actual != 1.0 { return fmt.Errorf("metric reconcile total expected: %v and got: %v", 1.0, actual) } @@ -848,7 +846,7 @@ var _ = Describe("controller", func() { It("should get updated on reconcile errors", func() { Expect(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "error").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "error").Write(&reconcileTotal)).To(Succeed()) if reconcileTotal.GetCounter().GetValue() != 0.0 { return fmt.Errorf("metric reconcile total not reset") } @@ -867,7 +865,7 @@ var _ = Describe("controller", func() { fakeReconcile.AddResult(reconcile.Result{}, fmt.Errorf("expected error: reconcile")) Expect(<-reconciled).To(Equal(request)) Eventually(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "error").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "error").Write(&reconcileTotal)).To(Succeed()) if actual := reconcileTotal.GetCounter().GetValue(); actual != 1.0 { return fmt.Errorf("metric reconcile total expected: %v and got: %v", 1.0, actual) } @@ -877,7 +875,7 @@ var _ = Describe("controller", func() { It("should get updated when reconcile returns with retry enabled", func() { Expect(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "retry").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "retry").Write(&reconcileTotal)).To(Succeed()) if reconcileTotal.GetCounter().GetValue() != 0.0 { return fmt.Errorf("metric reconcile total not reset") } @@ -897,7 +895,7 @@ var _ = Describe("controller", func() { fakeReconcile.AddResult(reconcile.Result{Requeue: true}, nil) Expect(<-reconciled).To(Equal(request)) Eventually(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "requeue").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "requeue").Write(&reconcileTotal)).To(Succeed()) if actual := reconcileTotal.GetCounter().GetValue(); actual != 1.0 { return fmt.Errorf("metric reconcile total expected: %v and got: %v", 1.0, actual) } @@ -907,7 +905,7 @@ var _ = Describe("controller", func() { It("should get updated when reconcile returns with retryAfter enabled", func() { Expect(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "retry_after").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "retry_after").Write(&reconcileTotal)).To(Succeed()) if reconcileTotal.GetCounter().GetValue() != 0.0 { return fmt.Errorf("metric reconcile total not reset") } @@ -926,7 +924,7 @@ var _ = Describe("controller", func() { fakeReconcile.AddResult(reconcile.Result{RequeueAfter: 5 * time.Hour}, nil) Expect(<-reconciled).To(Equal(request)) Eventually(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "requeue_after").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "requeue_after").Write(&reconcileTotal)).To(Succeed()) if actual := reconcileTotal.GetCounter().GetValue(); actual != 1.0 { return fmt.Errorf("metric reconcile total expected: %v and got: %v", 1.0, actual) } @@ -938,9 +936,9 @@ var _ = Describe("controller", func() { Context("should update prometheus metrics", func() { It("should requeue a Request if there is an error and continue processing items", func() { var reconcileErrs dto.Metric - ctrlmetrics.ReconcileErrors.Reset() + ctrlmetrics.GetControllerMetricsProvider().ReconcileErrors().(*intmetrics.PrometheusCounterAdapter).Reset() Expect(func() error { - Expect(ctrlmetrics.ReconcileErrors.WithLabelValues(ctrl.Name).Write(&reconcileErrs)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileErrors().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name).Write(&reconcileErrs)).To(Succeed()) if reconcileErrs.GetCounter().GetValue() != 0.0 { return fmt.Errorf("metric reconcile errors not reset") } @@ -959,7 +957,7 @@ var _ = Describe("controller", func() { fakeReconcile.AddResult(reconcile.Result{}, fmt.Errorf("expected error: reconcile")) Expect(<-reconciled).To(Equal(request)) Eventually(func() error { - Expect(ctrlmetrics.ReconcileErrors.WithLabelValues(ctrl.Name).Write(&reconcileErrs)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileErrors().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name).Write(&reconcileErrs)).To(Succeed()) if reconcileErrs.GetCounter().GetValue() != 1.0 { return fmt.Errorf("metrics not updated") } @@ -977,10 +975,10 @@ var _ = Describe("controller", func() { It("should add a reconcile time to the reconcile time histogram", func() { var reconcileTime dto.Metric - ctrlmetrics.ReconcileTime.Reset() + ctrlmetrics.GetControllerMetricsProvider().ReconcileTime().(*intmetrics.PrometheusHistogramAdapter).Reset() Expect(func() error { - histObserver := ctrlmetrics.ReconcileTime.WithLabelValues(ctrl.Name) + histObserver := ctrlmetrics.GetControllerMetricsProvider().ReconcileTime().(*intmetrics.PrometheusHistogramAdapter).WithLabelValues(ctrl.Name) hist := histObserver.(prometheus.Histogram) Expect(hist.Write(&reconcileTime)).To(Succeed()) if reconcileTime.GetHistogram().GetSampleCount() != uint64(0) { @@ -1006,7 +1004,7 @@ var _ = Describe("controller", func() { Eventually(func() int { return queue.NumRequeues(request) }).Should(Equal(0)) Eventually(func() error { - histObserver := ctrlmetrics.ReconcileTime.WithLabelValues(ctrl.Name) + histObserver := ctrlmetrics.GetControllerMetricsProvider().ReconcileTime().(*intmetrics.PrometheusHistogramAdapter).WithLabelValues(ctrl.Name) hist := histObserver.(prometheus.Histogram) Expect(hist.Write(&reconcileTime)).To(Succeed()) if reconcileTime.GetHistogram().GetSampleCount() == uint64(0) { diff --git a/pkg/internal/controller/metrics/metrics.go b/pkg/internal/controller/metrics/metrics.go deleted file mode 100644 index 7f41ce01a8..0000000000 --- a/pkg/internal/controller/metrics/metrics.go +++ /dev/null @@ -1,81 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" -) - -var ( - // ReconcileTotal is a prometheus counter metrics which holds the total - // number of reconciliations per controller. It has two labels. controller label refers - // to the controller name and result label refers to the reconcile result i.e - // success, error, requeue, requeue_after. - ReconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "controller_runtime_reconcile_total", - Help: "Total number of reconciliations per controller", - }, []string{"controller", "result"}) - - // ReconcileErrors is a prometheus counter metrics which holds the total - // number of errors from the Reconciler. - ReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "controller_runtime_reconcile_errors_total", - Help: "Total number of reconciliation errors per controller", - }, []string{"controller"}) - - // TerminalReconcileErrors is a prometheus counter metrics which holds the total - // number of terminal errors from the Reconciler. - TerminalReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "controller_runtime_terminal_reconcile_errors_total", - Help: "Total number of terminal reconciliation errors per controller", - }, []string{"controller"}) - - // ReconcilePanics is a prometheus counter metrics which holds the total - // number of panics from the Reconciler. - ReconcilePanics = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "controller_runtime_reconcile_panics_total", - Help: "Total number of reconciliation panics per controller", - }, []string{"controller"}) - - // ReconcileTime is a prometheus metric which keeps track of the duration - // of reconciliations. - ReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Name: "controller_runtime_reconcile_time_seconds", - Help: "Length of time per reconciliation per controller", - Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, - 1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}, - NativeHistogramBucketFactor: 1.1, - NativeHistogramMaxBucketNumber: 100, - NativeHistogramMinResetDuration: 1 * time.Hour, - }, []string{"controller"}) - - // WorkerCount is a prometheus metric which holds the number of - // concurrent reconciles per controller. - WorkerCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "controller_runtime_max_concurrent_reconciles", - Help: "Maximum number of concurrent reconciles per controller", - }, []string{"controller"}) - - // ActiveWorkers is a prometheus metric which holds the number - // of active workers per controller. - ActiveWorkers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "controller_runtime_active_workers", - Help: "Number of currently used workers per controller", - }, []string{"controller"}) -) diff --git a/pkg/internal/metrics/metrics.go b/pkg/internal/metrics/metrics.go new file mode 100644 index 0000000000..826c7d75f0 --- /dev/null +++ b/pkg/internal/metrics/metrics.go @@ -0,0 +1,68 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import "github.com/prometheus/client_golang/prometheus" + +// HistogramMetric is a metric that stores the set of observed values +type HistogramMetric interface { + Observe(map[string]string, float64) +} + +// GaugeMetric is a metric that gets set and can be changed dynamically at runtime +type GaugeMetric interface { + Set(map[string]string, float64) + Add(map[string]string, float64) +} + +// CounterMetric is a metric that gets incremented monotonically +type CounterMetric interface { + Inc(map[string]string) + Add(map[string]string, float64) +} + +type PrometheusCounterAdapter struct { + *prometheus.CounterVec +} + +func (p *PrometheusCounterAdapter) Inc(labels map[string]string) { + p.With(labels).Inc() +} + +func (p *PrometheusCounterAdapter) Add(labels map[string]string, val float64) { + p.With(labels).Add(val) +} + +type PrometheusGaugeAdapter struct { + *prometheus.GaugeVec +} + +func (p *PrometheusGaugeAdapter) Set(labels map[string]string, val float64) { + p.With(labels).Set(val) +} + +func (p *PrometheusGaugeAdapter) Add(labels map[string]string, val float64) { + p.With(labels).Add(val) +} + +type PrometheusHistogramAdapter struct { + *prometheus.HistogramVec +} + +func (p *PrometheusHistogramAdapter) Observe(labels map[string]string, val float64) { + p.With(labels).Observe(val) +} diff --git a/pkg/internal/metrics/workqueue.go b/pkg/internal/metrics/workqueue.go deleted file mode 100644 index 402319817b..0000000000 --- a/pkg/internal/metrics/workqueue.go +++ /dev/null @@ -1,170 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "strconv" - "time" - - "github.com/prometheus/client_golang/prometheus" - "k8s.io/client-go/util/workqueue" - "sigs.k8s.io/controller-runtime/pkg/metrics" -) - -// This file is copied and adapted from k8s.io/component-base/metrics/prometheus/workqueue -// which registers metrics to the k8s legacy Registry. We require very -// similar functionality, but must register metrics to a different Registry. - -// Metrics subsystem and all keys used by the workqueue. -const ( - WorkQueueSubsystem = metrics.WorkQueueSubsystem - DepthKey = metrics.DepthKey - AddsKey = metrics.AddsKey - QueueLatencyKey = metrics.QueueLatencyKey - WorkDurationKey = metrics.WorkDurationKey - UnfinishedWorkKey = metrics.UnfinishedWorkKey - LongestRunningProcessorKey = metrics.LongestRunningProcessorKey - RetriesKey = metrics.RetriesKey -) - -var ( - depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Subsystem: WorkQueueSubsystem, - Name: DepthKey, - Help: "Current depth of workqueue by workqueue and priority", - }, []string{"name", "controller", "priority"}) - - adds = prometheus.NewCounterVec(prometheus.CounterOpts{ - Subsystem: WorkQueueSubsystem, - Name: AddsKey, - Help: "Total number of adds handled by workqueue", - }, []string{"name", "controller"}) - - latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Subsystem: WorkQueueSubsystem, - Name: QueueLatencyKey, - Help: "How long in seconds an item stays in workqueue before being requested", - Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12), - NativeHistogramBucketFactor: 1.1, - NativeHistogramMaxBucketNumber: 100, - NativeHistogramMinResetDuration: 1 * time.Hour, - }, []string{"name", "controller"}) - - workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Subsystem: WorkQueueSubsystem, - Name: WorkDurationKey, - Help: "How long in seconds processing an item from workqueue takes.", - Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12), - NativeHistogramBucketFactor: 1.1, - NativeHistogramMaxBucketNumber: 100, - NativeHistogramMinResetDuration: 1 * time.Hour, - }, []string{"name", "controller"}) - - unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Subsystem: WorkQueueSubsystem, - Name: UnfinishedWorkKey, - Help: "How many seconds of work has been done that " + - "is in progress and hasn't been observed by work_duration. Large " + - "values indicate stuck threads. One can deduce the number of stuck " + - "threads by observing the rate at which this increases.", - }, []string{"name", "controller"}) - - longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Subsystem: WorkQueueSubsystem, - Name: LongestRunningProcessorKey, - Help: "How many seconds has the longest running " + - "processor for workqueue been running.", - }, []string{"name", "controller"}) - - retries = prometheus.NewCounterVec(prometheus.CounterOpts{ - Subsystem: WorkQueueSubsystem, - Name: RetriesKey, - Help: "Total number of retries handled by workqueue", - }, []string{"name", "controller"}) -) - -func init() { - metrics.Registry.MustRegister(depth) - metrics.Registry.MustRegister(adds) - metrics.Registry.MustRegister(latency) - metrics.Registry.MustRegister(workDuration) - metrics.Registry.MustRegister(unfinished) - metrics.Registry.MustRegister(longestRunningProcessor) - metrics.Registry.MustRegister(retries) - - workqueue.SetProvider(WorkqueueMetricsProvider{}) -} - -type WorkqueueMetricsProvider struct{} - -func (WorkqueueMetricsProvider) NewDepthMetric(name string) workqueue.GaugeMetric { - return depth.WithLabelValues(name, name, "") // no priority -} - -func (WorkqueueMetricsProvider) NewAddsMetric(name string) workqueue.CounterMetric { - return adds.WithLabelValues(name, name) -} - -func (WorkqueueMetricsProvider) NewLatencyMetric(name string) workqueue.HistogramMetric { - return latency.WithLabelValues(name, name) -} - -func (WorkqueueMetricsProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric { - return workDuration.WithLabelValues(name, name) -} - -func (WorkqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric { - return unfinished.WithLabelValues(name, name) -} - -func (WorkqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric { - return longestRunningProcessor.WithLabelValues(name, name) -} - -func (WorkqueueMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric { - return retries.WithLabelValues(name, name) -} - -type MetricsProviderWithPriority interface { - workqueue.MetricsProvider - - NewDepthMetricWithPriority(name string) DepthMetricWithPriority -} - -// DepthMetricWithPriority represents a depth metric with priority. -type DepthMetricWithPriority interface { - Inc(priority int) - Dec(priority int) -} - -var _ MetricsProviderWithPriority = WorkqueueMetricsProvider{} - -func (WorkqueueMetricsProvider) NewDepthMetricWithPriority(name string) DepthMetricWithPriority { - return &depthWithPriorityMetric{lvs: []string{name, name}} -} - -type depthWithPriorityMetric struct { - lvs []string -} - -func (g *depthWithPriorityMetric) Inc(priority int) { - depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Inc() -} - -func (g *depthWithPriorityMetric) Dec(priority int) { - depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Dec() -} diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 7511ec9cc3..c3ae317b04 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -33,7 +33,6 @@ import ( "k8s.io/client-go/tools/leaderelection/resourcelock" "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" - "sigs.k8s.io/controller-runtime/pkg/metrics" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/cache" @@ -224,10 +223,6 @@ type Options struct { // Metrics are the metricsserver.Options that will be used to create the metricsserver.Server. Metrics metricsserver.Options - // LeaderEelectionMetricProvider allows users to override the location where leader election metrics are emitted. - // By default, metrics are emitted to a pre-configured Prometheus registry - LeaderElectionMetricProvider metrics.LeaderElectionMetricsProvider - // HealthProbeBindAddress is the TCP address that the controller should bind to // for serving health probes // It can be set to "0" or "" to disable serving the health probe. @@ -406,11 +401,6 @@ func New(config *rest.Config, options Options) (Manager, error) { if err != nil { return nil, err } - leaderElectionMetricsProvider := options.LeaderElectionMetricProvider - if leaderElectionMetricsProvider == nil { - leaderElectionMetricsProvider = metrics.NewPrometheusProvider() - } - metrics.SetLeaderElectionProvider(leaderElectionMetricsProvider) // Create health probes listener. This will throw an error if the bind // address is invalid or already in use. diff --git a/pkg/metrics/leaderelection.go b/pkg/metrics/leaderelection.go index 2b6b979052..89f500b78d 100644 --- a/pkg/metrics/leaderelection.go +++ b/pkg/metrics/leaderelection.go @@ -1,19 +1,92 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package metrics import ( + "github.com/prometheus/client_golang/prometheus" "k8s.io/client-go/tools/leaderelection" + internalmetrics "sigs.k8s.io/controller-runtime/pkg/internal/metrics" +) + +var ( + leaderGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "leader_election_master_status", + Help: "Gauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.", + }, []string{"name"}) + leaderSlowpathCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "leader_election_slowpath_total", + Help: "Total number of slow path exercised in renewing leader leases. 'name' is the string used to identify the lease. Please make sure to group by name.", + }, []string{"name"}) ) -// SetLeaderElectionProvider sets the leader election provider leveraged by client-go -func SetLeaderElectionProvider(provider LeaderElectionMetricsProvider) { - leaderelection.SetProvider(leaderElectionMetricsProvider{provider: provider}) +func init() { + Registry.MustRegister(leaderGauge, leaderSlowpathCounter) + SetLeaderElectionMetricsProvider(NewPrometheusLeaderElectionMetricsProvider()) +} + +var leaderElectionMetricsProvider LeaderElectionMetricsProvider + +// SetLeaderElectionMetricsProvider sets the leader election provider leveraged by client-go +func SetLeaderElectionMetricsProvider(provider LeaderElectionMetricsProvider) { + leaderElectionMetricsProvider = provider + leaderelection.SetProvider(leaderElectionMetricsInternalProvider{provider: provider}) +} + +// GetLeaderElectionMetricsProvider returns the leader election metrics provider +func GetLeaderElectionMetricsProvider() LeaderElectionMetricsProvider { + return leaderElectionMetricsProvider +} + +// LeaderElectionMetricsProvider is an interface that provides methods for firing leader election metrics +type LeaderElectionMetricsProvider interface { + LeaderGauge() internalmetrics.GaugeMetric + SlowpathExercised() internalmetrics.CounterMetric +} + +// PrometheusLeaderElectionMetricsProvider is a metrics.LeaderElectionMetricsProvider +// that fires prometheus metrics in response to leader election and controller events +type PrometheusLeaderElectionMetricsProvider struct { + leaderGauge *prometheus.GaugeVec + leaderSlowpathCounter *prometheus.CounterVec +} + +// NewPrometheusLeaderElectionMetricsProvider creates a PrometheusLeaderElectionMetricsProvider +func NewPrometheusLeaderElectionMetricsProvider() *PrometheusLeaderElectionMetricsProvider { + return &PrometheusLeaderElectionMetricsProvider{ + leaderGauge: leaderGauge, + leaderSlowpathCounter: leaderSlowpathCounter, + } +} + +// LeaderGauge returns a Prometheus gauge that fulfills the GaugeMetric interface +func (p PrometheusLeaderElectionMetricsProvider) LeaderGauge() internalmetrics.GaugeMetric { + return &internalmetrics.PrometheusGaugeAdapter{GaugeVec: p.leaderGauge} +} + +// SlowpathExercised returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusLeaderElectionMetricsProvider) SlowpathExercised() internalmetrics.CounterMetric { + return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.leaderSlowpathCounter} } -type leaderElectionMetricsProvider struct { +type leaderElectionMetricsInternalProvider struct { provider LeaderElectionMetricsProvider } -func (l leaderElectionMetricsProvider) NewLeaderMetric() leaderelection.LeaderMetric { +func (l leaderElectionMetricsInternalProvider) NewLeaderMetric() leaderelection.LeaderMetric { return leaderElectionMetricAdapter(l) } diff --git a/pkg/metrics/provider.go b/pkg/metrics/provider.go deleted file mode 100644 index d9b9448c5d..0000000000 --- a/pkg/metrics/provider.go +++ /dev/null @@ -1,187 +0,0 @@ -package metrics - -import ( - "sync" - - "github.com/prometheus/client_golang/prometheus" - "sigs.k8s.io/controller-runtime/pkg/internal/controller/metrics" -) - -// LeaderElectionMetricsProvider is an interface that provides methods for firing leader election metrics -type LeaderElectionMetricsProvider interface { - LeaderGauge() GaugeMetric - SlowpathExercised() CounterMetric -} - -// ControllerMetricsProvider is an interface that provides methods for firing controller metrics -type ControllerMetricsProvider interface { - // ReconcileTotal is a prometheus counter metrics which holds the total - // number of reconciliations per controller. It has two labels. controller label refers - // to the controller name and result label refers to the reconcile result i.e - // success, error, requeue, requeue_after. - ReconcileTotal() CounterMetric - // ReconcileErrors is a prometheus counter metrics which holds the total - // number of errors from the Reconciler. - ReconcileErrors() CounterMetric - // TerminalReconcileErrors is a prometheus counter metrics which holds the total - // number of terminal errors from the Reconciler. - TerminalReconcileErrors() CounterMetric - // ReconcilePanics is a prometheus counter metrics which holds the total - // number of panics from the Reconciler. - ReconcilePanics() CounterMetric - // ReconcileTime is a prometheus metric which keeps track of the duration - // of reconciliations. - ReconcileTime() ObservationMetric - // WorkerCount is a prometheus metric which holds the number of - // concurrent reconciles per controller. - WorkerCount() GaugeMetric - // ActiveWorkers is a prometheus metric which holds the number - // of active workers per controller. - ActiveWorkers() GaugeMetric -} - -// ObservationMetric is a metric that stores the set of observed values -type ObservationMetric interface { - Observe(map[string]string, float64) -} - -// GaugeMetric is a metric that gets set and can be changed dynamically at runtime -type GaugeMetric interface { - Set(map[string]string, float64) - Add(map[string]string, float64) -} - -// CounterMetric is a metric that gets incremented monotonically -type CounterMetric interface { - Inc(map[string]string) - Add(map[string]string, float64) -} - -var once sync.Once - -// PrometheusProvider is a metrics.ControllerMetricsProvider and a metrics.LeaderElectionMetricsProvider -// that registers and fires prometheus metrics in response to leader election and controller events -type PrometheusProvider struct { - reconcileTotal *prometheus.CounterVec - reconcileErrors *prometheus.CounterVec - terminalReconcileErrors *prometheus.CounterVec - reconcilePanics *prometheus.CounterVec - reconcileTime *prometheus.HistogramVec - workerCount *prometheus.GaugeVec - activeWorkers *prometheus.GaugeVec - leaderGauge *prometheus.GaugeVec - leaderSlowpathCounter *prometheus.CounterVec -} - -// NewPrometheusProvider creates a PrometheusProvider -func NewPrometheusProvider() *PrometheusProvider { - leaderGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "leader_election_master_status", - Help: "Gauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.", - }, []string{"name"}) - leaderSlowpathCounter := prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "leader_election_slowpath_total", - Help: "Total number of slow path exercised in renewing leader leases. 'name' is the string used to identify the lease. Please make sure to group by name.", - }, []string{"name"}) - once.Do(func() { - Registry.MustRegister( - metrics.ReconcileTotal, - metrics.ReconcileErrors, - metrics.TerminalReconcileErrors, - metrics.ReconcilePanics, - metrics.ReconcileTime, - metrics.WorkerCount, - metrics.ActiveWorkers, - leaderGauge, - leaderSlowpathCounter, - ) - }) - return &PrometheusProvider{ - reconcileTotal: metrics.ReconcileTotal, - reconcileErrors: metrics.ReconcileErrors, - terminalReconcileErrors: metrics.TerminalReconcileErrors, - reconcilePanics: metrics.ReconcilePanics, - reconcileTime: metrics.ReconcileTime, - workerCount: metrics.WorkerCount, - activeWorkers: metrics.ActiveWorkers, - leaderGauge: leaderGauge, - leaderSlowpathCounter: leaderSlowpathCounter, - } -} - -type prometheusCounterAdapter struct { - *prometheus.CounterVec -} - -func (p *prometheusCounterAdapter) Inc(labels map[string]string) { - p.With(labels).Inc() -} - -func (p *prometheusCounterAdapter) Add(labels map[string]string, val float64) { - p.With(labels).Add(val) -} - -type prometheusGaugeAdapter struct { - *prometheus.GaugeVec -} - -func (p *prometheusGaugeAdapter) Set(labels map[string]string, val float64) { - p.With(labels).Set(val) -} - -func (p *prometheusGaugeAdapter) Add(labels map[string]string, val float64) { - p.With(labels).Add(val) -} - -type prometheusHistogramAdapter struct { - *prometheus.HistogramVec -} - -func (p *prometheusHistogramAdapter) Observe(labels map[string]string, val float64) { - p.With(labels).Observe(val) -} - -// ReconcileTotal returns a Prometheus counter that fulfills the CounterMetric interface -func (p PrometheusProvider) ReconcileTotal() CounterMetric { - return &prometheusCounterAdapter{CounterVec: p.reconcileTotal} -} - -// ReconcileErrors returns a Prometheus counter that fulfills the CounterMetric interface -func (p PrometheusProvider) ReconcileErrors() CounterMetric { - return &prometheusCounterAdapter{CounterVec: p.reconcileErrors} -} - -// TerminalReconcileErrors returns a Prometheus counter that fulfills the CounterMetric interface -func (p PrometheusProvider) TerminalReconcileErrors() CounterMetric { - return &prometheusCounterAdapter{CounterVec: p.terminalReconcileErrors} -} - -// ReconcilePanics returns a Prometheus counter that fulfills the CounterMetric interface -func (p PrometheusProvider) ReconcilePanics() CounterMetric { - return &prometheusCounterAdapter{CounterVec: p.reconcilePanics} -} - -// ReconcileTime returns a Prometheus histogram that fulfills the ObservationMetric interface -func (p PrometheusProvider) ReconcileTime() ObservationMetric { - return &prometheusHistogramAdapter{HistogramVec: p.reconcileTime} -} - -// WorkerCount returns a Prometheus gauge that fulfills the GaugeMetric interface -func (p PrometheusProvider) WorkerCount() GaugeMetric { - return &prometheusGaugeAdapter{GaugeVec: p.workerCount} -} - -// ActiveWorkers returns a Prometheus gauge that fulfills the GaugeMetric interface -func (p PrometheusProvider) ActiveWorkers() GaugeMetric { - return &prometheusGaugeAdapter{GaugeVec: p.activeWorkers} -} - -// LeaderGauge returns a Prometheus gauge that fulfills the GaugeMetric interface -func (p PrometheusProvider) LeaderGauge() GaugeMetric { - return &prometheusGaugeAdapter{GaugeVec: p.leaderGauge} -} - -// SlowpathExercised returns a Prometheus counter that fulfills the CounterMetric interface -func (p PrometheusProvider) SlowpathExercised() CounterMetric { - return &prometheusCounterAdapter{CounterVec: p.leaderSlowpathCounter} -} diff --git a/pkg/metrics/workqueue.go b/pkg/metrics/workqueue.go index cd7ccc773e..b606caa13f 100644 --- a/pkg/metrics/workqueue.go +++ b/pkg/metrics/workqueue.go @@ -16,6 +16,18 @@ limitations under the License. package metrics +import ( + "strconv" + "time" + + "github.com/prometheus/client_golang/prometheus" + "k8s.io/client-go/util/workqueue" +) + +// This file is copied and adapted from k8s.io/component-base/metrics/prometheus/workqueue +// which registers metrics to the k8s legacy Registry. We require very +// similar functionality, but must register metrics to a different Registry. + // Metrics subsystem and all keys used by the workqueue. const ( WorkQueueSubsystem = "workqueue" @@ -27,3 +39,163 @@ const ( LongestRunningProcessorKey = "longest_running_processor_seconds" RetriesKey = "retries_total" ) + +var ( + depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: WorkQueueSubsystem, + Name: DepthKey, + Help: "Current depth of workqueue by workqueue and priority", + }, []string{"name", "controller", "priority"}) + + adds = prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: WorkQueueSubsystem, + Name: AddsKey, + Help: "Total number of adds handled by workqueue", + }, []string{"name", "controller"}) + + latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: WorkQueueSubsystem, + Name: QueueLatencyKey, + Help: "How long in seconds an item stays in workqueue before being requested", + Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, + }, []string{"name", "controller"}) + + workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: WorkQueueSubsystem, + Name: WorkDurationKey, + Help: "How long in seconds processing an item from workqueue takes.", + Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, + }, []string{"name", "controller"}) + + unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: WorkQueueSubsystem, + Name: UnfinishedWorkKey, + Help: "How many seconds of work has been done that " + + "is in progress and hasn't been observed by work_duration. Large " + + "values indicate stuck threads. One can deduce the number of stuck " + + "threads by observing the rate at which this increases.", + }, []string{"name", "controller"}) + + longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: WorkQueueSubsystem, + Name: LongestRunningProcessorKey, + Help: "How many seconds has the longest running " + + "processor for workqueue been running.", + }, []string{"name", "controller"}) + + retries = prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: WorkQueueSubsystem, + Name: RetriesKey, + Help: "Total number of retries handled by workqueue", + }, []string{"name", "controller"}) +) + +func init() { + Registry.MustRegister( + depth, + adds, + latency, + workDuration, + unfinished, + longestRunningProcessor, + retries, + ) + SetWorkqueueMetricsProvider(NewPrometheusWorkqueueMetricsProvider()) +} + +var workqueueMetricsProvider workqueue.MetricsProvider + +// SetWorkqueueMetricsProvider sets the workqueue metrics provider leveraged by client-go +func SetWorkqueueMetricsProvider(provider workqueue.MetricsProvider) { + workqueueMetricsProvider = provider + workqueue.SetProvider(provider) +} + +// GetWorkqueueMetricsProvider returns the workqueue metrics provider +func GetWorkqueueMetricsProvider() workqueue.MetricsProvider { + return workqueueMetricsProvider +} + +// PrometheusWorkqueueMetricsProvider implements the metrics provider for exposing workqueue metrics from client-go +type PrometheusWorkqueueMetricsProvider struct{} + +// NewPrometheusWorkqueueMetricsProvider returns a new PrometheusWorkqueueMetricsProvider +func NewPrometheusWorkqueueMetricsProvider() *PrometheusWorkqueueMetricsProvider { + return &PrometheusWorkqueueMetricsProvider{} +} + +// NewDepthMetric creates a Gauge metric from the depth GaugeVec +func (PrometheusWorkqueueMetricsProvider) NewDepthMetric(name string) workqueue.GaugeMetric { + return depth.WithLabelValues(name, name, "") // no priority +} + +// NewAddsMetric creates a Counter metric from the adds CounterVec +func (PrometheusWorkqueueMetricsProvider) NewAddsMetric(name string) workqueue.CounterMetric { + return adds.WithLabelValues(name, name) +} + +// NewLatencyMetric creates a Histogram metric from the latency HistogramVec +func (PrometheusWorkqueueMetricsProvider) NewLatencyMetric(name string) workqueue.HistogramMetric { + return latency.WithLabelValues(name, name) +} + +// NewWorkDurationMetric creates a Histogram metric from the workDuration HistogramVec +func (PrometheusWorkqueueMetricsProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric { + return workDuration.WithLabelValues(name, name) +} + +// NewUnfinishedWorkSecondsMetric creates a Gauge metric from the unfinished GaugeVec +func (PrometheusWorkqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric { + return unfinished.WithLabelValues(name, name) +} + +// NewLongestRunningProcessorSecondsMetric creates a Gauge metric from the longestRunningProcessor GaugeVec +func (PrometheusWorkqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric { + return longestRunningProcessor.WithLabelValues(name, name) +} + +// NewRetriesMetric creates a Counter metric from the retries CounterVec +func (PrometheusWorkqueueMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric { + return retries.WithLabelValues(name, name) +} + +// MetricsProviderWithPriority implements the MetricsProvider and adds a NewDepthMetricWithPriority interface method +// +//nolint:revive +type MetricsProviderWithPriority interface { + workqueue.MetricsProvider + + // NewDepthMetricWithPriority creates a implementation of DepthMetricWithPriority + NewDepthMetricWithPriority(name string) DepthMetricWithPriority +} + +// DepthMetricWithPriority represents a depth metric with priority. +type DepthMetricWithPriority interface { + Inc(priority int) + Dec(priority int) +} + +var _ MetricsProviderWithPriority = PrometheusWorkqueueMetricsProvider{} + +// NewDepthMetricWithPriority returns a DepthMetricWithPriority from the WorkqueueMetricsProvider +func (PrometheusWorkqueueMetricsProvider) NewDepthMetricWithPriority(name string) DepthMetricWithPriority { + return &depthWithPriorityMetric{lvs: []string{name, name}} +} + +type depthWithPriorityMetric struct { + lvs []string +} + +func (g *depthWithPriorityMetric) Inc(priority int) { + depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Inc() +} + +func (g *depthWithPriorityMetric) Dec(priority int) { + depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Dec() +}