Skip to content

Commit 25718e3

Browse files
authored
proxy: Define service_info metric showing the run state (#12749)
## Problem Monitoring dashboards show aggregates of all proxy instances, including terminating ones. This can skew the results or make graphs less readable. Also, alerts must be tuned to ignore certain signals from terminating proxies. ## Summary of changes Add a `service_info` metric currently with one label, `state`, showing if an instance is in state `init`, `running`, or `terminating`. The metric can be joined with other metrics to filter the presented time series.
1 parent ac8f44c commit 25718e3

File tree

6 files changed

+75
-5
lines changed

6 files changed

+75
-5
lines changed

libs/metrics/src/lib.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,12 @@ impl<L: LabelGroup> InfoMetric<L> {
129129
}
130130
}
131131

132+
impl<L: LabelGroup + Default> Default for InfoMetric<L, GaugeState> {
133+
fn default() -> Self {
134+
InfoMetric::new(L::default())
135+
}
136+
}
137+
132138
impl<L: LabelGroup, M: MetricType<Metadata = ()>> InfoMetric<L, M> {
133139
pub fn with_metric(label: L, metric: M) -> Self {
134140
Self {

proxy/src/binary/local_proxy.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use crate::config::{
2929
};
3030
use crate::control_plane::locks::ApiLocks;
3131
use crate::http::health_server::AppMetrics;
32-
use crate::metrics::{Metrics, ThreadPoolMetrics};
32+
use crate::metrics::{Metrics, ServiceInfo, ThreadPoolMetrics};
3333
use crate::rate_limiter::{EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo};
3434
use crate::scram::threadpool::ThreadPool;
3535
use crate::serverless::cancel_set::CancelSet;
@@ -207,6 +207,11 @@ pub async fn run() -> anyhow::Result<()> {
207207
endpoint_rate_limiter,
208208
);
209209

210+
Metrics::get()
211+
.service
212+
.info
213+
.set_label(ServiceInfo::running());
214+
210215
match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
211216
// exit immediately on maintenance task completion
212217
Either::Left((Some(res), _)) => match crate::error::flatten_err(res)? {},

proxy/src/binary/pg_sni_router.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ use utils::project_git_version;
2626
use utils::sentry_init::init_sentry;
2727

2828
use crate::context::RequestContext;
29-
use crate::metrics::{Metrics, ThreadPoolMetrics};
29+
use crate::metrics::{Metrics, ServiceInfo, ThreadPoolMetrics};
3030
use crate::pglb::TlsRequired;
3131
use crate::pqproto::FeStartupPacket;
3232
use crate::protocol2::ConnectionInfo;
@@ -135,6 +135,12 @@ pub async fn run() -> anyhow::Result<()> {
135135
cancellation_token.clone(),
136136
))
137137
.map(crate::error::flatten_err);
138+
139+
Metrics::get()
140+
.service
141+
.info
142+
.set_label(ServiceInfo::running());
143+
138144
let signals_task = tokio::spawn(crate::signals::handle(cancellation_token, || {}));
139145

140146
// the signal task cant ever succeed.

proxy/src/binary/proxy.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ use crate::config::{
4040
};
4141
use crate::context::parquet::ParquetUploadArgs;
4242
use crate::http::health_server::AppMetrics;
43-
use crate::metrics::Metrics;
43+
use crate::metrics::{Metrics, ServiceInfo};
4444
use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
4545
use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
4646
use crate::redis::kv_ops::RedisKVClient;
@@ -590,6 +590,11 @@ pub async fn run() -> anyhow::Result<()> {
590590
}
591591
}
592592

593+
Metrics::get()
594+
.service
595+
.info
596+
.set_label(ServiceInfo::running());
597+
593598
let maintenance = loop {
594599
// get one complete task
595600
match futures::future::select(

proxy/src/metrics.rs

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,16 @@ use std::sync::{Arc, OnceLock};
22

33
use lasso::ThreadedRodeo;
44
use measured::label::{
5-
FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet,
5+
FixedCardinalitySet, LabelGroupSet, LabelGroupVisitor, LabelName, LabelSet, LabelValue,
6+
StaticLabelSet,
67
};
78
use measured::metric::histogram::Thresholds;
89
use measured::metric::name::MetricName;
910
use measured::{
1011
Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
1112
MetricGroup,
1213
};
13-
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLogVec};
14+
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLogVec, InfoMetric};
1415
use tokio::time::{self, Instant};
1516

1617
use crate::control_plane::messages::ColdStartInfo;
@@ -25,6 +26,9 @@ pub struct Metrics {
2526

2627
#[metric(namespace = "wake_compute_lock")]
2728
pub wake_compute_lock: ApiLockMetrics,
29+
30+
#[metric(namespace = "service")]
31+
pub service: ServiceMetrics,
2832
}
2933

3034
static SELF: OnceLock<Metrics> = OnceLock::new();
@@ -660,3 +664,43 @@ pub struct ThreadPoolMetrics {
660664
#[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
661665
pub worker_task_skips_total: CounterVec<ThreadPoolWorkers>,
662666
}
667+
668+
#[derive(MetricGroup, Default)]
669+
pub struct ServiceMetrics {
670+
pub info: InfoMetric<ServiceInfo>,
671+
}
672+
673+
#[derive(Default)]
674+
pub struct ServiceInfo {
675+
pub state: ServiceState,
676+
}
677+
678+
impl ServiceInfo {
679+
pub const fn running() -> Self {
680+
ServiceInfo {
681+
state: ServiceState::Running,
682+
}
683+
}
684+
685+
pub const fn terminating() -> Self {
686+
ServiceInfo {
687+
state: ServiceState::Terminating,
688+
}
689+
}
690+
}
691+
692+
impl LabelGroup for ServiceInfo {
693+
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
694+
const STATE: &LabelName = LabelName::from_str("state");
695+
v.write_value(STATE, &self.state);
696+
}
697+
}
698+
699+
#[derive(FixedCardinalityLabel, Clone, Copy, Debug, Default)]
700+
#[label(singleton = "state")]
701+
pub enum ServiceState {
702+
#[default]
703+
Init,
704+
Running,
705+
Terminating,
706+
}

proxy/src/signals.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ use anyhow::bail;
44
use tokio_util::sync::CancellationToken;
55
use tracing::{info, warn};
66

7+
use crate::metrics::{Metrics, ServiceInfo};
8+
79
/// Handle unix signals appropriately.
810
pub async fn handle<F>(
911
token: CancellationToken,
@@ -28,10 +30,12 @@ where
2830
// Shut down the whole application.
2931
_ = interrupt.recv() => {
3032
warn!("received SIGINT, exiting immediately");
33+
Metrics::get().service.info.set_label(ServiceInfo::terminating());
3134
bail!("interrupted");
3235
}
3336
_ = terminate.recv() => {
3437
warn!("received SIGTERM, shutting down once all existing connections have closed");
38+
Metrics::get().service.info.set_label(ServiceInfo::terminating());
3539
token.cancel();
3640
}
3741
}

0 commit comments

Comments
 (0)