From 12c54327b421421ffad696b50e1b4013e4044f76 Mon Sep 17 00:00:00 2001 From: terassyi Date: Mon, 3 Jun 2024 11:01:57 +0900 Subject: [PATCH] refactor metrics Signed-off-by: terassyi --- sartd/src/kubernetes/src/context.rs | 4 +- sartd/src/kubernetes/src/controller.rs | 1 + .../src/kubernetes/src/controller/metrics.rs | 92 +++++++++++++++++++ .../controller/reconciler/address_block.rs | 33 ++++++- sartd/src/kubernetes/src/fixture.rs | 3 +- sartd/src/kubernetes/src/lib.rs | 1 + sartd/src/kubernetes/src/metrics.rs | 90 ++++++++++++++++++ 7 files changed, 219 insertions(+), 5 deletions(-) create mode 100644 sartd/src/kubernetes/src/controller/metrics.rs create mode 100644 sartd/src/kubernetes/src/metrics.rs diff --git a/sartd/src/kubernetes/src/context.rs b/sartd/src/kubernetes/src/context.rs index 5f95c5b..6437414 100644 --- a/sartd/src/kubernetes/src/context.rs +++ b/sartd/src/kubernetes/src/context.rs @@ -12,7 +12,9 @@ pub use kube::{ use serde::Serialize; use tokio::sync::RwLock; -use sartd_trace::{error::TraceableError, metrics::Metrics}; +use sartd_trace::error::TraceableError; + +use crate::metrics::Metrics; pub trait Ctx { fn metrics(&self) -> &Metrics; diff --git a/sartd/src/kubernetes/src/controller.rs b/sartd/src/kubernetes/src/controller.rs index d98ddcf..51e5f22 100644 --- a/sartd/src/kubernetes/src/controller.rs +++ b/sartd/src/kubernetes/src/controller.rs @@ -3,3 +3,4 @@ pub mod error; pub mod reconciler; pub mod server; pub mod webhook; +mod metrics; diff --git a/sartd/src/kubernetes/src/controller/metrics.rs b/sartd/src/kubernetes/src/controller/metrics.rs new file mode 100644 index 0000000..11ba105 --- /dev/null +++ b/sartd/src/kubernetes/src/controller/metrics.rs @@ -0,0 +1,92 @@ +use kube::Resource; +use prometheus::Registry; +use prometheus::{histogram_opts, opts, HistogramVec, IntCounter, IntCounterVec}; +use sartd_trace::error::TraceableError; +use tokio::time::Instant; + +#[derive(Debug, Clone)] +pub struct Metrics { + pub reconciliations: IntCounterVec, + pub failures: IntCounterVec, + pub reconcile_duration: HistogramVec, +} + +impl Default for Metrics { + fn default() -> Self { + let reconcile_duration = HistogramVec::new( + histogram_opts!( + "sart_controller_reconcile_duration_seconds", + "The duration of reconcile to complete in seconds" + ) + .buckets(vec![0.01, 0.1, 0.25, 0.5, 1., 5., 15., 60.]), + &[], + ) + .unwrap(); + let failures = IntCounterVec::new( + opts!( + "sart_controller_reconciliation_errors_total", + "reconciliation errors", + ), + &["resource", "instance", "error"], + ) + .unwrap(); + let reconciliations = IntCounterVec::new( + opts!( + "sart_controller_reconciliation_total", + "Total count of reconciliations", + ), + &["resource", "instance"], + ) + .unwrap(); + Metrics { + reconciliations, + failures, + reconcile_duration, + } + } +} + +impl Metrics { + pub fn register(self, registry: &Registry) -> Result { + Ok(self) + } + + pub fn reconcile_failure, E: TraceableError>( + &self, + resource: &T, + e: &E, + ) { + self.failures + .with_label_values(&[ + &resource.object_ref(&()).kind.unwrap(), + &resource.object_ref(&()).name.unwrap(), + e.metric_label().as_ref(), + ]) + .inc() + } + + pub fn reconciliation>(&self, resource: &T) { + self.reconciliations + .with_label_values(&[ + &resource.object_ref(&()).kind.unwrap(), + &resource.object_ref(&()).name.unwrap(), + ]) + .inc() + } +} + +/// Smart function duration measurer +/// +/// Relies on Drop to calculate duration and register the observation in the histogram +pub struct ReconcileMeasurer { + start: Instant, + metric: HistogramVec, +} + +impl Drop for ReconcileMeasurer { + fn drop(&mut self) { + #[allow(clippy::cast_precision_loss)] + let duration = self.start.elapsed().as_millis() as f64 / 1000.0; + self.metric.with_label_values(&[]).observe(duration); + } +} diff --git a/sartd/src/kubernetes/src/controller/reconciler/address_block.rs b/sartd/src/kubernetes/src/controller/reconciler/address_block.rs index ea6fc0a..a8a9c26 100644 --- a/sartd/src/kubernetes/src/controller/reconciler/address_block.rs +++ b/sartd/src/kubernetes/src/controller/reconciler/address_block.rs @@ -27,6 +27,7 @@ pub async fn reconciler( ctx: Arc>>, ) -> Result { let address_blocks = Api::::all(ctx.client().clone()); + ctx.inner.metrics().reconciliation(ab.as_ref()); finalizer( &address_blocks, @@ -34,8 +35,8 @@ pub async fn reconciler( ab, |event| async { match event { - Event::Apply(ab) => reconcile(&address_blocks, &ab, ctx.clone()).await, - Event::Cleanup(ab) => cleanup(&address_blocks, &ab, ctx.clone()).await, + Event::Apply(ab) => reconcile_with_metrics(&address_blocks, &ab, ctx.clone()).await, + Event::Cleanup(ab) => cleanup_with_metrics(&address_blocks, &ab, ctx.clone()).await, } }, ) @@ -43,6 +44,34 @@ pub async fn reconciler( .map_err(|e| Error::Finalizer(Box::new(e))) } +async fn reconcile_with_metrics( + api: &Api, + ab: &AddressBlock, + ctx: Arc>>, +) -> Result { + match reconcile(api, ab, ctx.clone()).await { + Ok(action) => Ok(action), + Err(e) => { + ctx.inner.metrics().reconcile_failure(ab, &e); + Err(e) + } + } +} + +async fn cleanup_with_metrics( + api: &Api, + ab: &AddressBlock, + ctx: Arc>>, +) -> Result { + match cleanup(api, ab, ctx.clone()).await { + Ok(action) => Ok(action), + Err(e) => { + ctx.inner.metrics().reconcile_failure(ab, &e); + Err(e) + } + } +} + #[tracing::instrument(skip_all, fields(trace_id))] async fn reconcile( _api: &Api, diff --git a/sartd/src/kubernetes/src/fixture.rs b/sartd/src/kubernetes/src/fixture.rs index 1bac053..ecabaeb 100644 --- a/sartd/src/kubernetes/src/fixture.rs +++ b/sartd/src/kubernetes/src/fixture.rs @@ -20,7 +20,6 @@ pub mod reconciler { Client, Resource, ResourceExt, }; use prometheus::Registry; - use sartd_trace::metrics::Metrics; use serde::Serialize; use crate::{ @@ -43,7 +42,7 @@ pub mod reconciler { RouterIdSelector, SpeakerConfig, CLUSTER_BGP_FINALIZER, }, node_bgp::{NodeBGP, NodeBGPSpec}, - }, + }, metrics::Metrics, }; pub type ApiServerHandle = tower_test::mock::Handle, Response>; diff --git a/sartd/src/kubernetes/src/lib.rs b/sartd/src/kubernetes/src/lib.rs index bb663b3..7e8821c 100644 --- a/sartd/src/kubernetes/src/lib.rs +++ b/sartd/src/kubernetes/src/lib.rs @@ -4,4 +4,5 @@ pub mod controller; pub mod crd; pub mod error; pub mod fixture; +pub mod metrics; pub mod util; diff --git a/sartd/src/kubernetes/src/metrics.rs b/sartd/src/kubernetes/src/metrics.rs new file mode 100644 index 0000000..62c9743 --- /dev/null +++ b/sartd/src/kubernetes/src/metrics.rs @@ -0,0 +1,90 @@ +use kube::Resource; +use prometheus::Registry; +use prometheus::{histogram_opts, opts, HistogramVec, IntCounter, IntCounterVec}; +use sartd_trace::error::TraceableError; +use tokio::time::Instant; + +#[derive(Debug, Clone)] +pub struct Metrics { + pub reconciliations: IntCounterVec, + pub failures: IntCounterVec, + pub reconcile_duration: HistogramVec, +} + +impl Default for Metrics { + fn default() -> Self { + let reconcile_duration = HistogramVec::new( + histogram_opts!( + "sart_controller_reconcile_duration_seconds", + "The duration of reconcile to complete in seconds" + ) + .buckets(vec![0.01, 0.1, 0.25, 0.5, 1., 5., 15., 60.]), + &[], + ) + .unwrap(); + let failures = IntCounterVec::new( + opts!( + "sart_controller_reconciliation_errors_total", + "reconciliation errors", + ), + &["resource", "instance", "error"], + ) + .unwrap(); + let reconciliations = IntCounterVec::new( + opts!( + "sart_controller_reconciliation_total", + "Total count of reconciliations", + ), + &["resource", "instance"], + ) + .unwrap(); + Metrics { + reconciliations, + failures, + reconcile_duration, + } + } +} + +impl Metrics { + pub fn register(self, registry: &Registry) -> Result { + Ok(self) + } + + pub fn reconcile_failure, E: TraceableError>( + &self, + resource: &T, + e: &E, + ) { + self.failures + .with_label_values(&[ + &resource.object_ref(&()).kind.unwrap(), + &resource.object_ref(&()).name.unwrap(), + e.metric_label().as_ref(), + ]) + .inc() + } + + pub fn reconciliation>(&self, resource: &T) { + self.reconciliations.with_label_values(&[ + &resource.object_ref(&()).kind.unwrap(), + &resource.object_ref(&()).name.unwrap(), + ]).inc() + } +} + +/// Smart function duration measurer +/// +/// Relies on Drop to calculate duration and register the observation in the histogram +pub struct ReconcileMeasurer { + start: Instant, + metric: HistogramVec, +} + +impl Drop for ReconcileMeasurer { + fn drop(&mut self) { + #[allow(clippy::cast_precision_loss)] + let duration = self.start.elapsed().as_millis() as f64 / 1000.0; + self.metric.with_label_values(&[]).observe(duration); + } +}