From 2c3a22cb496d36406e244d77ee90d5435e7e5030 Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Mon, 4 Sep 2023 16:19:46 +0200 Subject: [PATCH] small performance improvements for telemetry (#3656) * check span names from a HashSet instead of iterating through an array of strings * move the SpanMetricsExporter to a tracing Layer (#3669) SpanMetricsExporter is an OpenTelemetry Exporter, which means that if it is loaded, then the entire OpenTelemetry handling infrastructure is loaded, and especially the part that allocates data for the entire list of spans. Unfortunately, the SpanMetricsExporter was always loaded, even if we do not export the metrics outside of the router, which means that there's a constant overhead of telemetry even when it is not used. This moves the SpanMetricsExporter to a lightweight tracing-subscriber Layer which performs the same busy/idle accounting as OpenTelemetryLayer, then generates the same events as before when the span closes --- .changesets/fix_geal_telemetry_perf.md | 5 + .../metrics/span_metrics_exporter.rs | 191 ++++++++++++------ apollo-router/src/plugins/telemetry/mod.rs | 7 +- apollo-router/src/plugins/telemetry/reload.rs | 12 +- 4 files changed, 142 insertions(+), 73 deletions(-) create mode 100644 .changesets/fix_geal_telemetry_perf.md diff --git a/.changesets/fix_geal_telemetry_perf.md b/.changesets/fix_geal_telemetry_perf.md new file mode 100644 index 0000000000..f20ceee31a --- /dev/null +++ b/.changesets/fix_geal_telemetry_perf.md @@ -0,0 +1,5 @@ +### small performance improvements for telemetry ([PR #3656](https://github.com/apollographql/router/pull/3656)) + +The SpanMetricsExporter, used to report span timings hade a few inefficiencies in the way it recognized spans, and it brought a constant overhead to the router usage, even when telemetry was not configured. It has now been isolated and optimized + +By [@Geal](https://github.com/Geal) in https://github.com/apollographql/router/pull/3656 \ No newline at end of file diff --git a/apollo-router/src/plugins/telemetry/metrics/span_metrics_exporter.rs b/apollo-router/src/plugins/telemetry/metrics/span_metrics_exporter.rs index 1c2aa6642a..5e6778ab74 100644 --- a/apollo-router/src/plugins/telemetry/metrics/span_metrics_exporter.rs +++ b/apollo-router/src/plugins/telemetry/metrics/span_metrics_exporter.rs @@ -1,11 +1,13 @@ -use async_trait::async_trait; -use futures::future::BoxFuture; -use futures::FutureExt; -use opentelemetry::sdk::export::trace::ExportResult; -use opentelemetry::sdk::export::trace::SpanData; -use opentelemetry::sdk::export::trace::SpanExporter; -use opentelemetry::Key; -use opentelemetry::Value; +use std::collections::HashSet; +use std::time::Instant; + +use tracing_core::field::Visit; +use tracing_core::span; +use tracing_core::Field; +use tracing_core::Subscriber; +use tracing_subscriber::layer::Context; +use tracing_subscriber::registry::LookupSpan; +use tracing_subscriber::Layer; use crate::axum_factory::utils::REQUEST_SPAN_NAME; use crate::plugins::telemetry::EXECUTION_SPAN_NAME; @@ -13,69 +15,126 @@ use crate::plugins::telemetry::SUBGRAPH_SPAN_NAME; use crate::plugins::telemetry::SUPERGRAPH_SPAN_NAME; use crate::services::QUERY_PLANNING_SPAN_NAME; -const SPAN_NAMES: &[&str] = &[ - REQUEST_SPAN_NAME, - SUPERGRAPH_SPAN_NAME, - SUBGRAPH_SPAN_NAME, - QUERY_PLANNING_SPAN_NAME, - EXECUTION_SPAN_NAME, -]; - -const BUSY_NS_ATTRIBUTE_NAME: Key = Key::from_static_str("busy_ns"); -const IDLE_NS_ATTRIBUTE_NAME: Key = Key::from_static_str("idle_ns"); -const SUBGRAPH_ATTRIBUTE_NAME: Key = Key::from_static_str("apollo.subgraph.name"); - -#[derive(Debug, Default)] -pub(crate) struct Exporter {} -#[async_trait] -impl SpanExporter for Exporter { - /// Export spans metrics to real metrics - fn export(&mut self, batch: Vec) -> BoxFuture<'static, ExportResult> { - for span in batch - .into_iter() - .filter(|s| SPAN_NAMES.contains(&s.name.as_ref())) - { - let busy = span - .attributes - .get(&BUSY_NS_ATTRIBUTE_NAME) - .and_then(|attr| match attr { - Value::I64(v) => Some(*v), - _ => None, - }) - .unwrap_or_default(); - let idle = span - .attributes - .get(&IDLE_NS_ATTRIBUTE_NAME) - .and_then(|attr| match attr { - Value::I64(v) => Some(*v), - _ => None, - }) - .unwrap_or_default(); - let duration = span - .end_time - .duration_since(span.start_time) - .unwrap_or_default() - .as_secs_f64(); +const SUBGRAPH_ATTRIBUTE_NAME: &str = "apollo.subgraph.name"; + +#[derive(Debug)] +pub(crate) struct SpanMetricsLayer { + span_names: HashSet<&'static str>, +} + +impl Default for SpanMetricsLayer { + fn default() -> Self { + Self { + span_names: [ + REQUEST_SPAN_NAME, + SUPERGRAPH_SPAN_NAME, + SUBGRAPH_SPAN_NAME, + QUERY_PLANNING_SPAN_NAME, + EXECUTION_SPAN_NAME, + ] + .into(), + } + } +} + +impl Layer for SpanMetricsLayer +where + S: Subscriber + for<'span> LookupSpan<'span>, +{ + fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) { + let span = ctx.span(id).expect("Span not found, this is a bug"); + let mut extensions = span.extensions_mut(); + + let name = attrs.metadata().name(); + if self.span_names.contains(name) && extensions.get_mut::().is_none() { + let mut timings = Timings::new(); + if name == SUBGRAPH_SPAN_NAME { + attrs.values().record(&mut ValueVisitor { + timings: &mut timings, + }); + } + extensions.insert(Timings::new()); + } + } + + fn on_record(&self, _span: &span::Id, _values: &span::Record<'_>, _ctx: Context<'_, S>) {} + + fn on_close(&self, id: span::Id, ctx: Context<'_, S>) { + let span = ctx.span(&id).expect("Span not found, this is a bug"); + let mut extensions = span.extensions_mut(); + + if let Some(timings) = extensions.get_mut::() { + let duration = timings.start.elapsed().as_secs_f64(); // Convert it in seconds - let idle: f64 = idle as f64 / 1_000_000_000_f64; - let busy: f64 = busy as f64 / 1_000_000_000_f64; - if span.name == SUBGRAPH_SPAN_NAME { - let subgraph_name = span - .attributes - .get(&SUBGRAPH_ATTRIBUTE_NAME) - .map(|name| name.as_str()) - .unwrap_or_default(); - ::tracing::info!(histogram.apollo_router_span = duration, kind = %"duration", span = %span.name, subgraph = %subgraph_name); - ::tracing::info!(histogram.apollo_router_span = idle, kind = %"idle", span = %span.name, subgraph = %subgraph_name); - ::tracing::info!(histogram.apollo_router_span = busy, kind = %"busy", span = %span.name, subgraph = %subgraph_name); + let idle: f64 = timings.idle as f64 / 1_000_000_000_f64; + let busy: f64 = timings.busy as f64 / 1_000_000_000_f64; + let name = span.metadata().name(); + if let Some(subgraph_name) = timings.subgraph.take() { + ::tracing::info!(histogram.apollo_router_span = duration, kind = %"duration", span = %name, subgraph = %subgraph_name); + ::tracing::info!(histogram.apollo_router_span = idle, kind = %"idle", span = %name, subgraph = %subgraph_name); + ::tracing::info!(histogram.apollo_router_span = busy, kind = %"busy", span = %name, subgraph = %subgraph_name); } else { - ::tracing::info!(histogram.apollo_router_span = duration, kind = %"duration", span = %span.name); - ::tracing::info!(histogram.apollo_router_span = idle, kind = %"idle", span = %span.name); - ::tracing::info!(histogram.apollo_router_span = busy, kind = %"busy", span = %span.name); + ::tracing::info!(histogram.apollo_router_span = duration, kind = %"duration", span = %name); + ::tracing::info!(histogram.apollo_router_span = idle, kind = %"idle", span = %name); + ::tracing::info!(histogram.apollo_router_span = busy, kind = %"busy", span = %name); } } + } - async { Ok(()) }.boxed() + fn on_enter(&self, id: &span::Id, ctx: Context<'_, S>) { + let span = ctx.span(id).expect("Span not found, this is a bug"); + let mut extensions = span.extensions_mut(); + + if let Some(timings) = extensions.get_mut::() { + let now = Instant::now(); + timings.idle += (now - timings.last).as_nanos() as i64; + timings.last = now; + } + } + + fn on_exit(&self, id: &span::Id, ctx: Context<'_, S>) { + let span = ctx.span(id).expect("Span not found, this is a bug"); + let mut extensions = span.extensions_mut(); + + if let Some(timings) = extensions.get_mut::() { + let now = Instant::now(); + timings.busy += (now - timings.last).as_nanos() as i64; + timings.last = now; + } + } +} + +struct Timings { + idle: i64, + busy: i64, + last: Instant, + start: Instant, + subgraph: Option, +} + +impl Timings { + fn new() -> Self { + Self { + idle: 0, + busy: 0, + last: Instant::now(), + start: Instant::now(), + subgraph: None, + } + } +} + +struct ValueVisitor<'a> { + timings: &'a mut Timings, +} + +impl<'a> Visit for ValueVisitor<'a> { + fn record_debug(&mut self, _field: &Field, _value: &dyn std::fmt::Debug) {} + + fn record_str(&mut self, field: &Field, value: &str) { + if field.name() == SUBGRAPH_ATTRIBUTE_NAME { + self.timings.subgraph = Some(value.to_string()); + } } } diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index b61d5fbf21..f6a8ba5202 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -66,6 +66,7 @@ use self::metrics::AttributesForwardConf; use self::metrics::MetricsAttributesConf; use self::reload::reload_fmt; use self::reload::reload_metrics; +use self::reload::LayeredRegistry; use self::reload::NullFieldFormatter; use self::reload::OPENTELEMETRY_TRACER_HANDLE; use self::tracing::apollo_telemetry::APOLLO_PRIVATE_DURATION_NS; @@ -622,8 +623,6 @@ impl Telemetry { builder = setup_tracing(builder, &tracing_config.datadog, trace_config)?; builder = setup_tracing(builder, &tracing_config.otlp, trace_config)?; builder = setup_tracing(builder, &config.apollo, trace_config)?; - // For metrics - builder = builder.with_simple_exporter(metrics::span_metrics_exporter::Exporter::default()); let tracer_provider = builder.build(); Ok(tracer_provider) @@ -672,10 +671,10 @@ impl Telemetry { dyn Layer< ::tracing_subscriber::layer::Layered< OpenTelemetryLayer< - ::tracing_subscriber::Registry, + LayeredRegistry, ReloadTracer<::opentelemetry::sdk::trace::Tracer>, >, - ::tracing_subscriber::Registry, + LayeredRegistry, >, > + Send + Sync, diff --git a/apollo-router/src/plugins/telemetry/reload.rs b/apollo-router/src/plugins/telemetry/reload.rs index 1c66ccf4ef..50ce48747a 100644 --- a/apollo-router/src/plugins/telemetry/reload.rs +++ b/apollo-router/src/plugins/telemetry/reload.rs @@ -15,6 +15,7 @@ use tracing_subscriber::util::SubscriberInitExt; use tracing_subscriber::EnvFilter; use tracing_subscriber::Registry; +use super::metrics::span_metrics_exporter::SpanMetricsLayer; use crate::plugins::telemetry::formatters::filter_metric_events; use crate::plugins::telemetry::formatters::text::TextFormatter; use crate::plugins::telemetry::formatters::FilteringFormatter; @@ -22,7 +23,10 @@ use crate::plugins::telemetry::metrics; use crate::plugins::telemetry::metrics::layer::MetricsLayer; use crate::plugins::telemetry::tracing::reload::ReloadTracer; -type LayeredTracer = Layered>, Registry>; +pub(crate) type LayeredRegistry = Layered; + +type LayeredTracer = + Layered>, LayeredRegistry>; // These handles allow hot tracing of layers. They have complex type definitions because tracing has // generic types in the layer definition. @@ -96,6 +100,7 @@ pub(crate) fn init_telemetry(log_level: &str) -> Result<()> { // Env filter is separate because of https://github.com/tokio-rs/tracing/issues/1629 // the tracing registry is only created once tracing_subscriber::registry() + .with(SpanMetricsLayer::default()) .with(opentelemetry_layer) .with(fmt_layer) .with(metrics_layer) @@ -128,8 +133,9 @@ pub(super) fn reload_metrics(layer: MetricsLayer) { #[allow(clippy::type_complexity)] pub(super) fn reload_fmt( layer: Box< - dyn Layer>, Registry>> - + Send + dyn Layer< + Layered>, LayeredRegistry>, + > + Send + Sync, >, ) {