Skip to content

Commit

Permalink
small performance improvements for telemetry (#3656)
Browse files Browse the repository at this point in the history
* check span names from a HashSet instead of iterating through an array of strings
* move the SpanMetricsExporter to a tracing Layer (#3669)
SpanMetricsExporter is an OpenTelemetry Exporter, which means that if it
is loaded, then the entire OpenTelemetry handling infrastructure is
loaded, and especially the part that allocates data for the entire list
of spans.
Unfortunately, the SpanMetricsExporter was always loaded, even if we do
not export the metrics outside of the router, which means that there's a
constant overhead of telemetry even when it is not used. This moves the
SpanMetricsExporter to a lightweight tracing-subscriber Layer which
performs the same busy/idle accounting as OpenTelemetryLayer, then
generates the same events as before when the span closes
  • Loading branch information
Geal authored and garypen committed Sep 12, 2023
1 parent 22bf32e commit 2c3a22c
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 73 deletions.
5 changes: 5 additions & 0 deletions .changesets/fix_geal_telemetry_perf.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
### small performance improvements for telemetry ([PR #3656](https://github.com/apollographql/router/pull/3656))

The SpanMetricsExporter, used to report span timings hade a few inefficiencies in the way it recognized spans, and it brought a constant overhead to the router usage, even when telemetry was not configured. It has now been isolated and optimized

By [@Geal](https://github.com/Geal) in https://github.com/apollographql/router/pull/3656
191 changes: 125 additions & 66 deletions apollo-router/src/plugins/telemetry/metrics/span_metrics_exporter.rs
Original file line number Diff line number Diff line change
@@ -1,81 +1,140 @@
use async_trait::async_trait;
use futures::future::BoxFuture;
use futures::FutureExt;
use opentelemetry::sdk::export::trace::ExportResult;
use opentelemetry::sdk::export::trace::SpanData;
use opentelemetry::sdk::export::trace::SpanExporter;
use opentelemetry::Key;
use opentelemetry::Value;
use std::collections::HashSet;
use std::time::Instant;

use tracing_core::field::Visit;
use tracing_core::span;
use tracing_core::Field;
use tracing_core::Subscriber;
use tracing_subscriber::layer::Context;
use tracing_subscriber::registry::LookupSpan;
use tracing_subscriber::Layer;

use crate::axum_factory::utils::REQUEST_SPAN_NAME;
use crate::plugins::telemetry::EXECUTION_SPAN_NAME;
use crate::plugins::telemetry::SUBGRAPH_SPAN_NAME;
use crate::plugins::telemetry::SUPERGRAPH_SPAN_NAME;
use crate::services::QUERY_PLANNING_SPAN_NAME;

const SPAN_NAMES: &[&str] = &[
REQUEST_SPAN_NAME,
SUPERGRAPH_SPAN_NAME,
SUBGRAPH_SPAN_NAME,
QUERY_PLANNING_SPAN_NAME,
EXECUTION_SPAN_NAME,
];

const BUSY_NS_ATTRIBUTE_NAME: Key = Key::from_static_str("busy_ns");
const IDLE_NS_ATTRIBUTE_NAME: Key = Key::from_static_str("idle_ns");
const SUBGRAPH_ATTRIBUTE_NAME: Key = Key::from_static_str("apollo.subgraph.name");

#[derive(Debug, Default)]
pub(crate) struct Exporter {}
#[async_trait]
impl SpanExporter for Exporter {
/// Export spans metrics to real metrics
fn export(&mut self, batch: Vec<SpanData>) -> BoxFuture<'static, ExportResult> {
for span in batch
.into_iter()
.filter(|s| SPAN_NAMES.contains(&s.name.as_ref()))
{
let busy = span
.attributes
.get(&BUSY_NS_ATTRIBUTE_NAME)
.and_then(|attr| match attr {
Value::I64(v) => Some(*v),
_ => None,
})
.unwrap_or_default();
let idle = span
.attributes
.get(&IDLE_NS_ATTRIBUTE_NAME)
.and_then(|attr| match attr {
Value::I64(v) => Some(*v),
_ => None,
})
.unwrap_or_default();
let duration = span
.end_time
.duration_since(span.start_time)
.unwrap_or_default()
.as_secs_f64();
const SUBGRAPH_ATTRIBUTE_NAME: &str = "apollo.subgraph.name";

#[derive(Debug)]
pub(crate) struct SpanMetricsLayer {
span_names: HashSet<&'static str>,
}

impl Default for SpanMetricsLayer {
fn default() -> Self {
Self {
span_names: [
REQUEST_SPAN_NAME,
SUPERGRAPH_SPAN_NAME,
SUBGRAPH_SPAN_NAME,
QUERY_PLANNING_SPAN_NAME,
EXECUTION_SPAN_NAME,
]
.into(),
}
}
}

impl<S> Layer<S> for SpanMetricsLayer
where
S: Subscriber + for<'span> LookupSpan<'span>,
{
fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
let span = ctx.span(id).expect("Span not found, this is a bug");
let mut extensions = span.extensions_mut();

let name = attrs.metadata().name();
if self.span_names.contains(name) && extensions.get_mut::<Timings>().is_none() {
let mut timings = Timings::new();
if name == SUBGRAPH_SPAN_NAME {
attrs.values().record(&mut ValueVisitor {
timings: &mut timings,
});
}
extensions.insert(Timings::new());
}
}

fn on_record(&self, _span: &span::Id, _values: &span::Record<'_>, _ctx: Context<'_, S>) {}

fn on_close(&self, id: span::Id, ctx: Context<'_, S>) {
let span = ctx.span(&id).expect("Span not found, this is a bug");
let mut extensions = span.extensions_mut();

if let Some(timings) = extensions.get_mut::<Timings>() {
let duration = timings.start.elapsed().as_secs_f64();

// Convert it in seconds
let idle: f64 = idle as f64 / 1_000_000_000_f64;
let busy: f64 = busy as f64 / 1_000_000_000_f64;
if span.name == SUBGRAPH_SPAN_NAME {
let subgraph_name = span
.attributes
.get(&SUBGRAPH_ATTRIBUTE_NAME)
.map(|name| name.as_str())
.unwrap_or_default();
::tracing::info!(histogram.apollo_router_span = duration, kind = %"duration", span = %span.name, subgraph = %subgraph_name);
::tracing::info!(histogram.apollo_router_span = idle, kind = %"idle", span = %span.name, subgraph = %subgraph_name);
::tracing::info!(histogram.apollo_router_span = busy, kind = %"busy", span = %span.name, subgraph = %subgraph_name);
let idle: f64 = timings.idle as f64 / 1_000_000_000_f64;
let busy: f64 = timings.busy as f64 / 1_000_000_000_f64;
let name = span.metadata().name();
if let Some(subgraph_name) = timings.subgraph.take() {
::tracing::info!(histogram.apollo_router_span = duration, kind = %"duration", span = %name, subgraph = %subgraph_name);
::tracing::info!(histogram.apollo_router_span = idle, kind = %"idle", span = %name, subgraph = %subgraph_name);
::tracing::info!(histogram.apollo_router_span = busy, kind = %"busy", span = %name, subgraph = %subgraph_name);
} else {
::tracing::info!(histogram.apollo_router_span = duration, kind = %"duration", span = %span.name);
::tracing::info!(histogram.apollo_router_span = idle, kind = %"idle", span = %span.name);
::tracing::info!(histogram.apollo_router_span = busy, kind = %"busy", span = %span.name);
::tracing::info!(histogram.apollo_router_span = duration, kind = %"duration", span = %name);
::tracing::info!(histogram.apollo_router_span = idle, kind = %"idle", span = %name);
::tracing::info!(histogram.apollo_router_span = busy, kind = %"busy", span = %name);
}
}
}

async { Ok(()) }.boxed()
fn on_enter(&self, id: &span::Id, ctx: Context<'_, S>) {
let span = ctx.span(id).expect("Span not found, this is a bug");
let mut extensions = span.extensions_mut();

if let Some(timings) = extensions.get_mut::<Timings>() {
let now = Instant::now();
timings.idle += (now - timings.last).as_nanos() as i64;
timings.last = now;
}
}

fn on_exit(&self, id: &span::Id, ctx: Context<'_, S>) {
let span = ctx.span(id).expect("Span not found, this is a bug");
let mut extensions = span.extensions_mut();

if let Some(timings) = extensions.get_mut::<Timings>() {
let now = Instant::now();
timings.busy += (now - timings.last).as_nanos() as i64;
timings.last = now;
}
}
}

struct Timings {
idle: i64,
busy: i64,
last: Instant,
start: Instant,
subgraph: Option<String>,
}

impl Timings {
fn new() -> Self {
Self {
idle: 0,
busy: 0,
last: Instant::now(),
start: Instant::now(),
subgraph: None,
}
}
}

struct ValueVisitor<'a> {
timings: &'a mut Timings,
}

impl<'a> Visit for ValueVisitor<'a> {
fn record_debug(&mut self, _field: &Field, _value: &dyn std::fmt::Debug) {}

fn record_str(&mut self, field: &Field, value: &str) {
if field.name() == SUBGRAPH_ATTRIBUTE_NAME {
self.timings.subgraph = Some(value.to_string());
}
}
}
7 changes: 3 additions & 4 deletions apollo-router/src/plugins/telemetry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ use self::metrics::AttributesForwardConf;
use self::metrics::MetricsAttributesConf;
use self::reload::reload_fmt;
use self::reload::reload_metrics;
use self::reload::LayeredRegistry;
use self::reload::NullFieldFormatter;
use self::reload::OPENTELEMETRY_TRACER_HANDLE;
use self::tracing::apollo_telemetry::APOLLO_PRIVATE_DURATION_NS;
Expand Down Expand Up @@ -622,8 +623,6 @@ impl Telemetry {
builder = setup_tracing(builder, &tracing_config.datadog, trace_config)?;
builder = setup_tracing(builder, &tracing_config.otlp, trace_config)?;
builder = setup_tracing(builder, &config.apollo, trace_config)?;
// For metrics
builder = builder.with_simple_exporter(metrics::span_metrics_exporter::Exporter::default());

let tracer_provider = builder.build();
Ok(tracer_provider)
Expand Down Expand Up @@ -672,10 +671,10 @@ impl Telemetry {
dyn Layer<
::tracing_subscriber::layer::Layered<
OpenTelemetryLayer<
::tracing_subscriber::Registry,
LayeredRegistry,
ReloadTracer<::opentelemetry::sdk::trace::Tracer>,
>,
::tracing_subscriber::Registry,
LayeredRegistry,
>,
> + Send
+ Sync,
Expand Down
12 changes: 9 additions & 3 deletions apollo-router/src/plugins/telemetry/reload.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@ use tracing_subscriber::util::SubscriberInitExt;
use tracing_subscriber::EnvFilter;
use tracing_subscriber::Registry;

use super::metrics::span_metrics_exporter::SpanMetricsLayer;
use crate::plugins::telemetry::formatters::filter_metric_events;
use crate::plugins::telemetry::formatters::text::TextFormatter;
use crate::plugins::telemetry::formatters::FilteringFormatter;
use crate::plugins::telemetry::metrics;
use crate::plugins::telemetry::metrics::layer::MetricsLayer;
use crate::plugins::telemetry::tracing::reload::ReloadTracer;

type LayeredTracer = Layered<OpenTelemetryLayer<Registry, ReloadTracer<Tracer>>, Registry>;
pub(crate) type LayeredRegistry = Layered<SpanMetricsLayer, Registry>;

type LayeredTracer =
Layered<OpenTelemetryLayer<LayeredRegistry, ReloadTracer<Tracer>>, LayeredRegistry>;

// These handles allow hot tracing of layers. They have complex type definitions because tracing has
// generic types in the layer definition.
Expand Down Expand Up @@ -96,6 +100,7 @@ pub(crate) fn init_telemetry(log_level: &str) -> Result<()> {
// Env filter is separate because of https://github.com/tokio-rs/tracing/issues/1629
// the tracing registry is only created once
tracing_subscriber::registry()
.with(SpanMetricsLayer::default())
.with(opentelemetry_layer)
.with(fmt_layer)
.with(metrics_layer)
Expand Down Expand Up @@ -128,8 +133,9 @@ pub(super) fn reload_metrics(layer: MetricsLayer) {
#[allow(clippy::type_complexity)]
pub(super) fn reload_fmt(
layer: Box<
dyn Layer<Layered<OpenTelemetryLayer<Registry, ReloadTracer<Tracer>>, Registry>>
+ Send
dyn Layer<
Layered<OpenTelemetryLayer<LayeredRegistry, ReloadTracer<Tracer>>, LayeredRegistry>,
> + Send
+ Sync,
>,
) {
Expand Down

0 comments on commit 2c3a22c

Please sign in to comment.