From 514bb868a63e36d091a6a3b969d155bce900fcf3 Mon Sep 17 00:00:00 2001 From: Bryn Cooke Date: Thu, 8 Sep 2022 15:29:00 +0100 Subject: [PATCH] FTV1 support (#1514) Adds FTV1 support. A new open telemetry exporter has been added that will convert regular traces to Apollo traces. A buffer of spans is collected on the server side which will retain spans until the root request span is completed. Once a request is completed the trace will be reconstructed and sent to Apollo. Span attributes that are only relevant to Apollo tracing are prefixed with `apollo_private.` and are filtered out of other APM data. @glasser Has given some guidance on how we should improve tracing, but this'll be left to followup tickets as this PR is large and has been ongoing for a significant period. - [ ] [Don't send ftv1 traces to free tier users](https://github.com/apollographql/router/issues/1728). - [ ] [Only send ftv1 traces that are interesting](https://github.com/apollographql/router/issues/1729). As an aside, this PR demonstrates that spans can be used for Apollo tracing, and that we could move to a native Otel based solution in future. Signed-off-by: Benjamin Coenen <5719034+bnjjj@users.noreply.github.com> Co-authored-by: bryn Co-authored-by: Benjamin Coenen <5719034+bnjjj@users.noreply.github.com> Co-authored-by: o0Ignition0o Co-authored-by: Coenen Benjamin Co-authored-by: Jesse Rosenberger Co-authored-by: Gary Pennington --- Cargo.lock | 10 +- NEXT_CHANGELOG.md | 38 + about.toml | 3 +- apollo-router-scaffold/src/plugin.rs | 1 - apollo-router/Cargo.toml | 13 +- apollo-router/src/axum_http_server_factory.rs | 12 +- ...nfiguration__tests__schema_generation.snap | 114 ++- apollo-router/src/context.rs | 5 + apollo-router/src/http_ext.rs | 15 +- apollo-router/src/plugin/serde.rs | 30 + apollo-router/src/plugins/telemetry/apollo.rs | 200 ++++- .../src/plugins/telemetry/apollo_exporter.rs | 207 +++++ apollo-router/src/plugins/telemetry/config.rs | 67 +- .../src/plugins/telemetry/metrics/apollo.rs | 256 +----- ...cs__apollo__studio__test__aggregation.snap | 2 + .../telemetry/metrics/apollo/studio.rs | 149 ++-- .../src/plugins/telemetry/metrics/mod.rs | 4 +- ..._apollo__test__apollo_metrics_exclude.snap | 5 +- ...t__apollo_metrics_multiple_operations.snap | 4 +- ...o__test__apollo_metrics_parse_failure.snap | 4 +- ...test__apollo_metrics_single_operation.snap | 5 +- ...est__apollo_metrics_unknown_operation.snap | 4 +- ...st__apollo_metrics_validation_failure.snap | 4 +- apollo-router/src/plugins/telemetry/mod.rs | 839 +++++++++++------- .../src/plugins/telemetry/tracing/apollo.rs | 40 +- .../telemetry/tracing/apollo_telemetry.rs | 712 +++++++++------ .../src/plugins/telemetry/tracing/datadog.rs | 8 +- .../src/plugins/telemetry/tracing/jaeger.rs | 4 +- .../src/plugins/telemetry/tracing/mod.rs | 72 ++ .../src/plugins/telemetry/tracing/otlp.rs | 12 +- .../src/plugins/telemetry/tracing/zipkin.rs | 8 +- apollo-router/src/query_planner/mod.rs | 34 +- apollo-router/src/spec/query.rs | 2 +- apollo-router/tests/integration_tests.rs | 30 +- apollo-router/tests/jaeger_test.rs | 4 +- ...ation_tests__traced_basic_composition.snap | 727 +++++++++------ ...tegration_tests__traced_basic_request.snap | 52 +- apollo-spaceport/build.rs | 28 +- apollo-spaceport/src/lib.rs | 20 + deny.toml | 1 + .../source/configuration/apollo-telemetry.mdx | 27 +- docs/source/customizations/native.mdx | 2 + licenses.html | 1 + rust-toolchain.toml | 1 - 44 files changed, 2524 insertions(+), 1252 deletions(-) create mode 100644 apollo-router/src/plugins/telemetry/apollo_exporter.rs diff --git a/Cargo.lock b/Cargo.lock index d02f873b68..dd17a9a25d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -148,7 +148,8 @@ dependencies = [ "atty", "axum", "backtrace", - "buildstructor 0.4.1", + "base64 0.13.0", + "buildstructor 0.5.0", "bytes", "clap 3.2.20", "ctor", @@ -194,6 +195,7 @@ dependencies = [ "paste", "pin-project-lite", "prometheus", + "rand", "regex", "reqwest", "rhai", @@ -646,9 +648,9 @@ dependencies = [ [[package]] name = "buildstructor" -version = "0.4.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a7772d3542812693473268c2308979967035162dd36d053ec728e6c947ba93f" +checksum = "54027423064fb9ead112911b05eccb6484070f5ec778610906458603e0673381" dependencies = [ "lazy_static", "proc-macro2", @@ -2554,6 +2556,8 @@ dependencies = [ "console 0.15.1", "linked-hash-map", "once_cell", + "pest", + "pest_derive", "serde", "similar", "yaml-rust", diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 7bef3238a6..6c01f10318 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -26,7 +26,45 @@ By [@USERNAME](https://github.com/USERNAME) in https://github.com/apollographql/ # [x.x.x] (unreleased) - 2022-mm-dd ## ❗ BREAKING ❗ +### Span client_name and client_version attributes renamed ([#1514](https://github.com/apollographql/router/issues/1514)) +OpenTelemetry attributes should be grouped by `.` rather than `_`, therefore the following attributes have changed: + +* `client_name` => `client.name` +* `client_version` => `client.version` + +By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/1514 + ## 🚀 Features + +### Add federated tracing support to Apollo studio usage reporting ([#1514](https://github.com/apollographql/router/issues/1514)) + +Add support of [federated tracing](https://www.apollographql.com/docs/federation/metrics/) in Apollo Studio: + +```yaml +telemetry: + apollo: + # The percentage of requests will include HTTP request and response headers in traces sent to Apollo Studio. + # This is expensive and should be left at a low value. + # This cannot be higher than tracing->trace_config->sampler + field_level_instrumentation_sampler: 0.01 # (default) + + # Include HTTP request and response headers in traces sent to Apollo Studio + send_headers: # other possible values are all, only (with an array), except (with an array), none (by default) + except: # Send all headers except referer + - referer + + # Send variable values in Apollo in traces sent to Apollo Studio + send_variable_values: # other possible values are all, only (with an array), except (with an array), none (by default) + except: # Send all variable values except for variable named first + - first + tracing: + trace_config: + sampler: 0.5 # The percentage of requests that will generate traces (a rate or `always_on` or `always_off`) +``` + +By [@BrynCooke](https://github.com/BrynCooke) & [@bnjjj](https://github.com/bnjjj) & [@o0Ignition0o](https://github.com/o0Ignition0o) in https://github.com/apollographql/router/pull/1514 + + ## 🐛 Fixes ## 🛠 Maintenance diff --git a/about.toml b/about.toml index d88422d071..b2418efc4c 100644 --- a/about.toml +++ b/about.toml @@ -8,7 +8,8 @@ accepted = [ "LicenseRef-ELv2", "LicenseRef-ring", "MIT", - "MPL-2.0" + "MPL-2.0", + "Unicode-DFS-2016" ] # Ignore non plublished crates, such as xtask for example diff --git a/apollo-router-scaffold/src/plugin.rs b/apollo-router-scaffold/src/plugin.rs index 7419a9ddd1..2b25b28026 100644 --- a/apollo-router-scaffold/src/plugin.rs +++ b/apollo-router-scaffold/src/plugin.rs @@ -98,7 +98,6 @@ fn create_plugin(name: &str, template_path: &Option) -> Result<()> { Value::Boolean(true), ); - dbg!(¶ms); desc.scaffold_with_parameters(params)?; let mod_path = mod_path(); diff --git a/apollo-router/Cargo.toml b/apollo-router/Cargo.toml index b60163deb1..bc68297b02 100644 --- a/apollo-router/Cargo.toml +++ b/apollo-router/Cargo.toml @@ -34,7 +34,8 @@ async-trait = "0.1.57" atty = "0.2.14" axum = { version = "0.5.15", features = ["headers", "json", "original-uri"] } backtrace = "0.3.66" -buildstructor = "0.4.1" +base64 = "0.13.0" +buildstructor = "0.5.0" bytes = "1.2.1" clap = { version = "3.2.20", default-features = false, features = [ "env", @@ -67,6 +68,7 @@ jsonschema = { version = "0.16.0", default-features = false } lazy_static = "1.4.0" libc = "0.2.132" lru = "0.7.8" +mediatype = "0.19.9" mockall = "0.11.2" miette = { version = "5.3.0", features = ["fancy"] } mime = "0.3.16" @@ -112,7 +114,9 @@ opentelemetry-zipkin = { version = "0.15.0", default-features = false, features ] } opentelemetry-prometheus = "0.10.0" paste = "1.0.9" +pin-project-lite = "0.2.9" prometheus = "0.13" +rand = "0.8.5" rhai = { version = "1.9.1", features = ["sync", "serde", "internals"] } regex = "1.6.0" reqwest = { version = "0.11.11", default-features = false, features = [ @@ -165,9 +169,8 @@ tracing-subscriber = { version = "0.3.11", features = ["env-filter", "json"] } url = { version = "2.3.0", features = ["serde"] } urlencoding = "2.1.2" +uuid = { version = "1.1.2", features = ["serde", "v4"] } yaml-rust = "0.4.5" -pin-project-lite = "0.2.9" -mediatype = "0.19.9" [target.'cfg(macos)'.dependencies] uname = "0.1.1" @@ -176,7 +179,7 @@ uname = "0.1.1" uname = "0.1.1" [dev-dependencies] -insta = { version = "1.19.1", features = [ "json" ] } +insta = { version = "1.19.1", features = [ "json", "redactions" ] } jsonpath_lib = "0.3.0" maplit = "1.0.2" mockall = "0.11.2" @@ -199,8 +202,6 @@ tracing-subscriber = { version = "0.3", default-features = false, features = [ "fmt", ] } tracing-test = "0.2.2" -uuid = { version = "1.1.2", features = ["serde", "v4"] } -url = "2.3.0" walkdir = "2.3.2" [[test]] name = "integration_tests" diff --git a/apollo-router/src/axum_http_server_factory.rs b/apollo-router/src/axum_http_server_factory.rs index b5c5e0b391..34731dbe9c 100644 --- a/apollo-router/src/axum_http_server_factory.rs +++ b/apollo-router/src/axum_http_server_factory.rs @@ -170,7 +170,9 @@ where .layer( TraceLayer::new_for_http() .make_span_with(PropagatingMakeSpan::new()) - .on_response(|resp: &Response<_>, _duration: Duration, span: &Span| { + .on_response(|resp: &Response<_>, duration: Duration, span: &Span| { + // Duration here is instant based + span.record("apollo_private.duration_ns", &(duration.as_nanos() as i64)); if resp.status() >= StatusCode::BAD_REQUEST { span.record( "otel.status_code", @@ -821,6 +823,8 @@ impl PropagatingMakeSpan { impl MakeSpan for PropagatingMakeSpan { fn make_span(&mut self, request: &http::Request) -> Span { + // This method needs to be moved to the telemetry plugin once we have a hook for the http request. + // Before we make the span we need to attach span info that may have come in from the request. let context = global::get_text_map_propagator(|propagator| { propagator.extract(&opentelemetry_http::HeaderExtractor(request.headers())) @@ -838,7 +842,8 @@ impl MakeSpan for PropagatingMakeSpan { uri = %request.uri(), version = ?request.version(), "otel.kind" = %SpanKind::Server, - "otel.status_code" = %opentelemetry::trace::StatusCode::Unset.as_str() + "otel.status_code" = %opentelemetry::trace::StatusCode::Unset.as_str(), + "apollo_private.duration_ns" = tracing::field::Empty ) } else { // No remote span, we can go ahead and create the span without context. @@ -849,7 +854,8 @@ impl MakeSpan for PropagatingMakeSpan { uri = %request.uri(), version = ?request.version(), "otel.kind" = %SpanKind::Server, - "otel.status_code" = %opentelemetry::trace::StatusCode::Unset.as_str() + "otel.status_code" = %opentelemetry::trace::StatusCode::Unset.as_str(), + "apollo_private.duration_ns" = tracing::field::Empty ) } } diff --git a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap index 3d52988045..b6c04dff4a 100644 --- a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap +++ b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap @@ -468,19 +468,131 @@ expression: "&schema" "apollo": { "type": "object", "properties": { + "buffer_size": { + "description": "The buffer size for sending traces to Apollo. Increase this if you are experiencing lost traces.", + "default": 10000, + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, "client_name_header": { + "description": "The name of the header to extract from requests when populating 'client nane' for traces and metrics in Apollo Studio.", "default": "apollographql-client-name", "type": "string", "nullable": true }, "client_version_header": { + "description": "The name of the header to extract from requests when populating 'client version' for traces and metrics in Apollo Studio.", "default": "apollographql-client-version", "type": "string", "nullable": true }, "endpoint": { + "description": "The Apollo Studio endpoint for exporting traces and metrics.", "type": "string", "nullable": true + }, + "field_level_instrumentation_sampler": { + "description": "Enable field level instrumentation for subgraphs via ftv1. ftv1 tracing can cause performance issues as it is transmitted in band with subgraph responses. 0.0 will result in no field level instrumentation. 1.0 will result in always instrumentation. Value MUST be less than global sampling rate", + "anyOf": [ + { + "description": "Sample a given fraction. Fractions >= 1 will always sample.", + "type": "number", + "format": "double" + }, + { + "type": "string", + "enum": [ + "always_on", + "always_off" + ] + } + ], + "nullable": true + }, + "send_headers": { + "description": "To configure which request header names and values are included in trace data that's sent to Apollo Studio.", + "oneOf": [ + { + "type": "string", + "enum": [ + "none", + "all" + ] + }, + { + "type": "object", + "required": [ + "only" + ], + "properties": { + "only": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + }, + { + "type": "object", + "required": [ + "except" + ], + "properties": { + "except": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + } + ] + }, + "send_variable_values": { + "description": "To configure which GraphQL variable values are included in trace data that's sent to Apollo Studio", + "oneOf": [ + { + "type": "string", + "enum": [ + "none", + "all" + ] + }, + { + "type": "object", + "required": [ + "only" + ], + "properties": { + "only": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + }, + { + "type": "object", + "required": [ + "except" + ], + "properties": { + "except": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + } + ] } }, "additionalProperties": false, @@ -1813,7 +1925,7 @@ expression: "&schema" "sampler": { "anyOf": [ { - "description": "Sample a given fraction of traces. Fractions >= 1 will always sample. If the parent span is sampled, then it's child spans will automatically be sampled. Fractions < 0 are treated as zero, but spans may still be sampled if their parent is.", + "description": "Sample a given fraction. Fractions >= 1 will always sample.", "type": "number", "format": "double" }, diff --git a/apollo-router/src/context.rs b/apollo-router/src/context.rs index e4c02b4336..21c7f839b5 100644 --- a/apollo-router/src/context.rs +++ b/apollo-router/src/context.rs @@ -4,6 +4,7 @@ //! allows additional data to be passed back and forth along the request invocation pipeline. use std::sync::Arc; +use std::time::Instant; use dashmap::mapref::multiple::RefMulti; use dashmap::mapref::multiple::RefMutMulti; @@ -30,6 +31,9 @@ pub(crate) type Entries = Arc>; pub struct Context { // Allows adding custom entries to the context. entries: Entries, + + /// Creation time + pub(crate) created_at: Instant, } impl Context { @@ -37,6 +41,7 @@ impl Context { pub fn new() -> Self { Context { entries: Default::default(), + created_at: Instant::now(), } } } diff --git a/apollo-router/src/http_ext.rs b/apollo-router/src/http_ext.rs index 1c7b13c2b8..b17c2b4945 100644 --- a/apollo-router/src/http_ext.rs +++ b/apollo-router/src/http_ext.rs @@ -13,6 +13,7 @@ use axum::body::boxed; use axum::response::IntoResponse; use bytes::Bytes; use http::header; +use http::header::HeaderName; use http::HeaderValue; use multimap::MultiMap; @@ -260,11 +261,15 @@ impl Request { method: http::Method, body: T, ) -> http::Result> { - let mut req = http::request::Builder::new() - .method(method) - .uri(uri) - .body(body)?; - *req.headers_mut() = header_map(headers)?; + let mut builder = http::request::Builder::new().method(method).uri(uri); + for (key, values) in headers { + let header_name: HeaderName = key.try_into()?; + for value in values { + let header_value: HeaderValue = value.try_into()?; + builder = builder.header(header_name.clone(), header_value); + } + } + let req = builder.body(body)?; Ok(Self { inner: req }) } } diff --git a/apollo-router/src/plugin/serde.rs b/apollo-router/src/plugin/serde.rs index 5ef8012d1c..3f454a827c 100644 --- a/apollo-router/src/plugin/serde.rs +++ b/apollo-router/src/plugin/serde.rs @@ -11,6 +11,7 @@ use http::HeaderValue; use regex::Regex; use serde::de; use serde::de::Error; +use serde::de::SeqAccess; use serde::de::Visitor; use serde::Deserializer; @@ -46,6 +47,34 @@ where deserializer.deserialize_option(OptionHeaderNameVisitor) } +pub fn deserialize_vec_header_name<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + struct VecHeaderNameVisitor; + + impl<'de> Visitor<'de> for VecHeaderNameVisitor { + type Value = Vec; + + fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result { + formatter.write_str("struct HeaderName") + } + + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, + { + let mut result = Vec::new(); + while let Some(element) = seq.next_element::()? { + let header_name = HeaderNameVisitor::default().visit_string(element)?; + result.push(header_name); + } + Ok(result) + } + } + deserializer.deserialize_seq(VecHeaderNameVisitor) +} + pub fn deserialize_option_header_value<'de, D>( deserializer: D, ) -> Result, D::Error> @@ -79,6 +108,7 @@ where deserializer.deserialize_option(OptionHeaderValueVisitor) } +#[derive(Default)] struct HeaderNameVisitor; impl<'de> Visitor<'de> for HeaderNameVisitor { diff --git a/apollo-router/src/plugins/telemetry/apollo.rs b/apollo-router/src/plugins/telemetry/apollo.rs index c4bb01f98a..c16031187b 100644 --- a/apollo-router/src/plugins/telemetry/apollo.rs +++ b/apollo-router/src/plugins/telemetry/apollo.rs @@ -1,26 +1,49 @@ //! Configuration for apollo telemetry. // This entire file is license key functionality +use std::collections::HashMap; +use std::ops::AddAssign; +use std::time::SystemTime; + +use apollo_spaceport::ReferencedFieldsForType; +use apollo_spaceport::ReportHeader; +use apollo_spaceport::StatsContext; +use apollo_spaceport::Trace; +use derivative::Derivative; use http::header::HeaderName; +use itertools::Itertools; use schemars::JsonSchema; use serde::Deserialize; +use serde::Serialize; use url::Url; +use super::metrics::apollo::studio::ContextualizedStats; +use super::metrics::apollo::studio::SingleStats; +use super::metrics::apollo::studio::SingleStatsReport; +use super::tracing::apollo::TracesReport; use crate::plugin::serde::deserialize_header_name; +use crate::plugin::serde::deserialize_vec_header_name; +use crate::plugins::telemetry::config::SamplerOption; -#[derive(Debug, Clone, Deserialize, JsonSchema)] +#[derive(Derivative)] +#[derivative(Debug)] +#[derive(Clone, Deserialize, JsonSchema)] #[serde(deny_unknown_fields)] pub(crate) struct Config { + /// The Apollo Studio endpoint for exporting traces and metrics. #[schemars(with = "Option")] pub(crate) endpoint: Option, + /// The Apollo Studio API key. #[schemars(skip)] #[serde(skip, default = "apollo_key")] pub(crate) apollo_key: Option, + /// The Apollo Studio graph reference. #[schemars(skip)] #[serde(skip, default = "apollo_graph_reference")] pub(crate) apollo_graph_ref: Option, + /// The name of the header to extract from requests when populating 'client nane' for traces and metrics in Apollo Studio. #[schemars(with = "Option", default = "client_name_header_default_str")] #[serde( deserialize_with = "deserialize_header_name", @@ -28,6 +51,7 @@ pub(crate) struct Config { )] pub(crate) client_name_header: HeaderName, + /// The name of the header to extract from requests when populating 'client version' for traces and metrics in Apollo Studio. #[schemars(with = "Option", default = "client_version_header_default_str")] #[serde( deserialize_with = "deserialize_header_name", @@ -35,6 +59,22 @@ pub(crate) struct Config { )] pub(crate) client_version_header: HeaderName, + /// The buffer size for sending traces to Apollo. Increase this if you are experiencing lost traces. + #[serde(default = "default_buffer_size")] + pub(crate) buffer_size: usize, + + /// Enable field level instrumentation for subgraphs via ftv1. ftv1 tracing can cause performance issues as it is transmitted in band with subgraph responses. + /// 0.0 will result in no field level instrumentation. 1.0 will result in always instrumentation. + /// Value MUST be less than global sampling rate + pub(crate) field_level_instrumentation_sampler: Option, + + /// To configure which request header names and values are included in trace data that's sent to Apollo Studio. + #[serde(default)] + pub(crate) send_headers: ForwardHeaders, + /// To configure which GraphQL variable values are included in trace data that's sent to Apollo Studio + #[serde(default)] + pub(crate) send_variable_values: ForwardValues, + // This'll get overridden if a user tries to set it. // The purpose is to allow is to pass this in to the plugin. #[schemars(skip)] @@ -65,6 +105,10 @@ fn client_version_header_default() -> HeaderName { HeaderName::from_static(client_version_header_default_str()) } +pub(crate) const fn default_buffer_size() -> usize { + 10000 +} + impl Default for Config { fn default() -> Self { Self { @@ -74,6 +118,160 @@ impl Default for Config { client_name_header: client_name_header_default(), client_version_header: client_version_header_default(), schema_id: "".to_string(), + buffer_size: default_buffer_size(), + field_level_instrumentation_sampler: Some(SamplerOption::TraceIdRatioBased(0.01)), + send_headers: ForwardHeaders::None, + send_variable_values: ForwardValues::None, } } } + +#[derive(Debug, Clone, Deserialize, JsonSchema)] +#[serde(deny_unknown_fields, rename_all = "snake_case")] +pub(crate) enum ForwardHeaders { + None, + All, + #[serde(deserialize_with = "deserialize_vec_header_name")] + #[schemars(with = "Vec")] + Only(Vec), + #[schemars(with = "Vec")] + #[serde(deserialize_with = "deserialize_vec_header_name")] + Except(Vec), +} + +impl Default for ForwardHeaders { + fn default() -> Self { + Self::None + } +} + +#[derive(Debug, Clone, Deserialize, JsonSchema)] +#[serde(deny_unknown_fields, rename_all = "snake_case")] +pub(crate) enum ForwardValues { + None, + All, + Only(Vec), + Except(Vec), +} + +impl Default for ForwardValues { + fn default() -> Self { + Self::None + } +} + +#[derive(Debug, Serialize)] +pub(crate) enum SingleReport { + Stats(SingleStatsReport), + Traces(TracesReport), +} + +#[derive(Default, Debug, Serialize)] +pub(crate) struct Report { + pub(crate) traces_per_query: HashMap, + pub(crate) operation_count: u64, +} + +impl Report { + #[cfg(test)] + pub(crate) fn new(reports: Vec) -> Report { + let mut aggregated_report = Report::default(); + for report in reports { + aggregated_report += report; + } + aggregated_report + } + + pub(crate) fn into_report(self, header: ReportHeader) -> apollo_spaceport::Report { + let mut report = apollo_spaceport::Report { + header: Some(header), + end_time: Some(SystemTime::now().into()), + operation_count: self.operation_count, + ..Default::default() + }; + + for (key, traces_and_stats) in self.traces_per_query { + report.traces_per_query.insert(key, traces_and_stats.into()); + } + report + } +} + +impl AddAssign for Report { + fn add_assign(&mut self, report: SingleReport) { + match report { + SingleReport::Stats(stats) => self.add_assign(stats), + SingleReport::Traces(traces) => self.add_assign(traces), + } + } +} + +impl AddAssign for Report { + fn add_assign(&mut self, report: TracesReport) { + self.operation_count += report.traces.len() as u64; + for (operation_signature, trace) in report.traces { + self.traces_per_query + .entry(operation_signature) + .or_default() + .traces + .push(trace); + } + } +} + +impl AddAssign for Report { + fn add_assign(&mut self, report: SingleStatsReport) { + for (k, v) in report.stats { + *self.traces_per_query.entry(k).or_default() += v; + } + + self.operation_count += report.operation_count; + } +} + +#[derive(Default, Debug, Serialize)] +pub(crate) struct TracesAndStats { + pub(crate) traces: Vec, + #[serde(with = "vectorize")] + pub(crate) stats_with_context: HashMap, + pub(crate) referenced_fields_by_type: HashMap, +} + +impl From for apollo_spaceport::TracesAndStats { + fn from(stats: TracesAndStats) -> Self { + Self { + stats_with_context: stats.stats_with_context.into_values().map_into().collect(), + referenced_fields_by_type: stats.referenced_fields_by_type, + trace: stats.traces, + ..Default::default() + } + } +} + +impl AddAssign for TracesAndStats { + fn add_assign(&mut self, stats: SingleStats) { + *self + .stats_with_context + .entry(stats.stats_with_context.context.clone()) + .or_default() += stats.stats_with_context; + + // No merging required here because references fields by type will always be the same for each stats report key. + self.referenced_fields_by_type = stats.referenced_fields_by_type; + } +} + +pub(crate) mod vectorize { + use serde::Serialize; + use serde::Serializer; + + pub(crate) fn serialize<'a, T, K, V, S>(target: T, ser: S) -> Result + where + S: Serializer, + T: IntoIterator, + K: Serialize + 'a, + V: Serialize + 'a, + { + let container: Vec<_> = target.into_iter().collect(); + serde::Serialize::serialize(&container, ser) + } +} diff --git a/apollo-router/src/plugins/telemetry/apollo_exporter.rs b/apollo-router/src/plugins/telemetry/apollo_exporter.rs new file mode 100644 index 0000000000..32e65e2b3b --- /dev/null +++ b/apollo-router/src/plugins/telemetry/apollo_exporter.rs @@ -0,0 +1,207 @@ +//! Configuration for apollo telemetry exporter. +// This entire file is license key functionality +use std::time::Duration; + +use apollo_spaceport::ReportHeader; +use apollo_spaceport::Reporter; +use apollo_spaceport::ReporterError; +use async_trait::async_trait; +use deadpool::managed; +use deadpool::managed::Pool; +use deadpool::Runtime; +use futures::channel::mpsc; +use futures::stream::StreamExt; +use sys_info::hostname; +use tower::BoxError; +use url::Url; + +use super::apollo::Report; +use super::apollo::SingleReport; +// use crate::plugins::telemetry::apollo::ReportBuilder; + +const DEFAULT_QUEUE_SIZE: usize = 65_536; +// Do not set to 5 secs because it's also the default value for the BatchSpanProcesseur of tracing. +// It's less error prone to set a different value to let us compute traces and metrics +pub(crate) const EXPORTER_TIMEOUT_DURATION: Duration = Duration::from_secs(6); + +#[derive(Clone)] +pub(crate) enum Sender { + Noop, + Spaceport(mpsc::Sender), +} + +impl Sender { + pub(crate) fn send(&self, metrics: SingleReport) { + match &self { + Sender::Noop => {} + Sender::Spaceport(channel) => { + if let Err(err) = channel.to_owned().try_send(metrics) { + tracing::warn!( + "could not send metrics to spaceport, metric will be dropped: {}", + err + ); + } + } + } + } +} + +impl Default for Sender { + fn default() -> Self { + Sender::Noop + } +} + +pub(crate) struct ApolloExporter { + tx: mpsc::Sender, +} + +impl ApolloExporter { + pub(crate) fn new( + endpoint: &Url, + apollo_key: &str, + apollo_graph_ref: &str, + schema_id: &str, + ) -> Result { + let apollo_key = apollo_key.to_string(); + // Desired behavior: + // * Metrics are batched with a timeout. + // * If we cannot connect to spaceport metrics are discarded and a warning raised. + // * When the stream of metrics finishes we terminate the thread. + // * If the exporter is dropped the remaining records are flushed. + let (tx, mut rx) = mpsc::channel::(DEFAULT_QUEUE_SIZE); + + let header = apollo_spaceport::ReportHeader { + graph_ref: apollo_graph_ref.to_string(), + hostname: hostname()?, + agent_version: format!( + "{}@{}", + std::env!("CARGO_PKG_NAME"), + std::env!("CARGO_PKG_VERSION") + ), + runtime_version: "rust".to_string(), + uname: get_uname()?, + executable_schema_id: schema_id.to_string(), + ..Default::default() + }; + + // Deadpool gives us connection pooling to spaceport + // It also significantly simplifies initialisation of the connection and gives us options in the future for configuring timeouts. + let pool = deadpool::managed::Pool::::builder(ReporterManager { + endpoint: endpoint.clone(), + }) + .create_timeout(Some(Duration::from_secs(5))) + .wait_timeout(Some(Duration::from_secs(5))) + .runtime(Runtime::Tokio1) + .build() + .unwrap(); + + // This is the thread that actually sends metrics + tokio::spawn(async move { + let timeout = tokio::time::interval(EXPORTER_TIMEOUT_DURATION); + let mut report = Report::default(); + + tokio::pin!(timeout); + + loop { + tokio::select! { + single_report = rx.next() => { + if let Some(r) = single_report { + report += r; + } else { + tracing::info!("terminating apollo exporter"); + break; + } + }, + _ = timeout.tick() => { + Self::send_report(&pool, &apollo_key, &header, std::mem::take(&mut report)).await; + } + }; + } + + Self::send_report(&pool, &apollo_key, &header, report).await; + }); + Ok(ApolloExporter { tx }) + } + + pub(crate) fn provider(&self) -> Sender { + Sender::Spaceport(self.tx.clone()) + } + + async fn send_report( + pool: &Pool, + apollo_key: &str, + header: &ReportHeader, + report: Report, + ) { + if report.operation_count == 0 && report.traces_per_query.is_empty() { + return; + } + + match pool.get().await { + Ok(mut reporter) => { + let report = report.into_report(header.clone()); + match reporter + .submit(apollo_spaceport::ReporterRequest { + apollo_key: apollo_key.to_string(), + report: Some(report), + }) + .await + { + Ok(_) => {} + Err(e) => { + tracing::warn!("failed to submit stats to spaceport: {}", e); + } + }; + } + Err(err) => { + tracing::warn!( + "stats discarded as unable to get connection to spaceport: {}", + err + ); + } + }; + } +} + +pub(crate) struct ReporterManager { + endpoint: Url, +} + +#[async_trait] +impl managed::Manager for ReporterManager { + type Type = Reporter; + type Error = ReporterError; + + async fn create(&self) -> Result { + let url = self.endpoint.to_string(); + Ok(Reporter::try_new(url).await?) + } + + async fn recycle(&self, _r: &mut Reporter) -> managed::RecycleResult { + Ok(()) + } +} + +#[cfg(not(target_os = "windows"))] +pub(crate) fn get_uname() -> Result { + let u = uname::uname()?; + Ok(format!( + "{}, {}, {}, {}, {},", + u.sysname, u.nodename, u.release, u.version, u.machine + )) +} + +#[cfg(target_os = "windows")] +pub(crate) fn get_uname() -> Result { + // Best we can do on windows right now + let sysname = sys_info::os_type().unwrap_or_else(|_| "Windows".to_owned()); + let nodename = sys_info::hostname().unwrap_or_else(|_| "unknown".to_owned()); + let release = sys_info::os_release().unwrap_or_else(|_| "unknown".to_owned()); + let version = "unknown"; + let machine = "unknown"; + Ok(format!( + "{}, {}, {}, {}, {}", + sysname, nodename, release, version, machine + )) +} diff --git a/apollo-router/src/plugins/telemetry/config.rs b/apollo-router/src/plugins/telemetry/config.rs index 52f16b8089..0de11ef8f8 100644 --- a/apollo-router/src/plugins/telemetry/config.rs +++ b/apollo-router/src/plugins/telemetry/config.rs @@ -13,6 +13,12 @@ use super::metrics::MetricsAttributesConf; use super::*; use crate::plugins::telemetry::metrics; +#[derive(thiserror::Error, Debug)] +pub(crate) enum Error { + #[error("field level instrumentation sampler must sample less frequently than tracing level sampler")] + InvalidFieldLevelInstrumentationSampler, +} + pub(crate) trait GenericWith where Self: Sized, @@ -160,9 +166,7 @@ impl From for opentelemetry::Array { #[derive(Clone, Debug, Deserialize, JsonSchema)] #[serde(deny_unknown_fields, untagged)] pub(crate) enum SamplerOption { - /// Sample a given fraction of traces. Fractions >= 1 will always sample. If the parent span is - /// sampled, then it's child spans will automatically be sampled. Fractions < 0 are treated as - /// zero, but spans may still be sampled if their parent is. + /// Sample a given fraction. Fractions >= 1 will always sample. TraceIdRatioBased(f64), Always(Sampler), } @@ -170,9 +174,9 @@ pub(crate) enum SamplerOption { #[derive(Clone, Debug, Deserialize, JsonSchema)] #[serde(deny_unknown_fields, rename_all = "snake_case")] pub(crate) enum Sampler { - /// Always sample the trace + /// Always sample AlwaysOn, - /// Never sample the trace + /// Never sample AlwaysOff, } @@ -272,3 +276,56 @@ impl From<&Trace> for opentelemetry::sdk::trace::Config { fn parent_based(sampler: opentelemetry::sdk::trace::Sampler) -> opentelemetry::sdk::trace::Sampler { opentelemetry::sdk::trace::Sampler::ParentBased(Box::new(sampler)) } + +impl Conf { + pub(crate) fn calculate_field_level_instrumentation_ratio(&self) -> Result { + Ok( + match ( + self.tracing + .clone() + .unwrap_or_default() + .trace_config + .unwrap_or_default() + .sampler, + self.apollo + .clone() + .unwrap_or_default() + .field_level_instrumentation_sampler, + ) { + // Error conditions + ( + Some(SamplerOption::TraceIdRatioBased(global_ratio)), + Some(SamplerOption::TraceIdRatioBased(field_ratio)), + ) if field_ratio > global_ratio => { + Err(Error::InvalidFieldLevelInstrumentationSampler)? + } + ( + Some(SamplerOption::Always(Sampler::AlwaysOff)), + Some(SamplerOption::Always(Sampler::AlwaysOn)), + ) => Err(Error::InvalidFieldLevelInstrumentationSampler)?, + ( + Some(SamplerOption::Always(Sampler::AlwaysOff)), + Some(SamplerOption::TraceIdRatioBased(ratio)), + ) if ratio != 0.0 => Err(Error::InvalidFieldLevelInstrumentationSampler)?, + ( + Some(SamplerOption::TraceIdRatioBased(ratio)), + Some(SamplerOption::Always(Sampler::AlwaysOn)), + ) if ratio != 1.0 => Err(Error::InvalidFieldLevelInstrumentationSampler)?, + + // Happy paths + (_, Some(SamplerOption::TraceIdRatioBased(ratio))) if ratio == 0.0 => 0.0, + (Some(SamplerOption::TraceIdRatioBased(ratio)), _) if ratio == 0.0 => 0.0, + (_, Some(SamplerOption::Always(Sampler::AlwaysOn))) => 1.0, + ( + Some(SamplerOption::TraceIdRatioBased(global_ratio)), + Some(SamplerOption::TraceIdRatioBased(field_ratio)), + ) => field_ratio / global_ratio, + ( + Some(SamplerOption::Always(Sampler::AlwaysOn)), + Some(SamplerOption::TraceIdRatioBased(field_ratio)), + ) => field_ratio, + (_, _) => 0.0, + }, + ) + } +} diff --git a/apollo-router/src/plugins/telemetry/metrics/apollo.rs b/apollo-router/src/plugins/telemetry/metrics/apollo.rs index c192b1eaab..77d47d87ef 100644 --- a/apollo-router/src/plugins/telemetry/metrics/apollo.rs +++ b/apollo-router/src/plugins/telemetry/metrics/apollo.rs @@ -2,24 +2,11 @@ // With regards to ELv2 licensing, this entire file is license key functionality use std::sync::atomic::AtomicBool; use std::sync::atomic::Ordering; -use std::time::Duration; -use apollo_spaceport::ReportHeader; -use apollo_spaceport::Reporter; -use apollo_spaceport::ReporterError; -use async_trait::async_trait; -use deadpool::managed; -use deadpool::managed::Pool; -use deadpool::Runtime; -use futures::channel::mpsc; -use futures::stream::StreamExt; -use studio::Report; -use studio::SingleReport; -use sys_info::hostname; use tower::BoxError; -use url::Url; use crate::plugins::telemetry::apollo::Config; +use crate::plugins::telemetry::apollo_exporter::ApolloExporter; use crate::plugins::telemetry::config::MetricsCommon; use crate::plugins::telemetry::metrics::MetricsBuilder; use crate::plugins::telemetry::metrics::MetricsConfigurator; @@ -27,36 +14,6 @@ use crate::plugins::telemetry::metrics::MetricsConfigurator; mod duration_histogram; pub(crate) mod studio; -const DEFAULT_QUEUE_SIZE: usize = 65_536; - -#[derive(Clone)] -pub(crate) enum Sender { - Noop, - Spaceport(mpsc::Sender), -} - -impl Sender { - pub(crate) fn send(&self, metrics: SingleReport) { - match &self { - Sender::Noop => {} - Sender::Spaceport(channel) => { - if let Err(err) = channel.to_owned().try_send(metrics) { - tracing::warn!( - "could not send metrics to spaceport, metric will be dropped: {}", - err - ); - } - } - } - } -} - -impl Default for Sender { - fn default() -> Self { - Sender::Noop - } -} - impl MetricsConfigurator for Config { fn apply( &self, @@ -76,7 +33,7 @@ impl MetricsConfigurator for Config { if !ENABLED.swap(true, Ordering::Relaxed) { tracing::info!("Apollo Studio usage reporting is enabled. See https://go.apollo.dev/o/data for details"); } - let exporter = ApolloMetricsExporter::new(endpoint, key, reference, schema_id)?; + let exporter = ApolloExporter::new(endpoint, key, reference, schema_id)?; builder .with_apollo_metrics_collector(exporter.provider()) @@ -90,170 +47,23 @@ impl MetricsConfigurator for Config { } } -#[cfg(not(target_os = "windows"))] -fn get_uname() -> Result { - let u = uname::uname()?; - Ok(format!( - "{}, {}, {}, {}, {},", - u.sysname, u.nodename, u.release, u.version, u.machine - )) -} - -#[cfg(target_os = "windows")] -fn get_uname() -> Result { - // Best we can do on windows right now - let sysname = sys_info::os_type().unwrap_or_else(|_| "Windows".to_owned()); - let nodename = sys_info::hostname().unwrap_or_else(|_| "unknown".to_owned()); - let release = sys_info::os_release().unwrap_or_else(|_| "unknown".to_owned()); - let version = "unknown"; - let machine = "unknown"; - Ok(format!( - "{}, {}, {}, {}, {}", - sysname, nodename, release, version, machine - )) -} - -struct ApolloMetricsExporter { - tx: mpsc::Sender, -} - -impl ApolloMetricsExporter { - fn new( - endpoint: &Url, - apollo_key: &str, - apollo_graph_ref: &str, - schema_id: &str, - ) -> Result { - let apollo_key = apollo_key.to_string(); - // Desired behavior: - // * Metrics are batched with a timeout. - // * If we cannot connect to spaceport metrics are discarded and a warning raised. - // * When the stream of metrics finishes we terminate the thread. - // * If the exporter is dropped the remaining records are flushed. - let (tx, mut rx) = mpsc::channel::(DEFAULT_QUEUE_SIZE); - - let header = apollo_spaceport::ReportHeader { - graph_ref: apollo_graph_ref.to_string(), - hostname: hostname()?, - agent_version: format!( - "{}@{}", - std::env!("CARGO_PKG_NAME"), - std::env!("CARGO_PKG_VERSION") - ), - runtime_version: "rust".to_string(), - uname: get_uname()?, - executable_schema_id: schema_id.to_string(), - ..Default::default() - }; - - // Deadpool gives us connection pooling to spaceport - // It also significantly simplifies initialisation of the connection and gives us options in the future for configuring timeouts. - let pool = deadpool::managed::Pool::::builder(ReporterManager { - endpoint: endpoint.clone(), - }) - .create_timeout(Some(Duration::from_secs(5))) - .wait_timeout(Some(Duration::from_secs(5))) - .runtime(Runtime::Tokio1) - .build() - .unwrap(); - - // This is the thread that actually sends metrics - tokio::spawn(async move { - let timeout = tokio::time::interval(Duration::from_secs(5)); - let mut report = Report::default(); - tokio::pin!(timeout); - - loop { - tokio::select! { - single_report = rx.next() => { - if let Some(r) = single_report { - report += r; - } else { - break; - } - }, - _ = timeout.tick() => { - Self::send_report(&pool, &apollo_key, &header, std::mem::take(&mut report)).await; - } - }; - } - - Self::send_report(&pool, &apollo_key, &header, report).await; - }); - Ok(ApolloMetricsExporter { tx }) - } - - pub(crate) fn provider(&self) -> Sender { - Sender::Spaceport(self.tx.clone()) - } - - async fn send_report( - pool: &Pool, - apollo_key: &str, - header: &ReportHeader, - report: Report, - ) { - if report.operation_count == 0 { - return; - } - - match pool.get().await { - Ok(mut reporter) => { - let report = report.into_report(header.clone()); - match reporter - .submit(apollo_spaceport::ReporterRequest { - apollo_key: apollo_key.to_string(), - report: Some(report), - }) - .await - { - Ok(_) => {} - Err(e) => { - tracing::warn!("failed to submit stats to spaceport: {}", e); - } - }; - } - Err(err) => { - tracing::warn!( - "stats discarded as unable to get connection to spaceport: {}", - err - ); - } - }; - } -} - -pub(crate) struct ReporterManager { - endpoint: Url, -} - -#[async_trait] -impl managed::Manager for ReporterManager { - type Type = Reporter; - type Error = ReporterError; - - async fn create(&self) -> Result { - let url = self.endpoint.to_string(); - Ok(Reporter::try_new(url).await?) - } - - async fn recycle(&self, _r: &mut Reporter) -> managed::RecycleResult { - Ok(()) - } -} - #[cfg(test)] mod test { use std::future::Future; + use std::time::Duration; + use futures::stream::StreamExt; use http::header::HeaderName; use tower::ServiceExt; use super::super::super::config; + use super::studio::SingleStatsReport; use super::*; use crate::plugin::Plugin; use crate::plugin::PluginInit; use crate::plugins::telemetry::apollo; + use crate::plugins::telemetry::apollo::default_buffer_size; + use crate::plugins::telemetry::apollo_exporter::Sender; use crate::plugins::telemetry::Telemetry; use crate::plugins::telemetry::STUDIO_EXCLUDE; use crate::Context; @@ -268,7 +78,9 @@ mod test { apollo_graph_ref: None, client_name_header: HeaderName::from_static("name_header"), client_version_header: HeaderName::from_static("version_header"), + buffer_size: default_buffer_size(), schema_id: "schema_sha".to_string(), + ..Default::default() }) .await?; assert!(matches!(plugin.apollo_metrics_sender, Sender::Noop)); @@ -286,7 +98,10 @@ mod test { async fn apollo_metrics_single_operation() -> Result<(), BoxError> { let query = "query {topProducts{name}}"; let results = get_metrics_for_request(query, None, None).await?; - insta::with_settings!({sort_maps => true}, { + let mut settings = insta::Settings::clone_current(); + settings.set_sort_maps(true); + settings.add_redaction("[].request_id", "[REDACTED]"); + settings.bind(|| { insta::assert_json_snapshot!(results); }); Ok(()) @@ -296,7 +111,10 @@ mod test { async fn apollo_metrics_multiple_operations() -> Result<(), BoxError> { let query = "query {topProducts{name}} query {topProducts{name}}"; let results = get_metrics_for_request(query, None, None).await?; - insta::with_settings!({sort_maps => true}, { + let mut settings = insta::Settings::clone_current(); + settings.set_sort_maps(true); + settings.add_redaction("[].request_id", "[REDACTED]"); + settings.bind(|| { insta::assert_json_snapshot!(results); }); Ok(()) @@ -306,7 +124,10 @@ mod test { async fn apollo_metrics_parse_failure() -> Result<(), BoxError> { let query = "garbage"; let results = get_metrics_for_request(query, None, None).await?; - insta::with_settings!({sort_maps => true}, { + let mut settings = insta::Settings::clone_current(); + settings.set_sort_maps(true); + settings.add_redaction("[].request_id", "[REDACTED]"); + settings.bind(|| { insta::assert_json_snapshot!(results); }); Ok(()) @@ -316,9 +137,10 @@ mod test { async fn apollo_metrics_unknown_operation() -> Result<(), BoxError> { let query = "query {topProducts{name}}"; let results = get_metrics_for_request(query, Some("UNKNOWN"), None).await?; - insta::with_settings!({sort_maps => true}, { - insta::assert_json_snapshot!(results); - }); + let mut settings = insta::Settings::clone_current(); + settings.set_sort_maps(true); + settings.add_redaction("[].request_id", "[REDACTED]"); + settings.bind(|| insta::assert_json_snapshot!(results)); Ok(()) } @@ -326,7 +148,10 @@ mod test { async fn apollo_metrics_validation_failure() -> Result<(), BoxError> { let query = "query {topProducts{unknown}}"; let results = get_metrics_for_request(query, None, None).await?; - insta::with_settings!({sort_maps => true}, { + let mut settings = insta::Settings::clone_current(); + settings.set_sort_maps(true); + settings.add_redaction("[].request_id", "[REDACTED]"); + settings.bind(|| { insta::assert_json_snapshot!(results); }); @@ -339,7 +164,10 @@ mod test { let context = Context::new(); context.insert(STUDIO_EXCLUDE, true)?; let results = get_metrics_for_request(query, None, Some(context)).await?; - insta::with_settings!({sort_maps => true}, { + let mut settings = insta::Settings::clone_current(); + settings.set_sort_maps(true); + settings.add_redaction("[].request_id", "[REDACTED]"); + settings.bind(|| { insta::assert_json_snapshot!(results); }); @@ -350,7 +178,7 @@ mod test { query: &str, operation_name: Option<&str>, context: Option, - ) -> Result, BoxError> { + ) -> Result, BoxError> { let _ = tracing_subscriber::fmt::try_init(); let mut plugin = create_plugin().await?; // Replace the apollo metrics sender so we can test metrics collection. @@ -375,15 +203,19 @@ mod test { .await .unwrap(); + let default_latency = Duration::from_millis(100); let results = rx .collect::>() .await .into_iter() - .map(|mut m| { - m.traces_and_stats.iter_mut().for_each(|(_k, v)| { - v.stats_with_context.query_latency_stats.latency = Duration::from_millis(100) - }); - m + .filter_map(|m| match m { + apollo::SingleReport::Stats(mut m) => { + m.stats.iter_mut().for_each(|(_k, v)| { + v.stats_with_context.query_latency_stats.latency = default_latency + }); + Some(m) + } + apollo::SingleReport::Traces(_) => None, }) .collect(); Ok(results) @@ -396,7 +228,9 @@ mod test { apollo_graph_ref: Some("ref".to_string()), client_name_header: HeaderName::from_static("name_header"), client_version_header: HeaderName::from_static("version_header"), + buffer_size: default_buffer_size(), schema_id: "schema_sha".to_string(), + ..Default::default() }) } diff --git a/apollo-router/src/plugins/telemetry/metrics/apollo/snapshots/apollo_router__plugins__telemetry__metrics__apollo__studio__test__aggregation.snap b/apollo-router/src/plugins/telemetry/metrics/apollo/snapshots/apollo_router__plugins__telemetry__metrics__apollo__studio__test__aggregation.snap index 5496dd50f4..1733a5147c 100644 --- a/apollo-router/src/plugins/telemetry/metrics/apollo/snapshots/apollo_router__plugins__telemetry__metrics__apollo__studio__test__aggregation.snap +++ b/apollo-router/src/plugins/telemetry/metrics/apollo/snapshots/apollo_router__plugins__telemetry__metrics__apollo__studio__test__aggregation.snap @@ -1,10 +1,12 @@ --- source: apollo-router/src/plugins/telemetry/metrics/apollo/studio.rs +assertion_line: 264 expression: aggregated_metrics --- { "traces_per_query": { "report_key_1": { + "traces": [], "stats_with_context": [ [ { diff --git a/apollo-router/src/plugins/telemetry/metrics/apollo/studio.rs b/apollo-router/src/plugins/telemetry/metrics/apollo/studio.rs index 3fab826a70..c6243f93ab 100644 --- a/apollo-router/src/plugins/telemetry/metrics/apollo/studio.rs +++ b/apollo-router/src/plugins/telemetry/metrics/apollo/studio.rs @@ -1,53 +1,46 @@ use std::collections::HashMap; +use std::ops::Add; use std::ops::AddAssign; use std::time::Duration; -use std::time::SystemTime; use apollo_spaceport::ReferencedFieldsForType; -use apollo_spaceport::ReportHeader; use apollo_spaceport::StatsContext; -use itertools::Itertools; use serde::Serialize; +use uuid::Uuid; use super::duration_histogram::DurationHistogram; -impl Report { - #[cfg(test)] - fn new(reports: Vec) -> Report { - let mut aggregated_report = Report::default(); - for report in reports { - aggregated_report += report; - } - aggregated_report - } - - pub(crate) fn into_report(self, header: ReportHeader) -> apollo_spaceport::Report { - let mut report = apollo_spaceport::Report { - header: Some(header), - end_time: Some(SystemTime::now().into()), - operation_count: self.operation_count, - ..Default::default() - }; - - for (key, traces_and_stats) in self.traces_per_query { - report.traces_per_query.insert(key, traces_and_stats.into()); - } - report - } -} - #[derive(Default, Debug, Serialize)] -pub(crate) struct SingleReport { - pub(crate) traces_and_stats: HashMap, +pub(crate) struct SingleStatsReport { + pub(crate) request_id: Uuid, + pub(crate) stats: HashMap, pub(crate) operation_count: u64, } #[derive(Default, Debug, Serialize)] -pub(crate) struct SingleTracesAndStats { +pub(crate) struct SingleStats { pub(crate) stats_with_context: SingleContextualizedStats, pub(crate) referenced_fields_by_type: HashMap, } +#[derive(Default, Debug, Serialize)] +pub(crate) struct Stats { + pub(crate) stats_with_context: ContextualizedStats, + pub(crate) referenced_fields_by_type: HashMap, +} + +impl Add for SingleStats { + type Output = Stats; + + fn add(self, rhs: SingleStats) -> Self::Output { + Stats { + stats_with_context: self.stats_with_context + rhs.stats_with_context, + // No merging required here because references fields by type will always be the same for each stats report key. + referenced_fields_by_type: rhs.referenced_fields_by_type, + } + } +} + #[derive(Default, Debug, Serialize)] pub(crate) struct SingleContextualizedStats { pub(crate) context: StatsContext, @@ -55,6 +48,18 @@ pub(crate) struct SingleContextualizedStats { pub(crate) per_type_stat: HashMap, } +impl Add for SingleContextualizedStats { + type Output = ContextualizedStats; + + fn add(self, stats: SingleContextualizedStats) -> Self::Output { + let mut res = ContextualizedStats::default(); + res += self; + res += stats; + + res + } +} + // TODO Make some of these fields bool #[derive(Default, Debug, Serialize)] pub(crate) struct SingleQueryLatencyStats { @@ -71,6 +76,17 @@ pub(crate) struct SingleQueryLatencyStats { pub(crate) without_field_instrumentation: bool, } +impl Add for SingleQueryLatencyStats { + type Output = QueryLatencyStats; + fn add(self, stats: SingleQueryLatencyStats) -> Self::Output { + let mut res = QueryLatencyStats::default(); + res += self; + res += stats; + + res + } +} + #[derive(Default, Debug, Serialize)] pub(crate) struct SinglePathErrorStats { pub(crate) children: HashMap, @@ -92,41 +108,6 @@ pub(crate) struct SingleFieldStat { pub(crate) latency: Duration, } -#[derive(Default, Serialize)] -pub(crate) struct Report { - traces_per_query: HashMap, - pub(crate) operation_count: u64, -} - -impl AddAssign for Report { - fn add_assign(&mut self, report: SingleReport) { - for (k, v) in report.traces_and_stats { - *self.traces_per_query.entry(k).or_default() += v; - } - - self.operation_count += report.operation_count; - } -} - -#[derive(Default, Debug, Serialize)] -pub(crate) struct TracesAndStats { - #[serde(with = "vectorize")] - pub(crate) stats_with_context: HashMap, - pub(crate) referenced_fields_by_type: HashMap, -} - -impl AddAssign for TracesAndStats { - fn add_assign(&mut self, stats: SingleTracesAndStats) { - *self - .stats_with_context - .entry(stats.stats_with_context.context.clone()) - .or_default() += stats.stats_with_context; - - // No merging required here because references fields by type will always be the same for each stats report key. - self.referenced_fields_by_type = stats.referenced_fields_by_type; - } -} - #[derive(Default, Debug, Serialize)] pub(crate) struct ContextualizedStats { context: StatsContext, @@ -244,16 +225,6 @@ impl From for apollo_spaceport::ContextualizedStats { } } -impl From for apollo_spaceport::TracesAndStats { - fn from(stats: TracesAndStats) -> Self { - Self { - stats_with_context: stats.stats_with_context.into_values().map_into().collect(), - referenced_fields_by_type: stats.referenced_fields_by_type, - ..Default::default() - } - } -} - impl From for apollo_spaceport::QueryLatencyStats { fn from(stats: QueryLatencyStats) -> Self { Self { @@ -313,22 +284,6 @@ impl From for apollo_spaceport::FieldStat { } } -pub(crate) mod vectorize { - use serde::Serialize; - use serde::Serializer; - - pub(crate) fn serialize<'a, T, K, V, S>(target: T, ser: S) -> Result - where - S: Serializer, - T: IntoIterator, - K: Serialize + 'a, - V: Serialize + 'a, - { - let container: Vec<_> = target.into_iter().collect(); - serde::Serialize::serialize(&container, ser) - } -} - #[cfg(test)] mod test { use std::collections::HashMap; @@ -337,6 +292,7 @@ mod test { use apollo_spaceport::ReferencedFieldsForType; use super::*; + use crate::plugins::telemetry::apollo::Report; #[test] fn test_aggregation() { @@ -377,17 +333,18 @@ mod test { client_name: &str, client_version: &str, stats_report_key: &str, - ) -> SingleReport { + ) -> SingleStatsReport { // This makes me sad. Really this should have just been a case of generate a couple of metrics using // a prop testing library and then assert that things got merged OK. But in practise everything was too hard to use let mut count = Count::default(); - SingleReport { + SingleStatsReport { + request_id: Uuid::default(), operation_count: count.inc_u64(), - traces_and_stats: HashMap::from([( + stats: HashMap::from([( stats_report_key.to_string(), - SingleTracesAndStats { + SingleStats { stats_with_context: SingleContextualizedStats { context: StatsContext { client_name: client_name.to_string(), diff --git a/apollo-router/src/plugins/telemetry/metrics/mod.rs b/apollo-router/src/plugins/telemetry/metrics/mod.rs index ca41e3dc8c..dd9b937348 100644 --- a/apollo-router/src/plugins/telemetry/metrics/mod.rs +++ b/apollo-router/src/plugins/telemetry/metrics/mod.rs @@ -26,8 +26,8 @@ use crate::graphql::Request; use crate::plugin::serde::deserialize_header_name; use crate::plugin::serde::deserialize_json_query; use crate::plugin::serde::deserialize_regex; +use crate::plugins::telemetry::apollo_exporter::Sender; use crate::plugins::telemetry::config::MetricsCommon; -use crate::plugins::telemetry::metrics::apollo::Sender; use crate::router_factory::Endpoint; use crate::Context; use crate::ListenAddr; @@ -469,7 +469,7 @@ impl MetricsBuilder { } pub(crate) fn apollo_metrics_provider(&mut self) -> Sender { - std::mem::take(&mut self.apollo_metrics) + self.apollo_metrics.clone() } } diff --git a/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_exclude.snap b/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_exclude.snap index 978ae96461..c0b851333c 100644 --- a/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_exclude.snap +++ b/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_exclude.snap @@ -1,11 +1,12 @@ --- source: apollo-router/src/plugins/telemetry/metrics/apollo.rs -assertion_line: 332 +assertion_line: 170 expression: results --- [ { - "traces_and_stats": {}, + "request_id": "[REDACTED]", + "stats": {}, "operation_count": 1 } ] diff --git a/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_multiple_operations.snap b/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_multiple_operations.snap index 234b082917..fcc0ca9827 100644 --- a/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_multiple_operations.snap +++ b/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_multiple_operations.snap @@ -1,10 +1,12 @@ --- source: apollo-router/src/plugins/telemetry/metrics/apollo.rs +assertion_line: 117 expression: results --- [ { - "traces_and_stats": { + "request_id": "[REDACTED]", + "stats": { "## GraphQLValidationFailure\n": { "stats_with_context": { "context": { diff --git a/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_parse_failure.snap b/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_parse_failure.snap index 39c1f34f1c..ffc74f55e6 100644 --- a/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_parse_failure.snap +++ b/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_parse_failure.snap @@ -1,10 +1,12 @@ --- source: apollo-router/src/plugins/telemetry/metrics/apollo.rs +assertion_line: 130 expression: results --- [ { - "traces_and_stats": { + "request_id": "[REDACTED]", + "stats": { "## GraphQLParseFailure\n": { "stats_with_context": { "context": { diff --git a/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_single_operation.snap b/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_single_operation.snap index b88e28606f..abe4703d48 100644 --- a/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_single_operation.snap +++ b/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_single_operation.snap @@ -1,11 +1,12 @@ --- source: apollo-router/src/plugins/telemetry/metrics/apollo.rs -assertion_line: 279 +assertion_line: 104 expression: results --- [ { - "traces_and_stats": { + "request_id": "[REDACTED]", + "stats": { "# -\n{topProducts{name}}": { "stats_with_context": { "context": { diff --git a/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_unknown_operation.snap b/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_unknown_operation.snap index 5f256987bc..47a1850bc7 100644 --- a/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_unknown_operation.snap +++ b/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_unknown_operation.snap @@ -1,10 +1,12 @@ --- source: apollo-router/src/plugins/telemetry/metrics/apollo.rs +assertion_line: 142 expression: results --- [ { - "traces_and_stats": { + "request_id": "[REDACTED]", + "stats": { "## GraphQLUnknownOperationName\n": { "stats_with_context": { "context": { diff --git a/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_validation_failure.snap b/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_validation_failure.snap index 234b082917..69384135a0 100644 --- a/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_validation_failure.snap +++ b/apollo-router/src/plugins/telemetry/metrics/snapshots/apollo_router__plugins__telemetry__metrics__apollo__test__apollo_metrics_validation_failure.snap @@ -1,10 +1,12 @@ --- source: apollo-router/src/plugins/telemetry/metrics/apollo.rs +assertion_line: 154 expression: results --- [ { - "traces_and_stats": { + "request_id": "[REDACTED]", + "stats": { "## GraphQLValidationFailure\n": { "stats_with_context": { "context": { diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 1beb177927..3021b5a1c2 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -1,7 +1,8 @@ //! Telemetry plugin. // With regards to ELv2 licensing, this entire file is license key functionality +use std::collections::BTreeMap; use std::collections::HashMap; -use std::error::Error; +use std::error::Error as Errors; use std::fmt; use std::sync::atomic::AtomicU8; use std::sync::atomic::Ordering; @@ -9,6 +10,7 @@ use std::sync::Arc; use std::time::Duration; use std::time::Instant; +use ::tracing::field; use ::tracing::info_span; use ::tracing::subscriber::set_global_default; use ::tracing::Span; @@ -20,8 +22,9 @@ use futures::future::BoxFuture; use futures::stream::once; use futures::FutureExt; use futures::StreamExt; +use http::header; +use http::HeaderMap; use http::HeaderValue; -use metrics::apollo::Sender; use multimap::MultiMap; use once_cell::sync::OnceCell; use opentelemetry::global; @@ -31,18 +34,27 @@ use opentelemetry::sdk::propagation::TextMapCompositePropagator; use opentelemetry::sdk::propagation::TraceContextPropagator; use opentelemetry::sdk::trace::Builder; use opentelemetry::trace::SpanKind; +use opentelemetry::trace::TraceContextExt; use opentelemetry::trace::TracerProvider; use opentelemetry::KeyValue; +use rand::Rng; use router_bridge::planner::UsageReporting; +use serde_json_bytes::ByteString; +use serde_json_bytes::Map; +use serde_json_bytes::Value; use tower::BoxError; use tower::ServiceBuilder; use tower::ServiceExt; +use tracing_opentelemetry::OpenTelemetrySpanExt; use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; use tracing_subscriber::registry::LookupSpan; use tracing_subscriber::EnvFilter; use tracing_subscriber::Registry; use url::Url; +use self::apollo::ForwardValues; +use self::apollo::SingleReport; +use self::apollo_exporter::Sender; use self::config::Conf; use self::metrics::AttributesForwardConf; use self::metrics::MetricsAttributesConf; @@ -50,12 +62,13 @@ use crate::executable::GLOBAL_ENV_FILTER; use crate::layers::ServiceBuilderExt; use crate::plugin::Plugin; use crate::plugin::PluginInit; +use crate::plugins::telemetry::apollo::ForwardHeaders; use crate::plugins::telemetry::config::MetricsCommon; use crate::plugins::telemetry::config::Trace; use crate::plugins::telemetry::metrics::apollo::studio::SingleContextualizedStats; use crate::plugins::telemetry::metrics::apollo::studio::SingleQueryLatencyStats; -use crate::plugins::telemetry::metrics::apollo::studio::SingleReport; -use crate::plugins::telemetry::metrics::apollo::studio::SingleTracesAndStats; +use crate::plugins::telemetry::metrics::apollo::studio::SingleStats; +use crate::plugins::telemetry::metrics::apollo::studio::SingleStatsReport; use crate::plugins::telemetry::metrics::AggregateMeterProvider; use crate::plugins::telemetry::metrics::BasicMetrics; use crate::plugins::telemetry::metrics::MetricsBuilder; @@ -68,6 +81,8 @@ use crate::router_factory::Endpoint; use crate::services::execution; use crate::services::subgraph; use crate::services::supergraph; +use crate::subgraph::Request; +use crate::subgraph::Response; use crate::Context; use crate::ExecutionRequest; use crate::ListenAddr; @@ -77,17 +92,20 @@ use crate::SupergraphRequest; use crate::SupergraphResponse; pub(crate) mod apollo; +pub(crate) mod apollo_exporter; pub(crate) mod config; mod metrics; mod otlp; mod tracing; - -static SUPERGRAPH_SPAN_NAME: &str = "supergraph"; -static CLIENT_NAME: &str = "apollo_telemetry::client_name"; -static CLIENT_VERSION: &str = "apollo_telemetry::client_version"; +pub(crate) const REQUEST_SPAN_NAME: &str = "request"; +pub(crate) const SUPERGRAPH_SPAN_NAME: &str = "supergraph"; +pub(crate) const SUBGRAPH_SPAN_NAME: &str = "subgraph"; +const CLIENT_NAME: &str = "apollo_telemetry::client_name"; +const CLIENT_VERSION: &str = "apollo_telemetry::client_version"; const ATTRIBUTES: &str = "apollo_telemetry::metrics_attributes"; const SUBGRAPH_ATTRIBUTES: &str = "apollo_telemetry::subgraph_metrics_attributes"; -pub(crate) static STUDIO_EXCLUDE: &str = "apollo_telemetry::studio::exclude"; +pub(crate) const STUDIO_EXCLUDE: &str = "apollo_telemetry::studio::exclude"; +pub(crate) const FTV1_DO_NOT_SAMPLE: &str = "apollo_telemetry::studio::ftv1_do_not_sample"; const DEFAULT_SERVICE_NAME: &str = "apollo-router"; static TELEMETRY_LOADED: OnceCell = OnceCell::new(); @@ -103,7 +121,8 @@ pub struct Telemetry { meter_provider: AggregateMeterProvider, custom_endpoints: MultiMap, spaceport_shutdown: Option>, - apollo_metrics_sender: metrics::apollo::Sender, + apollo_metrics_sender: apollo_exporter::Sender, + field_level_instrumentation_ratio: f64, } #[derive(Debug)] @@ -170,8 +189,21 @@ impl Plugin for Telemetry { let config_map_res = config.clone(); ServiceBuilder::new() .instrument(Self::supergraph_service_span( + self.field_level_instrumentation_ratio, config.apollo.clone().unwrap_or_default(), )) + .map_response(|resp: SupergraphResponse| { + if let Ok(Some(usage_reporting)) = + resp.context.get::<_, UsageReporting>(USAGE_REPORTING) + { + // Record the operation signature on the router span + Span::current().record( + "apollo_private.operation_signature", + &usage_reporting.stats_report_key.as_str(), + ); + } + resp + }) .map_future_with_request_data( move |req: &SupergraphRequest| { Self::populate_context(config.clone(), req); @@ -184,7 +216,7 @@ impl Plugin for Telemetry { let start = Instant::now(); async move { let mut result: Result = fut.await; - result = Self::update_metrics( + result = Self::update_otel_metrics( config.clone(), ctx.clone(), metrics.clone(), @@ -192,66 +224,9 @@ impl Plugin for Telemetry { start.elapsed(), ) .await; - match result { - Err(e) => { - if !matches!(sender, Sender::Noop) { - Self::update_apollo_metrics( - &ctx, - sender, - true, - start.elapsed(), - ); - } - let mut metric_attrs = Vec::new(); - // Fill attributes from error - if let Some(subgraph_attributes_conf) = config - .metrics - .as_ref() - .and_then(|m| m.common.as_ref()) - .and_then(|c| c.attributes.as_ref()) - .and_then(|c| c.router.as_ref()) - { - metric_attrs.extend( - subgraph_attributes_conf - .get_attributes_from_error(&e) - .into_iter() - .map(|(k, v)| KeyValue::new(k, v)), - ); - } - - metrics.http_requests_error_total.add(1, &metric_attrs); - - Err(e) - } - Ok(router_response) => { - let mut has_errors = - !router_response.response.status().is_success(); - Ok(router_response.map(move |response_stream| { - let sender = sender.clone(); - let ctx = ctx.clone(); - - response_stream - .map(move |response| { - if !response.errors.is_empty() { - has_errors = true; - } - - if !response.has_next.unwrap_or(false) - && !matches!(sender, Sender::Noop) - { - Self::update_apollo_metrics( - &ctx, - sender.clone(), - has_errors, - start.elapsed(), - ); - } - response - }) - .boxed() - })) - } - } + Self::update_metrics_on_last_response( + &ctx, config, metrics, sender, start, result, + ) } }, ) @@ -262,6 +237,13 @@ impl Plugin for Telemetry { fn execution_service(&self, service: execution::BoxService) -> execution::BoxService { ServiceBuilder::new() .instrument(move |req: &ExecutionRequest| { + // disable ftv1 sampling for deferred queries + let do_not_sample_reason = if req.query_plan.root.contains_condition_or_defer() { + req.context.insert(FTV1_DO_NOT_SAMPLE, true).unwrap(); + "query is deferred" + } else { + "" + }; let query = req .supergraph_request .body() @@ -277,7 +259,8 @@ impl Plugin for Telemetry { info_span!("execution", graphql.document = query.as_str(), graphql.operation.name = operation_name.as_str(), - "otel.kind" = %SpanKind::Internal + "otel.kind" = %SpanKind::Internal, + ftv1.do_not_sample_reason = do_not_sample_reason ) }) .service(service) @@ -287,72 +270,10 @@ impl Plugin for Telemetry { fn subgraph_service(&self, name: &str, service: subgraph::BoxService) -> subgraph::BoxService { let metrics = BasicMetrics::new(&self.meter_provider); let subgraph_attribute = KeyValue::new("subgraph", name.to_string()); + let subgraph_metrics_conf_req = self.create_subgraph_metrics_conf(name); + let subgraph_metrics_conf_resp = subgraph_metrics_conf_req.clone(); let name = name.to_owned(); - let subgraph_metrics = Arc::new( - self.config - .metrics - .as_ref() - .and_then(|m| m.common.as_ref()) - .and_then(|c| c.attributes.as_ref()) - .and_then(|c| c.subgraph.as_ref()) - .map(|subgraph_cfg| { - macro_rules! extend_config { - ($forward_kind: ident) => {{ - let mut cfg = subgraph_cfg - .all - .as_ref() - .and_then(|a| a.$forward_kind.clone()) - .unwrap_or_default(); - if let Some(subgraphs) = &subgraph_cfg.subgraphs { - cfg.extend( - subgraphs - .get(&name) - .and_then(|s| s.$forward_kind.clone()) - .unwrap_or_default(), - ); - } - - cfg - }}; - } - macro_rules! merge_config { - ($forward_kind: ident) => {{ - let mut cfg = subgraph_cfg - .all - .as_ref() - .and_then(|a| a.$forward_kind.clone()) - .unwrap_or_default(); - if let Some(subgraphs) = &subgraph_cfg.subgraphs { - cfg.merge( - subgraphs - .get(&name) - .and_then(|s| s.$forward_kind.clone()) - .unwrap_or_default(), - ); - } - - cfg - }}; - } - let insert = extend_config!(insert); - let context = extend_config!(context); - let request = merge_config!(request); - let response = merge_config!(response); - let errors = merge_config!(errors); - - AttributesForwardConf { - insert: (!insert.is_empty()).then(|| insert), - request: (request.header.is_some() || request.body.is_some()) - .then(|| request), - response: (response.header.is_some() || response.body.is_some()) - .then(|| response), - errors: (errors.extensions.is_some() || errors.include_messages) - .then(|| errors), - context: (!context.is_empty()).then(|| context), - } - }), - ); - let subgraph_metrics_conf = subgraph_metrics.clone(); + let apollo_handler = self.apollo_handler(); ServiceBuilder::new() .instrument(move |req: &SubgraphRequest| { let query = req @@ -368,107 +289,41 @@ impl Plugin for Telemetry { .clone() .unwrap_or_default(); - info_span!("subgraph", + info_span!(SUBGRAPH_SPAN_NAME, name = name.as_str(), graphql.document = query.as_str(), graphql.operation.name = operation_name.as_str(), "otel.kind" = %SpanKind::Internal, + "apollo_private.ftv1" = field::Empty ) }) + .map_request(move |req| apollo_handler.request_ftv1(req)) + .map_response(move |resp| apollo_handler.store_ftv1(resp)) .map_future_with_request_data( move |sub_request: &SubgraphRequest| { - let subgraph_metrics_conf = subgraph_metrics_conf.clone(); - let mut attributes = HashMap::new(); - if let Some(subgraph_attributes_conf) = &*subgraph_metrics_conf { - attributes.extend(subgraph_attributes_conf.get_attributes_from_request( - sub_request.subgraph_request.headers(), - sub_request.subgraph_request.body(), - )); - attributes.extend( - subgraph_attributes_conf - .get_attributes_from_context(&sub_request.context), - ); - } - sub_request - .context - .insert(SUBGRAPH_ATTRIBUTES, attributes) - .unwrap(); - + Self::store_subgraph_request_attributes( + subgraph_metrics_conf_req.clone(), + sub_request, + ); sub_request.context.clone() }, move |context: Context, f: BoxFuture<'static, Result>| { let metrics = metrics.clone(); let subgraph_attribute = subgraph_attribute.clone(); - let subgraph_metrics = subgraph_metrics.clone(); + let subgraph_metrics_conf = subgraph_metrics_conf_resp.clone(); // Using Instant because it is guaranteed to be monotonically increasing. let now = Instant::now(); - f.map(move |r: Result| { - let subgraph_metrics_conf = subgraph_metrics.clone(); - let mut metric_attrs = context - .get::<_, HashMap>(SUBGRAPH_ATTRIBUTES) - .ok() - .flatten() - .map(|attrs| { - attrs - .into_iter() - .map(|(attr_name, attr_value)| { - KeyValue::new(attr_name, attr_value) - }) - .collect::>() - }) - .unwrap_or_default(); - metric_attrs.push(subgraph_attribute.clone()); - // Fill attributes from context - if let Some(subgraph_attributes_conf) = &*subgraph_metrics_conf { - metric_attrs.extend( - subgraph_attributes_conf - .get_attributes_from_context(&context) - .into_iter() - .map(|(k, v)| KeyValue::new(k, v)), - ); - } - - match &r { - Ok(response) => { - metric_attrs.push(KeyValue::new( - "status", - response.response.status().as_u16().to_string(), - )); - - // Fill attributes from response - if let Some(subgraph_attributes_conf) = &*subgraph_metrics_conf { - metric_attrs.extend( - subgraph_attributes_conf - .get_attributes_from_response( - response.response.headers(), - response.response.body(), - ) - .into_iter() - .map(|(k, v)| KeyValue::new(k, v)), - ); - } - - metrics.http_requests_total.add(1, &metric_attrs); - } - Err(err) => { - // Fill attributes from error - if let Some(subgraph_attributes_conf) = &*subgraph_metrics_conf { - metric_attrs.extend( - subgraph_attributes_conf - .get_attributes_from_error(err) - .into_iter() - .map(|(k, v)| KeyValue::new(k, v)), - ); - } - - metrics.http_requests_error_total.add(1, &metric_attrs); - } - } - metrics - .http_requests_duration - .record(now.elapsed().as_secs_f64(), &metric_attrs); - r + f.map(move |result: Result| { + Self::store_subgraph_response_attributes( + &context, + metrics, + subgraph_attribute, + subgraph_metrics_conf, + now, + &result, + ); + result }) }, ) @@ -589,12 +444,16 @@ impl Telemetry { Ok(true) })?; + let field_level_instrumentation_ratio = + config.calculate_field_level_instrumentation_ratio()?; + let plugin = Ok(Telemetry { spaceport_shutdown: shutdown_tx, custom_endpoints: builder.custom_endpoints(), _metrics_exporters: builder.exporters(), meter_provider: builder.meter_provider(), apollo_metrics_sender: builder.apollo_metrics_provider(), + field_level_instrumentation_ratio, config, }); @@ -663,9 +522,8 @@ impl Telemetry { builder = setup_tracing(builder, &tracing_config.zipkin, trace_config)?; builder = setup_tracing(builder, &tracing_config.datadog, trace_config)?; builder = setup_tracing(builder, &tracing_config.otlp, trace_config)?; - // TODO Apollo tracing at some point in the future. - // This is the shell of what was previously used to transmit metrics, but will in future be useful for sending traces. - // builder = setup_tracing(builder, &config.apollo, trace_config)?; + builder = setup_tracing(builder, &config.apollo, trace_config)?; + let tracer_provider = builder.build(); Ok(tracer_provider) } @@ -707,11 +565,9 @@ impl Telemetry { } fn supergraph_service_span( + field_level_instrumentation_ratio: f64, config: apollo::Config, ) -> impl Fn(&SupergraphRequest) -> Span + Clone { - let client_name_header = config.client_name_header; - let client_version_header = config.client_version_header; - move |request: &SupergraphRequest| { let http_request = &request.supergraph_request; let headers = http_request.headers(); @@ -722,95 +578,129 @@ impl Telemetry { .clone() .unwrap_or_default(); let client_name = headers - .get(&client_name_header) + .get(&config.client_name_header) .cloned() .unwrap_or_else(|| HeaderValue::from_static("")); let client_version = headers - .get(&client_version_header) + .get(&config.client_version_header) .cloned() .unwrap_or_else(|| HeaderValue::from_static("")); + let span = info_span!( SUPERGRAPH_SPAN_NAME, graphql.document = query.as_str(), // TODO add graphql.operation.type graphql.operation.name = operation_name.as_str(), - client_name = client_name.to_str().unwrap_or_default(), - client_version = client_version.to_str().unwrap_or_default(), - "otel.kind" = %SpanKind::Internal + client.name = client_name.to_str().unwrap_or_default(), + client.version = client_version.to_str().unwrap_or_default(), + otel.kind = %SpanKind::Internal, + apollo_private.field_level_instrumentation_ratio = field_level_instrumentation_ratio, + apollo_private.operation_signature = field::Empty, + apollo_private.graphql.variables = field::Empty, + apollo_private.http.request_headers = field::Empty ); + + if is_span_sampled(&request.context) { + span.record( + "apollo_private.graphql.variables", + &Self::filter_variables_values( + &request.supergraph_request.body().variables, + &config.send_variable_values, + ) + .as_str(), + ); + span.record( + "apollo_private.http.request_headers", + &Self::filter_headers( + request.supergraph_request.headers(), + &config.send_headers, + ) + .as_str(), + ); + } + span } } - fn update_apollo_metrics( - context: &Context, - sender: Sender, - has_errors: bool, - duration: Duration, - ) { - let metrics = if let Some(usage_reporting) = context - .get::<_, UsageReporting>(USAGE_REPORTING) - .unwrap_or_default() - { - let operation_count = operation_count(&usage_reporting.stats_report_key); - let persisted_query_hit = context - .get::<_, bool>("persisted_query_hit") - .unwrap_or_default(); - - if context - .get(STUDIO_EXCLUDE) - .map_or(false, |x| x.unwrap_or_default()) - { - // The request was excluded don't report the details, but do report the operation count - SingleReport { - operation_count, - ..Default::default() - } - } else { - metrics::apollo::studio::SingleReport { - operation_count, - traces_and_stats: HashMap::from([( - usage_reporting.stats_report_key.to_string(), - SingleTracesAndStats { - stats_with_context: SingleContextualizedStats { - context: StatsContext { - client_name: context - .get(CLIENT_NAME) - .unwrap_or_default() - .unwrap_or_default(), - client_version: context - .get(CLIENT_VERSION) - .unwrap_or_default() - .unwrap_or_default(), - }, - query_latency_stats: SingleQueryLatencyStats { - latency: duration, - has_errors, - persisted_query_hit, - ..Default::default() - }, - ..Default::default() - }, - referenced_fields_by_type: usage_reporting - .referenced_fields_by_type - .into_iter() - .map(|(k, v)| (k, convert(v))) - .collect(), - }, - )]), + fn filter_headers(headers: &HeaderMap, forward_rules: &ForwardHeaders) -> String { + let headers_map = headers + .iter() + .filter(|(name, _value)| { + name != &header::AUTHORIZATION + && name != &header::COOKIE + && name != &header::SET_COOKIE + }) + .map(|(name, value)| { + if match &forward_rules { + ForwardHeaders::None => false, + ForwardHeaders::All => true, + ForwardHeaders::Only(only) => only.contains(name), + ForwardHeaders::Except(except) => !except.contains(name), + } { + ( + name.to_string(), + value.to_str().unwrap_or("").to_string(), + ) + } else { + (name.to_string(), "".to_string()) } + }) + .fold(BTreeMap::new(), |mut acc, (name, value)| { + acc.entry(name).or_insert_with(Vec::new).push(value); + acc + }); + + match serde_json::to_string(&headers_map) { + Ok(result) => result, + Err(_err) => { + ::tracing::warn!( + "could not serialize header, trace will not have header information" + ); + Default::default() } - } else { - // Usage reporting was missing, so it counts as one operation. - SingleReport { - operation_count: 1, - ..Default::default() + } + } + + fn filter_variables_values( + variables: &Map, + forward_rules: &ForwardValues, + ) -> String { + #[allow(clippy::mutable_key_type)] // False positive lint + let variables = variables + .iter() + .map(|(name, value)| { + if match &forward_rules { + ForwardValues::None => false, + ForwardValues::All => true, + ForwardValues::Only(only) => only.contains(&name.as_str().to_string()), + ForwardValues::Except(except) => !except.contains(&name.as_str().to_string()), + } { + ( + name, + serde_json::to_string(value).unwrap_or_else(|_| "".to_string()), + ) + } else { + (name, "".to_string()) + } + }) + .fold(BTreeMap::new(), |mut acc, (name, value)| { + acc.entry(name).or_insert_with(Vec::new).push(value); + acc + }); + + match serde_json::to_string(&variables) { + Ok(result) => result, + Err(_err) => { + ::tracing::warn!( + "could not serialize variables, trace will not have variables information" + ); + Default::default() } - }; - sender.send(metrics); + } } - async fn update_metrics( + async fn update_otel_metrics( config: Arc, context: Context, metrics: BasicMetrics, @@ -932,6 +822,318 @@ impl Telemetry { let _ = context.insert(ATTRIBUTES, attributes); } } + + fn apollo_handler(&self) -> ApolloFtv1Handler { + let mut rng = rand::thread_rng(); + + if rng.gen_ratio((self.field_level_instrumentation_ratio * 100.0) as u32, 100) { + ApolloFtv1Handler::Enabled + } else { + ApolloFtv1Handler::Disabled + } + } + + fn create_subgraph_metrics_conf(&self, name: &str) -> Arc> { + Arc::new( + self.config + .metrics + .as_ref() + .and_then(|m| m.common.as_ref()) + .and_then(|c| c.attributes.as_ref()) + .and_then(|c| c.subgraph.as_ref()) + .map(|subgraph_cfg| { + macro_rules! extend_config { + ($forward_kind: ident) => {{ + let mut cfg = subgraph_cfg + .all + .as_ref() + .and_then(|a| a.$forward_kind.clone()) + .unwrap_or_default(); + if let Some(subgraphs) = &subgraph_cfg.subgraphs { + cfg.extend( + subgraphs + .get(&name.to_owned()) + .and_then(|s| s.$forward_kind.clone()) + .unwrap_or_default(), + ); + } + + cfg + }}; + } + macro_rules! merge_config { + ($forward_kind: ident) => {{ + let mut cfg = subgraph_cfg + .all + .as_ref() + .and_then(|a| a.$forward_kind.clone()) + .unwrap_or_default(); + if let Some(subgraphs) = &subgraph_cfg.subgraphs { + cfg.merge( + subgraphs + .get(&name.to_owned()) + .and_then(|s| s.$forward_kind.clone()) + .unwrap_or_default(), + ); + } + + cfg + }}; + } + let insert = extend_config!(insert); + let context = extend_config!(context); + let request = merge_config!(request); + let response = merge_config!(response); + let errors = merge_config!(errors); + + AttributesForwardConf { + insert: (!insert.is_empty()).then(|| insert), + request: (request.header.is_some() || request.body.is_some()) + .then(|| request), + response: (response.header.is_some() || response.body.is_some()) + .then(|| response), + errors: (errors.extensions.is_some() || errors.include_messages) + .then(|| errors), + context: (!context.is_empty()).then(|| context), + } + }), + ) + } + + fn store_subgraph_request_attributes( + attribute_forward_config: Arc>, + sub_request: &Request, + ) { + let mut attributes = HashMap::new(); + if let Some(subgraph_attributes_conf) = &*attribute_forward_config { + attributes.extend(subgraph_attributes_conf.get_attributes_from_request( + sub_request.subgraph_request.headers(), + sub_request.subgraph_request.body(), + )); + attributes + .extend(subgraph_attributes_conf.get_attributes_from_context(&sub_request.context)); + } + sub_request + .context + .insert(SUBGRAPH_ATTRIBUTES, attributes) + .unwrap(); + } + + fn store_subgraph_response_attributes( + context: &Context, + metrics: BasicMetrics, + subgraph_attribute: KeyValue, + attribute_forward_config: Arc>, + now: Instant, + result: &Result, + ) { + let mut metric_attrs = context + .get::<_, HashMap>(SUBGRAPH_ATTRIBUTES) + .ok() + .flatten() + .map(|attrs| { + attrs + .into_iter() + .map(|(attr_name, attr_value)| KeyValue::new(attr_name, attr_value)) + .collect::>() + }) + .unwrap_or_default(); + metric_attrs.push(subgraph_attribute); + // Fill attributes from context + if let Some(subgraph_attributes_conf) = &*attribute_forward_config { + metric_attrs.extend( + subgraph_attributes_conf + .get_attributes_from_context(context) + .into_iter() + .map(|(k, v)| KeyValue::new(k, v)), + ); + } + + match &result { + Ok(response) => { + metric_attrs.push(KeyValue::new( + "status", + response.response.status().as_u16().to_string(), + )); + + // Fill attributes from response + if let Some(subgraph_attributes_conf) = &*attribute_forward_config { + metric_attrs.extend( + subgraph_attributes_conf + .get_attributes_from_response( + response.response.headers(), + response.response.body(), + ) + .into_iter() + .map(|(k, v)| KeyValue::new(k, v)), + ); + } + + metrics.http_requests_total.add(1, &metric_attrs); + } + Err(err) => { + // Fill attributes from error + if let Some(subgraph_attributes_conf) = &*attribute_forward_config { + metric_attrs.extend( + subgraph_attributes_conf + .get_attributes_from_error(err) + .into_iter() + .map(|(k, v)| KeyValue::new(k, v)), + ); + } + + metrics.http_requests_error_total.add(1, &metric_attrs); + } + } + metrics + .http_requests_duration + .record(now.elapsed().as_secs_f64(), &metric_attrs); + } + + #[allow(clippy::too_many_arguments)] + fn update_metrics_on_last_response( + ctx: &Context, + config: Arc, + metrics: BasicMetrics, + sender: Sender, + start: Instant, + result: Result, + ) -> Result { + match result { + Err(e) => { + if !matches!(sender, Sender::Noop) { + Self::update_apollo_metrics(ctx, sender, true, start.elapsed()); + } + let mut metric_attrs = Vec::new(); + // Fill attributes from error + if let Some(subgraph_attributes_conf) = config + .metrics + .as_ref() + .and_then(|m| m.common.as_ref()) + .and_then(|c| c.attributes.as_ref()) + .and_then(|c| c.router.as_ref()) + { + metric_attrs.extend( + subgraph_attributes_conf + .get_attributes_from_error(&e) + .into_iter() + .map(|(k, v)| KeyValue::new(k, v)), + ); + } + + metrics.http_requests_error_total.add(1, &metric_attrs); + + Err(e) + } + Ok(router_response) => { + let mut has_errors = !router_response.response.status().is_success(); + Ok(router_response.map(move |response_stream| { + let sender = sender.clone(); + let ctx = ctx.clone(); + + response_stream + .map(move |response| { + if !response.errors.is_empty() { + has_errors = true; + } + + if !response.has_next.unwrap_or(false) + && !matches!(sender, Sender::Noop) + { + Self::update_apollo_metrics( + &ctx, + sender.clone(), + has_errors, + start.elapsed(), + ); + } + response + }) + .boxed() + })) + } + } + } + + fn update_apollo_metrics( + context: &Context, + sender: Sender, + has_errors: bool, + duration: Duration, + ) { + if is_span_sampled(context) { + ::tracing::trace!("span is sampled then skip the apollo metrics"); + return; + } + let metrics = if let Some(usage_reporting) = context + .get::<_, UsageReporting>(USAGE_REPORTING) + .unwrap_or_default() + { + let operation_count = operation_count(&usage_reporting.stats_report_key); + let persisted_query_hit = context + .get::<_, bool>("persisted_query_hit") + .unwrap_or_default(); + + if context + .get(STUDIO_EXCLUDE) + .map_or(false, |x| x.unwrap_or_default()) + { + // The request was excluded don't report the details, but do report the operation count + SingleStatsReport { + operation_count, + ..Default::default() + } + } else { + SingleStatsReport { + request_id: uuid::Uuid::from_bytes( + Span::current() + .context() + .span() + .span_context() + .trace_id() + .to_bytes(), + ), + operation_count, + stats: HashMap::from([( + usage_reporting.stats_report_key.to_string(), + SingleStats { + stats_with_context: SingleContextualizedStats { + context: StatsContext { + client_name: context + .get(CLIENT_NAME) + .unwrap_or_default() + .unwrap_or_default(), + client_version: context + .get(CLIENT_VERSION) + .unwrap_or_default() + .unwrap_or_default(), + }, + query_latency_stats: SingleQueryLatencyStats { + latency: duration, + has_errors, + persisted_query_hit, + ..Default::default() + }, + ..Default::default() + }, + referenced_fields_by_type: usage_reporting + .referenced_fields_by_type + .into_iter() + .map(|(k, v)| (k, convert(v))) + .collect(), + }, + )]), + } + } + } else { + // Usage reporting was missing, so it counts as one operation. + SingleStatsReport { + operation_count: 1, + ..Default::default() + } + }; + sender.send(SingleReport::Stats(metrics)); + } } // Planner errors return stats report key that start with `## ` @@ -967,8 +1169,51 @@ fn handle_error>(err: T) { } } +pub(crate) fn is_span_sampled(context: &Context) -> bool { + Span::current().context().span().span_context().is_sampled() + && !context + .get(FTV1_DO_NOT_SAMPLE) + .unwrap_or_default() + .unwrap_or(false) +} + register_plugin!("apollo", "telemetry", Telemetry); +/// This enum is a partial cleanup of the telemetry plugin logic. +/// +#[derive(Copy, Clone)] +enum ApolloFtv1Handler { + Enabled, + Disabled, +} + +impl ApolloFtv1Handler { + fn request_ftv1(&self, mut req: SubgraphRequest) -> SubgraphRequest { + if let ApolloFtv1Handler::Enabled = self { + if is_span_sampled(&req.context) { + req.subgraph_request.headers_mut().insert( + "apollo-federation-include-trace", + HeaderValue::from_static("ftv1"), + ); + } + } + req + } + + fn store_ftv1(&self, resp: SubgraphResponse) -> SubgraphResponse { + // Stash the FTV1 data + if let ApolloFtv1Handler::Enabled = self { + if let Some(serde_json_bytes::Value::String(ftv1)) = + resp.response.body().extensions.get("ftv1") + { + // Record the ftv1 trace for processing later + Span::current().record("apollo_private.ftv1", &ftv1.as_str()); + } + } + resp + } +} + // // Please ensure that any tests added to the tests module use the tokio multi-threaded test executor. // diff --git a/apollo-router/src/plugins/telemetry/tracing/apollo.rs b/apollo-router/src/plugins/telemetry/tracing/apollo.rs index 857dc1badc..ffc3b0d503 100644 --- a/apollo-router/src/plugins/telemetry/tracing/apollo.rs +++ b/apollo-router/src/plugins/telemetry/tracing/apollo.rs @@ -1,39 +1,49 @@ //! Tracing configuration for apollo telemetry. // With regards to ELv2 licensing, this entire file is license key functionality +use apollo_spaceport::Trace; use opentelemetry::sdk::trace::Builder; +use serde::Serialize; use tower::BoxError; use crate::plugins::telemetry::apollo::Config; -use crate::plugins::telemetry::config::Trace; +use crate::plugins::telemetry::config; use crate::plugins::telemetry::tracing::apollo_telemetry; -use crate::plugins::telemetry::tracing::apollo_telemetry::SpaceportConfig; -use crate::plugins::telemetry::tracing::apollo_telemetry::StudioGraph; use crate::plugins::telemetry::tracing::TracingConfigurator; impl TracingConfigurator for Config { - fn apply(&self, builder: Builder, trace_config: &Trace) -> Result { + fn apply(&self, builder: Builder, trace_config: &config::Trace) -> Result { tracing::debug!("configuring Apollo tracing"); Ok(match self { Config { endpoint: Some(endpoint), apollo_key: Some(key), apollo_graph_ref: Some(reference), + schema_id, + buffer_size, + field_level_instrumentation_sampler, .. } => { - tracing::debug!("configuring exporter to Spaceport"); - let exporter = apollo_telemetry::new_pipeline() - .with_trace_config(trace_config.into()) - .with_graph_config(&Some(StudioGraph { - reference: reference.clone(), - key: key.clone(), - })) - .with_spaceport_config(&Some(SpaceportConfig { - collector: endpoint.to_string(), - })) - .build_exporter()?; + tracing::debug!("configuring exporter to Studio"); + + let exporter = apollo_telemetry::Exporter::builder() + .trace_config(trace_config.clone()) + .endpoint(endpoint.clone()) + .apollo_key(key) + .apollo_graph_ref(reference) + .schema_id(schema_id) + .buffer_size(*buffer_size) + .and_field_execution_sampler(field_level_instrumentation_sampler.clone()) + .build()?; builder.with_batch_exporter(exporter, opentelemetry::runtime::Tokio) } _ => builder, }) } } + +// List of signature and trace by request_id +#[derive(Default, Debug, Serialize)] +pub(crate) struct TracesReport { + // signature and trace + pub(crate) traces: Vec<(String, Trace)>, +} diff --git a/apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs b/apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs index 301abf049f..996f051ef1 100644 --- a/apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs +++ b/apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs @@ -1,311 +1,493 @@ -//! # Apollo-Telemetry Span Exporter -//! -//! The apollo-telemetry [`SpanExporter`] sends [`Report`]s to its configured -//! [`Reporter`] instance. By default it will write to the Apollo Ingress. -//! -//! [`SpanExporter`]: SpanExporter -//! [`Span`]: crate::trace::Span -//! [`Report`]: apollo_spaceport::report::Report -//! [`Reporter`]: apollo_spaceport::Reporter -//! -//! # Examples -//! -//! ```ignore -//! use apollo_router::apollo_telemetry; -//! use opentelemetry::trace::Tracer; -//! use opentelemetry::global::shutdown_tracer_provider; -//! -//! fn main() { -//! let tracer = apollo_telemetry::new_pipeline() -//! .install_simple(); -//! -//! tracer.in_span("doing_work", |cx| { -//! // Traced app logic here... -//! }); -//! -//! shutdown_tracer_provider(); // sending remaining spans -//! } -//! ``` +use std::borrow::Cow; use std::collections::HashMap; -use std::fmt::Debug; -use std::str::FromStr; - -use apollo_spaceport::Reporter; +use std::io::Cursor; +use std::time::SystemTimeError; + +use apollo_spaceport::trace::http::Values; +use apollo_spaceport::trace::query_plan_node::FetchNode; +use apollo_spaceport::trace::query_plan_node::FlattenNode; +use apollo_spaceport::trace::query_plan_node::ParallelNode; +use apollo_spaceport::trace::query_plan_node::ResponsePathElement; +use apollo_spaceport::trace::query_plan_node::SequenceNode; +use apollo_spaceport::trace::Details; +use apollo_spaceport::trace::Http; +use apollo_spaceport::trace::QueryPlanNode; +use apollo_spaceport::Message; use async_trait::async_trait; use derivative::Derivative; -use opentelemetry::global; -use opentelemetry::runtime::Tokio; -use opentelemetry::sdk; +use lru::LruCache; use opentelemetry::sdk::export::trace::ExportResult; use opentelemetry::sdk::export::trace::SpanData; use opentelemetry::sdk::export::trace::SpanExporter; -use opentelemetry::sdk::export::ExportError; -use opentelemetry::trace::TracerProvider; -use schemars::JsonSchema; -use serde::Deserialize; -use serde::Serialize; -use tokio::task::JoinError; - -const DEFAULT_SERVER_URL: &str = "https://127.0.0.1:50051"; - -pub(crate) fn default_collector() -> String { - DEFAULT_SERVER_URL.to_string() -} - -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)] -#[serde(deny_unknown_fields, rename_all = "snake_case")] -pub(crate) struct SpaceportConfig { - #[serde(default = "default_collector")] - pub(crate) collector: String, +use opentelemetry::trace::SpanId; +use opentelemetry::Key; +use opentelemetry::Value; +use thiserror::Error; +use url::Url; + +use crate::plugins::telemetry::apollo::SingleReport; +use crate::plugins::telemetry::apollo_exporter::ApolloExporter; +use crate::plugins::telemetry::apollo_exporter::Sender; +use crate::plugins::telemetry::config; +use crate::plugins::telemetry::config::Sampler; +use crate::plugins::telemetry::config::SamplerOption; +use crate::plugins::telemetry::tracing::apollo::TracesReport; +use crate::plugins::telemetry::BoxError; +use crate::plugins::telemetry::REQUEST_SPAN_NAME; +use crate::plugins::telemetry::SUBGRAPH_SPAN_NAME; +use crate::plugins::telemetry::SUPERGRAPH_SPAN_NAME; +use crate::query_planner::FETCH_SPAN_NAME; +use crate::query_planner::FLATTEN_SPAN_NAME; +use crate::query_planner::PARALLEL_SPAN_NAME; +use crate::query_planner::SEQUENCE_SPAN_NAME; + +const APOLLO_PRIVATE_DURATION_NS: Key = Key::from_static_str("apollo_private.duration_ns"); +const APOLLO_PRIVATE_SENT_TIME_OFFSET: Key = + Key::from_static_str("apollo_private.sent_time_offset"); +const APOLLO_PRIVATE_GRAPHQL_VARIABLES: Key = + Key::from_static_str("apollo_private.graphql.variables"); +const APOLLO_PRIVATE_HTTP_REQUEST_HEADERS: Key = + Key::from_static_str("apollo_private.http.request_headers"); +const APOLLO_PRIVATE_OPERATION_SIGNATURE: Key = + Key::from_static_str("apollo_private.operation_signature"); +const APOLLO_PRIVATE_FTV1: Key = Key::from_static_str("apollo_private.ftv1"); +const APOLLO_PRIVATE_PATH: Key = Key::from_static_str("apollo_private.path"); +const FTV1_DO_NOT_SAMPLE_REASON: Key = Key::from_static_str("ftv1.do_not_sample_reason"); +const SERVICE_NAME: Key = Key::from_static_str("service.name"); +const CLIENT_NAME: Key = Key::from_static_str("client.name"); +const CLIENT_VERSION: Key = Key::from_static_str("client.version"); +const HTTP_METHOD: Key = Key::from_static_str("http.method"); + +#[derive(Error, Debug)] +pub(crate) enum Error { + #[error("subgraph protobuf decode error")] + ProtobufDecode(#[from] apollo_spaceport::DecodeError), + + #[error("subgraph trace payload was not base64")] + Base64Decode(#[from] base64::DecodeError), + + #[error("ftv1 span attribute should have been a string")] + Ftv1SpanAttribute, + + #[error("there were multiple tracing errors")] + MultipleErrors(Vec), + + #[error("duration could not be calculated")] + SystemTime(#[from] SystemTimeError), + + #[error("this trace should not be sampled")] + DoNotSample(Cow<'static, str>), } -#[allow(dead_code)] -#[derive(Clone, Derivative, Deserialize, Serialize, JsonSchema)] +/// A [`SpanExporter`] that writes to [`Reporter`]. +/// +/// [`SpanExporter`]: super::SpanExporter +/// [`Reporter`]: apollo_spaceport::Reporter +#[derive(Derivative)] #[derivative(Debug)] -pub(crate) struct StudioGraph { - #[serde(skip, default = "apollo_graph_reference")] - pub(crate) reference: String, - - #[serde(skip, default = "apollo_key")] +pub(crate) struct Exporter { + trace_config: config::Trace, + spans_by_parent_id: LruCache>, + endpoint: Url, + schema_id: String, #[derivative(Debug = "ignore")] - pub(crate) key: String, -} - -fn apollo_key() -> String { - std::env::var("APOLLO_KEY") - .expect("cannot set up usage reporting if the APOLLO_KEY environment variable is not set") -} - -fn apollo_graph_reference() -> String { - std::env::var("APOLLO_GRAPH_REF").expect( - "cannot set up usage reporting if the APOLLO_GRAPH_REF environment variable is not set", - ) -} - -impl Default for SpaceportConfig { - fn default() -> Self { - Self { - collector: default_collector(), - } - } -} -/// Pipeline builder -#[derive(Debug)] -pub(crate) struct PipelineBuilder { - graph_config: Option, - spaceport_config: Option, - trace_config: Option, + apollo_sender: Sender, + field_execution_weight: f64, } -/// Create a new apollo telemetry exporter pipeline builder. -pub(crate) fn new_pipeline() -> PipelineBuilder { - PipelineBuilder::default() +enum TreeData { + Request(Result, Error>), + Supergraph { + http: Http, + client_name: Option, + client_version: Option, + operation_signature: String, + }, + QueryPlan(QueryPlanNode), + Trace(Result>, Error>), } -impl Default for PipelineBuilder { - /// Return the default pipeline builder. - fn default() -> Self { - Self { - graph_config: None, - spaceport_config: None, - trace_config: None, - } - } -} - -#[allow(dead_code)] -impl PipelineBuilder { - const DEFAULT_BATCH_SIZE: usize = 65_536; - const DEFAULT_QUEUE_SIZE: usize = 65_536; - - /// Assign the SDK trace configuration. - #[allow(dead_code)] - pub(crate) fn with_trace_config(mut self, config: sdk::trace::Config) -> Self { - self.trace_config = Some(config); - self - } - - /// Assign graph identification configuration - pub(crate) fn with_graph_config(mut self, config: &Option) -> Self { - self.graph_config = config.clone(); - self +#[buildstructor::buildstructor] +impl Exporter { + #[builder] + pub(crate) fn new( + trace_config: config::Trace, + endpoint: Url, + apollo_key: String, + apollo_graph_ref: String, + schema_id: String, + buffer_size: usize, + field_execution_sampler: Option, + ) -> Result { + let apollo_exporter = + ApolloExporter::new(&endpoint, &apollo_key, &apollo_graph_ref, &schema_id)?; + Ok(Self { + spans_by_parent_id: LruCache::new(buffer_size), + trace_config, + endpoint, + schema_id, + apollo_sender: apollo_exporter.provider(), + field_execution_weight: match field_execution_sampler { + Some(SamplerOption::Always(Sampler::AlwaysOn)) => 1.0, + Some(SamplerOption::Always(Sampler::AlwaysOff)) => 0.0, + Some(SamplerOption::TraceIdRatioBased(ratio)) => 1.0 / ratio, + None => 0.0, + }, + }) } - /// Assign spaceport reporting configuration - pub(crate) fn with_spaceport_config(mut self, config: &Option) -> Self { - self.spaceport_config = config.clone(); - self - } + fn extract_root_trace( + &mut self, + span: &SpanData, + child_nodes: Vec, + ) -> Result, Error> { + let variables = span + .attributes + .get(&APOLLO_PRIVATE_GRAPHQL_VARIABLES) + .map(|data| data.as_str()) + .unwrap_or_default(); + let variables_json = if variables != "{}" { + serde_json::from_str(&variables).unwrap_or_default() + } else { + HashMap::new() + }; - /// Install the apollo telemetry exporter pipeline with the recommended defaults. - pub(crate) fn install_batch(mut self) -> Result { - let exporter = self.build_exporter()?; + let details = Details { + variables_json, + ..Default::default() + }; - // Users can override the default batch and queue sizes, but they can't - // set them to be lower than our specified defaults; - let queue_size = match std::env::var("OTEL_BSP_MAX_QUEUE_SIZE") - .ok() - .and_then(|queue_size| usize::from_str(&queue_size).ok()) - { - Some(v) => { - let result = usize::max(PipelineBuilder::DEFAULT_QUEUE_SIZE, v); - if result > v { - tracing::warn!( - "Ignoring 'OTEL_BSP_MAX_QUEUE_SIZE' setting. Cannot set max queue size lower than {}", - PipelineBuilder::DEFAULT_QUEUE_SIZE - ); - } - result - } - None => PipelineBuilder::DEFAULT_QUEUE_SIZE, + let http = self.extract_http_data(span); + + let mut root_trace = apollo_spaceport::Trace { + start_time: Some(span.start_time.into()), + end_time: Some(span.end_time.into()), + duration_ns: span + .attributes + .get(&APOLLO_PRIVATE_DURATION_NS) + .and_then(Self::extract_i64) + .map(|e| e as u64) + .unwrap_or_default(), + root: None, + details: Some(details), + http: Some(http), + ..Default::default() }; - let batch_size = match std::env::var("OTEL_BSP_MAX_EXPORT_BATCH_SIZE") - .ok() - .and_then(|batch_size| usize::from_str(&batch_size).ok()) - { - Some(v) => { - let result = usize::max(PipelineBuilder::DEFAULT_BATCH_SIZE, v); - if result > v { - tracing::warn!( - "Ignoring 'OTEL_BSP_MAX_EXPORT_BATCH_SIZE' setting. Cannot set max export batch size lower than {}", - PipelineBuilder::DEFAULT_BATCH_SIZE - ); + + for node in child_nodes { + match node { + TreeData::QueryPlan(query_plan) => { + root_trace.query_plan = Some(Box::new(query_plan)) } - // Batch size must be <= queue size - if result > queue_size { - tracing::warn!( - "Clamping 'OTEL_BSP_MAX_EXPORT_BATCH_SIZE' setting to {}. Cannot set max export batch size greater than max queue size", - queue_size - ); - queue_size - } else { - result + TreeData::Supergraph { + http, + client_name, + client_version, + operation_signature, + } => { + root_trace + .http + .as_mut() + .expect("http was extracted earlier, qed") + .request_headers = http.request_headers; + root_trace.client_name = client_name.unwrap_or_default(); + root_trace.client_version = client_version.unwrap_or_default(); + root_trace.field_execution_weight = self.field_execution_weight; + // This will be moved out later + root_trace.signature = operation_signature; } + _ => panic!("should never have had other node types"), } - None => PipelineBuilder::DEFAULT_BATCH_SIZE, - }; - let batch = sdk::trace::BatchSpanProcessor::builder(exporter, Tokio) - .with_max_queue_size(queue_size) - .with_max_export_batch_size(batch_size) - .build(); - - let mut provider_builder = sdk::trace::TracerProvider::builder().with_span_processor(batch); - if let Some(config) = self.trace_config.take() { - provider_builder = provider_builder.with_config(config); } - let provider = provider_builder.build(); - - let tracer = provider.versioned_tracer( - "apollo-opentelemetry", - Some(env!("CARGO_PKG_VERSION")), - None, - ); - // This code will hang unless we execute from a separate - // thread. See: - // https://github.com/apollographql/router/issues/331 - // https://github.com/open-telemetry/opentelemetry-rust/issues/536 - // for more details and description. - let jh = tokio::task::spawn_blocking(|| { - opentelemetry::global::force_flush_tracer_provider(); - opentelemetry::global::set_tracer_provider(provider); - }); - futures::executor::block_on(jh)?; - Ok(tracer) + Ok(Box::new(root_trace)) } - // XXX CANNOT USE SIMPLE WITH OUR IMPLEMENTATION AS NO RUNTIME EXISTS - // WHEN TRYING TO EXPORT... - /// Install the apollo telemetry exporter pipeline with the recommended defaults. - #[allow(dead_code)] - pub(crate) fn install_simple(mut self) -> Result { - let exporter = self.build_exporter()?; - - let mut provider_builder = - sdk::trace::TracerProvider::builder().with_simple_exporter(exporter); - if let Some(config) = self.trace_config.take() { - provider_builder = provider_builder.with_config(config); - } - let provider = provider_builder.build(); - - let tracer = provider.versioned_tracer( - "apollo-opentelemetry", - Some(env!("CARGO_PKG_VERSION")), - None, - ); - let _prev_global_provider = global::set_tracer_provider(provider); - - Ok(tracer) + fn extract_trace(&mut self, span: SpanData) -> Result, Error> { + self.extract_data_from_spans(&span, &span)? + .pop() + .and_then(|node| { + if let TreeData::Request(trace) = node { + Some(trace) + } else { + None + } + }) + .expect("root trace must exist because it is constructed on the request span, qed") } - /// Create a client to talk to our spaceport and return an exporter. - pub(crate) fn build_exporter(&self) -> Result { - let collector = match self.spaceport_config.clone() { - Some(cfg) => cfg.collector, - None => DEFAULT_SERVER_URL.to_string(), - }; - let graph = self.graph_config.clone(); - - tracing::debug!("collector: {}", collector); - tracing::debug!("graph: {:?}", graph); + fn extract_data_from_spans( + &mut self, + root_span: &SpanData, + span: &SpanData, + ) -> Result, Error> { + let (mut child_nodes, errors) = self + .spans_by_parent_id + .pop_entry(&span.span_context.span_id()) + .map(|(_, spans)| spans) + .unwrap_or_default() + .into_iter() + .map(|span| self.extract_data_from_spans(root_span, &span)) + .fold((Vec::new(), Vec::new()), |(mut oks, mut errors), next| { + match next { + Ok(mut children) => oks.append(&mut children), + Err(err) => errors.push(err), + } + (oks, errors) + }); + if !errors.is_empty() { + return Err(Error::MultipleErrors(errors)); + } + if let Some(Value::String(reason)) = span.attributes.get(&FTV1_DO_NOT_SAMPLE_REASON) { + if !reason.is_empty() { + return Err(Error::DoNotSample(reason.clone())); + } + } - Ok(Exporter::new(collector, graph)) + Ok(match span.name.as_ref() { + PARALLEL_SPAN_NAME => vec![TreeData::QueryPlan(QueryPlanNode { + node: Some(apollo_spaceport::trace::query_plan_node::Node::Parallel( + ParallelNode { + nodes: child_nodes + .into_iter() + .filter_map(|child| match child { + TreeData::QueryPlan(node) => Some(node), + _ => None, + }) + .collect(), + }, + )), + })], + SEQUENCE_SPAN_NAME => vec![TreeData::QueryPlan(QueryPlanNode { + node: Some(apollo_spaceport::trace::query_plan_node::Node::Sequence( + SequenceNode { + nodes: child_nodes + .into_iter() + .filter_map(|child| match child { + TreeData::QueryPlan(node) => Some(node), + _ => None, + }) + .collect(), + }, + )), + })], + FETCH_SPAN_NAME => { + let (trace_parsing_failed, trace) = match child_nodes.pop() { + Some(TreeData::Trace(Ok(trace))) => (false, trace), + Some(TreeData::Trace(Err(_err))) => (true, None), + _ => (false, None), + }; + let service_name = (span + .attributes + .get(&SERVICE_NAME) + .cloned() + .unwrap_or_else(|| Value::String("unknown service".into())) + .as_str()) + .to_string(); + vec![TreeData::QueryPlan(QueryPlanNode { + node: Some(apollo_spaceport::trace::query_plan_node::Node::Fetch( + Box::new(FetchNode { + service_name, + trace_parsing_failed, + trace, + sent_time_offset: span + .attributes + .get(&APOLLO_PRIVATE_SENT_TIME_OFFSET) + .and_then(Self::extract_i64) + .map(|f| f as u64) + .unwrap_or_default(), + sent_time: Some(span.start_time.into()), + received_time: Some(span.end_time.into()), + }), + )), + })] + } + FLATTEN_SPAN_NAME => { + vec![TreeData::QueryPlan(QueryPlanNode { + node: Some(apollo_spaceport::trace::query_plan_node::Node::Flatten( + Box::new(FlattenNode { + response_path: span + .attributes + .get(&APOLLO_PRIVATE_PATH) + .and_then(Self::extract_string) + .map(|v| { + v.split('/').filter(|v|!v.is_empty() && *v != "@").map(|v| { + if let Ok(index) = v.parse::() { + ResponsePathElement { id: Some(apollo_spaceport::trace::query_plan_node::response_path_element::Id::Index(index))} + } else { + ResponsePathElement { id: Some(apollo_spaceport::trace::query_plan_node::response_path_element::Id::FieldName(v.to_string())) } + } + }).collect() + }).unwrap_or_default(), + node: child_nodes + .into_iter() + .filter_map(|child| match child { + TreeData::QueryPlan(node) => Some(Box::new(node)), + _ => None, + }) + .next(), + }), + )), + })] + } + SUBGRAPH_SPAN_NAME => { + vec![TreeData::Trace(self.find_ftv1_trace(span))] + } + SUPERGRAPH_SPAN_NAME => { + //Currently some data is in the supergraph span as we don't have the a request hook in plugin. + child_nodes.push(TreeData::Supergraph { + http: self.extract_http_data(span), + client_name: span + .attributes + .get(&CLIENT_NAME) + .and_then(Self::extract_string), + client_version: span + .attributes + .get(&CLIENT_VERSION) + .and_then(Self::extract_string), + operation_signature: span + .attributes + .get(&APOLLO_PRIVATE_OPERATION_SIGNATURE) + .and_then(Self::extract_string) + .unwrap_or_default(), + }); + child_nodes + } + REQUEST_SPAN_NAME => { + vec![TreeData::Request( + self.extract_root_trace(span, child_nodes), + )] + } + _ => child_nodes, + }) } -} - -/// A [`SpanExporter`] that writes to [`Reporter`]. -/// -/// [`SpanExporter`]: super::SpanExporter -/// [`Reporter`]: apollo_spaceport::Reporter -#[derive(Debug)] -#[allow(dead_code)] -pub(crate) struct Exporter { - collector: String, - graph: Option, - reporter: tokio::sync::OnceCell, - normalized_queries: HashMap, -} -impl Exporter { - /// Create a new apollo telemetry `Exporter`. - pub(crate) fn new(collector: String, graph: Option) -> Self { - Self { - collector, - graph, - reporter: tokio::sync::OnceCell::new(), - normalized_queries: HashMap::new(), + fn extract_string(v: &Value) -> Option { + if let Value::String(v) = v { + Some(v.to_string()) + } else { + None } } -} - -/// Apollo Telemetry exporter's error -#[derive(thiserror::Error, Debug)] -#[error(transparent)] -pub(crate) struct ApolloError(#[from] apollo_spaceport::ReporterError); -impl From for ApolloError { - fn from(error: std::io::Error) -> Self { - ApolloError(error.into()) + fn extract_i64(v: &Value) -> Option { + if let Value::I64(v) = v { + Some(*v) + } else { + None + } } -} -impl From for ApolloError { - fn from(error: JoinError) -> Self { - ApolloError(error.into()) + fn find_ftv1_trace( + &mut self, + span: &SpanData, + ) -> Result>, Error> { + span.attributes + .get(&APOLLO_PRIVATE_FTV1) + .map(|data| { + if let Value::String(data) = data { + Ok(Box::new(apollo_spaceport::Trace::decode(Cursor::new( + base64::decode(data.to_string())?, + ))?)) + } else { + Err(Error::Ftv1SpanAttribute) + } + }) + .transpose() } -} -impl ExportError for ApolloError { - fn exporter_name(&self) -> &'static str { - "apollo-telemetry" + fn extract_http_data(&self, span: &SpanData) -> Http { + let method = match span + .attributes + .get(&HTTP_METHOD) + .map(|data| data.as_str()) + .unwrap_or_default() + .as_ref() + { + "OPTIONS" => apollo_spaceport::trace::http::Method::Options, + "GET" => apollo_spaceport::trace::http::Method::Get, + "HEAD" => apollo_spaceport::trace::http::Method::Head, + "POST" => apollo_spaceport::trace::http::Method::Post, + "PUT" => apollo_spaceport::trace::http::Method::Put, + "DELETE" => apollo_spaceport::trace::http::Method::Delete, + "TRACE" => apollo_spaceport::trace::http::Method::Trace, + "CONNECT" => apollo_spaceport::trace::http::Method::Connect, + "PATCH" => apollo_spaceport::trace::http::Method::Patch, + _ => apollo_spaceport::trace::http::Method::Unknown, + }; + let headers = span + .attributes + .get(&APOLLO_PRIVATE_HTTP_REQUEST_HEADERS) + .map(|data| data.as_str()) + .unwrap_or_default(); + let request_headers = serde_json::from_str::>>(&headers) + .unwrap_or_default() + .into_iter() + .map(|(header_name, value)| (header_name.to_lowercase(), Values { value })) + .collect(); + + Http { + method: method.into(), + request_headers, + response_headers: Default::default(), + status_code: 0, + } } } #[async_trait] impl SpanExporter for Exporter { /// Export spans to apollo telemetry - async fn export(&mut self, _batch: Vec) -> ExportResult { - todo!("Apollo tracing is not yet implemented"); - //return ExportResult::Ok(()); + async fn export(&mut self, batch: Vec) -> ExportResult { + // Exporting to apollo means that we must have complete trace as the entire trace must be built. + // We do what we can, and if there are any traces that are not complete then we keep them for the next export event. + // We may get spans that simply don't complete. These need to be cleaned up after a period. It's the price of using ftv1. + + // Note that apollo-tracing won't really work with defer/stream/live queries. In this situation it's difficult to know when a request has actually finished. + let mut traces: Vec<(String, apollo_spaceport::Trace)> = Vec::new(); + for span in batch { + if span.name == REQUEST_SPAN_NAME { + match self.extract_trace(span) { + Ok(mut trace) => { + let mut operation_signature = Default::default(); + std::mem::swap(&mut trace.signature, &mut operation_signature); + if !operation_signature.is_empty() { + traces.push((operation_signature, *trace)); + } + } + Err(Error::MultipleErrors(errors)) => { + if let Some(Error::DoNotSample(reason)) = errors.first() { + tracing::debug!( + "sampling is disabled on this trace: {}, skipping", + reason + ); + } else { + tracing::error!( + "failed to construct trace: {}, skipping", + Error::MultipleErrors(errors) + ); + } + } + Err(error) => { + tracing::error!("failed to construct trace: {}, skipping", error); + } + } + } else { + // Not a root span, we may need it later so stash it. + + // This is sad, but with LRU there is no `get_insert_mut` so a double lookup is required + // It is safe to expect the entry to exist as we just inserted it, however capacity of the LRU must not be 0. + self.spans_by_parent_id + .get_or_insert(span.parent_span_id, Vec::new); + self.spans_by_parent_id + .get_mut(&span.parent_span_id) + .expect("capacity of cache was zero") + .push(span); + } + } + self.apollo_sender + .send(SingleReport::Traces(TracesReport { traces })); + + return ExportResult::Ok(()); } } diff --git a/apollo-router/src/plugins/telemetry/tracing/datadog.rs b/apollo-router/src/plugins/telemetry/tracing/datadog.rs index 1dacfe529a..285d3dce68 100644 --- a/apollo-router/src/plugins/telemetry/tracing/datadog.rs +++ b/apollo-router/src/plugins/telemetry/tracing/datadog.rs @@ -1,4 +1,5 @@ //! Configuration for datadog tracing. +use opentelemetry::sdk::trace::BatchSpanProcessor; use opentelemetry::sdk::trace::Builder; use schemars::JsonSchema; use serde::Deserialize; @@ -9,6 +10,7 @@ use super::deser_endpoint; use super::AgentEndpoint; use crate::plugins::telemetry::config::GenericWith; use crate::plugins::telemetry::config::Trace; +use crate::plugins::telemetry::tracing::SpanProcessorExt; use crate::plugins::telemetry::tracing::TracingConfigurator; #[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)] @@ -36,7 +38,11 @@ impl TracingConfigurator for Config { .with(&trace_config.service_name, |b, n| b.with_service_name(n)) .with_trace_config(trace_config.into()) .build_exporter()?; - Ok(builder.with_batch_exporter(exporter, opentelemetry::runtime::Tokio)) + Ok(builder.with_span_processor( + BatchSpanProcessor::builder(exporter, opentelemetry::runtime::Tokio) + .build() + .filtered(), + )) } } diff --git a/apollo-router/src/plugins/telemetry/tracing/jaeger.rs b/apollo-router/src/plugins/telemetry/tracing/jaeger.rs index 3e9411c92f..b0246e056f 100644 --- a/apollo-router/src/plugins/telemetry/tracing/jaeger.rs +++ b/apollo-router/src/plugins/telemetry/tracing/jaeger.rs @@ -16,6 +16,7 @@ use super::deser_endpoint; use super::AgentEndpoint; use crate::plugins::telemetry::config::GenericWith; use crate::plugins::telemetry::config::Trace; +use crate::plugins::telemetry::tracing::SpanProcessorExt; use crate::plugins::telemetry::tracing::TracingConfigurator; #[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)] @@ -110,7 +111,8 @@ impl TracingConfigurator for Config { Ok(builder.with_span_processor( BatchSpanProcessor::builder(exporter, opentelemetry::runtime::Tokio) .with(&self.scheduled_delay, |b, d| b.with_scheduled_delay(*d)) - .build(), + .build() + .filtered(), )) } } diff --git a/apollo-router/src/plugins/telemetry/tracing/mod.rs b/apollo-router/src/plugins/telemetry/tracing/mod.rs index 8fd5a1b300..04e47f7654 100644 --- a/apollo-router/src/plugins/telemetry/tracing/mod.rs +++ b/apollo-router/src/plugins/telemetry/tracing/mod.rs @@ -1,4 +1,11 @@ +use opentelemetry::sdk::export::trace::SpanData; use opentelemetry::sdk::trace::Builder; +use opentelemetry::sdk::trace::EvictedHashMap; +use opentelemetry::sdk::trace::Span; +use opentelemetry::sdk::trace::SpanProcessor; +use opentelemetry::trace::TraceResult; +use opentelemetry::Context; +use opentelemetry::KeyValue; use reqwest::Url; use schemars::JsonSchema; use serde::Deserialize; @@ -70,3 +77,68 @@ where let url = parse_url_for_endpoint(s).map_err(serde::de::Error::custom)?; Ok(AgentEndpoint::Url(url)) } + +#[derive(Debug)] +struct ApolloFilterSpanProcessor { + delegate: T, +} + +pub(crate) static APOLLO_PRIVATE_PREFIX: &str = "apollo_private."; + +impl SpanProcessor for ApolloFilterSpanProcessor { + fn on_start(&self, span: &mut Span, cx: &Context) { + self.delegate.on_start(span, cx); + } + + fn on_end(&self, span: SpanData) { + if span + .attributes + .iter() + .any(|(key, _)| key.as_str().starts_with(APOLLO_PRIVATE_PREFIX)) + { + let attributes_len = span.attributes.len(); + let span = SpanData { + attributes: span + .attributes + .into_iter() + .filter(|(k, _)| !k.as_str().starts_with(APOLLO_PRIVATE_PREFIX)) + .fold( + EvictedHashMap::new(attributes_len as u32, attributes_len), + |mut m, (k, v)| { + m.insert(KeyValue::new(k, v)); + m + }, + ), + ..span + }; + + self.delegate.on_end(span); + } else { + self.delegate.on_end(span); + } + } + + fn force_flush(&self) -> TraceResult<()> { + self.delegate.force_flush() + } + + fn shutdown(&mut self) -> TraceResult<()> { + self.delegate.shutdown() + } +} + +trait SpanProcessorExt +where + Self: Sized + SpanProcessor, +{ + fn filtered(self) -> ApolloFilterSpanProcessor; +} + +impl SpanProcessorExt for T +where + Self: Sized, +{ + fn filtered(self) -> ApolloFilterSpanProcessor { + ApolloFilterSpanProcessor { delegate: self } + } +} diff --git a/apollo-router/src/plugins/telemetry/tracing/otlp.rs b/apollo-router/src/plugins/telemetry/tracing/otlp.rs index 5f1d98e7f5..656d8f3970 100644 --- a/apollo-router/src/plugins/telemetry/tracing/otlp.rs +++ b/apollo-router/src/plugins/telemetry/tracing/otlp.rs @@ -1,20 +1,26 @@ //! Configuration for Otlp tracing. use std::result::Result; +use opentelemetry::sdk::trace::BatchSpanProcessor; use opentelemetry::sdk::trace::Builder; use opentelemetry_otlp::SpanExporterBuilder; use tower::BoxError; use crate::plugins::telemetry::config::Trace; +use crate::plugins::telemetry::tracing::SpanProcessorExt; use crate::plugins::telemetry::tracing::TracingConfigurator; impl TracingConfigurator for super::super::otlp::Config { fn apply(&self, builder: Builder, _trace_config: &Trace) -> Result { tracing::debug!("configuring Otlp tracing"); let exporter: SpanExporterBuilder = self.exporter()?; - Ok(builder.with_batch_exporter( - exporter.build_span_exporter()?, - opentelemetry::runtime::Tokio, + Ok(builder.with_span_processor( + BatchSpanProcessor::builder( + exporter.build_span_exporter()?, + opentelemetry::runtime::Tokio, + ) + .build() + .filtered(), )) } } diff --git a/apollo-router/src/plugins/telemetry/tracing/zipkin.rs b/apollo-router/src/plugins/telemetry/tracing/zipkin.rs index 880580943d..1dd6efab7e 100644 --- a/apollo-router/src/plugins/telemetry/tracing/zipkin.rs +++ b/apollo-router/src/plugins/telemetry/tracing/zipkin.rs @@ -1,4 +1,5 @@ //! Configuration for zipkin tracing. +use opentelemetry::sdk::trace::BatchSpanProcessor; use opentelemetry::sdk::trace::Builder; use schemars::JsonSchema; use serde::Deserialize; @@ -11,6 +12,7 @@ use super::AgentDefault; use super::AgentEndpoint; use crate::plugins::telemetry::config::GenericWith; use crate::plugins::telemetry::config::Trace; +use crate::plugins::telemetry::tracing::SpanProcessorExt; use crate::plugins::telemetry::tracing::TracingConfigurator; #[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)] @@ -61,6 +63,10 @@ impl TracingConfigurator for Config { }) .init_exporter()?; - Ok(builder.with_batch_exporter(exporter, opentelemetry::runtime::Tokio)) + Ok(builder.with_span_processor( + BatchSpanProcessor::builder(exporter, opentelemetry::runtime::Tokio) + .build() + .filtered(), + )) } } diff --git a/apollo-router/src/query_planner/mod.rs b/apollo-router/src/query_planner/mod.rs index 9b2da1f807..ca1d97417a 100644 --- a/apollo-router/src/query_planner/mod.rs +++ b/apollo-router/src/query_planner/mod.rs @@ -32,6 +32,11 @@ mod bridge_query_planner; mod caching_query_planner; mod selection; +pub(crate) const FETCH_SPAN_NAME: &str = "fetch"; +pub(crate) const FLATTEN_SPAN_NAME: &str = "flatten"; +pub(crate) const SEQUENCE_SPAN_NAME: &str = "sequence"; +pub(crate) const PARALLEL_SPAN_NAME: &str = "parallel"; + /// Query planning options. #[derive(Clone, Eq, Hash, PartialEq, Debug, Default)] pub(crate) struct QueryPlanOptions { @@ -142,6 +147,17 @@ impl PlanNode { } } + pub(crate) fn contains_condition_or_defer(&self) -> bool { + match self { + Self::Sequence { nodes } => nodes.iter().any(|n| n.contains_condition_or_defer()), + Self::Parallel { nodes } => nodes.iter().any(|n| n.contains_condition_or_defer()), + Self::Flatten(node) => node.node.contains_condition_or_defer(), + Self::Fetch(..) => false, + Self::Defer { .. } => true, + Self::Condition { .. } => true, + } + } + pub(crate) fn contains_defer(&self) -> bool { match self { Self::Sequence { nodes } => nodes.iter().any(|n| n.contains_defer()), @@ -338,7 +354,7 @@ impl PlanNode { PlanNode::Sequence { nodes } => { value = parent_value.clone(); errors = Vec::new(); - let span = tracing::info_span!("sequence"); + let span = tracing::info_span!(SEQUENCE_SPAN_NAME); for node in nodes { let (v, subselect, err) = node .execute_recursively(parameters, current_dir, &value, sender.clone()) @@ -354,7 +370,7 @@ impl PlanNode { value = Value::default(); errors = Vec::new(); - let span = tracing::info_span!("parallel"); + let span = tracing::info_span!(PARALLEL_SPAN_NAME); let mut stream: stream::FuturesUnordered<_> = nodes .iter() .map(|plan| { @@ -379,15 +395,19 @@ impl PlanNode { } } PlanNode::Flatten(FlattenNode { path, node }) => { + // Note that the span must be `info` as we need to pick this up in apollo tracing + let current_dir = current_dir.join(path); let (v, subselect, err) = node .execute_recursively( parameters, // this is the only command that actually changes the "current dir" - ¤t_dir.join(path), + ¤t_dir, parent_value, sender, ) - .instrument(tracing::trace_span!("flatten")) + .instrument( + tracing::info_span!(FLATTEN_SPAN_NAME, apollo_private.path = %current_dir), + ) .await; value = v; @@ -395,11 +415,15 @@ impl PlanNode { subselection = subselect; } PlanNode::Fetch(fetch_node) => { + let fetch_time_offset = + parameters.context.created_at.elapsed().as_nanos() as i64; match fetch_node .fetch_node(parameters, parent_value, current_dir) .instrument(tracing::info_span!( - "fetch", + FETCH_SPAN_NAME, "otel.kind" = %SpanKind::Internal, + "service.name" = fetch_node.service_name.as_str(), + "apollo_private.sent_time_offset" = fetch_time_offset )) .await { diff --git a/apollo-router/src/spec/query.rs b/apollo-router/src/spec/query.rs index 58f2f5195c..511e983098 100644 --- a/apollo-router/src/spec/query.rs +++ b/apollo-router/src/spec/query.rs @@ -133,7 +133,7 @@ impl Query { failfast_debug!("can't find operation for {:?}", operation_name); } } else { - failfast_debug!("invalid type for data in response."); + failfast_debug!("invalid type for data in response. data: {:#?}", data); } response.data = Some(Value::default()); diff --git a/apollo-router/tests/integration_tests.rs b/apollo-router/tests/integration_tests.rs index 1d6f706198..b62284c273 100644 --- a/apollo-router/tests/integration_tests.rs +++ b/apollo-router/tests/integration_tests.rs @@ -16,6 +16,8 @@ use apollo_router::services::supergraph; use http::header::ACCEPT; use http::Method; use http::StatusCode; +use insta::internals::Content; +use insta::internals::Redaction; use maplit::hashmap; use serde_json::to_string_pretty; use serde_json_bytes::json; @@ -116,7 +118,9 @@ async fn traced_basic_request() { "products".to_string()=>1, }, ); - insta::assert_json_snapshot!(get_spans()); + insta::assert_json_snapshot!(get_spans(), { + ".**.children.*.record.entries[]" => redact_dynamic() + }); } #[test_span(tokio::test)] @@ -130,7 +134,9 @@ async fn traced_basic_composition() { "accounts".to_string()=>1, }, ); - insta::assert_json_snapshot!(get_spans()); + insta::assert_json_snapshot!(get_spans(), { + ".**.children.*.record.entries[]" => redact_dynamic() + }); } #[tokio::test(flavor = "multi_thread")] @@ -860,3 +866,23 @@ impl ValueExt for Value { } } } + +// Useful to redact request_id in snapshot because it's not determinist +fn redact_dynamic() -> Redaction { + insta::dynamic_redaction(|value, _path| { + if let Some(value_slice) = value.as_slice() { + if value_slice.get(0).and_then(|v| v.as_str()) == Some("request.id") { + return Content::Seq(vec![ + value_slice.get(0).unwrap().clone(), + Content::String("[REDACTED]".to_string()), + ]); + } + if value_slice.get(0).and_then(|v| v.as_str()) + == Some("apollo_private.sent_time_offset") + { + return Content::Seq(vec![value_slice.get(0).unwrap().clone(), Content::I64(0)]); + } + } + value + }) +} diff --git a/apollo-router/tests/jaeger_test.rs b/apollo-router/tests/jaeger_test.rs index 911eb49d05..d3c0a27bc5 100644 --- a/apollo-router/tests/jaeger_test.rs +++ b/apollo-router/tests/jaeger_test.rs @@ -114,13 +114,13 @@ fn verify_router_span_fields(trace: &Value) -> Result<(), BoxError> { ); assert_eq!( router_span - .select_path("$.tags[?(@.key == 'client_name')].value")? + .select_path("$.tags[?(@.key == 'client.name')].value")? .get(0), Some(&&Value::String("custom_name".to_string())) ); assert_eq!( router_span - .select_path("$.tags[?(@.key == 'client_version')].value")? + .select_path("$.tags[?(@.key == 'client.version')].value")? .get(0), Some(&&Value::String("1.0".to_string())) ); diff --git a/apollo-router/tests/snapshots/integration_tests__traced_basic_composition.snap b/apollo-router/tests/snapshots/integration_tests__traced_basic_composition.snap index 29b7382253..0d4ca68504 100644 --- a/apollo-router/tests/snapshots/integration_tests__traced_basic_composition.snap +++ b/apollo-router/tests/snapshots/integration_tests__traced_basic_composition.snap @@ -30,16 +30,32 @@ expression: get_spans() "" ], [ - "client_name", + "client.name", "" ], [ - "client_version", + "client.version", "" ], [ "otel.kind", "internal" + ], + [ + "apollo_private.field_level_instrumentation_ratio", + 0.0 + ], + [ + "apollo_private.graphql.variables", + "{\"reviewsForAuthorAuthorId\":[\"\"],\"topProductsFirst\":[\"\"]}" + ], + [ + "apollo_private.http.request_headers", + "{\"content-type\":[\"\"]}" + ], + [ + "apollo_private.operation_signature", + "# -\n{topProducts{name reviews{author{id name}id product{name}}upc}}" ] ], "metadata": { @@ -51,9 +67,13 @@ expression: get_spans() "names": [ "graphql.document", "graphql.operation.name", - "client_name", - "client_version", - "otel.kind" + "client.name", + "client.version", + "otel.kind", + "apollo_private.field_level_instrumentation_ratio", + "apollo_private.operation_signature", + "apollo_private.graphql.variables", + "apollo_private.http.request_headers" ] } } @@ -131,6 +151,10 @@ expression: get_spans() [ "otel.kind", "internal" + ], + [ + "ftv1.do_not_sample_reason", + "" ] ], "metadata": { @@ -142,7 +166,8 @@ expression: get_spans() "names": [ "graphql.document", "graphql.operation.name", - "otel.kind" + "otel.kind", + "ftv1.do_not_sample_reason" ] } } @@ -170,6 +195,14 @@ expression: get_spans() [ "otel.kind", "internal" + ], + [ + "service.name", + "products" + ], + [ + "apollo_private.sent_time_offset", + 0 ] ], "metadata": { @@ -179,7 +212,9 @@ expression: get_spans() "module_path": "apollo_router::query_planner", "fields": { "names": [ - "otel.kind" + "otel.kind", + "service.name", + "apollo_private.sent_time_offset" ] } } @@ -232,7 +267,8 @@ expression: get_spans() "name", "graphql.document", "graphql.operation.name", - "otel.kind" + "otel.kind", + "apollo_private.ftv1" ] } } @@ -349,206 +385,27 @@ expression: get_spans() } } }, - "apollo_router::query_planner::fetch": { - "name": "apollo_router::query_planner::fetch", + "apollo_router::query_planner::flatten": { + "name": "apollo_router::query_planner::flatten", "record": { "entries": [ [ - "otel.kind", - "internal" + "apollo_private.path", + "/topProducts/@" ] ], "metadata": { - "name": "fetch", + "name": "flatten", "target": "apollo_router::query_planner", "level": "INFO", "module_path": "apollo_router::query_planner", "fields": { "names": [ - "otel.kind" + "apollo_private.path" ] } } }, - "children": { - "apollo_router::query_planner::fetch::make_variables": { - "name": "apollo_router::query_planner::fetch::make_variables", - "record": { - "entries": [], - "metadata": { - "name": "make_variables", - "target": "apollo_router::query_planner::fetch", - "level": "DEBUG", - "module_path": "apollo_router::query_planner::fetch", - "fields": { - "names": [] - } - } - }, - "children": {} - }, - "apollo_router::plugins::telemetry::subgraph": { - "name": "apollo_router::plugins::telemetry::subgraph", - "record": { - "entries": [ - [ - "name", - "reviews" - ], - [ - "graphql.document", - "query($representations:[_Any!]!){_entities(representations:$representations){...on Product{reviews{id product{__typename upc}author{__typename id}}}}}" - ], - [ - "graphql.operation.name", - "" - ], - [ - "otel.kind", - "internal" - ] - ], - "metadata": { - "name": "subgraph", - "target": "apollo_router::plugins::telemetry", - "level": "INFO", - "module_path": "apollo_router::plugins::telemetry", - "fields": { - "names": [ - "name", - "graphql.document", - "graphql.operation.name", - "otel.kind" - ] - } - } - }, - "children": { - "apollo_router::services::subgraph_service::body_compression": { - "name": "apollo_router::services::subgraph_service::body_compression", - "record": { - "entries": [], - "metadata": { - "name": "body_compression", - "target": "apollo_router::services::subgraph_service", - "level": "DEBUG", - "module_path": "apollo_router::services::subgraph_service", - "fields": { - "names": [] - } - } - }, - "children": {} - }, - "apollo_router::services::subgraph_service::subgraph_request": { - "name": "apollo_router::services::subgraph_service::subgraph_request", - "record": { - "entries": [ - [ - "otel.kind", - "client" - ], - [ - "net.peer.name", - "reviews.demo.starstuff.dev" - ], - [ - "net.peer.port", - "443" - ], - [ - "http.route", - "/" - ], - [ - "net.transport", - "ip_tcp" - ] - ], - "metadata": { - "name": "subgraph_request", - "target": "apollo_router::services::subgraph_service", - "level": "INFO", - "module_path": "apollo_router::services::subgraph_service", - "fields": { - "names": [ - "otel.kind", - "net.peer.name", - "net.peer.port", - "http.route", - "net.transport" - ] - } - } - }, - "children": {} - }, - "apollo_router::services::subgraph_service::aggregate_response_data": { - "name": "apollo_router::services::subgraph_service::aggregate_response_data", - "record": { - "entries": [], - "metadata": { - "name": "aggregate_response_data", - "target": "apollo_router::services::subgraph_service", - "level": "DEBUG", - "module_path": "apollo_router::services::subgraph_service", - "fields": { - "names": [] - } - } - }, - "children": {} - }, - "apollo_router::services::subgraph_service::parse_subgraph_response": { - "name": "apollo_router::services::subgraph_service::parse_subgraph_response", - "record": { - "entries": [], - "metadata": { - "name": "parse_subgraph_response", - "target": "apollo_router::services::subgraph_service", - "level": "DEBUG", - "module_path": "apollo_router::services::subgraph_service", - "fields": { - "names": [] - } - } - }, - "children": {} - } - } - }, - "apollo_router::query_planner::fetch::response_insert": { - "name": "apollo_router::query_planner::fetch::response_insert", - "record": { - "entries": [], - "metadata": { - "name": "response_insert", - "target": "apollo_router::query_planner::fetch", - "level": "DEBUG", - "module_path": "apollo_router::query_planner::fetch", - "fields": { - "names": [] - } - } - }, - "children": {} - } - } - }, - "apollo_router::query_planner::parallel": { - "name": "apollo_router::query_planner::parallel", - "record": { - "entries": [], - "metadata": { - "name": "parallel", - "target": "apollo_router::query_planner", - "level": "INFO", - "module_path": "apollo_router::query_planner", - "fields": { - "names": [] - } - } - }, "children": { "apollo_router::query_planner::fetch": { "name": "apollo_router::query_planner::fetch", @@ -557,6 +414,14 @@ expression: get_spans() [ "otel.kind", "internal" + ], + [ + "service.name", + "reviews" + ], + [ + "apollo_private.sent_time_offset", + 0 ] ], "metadata": { @@ -566,7 +431,9 @@ expression: get_spans() "module_path": "apollo_router::query_planner", "fields": { "names": [ - "otel.kind" + "otel.kind", + "service.name", + "apollo_private.sent_time_offset" ] } } @@ -594,11 +461,11 @@ expression: get_spans() "entries": [ [ "name", - "products" + "reviews" ], [ "graphql.document", - "query($representations:[_Any!]!){_entities(representations:$representations){...on Product{name}}}" + "query($representations:[_Any!]!){_entities(representations:$representations){...on Product{reviews{id product{__typename upc}author{__typename id}}}}}" ], [ "graphql.operation.name", @@ -619,7 +486,8 @@ expression: get_spans() "name", "graphql.document", "graphql.operation.name", - "otel.kind" + "otel.kind", + "apollo_private.ftv1" ] } } @@ -651,7 +519,7 @@ expression: get_spans() ], [ "net.peer.name", - "products.demo.starstuff.dev" + "reviews.demo.starstuff.dev" ], [ "net.peer.port", @@ -735,91 +603,87 @@ expression: get_spans() "children": {} } } - }, - "apollo_router::query_planner::fetch": { - "name": "apollo_router::query_planner::fetch", + } + } + }, + "apollo_router::query_planner::parallel": { + "name": "apollo_router::query_planner::parallel", + "record": { + "entries": [], + "metadata": { + "name": "parallel", + "target": "apollo_router::query_planner", + "level": "INFO", + "module_path": "apollo_router::query_planner", + "fields": { + "names": [] + } + } + }, + "children": { + "apollo_router::query_planner::flatten": { + "name": "apollo_router::query_planner::flatten", "record": { "entries": [ [ - "otel.kind", - "internal" + "apollo_private.path", + "/topProducts/@/reviews/@/product" ] ], "metadata": { - "name": "fetch", + "name": "flatten", "target": "apollo_router::query_planner", "level": "INFO", "module_path": "apollo_router::query_planner", "fields": { "names": [ - "otel.kind" + "apollo_private.path" ] } } }, "children": { - "apollo_router::query_planner::fetch::make_variables": { - "name": "apollo_router::query_planner::fetch::make_variables", - "record": { - "entries": [], - "metadata": { - "name": "make_variables", - "target": "apollo_router::query_planner::fetch", - "level": "DEBUG", - "module_path": "apollo_router::query_planner::fetch", - "fields": { - "names": [] - } - } - }, - "children": {} - }, - "apollo_router::plugins::telemetry::subgraph": { - "name": "apollo_router::plugins::telemetry::subgraph", + "apollo_router::query_planner::fetch": { + "name": "apollo_router::query_planner::fetch", "record": { "entries": [ [ - "name", - "accounts" - ], - [ - "graphql.document", - "query($representations:[_Any!]!){_entities(representations:$representations){...on User{name}}}" + "otel.kind", + "internal" ], [ - "graphql.operation.name", - "" + "service.name", + "products" ], [ - "otel.kind", - "internal" + "apollo_private.sent_time_offset", + 0 ] ], "metadata": { - "name": "subgraph", - "target": "apollo_router::plugins::telemetry", + "name": "fetch", + "target": "apollo_router::query_planner", "level": "INFO", - "module_path": "apollo_router::plugins::telemetry", + "module_path": "apollo_router::query_planner", "fields": { "names": [ - "name", - "graphql.document", - "graphql.operation.name", - "otel.kind" + "otel.kind", + "service.name", + "apollo_private.sent_time_offset" ] } } }, "children": { - "apollo_router::services::subgraph_service::body_compression": { - "name": "apollo_router::services::subgraph_service::body_compression", + "apollo_router::query_planner::fetch::make_variables": { + "name": "apollo_router::query_planner::fetch::make_variables", "record": { "entries": [], "metadata": { - "name": "body_compression", - "target": "apollo_router::services::subgraph_service", + "name": "make_variables", + "target": "apollo_router::query_planner::fetch", "level": "DEBUG", - "module_path": "apollo_router::services::subgraph_service", + "module_path": "apollo_router::query_planner::fetch", "fields": { "names": [] } @@ -827,58 +691,220 @@ expression: get_spans() }, "children": {} }, - "apollo_router::services::subgraph_service::subgraph_request": { - "name": "apollo_router::services::subgraph_service::subgraph_request", + "apollo_router::plugins::telemetry::subgraph": { + "name": "apollo_router::plugins::telemetry::subgraph", "record": { "entries": [ [ - "otel.kind", - "client" + "name", + "products" ], [ - "net.peer.name", - "accounts.demo.starstuff.dev" + "graphql.document", + "query($representations:[_Any!]!){_entities(representations:$representations){...on Product{name}}}" ], [ - "net.peer.port", - "443" - ], - [ - "http.route", - "/" + "graphql.operation.name", + "" ], [ - "net.transport", - "ip_tcp" + "otel.kind", + "internal" ] ], "metadata": { - "name": "subgraph_request", - "target": "apollo_router::services::subgraph_service", + "name": "subgraph", + "target": "apollo_router::plugins::telemetry", "level": "INFO", - "module_path": "apollo_router::services::subgraph_service", + "module_path": "apollo_router::plugins::telemetry", "fields": { "names": [ + "name", + "graphql.document", + "graphql.operation.name", "otel.kind", - "net.peer.name", - "net.peer.port", - "http.route", - "net.transport" + "apollo_private.ftv1" ] } } }, - "children": {} + "children": { + "apollo_router::services::subgraph_service::body_compression": { + "name": "apollo_router::services::subgraph_service::body_compression", + "record": { + "entries": [], + "metadata": { + "name": "body_compression", + "target": "apollo_router::services::subgraph_service", + "level": "DEBUG", + "module_path": "apollo_router::services::subgraph_service", + "fields": { + "names": [] + } + } + }, + "children": {} + }, + "apollo_router::services::subgraph_service::subgraph_request": { + "name": "apollo_router::services::subgraph_service::subgraph_request", + "record": { + "entries": [ + [ + "otel.kind", + "client" + ], + [ + "net.peer.name", + "products.demo.starstuff.dev" + ], + [ + "net.peer.port", + "443" + ], + [ + "http.route", + "/" + ], + [ + "net.transport", + "ip_tcp" + ] + ], + "metadata": { + "name": "subgraph_request", + "target": "apollo_router::services::subgraph_service", + "level": "INFO", + "module_path": "apollo_router::services::subgraph_service", + "fields": { + "names": [ + "otel.kind", + "net.peer.name", + "net.peer.port", + "http.route", + "net.transport" + ] + } + } + }, + "children": {} + }, + "apollo_router::services::subgraph_service::aggregate_response_data": { + "name": "apollo_router::services::subgraph_service::aggregate_response_data", + "record": { + "entries": [], + "metadata": { + "name": "aggregate_response_data", + "target": "apollo_router::services::subgraph_service", + "level": "DEBUG", + "module_path": "apollo_router::services::subgraph_service", + "fields": { + "names": [] + } + } + }, + "children": {} + }, + "apollo_router::services::subgraph_service::parse_subgraph_response": { + "name": "apollo_router::services::subgraph_service::parse_subgraph_response", + "record": { + "entries": [], + "metadata": { + "name": "parse_subgraph_response", + "target": "apollo_router::services::subgraph_service", + "level": "DEBUG", + "module_path": "apollo_router::services::subgraph_service", + "fields": { + "names": [] + } + } + }, + "children": {} + } + } }, - "apollo_router::services::subgraph_service::aggregate_response_data": { - "name": "apollo_router::services::subgraph_service::aggregate_response_data", + "apollo_router::query_planner::fetch::response_insert": { + "name": "apollo_router::query_planner::fetch::response_insert", "record": { "entries": [], "metadata": { - "name": "aggregate_response_data", - "target": "apollo_router::services::subgraph_service", + "name": "response_insert", + "target": "apollo_router::query_planner::fetch", "level": "DEBUG", - "module_path": "apollo_router::services::subgraph_service", + "module_path": "apollo_router::query_planner::fetch", + "fields": { + "names": [] + } + } + }, + "children": {} + } + } + } + } + }, + "apollo_router::query_planner::flatten": { + "name": "apollo_router::query_planner::flatten", + "record": { + "entries": [ + [ + "apollo_private.path", + "/topProducts/@/reviews/@/author" + ] + ], + "metadata": { + "name": "flatten", + "target": "apollo_router::query_planner", + "level": "INFO", + "module_path": "apollo_router::query_planner", + "fields": { + "names": [ + "apollo_private.path" + ] + } + } + }, + "children": { + "apollo_router::query_planner::fetch": { + "name": "apollo_router::query_planner::fetch", + "record": { + "entries": [ + [ + "otel.kind", + "internal" + ], + [ + "service.name", + "accounts" + ], + [ + "apollo_private.sent_time_offset", + 0 + ] + ], + "metadata": { + "name": "fetch", + "target": "apollo_router::query_planner", + "level": "INFO", + "module_path": "apollo_router::query_planner", + "fields": { + "names": [ + "otel.kind", + "service.name", + "apollo_private.sent_time_offset" + ] + } + } + }, + "children": { + "apollo_router::query_planner::fetch::make_variables": { + "name": "apollo_router::query_planner::fetch::make_variables", + "record": { + "entries": [], + "metadata": { + "name": "make_variables", + "target": "apollo_router::query_planner::fetch", + "level": "DEBUG", + "module_path": "apollo_router::query_planner::fetch", "fields": { "names": [] } @@ -886,15 +912,146 @@ expression: get_spans() }, "children": {} }, - "apollo_router::services::subgraph_service::parse_subgraph_response": { - "name": "apollo_router::services::subgraph_service::parse_subgraph_response", + "apollo_router::plugins::telemetry::subgraph": { + "name": "apollo_router::plugins::telemetry::subgraph", + "record": { + "entries": [ + [ + "name", + "accounts" + ], + [ + "graphql.document", + "query($representations:[_Any!]!){_entities(representations:$representations){...on User{name}}}" + ], + [ + "graphql.operation.name", + "" + ], + [ + "otel.kind", + "internal" + ] + ], + "metadata": { + "name": "subgraph", + "target": "apollo_router::plugins::telemetry", + "level": "INFO", + "module_path": "apollo_router::plugins::telemetry", + "fields": { + "names": [ + "name", + "graphql.document", + "graphql.operation.name", + "otel.kind", + "apollo_private.ftv1" + ] + } + } + }, + "children": { + "apollo_router::services::subgraph_service::body_compression": { + "name": "apollo_router::services::subgraph_service::body_compression", + "record": { + "entries": [], + "metadata": { + "name": "body_compression", + "target": "apollo_router::services::subgraph_service", + "level": "DEBUG", + "module_path": "apollo_router::services::subgraph_service", + "fields": { + "names": [] + } + } + }, + "children": {} + }, + "apollo_router::services::subgraph_service::subgraph_request": { + "name": "apollo_router::services::subgraph_service::subgraph_request", + "record": { + "entries": [ + [ + "otel.kind", + "client" + ], + [ + "net.peer.name", + "accounts.demo.starstuff.dev" + ], + [ + "net.peer.port", + "443" + ], + [ + "http.route", + "/" + ], + [ + "net.transport", + "ip_tcp" + ] + ], + "metadata": { + "name": "subgraph_request", + "target": "apollo_router::services::subgraph_service", + "level": "INFO", + "module_path": "apollo_router::services::subgraph_service", + "fields": { + "names": [ + "otel.kind", + "net.peer.name", + "net.peer.port", + "http.route", + "net.transport" + ] + } + } + }, + "children": {} + }, + "apollo_router::services::subgraph_service::aggregate_response_data": { + "name": "apollo_router::services::subgraph_service::aggregate_response_data", + "record": { + "entries": [], + "metadata": { + "name": "aggregate_response_data", + "target": "apollo_router::services::subgraph_service", + "level": "DEBUG", + "module_path": "apollo_router::services::subgraph_service", + "fields": { + "names": [] + } + } + }, + "children": {} + }, + "apollo_router::services::subgraph_service::parse_subgraph_response": { + "name": "apollo_router::services::subgraph_service::parse_subgraph_response", + "record": { + "entries": [], + "metadata": { + "name": "parse_subgraph_response", + "target": "apollo_router::services::subgraph_service", + "level": "DEBUG", + "module_path": "apollo_router::services::subgraph_service", + "fields": { + "names": [] + } + } + }, + "children": {} + } + } + }, + "apollo_router::query_planner::fetch::response_insert": { + "name": "apollo_router::query_planner::fetch::response_insert", "record": { "entries": [], "metadata": { - "name": "parse_subgraph_response", - "target": "apollo_router::services::subgraph_service", + "name": "response_insert", + "target": "apollo_router::query_planner::fetch", "level": "DEBUG", - "module_path": "apollo_router::services::subgraph_service", + "module_path": "apollo_router::query_planner::fetch", "fields": { "names": [] } @@ -903,22 +1060,6 @@ expression: get_spans() "children": {} } } - }, - "apollo_router::query_planner::fetch::response_insert": { - "name": "apollo_router::query_planner::fetch::response_insert", - "record": { - "entries": [], - "metadata": { - "name": "response_insert", - "target": "apollo_router::query_planner::fetch", - "level": "DEBUG", - "module_path": "apollo_router::query_planner::fetch", - "fields": { - "names": [] - } - } - }, - "children": {} } } } diff --git a/apollo-router/tests/snapshots/integration_tests__traced_basic_request.snap b/apollo-router/tests/snapshots/integration_tests__traced_basic_request.snap index 7a9e52a92b..ab56e90925 100644 --- a/apollo-router/tests/snapshots/integration_tests__traced_basic_request.snap +++ b/apollo-router/tests/snapshots/integration_tests__traced_basic_request.snap @@ -30,16 +30,32 @@ expression: get_spans() "" ], [ - "client_name", + "client.name", "" ], [ - "client_version", + "client.version", "" ], [ "otel.kind", "internal" + ], + [ + "apollo_private.field_level_instrumentation_ratio", + 0.0 + ], + [ + "apollo_private.graphql.variables", + "{\"reviewsForAuthorAuthorId\":[\"\"],\"topProductsFirst\":[\"\"]}" + ], + [ + "apollo_private.http.request_headers", + "{\"content-type\":[\"\"]}" + ], + [ + "apollo_private.operation_signature", + "# -\n{topProducts{name name}}" ] ], "metadata": { @@ -51,9 +67,13 @@ expression: get_spans() "names": [ "graphql.document", "graphql.operation.name", - "client_name", - "client_version", - "otel.kind" + "client.name", + "client.version", + "otel.kind", + "apollo_private.field_level_instrumentation_ratio", + "apollo_private.operation_signature", + "apollo_private.graphql.variables", + "apollo_private.http.request_headers" ] } } @@ -131,6 +151,10 @@ expression: get_spans() [ "otel.kind", "internal" + ], + [ + "ftv1.do_not_sample_reason", + "" ] ], "metadata": { @@ -142,7 +166,8 @@ expression: get_spans() "names": [ "graphql.document", "graphql.operation.name", - "otel.kind" + "otel.kind", + "ftv1.do_not_sample_reason" ] } } @@ -155,6 +180,14 @@ expression: get_spans() [ "otel.kind", "internal" + ], + [ + "service.name", + "products" + ], + [ + "apollo_private.sent_time_offset", + 0 ] ], "metadata": { @@ -164,7 +197,9 @@ expression: get_spans() "module_path": "apollo_router::query_planner", "fields": { "names": [ - "otel.kind" + "otel.kind", + "service.name", + "apollo_private.sent_time_offset" ] } } @@ -217,7 +252,8 @@ expression: get_spans() "name", "graphql.document", "graphql.operation.name", - "otel.kind" + "otel.kind", + "apollo_private.ftv1" ] } } diff --git a/apollo-spaceport/build.rs b/apollo-spaceport/build.rs index de33310a4e..853f8b2abf 100644 --- a/apollo-spaceport/build.rs +++ b/apollo-spaceport/build.rs @@ -42,13 +42,27 @@ fn main() -> Result<(), Box> { let proto_files = vec!["proto/agents.proto", "proto/reports.proto"]; tonic_build::configure() - .type_attribute("ContextualizedStats", "#[derive(serde::Serialize)]") - .type_attribute("StatsContext", "#[derive(serde::Serialize)]") - .type_attribute("QueryLatencyStats", "#[derive(serde::Serialize)]") - .type_attribute("TypeStat", "#[derive(serde::Serialize)]") - .type_attribute("PathErrorStats", "#[derive(serde::Serialize)]") - .type_attribute("FieldStat", "#[derive(serde::Serialize)]") - .type_attribute("ReferencedFieldsForType", "#[derive(serde::Serialize)]") + .field_attribute( + "Trace.start_time", + "#[serde(serialize_with = \"crate::serialize_timestamp\")]", + ) + .field_attribute( + "Trace.end_time", + "#[serde(serialize_with = \"crate::serialize_timestamp\")]", + ) + .field_attribute( + "FetchNode.sent_time", + "#[serde(serialize_with = \"crate::serialize_timestamp\")]", + ) + .field_attribute( + "FetchNode.received_time", + "#[serde(serialize_with = \"crate::serialize_timestamp\")]", + ) + .field_attribute( + "Report.end_time", + "#[serde(serialize_with = \"crate::serialize_timestamp\")]", + ) + .type_attribute(".", "#[derive(serde::Serialize)]") .type_attribute("StatsContext", "#[derive(Eq, Hash)]") .build_server(true) .compile(&proto_files, &["."])?; diff --git a/apollo-spaceport/src/lib.rs b/apollo-spaceport/src/lib.rs index b3ca54a2c8..1c70891b7e 100644 --- a/apollo-spaceport/src/lib.rs +++ b/apollo-spaceport/src/lib.rs @@ -14,8 +14,10 @@ use std::error::Error; use agent::reporter_client::ReporterClient; pub use agent::*; +pub use prost::*; pub use prost_types::Timestamp; pub use report::*; +use serde::ser::SerializeStruct; use sys_info::hostname; use tokio::task::JoinError; use tonic::codegen::http::uri::InvalidUri; @@ -190,3 +192,21 @@ impl Reporter { self.client.add(Request::new(request)).await } } + +pub fn serialize_timestamp( + timestamp: &Option, + serializer: S, +) -> Result +where + S: serde::Serializer, +{ + match timestamp { + Some(ts) => { + let mut ts_strukt = serializer.serialize_struct("Timestamp", 2)?; + ts_strukt.serialize_field("seconds", &ts.seconds)?; + ts_strukt.serialize_field("nanos", &ts.nanos)?; + ts_strukt.end() + } + None => serializer.serialize_none(), + } +} diff --git a/deny.toml b/deny.toml index e049878b0b..505e548970 100644 --- a/deny.toml +++ b/deny.toml @@ -45,6 +45,7 @@ allow = [ "MIT", "MPL-2.0", "LicenseRef-ELv2", + "Unicode-DFS-2016" ] copyleft = "warn" allow-osi-fsf-free = "neither" diff --git a/docs/source/configuration/apollo-telemetry.mdx b/docs/source/configuration/apollo-telemetry.mdx index c04ce963fa..8a0ff82137 100644 --- a/docs/source/configuration/apollo-telemetry.mdx +++ b/docs/source/configuration/apollo-telemetry.mdx @@ -38,15 +38,28 @@ To connect the Apollo Router to an external Spaceport instance, specify its endp ```yaml title="router.yaml" telemetry: - # Optional Apollo telemetry configuration. - apollo: - - # Optional external Spaceport URL. - # If not specified, an in-process Spaceport is used. - endpoint: "https://my-spaceport" - + apollo: + # The percentage of requests will include HTTP request and response headers in traces sent to Apollo Studio. + # This is expensive and should be left at a low value. + # This cannot be higher than tracing->trace_config->sampler + field_level_instrumentation_sampler: 0.01 # (default) + + # Include HTTP request and response headers in traces sent to Apollo Studio + send_headers: # other possible values are all, only (with an array), except (with an array), none (by default) + except: # Send all headers except referer + - referer + + # Include variable values in Apollo in traces sent to Apollo Studio + send_variable_values: # other possible values are all, only (with an array), except (with an array), none (by default) + except: # Send all variable values except for variable named first + - first + tracing: + trace_config: + sampler: 0.5 # The percentage of requests that will generate traces (a rate or `always_on` or `always_off`) ``` +Note that `field_level_instrumentation_sampler` may not sample at a greater rate than `trace_config/sampler`.**** + ## Running Spaceport externally (not recommended) Running spaceport as a separate process currently requires building from [source](https://github.com/apollographql/router/tree/main/apollo-spaceport). diff --git a/docs/source/customizations/native.mdx b/docs/source/customizations/native.mdx index 34e2fd808c..83799eca0d 100644 --- a/docs/source/customizations/native.mdx +++ b/docs/source/customizations/native.mdx @@ -250,3 +250,5 @@ After the new configuration is deemed valid, the router shifts to it. The previo ### Testing plugins Unit testing of a plugin is typically most helpful and there are extensive examples of plugin testing in the examples and plugins directories. + +> If you need an unique identifier for your request, we generate a `RequestId` in the orinating request extensions. You can get it by writing `req.originating_request.extensions().get::().expect("request id must be set").clone()` in a `map_request` callback. \ No newline at end of file diff --git a/licenses.html b/licenses.html index 4c6fe48d28..a759c0d865 100644 --- a/licenses.html +++ b/licenses.html @@ -8351,6 +8351,7 @@

Used by:

                              Apache License
                         Version 2.0, January 2004
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index c639581df2..c5e2e445bb 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,4 @@
 [toolchain]
-# If updating this, you *must* also change the rust value in .tool-versions
 # renovate-automation: rustc version
 channel = "1.61.0"
 components = [ "rustfmt", "clippy" ]