From a21256c020984aee0b9365b7c38a073c0d01dda2 Mon Sep 17 00:00:00 2001 From: bryn Date: Thu, 12 Oct 2023 16:43:04 +0100 Subject: [PATCH] Unify resource handling The telemetry code had separate logic for constructing resources to pass to tracing and metrics exporters. This has now been unified and tests added. As a consequence, the default service.name for the router in tracing has now been brought into line with the otel spec, where if unspecified the value will be `unknown_service`. --- .changesets/fix_bryn_unify_resources.md | 15 + ...nfiguration__tests__schema_generation.snap | 70 +++- apollo-router/src/plugins/telemetry/config.rs | 129 ++----- .../src/plugins/telemetry/metrics/mod.rs | 34 +- apollo-router/src/plugins/telemetry/mod.rs | 1 + .../src/plugins/telemetry/resource.rs | 324 ++++++++++++++++++ .../src/plugins/telemetry/tracing/jaeger.rs | 2 - .../src/plugins/telemetry/tracing/zipkin.rs | 1 - docs/source/configuration/metrics.mdx | 57 +-- docs/source/configuration/tracing.mdx | 72 ++-- 10 files changed, 525 insertions(+), 180 deletions(-) create mode 100644 .changesets/fix_bryn_unify_resources.md create mode 100644 apollo-router/src/plugins/telemetry/resource.rs diff --git a/.changesets/fix_bryn_unify_resources.md b/.changesets/fix_bryn_unify_resources.md new file mode 100644 index 00000000000..d97c82b2f88 --- /dev/null +++ b/.changesets/fix_bryn_unify_resources.md @@ -0,0 +1,15 @@ +### Bring Utel `service.name` into line with the Otel spec ([PR #4034](https://github.com/apollographql/router/pull/4034)) + +Handling of Otel `service.name` has been brought into line with the [Otel spec](https://opentelemetry.io/docs/concepts/sdk-configuration/general-sdk-configuration/#otel_service_name) across traces and metrics. + +Service name discovery is handled in the following order: +1. `OTEL_SERVICE_NAME` env +2. `OTEL_RESOURCE_ATTRIBUTES` env +3. `router.yaml` `service_name` +4. `router.yaml` `resources` (attributes) + +If none of the above ar found then the service name will be set to `unknown_service:apollo_router` or `unknown_service` if the executable name cannot be determined. + +Users who have not explicitly configured their service name should do so either via the yaml config file or via the `OTEL_SERVICE_NAME` environment variable. + +By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/4034 diff --git a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap index d43e9f7a1d0..04c81a9a1d4 100644 --- a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap +++ b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap @@ -4427,11 +4427,65 @@ expression: "&schema" "additionalProperties": false }, "resources": { - "description": "Resources", + "description": "Otel configuration via resource", "default": {}, "type": "object", "additionalProperties": { - "type": "string" + "anyOf": [ + { + "description": "bool values", + "type": "boolean" + }, + { + "description": "i64 values", + "type": "integer", + "format": "int64" + }, + { + "description": "f64 values", + "type": "number", + "format": "double" + }, + { + "description": "String values", + "type": "string" + }, + { + "description": "Array of homogeneous values", + "anyOf": [ + { + "description": "Array of bools", + "type": "array", + "items": { + "type": "boolean" + } + }, + { + "description": "Array of integers", + "type": "array", + "items": { + "type": "integer", + "format": "int64" + } + }, + { + "description": "Array of floats", + "type": "array", + "items": { + "type": "number", + "format": "double" + } + }, + { + "description": "Array of strings", + "type": "array", + "items": { + "type": "string" + } + } + ] + } + ] } }, "service_name": { @@ -5081,7 +5135,7 @@ expression: "&schema" "type": "object", "properties": { "attributes": { - "description": "Default attributes", + "description": "The resources configured on the tracing pipeline", "default": {}, "type": "object", "additionalProperties": { @@ -5212,13 +5266,15 @@ expression: "&schema" }, "service_name": { "description": "The trace service name", - "default": "router", - "type": "string" + "default": null, + "type": "string", + "nullable": true }, "service_namespace": { "description": "The trace service namespace", - "default": "", - "type": "string" + "default": null, + "type": "string", + "nullable": true } }, "additionalProperties": false diff --git a/apollo-router/src/plugins/telemetry/config.rs b/apollo-router/src/plugins/telemetry/config.rs index 20007ccf66e..a197085e45b 100644 --- a/apollo-router/src/plugins/telemetry/config.rs +++ b/apollo-router/src/plugins/telemetry/config.rs @@ -1,15 +1,10 @@ //! Configuration for the telemetry plugin. use std::collections::BTreeMap; -use std::env; use std::io::IsTerminal; use axum::headers::HeaderName; -use opentelemetry::sdk::resource::EnvResourceDetector; -use opentelemetry::sdk::resource::ResourceDetector; use opentelemetry::sdk::trace::SpanLimits; -use opentelemetry::sdk::Resource; use opentelemetry::Array; -use opentelemetry::KeyValue; use opentelemetry::Value; use regex::Regex; use schemars::JsonSchema; @@ -22,6 +17,7 @@ use crate::configuration::ConfigurationError; use crate::plugin::serde::deserialize_option_header_name; use crate::plugin::serde::deserialize_regex; use crate::plugins::telemetry::metrics; +use crate::plugins::telemetry::resource::ConfigResource; #[derive(thiserror::Error, Debug)] pub(crate) enum Error { @@ -89,8 +85,8 @@ pub(crate) struct MetricsCommon { pub(crate) service_name: Option, /// Set a service.namespace attribute in your metrics pub(crate) service_namespace: Option, - /// Resources - pub(crate) resources: HashMap, + /// Otel configuration via resource + pub(crate) resources: BTreeMap, /// Custom buckets for histograms pub(crate) buckets: Vec, /// Experimental metrics to know more about caching strategies @@ -123,7 +119,7 @@ impl Default for MetricsCommon { attributes: Default::default(), service_name: None, service_namespace: None, - resources: HashMap::new(), + resources: BTreeMap::new(), buckets: vec![ 0.001, 0.005, 0.015, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 5.0, 10.0, ], @@ -349,9 +345,9 @@ pub(crate) struct RequestPropagation { #[non_exhaustive] pub(crate) struct Trace { /// The trace service name - pub(crate) service_name: String, + pub(crate) service_name: Option, /// The trace service namespace - pub(crate) service_namespace: String, + pub(crate) service_namespace: Option, /// The sampler, always_on, always_off or a decimal between 0.0 and 1.0 pub(crate) sampler: SamplerOption, /// Whether to use parent based sampling @@ -366,10 +362,34 @@ pub(crate) struct Trace { pub(crate) max_attributes_per_event: u32, /// The maximum attributes per link before discarding pub(crate) max_attributes_per_link: u32, - /// Default attributes + /// The resources configured on the tracing pipeline pub(crate) attributes: BTreeMap, } +impl ConfigResource for Trace { + fn service_name(&self) -> Option { + self.service_name.clone() + } + fn service_namespace(&self) -> Option { + self.service_namespace.clone() + } + fn resource(&self) -> &BTreeMap { + &self.attributes + } +} + +impl ConfigResource for MetricsCommon { + fn service_name(&self) -> Option { + self.service_name.clone() + } + fn service_namespace(&self) -> Option { + self.service_namespace.clone() + } + fn resource(&self) -> &BTreeMap { + &self.resources + } +} + fn default_parent_based_sampler() -> bool { true } @@ -381,7 +401,7 @@ fn default_sampler() -> SamplerOption { impl Default for Trace { fn default() -> Self { Self { - service_name: "router".to_string(), + service_name: Default::default(), service_namespace: Default::default(), sampler: default_sampler(), parent_based_sampler: default_parent_based_sampler(), @@ -563,58 +583,8 @@ impl From<&Trace> for opentelemetry::sdk::trace::Config { trace_config = trace_config.with_max_attributes_per_event(config.max_attributes_per_event); trace_config = trace_config.with_max_attributes_per_link(config.max_attributes_per_link); - let mut resource_defaults = vec![]; - resource_defaults.push(KeyValue::new( - opentelemetry_semantic_conventions::resource::SERVICE_NAME, - config.service_name.clone(), - )); - resource_defaults.push(KeyValue::new( - opentelemetry_semantic_conventions::resource::SERVICE_NAMESPACE, - config.service_namespace.clone(), - )); - resource_defaults.push(KeyValue::new( - opentelemetry_semantic_conventions::resource::SERVICE_VERSION, - std::env!("CARGO_PKG_VERSION"), - )); - - if let Some(executable_name) = std::env::current_exe().ok().and_then(|path| { - path.file_name() - .and_then(|p| p.to_str().map(|s| s.to_string())) - }) { - resource_defaults.push(KeyValue::new( - opentelemetry_semantic_conventions::resource::PROCESS_EXECUTABLE_NAME, - executable_name, - )); - } - // Take the default first, then config, then env resources, then env variable. Last entry wins - let resource = Resource::new(resource_defaults) - .merge(&Resource::new( - config - .attributes - .iter() - .map(|(k, v)| { - KeyValue::new( - opentelemetry::Key::from(k.clone()), - opentelemetry::Value::from(v.clone()), - ) - }) - .collect::>(), - )) - .merge(&EnvResourceDetector::new().detect(Duration::from_secs(0))) - .merge(&Resource::new( - env::var("OTEL_SERVICE_NAME") - .ok() - .map(|v| { - vec![KeyValue::new( - opentelemetry_semantic_conventions::resource::SERVICE_NAME, - v, - )] - }) - .unwrap_or_default(), - )); - - trace_config = trace_config.with_resource(resource); + trace_config = trace_config.with_resource(config.to_resource()); trace_config } } @@ -679,8 +649,6 @@ impl Conf { #[cfg(test)] mod tests { - use opentelemetry::sdk::trace::Config; - use opentelemetry_semantic_conventions::resource::SERVICE_NAME; use serde_json::json; use super::*; @@ -837,35 +805,4 @@ mod tests { AttributeValue::try_from(json!([1.1, true])).expect_err("mixed conversion must fail"); AttributeValue::try_from(json!([true, "bar"])).expect_err("mixed conversion must fail"); } - - #[test] - fn test_service_name() { - let router_config = Trace { - service_name: "foo".to_string(), - ..Default::default() - }; - let otel_config: Config = (&router_config).into(); - assert_eq!( - Some(Value::String("foo".into())), - otel_config.resource.get(SERVICE_NAME) - ); - - // Env should take precedence - env::set_var("OTEL_RESOURCE_ATTRIBUTES", "service.name=bar"); - let otel_config: Config = (&router_config).into(); - assert_eq!( - Some(Value::String("bar".into())), - otel_config.resource.get(SERVICE_NAME) - ); - - // Env should take precedence - env::set_var("OTEL_SERVICE_NAME", "bif"); - let otel_config: Config = (&router_config).into(); - assert_eq!( - Some(Value::String("bif".into())), - otel_config.resource.get(SERVICE_NAME) - ); - env::remove_var("OTEL_SERVICE_NAME"); - env::remove_var("OTEL_RESOURCE_ATTRIBUTES"); - } } diff --git a/apollo-router/src/plugins/telemetry/metrics/mod.rs b/apollo-router/src/plugins/telemetry/metrics/mod.rs index 351988030c0..767bada8820 100644 --- a/apollo-router/src/plugins/telemetry/metrics/mod.rs +++ b/apollo-router/src/plugins/telemetry/metrics/mod.rs @@ -11,7 +11,6 @@ use opentelemetry::sdk::metrics::reader::AggregationSelector; use opentelemetry::sdk::metrics::Aggregation; use opentelemetry::sdk::metrics::InstrumentKind; use opentelemetry::sdk::resource::ResourceDetector; -use opentelemetry::sdk::resource::SdkProvidedResourceDetector; use opentelemetry::sdk::Resource; use opentelemetry_api::KeyValue; use regex::Regex; @@ -29,6 +28,7 @@ use crate::plugins::telemetry::apollo_exporter::Sender; use crate::plugins::telemetry::config::AttributeValue; use crate::plugins::telemetry::config::Conf; use crate::plugins::telemetry::config::MetricsCommon; +use crate::plugins::telemetry::resource::ConfigResource; use crate::router_factory::Endpoint; use crate::Context; use crate::ListenAddr; @@ -37,7 +37,6 @@ pub(crate) mod apollo; pub(crate) mod otlp; pub(crate) mod prometheus; pub(crate) mod span_metrics_exporter; -static UNKNOWN_SERVICE: &str = "unknown_service"; #[derive(Debug, Clone, Deserialize, JsonSchema, Default)] #[serde(deny_unknown_fields, default)] @@ -475,36 +474,7 @@ impl ResourceDetector for ConfigResourceDetector { impl MetricsBuilder { pub(crate) fn new(config: &Conf) -> Self { - let metrics_common_config = &config.metrics.common; - - let mut resource = Resource::from_detectors( - Duration::from_secs(0), - vec![ - Box::new(ConfigResourceDetector(metrics_common_config.clone())), - Box::new(SdkProvidedResourceDetector), - Box::new(opentelemetry::sdk::resource::EnvResourceDetector::new()), - ], - ); - - // Otel resources can be initialized from env variables, there is an override mechanism, but it's broken for service name as it will always override service.name - // If the service name is set to unknown service then override it from the config - if resource.get(opentelemetry_semantic_conventions::resource::SERVICE_NAME) - == Some(UNKNOWN_SERVICE.into()) - { - if let Some(service_name) = Resource::from_detectors( - Duration::from_secs(0), - vec![Box::new(ConfigResourceDetector( - metrics_common_config.clone(), - ))], - ) - .get(opentelemetry_semantic_conventions::resource::SERVICE_NAME) - { - resource = resource.merge(&mut Resource::new(vec![KeyValue::new( - opentelemetry_semantic_conventions::resource::SERVICE_NAME, - service_name, - )])); - } - } + let resource = config.metrics.common.to_resource(); Self { resource: resource.clone(), diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index f385762b927..cb8eb2fdde4 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -129,6 +129,7 @@ pub(crate) mod formatters; pub(crate) mod metrics; mod otlp; pub(crate) mod reload; +mod resource; pub(crate) mod tracing; pub(crate) mod utils; diff --git a/apollo-router/src/plugins/telemetry/resource.rs b/apollo-router/src/plugins/telemetry/resource.rs new file mode 100644 index 00000000000..42f54eb6809 --- /dev/null +++ b/apollo-router/src/plugins/telemetry/resource.rs @@ -0,0 +1,324 @@ +use std::collections::BTreeMap; +use std::env; +use std::time::Duration; + +use opentelemetry::sdk::resource::EnvResourceDetector; +use opentelemetry::sdk::resource::ResourceDetector; +use opentelemetry::sdk::Resource; +use opentelemetry::KeyValue; + +use crate::plugins::telemetry::config::AttributeValue; +const UNKNOWN_SERVICE: &str = "unknown_service"; +const OTEL_SERVICE_NAME: &str = "OTEL_SERVICE_NAME"; + +struct EnvServiceNameDetector; +// Used instead of SdkProvidedResourceDetector +impl ResourceDetector for EnvServiceNameDetector { + fn detect(&self, _timeout: Duration) -> Resource { + match env::var(OTEL_SERVICE_NAME) { + Ok(service_name) if !service_name.is_empty() => Resource::new(vec![KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAME, + service_name, + )]), + Ok(_) | Err(_) => Resource::new(vec![]), // return empty resource + } + } +} + +pub(crate) trait ConfigResource { + fn service_name(&self) -> Option; + fn service_namespace(&self) -> Option; + + fn resource(&self) -> &BTreeMap; + + fn to_resource(&self) -> Resource { + let config_resource_detector = ConfigResourceDetector { + service_name: self.service_name(), + service_namespace: self.service_namespace(), + resources: self.resource().clone(), + }; + + // Last one wins + let resource = Resource::from_detectors( + Duration::from_secs(0), + vec![ + Box::new(config_resource_detector), + Box::new(EnvResourceDetector::new()), + Box::new(EnvServiceNameDetector), + ], + ); + + // Default service name + if resource + .get(opentelemetry_semantic_conventions::resource::SERVICE_NAME) + .is_none() + { + let executable_name = executable_name(); + resource.merge(&Resource::new(vec![KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAME, + executable_name + .map(|executable_name| format!("{}:{}", UNKNOWN_SERVICE, executable_name)) + .unwrap_or_else(|| UNKNOWN_SERVICE.to_string()), + )])) + } else { + resource + } + } +} + +fn executable_name() -> Option { + let executable_name = std::env::current_exe().ok().and_then(|path| { + path.file_name() + .and_then(|p| p.to_str().map(|s| s.to_string())) + }); + executable_name +} + +struct ConfigResourceDetector { + service_name: Option, + service_namespace: Option, + resources: BTreeMap, +} + +impl ResourceDetector for ConfigResourceDetector { + fn detect(&self, _timeout: Duration) -> Resource { + let mut config_resources = vec![]; + + // For config resources last entry wins + + // Add any other resources from config + for (key, value) in self.resources.iter() { + config_resources.push(KeyValue::new(key.clone(), value.clone())); + } + + // Some other basic resources + config_resources.push(KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_VERSION, + std::env!("CARGO_PKG_VERSION"), + )); + if let Some(executable_name) = executable_name() { + config_resources.push(KeyValue::new( + opentelemetry_semantic_conventions::resource::PROCESS_EXECUTABLE_NAME, + executable_name, + )); + } + + // Service namespace + if let Some(service_namespace) = self.service_namespace.clone() { + config_resources.push(KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAMESPACE, + service_namespace.to_string(), + )); + } + + if let Some(service_name) = self.service_name.clone().or_else(|| { + // Yaml resources + if let Some(AttributeValue::String(name)) = self + .resources + .get(&opentelemetry_semantic_conventions::resource::SERVICE_NAME.to_string()) + { + Some(name.clone()) + } else { + None + } + }) { + config_resources.push(KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAME, + service_name.to_string(), + )); + } + Resource::new(config_resources) + } +} + +#[cfg(test)] +mod test { + use std::collections::BTreeMap; + use std::env; + + use opentelemetry_api::Key; + + use crate::plugins::telemetry::config::AttributeValue; + use crate::plugins::telemetry::resource::ConfigResource; + + struct TestConfig { + service_name: Option, + service_namespace: Option, + resources: BTreeMap, + } + impl ConfigResource for TestConfig { + fn service_name(&self) -> Option { + self.service_name.clone() + } + fn service_namespace(&self) -> Option { + self.service_namespace.clone() + } + fn resource(&self) -> &BTreeMap { + &self.resources + } + } + + #[test] + fn test_empty() { + let test_config = TestConfig { + service_name: None, + service_namespace: None, + resources: Default::default(), + }; + let resource = test_config.to_resource(); + assert!(resource + .get(opentelemetry_semantic_conventions::resource::SERVICE_NAME) + .unwrap() + .as_str() + .starts_with("unknown_service:apollo_router")); + assert!(resource + .get(opentelemetry_semantic_conventions::resource::SERVICE_NAMESPACE) + .is_none()); + assert_eq!( + resource.get(opentelemetry_semantic_conventions::resource::SERVICE_VERSION), + Some(std::env!("CARGO_PKG_VERSION").into()) + ); + + assert!(resource + .get(opentelemetry_semantic_conventions::resource::PROCESS_EXECUTABLE_NAME) + .expect("expected excutable name") + .as_str() + .contains("apollo")); + } + + #[test] + fn test_config_resources() { + let test_config = TestConfig { + service_name: None, + service_namespace: None, + resources: BTreeMap::from_iter(vec![ + ( + opentelemetry_semantic_conventions::resource::SERVICE_NAME.to_string(), + AttributeValue::String("override-service-name".to_string()), + ), + ( + opentelemetry_semantic_conventions::resource::SERVICE_NAMESPACE.to_string(), + AttributeValue::String("override-namespace".to_string()), + ), + ( + "extra-key".to_string(), + AttributeValue::String("extra-value".to_string()), + ), + ]), + }; + let resource = test_config.to_resource(); + assert_eq!( + resource.get(opentelemetry_semantic_conventions::resource::SERVICE_NAME), + Some("override-service-name".into()) + ); + assert_eq!( + resource.get(opentelemetry_semantic_conventions::resource::SERVICE_NAMESPACE), + Some("override-namespace".into()) + ); + assert_eq!( + resource.get(Key::from_static_str("extra-key")), + Some("extra-value".into()) + ); + } + + #[test] + fn test_service_name_service_namespace() { + let test_config = TestConfig { + service_name: Some("override-service-name".to_string()), + service_namespace: Some("override-namespace".to_string()), + resources: BTreeMap::new(), + }; + let resource = test_config.to_resource(); + assert_eq!( + resource.get(opentelemetry_semantic_conventions::resource::SERVICE_NAME), + Some("override-service-name".into()) + ); + assert_eq!( + resource.get(opentelemetry_semantic_conventions::resource::SERVICE_NAMESPACE), + Some("override-namespace".into()) + ); + } + + #[test] + fn test_service_name_override() { + // Order of precedence + // OTEL_SERVICE_NAME env + // OTEL_RESOURCE_ATTRIBUTES env + // config service_name + // config resources + // unknown_service:executable_name + // unknown_service (Untested as it can't happen) + + assert!(TestConfig { + service_name: None, + service_namespace: None, + resources: Default::default(), + } + .to_resource() + .get(opentelemetry_semantic_conventions::resource::SERVICE_NAME) + .unwrap() + .as_str() + .starts_with("unknown_service:apollo_router")); + + assert_eq!( + TestConfig { + service_name: None, + service_namespace: None, + resources: BTreeMap::from_iter(vec![( + opentelemetry_semantic_conventions::resource::SERVICE_NAME.to_string(), + AttributeValue::String("yaml-resource".to_string()), + )]), + } + .to_resource() + .get(opentelemetry_semantic_conventions::resource::SERVICE_NAME), + Some("yaml-resource".into()) + ); + + assert_eq!( + TestConfig { + service_name: Some("yaml-service-name".to_string()), + service_namespace: None, + resources: BTreeMap::from_iter(vec![( + opentelemetry_semantic_conventions::resource::SERVICE_NAME.to_string(), + AttributeValue::String("yaml-resource".to_string()), + )]), + } + .to_resource() + .get(opentelemetry_semantic_conventions::resource::SERVICE_NAME), + Some("yaml-service-name".into()) + ); + + env::set_var("OTEL_RESOURCE_ATTRIBUTES", "service.name=env-resource"); + assert_eq!( + TestConfig { + service_name: Some("yaml-service-name".to_string()), + service_namespace: None, + resources: BTreeMap::from_iter(vec![( + opentelemetry_semantic_conventions::resource::SERVICE_NAME.to_string(), + AttributeValue::String("yaml-resource".to_string()), + )]), + } + .to_resource() + .get(opentelemetry_semantic_conventions::resource::SERVICE_NAME), + Some("env-resource".into()) + ); + + env::set_var("OTEL_SERVICE_NAME", "env-service-name"); + assert_eq!( + TestConfig { + service_name: Some("yaml-service-name".to_string()), + service_namespace: None, + resources: BTreeMap::from_iter(vec![( + opentelemetry_semantic_conventions::resource::SERVICE_NAME.to_string(), + AttributeValue::String("yaml-resource".to_string()), + )]), + } + .to_resource() + .get(opentelemetry_semantic_conventions::resource::SERVICE_NAME), + Some("env-service-name".into()) + ); + + env::remove_var("OTEL_SERVICE_NAME"); + env::remove_var("OTEL_RESOURCE_ATTRIBUTES"); + } +} diff --git a/apollo-router/src/plugins/telemetry/tracing/jaeger.rs b/apollo-router/src/plugins/telemetry/tracing/jaeger.rs index c915cdca172..b06ab7472a2 100644 --- a/apollo-router/src/plugins/telemetry/tracing/jaeger.rs +++ b/apollo-router/src/plugins/telemetry/tracing/jaeger.rs @@ -96,7 +96,6 @@ impl TracingConfigurator for Config { tracing::info!("Configuring Jaeger tracing: {} (agent)", batch_processor); let exporter = opentelemetry_jaeger::new_agent_pipeline() .with_trace_config(trace_config.into()) - .with_service_name(trace_config.service_name.clone()) .with(&agent.endpoint.to_socket(), |b, s| b.with_endpoint(s)) .build_async_agent_exporter(opentelemetry::runtime::Tokio)?; Ok(builder.with_span_processor( @@ -118,7 +117,6 @@ impl TracingConfigurator for Config { let exporter = opentelemetry_jaeger::new_collector_pipeline() .with_trace_config(trace_config.into()) - .with_service_name(trace_config.service_name.clone()) .with(&collector.username, |b, u| b.with_username(u)) .with(&collector.password, |b, p| b.with_password(p)) .with( diff --git a/apollo-router/src/plugins/telemetry/tracing/zipkin.rs b/apollo-router/src/plugins/telemetry/tracing/zipkin.rs index 589c5f7bca0..e34632d8c37 100644 --- a/apollo-router/src/plugins/telemetry/tracing/zipkin.rs +++ b/apollo-router/src/plugins/telemetry/tracing/zipkin.rs @@ -43,7 +43,6 @@ impl TracingConfigurator for Config { let exporter = opentelemetry_zipkin::new_pipeline() .with_trace_config(trace_config.into()) - .with_service_name(trace_config.service_name.clone()) .with(&self.endpoint.to_uri(&DEFAULT_ENDPOINT), |b, endpoint| { b.with_collector_endpoint(endpoint.to_string()) }) diff --git a/docs/source/configuration/metrics.mdx b/docs/source/configuration/metrics.mdx index b7c02db97ee..a302f2cc274 100644 --- a/docs/source/configuration/metrics.mdx +++ b/docs/source/configuration/metrics.mdx @@ -4,6 +4,41 @@ title: Collecting metrics in the Apollo Router The Apollo Router provides built-in support for metrics collection via [Prometheus](#using-prometheus) and [OpenTelemetry Collector](#using-opentelemetry-collector). +## Common configuration + +### Service name + +```yaml title="router.yaml" +telemetry: + metrics: + common: + # (Optional) Set the service name to easily find metrics related to the apollo-router in your metrics dashboards + service_name: "router" +``` + +Service name discovery is handled in the following order: +1. `OTEL_SERVICE_NAME` env +2. `OTEL_RESOURCE_ATTRIBUTES` env +3. `router.yaml` `service_name` +4. `router.yaml` `resources` (attributes) + +If none of the above are found then the service name will be set to `unknown_service:apollo_router` or `unknown_service` if the executable name cannot be determined. + +### Resource + +A Resource is a set of key-value pairs that provide additional information to an exporter. APMs may interpret and display resource information. + +```yaml title="router.yaml" +telemetry: + tracing: + trace_config: + attributes: + "environment.name": "production" + "environment.namespace": "{env.MY_K8_NAMESPACE_ENV_VARIABLE}" +``` + +> [See OpenTelemetry conventions for resources.](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/resource/semantic_conventions/README.md) + ## Using Prometheus You can use [Prometheus and Grafana](https://prometheus.io/docs/visualization/grafana/) to collect metrics and visualize the router metrics. @@ -11,11 +46,6 @@ You can use [Prometheus and Grafana](https://prometheus.io/docs/visualization/gr ```yaml title="router.yaml" telemetry: metrics: - common: - # (Optional, default to "apollo-router") Set the service name to easily find metrics related to the apollo-router in your metrics dashboards - service_name: "apollo-router" - # (Optional) - service_namespace: "apollo" prometheus: # By setting this endpoint you enable the Prometheus exporter # All our endpoints exposed by plugins are namespaced by the name of the plugin @@ -279,20 +309,3 @@ telemetry: - 20.00 ``` -## Adding custom resources - -Resources are similar to [attributes](#adding-custom-attributeslabels), but there are more globals. They're configured directly on the metrics exporter, which means they're always present on each of your metrics. - -As an example, it can be useful to set a `environment_name` resource to help you identify metrics related to a particular environment: - -```yaml title="router.yaml" -telemetry: - metrics: - common: - resources: - environment_name: "production" -``` - -> [See OpenTelemetry conventions for resources.](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/resource/semantic_conventions/README.md) -> -> For example, if you want to use a Datadog agent and specify a service name, you should set the `service.name` resource as shown above and described in the conventions document. diff --git a/docs/source/configuration/tracing.mdx b/docs/source/configuration/tracing.mdx index 0681c2298f5..ba0464ff13e 100644 --- a/docs/source/configuration/tracing.mdx +++ b/docs/source/configuration/tracing.mdx @@ -24,39 +24,54 @@ Span data is sent to a collector such as [Jaeger](https://www.jaegertracing.io/) ## Common configuration -### Trace config - -In your router's YAML config file, the `trace_config` section contains common configuration that's used by all exporters. This section is optional, and it falls back on the values of environment variables specified by the [OpenTelemetry spec](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/sdk-environment-variables.md) if `service_name` is not set. +### Service name ```yaml title="router.yaml" telemetry: tracing: trace_config: + # (Optional) Set the service name to easily find traces related to the apollo-router in your metrics dashboards service_name: "router" - service_namespace: "apollo" - # Optional. Either a float between 0 and 1 or 'always_on' or 'always_off' - sampler: 0.1 +``` - # Optional. Use a parent based sampler. This enables remote spans help make a decision on if a span is sampeld or not. - # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/sdk.md#parentbased - parent_based_sampler: false +Service name discovery is handled in the following order: +1. `OTEL_SERVICE_NAME` env +2. `OTEL_RESOURCE_ATTRIBUTES` env +3. `router.yaml` `service_name` +4. `router.yaml` `resources` (attributes) - # Optional limits - max_attributes_per_event: 10 - max_attributes_per_link: 10 - max_attributes_per_span: 10 - max_events_per_span: 10 - max_links_per_span: 10 +If none of the above are found then the service name will be set to `unknown_service:apollo_router` or `unknown_service` if the executable name cannot be determined. + +### Resource + +A Resource is a set of key-value pairs that provide additional global information exported with your traces. - # Attributes particular to an exporter that have not - # been explicitly handled in Router configuration. +```yaml title="router.yaml" +telemetry: + tracing: + trace_config: attributes: - some.config.attribute: "config value" + "environment.name": "production" + "environment.namespace": "{env.MY_K8_NAMESPACE_ENV_VARIABLE}" +``` + +> [See OpenTelemetry conventions for resources.](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/resource/semantic_conventions/README.md) + +### Sampling + +To prevent sending to many traces to your APM you may want to enable sampling. + +```yaml title="router.yaml" +telemetry: + tracing: + trace_config: + sampler: always_on # (default) all requests are sampled (always_on|always_off|<0.0-1.0>) + parent_based_sampler: true # (default) If an incoming span has OpenTelemetry headers then the request will always be sampled. ``` -If `service_name` is set, then environment variables are not used. However, you can embed environment variables into your router config using Unix `${key:default}` syntax. +Setting `sampler` to `0.1` will result in only 10% of your requests being sampled. -If no environment variable is set and `service_name` is not present then `router` is used as the default service name. +`parent_based_sampler` enables clients to make the sampling decision. This guarantees that a trace that starts at a client will also have spans at the router. You may wish to turn this off if your router is exposed directly to the internet. ### Propagation @@ -91,6 +106,23 @@ telemetry: Specifying explicit propagation is generally only required if you're using an exporter that supports multiple trace ID formats (e.g., OpenTelemetry Collector, Jaeger, or OpenTracing compatible exporters). +### Span limits + +You may set limits on spans to prevent sending too much data to your APM. + +```yaml title="router.yaml" +telemetry: + tracing: + trace_config: + + # Optional limits + max_attributes_per_event: 10 + max_attributes_per_link: 10 + max_attributes_per_span: 10 + max_events_per_span: 10 + max_links_per_span: 10 +``` + ## Trace ID > This is part of an experimental feature, it means any time until it's stabilized (without the prefix `experimental_`) we might change the configuration shape or adding/removing features.