diff --git a/.changesets/maint_renee_router_297_monotonic_counters.md b/.changesets/maint_renee_router_297_monotonic_counters.md new file mode 100644 index 0000000000..dffc9b6f3b --- /dev/null +++ b/.changesets/maint_renee_router_297_monotonic_counters.md @@ -0,0 +1,13 @@ +### Docs-deprecate several metrics ([PR #6350](https://github.com/apollographql/router/pull/6350)) + +These metrics are deprecated in favor of better, OTel-compatible alternatives: + +- `apollo_router_deduplicated_subscriptions_total` - use the `apollo.router.operations.subscriptions` metric's `subscriptions.deduplicated` attribute. +- `apollo_authentication_failure_count` - use the `apollo.router.operations.authentication.jwt` metric's `authentication.jwt.failed` attribute. +- `apollo_authentication_success_count` - use the `apollo.router.operations.authentication.jwt` metric instead. If the `authentication.jwt.failed` attribute is *absent* or `false`, the authentication succeeded. +- `apollo_require_authentication_failure_count` - use the `http.server.request.duration` metric's `http.response.status_code` attribute. Requests with authentication failures have HTTP status code 401. +- `apollo_router_timeout` - this metric conflates timed-out requests from client to the router, and requests from the router to subgraphs. Timed-out requests have HTTP status code 504. Use the `http.response.status_code` attribute on the `http.server.request.duration` metric to identify timed-out router requests, and the same attribute on the `http.client.request.duration` metric to identify timed-out subgraph requests. + +The deprecated metrics will continue to work in the 1.x release line. + +By [@goto-bus-stop](https://github.com/goto-bus-stop) in https://github.com/apollographql/router/pull/6350 diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index 15f452ed28..4408d24c5c 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -170,10 +170,12 @@ where match res { Some(v) => { - tracing::info!( - monotonic_counter.apollo_router_cache_hit_count = 1u64, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Memory), + u64_counter!( + "apollo_router_cache_hit_count", + "Number of cache hits", + 1, + kind = self.caller, + storage = CacheStorageName::Memory.to_string() ); let duration = instant_memory.elapsed().as_secs_f64(); tracing::info!( @@ -190,10 +192,12 @@ where kind = %self.caller, storage = &tracing::field::display(CacheStorageName::Memory), ); - tracing::info!( - monotonic_counter.apollo_router_cache_miss_count = 1u64, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Memory), + u64_counter!( + "apollo_router_cache_miss_count", + "Number of cache misses", + 1, + kind = self.caller, + storage = CacheStorageName::Memory.to_string() ); let instant_redis = Instant::now(); @@ -214,10 +218,12 @@ where Some(v) => { self.insert_in_memory(key.clone(), v.0.clone()).await; - tracing::info!( - monotonic_counter.apollo_router_cache_hit_count = 1u64, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Redis), + u64_counter!( + "apollo_router_cache_hit_count", + "Number of cache hits", + 1, + kind = self.caller, + storage = CacheStorageName::Redis.to_string() ); let duration = instant_redis.elapsed().as_secs_f64(); tracing::info!( @@ -228,10 +234,12 @@ where Some(v.0) } None => { - tracing::info!( - monotonic_counter.apollo_router_cache_miss_count = 1u64, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Redis), + u64_counter!( + "apollo_router_cache_miss_count", + "Number of cache misses", + 1, + kind = self.caller, + storage = CacheStorageName::Redis.to_string() ); let duration = instant_redis.elapsed().as_secs_f64(); tracing::info!( diff --git a/apollo-router/src/graphql/request.rs b/apollo-router/src/graphql/request.rs index 1e51262dbf..fc572f70cb 100644 --- a/apollo-router/src/graphql/request.rs +++ b/apollo-router/src/graphql/request.rs @@ -202,9 +202,11 @@ impl Request { mode = %BatchingMode::BatchHttpLink // Only supported mode right now ); - tracing::info!( - monotonic_counter.apollo.router.operations.batching = 1u64, - mode = %BatchingMode::BatchHttpLink // Only supported mode right now + u64_counter!( + "apollo.router.operations.batching", + "Total requests with batched operations", + 1, + mode = BatchingMode::BatchHttpLink.to_string() // Only supported mode right now ); for entry in value .as_array() @@ -229,9 +231,11 @@ impl Request { mode = "batch_http_link" // Only supported mode right now ); - tracing::info!( - monotonic_counter.apollo.router.operations.batching = 1u64, - mode = "batch_http_link" // Only supported mode right now + u64_counter!( + "apollo.router.operations.batching", + "Total requests with batched operations", + 1, + mode = BatchingMode::BatchHttpLink.to_string() // Only supported mode right now ); for entry in value .as_array() diff --git a/apollo-router/src/metrics/mod.rs b/apollo-router/src/metrics/mod.rs index e24317cd06..4b2aa0b888 100644 --- a/apollo-router/src/metrics/mod.rs +++ b/apollo-router/src/metrics/mod.rs @@ -496,7 +496,9 @@ pub(crate) fn meter_provider() -> AggregateMeterProvider { } #[macro_export] -/// Get or create a u64 monotonic counter metric and add a value to it +/// Get or create a `u64` monotonic counter metric and add a value to it. +/// +/// Each metric needs a description. /// /// This macro is a replacement for the telemetry crate's MetricsLayer. We will eventually convert all metrics to use these macros and deprecate the MetricsLayer. /// The reason for this is that the MetricsLayer has: @@ -506,6 +508,33 @@ pub(crate) fn meter_provider() -> AggregateMeterProvider { /// * Imperfect mapping to metrics API that can only be checked at runtime. /// /// New metrics should be added using these macros. +/// +/// # Examples +/// ```ignore +/// // Count a thing: +/// u64_counter!( +/// "apollo.router.operations.frobbles", +/// "The amount of frobbles we've operated on", +/// 1 +/// ); +/// // Count a thing with attributes: +/// u64_counter!( +/// "apollo.router.operations.frobbles", +/// "The amount of frobbles we've operated on", +/// 1, +/// frobbles.color = "blue" +/// ); +/// // Count a thing with dynamic attributes: +/// let attributes = [ +/// opentelemetry::KeyValue::new("frobbles.color".to_string(), "blue".into()), +/// ]; +/// u64_counter!( +/// "apollo.router.operations.frobbles", +/// "The amount of frobbles we've operated on", +/// 1, +/// attributes +/// ); +/// ``` #[allow(unused_macros)] macro_rules! u64_counter { ($($name:ident).+, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { diff --git a/apollo-router/src/notification.rs b/apollo-router/src/notification.rs index 7cfba87e7a..ac1f33645d 100644 --- a/apollo-router/src/notification.rs +++ b/apollo-router/src/notification.rs @@ -510,7 +510,11 @@ where match Pin::new(&mut this.msg_receiver).poll_next(cx) { Poll::Ready(Some(Err(BroadcastStreamRecvError::Lagged(_)))) => { - tracing::info!(monotonic_counter.apollo_router_skipped_event_count = 1u64,); + u64_counter!( + "apollo_router_skipped_event_count", + "Amount of events dropped from the internal message queue", + 1u64 + ); self.poll_next(cx) } Poll::Ready(None) => Poll::Ready(None), diff --git a/apollo-router/src/plugins/authentication/mod.rs b/apollo-router/src/plugins/authentication/mod.rs index 9f9abc8040..0239e1f005 100644 --- a/apollo-router/src/plugins/authentication/mod.rs +++ b/apollo-router/src/plugins/authentication/mod.rs @@ -539,8 +539,6 @@ fn authenticate( jwks_manager: &JwksManager, request: router::Request, ) -> ControlFlow { - const AUTHENTICATION_KIND: &str = "JWT"; - // We are going to do a lot of similar checking so let's define a local function // to help reduce repetition fn failure_message( @@ -549,17 +547,16 @@ fn authenticate( status: StatusCode, ) -> ControlFlow { // This is a metric and will not appear in the logs - tracing::info!( - monotonic_counter.apollo_authentication_failure_count = 1u64, - kind = %AUTHENTICATION_KIND + u64_counter!( + "apollo_authentication_failure_count", + "Number of requests with failed JWT authentication (deprecated)", + 1, + kind = "JWT" ); - tracing::info!( - monotonic_counter - .apollo - .router - .operations - .authentication - .jwt = 1, + u64_counter!( + "apollo.router.operations.authentication.jwt", + "Number of requests with JWT authentication", + 1, authentication.jwt.failed = true ); tracing::info!(message = %error, "jwt authentication failure"); @@ -662,11 +659,17 @@ fn authenticate( ); } // This is a metric and will not appear in the logs - tracing::info!( - monotonic_counter.apollo_authentication_success_count = 1u64, - kind = %AUTHENTICATION_KIND + u64_counter!( + "apollo_authentication_success_count", + "Number of requests with successful JWT authentication (deprecated)", + 1, + kind = "JWT" + ); + u64_counter!( + "apollo.router.operations.jwt", + "Number of requests with JWT authentication", + 1 ); - tracing::info!(monotonic_counter.apollo.router.operations.jwt = 1u64); return ControlFlow::Continue(request); } diff --git a/apollo-router/src/plugins/authentication/subgraph.rs b/apollo-router/src/plugins/authentication/subgraph.rs index 568aa9c8ac..4a0bcd4d65 100644 --- a/apollo-router/src/plugins/authentication/subgraph.rs +++ b/apollo-router/src/plugins/authentication/subgraph.rs @@ -409,17 +409,21 @@ impl SigningParamsConfig { } fn increment_success_counter(subgraph_name: &str) { - tracing::info!( - monotonic_counter.apollo.router.operations.authentication.aws.sigv4 = 1u64, + u64_counter!( + "apollo.router.operations.authentication.aws.sigv4", + "Number of subgraph requests signed with AWS SigV4", + 1, authentication.aws.sigv4.failed = false, - subgraph.service.name = %subgraph_name, + subgraph.service.name = subgraph_name.to_string() ); } fn increment_failure_counter(subgraph_name: &str) { - tracing::info!( - monotonic_counter.apollo.router.operations.authentication.aws.sigv4 = 1u64, + u64_counter!( + "apollo.router.operations.authentication.aws.sigv4", + "Number of subgraph requests signed with AWS SigV4", + 1, authentication.aws.sigv4.failed = true, - subgraph.service.name = %subgraph_name, + subgraph.service.name = subgraph_name.to_string() ); } diff --git a/apollo-router/src/plugins/authorization/mod.rs b/apollo-router/src/plugins/authorization/mod.rs index 331641a726..63b2062a05 100644 --- a/apollo-router/src/plugins/authorization/mod.rs +++ b/apollo-router/src/plugins/authorization/mod.rs @@ -556,8 +556,10 @@ impl Plugin for AuthorizationPlugin { Ok(ControlFlow::Continue(request)) } else { // This is a metric and will not appear in the logs - tracing::info!( - monotonic_counter.apollo_require_authentication_failure_count = 1u64, + u64_counter!( + "apollo_require_authentication_failure_count", + "Number of unauthenticated requests (deprecated)", + 1 ); tracing::error!("rejecting unauthenticated request"); let response = supergraph::Response::error_builder() @@ -588,11 +590,13 @@ impl Plugin for AuthorizationPlugin { let needs_requires_scopes = request.context.contains_key(REQUIRED_SCOPES_KEY); if needs_authenticated || needs_requires_scopes { - tracing::info!( - monotonic_counter.apollo.router.operations.authorization = 1u64, + u64_counter!( + "apollo.router.operations.authorization", + "Number of subgraph requests requiring authorization", + 1, authorization.filtered = filtered, authorization.needs_authenticated = needs_authenticated, - authorization.needs_requires_scopes = needs_requires_scopes, + authorization.needs_requires_scopes = needs_requires_scopes ); } diff --git a/apollo-router/src/plugins/subscription.rs b/apollo-router/src/plugins/subscription.rs index 50d5e78ead..1d342f586f 100644 --- a/apollo-router/src/plugins/subscription.rs +++ b/apollo-router/src/plugins/subscription.rs @@ -503,10 +503,12 @@ impl Service for CallbackService { }; // Keep the subscription to the client opened payload.subscribed = Some(true); - tracing::info!( - monotonic_counter.apollo.router.operations.subscriptions.events = 1u64, - subscriptions.mode="callback" - ); + u64_counter!( + "apollo.router.operations.subscriptions.events", + "Number of subscription events", + 1, + subscriptions.mode = "callback" + ); handle.send_sync(payload)?; Ok(router::Response { @@ -626,10 +628,12 @@ impl Service for CallbackService { }); } }; - tracing::info!( - monotonic_counter.apollo.router.operations.subscriptions.events = 1u64, - subscriptions.mode="callback", - subscriptions.complete=true + u64_counter!( + "apollo.router.operations.subscriptions.events", + "Number of subscription events", + 1, + subscriptions.mode = "callback", + subscriptions.complete = true ); if let Err(_err) = handle.send_sync( graphql::Response::builder().errors(errors).build(), diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 14ba7199f9..da5361e1c1 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -131,7 +131,6 @@ use crate::plugins::telemetry::reload::OPENTELEMETRY_TRACER_HANDLE; use crate::plugins::telemetry::tracing::apollo_telemetry::decode_ftv1_trace; use crate::plugins::telemetry::tracing::apollo_telemetry::APOLLO_PRIVATE_OPERATION_SIGNATURE; use crate::plugins::telemetry::tracing::TracingConfigurator; -use crate::plugins::telemetry::utils::TracingUtils; use crate::query_planner::OperationKind; use crate::register_private_plugin; use crate::router_factory::Endpoint; @@ -1747,28 +1746,32 @@ impl Telemetry { } fn plugin_metrics(config: &Arc) { - let metrics_prom_used = config.exporters.metrics.prometheus.enabled; - let metrics_otlp_used = MetricsConfigurator::enabled(&config.exporters.metrics.otlp); - let tracing_otlp_used = TracingConfigurator::enabled(&config.exporters.tracing.otlp); - let tracing_datadog_used = config.exporters.tracing.datadog.enabled(); - let tracing_jaeger_used = config.exporters.tracing.jaeger.enabled(); - let tracing_zipkin_used = config.exporters.tracing.zipkin.enabled(); - - if metrics_prom_used - || metrics_otlp_used - || tracing_jaeger_used - || tracing_otlp_used - || tracing_zipkin_used - || tracing_datadog_used - { - ::tracing::info!( - monotonic_counter.apollo.router.operations.telemetry = 1u64, - telemetry.metrics.otlp = metrics_otlp_used.or_empty(), - telemetry.metrics.prometheus = metrics_prom_used.or_empty(), - telemetry.tracing.otlp = tracing_otlp_used.or_empty(), - telemetry.tracing.datadog = tracing_datadog_used.or_empty(), - telemetry.tracing.jaeger = tracing_jaeger_used.or_empty(), - telemetry.tracing.zipkin = tracing_zipkin_used.or_empty(), + let mut attributes = Vec::new(); + if MetricsConfigurator::enabled(&config.exporters.metrics.otlp) { + attributes.push(KeyValue::new("telemetry.metrics.otlp", true)); + } + if config.exporters.metrics.prometheus.enabled { + attributes.push(KeyValue::new("telemetry.metrics.prometheus", true)); + } + if TracingConfigurator::enabled(&config.exporters.tracing.otlp) { + attributes.push(KeyValue::new("telemetry.tracing.otlp", true)); + } + if config.exporters.tracing.datadog.enabled() { + attributes.push(KeyValue::new("telemetry.tracing.datadog", true)); + } + if config.exporters.tracing.jaeger.enabled() { + attributes.push(KeyValue::new("telemetry.tracing.jaeger", true)); + } + if config.exporters.tracing.zipkin.enabled() { + attributes.push(KeyValue::new("telemetry.tracing.zipkin", true)); + } + + if !attributes.is_empty() { + u64_counter!( + "apollo.router.operations.telemetry", + "Telemetry exporters enabled", + 1, + attributes ); } } diff --git a/apollo-router/src/plugins/traffic_shaping/timeout/future.rs b/apollo-router/src/plugins/traffic_shaping/timeout/future.rs index 8a390b393e..eda4100198 100644 --- a/apollo-router/src/plugins/traffic_shaping/timeout/future.rs +++ b/apollo-router/src/plugins/traffic_shaping/timeout/future.rs @@ -49,7 +49,11 @@ where match Pin::new(&mut this.sleep).poll(cx) { Poll::Pending => Poll::Pending, Poll::Ready(_) => { - tracing::info!(monotonic_counter.apollo_router_timeout = 1u64,); + u64_counter!( + "apollo_router_timeout", + "Number of timed out client requests", + 1 + ); Poll::Ready(Err(Elapsed::new().into())) } } diff --git a/apollo-router/src/protocols/websocket.rs b/apollo-router/src/protocols/websocket.rs index bd556232ac..13700e84ce 100644 --- a/apollo-router/src/protocols/websocket.rs +++ b/apollo-router/src/protocols/websocket.rs @@ -300,13 +300,10 @@ where request: graphql::Request, heartbeat_interval: Option, ) -> Result, graphql::Error> { - tracing::info!( - monotonic_counter - .apollo - .router - .operations - .subscriptions - .events = 1u64, + u64_counter!( + "apollo.router.operations.subscriptions.events", + "Number of subscription events", + 1, subscriptions.mode = "passthrough" ); @@ -443,13 +440,10 @@ where tracing::trace!("cannot shutdown sink: {err:?}"); }; - tracing::info!( - monotonic_counter - .apollo - .router - .operations - .subscriptions - .events = 1u64, + u64_counter!( + "apollo.router.operations.subscriptions.events", + "Number of subscription events", + 1, subscriptions.mode = "passthrough", subscriptions.complete = true ); diff --git a/apollo-router/src/query_planner/execution.rs b/apollo-router/src/query_planner/execution.rs index bd23f549ea..9a5b647350 100644 --- a/apollo-router/src/query_planner/execution.rs +++ b/apollo-router/src/query_planner/execution.rs @@ -82,7 +82,11 @@ impl QueryPlan { ) .await; if !deferred_fetches.is_empty() { - tracing::info!(monotonic_counter.apollo.router.operations.defer = 1u64); + u64_counter!( + "apollo.router.operations.defer", + "Number of requests that request deferred data", + 1 + ); } Response::builder().data(value).errors(errors).build() diff --git a/apollo-router/src/query_planner/fetch.rs b/apollo-router/src/query_planner/fetch.rs index 47069283ca..3ec89c1e95 100644 --- a/apollo-router/src/query_planner/fetch.rs +++ b/apollo-router/src/query_planner/fetch.rs @@ -521,7 +521,11 @@ impl FetchNode { self.response_at_path(parameters.schema, current_dir, paths, response); if let Some(id) = &self.id { if let Some(sender) = parameters.deferred_fetches.get(id.as_str()) { - tracing::info!(monotonic_counter.apollo.router.operations.defer.fetch = 1u64); + u64_counter!( + "apollo.router.operations.defer.fetch", + "Number of deferred responses fetched from subgraphs", + 1 + ); if let Err(e) = sender.clone().send((value.clone(), errors.clone())) { tracing::error!("error sending fetch result at path {} and id {:?} for deferred response building: {}", current_dir, self.id, e); } diff --git a/apollo-router/src/services/layers/persisted_queries/mod.rs b/apollo-router/src/services/layers/persisted_queries/mod.rs index 457ea67820..48c4dcba1d 100644 --- a/apollo-router/src/services/layers/persisted_queries/mod.rs +++ b/apollo-router/src/services/layers/persisted_queries/mod.rs @@ -122,7 +122,11 @@ impl PersistedQueryLayer { .context .extensions() .with_lock(|mut lock| lock.insert(UsedQueryIdFromManifest)); - tracing::info!(monotonic_counter.apollo.router.operations.persisted_queries = 1u64); + u64_counter!( + "apollo.router.operations.persisted_queries", + "Total requests with persisted queries enabled", + 1 + ); Ok(request) } else if manifest_poller.augmenting_apq_with_pre_registration_and_no_safelisting() { // The query ID isn't in our manifest, but we have APQ enabled @@ -131,8 +135,10 @@ impl PersistedQueryLayer { // safelist later for log_unknown!) Ok(request) } else { - tracing::info!( - monotonic_counter.apollo.router.operations.persisted_queries = 1u64, + u64_counter!( + "apollo.router.operations.persisted_queries", + "Total requests with persisted queries enabled", + 1, persisted_queries.not_found = true ); // if APQ is not enabled, return an error indicating the query was not found @@ -209,28 +215,38 @@ impl PersistedQueryLayer { match manifest_poller.action_for_freeform_graphql(Ok(&doc.ast)) { FreeformGraphQLAction::Allow => { - tracing::info!(monotonic_counter.apollo.router.operations.persisted_queries = 1u64,); + u64_counter!( + "apollo.router.operations.persisted_queries", + "Total requests with persisted queries enabled", + 1 + ); Ok(request) } FreeformGraphQLAction::Deny => { - tracing::info!( - monotonic_counter.apollo.router.operations.persisted_queries = 1u64, - persisted_queries.safelist.rejected.unknown = false, + u64_counter!( + "apollo.router.operations.persisted_queries", + "Total requests with persisted queries enabled", + 1, + persisted_queries.safelist.rejected.unknown = false ); Err(supergraph_err_operation_not_in_safelist(request)) } // Note that this might even include complaining about an operation that came via APQs. FreeformGraphQLAction::AllowAndLog => { - tracing::info!( - monotonic_counter.apollo.router.operations.persisted_queries = 1u64, + u64_counter!( + "apollo.router.operations.persisted_queries", + "Total requests with persisted queries enabled", + 1, persisted_queries.logged = true ); log_unknown_operation(operation_body); Ok(request) } FreeformGraphQLAction::DenyAndLog => { - tracing::info!( - monotonic_counter.apollo.router.operations.persisted_queries = 1u64, + u64_counter!( + "apollo.router.operations.persisted_queries", + "Total requests with persisted queries enabled", + 1, persisted_queries.safelist.rejected.unknown = true, persisted_queries.logged = true ); diff --git a/apollo-router/src/services/router/service.rs b/apollo-router/src/services/router/service.rs index e5792c1a4d..acc67e332d 100644 --- a/apollo-router/src/services/router/service.rs +++ b/apollo-router/src/services/router/service.rs @@ -378,8 +378,10 @@ impl RouterService { Ok(RouterResponse { response, context }) } else { - tracing::info!( - monotonic_counter.apollo.router.graphql_error = 1u64, + u64_counter!( + "apollo.router.graphql_error", + "Number of GraphQL error responses returned by the router", + 1, code = "INVALID_ACCEPT_HEADER" ); // Useful for selector in spans/instruments/events @@ -799,12 +801,18 @@ impl RouterService { for (code, count) in map { match code { None => { - tracing::info!(monotonic_counter.apollo.router.graphql_error = count,); + u64_counter!( + "apollo.router.graphql_error", + "Number of GraphQL error responses returned by the router", + count + ); } Some(code) => { - tracing::info!( - monotonic_counter.apollo.router.graphql_error = count, - code = code + u64_counter!( + "apollo.router.graphql_error", + "Number of GraphQL error responses returned by the router", + count, + code = code.to_string() ); } } diff --git a/apollo-router/src/services/subgraph_service.rs b/apollo-router/src/services/subgraph_service.rs index 9dbb9fb773..dc93499143 100644 --- a/apollo-router/src/services/subgraph_service.rs +++ b/apollo-router/src/services/subgraph_service.rs @@ -300,16 +300,20 @@ impl tower::Service for SubgraphService { })?; stream_tx.send(Box::pin(handle.into_stream())).await?; - tracing::info!( - monotonic_counter.apollo.router.operations.subscriptions = 1u64, - subscriptions.mode = %"callback", + u64_counter!( + "apollo.router.operations.subscriptions", + "Total requests with subscription operations", + 1, + subscriptions.mode = "callback", subscriptions.deduplicated = !created, - subgraph.service.name = service_name, + subgraph.service.name = service_name.clone() ); if !created { - tracing::info!( - monotonic_counter.apollo_router_deduplicated_subscriptions_total = 1u64, - mode = %"callback", + u64_counter!( + "apollo_router_deduplicated_subscriptions_total", + "Total deduplicated subscription requests (deprecated)", + 1, + mode = "callback" ); // Dedup happens here return Ok(SubgraphResponse::builder() @@ -507,19 +511,23 @@ async fn call_websocket( let (handle, created) = notify .create_or_subscribe(subscription_hash.clone(), false) .await?; - tracing::info!( - monotonic_counter.apollo.router.operations.subscriptions = 1u64, - subscriptions.mode = %"passthrough", + u64_counter!( + "apollo.router.operations.subscriptions", + "Total requests with subscription operations", + 1, + subscriptions.mode = "passthrough", subscriptions.deduplicated = !created, - subgraph.service.name = service_name, + subgraph.service.name = service_name.clone() ); if !created { subscription_stream_tx .send(Box::pin(handle.into_stream())) .await?; - tracing::info!( - monotonic_counter.apollo_router_deduplicated_subscriptions_total = 1u64, - mode = %"passthrough", + u64_counter!( + "apollo_router_deduplicated_subscriptions_total", + "Total deduplicated subscription requests (deprecated)", + 1, + mode = "passthrough" ); // Dedup happens here @@ -868,9 +876,14 @@ pub(crate) async fn process_batch( subgraph = &service ); - tracing::info!(monotonic_counter.apollo.router.operations.batching = 1u64, - mode = %BatchingMode::BatchHttpLink, // Only supported mode right now - subgraph = &service + u64_counter!( + "apollo.router.operations.batching", + "Total requests with batched operations", + 1, + // XXX(@goto-bus-stop): Should these be `batching.mode`, `batching.subgraph`? + // Also, other metrics use a different convention to report the subgraph name + mode = BatchingMode::BatchHttpLink.to_string(), // Only supported mode right now + subgraph = service.clone() ); // Perform the actual fetch. If this fails then we didn't manage to make the call at all, so we can't do anything with it. diff --git a/apollo-router/src/state_machine.rs b/apollo-router/src/state_machine.rs index e3ce6c3a67..ed5765d4e3 100644 --- a/apollo-router/src/state_machine.rs +++ b/apollo-router/src/state_machine.rs @@ -557,13 +557,20 @@ where #[cfg(test)] self.notify_updated.notify_one(); - tracing::debug!( - monotonic_counter.apollo_router_state_change_total = 1u64, + tracing::info!( event = event_name, state = ?state, previous_state, "state machine transitioned" ); + u64_counter!( + "apollo_router_state_change_total", + "Router state changes", + 1, + event = event_name, + state = format!("{state:?}"), + previous_state = previous_state + ); // If we've errored then exit even if there are potentially more messages if matches!(&state, Stopped | Errored(_)) { diff --git a/apollo-router/src/uplink/mod.rs b/apollo-router/src/uplink/mod.rs index 6a8974699e..2ea483daf4 100644 --- a/apollo-router/src/uplink/mod.rs +++ b/apollo-router/src/uplink/mod.rs @@ -210,7 +210,7 @@ where Response: Send + 'static + Debug, TransformedResponse: Send + 'static + Debug, { - let query = query_name::(); + let query_name = query_name::(); let (sender, receiver) = channel(2); let client = match reqwest::Client::builder() .no_gzip() @@ -245,10 +245,12 @@ where .await { Ok(response) => { - tracing::info!( - monotonic_counter.apollo_router_uplink_fetch_count_total = 1u64, + u64_counter!( + "apollo_router_uplink_fetch_count_total", + "Total number of requests to Apollo Uplink", + 1u64, status = "success", - query + query = query_name ); match response { UplinkResponse::New { @@ -294,10 +296,12 @@ where } } Err(err) => { - tracing::info!( - monotonic_counter.apollo_router_uplink_fetch_count_total = 1u64, + u64_counter!( + "apollo_router_uplink_fetch_count_total", + "Total number of requests to Apollo Uplink", + 1u64, status = "failure", - query + query = query_name ); if let Err(e) = sender.send(Err(err)).await { tracing::debug!("failed to send error to uplink stream. This is likely to be because the router is shutting down: {e}"); diff --git a/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx b/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx index 9bcf4c83fa..dead9fb70b 100644 --- a/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx +++ b/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx @@ -16,11 +16,10 @@ These instruments can be consumed by configuring a [metrics exporter](/router/co - `apollo_router_http_request_duration_seconds_bucket` - HTTP subgraph request duration, attributes: - `subgraph`: (Optional) The subgraph being queried - `apollo_router_http_requests_total` - Total number of HTTP requests by HTTP status -- `apollo_router_timeout` - Number of triggered timeouts ### GraphQL -- `apollo_router_graphql_error` - counts GraphQL errors in responses, attributes: +- `apollo.router.graphql_error` - counts GraphQL errors in responses, attributes: - `code`: error code ### Session @@ -122,8 +121,17 @@ The initial call to Uplink during router startup is not reflected in metrics. The following metrics have been deprecated and should not be used. -- `apollo_router_span` - **Deprecated**—use `apollo_router_processing_time` instead. -- `apollo_router_http_request_retry_total` **Deprecated**- Number of subgraph requests retried. You should use: +- `apollo_router_span` - **Deprecated**: use `apollo_router_processing_time` instead. +- `apollo_router_deduplicated_subscriptions_total` - **Deprecated**: use the `apollo.router.operations.subscriptions` metric's `subscriptions.deduplicated` attribute. +- `apollo_authentication_failure_count` - **Deprecated**: use the `apollo.router.operations.authentication.jwt` metric's `authentication.jwt.failed` attribute. +- `apollo_authentication_success_count` - **Deprecated**: use the `apollo.router.operations.authentication.jwt` metric instead. If the `authentication.jwt.failed` attribute is *absent* or `false`, the authentication succeeded. +- `apollo_require_authentication_failure_count` - **Deprecated**: use the `http.server.request.duration` metric's `http.response.status_code` attribute. Requests with authentication failures have HTTP status code 401. +- `apollo_router_timeout` - **Deprecated**: this metric conflates timed-out requests from client to the router, and requests from the router to subgraphs. Timed-out requests have HTTP status code 504. Use the `http.response.status_code` attribute on the `http.server.request.duration` metric to identify timed-out router requests, and the same attribute on the `http.client.request.duration` metric to identify timed-out subgraph requests. +- `apollo_router_http_request_retry_total` **Deprecated**: this can be achieved with custom telemetry instead. See [below for an example](#migrate-from-apollo_router_http_request_retry_total). + +#### Migrate from `apollo_router_http_request_retry_total` + +The below configuration filters the deprecated metric, and defines a custom histogram recording the number of retried HTTP requests to subgraphs. ```yaml title="config.router.yaml" telemetry: @@ -158,4 +166,4 @@ telemetry: subgraph.name: true supergraph.operation.name: supergraph_operation_name: string -``` \ No newline at end of file +```