From 055faac4daa2999ce8d2b48df4bc651d647107f7 Mon Sep 17 00:00:00 2001 From: bryn Date: Wed, 27 Sep 2023 10:56:22 +0100 Subject: [PATCH 1/3] Metrics and root span creation now happen first Our metrics are currently mutated at the supergraph service. This is a throwback from when the router service did not exist. Move operation count metrics to the first thing that happens in the pipeline. Note that this won't catch disconnects because they are reliant on the status code of the response, which will stop processing before the metric is incremented. Fixes #3915 --- .changesets/fix_bryn_move_telemetry.md | 12 +++++++++++ .../axum_factory/axum_http_server_factory.rs | 16 +++++++++++++-- apollo-router/src/plugins/telemetry/mod.rs | 12 ----------- apollo-router/tests/common.rs | 17 ++++++++++------ apollo-router/tests/metrics_tests.rs | 20 ++++++++++++++----- 5 files changed, 52 insertions(+), 25 deletions(-) create mode 100644 .changesets/fix_bryn_move_telemetry.md diff --git a/.changesets/fix_bryn_move_telemetry.md b/.changesets/fix_bryn_move_telemetry.md new file mode 100644 index 0000000000..f128f62cdf --- /dev/null +++ b/.changesets/fix_bryn_move_telemetry.md @@ -0,0 +1,12 @@ +### Ensure that telemetry happens first ([Issue #3915](https://github.com/apollographql/router/issues/3915)) + +Telemetry related logic is now moved to the first thing in the router pipeline. + +Previously the metric `apollo.router.operations` may have missed some requests if they were failed at the router stage. +In addition, some logic happened before root spans were created, which would have caused missing traces. + +`apollo.router.operations` and root spans are now the first thing that happens in the router pipeline for graphql requests. + + + +By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/3919 diff --git a/apollo-router/src/axum_factory/axum_http_server_factory.rs b/apollo-router/src/axum_factory/axum_http_server_factory.rs index 0b414f25b1..08ea743f76 100644 --- a/apollo-router/src/axum_factory/axum_http_server_factory.rs +++ b/apollo-router/src/axum_factory/axum_http_server_factory.rs @@ -402,9 +402,10 @@ where (license, Instant::now(), Arc::new(AtomicU64::new(0))), license_handler, )) - .layer(TraceLayer::new_for_http().make_span_with(PropagatingMakeSpan { license })) .layer(Extension(service_factory)) - .layer(cors); + .layer(cors) + .layer(TraceLayer::new_for_http().make_span_with(PropagatingMakeSpan { license })) + .layer(middleware::from_fn(metrics_handler)); let route = endpoints_on_main_listener .into_iter() @@ -414,6 +415,17 @@ where Ok(ListenAddrAndRouter(listener, route)) } +async fn metrics_handler(request: Request, next: Next) -> Response { + let resp = next.run(request).await; + u64_counter!( + "apollo.router.operations", + "The number of graphql operations performed by the Router", + 1, + "http.response.status_code" = resp.status().as_u16() as i64 + ); + resp +} + async fn license_handler( State((license, start, delta)): State<(LicenseState, Instant, Arc)>, request: Request, diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 4c76dcf5b7..9a35aeabee 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -833,12 +833,6 @@ impl Telemetry { if !parts.status.is_success() { metric_attrs.push(KeyValue::new("error", parts.status.to_string())); } - u64_counter!( - "apollo.router.operations", - "The number of graphql operations performed by the Router", - 1, - "http.response.status_code" = parts.status.as_u16() as i64 - ); let response = http::Response::from_parts( parts, once(ready(first_response.unwrap_or_default())) @@ -850,12 +844,6 @@ impl Telemetry { } Err(err) => { metric_attrs.push(KeyValue::new("status", "500")); - u64_counter!( - "apollo.router.operations", - "The number of graphql operations performed by the Router", - 1, - "http.response.status_code" = 500 - ); Err(err) } }; diff --git a/apollo-router/tests/common.rs b/apollo-router/tests/common.rs index f7eb55fb82..352962deb8 100644 --- a/apollo-router/tests/common.rs +++ b/apollo-router/tests/common.rs @@ -334,7 +334,7 @@ impl IntegrationTest { pub fn execute_default_query( &self, ) -> impl std::future::Future { - self.execute_query_internal(None) + self.execute_query_internal(&json!({"query":"query {topProducts{name}}","variables":{}})) } #[allow(dead_code)] @@ -342,21 +342,26 @@ impl IntegrationTest { &self, query: &Value, ) -> impl std::future::Future { - self.execute_query_internal(Some(query)) + self.execute_query_internal(query) + } + + #[allow(dead_code)] + pub fn execute_bad_query( + &self, + ) -> impl std::future::Future { + self.execute_query_internal(&json!({"garbage":{}})) } fn execute_query_internal( &self, - query: Option<&Value>, + query: &Value, ) -> impl std::future::Future { assert!( self.router.is_some(), "router was not started, call `router.start().await; router.assert_started().await`" ); - let default_query = &json!({"query":"query {topProducts{name}}","variables":{}}); - let query = query.unwrap_or(default_query).clone(); let dispatch = self.subscriber.clone(); - + let query = query.clone(); async move { let span = info_span!("client_request"); let span_id = span.context().span().span_context().trace_id().to_string(); diff --git a/apollo-router/tests/metrics_tests.rs b/apollo-router/tests/metrics_tests.rs index 5dc48a2aed..60b1b7f9a0 100644 --- a/apollo-router/tests/metrics_tests.rs +++ b/apollo-router/tests/metrics_tests.rs @@ -11,7 +11,7 @@ const PROMETHEUS_CONFIG: &str = include_str!("fixtures/prometheus.router.yaml"); const SUBGRAPH_AUTH_CONFIG: &str = include_str!("fixtures/subgraph_auth.router.yaml"); #[tokio::test(flavor = "multi_thread")] -async fn test_metrics_reloading() -> Result<(), BoxError> { +async fn test_metrics_reloading() { let mut router = IntegrationTest::builder() .config(PROMETHEUS_CONFIG) .build() @@ -68,12 +68,10 @@ async fn test_metrics_reloading() -> Result<(), BoxError> { router.assert_metrics_contains(r#"apollo_router_uplink_fetch_duration_seconds_count{kind="unchanged",query="License",url="https://uplink.api.apollographql.com/",otel_scope_name="apollo/router"}"#, Some(Duration::from_secs(120))).await; router.assert_metrics_contains(r#"apollo_router_uplink_fetch_count_total{query="License",status="success",otel_scope_name="apollo/router"}"#, Some(Duration::from_secs(1))).await; } - - Ok(()) } #[tokio::test(flavor = "multi_thread")] -async fn test_subgraph_auth_metrics() -> Result<(), BoxError> { +async fn test_subgraph_auth_metrics() { let mut router = IntegrationTest::builder() .config(SUBGRAPH_AUTH_CONFIG) .build() @@ -108,6 +106,18 @@ async fn test_subgraph_auth_metrics() -> Result<(), BoxError> { ); router.assert_metrics_contains(r#"apollo_router_operations_authentication_aws_sigv4_total{authentication_aws_sigv4_failed="false",subgraph_service_name="products",otel_scope_name="apollo/router"} 2"#, None).await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_metrics_bad_query() { + let mut router = IntegrationTest::builder() + .config(SUBGRAPH_AUTH_CONFIG) + .build() + .await; - Ok(()) + router.start().await; + router.assert_started().await; + // This query won't make it to the supergraph service + router.execute_bad_query().await; + router.assert_metrics_contains(r#"apollo_router_operations_total{http_response_status_code="400",otel_scope_name="apollo/router"} 1"#, None).await; } From bfb7417e987f1b39b67133b69f83e0f871f52cd9 Mon Sep 17 00:00:00 2001 From: bryn Date: Wed, 27 Sep 2023 11:37:59 +0100 Subject: [PATCH 2/3] Lint --- apollo-router/tests/metrics_tests.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/apollo-router/tests/metrics_tests.rs b/apollo-router/tests/metrics_tests.rs index 60b1b7f9a0..fc130d321c 100644 --- a/apollo-router/tests/metrics_tests.rs +++ b/apollo-router/tests/metrics_tests.rs @@ -1,7 +1,6 @@ use std::time::Duration; use serde_json::json; -use tower::BoxError; use crate::common::IntegrationTest; From 4d67075301b56ba15ed323014917efbfb4f3ccd4 Mon Sep 17 00:00:00 2001 From: bryn Date: Wed, 27 Sep 2023 11:51:25 +0100 Subject: [PATCH 3/3] Add comment to explain layer ordering --- apollo-router/src/axum_factory/axum_http_server_factory.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apollo-router/src/axum_factory/axum_http_server_factory.rs b/apollo-router/src/axum_factory/axum_http_server_factory.rs index 08ea743f76..29fde9a0f4 100644 --- a/apollo-router/src/axum_factory/axum_http_server_factory.rs +++ b/apollo-router/src/axum_factory/axum_http_server_factory.rs @@ -404,6 +404,8 @@ where )) .layer(Extension(service_factory)) .layer(cors) + // Telemetry layers MUST be last. This means that they will be hit first during execution of the pipeline + // Adding layers after telemetry will cause us to lose metrics and spans. .layer(TraceLayer::new_for_http().make_span_with(PropagatingMakeSpan { license })) .layer(middleware::from_fn(metrics_handler));