From 8625eb73c7df600b258dfe8c8671585e86f09630 Mon Sep 17 00:00:00 2001 From: Bryn Cooke Date: Wed, 27 Sep 2023 14:26:58 +0100 Subject: [PATCH] Move `apollo.router.operations` to the outer layer of the router pipeline. (#3919) Our metrics are currently mutated at the supergraph service. This is a throwback from when the router service did not exist. Move operation count metrics to the first thing that happens in the pipeline. Note that this won't catch disconnects because they are reliant on the status code of the response, which will stop processing before the metric is incremented. This also move the creation of the root span to the just after so that cors and extensions are traced. Fixes #3915 --- **Checklist** Complete the checklist (and note appropriate exceptions) before the PR is marked ready-for-review. - [ ] Changes are compatible[^1] - [ ] Documentation[^2] completed - [ ] Performance impact assessed and acceptable - Tests added and passing[^3] - [ ] Unit Tests - [ ] Integration Tests - [ ] Manual Tests **Exceptions** *Note any exceptions here* **Notes** [^1]: It may be appropriate to bring upcoming changes to the attention of other (impacted) groups. Please endeavour to do this before seeking PR approval. The mechanism for doing this will vary considerably, so use your judgement as to how and when to do this. [^2]: Configuration is an important part of many changes. Where applicable please try to document configuration examples. [^3]: Tick whichever testing boxes are applicable. If you are adding Manual Tests, please document the manual testing (extensively) in the Exceptions. --------- Co-authored-by: bryn --- .changesets/fix_bryn_move_telemetry.md | 12 +++++++++++ .../axum_factory/axum_http_server_factory.rs | 18 ++++++++++++++-- apollo-router/src/plugins/telemetry/mod.rs | 12 ----------- apollo-router/tests/common.rs | 17 +++++++++------ apollo-router/tests/metrics_tests.rs | 21 +++++++++++++------ 5 files changed, 54 insertions(+), 26 deletions(-) create mode 100644 .changesets/fix_bryn_move_telemetry.md diff --git a/.changesets/fix_bryn_move_telemetry.md b/.changesets/fix_bryn_move_telemetry.md new file mode 100644 index 0000000000..f128f62cdf --- /dev/null +++ b/.changesets/fix_bryn_move_telemetry.md @@ -0,0 +1,12 @@ +### Ensure that telemetry happens first ([Issue #3915](https://github.com/apollographql/router/issues/3915)) + +Telemetry related logic is now moved to the first thing in the router pipeline. + +Previously the metric `apollo.router.operations` may have missed some requests if they were failed at the router stage. +In addition, some logic happened before root spans were created, which would have caused missing traces. + +`apollo.router.operations` and root spans are now the first thing that happens in the router pipeline for graphql requests. + + + +By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/3919 diff --git a/apollo-router/src/axum_factory/axum_http_server_factory.rs b/apollo-router/src/axum_factory/axum_http_server_factory.rs index 0b414f25b1..29fde9a0f4 100644 --- a/apollo-router/src/axum_factory/axum_http_server_factory.rs +++ b/apollo-router/src/axum_factory/axum_http_server_factory.rs @@ -402,9 +402,12 @@ where (license, Instant::now(), Arc::new(AtomicU64::new(0))), license_handler, )) - .layer(TraceLayer::new_for_http().make_span_with(PropagatingMakeSpan { license })) .layer(Extension(service_factory)) - .layer(cors); + .layer(cors) + // Telemetry layers MUST be last. This means that they will be hit first during execution of the pipeline + // Adding layers after telemetry will cause us to lose metrics and spans. + .layer(TraceLayer::new_for_http().make_span_with(PropagatingMakeSpan { license })) + .layer(middleware::from_fn(metrics_handler)); let route = endpoints_on_main_listener .into_iter() @@ -414,6 +417,17 @@ where Ok(ListenAddrAndRouter(listener, route)) } +async fn metrics_handler(request: Request, next: Next) -> Response { + let resp = next.run(request).await; + u64_counter!( + "apollo.router.operations", + "The number of graphql operations performed by the Router", + 1, + "http.response.status_code" = resp.status().as_u16() as i64 + ); + resp +} + async fn license_handler( State((license, start, delta)): State<(LicenseState, Instant, Arc)>, request: Request, diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 4c76dcf5b7..9a35aeabee 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -833,12 +833,6 @@ impl Telemetry { if !parts.status.is_success() { metric_attrs.push(KeyValue::new("error", parts.status.to_string())); } - u64_counter!( - "apollo.router.operations", - "The number of graphql operations performed by the Router", - 1, - "http.response.status_code" = parts.status.as_u16() as i64 - ); let response = http::Response::from_parts( parts, once(ready(first_response.unwrap_or_default())) @@ -850,12 +844,6 @@ impl Telemetry { } Err(err) => { metric_attrs.push(KeyValue::new("status", "500")); - u64_counter!( - "apollo.router.operations", - "The number of graphql operations performed by the Router", - 1, - "http.response.status_code" = 500 - ); Err(err) } }; diff --git a/apollo-router/tests/common.rs b/apollo-router/tests/common.rs index f7eb55fb82..352962deb8 100644 --- a/apollo-router/tests/common.rs +++ b/apollo-router/tests/common.rs @@ -334,7 +334,7 @@ impl IntegrationTest { pub fn execute_default_query( &self, ) -> impl std::future::Future { - self.execute_query_internal(None) + self.execute_query_internal(&json!({"query":"query {topProducts{name}}","variables":{}})) } #[allow(dead_code)] @@ -342,21 +342,26 @@ impl IntegrationTest { &self, query: &Value, ) -> impl std::future::Future { - self.execute_query_internal(Some(query)) + self.execute_query_internal(query) + } + + #[allow(dead_code)] + pub fn execute_bad_query( + &self, + ) -> impl std::future::Future { + self.execute_query_internal(&json!({"garbage":{}})) } fn execute_query_internal( &self, - query: Option<&Value>, + query: &Value, ) -> impl std::future::Future { assert!( self.router.is_some(), "router was not started, call `router.start().await; router.assert_started().await`" ); - let default_query = &json!({"query":"query {topProducts{name}}","variables":{}}); - let query = query.unwrap_or(default_query).clone(); let dispatch = self.subscriber.clone(); - + let query = query.clone(); async move { let span = info_span!("client_request"); let span_id = span.context().span().span_context().trace_id().to_string(); diff --git a/apollo-router/tests/metrics_tests.rs b/apollo-router/tests/metrics_tests.rs index 5dc48a2aed..fc130d321c 100644 --- a/apollo-router/tests/metrics_tests.rs +++ b/apollo-router/tests/metrics_tests.rs @@ -1,7 +1,6 @@ use std::time::Duration; use serde_json::json; -use tower::BoxError; use crate::common::IntegrationTest; @@ -11,7 +10,7 @@ const PROMETHEUS_CONFIG: &str = include_str!("fixtures/prometheus.router.yaml"); const SUBGRAPH_AUTH_CONFIG: &str = include_str!("fixtures/subgraph_auth.router.yaml"); #[tokio::test(flavor = "multi_thread")] -async fn test_metrics_reloading() -> Result<(), BoxError> { +async fn test_metrics_reloading() { let mut router = IntegrationTest::builder() .config(PROMETHEUS_CONFIG) .build() @@ -68,12 +67,10 @@ async fn test_metrics_reloading() -> Result<(), BoxError> { router.assert_metrics_contains(r#"apollo_router_uplink_fetch_duration_seconds_count{kind="unchanged",query="License",url="https://uplink.api.apollographql.com/",otel_scope_name="apollo/router"}"#, Some(Duration::from_secs(120))).await; router.assert_metrics_contains(r#"apollo_router_uplink_fetch_count_total{query="License",status="success",otel_scope_name="apollo/router"}"#, Some(Duration::from_secs(1))).await; } - - Ok(()) } #[tokio::test(flavor = "multi_thread")] -async fn test_subgraph_auth_metrics() -> Result<(), BoxError> { +async fn test_subgraph_auth_metrics() { let mut router = IntegrationTest::builder() .config(SUBGRAPH_AUTH_CONFIG) .build() @@ -108,6 +105,18 @@ async fn test_subgraph_auth_metrics() -> Result<(), BoxError> { ); router.assert_metrics_contains(r#"apollo_router_operations_authentication_aws_sigv4_total{authentication_aws_sigv4_failed="false",subgraph_service_name="products",otel_scope_name="apollo/router"} 2"#, None).await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_metrics_bad_query() { + let mut router = IntegrationTest::builder() + .config(SUBGRAPH_AUTH_CONFIG) + .build() + .await; - Ok(()) + router.start().await; + router.assert_started().await; + // This query won't make it to the supergraph service + router.execute_bad_query().await; + router.assert_metrics_contains(r#"apollo_router_operations_total{http_response_status_code="400",otel_scope_name="apollo/router"} 1"#, None).await; }