From b9632bb97f9bdbf5a275c83820aabaaa95b6b35c Mon Sep 17 00:00:00 2001 From: Iryna Shestak Date: Wed, 4 Dec 2024 13:55:56 +0100 Subject: [PATCH 1/5] convert panics to internal errors in federation crate There were several panic!() marcos still in use in `apollo-federation`. Instead of panicking, these functions now return the expected FederationError. If you're searching in the code, you'll still see a few panics. They are in tests, in macros or function annotated with `#[cfg(test)]`, or in [unused] composition part of the codebase. More specifically: * in internally used [CLI](https://github.com/apollographql/router/blob/85f99f19e2a2907bd5597687773d68d63644a0c6/apollo-federation/cli/src/main.rs#L214) * in tests: * [operation tests](https://github.com/apollographql/router/blob/85f99f19e2a2907bd5597687773d68d63644a0c6/apollo-federation/src/operation/tests/mod.rs) * [build query graph tests](https://github.com/apollographql/router/blob/85f99f19e2a2907bd5597687773d68d63644a0c6/apollo-federation/src/query_graph/build_query_graph.rs#L2448) * [api schema tests](https://github.com/apollographql/router/blob/85f99f19e2a2907bd5597687773d68d63644a0c6/apollo-federation/tests/api_schema.rs) * [build query plan support tests](https://github.com/apollographql/router/blob/85f99f19e2a2907bd5597687773d68d63644a0c6/apollo-federation/tests/query_plan/build_query_plan_support.rs) * [context build query plan test](https://github.com/apollographql/router/blob/85f99f19e2a2907bd5597687773d68d63644a0c6/apollo-federation/tests/query_plan/build_query_plan_tests/context.rs) * [fetch operation name tests](https://github.com/apollographql/router/blob/85f99f19e2a2907bd5597687773d68d63644a0c6/apollo-federation/tests/query_plan/build_query_plan_tests/fetch_operation_names.rs) * test helpers annotated with `#[cfg(test)]`: * [connectors' json selection test helper](https://github.com/apollographql/router/blob/85f99f19e2a2907bd5597687773d68d63644a0c6/apollo-federation/src/sources/connect/json_selection/apply_to.rs#L116) * [connectors selection! macro](https://github.com/apollographql/router/blob/85f99f19e2a2907bd5597687773d68d63644a0c6/apollo-federation/src/sources/connect/json_selection/helpers.rs#L10) * [argument merge strategies](https://github.com/apollographql/router/blob/85f99f19e2a2907bd5597687773d68d63644a0c6/apollo-federation/src/schema/argument_composition_strategies.rs) used solely in composition --- .../src/query_plan/fetch_dependency_graph.rs | 9 ++++++--- apollo-federation/src/query_plan/query_planner.rs | 5 +++-- .../src/sources/connect/json_selection/apply_to.rs | 2 ++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/apollo-federation/src/query_plan/fetch_dependency_graph.rs b/apollo-federation/src/query_plan/fetch_dependency_graph.rs index b91699ecc4..9a68f4f77e 100644 --- a/apollo-federation/src/query_plan/fetch_dependency_graph.rs +++ b/apollo-federation/src/query_plan/fetch_dependency_graph.rs @@ -32,6 +32,7 @@ use crate::bail; use crate::display_helpers::DisplayOption; use crate::error::FederationError; use crate::error::SingleFederationError; +use crate::internal_error; use crate::link::graphql_definition::DeferDirectiveArguments; use crate::operation::ArgumentList; use crate::operation::ContainmentOptions; @@ -1718,12 +1719,12 @@ impl FetchDependencyGraph { children.push(child_index); } else { let Some(child_defer_ref) = &child.defer_ref else { - panic!( + return Err(internal_error!( "{} has defer_ref `{}`, so its child {} cannot have a top-level defer_ref.", node.display(node_index), DisplayOption(node.defer_ref.as_ref()), child.display(child_index), - ); + )); }; if !node.selection_set.selection_set.selections.is_empty() { @@ -3402,7 +3403,9 @@ impl DeferTracking { if let Some(parent_ref) = &defer_context.current_defer_ref { let Some(parent_info) = self.deferred.get_mut(parent_ref) else { - panic!("Cannot find info for parent {parent_ref} or {label}"); + return Err(internal_error!( + "Cannot find info for parent {parent_ref} or {label}" + )); }; parent_info.deferred.insert(label.clone()); diff --git a/apollo-federation/src/query_plan/query_planner.rs b/apollo-federation/src/query_plan/query_planner.rs index ed2a4b2589..7a167d2626 100644 --- a/apollo-federation/src/query_plan/query_planner.rs +++ b/apollo-federation/src/query_plan/query_planner.rs @@ -16,6 +16,7 @@ use super::fetch_dependency_graph::FetchIdGenerator; use super::ConditionNode; use crate::error::FederationError; use crate::error::SingleFederationError; +use crate::internal_error; use crate::operation::normalize_operation; use crate::operation::NamedFragments; use crate::operation::NormalizedDefer; @@ -443,10 +444,10 @@ impl QueryPlanner { .root_kinds_to_nodes()? .get(&normalized_operation.root_kind) else { - panic!( + return Err(internal_error!( "Shouldn't have a {0} operation if the subgraphs don't have a {0} root", normalized_operation.root_kind - ); + )); }; let operation_compression = if self.config.generate_query_fragments { diff --git a/apollo-federation/src/sources/connect/json_selection/apply_to.rs b/apollo-federation/src/sources/connect/json_selection/apply_to.rs index 08cb5d0fcb..5c76c91414 100644 --- a/apollo-federation/src/sources/connect/json_selection/apply_to.rs +++ b/apollo-federation/src/sources/connect/json_selection/apply_to.rs @@ -92,6 +92,8 @@ impl ApplyToError { })) } + // This macro is useful for tests, but it absolutely should never be used with + // dynamic input at runtime, since it panics for any input that's not JSON. #[cfg(test)] fn from_json(json: &JSON) -> Self { if let JSON::Object(error) = json { From 797c65dcb29d1b80ee02ad20cdfc7b12c80fc0b7 Mon Sep 17 00:00:00 2001 From: Iryna Shestak Date: Wed, 4 Dec 2024 14:08:43 +0100 Subject: [PATCH 2/5] remove backtrace capturing in federation crate We were adding backtrace captures to SingleFederationErrors in order to help with debugging unexpected errors while were still developing the rust query planner. Capturing backtraces is quite often [a slow and memory intensive process](https://doc.rust-lang.org/stable/std/backtrace/struct.Backtrace.html#method.capture), so we want to remove this additional functionality for rust query planner's GA. Since we now have a lot of confidence in this implementation and have not faced a panic across millions of operations that were tested and compared, this is a safe thing to remove. --- apollo-federation/src/error/mod.rs | 28 ++++--------------- apollo-federation/src/operation/rebase.rs | 7 ++--- .../src/query_plan/fetch_dependency_graph.rs | 7 ++--- .../src/query_planner/bridge_query_planner.rs | 5 +--- 4 files changed, 12 insertions(+), 35 deletions(-) diff --git a/apollo-federation/src/error/mod.rs b/apollo-federation/src/error/mod.rs index 8d1ceb156b..db775b92d5 100644 --- a/apollo-federation/src/error/mod.rs +++ b/apollo-federation/src/error/mod.rs @@ -1,4 +1,3 @@ -use std::backtrace::Backtrace; use std::cmp::Ordering; use std::fmt::Display; use std::fmt::Formatter; @@ -523,8 +522,8 @@ pub struct MultipleFederationErrors { impl MultipleFederationErrors { pub fn push(&mut self, error: FederationError) { match error { - FederationError::SingleFederationError { inner, .. } => { - self.errors.push(inner); + FederationError::SingleFederationError(error) => { + self.errors.push(error); } FederationError::MultipleFederationErrors(errors) => { self.errors.extend(errors.errors); @@ -591,22 +590,14 @@ impl Display for AggregateFederationError { } } -/// Work around thiserror, which when an error field has a type named `Backtrace` -/// "helpfully" implements `Error::provides` even though that API is not stable yet: -/// -type ThiserrorTrustMeThisIsTotallyNotABacktrace = Backtrace; - // PORT_NOTE: Often times, JS functions would either throw/return a GraphQLError, return a vector // of GraphQLErrors, or take a vector of GraphQLErrors and group them together under an // AggregateGraphQLError which itself would have a specific error message and code, and throw that. // We represent all these cases with an enum, and delegate to the members. #[derive(thiserror::Error)] pub enum FederationError { - #[error("{inner}")] - SingleFederationError { - inner: SingleFederationError, - trace: ThiserrorTrustMeThisIsTotallyNotABacktrace, - }, + #[error(transparent)] + SingleFederationError(#[from] SingleFederationError), #[error(transparent)] MultipleFederationErrors(#[from] MultipleFederationErrors), #[error(transparent)] @@ -616,22 +607,13 @@ pub enum FederationError { impl std::fmt::Debug for FederationError { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { - Self::SingleFederationError { inner, trace } => write!(f, "{inner}\n{trace}"), + Self::SingleFederationError(inner) => std::fmt::Debug::fmt(inner, f), Self::MultipleFederationErrors(inner) => std::fmt::Debug::fmt(inner, f), Self::AggregateFederationError(inner) => std::fmt::Debug::fmt(inner, f), } } } -impl From for FederationError { - fn from(inner: SingleFederationError) -> Self { - Self::SingleFederationError { - inner, - trace: Backtrace::capture(), - } - } -} - impl From for FederationError { fn from(value: DiagnosticList) -> Self { SingleFederationError::from(value).into() diff --git a/apollo-federation/src/operation/rebase.rs b/apollo-federation/src/operation/rebase.rs index 09b8799067..99fd841438 100644 --- a/apollo-federation/src/operation/rebase.rs +++ b/apollo-federation/src/operation/rebase.rs @@ -150,10 +150,9 @@ impl FederationError { fn is_rebase_error(&self) -> bool { matches!( self, - crate::error::FederationError::SingleFederationError { - inner: crate::error::SingleFederationError::InternalRebaseError(_), - .. - } + crate::error::FederationError::SingleFederationError( + crate::error::SingleFederationError::InternalRebaseError(_) + ) ) } } diff --git a/apollo-federation/src/query_plan/fetch_dependency_graph.rs b/apollo-federation/src/query_plan/fetch_dependency_graph.rs index 9a68f4f77e..10bbdeef16 100644 --- a/apollo-federation/src/query_plan/fetch_dependency_graph.rs +++ b/apollo-federation/src/query_plan/fetch_dependency_graph.rs @@ -2677,10 +2677,9 @@ impl FetchDependencyGraphNode { let operation = operation_compression.compress(&self.subgraph_name, subgraph_schema, operation)?; let operation_document = operation.try_into().map_err(|err| match err { - FederationError::SingleFederationError { - inner: SingleFederationError::InvalidGraphQL { diagnostics }, - .. - } => FederationError::internal(format!( + FederationError::SingleFederationError(SingleFederationError::InvalidGraphQL { + diagnostics, + }) => FederationError::internal(format!( "Query planning produced an invalid subgraph operation.\n{diagnostics}" )), _ => err, diff --git a/apollo-router/src/query_planner/bridge_query_planner.rs b/apollo-router/src/query_planner/bridge_query_planner.rs index cc058b5555..549e9bddd9 100644 --- a/apollo-router/src/query_planner/bridge_query_planner.rs +++ b/apollo-router/src/query_planner/bridge_query_planner.rs @@ -183,10 +183,7 @@ impl PlannerMode { let result = QueryPlanner::new(schema.federation_supergraph(), config); match &result { - Err(FederationError::SingleFederationError { - inner: error, - trace: _, - }) => match error { + Err(FederationError::SingleFederationError(error)) => match error { SingleFederationError::UnsupportedFederationVersion { .. } => { metric_rust_qp_init(Some(UNSUPPORTED_FED1)); } From 8e9e23b2ebf61ed5bd31c037fe476f55dd009ae8 Mon Sep 17 00:00:00 2001 From: Iryna Shestak Date: Wed, 4 Dec 2024 14:41:14 +0100 Subject: [PATCH 3/5] remove catch_unwind around the rust query planner Since we are now confident in rust query planner's implementation, we no longer need to catch_unwind when planning an operation in Rust. --- .../maint_lrlna_remove_catch_unwind.md | 15 +++++ apollo-router/src/executable.rs | 13 +---- .../src/query_planner/dual_query_planner.rs | 58 ++++++------------- 3 files changed, 33 insertions(+), 53 deletions(-) create mode 100644 .changesets/maint_lrlna_remove_catch_unwind.md diff --git a/.changesets/maint_lrlna_remove_catch_unwind.md b/.changesets/maint_lrlna_remove_catch_unwind.md new file mode 100644 index 0000000000..0475d08cc0 --- /dev/null +++ b/.changesets/maint_lrlna_remove_catch_unwind.md @@ -0,0 +1,15 @@ + +### Remove catch_unwind wrapper around the native query planner ([PR #6397](https://github.com/apollographql/router/pull/6397)) + +As part of internal maintenance of the query planner, we are removing the +catch_unwind wrapper around the native query planner. This wrapper was used as +an extra safeguard for potential panics the native planner could produce. The +native query planner no longer has any code paths that could panic. We have also +not witnessed a panic in the last four months, having processed 560 million real +user operations through the native planner. + +This maintenance work also removes backtrace capture for federation errors which +was used for debugging and is no longer necessary as we have the confidence in +the native planner's implementation. + +By [@lrlna](https://github.com/lrlna) in https://github.com/apollographql/router/pull/6397 diff --git a/apollo-router/src/executable.rs b/apollo-router/src/executable.rs index a67e294a7f..2dce46b128 100644 --- a/apollo-router/src/executable.rs +++ b/apollo-router/src/executable.rs @@ -1,6 +1,5 @@ //! Main entry point for CLI command to start server. -use std::cell::Cell; use std::env; use std::fmt::Debug; use std::net::SocketAddr; @@ -751,20 +750,10 @@ fn setup_panic_handler() { } else { tracing::error!("{}", e) } - if !USING_CATCH_UNWIND.get() { - // Once we've panic'ed the behaviour of the router is non-deterministic - // We've logged out the panic details. Terminate with an error code - std::process::exit(1); - } + std::process::exit(1); })); } -// TODO: once the Rust query planner does not use `todo!()` anymore, -// remove this and the use of `catch_unwind` to call it. -thread_local! { - pub(crate) static USING_CATCH_UNWIND: Cell = const { Cell::new(false) }; -} - static COPIED: AtomicBool = AtomicBool::new(false); fn copy_args_to_env() { diff --git a/apollo-router/src/query_planner/dual_query_planner.rs b/apollo-router/src/query_planner/dual_query_planner.rs index 9e5250a447..831bdebe45 100644 --- a/apollo-router/src/query_planner/dual_query_planner.rs +++ b/apollo-router/src/query_planner/dual_query_planner.rs @@ -15,6 +15,7 @@ use apollo_compiler::validation::Valid; use apollo_compiler::ExecutableDocument; use apollo_compiler::Name; use apollo_compiler::Node; +use apollo_federation::error::FederationError; use apollo_federation::query_plan::query_planner::QueryPlanOptions; use apollo_federation::query_plan::query_planner::QueryPlanner; use apollo_federation::query_plan::QueryPlan; @@ -24,7 +25,6 @@ use super::fetch::SubgraphOperation; use super::subscription::SubscriptionNode; use super::FlattenNode; use crate::error::format_bridge_errors; -use crate::executable::USING_CATCH_UNWIND; use crate::query_planner::bridge_query_planner::metric_query_planning_plan_duration; use crate::query_planner::bridge_query_planner::JS_QP_MODE; use crate::query_planner::bridge_query_planner::RUST_QP_MODE; @@ -80,47 +80,23 @@ impl BothModeComparisonJob { } fn execute(self) { - // TODO: once the Rust query planner does not use `todo!()` anymore, - // remove `USING_CATCH_UNWIND` and this use of `catch_unwind`. - let rust_result = std::panic::catch_unwind(|| { - let name = self - .operation_name - .clone() - .map(Name::try_from) - .transpose()?; - USING_CATCH_UNWIND.set(true); - - let start = Instant::now(); - - // No question mark operator or macro from here … - let result = + let start = Instant::now(); + + let rust_result = self + .operation_name + .as_deref() + .map(|n| Name::new(n).map_err(FederationError::from)) + .transpose() + .and_then(|operation| { self.rust_planner - .build_query_plan(&self.document, name, self.plan_options); - - let elapsed = start.elapsed().as_secs_f64(); - metric_query_planning_plan_duration(RUST_QP_MODE, elapsed); - - metric_query_planning_plan_both_comparison_duration(RUST_QP_MODE, elapsed); - metric_query_planning_plan_both_comparison_duration(JS_QP_MODE, self.js_duration); - - // … to here, so the thread can only eiher reach here or panic. - // We unset USING_CATCH_UNWIND in both cases. - USING_CATCH_UNWIND.set(false); - result - }) - .unwrap_or_else(|panic| { - USING_CATCH_UNWIND.set(false); - Err(apollo_federation::error::FederationError::internal( - format!( - "query planner panicked: {}", - panic - .downcast_ref::() - .map(|s| s.as_str()) - .or_else(|| panic.downcast_ref::<&str>().copied()) - .unwrap_or_default() - ), - )) - }); + .build_query_plan(&self.document, operation, self.plan_options) + }); + + let elapsed = start.elapsed().as_secs_f64(); + metric_query_planning_plan_duration(RUST_QP_MODE, elapsed); + + metric_query_planning_plan_both_comparison_duration(RUST_QP_MODE, elapsed); + metric_query_planning_plan_both_comparison_duration(JS_QP_MODE, self.js_duration); let name = self.operation_name.as_deref(); let operation_desc = if let Ok(operation) = self.document.operations.get(name) { From 7d629059cad60c4f8ccf514607e08fa059a7a4da Mon Sep 17 00:00:00 2001 From: Iryna Shestak Date: Fri, 6 Dec 2024 13:31:58 +0100 Subject: [PATCH 4/5] use bail!() instead of internal_error!() --- .../src/query_plan/fetch_dependency_graph.rs | 12 +++++------- apollo-federation/src/query_plan/query_planner.rs | 6 +++--- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/apollo-federation/src/query_plan/fetch_dependency_graph.rs b/apollo-federation/src/query_plan/fetch_dependency_graph.rs index 10bbdeef16..0b0763ae82 100644 --- a/apollo-federation/src/query_plan/fetch_dependency_graph.rs +++ b/apollo-federation/src/query_plan/fetch_dependency_graph.rs @@ -1719,12 +1719,12 @@ impl FetchDependencyGraph { children.push(child_index); } else { let Some(child_defer_ref) = &child.defer_ref else { - return Err(internal_error!( + bail!( "{} has defer_ref `{}`, so its child {} cannot have a top-level defer_ref.", node.display(node_index), DisplayOption(node.defer_ref.as_ref()), child.display(child_index), - )); + ); }; if !node.selection_set.selection_set.selections.is_empty() { @@ -2679,9 +2679,9 @@ impl FetchDependencyGraphNode { let operation_document = operation.try_into().map_err(|err| match err { FederationError::SingleFederationError(SingleFederationError::InvalidGraphQL { diagnostics, - }) => FederationError::internal(format!( + }) => internal_error!( "Query planning produced an invalid subgraph operation.\n{diagnostics}" - )), + ), _ => err, })?; @@ -3402,9 +3402,7 @@ impl DeferTracking { if let Some(parent_ref) = &defer_context.current_defer_ref { let Some(parent_info) = self.deferred.get_mut(parent_ref) else { - return Err(internal_error!( - "Cannot find info for parent {parent_ref} or {label}" - )); + bail!("Cannot find info for parent {parent_ref} or {label}") }; parent_info.deferred.insert(label.clone()); diff --git a/apollo-federation/src/query_plan/query_planner.rs b/apollo-federation/src/query_plan/query_planner.rs index 7a167d2626..72417f9b86 100644 --- a/apollo-federation/src/query_plan/query_planner.rs +++ b/apollo-federation/src/query_plan/query_planner.rs @@ -14,9 +14,9 @@ use tracing::trace; use super::fetch_dependency_graph::FetchIdGenerator; use super::ConditionNode; +use crate::bail; use crate::error::FederationError; use crate::error::SingleFederationError; -use crate::internal_error; use crate::operation::normalize_operation; use crate::operation::NamedFragments; use crate::operation::NormalizedDefer; @@ -444,10 +444,10 @@ impl QueryPlanner { .root_kinds_to_nodes()? .get(&normalized_operation.root_kind) else { - return Err(internal_error!( + bail!( "Shouldn't have a {0} operation if the subgraphs don't have a {0} root", normalized_operation.root_kind - )); + ) }; let operation_compression = if self.config.generate_query_fragments { From b838fd3a0a212cbce64bdd2ffb6290f7d9849f57 Mon Sep 17 00:00:00 2001 From: Iryna Shestak Date: Fri, 6 Dec 2024 13:53:25 +0100 Subject: [PATCH 5/5] add back the comment in executable.rs --- apollo-router/src/executable.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apollo-router/src/executable.rs b/apollo-router/src/executable.rs index 2dce46b128..afe70ff552 100644 --- a/apollo-router/src/executable.rs +++ b/apollo-router/src/executable.rs @@ -750,6 +750,9 @@ fn setup_panic_handler() { } else { tracing::error!("{}", e) } + + // Once we've panic'ed the behaviour of the router is non-deterministic + // We've logged out the panic details. Terminate with an error code std::process::exit(1); })); }