diff --git a/.changesets/maint_lrlna_remove_catch_unwind.md b/.changesets/maint_lrlna_remove_catch_unwind.md new file mode 100644 index 0000000000..0475d08cc0 --- /dev/null +++ b/.changesets/maint_lrlna_remove_catch_unwind.md @@ -0,0 +1,15 @@ + +### Remove catch_unwind wrapper around the native query planner ([PR #6397](https://github.com/apollographql/router/pull/6397)) + +As part of internal maintenance of the query planner, we are removing the +catch_unwind wrapper around the native query planner. This wrapper was used as +an extra safeguard for potential panics the native planner could produce. The +native query planner no longer has any code paths that could panic. We have also +not witnessed a panic in the last four months, having processed 560 million real +user operations through the native planner. + +This maintenance work also removes backtrace capture for federation errors which +was used for debugging and is no longer necessary as we have the confidence in +the native planner's implementation. + +By [@lrlna](https://github.com/lrlna) in https://github.com/apollographql/router/pull/6397 diff --git a/apollo-federation/src/error/mod.rs b/apollo-federation/src/error/mod.rs index 8d1ceb156b..db775b92d5 100644 --- a/apollo-federation/src/error/mod.rs +++ b/apollo-federation/src/error/mod.rs @@ -1,4 +1,3 @@ -use std::backtrace::Backtrace; use std::cmp::Ordering; use std::fmt::Display; use std::fmt::Formatter; @@ -523,8 +522,8 @@ pub struct MultipleFederationErrors { impl MultipleFederationErrors { pub fn push(&mut self, error: FederationError) { match error { - FederationError::SingleFederationError { inner, .. } => { - self.errors.push(inner); + FederationError::SingleFederationError(error) => { + self.errors.push(error); } FederationError::MultipleFederationErrors(errors) => { self.errors.extend(errors.errors); @@ -591,22 +590,14 @@ impl Display for AggregateFederationError { } } -/// Work around thiserror, which when an error field has a type named `Backtrace` -/// "helpfully" implements `Error::provides` even though that API is not stable yet: -/// -type ThiserrorTrustMeThisIsTotallyNotABacktrace = Backtrace; - // PORT_NOTE: Often times, JS functions would either throw/return a GraphQLError, return a vector // of GraphQLErrors, or take a vector of GraphQLErrors and group them together under an // AggregateGraphQLError which itself would have a specific error message and code, and throw that. // We represent all these cases with an enum, and delegate to the members. #[derive(thiserror::Error)] pub enum FederationError { - #[error("{inner}")] - SingleFederationError { - inner: SingleFederationError, - trace: ThiserrorTrustMeThisIsTotallyNotABacktrace, - }, + #[error(transparent)] + SingleFederationError(#[from] SingleFederationError), #[error(transparent)] MultipleFederationErrors(#[from] MultipleFederationErrors), #[error(transparent)] @@ -616,22 +607,13 @@ pub enum FederationError { impl std::fmt::Debug for FederationError { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { - Self::SingleFederationError { inner, trace } => write!(f, "{inner}\n{trace}"), + Self::SingleFederationError(inner) => std::fmt::Debug::fmt(inner, f), Self::MultipleFederationErrors(inner) => std::fmt::Debug::fmt(inner, f), Self::AggregateFederationError(inner) => std::fmt::Debug::fmt(inner, f), } } } -impl From for FederationError { - fn from(inner: SingleFederationError) -> Self { - Self::SingleFederationError { - inner, - trace: Backtrace::capture(), - } - } -} - impl From for FederationError { fn from(value: DiagnosticList) -> Self { SingleFederationError::from(value).into() diff --git a/apollo-federation/src/query_plan/fetch_dependency_graph.rs b/apollo-federation/src/query_plan/fetch_dependency_graph.rs index c7ee7e8e22..14e259aa68 100644 --- a/apollo-federation/src/query_plan/fetch_dependency_graph.rs +++ b/apollo-federation/src/query_plan/fetch_dependency_graph.rs @@ -32,6 +32,7 @@ use crate::bail; use crate::display_helpers::DisplayOption; use crate::error::FederationError; use crate::error::SingleFederationError; +use crate::internal_error; use crate::link::graphql_definition::DeferDirectiveArguments; use crate::operation::ArgumentList; use crate::operation::ContainmentOptions; @@ -1709,7 +1710,7 @@ impl FetchDependencyGraph { children.push(child_index); } else { let Some(child_defer_ref) = &child.defer_ref else { - panic!( + bail!( "{} has defer_ref `{}`, so its child {} cannot have a top-level defer_ref.", node.display(node_index), DisplayOption(node.defer_ref.as_ref()), @@ -2666,12 +2667,11 @@ impl FetchDependencyGraphNode { }; let operation = operation_compression.compress(operation)?; let operation_document = operation.try_into().map_err(|err| match err { - FederationError::SingleFederationError { - inner: SingleFederationError::InvalidGraphQL { diagnostics }, - .. - } => FederationError::internal(format!( + FederationError::SingleFederationError(SingleFederationError::InvalidGraphQL { + diagnostics, + }) => internal_error!( "Query planning produced an invalid subgraph operation.\n{diagnostics}" - )), + ), _ => err, })?; @@ -3392,7 +3392,7 @@ impl DeferTracking { if let Some(parent_ref) = &defer_context.current_defer_ref { let Some(parent_info) = self.deferred.get_mut(parent_ref) else { - panic!("Cannot find info for parent {parent_ref} or {label}"); + bail!("Cannot find info for parent {parent_ref} or {label}") }; parent_info.deferred.insert(label.clone()); diff --git a/apollo-federation/src/query_plan/query_planner.rs b/apollo-federation/src/query_plan/query_planner.rs index 0c94adde1d..abf514382f 100644 --- a/apollo-federation/src/query_plan/query_planner.rs +++ b/apollo-federation/src/query_plan/query_planner.rs @@ -14,6 +14,7 @@ use tracing::trace; use super::fetch_dependency_graph::FetchIdGenerator; use super::ConditionNode; +use crate::bail; use crate::error::FederationError; use crate::error::SingleFederationError; use crate::operation::normalize_operation; @@ -432,10 +433,10 @@ impl QueryPlanner { .root_kinds_to_nodes()? .get(&normalized_operation.root_kind) else { - panic!( + bail!( "Shouldn't have a {0} operation if the subgraphs don't have a {0} root", normalized_operation.root_kind - ); + ) }; let operation_compression = if self.config.generate_query_fragments { diff --git a/apollo-federation/src/sources/connect/json_selection/apply_to.rs b/apollo-federation/src/sources/connect/json_selection/apply_to.rs index 08cb5d0fcb..5c76c91414 100644 --- a/apollo-federation/src/sources/connect/json_selection/apply_to.rs +++ b/apollo-federation/src/sources/connect/json_selection/apply_to.rs @@ -92,6 +92,8 @@ impl ApplyToError { })) } + // This macro is useful for tests, but it absolutely should never be used with + // dynamic input at runtime, since it panics for any input that's not JSON. #[cfg(test)] fn from_json(json: &JSON) -> Self { if let JSON::Object(error) = json { diff --git a/apollo-router/src/executable.rs b/apollo-router/src/executable.rs index a67e294a7f..afe70ff552 100644 --- a/apollo-router/src/executable.rs +++ b/apollo-router/src/executable.rs @@ -1,6 +1,5 @@ //! Main entry point for CLI command to start server. -use std::cell::Cell; use std::env; use std::fmt::Debug; use std::net::SocketAddr; @@ -751,18 +750,11 @@ fn setup_panic_handler() { } else { tracing::error!("{}", e) } - if !USING_CATCH_UNWIND.get() { - // Once we've panic'ed the behaviour of the router is non-deterministic - // We've logged out the panic details. Terminate with an error code - std::process::exit(1); - } - })); -} -// TODO: once the Rust query planner does not use `todo!()` anymore, -// remove this and the use of `catch_unwind` to call it. -thread_local! { - pub(crate) static USING_CATCH_UNWIND: Cell = const { Cell::new(false) }; + // Once we've panic'ed the behaviour of the router is non-deterministic + // We've logged out the panic details. Terminate with an error code + std::process::exit(1); + })); } static COPIED: AtomicBool = AtomicBool::new(false); diff --git a/apollo-router/src/query_planner/bridge_query_planner.rs b/apollo-router/src/query_planner/bridge_query_planner.rs index dac65efe45..4c820299f5 100644 --- a/apollo-router/src/query_planner/bridge_query_planner.rs +++ b/apollo-router/src/query_planner/bridge_query_planner.rs @@ -170,10 +170,7 @@ impl PlannerMode { let result = QueryPlanner::new(schema.federation_supergraph(), config); match &result { - Err(FederationError::SingleFederationError { - inner: error, - trace: _, - }) => match error { + Err(FederationError::SingleFederationError(error)) => match error { SingleFederationError::UnsupportedFederationVersion { .. } => { metric_rust_qp_init(Some(UNSUPPORTED_FED1)); } diff --git a/apollo-router/src/query_planner/dual_query_planner.rs b/apollo-router/src/query_planner/dual_query_planner.rs index 9e5250a447..831bdebe45 100644 --- a/apollo-router/src/query_planner/dual_query_planner.rs +++ b/apollo-router/src/query_planner/dual_query_planner.rs @@ -15,6 +15,7 @@ use apollo_compiler::validation::Valid; use apollo_compiler::ExecutableDocument; use apollo_compiler::Name; use apollo_compiler::Node; +use apollo_federation::error::FederationError; use apollo_federation::query_plan::query_planner::QueryPlanOptions; use apollo_federation::query_plan::query_planner::QueryPlanner; use apollo_federation::query_plan::QueryPlan; @@ -24,7 +25,6 @@ use super::fetch::SubgraphOperation; use super::subscription::SubscriptionNode; use super::FlattenNode; use crate::error::format_bridge_errors; -use crate::executable::USING_CATCH_UNWIND; use crate::query_planner::bridge_query_planner::metric_query_planning_plan_duration; use crate::query_planner::bridge_query_planner::JS_QP_MODE; use crate::query_planner::bridge_query_planner::RUST_QP_MODE; @@ -80,47 +80,23 @@ impl BothModeComparisonJob { } fn execute(self) { - // TODO: once the Rust query planner does not use `todo!()` anymore, - // remove `USING_CATCH_UNWIND` and this use of `catch_unwind`. - let rust_result = std::panic::catch_unwind(|| { - let name = self - .operation_name - .clone() - .map(Name::try_from) - .transpose()?; - USING_CATCH_UNWIND.set(true); - - let start = Instant::now(); - - // No question mark operator or macro from here … - let result = + let start = Instant::now(); + + let rust_result = self + .operation_name + .as_deref() + .map(|n| Name::new(n).map_err(FederationError::from)) + .transpose() + .and_then(|operation| { self.rust_planner - .build_query_plan(&self.document, name, self.plan_options); - - let elapsed = start.elapsed().as_secs_f64(); - metric_query_planning_plan_duration(RUST_QP_MODE, elapsed); - - metric_query_planning_plan_both_comparison_duration(RUST_QP_MODE, elapsed); - metric_query_planning_plan_both_comparison_duration(JS_QP_MODE, self.js_duration); - - // … to here, so the thread can only eiher reach here or panic. - // We unset USING_CATCH_UNWIND in both cases. - USING_CATCH_UNWIND.set(false); - result - }) - .unwrap_or_else(|panic| { - USING_CATCH_UNWIND.set(false); - Err(apollo_federation::error::FederationError::internal( - format!( - "query planner panicked: {}", - panic - .downcast_ref::() - .map(|s| s.as_str()) - .or_else(|| panic.downcast_ref::<&str>().copied()) - .unwrap_or_default() - ), - )) - }); + .build_query_plan(&self.document, operation, self.plan_options) + }); + + let elapsed = start.elapsed().as_secs_f64(); + metric_query_planning_plan_duration(RUST_QP_MODE, elapsed); + + metric_query_planning_plan_both_comparison_duration(RUST_QP_MODE, elapsed); + metric_query_planning_plan_both_comparison_duration(JS_QP_MODE, self.js_duration); let name = self.operation_name.as_deref(); let operation_desc = if let Ok(operation) = self.document.operations.get(name) {