diff --git a/nexus/db-queries/src/db/sec_store.rs b/nexus/db-queries/src/db/sec_store.rs index 1c63a48463..f8fd4ab86d 100644 --- a/nexus/db-queries/src/db/sec_store.rs +++ b/nexus/db-queries/src/db/sec_store.rs @@ -7,9 +7,13 @@ use crate::db::{self, model::Generation}; use anyhow::Context; use async_trait::async_trait; +use dropshot::HttpError; +use futures::TryFutureExt; +use omicron_common::backoff; use slog::Logger; use std::fmt; use std::sync::Arc; +use std::time::Duration; use steno::SagaId; /// Implementation of [`steno::SecStore`] backed by the Omicron CockroachDB @@ -53,16 +57,74 @@ impl steno::SecStore for CockroachDbSecStore { } async fn record_event(&self, event: steno::SagaNodeEvent) { - debug!(&self.log, "recording saga event"; + let log = self.log.new(o!( "saga_id" => event.saga_id.to_string(), - "node_id" => ?event.node_id, - "event_type" => ?event.event_type, - ); + "node_id" => event.node_id.to_string(), + "event_type" => format!("{:?}", event.event_type), + )); + + debug!(&log, "recording saga event"); let our_event = db::saga_types::SagaNodeEvent::new(event, self.sec_id); - // TODO-robustness This should be wrapped with a retry loop rather than - // unwrapping the result. See omicron#2416. - self.datastore.saga_create_event(&our_event).await.unwrap(); + backoff::retry_notify_ext( + // This is an internal service query to CockroachDB. + backoff::retry_policy_internal_service(), + || { + // An interesting question is how to handle errors. + // + // In general, there are some kinds of database errors that are + // temporary/server errors (e.g. network failures), and some + // that are permanent/client errors (e.g. conflict during + // insertion). The permanent ones would require operator + // intervention to fix. + // + // However, there is no way to bubble up errors here, and for + // good reason: it is inherent to the nature of sagas that + // progress is durably recorded. So within *this* code there is + // no option but to retry forever. (Below, however, we do mark + // errors that likely require operator intervention.) + // + // At a higher level, callers should plan for the fact that + // record_event could potentially loop forever. See + // https://github.com/oxidecomputer/omicron/issues/5406 and the + // note in `nexus/src/app/saga.rs`'s `execute_saga` for more + // details. + self.datastore + .saga_create_event(&our_event) + .map_err(backoff::BackoffError::transient) + }, + move |error, call_count, total_duration| { + let http_error = HttpError::from(error.clone()); + if http_error.status_code.is_client_error() { + error!( + &log, + "client error while recording saga event (likely \ + requires operator intervention), retrying anyway"; + "error" => &error, + "call_count" => call_count, + "total_duration" => ?total_duration, + ); + } else if total_duration > Duration::from_secs(20) { + warn!( + &log, + "server error while recording saga event, retrying"; + "error" => &error, + "call_count" => call_count, + "total_duration" => ?total_duration, + ); + } else { + info!( + &log, + "server error while recording saga event, retrying"; + "error" => &error, + "call_count" => call_count, + "total_duration" => ?total_duration, + ); + } + }, + ) + .await + .expect("the above backoff retries forever") } async fn saga_update(&self, id: SagaId, update: steno::SagaCachedState) { diff --git a/nexus/src/app/saga.rs b/nexus/src/app/saga.rs index 93d22df7e1..8a717839f0 100644 --- a/nexus/src/app/saga.rs +++ b/nexus/src/app/saga.rs @@ -213,6 +213,18 @@ impl super::Nexus { let runnable_saga = self.create_runnable_saga(dag).await?; // Actually run the saga to completion. + // + // XXX: This may loop forever in case `SecStore::record_event` fails. + // Ideally, `run_saga` wouldn't both start the saga and wait for it to + // be finished -- instead, it would start off the saga, and then return + // a notification channel that the caller could use to decide: + // + // - either to .await until completion + // - or to stop waiting after a certain period, while still letting the + // saga run in the background. + // + // For more, see https://github.com/oxidecomputer/omicron/issues/5406 + // and the note in `sec_store.rs`'s `record_event`. self.run_saga(runnable_saga).await } }