Skip to content

Commit

Permalink
[nexus-db-queries] add a retry loop to saga record_event (#5390)
Browse files Browse the repository at this point in the history
We observed during a mupdate on Friday that Nexus panicked because of
network flakiness. This is an attempt to address that by adding a retry
loop.

Fixes #2416.
  • Loading branch information
sunshowers authored Apr 4, 2024
1 parent 4b5ddcc commit 1f1a63f
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 7 deletions.
76 changes: 69 additions & 7 deletions nexus/db-queries/src/db/sec_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
use crate::db::{self, model::Generation};
use anyhow::Context;
use async_trait::async_trait;
use dropshot::HttpError;
use futures::TryFutureExt;
use omicron_common::backoff;
use slog::Logger;
use std::fmt;
use std::sync::Arc;
use std::time::Duration;
use steno::SagaId;

/// Implementation of [`steno::SecStore`] backed by the Omicron CockroachDB
Expand Down Expand Up @@ -53,16 +57,74 @@ impl steno::SecStore for CockroachDbSecStore {
}

async fn record_event(&self, event: steno::SagaNodeEvent) {
debug!(&self.log, "recording saga event";
let log = self.log.new(o!(
"saga_id" => event.saga_id.to_string(),
"node_id" => ?event.node_id,
"event_type" => ?event.event_type,
);
"node_id" => event.node_id.to_string(),
"event_type" => format!("{:?}", event.event_type),
));

debug!(&log, "recording saga event");
let our_event = db::saga_types::SagaNodeEvent::new(event, self.sec_id);

// TODO-robustness This should be wrapped with a retry loop rather than
// unwrapping the result. See omicron#2416.
self.datastore.saga_create_event(&our_event).await.unwrap();
backoff::retry_notify_ext(
// This is an internal service query to CockroachDB.
backoff::retry_policy_internal_service(),
|| {
// An interesting question is how to handle errors.
//
// In general, there are some kinds of database errors that are
// temporary/server errors (e.g. network failures), and some
// that are permanent/client errors (e.g. conflict during
// insertion). The permanent ones would require operator
// intervention to fix.
//
// However, there is no way to bubble up errors here, and for
// good reason: it is inherent to the nature of sagas that
// progress is durably recorded. So within *this* code there is
// no option but to retry forever. (Below, however, we do mark
// errors that likely require operator intervention.)
//
// At a higher level, callers should plan for the fact that
// record_event could potentially loop forever. See
// https://github.com/oxidecomputer/omicron/issues/5406 and the
// note in `nexus/src/app/saga.rs`'s `execute_saga` for more
// details.
self.datastore
.saga_create_event(&our_event)
.map_err(backoff::BackoffError::transient)
},
move |error, call_count, total_duration| {
let http_error = HttpError::from(error.clone());
if http_error.status_code.is_client_error() {
error!(
&log,
"client error while recording saga event (likely \
requires operator intervention), retrying anyway";
"error" => &error,
"call_count" => call_count,
"total_duration" => ?total_duration,
);
} else if total_duration > Duration::from_secs(20) {
warn!(
&log,
"server error while recording saga event, retrying";
"error" => &error,
"call_count" => call_count,
"total_duration" => ?total_duration,
);
} else {
info!(
&log,
"server error while recording saga event, retrying";
"error" => &error,
"call_count" => call_count,
"total_duration" => ?total_duration,
);
}
},
)
.await
.expect("the above backoff retries forever")
}

async fn saga_update(&self, id: SagaId, update: steno::SagaCachedState) {
Expand Down
12 changes: 12 additions & 0 deletions nexus/src/app/saga.rs
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,18 @@ impl super::Nexus {
let runnable_saga = self.create_runnable_saga(dag).await?;

// Actually run the saga to completion.
//
// XXX: This may loop forever in case `SecStore::record_event` fails.
// Ideally, `run_saga` wouldn't both start the saga and wait for it to
// be finished -- instead, it would start off the saga, and then return
// a notification channel that the caller could use to decide:
//
// - either to .await until completion
// - or to stop waiting after a certain period, while still letting the
// saga run in the background.
//
// For more, see https://github.com/oxidecomputer/omicron/issues/5406
// and the note in `sec_store.rs`'s `record_event`.
self.run_saga(runnable_saga).await
}
}

0 comments on commit 1f1a63f

Please sign in to comment.