Skip to content

Commit

Permalink
Make backoff policy explicit, with short / long options (#1996)
Browse files Browse the repository at this point in the history
* Make backoff policy explicit

* rename
  • Loading branch information
smklein authored Nov 30, 2022
1 parent 7cab5cf commit eebdd20
Show file tree
Hide file tree
Showing 16 changed files with 51 additions and 58 deletions.
31 changes: 21 additions & 10 deletions common/src/backoff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,33 @@ pub use ::backoff::future::{retry, retry_notify};
pub use ::backoff::Error as BackoffError;
pub use ::backoff::{backoff::Backoff, ExponentialBackoff, Notify};

/// Return a backoff policy appropriate for retrying internal services
/// indefinitely.
pub fn internal_service_policy() -> ::backoff::ExponentialBackoff {
/// Return a backoff policy for querying internal services which may not be up
/// for a relatively long amount of time.
pub fn retry_policy_long() -> ::backoff::ExponentialBackoff {
const INITIAL_INTERVAL: Duration = Duration::from_millis(250);
const MAX_INTERVAL: Duration = Duration::from_secs(60 * 60);
internal_service_policy_with_max(MAX_INTERVAL)
internal_service_policy_with_max(INITIAL_INTERVAL, MAX_INTERVAL)
}

/// Return a backoff policy for querying conditions that are expected to
/// complete in a relatively shorter amount of time than
/// [retry_policy_long].
pub fn retry_policy_short() -> ::backoff::ExponentialBackoff {
const INITIAL_INTERVAL: Duration = Duration::from_millis(50);
const MAX_INTERVAL: Duration = Duration::from_secs(1);
internal_service_policy_with_max(INITIAL_INTERVAL, MAX_INTERVAL)
}

pub fn internal_service_policy_with_max(
max_duration: Duration,
fn internal_service_policy_with_max(
initial_interval: Duration,
max_interval: Duration,
) -> ::backoff::ExponentialBackoff {
const INITIAL_INTERVAL: Duration = Duration::from_millis(250);
let current_interval = initial_interval;
::backoff::ExponentialBackoff {
current_interval: INITIAL_INTERVAL,
initial_interval: INITIAL_INTERVAL,
current_interval,
initial_interval,
multiplier: 2.0,
max_interval: max_duration,
max_interval,
max_elapsed_time: None,
..backoff::ExponentialBackoff::default()
}
Expand Down
2 changes: 1 addition & 1 deletion nexus/src/app/oximeter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ impl super::Nexus {
);
};
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_long(),
register,
log_registration_failure,
).await
Expand Down
2 changes: 1 addition & 1 deletion nexus/src/app/sagas/common_storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ pub async fn ensure_region_in_dataset(
};

let region = backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_long(),
create_region,
log_create_failure,
)
Expand Down
4 changes: 2 additions & 2 deletions nexus/src/db/saga_recovery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ use crate::db;
use futures::{future::BoxFuture, TryFutureExt};
use omicron_common::api::external::DataPageParams;
use omicron_common::api::external::Error;
use omicron_common::backoff::internal_service_policy;
use omicron_common::backoff::retry_notify;
use omicron_common::backoff::retry_policy_long;
use omicron_common::backoff::BackoffError;
use std::future::Future;
use std::pin::Pin;
Expand Down Expand Up @@ -92,7 +92,7 @@ where
// (pages) goes up. We'd be much more likely to finish the overall
// operation if we didn't throw away the results we did get each time.
let found_sagas = retry_notify(
internal_service_policy(),
retry_policy_long(),
|| async {
list_unfinished_sagas(&opctx, &datastore, &sec_id)
.await
Expand Down
2 changes: 1 addition & 1 deletion nexus/src/populate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ async fn populate(
) -> Result<(), String> {
for p in *ALL_POPULATORS {
let db_result = backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_long(),
|| async {
p.populate(opctx, datastore, args).await.map_err(|error| {
match &error {
Expand Down
2 changes: 1 addition & 1 deletion nexus/tests/integration_tests/disks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1154,7 +1154,7 @@ async fn query_for_metrics_until_they_exist(
path: &str,
) -> ResultsPage<Measurement> {
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
|| async {
let measurements: ResultsPage<Measurement> =
objects_list_page_authz(client, path).await;
Expand Down
4 changes: 2 additions & 2 deletions oximeter/collector/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ impl Oximeter {
);
};
let agent = backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
make_agent,
log_client_failure,
)
Expand Down Expand Up @@ -503,7 +503,7 @@ impl Oximeter {
);
};
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
notify_nexus,
log_notification_failure,
)
Expand Down
6 changes: 2 additions & 4 deletions sled-agent/src/bootstrap/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ use crate::server::Server as SledServer;
use crate::sp::SpHandle;
use omicron_common::address::Ipv6Subnet;
use omicron_common::api::external::{Error as ExternalError, MacAddr};
use omicron_common::backoff::{
internal_service_policy, retry_notify, BackoffError,
};
use omicron_common::backoff::{retry_notify, retry_policy_short, BackoffError};
use serde::{Deserialize, Serialize};
use slog::Logger;
use std::borrow::Cow;
Expand Down Expand Up @@ -331,7 +329,7 @@ impl Agent {
) -> Result<RackSecret, BootstrapError> {
let ddm_admin_client = DdmAdminClient::new(self.log.clone())?;
let rack_secret = retry_notify(
internal_service_policy(),
retry_policy_short(),
|| async {
let other_agents = {
// Manually build up a `HashSet` instead of `.collect()`ing
Expand Down
4 changes: 2 additions & 2 deletions sled-agent/src/bootstrap/ddm_admin_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ use ddm_admin_client::types::Ipv6Prefix;
use ddm_admin_client::Client;
use omicron_common::address::Ipv6Subnet;
use omicron_common::address::SLED_PREFIX;
use omicron_common::backoff::internal_service_policy;
use omicron_common::backoff::retry_notify;
use omicron_common::backoff::retry_policy_short;
use slog::Logger;
use std::net::Ipv6Addr;
use std::net::SocketAddr;
Expand Down Expand Up @@ -65,7 +65,7 @@ impl DdmAdminClient {
tokio::spawn(async move {
let prefix =
Ipv6Prefix { addr: address.net().network(), mask: SLED_PREFIX };
retry_notify(internal_service_policy(), || async {
retry_notify(retry_policy_short(), || async {
info!(
me.log, "Sending prefix to ddmd for advertisement";
"prefix" => ?prefix,
Expand Down
4 changes: 2 additions & 2 deletions sled-agent/src/bootstrap/rss_handle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ use crate::rack_setup::service::RackSetupService;
use crate::sp::SpHandle;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use omicron_common::backoff::internal_service_policy;
use omicron_common::backoff::retry_notify;
use omicron_common::backoff::retry_policy_short;
use omicron_common::backoff::BackoffError;
use slog::Logger;
use sprockets_host::Ed25519Certificate;
Expand Down Expand Up @@ -101,7 +101,7 @@ async fn initialize_sled_agent(
let log_failure = |error, _| {
warn!(log, "failed to start sled agent"; "error" => ?error);
};
retry_notify(internal_service_policy(), sled_agent_initialize, log_failure)
retry_notify(retry_policy_short(), sled_agent_initialize, log_failure)
.await?;
info!(log, "Peer agent initialized"; "peer" => %bootstrap_addr);
Ok(())
Expand Down
2 changes: 1 addition & 1 deletion sled-agent/src/illumos/svc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ mod inner {

let log_notification_failure = |_error, _delay| {};
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
|| async {
let mut p = smf::Properties::new();
let properties = {
Expand Down
4 changes: 2 additions & 2 deletions sled-agent/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ async fn wait_for_http_server(
};

backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
|| async {
// This request is nonsensical - we don't expect an instance to
// exist - but getting a response that isn't a connection-based
Expand Down Expand Up @@ -610,7 +610,7 @@ impl Instance {
inner.log, "Adding service"; "smf_name" => &smf_instance_name
);
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
|| async {
running_zone
.run_cmd(&[
Expand Down
22 changes: 6 additions & 16 deletions sled-agent/src/rack_setup/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ use internal_dns_client::multiclient::{DnsError, Updater as DnsUpdater};
use omicron_common::address::{
get_sled_address, ReservedRackSubnet, DNS_PORT, DNS_SERVER_PORT,
};
use omicron_common::backoff::{
internal_service_policy, retry_notify, BackoffError,
};
use omicron_common::backoff::{retry_notify, retry_policy_short, BackoffError};
use serde::{Deserialize, Serialize};
use slog::Logger;
use sprockets_host::Ed25519Certificate;
Expand Down Expand Up @@ -200,12 +198,8 @@ impl ServiceInner {
let log_failure = |error, _| {
warn!(self.log, "failed to create filesystem"; "error" => ?error);
};
retry_notify(
internal_service_policy(),
filesystem_put,
log_failure,
)
.await?;
retry_notify(retry_policy_short(), filesystem_put, log_failure)
.await?;
}
Ok(())
}
Expand Down Expand Up @@ -249,8 +243,7 @@ impl ServiceInner {
let log_failure = |error, _| {
warn!(self.log, "failed to initialize services"; "error" => ?error);
};
retry_notify(internal_service_policy(), services_put, log_failure)
.await?;
retry_notify(retry_policy_short(), services_put, log_failure).await?;
Ok(())
}

Expand Down Expand Up @@ -400,10 +393,7 @@ impl ServiceInner {
) -> Result<Vec<Ipv6Addr>, DdmError> {
let ddm_admin_client = DdmAdminClient::new(self.log.clone())?;
let addrs = retry_notify(
// TODO-correctness `internal_service_policy()` has potentially-long
// exponential backoff, which is probably not what we want. See
// https://github.com/oxidecomputer/omicron/issues/1270
internal_service_policy(),
retry_policy_short(),
|| async {
let peer_addrs =
ddm_admin_client.peer_addrs().await.map_err(|err| {
Expand Down Expand Up @@ -450,7 +440,7 @@ impl ServiceInner {
);
},
)
// `internal_service_policy()` retries indefinitely on transient errors
// `retry_policy_short()` retries indefinitely on transient errors
// (the only kind we produce), allowing us to `.unwrap()` without
// panicking
.await
Expand Down
6 changes: 2 additions & 4 deletions sled-agent/src/sim/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ use super::sled_agent::SledAgent;
use crate::nexus::NexusClient;
use crucible_agent_client::types::State as RegionState;

use omicron_common::backoff::{
internal_service_policy, retry_notify, BackoffError,
};
use omicron_common::backoff::{retry_notify, retry_policy_short, BackoffError};
use slog::{Drain, Logger};
use std::sync::Arc;

Expand Down Expand Up @@ -86,7 +84,7 @@ impl Server {
"error" => ?error);
};
retry_notify(
internal_service_policy(),
retry_policy_short(),
notify_nexus,
log_notification_failure,
)
Expand Down
8 changes: 2 additions & 6 deletions sled-agent/src/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@ use omicron_common::api::{
internal::nexus::DiskRuntimeState, internal::nexus::InstanceRuntimeState,
internal::nexus::UpdateArtifact,
};
use omicron_common::backoff::{
internal_service_policy_with_max, retry_notify, BackoffError,
};
use omicron_common::backoff::{retry_notify, retry_policy_short, BackoffError};
use slog::Logger;
use std::net::{Ipv6Addr, SocketAddrV6};
use std::process::Command;
Expand Down Expand Up @@ -524,9 +522,7 @@ impl SledAgent {
);
};
retry_notify(
internal_service_policy_with_max(
std::time::Duration::from_secs(1),
),
retry_policy_short(),
notify_nexus,
log_notification_failure,
)
Expand Down
6 changes: 3 additions & 3 deletions sled-agent/src/storage_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ impl DatasetInfo {
warn!(log, "cockroachdb not yet alive");
};
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
check_health,
log_failure,
)
Expand Down Expand Up @@ -659,7 +659,7 @@ impl StorageWorker {
};
nexus_notifications.push_back(
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
notify_nexus,
log_post_failure,
)
Expand Down Expand Up @@ -705,7 +705,7 @@ impl StorageWorker {
};
nexus_notifications.push_back(
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
notify_nexus,
log_post_failure,
)
Expand Down

0 comments on commit eebdd20

Please sign in to comment.