From 7facfdf224c7e8fd069b58a4221313874dceb629 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 28 Sep 2023 05:12:21 +0000 Subject: [PATCH 01/66] wip --- Cargo.lock | 23 + Cargo.toml | 2 + sled-storage/Cargo.toml | 27 ++ sled-storage/src/dataset.rs | 120 +++++ sled-storage/src/disk.rs | 57 +++ sled-storage/src/dump_setup.rs | 795 +++++++++++++++++++++++++++++++++ sled-storage/src/error.rs | 0 sled-storage/src/lib.rs | 9 + 8 files changed, 1033 insertions(+) create mode 100644 sled-storage/Cargo.toml create mode 100644 sled-storage/src/dataset.rs create mode 100644 sled-storage/src/disk.rs create mode 100644 sled-storage/src/dump_setup.rs create mode 100644 sled-storage/src/error.rs create mode 100644 sled-storage/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index e5130b6b33..bdf2d44ea4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5265,6 +5265,29 @@ dependencies = [ "zone", ] +[[package]] +name = "omicron-sled-storage" +version = "0.1.0" +dependencies = [ + "async-trait", + "camino", + "derive_more", + "glob", + "illumos-utils", + "key-manager", + "nexus-client 0.1.0", + "omicron-common 0.1.0", + "schemars", + "serde", + "serde_json", + "sled-agent-client", + "sled-hardware", + "slog", + "thiserror", + "tokio", + "uuid", +] + [[package]] name = "omicron-test-utils" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 9498157b28..0b63984ea7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,6 +54,7 @@ members = [ "sled-agent-client", "sled-agent", "sled-hardware", + "sled-storage", "sp-sim", "test-utils", "tufaceous-lib", @@ -115,6 +116,7 @@ default-members = [ "sled-agent", "sled-agent-client", "sled-hardware", + "sled-storage", "sp-sim", "test-utils", "tufaceous", diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml new file mode 100644 index 0000000000..03f0f608de --- /dev/null +++ b/sled-storage/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "omicron-sled-storage" +version = "0.1.0" +edition = "2021" + +[dependencies] +async-trait.workspace = true +camino.workspace = true +derive_more.workspace = true +glob.workspace = true +illumos-utils.workspace = true +key-manager.workspace = true +# Needed strictly for parameter type conversion +# We could put this in the nexus-client instead +nexus-client.workspace = true +omicron-common.workspace = true +schemars = { workspace = true, features = [ "chrono", "uuid1" ] } +serde.workspace = true +serde_json.workspace = true +# Needed strictly for parameter type conversion +# We could put this in the sled-agent-client instead +sled-agent-client.workspace = true +sled-hardware.workspace = true +slog.workspace = true +thiserror.workspace = true +tokio.workspace = true +uuid.workspace = true diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs new file mode 100644 index 0000000000..e521dd963a --- /dev/null +++ b/sled-storage/src/dataset.rs @@ -0,0 +1,120 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use illumos_utils::zpool::ZpoolName; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::str::FromStr; + +/// The type of a dataset, and an auxiliary information necessary +/// to successfully launch a zone managing the associated data. +#[derive( + Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, +)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum DatasetKind { + CockroachDb, + Crucible, + Clickhouse, + ClickhouseKeeper, + ExternalDns, + InternalDns, +} + +impl From for sled_agent_client::types::DatasetKind { + fn from(k: DatasetKind) -> Self { + use DatasetKind::*; + match k { + CockroachDb => Self::CockroachDb, + Crucible => Self::Crucible, + Clickhouse => Self::Clickhouse, + ClickhouseKeeper => Self::ClickhouseKeeper, + ExternalDns => Self::ExternalDns, + InternalDns => Self::InternalDns, + } + } +} + +impl From for nexus_client::types::DatasetKind { + fn from(k: DatasetKind) -> Self { + use DatasetKind::*; + match k { + CockroachDb => Self::Cockroach, + Crucible => Self::Crucible, + Clickhouse => Self::Clickhouse, + ClickhouseKeeper => Self::ClickhouseKeeper, + ExternalDns => Self::ExternalDns, + InternalDns => Self::InternalDns, + } + } +} + +impl std::fmt::Display for DatasetKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use DatasetKind::*; + let s = match self { + Crucible => "crucible", + CockroachDb { .. } => "cockroachdb", + Clickhouse => "clickhouse", + ClickhouseKeeper => "clickhouse_keeper", + ExternalDns { .. } => "external_dns", + InternalDns { .. } => "internal_dns", + }; + write!(f, "{}", s) + } +} + +#[derive( + Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Clone, JsonSchema, +)] +pub struct DatasetName { + // A unique identifier for the Zpool on which the dataset is stored. + pool_name: ZpoolName, + // A name for the dataset within the Zpool. + kind: DatasetKind, +} + +impl DatasetName { + pub fn new(pool_name: ZpoolName, kind: DatasetKind) -> Self { + Self { pool_name, kind } + } + + pub fn pool(&self) -> &ZpoolName { + &self.pool_name + } + + pub fn dataset(&self) -> &DatasetKind { + &self.kind + } + + pub fn full(&self) -> String { + format!("{}/{}", self.pool_name, self.kind) + } +} + +impl From for sled_agent_client::types::DatasetName { + fn from(n: DatasetName) -> Self { + Self { + pool_name: sled_agent_client::types::ZpoolName::from_str( + &n.pool().to_string(), + ) + .unwrap(), + kind: n.dataset().clone().into(), + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use uuid::Uuid; + + #[test] + fn serialize_dataset_name() { + let pool = ZpoolName::new_internal(Uuid::new_v4()); + let kind = DatasetKind::Crucible; + let name = DatasetName::new(pool, kind); + serde_json::to_string(&name).unwrap(); + } +} diff --git a/sled-storage/src/disk.rs b/sled-storage/src/disk.rs new file mode 100644 index 0000000000..b78bd57af9 --- /dev/null +++ b/sled-storage/src/disk.rs @@ -0,0 +1,57 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Disk related types + +use camino::Utf8PathBuf; +use illumos_utils::zpool::{ZpoolKind, ZpoolName}; +use omicron_common::disk::DiskIdentity; +use sled_hardware::{Disk, DiskVariant}; + +/// A wrapper around real disks or synthetic disks backed by a file +#[derive(PartialEq, Eq, Clone)] +pub(crate) enum DiskWrapper { + Real { disk: Disk, devfs_path: Utf8PathBuf }, + Synthetic { zpool_name: ZpoolName }, +} + +impl From for DiskWrapper { + fn from(disk: Disk) -> Self { + let devfs_path = disk.devfs_path().clone(); + Self::Real { disk, devfs_path } + } +} + +impl DiskWrapper { + fn identity(&self) -> DiskIdentity { + match self { + DiskWrapper::Real { disk, .. } => disk.identity().clone(), + DiskWrapper::Synthetic { zpool_name } => { + let id = zpool_name.id(); + DiskIdentity { + vendor: "synthetic-vendor".to_string(), + serial: format!("synthetic-serial-{id}"), + model: "synthetic-model".to_string(), + } + } + } + } + + fn variant(&self) -> DiskVariant { + match self { + DiskWrapper::Real { disk, .. } => disk.variant(), + DiskWrapper::Synthetic { zpool_name } => match zpool_name.kind() { + ZpoolKind::External => DiskVariant::U2, + ZpoolKind::Internal => DiskVariant::M2, + }, + } + } + + fn zpool_name(&self) -> &ZpoolName { + match self { + DiskWrapper::Real { disk, .. } => disk.zpool_name(), + DiskWrapper::Synthetic { zpool_name } => zpool_name, + } + } +} diff --git a/sled-storage/src/dump_setup.rs b/sled-storage/src/dump_setup.rs new file mode 100644 index 0000000000..ea51251f84 --- /dev/null +++ b/sled-storage/src/dump_setup.rs @@ -0,0 +1,795 @@ +use crate::disk::DiskWrapper; +use camino::Utf8PathBuf; +use derive_more::{AsRef, Deref, From}; +use illumos_utils::dumpadm::DumpAdmError; +use illumos_utils::zone::{AdmError, Zones}; +use illumos_utils::zpool::{ZpoolHealth, ZpoolName}; +use omicron_common::disk::DiskIdentity; +use sled_hardware::DiskVariant; +use slog::{debug, error, info, o, warn, Logger}; +use std::collections::{HashMap, HashSet}; +use std::ffi::OsString; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, Weak}; +use std::time::{Duration, SystemTime, SystemTimeError, UNIX_EPOCH}; +use tokio::sync::MutexGuard; + +pub struct DumpSetup { + worker: Arc>, + _poller: std::thread::JoinHandle<()>, + log: Logger, +} + +impl DumpSetup { + pub fn new(log: &Logger) -> Self { + let worker = Arc::new(std::sync::Mutex::new(DumpSetupWorker::new( + log.new(o!("component" => "DumpSetup-worker")), + ))); + let worker_weak = Arc::downgrade(&worker); + let log_poll = log.new(o!("component" => "DumpSetup-archival")); + let _poller = std::thread::spawn(move || { + Self::poll_file_archival(worker_weak, log_poll) + }); + let log = log.new(o!("component" => "DumpSetup")); + Self { worker, _poller, log } + } +} + +// we sure are passing a lot of Utf8PathBufs around, let's be careful about it +#[derive( + AsRef, Clone, Debug, Deref, Eq, From, Hash, Ord, PartialEq, PartialOrd, +)] +struct DumpSlicePath(Utf8PathBuf); +#[derive( + AsRef, Clone, Debug, Deref, Eq, From, Hash, Ord, PartialEq, PartialOrd, +)] +struct DebugDataset(Utf8PathBuf); +#[derive( + AsRef, Clone, Debug, Deref, Eq, From, Hash, Ord, PartialEq, PartialOrd, +)] +struct CoreDataset(Utf8PathBuf); + +#[derive(Deref)] +struct CoreZpool(ZpoolName); +#[derive(Deref)] +struct DebugZpool(ZpoolName); + +// only want to access these directories after they're mounted! +trait GetMountpoint: std::ops::Deref { + type NewType: From; + const MOUNTPOINT: &'static str; + fn mountpoint(&self) -> Result, ZfsGetError> { + if zfs_get_prop(self.to_string(), "mounted")? == "yes" { + Ok(Some(Self::NewType::from( + self.dataset_mountpoint(Self::MOUNTPOINT), + ))) + } else { + Ok(None) + } + } +} +impl GetMountpoint for DebugZpool { + type NewType = DebugDataset; + const MOUNTPOINT: &'static str = sled_hardware::disk::DUMP_DATASET; +} +impl GetMountpoint for CoreZpool { + type NewType = CoreDataset; + const MOUNTPOINT: &'static str = sled_hardware::disk::CRASH_DATASET; +} + +struct DumpSetupWorker { + core_dataset_names: Vec, + debug_dataset_names: Vec, + + chosen_dump_slice: Option, + chosen_debug_dir: Option, + chosen_core_dir: Option, + + known_dump_slices: Vec, + known_debug_dirs: Vec, + known_core_dirs: Vec, + + savecored_slices: HashSet, + + log: Logger, +} + +const ARCHIVAL_INTERVAL: Duration = Duration::from_secs(300); + +impl DumpSetup { + pub(crate) async fn update_dumpdev_setup( + &self, + disks: &mut MutexGuard<'_, HashMap>, + ) { + let log = &self.log; + let mut m2_dump_slices = Vec::new(); + let mut u2_debug_datasets = Vec::new(); + let mut m2_core_datasets = Vec::new(); + for (_id, disk_wrapper) in disks.iter() { + match disk_wrapper { + DiskWrapper::Real { disk, .. } => match disk.variant() { + DiskVariant::M2 => { + match disk.dump_device_devfs_path(false) { + Ok(path) => { + m2_dump_slices.push(DumpSlicePath(path)) + } + Err(err) => { + warn!(log, "Error getting dump device devfs path: {err:?}"); + } + } + let name = disk.zpool_name(); + if let Ok(info) = illumos_utils::zpool::Zpool::get_info( + &name.to_string(), + ) { + if info.health() == ZpoolHealth::Online { + m2_core_datasets.push(CoreZpool(name.clone())); + } else { + warn!(log, "Zpool {name:?} not online, won't attempt to save process core dumps there"); + } + } + } + DiskVariant::U2 => { + let name = disk.zpool_name(); + if let Ok(info) = illumos_utils::zpool::Zpool::get_info( + &name.to_string(), + ) { + if info.health() == ZpoolHealth::Online { + u2_debug_datasets + .push(DebugZpool(name.clone())); + } else { + warn!(log, "Zpool {name:?} not online, won't attempt to save kernel core dumps there"); + } + } + } + }, + DiskWrapper::Synthetic { .. } => {} + } + } + + let savecore_lock = self.worker.clone(); + let log_tmp = log.new(o!("component" => "DumpSetup-mutex")); + tokio::task::spawn_blocking(move || match savecore_lock.lock() { + Ok(mut guard) => { + guard.update_disk_loadout( + m2_dump_slices, + u2_debug_datasets, + m2_core_datasets, + ); + } + Err(err) => { + error!(log_tmp, "DumpSetup mutex poisoned: {err:?}"); + } + }); + } + + fn poll_file_archival( + worker: Weak>, + log: Logger, + ) { + info!(log, "DumpSetup poll loop started."); + loop { + if let Some(mutex) = worker.upgrade() { + match mutex.lock() { + Ok(mut guard) => { + guard.reevaluate_choices(); + if let Err(err) = guard.archive_files() { + error!( + log, + "Failed to archive debug/dump files: {err:?}" + ); + } + } + Err(err) => { + error!( + log, + "DumpSetup mutex poisoned in poll thread: {err:?}" + ); + break; + } + } + } else { + info!( + log, + "DumpSetup weak pointer dropped, leaving poll loop." + ); + break; + } + std::thread::sleep(ARCHIVAL_INTERVAL); + } + } +} + +#[derive(Debug, thiserror::Error)] +enum ZfsGetError { + #[error("Error executing 'zfs get' command: {0}")] + IoError(#[from] std::io::Error), + #[error("Output of 'zfs get' was not only not an integer string, it wasn't even UTF-8: {0}")] + Utf8(#[from] std::string::FromUtf8Error), + #[error("Error parsing output of 'zfs get' command as integer: {0}")] + Parse(#[from] std::num::ParseIntError), +} + +const ZFS_PROP_USED: &str = "used"; +const ZFS_PROP_AVAILABLE: &str = "available"; + +fn zfs_get_integer( + mountpoint_or_name: impl AsRef, + property: &str, +) -> Result { + zfs_get_prop(mountpoint_or_name, property)?.parse().map_err(Into::into) +} + +fn zfs_get_prop( + mountpoint_or_name: impl AsRef + Sized, + property: &str, +) -> Result { + let mountpoint = mountpoint_or_name.as_ref(); + let mut cmd = std::process::Command::new(illumos_utils::zfs::ZFS); + cmd.arg("get").arg("-Hpo").arg("value"); + cmd.arg(property); + cmd.arg(mountpoint); + let output = cmd.output()?; + Ok(String::from_utf8(output.stdout)?.trim().to_string()) +} + +const DATASET_USAGE_PERCENT_CHOICE: u64 = 70; +const DATASET_USAGE_PERCENT_CLEANUP: u64 = 80; + +fn below_thresh( + mountpoint: &Utf8PathBuf, + percent: u64, +) -> Result<(bool, u64), ZfsGetError> { + let used = zfs_get_integer(mountpoint, ZFS_PROP_USED)?; + let available = zfs_get_integer(mountpoint, ZFS_PROP_AVAILABLE)?; + let capacity = used + available; + let below = (used * 100) / capacity < percent; + Ok((below, used)) +} + +impl DumpSetupWorker { + fn new(log: Logger) -> Self { + Self { + core_dataset_names: vec![], + debug_dataset_names: vec![], + chosen_dump_slice: None, + chosen_debug_dir: None, + chosen_core_dir: None, + known_dump_slices: vec![], + known_debug_dirs: vec![], + known_core_dirs: vec![], + savecored_slices: Default::default(), + log, + } + } + + fn update_disk_loadout( + &mut self, + dump_slices: Vec, + debug_datasets: Vec, + core_datasets: Vec, + ) { + self.core_dataset_names = core_datasets; + self.debug_dataset_names = debug_datasets; + + self.known_dump_slices = dump_slices; + + self.reevaluate_choices(); + } + + // only allow mounted zfs datasets into 'known_*_dirs', + // such that we don't render them non-auto-mountable by zfs + fn update_mounted_dirs(&mut self) { + self.known_debug_dirs = self + .debug_dataset_names + .iter() + .flat_map(|ds| ds.mountpoint()) + .flatten() + .collect(); + self.known_core_dirs = self + .core_dataset_names + .iter() + .flat_map(|ds| ds.mountpoint()) + .flatten() + .collect(); + } + + fn reevaluate_choices(&mut self) { + self.update_mounted_dirs(); + + self.known_dump_slices.sort(); + // sort key: prefer to choose a dataset where there's already other + // dumps so we don't shotgun them across every U.2, but only if they're + // below a certain usage threshold. + self.known_debug_dirs.sort_by_cached_key( + |mountpoint: &DebugDataset| { + match below_thresh(mountpoint.as_ref(), DATASET_USAGE_PERCENT_CHOICE) { + Ok((below, used)) => { + let priority = if below { 0 } else { 1 }; + (priority, used, mountpoint.clone()) + } + Err(err) => { + error!(self.log, "Could not query zfs properties of debug dump dir: {err:?}"); + // deprioritize anything we get errors querying. + (usize::MAX, u64::MAX, mountpoint.clone()) + } + } + }, + ); + self.known_core_dirs.sort_by_cached_key(|mnt| { + // these get archived periodically anyway, pick one with room + let available = zfs_get_integer(&**mnt, "available").unwrap_or(0); + (u64::MAX - available, mnt.clone()) + }); + + if let Some(x) = &self.chosen_debug_dir { + if !self.known_debug_dirs.contains(x) { + warn!(self.log, "Previously-chosen debug/dump dir {x:?} no longer exists in our view of reality"); + self.chosen_debug_dir = None; + } else { + match below_thresh(x.as_ref(), DATASET_USAGE_PERCENT_CLEANUP) { + Ok((true, _)) => {} + Ok((false, _)) => { + if self.known_debug_dirs.iter().any(|x| { + below_thresh( + x.as_ref(), + DATASET_USAGE_PERCENT_CHOICE, + ) + .unwrap_or((false, 0)) + .0 + }) { + info!(self.log, "Previously-chosen debug/dump dir {x:?} is over usage threshold, choosing a more vacant disk"); + self.chosen_debug_dir = None; + } else { + warn!(self.log, "All candidate debug/dump dirs are over usage threshold, removing older archived files"); + if let Err(err) = self.cleanup() { + error!(self.log, "Couldn't clean up any debug/dump dirs, may hit dataset quota in {x:?}: {err:?}"); + } else { + self.chosen_debug_dir = None; + } + } + } + Err(err) => { + error!(self.log, "Previously-chosen debug/dump dir {x:?} couldn't be queried for zfs properties! Choosing another. {err:?}"); + self.chosen_debug_dir = None; + } + } + } + } + if let Some(x) = &self.chosen_dump_slice { + if !self.known_dump_slices.contains(x) { + warn!(self.log, "Previously-chosen dump slice {x:?} no longer exists in our view of reality"); + self.chosen_dump_slice = None; + } + } + if let Some(x) = &self.chosen_core_dir { + if !self.known_core_dirs.contains(x) { + warn!(self.log, "Previously-chosen core dir {x:?} no longer exists in our view of reality"); + self.chosen_core_dir = None; + } + } + + if self.chosen_debug_dir.is_none() { + self.chosen_debug_dir = self.known_debug_dirs.first().cloned(); + } + + if self.chosen_core_dir.is_none() { + for core_dir in &self.known_core_dirs { + // tell the system to write *userspace process* cores here. + match illumos_utils::coreadm::coreadm(core_dir) { + Ok(()) => { + self.chosen_core_dir = Some(core_dir.clone()); + info!( + self.log, + "Set process core dump directory to {core_dir:?}" + ); + break; + } + Err(err) => { + error!(self.log, "Couldn't configure process core dump directory to {core_dir:?}: {err:?}"); + } + } + } + } + + if self.chosen_dump_slice.is_none() { + if self.chosen_debug_dir.is_some() { + for dump_slice in self.known_dump_slices.clone() { + // Let's try to see if it appears to have a kernel dump already + match illumos_utils::dumpadm::dump_flag_is_valid( + &dump_slice, + ) { + Ok(true) => { + debug!(self.log, "Dump slice {dump_slice:?} appears to have a valid header; will attempt to savecore"); + } + Ok(false) => { + info!(self.log, "Dump slice {dump_slice:?} appears to have already been saved"); + } + Err(err) => { + debug!(self.log, "Dump slice {dump_slice:?} appears to be unused: {err:?}"); + } + } + if let Ok(saved) = self.dumpadm_and_savecore(&dump_slice) { + if let Some(out) = saved { + info!(self.log, "Previous dump on slice {dump_slice:?} saved, configured slice as target for new dumps. {out:?}"); + } + self.chosen_dump_slice = Some(dump_slice); + break; + } + } + } else { + // Don't risk overwriting an existing kernel dump if there's + // already one there until we can attempt to savecore(8) + // it away and clear the flag to make room. + for dump_slice in &self.known_dump_slices { + match illumos_utils::dumpadm::dump_flag_is_valid(dump_slice) + { + Ok(false) => { + // Have dumpadm write the config for crash dumps to be + // on this slice, at least, until a U.2 comes along. + match illumos_utils::dumpadm::dumpadm( + dump_slice, None, + ) { + Ok(_) => { + info!(self.log, "Using dump device {dump_slice:?} with no savecore destination (no U.2 debug zvol yet)"); + self.chosen_dump_slice = + Some(dump_slice.clone()); + break; + } + Err(err) => { + warn!(self.log, "Could not configure {dump_slice:?} as dump device: {err:?}"); + } + } + } + Ok(true) => { + warn!(self.log, "Not configuring {dump_slice:?} as it appears to contain a dump we cannot yet send to a U.2 debug zvol"); + } + Err(err) => { + debug!( + self.log, + "Dump slice {dump_slice:?} appears to be unused : {err:?}", + ); + } + } + } + } + } + + if let Some(debug_dir) = self.chosen_debug_dir.clone() { + let mut changed_slice = false; + for dump_slice in self.known_dump_slices.clone() { + if !self.savecored_slices.contains(&dump_slice) { + changed_slice = true; + // temporarily changes the system's dump slice so savecore(8) + // can update the header in the slice when it finishes... + match self.dumpadm_and_savecore(&dump_slice) { + Ok(saved) => { + if let Some(stdout) = &saved { + info!( + self.log, + "Saved dump from {dump_slice:?} to {debug_dir:?}: {stdout:?}" + ); + } else { + info!( + self.log, + "Set {dump_slice:?} as system dump slice", + ); + } + } + Err(err) => { + warn!(self.log, "Could not configure {dump_slice:?} as dump device with {debug_dir:?} as savecore destination: {err:?}"); + } + } + } + } + + // ...so then we restore the chosen dump slice for the system to use + // in the event of a kernel crash + if changed_slice { + if let Some(dump_slice) = &self.chosen_dump_slice { + if let Err(err) = + illumos_utils::dumpadm::dumpadm(dump_slice, None) + { + error!(self.log, "Could not restore dump slice to {dump_slice:?}: {err:?}"); + } + } + } + } + } + + fn archive_files(&self) -> std::io::Result<()> { + if let Some(debug_dir) = &self.chosen_debug_dir { + if self.known_core_dirs.is_empty() { + info!(self.log, "No core dump locations yet known."); + } + for core_dir in &self.known_core_dirs { + if let Ok(dir) = core_dir.read_dir() { + for entry in dir.flatten() { + if let Some(path) = entry.file_name().to_str() { + let dest = debug_dir.join(path); + + if let Err(err) = + Self::copy_sync_and_remove(&entry.path(), &dest) + { + error!( + self.log, + "Failed to archive {entry:?}: {err:?}" + ); + } else { + info!( + self.log, + "Relocated {entry:?} to {dest:?}" + ); + } + } else { + error!(self.log, "Non-UTF8 path found while archiving core dumps: {entry:?}"); + } + } + } + } + } else { + info!( + self.log, + "No archival destination for crash dumps yet chosen." + ); + } + + if let Err(err) = self.archive_logs() { + if !matches!(err, ArchiveLogsError::NoDebugDirYet) { + error!( + self.log, + "Failure while trying to archive logs to debug dataset: {err:?}" + ); + } + } + + Ok(()) + } + + fn copy_sync_and_remove( + source: impl AsRef, + dest: impl AsRef, + ) -> std::io::Result<()> { + let source = source.as_ref(); + let dest = dest.as_ref(); + let mut dest_f = std::fs::File::create(&dest)?; + let mut src_f = std::fs::File::open(&source)?; + + std::io::copy(&mut src_f, &mut dest_f)?; + + dest_f.sync_all()?; + + drop(src_f); + drop(dest_f); + + std::fs::remove_file(source)?; + Ok(()) + } + + fn archive_logs(&self) -> Result<(), ArchiveLogsError> { + let debug_dir = self + .chosen_debug_dir + .as_ref() + .ok_or(ArchiveLogsError::NoDebugDirYet)?; + // zone crate's 'deprecated' functions collide if you try to enable + // its 'sync' and 'async' features simultaneously :( + let rt = + tokio::runtime::Runtime::new().map_err(ArchiveLogsError::Tokio)?; + let oxz_zones = rt.block_on(Zones::get())?; + self.archive_logs_inner( + debug_dir, + PathBuf::from("/var/svc/log"), + "global", + )?; + for zone in oxz_zones { + let logdir = zone.path().join("root/var/svc/log"); + let zone_name = zone.name(); + self.archive_logs_inner(debug_dir, logdir, zone_name)?; + } + Ok(()) + } + + fn archive_logs_inner( + &self, + debug_dir: &DebugDataset, + logdir: PathBuf, + zone_name: &str, + ) -> Result<(), ArchiveLogsError> { + let mut rotated_log_files = Vec::new(); + // patterns matching archived logs, e.g. foo.log.3 + // keep checking for greater numbers of digits until we don't find any + for n in 1..9 { + let pattern = logdir + .join(format!("*.log.{}", "[0-9]".repeat(n))) + .to_str() + .ok_or_else(|| ArchiveLogsError::Utf8(zone_name.to_string()))? + .to_string(); + rotated_log_files.extend(glob::glob(&pattern)?.flatten()); + } + let dest_dir = debug_dir.join(zone_name).into_std_path_buf(); + if !rotated_log_files.is_empty() { + std::fs::create_dir_all(&dest_dir)?; + let count = rotated_log_files.len(); + info!( + self.log, + "Archiving {count} log files from {zone_name} zone" + ); + } + for entry in rotated_log_files { + let src_name = entry.file_name().unwrap(); + // as we archive them, logadm will keep resetting to .log.0, + // so we need to maintain our own numbering in the dest dataset. + // we'll use the modified date of the rotated log file, or try + // falling back to the time of archival if that fails, and + // falling back to counting up from 0 if *that* somehow fails. + let mut n = entry + .metadata() + .and_then(|m| m.modified()) + .unwrap_or_else(|_| SystemTime::now()) + .duration_since(UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0); + let mut dest; + loop { + dest = dest_dir.join(src_name).with_extension(format!("{n}")); + if dest.exists() { + n += 1; + } else { + break; + } + } + if let Err(err) = Self::copy_sync_and_remove(&entry, dest) { + warn!(self.log, "Failed to archive {entry:?}: {err:?}"); + } + } + Ok(()) + } + + // Have dumpadm write the config for crash dumps to be + // on this slice, and then invoke savecore(8) to save any + // dump that's already present there. + // + // NOTE: because of the need to have dumpadm change the global + // state of which slice the system is using for dumps in order + // for savecore to behave the way we want (i.e. clear the flag + // after succeeding), we could hypothetically miss a dump if + // the kernel crashes again while savecore is still running. + fn dumpadm_and_savecore( + &mut self, + dump_slice: &DumpSlicePath, + ) -> Result, DumpAdmError> { + // TODO: untangle savecore from illumos_utils::dumpadm + assert!(self.chosen_debug_dir.is_some()); + + let savecore_dir = self.chosen_debug_dir.clone().unwrap().0; + + match illumos_utils::dumpadm::dumpadm(&dump_slice, Some(&savecore_dir)) + { + Ok(saved) => { + self.savecored_slices.insert(dump_slice.clone()); + Ok(saved) + } + Err(err) => Err(err), + } + } + + fn cleanup(&self) -> Result<(), CleanupError> { + let mut dir_info = Vec::new(); + for dir in &self.known_debug_dirs { + match Self::scope_dir_for_cleanup(dir) { + Ok(info) => { + dir_info.push((info, dir)); + } + Err(err) => { + error!(self.log, "Could not analyze {dir:?} for debug dataset cleanup task: {err:?}"); + } + } + } + if dir_info.is_empty() { + return Err(CleanupError::NoDatasetsToClean); + } + // find dir with oldest average time of files that must be deleted + // to achieve desired threshold, and reclaim that space. + dir_info.sort(); + 'outer: for (dir_info, dir) in dir_info { + let CleanupDirInfo { average_time: _, num_to_delete, file_list } = + dir_info; + for (_time, _bytes, path) in &file_list[..num_to_delete as usize] { + // if we are unable to remove a file, we cannot guarantee + // that we will reach our target size threshold, and suspect + // the i/o error *may* be an issue with the underlying disk, so + // we continue to the dataset with the next-oldest average age + // of files-to-delete in the sorted list. + if let Err(err) = std::fs::remove_file(&path) { + error!(self.log, "Couldn't delete {path:?} from debug dataset, skipping {dir:?}. {err:?}"); + continue 'outer; + } + } + // we made it through all the files we planned to remove, thereby + // freeing up enough space on one of the debug datasets for it to + // be chosen when reevaluating targets. + break; + } + Ok(()) + } + + fn scope_dir_for_cleanup( + debug_dir: &DebugDataset, + ) -> Result { + let used = zfs_get_integer(&**debug_dir, ZFS_PROP_USED)?; + let available = zfs_get_integer(&**debug_dir, ZFS_PROP_AVAILABLE)?; + let capacity = used + available; + + let target_used = capacity * DATASET_USAGE_PERCENT_CHOICE / 100; + + let mut file_list = Vec::new(); + // find all files in the debug dataset and sort by modified time + for path in glob::glob(debug_dir.join("**/*").as_str())?.flatten() { + let meta = std::fs::metadata(&path)?; + // we need this to be a Duration rather than SystemTime so we can + // do math to it later. + let time = meta.modified()?.duration_since(UNIX_EPOCH)?; + let size = meta.len(); + + file_list.push((time, size, path)) + } + file_list.sort(); + + // find how many old files must be deleted to get the dataset under + // the limit, and what the average age of that set is. + let mut possible_bytes = 0; + let mut total_time = Duration::ZERO; + let mut num_to_delete = 0; + for (time, size, _path) in &file_list { + if used - possible_bytes < target_used { + break; + } else { + total_time += *time; + num_to_delete += 1; + possible_bytes += size; + } + } + let average_time = + total_time.checked_div(num_to_delete).unwrap_or(Duration::MAX); + + Ok(CleanupDirInfo { average_time, num_to_delete, file_list }) + } +} + +#[derive(thiserror::Error, Debug)] +enum ArchiveLogsError { + #[error("Couldn't make an async runtime to get zone info: {0}")] + Tokio(std::io::Error), + #[error("I/O error: {0}")] + IoError(#[from] std::io::Error), + #[error("Error calling zoneadm: {0}")] + Zoneadm(#[from] AdmError), + #[error("Non-UTF8 zone path for zone {0}")] + Utf8(String), + #[error("Glob pattern invalid: {0}")] + Glob(#[from] glob::PatternError), + #[error( + "No debug dir into which we should archive logs has yet been chosen" + )] + NoDebugDirYet, +} + +#[derive(thiserror::Error, Debug)] +enum CleanupError { + #[error("No debug datasets were successfully evaluated for cleanup")] + NoDatasetsToClean, + #[error("Failed to query ZFS properties: {0}")] + ZfsError(#[from] ZfsGetError), + #[error("I/O error: {0}")] + IoError(#[from] std::io::Error), + #[error("Glob pattern invalid: {0}")] + Glob(#[from] glob::PatternError), + #[error("A file's observed modified time was before the Unix epoch: {0}")] + TimelineWentSideways(#[from] SystemTimeError), +} + +#[derive(Ord, PartialOrd, Eq, PartialEq)] +struct CleanupDirInfo { + average_time: Duration, + num_to_delete: u32, + file_list: Vec<(Duration, u64, PathBuf)>, +} diff --git a/sled-storage/src/error.rs b/sled-storage/src/error.rs new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sled-storage/src/lib.rs b/sled-storage/src/lib.rs new file mode 100644 index 0000000000..48f335a36a --- /dev/null +++ b/sled-storage/src/lib.rs @@ -0,0 +1,9 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Local storage abstraction for use by sled-agent + +pub(crate) mod dataset; +pub(crate) mod disk; +pub(crate) mod dump_setup; From f927d023e9bf470e50cf5bb08bc53906a17ec1f7 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 28 Sep 2023 06:40:30 +0000 Subject: [PATCH 02/66] wip --- sled-agent/src/storage_manager.rs | 25 ------- sled-storage/src/disk.rs | 10 +-- sled-storage/src/error.rs | 80 ++++++++++++++++++++ sled-storage/src/lib.rs | 3 + sled-storage/src/pool.rs | 36 +++++++++ sled-storage/src/state.rs | 118 ++++++++++++++++++++++++++++++ 6 files changed, 242 insertions(+), 30 deletions(-) create mode 100644 sled-storage/src/pool.rs create mode 100644 sled-storage/src/state.rs diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index bd71371396..e43f2d841d 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -38,7 +38,6 @@ use tokio::task::JoinHandle; use tokio::time::{interval, MissedTickBehavior}; use uuid::Uuid; -use illumos_utils::dumpadm::DumpHdrError; #[cfg(test)] use illumos_utils::{zfs::MockZfs as Zfs, zpool::MockZpool as Zpool}; #[cfg(not(test))] @@ -114,32 +113,8 @@ pub enum Error { #[error("Zpool Not Found: {0}")] ZpoolNotFound(String), - #[error("Failed to serialize toml (intended for {path:?}): {err}")] - Serialize { - path: Utf8PathBuf, - #[source] - err: toml::ser::Error, - }, - - #[error("Failed to deserialize toml from {path:?}: {err}")] - Deserialize { - path: Utf8PathBuf, - #[source] - err: toml::de::Error, - }, - - #[error("Failed to perform I/O: {message}: {err}")] - Io { - message: String, - #[source] - err: std::io::Error, - }, - #[error("Underlay not yet initialized")] UnderlayNotInitialized, - - #[error("Encountered error checking dump device flags: {0}")] - DumpHdr(#[from] DumpHdrError), } /// A ZFS storage pool. diff --git a/sled-storage/src/disk.rs b/sled-storage/src/disk.rs index b78bd57af9..aef68528bf 100644 --- a/sled-storage/src/disk.rs +++ b/sled-storage/src/disk.rs @@ -10,8 +10,8 @@ use omicron_common::disk::DiskIdentity; use sled_hardware::{Disk, DiskVariant}; /// A wrapper around real disks or synthetic disks backed by a file -#[derive(PartialEq, Eq, Clone)] -pub(crate) enum DiskWrapper { +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum DiskWrapper { Real { disk: Disk, devfs_path: Utf8PathBuf }, Synthetic { zpool_name: ZpoolName }, } @@ -24,7 +24,7 @@ impl From for DiskWrapper { } impl DiskWrapper { - fn identity(&self) -> DiskIdentity { + pub fn identity(&self) -> DiskIdentity { match self { DiskWrapper::Real { disk, .. } => disk.identity().clone(), DiskWrapper::Synthetic { zpool_name } => { @@ -38,7 +38,7 @@ impl DiskWrapper { } } - fn variant(&self) -> DiskVariant { + pub fn variant(&self) -> DiskVariant { match self { DiskWrapper::Real { disk, .. } => disk.variant(), DiskWrapper::Synthetic { zpool_name } => match zpool_name.kind() { @@ -48,7 +48,7 @@ impl DiskWrapper { } } - fn zpool_name(&self) -> &ZpoolName { + pub fn zpool_name(&self) -> &ZpoolName { match self { DiskWrapper::Real { disk, .. } => disk.zpool_name(), DiskWrapper::Synthetic { zpool_name } => zpool_name, diff --git a/sled-storage/src/error.rs b/sled-storage/src/error.rs index e69de29bb2..d2a2a473b1 100644 --- a/sled-storage/src/error.rs +++ b/sled-storage/src/error.rs @@ -0,0 +1,80 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Storage related errors + +use crate::dataset::DatasetName; +use camino::Utf8PathBuf; +use omicron_common::api::external::ByteCountRangeError; +use uuid::Uuid; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error(transparent)] + DiskError(#[from] sled_hardware::DiskError), + + // TODO: We could add the context of "why are we doint this op", maybe? + #[error(transparent)] + ZfsListDataset(#[from] illumos_utils::zfs::ListDatasetsError), + + #[error(transparent)] + ZfsEnsureFilesystem(#[from] illumos_utils::zfs::EnsureFilesystemError), + + #[error(transparent)] + ZfsSetValue(#[from] illumos_utils::zfs::SetValueError), + + #[error(transparent)] + ZfsGetValue(#[from] illumos_utils::zfs::GetValueError), + + #[error(transparent)] + GetZpoolInfo(#[from] illumos_utils::zpool::GetInfoError), + + #[error(transparent)] + Fstyp(#[from] illumos_utils::fstyp::Error), + + #[error(transparent)] + ZoneCommand(#[from] illumos_utils::running_zone::RunCommandError), + + #[error(transparent)] + ZoneBoot(#[from] illumos_utils::running_zone::BootError), + + #[error(transparent)] + ZoneEnsureAddress(#[from] illumos_utils::running_zone::EnsureAddressError), + + #[error(transparent)] + ZoneInstall(#[from] illumos_utils::running_zone::InstallZoneError), + + #[error("No U.2 Zpools found")] + NoU2Zpool, + + #[error("Failed to parse UUID from {path}: {err}")] + ParseUuid { + path: Utf8PathBuf, + #[source] + err: uuid::Error, + }, + + #[error("Dataset {name:?} exists with a different uuid (has {old}, requested {new})")] + UuidMismatch { name: Box, old: Uuid, new: Uuid }, + + #[error("Error parsing pool {name}'s size: {err}")] + BadPoolSize { + name: String, + #[source] + err: ByteCountRangeError, + }, + + #[error("Failed to parse the dataset {name}'s UUID: {err}")] + ParseDatasetUuid { + name: String, + #[source] + err: uuid::Error, + }, + + #[error("Zpool Not Found: {0}")] + ZpoolNotFound(String), + + #[error("Underlay not yet initialized")] + UnderlayNotInitialized, +} diff --git a/sled-storage/src/lib.rs b/sled-storage/src/lib.rs index 48f335a36a..a1bd4eecfb 100644 --- a/sled-storage/src/lib.rs +++ b/sled-storage/src/lib.rs @@ -7,3 +7,6 @@ pub(crate) mod dataset; pub(crate) mod disk; pub(crate) mod dump_setup; +pub mod error; +pub(crate) mod pool; +pub mod state; diff --git a/sled-storage/src/pool.rs b/sled-storage/src/pool.rs new file mode 100644 index 0000000000..4a9960da4c --- /dev/null +++ b/sled-storage/src/pool.rs @@ -0,0 +1,36 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! ZFS storage pool + +use crate::error::Error; +use illumos_utils::zpool::{ZpoolInfo, ZpoolName}; +use omicron_common::disk::DiskIdentity; + +#[cfg(test)] +use illumos_utils::{zfs::MockZfs as Zfs, zpool::MockZpool as Zpool}; +#[cfg(not(test))] +use illumos_utils::{zfs::Zfs, zpool::Zpool}; + +/// A ZFS storage pool +#[derive(Debug, Clone)] +pub struct Pool { + name: ZpoolName, + info: ZpoolInfo, + parent: DiskIdentity, +} + +impl Pool { + /// Queries for an existing Zpool by name. + /// + /// Returns Ok if the pool exists. + fn new(name: ZpoolName, parent: DiskIdentity) -> Result { + let info = Zpool::get_info(&name.to_string())?; + Ok(Pool { name, info, parent }) + } + + fn parent(&self) -> &DiskIdentity { + &self.parent + } +} diff --git a/sled-storage/src/state.rs b/sled-storage/src/state.rs new file mode 100644 index 0000000000..a7de70999e --- /dev/null +++ b/sled-storage/src/state.rs @@ -0,0 +1,118 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The internal state of the storage manager task + +use crate::disk::DiskWrapper; +use crate::pool::Pool; +use camino::Utf8PathBuf; +use illumos_utils::zpool::ZpoolName; +use omicron_common::disk::DiskIdentity; +use sled_hardware::DiskVariant; +use std::collections::BTreeMap; +use std::sync::Arc; +use uuid::Uuid; + +// The directory within the debug dataset in which bundles are created. +const BUNDLE_DIRECTORY: &str = "bundle"; + +// The directory for zone bundles. +const ZONE_BUNDLE_DIRECTORY: &str = "zone"; + +/// Storage related state +/// +/// This state is internal to the [`crate::StorageManager`] task. Clones +/// of this state, or subsets of it, can be retrieved by requests to the +/// `StorageManager` task from the [`crate::StorageManagerHandle`]. This state +/// is not `Sync`, and as such does not require any mutexes. However, we do +/// expect to share it relatively frequently, and we want copies of it to be +/// as cheaply made as possible. So any large state is stored inside `Arc`s. On +/// the other hand, we expect infrequent updates to this state, and as such, we +/// use [`std::sync::Arc::make_mut`] to implement clone on write functionality +/// inside the `StorageManager` task if there are any outstanding copies. +/// Therefore, we only pay the cost to update infrequently, and no locks are +/// required by callers when operating on cloned data. The only contention here +/// is for the refrence counters of the internal Arcs when `State` gets cloned +/// or dropped. +#[derive(Debug, Clone)] +pub struct State { + // All disks, real and synthetic, being managed by this sled + disks: Arc>, + + // A map of "Uuid" to "pool". + pools: Arc>, +} + +impl State { + /// Returns the identity of the boot disk. + /// + /// If this returns `None`, we have not processed the boot disk yet. + pub fn boot_disk(&self) -> Option<(DiskIdentity, ZpoolName)> { + self.disks.iter().find_map(|(id, disk)| { + match disk { + // This is the "real" use-case: if we have real disks, query + // their properties to identify if they truly are the boot disk. + DiskWrapper::Real { disk, .. } => { + if disk.is_boot_disk() { + return Some((id.clone(), disk.zpool_name().clone())); + } + } + // This is the "less real" use-case: if we have synthetic disks, + // just label the first M.2-looking one as a "boot disk". + DiskWrapper::Synthetic { .. } => { + if matches!(disk.variant(), DiskVariant::M2) { + return Some((id.clone(), disk.zpool_name().clone())); + } + } + }; + None + }) + } + /// Returns all M.2 zpools + pub fn all_m2_zpools(&self) -> Vec { + self.all_zpools(DiskVariant::M2) + } + + /// Returns all U.2 zpools + pub fn all_u2_zpools(&self) -> Vec { + self.all_zpools(DiskVariant::U2) + } + + /// Returns all mountpoints within all M.2s for a particular dataset. + pub fn all_m2_mountpoints(&self, dataset: &str) -> Vec { + self.all_m2_zpools() + .iter() + .map(|zpool| zpool.dataset_mountpoint(dataset)) + .collect() + } + + /// Returns all mountpoints within all U.2s for a particular dataset. + pub fn all_u2_mountpoints(&self, dataset: &str) -> Vec { + self.all_u2_zpools() + .iter() + .map(|zpool| zpool.dataset_mountpoint(dataset)) + .collect() + } + + /// Returns all zpools of a particular variant + pub fn all_zpools(&self, variant: DiskVariant) -> Vec { + self.disks + .values() + .filter_map(|disk| { + if disk.variant() == variant { + return Some(disk.zpool_name().clone()); + } + None + }) + .collect() + } + + /// Return the directories for storing zone service bundles. + pub fn all_zone_bundle_directories(&self) -> Vec { + self.all_m2_mountpoints(sled_hardware::disk::M2_DEBUG_DATASET) + .into_iter() + .map(|p| p.join(BUNDLE_DIRECTORY).join(ZONE_BUNDLE_DIRECTORY)) + .collect() + } +} From 9818c05473bd431017950498f613671d18e0998e Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 28 Sep 2023 20:00:04 +0000 Subject: [PATCH 03/66] wip --- Cargo.lock | 2 + sled-agent/src/storage_manager.rs | 14 +- sled-hardware/src/disk.rs | 470 ++---------------------- sled-hardware/src/illumos/partitions.rs | 26 +- sled-storage/Cargo.toml | 5 + sled-storage/src/dataset.rs | 274 ++++++++++++++ sled-storage/src/disk.rs | 111 +++++- sled-storage/src/dump_setup.rs | 11 +- sled-storage/src/error.rs | 2 +- sled-storage/src/keyfile.rs | 68 ++++ sled-storage/src/lib.rs | 5 + sled-storage/src/pool.rs | 8 +- sled-storage/src/state.rs | 3 +- 13 files changed, 536 insertions(+), 463 deletions(-) create mode 100644 sled-storage/src/keyfile.rs diff --git a/Cargo.lock b/Cargo.lock index bdf2d44ea4..a448600863 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5277,6 +5277,8 @@ dependencies = [ "key-manager", "nexus-client 0.1.0", "omicron-common 0.1.0", + "omicron-test-utils", + "rand 0.8.5", "schemars", "serde", "serde_json", diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index e43f2d841d..3d3e544573 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -50,7 +50,7 @@ static KEY_MANAGER_READY: OnceLock<()> = OnceLock::new(); #[derive(thiserror::Error, Debug)] pub enum Error { #[error(transparent)] - DiskError(#[from] sled_hardware::DiskError), + DiskError(#[from] sled_hardware::PooledDiskError), // TODO: We could add the context of "why are we doint this op", maybe? #[error(transparent)] @@ -610,7 +610,7 @@ impl StorageWorker { &mut self, unparsed_disk: UnparsedDisk, queued_u2_drives: &mut Option>, - ) -> Result { + ) -> Result { match sled_hardware::Disk::new( &self.log, unparsed_disk.clone(), @@ -619,7 +619,7 @@ impl StorageWorker { .await { Ok(disk) => Ok(disk), - Err(sled_hardware::DiskError::KeyManager(err)) => { + Err(sled_hardware::PooledDiskError::KeyManager(err)) => { warn!( self.log, "Transient error: {err} - queuing disk {:?}", unparsed_disk @@ -630,7 +630,7 @@ impl StorageWorker { *queued_u2_drives = Some(HashSet::from([unparsed_disk.into()])); } - Err(sled_hardware::DiskError::KeyManager(err)) + Err(sled_hardware::PooledDiskError::KeyManager(err)) } Err(err) => { error!( @@ -651,7 +651,7 @@ impl StorageWorker { &mut self, zpool_name: ZpoolName, queued_u2_drives: &mut Option>, - ) -> Result<(), sled_hardware::DiskError> { + ) -> Result<(), sled_hardware::PooledDiskError> { let synthetic_id = DiskIdentity { vendor: "fake_vendor".to_string(), serial: "fake_serial".to_string(), @@ -666,7 +666,7 @@ impl StorageWorker { .await { Ok(()) => Ok(()), - Err(sled_hardware::DiskError::KeyManager(err)) => { + Err(sled_hardware::PooledDiskError::KeyManager(err)) => { warn!( self.log, "Transient error: {err} - queuing synthetic disk: {:?}", @@ -678,7 +678,7 @@ impl StorageWorker { *queued_u2_drives = Some(HashSet::from([zpool_name.into()])); } - Err(sled_hardware::DiskError::KeyManager(err)) + Err(sled_hardware::PooledDiskError::KeyManager(err)) } Err(err) => { error!( diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index aec99ae3f8..bea7e23c73 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -4,34 +4,14 @@ use camino::{Utf8Path, Utf8PathBuf}; use illumos_utils::fstyp::Fstyp; -use illumos_utils::zfs; -use illumos_utils::zfs::DestroyDatasetErrorVariant; -use illumos_utils::zfs::EncryptionDetails; -use illumos_utils::zfs::Keypath; -use illumos_utils::zfs::Mountpoint; -use illumos_utils::zfs::SizeDetails; -use illumos_utils::zfs::Zfs; use illumos_utils::zpool::Zpool; use illumos_utils::zpool::ZpoolKind; use illumos_utils::zpool::ZpoolName; -use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; -use rand::distributions::{Alphanumeric, DistString}; use slog::Logger; use slog::{info, warn}; -use std::sync::OnceLock; -use tokio::fs::{remove_file, File}; -use tokio::io::{AsyncSeekExt, AsyncWriteExt, SeekFrom}; use uuid::Uuid; -/// This path is intentionally on a `tmpfs` to prevent copy-on-write behavior -/// and to ensure it goes away on power off. -/// -/// We want minimize the time the key files are in memory, and so we rederive -/// the keys and recreate the files on demand when creating and mounting -/// encrypted filesystems. We then zero them and unlink them. -pub const KEYPATH_ROOT: &str = "/var/run/oxide/"; - cfg_if::cfg_if! { if #[cfg(target_os = "illumos")] { use crate::illumos::*; @@ -41,7 +21,7 @@ cfg_if::cfg_if! { } #[derive(Debug, thiserror::Error)] -pub enum DiskError { +pub enum PooledDiskError { #[error("Cannot open {path} due to {error}")] IoError { path: Utf8PathBuf, error: std::io::Error }, #[error("Failed to open partition at {path} due to {error}")] @@ -51,10 +31,6 @@ pub enum DiskError { #[error("Requested partition {partition:?} not found on device {path}")] NotFound { path: Utf8PathBuf, partition: Partition }, #[error(transparent)] - DestroyFilesystem(#[from] illumos_utils::zfs::DestroyDatasetError), - #[error(transparent)] - EnsureFilesystem(#[from] illumos_utils::zfs::EnsureFilesystemError), - #[error(transparent)] ZpoolCreate(#[from] illumos_utils::zpool::CreateError), #[error("Cannot import zpool: {0}")] ZpoolImport(illumos_utils::zpool::Error), @@ -62,18 +38,6 @@ pub enum DiskError { CannotFormatMissingDevPath { path: Utf8PathBuf }, #[error("Formatting M.2 devices is not yet implemented")] CannotFormatM2NotImplemented, - #[error("KeyManager error: {0}")] - KeyManager(#[from] key_manager::Error), - #[error("Missing StorageKeyRequester when creating U.2 disk")] - MissingStorageKeyRequester, - #[error("Encrypted filesystem '{0}' missing 'oxide:epoch' property")] - CannotParseEpochProperty(String), - #[error("Encrypted dataset '{dataset}' cannot set 'oxide:agent' property: {err}")] - CannotSetAgentProperty { - dataset: String, - #[source] - err: Box, - }, } /// A partition (or 'slice') of a disk. @@ -126,17 +90,17 @@ impl DiskPaths { } // Finds the first 'variant' partition, and returns the path to it. - fn partition_device_path( + pub fn partition_device_path( &self, partitions: &[Partition], expected_partition: Partition, raw: bool, - ) -> Result { + ) -> Result { for (index, partition) in partitions.iter().enumerate() { if &expected_partition == partition { let path = self.partition_path(index, raw).ok_or_else(|| { - DiskError::NotFound { + PooledDiskError::NotFound { path: self.devfs_path.clone(), partition: expected_partition, } @@ -144,7 +108,7 @@ impl DiskPaths { return Ok(path); } } - Err(DiskError::NotFound { + Err(PooledDiskError::NotFound { path: self.devfs_path.clone(), partition: expected_partition, }) @@ -202,122 +166,33 @@ impl UnparsedDisk { } } -/// A physical disk conforming to the expected partition layout. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct Disk { - paths: DiskPaths, - slot: i64, - variant: DiskVariant, - identity: DiskIdentity, - is_boot_disk: bool, - partitions: Vec, - +/// A physical disk that is partitioned to contain exactly one zpool +/// +/// A PooledDisk relies on hardware specific information to be constructed +/// and is the highest level disk structure in the `sled-hardware` package. +/// The `sled-storage` package contains `Disk`s whose zpool and datasets can be +/// manipulated. This separation exists to remove the hardware dependent logic +/// from the ZFS related logic which can also operate on file backed zpools. +/// Doing things this way allows us to not put higher level concepts like +/// storage keys into this hardware related package. +pub struct PooledDisk { + pub paths: DiskPaths, + pub slot: i64, + pub variant: DiskVariant, + pub identity: DiskIdentity, + pub is_boot_disk: bool, + pub partitions: Vec, // This embeds the assumtion that there is exactly one parsed zpool per // disk. - zpool_name: ZpoolName, + pub zpool_name: ZpoolName, } -// Helper type for describing expected datasets and their optional quota. -#[derive(Clone, Copy, Debug)] -struct ExpectedDataset { - // Name for the dataset - name: &'static str, - // Optional quota, in _bytes_ - quota: Option, - // Identifies if the dataset should be deleted on boot - wipe: bool, - // Optional compression mode - compression: Option<&'static str>, -} - -impl ExpectedDataset { - const fn new(name: &'static str) -> Self { - ExpectedDataset { name, quota: None, wipe: false, compression: None } - } - - const fn quota(mut self, quota: usize) -> Self { - self.quota = Some(quota); - self - } - - const fn wipe(mut self) -> Self { - self.wipe = true; - self - } - - const fn compression(mut self, compression: &'static str) -> Self { - self.compression = Some(compression); - self - } -} - -pub const INSTALL_DATASET: &'static str = "install"; -pub const CRASH_DATASET: &'static str = "crash"; -pub const CLUSTER_DATASET: &'static str = "cluster"; -pub const CONFIG_DATASET: &'static str = "config"; -pub const M2_DEBUG_DATASET: &'static str = "debug"; -// TODO-correctness: This value of 100GiB is a pretty wild guess, and should be -// tuned as needed. -pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30); -// ditto. -pub const DUMP_DATASET_QUOTA: usize = 100 * (1 << 30); -// passed to zfs create -o compression= -pub const DUMP_DATASET_COMPRESSION: &'static str = "gzip-9"; - -// U.2 datasets live under the encrypted dataset and inherit encryption -pub const ZONE_DATASET: &'static str = "crypt/zone"; -pub const DUMP_DATASET: &'static str = "crypt/debug"; -pub const U2_DEBUG_DATASET: &'static str = "crypt/debug"; - -// This is the root dataset for all U.2 drives. Encryption is inherited. -pub const CRYPT_DATASET: &'static str = "crypt"; - -const U2_EXPECTED_DATASET_COUNT: usize = 2; -static U2_EXPECTED_DATASETS: [ExpectedDataset; U2_EXPECTED_DATASET_COUNT] = [ - // Stores filesystems for zones - ExpectedDataset::new(ZONE_DATASET).wipe(), - // For storing full kernel RAM dumps - ExpectedDataset::new(DUMP_DATASET) - .quota(DUMP_DATASET_QUOTA) - .compression(DUMP_DATASET_COMPRESSION), -]; - -const M2_EXPECTED_DATASET_COUNT: usize = 5; -static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ - // Stores software images. - // - // Should be duplicated to both M.2s. - ExpectedDataset::new(INSTALL_DATASET), - // Stores crash dumps. - ExpectedDataset::new(CRASH_DATASET), - // Stores cluter configuration information. - // - // Should be duplicated to both M.2s. - ExpectedDataset::new(CLUSTER_DATASET), - // Stores configuration data, including: - // - What services should be launched on this sled - // - Information about how to initialize the Sled Agent - // - (For scrimlets) RSS setup information - // - // Should be duplicated to both M.2s. - ExpectedDataset::new(CONFIG_DATASET), - // Store debugging data, such as service bundles. - ExpectedDataset::new(M2_DEBUG_DATASET).quota(DEBUG_DATASET_QUOTA), -]; - -impl Disk { - /// Create a new Disk - /// - /// WARNING: In all cases where a U.2 is a possible `DiskVariant`, a - /// `StorageKeyRequester` must be passed so that disk encryption can - /// be used. The `StorageManager` for the sled-agent always has a - /// `StorageKeyRequester` available, and so the only place we should pass - /// `None` is for the M.2s touched by the Installinator. - pub async fn new( +impl PooledDisk { + /// Create a new PooledDisk + pub fn new( log: &Logger, unparsed_disk: UnparsedDisk, - key_requester: Option<&StorageKeyRequester>, - ) -> Result { + ) -> Result { let paths = &unparsed_disk.paths; let variant = unparsed_disk.variant; // Ensure the GPT has the right format. This does not necessarily @@ -335,13 +210,8 @@ impl Disk { )?; let zpool_name = Self::ensure_zpool_exists(log, variant, &zpool_path)?; - Self::ensure_zpool_ready( - log, - &zpool_name, - &unparsed_disk.identity, - key_requester, - ) - .await?; + Self::ensure_zpool_imported(log, &zpool_name)?; + Self::ensure_zpool_failmode_is_continue(log, &zpool_name)?; Ok(Self { paths: unparsed_disk.paths, @@ -354,29 +224,11 @@ impl Disk { }) } - pub async fn ensure_zpool_ready( - log: &Logger, - zpool_name: &ZpoolName, - disk_identity: &DiskIdentity, - key_requester: Option<&StorageKeyRequester>, - ) -> Result<(), DiskError> { - Self::ensure_zpool_imported(log, &zpool_name)?; - Self::ensure_zpool_failmode_is_continue(log, &zpool_name)?; - Self::ensure_zpool_has_datasets( - log, - &zpool_name, - disk_identity, - key_requester, - ) - .await?; - Ok(()) - } - fn ensure_zpool_exists( log: &Logger, variant: DiskVariant, zpool_path: &Utf8Path, - ) -> Result { + ) -> Result { let zpool_name = match Fstyp::get_zpool(&zpool_path) { Ok(zpool_name) => zpool_name, Err(_) => { @@ -407,7 +259,7 @@ impl Disk { }; Zpool::import(zpool_name.clone()).map_err(|e| { warn!(log, "Failed to import zpool {zpool_name}: {e}"); - DiskError::ZpoolImport(e) + PooledDiskError::ZpoolImport(e) })?; Ok(zpool_name) @@ -416,10 +268,10 @@ impl Disk { fn ensure_zpool_imported( log: &Logger, zpool_name: &ZpoolName, - ) -> Result<(), DiskError> { + ) -> Result<(), PooledDiskError> { Zpool::import(zpool_name.clone()).map_err(|e| { warn!(log, "Failed to import zpool {zpool_name}: {e}"); - DiskError::ZpoolImport(e) + PooledDiskError::ZpoolImport(e) })?; Ok(()) } @@ -427,7 +279,7 @@ impl Disk { fn ensure_zpool_failmode_is_continue( log: &Logger, zpool_name: &ZpoolName, - ) -> Result<(), DiskError> { + ) -> Result<(), PooledDiskError> { // Ensure failmode is set to `continue`. See // https://github.com/oxidecomputer/omicron/issues/2766 for details. The // short version is, each pool is only backed by one vdev. There is no @@ -440,212 +292,10 @@ impl Disk { log, "Failed to set failmode=continue on zpool {zpool_name}: {e}" ); - DiskError::ZpoolImport(e) + PooledDiskError::ZpoolImport(e) })?; Ok(()) } - - // Ensure that the zpool contains all the datasets we would like it to - // contain. - async fn ensure_zpool_has_datasets( - log: &Logger, - zpool_name: &ZpoolName, - disk_identity: &DiskIdentity, - key_requester: Option<&StorageKeyRequester>, - ) -> Result<(), DiskError> { - let (root, datasets) = match zpool_name.kind().into() { - DiskVariant::M2 => (None, M2_EXPECTED_DATASETS.iter()), - DiskVariant::U2 => { - (Some(CRYPT_DATASET), U2_EXPECTED_DATASETS.iter()) - } - }; - - let zoned = false; - let do_format = true; - - // Ensure the root encrypted filesystem exists - // Datasets below this in the hierarchy will inherit encryption - if let Some(dataset) = root { - let Some(key_requester) = key_requester else { - return Err(DiskError::MissingStorageKeyRequester); - }; - let mountpoint = zpool_name.dataset_mountpoint(dataset); - let keypath: Keypath = disk_identity.into(); - - let epoch = - if let Ok(epoch_str) = Zfs::get_oxide_value(dataset, "epoch") { - if let Ok(epoch) = epoch_str.parse::() { - epoch - } else { - return Err(DiskError::CannotParseEpochProperty( - dataset.to_string(), - )); - } - } else { - // We got an error trying to call `Zfs::get_oxide_value` - // which indicates that the dataset doesn't exist or there - // was a problem running the command. - // - // Note that `Zfs::get_oxide_value` will succeed even if - // the epoch is missing. `epoch_str` will show up as a dash - // (`-`) and will not parse into a `u64`. So we don't have - // to worry about that case here as it is handled above. - // - // If the error indicated that the command failed for some - // other reason, but the dataset actually existed, we will - // try to create the dataset below and that will fail. So - // there is no harm in just loading the latest secret here. - key_requester.load_latest_secret().await? - }; - - let key = - key_requester.get_key(epoch, disk_identity.clone()).await?; - - let mut keyfile = - KeyFile::create(keypath.clone(), key.expose_secret(), log) - .await - .map_err(|error| DiskError::IoError { - path: keypath.0.clone(), - error, - })?; - - let encryption_details = EncryptionDetails { keypath, epoch }; - - info!( - log, - "Ensuring encrypted filesystem: {} for epoch {}", - dataset, - epoch - ); - let result = Zfs::ensure_filesystem( - &format!("{}/{}", zpool_name, dataset), - Mountpoint::Path(mountpoint), - zoned, - do_format, - Some(encryption_details), - None, - ); - - keyfile.zero_and_unlink().await.map_err(|error| { - DiskError::IoError { path: keyfile.path().0.clone(), error } - })?; - - result?; - }; - - for dataset in datasets.into_iter() { - let mountpoint = zpool_name.dataset_mountpoint(dataset.name); - let name = &format!("{}/{}", zpool_name, dataset.name); - - // Use a value that's alive for the duration of this sled agent - // to answer the question: should we wipe this disk, or have - // we seen it before? - // - // If this value comes from a prior iteration of the sled agent, - // we opt to remove the corresponding dataset. - static AGENT_LOCAL_VALUE: OnceLock = OnceLock::new(); - let agent_local_value = AGENT_LOCAL_VALUE.get_or_init(|| { - Alphanumeric.sample_string(&mut rand::thread_rng(), 20) - }); - - if dataset.wipe { - match Zfs::get_oxide_value(name, "agent") { - Ok(v) if &v == agent_local_value => { - info!( - log, - "Skipping automatic wipe for dataset: {}", name - ); - } - Ok(_) | Err(_) => { - info!( - log, - "Automatically destroying dataset: {}", name - ); - Zfs::destroy_dataset(name).or_else(|err| { - // If we can't find the dataset, that's fine -- it might - // not have been formatted yet. - if let DestroyDatasetErrorVariant::NotFound = - err.err - { - Ok(()) - } else { - Err(err) - } - })?; - } - } - } - - let encryption_details = None; - let size_details = Some(SizeDetails { - quota: dataset.quota, - compression: dataset.compression, - }); - Zfs::ensure_filesystem( - name, - Mountpoint::Path(mountpoint), - zoned, - do_format, - encryption_details, - size_details, - )?; - - if dataset.wipe { - Zfs::set_oxide_value(name, "agent", agent_local_value) - .map_err(|err| DiskError::CannotSetAgentProperty { - dataset: name.clone(), - err: Box::new(err), - })?; - } - } - Ok(()) - } - - pub fn is_boot_disk(&self) -> bool { - self.is_boot_disk - } - - pub fn identity(&self) -> &DiskIdentity { - &self.identity - } - - pub fn variant(&self) -> DiskVariant { - self.variant - } - - pub fn devfs_path(&self) -> &Utf8PathBuf { - &self.paths.devfs_path - } - - pub fn zpool_name(&self) -> &ZpoolName { - &self.zpool_name - } - - pub fn boot_image_devfs_path( - &self, - raw: bool, - ) -> Result { - self.paths.partition_device_path( - &self.partitions, - Partition::BootImage, - raw, - ) - } - - pub fn dump_device_devfs_path( - &self, - raw: bool, - ) -> Result { - self.paths.partition_device_path( - &self.partitions, - Partition::DumpDevice, - raw, - ) - } - - pub fn slot(&self) -> i64 { - self.slot - } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -664,56 +314,6 @@ impl From for DiskVariant { } } -/// A file that wraps a zfs encryption key. -/// -/// We put this in a RAM backed filesystem and zero and delete it when we are -/// done with it. Unfortunately we cannot do this inside `Drop` because there is no -/// equivalent async drop. -pub struct KeyFile { - path: Keypath, - file: File, - log: Logger, -} - -impl KeyFile { - pub async fn create( - path: Keypath, - key: &[u8; 32], - log: &Logger, - ) -> std::io::Result { - // TODO: fix this to not truncate - // We want to overwrite any existing contents. - // If we truncate we may leave dirty pages around - // containing secrets. - let mut file = tokio::fs::OpenOptions::new() - .create(true) - .write(true) - .open(&path.0) - .await?; - file.write_all(key).await?; - info!(log, "Created keyfile {}", path); - Ok(KeyFile { path, file, log: log.clone() }) - } - - /// These keyfiles live on a tmpfs and we zero the file so the data doesn't - /// linger on the page in memory. - /// - /// It'd be nice to `impl Drop for `KeyFile` and then call `zero` - /// from within the drop handler, but async `Drop` isn't supported. - pub async fn zero_and_unlink(&mut self) -> std::io::Result<()> { - let zeroes = [0u8; 32]; - let _ = self.file.seek(SeekFrom::Start(0)).await?; - self.file.write_all(&zeroes).await?; - info!(self.log, "Zeroed and unlinked keyfile {}", self.path); - remove_file(&self.path().0).await?; - Ok(()) - } - - pub fn path(&self) -> &Keypath { - &self.path - } -} - #[cfg(test)] mod test { use super::*; @@ -825,7 +425,7 @@ mod test { paths .partition_device_path(&[], Partition::ZfsPool, false) .expect_err("Should not have found partition"), - DiskError::NotFound { .. }, + PooledDiskError::NotFound { .. }, )); } } diff --git a/sled-hardware/src/illumos/partitions.rs b/sled-hardware/src/illumos/partitions.rs index 950074bd3a..ee745fc78b 100644 --- a/sled-hardware/src/illumos/partitions.rs +++ b/sled-hardware/src/illumos/partitions.rs @@ -5,7 +5,7 @@ //! illumos-specific mechanisms for parsing disk info. use crate::illumos::gpt; -use crate::{DiskError, DiskPaths, DiskVariant, Partition}; +use crate::{DiskPaths, DiskVariant, Partition, PooledDiskError}; use camino::Utf8Path; use illumos_utils::zpool::ZpoolName; use slog::info; @@ -41,9 +41,9 @@ fn parse_partition_types( path: &Utf8Path, partitions: &Vec, expected_partitions: &[Partition; N], -) -> Result, DiskError> { +) -> Result, PooledDiskError> { if partitions.len() != N { - return Err(DiskError::BadPartitionLayout { + return Err(PooledDiskError::BadPartitionLayout { path: path.to_path_buf(), why: format!( "Expected {} partitions, only saw {}", @@ -54,7 +54,7 @@ fn parse_partition_types( } for i in 0..N { if partitions[i].index() != i { - return Err(DiskError::BadPartitionLayout { + return Err(PooledDiskError::BadPartitionLayout { path: path.to_path_buf(), why: format!( "The {i}-th partition has index {}", @@ -80,7 +80,7 @@ pub fn ensure_partition_layout( log: &Logger, paths: &DiskPaths, variant: DiskVariant, -) -> Result, DiskError> { +) -> Result, PooledDiskError> { internal_ensure_partition_layout::(log, paths, variant) } @@ -90,7 +90,7 @@ fn internal_ensure_partition_layout( log: &Logger, paths: &DiskPaths, variant: DiskVariant, -) -> Result, DiskError> { +) -> Result, PooledDiskError> { // Open the "Whole Disk" as a raw device to be parsed by the // libefi-illumos library. This lets us peek at the GPT before // making too many assumptions about it. @@ -114,7 +114,9 @@ fn internal_ensure_partition_layout( let dev_path = if let Some(dev_path) = &paths.dev_path { dev_path } else { - return Err(DiskError::CannotFormatMissingDevPath { path }); + return Err(PooledDiskError::CannotFormatMissingDevPath { + path, + }); }; match variant { DiskVariant::U2 => { @@ -129,12 +131,12 @@ fn internal_ensure_partition_layout( // the expected partitions? Or would it be wiser to infer // that this indicates an unexpected error conditions that // needs mitigation? - return Err(DiskError::CannotFormatM2NotImplemented); + return Err(PooledDiskError::CannotFormatM2NotImplemented); } } } Err(err) => { - return Err(DiskError::Gpt { + return Err(PooledDiskError::Gpt { path, error: anyhow::Error::new(err), }); @@ -197,7 +199,7 @@ mod test { DiskVariant::U2, ); match result { - Err(DiskError::CannotFormatMissingDevPath { .. }) => {} + Err(PooledDiskError::CannotFormatMissingDevPath { .. }) => {} _ => panic!("Should have failed with a missing dev path error"), } @@ -373,7 +375,7 @@ mod test { DiskVariant::M2, ) .expect_err("Should have failed parsing empty GPT"), - DiskError::BadPartitionLayout { .. } + PooledDiskError::BadPartitionLayout { .. } )); logctx.cleanup_successful(); @@ -398,7 +400,7 @@ mod test { DiskVariant::U2, ) .expect_err("Should have failed parsing empty GPT"), - DiskError::BadPartitionLayout { .. } + PooledDiskError::BadPartitionLayout { .. } )); logctx.cleanup_successful(); diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml index 03f0f608de..ae9718382d 100644 --- a/sled-storage/Cargo.toml +++ b/sled-storage/Cargo.toml @@ -14,6 +14,7 @@ key-manager.workspace = true # We could put this in the nexus-client instead nexus-client.workspace = true omicron-common.workspace = true +rand.workspace = true schemars = { workspace = true, features = [ "chrono", "uuid1" ] } serde.workspace = true serde_json.workspace = true @@ -25,3 +26,7 @@ slog.workspace = true thiserror.workspace = true tokio.workspace = true uuid.workspace = true + +[dev-dependencies] +illumos-utils = { workspace = true, features = ["testing"] } +omicron-test-utils.workspace = true diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index e521dd963a..3c40dc10f0 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -2,10 +2,112 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +//! ZFS dataset related functionality + +use crate::keyfile::KeyFile; +use camino::Utf8PathBuf; +use illumos_utils::zfs::{ + self, DestroyDatasetErrorVariant, EncryptionDetails, Keypath, Mountpoint, + SizeDetails, Zfs, +}; use illumos_utils::zpool::ZpoolName; +use key_manager::StorageKeyRequester; +use omicron_common::disk::DiskIdentity; +use rand::distributions::{Alphanumeric, DistString}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use sled_hardware::DiskVariant; +use slog::{info, Logger}; use std::str::FromStr; +use std::sync::OnceLock; + +pub const INSTALL_DATASET: &'static str = "install"; +pub const CRASH_DATASET: &'static str = "crash"; +pub const CLUSTER_DATASET: &'static str = "cluster"; +pub const CONFIG_DATASET: &'static str = "config"; +pub const M2_DEBUG_DATASET: &'static str = "debug"; +// TODO-correctness: This value of 100GiB is a pretty wild guess, and should be +// tuned as needed. +pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30); +// ditto. +pub const DUMP_DATASET_QUOTA: usize = 100 * (1 << 30); +// passed to zfs create -o compression= +pub const DUMP_DATASET_COMPRESSION: &'static str = "gzip-9"; + +// U.2 datasets live under the encrypted dataset and inherit encryption +pub const ZONE_DATASET: &'static str = "crypt/zone"; +pub const DUMP_DATASET: &'static str = "crypt/debug"; +pub const U2_DEBUG_DATASET: &'static str = "crypt/debug"; + +// This is the root dataset for all U.2 drives. Encryption is inherited. +pub const CRYPT_DATASET: &'static str = "crypt"; + +const U2_EXPECTED_DATASET_COUNT: usize = 2; +static U2_EXPECTED_DATASETS: [ExpectedDataset; U2_EXPECTED_DATASET_COUNT] = [ + // Stores filesystems for zones + ExpectedDataset::new(ZONE_DATASET).wipe(), + // For storing full kernel RAM dumps + ExpectedDataset::new(DUMP_DATASET) + .quota(DUMP_DATASET_QUOTA) + .compression(DUMP_DATASET_COMPRESSION), +]; + +const M2_EXPECTED_DATASET_COUNT: usize = 5; +static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ + // Stores software images. + // + // Should be duplicated to both M.2s. + ExpectedDataset::new(INSTALL_DATASET), + // Stores crash dumps. + ExpectedDataset::new(CRASH_DATASET), + // Stores cluter configuration information. + // + // Should be duplicated to both M.2s. + ExpectedDataset::new(CLUSTER_DATASET), + // Stores configuration data, including: + // - What services should be launched on this sled + // - Information about how to initialize the Sled Agent + // - (For scrimlets) RSS setup information + // + // Should be duplicated to both M.2s. + ExpectedDataset::new(CONFIG_DATASET), + // Store debugging data, such as service bundles. + ExpectedDataset::new(M2_DEBUG_DATASET).quota(DEBUG_DATASET_QUOTA), +]; + +// Helper type for describing expected datasets and their optional quota. +#[derive(Clone, Copy, Debug)] +struct ExpectedDataset { + // Name for the dataset + name: &'static str, + // Optional quota, in _bytes_ + quota: Option, + // Identifies if the dataset should be deleted on boot + wipe: bool, + // Optional compression mode + compression: Option<&'static str>, +} + +impl ExpectedDataset { + const fn new(name: &'static str) -> Self { + ExpectedDataset { name, quota: None, wipe: false, compression: None } + } + + const fn quota(mut self, quota: usize) -> Self { + self.quota = Some(quota); + self + } + + const fn wipe(mut self) -> Self { + self.wipe = true; + self + } + + const fn compression(mut self, compression: &'static str) -> Self { + self.compression = Some(compression); + self + } +} /// The type of a dataset, and an auxiliary information necessary /// to successfully launch a zone managing the associated data. @@ -105,6 +207,178 @@ impl From for sled_agent_client::types::DatasetName { } } +#[derive(Debug, thiserror::Error)] +pub enum DatasetError { + #[error("Cannot open {path} due to {error}")] + IoError { path: Utf8PathBuf, error: std::io::Error }, + #[error(transparent)] + DestroyFilesystem(#[from] illumos_utils::zfs::DestroyDatasetError), + #[error(transparent)] + EnsureFilesystem(#[from] illumos_utils::zfs::EnsureFilesystemError), + #[error("KeyManager error: {0}")] + KeyManager(#[from] key_manager::Error), + #[error("Missing StorageKeyRequester when creating U.2 disk")] + MissingStorageKeyRequester, + #[error("Encrypted filesystem '{0}' missing 'oxide:epoch' property")] + CannotParseEpochProperty(String), + #[error("Encrypted dataset '{dataset}' cannot set 'oxide:agent' property: {err}")] + CannotSetAgentProperty { + dataset: String, + #[source] + err: Box, + }, +} + +/// Ensure that the zpool contains all the datasets we would like it to +/// contain. +/// +/// WARNING: In all cases where a U.2 is a possible `DiskVariant`, a +/// `StorageKeyRequester` must be passed so that disk encryption can +/// be used. The `StorageManager` for the sled-agent always has a +/// `StorageKeyRequester` available, and so the only place we should pass +/// `None` is for the M.2s touched by the Installinator. +pub async fn ensure_zpool_has_datasets( + log: &Logger, + zpool_name: &ZpoolName, + disk_identity: &DiskIdentity, + key_requester: Option<&StorageKeyRequester>, +) -> Result<(), DatasetError> { + let (root, datasets) = match zpool_name.kind().into() { + DiskVariant::M2 => (None, M2_EXPECTED_DATASETS.iter()), + DiskVariant::U2 => (Some(CRYPT_DATASET), U2_EXPECTED_DATASETS.iter()), + }; + + let zoned = false; + let do_format = true; + + // Ensure the root encrypted filesystem exists + // Datasets below this in the hierarchy will inherit encryption + if let Some(dataset) = root { + let Some(key_requester) = key_requester else { + return Err(DatasetError::MissingStorageKeyRequester); + }; + let mountpoint = zpool_name.dataset_mountpoint(dataset); + let keypath: Keypath = disk_identity.into(); + + let epoch = + if let Ok(epoch_str) = Zfs::get_oxide_value(dataset, "epoch") { + if let Ok(epoch) = epoch_str.parse::() { + epoch + } else { + return Err(DatasetError::CannotParseEpochProperty( + dataset.to_string(), + )); + } + } else { + // We got an error trying to call `Zfs::get_oxide_value` + // which indicates that the dataset doesn't exist or there + // was a problem running the command. + // + // Note that `Zfs::get_oxide_value` will succeed even if + // the epoch is missing. `epoch_str` will show up as a dash + // (`-`) and will not parse into a `u64`. So we don't have + // to worry about that case here as it is handled above. + // + // If the error indicated that the command failed for some + // other reason, but the dataset actually existed, we will + // try to create the dataset below and that will fail. So + // there is no harm in just loading the latest secret here. + key_requester.load_latest_secret().await? + }; + + let key = key_requester.get_key(epoch, disk_identity.clone()).await?; + + let mut keyfile = + KeyFile::create(keypath.clone(), key.expose_secret(), log) + .await + .map_err(|error| DatasetError::IoError { + path: keypath.0.clone(), + error, + })?; + + let encryption_details = EncryptionDetails { keypath, epoch }; + + info!( + log, + "Ensuring encrypted filesystem: {} for epoch {}", dataset, epoch + ); + let result = Zfs::ensure_filesystem( + &format!("{}/{}", zpool_name, dataset), + Mountpoint::Path(mountpoint), + zoned, + do_format, + Some(encryption_details), + None, + ); + + keyfile.zero_and_unlink().await.map_err(|error| { + DatasetError::IoError { path: keyfile.path().0.clone(), error } + })?; + + result?; + }; + + for dataset in datasets.into_iter() { + let mountpoint = zpool_name.dataset_mountpoint(dataset.name); + let name = &format!("{}/{}", zpool_name, dataset.name); + + // Use a value that's alive for the duration of this sled agent + // to answer the question: should we wipe this disk, or have + // we seen it before? + // + // If this value comes from a prior iteration of the sled agent, + // we opt to remove the corresponding dataset. + static AGENT_LOCAL_VALUE: OnceLock = OnceLock::new(); + let agent_local_value = AGENT_LOCAL_VALUE.get_or_init(|| { + Alphanumeric.sample_string(&mut rand::thread_rng(), 20) + }); + + if dataset.wipe { + match Zfs::get_oxide_value(name, "agent") { + Ok(v) if &v == agent_local_value => { + info!(log, "Skipping automatic wipe for dataset: {}", name); + } + Ok(_) | Err(_) => { + info!(log, "Automatically destroying dataset: {}", name); + Zfs::destroy_dataset(name).or_else(|err| { + // If we can't find the dataset, that's fine -- it might + // not have been formatted yet. + if let DestroyDatasetErrorVariant::NotFound = err.err { + Ok(()) + } else { + Err(err) + } + })?; + } + } + } + + let encryption_details = None; + let size_details = Some(SizeDetails { + quota: dataset.quota, + compression: dataset.compression, + }); + Zfs::ensure_filesystem( + name, + Mountpoint::Path(mountpoint), + zoned, + do_format, + encryption_details, + size_details, + )?; + + if dataset.wipe { + Zfs::set_oxide_value(name, "agent", agent_local_value).map_err( + |err| DatasetError::CannotSetAgentProperty { + dataset: name.clone(), + err: Box::new(err), + }, + )?; + } + } + Ok(()) +} + #[cfg(test)] mod test { use super::*; diff --git a/sled-storage/src/disk.rs b/sled-storage/src/disk.rs index aef68528bf..d7e02d8c97 100644 --- a/sled-storage/src/disk.rs +++ b/sled-storage/src/disk.rs @@ -6,8 +6,15 @@ use camino::Utf8PathBuf; use illumos_utils::zpool::{ZpoolKind, ZpoolName}; +use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; -use sled_hardware::{Disk, DiskVariant}; +use sled_hardware::{ + DiskPaths, DiskVariant, Partition, PooledDisk, PooledDiskError, + UnparsedDisk, +}; +use slog::Logger; + +use crate::dataset; /// A wrapper around real disks or synthetic disks backed by a file #[derive(Debug, PartialEq, Eq, Clone)] @@ -55,3 +62,105 @@ impl DiskWrapper { } } } + +#[derive(Debug, thiserror::Error)] +pub enum DiskError { + #[error(transparent)] + Dataset(#[from] crate::dataset::DatasetError), + #[error(transparent)] + PooledDisk(#[from] sled_hardware::PooledDiskError), +} + +/// A physical disk conforming to the expected partition layout +/// and which contains provisioned zpools and datasets. This disk +/// is ready for usage by higher level software. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Disk { + paths: DiskPaths, + slot: i64, + variant: DiskVariant, + identity: DiskIdentity, + is_boot_disk: bool, + partitions: Vec, + + // This embeds the assumtion that there is exactly one parsed zpool per + // disk. + zpool_name: ZpoolName, +} + +impl Disk { + pub async fn new( + log: &Logger, + unparsed_disk: UnparsedDisk, + key_requester: Option<&StorageKeyRequester>, + ) -> Result { + let disk = PooledDisk::new(log, unparsed_disk)?; + dataset::ensure_zpool_has_datasets( + log, + &disk.zpool_name, + &disk.identity, + key_requester, + ) + .await?; + Ok(disk.into()) + } + pub fn is_boot_disk(&self) -> bool { + self.is_boot_disk + } + + pub fn identity(&self) -> &DiskIdentity { + &self.identity + } + + pub fn variant(&self) -> DiskVariant { + self.variant + } + + pub fn devfs_path(&self) -> &Utf8PathBuf { + &self.paths.devfs_path + } + + pub fn zpool_name(&self) -> &ZpoolName { + &self.zpool_name + } + + pub fn boot_image_devfs_path( + &self, + raw: bool, + ) -> Result { + self.paths.partition_device_path( + &self.partitions, + Partition::BootImage, + raw, + ) + } + + pub fn dump_device_devfs_path( + &self, + raw: bool, + ) -> Result { + self.paths.partition_device_path( + &self.partitions, + Partition::DumpDevice, + raw, + ) + } + + pub fn slot(&self) -> i64 { + self.slot + } +} + +impl From for Disk { + fn from(pd: PooledDisk) -> Self { + Self { + paths: pd.paths, + slot: pd.slot, + variant: pd.variant, + identity: pd.identity, + is_boot_disk: pd.is_boot_disk, + partitions: pd.partitions, + zpool_name: pd.zpool_name, + } + } +} diff --git a/sled-storage/src/dump_setup.rs b/sled-storage/src/dump_setup.rs index ea51251f84..5befa8e8c8 100644 --- a/sled-storage/src/dump_setup.rs +++ b/sled-storage/src/dump_setup.rs @@ -1,3 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Dump dataset setup + +use crate::dataset::{CRASH_DATASET, DUMP_DATASET}; use crate::disk::DiskWrapper; use camino::Utf8PathBuf; use derive_more::{AsRef, Deref, From}; @@ -70,11 +77,11 @@ trait GetMountpoint: std::ops::Deref { } impl GetMountpoint for DebugZpool { type NewType = DebugDataset; - const MOUNTPOINT: &'static str = sled_hardware::disk::DUMP_DATASET; + const MOUNTPOINT: &'static str = DUMP_DATASET; } impl GetMountpoint for CoreZpool { type NewType = CoreDataset; - const MOUNTPOINT: &'static str = sled_hardware::disk::CRASH_DATASET; + const MOUNTPOINT: &'static str = CRASH_DATASET; } struct DumpSetupWorker { diff --git a/sled-storage/src/error.rs b/sled-storage/src/error.rs index d2a2a473b1..04c4f7ec07 100644 --- a/sled-storage/src/error.rs +++ b/sled-storage/src/error.rs @@ -12,7 +12,7 @@ use uuid::Uuid; #[derive(thiserror::Error, Debug)] pub enum Error { #[error(transparent)] - DiskError(#[from] sled_hardware::DiskError), + DiskError(#[from] sled_hardware::PooledDiskError), // TODO: We could add the context of "why are we doint this op", maybe? #[error(transparent)] diff --git a/sled-storage/src/keyfile.rs b/sled-storage/src/keyfile.rs new file mode 100644 index 0000000000..396c860fc5 --- /dev/null +++ b/sled-storage/src/keyfile.rs @@ -0,0 +1,68 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Key file support for ZFS dataset encryption + +use illumos_utils::zfs::Keypath; +use slog::{info, Logger}; +use tokio::fs::{remove_file, File}; +use tokio::io::{AsyncSeekExt, AsyncWriteExt, SeekFrom}; + +/// This path is intentionally on a `tmpfs` to prevent copy-on-write behavior +/// and to ensure it goes away on power off. +/// +/// We want minimize the time the key files are in memory, and so we rederive +/// the keys and recreate the files on demand when creating and mounting +/// encrypted filesystems. We then zero them and unlink them. +pub const KEYPATH_ROOT: &str = "/var/run/oxide/"; + +/// A file that wraps a zfs encryption key. +/// +/// We put this in a RAM backed filesystem and zero and delete it when we are +/// done with it. Unfortunately we cannot do this inside `Drop` because there is no +/// equivalent async drop. +pub struct KeyFile { + path: Keypath, + file: File, + log: Logger, +} + +impl KeyFile { + pub async fn create( + path: Keypath, + key: &[u8; 32], + log: &Logger, + ) -> std::io::Result { + // TODO: fix this to not truncate + // We want to overwrite any existing contents. + // If we truncate we may leave dirty pages around + // containing secrets. + let mut file = tokio::fs::OpenOptions::new() + .create(true) + .write(true) + .open(&path.0) + .await?; + file.write_all(key).await?; + info!(log, "Created keyfile {}", path); + Ok(KeyFile { path, file, log: log.clone() }) + } + + /// These keyfiles live on a tmpfs and we zero the file so the data doesn't + /// linger on the page in memory. + /// + /// It'd be nice to `impl Drop for `KeyFile` and then call `zero` + /// from within the drop handler, but async `Drop` isn't supported. + pub async fn zero_and_unlink(&mut self) -> std::io::Result<()> { + let zeroes = [0u8; 32]; + let _ = self.file.seek(SeekFrom::Start(0)).await?; + self.file.write_all(&zeroes).await?; + info!(self.log, "Zeroed and unlinked keyfile {}", self.path); + remove_file(&self.path().0).await?; + Ok(()) + } + + pub fn path(&self) -> &Keypath { + &self.path + } +} diff --git a/sled-storage/src/lib.rs b/sled-storage/src/lib.rs index a1bd4eecfb..783eaf6642 100644 --- a/sled-storage/src/lib.rs +++ b/sled-storage/src/lib.rs @@ -3,10 +3,15 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. //! Local storage abstraction for use by sled-agent +//! +//! This abstraction operates at the ZFS level and relies on zpool setup on +//! hardware partitions from the `sled-hardware` crate. It utilizes the +//! `illumos-utils` crate to actually perform ZFS related OS calls. pub(crate) mod dataset; pub(crate) mod disk; pub(crate) mod dump_setup; pub mod error; +pub(crate) mod keyfile; pub(crate) mod pool; pub mod state; diff --git a/sled-storage/src/pool.rs b/sled-storage/src/pool.rs index 4a9960da4c..1abf43c1de 100644 --- a/sled-storage/src/pool.rs +++ b/sled-storage/src/pool.rs @@ -9,9 +9,9 @@ use illumos_utils::zpool::{ZpoolInfo, ZpoolName}; use omicron_common::disk::DiskIdentity; #[cfg(test)] -use illumos_utils::{zfs::MockZfs as Zfs, zpool::MockZpool as Zpool}; +use illumos_utils::zpool::MockZpool as Zpool; #[cfg(not(test))] -use illumos_utils::{zfs::Zfs, zpool::Zpool}; +use illumos_utils::zpool::Zpool; /// A ZFS storage pool #[derive(Debug, Clone)] @@ -25,12 +25,12 @@ impl Pool { /// Queries for an existing Zpool by name. /// /// Returns Ok if the pool exists. - fn new(name: ZpoolName, parent: DiskIdentity) -> Result { + pub fn new(name: ZpoolName, parent: DiskIdentity) -> Result { let info = Zpool::get_info(&name.to_string())?; Ok(Pool { name, info, parent }) } - fn parent(&self) -> &DiskIdentity { + pub fn parent(&self) -> &DiskIdentity { &self.parent } } diff --git a/sled-storage/src/state.rs b/sled-storage/src/state.rs index a7de70999e..8a0be34f63 100644 --- a/sled-storage/src/state.rs +++ b/sled-storage/src/state.rs @@ -4,6 +4,7 @@ //! The internal state of the storage manager task +use crate::dataset::M2_DEBUG_DATASET; use crate::disk::DiskWrapper; use crate::pool::Pool; use camino::Utf8PathBuf; @@ -110,7 +111,7 @@ impl State { /// Return the directories for storing zone service bundles. pub fn all_zone_bundle_directories(&self) -> Vec { - self.all_m2_mountpoints(sled_hardware::disk::M2_DEBUG_DATASET) + self.all_m2_mountpoints(M2_DEBUG_DATASET) .into_iter() .map(|p| p.join(BUNDLE_DIRECTORY).join(ZONE_BUNDLE_DIRECTORY)) .collect() From ab57c4671e5a0ffdf7f050eb29a7ba3afa4c5fcc Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 28 Sep 2023 22:51:06 +0000 Subject: [PATCH 04/66] wip --- common/src/disk.rs | 2 +- sled-storage/src/error.rs | 3 +- sled-storage/src/lib.rs | 3 +- sled-storage/src/manager.rs | 110 ++++++++++++++++++++ sled-storage/src/pool.rs | 10 +- sled-storage/src/{state.rs => resources.rs} | 30 ++++-- 6 files changed, 141 insertions(+), 17 deletions(-) create mode 100644 sled-storage/src/manager.rs rename sled-storage/src/{state.rs => resources.rs} (81%) diff --git a/common/src/disk.rs b/common/src/disk.rs index 3ea8091326..3ae9c31e01 100644 --- a/common/src/disk.rs +++ b/common/src/disk.rs @@ -5,7 +5,7 @@ //! Disk related types shared among crates /// Uniquely identifies a disk. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Ord, PartialOrd)] pub struct DiskIdentity { pub vendor: String, pub serial: String, diff --git a/sled-storage/src/error.rs b/sled-storage/src/error.rs index 04c4f7ec07..fbf721fab7 100644 --- a/sled-storage/src/error.rs +++ b/sled-storage/src/error.rs @@ -5,6 +5,7 @@ //! Storage related errors use crate::dataset::DatasetName; +use crate::disk::DiskError; use camino::Utf8PathBuf; use omicron_common::api::external::ByteCountRangeError; use uuid::Uuid; @@ -12,7 +13,7 @@ use uuid::Uuid; #[derive(thiserror::Error, Debug)] pub enum Error { #[error(transparent)] - DiskError(#[from] sled_hardware::PooledDiskError), + DiskError(#[from] DiskError), // TODO: We could add the context of "why are we doint this op", maybe? #[error(transparent)] diff --git a/sled-storage/src/lib.rs b/sled-storage/src/lib.rs index 783eaf6642..f923165896 100644 --- a/sled-storage/src/lib.rs +++ b/sled-storage/src/lib.rs @@ -13,5 +13,6 @@ pub(crate) mod disk; pub(crate) mod dump_setup; pub mod error; pub(crate) mod keyfile; +pub mod manager; pub(crate) mod pool; -pub mod state; +pub mod resources; diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs new file mode 100644 index 0000000000..dbbe5fb57a --- /dev/null +++ b/sled-storage/src/manager.rs @@ -0,0 +1,110 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The storage manager task + +use std::collections::{BTreeSet, HashSet}; + +use crate::dataset::DatasetError; +use crate::disk::{Disk, DiskError, DiskWrapper}; +use crate::error::Error; +use crate::resources::StorageResources; +use derive_more::From; +use illumos_utils::zpool::{ZpoolKind, ZpoolName}; +use key_manager::StorageKeyRequester; +use omicron_common::disk::DiskIdentity; +use sled_hardware::{DiskVariant, UnparsedDisk}; +use slog::{error, info, o, warn, Logger}; +use tokio::sync::{mpsc, oneshot}; + +// The size of the mpsc bounded channel used to communicate +// between the `StorageHandle` and `StorageManager`. +const QUEUE_SIZE: usize = 256; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum StorageManagerStage { + WaitingForBootDisk, + WaitingForKeyManager, + QueuingDisks, + Normal, +} + +enum StorageRequest {} + +/// A mechanism for interacting with the [`StorageManager`] +pub struct StorageHandle { + tx: mpsc::Sender, +} + +/// The storage manager responsible for the state of the storage +/// on a sled. The storage manager runs in its own task and is interacted +/// with via the [`StorageHandle`]. +pub struct StorageManager { + log: Logger, + stage: StorageManagerStage, + rx: mpsc::Receiver, + resources: StorageResources, + queued_u2_drives: HashSet, + queued_synthetic_u2_drives: BTreeSet, + key_requester: StorageKeyRequester, +} + +impl StorageManager { + pub fn new( + log: &Logger, + key_requester: StorageKeyRequester, + ) -> (StorageManager, StorageHandle) { + let (tx, rx) = mpsc::channel(QUEUE_SIZE); + ( + StorageManager { + log: log.new(o!("component" => "StorageManager")), + stage: StorageManagerStage::WaitingForBootDisk, + rx, + resources: StorageResources::default(), + queued_u2_drives: HashSet::new(), + queued_synthetic_u2_drives: BTreeSet::new(), + key_requester, + }, + StorageHandle { tx }, + ) + } + + /// Add a disk to storage resources or queue it to be added later + async fn add_u2_disk( + &mut self, + unparsed_disk: UnparsedDisk, + ) -> Result<(), Error> { + if self.stage != StorageManagerStage::Normal { + self.queued_u2_drives.insert(unparsed_disk); + return Ok(()); + } + + match Disk::new( + &self.log, + unparsed_disk.clone(), + Some(&self.key_requester), + ) + .await + { + Ok(disk) => self.resources.insert_real_disk(disk), + Err(err @ DiskError::Dataset(DatasetError::KeyManager(_))) => { + warn!( + self.log, + "Transient error: {err} - queuing disk {:?}", unparsed_disk + ); + self.queued_u2_drives.insert(unparsed_disk); + self.stage = StorageManagerStage::QueuingDisks; + Err(err.into()) + } + Err(err) => { + error!( + self.log, + "Persistent error: {err} - not queueing disk {:?}", + unparsed_disk + ); + Err(err.into()) + } + } + } +} diff --git a/sled-storage/src/pool.rs b/sled-storage/src/pool.rs index 1abf43c1de..a16722537d 100644 --- a/sled-storage/src/pool.rs +++ b/sled-storage/src/pool.rs @@ -16,9 +16,9 @@ use illumos_utils::zpool::Zpool; /// A ZFS storage pool #[derive(Debug, Clone)] pub struct Pool { - name: ZpoolName, - info: ZpoolInfo, - parent: DiskIdentity, + pub name: ZpoolName, + pub info: ZpoolInfo, + pub parent: DiskIdentity, } impl Pool { @@ -29,8 +29,4 @@ impl Pool { let info = Zpool::get_info(&name.to_string())?; Ok(Pool { name, info, parent }) } - - pub fn parent(&self) -> &DiskIdentity { - &self.parent - } } diff --git a/sled-storage/src/state.rs b/sled-storage/src/resources.rs similarity index 81% rename from sled-storage/src/state.rs rename to sled-storage/src/resources.rs index 8a0be34f63..0e874be522 100644 --- a/sled-storage/src/state.rs +++ b/sled-storage/src/resources.rs @@ -2,13 +2,15 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! The internal state of the storage manager task +//! Discovered and usable disks and zpools use crate::dataset::M2_DEBUG_DATASET; -use crate::disk::DiskWrapper; +use crate::disk::{Disk, DiskWrapper}; +use crate::error::Error; use crate::pool::Pool; use camino::Utf8PathBuf; use illumos_utils::zpool::ZpoolName; +use omicron_common::api::external::{ByteCount, ByteCountRangeError}; use omicron_common::disk::DiskIdentity; use sled_hardware::DiskVariant; use std::collections::BTreeMap; @@ -21,7 +23,7 @@ const BUNDLE_DIRECTORY: &str = "bundle"; // The directory for zone bundles. const ZONE_BUNDLE_DIRECTORY: &str = "zone"; -/// Storage related state +/// Storage related resources: disks and zpools /// /// This state is internal to the [`crate::StorageManager`] task. Clones /// of this state, or subsets of it, can be retrieved by requests to the @@ -34,10 +36,10 @@ const ZONE_BUNDLE_DIRECTORY: &str = "zone"; /// inside the `StorageManager` task if there are any outstanding copies. /// Therefore, we only pay the cost to update infrequently, and no locks are /// required by callers when operating on cloned data. The only contention here -/// is for the refrence counters of the internal Arcs when `State` gets cloned +/// is for the refrence counters of the internal Arcs when `StorageResources` gets cloned /// or dropped. -#[derive(Debug, Clone)] -pub struct State { +#[derive(Debug, Clone, Default)] +pub struct StorageResources { // All disks, real and synthetic, being managed by this sled disks: Arc>, @@ -45,7 +47,21 @@ pub struct State { pools: Arc>, } -impl State { +impl StorageResources { + /// Insert a disk and its zpool + pub(crate) fn insert_real_disk(&mut self, disk: Disk) -> Result<(), Error> { + let parent = disk.identity().clone(); + let zpool_name = disk.zpool_name().clone(); + let disk = DiskWrapper::Real { + disk: disk.clone(), + devfs_path: disk.devfs_path().clone(), + }; + Arc::make_mut(&mut self.disks).insert(disk.identity(), disk); + let zpool = Pool::new(zpool_name, parent)?; + Arc::make_mut(&mut self.pools).insert(zpool.name.id(), zpool); + Ok(()) + } + /// Returns the identity of the boot disk. /// /// If this returns `None`, we have not processed the boot disk yet. From 90ec972e636df289d39bab52eff57c09415300fa Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 29 Sep 2023 05:11:00 +0000 Subject: [PATCH 05/66] wip --- sled-storage/src/manager.rs | 129 ++++++++++++++++++++++++++++++++-- sled-storage/src/resources.rs | 13 ++++ 2 files changed, 138 insertions(+), 4 deletions(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index dbbe5fb57a..c792fde243 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -6,7 +6,7 @@ use std::collections::{BTreeSet, HashSet}; -use crate::dataset::DatasetError; +use crate::dataset::{self, DatasetError}; use crate::disk::{Disk, DiskError, DiskWrapper}; use crate::error::Error; use crate::resources::StorageResources; @@ -46,7 +46,7 @@ pub struct StorageManager { rx: mpsc::Receiver, resources: StorageResources, queued_u2_drives: HashSet, - queued_synthetic_u2_drives: BTreeSet, + queued_synthetic_u2_drives: HashSet, key_requester: StorageKeyRequester, } @@ -63,14 +63,14 @@ impl StorageManager { rx, resources: StorageResources::default(), queued_u2_drives: HashSet::new(), - queued_synthetic_u2_drives: BTreeSet::new(), + queued_synthetic_u2_drives: HashSet::new(), key_requester, }, StorageHandle { tx }, ) } - /// Add a disk to storage resources or queue it to be added later + /// Add a real U.2 disk to storage resources or queue it to be added later async fn add_u2_disk( &mut self, unparsed_disk: UnparsedDisk, @@ -107,4 +107,125 @@ impl StorageManager { } } } + + /// Add a synthetic U.2 disk to storage resources or queue it to be added later + async fn add_synthetic_u2_disk( + &mut self, + zpool_name: ZpoolName, + ) -> Result<(), Error> { + if self.stage != StorageManagerStage::Normal { + self.queued_synthetic_u2_drives.insert(zpool_name); + return Ok(()); + } + + let synthetic_id = DiskIdentity { + vendor: "fake_vendor".to_string(), + serial: "fake_serial".to_string(), + model: zpool_name.id().to_string(), + }; + match dataset::ensure_zpool_has_datasets( + &self.log, + &zpool_name, + &synthetic_id, + Some(&self.key_requester), + ) + .await + { + Ok(disk) => self.resources.insert_synthetic_disk(zpool_name), + Err(err @ DatasetError::KeyManager(_)) => { + warn!( + self.log, + "Transient error: {err} - queuing disk {:?}", synthetic_id + ); + self.queued_synthetic_u2_drives.insert(zpool_name); + self.stage = StorageManagerStage::QueuingDisks; + Err(DiskError::Dataset(err).into()) + } + Err(err) => { + error!( + self.log, + "Persistent error: {err} - not queueing disk {:?}", + synthetic_id + ); + Err(DiskError::Dataset(err).into()) + } + } + } +} + +/// All tests only use synthetic disks, but are expected to be run on illumos +/// systems. +#[cfg(all(test, target_os = "illumos"))] +mod tests { + use super::*; + use async_trait::async_trait; + use key_manager::{ + KeyManager, SecretRetriever, SecretRetrieverError, SecretState, + VersionedIkm, + }; + use uuid::Uuid; + + pub fn log() -> slog::Logger { + let drain = slog::Discard; + slog::Logger::root(drain, o!()) + } + + /// A [`key-manager::SecretRetriever`] that only returns hardcoded IKM for + /// epoch 0 + #[derive(Debug)] + struct HardcodedSecretRetriever {} + + #[async_trait] + impl SecretRetriever for HardcodedSecretRetriever { + async fn get_latest( + &self, + ) -> Result { + let epoch = 0; + let salt = [0u8; 32]; + let secret = [0x1d; 32]; + + Ok(VersionedIkm::new(epoch, salt, &secret)) + } + + /// We don't plan to do any key rotation before trust quorum is ready + async fn get( + &self, + epoch: u64, + ) -> Result { + if epoch != 0 { + return Err(SecretRetrieverError::NoSuchEpoch(epoch)); + } + Ok(SecretState::Current(self.get_latest().await?)) + } + } + + #[tokio::test] + async fn add_u2_disk_while_not_in_normal_stage_and_ensure_it_gets_queued() { + let (mut _key_manager, key_requester) = + KeyManager::new(&log(), HardcodedSecretRetriever {}); + let (mut manager, _) = StorageManager::new(&log(), key_requester); + let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + assert_eq!(StorageManagerStage::WaitingForBootDisk, manager.stage); + manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); + assert!(manager.resources.all_u2_zpools().is_empty()); + assert_eq!( + manager.queued_synthetic_u2_drives, + HashSet::from([zpool_name.clone()]) + ); + + // Walk through other non-normal stages and enusre disk gets queued + for stage in [ + StorageManagerStage::WaitingForKeyManager, + StorageManagerStage::QueuingDisks, + ] { + manager.queued_synthetic_u2_drives.clear(); + manager.stage = stage; + manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); + assert!(manager.resources.all_u2_zpools().is_empty()); + assert_eq!( + manager.queued_synthetic_u2_drives, + HashSet::from([zpool_name.clone()]) + ); + } + } } diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 0e874be522..7601ac7b86 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -62,6 +62,19 @@ impl StorageResources { Ok(()) } + /// Insert a synthetic disk and its zpool + pub(crate) fn insert_synthetic_disk( + &mut self, + zpool_name: ZpoolName, + ) -> Result<(), Error> { + let disk = DiskWrapper::Synthetic { zpool_name: zpool_name.clone() }; + let parent = disk.identity().clone(); + Arc::make_mut(&mut self.disks).insert(disk.identity(), disk); + let zpool = Pool::new(zpool_name, parent)?; + Arc::make_mut(&mut self.pools).insert(zpool.name.id(), zpool); + Ok(()) + } + /// Returns the identity of the boot disk. /// /// If this returns `None`, we have not processed the boot disk yet. From 5c169ac81ad497f51006b6b3bf9f9fa66999f483 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 29 Sep 2023 05:18:29 +0000 Subject: [PATCH 06/66] wip --- sled-storage/Cargo.toml | 1 - sled-storage/src/pool.rs | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml index ae9718382d..8c8ddeeb88 100644 --- a/sled-storage/Cargo.toml +++ b/sled-storage/Cargo.toml @@ -28,5 +28,4 @@ tokio.workspace = true uuid.workspace = true [dev-dependencies] -illumos-utils = { workspace = true, features = ["testing"] } omicron-test-utils.workspace = true diff --git a/sled-storage/src/pool.rs b/sled-storage/src/pool.rs index a16722537d..203738b16a 100644 --- a/sled-storage/src/pool.rs +++ b/sled-storage/src/pool.rs @@ -5,14 +5,9 @@ //! ZFS storage pool use crate::error::Error; -use illumos_utils::zpool::{ZpoolInfo, ZpoolName}; +use illumos_utils::zpool::{Zpool, ZpoolInfo, ZpoolName}; use omicron_common::disk::DiskIdentity; -#[cfg(test)] -use illumos_utils::zpool::MockZpool as Zpool; -#[cfg(not(test))] -use illumos_utils::zpool::Zpool; - /// A ZFS storage pool #[derive(Debug, Clone)] pub struct Pool { From 255155ccfbbf165072b996e4080b428e29e844e5 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 29 Sep 2023 21:13:10 +0000 Subject: [PATCH 07/66] wip --- Cargo.lock | 1 + illumos-utils/src/zpool.rs | 24 +++++++++- sled-hardware/src/disk.rs | 6 +-- sled-hardware/src/illumos/partitions.rs | 2 +- sled-storage/Cargo.toml | 1 + sled-storage/src/dataset.rs | 54 +++++++++++++---------- sled-storage/src/manager.rs | 58 +++++++++++++++++++++---- 7 files changed, 107 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a448600863..26358b3459 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5271,6 +5271,7 @@ version = "0.1.0" dependencies = [ "async-trait", "camino", + "camino-tempfile", "derive_more", "glob", "illumos-utils", diff --git a/illumos-utils/src/zpool.rs b/illumos-utils/src/zpool.rs index 81ded2655e..68d5ebd3a2 100644 --- a/illumos-utils/src/zpool.rs +++ b/illumos-utils/src/zpool.rs @@ -39,6 +39,13 @@ pub struct CreateError { err: Error, } +#[derive(thiserror::Error, Debug)] +#[error("Failed to destroy zpool: {err}")] +pub struct DestroyError { + #[from] + err: Error, +} + #[derive(thiserror::Error, Debug)] #[error("Failed to list zpools: {err}")] pub struct ListError { @@ -167,7 +174,10 @@ pub struct Zpool {} #[cfg_attr(any(test, feature = "testing"), mockall::automock, allow(dead_code))] impl Zpool { - pub fn create(name: ZpoolName, vdev: &Utf8Path) -> Result<(), CreateError> { + pub fn create( + name: &ZpoolName, + vdev: &Utf8Path, + ) -> Result<(), CreateError> { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear(); cmd.env("LC_ALL", "C.UTF-8"); @@ -189,7 +199,17 @@ impl Zpool { Ok(()) } - pub fn import(name: ZpoolName) -> Result<(), Error> { + pub fn destroy(name: &ZpoolName) -> Result<(), DestroyError> { + let mut cmd = std::process::Command::new(PFEXEC); + cmd.env_clear(); + cmd.env("LC_ALL", "C.UTF-8"); + cmd.arg(ZPOOL).arg("destroy"); + cmd.arg(&name.to_string()); + execute(&mut cmd).map_err(Error::from)?; + Ok(()) + } + + pub fn import(name: &ZpoolName) -> Result<(), Error> { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear(); cmd.env("LC_ALL", "C.UTF-8"); diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index bea7e23c73..541d7bd548 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -253,11 +253,11 @@ impl PooledDisk { DiskVariant::M2 => ZpoolName::new_internal(Uuid::new_v4()), DiskVariant::U2 => ZpoolName::new_external(Uuid::new_v4()), }; - Zpool::create(zpool_name.clone(), &zpool_path)?; + Zpool::create(&zpool_name, &zpool_path)?; zpool_name } }; - Zpool::import(zpool_name.clone()).map_err(|e| { + Zpool::import(&zpool_name).map_err(|e| { warn!(log, "Failed to import zpool {zpool_name}: {e}"); PooledDiskError::ZpoolImport(e) })?; @@ -269,7 +269,7 @@ impl PooledDisk { log: &Logger, zpool_name: &ZpoolName, ) -> Result<(), PooledDiskError> { - Zpool::import(zpool_name.clone()).map_err(|e| { + Zpool::import(&zpool_name).map_err(|e| { warn!(log, "Failed to import zpool {zpool_name}: {e}"); PooledDiskError::ZpoolImport(e) })?; diff --git a/sled-hardware/src/illumos/partitions.rs b/sled-hardware/src/illumos/partitions.rs index ee745fc78b..4b7e69057d 100644 --- a/sled-hardware/src/illumos/partitions.rs +++ b/sled-hardware/src/illumos/partitions.rs @@ -123,7 +123,7 @@ fn internal_ensure_partition_layout( info!(log, "Formatting zpool on disk {}", paths.devfs_path); // If a zpool does not already exist, create one. let zpool_name = ZpoolName::new_external(Uuid::new_v4()); - Zpool::create(zpool_name, dev_path)?; + Zpool::create(&zpool_name, dev_path)?; return Ok(vec![Partition::ZfsPool]); } DiskVariant::M2 => { diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml index 8c8ddeeb88..e1ba21db93 100644 --- a/sled-storage/Cargo.toml +++ b/sled-storage/Cargo.toml @@ -29,3 +29,4 @@ uuid.workspace = true [dev-dependencies] omicron-test-utils.workspace = true +camino-tempfile.workspace = true \ No newline at end of file diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index 3c40dc10f0..99df582371 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -260,33 +260,39 @@ pub async fn ensure_zpool_has_datasets( let mountpoint = zpool_name.dataset_mountpoint(dataset); let keypath: Keypath = disk_identity.into(); - let epoch = - if let Ok(epoch_str) = Zfs::get_oxide_value(dataset, "epoch") { - if let Ok(epoch) = epoch_str.parse::() { - epoch - } else { - return Err(DatasetError::CannotParseEpochProperty( - dataset.to_string(), - )); - } + let epoch = if let Ok(epoch_str) = + Zfs::get_oxide_value(dataset, "epoch") + { + if let Ok(epoch) = epoch_str.parse::() { + epoch } else { - // We got an error trying to call `Zfs::get_oxide_value` - // which indicates that the dataset doesn't exist or there - // was a problem running the command. - // - // Note that `Zfs::get_oxide_value` will succeed even if - // the epoch is missing. `epoch_str` will show up as a dash - // (`-`) and will not parse into a `u64`. So we don't have - // to worry about that case here as it is handled above. - // - // If the error indicated that the command failed for some - // other reason, but the dataset actually existed, we will - // try to create the dataset below and that will fail. So - // there is no harm in just loading the latest secret here. - key_requester.load_latest_secret().await? - }; + return Err(DatasetError::CannotParseEpochProperty( + dataset.to_string(), + )); + } + } else { + // We got an error trying to call `Zfs::get_oxide_value` + // which indicates that the dataset doesn't exist or there + // was a problem running the command. + // + // Note that `Zfs::get_oxide_value` will succeed even if + // the epoch is missing. `epoch_str` will show up as a dash + // (`-`) and will not parse into a `u64`. So we don't have + // to worry about that case here as it is handled above. + // + // If the error indicated that the command failed for some + // other reason, but the dataset actually existed, we will + // try to create the dataset below and that will fail. So + // there is no harm in just loading the latest secret here. + info!(log, "Loading latest secret"; "disk_id"=>#?disk_identity); + let epoch = key_requester.load_latest_secret().await?; + info!(log, "Loaded latest secret"; "epoch"=>%epoch, "disk_id"=>#?disk_identity); + epoch + }; + info!(log, "Retrieving key"; "epoch"=>%epoch, "disk_id"=>#?disk_identity); let key = key_requester.get_key(epoch, disk_identity.clone()).await?; + info!(log, "Got key"; "epoch"=>%epoch, "disk_id"=>#?disk_identity); let mut keyfile = KeyFile::create(keypath.clone(), key.expose_secret(), log) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index c792fde243..543e47a9b7 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -15,7 +15,7 @@ use illumos_utils::zpool::{ZpoolKind, ZpoolName}; use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; use sled_hardware::{DiskVariant, UnparsedDisk}; -use slog::{error, info, o, warn, Logger}; +use slog::{debug, error, info, o, warn, Logger}; use tokio::sync::{mpsc, oneshot}; // The size of the mpsc bounded channel used to communicate @@ -114,6 +114,7 @@ impl StorageManager { zpool_name: ZpoolName, ) -> Result<(), Error> { if self.stage != StorageManagerStage::Normal { + info!(self.log, "Queuing synthetic U.2 drive: {zpool_name}"); self.queued_synthetic_u2_drives.insert(zpool_name); return Ok(()); } @@ -123,6 +124,8 @@ impl StorageManager { serial: "fake_serial".to_string(), model: zpool_name.id().to_string(), }; + + debug!(self.log, "Ensure zpool has datasets: {zpool_name}"); match dataset::ensure_zpool_has_datasets( &self.log, &zpool_name, @@ -131,7 +134,7 @@ impl StorageManager { ) .await { - Ok(disk) => self.resources.insert_synthetic_disk(zpool_name), + Ok(()) => self.resources.insert_synthetic_disk(zpool_name), Err(err @ DatasetError::KeyManager(_)) => { warn!( self.log, @@ -159,17 +162,17 @@ impl StorageManager { mod tests { use super::*; use async_trait::async_trait; + use camino::{Utf8Path, Utf8PathBuf}; + use camino_tempfile::tempdir; + use illumos_utils::zpool::Zpool; use key_manager::{ KeyManager, SecretRetriever, SecretRetrieverError, SecretState, VersionedIkm, }; + use omicron_test_utils::dev::test_setup_log; + use std::fs::File; use uuid::Uuid; - pub fn log() -> slog::Logger { - let drain = slog::Discard; - slog::Logger::root(drain, o!()) - } - /// A [`key-manager::SecretRetriever`] that only returns hardcoded IKM for /// epoch 0 #[derive(Debug)] @@ -199,11 +202,29 @@ mod tests { } } + // 64 MiB (min size of zpool) + const DISK_SIZE: u64 = 64 * 1024 * 1024; + + // Create a synthetic disk with a zpool backed by a file + fn new_disk(dir: &Utf8Path, zpool_name: &ZpoolName) -> Utf8PathBuf { + let path = dir.join(zpool_name.to_string()); + let file = File::create(&path).unwrap(); + file.set_len(DISK_SIZE).unwrap(); + drop(file); + Zpool::create(zpool_name, &path).unwrap(); + Zpool::import(zpool_name).unwrap(); + Zpool::set_failmode_continue(zpool_name).unwrap(); + path + } + #[tokio::test] async fn add_u2_disk_while_not_in_normal_stage_and_ensure_it_gets_queued() { + let logctx = test_setup_log( + "add_u2_disk_while_not_in_normal_stage_and_ensure_it_gets_queued", + ); let (mut _key_manager, key_requester) = - KeyManager::new(&log(), HardcodedSecretRetriever {}); - let (mut manager, _) = StorageManager::new(&log(), key_requester); + KeyManager::new(&logctx.log, HardcodedSecretRetriever {}); + let (mut manager, _) = StorageManager::new(&logctx.log, key_requester); let zpool_name = ZpoolName::new_external(Uuid::new_v4()); assert_eq!(StorageManagerStage::WaitingForBootDisk, manager.stage); manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); @@ -227,5 +248,24 @@ mod tests { HashSet::from([zpool_name.clone()]) ); } + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn ensure_u2_gets_added_to_resources() { + let logctx = test_setup_log("ensure_u2_gets_added_to_resources"); + let (mut key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever {}); + let (mut manager, _) = StorageManager::new(&logctx.log, key_requester); + let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let dir = tempdir().unwrap(); + let _ = new_disk(dir.path(), &zpool_name); + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + manager.stage = StorageManagerStage::Normal; + manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); + assert_eq!(manager.resources.all_u2_zpools().len(), 1); + Zpool::destroy(&zpool_name).unwrap(); + logctx.cleanup_successful(); } } From 115510a8d094752c305ddba818c0b3acb919b2a9 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 29 Sep 2023 22:14:59 +0000 Subject: [PATCH 08/66] wip --- illumos-utils/Cargo.toml | 3 +++ illumos-utils/src/zfs.rs | 35 +++++++++++++++++++++++++++-------- sled-storage/Cargo.toml | 1 + sled-storage/src/keyfile.rs | 8 -------- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/illumos-utils/Cargo.toml b/illumos-utils/Cargo.toml index 3c0c2e7fc9..573d0be14b 100644 --- a/illumos-utils/Cargo.toml +++ b/illumos-utils/Cargo.toml @@ -42,3 +42,6 @@ toml.workspace = true [features] # Enable to generate MockZones testing = ["mockall"] +# Useful for tests that want real functionality and ability to run without +# pfexec +tmp_keypath = [] diff --git a/illumos-utils/src/zfs.rs b/illumos-utils/src/zfs.rs index ba8cd8c84a..382c01f9aa 100644 --- a/illumos-utils/src/zfs.rs +++ b/illumos-utils/src/zfs.rs @@ -20,7 +20,16 @@ pub const ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT: &str = "/zone"; pub const ZONE_ZFS_RAMDISK_DATASET: &str = "rpool/zone"; pub const ZFS: &str = "/usr/sbin/zfs"; + +/// This path is intentionally on a `tmpfs` to prevent copy-on-write behavior +/// and to ensure it goes away on power off. +/// +/// We want minimize the time the key files are in memory, and so we rederive +/// the keys and recreate the files on demand when creating and mounting +/// encrypted filesystems. We then zero them and unlink them. pub const KEYPATH_ROOT: &str = "/var/run/oxide/"; +// Use /tmp so we don't have to worry about running tests with pfexec +pub const TEST_KEYPATH_ROOT: &str = "/tmp"; /// Error returned by [`Zfs::list_datasets`]. #[derive(thiserror::Error, Debug)] @@ -135,19 +144,29 @@ impl fmt::Display for Keypath { } } +#[cfg(not(feature = "tmp_keypath"))] +impl From<&DiskIdentity> for Keypath { + fn from(id: &DiskIdentity) -> Self { + build_keypath(id, KEYPATH_ROOT) + } +} + +#[cfg(feature = "tmp_keypath")] impl From<&DiskIdentity> for Keypath { fn from(id: &DiskIdentity) -> Self { - let filename = format!( - "{}-{}-{}-zfs-aes-256-gcm.key", - id.vendor, id.serial, id.model - ); - let mut path = Utf8PathBuf::new(); - path.push(KEYPATH_ROOT); - path.push(filename); - Keypath(path) + build_keypath(id, TEST_KEYPATH_ROOT) } } +fn build_keypath(id: &DiskIdentity, root: &str) -> Keypath { + let filename = + format!("{}-{}-{}-zfs-aes-256-gcm.key", id.vendor, id.serial, id.model); + let mut path = Utf8PathBuf::new(); + path.push(root); + path.push(filename); + Keypath(path) +} + #[derive(Debug)] pub struct EncryptionDetails { pub keypath: Keypath, diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml index e1ba21db93..11bd502183 100644 --- a/sled-storage/Cargo.toml +++ b/sled-storage/Cargo.toml @@ -28,5 +28,6 @@ tokio.workspace = true uuid.workspace = true [dev-dependencies] +illumos-utils = { workspace = true, features = ["tmp_keypath"] } omicron-test-utils.workspace = true camino-tempfile.workspace = true \ No newline at end of file diff --git a/sled-storage/src/keyfile.rs b/sled-storage/src/keyfile.rs index 396c860fc5..fcdbf8b3bf 100644 --- a/sled-storage/src/keyfile.rs +++ b/sled-storage/src/keyfile.rs @@ -9,14 +9,6 @@ use slog::{info, Logger}; use tokio::fs::{remove_file, File}; use tokio::io::{AsyncSeekExt, AsyncWriteExt, SeekFrom}; -/// This path is intentionally on a `tmpfs` to prevent copy-on-write behavior -/// and to ensure it goes away on power off. -/// -/// We want minimize the time the key files are in memory, and so we rederive -/// the keys and recreate the files on demand when creating and mounting -/// encrypted filesystems. We then zero them and unlink them. -pub const KEYPATH_ROOT: &str = "/var/run/oxide/"; - /// A file that wraps a zfs encryption key. /// /// We put this in a RAM backed filesystem and zero and delete it when we are From 0bc3aa0106d5cb5068547c2379c52465bd976044 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Mon, 2 Oct 2023 02:21:58 +0000 Subject: [PATCH 09/66] wip --- sled-storage/src/manager.rs | 66 +++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 543e47a9b7..813f552878 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -16,7 +16,7 @@ use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; use sled_hardware::{DiskVariant, UnparsedDisk}; use slog::{debug, error, info, o, warn, Logger}; -use tokio::sync::{mpsc, oneshot}; +use tokio::sync::{mpsc, oneshot, watch}; // The size of the mpsc bounded channel used to communicate // between the `StorageHandle` and `StorageManager`. @@ -30,11 +30,19 @@ pub enum StorageManagerStage { Normal, } -enum StorageRequest {} +enum StorageRequest { + AddDisk(UnparsedDisk), + AddSyntheticDisk(ZpoolName), + RemoveDisk(UnparsedDisk), + DisksChanged(Vec), + // NewFilesystem(NewFilesystemRequest), + KeyManagerReady, +} /// A mechanism for interacting with the [`StorageManager`] pub struct StorageHandle { tx: mpsc::Sender, + resource_updates: watch::Receiver, } /// The storage manager responsible for the state of the storage @@ -48,6 +56,7 @@ pub struct StorageManager { queued_u2_drives: HashSet, queued_synthetic_u2_drives: HashSet, key_requester: StorageKeyRequester, + resource_updates: watch::Sender, } impl StorageManager { @@ -56,20 +65,62 @@ impl StorageManager { key_requester: StorageKeyRequester, ) -> (StorageManager, StorageHandle) { let (tx, rx) = mpsc::channel(QUEUE_SIZE); + let resources = StorageResources::default(); + let (update_tx, update_rx) = watch::channel(resources.clone()); ( StorageManager { log: log.new(o!("component" => "StorageManager")), stage: StorageManagerStage::WaitingForBootDisk, rx, - resources: StorageResources::default(), + resources, queued_u2_drives: HashSet::new(), queued_synthetic_u2_drives: HashSet::new(), key_requester, + resource_updates: update_tx, }, - StorageHandle { tx }, + StorageHandle { tx, resource_updates: update_rx }, ) } + /// Run the main receive loop of the `StorageManager` + /// + /// This should be spawned into a tokio task + pub async fn run(&mut self) { + loop { + if let Err(e) = self.step().await { + warn!(self.log, "{e}"); + return; + } + } + } + + /// Process the next event + /// + /// This is useful for testing/debugging + pub async fn step(&mut self) -> Result<(), Error> { + // The sending side should never disappear + match self.rx.recv().await.unwrap() { + StorageRequest::AddDisk(unparsed_disk) => { + match unparsed_disk.variant() { + DiskVariant::U2 => self.add_u2_disk(unparsed_disk).await?, + DiskVariant::M2 => todo!(), + } + } + StorageRequest::AddSyntheticDisk(zpool_name) => { + match zpool_name.kind() { + ZpoolKind::External => { + self.add_synthetic_u2_disk(zpool_name).await? + } + ZpoolKind::Internal => todo!(), + } + } + StorageRequest::RemoveDisk(_unparsed_disk) => todo!(), + StorageRequest::DisksChanged(_unparsed_disks) => todo!(), + StorageRequest::KeyManagerReady => todo!(), + } + Ok(()) + } + /// Add a real U.2 disk to storage resources or queue it to be added later async fn add_u2_disk( &mut self, @@ -142,7 +193,7 @@ impl StorageManager { ); self.queued_synthetic_u2_drives.insert(zpool_name); self.stage = StorageManagerStage::QueuingDisks; - Err(DiskError::Dataset(err).into()) + Ok(()) } Err(err) => { error!( @@ -150,7 +201,7 @@ impl StorageManager { "Persistent error: {err} - not queueing disk {:?}", synthetic_id ); - Err(DiskError::Dataset(err).into()) + Ok(()) } } } @@ -260,8 +311,11 @@ mod tests { let zpool_name = ZpoolName::new_external(Uuid::new_v4()); let dir = tempdir().unwrap(); let _ = new_disk(dir.path(), &zpool_name); + // Spawn the key_manager so that it will respond to requests for encryption keys tokio::spawn(async move { key_manager.run().await }); + + // Set the stage to pretend we've progressed enough to have a key_manager available. manager.stage = StorageManagerStage::Normal; manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); assert_eq!(manager.resources.all_u2_zpools().len(), 1); From e2e7dc8e55d7dd17be941ad9ae091deba67fb1ab Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Mon, 2 Oct 2023 03:04:43 +0000 Subject: [PATCH 10/66] wip --- sled-storage/src/error.rs | 5 +++- sled-storage/src/manager.rs | 57 +++++++++++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/sled-storage/src/error.rs b/sled-storage/src/error.rs index fbf721fab7..70d7fe7c1e 100644 --- a/sled-storage/src/error.rs +++ b/sled-storage/src/error.rs @@ -4,7 +4,7 @@ //! Storage related errors -use crate::dataset::DatasetName; +use crate::dataset::{DatasetError, DatasetName}; use crate::disk::DiskError; use camino::Utf8PathBuf; use omicron_common::api::external::ByteCountRangeError; @@ -15,6 +15,9 @@ pub enum Error { #[error(transparent)] DiskError(#[from] DiskError), + #[error(transparent)] + DatasetError(#[from] DatasetError), + // TODO: We could add the context of "why are we doint this op", maybe? #[error(transparent)] ZfsListDataset(#[from] illumos_utils::zfs::ListDatasetsError), diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 813f552878..499d8edee2 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -89,7 +89,6 @@ impl StorageManager { loop { if let Err(e) = self.step().await { warn!(self.log, "{e}"); - return; } } } @@ -103,7 +102,7 @@ impl StorageManager { StorageRequest::AddDisk(unparsed_disk) => { match unparsed_disk.variant() { DiskVariant::U2 => self.add_u2_disk(unparsed_disk).await?, - DiskVariant::M2 => todo!(), + DiskVariant::M2 => self.add_m2_disk(unparsed_disk).await?, } } StorageRequest::AddSyntheticDisk(zpool_name) => { @@ -111,7 +110,9 @@ impl StorageManager { ZpoolKind::External => { self.add_synthetic_u2_disk(zpool_name).await? } - ZpoolKind::Internal => todo!(), + ZpoolKind::Internal => { + self.add_synthetic_m2_disk(zpool_name).await? + } } } StorageRequest::RemoveDisk(_unparsed_disk) => todo!(), @@ -121,7 +122,7 @@ impl StorageManager { Ok(()) } - /// Add a real U.2 disk to storage resources or queue it to be added later + // Add a real U.2 disk to [`StorageResources`] or queue it to be added later async fn add_u2_disk( &mut self, unparsed_disk: UnparsedDisk, @@ -159,7 +160,53 @@ impl StorageManager { } } - /// Add a synthetic U.2 disk to storage resources or queue it to be added later + // Add a real U.2 disk to [`StorageResources`] + // + // + // We never queue M.2 drives, as they don't rely on [`KeyManager`] based + // encryption + async fn add_m2_disk( + &mut self, + unparsed_disk: UnparsedDisk, + ) -> Result<(), Error> { + let disk = Disk::new( + &self.log, + unparsed_disk.clone(), + Some(&self.key_requester), + ) + .await?; + self.resources.insert_real_disk(disk)?; + Ok(()) + } + + // Add a synthetic U.2 disk to [`StorageResources`] + // + // We never queue M.2 drives, as they don't rely on [`KeyManager`] based + // encryption + async fn add_synthetic_m2_disk( + &mut self, + zpool_name: ZpoolName, + ) -> Result<(), Error> { + let synthetic_id = DiskIdentity { + vendor: "fake_vendor".to_string(), + serial: "fake_serial".to_string(), + model: zpool_name.id().to_string(), + }; + + debug!(self.log, "Ensure zpool has datasets: {zpool_name}"); + dataset::ensure_zpool_has_datasets( + &self.log, + &zpool_name, + &synthetic_id, + Some(&self.key_requester), + ) + .await?; + self.resources.insert_synthetic_disk(zpool_name)?; + Ok(()) + } + + // Add a synthetic U.2 disk to [`StorageResources`] or queue it to be added + // later async fn add_synthetic_u2_disk( &mut self, zpool_name: ZpoolName, From 9a1e9164363818c6780ad9529eadb087230ccf84 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Mon, 2 Oct 2023 17:41:03 +0000 Subject: [PATCH 11/66] wip --- sled-storage/src/manager.rs | 140 +++++++++++++++++++++++++++++----- sled-storage/src/resources.rs | 25 +++++- 2 files changed, 144 insertions(+), 21 deletions(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 499d8edee2..2855345c3b 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -23,7 +23,7 @@ use tokio::sync::{mpsc, oneshot, watch}; const QUEUE_SIZE: usize = 256; #[derive(Debug, Clone, PartialEq, Eq)] -pub enum StorageManagerStage { +pub enum StorageManagerState { WaitingForBootDisk, WaitingForKeyManager, QueuingDisks, @@ -34,7 +34,7 @@ enum StorageRequest { AddDisk(UnparsedDisk), AddSyntheticDisk(ZpoolName), RemoveDisk(UnparsedDisk), - DisksChanged(Vec), + DisksChanged(HashSet), // NewFilesystem(NewFilesystemRequest), KeyManagerReady, } @@ -45,12 +45,76 @@ pub struct StorageHandle { resource_updates: watch::Receiver, } +impl StorageHandle { + /// Adds a disk and associated zpool to the storage manager. + pub async fn upsert_disk(&self, disk: UnparsedDisk) { + self.tx.send(StorageRequest::AddDisk(disk)).await.unwrap(); + } + + /// Adds a synthetic disk backed by a zpool to the storage manager. + pub async fn upsert_synthetic_disk(&self, pool: ZpoolName) { + self.tx.send(StorageRequest::AddSyntheticDisk(pool)).await.unwrap(); + } + + /// Removes a disk, if it's tracked by the storage manager, as well + /// as any associated zpools. + pub async fn delete_disk(&self, disk: UnparsedDisk) { + self.tx.send(StorageRequest::RemoveDisk(disk)).await.unwrap(); + } + + /// Ensures that the storage manager tracks exactly the provided disks. + /// + /// This acts similar to a batch [Self::upsert_disk] for all new disks, and + /// [Self::delete_disk] for all removed disks. + /// + /// If errors occur, an arbitrary "one" of them will be returned, but a + /// best-effort attempt to add all disks will still be attempted. + pub async fn ensure_using_exactly_these_disks(&self, unparsed_disks: I) + where + I: IntoIterator, + { + self.tx + .send(StorageRequest::DisksChanged( + unparsed_disks.into_iter().collect(), + )) + .await + .unwrap(); + } + + /// Notify the [`StorageManager`] that the [`key_manager::KeyManager`] + /// has determined what [`key_manager::SecretRetriever`] to use and + /// it is now possible to retrieve secrets and construct keys. Note + /// that in cases of using the trust quorum, it is possible that the + /// [`key_manager::SecretRetriever`] is ready, but enough key shares cannot + /// be retrieved from other sleds. In this case, we still will be unable + /// to add the disks successfully. In the common case this is a transient + /// error. In other cases it may be fatal. However, that is outside the + /// scope of the cares of this module. + pub async fn key_manager_ready(&self) { + self.tx.send(StorageRequest::KeyManagerReady).await.unwrap(); + } + + /// Wait for a boot disk to be initialized + pub async fn wait_for_boot_disk(&mut self) -> (DiskIdentity, ZpoolName) { + loop { + // We panic if the sender is dropped, as this means + // the StorageManager has gone away, which it should not do. + self.resource_updates.changed().await.unwrap(); + // Limit any RWLock related cancellation issues by immediately cloning + let resources = self.resource_updates.borrow().clone(); + if let Some((disk_id, zpool_name)) = resources.boot_disk() { + return (disk_id, zpool_name); + } + } + } +} + /// The storage manager responsible for the state of the storage /// on a sled. The storage manager runs in its own task and is interacted /// with via the [`StorageHandle`]. pub struct StorageManager { log: Logger, - stage: StorageManagerStage, + state: StorageManagerState, rx: mpsc::Receiver, resources: StorageResources, queued_u2_drives: HashSet, @@ -70,7 +134,7 @@ impl StorageManager { ( StorageManager { log: log.new(o!("component" => "StorageManager")), - stage: StorageManagerStage::WaitingForBootDisk, + state: StorageManagerState::WaitingForBootDisk, rx, resources, queued_u2_drives: HashSet::new(), @@ -127,7 +191,7 @@ impl StorageManager { &mut self, unparsed_disk: UnparsedDisk, ) -> Result<(), Error> { - if self.stage != StorageManagerStage::Normal { + if self.state != StorageManagerState::Normal { self.queued_u2_drives.insert(unparsed_disk); return Ok(()); } @@ -139,14 +203,21 @@ impl StorageManager { ) .await { - Ok(disk) => self.resources.insert_real_disk(disk), + Ok(disk) => { + if self.resources.insert_real_disk(disk)? { + let _ = self + .resource_updates + .send_replace(self.resources.clone()); + } + Ok(()) + } Err(err @ DiskError::Dataset(DatasetError::KeyManager(_))) => { warn!( self.log, "Transient error: {err} - queuing disk {:?}", unparsed_disk ); self.queued_u2_drives.insert(unparsed_disk); - self.stage = StorageManagerStage::QueuingDisks; + self.state = StorageManagerState::QueuingDisks; Err(err.into()) } Err(err) => { @@ -175,7 +246,9 @@ impl StorageManager { Some(&self.key_requester), ) .await?; - self.resources.insert_real_disk(disk)?; + if self.resources.insert_real_disk(disk)? { + let _ = self.resource_updates.send_replace(self.resources.clone()); + } Ok(()) } @@ -201,7 +274,9 @@ impl StorageManager { Some(&self.key_requester), ) .await?; - self.resources.insert_synthetic_disk(zpool_name)?; + if self.resources.insert_synthetic_disk(zpool_name)? { + let _ = self.resource_updates.send_replace(self.resources.clone()); + } Ok(()) } @@ -211,7 +286,7 @@ impl StorageManager { &mut self, zpool_name: ZpoolName, ) -> Result<(), Error> { - if self.stage != StorageManagerStage::Normal { + if self.state != StorageManagerState::Normal { info!(self.log, "Queuing synthetic U.2 drive: {zpool_name}"); self.queued_synthetic_u2_drives.insert(zpool_name); return Ok(()); @@ -232,14 +307,21 @@ impl StorageManager { ) .await { - Ok(()) => self.resources.insert_synthetic_disk(zpool_name), + Ok(()) => { + if self.resources.insert_synthetic_disk(zpool_name)? { + let _ = self + .resource_updates + .send_replace(self.resources.clone()); + } + Ok(()) + } Err(err @ DatasetError::KeyManager(_)) => { warn!( self.log, "Transient error: {err} - queuing disk {:?}", synthetic_id ); self.queued_synthetic_u2_drives.insert(zpool_name); - self.stage = StorageManagerStage::QueuingDisks; + self.state = StorageManagerState::QueuingDisks; Ok(()) } Err(err) => { @@ -324,7 +406,7 @@ mod tests { KeyManager::new(&logctx.log, HardcodedSecretRetriever {}); let (mut manager, _) = StorageManager::new(&logctx.log, key_requester); let zpool_name = ZpoolName::new_external(Uuid::new_v4()); - assert_eq!(StorageManagerStage::WaitingForBootDisk, manager.stage); + assert_eq!(StorageManagerState::WaitingForBootDisk, manager.state); manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); assert!(manager.resources.all_u2_zpools().is_empty()); assert_eq!( @@ -334,11 +416,11 @@ mod tests { // Walk through other non-normal stages and enusre disk gets queued for stage in [ - StorageManagerStage::WaitingForKeyManager, - StorageManagerStage::QueuingDisks, + StorageManagerState::WaitingForKeyManager, + StorageManagerState::QueuingDisks, ] { manager.queued_synthetic_u2_drives.clear(); - manager.stage = stage; + manager.state = stage; manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); assert!(manager.resources.all_u2_zpools().is_empty()); assert_eq!( @@ -363,10 +445,34 @@ mod tests { tokio::spawn(async move { key_manager.run().await }); // Set the stage to pretend we've progressed enough to have a key_manager available. - manager.stage = StorageManagerStage::Normal; + manager.state = StorageManagerState::Normal; manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); assert_eq!(manager.resources.all_u2_zpools().len(), 1); Zpool::destroy(&zpool_name).unwrap(); logctx.cleanup_successful(); } + + #[tokio::test] + async fn wait_for_bootdisk() { + let logctx = test_setup_log("ensure_u2_gets_added_to_resources"); + let (mut key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever {}); + let (mut manager, mut handle) = + StorageManager::new(&logctx.log, key_requester); + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + // Create a synthetic internal disk + let zpool_name = ZpoolName::new_internal(Uuid::new_v4()); + let dir = tempdir().unwrap(); + let _ = new_disk(dir.path(), &zpool_name); + + handle.upsert_synthetic_disk(zpool_name.clone()).await; + handle.wait_for_boot_disk().await; + } } diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 7601ac7b86..fb57d742e3 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -49,30 +49,47 @@ pub struct StorageResources { impl StorageResources { /// Insert a disk and its zpool - pub(crate) fn insert_real_disk(&mut self, disk: Disk) -> Result<(), Error> { + /// + /// Return true, if data was changed, false otherwise + pub(crate) fn insert_real_disk( + &mut self, + disk: Disk, + ) -> Result { let parent = disk.identity().clone(); let zpool_name = disk.zpool_name().clone(); let disk = DiskWrapper::Real { disk: disk.clone(), devfs_path: disk.devfs_path().clone(), }; + if let Some(stored) = self.disks.get(&parent) { + if stored == &disk { + return Ok(false); + } + } Arc::make_mut(&mut self.disks).insert(disk.identity(), disk); let zpool = Pool::new(zpool_name, parent)?; Arc::make_mut(&mut self.pools).insert(zpool.name.id(), zpool); - Ok(()) + Ok(true) } /// Insert a synthetic disk and its zpool + /// + /// Return true, if data was changed, false otherwise pub(crate) fn insert_synthetic_disk( &mut self, zpool_name: ZpoolName, - ) -> Result<(), Error> { + ) -> Result { let disk = DiskWrapper::Synthetic { zpool_name: zpool_name.clone() }; let parent = disk.identity().clone(); + if let Some(stored) = self.disks.get(&parent) { + if stored == &disk { + return Ok(false); + } + } Arc::make_mut(&mut self.disks).insert(disk.identity(), disk); let zpool = Pool::new(zpool_name, parent)?; Arc::make_mut(&mut self.pools).insert(zpool.name.id(), zpool); - Ok(()) + Ok(true) } /// Returns the identity of the boot disk. From 30e16c802c645b964e3cf838c38f4ea0596eb6f5 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Mon, 2 Oct 2023 23:39:05 +0000 Subject: [PATCH 12/66] wip --- sled-storage/src/manager.rs | 163 +++++++++++++++++++++++++++++++++--- 1 file changed, 150 insertions(+), 13 deletions(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 2855345c3b..96119bd74e 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -17,6 +17,7 @@ use omicron_common::disk::DiskIdentity; use sled_hardware::{DiskVariant, UnparsedDisk}; use slog::{debug, error, info, o, warn, Logger}; use tokio::sync::{mpsc, oneshot, watch}; +use tokio::time::{interval, Duration, MissedTickBehavior}; // The size of the mpsc bounded channel used to communicate // between the `StorageHandle` and `StorageManager`. @@ -24,7 +25,6 @@ const QUEUE_SIZE: usize = 256; #[derive(Debug, Clone, PartialEq, Eq)] pub enum StorageManagerState { - WaitingForBootDisk, WaitingForKeyManager, QueuingDisks, Normal, @@ -37,6 +37,10 @@ enum StorageRequest { DisksChanged(HashSet), // NewFilesystem(NewFilesystemRequest), KeyManagerReady, + /// This will always grab the latest state after any new updates, as it + /// serializes through the `StorageManager` task. + /// This serialization is particularly useful for tests. + GetLatestResources(oneshot::Sender), } /// A mechanism for interacting with the [`StorageManager`] @@ -107,6 +111,20 @@ impl StorageHandle { } } } + + /// Wait for any storage resource changes + pub async fn wait_for_changes(&mut self) -> StorageResources { + self.resource_updates.changed().await.unwrap(); + self.resource_updates.borrow().clone() + } + + /// Retrieve the latest value of `StorageResources` from the + /// `StorageManager` task. + pub async fn get_latest_resources(&mut self) -> StorageResources { + let (tx, rx) = oneshot::channel(); + self.tx.send(StorageRequest::GetLatestResources(tx)).await.unwrap(); + rx.await.unwrap() + } } /// The storage manager responsible for the state of the storage @@ -134,7 +152,7 @@ impl StorageManager { ( StorageManager { log: log.new(o!("component" => "StorageManager")), - state: StorageManagerState::WaitingForBootDisk, + state: StorageManagerState::WaitingForKeyManager, rx, resources, queued_u2_drives: HashSet::new(), @@ -151,8 +169,22 @@ impl StorageManager { /// This should be spawned into a tokio task pub async fn run(&mut self) { loop { - if let Err(e) = self.step().await { - warn!(self.log, "{e}"); + const QUEUED_DISK_RETRY_TIMEOUT: Duration = Duration::from_secs(10); + let mut interval = interval(QUEUED_DISK_RETRY_TIMEOUT); + interval.set_missed_tick_behavior(MissedTickBehavior::Delay); + tokio::select! { + res = self.step() => { + if let Err(e) = res { + warn!(self.log, "{e}"); + } + } + _ = interval.tick(), + if self.state == StorageManagerState::QueuingDisks => + { + // We are going to try to configure these disks again + self.state = StorageManagerState::Normal; + self.add_queued_disks().await; + } } } } @@ -181,11 +213,83 @@ impl StorageManager { } StorageRequest::RemoveDisk(_unparsed_disk) => todo!(), StorageRequest::DisksChanged(_unparsed_disks) => todo!(), - StorageRequest::KeyManagerReady => todo!(), + StorageRequest::KeyManagerReady => { + self.state = StorageManagerState::Normal; + self.add_queued_disks().await; + } + StorageRequest::GetLatestResources(tx) => { + let _ = tx.send(self.resources.clone()); + } } Ok(()) } + // Loop through all queued disks inserting them into [`StorageResources`] + // unless we hit a transient error. If we hit a transient error, we return + // and wait for the next retry window to re-call this method. If we hit a + // permanent error we log it, but we continue inserting queued disks. + async fn add_queued_disks(&mut self) { + // Operate on queued real disks + + // Disks that should be requeued. + let mut saved = HashSet::new(); + let queued = std::mem::take(&mut self.queued_u2_drives); + let mut iter = queued.into_iter(); + while let Some(disk) = iter.next() { + if self.state == StorageManagerState::QueuingDisks { + // We hit a transient error in a prior iteration. + saved.insert(disk); + } else { + // Try ot add the disk. If there was a transient error the disk will + // have been requeued. If there was a permanent error, it will have been + // dropped. If there is an another unexpected error, we will handle it and + // requeue ourselves. + if let Err(err) = self.add_u2_disk(disk.clone()).await { + warn!( + self.log, + "Potentially transient error: {err}: - requeing disk {:?}", + disk + ); + saved.insert(disk); + } + } + } + // Merge any requeued disks from transient errors with saved disks here + self.queued_u2_drives.extend(saved); + + // Operate on queued synthetic disks + if self.state == StorageManagerState::QueuingDisks { + return; + } + + let mut saved = HashSet::new(); + let queued = std::mem::take(&mut self.queued_synthetic_u2_drives); + let mut iter = queued.into_iter(); + while let Some(zpool_name) = iter.next() { + if self.state == StorageManagerState::QueuingDisks { + // We hit a transient error in a prior iteration. + saved.insert(zpool_name); + } else { + // Try ot add the disk. If there was a transient error the disk will + // have been requeued. If there was a permanent error, it will have been + // dropped. If there is an another unexpected error, we will handle it and + // requeue ourselves. + if let Err(err) = + self.add_synthetic_u2_disk(zpool_name.clone()).await + { + warn!( + self.log, + "Potentially transient error: {err}: - requeing synthetic disk {:?}", + zpool_name + ); + saved.insert(zpool_name); + } + } + } + // Merge any requeued disks from transient errors with saved disks here + self.queued_synthetic_u2_drives.extend(saved); + } + // Add a real U.2 disk to [`StorageResources`] or queue it to be added later async fn add_u2_disk( &mut self, @@ -218,7 +322,7 @@ impl StorageManager { ); self.queued_u2_drives.insert(unparsed_disk); self.state = StorageManagerState::QueuingDisks; - Err(err.into()) + Ok(()) } Err(err) => { error!( @@ -226,7 +330,7 @@ impl StorageManager { "Persistent error: {err} - not queueing disk {:?}", unparsed_disk ); - Err(err.into()) + Ok(()) } } } @@ -406,7 +510,7 @@ mod tests { KeyManager::new(&logctx.log, HardcodedSecretRetriever {}); let (mut manager, _) = StorageManager::new(&logctx.log, key_requester); let zpool_name = ZpoolName::new_external(Uuid::new_v4()); - assert_eq!(StorageManagerState::WaitingForBootDisk, manager.state); + assert_eq!(StorageManagerState::WaitingForKeyManager, manager.state); manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); assert!(manager.resources.all_u2_zpools().is_empty()); assert_eq!( @@ -415,10 +519,7 @@ mod tests { ); // Walk through other non-normal stages and enusre disk gets queued - for stage in [ - StorageManagerState::WaitingForKeyManager, - StorageManagerState::QueuingDisks, - ] { + for stage in [StorageManagerState::QueuingDisks] { manager.queued_synthetic_u2_drives.clear(); manager.state = stage; manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); @@ -454,7 +555,7 @@ mod tests { #[tokio::test] async fn wait_for_bootdisk() { - let logctx = test_setup_log("ensure_u2_gets_added_to_resources"); + let logctx = test_setup_log("wait_for_bootdisk"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever {}); let (mut manager, mut handle) = @@ -474,5 +575,41 @@ mod tests { handle.upsert_synthetic_disk(zpool_name.clone()).await; handle.wait_for_boot_disk().await; + Zpool::destroy(&zpool_name).unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn queued_disks_get_added_as_resources() { + let logctx = test_setup_log("queued_disks_get_added_as_resources"); + let (mut key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever {}); + let (mut manager, mut handle) = + StorageManager::new(&logctx.log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + // Queue up a disks, as we haven't told the `StorageManager` that + // the `KeyManager` is ready yet. + let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let dir = tempdir().unwrap(); + let _ = new_disk(dir.path(), &zpool_name); + handle.upsert_synthetic_disk(zpool_name.clone()).await; + let resources = handle.get_latest_resources().await; + assert!(resources.all_u2_zpools().is_empty()); + + // Now inform the storage manager that the key manager is ready + // The queued disk should be successfully added + handle.key_manager_ready().await; + let resources = handle.get_latest_resources().await; + assert_eq!(resources.all_u2_zpools().len(), 1); + Zpool::destroy(&zpool_name).unwrap(); + logctx.cleanup_successful(); } } From 51dcbcdd9c6bf0bd4ac976be159504b0bd4e628d Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 3 Oct 2023 16:45:24 +0000 Subject: [PATCH 13/66] wip --- sled-storage/src/manager.rs | 85 +++++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 8 deletions(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 96119bd74e..2a7dcbda9b 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -181,8 +181,6 @@ impl StorageManager { _ = interval.tick(), if self.state == StorageManagerState::QueuingDisks => { - // We are going to try to configure these disks again - self.state = StorageManagerState::Normal; self.add_queued_disks().await; } } @@ -229,6 +227,7 @@ impl StorageManager { // and wait for the next retry window to re-call this method. If we hit a // permanent error we log it, but we continue inserting queued disks. async fn add_queued_disks(&mut self) { + self.state = StorageManagerState::Normal; // Operate on queued real disks // Disks that should be requeued. @@ -455,18 +454,30 @@ mod tests { }; use omicron_test_utils::dev::test_setup_log; use std::fs::File; + use std::sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }; use uuid::Uuid; /// A [`key-manager::SecretRetriever`] that only returns hardcoded IKM for /// epoch 0 - #[derive(Debug)] - struct HardcodedSecretRetriever {} + #[derive(Debug, Default)] + struct HardcodedSecretRetriever { + inject_error: Arc, + } #[async_trait] impl SecretRetriever for HardcodedSecretRetriever { async fn get_latest( &self, ) -> Result { + if self.inject_error.load(Ordering::SeqCst) { + return Err(SecretRetrieverError::Bootstore( + "Timeout".to_string(), + )); + } + let epoch = 0; let salt = [0u8; 32]; let secret = [0x1d; 32]; @@ -479,6 +490,11 @@ mod tests { &self, epoch: u64, ) -> Result { + if self.inject_error.load(Ordering::SeqCst) { + return Err(SecretRetrieverError::Bootstore( + "Timeout".to_string(), + )); + } if epoch != 0 { return Err(SecretRetrieverError::NoSuchEpoch(epoch)); } @@ -507,7 +523,7 @@ mod tests { "add_u2_disk_while_not_in_normal_stage_and_ensure_it_gets_queued", ); let (mut _key_manager, key_requester) = - KeyManager::new(&logctx.log, HardcodedSecretRetriever {}); + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); let (mut manager, _) = StorageManager::new(&logctx.log, key_requester); let zpool_name = ZpoolName::new_external(Uuid::new_v4()); assert_eq!(StorageManagerState::WaitingForKeyManager, manager.state); @@ -536,7 +552,7 @@ mod tests { async fn ensure_u2_gets_added_to_resources() { let logctx = test_setup_log("ensure_u2_gets_added_to_resources"); let (mut key_manager, key_requester) = - KeyManager::new(&logctx.log, HardcodedSecretRetriever {}); + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); let (mut manager, _) = StorageManager::new(&logctx.log, key_requester); let zpool_name = ZpoolName::new_external(Uuid::new_v4()); let dir = tempdir().unwrap(); @@ -557,7 +573,7 @@ mod tests { async fn wait_for_bootdisk() { let logctx = test_setup_log("wait_for_bootdisk"); let (mut key_manager, key_requester) = - KeyManager::new(&logctx.log, HardcodedSecretRetriever {}); + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); let (mut manager, mut handle) = StorageManager::new(&logctx.log, key_requester); // Spawn the key_manager so that it will respond to requests for encryption keys @@ -583,7 +599,7 @@ mod tests { async fn queued_disks_get_added_as_resources() { let logctx = test_setup_log("queued_disks_get_added_as_resources"); let (mut key_manager, key_requester) = - KeyManager::new(&logctx.log, HardcodedSecretRetriever {}); + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); let (mut manager, mut handle) = StorageManager::new(&logctx.log, key_requester); @@ -612,4 +628,57 @@ mod tests { Zpool::destroy(&zpool_name).unwrap(); logctx.cleanup_successful(); } + + /// For this test, we are going to step through the msg recv loop directly + /// without running the `StorageManager` in a tokio task. + /// This allows us to control timing precisely. + #[tokio::test] + async fn queued_disks_get_requeued_on_secret_retriever_error() { + let logctx = test_setup_log("queued_disks_get_added_as_resources"); + let inject_error = Arc::new(AtomicBool::new(false)); + let (mut key_manager, key_requester) = KeyManager::new( + &logctx.log, + HardcodedSecretRetriever { inject_error: inject_error.clone() }, + ); + let (mut manager, handle) = + StorageManager::new(&logctx.log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Queue up a disks, as we haven't told the `StorageManager` that + // the `KeyManager` is ready yet. + let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let dir = tempdir().unwrap(); + let _ = new_disk(dir.path(), &zpool_name); + handle.upsert_synthetic_disk(zpool_name.clone()).await; + manager.step().await.unwrap(); + + // We can't wait for a reply through the handle as the storage manager task + // isn't actually running. We just check the resources directly. + assert!(manager.resources.all_u2_zpools().is_empty()); + + // Let's inject an error to the `SecretRetriever` to simulate a trust + // quorum timeout + inject_error.store(true, Ordering::SeqCst); + + // Now inform the storage manager that the key manager is ready + // The queued disk should not be added due to the error + handle.key_manager_ready().await; + manager.step().await.unwrap(); + assert!(manager.resources.all_u2_zpools().is_empty()); + + // Manually simulating a timer tick to add queued disks should also + // still hit the error + manager.add_queued_disks().await; + assert!(manager.resources.all_u2_zpools().is_empty()); + + // Clearing the injected error will cause the disk to get added + inject_error.store(false, Ordering::SeqCst); + manager.add_queued_disks().await; + assert_eq!(1, manager.resources.all_u2_zpools().len()); + + Zpool::destroy(&zpool_name).unwrap(); + logctx.cleanup_successful(); + } } From 96512174027ee73b0d77c2409dc23f3f488752db Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 3 Oct 2023 19:48:04 +0000 Subject: [PATCH 14/66] wip --- sled-agent/src/storage_manager.rs | 270 ------------------------------ sled-storage/src/manager.rs | 75 ++++++++- sled-storage/src/resources.rs | 34 +++- 3 files changed, 106 insertions(+), 273 deletions(-) diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index 3d3e544573..68fb7df7df 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -47,97 +47,6 @@ use illumos_utils::{zfs::Zfs, zpool::Zpool}; // boot when the bootstore has detected it has a key share. static KEY_MANAGER_READY: OnceLock<()> = OnceLock::new(); -#[derive(thiserror::Error, Debug)] -pub enum Error { - #[error(transparent)] - DiskError(#[from] sled_hardware::PooledDiskError), - - // TODO: We could add the context of "why are we doint this op", maybe? - #[error(transparent)] - ZfsListDataset(#[from] illumos_utils::zfs::ListDatasetsError), - - #[error(transparent)] - ZfsEnsureFilesystem(#[from] illumos_utils::zfs::EnsureFilesystemError), - - #[error(transparent)] - ZfsSetValue(#[from] illumos_utils::zfs::SetValueError), - - #[error(transparent)] - ZfsGetValue(#[from] illumos_utils::zfs::GetValueError), - - #[error(transparent)] - GetZpoolInfo(#[from] illumos_utils::zpool::GetInfoError), - - #[error(transparent)] - Fstyp(#[from] illumos_utils::fstyp::Error), - - #[error(transparent)] - ZoneCommand(#[from] illumos_utils::running_zone::RunCommandError), - - #[error(transparent)] - ZoneBoot(#[from] illumos_utils::running_zone::BootError), - - #[error(transparent)] - ZoneEnsureAddress(#[from] illumos_utils::running_zone::EnsureAddressError), - - #[error(transparent)] - ZoneInstall(#[from] illumos_utils::running_zone::InstallZoneError), - - #[error("No U.2 Zpools found")] - NoU2Zpool, - - #[error("Failed to parse UUID from {path}: {err}")] - ParseUuid { - path: Utf8PathBuf, - #[source] - err: uuid::Error, - }, - - #[error("Dataset {name:?} exists with a different uuid (has {old}, requested {new})")] - UuidMismatch { name: Box, old: Uuid, new: Uuid }, - - #[error("Error parsing pool {name}'s size: {err}")] - BadPoolSize { - name: String, - #[source] - err: ByteCountRangeError, - }, - - #[error("Failed to parse the dataset {name}'s UUID: {err}")] - ParseDatasetUuid { - name: String, - #[source] - err: uuid::Error, - }, - - #[error("Zpool Not Found: {0}")] - ZpoolNotFound(String), - - #[error("Underlay not yet initialized")] - UnderlayNotInitialized, -} - -/// A ZFS storage pool. -struct Pool { - name: ZpoolName, - info: ZpoolInfo, - parent: DiskIdentity, -} - -impl Pool { - /// Queries for an existing Zpool by name. - /// - /// Returns Ok if the pool exists. - fn new(name: ZpoolName, parent: DiskIdentity) -> Result { - let info = Zpool::get_info(&name.to_string())?; - Ok(Pool { name, info, parent }) - } - - fn parent(&self) -> &DiskIdentity { - &self.parent - } -} - // The type of a future which is used to send a notification to Nexus. type NotifyFut = Pin> + Send>>; @@ -154,179 +63,12 @@ struct UnderlayRequest { responder: oneshot::Sender>, } -#[derive(PartialEq, Eq, Clone)] -pub(crate) enum DiskWrapper { - Real { disk: Disk, devfs_path: Utf8PathBuf }, - Synthetic { zpool_name: ZpoolName }, -} - -impl From for DiskWrapper { - fn from(disk: Disk) -> Self { - let devfs_path = disk.devfs_path().clone(); - Self::Real { disk, devfs_path } - } -} - -impl DiskWrapper { - fn identity(&self) -> DiskIdentity { - match self { - DiskWrapper::Real { disk, .. } => disk.identity().clone(), - DiskWrapper::Synthetic { zpool_name } => { - let id = zpool_name.id(); - DiskIdentity { - vendor: "synthetic-vendor".to_string(), - serial: format!("synthetic-serial-{id}"), - model: "synthetic-model".to_string(), - } - } - } - } - - fn variant(&self) -> DiskVariant { - match self { - DiskWrapper::Real { disk, .. } => disk.variant(), - DiskWrapper::Synthetic { zpool_name } => match zpool_name.kind() { - ZpoolKind::External => DiskVariant::U2, - ZpoolKind::Internal => DiskVariant::M2, - }, - } - } - - fn zpool_name(&self) -> &ZpoolName { - match self { - DiskWrapper::Real { disk, .. } => disk.zpool_name(), - DiskWrapper::Synthetic { zpool_name } => zpool_name, - } - } -} - -#[derive(Clone)] -pub struct StorageResources { - // All disks, real and synthetic, being managed by this sled - disks: Arc>>, - - // A map of "Uuid" to "pool". - pools: Arc>>, -} - // The directory within the debug dataset in which bundles are created. const BUNDLE_DIRECTORY: &str = "bundle"; // The directory for zone bundles. const ZONE_BUNDLE_DIRECTORY: &str = "zone"; -impl StorageResources { - /// Creates a fabricated view of storage resources. - /// - /// Use this only when you want to reference the disks, but not actually - /// access them. Creates one internal and one external disk. - #[cfg(test)] - pub fn new_for_test() -> Self { - let new_disk_identity = || DiskIdentity { - vendor: "vendor".to_string(), - serial: Uuid::new_v4().to_string(), - model: "model".to_string(), - }; - - Self { - disks: Arc::new(Mutex::new(HashMap::from([ - ( - new_disk_identity(), - DiskWrapper::Synthetic { - zpool_name: ZpoolName::new_internal(Uuid::new_v4()), - }, - ), - ( - new_disk_identity(), - DiskWrapper::Synthetic { - zpool_name: ZpoolName::new_external(Uuid::new_v4()), - }, - ), - ]))), - pools: Arc::new(Mutex::new(HashMap::new())), - } - } - - /// Returns the identity of the boot disk. - /// - /// If this returns `None`, we have not processed the boot disk yet. - pub async fn boot_disk(&self) -> Option<(DiskIdentity, ZpoolName)> { - let disks = self.disks.lock().await; - disks.iter().find_map(|(id, disk)| { - match disk { - // This is the "real" use-case: if we have real disks, query - // their properties to identify if they truly are the boot disk. - DiskWrapper::Real { disk, .. } => { - if disk.is_boot_disk() { - return Some((id.clone(), disk.zpool_name().clone())); - } - } - // This is the "less real" use-case: if we have synthetic disks, - // just label the first M.2-looking one as a "boot disk". - DiskWrapper::Synthetic { .. } => { - if matches!(disk.variant(), DiskVariant::M2) { - return Some((id.clone(), disk.zpool_name().clone())); - } - } - }; - None - }) - } - - // TODO: Could be generic over DiskVariant - - /// Returns all M.2 zpools - pub async fn all_m2_zpools(&self) -> Vec { - self.all_zpools(DiskVariant::M2).await - } - - /// Returns all U.2 zpools - pub async fn all_u2_zpools(&self) -> Vec { - self.all_zpools(DiskVariant::U2).await - } - - /// Returns all mountpoints within all M.2s for a particular dataset. - pub async fn all_m2_mountpoints(&self, dataset: &str) -> Vec { - let m2_zpools = self.all_m2_zpools().await; - m2_zpools - .iter() - .map(|zpool| zpool.dataset_mountpoint(dataset)) - .collect() - } - - /// Returns all mountpoints within all U.2s for a particular dataset. - pub async fn all_u2_mountpoints(&self, dataset: &str) -> Vec { - let u2_zpools = self.all_u2_zpools().await; - u2_zpools - .iter() - .map(|zpool| zpool.dataset_mountpoint(dataset)) - .collect() - } - - /// Returns all zpools of a particular variant - pub async fn all_zpools(&self, variant: DiskVariant) -> Vec { - let disks = self.disks.lock().await; - disks - .values() - .filter_map(|disk| { - if disk.variant() == variant { - return Some(disk.zpool_name().clone()); - } - None - }) - .collect() - } - - /// Return the directories for storing zone service bundles. - pub async fn all_zone_bundle_directories(&self) -> Vec { - self.all_m2_mountpoints(sled_hardware::disk::M2_DEBUG_DATASET) - .await - .into_iter() - .map(|p| p.join(BUNDLE_DIRECTORY).join(ZONE_BUNDLE_DIRECTORY)) - .collect() - } -} - /// Describes the access to the underlay used by the StorageManager. pub struct UnderlayAccess { pub nexus_client: NexusClientWithResolver, @@ -1392,15 +1134,3 @@ impl StorageManager { &self.inner.resources } } - -impl Drop for StorageManagerInner { - fn drop(&mut self) { - // NOTE: Ideally, with async drop, we'd await completion of the worker - // somehow. - // - // Without that option, we instead opt to simply cancel the worker - // task to ensure it does not remain alive beyond the StorageManager - // itself. - self.task.abort(); - } -} diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 2a7dcbda9b..f5304262e4 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -34,6 +34,7 @@ enum StorageRequest { AddDisk(UnparsedDisk), AddSyntheticDisk(ZpoolName), RemoveDisk(UnparsedDisk), + RemoveSyntheticDisk(ZpoolName), DisksChanged(HashSet), // NewFilesystem(NewFilesystemRequest), KeyManagerReady, @@ -66,6 +67,12 @@ impl StorageHandle { self.tx.send(StorageRequest::RemoveDisk(disk)).await.unwrap(); } + /// Removes a synthetic disk, if it's tracked by the storage manager, as + /// well as any associated zpools. + pub async fn delete_synthetic_disk(&self, pool: ZpoolName) { + self.tx.send(StorageRequest::RemoveSyntheticDisk(pool)).await.unwrap(); + } + /// Ensures that the storage manager tracks exactly the provided disks. /// /// This acts similar to a batch [Self::upsert_disk] for all new disks, and @@ -209,7 +216,12 @@ impl StorageManager { } } } - StorageRequest::RemoveDisk(_unparsed_disk) => todo!(), + StorageRequest::RemoveDisk(unparsed_disk) => { + self.remove_disk(unparsed_disk).await; + } + StorageRequest::RemoveSyntheticDisk(pool) => { + self.remove_synthetic_disk(pool).await; + } StorageRequest::DisksChanged(_unparsed_disks) => todo!(), StorageRequest::KeyManagerReady => { self.state = StorageManagerState::Normal; @@ -437,6 +449,24 @@ impl StorageManager { } } } + + // Delete a real disk + async fn remove_disk(&mut self, unparsed_disk: UnparsedDisk) { + // If the disk is a U.2, we want to first delete it from any queued disks + let _ = self.queued_u2_drives.remove(&unparsed_disk); + if self.resources.remove_real_disk(unparsed_disk) { + let _ = self.resource_updates.send_replace(self.resources.clone()); + } + } + + // Delete a synthetic disk + async fn remove_synthetic_disk(&mut self, pool: ZpoolName) { + // If the disk is a U.2, we want to first delete it from any queued disks + let _ = self.queued_synthetic_u2_drives.remove(&pool); + if self.resources.remove_synthetic_disk(pool) { + let _ = self.resource_updates.send_replace(self.resources.clone()); + } + } } /// All tests only use synthetic disks, but are expected to be run on illumos @@ -634,7 +664,9 @@ mod tests { /// This allows us to control timing precisely. #[tokio::test] async fn queued_disks_get_requeued_on_secret_retriever_error() { - let logctx = test_setup_log("queued_disks_get_added_as_resources"); + let logctx = test_setup_log( + "queued_disks_get_requeued_on_secret_retriever_error", + ); let inject_error = Arc::new(AtomicBool::new(false)); let (mut key_manager, key_requester) = KeyManager::new( &logctx.log, @@ -681,4 +713,43 @@ mod tests { Zpool::destroy(&zpool_name).unwrap(); logctx.cleanup_successful(); } + + #[tokio::test] + async fn delete_disk_triggers_notification() { + let logctx = test_setup_log("delete_disk_triggers_notification"); + let (mut key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); + let (mut manager, mut handle) = + StorageManager::new(&logctx.log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + // Inform the storage manager that the key manager is ready, so disks + // don't get queued + handle.key_manager_ready().await; + + // Create and add a disk + let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let dir = tempdir().unwrap(); + let _ = new_disk(dir.path(), &zpool_name); + handle.upsert_synthetic_disk(zpool_name.clone()).await; + + // Wait for the add disk notification + let resources = handle.wait_for_changes().await; + assert_eq!(resources.all_u2_zpools().len(), 1); + + // Delete the disk and wait for a notification + handle.delete_synthetic_disk(zpool_name.clone()).await; + let resources = handle.wait_for_changes().await; + assert!(resources.all_u2_zpools().is_empty()); + + Zpool::destroy(&zpool_name).unwrap(); + logctx.cleanup_successful(); + } } diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index fb57d742e3..82c588bd27 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -12,7 +12,7 @@ use camino::Utf8PathBuf; use illumos_utils::zpool::ZpoolName; use omicron_common::api::external::{ByteCount, ByteCountRangeError}; use omicron_common::disk::DiskIdentity; -use sled_hardware::DiskVariant; +use sled_hardware::{DiskVariant, UnparsedDisk}; use std::collections::BTreeMap; use std::sync::Arc; use uuid::Uuid; @@ -92,6 +92,38 @@ impl StorageResources { Ok(true) } + /// Delete a real disk and its zpool + /// + /// Return true, if data was changed, false otherwise + pub(crate) fn remove_real_disk(&mut self, disk: UnparsedDisk) -> bool { + if !self.disks.contains_key(disk.identity()) { + return false; + } + // Safe to unwrap as we just checked the key existed above + let parsed_disk = + Arc::make_mut(&mut self.disks).remove(disk.identity()).unwrap(); + Arc::make_mut(&mut self.pools).remove(&parsed_disk.zpool_name().id()); + true + } + + /// Delete a synthetic disk and its zpool + /// + /// Return true, if data was changed, false otherwise + pub(crate) fn remove_synthetic_disk( + &mut self, + zpool_name: ZpoolName, + ) -> bool { + let disk = DiskWrapper::Synthetic { zpool_name: zpool_name.clone() }; + if !self.disks.contains_key(&disk.identity()) { + return false; + } + // Safe to unwrap as we just checked the key existed above + let parsed_disk = + Arc::make_mut(&mut self.disks).remove(&disk.identity()).unwrap(); + Arc::make_mut(&mut self.pools).remove(&parsed_disk.zpool_name().id()); + true + } + /// Returns the identity of the boot disk. /// /// If this returns `None`, we have not processed the boot disk yet. From 4a38f9019d12ee6f5c5bd62455d42db7fa4ebfae Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 3 Oct 2023 22:07:44 +0000 Subject: [PATCH 15/66] wip --- sled-hardware/src/disk.rs | 1 + sled-storage/src/disk.rs | 229 ++++++++++++++---------- sled-storage/src/dump_setup.rs | 71 ++++---- sled-storage/src/manager.rs | 311 +++++++-------------------------- sled-storage/src/resources.rs | 81 ++------- 5 files changed, 248 insertions(+), 445 deletions(-) diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index 541d7bd548..aeaca9dc31 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -175,6 +175,7 @@ impl UnparsedDisk { /// from the ZFS related logic which can also operate on file backed zpools. /// Doing things this way allows us to not put higher level concepts like /// storage keys into this hardware related package. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct PooledDisk { pub paths: DiskPaths, pub slot: i64, diff --git a/sled-storage/src/disk.rs b/sled-storage/src/disk.rs index d7e02d8c97..640d4e77f4 100644 --- a/sled-storage/src/disk.rs +++ b/sled-storage/src/disk.rs @@ -4,163 +4,214 @@ //! Disk related types -use camino::Utf8PathBuf; -use illumos_utils::zpool::{ZpoolKind, ZpoolName}; +use camino::{Utf8Path, Utf8PathBuf}; +use derive_more::From; +use illumos_utils::zpool::{Zpool, ZpoolKind, ZpoolName}; use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; use sled_hardware::{ - DiskPaths, DiskVariant, Partition, PooledDisk, PooledDiskError, - UnparsedDisk, + DiskVariant, Partition, PooledDisk, PooledDiskError, UnparsedDisk, }; use slog::Logger; +use std::fs::File; use crate::dataset; -/// A wrapper around real disks or synthetic disks backed by a file -#[derive(Debug, PartialEq, Eq, Clone)] -pub enum DiskWrapper { - Real { disk: Disk, devfs_path: Utf8PathBuf }, - Synthetic { zpool_name: ZpoolName }, +#[derive(Debug, thiserror::Error)] +pub enum DiskError { + #[error(transparent)] + Dataset(#[from] crate::dataset::DatasetError), + #[error(transparent)] + PooledDisk(#[from] sled_hardware::PooledDiskError), +} + +// A synthetic disk that acts as one "found" by the hardware and that is backed +// by a zpool +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SyntheticDisk { + identity: DiskIdentity, + zpool_name: ZpoolName, } -impl From for DiskWrapper { - fn from(disk: Disk) -> Self { - let devfs_path = disk.devfs_path().clone(); - Self::Real { disk, devfs_path } +impl SyntheticDisk { + // Create a zpool and import it for the synthetic disk + // Zpools willl be set to the min size of 64Mib + pub fn create_zpool( + dir: &Utf8Path, + zpool_name: &ZpoolName, + ) -> SyntheticDisk { + // 64 MiB (min size of zpool) + const DISK_SIZE: u64 = 64 * 1024 * 1024; + let path = dir.join(zpool_name.to_string()); + let file = File::create(&path).unwrap(); + file.set_len(DISK_SIZE).unwrap(); + drop(file); + Zpool::create(zpool_name, &path).unwrap(); + Zpool::import(zpool_name).unwrap(); + Zpool::set_failmode_continue(zpool_name).unwrap(); + Self::new(zpool_name.clone()) + } + + pub fn new(zpool_name: ZpoolName) -> SyntheticDisk { + let id = zpool_name.id(); + let identity = DiskIdentity { + vendor: "synthetic-vendor".to_string(), + serial: format!("synthetic-serial-{id}"), + model: "synthetic-model".to_string(), + }; + SyntheticDisk { identity, zpool_name } } } -impl DiskWrapper { - pub fn identity(&self) -> DiskIdentity { +// An [`UnparsedDisk`] disk learned about from the hardware or a wrapped zpool +#[derive(Debug, Clone, PartialEq, Eq, Hash, From)] +pub enum RawDisk { + Real(UnparsedDisk), + Synthetic(SyntheticDisk), +} + +impl RawDisk { + pub fn is_boot_disk(&self) -> bool { match self { - DiskWrapper::Real { disk, .. } => disk.identity().clone(), - DiskWrapper::Synthetic { zpool_name } => { - let id = zpool_name.id(); - DiskIdentity { - vendor: "synthetic-vendor".to_string(), - serial: format!("synthetic-serial-{id}"), - model: "synthetic-model".to_string(), - } + Self::Real(disk) => disk.is_boot_disk(), + Self::Synthetic(disk) => { + // Just label any M.2 the boot disk. + disk.zpool_name.kind() == ZpoolKind::Internal } } } - pub fn variant(&self) -> DiskVariant { + pub fn identity(&self) -> &DiskIdentity { match self { - DiskWrapper::Real { disk, .. } => disk.variant(), - DiskWrapper::Synthetic { zpool_name } => match zpool_name.kind() { - ZpoolKind::External => DiskVariant::U2, - ZpoolKind::Internal => DiskVariant::M2, - }, + Self::Real(disk) => &disk.identity(), + Self::Synthetic(disk) => &disk.identity, } } - pub fn zpool_name(&self) -> &ZpoolName { + pub fn variant(&self) -> DiskVariant { match self { - DiskWrapper::Real { disk, .. } => disk.zpool_name(), - DiskWrapper::Synthetic { zpool_name } => zpool_name, + Self::Real(disk) => disk.variant(), + Self::Synthetic(disk) => match disk.zpool_name.kind() { + ZpoolKind::External => DiskVariant::U2, + ZpoolKind::Internal => DiskVariant::M2, + }, } } } -#[derive(Debug, thiserror::Error)] -pub enum DiskError { - #[error(transparent)] - Dataset(#[from] crate::dataset::DatasetError), - #[error(transparent)] - PooledDisk(#[from] sled_hardware::PooledDiskError), -} - -/// A physical disk conforming to the expected partition layout -/// and which contains provisioned zpools and datasets. This disk -/// is ready for usage by higher level software. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct Disk { - paths: DiskPaths, - slot: i64, - variant: DiskVariant, - identity: DiskIdentity, - is_boot_disk: bool, - partitions: Vec, - - // This embeds the assumtion that there is exactly one parsed zpool per - // disk. - zpool_name: ZpoolName, +/// A physical [`PooledDisk`] or a [`SyntheticDisk`] that contains or is backed +/// by a single zpool and that has provisioned datasets. This disk is ready for +/// usage by higher level software. +#[derive(Debug, Clone, PartialEq, Eq, Hash, From)] +pub enum Disk { + Real(PooledDisk), + Synthetic(SyntheticDisk), } impl Disk { pub async fn new( log: &Logger, - unparsed_disk: UnparsedDisk, + raw_disk: RawDisk, key_requester: Option<&StorageKeyRequester>, ) -> Result { - let disk = PooledDisk::new(log, unparsed_disk)?; + let disk = match raw_disk { + RawDisk::Real(disk) => PooledDisk::new(log, disk)?.into(), + RawDisk::Synthetic(disk) => Disk::Synthetic(disk), + }; dataset::ensure_zpool_has_datasets( log, - &disk.zpool_name, - &disk.identity, + disk.zpool_name(), + disk.identity(), key_requester, ) .await?; - Ok(disk.into()) + Ok(disk) + } + + pub fn is_synthetic(&self) -> bool { + match self { + Self::Real(_) => false, + Self::Synthetic(_) => true, + } + } + + pub fn is_real(&self) -> bool { + !self.is_synthetic() } + pub fn is_boot_disk(&self) -> bool { - self.is_boot_disk + match self { + Self::Real(disk) => disk.is_boot_disk, + Self::Synthetic(disk) => { + // Just label any M.2 the boot disk. + disk.zpool_name.kind() == ZpoolKind::Internal + } + } } pub fn identity(&self) -> &DiskIdentity { - &self.identity + match self { + Self::Real(disk) => &disk.identity, + Self::Synthetic(disk) => &disk.identity, + } } pub fn variant(&self) -> DiskVariant { - self.variant + match self { + Self::Real(disk) => disk.variant, + Self::Synthetic(disk) => match disk.zpool_name.kind() { + ZpoolKind::External => DiskVariant::U2, + ZpoolKind::Internal => DiskVariant::M2, + }, + } } pub fn devfs_path(&self) -> &Utf8PathBuf { - &self.paths.devfs_path + match self { + Self::Real(disk) => &disk.paths.devfs_path, + Self::Synthetic(_) => unreachable!(), + } } pub fn zpool_name(&self) -> &ZpoolName { - &self.zpool_name + match self { + Self::Real(disk) => &disk.zpool_name, + Self::Synthetic(disk) => &disk.zpool_name, + } } pub fn boot_image_devfs_path( &self, raw: bool, ) -> Result { - self.paths.partition_device_path( - &self.partitions, - Partition::BootImage, - raw, - ) + match self { + Self::Real(disk) => disk.paths.partition_device_path( + &disk.partitions, + Partition::BootImage, + raw, + ), + Self::Synthetic(_) => unreachable!(), + } } pub fn dump_device_devfs_path( &self, raw: bool, ) -> Result { - self.paths.partition_device_path( - &self.partitions, - Partition::DumpDevice, - raw, - ) + match self { + Self::Real(disk) => disk.paths.partition_device_path( + &disk.partitions, + Partition::DumpDevice, + raw, + ), + Self::Synthetic(_) => unreachable!(), + } } pub fn slot(&self) -> i64 { - self.slot - } -} - -impl From for Disk { - fn from(pd: PooledDisk) -> Self { - Self { - paths: pd.paths, - slot: pd.slot, - variant: pd.variant, - identity: pd.identity, - is_boot_disk: pd.is_boot_disk, - partitions: pd.partitions, - zpool_name: pd.zpool_name, + match self { + Self::Real(disk) => disk.slot, + Self::Synthetic(_) => unreachable!(), } } } diff --git a/sled-storage/src/dump_setup.rs b/sled-storage/src/dump_setup.rs index 5befa8e8c8..39c6aa2995 100644 --- a/sled-storage/src/dump_setup.rs +++ b/sled-storage/src/dump_setup.rs @@ -5,7 +5,7 @@ //! Dump dataset setup use crate::dataset::{CRASH_DATASET, DUMP_DATASET}; -use crate::disk::DiskWrapper; +use crate::disk::Disk; use camino::Utf8PathBuf; use derive_more::{AsRef, Deref, From}; use illumos_utils::dumpadm::DumpAdmError; @@ -106,50 +106,51 @@ const ARCHIVAL_INTERVAL: Duration = Duration::from_secs(300); impl DumpSetup { pub(crate) async fn update_dumpdev_setup( &self, - disks: &mut MutexGuard<'_, HashMap>, + disks: &mut MutexGuard<'_, HashMap>, ) { let log = &self.log; let mut m2_dump_slices = Vec::new(); let mut u2_debug_datasets = Vec::new(); let mut m2_core_datasets = Vec::new(); - for (_id, disk_wrapper) in disks.iter() { - match disk_wrapper { - DiskWrapper::Real { disk, .. } => match disk.variant() { - DiskVariant::M2 => { - match disk.dump_device_devfs_path(false) { - Ok(path) => { - m2_dump_slices.push(DumpSlicePath(path)) - } - Err(err) => { - warn!(log, "Error getting dump device devfs path: {err:?}"); - } + for (_id, disk) in disks.iter() { + if disk.is_synthetic() { + // We only setup dump devices on real disks + continue; + } + match disk.variant() { + DiskVariant::M2 => { + match disk.dump_device_devfs_path(false) { + Ok(path) => m2_dump_slices.push(DumpSlicePath(path)), + Err(err) => { + warn!( + log, + "Error getting dump device devfs path: {err:?}" + ); } - let name = disk.zpool_name(); - if let Ok(info) = illumos_utils::zpool::Zpool::get_info( - &name.to_string(), - ) { - if info.health() == ZpoolHealth::Online { - m2_core_datasets.push(CoreZpool(name.clone())); - } else { - warn!(log, "Zpool {name:?} not online, won't attempt to save process core dumps there"); - } + } + let name = disk.zpool_name(); + if let Ok(info) = + illumos_utils::zpool::Zpool::get_info(&name.to_string()) + { + if info.health() == ZpoolHealth::Online { + m2_core_datasets.push(CoreZpool(name.clone())); + } else { + warn!(log, "Zpool {name:?} not online, won't attempt to save process core dumps there"); } } - DiskVariant::U2 => { - let name = disk.zpool_name(); - if let Ok(info) = illumos_utils::zpool::Zpool::get_info( - &name.to_string(), - ) { - if info.health() == ZpoolHealth::Online { - u2_debug_datasets - .push(DebugZpool(name.clone())); - } else { - warn!(log, "Zpool {name:?} not online, won't attempt to save kernel core dumps there"); - } + } + DiskVariant::U2 => { + let name = disk.zpool_name(); + if let Ok(info) = + illumos_utils::zpool::Zpool::get_info(&name.to_string()) + { + if info.health() == ZpoolHealth::Online { + u2_debug_datasets.push(DebugZpool(name.clone())); + } else { + warn!(log, "Zpool {name:?} not online, won't attempt to save kernel core dumps there"); } } - }, - DiskWrapper::Synthetic { .. } => {} + } } } diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index f5304262e4..efd4630c71 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -4,17 +4,16 @@ //! The storage manager task -use std::collections::{BTreeSet, HashSet}; +use std::collections::HashSet; -use crate::dataset::{self, DatasetError}; -use crate::disk::{Disk, DiskError, DiskWrapper}; +use crate::dataset::DatasetError; +use crate::disk::{Disk, DiskError, RawDisk}; use crate::error::Error; use crate::resources::StorageResources; -use derive_more::From; -use illumos_utils::zpool::{ZpoolKind, ZpoolName}; +use illumos_utils::zpool::ZpoolName; use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; -use sled_hardware::{DiskVariant, UnparsedDisk}; +use sled_hardware::DiskVariant; use slog::{debug, error, info, o, warn, Logger}; use tokio::sync::{mpsc, oneshot, watch}; use tokio::time::{interval, Duration, MissedTickBehavior}; @@ -31,11 +30,9 @@ pub enum StorageManagerState { } enum StorageRequest { - AddDisk(UnparsedDisk), - AddSyntheticDisk(ZpoolName), - RemoveDisk(UnparsedDisk), - RemoveSyntheticDisk(ZpoolName), - DisksChanged(HashSet), + AddDisk(RawDisk), + RemoveDisk(RawDisk), + DisksChanged(HashSet), // NewFilesystem(NewFilesystemRequest), KeyManagerReady, /// This will always grab the latest state after any new updates, as it @@ -52,27 +49,16 @@ pub struct StorageHandle { impl StorageHandle { /// Adds a disk and associated zpool to the storage manager. - pub async fn upsert_disk(&self, disk: UnparsedDisk) { + pub async fn upsert_disk(&self, disk: RawDisk) { self.tx.send(StorageRequest::AddDisk(disk)).await.unwrap(); } - /// Adds a synthetic disk backed by a zpool to the storage manager. - pub async fn upsert_synthetic_disk(&self, pool: ZpoolName) { - self.tx.send(StorageRequest::AddSyntheticDisk(pool)).await.unwrap(); - } - /// Removes a disk, if it's tracked by the storage manager, as well /// as any associated zpools. - pub async fn delete_disk(&self, disk: UnparsedDisk) { + pub async fn delete_disk(&self, disk: RawDisk) { self.tx.send(StorageRequest::RemoveDisk(disk)).await.unwrap(); } - /// Removes a synthetic disk, if it's tracked by the storage manager, as - /// well as any associated zpools. - pub async fn delete_synthetic_disk(&self, pool: ZpoolName) { - self.tx.send(StorageRequest::RemoveSyntheticDisk(pool)).await.unwrap(); - } - /// Ensures that the storage manager tracks exactly the provided disks. /// /// This acts similar to a batch [Self::upsert_disk] for all new disks, and @@ -80,14 +66,12 @@ impl StorageHandle { /// /// If errors occur, an arbitrary "one" of them will be returned, but a /// best-effort attempt to add all disks will still be attempted. - pub async fn ensure_using_exactly_these_disks(&self, unparsed_disks: I) + pub async fn ensure_using_exactly_these_disks(&self, raw_disks: I) where - I: IntoIterator, + I: IntoIterator, { self.tx - .send(StorageRequest::DisksChanged( - unparsed_disks.into_iter().collect(), - )) + .send(StorageRequest::DisksChanged(raw_disks.into_iter().collect())) .await .unwrap(); } @@ -142,8 +126,7 @@ pub struct StorageManager { state: StorageManagerState, rx: mpsc::Receiver, resources: StorageResources, - queued_u2_drives: HashSet, - queued_synthetic_u2_drives: HashSet, + queued_u2_drives: HashSet, key_requester: StorageKeyRequester, resource_updates: watch::Sender, } @@ -163,7 +146,6 @@ impl StorageManager { rx, resources, queued_u2_drives: HashSet::new(), - queued_synthetic_u2_drives: HashSet::new(), key_requester, resource_updates: update_tx, }, @@ -200,29 +182,14 @@ impl StorageManager { pub async fn step(&mut self) -> Result<(), Error> { // The sending side should never disappear match self.rx.recv().await.unwrap() { - StorageRequest::AddDisk(unparsed_disk) => { - match unparsed_disk.variant() { - DiskVariant::U2 => self.add_u2_disk(unparsed_disk).await?, - DiskVariant::M2 => self.add_m2_disk(unparsed_disk).await?, - } - } - StorageRequest::AddSyntheticDisk(zpool_name) => { - match zpool_name.kind() { - ZpoolKind::External => { - self.add_synthetic_u2_disk(zpool_name).await? - } - ZpoolKind::Internal => { - self.add_synthetic_m2_disk(zpool_name).await? - } - } - } - StorageRequest::RemoveDisk(unparsed_disk) => { - self.remove_disk(unparsed_disk).await; - } - StorageRequest::RemoveSyntheticDisk(pool) => { - self.remove_synthetic_disk(pool).await; + StorageRequest::AddDisk(raw_disk) => match raw_disk.variant() { + DiskVariant::U2 => self.add_u2_disk(raw_disk).await?, + DiskVariant::M2 => self.add_m2_disk(raw_disk).await?, + }, + StorageRequest::RemoveDisk(raw_disk) => { + self.remove_disk(raw_disk).await; } - StorageRequest::DisksChanged(_unparsed_disks) => todo!(), + StorageRequest::DisksChanged(_raw_disks) => todo!(), StorageRequest::KeyManagerReady => { self.state = StorageManagerState::Normal; self.add_queued_disks().await; @@ -267,59 +234,20 @@ impl StorageManager { } // Merge any requeued disks from transient errors with saved disks here self.queued_u2_drives.extend(saved); - - // Operate on queued synthetic disks - if self.state == StorageManagerState::QueuingDisks { - return; - } - - let mut saved = HashSet::new(); - let queued = std::mem::take(&mut self.queued_synthetic_u2_drives); - let mut iter = queued.into_iter(); - while let Some(zpool_name) = iter.next() { - if self.state == StorageManagerState::QueuingDisks { - // We hit a transient error in a prior iteration. - saved.insert(zpool_name); - } else { - // Try ot add the disk. If there was a transient error the disk will - // have been requeued. If there was a permanent error, it will have been - // dropped. If there is an another unexpected error, we will handle it and - // requeue ourselves. - if let Err(err) = - self.add_synthetic_u2_disk(zpool_name.clone()).await - { - warn!( - self.log, - "Potentially transient error: {err}: - requeing synthetic disk {:?}", - zpool_name - ); - saved.insert(zpool_name); - } - } - } - // Merge any requeued disks from transient errors with saved disks here - self.queued_synthetic_u2_drives.extend(saved); } // Add a real U.2 disk to [`StorageResources`] or queue it to be added later - async fn add_u2_disk( - &mut self, - unparsed_disk: UnparsedDisk, - ) -> Result<(), Error> { + async fn add_u2_disk(&mut self, raw_disk: RawDisk) -> Result<(), Error> { if self.state != StorageManagerState::Normal { - self.queued_u2_drives.insert(unparsed_disk); + self.queued_u2_drives.insert(raw_disk); return Ok(()); } - match Disk::new( - &self.log, - unparsed_disk.clone(), - Some(&self.key_requester), - ) - .await + match Disk::new(&self.log, raw_disk.clone(), Some(&self.key_requester)) + .await { Ok(disk) => { - if self.resources.insert_real_disk(disk)? { + if self.resources.insert_disk(disk)? { let _ = self .resource_updates .send_replace(self.resources.clone()); @@ -329,9 +257,9 @@ impl StorageManager { Err(err @ DiskError::Dataset(DatasetError::KeyManager(_))) => { warn!( self.log, - "Transient error: {err} - queuing disk {:?}", unparsed_disk + "Transient error: {err} - queuing disk {:?}", raw_disk ); - self.queued_u2_drives.insert(unparsed_disk); + self.queued_u2_drives.insert(raw_disk); self.state = StorageManagerState::QueuingDisks; Ok(()) } @@ -339,7 +267,7 @@ impl StorageManager { error!( self.log, "Persistent error: {err} - not queueing disk {:?}", - unparsed_disk + raw_disk ); Ok(()) } @@ -351,119 +279,21 @@ impl StorageManager { // // We never queue M.2 drives, as they don't rely on [`KeyManager`] based // encryption - async fn add_m2_disk( - &mut self, - unparsed_disk: UnparsedDisk, - ) -> Result<(), Error> { - let disk = Disk::new( - &self.log, - unparsed_disk.clone(), - Some(&self.key_requester), - ) - .await?; - if self.resources.insert_real_disk(disk)? { + async fn add_m2_disk(&mut self, raw_disk: RawDisk) -> Result<(), Error> { + let disk = + Disk::new(&self.log, raw_disk.clone(), Some(&self.key_requester)) + .await?; + if self.resources.insert_disk(disk)? { let _ = self.resource_updates.send_replace(self.resources.clone()); } Ok(()) } - // Add a synthetic U.2 disk to [`StorageResources`] - // - // We never queue M.2 drives, as they don't rely on [`KeyManager`] based - // encryption - async fn add_synthetic_m2_disk( - &mut self, - zpool_name: ZpoolName, - ) -> Result<(), Error> { - let synthetic_id = DiskIdentity { - vendor: "fake_vendor".to_string(), - serial: "fake_serial".to_string(), - model: zpool_name.id().to_string(), - }; - - debug!(self.log, "Ensure zpool has datasets: {zpool_name}"); - dataset::ensure_zpool_has_datasets( - &self.log, - &zpool_name, - &synthetic_id, - Some(&self.key_requester), - ) - .await?; - if self.resources.insert_synthetic_disk(zpool_name)? { - let _ = self.resource_updates.send_replace(self.resources.clone()); - } - Ok(()) - } - - // Add a synthetic U.2 disk to [`StorageResources`] or queue it to be added - // later - async fn add_synthetic_u2_disk( - &mut self, - zpool_name: ZpoolName, - ) -> Result<(), Error> { - if self.state != StorageManagerState::Normal { - info!(self.log, "Queuing synthetic U.2 drive: {zpool_name}"); - self.queued_synthetic_u2_drives.insert(zpool_name); - return Ok(()); - } - - let synthetic_id = DiskIdentity { - vendor: "fake_vendor".to_string(), - serial: "fake_serial".to_string(), - model: zpool_name.id().to_string(), - }; - - debug!(self.log, "Ensure zpool has datasets: {zpool_name}"); - match dataset::ensure_zpool_has_datasets( - &self.log, - &zpool_name, - &synthetic_id, - Some(&self.key_requester), - ) - .await - { - Ok(()) => { - if self.resources.insert_synthetic_disk(zpool_name)? { - let _ = self - .resource_updates - .send_replace(self.resources.clone()); - } - Ok(()) - } - Err(err @ DatasetError::KeyManager(_)) => { - warn!( - self.log, - "Transient error: {err} - queuing disk {:?}", synthetic_id - ); - self.queued_synthetic_u2_drives.insert(zpool_name); - self.state = StorageManagerState::QueuingDisks; - Ok(()) - } - Err(err) => { - error!( - self.log, - "Persistent error: {err} - not queueing disk {:?}", - synthetic_id - ); - Ok(()) - } - } - } - // Delete a real disk - async fn remove_disk(&mut self, unparsed_disk: UnparsedDisk) { - // If the disk is a U.2, we want to first delete it from any queued disks - let _ = self.queued_u2_drives.remove(&unparsed_disk); - if self.resources.remove_real_disk(unparsed_disk) { - let _ = self.resource_updates.send_replace(self.resources.clone()); - } - } - - // Delete a synthetic disk - async fn remove_synthetic_disk(&mut self, pool: ZpoolName) { + async fn remove_disk(&mut self, raw_disk: RawDisk) { // If the disk is a U.2, we want to first delete it from any queued disks - let _ = self.queued_synthetic_u2_drives.remove(&pool); - if self.resources.remove_synthetic_disk(pool) { + let _ = self.queued_u2_drives.remove(&raw_disk); + if self.resources.remove_disk(raw_disk) { let _ = self.resource_updates.send_replace(self.resources.clone()); } } @@ -473,9 +303,10 @@ impl StorageManager { /// systems. #[cfg(all(test, target_os = "illumos"))] mod tests { + use crate::disk::SyntheticDisk; + use super::*; use async_trait::async_trait; - use camino::{Utf8Path, Utf8PathBuf}; use camino_tempfile::tempdir; use illumos_utils::zpool::Zpool; use key_manager::{ @@ -483,7 +314,6 @@ mod tests { VersionedIkm, }; use omicron_test_utils::dev::test_setup_log; - use std::fs::File; use std::sync::{ atomic::{AtomicBool, Ordering}, Arc, @@ -532,21 +362,6 @@ mod tests { } } - // 64 MiB (min size of zpool) - const DISK_SIZE: u64 = 64 * 1024 * 1024; - - // Create a synthetic disk with a zpool backed by a file - fn new_disk(dir: &Utf8Path, zpool_name: &ZpoolName) -> Utf8PathBuf { - let path = dir.join(zpool_name.to_string()); - let file = File::create(&path).unwrap(); - file.set_len(DISK_SIZE).unwrap(); - drop(file); - Zpool::create(zpool_name, &path).unwrap(); - Zpool::import(zpool_name).unwrap(); - Zpool::set_failmode_continue(zpool_name).unwrap(); - path - } - #[tokio::test] async fn add_u2_disk_while_not_in_normal_stage_and_ensure_it_gets_queued() { let logctx = test_setup_log( @@ -556,25 +371,18 @@ mod tests { KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); let (mut manager, _) = StorageManager::new(&logctx.log, key_requester); let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let raw_disk: RawDisk = SyntheticDisk::new(zpool_name).into(); assert_eq!(StorageManagerState::WaitingForKeyManager, manager.state); - manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); + manager.add_u2_disk(raw_disk.clone()).await.unwrap(); assert!(manager.resources.all_u2_zpools().is_empty()); - assert_eq!( - manager.queued_synthetic_u2_drives, - HashSet::from([zpool_name.clone()]) - ); + assert_eq!(manager.queued_u2_drives, HashSet::from([raw_disk.clone()])); - // Walk through other non-normal stages and enusre disk gets queued - for stage in [StorageManagerState::QueuingDisks] { - manager.queued_synthetic_u2_drives.clear(); - manager.state = stage; - manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); - assert!(manager.resources.all_u2_zpools().is_empty()); - assert_eq!( - manager.queued_synthetic_u2_drives, - HashSet::from([zpool_name.clone()]) - ); - } + // Check other non-normal stages and enusre disk gets queued + manager.queued_u2_drives.clear(); + manager.state = StorageManagerState::QueuingDisks; + manager.add_u2_disk(raw_disk.clone()).await.unwrap(); + assert!(manager.resources.all_u2_zpools().is_empty()); + assert_eq!(manager.queued_u2_drives, HashSet::from([raw_disk])); logctx.cleanup_successful(); } @@ -586,14 +394,14 @@ mod tests { let (mut manager, _) = StorageManager::new(&logctx.log, key_requester); let zpool_name = ZpoolName::new_external(Uuid::new_v4()); let dir = tempdir().unwrap(); - let _ = new_disk(dir.path(), &zpool_name); + let disk = SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); // Spawn the key_manager so that it will respond to requests for encryption keys tokio::spawn(async move { key_manager.run().await }); // Set the stage to pretend we've progressed enough to have a key_manager available. manager.state = StorageManagerState::Normal; - manager.add_synthetic_u2_disk(zpool_name.clone()).await.unwrap(); + manager.add_u2_disk(disk).await.unwrap(); assert_eq!(manager.resources.all_u2_zpools().len(), 1); Zpool::destroy(&zpool_name).unwrap(); logctx.cleanup_successful(); @@ -617,9 +425,9 @@ mod tests { // Create a synthetic internal disk let zpool_name = ZpoolName::new_internal(Uuid::new_v4()); let dir = tempdir().unwrap(); - let _ = new_disk(dir.path(), &zpool_name); + let disk = SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); - handle.upsert_synthetic_disk(zpool_name.clone()).await; + handle.upsert_disk(disk).await; handle.wait_for_boot_disk().await; Zpool::destroy(&zpool_name).unwrap(); logctx.cleanup_successful(); @@ -645,8 +453,8 @@ mod tests { // the `KeyManager` is ready yet. let zpool_name = ZpoolName::new_external(Uuid::new_v4()); let dir = tempdir().unwrap(); - let _ = new_disk(dir.path(), &zpool_name); - handle.upsert_synthetic_disk(zpool_name.clone()).await; + let disk = SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); + handle.upsert_disk(disk).await; let resources = handle.get_latest_resources().await; assert!(resources.all_u2_zpools().is_empty()); @@ -682,8 +490,8 @@ mod tests { // the `KeyManager` is ready yet. let zpool_name = ZpoolName::new_external(Uuid::new_v4()); let dir = tempdir().unwrap(); - let _ = new_disk(dir.path(), &zpool_name); - handle.upsert_synthetic_disk(zpool_name.clone()).await; + let disk = SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); + handle.upsert_disk(disk).await; manager.step().await.unwrap(); // We can't wait for a reply through the handle as the storage manager task @@ -737,15 +545,16 @@ mod tests { // Create and add a disk let zpool_name = ZpoolName::new_external(Uuid::new_v4()); let dir = tempdir().unwrap(); - let _ = new_disk(dir.path(), &zpool_name); - handle.upsert_synthetic_disk(zpool_name.clone()).await; + let disk: RawDisk = + SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); + handle.upsert_disk(disk.clone()).await; // Wait for the add disk notification let resources = handle.wait_for_changes().await; assert_eq!(resources.all_u2_zpools().len(), 1); // Delete the disk and wait for a notification - handle.delete_synthetic_disk(zpool_name.clone()).await; + handle.delete_disk(disk).await; let resources = handle.wait_for_changes().await; assert!(resources.all_u2_zpools().is_empty()); diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 82c588bd27..07ee0a1ea5 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -5,12 +5,11 @@ //! Discovered and usable disks and zpools use crate::dataset::M2_DEBUG_DATASET; -use crate::disk::{Disk, DiskWrapper}; +use crate::disk::{Disk, RawDisk}; use crate::error::Error; use crate::pool::Pool; use camino::Utf8PathBuf; use illumos_utils::zpool::ZpoolName; -use omicron_common::api::external::{ByteCount, ByteCountRangeError}; use omicron_common::disk::DiskIdentity; use sled_hardware::{DiskVariant, UnparsedDisk}; use std::collections::BTreeMap; @@ -41,7 +40,7 @@ const ZONE_BUNDLE_DIRECTORY: &str = "zone"; #[derive(Debug, Clone, Default)] pub struct StorageResources { // All disks, real and synthetic, being managed by this sled - disks: Arc>, + disks: Arc>, // A map of "Uuid" to "pool". pools: Arc>, @@ -51,42 +50,15 @@ impl StorageResources { /// Insert a disk and its zpool /// /// Return true, if data was changed, false otherwise - pub(crate) fn insert_real_disk( - &mut self, - disk: Disk, - ) -> Result { + pub(crate) fn insert_disk(&mut self, disk: Disk) -> Result { let parent = disk.identity().clone(); let zpool_name = disk.zpool_name().clone(); - let disk = DiskWrapper::Real { - disk: disk.clone(), - devfs_path: disk.devfs_path().clone(), - }; if let Some(stored) = self.disks.get(&parent) { if stored == &disk { return Ok(false); } } - Arc::make_mut(&mut self.disks).insert(disk.identity(), disk); - let zpool = Pool::new(zpool_name, parent)?; - Arc::make_mut(&mut self.pools).insert(zpool.name.id(), zpool); - Ok(true) - } - - /// Insert a synthetic disk and its zpool - /// - /// Return true, if data was changed, false otherwise - pub(crate) fn insert_synthetic_disk( - &mut self, - zpool_name: ZpoolName, - ) -> Result { - let disk = DiskWrapper::Synthetic { zpool_name: zpool_name.clone() }; - let parent = disk.identity().clone(); - if let Some(stored) = self.disks.get(&parent) { - if stored == &disk { - return Ok(false); - } - } - Arc::make_mut(&mut self.disks).insert(disk.identity(), disk); + Arc::make_mut(&mut self.disks).insert(disk.identity().clone(), disk); let zpool = Pool::new(zpool_name, parent)?; Arc::make_mut(&mut self.pools).insert(zpool.name.id(), zpool); Ok(true) @@ -95,7 +67,7 @@ impl StorageResources { /// Delete a real disk and its zpool /// /// Return true, if data was changed, false otherwise - pub(crate) fn remove_real_disk(&mut self, disk: UnparsedDisk) -> bool { + pub(crate) fn remove_disk(&mut self, disk: RawDisk) -> bool { if !self.disks.contains_key(disk.identity()) { return false; } @@ -106,47 +78,16 @@ impl StorageResources { true } - /// Delete a synthetic disk and its zpool - /// - /// Return true, if data was changed, false otherwise - pub(crate) fn remove_synthetic_disk( - &mut self, - zpool_name: ZpoolName, - ) -> bool { - let disk = DiskWrapper::Synthetic { zpool_name: zpool_name.clone() }; - if !self.disks.contains_key(&disk.identity()) { - return false; - } - // Safe to unwrap as we just checked the key existed above - let parsed_disk = - Arc::make_mut(&mut self.disks).remove(&disk.identity()).unwrap(); - Arc::make_mut(&mut self.pools).remove(&parsed_disk.zpool_name().id()); - true - } - /// Returns the identity of the boot disk. /// /// If this returns `None`, we have not processed the boot disk yet. pub fn boot_disk(&self) -> Option<(DiskIdentity, ZpoolName)> { - self.disks.iter().find_map(|(id, disk)| { - match disk { - // This is the "real" use-case: if we have real disks, query - // their properties to identify if they truly are the boot disk. - DiskWrapper::Real { disk, .. } => { - if disk.is_boot_disk() { - return Some((id.clone(), disk.zpool_name().clone())); - } - } - // This is the "less real" use-case: if we have synthetic disks, - // just label the first M.2-looking one as a "boot disk". - DiskWrapper::Synthetic { .. } => { - if matches!(disk.variant(), DiskVariant::M2) { - return Some((id.clone(), disk.zpool_name().clone())); - } - } - }; - None - }) + for (id, disk) in self.disks.iter() { + if disk.is_boot_disk() { + return Some((id.clone(), disk.zpool_name().clone())); + } + } + None } /// Returns all M.2 zpools pub fn all_m2_zpools(&self) -> Vec { From 98cc812ebf7f437f17aea17e65055be12fce269d Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 4 Oct 2023 19:42:37 +0000 Subject: [PATCH 16/66] wip --- illumos-utils/src/zpool.rs | 2 +- sled-storage/src/disk.rs | 11 +++ sled-storage/src/manager.rs | 170 +++++++++++++++++++++++++--------- sled-storage/src/pool.rs | 2 +- sled-storage/src/resources.rs | 44 ++++----- 5 files changed, 160 insertions(+), 69 deletions(-) diff --git a/illumos-utils/src/zpool.rs b/illumos-utils/src/zpool.rs index 68d5ebd3a2..f0916b236a 100644 --- a/illumos-utils/src/zpool.rs +++ b/illumos-utils/src/zpool.rs @@ -96,7 +96,7 @@ impl FromStr for ZpoolHealth { } /// Describes a Zpool. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct ZpoolInfo { name: String, size: u64, diff --git a/sled-storage/src/disk.rs b/sled-storage/src/disk.rs index 640d4e77f4..6121b267d0 100644 --- a/sled-storage/src/disk.rs +++ b/sled-storage/src/disk.rs @@ -97,6 +97,17 @@ impl RawDisk { }, } } + + pub fn is_synthetic(&self) -> bool { + match self { + Self::Real(_) => false, + Self::Synthetic(_) => true, + } + } + + pub fn is_real(&self) -> bool { + !self.is_synthetic() + } } /// A physical [`PooledDisk`] or a [`SyntheticDisk`] that contains or is backed diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index efd4630c71..7355530f2e 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -6,7 +6,7 @@ use std::collections::HashSet; -use crate::dataset::DatasetError; +use crate::dataset::{DatasetError, DatasetName}; use crate::disk::{Disk, DiskError, RawDisk}; use crate::error::Error; use crate::resources::StorageResources; @@ -17,6 +17,7 @@ use sled_hardware::DiskVariant; use slog::{debug, error, info, o, warn, Logger}; use tokio::sync::{mpsc, oneshot, watch}; use tokio::time::{interval, Duration, MissedTickBehavior}; +use uuid::Uuid; // The size of the mpsc bounded channel used to communicate // between the `StorageHandle` and `StorageManager`. @@ -29,11 +30,18 @@ pub enum StorageManagerState { Normal, } +#[derive(Debug)] +struct NewFilesystemRequest { + dataset_id: Uuid, + dataset_name: DatasetName, + responder: oneshot::Sender>, +} + enum StorageRequest { AddDisk(RawDisk), RemoveDisk(RawDisk), DisksChanged(HashSet), - // NewFilesystem(NewFilesystemRequest), + NewFilesystem(NewFilesystemRequest), KeyManagerReady, /// This will always grab the latest state after any new updates, as it /// serializes through the `StorageManager` task. @@ -170,7 +178,9 @@ impl StorageManager { _ = interval.tick(), if self.state == StorageManagerState::QueuingDisks => { - self.add_queued_disks().await; + if self.add_queued_disks().await { + let _ = self.resource_updates.send_replace(self.resources.clone()); + } } } } @@ -181,23 +191,31 @@ impl StorageManager { /// This is useful for testing/debugging pub async fn step(&mut self) -> Result<(), Error> { // The sending side should never disappear - match self.rx.recv().await.unwrap() { - StorageRequest::AddDisk(raw_disk) => match raw_disk.variant() { - DiskVariant::U2 => self.add_u2_disk(raw_disk).await?, - DiskVariant::M2 => self.add_m2_disk(raw_disk).await?, - }, + let should_send_updates = match self.rx.recv().await.unwrap() { + StorageRequest::AddDisk(raw_disk) => { + self.add_disk(raw_disk).await? + } StorageRequest::RemoveDisk(raw_disk) => { - self.remove_disk(raw_disk).await; + self.remove_disk(raw_disk).await } - StorageRequest::DisksChanged(_raw_disks) => todo!(), + StorageRequest::DisksChanged(raw_disks) => { + self.ensure_using_exactly_these_disks(raw_disks).await + } + StorageRequest::NewFilesystem(_req) => todo!(), StorageRequest::KeyManagerReady => { self.state = StorageManagerState::Normal; - self.add_queued_disks().await; + self.add_queued_disks().await } StorageRequest::GetLatestResources(tx) => { let _ = tx.send(self.resources.clone()); + false } + }; + + if should_send_updates { + let _ = self.resource_updates.send_replace(self.resources.clone()); } + Ok(()) } @@ -205,9 +223,12 @@ impl StorageManager { // unless we hit a transient error. If we hit a transient error, we return // and wait for the next retry window to re-call this method. If we hit a // permanent error we log it, but we continue inserting queued disks. - async fn add_queued_disks(&mut self) { + // + // Return true if updates should be sent to watchers, false otherwise + async fn add_queued_disks(&mut self) -> bool { self.state = StorageManagerState::Normal; - // Operate on queued real disks + + let mut send_updates = false; // Disks that should be requeued. let mut saved = HashSet::new(); @@ -222,80 +243,137 @@ impl StorageManager { // have been requeued. If there was a permanent error, it will have been // dropped. If there is an another unexpected error, we will handle it and // requeue ourselves. - if let Err(err) = self.add_u2_disk(disk.clone()).await { - warn!( - self.log, - "Potentially transient error: {err}: - requeing disk {:?}", - disk - ); - saved.insert(disk); + match self.add_u2_disk(disk.clone()).await { + Err(err) => { + warn!( + self.log, + "Potentially transient error: {err}: requeuing disk"; + "disk_id" => ?disk.identity() + ); + saved.insert(disk); + } + Ok(true) => send_updates = true, + Ok(false) => (), } } } // Merge any requeued disks from transient errors with saved disks here self.queued_u2_drives.extend(saved); + send_updates + } + + // Add a disk to `StorageResources` if it is new and return Ok(true) if so + async fn add_disk(&mut self, raw_disk: RawDisk) -> Result { + match raw_disk.variant() { + DiskVariant::U2 => self.add_u2_disk(raw_disk).await, + DiskVariant::M2 => self.add_m2_disk(raw_disk).await, + } } - // Add a real U.2 disk to [`StorageResources`] or queue it to be added later - async fn add_u2_disk(&mut self, raw_disk: RawDisk) -> Result<(), Error> { + // Add a U.2 disk to [`StorageResources`] or queue it to be added later + async fn add_u2_disk(&mut self, raw_disk: RawDisk) -> Result { if self.state != StorageManagerState::Normal { self.queued_u2_drives.insert(raw_disk); - return Ok(()); + return Ok(false); } match Disk::new(&self.log, raw_disk.clone(), Some(&self.key_requester)) .await { - Ok(disk) => { - if self.resources.insert_disk(disk)? { - let _ = self - .resource_updates - .send_replace(self.resources.clone()); - } - Ok(()) - } + Ok(disk) => self.resources.insert_disk(disk), Err(err @ DiskError::Dataset(DatasetError::KeyManager(_))) => { warn!( self.log, - "Transient error: {err} - queuing disk {:?}", raw_disk + "Transient error: {err}: queuing disk"; + "disk_id" => ?raw_disk.identity() ); self.queued_u2_drives.insert(raw_disk); self.state = StorageManagerState::QueuingDisks; - Ok(()) + Ok(false) } Err(err) => { error!( self.log, - "Persistent error: {err} - not queueing disk {:?}", - raw_disk + "Persistent error: {err}: not queueing disk"; + "disk_id" => ?raw_disk.identity() ); - Ok(()) + Ok(false) } } } - // Add a real U.2 disk to [`StorageResources`] + // Add a U.2 disk to [`StorageResources`] if new and return `Ok(true)` if so // // // We never queue M.2 drives, as they don't rely on [`KeyManager`] based // encryption - async fn add_m2_disk(&mut self, raw_disk: RawDisk) -> Result<(), Error> { + async fn add_m2_disk(&mut self, raw_disk: RawDisk) -> Result { let disk = Disk::new(&self.log, raw_disk.clone(), Some(&self.key_requester)) .await?; - if self.resources.insert_disk(disk)? { - let _ = self.resource_updates.send_replace(self.resources.clone()); - } - Ok(()) + self.resources.insert_disk(disk) } - // Delete a real disk - async fn remove_disk(&mut self, raw_disk: RawDisk) { + // Delete a real disk and return `true` if the disk was actually removed + async fn remove_disk(&mut self, raw_disk: RawDisk) -> bool { // If the disk is a U.2, we want to first delete it from any queued disks let _ = self.queued_u2_drives.remove(&raw_disk); - if self.resources.remove_disk(raw_disk) { - let _ = self.resource_updates.send_replace(self.resources.clone()); + self.resources.remove_disk(raw_disk.identity()) + } + + // Find all disks to remove that are not in raw_disks and remove them Then + // take the remaining disks and try to add them all. `StorageResources` will + // inform us if anything changed, and if so we return true, otherwise we + // return false. + async fn ensure_using_exactly_these_disks( + &mut self, + raw_disks: HashSet, + ) -> bool { + let mut should_update = false; + + // Clear out any queued U.2 disks that are real. + // We keep synthetic disks, as they are only added once. + self.queued_u2_drives.retain(|d| d.is_synthetic()); + + let all_ids: HashSet<_> = + raw_disks.iter().map(|d| d.identity()).collect(); + + // Find all existing disks not in the current set + let to_remove: Vec = self + .resources + .disks + .keys() + .filter_map(|id| { + if !all_ids.contains(id) { + Some(id.clone()) + } else { + None + } + }) + .collect(); + + for id in to_remove { + if self.resources.remove_disk(&id) { + should_update = true; + } + } + + for raw_disk in raw_disks { + let disk_id = raw_disk.identity().clone(); + match self.add_disk(raw_disk).await { + Ok(true) => should_update = true, + Ok(false) => (), + Err(err) => { + warn!( + self.log, + "Failed to add disk to storage resources: {err}"; + "disk_id" => ?disk_id + ); + } + } } + + should_update } } diff --git a/sled-storage/src/pool.rs b/sled-storage/src/pool.rs index 203738b16a..76eabedd12 100644 --- a/sled-storage/src/pool.rs +++ b/sled-storage/src/pool.rs @@ -9,7 +9,7 @@ use illumos_utils::zpool::{Zpool, ZpoolInfo, ZpoolName}; use omicron_common::disk::DiskIdentity; /// A ZFS storage pool -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Pool { pub name: ZpoolName, pub info: ZpoolInfo, diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 07ee0a1ea5..8d739fae3f 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -25,25 +25,25 @@ const ZONE_BUNDLE_DIRECTORY: &str = "zone"; /// Storage related resources: disks and zpools /// /// This state is internal to the [`crate::StorageManager`] task. Clones -/// of this state, or subsets of it, can be retrieved by requests to the -/// `StorageManager` task from the [`crate::StorageManagerHandle`]. This state -/// is not `Sync`, and as such does not require any mutexes. However, we do -/// expect to share it relatively frequently, and we want copies of it to be -/// as cheaply made as possible. So any large state is stored inside `Arc`s. On -/// the other hand, we expect infrequent updates to this state, and as such, we -/// use [`std::sync::Arc::make_mut`] to implement clone on write functionality +/// of this state can be retrieved by requests to the `StorageManager` task +/// from the [`crate::StorageManagerHandle`]. This state is not `Sync`, and +/// as such does not require any mutexes. However, we do expect to share it +/// relatively frequently, and we want copies of it to be as cheaply made +/// as possible. So any large state is stored inside `Arc`s. On the other +/// hand, we expect infrequent updates to this state, and as such, we use +/// [`std::sync::Arc::make_mut`] to implement clone on write functionality /// inside the `StorageManager` task if there are any outstanding copies. /// Therefore, we only pay the cost to update infrequently, and no locks are /// required by callers when operating on cloned data. The only contention here -/// is for the refrence counters of the internal Arcs when `StorageResources` gets cloned -/// or dropped. +/// is for the refrence counters of the internal Arcs when `StorageResources` +/// gets cloned or dropped. #[derive(Debug, Clone, Default)] pub struct StorageResources { // All disks, real and synthetic, being managed by this sled - disks: Arc>, + pub disks: Arc>, // A map of "Uuid" to "pool". - pools: Arc>, + pub pools: Arc>, } impl StorageResources { @@ -51,15 +51,18 @@ impl StorageResources { /// /// Return true, if data was changed, false otherwise pub(crate) fn insert_disk(&mut self, disk: Disk) -> Result { - let parent = disk.identity().clone(); + let disk_id = disk.identity().clone(); let zpool_name = disk.zpool_name().clone(); - if let Some(stored) = self.disks.get(&parent) { - if stored == &disk { - return Ok(false); + let zpool = Pool::new(zpool_name, disk_id.clone())?; + if let Some(stored_disk) = self.disks.get(&disk_id) { + if let Some(stored_pool) = self.pools.get(&zpool.name.id()) { + if stored_disk == &disk && stored_pool == &zpool { + return Ok(false); + } } } - Arc::make_mut(&mut self.disks).insert(disk.identity().clone(), disk); - let zpool = Pool::new(zpool_name, parent)?; + // Either the disk or zpool changed + Arc::make_mut(&mut self.disks).insert(disk_id, disk); Arc::make_mut(&mut self.pools).insert(zpool.name.id(), zpool); Ok(true) } @@ -67,13 +70,12 @@ impl StorageResources { /// Delete a real disk and its zpool /// /// Return true, if data was changed, false otherwise - pub(crate) fn remove_disk(&mut self, disk: RawDisk) -> bool { - if !self.disks.contains_key(disk.identity()) { + pub(crate) fn remove_disk(&mut self, id: &DiskIdentity) -> bool { + if !self.disks.contains_key(id) { return false; } // Safe to unwrap as we just checked the key existed above - let parsed_disk = - Arc::make_mut(&mut self.disks).remove(disk.identity()).unwrap(); + let parsed_disk = Arc::make_mut(&mut self.disks).remove(id).unwrap(); Arc::make_mut(&mut self.pools).remove(&parsed_disk.zpool_name().id()); true } From 8c38e8d75ece2b8b0d13a7d5a9610e19f4d7a292 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 4 Oct 2023 23:38:51 +0000 Subject: [PATCH 17/66] wip --- sled-storage/src/disk.rs | 12 ++- sled-storage/src/manager.rs | 144 +++++++++++++++++++++++++++++++++- sled-storage/src/resources.rs | 9 ++- 3 files changed, 160 insertions(+), 5 deletions(-) diff --git a/sled-storage/src/disk.rs b/sled-storage/src/disk.rs index 6121b267d0..f5a0e60c8f 100644 --- a/sled-storage/src/disk.rs +++ b/sled-storage/src/disk.rs @@ -29,8 +29,8 @@ pub enum DiskError { // by a zpool #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct SyntheticDisk { - identity: DiskIdentity, - zpool_name: ZpoolName, + pub identity: DiskIdentity, + pub zpool_name: ZpoolName, } impl SyntheticDisk { @@ -98,6 +98,14 @@ impl RawDisk { } } + #[cfg(test)] + pub fn zpool_name(&self) -> &ZpoolName { + match self { + Self::Real(_) => unreachable!(), + Self::Synthetic(disk) => &disk.zpool_name, + } + } + pub fn is_synthetic(&self) -> bool { match self { Self::Real(_) => false, diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 7355530f2e..51706c5021 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -23,7 +23,7 @@ use uuid::Uuid; // between the `StorageHandle` and `StorageManager`. const QUEUE_SIZE: usize = 256; -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum StorageManagerState { WaitingForKeyManager, QueuingDisks, @@ -47,6 +47,17 @@ enum StorageRequest { /// serializes through the `StorageManager` task. /// This serialization is particularly useful for tests. GetLatestResources(oneshot::Sender), + + /// Get the internal task state of the manager + GetManagerState(oneshot::Sender), +} + +/// Data managed internally to the StorageManagerTask that can be useful +/// to clients for debugging purposes, and that isn't exposed in other ways. +#[derive(Debug, Clone)] +pub struct StorageManagerData { + state: StorageManagerState, + queued_u2_drives: HashSet, } /// A mechanism for interacting with the [`StorageManager`] @@ -124,6 +135,13 @@ impl StorageHandle { self.tx.send(StorageRequest::GetLatestResources(tx)).await.unwrap(); rx.await.unwrap() } + + /// Return internal data useful for debugging and testing + pub async fn get_manager_state(&mut self) -> StorageManagerData { + let (tx, rx) = oneshot::channel(); + self.tx.send(StorageRequest::GetManagerState(tx)).await.unwrap(); + rx.await.unwrap() + } } /// The storage manager responsible for the state of the storage @@ -210,6 +228,13 @@ impl StorageManager { let _ = tx.send(self.resources.clone()); false } + StorageRequest::GetManagerState(tx) => { + let _ = tx.send(StorageManagerData { + state: self.state, + queued_u2_drives: self.queued_u2_drives.clone(), + }); + false + } }; if should_send_updates { @@ -639,4 +664,121 @@ mod tests { Zpool::destroy(&zpool_name).unwrap(); logctx.cleanup_successful(); } + + #[tokio::test] + async fn ensure_using_exactly_these_disks() { + let logctx = test_setup_log("ensure_using_exactly_these_disks"); + let (mut key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); + let (mut manager, mut handle) = + StorageManager::new(&logctx.log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + // Create a bunch of file backed external disks with zpools + let dir = tempdir().unwrap(); + let zpools: Vec = + (0..10).map(|_| ZpoolName::new_external(Uuid::new_v4())).collect(); + let disks: Vec = zpools + .iter() + .map(|zpool_name| { + SyntheticDisk::create_zpool(dir.path(), zpool_name).into() + }) + .collect(); + + // Add the first 3 disks, and ensure they get queued, as we haven't + // marked our key manager ready yet + handle + .ensure_using_exactly_these_disks(disks.iter().take(3).cloned()) + .await; + let state = handle.get_manager_state().await; + assert_eq!(state.queued_u2_drives.len(), 3); + assert_eq!(state.state, StorageManagerState::WaitingForKeyManager); + assert!(handle.get_latest_resources().await.all_u2_zpools().is_empty()); + + // Mark the key manager ready and wait for the storage update + handle.key_manager_ready().await; + let resources = handle.wait_for_changes().await; + let expected: HashSet<_> = + disks.iter().take(3).map(|d| d.identity()).collect(); + let actual: HashSet<_> = resources.disks.keys().collect(); + assert_eq!(expected, actual); + + // Add first three disks after the initial one. The returned resources + // should not contain the first disk. + handle + .ensure_using_exactly_these_disks( + disks.iter().skip(1).take(3).cloned(), + ) + .await; + let resources = handle.wait_for_changes().await; + let expected: HashSet<_> = + disks.iter().skip(1).take(3).map(|d| d.identity()).collect(); + let actual: HashSet<_> = resources.disks.keys().collect(); + assert_eq!(expected, actual); + + // Ensure the same set of disks and make sure no change occurs + // Note that we directly request the resources this time so we aren't + // waiting forever for a change notification. + handle + .ensure_using_exactly_these_disks( + disks.iter().skip(1).take(3).cloned(), + ) + .await; + let resources2 = handle.get_latest_resources().await; + assert_eq!(resources, resources2); + + // Add a disjoint set of disks and see that only they come through + handle + .ensure_using_exactly_these_disks( + disks.iter().skip(4).take(5).cloned(), + ) + .await; + let resources = handle.wait_for_changes().await; + let expected: HashSet<_> = + disks.iter().skip(4).take(5).map(|d| d.identity()).collect(); + let actual: HashSet<_> = resources.disks.keys().collect(); + assert_eq!(expected, actual); + + // Finally, change the zpool backing of the 5th disk to be that of the 10th + // and ensure that disk changes. Note that we don't change the identity + // of the 5th disk. + let mut modified_disk = disks[4].clone(); + if let RawDisk::Synthetic(disk) = &mut modified_disk { + disk.zpool_name = disks[9].zpool_name().clone(); + } else { + panic!(); + } + let mut expected: HashSet<_> = + disks.iter().skip(5).take(4).cloned().collect(); + expected.insert(modified_disk); + + handle + .ensure_using_exactly_these_disks(expected.clone().into_iter()) + .await; + let resources = handle.wait_for_changes().await; + + // Ensure the one modified disk changed as we expected + assert_eq!(5, resources.disks.len()); + //assert_eq!(5, resources.pools.len()); + for raw_disk in expected { + let disk = resources.disks.get(raw_disk.identity()).unwrap(); + assert_eq!(disk.zpool_name(), raw_disk.zpool_name()); + let pool = resources.pools.get(&disk.zpool_name().id()).unwrap(); + assert_eq!(&pool.name, disk.zpool_name()); + assert_eq!(raw_disk.identity(), &pool.parent); + } + + // Cleanup + for zpool in zpools { + Zpool::destroy(&zpool).unwrap(); + } + logctx.cleanup_successful(); + } } diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 8d739fae3f..212e9ad397 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -37,7 +37,7 @@ const ZONE_BUNDLE_DIRECTORY: &str = "zone"; /// required by callers when operating on cloned data. The only contention here /// is for the refrence counters of the internal Arcs when `StorageResources` /// gets cloned or dropped. -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone, Default, PartialEq, Eq)] pub struct StorageResources { // All disks, real and synthetic, being managed by this sled pub disks: Arc>, @@ -56,9 +56,14 @@ impl StorageResources { let zpool = Pool::new(zpool_name, disk_id.clone())?; if let Some(stored_disk) = self.disks.get(&disk_id) { if let Some(stored_pool) = self.pools.get(&zpool.name.id()) { - if stored_disk == &disk && stored_pool == &zpool { + if stored_disk == &disk + && stored_pool.info.size() == zpool.info.size() + && stored_pool.name == zpool.name + { return Ok(false); } + } else { + // We must delete the stored pool which no longer matches our disk } } // Either the disk or zpool changed From b2f3146f8d26d89c31dfc667600e36b4a0c2190c Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 4 Oct 2023 23:46:29 +0000 Subject: [PATCH 18/66] wip --- sled-storage/src/manager.rs | 5 ++--- sled-storage/src/pool.rs | 3 ++- sled-storage/src/resources.rs | 31 +++++++++++-------------------- 3 files changed, 15 insertions(+), 24 deletions(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 51706c5021..c214566ecb 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -766,11 +766,10 @@ mod tests { // Ensure the one modified disk changed as we expected assert_eq!(5, resources.disks.len()); - //assert_eq!(5, resources.pools.len()); for raw_disk in expected { - let disk = resources.disks.get(raw_disk.identity()).unwrap(); + let (disk, pool) = + resources.disks.get(raw_disk.identity()).unwrap(); assert_eq!(disk.zpool_name(), raw_disk.zpool_name()); - let pool = resources.pools.get(&disk.zpool_name().id()).unwrap(); assert_eq!(&pool.name, disk.zpool_name()); assert_eq!(raw_disk.identity(), &pool.parent); } diff --git a/sled-storage/src/pool.rs b/sled-storage/src/pool.rs index 76eabedd12..bac851df46 100644 --- a/sled-storage/src/pool.rs +++ b/sled-storage/src/pool.rs @@ -8,7 +8,8 @@ use crate::error::Error; use illumos_utils::zpool::{Zpool, ZpoolInfo, ZpoolName}; use omicron_common::disk::DiskIdentity; -/// A ZFS storage pool +/// A ZFS storage pool wrapper that tracks information returned from +/// `zpool` commands #[derive(Debug, Clone, PartialEq, Eq)] pub struct Pool { pub name: ZpoolName, diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 212e9ad397..2b9e7cffae 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -40,10 +40,7 @@ const ZONE_BUNDLE_DIRECTORY: &str = "zone"; #[derive(Debug, Clone, Default, PartialEq, Eq)] pub struct StorageResources { // All disks, real and synthetic, being managed by this sled - pub disks: Arc>, - - // A map of "Uuid" to "pool". - pub pools: Arc>, + pub disks: Arc>, } impl StorageResources { @@ -54,21 +51,16 @@ impl StorageResources { let disk_id = disk.identity().clone(); let zpool_name = disk.zpool_name().clone(); let zpool = Pool::new(zpool_name, disk_id.clone())?; - if let Some(stored_disk) = self.disks.get(&disk_id) { - if let Some(stored_pool) = self.pools.get(&zpool.name.id()) { - if stored_disk == &disk - && stored_pool.info.size() == zpool.info.size() - && stored_pool.name == zpool.name - { - return Ok(false); - } - } else { - // We must delete the stored pool which no longer matches our disk + if let Some((stored_disk, stored_pool)) = self.disks.get(&disk_id) { + if stored_disk == &disk + && stored_pool.info.size() == zpool.info.size() + && stored_pool.name == zpool.name + { + return Ok(false); } } // Either the disk or zpool changed - Arc::make_mut(&mut self.disks).insert(disk_id, disk); - Arc::make_mut(&mut self.pools).insert(zpool.name.id(), zpool); + Arc::make_mut(&mut self.disks).insert(disk_id, (disk, zpool)); Ok(true) } @@ -80,8 +72,7 @@ impl StorageResources { return false; } // Safe to unwrap as we just checked the key existed above - let parsed_disk = Arc::make_mut(&mut self.disks).remove(id).unwrap(); - Arc::make_mut(&mut self.pools).remove(&parsed_disk.zpool_name().id()); + Arc::make_mut(&mut self.disks).remove(id).unwrap(); true } @@ -89,7 +80,7 @@ impl StorageResources { /// /// If this returns `None`, we have not processed the boot disk yet. pub fn boot_disk(&self) -> Option<(DiskIdentity, ZpoolName)> { - for (id, disk) in self.disks.iter() { + for (id, (disk, _)) in self.disks.iter() { if disk.is_boot_disk() { return Some((id.clone(), disk.zpool_name().clone())); } @@ -126,7 +117,7 @@ impl StorageResources { pub fn all_zpools(&self, variant: DiskVariant) -> Vec { self.disks .values() - .filter_map(|disk| { + .filter_map(|(disk, _)| { if disk.variant() == variant { return Some(disk.zpool_name().clone()); } From 1352bcf41b43990e3eb086fdf383923eb135352b Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 5 Oct 2023 00:28:57 +0000 Subject: [PATCH 19/66] wip --- sled-storage/src/manager.rs | 66 +++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index c214566ecb..d74a7ada0a 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -10,6 +10,8 @@ use crate::dataset::{DatasetError, DatasetName}; use crate::disk::{Disk, DiskError, RawDisk}; use crate::error::Error; use crate::resources::StorageResources; +use camino::Utf8PathBuf; +use illumos_utils::zfs::{Mountpoint, Zfs}; use illumos_utils::zpool::ZpoolName; use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; @@ -34,7 +36,7 @@ pub enum StorageManagerState { struct NewFilesystemRequest { dataset_id: Uuid, dataset_name: DatasetName, - responder: oneshot::Sender>, + responder: oneshot::Sender>, } enum StorageRequest { @@ -219,7 +221,14 @@ impl StorageManager { StorageRequest::DisksChanged(raw_disks) => { self.ensure_using_exactly_these_disks(raw_disks).await } - StorageRequest::NewFilesystem(_req) => todo!(), + StorageRequest::NewFilesystem(request) => { + let result = self.add_dataset(&request).await; + if result.is_err() { + warn!(self.log, "{result:?}"); + } + let _ = request.responder.send(result); + false + } StorageRequest::KeyManagerReady => { self.state = StorageManagerState::Normal; self.add_queued_disks().await @@ -400,6 +409,59 @@ impl StorageManager { should_update } + + // Attempts to add a dataset within a zpool, according to `request`. + async fn add_dataset( + &mut self, + request: &NewFilesystemRequest, + ) -> Result<(), Error> { + info!(self.log, "add_dataset: {:?}", request); + if !self + .resources + .disks + .values() + .any(|(_, pool)| &pool.name == request.dataset_name.pool()) + { + return Err(Error::ZpoolNotFound(format!( + "{}, looked up while trying to add dataset", + request.dataset_name.pool(), + ))); + } + + let zoned = true; + let fs_name = &request.dataset_name.full(); + let do_format = true; + let encryption_details = None; + let size_details = None; + Zfs::ensure_filesystem( + fs_name, + Mountpoint::Path(Utf8PathBuf::from("/data")), + zoned, + do_format, + encryption_details, + size_details, + )?; + // Ensure the dataset has a usable UUID. + if let Ok(id_str) = Zfs::get_oxide_value(&fs_name, "uuid") { + if let Ok(id) = id_str.parse::() { + if id != request.dataset_id { + return Err(Error::UuidMismatch { + name: Box::new(request.dataset_name.clone()), + old: id, + new: request.dataset_id, + }); + } + return Ok(()); + } + } + Zfs::set_oxide_value( + &fs_name, + "uuid", + &request.dataset_id.to_string(), + )?; + + Ok(()) + } } /// All tests only use synthetic disks, but are expected to be run on illumos From 12245d546dbabbd89a670441c8a92d9f1fa2c7a8 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 5 Oct 2023 03:39:40 +0000 Subject: [PATCH 20/66] wip --- sled-agent/src/storage_manager.rs | 208 ------------------------------ sled-storage/src/manager.rs | 49 ++++++- 2 files changed, 48 insertions(+), 209 deletions(-) diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index 68fb7df7df..bbf89e41fb 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -113,45 +113,6 @@ impl QueuedDiskCreate { } impl StorageWorker { - // Ensures the named dataset exists as a filesystem with a UUID, optionally - // creating it if `do_format` is true. - // - // Returns the UUID attached to the ZFS filesystem. - fn ensure_dataset( - &mut self, - dataset_id: Uuid, - dataset_name: &DatasetName, - ) -> Result<(), Error> { - let zoned = true; - let fs_name = &dataset_name.full(); - let do_format = true; - let encryption_details = None; - let size_details = None; - Zfs::ensure_filesystem( - &dataset_name.full(), - Mountpoint::Path(Utf8PathBuf::from("/data")), - zoned, - do_format, - encryption_details, - size_details, - )?; - // Ensure the dataset has a usable UUID. - if let Ok(id_str) = Zfs::get_oxide_value(&fs_name, "uuid") { - if let Ok(id) = id_str.parse::() { - if id != dataset_id { - return Err(Error::UuidMismatch { - name: Box::new(dataset_name.clone()), - old: id, - new: dataset_id, - }); - } - return Ok(()); - } - } - Zfs::set_oxide_value(&fs_name, "uuid", &dataset_id.to_string())?; - Ok(()) - } - // Adds a "notification to nexus" to `nexus_notifications`, // informing it about the addition of `pool_id` to this sled. async fn add_zpool_notify(&mut self, pool: &Pool, size: ByteCount) { @@ -712,30 +673,6 @@ impl StorageWorker { Ok(()) } - // Attempts to add a dataset within a zpool, according to `request`. - async fn add_dataset( - &mut self, - resources: &StorageResources, - request: &NewFilesystemRequest, - ) -> Result { - info!(self.log, "add_dataset: {:?}", request); - let mut pools = resources.pools.lock().await; - let pool = pools - .get_mut(&request.dataset_name.pool().id()) - .ok_or_else(|| { - Error::ZpoolNotFound(format!( - "{}, looked up while trying to add dataset", - request.dataset_name.pool(), - )) - })?; - let dataset_name = DatasetName::new( - pool.name.clone(), - request.dataset_name.dataset().clone(), - ); - self.ensure_dataset(request.dataset_id, &dataset_name)?; - Ok(dataset_name) - } - // Small wrapper around `Self::do_work_internal` that ensures we always // emit info to the log when we exit. async fn do_work( @@ -851,77 +788,6 @@ impl StorageWorker { } Ok(()) } - - async fn upsert_queued_disks( - &mut self, - resources: &StorageResources, - queued_u2_drives: &mut Option>, - ) { - let queued = queued_u2_drives.take(); - if let Some(queued) = queued { - for disk in queued { - if let Some(saved) = queued_u2_drives { - // We already hit a transient error and recreated our queue. - // Add any remaining queued disks back on the queue so we - // can try again later. - saved.insert(disk); - } else { - match self.upsert_queued_disk(disk, resources).await { - Ok(()) => {} - Err((_, None)) => { - // We already logged this as a persistent error in - // `add_new_disk` or `add_new_synthetic_disk` - } - Err((_, Some(disk))) => { - // We already logged this as a transient error in - // `add_new_disk` or `add_new_synthetic_disk` - *queued_u2_drives = Some(HashSet::from([disk])); - } - } - } - } - } - if queued_u2_drives.is_none() { - info!(self.log, "upserted all queued disks"); - } else { - warn!( - self.log, - "failed to upsert all queued disks - will try again" - ); - } - } - - // Attempt to upsert a queued disk. Return the disk and error if the upsert - // fails due to a transient error. Examples of transient errors are key - // manager errors which indicate that there are not enough sleds available - // to unlock the rack. - async fn upsert_queued_disk( - &mut self, - disk: QueuedDiskCreate, - resources: &StorageResources, - ) -> Result<(), (Error, Option)> { - let mut temp: Option> = None; - let res = match disk { - QueuedDiskCreate::Real(disk) => { - self.upsert_disk(&resources, disk, &mut temp).await - } - QueuedDiskCreate::Synthetic(zpool_name) => { - self.upsert_synthetic_disk(&resources, zpool_name, &mut temp) - .await - } - }; - if let Some(mut disks) = temp.take() { - assert!(res.is_err()); - assert_eq!(disks.len(), 1); - return Err(( - res.unwrap_err(), - disks.drain().next().unwrap().into(), - )); - } - // Any error at this point is not transient. - // We don't requeue the disk. - res.map_err(|e| (e, None)) - } } enum StorageWorkerRequest { @@ -997,64 +863,6 @@ impl StorageManager { &self.zone_bundler } - /// Ensures that the storage manager tracks exactly the provided disks. - /// - /// This acts similar to a batch [Self::upsert_disk] for all new disks, and - /// [Self::delete_disk] for all removed disks. - /// - /// If errors occur, an arbitrary "one" of them will be returned, but a - /// best-effort attempt to add all disks will still be attempted. - // Receiver implemented by [StorageWorker::ensure_using_exactly_these_disks] - pub async fn ensure_using_exactly_these_disks(&self, unparsed_disks: I) - where - I: IntoIterator, - { - self.inner - .tx - .send(StorageWorkerRequest::DisksChanged( - unparsed_disks.into_iter().collect::>(), - )) - .await - .map_err(|e| e.to_string()) - .expect("Failed to send DisksChanged request"); - } - - /// Adds a disk and associated zpool to the storage manager. - // Receiver implemented by [StorageWorker::upsert_disk]. - pub async fn upsert_disk(&self, disk: UnparsedDisk) { - info!(self.inner.log, "Upserting disk: {disk:?}"); - self.inner - .tx - .send(StorageWorkerRequest::AddDisk(disk)) - .await - .map_err(|e| e.to_string()) - .expect("Failed to send AddDisk request"); - } - - /// Removes a disk, if it's tracked by the storage manager, as well - /// as any associated zpools. - // Receiver implemented by [StorageWorker::delete_disk]. - pub async fn delete_disk(&self, disk: UnparsedDisk) { - info!(self.inner.log, "Deleting disk: {disk:?}"); - self.inner - .tx - .send(StorageWorkerRequest::RemoveDisk(disk)) - .await - .map_err(|e| e.to_string()) - .expect("Failed to send RemoveDisk request"); - } - - /// Adds a synthetic zpool to the storage manager. - // Receiver implemented by [StorageWorker::upsert_synthetic_disk]. - pub async fn upsert_synthetic_disk(&self, name: ZpoolName) { - self.inner - .tx - .send(StorageWorkerRequest::AddSyntheticDisk(name)) - .await - .map_err(|e| e.to_string()) - .expect("Failed to send AddSyntheticDisk request"); - } - /// Adds underlay access to the storage manager. pub async fn setup_underlay_access( &self, @@ -1117,20 +925,4 @@ impl StorageManager { Ok(dataset_name) } - - /// Inform the storage worker that the KeyManager is capable of retrieving - /// secrets now and that any queued disks can be upserted. - pub async fn key_manager_ready(&self) { - info!(self.inner.log, "KeyManger ready"); - self.inner - .tx - .send(StorageWorkerRequest::KeyManagerReady) - .await - .map_err(|e| e.to_string()) - .expect("Failed to send KeyManagerReady request"); - } - - pub fn resources(&self) -> &StorageResources { - &self.inner.resources - } } diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index d74a7ada0a..e6395def6b 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -6,7 +6,7 @@ use std::collections::HashSet; -use crate::dataset::{DatasetError, DatasetName}; +use crate::dataset::{DatasetError, DatasetKind, DatasetName}; use crate::disk::{Disk, DiskError, RawDisk}; use crate::error::Error; use crate::resources::StorageResources; @@ -144,6 +144,18 @@ impl StorageHandle { self.tx.send(StorageRequest::GetManagerState(tx)).await.unwrap(); rx.await.unwrap() } + + pub async fn upsert_filesystem( + &self, + dataset_id: Uuid, + dataset_name: DatasetName, + ) -> Result<(), Error> { + let (tx, rx) = oneshot::channel(); + let request = + NewFilesystemRequest { dataset_id, dataset_name, responder: tx }; + self.tx.send(StorageRequest::NewFilesystem(request)).await.unwrap(); + rx.await.unwrap() + } } /// The storage manager responsible for the state of the storage @@ -842,4 +854,39 @@ mod tests { } logctx.cleanup_successful(); } + + #[tokio::test] + async fn upsert_filesystem() { + let logctx = test_setup_log("upsert_filesystem"); + let (mut key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); + let (mut manager, handle) = + StorageManager::new(&logctx.log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + handle.key_manager_ready().await; + + // Create and add a disk + let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let dir = tempdir().unwrap(); + let disk: RawDisk = + SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); + handle.upsert_disk(disk.clone()).await; + + // Create a filesystem + let dataset_id = Uuid::new_v4(); + let dataset_name = + DatasetName::new(zpool_name.clone(), DatasetKind::Crucible); + handle.upsert_filesystem(dataset_id, dataset_name).await.unwrap(); + + Zpool::destroy(&zpool_name).unwrap(); + logctx.cleanup_successful(); + } } From e8afd42d95c2219aec822b6edaade13c7c502721 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 6 Oct 2023 04:14:20 +0000 Subject: [PATCH 21/66] wip --- Cargo.lock | 53 +++++----- Cargo.toml | 1 + sled-agent/Cargo.toml | 1 + sled-agent/src/bootstrap/bootstore.rs | 104 +++++-------------- sled-agent/src/bootstrap/mod.rs | 4 +- sled-agent/src/lib.rs | 1 + sled-agent/src/long_running_tasks.rs | 142 ++++++++++++++++++++++++++ sled-hardware/src/illumos/mod.rs | 6 +- sled-storage/Cargo.toml | 4 +- sled-storage/src/dataset.rs | 2 +- sled-storage/src/lib.rs | 2 +- sled-storage/src/manager.rs | 10 +- 12 files changed, 215 insertions(+), 115 deletions(-) create mode 100644 sled-agent/src/long_running_tasks.rs diff --git a/Cargo.lock b/Cargo.lock index 26358b3459..6165b6963c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5246,6 +5246,7 @@ dependencies = [ "sha3", "sled-agent-client", "sled-hardware", + "sled-storage", "slog", "slog-async", "slog-dtrace", @@ -5265,32 +5266,6 @@ dependencies = [ "zone", ] -[[package]] -name = "omicron-sled-storage" -version = "0.1.0" -dependencies = [ - "async-trait", - "camino", - "camino-tempfile", - "derive_more", - "glob", - "illumos-utils", - "key-manager", - "nexus-client 0.1.0", - "omicron-common 0.1.0", - "omicron-test-utils", - "rand 0.8.5", - "schemars", - "serde", - "serde_json", - "sled-agent-client", - "sled-hardware", - "slog", - "thiserror", - "tokio", - "uuid", -] - [[package]] name = "omicron-test-utils" version = "0.1.0" @@ -7982,6 +7957,32 @@ dependencies = [ "uuid", ] +[[package]] +name = "sled-storage" +version = "0.1.0" +dependencies = [ + "async-trait", + "camino", + "camino-tempfile", + "derive_more", + "glob", + "illumos-utils", + "key-manager", + "nexus-client 0.1.0", + "omicron-common 0.1.0", + "omicron-test-utils", + "rand 0.8.5", + "schemars", + "serde", + "serde_json", + "sled-agent-client", + "sled-hardware", + "slog", + "thiserror", + "tokio", + "uuid", +] + [[package]] name = "slog" version = "2.7.0" diff --git a/Cargo.toml b/Cargo.toml index 0b63984ea7..2f1d0b012e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -319,6 +319,7 @@ similar-asserts = "1.5.0" sled = "0.34" sled-agent-client = { path = "sled-agent-client" } sled-hardware = { path = "sled-hardware" } +sled-storage = { path = "sled-storage" } slog = { version = "2.7", features = [ "dynamic-keys", "max_level_trace", "release_max_level_debug" ] } slog-async = "2.8" slog-dtrace = "0.2" diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index f172136726..e219461b9b 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -60,6 +60,7 @@ serde_json.workspace = true sha3.workspace = true sled-agent-client.workspace = true sled-hardware.workspace = true +sled-storage.workspace = true slog.workspace = true slog-async.workspace = true slog-dtrace.workspace = true diff --git a/sled-agent/src/bootstrap/bootstore.rs b/sled-agent/src/bootstrap/bootstore.rs index 17267bef55..3c7e860b4a 100644 --- a/sled-agent/src/bootstrap/bootstore.rs +++ b/sled-agent/src/bootstrap/bootstore.rs @@ -7,122 +7,74 @@ use super::config::BOOTSTORE_PORT; use super::server::StartError; -use crate::storage_manager::StorageResources; use bootstore::schemes::v0 as bootstore; use camino::Utf8PathBuf; use ddm_admin_client::Client as DdmAdminClient; use sled_hardware::underlay::BootstrapInterface; use sled_hardware::Baseboard; +use sled_storage::dataset::CLUSTER_DATASET; +use sled_storage::resources::StorageResources; use slog::Logger; use std::collections::BTreeSet; use std::net::Ipv6Addr; use std::net::SocketAddrV6; use std::time::Duration; -use tokio::task::JoinHandle; const BOOTSTORE_FSM_STATE_FILE: &str = "bootstore-fsm-state.json"; const BOOTSTORE_NETWORK_CONFIG_FILE: &str = "bootstore-network-config.json"; -pub(super) struct BootstoreHandles { - pub(super) node_handle: bootstore::NodeHandle, - - // These two are never used; we keep them to show ownership of the spawned - // tasks. - _node_task_handle: JoinHandle<()>, - _peer_update_task_handle: JoinHandle<()>, +pub fn new_bootstore_config( + storage_resources: &StorageResources, + baseboard: Baseboard, + global_zone_bootstrap_ip: Ipv6Addr, +) -> Result { + Ok(bootstore::Config { + id: baseboard, + addr: SocketAddrV6::new(global_zone_bootstrap_ip, BOOTSTORE_PORT, 0, 0), + time_per_tick: Duration::from_millis(250), + learn_timeout: Duration::from_secs(5), + rack_init_timeout: Duration::from_secs(300), + rack_secret_request_timeout: Duration::from_secs(5), + fsm_state_ledger_paths: bootstore_fsm_state_paths(&storage_resources)?, + network_config_ledger_paths: bootstore_network_config_paths( + &storage_resources, + )?, + }) } -impl BootstoreHandles { - pub(super) async fn spawn( - storage_resources: &StorageResources, - ddm_admin_client: DdmAdminClient, - baseboard: Baseboard, - global_zone_bootstrap_ip: Ipv6Addr, - base_log: &Logger, - ) -> Result { - let config = bootstore::Config { - id: baseboard, - addr: SocketAddrV6::new( - global_zone_bootstrap_ip, - BOOTSTORE_PORT, - 0, - 0, - ), - time_per_tick: Duration::from_millis(250), - learn_timeout: Duration::from_secs(5), - rack_init_timeout: Duration::from_secs(300), - rack_secret_request_timeout: Duration::from_secs(5), - fsm_state_ledger_paths: bootstore_fsm_state_paths( - &storage_resources, - ) - .await?, - network_config_ledger_paths: bootstore_network_config_paths( - &storage_resources, - ) - .await?, - }; - - let (mut node, node_handle) = - bootstore::Node::new(config, base_log).await; - - let join_handle = tokio::spawn(async move { node.run().await }); - - // Spawn a task for polling DDMD and updating bootstore - let peer_update_handle = - tokio::spawn(poll_ddmd_for_bootstore_peer_update( - base_log.new(o!("component" => "bootstore_ddmd_poller")), - node_handle.clone(), - ddm_admin_client, - )); - - Ok(Self { - node_handle, - _node_task_handle: join_handle, - _peer_update_task_handle: peer_update_handle, - }) - } -} - -async fn bootstore_fsm_state_paths( +fn bootstore_fsm_state_paths( storage: &StorageResources, ) -> Result, StartError> { let paths: Vec<_> = storage - .all_m2_mountpoints(sled_hardware::disk::CLUSTER_DATASET) - .await + .all_m2_mountpoints(CLUSTER_DATASET) .into_iter() .map(|p| p.join(BOOTSTORE_FSM_STATE_FILE)) .collect(); if paths.is_empty() { - return Err(StartError::MissingM2Paths( - sled_hardware::disk::CLUSTER_DATASET, - )); + return Err(StartError::MissingM2Paths(CLUSTER_DATASET)); } Ok(paths) } -async fn bootstore_network_config_paths( +fn bootstore_network_config_paths( storage: &StorageResources, ) -> Result, StartError> { let paths: Vec<_> = storage - .all_m2_mountpoints(sled_hardware::disk::CLUSTER_DATASET) - .await + .all_m2_mountpoints(CLUSTER_DATASET) .into_iter() .map(|p| p.join(BOOTSTORE_NETWORK_CONFIG_FILE)) .collect(); if paths.is_empty() { - return Err(StartError::MissingM2Paths( - sled_hardware::disk::CLUSTER_DATASET, - )); + return Err(StartError::MissingM2Paths(CLUSTER_DATASET)); } Ok(paths) } -async fn poll_ddmd_for_bootstore_peer_update( +pub async fn poll_ddmd_for_bootstore_peer_update( log: Logger, bootstore_node_handle: bootstore::NodeHandle, - ddmd_client: DdmAdminClient, ) { let mut current_peers: BTreeSet = BTreeSet::new(); // We're talking to a service's admin interface on localhost and @@ -132,7 +84,7 @@ async fn poll_ddmd_for_bootstore_peer_update( // We also use this timeout in the case of spurious ddmd failures // that require a reconnection from the ddmd_client. const RETRY: tokio::time::Duration = tokio::time::Duration::from_secs(5); - + let ddmd_client = DdmAdminClient::localhost(&log).unwrap(); loop { match ddmd_client .derive_bootstrap_addrs_from_prefixes(&[ @@ -154,7 +106,7 @@ async fn poll_ddmd_for_bootstore_peer_update( log, concat!( "Bootstore comms error: {}. ", - "bootstore::Node task must have paniced", + "bootstore::Node task must have panicked", ), e ); diff --git a/sled-agent/src/bootstrap/mod.rs b/sled-agent/src/bootstrap/mod.rs index 96e674acf3..5bf25b8521 100644 --- a/sled-agent/src/bootstrap/mod.rs +++ b/sled-agent/src/bootstrap/mod.rs @@ -4,7 +4,7 @@ //! Bootstrap-related utilities -mod bootstore; +pub(crate) mod bootstore; pub mod client; pub mod config; pub mod early_networking; @@ -14,7 +14,7 @@ pub(crate) mod params; mod pre_server; mod rack_ops; pub(crate) mod rss_handle; -mod secret_retriever; +pub mod secret_retriever; pub mod server; mod sprockets_server; mod views; diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 5c4dbd8310..376a154ed2 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -22,6 +22,7 @@ pub mod config; mod http_entrypoints; mod instance; mod instance_manager; +mod long_running_tasks; mod nexus; pub mod params; mod profile; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs new file mode 100644 index 0000000000..54e8ed7e18 --- /dev/null +++ b/sled-agent/src/long_running_tasks.rs @@ -0,0 +1,142 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! This module is responsible for spawning, starting, and managing long running +//! tasks and task driven subsystems. These tasks run for the remainder of the +//! sled-agent process from the moment they begin. Primarily they include the +//! "managers", like `StorageManager`, `InstanceManager`, etc..., and are used +//! by both the bootstrap agent and the sled-agent. +//! +//! We don't bother keeping track of the spawned tasks handles because we know +//! these tasks are supposed to run forever, and they can shutdown if their +//! handles are dropped. + +use crate::bootstrap::bootstore::{ + new_bootstore_config, poll_ddmd_for_bootstore_peer_update, +}; +use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; +use bootstore::schemes::v0 as bootstore; +use key_manager::{KeyManager, StorageKeyRequester}; +use sled_hardware::{HardwareManager, SledMode}; +use sled_storage::manager::{StorageHandle, StorageManager}; +use slog::{info, Logger}; +use std::net::Ipv6Addr; + +/// A mechanism for interacting with all long running tasks that can be shared +/// between the bootstrap-agent and sled-agent code. +#[derive(Clone)] +pub struct LongRunningTaskHandles { + /// A mechanism for retrieving storage keys. This interacts with the + /// [`KeyManager`] task. In the future, there may be other handles for + /// retrieving different types of keys. Separating the handles limits the + /// access for a given key type to the code that holds the handle. + pub storage_key_requester: StorageKeyRequester, + + /// A mechanism for talking to the [`StorageManager`] which is responsible + /// for establishing zpools on disks and managing their datasets. + pub storage_handle: StorageHandle, + + /// A mechanism for interacting with the hardware device tree + pub hardware_manager: HardwareManager, + + // A handle for interacting with the bootstore + pub bootstore: bootstore::NodeHandle, +} + +/// Spawn all long running tasks +pub async fn spawn_all( + log: &Logger, + sled_mode: SledMode, + global_zone_bootstrap_ip: Ipv6Addr, +) -> LongRunningTaskHandles { + let storage_key_requester = spawn_key_manager(log); + let mut storage_handle = + spawn_storage_manager(log, storage_key_requester.clone()); + let hardware_manager = spawn_hardware_manager(log, sled_mode); + + // Wait for the boot disk so that we can work with any ledgers, + // such as those needed by the bootstore and sled-agent + let _ = storage_handle.wait_for_boot_disk().await; + + let bootstore = spawn_bootstore_tasks( + log, + &mut storage_handle, + &hardware_manager, + global_zone_bootstrap_ip, + ) + .await; + + LongRunningTaskHandles { + storage_key_requester, + storage_handle, + hardware_manager, + bootstore, + } +} + +fn spawn_key_manager(log: &Logger) -> StorageKeyRequester { + info!(log, "Starting KeyManager"); + let secret_retriever = LrtqOrHardcodedSecretRetriever::new(); + let (mut key_manager, storage_key_requester) = + KeyManager::new(log, secret_retriever); + let key_manager_handle = + tokio::spawn(async move { key_manager.run().await }); + storage_key_requester +} + +fn spawn_storage_manager( + log: &Logger, + key_requester: StorageKeyRequester, +) -> StorageHandle { + info!(log, "Starting StorageManager"); + let (mut manager, handle) = StorageManager::new(log, key_requester); + tokio::spawn(async move { + manager.run().await; + }); + handle +} + +fn spawn_hardware_manager( + log: &Logger, + sled_mode: SledMode, +) -> HardwareManager { + // The `HardwareManager` does not use the the "task/handle" pattern + // and spawns its worker task inside `HardwareManager::new`. Instead of returning + // a handle to send messages to that task, the "Inner/Mutex" pattern is used + // which shares data between the task, the manager itself, and the users of the manager + // since the manager can be freely cloned and passed around. + // + // There are pros and cons to both methods, but the reason to mention it here is that + // the handle in this case is the `HardwareManager` itself. + info!(log, "Starting HardwareManager"; "sled_mode" => ?sled_mode); + HardwareManager::new(log, sled_mode).unwrap() +} + +async fn spawn_bootstore_tasks( + log: &Logger, + storage_handle: &mut StorageHandle, + hardware_manager: &HardwareManager, + global_zone_bootstrap_ip: Ipv6Addr, +) -> bootstore::NodeHandle { + let storage_resources = storage_handle.get_latest_resources().await; + let config = new_bootstore_config( + &storage_resources, + hardware_manager.baseboard(), + global_zone_bootstrap_ip, + ) + .unwrap(); + + // Create and spawn the bootstore + let (mut node, node_handle) = bootstore::Node::new(config, log).await; + tokio::spawn(async move { node.run().await }); + + // Spawn a task for polling DDMD and updating bootstore with peer addresses + let log = log.new(o!("component" => "bootstore_ddmd_poller")); + let node_handle2 = node_handle.clone(); + tokio::spawn(async move { + poll_ddmd_for_bootstore_peer_update(log, node_handle2).await + }); + + node_handle +} diff --git a/sled-hardware/src/illumos/mod.rs b/sled-hardware/src/illumos/mod.rs index 0364c98f14..0e49d6d776 100644 --- a/sled-hardware/src/illumos/mod.rs +++ b/sled-hardware/src/illumos/mod.rs @@ -589,11 +589,11 @@ async fn hardware_tracking_task( /// /// This structure provides interfaces for both querying and for receiving new /// events. +#[derive(Clone)] pub struct HardwareManager { log: Logger, inner: Arc>, tx: broadcast::Sender, - _worker: JoinHandle<()>, } impl HardwareManager { @@ -660,11 +660,11 @@ impl HardwareManager { let log2 = log.clone(); let inner2 = inner.clone(); let tx2 = tx.clone(); - let _worker = tokio::task::spawn(async move { + tokio::task::spawn(async move { hardware_tracking_task(log2, inner2, tx2).await }); - Ok(Self { log, inner, tx, _worker }) + Ok(Self { log, inner, tx }) } pub fn baseboard(&self) -> Baseboard { diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml index 11bd502183..617a0a0fd7 100644 --- a/sled-storage/Cargo.toml +++ b/sled-storage/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "omicron-sled-storage" +name = "sled-storage" version = "0.1.0" edition = "2021" @@ -30,4 +30,4 @@ uuid.workspace = true [dev-dependencies] illumos-utils = { workspace = true, features = ["tmp_keypath"] } omicron-test-utils.workspace = true -camino-tempfile.workspace = true \ No newline at end of file +camino-tempfile.workspace = true diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index 99df582371..71e04a6935 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -237,7 +237,7 @@ pub enum DatasetError { /// be used. The `StorageManager` for the sled-agent always has a /// `StorageKeyRequester` available, and so the only place we should pass /// `None` is for the M.2s touched by the Installinator. -pub async fn ensure_zpool_has_datasets( +pub(crate) async fn ensure_zpool_has_datasets( log: &Logger, zpool_name: &ZpoolName, disk_identity: &DiskIdentity, diff --git a/sled-storage/src/lib.rs b/sled-storage/src/lib.rs index f923165896..20f6442b9a 100644 --- a/sled-storage/src/lib.rs +++ b/sled-storage/src/lib.rs @@ -8,7 +8,7 @@ //! hardware partitions from the `sled-hardware` crate. It utilizes the //! `illumos-utils` crate to actually perform ZFS related OS calls. -pub(crate) mod dataset; +pub mod dataset; pub(crate) mod disk; pub(crate) mod dump_setup; pub mod error; diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index e6395def6b..7bcb8df192 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -6,7 +6,7 @@ use std::collections::HashSet; -use crate::dataset::{DatasetError, DatasetKind, DatasetName}; +use crate::dataset::{DatasetError, DatasetName}; use crate::disk::{Disk, DiskError, RawDisk}; use crate::error::Error; use crate::resources::StorageResources; @@ -16,7 +16,7 @@ use illumos_utils::zpool::ZpoolName; use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; use sled_hardware::DiskVariant; -use slog::{debug, error, info, o, warn, Logger}; +use slog::{error, info, o, warn, Logger}; use tokio::sync::{mpsc, oneshot, watch}; use tokio::time::{interval, Duration, MissedTickBehavior}; use uuid::Uuid; @@ -58,11 +58,12 @@ enum StorageRequest { /// to clients for debugging purposes, and that isn't exposed in other ways. #[derive(Debug, Clone)] pub struct StorageManagerData { - state: StorageManagerState, - queued_u2_drives: HashSet, + pub state: StorageManagerState, + pub queued_u2_drives: HashSet, } /// A mechanism for interacting with the [`StorageManager`] +#[derive(Clone)] pub struct StorageHandle { tx: mpsc::Sender, resource_updates: watch::Receiver, @@ -480,6 +481,7 @@ impl StorageManager { /// systems. #[cfg(all(test, target_os = "illumos"))] mod tests { + use crate::dataset::DatasetKind; use crate::disk::SyntheticDisk; use super::*; From f540c086f5cbc1dda34bfa3f85203e0826aa07ca Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 6 Oct 2023 05:33:07 +0000 Subject: [PATCH 22/66] wip --- sled-agent/src/bootstrap/pre_server.rs | 79 +++++++++----------------- sled-agent/src/bootstrap/server.rs | 1 - sled-agent/src/long_running_tasks.rs | 30 ++++++++-- sled-agent/src/services.rs | 18 +++--- sled-agent/src/sled_agent.rs | 2 +- sled-agent/src/zone_bundle.rs | 39 ++++++------- sled-storage/src/lib.rs | 2 +- sled-storage/src/manager.rs | 4 +- 8 files changed, 83 insertions(+), 92 deletions(-) diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index 0899bdd82f..d7c3e9d5d8 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -11,9 +11,11 @@ #![allow(clippy::result_large_err)] use super::maghemite; -use super::secret_retriever::LrtqOrHardcodedSecretRetriever; use super::server::StartError; use crate::config::Config; +use crate::long_running_tasks::{ + spawn_all_longrunning_tasks, LongRunningTaskHandles, +}; use crate::services::ServiceManager; use crate::sled_agent::SledAgent; use crate::storage_manager::StorageManager; @@ -29,8 +31,6 @@ use illumos_utils::zfs; use illumos_utils::zfs::Zfs; use illumos_utils::zone; use illumos_utils::zone::Zones; -use key_manager::KeyManager; -use key_manager::StorageKeyRequester; use omicron_common::address::Ipv6Subnet; use omicron_common::FileKv; use sled_hardware::underlay; @@ -38,6 +38,8 @@ use sled_hardware::DendriteAsic; use sled_hardware::HardwareManager; use sled_hardware::HardwareUpdate; use sled_hardware::SledMode; +use sled_storage::disk::SyntheticDisk; +use sled_storage::manager::StorageHandle; use slog::Drain; use slog::Logger; use std::net::IpAddr; @@ -200,36 +202,24 @@ impl BootstrapAgentStartup { // This should be a no-op if already enabled. BootstrapNetworking::enable_ipv6_forwarding().await?; - // Spawn the `KeyManager` which is needed by the the StorageManager to - // retrieve encryption keys. - let (storage_key_requester, key_manager_handle) = - spawn_key_manager_task(&base_log); - let sled_mode = sled_mode_from_config(&config)?; - // Start monitoring hardware. This is blocking so we use - // `spawn_blocking`; similar to above, we move some things in and (on - // success) it gives them back. - let (base_log, log, hardware_manager) = { - tokio::task::spawn_blocking(move || { - info!( - log, "Starting hardware monitor"; - "sled_mode" => ?sled_mode, - ); - let hardware_manager = - HardwareManager::new(&base_log, sled_mode) - .map_err(StartError::StartHardwareManager)?; - Ok::<_, StartError>((base_log, log, hardware_manager)) - }) - .await - .unwrap()? - }; + // Spawn all important long running tasks that live for the lifetime of + // the process and are used by both the bootstrap agent and sled agent + let long_running_task_handles = spawn_all_longrunning_tasks( + &base_log, + sled_mode, + startup_networking.global_zone_bootstrap_ip, + ) + .await; - // Create a `StorageManager` and (possibly) synthetic disks. - let storage_manager = - StorageManager::new(&base_log, storage_key_requester).await; - upsert_synthetic_zpools_if_needed(&log, &storage_manager, &config) - .await; + // Add some synthetic disks if necessary. + upsert_synthetic_zpools_if_needed( + &log, + &long_running_task_handles.storage_manager, + &config, + ) + .await; let global_zone_bootstrap_ip = startup_networking.global_zone_bootstrap_ip; @@ -242,7 +232,7 @@ impl BootstrapAgentStartup { config.skip_timesync, config.sidecar_revision.clone(), config.switch_zone_maghemite_links.clone(), - storage_manager.resources().clone(), + long_running_task_handles.storage_manager.clone(), storage_manager.zone_bundler().clone(), ); @@ -357,13 +347,10 @@ fn ensure_zfs_key_directory_exists(log: &Logger) -> Result<(), StartError> { // to create and mount encrypted datasets. info!( log, "Ensuring zfs key directory exists"; - "path" => sled_hardware::disk::KEYPATH_ROOT, + "path" => zfs::KEYPATH_ROOT, ); - std::fs::create_dir_all(sled_hardware::disk::KEYPATH_ROOT).map_err(|err| { - StartError::CreateZfsKeyDirectory { - dir: sled_hardware::disk::KEYPATH_ROOT, - err, - } + std::fs::create_dir_all(zfs::KEYPATH_ROOT).map_err(|err| { + StartError::CreateZfsKeyDirectory { dir: zfs::KEYPATH_ROOT, err } }) } @@ -387,7 +374,7 @@ fn ensure_zfs_ramdisk_dataset() -> Result<(), StartError> { async fn upsert_synthetic_zpools_if_needed( log: &Logger, - storage_manager: &StorageManager, + storage_manager: &StorageHandle, config: &Config, ) { if let Some(pools) = &config.zpools { @@ -397,7 +384,8 @@ async fn upsert_synthetic_zpools_if_needed( "Upserting synthetic zpool to Storage Manager: {}", pool.to_string() ); - storage_manager.upsert_synthetic_disk(pool.clone()).await; + let disk = SyntheticDisk::new(pool.clone()).into(); + storage_manager.upsert_disk(disk).await; } } } @@ -435,19 +423,6 @@ fn sled_mode_from_config(config: &Config) -> Result { Ok(sled_mode) } -fn spawn_key_manager_task( - log: &Logger, -) -> (StorageKeyRequester, JoinHandle<()>) { - let secret_retriever = LrtqOrHardcodedSecretRetriever::new(); - let (mut key_manager, storage_key_requester) = - KeyManager::new(log, secret_retriever); - - let key_manager_handle = - tokio::spawn(async move { key_manager.run().await }); - - (storage_key_requester, key_manager_handle) -} - #[derive(Debug, Clone)] pub(crate) struct BootstrapNetworking { pub(crate) bootstrap_etherstub: dladm::Etherstub, diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 0cbbf0678b..638aa51dee 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -13,7 +13,6 @@ use super::rack_ops::RackInitId; use super::views::SledAgentResponse; use super::BootstrapError; use super::RssAccessError; -use crate::bootstrap::bootstore::BootstoreHandles; use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; use crate::bootstrap::http_entrypoints::api as http_api; use crate::bootstrap::http_entrypoints::BootstrapServerContext; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index 54e8ed7e18..cb82648a8c 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -16,8 +16,10 @@ use crate::bootstrap::bootstore::{ new_bootstore_config, poll_ddmd_for_bootstore_peer_update, }; use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; +use crate::zone_bundle::{CleanupContext, ZoneBundler}; use bootstore::schemes::v0 as bootstore; use key_manager::{KeyManager, StorageKeyRequester}; +use sled_agent_client::types::CleanupContext; use sled_hardware::{HardwareManager, SledMode}; use sled_storage::manager::{StorageHandle, StorageManager}; use slog::{info, Logger}; @@ -35,24 +37,29 @@ pub struct LongRunningTaskHandles { /// A mechanism for talking to the [`StorageManager`] which is responsible /// for establishing zpools on disks and managing their datasets. - pub storage_handle: StorageHandle, + pub storage_manager: StorageHandle, /// A mechanism for interacting with the hardware device tree pub hardware_manager: HardwareManager, // A handle for interacting with the bootstore pub bootstore: bootstore::NodeHandle, + + // A reference to the object used to manage zone bundles + pub zone_bundler: ZoneBundler, } /// Spawn all long running tasks -pub async fn spawn_all( +pub async fn spawn_all_longrunning_tasks( log: &Logger, sled_mode: SledMode, global_zone_bootstrap_ip: Ipv6Addr, ) -> LongRunningTaskHandles { let storage_key_requester = spawn_key_manager(log); - let mut storage_handle = + let mut storage_manager = spawn_storage_manager(log, storage_key_requester.clone()); + + // TODO: Does this need to run inside tokio::task::spawn_blocking? let hardware_manager = spawn_hardware_manager(log, sled_mode); // Wait for the boot disk so that we can work with any ledgers, @@ -67,9 +74,11 @@ pub async fn spawn_all( ) .await; + let zone_bundler = spawn_zone_bundler_tasks(log, &mut storage_handle); + LongRunningTaskHandles { storage_key_requester, - storage_handle, + storage_manager, hardware_manager, bootstore, } @@ -140,3 +149,16 @@ async fn spawn_bootstore_tasks( node_handle } + +// `ZoneBundler::new` spawns a periodic cleanup task that runs indefinitely +fn spawn_zone_bundler_tasks( + log: &Logger, + storage_handle: &mut StorageHandle, +) -> ZoneBundler { + let log = log.new(o!("component" => "ZoneBundler")); + let zone_bundler = ZoneBundler::new( + log, + storage_handle.clone(), + CleanupContext::default(), + ); +} diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 96cdf8222b..3fcbf717fa 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -5,7 +5,7 @@ //! Sled-local service management. //! //! For controlling zone-based storage services, refer to -//! [crate::storage_manager::StorageManager]. +//! [sled_hardware:manager::StorageManager]. //! //! For controlling virtual machine instances, refer to //! [crate::instance_manager::InstanceManager]. @@ -38,7 +38,6 @@ use crate::params::{ use crate::profile::*; use crate::smf_helper::Service; use crate::smf_helper::SmfHelper; -use crate::storage_manager::StorageResources; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; use anyhow::anyhow; @@ -88,12 +87,14 @@ use omicron_common::nexus_config::{ use once_cell::sync::OnceCell; use rand::prelude::SliceRandom; use rand::SeedableRng; -use sled_hardware::disk::ZONE_DATASET; use sled_hardware::is_gimlet; use sled_hardware::underlay; use sled_hardware::underlay::BOOTSTRAP_PREFIX; use sled_hardware::Baseboard; use sled_hardware::SledMode; +use sled_storage::dataset::{CONFIG_DATASET, ZONE_DATASET}; +use sled_storage::manager::StorageHandle; +use sled_storage::resources::StorageResources; use slog::Logger; use std::collections::HashSet; use std::collections::{BTreeMap, HashMap}; @@ -370,7 +371,7 @@ pub struct ServiceManagerInner { advertised_prefixes: Mutex>>, sled_info: OnceCell, switch_zone_bootstrap_address: Ipv6Addr, - storage: StorageResources, + storage: StorageHandle, zone_bundler: ZoneBundler, ledger_directory_override: OnceCell, image_directory_override: OnceCell, @@ -415,7 +416,7 @@ impl ServiceManager { skip_timesync: Option, sidecar_revision: SidecarRevision, switch_zone_maghemite_links: Vec, - storage: StorageResources, + storage: StorageHandle, zone_bundler: ZoneBundler, ) -> Self { let log = log.new(o!("component" => "ServiceManager")); @@ -470,13 +471,12 @@ impl ServiceManager { } async fn all_service_ledgers(&self) -> Vec { + let resources = self.inner.storage.get_latest_resources().await; if let Some(dir) = self.inner.ledger_directory_override.get() { return vec![dir.join(SERVICES_LEDGER_FILENAME)]; } - self.inner - .storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) - .await + resources + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(SERVICES_LEDGER_FILENAME)) .collect() diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 7e62f6a8a7..dc130524f6 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -90,7 +90,7 @@ pub enum Error { Instance(#[from] crate::instance_manager::Error), #[error("Error managing storage: {0}")] - Storage(#[from] crate::storage_manager::Error), + Storage(#[from] sled_storage::error::Error), #[error("Error updating: {0}")] Download(#[from] crate::updates::Error), diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index 2eeb8ebe7d..ea7481bd6d 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -6,7 +6,6 @@ //! Tools for collecting and inspecting service bundles for zones. -use crate::storage_manager::StorageResources; use anyhow::anyhow; use anyhow::Context; use camino::FromPathBufError; @@ -22,6 +21,8 @@ use illumos_utils::zone::AdmError; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; +use sled_storage::dataset::U2_DEBUG_DATASET; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::cmp::Ord; use std::cmp::Ordering; @@ -148,20 +149,12 @@ pub struct ZoneBundler { inner: Arc>, // Channel for notifying the cleanup task that it should reevaluate. notify_cleanup: Arc, - // Tokio task handle running the period cleanup operation. - cleanup_task: Arc>, -} - -impl Drop for ZoneBundler { - fn drop(&mut self) { - self.cleanup_task.abort(); - } } // State shared between tasks, e.g., used when creating a bundle in different // tasks or between a creation and cleanup. struct Inner { - resources: StorageResources, + storage_handle: StorageHandle, cleanup_context: CleanupContext, last_cleanup_at: Instant, } @@ -189,7 +182,8 @@ impl Inner { // that can exist but do not, i.e., those whose parent datasets already // exist; and returns those. async fn bundle_directories(&self) -> Vec { - let expected = self.resources.all_zone_bundle_directories().await; + let resources = self.storage_handle.get_latest_resources().await; + let expected = resources.all_zone_bundle_directories(); let mut out = Vec::with_capacity(expected.len()); for each in expected.into_iter() { if tokio::fs::create_dir_all(&each).await.is_ok() { @@ -249,26 +243,28 @@ impl ZoneBundler { /// Create a new zone bundler. /// /// This creates an object that manages zone bundles on the system. It can - /// be used to create bundles from running zones, and runs a period task to - /// clean them up to free up space. + /// be used to create bundles from running zones, and runs a periodic task + /// to clean them up to free up space. pub fn new( log: Logger, - resources: StorageResources, + storage_handle: StorageHandle, cleanup_context: CleanupContext, ) -> Self { let notify_cleanup = Arc::new(Notify::new()); let inner = Arc::new(Mutex::new(Inner { - resources, + storage_handle, cleanup_context, last_cleanup_at: Instant::now(), })); let cleanup_log = log.new(slog::o!("component" => "auto-cleanup-task")); let notify_clone = notify_cleanup.clone(); let inner_clone = inner.clone(); - let cleanup_task = Arc::new(tokio::task::spawn( - Self::periodic_cleanup(cleanup_log, inner_clone, notify_clone), + tokio::task::spawn(Self::periodic_cleanup( + cleanup_log, + inner_clone, + notify_clone, )); - Self { log, inner, notify_cleanup, cleanup_task } + Self { log, inner, notify_cleanup } } /// Trigger an immediate cleanup of low-priority zone bundles. @@ -353,10 +349,9 @@ impl ZoneBundler { ) -> Result { let inner = self.inner.lock().await; let storage_dirs = inner.bundle_directories().await; - let extra_log_dirs = inner - .resources - .all_u2_mountpoints(sled_hardware::disk::U2_DEBUG_DATASET) - .await + let resources = inner.storage_handle.get_latest_resources().await; + let extra_log_dirs = resources + .all_u2_mountpoints(U2_DEBUG_DATASET) .into_iter() .map(|p| p.join(zone.name())) .collect(); diff --git a/sled-storage/src/lib.rs b/sled-storage/src/lib.rs index 20f6442b9a..0c1b383d7f 100644 --- a/sled-storage/src/lib.rs +++ b/sled-storage/src/lib.rs @@ -9,7 +9,7 @@ //! `illumos-utils` crate to actually perform ZFS related OS calls. pub mod dataset; -pub(crate) mod disk; +pub mod disk; pub(crate) mod dump_setup; pub mod error; pub(crate) mod keyfile; diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 7bcb8df192..7e2050084b 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -133,14 +133,14 @@ impl StorageHandle { /// Retrieve the latest value of `StorageResources` from the /// `StorageManager` task. - pub async fn get_latest_resources(&mut self) -> StorageResources { + pub async fn get_latest_resources(&self) -> StorageResources { let (tx, rx) = oneshot::channel(); self.tx.send(StorageRequest::GetLatestResources(tx)).await.unwrap(); rx.await.unwrap() } /// Return internal data useful for debugging and testing - pub async fn get_manager_state(&mut self) -> StorageManagerData { + pub async fn get_manager_state(&self) -> StorageManagerData { let (tx, rx) = oneshot::channel(); self.tx.send(StorageRequest::GetManagerState(tx)).await.unwrap(); rx.await.unwrap() From 1e34e7cbd1f624bedc40b503e537d08adc49bc56 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 6 Oct 2023 20:42:00 +0000 Subject: [PATCH 23/66] wip --- sled-agent/src/bootstrap/pre_server.rs | 47 +---- sled-agent/src/bootstrap/server.rs | 1 - sled-agent/src/hardware_monitor.rs | 270 +++++++++++++++++++++++++ sled-agent/src/hardware_monitor.sh | 1 + sled-agent/src/lib.rs | 1 + sled-agent/src/long_running_tasks.rs | 14 +- sled-storage/src/manager.rs | 2 +- 7 files changed, 282 insertions(+), 54 deletions(-) create mode 100644 sled-agent/src/hardware_monitor.rs create mode 100644 sled-agent/src/hardware_monitor.sh diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index d7c3e9d5d8..64d8119a33 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -104,41 +104,6 @@ impl BootstrapManagers { } } } - - // Observe the current hardware state manually. - // - // We use this when we're monitoring hardware for the first - // time, and if we miss notifications. - pub(super) async fn check_latest_hardware_snapshot( - &self, - sled_agent: Option<&SledAgent>, - log: &Logger, - ) { - let underlay_network = sled_agent.map(|sled_agent| { - sled_agent.notify_nexus_about_self(log); - sled_agent.switch_zone_underlay_info() - }); - info!( - log, "Checking current full hardware snapshot"; - "underlay_network_info" => ?underlay_network, - ); - if self.hardware.is_scrimlet_driver_loaded() { - let baseboard = self.hardware.baseboard(); - if let Err(e) = - self.service.activate_switch(underlay_network, baseboard).await - { - warn!(log, "Failed to activate switch: {e}"); - } - } else { - if let Err(e) = self.service.deactivate_switch().await { - warn!(log, "Failed to deactivate switch: {e}"); - } - } - - self.storage - .ensure_using_exactly_these_disks(self.hardware.disks()) - .await; - } } pub(super) struct BootstrapAgentStartup { @@ -148,7 +113,6 @@ pub(super) struct BootstrapAgentStartup { pub(super) base_log: Logger, pub(super) startup_log: Logger, pub(super) managers: BootstrapManagers, - pub(super) key_manager_handle: JoinHandle<()>, } impl BootstrapAgentStartup { @@ -202,6 +166,7 @@ impl BootstrapAgentStartup { // This should be a no-op if already enabled. BootstrapNetworking::enable_ipv6_forwarding().await?; + // Are we a gimlet or scrimlet? let sled_mode = sled_mode_from_config(&config)?; // Spawn all important long running tasks that live for the lifetime of @@ -233,7 +198,7 @@ impl BootstrapAgentStartup { config.sidecar_revision.clone(), config.switch_zone_maghemite_links.clone(), long_running_task_handles.storage_manager.clone(), - storage_manager.zone_bundler().clone(), + long_running_task_handles.zone_bundler.clone(), ); Ok(Self { @@ -242,12 +207,8 @@ impl BootstrapAgentStartup { ddm_admin_localhost_client, base_log, startup_log: log, - managers: BootstrapManagers { - hardware: hardware_manager, - storage: storage_manager, - service: service_manager, - }, - key_manager_handle, + service_manager, + long_running_task_handles, }) } } diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 638aa51dee..20778bd402 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -25,7 +25,6 @@ use crate::config::Config as SledConfig; use crate::config::ConfigError; use crate::server::Server as SledAgentServer; use crate::sled_agent::SledAgent; -use crate::storage_manager::StorageResources; use bootstore::schemes::v0 as bootstore; use camino::Utf8PathBuf; use cancel_safe_futures::TryStreamExt; diff --git a/sled-agent/src/hardware_monitor.rs b/sled-agent/src/hardware_monitor.rs new file mode 100644 index 0000000000..7304f10492 --- /dev/null +++ b/sled-agent/src/hardware_monitor.rs @@ -0,0 +1,270 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A task that listens for hardware events from the +//! [`sled_hardware::HardwareManager`] and dispatches them to other parts +//! of the bootstrap agent and sled-agent code. + +use crate::services::ServiceManager; +use crate::sled_agent::SledAgent; +use sled_hardware::{Baseboard, HardwareManager, HardwareUpdate}; +use sled_storage::disk::RawDisk; +use sled_storage::manager::StorageHandle; +use slog::Logger; +use std::fmt::Debug; +use tokio::sync::broadcast; +use tokio::sync::broadcast::error::RecvError; +use tokio::sync::mpsc; + +const QUEUE_SIZE: usize = 10; + +pub enum HardwareMonitorMsg { + SledAgentStarted(SledAgent), + ServiceManagerCreated(ServiceManager), +} + +impl Debug for HardwareMonitorMsg { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + HardwareMonitorMsg::SledAgentStarted(_) => { + f.debug_struct("SledAgentStarted").finish() + } + HardwareMonitorMsg::ServiceManagerCreated(_) => { + f.debug_struct("ServiceManagerCreated").finish() + } + } + } +} + +// A thin wrapper around the the [`ServiceManager`] that caches the state +// whether or not the tofino is loaded if the [`ServiceManager`] doesn't exist +// yet. +enum TofinoManager { + Ready(ServiceManager), + NotReady { tofino_loaded: bool }, +} + +impl TofinoManager { + pub fn new() -> TofinoManager { + TofinoManager::NotReady { tofino_loaded: false } + } + + // Must only be called once on the transition from `NotReady` to `Ready`. + // Panics otherwise. + // + // Returns whether the tofino was loaded or not + pub fn become_ready(&mut self, service_manager: ServiceManager) -> bool { + match self { + Self::Ready(_) => panic!("ServiceManager is already available"), + Self::NotReady { tofino_loaded } => { + *self = Self::Ready(service_manager); + *tofino_loaded + } + } + } +} + +#[derive(Clone)] +pub struct HardwareMonitorHandle { + tx: mpsc::Sender, +} + +pub struct HardwareMonitor { + log: Logger, + + baseboard: Baseboard, + + // Receive messages from the [`HardwareMonitorHandle`] + handle_rx: mpsc::Receiver, + + // Receive messages from the [`HardwareManager`] + hardware_rx: broadcast::Receiver, + + // A reference to the hardware manager + hardware_manager: HardwareManager, + + // A handle to [`sled_hardware::manager::StorageManger`] + storage_manager: StorageHandle, + + // A handle to the sled-agent + // + // This will go away once Nexus updates are polled: + // See: + // * https://github.com/oxidecomputer/omicron/issues/1917 + // * https://rfd.shared.oxide.computer/rfd/0433 + sled_agent: Option, + + // The [`ServiceManager`] is instantiated after we start the [`HardwareMonitor`] + // task. However, it is only used to load and unload the switch zone when thes + // state of the tofino changes. We keep track of the tofino state so that we + // can properly load the tofino when the [`ServiceManager`] becomes available + // available. + tofino_manager: TofinoManager, +} + +impl HardwareMonitor { + pub fn new( + log: &Logger, + hardware_manager: &mut HardwareManager, + storage_manager: &mut StorageHandle, + ) -> (HardwareMonitor, HardwareMonitorHandle) { + let baseboard = hardware_manager.baseboard(); + let (handle_tx, handle_rx) = mpsc::channel(QUEUE_SIZE); + let hardware_rx = hardware_manager.monitor(); + let log = log.new(o!("component" => "HardwareMonitor")); + let tofino_manager = TofinoManager::new(); + ( + HardwareMonitor { + log, + baseboard, + handle_rx, + hardware_rx, + hardware_manager: hardware_manager.clone(), + storage_manager: storage_manager.clone(), + sled_agent: None, + tofino_manager, + }, + HardwareMonitorHandle { tx: handle_tx }, + ) + } + + /// Run the main receive loop of the `StorageManager` + /// + /// This should be spawned into a tokio task + pub async fn run(&mut self) { + loop { + tokio::select! { + Some(msg) = self.handle_rx.recv() => { + info!( + self.log, + "Received hardware monitor message"; + "msg" => ?msg + ); + self.handle_monitor_msg(msg).await; + } + update = self.hardware_rx.recv() => { + info!( + self.log, + "Received hardware update message"; + "update" => ?update, + ); + self.handle_hardware_update(update).await; + } + } + } + } + + // Handle a message from the [`HardwareMonitorHandle`] + async fn handle_monitor_msg(&mut self, msg: HardwareMonitorMsg) { + match msg { + HardwareMonitorMsg::SledAgentStarted(sled_agent) => { + self.sled_agent = Some(sled_agent); + self.check_latest_hardware_snapshot().await; + } + HardwareMonitorMsg::ServiceManagerCreated(service_manager) => { + let tofino_loaded = + self.tofino_manager.become_ready(service_manager); + if tofino_loaded { + self.activate_switch().await; + } + } + } + } + + // Handle an update from the [`HardwareMonitor`] + async fn handle_hardware_update( + &mut self, + update: Result, + ) { + match update { + Ok(update) => match update { + HardwareUpdate::TofinoLoaded => self.activate_switch().await, + HardwareUpdate::TofinoUnloaded => { + self.deactivate_switch().await + } + HardwareUpdate::TofinoDeviceChange => { + if let Some(sled_agent) = &mut self.sled_agent { + sled_agent.notify_nexus_about_self(&self.log); + } + } + HardwareUpdate::DiskAdded(disk) => { + self.storage_manager.upsert_disk(disk.into()).await; + } + HardwareUpdate::DiskRemoved(disk) => { + self.storage_manager.delete_disk(disk.into()).await; + } + }, + Err(broadcast::error::RecvError::Lagged(count)) => { + warn!(self.log, "Hardware monitor missed {count} messages"); + self.check_latest_hardware_snapshot().await; + } + Err(broadcast::error::RecvError::Closed) => { + // The `HardwareManager` monitoring task is an infinite loop - + // the only way for us to get `Closed` here is if it panicked, + // so we will propagate such a panic. + panic!("Hardware manager monitor task panicked"); + } + } + } + + async fn activate_switch(&mut self) { + match &mut self.tofino_manager { + TofinoManager::Ready(service_manager) => { + if let Err(e) = service_manager + .activate_switch( + self.sled_agent + .as_ref() + .map(|sa| sa.switch_zone_underlay_info()), + self.baseboard.clone(), + ) + .await + { + warn!(self.log, "Failed to activate switch: {e}"); + } + } + TofinoManager::NotReady { tofino_loaded } => { + *tofino_loaded = true; + } + } + } + + async fn deactivate_switch(&mut self) { + match &mut self.tofino_manager { + TofinoManager::Ready(service_manager) => { + if let Err(e) = service_manager.deactivate_switch().await { + warn!(self.log, "Failed to deactivate switch: {e}"); + } + } + TofinoManager::NotReady { tofino_loaded } => { + *tofino_loaded = false; + } + } + } + + // Observe the current hardware state manually. + // + // We use this when we're monitoring hardware for the first + // time, and if we miss notifications. + async fn check_latest_hardware_snapshot(&mut self) { + let underlay_network = self.sled_agent.as_ref().map(|sled_agent| { + sled_agent.notify_nexus_about_self(&self.log); + sled_agent.switch_zone_underlay_info() + }); + info!( + self.log, "Checking current full hardware snapshot"; + "underlay_network_info" => ?underlay_network, + ); + if self.hardware_manager.is_scrimlet_driver_loaded() { + self.activate_switch().await; + } else { + self.deactivate_switch().await; + } + + self.storage_manager + .ensure_using_exactly_these_disks( + self.hardware_manager.disks().into_iter().map(RawDisk::from), + ) + .await; + } +} diff --git a/sled-agent/src/hardware_monitor.sh b/sled-agent/src/hardware_monitor.sh new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/sled-agent/src/hardware_monitor.sh @@ -0,0 +1 @@ + diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 376a154ed2..154a133272 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -19,6 +19,7 @@ pub mod common; // Modules for the non-simulated sled agent. pub mod bootstrap; pub mod config; +pub(crate) mod hardware_monitor; mod http_entrypoints; mod instance; mod instance_manager; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index cb82648a8c..b6c22ce633 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -19,7 +19,6 @@ use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; use crate::zone_bundle::{CleanupContext, ZoneBundler}; use bootstore::schemes::v0 as bootstore; use key_manager::{KeyManager, StorageKeyRequester}; -use sled_agent_client::types::CleanupContext; use sled_hardware::{HardwareManager, SledMode}; use sled_storage::manager::{StorageHandle, StorageManager}; use slog::{info, Logger}; @@ -64,23 +63,24 @@ pub async fn spawn_all_longrunning_tasks( // Wait for the boot disk so that we can work with any ledgers, // such as those needed by the bootstore and sled-agent - let _ = storage_handle.wait_for_boot_disk().await; + let _ = storage_manager.wait_for_boot_disk().await; let bootstore = spawn_bootstore_tasks( log, - &mut storage_handle, + &mut storage_manager, &hardware_manager, global_zone_bootstrap_ip, ) .await; - let zone_bundler = spawn_zone_bundler_tasks(log, &mut storage_handle); + let zone_bundler = spawn_zone_bundler_tasks(log, &mut storage_manager); LongRunningTaskHandles { storage_key_requester, storage_manager, hardware_manager, bootstore, + zone_bundler, } } @@ -156,9 +156,5 @@ fn spawn_zone_bundler_tasks( storage_handle: &mut StorageHandle, ) -> ZoneBundler { let log = log.new(o!("component" => "ZoneBundler")); - let zone_bundler = ZoneBundler::new( - log, - storage_handle.clone(), - CleanupContext::default(), - ); + ZoneBundler::new(log, storage_handle.clone(), CleanupContext::default()) } diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 7e2050084b..e00c9ad4fa 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -617,7 +617,7 @@ mod tests { let logctx = test_setup_log("queued_disks_get_added_as_resources"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (mut manager, mut handle) = + let (mut manager, handle) = StorageManager::new(&logctx.log, key_requester); // Spawn the key_manager so that it will respond to requests for encryption keys From 9a3380ac0fab42e6045c2a69a01358de920f0f54 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 6 Oct 2023 21:02:46 +0000 Subject: [PATCH 24/66] wip --- sled-agent/src/bootstrap/pre_server.rs | 73 +++----------------------- sled-agent/src/hardware_monitor.rs | 20 ++++++- sled-agent/src/long_running_tasks.rs | 23 ++++++++ 3 files changed, 48 insertions(+), 68 deletions(-) diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index 64d8119a33..bb9a040100 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -17,8 +17,6 @@ use crate::long_running_tasks::{ spawn_all_longrunning_tasks, LongRunningTaskHandles, }; use crate::services::ServiceManager; -use crate::sled_agent::SledAgent; -use crate::storage_manager::StorageManager; use camino::Utf8PathBuf; use cancel_safe_futures::TryStreamExt; use ddm_admin_client::Client as DdmAdminClient; @@ -35,8 +33,6 @@ use omicron_common::address::Ipv6Subnet; use omicron_common::FileKv; use sled_hardware::underlay; use sled_hardware::DendriteAsic; -use sled_hardware::HardwareManager; -use sled_hardware::HardwareUpdate; use sled_hardware::SledMode; use sled_storage::disk::SyntheticDisk; use sled_storage::manager::StorageHandle; @@ -44,67 +40,6 @@ use slog::Drain; use slog::Logger; use std::net::IpAddr; use std::net::Ipv6Addr; -use tokio::sync::broadcast; -use tokio::task::JoinHandle; - -pub(super) struct BootstrapManagers { - pub(super) hardware: HardwareManager, - pub(super) storage: StorageManager, - pub(super) service: ServiceManager, -} - -impl BootstrapManagers { - pub(super) async fn handle_hardware_update( - &self, - update: Result, - sled_agent: Option<&SledAgent>, - log: &Logger, - ) { - match update { - Ok(update) => match update { - HardwareUpdate::TofinoLoaded => { - let baseboard = self.hardware.baseboard(); - if let Err(e) = self - .service - .activate_switch( - sled_agent.map(|sa| sa.switch_zone_underlay_info()), - baseboard, - ) - .await - { - warn!(log, "Failed to activate switch: {e}"); - } - } - HardwareUpdate::TofinoUnloaded => { - if let Err(e) = self.service.deactivate_switch().await { - warn!(log, "Failed to deactivate switch: {e}"); - } - } - HardwareUpdate::TofinoDeviceChange => { - if let Some(sled_agent) = sled_agent { - sled_agent.notify_nexus_about_self(log); - } - } - HardwareUpdate::DiskAdded(disk) => { - self.storage.upsert_disk(disk).await; - } - HardwareUpdate::DiskRemoved(disk) => { - self.storage.delete_disk(disk).await; - } - }, - Err(broadcast::error::RecvError::Lagged(count)) => { - warn!(log, "Hardware monitor missed {count} messages"); - self.check_latest_hardware_snapshot(sled_agent, log).await; - } - Err(broadcast::error::RecvError::Closed) => { - // The `HardwareManager` monitoring task is an infinite loop - - // the only way for us to get `Closed` here is if it panicked, - // so we will propagate such a panic. - panic!("Hardware manager monitor task panicked"); - } - } - } -} pub(super) struct BootstrapAgentStartup { pub(super) config: Config, @@ -112,7 +47,8 @@ pub(super) struct BootstrapAgentStartup { pub(super) ddm_admin_localhost_client: DdmAdminClient, pub(super) base_log: Logger, pub(super) startup_log: Logger, - pub(super) managers: BootstrapManagers, + pub(super) service_manager: ServiceManager, + pub(super) long_running_task_handles: LongRunningTaskHandles, } impl BootstrapAgentStartup { @@ -201,6 +137,11 @@ impl BootstrapAgentStartup { long_running_task_handles.zone_bundler.clone(), ); + long_running_task_handles + .hardware_monitor + .service_manager_ready(service_manager.clone()) + .await; + Ok(Self { config, global_zone_bootstrap_ip, diff --git a/sled-agent/src/hardware_monitor.rs b/sled-agent/src/hardware_monitor.rs index 7304f10492..10e20ed92c 100644 --- a/sled-agent/src/hardware_monitor.rs +++ b/sled-agent/src/hardware_monitor.rs @@ -70,6 +70,22 @@ pub struct HardwareMonitorHandle { tx: mpsc::Sender, } +impl HardwareMonitorHandle { + pub async fn service_manager_ready(&self, service_manager: ServiceManager) { + self.tx + .send(HardwareMonitorMsg::ServiceManagerCreated(service_manager)) + .await + .unwrap(); + } + + pub async fn sled_agent_started(&self, sled_agent: SledAgent) { + self.tx + .send(HardwareMonitorMsg::SledAgentStarted(sled_agent)) + .await + .unwrap(); + } +} + pub struct HardwareMonitor { log: Logger, @@ -106,8 +122,8 @@ pub struct HardwareMonitor { impl HardwareMonitor { pub fn new( log: &Logger, - hardware_manager: &mut HardwareManager, - storage_manager: &mut StorageHandle, + hardware_manager: &HardwareManager, + storage_manager: &StorageHandle, ) -> (HardwareMonitor, HardwareMonitorHandle) { let baseboard = hardware_manager.baseboard(); let (handle_tx, handle_rx) = mpsc::channel(QUEUE_SIZE); diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index b6c22ce633..223289bb2e 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -16,6 +16,7 @@ use crate::bootstrap::bootstore::{ new_bootstore_config, poll_ddmd_for_bootstore_peer_update, }; use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; +use crate::hardware_monitor::{HardwareMonitor, HardwareMonitorHandle}; use crate::zone_bundle::{CleanupContext, ZoneBundler}; use bootstore::schemes::v0 as bootstore; use key_manager::{KeyManager, StorageKeyRequester}; @@ -41,6 +42,10 @@ pub struct LongRunningTaskHandles { /// A mechanism for interacting with the hardware device tree pub hardware_manager: HardwareManager, + /// A mechanism for interacting with the task that monitors for hardware + /// updates from the [`HardwareManager`] + pub hardware_monitor: HardwareMonitorHandle, + // A handle for interacting with the bootstore pub bootstore: bootstore::NodeHandle, @@ -61,6 +66,10 @@ pub async fn spawn_all_longrunning_tasks( // TODO: Does this need to run inside tokio::task::spawn_blocking? let hardware_manager = spawn_hardware_manager(log, sled_mode); + // Start monitoring for hardware changes + let hardware_monitor = + spawn_hardware_monitor(log, &hardware_manager, &storage_manager); + // Wait for the boot disk so that we can work with any ledgers, // such as those needed by the bootstore and sled-agent let _ = storage_manager.wait_for_boot_disk().await; @@ -79,6 +88,7 @@ pub async fn spawn_all_longrunning_tasks( storage_key_requester, storage_manager, hardware_manager, + hardware_monitor, bootstore, zone_bundler, } @@ -122,6 +132,19 @@ fn spawn_hardware_manager( HardwareManager::new(log, sled_mode).unwrap() } +fn spawn_hardware_monitor( + log: &Logger, + hardware_manager: &HardwareManager, + storage_handle: &StorageHandle, +) -> HardwareMonitorHandle { + let (monitor, handle) = + HardwareMonitor::new(log, hardware_manager, storage_handle); + tokio::spawn(async move { + monitor.run().await; + }); + handle +} + async fn spawn_bootstore_tasks( log: &Logger, storage_handle: &mut StorageHandle, From b2c01e707c0acf8c7d1393dfd76c8d3917685912 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 6 Oct 2023 21:11:38 +0000 Subject: [PATCH 25/66] wip --- sled-agent/src/bootstrap/server.rs | 99 +++++------------------------- sled-agent/src/hardware_monitor.rs | 5 ++ 2 files changed, 20 insertions(+), 84 deletions(-) diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 20778bd402..4e07ee03b7 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -45,6 +45,8 @@ use serde::Deserialize; use serde::Serialize; use sled_hardware::underlay; use sled_hardware::HardwareUpdate; +use sled_storage::dataset::CONFIG_DATASET; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::borrow::Cow; use std::io; @@ -175,68 +177,19 @@ impl Server { ddm_admin_localhost_client, base_log, startup_log, - managers, - key_manager_handle, + service_manager, + long_running_task_handles, } = BootstrapAgentStartup::run(config).await?; - // From this point on we will listen for hardware notifications and - // potentially start the switch zone and be notified of new disks; we - // are responsible for responding to updates from this point on. - let mut hardware_monitor = managers.hardware.monitor(); - let storage_resources = managers.storage.resources(); - - // Check the latest hardware snapshot; we could have missed events - // between the creation of the hardware manager and our subscription of - // its monitor. - managers.check_latest_hardware_snapshot(None, &startup_log).await; - - // Wait for our boot M.2 to show up. - wait_while_handling_hardware_updates( - wait_for_boot_m2(storage_resources, &startup_log), - &mut hardware_monitor, - &managers, - None, // No underlay network yet - &startup_log, - "waiting for boot M.2", - ) - .await; - - // Wait for the bootstore to start. - let bootstore_handles = wait_while_handling_hardware_updates( - BootstoreHandles::spawn( - storage_resources, - ddm_admin_localhost_client.clone(), - managers.hardware.baseboard(), - global_zone_bootstrap_ip, - &base_log, - ), - &mut hardware_monitor, - &managers, - None, // No underlay network yet - &startup_log, - "initializing bootstore", - ) - .await?; - // Do we have a StartSledAgentRequest stored in the ledger? - let maybe_ledger = wait_while_handling_hardware_updates( - async { - let paths = sled_config_paths(storage_resources).await?; - let maybe_ledger = - Ledger::>::new( - &startup_log, - paths, - ) - .await; - Ok::<_, StartError>(maybe_ledger) - }, - &mut hardware_monitor, - &managers, - None, // No underlay network yet + let paths = + sled_config_paths(&long_running_task_handles.storage_manager) + .await?; + let maybe_ledger = Ledger::>::new( &startup_log, - "loading sled-agent request from ledger", + paths, ) - .await?; + .await; // We don't yet _act_ on the `StartSledAgentRequest` if we have one, but // if we have one we init our `RssAccess` noting that we're already @@ -522,28 +475,6 @@ fn start_dropshot_server( Ok(http_server) } -/// Wait for at least the M.2 we booted from to show up. -/// -/// TODO-correctness Subsequent steps may assume all M.2s that will ever be -/// present are present once we return from this function; see -/// https://github.com/oxidecomputer/omicron/issues/3815. -async fn wait_for_boot_m2(storage_resources: &StorageResources, log: &Logger) { - // Wait for at least the M.2 we booted from to show up. - loop { - match storage_resources.boot_disk().await { - Some(disk) => { - info!(log, "Found boot disk M.2: {disk:?}"); - break; - } - None => { - info!(log, "Waiting for boot disk M.2..."); - tokio::time::sleep(core::time::Duration::from_millis(250)) - .await; - } - } - } -} - struct MissingM2Paths(&'static str); impl From for StartError { @@ -559,17 +490,17 @@ impl From for SledAgentServerStartError { } async fn sled_config_paths( - storage: &StorageResources, + storage: &StorageHandle, ) -> Result, MissingM2Paths> { - let paths: Vec<_> = storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) - .await + let resources = storage.get_latest_resources().await; + let paths: Vec<_> = resources + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(SLED_AGENT_REQUEST_FILE)) .collect(); if paths.is_empty() { - return Err(MissingM2Paths(sled_hardware::disk::CONFIG_DATASET)); + return Err(MissingM2Paths(CONFIG_DATASET)); } Ok(paths) } diff --git a/sled-agent/src/hardware_monitor.rs b/sled-agent/src/hardware_monitor.rs index 10e20ed92c..e296a3bdca 100644 --- a/sled-agent/src/hardware_monitor.rs +++ b/sled-agent/src/hardware_monitor.rs @@ -149,6 +149,11 @@ impl HardwareMonitor { /// /// This should be spawned into a tokio task pub async fn run(&mut self) { + // Check the latest hardware snapshot; we could have missed events + // between the creation of the hardware manager and our subscription of + // its monitor. + self.check_latest_hardware_snapshot().await; + loop { tokio::select! { Some(msg) = self.handle_rx.recv() => { From 327578ca51bd75f9905f9126f56cfb1fd1b8aaa0 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 6 Oct 2023 22:39:16 +0000 Subject: [PATCH 26/66] wip --- sled-agent/src/bootstrap/http_entrypoints.rs | 6 +- sled-agent/src/bootstrap/rack_ops.rs | 12 +- sled-agent/src/bootstrap/rss_handle.rs | 6 +- sled-agent/src/bootstrap/server.rs | 172 +--- sled-agent/src/instance.rs | 11 +- sled-agent/src/instance_manager.rs | 6 +- sled-agent/src/params.rs | 61 +- sled-agent/src/rack_setup/plan/service.rs | 17 +- sled-agent/src/rack_setup/plan/sled.rs | 9 +- sled-agent/src/rack_setup/service.rs | 26 +- sled-agent/src/server.rs | 9 +- sled-agent/src/sled_agent.rs | 49 +- sled-agent/src/storage/dataset.rs | 63 -- sled-agent/src/storage/dump_setup.rs | 76 +- sled-agent/src/storage/mod.rs | 1 - sled-storage/src/dump_setup.rs | 803 ------------------- sled-storage/src/lib.rs | 1 - 17 files changed, 156 insertions(+), 1172 deletions(-) delete mode 100644 sled-agent/src/storage/dataset.rs delete mode 100644 sled-storage/src/dump_setup.rs diff --git a/sled-agent/src/bootstrap/http_entrypoints.rs b/sled-agent/src/bootstrap/http_entrypoints.rs index c69bdeb0ce..7c32bf48a5 100644 --- a/sled-agent/src/bootstrap/http_entrypoints.rs +++ b/sled-agent/src/bootstrap/http_entrypoints.rs @@ -12,7 +12,6 @@ use super::BootstrapError; use super::RssAccessError; use crate::bootstrap::params::RackInitializeRequest; use crate::bootstrap::rack_ops::{RackInitId, RackResetId}; -use crate::storage_manager::StorageResources; use crate::updates::ConfigUpdates; use crate::updates::{Component, UpdateManager}; use bootstore::schemes::v0 as bootstore; @@ -25,6 +24,7 @@ use omicron_common::api::external::Error; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sled_hardware::Baseboard; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::net::Ipv6Addr; use tokio::sync::mpsc::error::TrySendError; @@ -33,7 +33,7 @@ use tokio::sync::{mpsc, oneshot}; pub(crate) struct BootstrapServerContext { pub(crate) base_log: Logger, pub(crate) global_zone_bootstrap_ip: Ipv6Addr, - pub(crate) storage_resources: StorageResources, + pub(crate) storage_manager: StorageHandle, pub(crate) bootstore_node_handle: bootstore::NodeHandle, pub(crate) baseboard: Baseboard, pub(crate) rss_access: RssAccess, @@ -50,7 +50,7 @@ impl BootstrapServerContext { self.rss_access.start_initializing( &self.base_log, self.global_zone_bootstrap_ip, - &self.storage_resources, + &self.storage_manager, &self.bootstore_node_handle, request, ) diff --git a/sled-agent/src/bootstrap/rack_ops.rs b/sled-agent/src/bootstrap/rack_ops.rs index b8721f8332..5cfd0b074a 100644 --- a/sled-agent/src/bootstrap/rack_ops.rs +++ b/sled-agent/src/bootstrap/rack_ops.rs @@ -8,11 +8,11 @@ use crate::bootstrap::http_entrypoints::RackOperationStatus; use crate::bootstrap::params::RackInitializeRequest; use crate::bootstrap::rss_handle::RssHandle; use crate::rack_setup::service::SetupServiceError; -use crate::storage_manager::StorageResources; use bootstore::schemes::v0 as bootstore; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::mem; use std::net::Ipv6Addr; @@ -171,7 +171,7 @@ impl RssAccess { &self, parent_log: &Logger, global_zone_bootstrap_ip: Ipv6Addr, - storage_resources: &StorageResources, + storage_manager: &StorageHandle, bootstore_node_handle: &bootstore::NodeHandle, request: RackInitializeRequest, ) -> Result { @@ -207,14 +207,14 @@ impl RssAccess { mem::drop(status); let parent_log = parent_log.clone(); - let storage_resources = storage_resources.clone(); + let storage_manager = storage_manager.clone(); let bootstore_node_handle = bootstore_node_handle.clone(); let status = Arc::clone(&self.status); tokio::spawn(async move { let result = rack_initialize( &parent_log, global_zone_bootstrap_ip, - storage_resources, + storage_manager, bootstore_node_handle, request, ) @@ -342,7 +342,7 @@ enum RssStatus { async fn rack_initialize( parent_log: &Logger, global_zone_bootstrap_ip: Ipv6Addr, - storage_resources: StorageResources, + storage_manager: StorageHandle, bootstore_node_handle: bootstore::NodeHandle, request: RackInitializeRequest, ) -> Result<(), SetupServiceError> { @@ -350,7 +350,7 @@ async fn rack_initialize( parent_log, request, global_zone_bootstrap_ip, - storage_resources, + storage_manager, bootstore_node_handle, ) .await diff --git a/sled-agent/src/bootstrap/rss_handle.rs b/sled-agent/src/bootstrap/rss_handle.rs index c82873d91d..5d9c01e7f2 100644 --- a/sled-agent/src/bootstrap/rss_handle.rs +++ b/sled-agent/src/bootstrap/rss_handle.rs @@ -9,7 +9,6 @@ use super::params::StartSledAgentRequest; use crate::rack_setup::config::SetupServiceConfig; use crate::rack_setup::service::RackSetupService; use crate::rack_setup::service::SetupServiceError; -use crate::storage_manager::StorageResources; use ::bootstrap_agent_client::Client as BootstrapAgentClient; use bootstore::schemes::v0 as bootstore; use futures::stream::FuturesUnordered; @@ -17,6 +16,7 @@ use futures::StreamExt; use omicron_common::backoff::retry_notify; use omicron_common::backoff::retry_policy_local; use omicron_common::backoff::BackoffError; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::net::Ipv6Addr; use std::net::SocketAddrV6; @@ -46,7 +46,7 @@ impl RssHandle { log: &Logger, config: SetupServiceConfig, our_bootstrap_address: Ipv6Addr, - storage_resources: StorageResources, + storage_manager: StorageHandle, bootstore: bootstore::NodeHandle, ) -> Result<(), SetupServiceError> { let (tx, rx) = rss_channel(our_bootstrap_address); @@ -54,7 +54,7 @@ impl RssHandle { let rss = RackSetupService::new( log.new(o!("component" => "RSS")), config, - storage_resources, + storage_manager, tx, bootstore, ); diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 4e07ee03b7..94c326eef5 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -8,7 +8,6 @@ use super::config::BOOTSTRAP_AGENT_HTTP_PORT; use super::http_entrypoints; use super::params::RackInitializeRequest; use super::params::StartSledAgentRequest; -use super::pre_server::BootstrapManagers; use super::rack_ops::RackInitId; use super::views::SledAgentResponse; use super::BootstrapError; @@ -23,15 +22,15 @@ use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; use crate::bootstrap::sprockets_server::SprocketsServer; use crate::config::Config as SledConfig; use crate::config::ConfigError; +use crate::long_running_tasks::LongRunningTaskHandles; use crate::server::Server as SledAgentServer; +use crate::services::ServiceManager; use crate::sled_agent::SledAgent; -use bootstore::schemes::v0 as bootstore; use camino::Utf8PathBuf; use cancel_safe_futures::TryStreamExt; use ddm_admin_client::Client as DdmAdminClient; use ddm_admin_client::DdmError; use dropshot::HttpServer; -use futures::Future; use futures::StreamExt; use illumos_utils::dladm; use illumos_utils::zfs; @@ -44,7 +43,6 @@ use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; use sled_hardware::underlay; -use sled_hardware::HardwareUpdate; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; use slog::Logger; @@ -52,7 +50,6 @@ use std::borrow::Cow; use std::io; use std::net::SocketAddr; use std::net::SocketAddrV6; -use tokio::sync::broadcast; use tokio::sync::mpsc; use tokio::sync::oneshot; use tokio::task::JoinHandle; @@ -206,9 +203,9 @@ impl Server { let bootstrap_context = BootstrapServerContext { base_log: base_log.clone(), global_zone_bootstrap_ip, - storage_resources: storage_resources.clone(), - bootstore_node_handle: bootstore_handles.node_handle.clone(), - baseboard: managers.hardware.baseboard(), + storage_manager: long_running_task_handles.storage_manager.clone(), + bootstore_node_handle: long_running_task_handles.bootstore.clone(), + baseboard: long_running_task_handles.hardware_manager.baseboard(), rss_access, updates: config.updates.clone(), sled_reset_tx, @@ -240,52 +237,31 @@ impl Server { // Do we have a persistent sled-agent request that we need to restore? let state = if let Some(ledger) = maybe_ledger { let sled_request = ledger.data(); - let sled_agent_server = wait_while_handling_hardware_updates( - start_sled_agent( - &config, - &sled_request.request, - &bootstore_handles.node_handle, - &managers, - &ddm_admin_localhost_client, - &base_log, - &startup_log, - ), - &mut hardware_monitor, - &managers, - None, // No underlay network yet + let sled_agent_server = start_sled_agent( + &config, + &sled_request.request, + long_running_task_handles.clone(), + service_manager, + &ddm_admin_localhost_client, + &base_log, &startup_log, - "restoring sled-agent (cold boot)", ) .await?; - let sled_agent = sled_agent_server.sled_agent(); - // We've created sled-agent; we need to (possibly) reconfigure the // switch zone, if we're a scrimlet, to give it our underlay network // information. - let underlay_network_info = sled_agent.switch_zone_underlay_info(); - info!( - startup_log, "Sled Agent started; rescanning hardware"; - "underlay_network_info" => ?underlay_network_info, - ); - managers - .check_latest_hardware_snapshot(Some(&sled_agent), &startup_log) + let sled_agent = sled_agent_server.sled_agent(); + long_running_task_handles + .hardware_monitor + .sled_agent_started(sled_agent.clone()) .await; // For cold boot specifically, we now need to load the services // we're responsible for, while continuing to handle hardware // notifications. This cannot fail: we retry indefinitely until // we're done loading services. - wait_while_handling_hardware_updates( - sled_agent.cold_boot_load_services(), - &mut hardware_monitor, - &managers, - Some(&sled_agent), - &startup_log, - "restoring sled-agent services (cold boot)", - ) - .await; - + sled_agent.cold_boot_load_services().await; SledAgentState::ServerStarted(sled_agent_server) } else { SledAgentState::Bootstrapping @@ -296,15 +272,13 @@ impl Server { // agent state. let inner = Inner { config, - hardware_monitor, state, sled_init_rx, sled_reset_rx, - managers, ddm_admin_localhost_client, - bootstore_handles, + long_running_task_handles, + service_manager, _sprockets_server_handle: sprockets_server_handle, - _key_manager_handle: key_manager_handle, base_log, }; let inner_task = tokio::spawn(inner.run()); @@ -378,8 +352,8 @@ impl From for StartError { async fn start_sled_agent( config: &SledConfig, request: &StartSledAgentRequest, - bootstore: &bootstore::NodeHandle, - managers: &BootstrapManagers, + long_running_task_handles: LongRunningTaskHandles, + service_manager: ServiceManager, ddmd_client: &DdmAdminClient, base_log: &Logger, log: &Logger, @@ -394,14 +368,17 @@ async fn start_sled_agent( if request.use_trust_quorum { info!(log, "KeyManager: using lrtq secret retriever"); let salt = request.hash_rack_id(); - LrtqOrHardcodedSecretRetriever::init_lrtq(salt, bootstore.clone()) + LrtqOrHardcodedSecretRetriever::init_lrtq( + salt, + long_running_task_handles.bootstore.clone(), + ) } else { info!(log, "KeyManager: using hardcoded secret retriever"); LrtqOrHardcodedSecretRetriever::init_hardcoded(); } // Inform the storage service that the key manager is available - managers.storage.key_manager_ready().await; + long_running_task_handles.storage_manager.key_manager_ready().await; // Start trying to notify ddmd of our sled prefix so it can // advertise it to other sleds. @@ -421,9 +398,8 @@ async fn start_sled_agent( config, base_log.clone(), request.clone(), - managers.service.clone(), - managers.storage.clone(), - bootstore.clone(), + long_running_task_handles.clone(), + service_manager, ) .await .map_err(SledAgentServerStartError::FailedStartingServer)?; @@ -432,7 +408,8 @@ async fn start_sled_agent( // Record this request so the sled agent can be automatically // initialized on the next boot. - let paths = sled_config_paths(managers.storage.resources()).await?; + let paths = + sled_config_paths(&long_running_task_handles.storage_manager).await?; let mut ledger = Ledger::new_with( &log, @@ -505,41 +482,6 @@ async fn sled_config_paths( Ok(paths) } -// Helper function to wait for `fut` while handling any updates about hardware. -async fn wait_while_handling_hardware_updates, T>( - fut: F, - hardware_monitor: &mut broadcast::Receiver, - managers: &BootstrapManagers, - sled_agent: Option<&SledAgent>, - log: &Logger, - log_phase: &str, -) -> T { - tokio::pin!(fut); - loop { - tokio::select! { - // Cancel-safe per the docs on `broadcast::Receiver::recv()`. - hardware_update = hardware_monitor.recv() => { - info!( - log, - "Handling hardware update message"; - "phase" => log_phase, - "update" => ?hardware_update, - ); - - managers.handle_hardware_update( - hardware_update, - sled_agent, - log, - ).await; - } - - // Cancel-safe: we're using a `&mut Future`; dropping the - // reference does not cancel the underlying future. - result = &mut fut => return result, - } - } -} - #[derive(Clone, Serialize, Deserialize, PartialEq, JsonSchema)] struct PersistentSledAgentRequest<'a> { request: Cow<'a, StartSledAgentRequest>, @@ -565,18 +507,16 @@ pub fn run_openapi() -> Result<(), String> { struct Inner { config: SledConfig, - hardware_monitor: broadcast::Receiver, state: SledAgentState, sled_init_rx: mpsc::Receiver<( StartSledAgentRequest, oneshot::Sender>, )>, sled_reset_rx: mpsc::Receiver>>, - managers: BootstrapManagers, ddm_admin_localhost_client: DdmAdminClient, - bootstore_handles: BootstoreHandles, + service_manager: ServiceManager, + long_running_task_handles: LongRunningTaskHandles, _sprockets_server_handle: JoinHandle<()>, - _key_manager_handle: JoinHandle<()>, base_log: Logger, } @@ -584,14 +524,7 @@ impl Inner { async fn run(mut self) { let log = self.base_log.new(o!("component" => "SledAgentMain")); loop { - // TODO-correctness We pause handling hardware update messages while - // we handle sled init/reset requests - is that okay? tokio::select! { - // Cancel-safe per the docs on `broadcast::Receiver::recv()`. - hardware_update = self.hardware_monitor.recv() => { - self.handle_hardware_update(hardware_update, &log).await; - } - // Cancel-safe per the docs on `mpsc::Receiver::recv()`. Some((request, response_tx)) = self.sled_init_rx.recv() => { self.handle_start_sled_agent_request( @@ -619,27 +552,6 @@ impl Inner { } } - async fn handle_hardware_update( - &self, - hardware_update: Result, - log: &Logger, - ) { - info!( - log, - "Handling hardware update message"; - "phase" => "bootstore-steady-state", - "update" => ?hardware_update, - ); - - self.managers - .handle_hardware_update( - hardware_update, - self.state.sled_agent(), - &log, - ) - .await; - } - async fn handle_start_sled_agent_request( &mut self, request: StartSledAgentRequest, @@ -651,8 +563,8 @@ impl Inner { let response = match start_sled_agent( &self.config, &request, - &self.bootstore_handles.node_handle, - &self.managers, + self.long_running_task_handles.clone(), + self.service_manager.clone(), &self.ddm_admin_localhost_client, &self.base_log, &log, @@ -663,11 +575,9 @@ impl Inner { // We've created sled-agent; we need to (possibly) // reconfigure the switch zone, if we're a scrimlet, to // give it our underlay network information. - self.managers - .check_latest_hardware_snapshot( - Some(server.sled_agent()), - log, - ) + self.long_running_task_handles + .hardware_monitor + .sled_agent_started(server.sled_agent().clone()) .await; self.state = SledAgentState::ServerStarted(server); @@ -725,11 +635,11 @@ impl Inner { async fn uninstall_sled_local_config(&self) -> Result<(), BootstrapError> { let config_dirs = self - .managers - .storage - .resources() - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + .long_running_task_handles + .storage_manager + .get_latest_resources() .await + .all_m2_mountpoints(CONFIG_DATASET) .into_iter(); for dir in config_dirs { diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index baf92af28a..88ece3e3b0 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -17,7 +17,6 @@ use crate::params::{ InstanceMigrationTargetParams, InstanceStateRequested, VpcFirewallRule, }; use crate::profile::*; -use crate::storage_manager::StorageResources; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; use anyhow::anyhow; @@ -40,7 +39,8 @@ use omicron_common::backoff; use propolis_client::Client as PropolisClient; use rand::prelude::SliceRandom; use rand::SeedableRng; -use sled_hardware::disk::ZONE_DATASET; +use sled_storage::dataset::ZONE_DATASET; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::net::IpAddr; use std::net::{SocketAddr, SocketAddrV6}; @@ -243,7 +243,7 @@ struct InstanceInner { nexus_client: NexusClientWithResolver, // Storage resources - storage: StorageResources, + storage: StorageHandle, // Object used to collect zone bundles from this instance when terminated. zone_bundler: ZoneBundler, @@ -622,7 +622,7 @@ impl Instance { vnic_allocator: VnicAllocator, port_manager: PortManager, nexus_client: NexusClientWithResolver, - storage: StorageResources, + storage: StorageHandle, zone_bundler: ZoneBundler, ) -> Result { info!(log, "Instance::new w/initial HW: {:?}", initial); @@ -889,8 +889,9 @@ impl Instance { let mut rng = rand::rngs::StdRng::from_entropy(); let root = inner .storage - .all_u2_mountpoints(ZONE_DATASET) + .get_latest_resources() .await + .all_u2_mountpoints(ZONE_DATASET) .choose(&mut rng) .ok_or_else(|| Error::U2NotFound)? .clone(); diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index bdd29e4d1f..c6310d28f2 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -12,7 +12,6 @@ use crate::params::{ InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, }; -use crate::storage_manager::StorageResources; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; use illumos_utils::dladm::Etherstub; @@ -21,6 +20,7 @@ use illumos_utils::opte::PortManager; use illumos_utils::vmm_reservoir; use omicron_common::api::external::ByteCount; use omicron_common::api::internal::nexus::InstanceRuntimeState; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeMap; use std::sync::{Arc, Mutex}; @@ -62,7 +62,7 @@ struct InstanceManagerInternal { vnic_allocator: VnicAllocator, port_manager: PortManager, - storage: StorageResources, + storage: StorageHandle, zone_bundler: ZoneBundler, } @@ -78,7 +78,7 @@ impl InstanceManager { nexus_client: NexusClientWithResolver, etherstub: Etherstub, port_manager: PortManager, - storage: StorageResources, + storage: StorageHandle, zone_bundler: ZoneBundler, ) -> Result { Ok(InstanceManager { diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index d0fa2fbe4d..e736793298 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -18,6 +18,7 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sled_hardware::Baseboard; pub use sled_hardware::DendriteAsic; +use sled_storage::dataset::DatasetName; use std::fmt::{Debug, Display, Formatter, Result as FormatResult}; use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; use std::time::Duration; @@ -210,64 +211,6 @@ pub struct Zpool { pub disk_type: DiskType, } -/// The type of a dataset, and an auxiliary information necessary -/// to successfully launch a zone managing the associated data. -#[derive( - Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, -)] -#[serde(tag = "type", rename_all = "snake_case")] -pub enum DatasetKind { - CockroachDb, - Crucible, - Clickhouse, - ClickhouseKeeper, - ExternalDns, - InternalDns, -} - -impl From for sled_agent_client::types::DatasetKind { - fn from(k: DatasetKind) -> Self { - use DatasetKind::*; - match k { - CockroachDb => Self::CockroachDb, - Crucible => Self::Crucible, - Clickhouse => Self::Clickhouse, - ClickhouseKeeper => Self::ClickhouseKeeper, - ExternalDns => Self::ExternalDns, - InternalDns => Self::InternalDns, - } - } -} - -impl From for nexus_client::types::DatasetKind { - fn from(k: DatasetKind) -> Self { - use DatasetKind::*; - match k { - CockroachDb => Self::Cockroach, - Crucible => Self::Crucible, - Clickhouse => Self::Clickhouse, - ClickhouseKeeper => Self::ClickhouseKeeper, - ExternalDns => Self::ExternalDns, - InternalDns => Self::InternalDns, - } - } -} - -impl std::fmt::Display for DatasetKind { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - use DatasetKind::*; - let s = match self { - Crucible => "crucible", - CockroachDb { .. } => "cockroachdb", - Clickhouse => "clickhouse", - ClickhouseKeeper => "clickhouse_keeper", - ExternalDns { .. } => "external_dns", - InternalDns { .. } => "internal_dns", - }; - write!(f, "{}", s) - } -} - /// Describes service-specific parameters. #[derive( Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, @@ -577,7 +520,7 @@ impl std::fmt::Display for ZoneType { )] pub struct DatasetRequest { pub id: Uuid, - pub name: crate::storage::dataset::DatasetName, + pub name: DatasetName, pub service_address: SocketAddrV6, } diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 2183aa7b63..01fababa4d 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -11,7 +11,6 @@ use crate::params::{ }; use crate::rack_setup::config::SetupServiceConfig as Config; use crate::storage::dataset::DatasetName; -use crate::storage_manager::StorageResources; use camino::Utf8PathBuf; use dns_service_client::types::DnsConfigParams; use illumos_utils::zpool::ZpoolName; @@ -35,6 +34,8 @@ use serde::{Deserialize, Serialize}; use sled_agent_client::{ types as SledAgentTypes, Client as SledAgentClient, Error as SledAgentError, }; +use sled_storage::dataset::CONFIG_DATASET; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::{BTreeSet, HashMap, HashSet}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV6}; @@ -124,11 +125,12 @@ const RSS_SERVICE_PLAN_FILENAME: &str = "rss-service-plan.json"; impl Plan { pub async fn load( log: &Logger, - storage: &StorageResources, + storage_manager: &StorageHandle, ) -> Result, PlanError> { - let paths: Vec = storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + let paths: Vec = storage_manager + .get_latest_resources() .await + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(RSS_SERVICE_PLAN_FILENAME)) .collect(); @@ -236,7 +238,7 @@ impl Plan { pub async fn create( log: &Logger, config: &Config, - storage: &StorageResources, + storage_manager: &StorageHandle, sleds: &HashMap, ) -> Result { let mut dns_builder = internal_dns::DnsConfigBuilder::new(); @@ -724,9 +726,10 @@ impl Plan { let plan = Self { services, dns_config }; // Once we've constructed a plan, write it down to durable storage. - let paths: Vec = storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + let paths: Vec = storage_manager + .get_latest_resources() .await + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(RSS_SERVICE_PLAN_FILENAME)) .collect(); diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index ea12f0db32..189216fd9b 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -13,6 +13,8 @@ use camino::Utf8PathBuf; use omicron_common::ledger::{self, Ledger, Ledgerable}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use sled_storage::dataset::CONFIG_DATASET; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::{HashMap, HashSet}; use std::net::{Ipv6Addr, SocketAddrV6}; @@ -77,7 +79,7 @@ impl Plan { pub async fn create( log: &Logger, config: &Config, - storage: &StorageResources, + storage_manager: &StorageHandle, bootstrap_addrs: HashSet, use_trust_quorum: bool, ) -> Result { @@ -119,9 +121,10 @@ impl Plan { let plan = Self { rack_id, sleds, config: config.clone() }; // Once we've constructed a plan, write it down to durable storage. - let paths: Vec = storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + let paths: Vec = storage_manager + .get_latest_resources() .await + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(RSS_SLED_PLAN_FILENAME)) .collect(); diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 805c889295..42290f5ce8 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -73,7 +73,6 @@ use crate::rack_setup::plan::service::{ use crate::rack_setup::plan::sled::{ Plan as SledPlan, PlanError as SledPlanError, }; -use crate::storage_manager::StorageResources; use bootstore::schemes::v0 as bootstore; use camino::Utf8PathBuf; use ddm_admin_client::{Client as DdmAdminClient, DdmError}; @@ -93,6 +92,8 @@ use sled_agent_client::{ types as SledAgentTypes, Client as SledAgentClient, Error as SledAgentError, }; use sled_hardware::underlay::BootstrapInterface; +use sled_storage::dataset::CONFIG_DATASET; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeSet; use std::collections::{HashMap, HashSet}; @@ -186,7 +187,7 @@ impl RackSetupService { pub(crate) fn new( log: Logger, config: Config, - storage_resources: StorageResources, + storage_manager: StorageHandle, local_bootstrap_agent: BootstrapAgentHandle, bootstore: bootstore::NodeHandle, ) -> Self { @@ -195,7 +196,7 @@ impl RackSetupService { if let Err(e) = svc .run( &config, - &storage_resources, + &storage_manager, local_bootstrap_agent, bootstore, ) @@ -741,7 +742,7 @@ impl ServiceInner { async fn run( &self, config: &Config, - storage_resources: &StorageResources, + storage_manager: &StorageHandle, local_bootstrap_agent: BootstrapAgentHandle, bootstore: bootstore::NodeHandle, ) -> Result<(), SetupServiceError> { @@ -752,9 +753,10 @@ impl ServiceInner { config.az_subnet(), )?; - let marker_paths: Vec = storage_resources - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + let marker_paths: Vec = storage_manager + .get_latest_resources() .await + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(RSS_COMPLETED_FILENAME)) .collect(); @@ -775,7 +777,7 @@ impl ServiceInner { "RSS configuration looks like it has already been applied", ); - let sled_plan = SledPlan::load(&self.log, storage_resources) + let sled_plan = SledPlan::load(&self.log, storage_manager) .await? .expect("Sled plan should exist if completed marker exists"); if &sled_plan.config != config { @@ -783,7 +785,7 @@ impl ServiceInner { "Configuration changed".to_string(), )); } - let service_plan = ServicePlan::load(&self.log, storage_resources) + let service_plan = ServicePlan::load(&self.log, storage_manager) .await? .expect("Service plan should exist if completed marker exists"); @@ -817,7 +819,7 @@ impl ServiceInner { BootstrapAddressDiscovery::OnlyThese { addrs } => addrs.clone(), }; let maybe_sled_plan = - SledPlan::load(&self.log, storage_resources).await?; + SledPlan::load(&self.log, storage_manager).await?; if let Some(plan) = &maybe_sled_plan { let stored_peers: HashSet = plan.sleds.keys().map(|a| *a.ip()).collect(); @@ -849,7 +851,7 @@ impl ServiceInner { SledPlan::create( &self.log, config, - &storage_resources, + &storage_manager, bootstrap_addrs, config.trust_quorum_peers.is_some(), ) @@ -902,14 +904,14 @@ impl ServiceInner { }) .collect(); let service_plan = if let Some(plan) = - ServicePlan::load(&self.log, storage_resources).await? + ServicePlan::load(&self.log, storage_manager).await? { plan } else { ServicePlan::create( &self.log, &config, - &storage_resources, + &storage_manager, &plan.sleds, ) .await? diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs index 156547627c..c9828e7542 100644 --- a/sled-agent/src/server.rs +++ b/sled-agent/src/server.rs @@ -8,10 +8,9 @@ use super::config::Config; use super::http_entrypoints::api as http_api; use super::sled_agent::SledAgent; use crate::bootstrap::params::StartSledAgentRequest; +use crate::long_running_tasks::LongRunningTaskHandles; use crate::nexus::NexusClientWithResolver; use crate::services::ServiceManager; -use crate::storage_manager::StorageManager; -use bootstore::schemes::v0 as bootstore; use internal_dns::resolver::Resolver; use slog::Logger; use std::net::SocketAddr; @@ -39,9 +38,8 @@ impl Server { config: &Config, log: Logger, request: StartSledAgentRequest, + long_running_tasks_handles: LongRunningTaskHandles, services: ServiceManager, - storage: StorageManager, - bootstore: bootstore::NodeHandle, ) -> Result { info!(log, "setting up sled agent server"); @@ -63,8 +61,7 @@ impl Server { nexus_client, request, services, - storage, - bootstore, + long_running_tasks_handles, ) .await .map_err(|e| e.to_string())?; diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index dc130524f6..475cbf8018 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -10,6 +10,7 @@ use crate::bootstrap::early_networking::{ use crate::bootstrap::params::StartSledAgentRequest; use crate::config::Config; use crate::instance_manager::InstanceManager; +use crate::long_running_tasks::LongRunningTaskHandles; use crate::nexus::{NexusClientWithResolver, NexusRequestQueue}; use crate::params::{ DiskStateRequested, InstanceHardware, InstanceMigrationSourceParams, @@ -18,11 +19,9 @@ use crate::params::{ VpcFirewallRule, ZoneBundleMetadata, Zpool, }; use crate::services::{self, ServiceManager}; -use crate::storage_manager::{self, StorageManager}; use crate::updates::{ConfigUpdates, UpdateManager}; use crate::zone_bundle; use crate::zone_bundle::BundleError; -use bootstore::schemes::v0 as bootstore; use camino::Utf8PathBuf; use dropshot::HttpError; use illumos_utils::opte::params::{ @@ -46,6 +45,8 @@ use omicron_common::backoff::{ }; use sled_hardware::underlay; use sled_hardware::HardwareManager; +use sled_storage::dataset::DatasetName; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeMap; use std::net::{Ipv6Addr, SocketAddrV6}; @@ -200,7 +201,7 @@ struct SledAgentInner { subnet: Ipv6Subnet, // Component of Sled Agent responsible for storage and dataset management. - storage: StorageManager, + storage: StorageHandle, // Component of Sled Agent responsible for managing Propolis instances. instances: InstanceManager, @@ -254,8 +255,7 @@ impl SledAgent { nexus_client: NexusClientWithResolver, request: StartSledAgentRequest, services: ServiceManager, - storage: StorageManager, - bootstore: bootstore::NodeHandle, + long_running_task_handles: LongRunningTaskHandles, ) -> Result { // Pass the "parent_log" to all subcomponents that want to set their own // "component" value. @@ -268,12 +268,17 @@ impl SledAgent { )); info!(&log, "SledAgent::new(..) starting"); + let storage_manager = &long_running_task_handles.storage_manager; + // Configure a swap device of the configured size before other system setup. match config.swap_device_size_gb { Some(sz) if sz > 0 => { info!(log, "Requested swap device of size {} GiB", sz); - let boot_disk = - storage.resources().boot_disk().await.ok_or_else(|| { + let boot_disk = storage_manager + .get_latest_resources() + .await + .boot_disk() + .ok_or_else(|| { crate::swap_device::SwapDeviceError::BootDiskNotFound })?; crate::swap_device::ensure_swap_device( @@ -324,28 +329,13 @@ impl SledAgent { *sled_address.ip(), ); - storage - .setup_underlay_access(storage_manager::UnderlayAccess { - nexus_client: nexus_client.clone(), - sled_id: request.id, - }) - .await?; - - // TODO-correctness The bootstrap agent _also_ has a `HardwareManager`. - // We only use it for reading properties, but it's not `Clone`able - // because it's holding an inner task handle. Could we add a way to get - // a read-only handle to it, and have bootstrap agent give us that - // instead of creating a new full one ourselves? - let hardware = HardwareManager::new(&parent_log, services.sled_mode()) - .map_err(|e| Error::Hardware(e))?; - let instances = InstanceManager::new( parent_log.clone(), nexus_client.clone(), etherstub.clone(), port_manager.clone(), - storage.resources().clone(), - storage.zone_bundler().clone(), + storage_manager.clone(), + long_running_task_handles.zone_bundler.clone(), )?; match config.vmm_reservoir_percentage { @@ -378,7 +368,8 @@ impl SledAgent { // until we have this, as we need to know which switches have uplinks to // correctly set up services. let get_network_config = || async { - let serialized_config = bootstore + let serialized_config = long_running_task_handles + .bootstore .get_network_config() .await .map_err(|err| BackoffError::transient(err.to_string()))? @@ -421,14 +412,13 @@ impl SledAgent { rack_network_config.clone(), )?; - let zone_bundler = storage.zone_bundler().clone(); let sled_agent = SledAgent { inner: Arc::new(SledAgentInner { id: request.id, subnet: request.subnet, - storage, + storage: long_running_task_handles.storage_manager.clone(), instances, - hardware, + hardware: long_running_task_handles.hardware_manager.clone(), updates, port_manager, services, @@ -442,7 +432,7 @@ impl SledAgent { // request queue? nexus_request_queue: NexusRequestQueue::new(), rack_network_config, - zone_bundler, + zone_bundler: long_running_task_handles.zone_bundler.clone(), }), log: log.clone(), }; @@ -462,6 +452,7 @@ impl SledAgent { /// Blocks until all services have started, retrying indefinitely on /// failure. pub(crate) async fn cold_boot_load_services(&self) { + info!(self.log, "Loading cold boot services"); retry_notify( retry_policy_internal_service_aggressive(), || async { diff --git a/sled-agent/src/storage/dataset.rs b/sled-agent/src/storage/dataset.rs deleted file mode 100644 index 4efc0f320a..0000000000 --- a/sled-agent/src/storage/dataset.rs +++ /dev/null @@ -1,63 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use crate::params::DatasetKind; -use illumos_utils::zpool::ZpoolName; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; -use std::str::FromStr; - -#[derive( - Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Clone, JsonSchema, -)] -pub struct DatasetName { - // A unique identifier for the Zpool on which the dataset is stored. - pool_name: ZpoolName, - // A name for the dataset within the Zpool. - kind: DatasetKind, -} - -impl DatasetName { - pub fn new(pool_name: ZpoolName, kind: DatasetKind) -> Self { - Self { pool_name, kind } - } - - pub fn pool(&self) -> &ZpoolName { - &self.pool_name - } - - pub fn dataset(&self) -> &DatasetKind { - &self.kind - } - - pub fn full(&self) -> String { - format!("{}/{}", self.pool_name, self.kind) - } -} - -impl From for sled_agent_client::types::DatasetName { - fn from(n: DatasetName) -> Self { - Self { - pool_name: sled_agent_client::types::ZpoolName::from_str( - &n.pool().to_string(), - ) - .unwrap(), - kind: n.dataset().clone().into(), - } - } -} - -#[cfg(test)] -mod test { - use super::*; - use uuid::Uuid; - - #[test] - fn serialize_dataset_name() { - let pool = ZpoolName::new_internal(Uuid::new_v4()); - let kind = DatasetKind::Crucible; - let name = DatasetName::new(pool, kind); - toml::to_string(&name).unwrap(); - } -} diff --git a/sled-agent/src/storage/dump_setup.rs b/sled-agent/src/storage/dump_setup.rs index 9b5edc0a7e..ea60998955 100644 --- a/sled-agent/src/storage/dump_setup.rs +++ b/sled-agent/src/storage/dump_setup.rs @@ -1,4 +1,3 @@ -use crate::storage_manager::DiskWrapper; use camino::Utf8PathBuf; use derive_more::{AsRef, Deref, From}; use illumos_utils::dumpadm::DumpAdmError; @@ -6,6 +5,8 @@ use illumos_utils::zone::{AdmError, Zones}; use illumos_utils::zpool::{ZpoolHealth, ZpoolName}; use omicron_common::disk::DiskIdentity; use sled_hardware::DiskVariant; +use sled_storage::dataset::{CRASH_DATASET, DUMP_DATASET}; +use sled_storage::disk::Disk; use slog::Logger; use std::collections::{HashMap, HashSet}; use std::ffi::OsString; @@ -70,11 +71,11 @@ trait GetMountpoint: std::ops::Deref { } impl GetMountpoint for DebugZpool { type NewType = DebugDataset; - const MOUNTPOINT: &'static str = sled_hardware::disk::DUMP_DATASET; + const MOUNTPOINT: &'static str = DUMP_DATASET; } impl GetMountpoint for CoreZpool { type NewType = CoreDataset; - const MOUNTPOINT: &'static str = sled_hardware::disk::CRASH_DATASET; + const MOUNTPOINT: &'static str = CRASH_DATASET; } struct DumpSetupWorker { @@ -99,50 +100,51 @@ const ARCHIVAL_INTERVAL: Duration = Duration::from_secs(300); impl DumpSetup { pub(crate) async fn update_dumpdev_setup( &self, - disks: &mut MutexGuard<'_, HashMap>, + disks: &mut MutexGuard<'_, HashMap>, ) { let log = &self.log; let mut m2_dump_slices = Vec::new(); let mut u2_debug_datasets = Vec::new(); let mut m2_core_datasets = Vec::new(); - for (_id, disk_wrapper) in disks.iter() { - match disk_wrapper { - DiskWrapper::Real { disk, .. } => match disk.variant() { - DiskVariant::M2 => { - match disk.dump_device_devfs_path(false) { - Ok(path) => { - m2_dump_slices.push(DumpSlicePath(path)) - } - Err(err) => { - warn!(log, "Error getting dump device devfs path: {err:?}"); - } + for (_id, disk) in disks.iter() { + if disk.is_synthetic() { + // We only setup dump devices on real disks + continue; + } + match disk.variant() { + DiskVariant::M2 => { + match disk.dump_device_devfs_path(false) { + Ok(path) => m2_dump_slices.push(DumpSlicePath(path)), + Err(err) => { + warn!( + log, + "Error getting dump device devfs path: {err:?}" + ); } - let name = disk.zpool_name(); - if let Ok(info) = illumos_utils::zpool::Zpool::get_info( - &name.to_string(), - ) { - if info.health() == ZpoolHealth::Online { - m2_core_datasets.push(CoreZpool(name.clone())); - } else { - warn!(log, "Zpool {name:?} not online, won't attempt to save process core dumps there"); - } + } + let name = disk.zpool_name(); + if let Ok(info) = + illumos_utils::zpool::Zpool::get_info(&name.to_string()) + { + if info.health() == ZpoolHealth::Online { + m2_core_datasets.push(CoreZpool(name.clone())); + } else { + warn!(log, "Zpool {name:?} not online, won't attempt to save process core dumps there"); } } - DiskVariant::U2 => { - let name = disk.zpool_name(); - if let Ok(info) = illumos_utils::zpool::Zpool::get_info( - &name.to_string(), - ) { - if info.health() == ZpoolHealth::Online { - u2_debug_datasets - .push(DebugZpool(name.clone())); - } else { - warn!(log, "Zpool {name:?} not online, won't attempt to save kernel core dumps there"); - } + } + DiskVariant::U2 => { + let name = disk.zpool_name(); + if let Ok(info) = + illumos_utils::zpool::Zpool::get_info(&name.to_string()) + { + if info.health() == ZpoolHealth::Online { + u2_debug_datasets.push(DebugZpool(name.clone())); + } else { + warn!(log, "Zpool {name:?} not online, won't attempt to save kernel core dumps there"); } } - }, - DiskWrapper::Synthetic { .. } => {} + } } } diff --git a/sled-agent/src/storage/mod.rs b/sled-agent/src/storage/mod.rs index 74bd59a151..663ebe8274 100644 --- a/sled-agent/src/storage/mod.rs +++ b/sled-agent/src/storage/mod.rs @@ -4,5 +4,4 @@ //! Management of local storage -pub(crate) mod dataset; pub(crate) mod dump_setup; diff --git a/sled-storage/src/dump_setup.rs b/sled-storage/src/dump_setup.rs deleted file mode 100644 index 39c6aa2995..0000000000 --- a/sled-storage/src/dump_setup.rs +++ /dev/null @@ -1,803 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Dump dataset setup - -use crate::dataset::{CRASH_DATASET, DUMP_DATASET}; -use crate::disk::Disk; -use camino::Utf8PathBuf; -use derive_more::{AsRef, Deref, From}; -use illumos_utils::dumpadm::DumpAdmError; -use illumos_utils::zone::{AdmError, Zones}; -use illumos_utils::zpool::{ZpoolHealth, ZpoolName}; -use omicron_common::disk::DiskIdentity; -use sled_hardware::DiskVariant; -use slog::{debug, error, info, o, warn, Logger}; -use std::collections::{HashMap, HashSet}; -use std::ffi::OsString; -use std::path::{Path, PathBuf}; -use std::sync::{Arc, Weak}; -use std::time::{Duration, SystemTime, SystemTimeError, UNIX_EPOCH}; -use tokio::sync::MutexGuard; - -pub struct DumpSetup { - worker: Arc>, - _poller: std::thread::JoinHandle<()>, - log: Logger, -} - -impl DumpSetup { - pub fn new(log: &Logger) -> Self { - let worker = Arc::new(std::sync::Mutex::new(DumpSetupWorker::new( - log.new(o!("component" => "DumpSetup-worker")), - ))); - let worker_weak = Arc::downgrade(&worker); - let log_poll = log.new(o!("component" => "DumpSetup-archival")); - let _poller = std::thread::spawn(move || { - Self::poll_file_archival(worker_weak, log_poll) - }); - let log = log.new(o!("component" => "DumpSetup")); - Self { worker, _poller, log } - } -} - -// we sure are passing a lot of Utf8PathBufs around, let's be careful about it -#[derive( - AsRef, Clone, Debug, Deref, Eq, From, Hash, Ord, PartialEq, PartialOrd, -)] -struct DumpSlicePath(Utf8PathBuf); -#[derive( - AsRef, Clone, Debug, Deref, Eq, From, Hash, Ord, PartialEq, PartialOrd, -)] -struct DebugDataset(Utf8PathBuf); -#[derive( - AsRef, Clone, Debug, Deref, Eq, From, Hash, Ord, PartialEq, PartialOrd, -)] -struct CoreDataset(Utf8PathBuf); - -#[derive(Deref)] -struct CoreZpool(ZpoolName); -#[derive(Deref)] -struct DebugZpool(ZpoolName); - -// only want to access these directories after they're mounted! -trait GetMountpoint: std::ops::Deref { - type NewType: From; - const MOUNTPOINT: &'static str; - fn mountpoint(&self) -> Result, ZfsGetError> { - if zfs_get_prop(self.to_string(), "mounted")? == "yes" { - Ok(Some(Self::NewType::from( - self.dataset_mountpoint(Self::MOUNTPOINT), - ))) - } else { - Ok(None) - } - } -} -impl GetMountpoint for DebugZpool { - type NewType = DebugDataset; - const MOUNTPOINT: &'static str = DUMP_DATASET; -} -impl GetMountpoint for CoreZpool { - type NewType = CoreDataset; - const MOUNTPOINT: &'static str = CRASH_DATASET; -} - -struct DumpSetupWorker { - core_dataset_names: Vec, - debug_dataset_names: Vec, - - chosen_dump_slice: Option, - chosen_debug_dir: Option, - chosen_core_dir: Option, - - known_dump_slices: Vec, - known_debug_dirs: Vec, - known_core_dirs: Vec, - - savecored_slices: HashSet, - - log: Logger, -} - -const ARCHIVAL_INTERVAL: Duration = Duration::from_secs(300); - -impl DumpSetup { - pub(crate) async fn update_dumpdev_setup( - &self, - disks: &mut MutexGuard<'_, HashMap>, - ) { - let log = &self.log; - let mut m2_dump_slices = Vec::new(); - let mut u2_debug_datasets = Vec::new(); - let mut m2_core_datasets = Vec::new(); - for (_id, disk) in disks.iter() { - if disk.is_synthetic() { - // We only setup dump devices on real disks - continue; - } - match disk.variant() { - DiskVariant::M2 => { - match disk.dump_device_devfs_path(false) { - Ok(path) => m2_dump_slices.push(DumpSlicePath(path)), - Err(err) => { - warn!( - log, - "Error getting dump device devfs path: {err:?}" - ); - } - } - let name = disk.zpool_name(); - if let Ok(info) = - illumos_utils::zpool::Zpool::get_info(&name.to_string()) - { - if info.health() == ZpoolHealth::Online { - m2_core_datasets.push(CoreZpool(name.clone())); - } else { - warn!(log, "Zpool {name:?} not online, won't attempt to save process core dumps there"); - } - } - } - DiskVariant::U2 => { - let name = disk.zpool_name(); - if let Ok(info) = - illumos_utils::zpool::Zpool::get_info(&name.to_string()) - { - if info.health() == ZpoolHealth::Online { - u2_debug_datasets.push(DebugZpool(name.clone())); - } else { - warn!(log, "Zpool {name:?} not online, won't attempt to save kernel core dumps there"); - } - } - } - } - } - - let savecore_lock = self.worker.clone(); - let log_tmp = log.new(o!("component" => "DumpSetup-mutex")); - tokio::task::spawn_blocking(move || match savecore_lock.lock() { - Ok(mut guard) => { - guard.update_disk_loadout( - m2_dump_slices, - u2_debug_datasets, - m2_core_datasets, - ); - } - Err(err) => { - error!(log_tmp, "DumpSetup mutex poisoned: {err:?}"); - } - }); - } - - fn poll_file_archival( - worker: Weak>, - log: Logger, - ) { - info!(log, "DumpSetup poll loop started."); - loop { - if let Some(mutex) = worker.upgrade() { - match mutex.lock() { - Ok(mut guard) => { - guard.reevaluate_choices(); - if let Err(err) = guard.archive_files() { - error!( - log, - "Failed to archive debug/dump files: {err:?}" - ); - } - } - Err(err) => { - error!( - log, - "DumpSetup mutex poisoned in poll thread: {err:?}" - ); - break; - } - } - } else { - info!( - log, - "DumpSetup weak pointer dropped, leaving poll loop." - ); - break; - } - std::thread::sleep(ARCHIVAL_INTERVAL); - } - } -} - -#[derive(Debug, thiserror::Error)] -enum ZfsGetError { - #[error("Error executing 'zfs get' command: {0}")] - IoError(#[from] std::io::Error), - #[error("Output of 'zfs get' was not only not an integer string, it wasn't even UTF-8: {0}")] - Utf8(#[from] std::string::FromUtf8Error), - #[error("Error parsing output of 'zfs get' command as integer: {0}")] - Parse(#[from] std::num::ParseIntError), -} - -const ZFS_PROP_USED: &str = "used"; -const ZFS_PROP_AVAILABLE: &str = "available"; - -fn zfs_get_integer( - mountpoint_or_name: impl AsRef, - property: &str, -) -> Result { - zfs_get_prop(mountpoint_or_name, property)?.parse().map_err(Into::into) -} - -fn zfs_get_prop( - mountpoint_or_name: impl AsRef + Sized, - property: &str, -) -> Result { - let mountpoint = mountpoint_or_name.as_ref(); - let mut cmd = std::process::Command::new(illumos_utils::zfs::ZFS); - cmd.arg("get").arg("-Hpo").arg("value"); - cmd.arg(property); - cmd.arg(mountpoint); - let output = cmd.output()?; - Ok(String::from_utf8(output.stdout)?.trim().to_string()) -} - -const DATASET_USAGE_PERCENT_CHOICE: u64 = 70; -const DATASET_USAGE_PERCENT_CLEANUP: u64 = 80; - -fn below_thresh( - mountpoint: &Utf8PathBuf, - percent: u64, -) -> Result<(bool, u64), ZfsGetError> { - let used = zfs_get_integer(mountpoint, ZFS_PROP_USED)?; - let available = zfs_get_integer(mountpoint, ZFS_PROP_AVAILABLE)?; - let capacity = used + available; - let below = (used * 100) / capacity < percent; - Ok((below, used)) -} - -impl DumpSetupWorker { - fn new(log: Logger) -> Self { - Self { - core_dataset_names: vec![], - debug_dataset_names: vec![], - chosen_dump_slice: None, - chosen_debug_dir: None, - chosen_core_dir: None, - known_dump_slices: vec![], - known_debug_dirs: vec![], - known_core_dirs: vec![], - savecored_slices: Default::default(), - log, - } - } - - fn update_disk_loadout( - &mut self, - dump_slices: Vec, - debug_datasets: Vec, - core_datasets: Vec, - ) { - self.core_dataset_names = core_datasets; - self.debug_dataset_names = debug_datasets; - - self.known_dump_slices = dump_slices; - - self.reevaluate_choices(); - } - - // only allow mounted zfs datasets into 'known_*_dirs', - // such that we don't render them non-auto-mountable by zfs - fn update_mounted_dirs(&mut self) { - self.known_debug_dirs = self - .debug_dataset_names - .iter() - .flat_map(|ds| ds.mountpoint()) - .flatten() - .collect(); - self.known_core_dirs = self - .core_dataset_names - .iter() - .flat_map(|ds| ds.mountpoint()) - .flatten() - .collect(); - } - - fn reevaluate_choices(&mut self) { - self.update_mounted_dirs(); - - self.known_dump_slices.sort(); - // sort key: prefer to choose a dataset where there's already other - // dumps so we don't shotgun them across every U.2, but only if they're - // below a certain usage threshold. - self.known_debug_dirs.sort_by_cached_key( - |mountpoint: &DebugDataset| { - match below_thresh(mountpoint.as_ref(), DATASET_USAGE_PERCENT_CHOICE) { - Ok((below, used)) => { - let priority = if below { 0 } else { 1 }; - (priority, used, mountpoint.clone()) - } - Err(err) => { - error!(self.log, "Could not query zfs properties of debug dump dir: {err:?}"); - // deprioritize anything we get errors querying. - (usize::MAX, u64::MAX, mountpoint.clone()) - } - } - }, - ); - self.known_core_dirs.sort_by_cached_key(|mnt| { - // these get archived periodically anyway, pick one with room - let available = zfs_get_integer(&**mnt, "available").unwrap_or(0); - (u64::MAX - available, mnt.clone()) - }); - - if let Some(x) = &self.chosen_debug_dir { - if !self.known_debug_dirs.contains(x) { - warn!(self.log, "Previously-chosen debug/dump dir {x:?} no longer exists in our view of reality"); - self.chosen_debug_dir = None; - } else { - match below_thresh(x.as_ref(), DATASET_USAGE_PERCENT_CLEANUP) { - Ok((true, _)) => {} - Ok((false, _)) => { - if self.known_debug_dirs.iter().any(|x| { - below_thresh( - x.as_ref(), - DATASET_USAGE_PERCENT_CHOICE, - ) - .unwrap_or((false, 0)) - .0 - }) { - info!(self.log, "Previously-chosen debug/dump dir {x:?} is over usage threshold, choosing a more vacant disk"); - self.chosen_debug_dir = None; - } else { - warn!(self.log, "All candidate debug/dump dirs are over usage threshold, removing older archived files"); - if let Err(err) = self.cleanup() { - error!(self.log, "Couldn't clean up any debug/dump dirs, may hit dataset quota in {x:?}: {err:?}"); - } else { - self.chosen_debug_dir = None; - } - } - } - Err(err) => { - error!(self.log, "Previously-chosen debug/dump dir {x:?} couldn't be queried for zfs properties! Choosing another. {err:?}"); - self.chosen_debug_dir = None; - } - } - } - } - if let Some(x) = &self.chosen_dump_slice { - if !self.known_dump_slices.contains(x) { - warn!(self.log, "Previously-chosen dump slice {x:?} no longer exists in our view of reality"); - self.chosen_dump_slice = None; - } - } - if let Some(x) = &self.chosen_core_dir { - if !self.known_core_dirs.contains(x) { - warn!(self.log, "Previously-chosen core dir {x:?} no longer exists in our view of reality"); - self.chosen_core_dir = None; - } - } - - if self.chosen_debug_dir.is_none() { - self.chosen_debug_dir = self.known_debug_dirs.first().cloned(); - } - - if self.chosen_core_dir.is_none() { - for core_dir in &self.known_core_dirs { - // tell the system to write *userspace process* cores here. - match illumos_utils::coreadm::coreadm(core_dir) { - Ok(()) => { - self.chosen_core_dir = Some(core_dir.clone()); - info!( - self.log, - "Set process core dump directory to {core_dir:?}" - ); - break; - } - Err(err) => { - error!(self.log, "Couldn't configure process core dump directory to {core_dir:?}: {err:?}"); - } - } - } - } - - if self.chosen_dump_slice.is_none() { - if self.chosen_debug_dir.is_some() { - for dump_slice in self.known_dump_slices.clone() { - // Let's try to see if it appears to have a kernel dump already - match illumos_utils::dumpadm::dump_flag_is_valid( - &dump_slice, - ) { - Ok(true) => { - debug!(self.log, "Dump slice {dump_slice:?} appears to have a valid header; will attempt to savecore"); - } - Ok(false) => { - info!(self.log, "Dump slice {dump_slice:?} appears to have already been saved"); - } - Err(err) => { - debug!(self.log, "Dump slice {dump_slice:?} appears to be unused: {err:?}"); - } - } - if let Ok(saved) = self.dumpadm_and_savecore(&dump_slice) { - if let Some(out) = saved { - info!(self.log, "Previous dump on slice {dump_slice:?} saved, configured slice as target for new dumps. {out:?}"); - } - self.chosen_dump_slice = Some(dump_slice); - break; - } - } - } else { - // Don't risk overwriting an existing kernel dump if there's - // already one there until we can attempt to savecore(8) - // it away and clear the flag to make room. - for dump_slice in &self.known_dump_slices { - match illumos_utils::dumpadm::dump_flag_is_valid(dump_slice) - { - Ok(false) => { - // Have dumpadm write the config for crash dumps to be - // on this slice, at least, until a U.2 comes along. - match illumos_utils::dumpadm::dumpadm( - dump_slice, None, - ) { - Ok(_) => { - info!(self.log, "Using dump device {dump_slice:?} with no savecore destination (no U.2 debug zvol yet)"); - self.chosen_dump_slice = - Some(dump_slice.clone()); - break; - } - Err(err) => { - warn!(self.log, "Could not configure {dump_slice:?} as dump device: {err:?}"); - } - } - } - Ok(true) => { - warn!(self.log, "Not configuring {dump_slice:?} as it appears to contain a dump we cannot yet send to a U.2 debug zvol"); - } - Err(err) => { - debug!( - self.log, - "Dump slice {dump_slice:?} appears to be unused : {err:?}", - ); - } - } - } - } - } - - if let Some(debug_dir) = self.chosen_debug_dir.clone() { - let mut changed_slice = false; - for dump_slice in self.known_dump_slices.clone() { - if !self.savecored_slices.contains(&dump_slice) { - changed_slice = true; - // temporarily changes the system's dump slice so savecore(8) - // can update the header in the slice when it finishes... - match self.dumpadm_and_savecore(&dump_slice) { - Ok(saved) => { - if let Some(stdout) = &saved { - info!( - self.log, - "Saved dump from {dump_slice:?} to {debug_dir:?}: {stdout:?}" - ); - } else { - info!( - self.log, - "Set {dump_slice:?} as system dump slice", - ); - } - } - Err(err) => { - warn!(self.log, "Could not configure {dump_slice:?} as dump device with {debug_dir:?} as savecore destination: {err:?}"); - } - } - } - } - - // ...so then we restore the chosen dump slice for the system to use - // in the event of a kernel crash - if changed_slice { - if let Some(dump_slice) = &self.chosen_dump_slice { - if let Err(err) = - illumos_utils::dumpadm::dumpadm(dump_slice, None) - { - error!(self.log, "Could not restore dump slice to {dump_slice:?}: {err:?}"); - } - } - } - } - } - - fn archive_files(&self) -> std::io::Result<()> { - if let Some(debug_dir) = &self.chosen_debug_dir { - if self.known_core_dirs.is_empty() { - info!(self.log, "No core dump locations yet known."); - } - for core_dir in &self.known_core_dirs { - if let Ok(dir) = core_dir.read_dir() { - for entry in dir.flatten() { - if let Some(path) = entry.file_name().to_str() { - let dest = debug_dir.join(path); - - if let Err(err) = - Self::copy_sync_and_remove(&entry.path(), &dest) - { - error!( - self.log, - "Failed to archive {entry:?}: {err:?}" - ); - } else { - info!( - self.log, - "Relocated {entry:?} to {dest:?}" - ); - } - } else { - error!(self.log, "Non-UTF8 path found while archiving core dumps: {entry:?}"); - } - } - } - } - } else { - info!( - self.log, - "No archival destination for crash dumps yet chosen." - ); - } - - if let Err(err) = self.archive_logs() { - if !matches!(err, ArchiveLogsError::NoDebugDirYet) { - error!( - self.log, - "Failure while trying to archive logs to debug dataset: {err:?}" - ); - } - } - - Ok(()) - } - - fn copy_sync_and_remove( - source: impl AsRef, - dest: impl AsRef, - ) -> std::io::Result<()> { - let source = source.as_ref(); - let dest = dest.as_ref(); - let mut dest_f = std::fs::File::create(&dest)?; - let mut src_f = std::fs::File::open(&source)?; - - std::io::copy(&mut src_f, &mut dest_f)?; - - dest_f.sync_all()?; - - drop(src_f); - drop(dest_f); - - std::fs::remove_file(source)?; - Ok(()) - } - - fn archive_logs(&self) -> Result<(), ArchiveLogsError> { - let debug_dir = self - .chosen_debug_dir - .as_ref() - .ok_or(ArchiveLogsError::NoDebugDirYet)?; - // zone crate's 'deprecated' functions collide if you try to enable - // its 'sync' and 'async' features simultaneously :( - let rt = - tokio::runtime::Runtime::new().map_err(ArchiveLogsError::Tokio)?; - let oxz_zones = rt.block_on(Zones::get())?; - self.archive_logs_inner( - debug_dir, - PathBuf::from("/var/svc/log"), - "global", - )?; - for zone in oxz_zones { - let logdir = zone.path().join("root/var/svc/log"); - let zone_name = zone.name(); - self.archive_logs_inner(debug_dir, logdir, zone_name)?; - } - Ok(()) - } - - fn archive_logs_inner( - &self, - debug_dir: &DebugDataset, - logdir: PathBuf, - zone_name: &str, - ) -> Result<(), ArchiveLogsError> { - let mut rotated_log_files = Vec::new(); - // patterns matching archived logs, e.g. foo.log.3 - // keep checking for greater numbers of digits until we don't find any - for n in 1..9 { - let pattern = logdir - .join(format!("*.log.{}", "[0-9]".repeat(n))) - .to_str() - .ok_or_else(|| ArchiveLogsError::Utf8(zone_name.to_string()))? - .to_string(); - rotated_log_files.extend(glob::glob(&pattern)?.flatten()); - } - let dest_dir = debug_dir.join(zone_name).into_std_path_buf(); - if !rotated_log_files.is_empty() { - std::fs::create_dir_all(&dest_dir)?; - let count = rotated_log_files.len(); - info!( - self.log, - "Archiving {count} log files from {zone_name} zone" - ); - } - for entry in rotated_log_files { - let src_name = entry.file_name().unwrap(); - // as we archive them, logadm will keep resetting to .log.0, - // so we need to maintain our own numbering in the dest dataset. - // we'll use the modified date of the rotated log file, or try - // falling back to the time of archival if that fails, and - // falling back to counting up from 0 if *that* somehow fails. - let mut n = entry - .metadata() - .and_then(|m| m.modified()) - .unwrap_or_else(|_| SystemTime::now()) - .duration_since(UNIX_EPOCH) - .map(|d| d.as_secs()) - .unwrap_or(0); - let mut dest; - loop { - dest = dest_dir.join(src_name).with_extension(format!("{n}")); - if dest.exists() { - n += 1; - } else { - break; - } - } - if let Err(err) = Self::copy_sync_and_remove(&entry, dest) { - warn!(self.log, "Failed to archive {entry:?}: {err:?}"); - } - } - Ok(()) - } - - // Have dumpadm write the config for crash dumps to be - // on this slice, and then invoke savecore(8) to save any - // dump that's already present there. - // - // NOTE: because of the need to have dumpadm change the global - // state of which slice the system is using for dumps in order - // for savecore to behave the way we want (i.e. clear the flag - // after succeeding), we could hypothetically miss a dump if - // the kernel crashes again while savecore is still running. - fn dumpadm_and_savecore( - &mut self, - dump_slice: &DumpSlicePath, - ) -> Result, DumpAdmError> { - // TODO: untangle savecore from illumos_utils::dumpadm - assert!(self.chosen_debug_dir.is_some()); - - let savecore_dir = self.chosen_debug_dir.clone().unwrap().0; - - match illumos_utils::dumpadm::dumpadm(&dump_slice, Some(&savecore_dir)) - { - Ok(saved) => { - self.savecored_slices.insert(dump_slice.clone()); - Ok(saved) - } - Err(err) => Err(err), - } - } - - fn cleanup(&self) -> Result<(), CleanupError> { - let mut dir_info = Vec::new(); - for dir in &self.known_debug_dirs { - match Self::scope_dir_for_cleanup(dir) { - Ok(info) => { - dir_info.push((info, dir)); - } - Err(err) => { - error!(self.log, "Could not analyze {dir:?} for debug dataset cleanup task: {err:?}"); - } - } - } - if dir_info.is_empty() { - return Err(CleanupError::NoDatasetsToClean); - } - // find dir with oldest average time of files that must be deleted - // to achieve desired threshold, and reclaim that space. - dir_info.sort(); - 'outer: for (dir_info, dir) in dir_info { - let CleanupDirInfo { average_time: _, num_to_delete, file_list } = - dir_info; - for (_time, _bytes, path) in &file_list[..num_to_delete as usize] { - // if we are unable to remove a file, we cannot guarantee - // that we will reach our target size threshold, and suspect - // the i/o error *may* be an issue with the underlying disk, so - // we continue to the dataset with the next-oldest average age - // of files-to-delete in the sorted list. - if let Err(err) = std::fs::remove_file(&path) { - error!(self.log, "Couldn't delete {path:?} from debug dataset, skipping {dir:?}. {err:?}"); - continue 'outer; - } - } - // we made it through all the files we planned to remove, thereby - // freeing up enough space on one of the debug datasets for it to - // be chosen when reevaluating targets. - break; - } - Ok(()) - } - - fn scope_dir_for_cleanup( - debug_dir: &DebugDataset, - ) -> Result { - let used = zfs_get_integer(&**debug_dir, ZFS_PROP_USED)?; - let available = zfs_get_integer(&**debug_dir, ZFS_PROP_AVAILABLE)?; - let capacity = used + available; - - let target_used = capacity * DATASET_USAGE_PERCENT_CHOICE / 100; - - let mut file_list = Vec::new(); - // find all files in the debug dataset and sort by modified time - for path in glob::glob(debug_dir.join("**/*").as_str())?.flatten() { - let meta = std::fs::metadata(&path)?; - // we need this to be a Duration rather than SystemTime so we can - // do math to it later. - let time = meta.modified()?.duration_since(UNIX_EPOCH)?; - let size = meta.len(); - - file_list.push((time, size, path)) - } - file_list.sort(); - - // find how many old files must be deleted to get the dataset under - // the limit, and what the average age of that set is. - let mut possible_bytes = 0; - let mut total_time = Duration::ZERO; - let mut num_to_delete = 0; - for (time, size, _path) in &file_list { - if used - possible_bytes < target_used { - break; - } else { - total_time += *time; - num_to_delete += 1; - possible_bytes += size; - } - } - let average_time = - total_time.checked_div(num_to_delete).unwrap_or(Duration::MAX); - - Ok(CleanupDirInfo { average_time, num_to_delete, file_list }) - } -} - -#[derive(thiserror::Error, Debug)] -enum ArchiveLogsError { - #[error("Couldn't make an async runtime to get zone info: {0}")] - Tokio(std::io::Error), - #[error("I/O error: {0}")] - IoError(#[from] std::io::Error), - #[error("Error calling zoneadm: {0}")] - Zoneadm(#[from] AdmError), - #[error("Non-UTF8 zone path for zone {0}")] - Utf8(String), - #[error("Glob pattern invalid: {0}")] - Glob(#[from] glob::PatternError), - #[error( - "No debug dir into which we should archive logs has yet been chosen" - )] - NoDebugDirYet, -} - -#[derive(thiserror::Error, Debug)] -enum CleanupError { - #[error("No debug datasets were successfully evaluated for cleanup")] - NoDatasetsToClean, - #[error("Failed to query ZFS properties: {0}")] - ZfsError(#[from] ZfsGetError), - #[error("I/O error: {0}")] - IoError(#[from] std::io::Error), - #[error("Glob pattern invalid: {0}")] - Glob(#[from] glob::PatternError), - #[error("A file's observed modified time was before the Unix epoch: {0}")] - TimelineWentSideways(#[from] SystemTimeError), -} - -#[derive(Ord, PartialOrd, Eq, PartialEq)] -struct CleanupDirInfo { - average_time: Duration, - num_to_delete: u32, - file_list: Vec<(Duration, u64, PathBuf)>, -} diff --git a/sled-storage/src/lib.rs b/sled-storage/src/lib.rs index 0c1b383d7f..fc08579d77 100644 --- a/sled-storage/src/lib.rs +++ b/sled-storage/src/lib.rs @@ -10,7 +10,6 @@ pub mod dataset; pub mod disk; -pub(crate) mod dump_setup; pub mod error; pub(crate) mod keyfile; pub mod manager; From 9593047345ba5c84d5db302b5f44c3d2f8908815 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 6 Oct 2023 22:50:53 +0000 Subject: [PATCH 27/66] wip --- sled-agent/src/bootstrap/server.rs | 2 +- sled-agent/src/services.rs | 29 ++++++++++++++++++----------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 94c326eef5..c9e1002306 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -514,8 +514,8 @@ struct Inner { )>, sled_reset_rx: mpsc::Receiver>>, ddm_admin_localhost_client: DdmAdminClient, - service_manager: ServiceManager, long_running_task_handles: LongRunningTaskHandles, + service_manager: ServiceManager, _sprockets_server_handle: JoinHandle<()>, base_log: Logger, } diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 3fcbf717fa..c22eae6baa 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -92,9 +92,8 @@ use sled_hardware::underlay; use sled_hardware::underlay::BOOTSTRAP_PREFIX; use sled_hardware::Baseboard; use sled_hardware::SledMode; -use sled_storage::dataset::{CONFIG_DATASET, ZONE_DATASET}; +use sled_storage::dataset::{CONFIG_DATASET, INSTALL_DATASET, ZONE_DATASET}; use sled_storage::manager::StorageHandle; -use sled_storage::resources::StorageResources; use slog::Logger; use std::collections::HashSet; use std::collections::{BTreeMap, HashMap}; @@ -1084,11 +1083,11 @@ impl ServiceManager { // If the boot disk exists, look for the image in the "install" dataset // there too. - if let Some((_, boot_zpool)) = self.inner.storage.boot_disk().await { - zone_image_paths.push( - boot_zpool - .dataset_mountpoint(sled_hardware::disk::INSTALL_DATASET), - ); + if let Some((_, boot_zpool)) = + self.inner.storage.get_latest_resources().await.boot_disk() + { + zone_image_paths + .push(boot_zpool.dataset_mountpoint(INSTALL_DATASET)); } let installed_zone = InstalledZone::install( @@ -2195,8 +2194,12 @@ impl ServiceManager { // Create zones that should be running let mut zone_requests = AllZoneRequests::default(); - let all_u2_roots = - self.inner.storage.all_u2_mountpoints(ZONE_DATASET).await; + let all_u2_roots = self + .inner + .storage + .get_latest_resources() + .await + .all_u2_mountpoints(ZONE_DATASET); for zone in zones_to_be_added { // Check if we think the zone should already be running let name = zone.zone_name(); @@ -2870,8 +2873,12 @@ impl ServiceManager { let root = if request.zone_type == ZoneType::Switch { Utf8PathBuf::from(ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT) } else { - let all_u2_roots = - self.inner.storage.all_u2_mountpoints(ZONE_DATASET).await; + let all_u2_roots = self + .inner + .storage + .get_latest_resources() + .await + .all_u2_mountpoints(ZONE_DATASET); let mut rng = rand::rngs::StdRng::from_entropy(); all_u2_roots .choose(&mut rng) From e64b5690121f3722c2fd00dbff91266006775c55 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 6 Oct 2023 23:33:13 +0000 Subject: [PATCH 28/66] wip --- sled-agent/src/bootstrap/server.rs | 2 +- sled-agent/src/hardware_monitor.rs | 11 +- sled-agent/src/hardware_monitor.sh | 1 - sled-agent/src/http_entrypoints.rs | 2 +- sled-agent/src/lib.rs | 1 - sled-agent/src/long_running_tasks.rs | 5 +- sled-agent/src/rack_setup/plan/service.rs | 7 +- sled-agent/src/rack_setup/plan/sled.rs | 6 +- sled-agent/src/sled_agent.rs | 32 +- sled-agent/src/storage_manager.rs | 928 ---------------------- sled-hardware/src/illumos/mod.rs | 1 - sled-storage/src/resources.rs | 17 +- 12 files changed, 50 insertions(+), 963 deletions(-) delete mode 100644 sled-agent/src/hardware_monitor.sh delete mode 100644 sled-agent/src/storage_manager.rs diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index c9e1002306..17d988e749 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -241,7 +241,7 @@ impl Server { &config, &sled_request.request, long_running_task_handles.clone(), - service_manager, + service_manager.clone(), &ddm_admin_localhost_client, &base_log, &startup_log, diff --git a/sled-agent/src/hardware_monitor.rs b/sled-agent/src/hardware_monitor.rs index e296a3bdca..03677e4e6d 100644 --- a/sled-agent/src/hardware_monitor.rs +++ b/sled-agent/src/hardware_monitor.rs @@ -55,13 +55,12 @@ impl TofinoManager { // // Returns whether the tofino was loaded or not pub fn become_ready(&mut self, service_manager: ServiceManager) -> bool { - match self { + let tofino_loaded = match self { Self::Ready(_) => panic!("ServiceManager is already available"), - Self::NotReady { tofino_loaded } => { - *self = Self::Ready(service_manager); - *tofino_loaded - } - } + Self::NotReady { tofino_loaded } => *tofino_loaded, + }; + *self = Self::Ready(service_manager); + tofino_loaded } } diff --git a/sled-agent/src/hardware_monitor.sh b/sled-agent/src/hardware_monitor.sh deleted file mode 100644 index 8b13789179..0000000000 --- a/sled-agent/src/hardware_monitor.sh +++ /dev/null @@ -1 +0,0 @@ - diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 440ccb73ee..f5b35ee77a 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -352,7 +352,7 @@ async fn zpools_get( rqctx: RequestContext, ) -> Result>, HttpError> { let sa = rqctx.context(); - Ok(HttpResponseOk(sa.zpools_get().await.map_err(|e| Error::from(e))?)) + Ok(HttpResponseOk(sa.zpools_get().await)) } #[endpoint { diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 154a133272..9a3b6d4f1b 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -33,7 +33,6 @@ mod services; mod sled_agent; mod smf_helper; pub(crate) mod storage; -mod storage_manager; mod swap_device; mod updates; mod zone_bundle; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index 223289bb2e..f322126714 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -99,8 +99,7 @@ fn spawn_key_manager(log: &Logger) -> StorageKeyRequester { let secret_retriever = LrtqOrHardcodedSecretRetriever::new(); let (mut key_manager, storage_key_requester) = KeyManager::new(log, secret_retriever); - let key_manager_handle = - tokio::spawn(async move { key_manager.run().await }); + tokio::spawn(async move { key_manager.run().await }); storage_key_requester } @@ -137,7 +136,7 @@ fn spawn_hardware_monitor( hardware_manager: &HardwareManager, storage_handle: &StorageHandle, ) -> HardwareMonitorHandle { - let (monitor, handle) = + let (mut monitor, handle) = HardwareMonitor::new(log, hardware_manager, storage_handle); tokio::spawn(async move { monitor.run().await; diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 01fababa4d..5a9b3939e4 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -6,11 +6,10 @@ use crate::bootstrap::params::StartSledAgentRequest; use crate::params::{ - DatasetKind, DatasetRequest, ServiceType, ServiceZoneRequest, - ServiceZoneService, ZoneType, + DatasetRequest, ServiceType, ServiceZoneRequest, ServiceZoneService, + ZoneType, }; use crate::rack_setup::config::SetupServiceConfig as Config; -use crate::storage::dataset::DatasetName; use camino::Utf8PathBuf; use dns_service_client::types::DnsConfigParams; use illumos_utils::zpool::ZpoolName; @@ -34,7 +33,7 @@ use serde::{Deserialize, Serialize}; use sled_agent_client::{ types as SledAgentTypes, Client as SledAgentClient, Error as SledAgentError, }; -use sled_storage::dataset::CONFIG_DATASET; +use sled_storage::dataset::{DatasetKind, DatasetName, CONFIG_DATASET}; use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::{BTreeSet, HashMap, HashSet}; diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index 189216fd9b..2c8814403b 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -8,7 +8,6 @@ use crate::bootstrap::{ config::BOOTSTRAP_AGENT_RACK_INIT_PORT, params::StartSledAgentRequest, }; use crate::rack_setup::config::SetupServiceConfig as Config; -use crate::storage_manager::StorageResources; use camino::Utf8PathBuf; use omicron_common::ledger::{self, Ledger, Ledgerable}; use schemars::JsonSchema; @@ -56,11 +55,12 @@ pub struct Plan { impl Plan { pub async fn load( log: &Logger, - storage: &StorageResources, + storage: &StorageHandle, ) -> Result, PlanError> { let paths: Vec = storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + .get_latest_resources() .await + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(RSS_SLED_PLAN_FILENAME)) .collect(); diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 475cbf8018..571ea45499 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -22,6 +22,7 @@ use crate::services::{self, ServiceManager}; use crate::updates::{ConfigUpdates, UpdateManager}; use crate::zone_bundle; use crate::zone_bundle::BundleError; +use bootstore::schemes::v0::NodeRequestError as BootstoreNodeRequestError; use camino::Utf8PathBuf; use dropshot::HttpError; use illumos_utils::opte::params::{ @@ -45,7 +46,6 @@ use omicron_common::backoff::{ }; use sled_hardware::underlay; use sled_hardware::HardwareManager; -use sled_storage::dataset::DatasetName; use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeMap; @@ -112,7 +112,7 @@ pub enum Error { EarlyNetworkError(#[from] EarlyNetworkSetupError), #[error("Bootstore Error: {0}")] - Bootstore(#[from] bootstore::NodeRequestError), + Bootstore(#[from] BootstoreNodeRequestError), #[error("Failed to deserialize early network config: {0}")] EarlyNetworkDeserialize(serde_json::Error), @@ -340,10 +340,15 @@ impl SledAgent { match config.vmm_reservoir_percentage { Some(sz) if sz > 0 && sz < 100 => { - instances.set_reservoir_size(&hardware, sz).map_err(|e| { - error!(log, "Failed to set VMM reservoir size: {e}"); - e - })?; + instances + .set_reservoir_size( + &long_running_task_handles.hardware_manager, + sz, + ) + .map_err(|e| { + error!(log, "Failed to set VMM reservoir size: {e}"); + e + })?; } Some(sz) if sz == 0 => { warn!(log, "Not using VMM reservoir (size 0 bytes requested)"); @@ -729,9 +734,18 @@ impl SledAgent { } /// Gets the sled's current list of all zpools. - pub async fn zpools_get(&self) -> Result, Error> { - let zpools = self.inner.storage.get_zpools().await?; - Ok(zpools) + pub async fn zpools_get(&self) -> Vec { + self.inner + .storage + .get_latest_resources() + .await + .get_all_zpools() + .into_iter() + .map(|(name, variant)| Zpool { + id: name.id(), + disk_type: variant.into(), + }) + .collect() } /// Returns whether or not the sled believes itself to be a scrimlet diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs deleted file mode 100644 index bbf89e41fb..0000000000 --- a/sled-agent/src/storage_manager.rs +++ /dev/null @@ -1,928 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Management of sled-local storage. - -use crate::nexus::NexusClientWithResolver; -use crate::storage::dataset::DatasetName; -use crate::storage::dump_setup::DumpSetup; -use crate::zone_bundle::ZoneBundler; -use camino::Utf8PathBuf; -use derive_more::From; -use futures::stream::FuturesOrdered; -use futures::FutureExt; -use futures::StreamExt; -use illumos_utils::zpool::{ZpoolKind, ZpoolName}; -use illumos_utils::{zfs::Mountpoint, zpool::ZpoolInfo}; -use key_manager::StorageKeyRequester; -use nexus_client::types::PhysicalDiskDeleteRequest; -use nexus_client::types::PhysicalDiskKind; -use nexus_client::types::PhysicalDiskPutRequest; -use nexus_client::types::ZpoolPutRequest; -use omicron_common::api::external::{ByteCount, ByteCountRangeError}; -use omicron_common::backoff; -use omicron_common::disk::DiskIdentity; -use sled_hardware::{Disk, DiskVariant, UnparsedDisk}; -use slog::Logger; -use std::collections::hash_map; -use std::collections::HashMap; -use std::collections::HashSet; -use std::convert::TryFrom; -use std::pin::Pin; -use std::sync::Arc; -use std::sync::OnceLock; -use std::time::Duration; -use tokio::sync::{mpsc, oneshot, Mutex}; -use tokio::task::JoinHandle; -use tokio::time::{interval, MissedTickBehavior}; -use uuid::Uuid; - -#[cfg(test)] -use illumos_utils::{zfs::MockZfs as Zfs, zpool::MockZpool as Zpool}; -#[cfg(not(test))] -use illumos_utils::{zfs::Zfs, zpool::Zpool}; - -// A key manager can only become ready once. This occurs during RSS or cold -// boot when the bootstore has detected it has a key share. -static KEY_MANAGER_READY: OnceLock<()> = OnceLock::new(); - -// The type of a future which is used to send a notification to Nexus. -type NotifyFut = - Pin> + Send>>; - -#[derive(Debug)] -struct NewFilesystemRequest { - dataset_id: Uuid, - dataset_name: DatasetName, - responder: oneshot::Sender>, -} - -struct UnderlayRequest { - underlay: UnderlayAccess, - responder: oneshot::Sender>, -} - -// The directory within the debug dataset in which bundles are created. -const BUNDLE_DIRECTORY: &str = "bundle"; - -// The directory for zone bundles. -const ZONE_BUNDLE_DIRECTORY: &str = "zone"; - -/// Describes the access to the underlay used by the StorageManager. -pub struct UnderlayAccess { - pub nexus_client: NexusClientWithResolver, - pub sled_id: Uuid, -} - -// A worker that starts zones for pools as they are received. -struct StorageWorker { - log: Logger, - nexus_notifications: FuturesOrdered, - rx: mpsc::Receiver, - underlay: Arc>>, - - // A mechanism for requesting disk encryption keys from the - // [`key_manager::KeyManager`] - key_requester: StorageKeyRequester, - - // Invokes dumpadm(8) and savecore(8) when new disks are encountered - dump_setup: Arc, -} - -#[derive(Clone, Debug)] -enum NotifyDiskRequest { - Add { identity: DiskIdentity, variant: DiskVariant }, - Remove(DiskIdentity), -} - -#[derive(From, Clone, Debug, PartialEq, Eq, Hash)] -enum QueuedDiskCreate { - Real(UnparsedDisk), - Synthetic(ZpoolName), -} - -impl QueuedDiskCreate { - fn is_synthetic(&self) -> bool { - if let QueuedDiskCreate::Synthetic(_) = self { - true - } else { - false - } - } -} - -impl StorageWorker { - // Adds a "notification to nexus" to `nexus_notifications`, - // informing it about the addition of `pool_id` to this sled. - async fn add_zpool_notify(&mut self, pool: &Pool, size: ByteCount) { - // The underlay network is setup once at sled-agent startup. Before - // there is an underlay we want to avoid sending notifications to nexus for - // two reasons: - // 1. They can't possibly succeed - // 2. They increase the backoff time exponentially, so that once - // sled-agent does start it may take much longer to notify nexus - // than it would if we avoid this. This goes especially so for rack - // setup, when bootstrap agent is waiting an aribtrary time for RSS - // initialization. - if self.underlay.lock().await.is_none() { - return; - } - - let pool_id = pool.name.id(); - let DiskIdentity { vendor, serial, model } = pool.parent.clone(); - let underlay = self.underlay.clone(); - - let notify_nexus = move || { - let zpool_request = ZpoolPutRequest { - size: size.into(), - disk_vendor: vendor.clone(), - disk_serial: serial.clone(), - disk_model: model.clone(), - }; - let underlay = underlay.clone(); - - async move { - let underlay_guard = underlay.lock().await; - let Some(underlay) = underlay_guard.as_ref() else { - return Err(backoff::BackoffError::transient( - Error::UnderlayNotInitialized.to_string(), - )); - }; - let sled_id = underlay.sled_id; - let nexus_client = underlay.nexus_client.client().clone(); - drop(underlay_guard); - - nexus_client - .zpool_put(&sled_id, &pool_id, &zpool_request) - .await - .map_err(|e| { - backoff::BackoffError::transient(e.to_string()) - })?; - Ok(()) - } - }; - let log = self.log.clone(); - let name = pool.name.clone(); - let disk = pool.parent().clone(); - let log_post_failure = move |_, call_count, total_duration| { - if call_count == 0 { - info!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"); - } else if total_duration > std::time::Duration::from_secs(30) { - warn!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"; - "total duration" => ?total_duration); - } - }; - self.nexus_notifications.push_back( - backoff::retry_notify_ext( - backoff::retry_policy_internal_service_aggressive(), - notify_nexus, - log_post_failure, - ) - .boxed(), - ); - } - - async fn ensure_using_exactly_these_disks( - &mut self, - resources: &StorageResources, - unparsed_disks: Vec, - queued_u2_drives: &mut Option>, - ) -> Result<(), Error> { - // Queue U.2 drives if necessary - // We clear all existing queued drives that are not synthetic and add - // new ones in the loop below - if let Some(queued) = queued_u2_drives { - info!( - self.log, - "Ensure exact disks: clearing non-synthetic queued disks." - ); - queued.retain(|d| d.is_synthetic()); - } - - let mut new_disks = HashMap::new(); - - // We may encounter errors while parsing any of the disks; keep track of - // any errors that occur and return any of them if something goes wrong. - // - // That being said, we should not prevent access to the other disks if - // only one failure occurs. - let mut err: Option = None; - - // Ensure all disks conform to the expected partition layout. - for disk in unparsed_disks.into_iter() { - if disk.variant() == DiskVariant::U2 { - if let Some(queued) = queued_u2_drives { - info!(self.log, "Queuing disk for upsert: {disk:?}"); - queued.insert(disk.into()); - continue; - } - } - match self.add_new_disk(disk, queued_u2_drives).await.map_err( - |err| { - warn!(self.log, "Could not ensure partitions: {err}"); - err - }, - ) { - Ok(disk) => { - new_disks.insert(disk.identity().clone(), disk); - } - Err(e) => { - warn!(self.log, "Cannot parse disk: {e}"); - err = Some(e.into()); - } - }; - } - - let mut disks = resources.disks.lock().await; - - // Remove disks that don't appear in the "new_disks" set. - // - // This also accounts for zpools and notifies Nexus. - let disks_to_be_removed = disks - .iter_mut() - .filter(|(key, old_disk)| { - // If this disk appears in the "new" and "old" set, it should - // only be removed if it has changed. - // - // This treats a disk changing in an unexpected way as a - // "removal and re-insertion". - match old_disk { - DiskWrapper::Real { disk, .. } => { - if let Some(new_disk) = new_disks.get(*key) { - // Changed Disk -> Disk should be removed. - new_disk != disk - } else { - // Real disk, not in the new set -> Disk should be removed. - true - } - } - // Synthetic disk -> Disk should NOT be removed. - DiskWrapper::Synthetic { .. } => false, - } - }) - .map(|(_key, disk)| disk.clone()) - .collect::>(); - - for disk in disks_to_be_removed { - if let Err(e) = self - .delete_disk_locked(&resources, &mut disks, &disk.identity()) - .await - { - warn!(self.log, "Failed to delete disk: {e}"); - err = Some(e); - } - } - - // Add new disks to `resources.disks`. - // - // This also accounts for zpools and notifies Nexus. - for (key, new_disk) in new_disks { - if let Some(old_disk) = disks.get(&key) { - // In this case, the disk should be unchanged. - // - // This assertion should be upheld by the filter above, which - // should remove disks that changed. - assert!(old_disk == &new_disk.into()); - } else { - let disk = DiskWrapper::Real { - disk: new_disk.clone(), - devfs_path: new_disk.devfs_path().clone(), - }; - if let Err(e) = - self.upsert_disk_locked(&resources, &mut disks, disk).await - { - warn!(self.log, "Failed to upsert disk: {e}"); - err = Some(e); - } - } - } - - if let Some(err) = err { - Err(err) - } else { - Ok(()) - } - } - - // Attempt to create a new disk via `sled_hardware::Disk::new()`. If the - // disk addition fails because the the key manager cannot load a secret, - // this indicates a transient error, and so we queue the disk so we can - // try again. - async fn add_new_disk( - &mut self, - unparsed_disk: UnparsedDisk, - queued_u2_drives: &mut Option>, - ) -> Result { - match sled_hardware::Disk::new( - &self.log, - unparsed_disk.clone(), - Some(&self.key_requester), - ) - .await - { - Ok(disk) => Ok(disk), - Err(sled_hardware::PooledDiskError::KeyManager(err)) => { - warn!( - self.log, - "Transient error: {err} - queuing disk {:?}", unparsed_disk - ); - if let Some(queued) = queued_u2_drives { - queued.insert(unparsed_disk.into()); - } else { - *queued_u2_drives = - Some(HashSet::from([unparsed_disk.into()])); - } - Err(sled_hardware::PooledDiskError::KeyManager(err)) - } - Err(err) => { - error!( - self.log, - "Persistent error: {err} - not queueing disk {:?}", - unparsed_disk - ); - Err(err) - } - } - } - - // Attempt to create a new synthetic disk via - // `sled_hardware::Disk::ensure_zpool_ready()`. If the disk addition fails - // because the the key manager cannot load a secret, this indicates a - // transient error, and so we queue the disk so we can try again. - async fn add_new_synthetic_disk( - &mut self, - zpool_name: ZpoolName, - queued_u2_drives: &mut Option>, - ) -> Result<(), sled_hardware::PooledDiskError> { - let synthetic_id = DiskIdentity { - vendor: "fake_vendor".to_string(), - serial: "fake_serial".to_string(), - model: zpool_name.id().to_string(), - }; - match sled_hardware::Disk::ensure_zpool_ready( - &self.log, - &zpool_name, - &synthetic_id, - Some(&self.key_requester), - ) - .await - { - Ok(()) => Ok(()), - Err(sled_hardware::PooledDiskError::KeyManager(err)) => { - warn!( - self.log, - "Transient error: {err} - queuing synthetic disk: {:?}", - zpool_name - ); - if let Some(queued) = queued_u2_drives { - queued.insert(zpool_name.into()); - } else { - *queued_u2_drives = - Some(HashSet::from([zpool_name.into()])); - } - Err(sled_hardware::PooledDiskError::KeyManager(err)) - } - Err(err) => { - error!( - self.log, - "Persistent error: {} - not queueing synthetic disk {:?}", - err, - zpool_name - ); - Err(err) - } - } - } - - async fn upsert_disk( - &mut self, - resources: &StorageResources, - disk: UnparsedDisk, - queued_u2_drives: &mut Option>, - ) -> Result<(), Error> { - // Queue U.2 drives if necessary - if let Some(queued) = queued_u2_drives { - if disk.variant() == DiskVariant::U2 { - info!(self.log, "Queuing disk for upsert: {disk:?}"); - queued.insert(disk.into()); - return Ok(()); - } - } - - info!(self.log, "Upserting disk: {disk:?}"); - - // Ensure the disk conforms to an expected partition layout. - let disk = - self.add_new_disk(disk, queued_u2_drives).await.map_err(|err| { - warn!(self.log, "Could not ensure partitions: {err}"); - err - })?; - - let mut disks = resources.disks.lock().await; - let disk = DiskWrapper::Real { - disk: disk.clone(), - devfs_path: disk.devfs_path().clone(), - }; - self.upsert_disk_locked(resources, &mut disks, disk).await - } - - async fn upsert_synthetic_disk( - &mut self, - resources: &StorageResources, - zpool_name: ZpoolName, - queued_u2_drives: &mut Option>, - ) -> Result<(), Error> { - // Queue U.2 drives if necessary - if let Some(queued) = queued_u2_drives { - if zpool_name.kind() == ZpoolKind::External { - info!( - self.log, - "Queuing synthetic disk for upsert: {zpool_name:?}" - ); - queued.insert(zpool_name.into()); - return Ok(()); - } - } - - info!(self.log, "Upserting synthetic disk for: {zpool_name:?}"); - - self.add_new_synthetic_disk(zpool_name.clone(), queued_u2_drives) - .await?; - let disk = DiskWrapper::Synthetic { zpool_name }; - let mut disks = resources.disks.lock().await; - self.upsert_disk_locked(resources, &mut disks, disk).await - } - - async fn upsert_disk_locked( - &mut self, - resources: &StorageResources, - disks: &mut tokio::sync::MutexGuard< - '_, - HashMap, - >, - disk: DiskWrapper, - ) -> Result<(), Error> { - disks.insert(disk.identity(), disk.clone()); - self.physical_disk_notify(NotifyDiskRequest::Add { - identity: disk.identity(), - variant: disk.variant(), - }) - .await; - self.upsert_zpool(&resources, disk.identity(), disk.zpool_name()) - .await?; - - self.dump_setup.update_dumpdev_setup(disks).await; - - Ok(()) - } - - async fn delete_disk( - &mut self, - resources: &StorageResources, - disk: UnparsedDisk, - ) -> Result<(), Error> { - info!(self.log, "Deleting disk: {disk:?}"); - // TODO: Don't we need to do some accounting, e.g. for all the information - // that's no longer accessible? Or is that up to Nexus to figure out at - // a later point-in-time? - // - // If we're storing zone images on the M.2s for internal services, how - // do we reconcile them? - let mut disks = resources.disks.lock().await; - self.delete_disk_locked(resources, &mut disks, disk.identity()).await - } - - async fn delete_disk_locked( - &mut self, - resources: &StorageResources, - disks: &mut tokio::sync::MutexGuard< - '_, - HashMap, - >, - key: &DiskIdentity, - ) -> Result<(), Error> { - if let Some(parsed_disk) = disks.remove(key) { - resources.pools.lock().await.remove(&parsed_disk.zpool_name().id()); - self.physical_disk_notify(NotifyDiskRequest::Remove(key.clone())) - .await; - } - - self.dump_setup.update_dumpdev_setup(disks).await; - - Ok(()) - } - - /// When the underlay becomes available, we need to notify nexus about any - /// discovered disks and pools, since we don't attempt to notify until there - /// is an underlay available. - async fn notify_nexus_about_existing_resources( - &mut self, - resources: &StorageResources, - ) -> Result<(), Error> { - let disks = resources.disks.lock().await; - for disk in disks.values() { - self.physical_disk_notify(NotifyDiskRequest::Add { - identity: disk.identity(), - variant: disk.variant(), - }) - .await; - } - - // We may encounter errors while processing any of the pools; keep track of - // any errors that occur and return any of them if something goes wrong. - // - // That being said, we should not prevent notification to nexus of the - // other pools if only one failure occurs. - let mut err: Option = None; - - let pools = resources.pools.lock().await; - for pool in pools.values() { - match ByteCount::try_from(pool.info.size()).map_err(|err| { - Error::BadPoolSize { name: pool.name.to_string(), err } - }) { - Ok(size) => self.add_zpool_notify(pool, size).await, - Err(e) => { - warn!(self.log, "Failed to notify nexus about pool: {e}"); - err = Some(e) - } - } - } - - if let Some(err) = err { - Err(err) - } else { - Ok(()) - } - } - - // Adds a "notification to nexus" to `self.nexus_notifications`, informing it - // about the addition/removal of a physical disk to this sled. - async fn physical_disk_notify(&mut self, disk: NotifyDiskRequest) { - // The underlay network is setup once at sled-agent startup. Before - // there is an underlay we want to avoid sending notifications to nexus for - // two reasons: - // 1. They can't possibly succeed - // 2. They increase the backoff time exponentially, so that once - // sled-agent does start it may take much longer to notify nexus - // than it would if we avoid this. This goes especially so for rack - // setup, when bootstrap agent is waiting an aribtrary time for RSS - // initialization. - if self.underlay.lock().await.is_none() { - return; - } - let underlay = self.underlay.clone(); - let disk2 = disk.clone(); - let notify_nexus = move || { - let disk = disk.clone(); - let underlay = underlay.clone(); - async move { - let underlay_guard = underlay.lock().await; - let Some(underlay) = underlay_guard.as_ref() else { - return Err(backoff::BackoffError::transient( - Error::UnderlayNotInitialized.to_string(), - )); - }; - let sled_id = underlay.sled_id; - let nexus_client = underlay.nexus_client.client().clone(); - drop(underlay_guard); - - match &disk { - NotifyDiskRequest::Add { identity, variant } => { - let request = PhysicalDiskPutRequest { - model: identity.model.clone(), - serial: identity.serial.clone(), - vendor: identity.vendor.clone(), - variant: match variant { - DiskVariant::U2 => PhysicalDiskKind::U2, - DiskVariant::M2 => PhysicalDiskKind::M2, - }, - sled_id, - }; - nexus_client - .physical_disk_put(&request) - .await - .map_err(|e| { - backoff::BackoffError::transient(e.to_string()) - })?; - } - NotifyDiskRequest::Remove(disk_identity) => { - let request = PhysicalDiskDeleteRequest { - model: disk_identity.model.clone(), - serial: disk_identity.serial.clone(), - vendor: disk_identity.vendor.clone(), - sled_id, - }; - nexus_client - .physical_disk_delete(&request) - .await - .map_err(|e| { - backoff::BackoffError::transient(e.to_string()) - })?; - } - } - Ok(()) - } - }; - let log = self.log.clone(); - // This notification is often invoked before Nexus has started - // running, so avoid flagging any errors as concerning until some - // time has passed. - let log_post_failure = move |_, call_count, total_duration| { - if call_count == 0 { - info!(log, "failed to notify nexus about {disk2:?}"); - } else if total_duration > std::time::Duration::from_secs(30) { - warn!(log, "failed to notify nexus about {disk2:?}"; - "total duration" => ?total_duration); - } - }; - self.nexus_notifications.push_back( - backoff::retry_notify_ext( - backoff::retry_policy_internal_service_aggressive(), - notify_nexus, - log_post_failure, - ) - .boxed(), - ); - } - - async fn upsert_zpool( - &mut self, - resources: &StorageResources, - parent: DiskIdentity, - pool_name: &ZpoolName, - ) -> Result<(), Error> { - let mut pools = resources.pools.lock().await; - let zpool = Pool::new(pool_name.clone(), parent)?; - - let pool = match pools.entry(pool_name.id()) { - hash_map::Entry::Occupied(mut entry) => { - // The pool already exists. - entry.get_mut().info = zpool.info; - return Ok(()); - } - hash_map::Entry::Vacant(entry) => entry.insert(zpool), - }; - info!(&self.log, "Storage manager processing zpool: {:#?}", pool.info); - - let size = ByteCount::try_from(pool.info.size()).map_err(|err| { - Error::BadPoolSize { name: pool_name.to_string(), err } - })?; - // Notify Nexus of the zpool. - self.add_zpool_notify(&pool, size).await; - Ok(()) - } - - // Small wrapper around `Self::do_work_internal` that ensures we always - // emit info to the log when we exit. - async fn do_work( - &mut self, - resources: StorageResources, - ) -> Result<(), Error> { - // We queue U.2 sleds until the StorageKeyRequester is ready to use. - let mut queued_u2_drives = Some(HashSet::new()); - loop { - match self.do_work_internal(&resources, &mut queued_u2_drives).await - { - Ok(()) => { - info!(self.log, "StorageWorker exited successfully"); - return Ok(()); - } - Err(e) => { - warn!( - self.log, - "StorageWorker encountered unexpected error: {}", e - ); - // ... for now, keep trying. - } - } - } - } - - async fn do_work_internal( - &mut self, - resources: &StorageResources, - queued_u2_drives: &mut Option>, - ) -> Result<(), Error> { - const QUEUED_DISK_RETRY_TIMEOUT: Duration = Duration::from_secs(5); - let mut interval = interval(QUEUED_DISK_RETRY_TIMEOUT); - interval.set_missed_tick_behavior(MissedTickBehavior::Delay); - loop { - tokio::select! { - _ = self.nexus_notifications.next(), - if !self.nexus_notifications.is_empty() => {}, - Some(request) = self.rx.recv() => { - // We want to queue failed requests related to the key manager - match self.handle_storage_worker_request( - resources, queued_u2_drives, request) - .await { - Err(Error::DiskError(_)) => { - // We already handle and log disk errors, no need to - // return here. - } - Err(e) => return Err(e), - Ok(()) => {} - } - } - _ = interval.tick(), if queued_u2_drives.is_some() && - KEY_MANAGER_READY.get().is_some()=> - { - self.upsert_queued_disks(resources, queued_u2_drives).await; - } - } - } - } - - async fn handle_storage_worker_request( - &mut self, - resources: &StorageResources, - queued_u2_drives: &mut Option>, - request: StorageWorkerRequest, - ) -> Result<(), Error> { - use StorageWorkerRequest::*; - match request { - AddDisk(disk) => { - self.upsert_disk(&resources, disk, queued_u2_drives).await?; - } - AddSyntheticDisk(zpool_name) => { - self.upsert_synthetic_disk( - &resources, - zpool_name, - queued_u2_drives, - ) - .await?; - } - RemoveDisk(disk) => { - self.delete_disk(&resources, disk).await?; - } - NewFilesystem(request) => { - let result = self.add_dataset(&resources, &request).await; - let _ = request.responder.send(result); - } - DisksChanged(disks) => { - self.ensure_using_exactly_these_disks( - &resources, - disks, - queued_u2_drives, - ) - .await?; - } - SetupUnderlayAccess(UnderlayRequest { underlay, responder }) => { - // If this is the first time establishing an - // underlay we should notify nexus of all existing - // disks and zpools. - // - // Instead of individual notifications, we should - // send a bulk notification as described in https:// - // github.com/oxidecomputer/omicron/issues/1917 - if self.underlay.lock().await.replace(underlay).is_none() { - self.notify_nexus_about_existing_resources(&resources) - .await?; - } - let _ = responder.send(Ok(())); - } - KeyManagerReady => { - let _ = KEY_MANAGER_READY.set(()); - self.upsert_queued_disks(resources, queued_u2_drives).await; - } - } - Ok(()) - } -} - -enum StorageWorkerRequest { - AddDisk(UnparsedDisk), - AddSyntheticDisk(ZpoolName), - RemoveDisk(UnparsedDisk), - DisksChanged(Vec), - NewFilesystem(NewFilesystemRequest), - SetupUnderlayAccess(UnderlayRequest), - KeyManagerReady, -} - -struct StorageManagerInner { - log: Logger, - - resources: StorageResources, - - tx: mpsc::Sender, - - // A handle to a worker which updates "pools". - task: JoinHandle>, -} - -/// A sled-local view of all attached storage. -#[derive(Clone)] -pub struct StorageManager { - inner: Arc, - zone_bundler: ZoneBundler, -} - -impl StorageManager { - /// Creates a new [`StorageManager`] which should manage local storage. - pub async fn new(log: &Logger, key_requester: StorageKeyRequester) -> Self { - let log = log.new(o!("component" => "StorageManager")); - let resources = StorageResources { - disks: Arc::new(Mutex::new(HashMap::new())), - pools: Arc::new(Mutex::new(HashMap::new())), - }; - let (tx, rx) = mpsc::channel(30); - - let zb_log = log.new(o!("component" => "ZoneBundler")); - let zone_bundler = - ZoneBundler::new(zb_log, resources.clone(), Default::default()); - - StorageManager { - inner: Arc::new(StorageManagerInner { - log: log.clone(), - resources: resources.clone(), - tx, - task: tokio::task::spawn(async move { - let dump_setup = Arc::new(DumpSetup::new(&log)); - let mut worker = StorageWorker { - log, - nexus_notifications: FuturesOrdered::new(), - rx, - underlay: Arc::new(Mutex::new(None)), - key_requester, - dump_setup, - }; - - worker.do_work(resources).await - }), - }), - zone_bundler, - } - } - - /// Return a reference to the object used to manage zone bundles. - /// - /// This can be cloned by other code wishing to create and manage their own - /// zone bundles. - pub fn zone_bundler(&self) -> &ZoneBundler { - &self.zone_bundler - } - - /// Adds underlay access to the storage manager. - pub async fn setup_underlay_access( - &self, - underlay: UnderlayAccess, - ) -> Result<(), Error> { - let (tx, rx) = oneshot::channel(); - self.inner - .tx - .send(StorageWorkerRequest::SetupUnderlayAccess(UnderlayRequest { - underlay, - responder: tx, - })) - .await - .map_err(|e| e.to_string()) - .expect("Failed to send SetupUnderlayAccess request"); - rx.await.expect("Failed to await underlay setup") - } - - pub async fn get_zpools(&self) -> Result, Error> { - let disks = self.inner.resources.disks.lock().await; - let pools = self.inner.resources.pools.lock().await; - - let mut zpools = Vec::with_capacity(pools.len()); - - for (id, pool) in pools.iter() { - let disk_identity = &pool.parent; - let disk_type = if let Some(disk) = disks.get(&disk_identity) { - disk.variant().into() - } else { - // If the zpool claims to be attached to a disk that we - // don't know about, that's an error. - return Err(Error::ZpoolNotFound( - format!("zpool: {id} claims to be from unknown disk: {disk_identity:#?}") - )); - }; - zpools.push(crate::params::Zpool { id: *id, disk_type }); - } - - Ok(zpools) - } - - pub async fn upsert_filesystem( - &self, - dataset_id: Uuid, - dataset_name: DatasetName, - ) -> Result { - let (tx, rx) = oneshot::channel(); - let request = - NewFilesystemRequest { dataset_id, dataset_name, responder: tx }; - - self.inner - .tx - .send(StorageWorkerRequest::NewFilesystem(request)) - .await - .map_err(|e| e.to_string()) - .expect("Storage worker bug (not alive)"); - let dataset_name = rx.await.expect( - "Storage worker bug (dropped responder without responding)", - )?; - - Ok(dataset_name) - } -} diff --git a/sled-hardware/src/illumos/mod.rs b/sled-hardware/src/illumos/mod.rs index 0e49d6d776..a1204c4c63 100644 --- a/sled-hardware/src/illumos/mod.rs +++ b/sled-hardware/src/illumos/mod.rs @@ -19,7 +19,6 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; use std::sync::Mutex; use tokio::sync::broadcast; -use tokio::task::JoinHandle; use uuid::Uuid; mod gpt; diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 2b9e7cffae..0bdca5c19c 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -5,16 +5,15 @@ //! Discovered and usable disks and zpools use crate::dataset::M2_DEBUG_DATASET; -use crate::disk::{Disk, RawDisk}; +use crate::disk::Disk; use crate::error::Error; use crate::pool::Pool; use camino::Utf8PathBuf; use illumos_utils::zpool::ZpoolName; use omicron_common::disk::DiskIdentity; -use sled_hardware::{DiskVariant, UnparsedDisk}; +use sled_hardware::DiskVariant; use std::collections::BTreeMap; use std::sync::Arc; -use uuid::Uuid; // The directory within the debug dataset in which bundles are created. const BUNDLE_DIRECTORY: &str = "bundle"; @@ -113,8 +112,16 @@ impl StorageResources { .collect() } - /// Returns all zpools of a particular variant - pub fn all_zpools(&self, variant: DiskVariant) -> Vec { + pub fn get_all_zpools(&self) -> Vec<(ZpoolName, DiskVariant)> { + self.disks + .values() + .cloned() + .map(|(disk, _)| (disk.zpool_name().clone(), disk.variant())) + .collect() + } + + // Returns all zpools of a particular variant + fn all_zpools(&self, variant: DiskVariant) -> Vec { self.disks .values() .filter_map(|(disk, _)| { From e0b4b26e431d99538ec358f2f53d64dd6246a036 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 10 Oct 2023 03:50:16 +0000 Subject: [PATCH 29/66] wip --- nexus-client/Cargo.toml | 1 + nexus-client/src/lib.rs | 9 + sled-agent/src/{storage => }/dump_setup.rs | 0 sled-agent/src/hardware_monitor.rs | 2 +- sled-agent/src/lib.rs | 3 +- sled-agent/src/storage/mod.rs | 7 - sled-agent/src/storage_monitor.rs | 199 +++++++++++++++++++++ sled-hardware/Cargo.toml | 1 - 8 files changed, 212 insertions(+), 10 deletions(-) rename sled-agent/src/{storage => }/dump_setup.rs (100%) delete mode 100644 sled-agent/src/storage/mod.rs create mode 100644 sled-agent/src/storage_monitor.rs diff --git a/nexus-client/Cargo.toml b/nexus-client/Cargo.toml index 589562c930..7da76b418a 100644 --- a/nexus-client/Cargo.toml +++ b/nexus-client/Cargo.toml @@ -10,6 +10,7 @@ futures.workspace = true ipnetwork.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true +sled-hardware.workspace = true progenitor.workspace = true regress.workspace = true reqwest = { workspace = true, features = ["rustls-tls", "stream"] } diff --git a/nexus-client/src/lib.rs b/nexus-client/src/lib.rs index e5cec83f39..c32f8be2e1 100644 --- a/nexus-client/src/lib.rs +++ b/nexus-client/src/lib.rs @@ -375,3 +375,12 @@ impl From } } } + +impl From for types::PhysicalDiskKind { + fn from(value: sled_hardware::DiskVariant) -> Self { + match value { + sled_hardware::DiskVariant::U2 => types::PhysicalDiskKind::U2, + sled_hardware::DiskVariant::M2 => types::PhysicalDiskKind::M2, + } + } +} diff --git a/sled-agent/src/storage/dump_setup.rs b/sled-agent/src/dump_setup.rs similarity index 100% rename from sled-agent/src/storage/dump_setup.rs rename to sled-agent/src/dump_setup.rs diff --git a/sled-agent/src/hardware_monitor.rs b/sled-agent/src/hardware_monitor.rs index 03677e4e6d..f3402cb6bd 100644 --- a/sled-agent/src/hardware_monitor.rs +++ b/sled-agent/src/hardware_monitor.rs @@ -144,7 +144,7 @@ impl HardwareMonitor { ) } - /// Run the main receive loop of the `StorageManager` + /// Run the main receive loop of the `HardwareMonitor` /// /// This should be spawned into a tokio task pub async fn run(&mut self) { diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 9a3b6d4f1b..4c1266df9b 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -19,6 +19,7 @@ pub mod common; // Modules for the non-simulated sled agent. pub mod bootstrap; pub mod config; +pub(crate) mod dump_setup; pub(crate) mod hardware_monitor; mod http_entrypoints; mod instance; @@ -32,7 +33,7 @@ pub mod server; mod services; mod sled_agent; mod smf_helper; -pub(crate) mod storage; +mod storage_monitor; mod swap_device; mod updates; mod zone_bundle; diff --git a/sled-agent/src/storage/mod.rs b/sled-agent/src/storage/mod.rs deleted file mode 100644 index 663ebe8274..0000000000 --- a/sled-agent/src/storage/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Management of local storage - -pub(crate) mod dump_setup; diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs new file mode 100644 index 0000000000..bd21664d19 --- /dev/null +++ b/sled-agent/src/storage_monitor.rs @@ -0,0 +1,199 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A task that listens for storage events from [`sled_storage::StorageMonitor`] +//! and dispatches them to other parst of the bootstrap agent and sled agent +//! code. + +use crate::nexus::NexusClientWithResolver; +use nexus_client::types::PhysicalDiskDeleteRequest; +use nexus_client::types::PhysicalDiskKind; +use nexus_client::types::PhysicalDiskPutRequest; +use nexus_client::types::ZpoolPutRequest; +use omicron_common::api::external::ByteCount; +use sled_storage::disk::Disk; +use sled_storage::manager::StorageHandle; +use sled_storage::resources::StorageResources; +use slog::Logger; +use std::fmt::Debug; +use tokio::sync::mpsc; +use uuid::Uuid; + +const QUEUE_SIZE: usize = 10; + +/// A message sent from the `StorageMonitorHandle` to the `StorageMonitor`. +#[derive(Debug)] +pub enum StorageMonitorMsg { + UnderlayAvailable(UnderlayAccess), +} + +/// Describes the access to the underlay used by the StorageManager. +pub struct UnderlayAccess { + pub nexus_client: NexusClientWithResolver, + pub sled_id: Uuid, +} + +impl Debug for UnderlayAccess { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("UnderlayAccess") + .field("sled_id", &self.sled_id) + .finish() + } +} + +/// A mechanism for interacting with the StorageMonitor +#[derive(Clone)] +pub struct StorageMonitorHandle { + tx: mpsc::Sender, +} + +pub struct StorageMonitor { + log: Logger, + storage_manager: StorageHandle, + handle_rx: mpsc::Receiver, + + // A cached copy of the `StorageResources` from the last update + storage_resources: StorageResources, + + // Ability to access the underlay network + underlay: Option, +} + +impl StorageMonitor { + pub fn new( + log: &Logger, + storage_manager: StorageHandle, + ) -> (StorageMonitor, StorageMonitorHandle) { + let (handle_tx, handle_rx) = mpsc::channel(QUEUE_SIZE); + let storage_resources = StorageResources::default(); + let log = log.new(o!("component" => "StorageMonitor")); + ( + StorageMonitor { + log, + storage_manager, + handle_rx, + storage_resources, + underlay: None, + }, + StorageMonitorHandle { tx: handle_tx }, + ) + } + + /// Run the main receive loop of the `StorageMonitor` + /// + /// This should be spawned into a tokio task + pub async fn run(&mut self) { + loop { + tokio::select! { + resources = self.storage_manager.wait_for_changes() => { + info!( + self.log, + "Received storage manager update"; + "resources" => ?resources + ); + self.handle_resource_update(resources).await; + } + Some(msg) = self.handle_rx.recv() => { + info!( + self.log, + "Received storage monitor message"; + "msg" => ?msg + ); + self.handle_monitor_msg(msg).await; + } + } + } + } + + async fn handle_resource_update( + &mut self, + updated_resources: StorageResources, + ) { + // If the underlay isn't available, we only record the changes. Nexus + // isn't yet reachable to notify. + if self.underlay.is_some() { + let nexus_updates = compute_resource_diffs( + &self.log, + &self.underlay.as_ref().unwrap().sled_id, + &self.storage_resources, + &updated_resources, + ); + // TODO: Notify nexus about diffs + } + // Save the updated `StorageResources` + self.storage_resources = updated_resources; + } +} + +struct NexusUpdates { + disk_puts: Vec, + disk_deletes: Vec, + zpool_puts: Vec, +} + +async fn compute_resource_diffs( + log: &Logger, + sled_id: &Uuid, + current: &StorageResources, + updated: &StorageResources, +) -> NexusUpdates { + let mut disk_puts = vec![]; + let mut disk_deletes = vec![]; + let mut zpool_puts = vec![]; + + // Diff the existing resources with the update to see what has changed + // This loop finds disks and pools that were modified or deleted + for (disk_id, (disk, pool)) in current.disks.iter() { + match updated.disks.get(disk_id) { + Some((updated_disk, updated_pool)) => { + if disk != updated_disk { + disk_puts.push(PhysicalDiskPutRequest { + sled_id: *sled_id, + model: disk_id.model.clone(), + serial: disk_id.serial.clone(), + vendor: disk_id.vendor.clone(), + variant: updated_disk.variant().into(), + }); + } + if pool != updated_pool { + match ByteCount::try_from(pool.info.size()) { + Ok(size) => zpool_puts.push(ZpoolPutRequest { + size: size.into(), + disk_model: disk_id.model.clone(), + disk_serial: disk_id.serial.clone(), + disk_vendor: disk_id.vendor.clone(), + }), + Err(err) => error!( + log, + "Error parsing pool size"; + "name" => pool.name.to_string(), + "err" => ?err), + } + } + } + None => disk_deletes.push(PhysicalDiskDeleteRequest { + model: disk_id.model.clone(), + serial: disk_id.serial.clone(), + vendor: disk_id.vendor.clone(), + sled_id, + }), + } + } + + // Diff the existing resources with the update to see what has changed + // This loop finds new disks and pools + for (disk_id, (updated_disk, updated_pool)) in updated.disks.iter() { + if !current.disks.contains_key(disk_id) { + disk_puts.push(PhysicalDiskPutRequest { + sled_id: *sled_id, + model: disk_id.model.clone(), + serial: disk_id.serial.clone(), + vendor: disk_id.vendor.clone(), + variant: updated_disk.variant().into(), + }); + } + } + + NexusUpdates { disk_puts, disk_deletes, zpool_puts } +} diff --git a/sled-hardware/Cargo.toml b/sled-hardware/Cargo.toml index c6bc09f41e..5a6df575a0 100644 --- a/sled-hardware/Cargo.toml +++ b/sled-hardware/Cargo.toml @@ -11,7 +11,6 @@ camino.workspace = true cfg-if.workspace = true futures.workspace = true illumos-utils.workspace = true -key-manager.workspace = true libc.workspace = true macaddr.workspace = true nexus-client.workspace = true From 950a1d431a2421a6421ad584a1b0397a73e1dd2f Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 10 Oct 2023 18:51:03 +0000 Subject: [PATCH 30/66] wip --- Cargo.lock | 3 +- nexus-client/src/lib.rs | 10 ++ sled-agent/src/storage_monitor.rs | 158 ++++++++++++++++++++++++++++-- sled-hardware/Cargo.toml | 1 - sled-hardware/src/lib.rs | 10 -- sled-storage/src/lib.rs | 2 +- 6 files changed, 160 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6165b6963c..d8c01f0136 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4258,6 +4258,7 @@ dependencies = [ "schemars", "serde", "serde_json", + "sled-hardware", "slog", "uuid", ] @@ -7939,11 +7940,9 @@ dependencies = [ "futures", "illumos-devinfo", "illumos-utils", - "key-manager", "libc", "libefi-illumos", "macaddr", - "nexus-client 0.1.0", "omicron-common 0.1.0", "omicron-test-utils", "rand 0.8.5", diff --git a/nexus-client/src/lib.rs b/nexus-client/src/lib.rs index c32f8be2e1..e1980bcf6b 100644 --- a/nexus-client/src/lib.rs +++ b/nexus-client/src/lib.rs @@ -384,3 +384,13 @@ impl From for types::PhysicalDiskKind { } } } + +impl From for types::Baseboard { + fn from(b: sled_hardware::Baseboard) -> types::Baseboard { + types::Baseboard { + serial_number: b.identifier().to_string(), + part_number: b.model().to_string(), + revision: b.revision(), + } + } +} diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index bd21664d19..93233a8ed2 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -7,21 +7,33 @@ //! code. use crate::nexus::NexusClientWithResolver; +use derive_more::From; +use futures::stream::FuturesOrdered; +use futures::FutureExt; use nexus_client::types::PhysicalDiskDeleteRequest; use nexus_client::types::PhysicalDiskKind; use nexus_client::types::PhysicalDiskPutRequest; use nexus_client::types::ZpoolPutRequest; use omicron_common::api::external::ByteCount; +use omicron_common::backoff; use sled_storage::disk::Disk; use sled_storage::manager::StorageHandle; +use sled_storage::pool::Pool; use sled_storage::resources::StorageResources; use slog::Logger; use std::fmt::Debug; +use std::pin::Pin; use tokio::sync::mpsc; use uuid::Uuid; const QUEUE_SIZE: usize = 10; +#[derive(From, Clone, Debug)] +enum NexusDiskRequest { + Put(PhysicalDiskPutRequest), + Delete(PhysicalDiskDeleteRequest), +} + /// A message sent from the `StorageMonitorHandle` to the `StorageMonitor`. #[derive(Debug)] pub enum StorageMonitorMsg { @@ -29,6 +41,7 @@ pub enum StorageMonitorMsg { } /// Describes the access to the underlay used by the StorageManager. +#[derive(Clone)] pub struct UnderlayAccess { pub nexus_client: NexusClientWithResolver, pub sled_id: Uuid, @@ -58,6 +71,9 @@ pub struct StorageMonitor { // Ability to access the underlay network underlay: Option, + + // A queue for sending nexus notifications in order + nexus_notifications: FuturesOrdered, } impl StorageMonitor { @@ -75,6 +91,7 @@ impl StorageMonitor { handle_rx, storage_resources, underlay: None, + nexus_notifications: FuturesOrdered::new(), }, StorageMonitorHandle { tx: handle_tx }, ) @@ -106,6 +123,8 @@ impl StorageMonitor { } } + async fn handle_monitor_msg(&mut self, msg: StorageMonitorMsg) {} + async fn handle_resource_update( &mut self, updated_resources: StorageResources, @@ -119,20 +138,136 @@ impl StorageMonitor { &self.storage_resources, &updated_resources, ); - // TODO: Notify nexus about diffs + for put in nexus_updates.disk_puts { + self.physical_disk_notify(put.into()).await; + } + for del in nexus_updates.disk_deletes { + self.physical_disk_notify(del.into()).await; + } + for (pool, put) in nexus_updates.zpool_puts { + self.add_zpool_notify(pool, put).await; + } + + // TODO: Update Dump Setup if any diffs } // Save the updated `StorageResources` self.storage_resources = updated_resources; } + + // Adds a "notification to nexus" to `self.nexus_notifications`, informing it + // about the addition/removal of a physical disk to this sled. + async fn physical_disk_notify(&mut self, disk: NexusDiskRequest) { + let underlay = self.underlay.as_ref().unwrap().clone(); + let disk2 = disk.clone(); + let notify_nexus = move || { + let underlay = underlay.clone(); + let disk = disk.clone(); + async move { + let nexus_client = underlay.nexus_client.client().clone(); + + match &disk { + NexusDiskRequest::Put(request) => { + nexus_client + .physical_disk_put(&request) + .await + .map_err(|e| { + backoff::BackoffError::transient(e.to_string()) + })?; + } + NexusDiskRequest::Delete(request) => { + nexus_client + .physical_disk_delete(&request) + .await + .map_err(|e| { + backoff::BackoffError::transient(e.to_string()) + })?; + } + } + Ok(()) + } + }; + + let log = self.log.clone(); + // This notification is often invoked before Nexus has started + // running, so avoid flagging any errors as concerning until some + // time has passed. + let log_post_failure = move |_, call_count, total_duration| { + if call_count == 0 { + info!(log, "failed to notify nexus about {disk2:?}"); + } else if total_duration > std::time::Duration::from_secs(30) { + warn!(log, "failed to notify nexus about {disk2:?}"; + "total duration" => ?total_duration); + } + }; + self.nexus_notifications.push_back( + backoff::retry_notify_ext( + backoff::retry_policy_internal_service_aggressive(), + notify_nexus, + log_post_failure, + ) + .boxed(), + ); + } + + // Adds a "notification to nexus" to `nexus_notifications`, + // informing it about the addition of `pool_id` to this sled. + async fn add_zpool_notify( + &mut self, + pool: Pool, + zpool_request: ZpoolPutRequest, + ) { + let pool_id = pool.name.id(); + let underlay = self.underlay.as_ref().unwrap().clone(); + + let notify_nexus = move || { + let underlay = underlay.clone(); + let zpool_request = zpool_request.clone(); + async move { + let sled_id = underlay.sled_id; + let nexus_client = underlay.nexus_client.client().clone(); + nexus_client + .zpool_put(&sled_id, &pool_id, &zpool_request) + .await + .map_err(|e| { + backoff::BackoffError::transient(e.to_string()) + })?; + Ok(()) + } + }; + + let log = self.log.clone(); + let name = pool.name.clone(); + let disk = pool.parent.clone(); + let log_post_failure = move |_, call_count, total_duration| { + if call_count == 0 { + info!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"); + } else if total_duration > std::time::Duration::from_secs(30) { + warn!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"; + "total duration" => ?total_duration); + } + }; + self.nexus_notifications.push_back( + backoff::retry_notify_ext( + backoff::retry_policy_internal_service_aggressive(), + notify_nexus, + log_post_failure, + ) + .boxed(), + ); + } } +// The type of a future which is used to send a notification to Nexus. +type NotifyFut = + Pin> + Send>>; + struct NexusUpdates { disk_puts: Vec, disk_deletes: Vec, - zpool_puts: Vec, + zpool_puts: Vec<(Pool, ZpoolPutRequest)>, } -async fn compute_resource_diffs( +fn compute_resource_diffs( log: &Logger, sled_id: &Uuid, current: &StorageResources, @@ -158,12 +293,15 @@ async fn compute_resource_diffs( } if pool != updated_pool { match ByteCount::try_from(pool.info.size()) { - Ok(size) => zpool_puts.push(ZpoolPutRequest { - size: size.into(), - disk_model: disk_id.model.clone(), - disk_serial: disk_id.serial.clone(), - disk_vendor: disk_id.vendor.clone(), - }), + Ok(size) => zpool_puts.push(( + pool.clone(), + ZpoolPutRequest { + size: size.into(), + disk_model: disk_id.model.clone(), + disk_serial: disk_id.serial.clone(), + disk_vendor: disk_id.vendor.clone(), + }, + )), Err(err) => error!( log, "Error parsing pool size"; @@ -176,7 +314,7 @@ async fn compute_resource_diffs( model: disk_id.model.clone(), serial: disk_id.serial.clone(), vendor: disk_id.vendor.clone(), - sled_id, + sled_id: *sled_id, }), } } diff --git a/sled-hardware/Cargo.toml b/sled-hardware/Cargo.toml index 5a6df575a0..0b97c160f3 100644 --- a/sled-hardware/Cargo.toml +++ b/sled-hardware/Cargo.toml @@ -13,7 +13,6 @@ futures.workspace = true illumos-utils.workspace = true libc.workspace = true macaddr.workspace = true -nexus-client.workspace = true omicron-common.workspace = true rand.workspace = true schemars.workspace = true diff --git a/sled-hardware/src/lib.rs b/sled-hardware/src/lib.rs index c81bcddbfb..3ae745118b 100644 --- a/sled-hardware/src/lib.rs +++ b/sled-hardware/src/lib.rs @@ -160,13 +160,3 @@ impl std::fmt::Display for Baseboard { } } } - -impl From for nexus_client::types::Baseboard { - fn from(b: Baseboard) -> nexus_client::types::Baseboard { - nexus_client::types::Baseboard { - serial_number: b.identifier().to_string(), - part_number: b.model().to_string(), - revision: b.revision(), - } - } -} diff --git a/sled-storage/src/lib.rs b/sled-storage/src/lib.rs index fc08579d77..d4b64c55a5 100644 --- a/sled-storage/src/lib.rs +++ b/sled-storage/src/lib.rs @@ -13,5 +13,5 @@ pub mod disk; pub mod error; pub(crate) mod keyfile; pub mod manager; -pub(crate) mod pool; +pub mod pool; pub mod resources; From 1a67b04095ff2066c9ad2e0d76358f04b0e60885 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 10 Oct 2023 22:57:32 +0000 Subject: [PATCH 31/66] wip --- sled-agent/src/storage_monitor.rs | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index 93233a8ed2..debd0d5e95 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -123,7 +123,31 @@ impl StorageMonitor { } } - async fn handle_monitor_msg(&mut self, msg: StorageMonitorMsg) {} + async fn handle_monitor_msg(&mut self, msg: StorageMonitorMsg) { + match msg { + StorageMonitorMsg::UnderlayAvailable(underlay) => { + let sled_id = underlay.sled_id; + self.underlay = Some(underlay); + self.notify_nexus_about_existing_resources(sled_id).await; + } + } + } + + /// When the underlay becomes available, we need to notify nexus about any + /// discovered disks and pools, since we don't attempt to notify until there + /// is an underlay available. + async fn notify_nexus_about_existing_resources(&mut self, sled_id: Uuid) { + let current = StorageResources::default(); + let updated = &self.storage_resources; + let nexus_updates = + compute_resource_diffs(&self.log, &sled_id, ¤t, updated); + for put in nexus_updates.disk_puts { + self.physical_disk_notify(put.into()).await; + } + for (pool, put) in nexus_updates.zpool_puts { + self.add_zpool_notify(pool, put).await; + } + } async fn handle_resource_update( &mut self, From 1e61ea90e52a5f6b909b61ea8b650088971a109a Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 11 Oct 2023 22:37:36 +0000 Subject: [PATCH 32/66] wip --- sled-agent/src/bootstrap/secret_retriever.rs | 2 +- sled-agent/src/dump_setup.rs | 8 +- sled-agent/src/long_running_tasks.rs | 22 +++ sled-agent/src/services.rs | 80 ++++++----- sled-agent/src/storage_monitor.rs | 25 +++- sled-agent/src/zone_bundle.rs | 133 ++++++++----------- sled-storage/src/resources.rs | 4 +- 7 files changed, 147 insertions(+), 127 deletions(-) diff --git a/sled-agent/src/bootstrap/secret_retriever.rs b/sled-agent/src/bootstrap/secret_retriever.rs index 5cae06310c..d6b542378d 100644 --- a/sled-agent/src/bootstrap/secret_retriever.rs +++ b/sled-agent/src/bootstrap/secret_retriever.rs @@ -92,7 +92,7 @@ impl LrtqOrHardcodedSecretRetriever { /// /// The local retriever only returns keys for epoch 0 #[derive(Debug)] -struct HardcodedSecretRetriever {} +pub struct HardcodedSecretRetriever {} #[async_trait] impl SecretRetriever for HardcodedSecretRetriever { diff --git a/sled-agent/src/dump_setup.rs b/sled-agent/src/dump_setup.rs index ea60998955..50bbda44b4 100644 --- a/sled-agent/src/dump_setup.rs +++ b/sled-agent/src/dump_setup.rs @@ -7,13 +7,13 @@ use omicron_common::disk::DiskIdentity; use sled_hardware::DiskVariant; use sled_storage::dataset::{CRASH_DATASET, DUMP_DATASET}; use sled_storage::disk::Disk; +use sled_storage::pool::Pool; use slog::Logger; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashSet}; use std::ffi::OsString; use std::path::{Path, PathBuf}; use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime, SystemTimeError, UNIX_EPOCH}; -use tokio::sync::MutexGuard; pub struct DumpSetup { worker: Arc>, @@ -100,13 +100,13 @@ const ARCHIVAL_INTERVAL: Duration = Duration::from_secs(300); impl DumpSetup { pub(crate) async fn update_dumpdev_setup( &self, - disks: &mut MutexGuard<'_, HashMap>, + disks: &Arc>, ) { let log = &self.log; let mut m2_dump_slices = Vec::new(); let mut u2_debug_datasets = Vec::new(); let mut m2_core_datasets = Vec::new(); - for (_id, disk) in disks.iter() { + for (_id, (disk, _)) in disks.iter() { if disk.is_synthetic() { // We only setup dump devices on real disks continue; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index f322126714..714bd1e406 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -17,6 +17,7 @@ use crate::bootstrap::bootstore::{ }; use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; use crate::hardware_monitor::{HardwareMonitor, HardwareMonitorHandle}; +use crate::storage_monitor::{StorageMonitor, StorageMonitorHandle}; use crate::zone_bundle::{CleanupContext, ZoneBundler}; use bootstore::schemes::v0 as bootstore; use key_manager::{KeyManager, StorageKeyRequester}; @@ -39,6 +40,11 @@ pub struct LongRunningTaskHandles { /// for establishing zpools on disks and managing their datasets. pub storage_manager: StorageHandle, + /// A task which monitors for updates from the `StorageManager` and takes + /// actions based on those updates, such as informing Nexus and setting + /// up dump locations. + pub storage_monitor: StorageMonitorHandle, + /// A mechanism for interacting with the hardware device tree pub hardware_manager: HardwareManager, @@ -63,6 +69,8 @@ pub async fn spawn_all_longrunning_tasks( let mut storage_manager = spawn_storage_manager(log, storage_key_requester.clone()); + let storage_monitor = spawn_storage_monitor(log, storage_manager.clone()); + // TODO: Does this need to run inside tokio::task::spawn_blocking? let hardware_manager = spawn_hardware_manager(log, sled_mode); @@ -87,6 +95,7 @@ pub async fn spawn_all_longrunning_tasks( LongRunningTaskHandles { storage_key_requester, storage_manager, + storage_monitor, hardware_manager, hardware_monitor, bootstore, @@ -115,6 +124,19 @@ fn spawn_storage_manager( handle } +fn spawn_storage_monitor( + log: &Logger, + storage_handle: StorageHandle, +) -> StorageMonitorHandle { + info!(log, "Starting StorageMonitor"); + let (mut storage_monitor, handle) = + StorageMonitor::new(log, storage_handle); + tokio::spawn(async move { + storage_monitor.run().await; + }); + handle +} + fn spawn_hardware_manager( log: &Logger, sled_mode: SledMode, diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index c22eae6baa..6aaf69f198 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -5,7 +5,7 @@ //! Sled-local service management. //! //! For controlling zone-based storage services, refer to -//! [sled_hardware:manager::StorageManager]. +//! [sled_storage:manager::StorageManager]. //! //! For controlling virtual machine instances, refer to //! [crate::instance_manager::InstanceManager]. @@ -2935,8 +2935,8 @@ impl ServiceManager { #[cfg(test)] mod test { use super::*; + use crate::bootstrap::secret_retriever::HardcodedSecretRetriever; use crate::params::{ServiceZoneService, ZoneType}; - use async_trait::async_trait; use illumos_utils::{ dladm::{ Etherstub, MockDladm, BOOTSTRAP_ETHERSTUB_NAME, @@ -2945,10 +2945,9 @@ mod test { svc, zone::MockZones, }; - use key_manager::{ - SecretRetriever, SecretRetrieverError, SecretState, VersionedIkm, - }; + use key_manager::KeyManager; use omicron_common::address::OXIMETER_PORT; + use sled_storage::manager::{StorageHandle, StorageManager}; use std::net::{Ipv6Addr, SocketAddrV6}; use std::os::unix::process::ExitStatusExt; use uuid::Uuid; @@ -3141,29 +3140,28 @@ mod test { } } - pub struct TestSecretRetriever {} - - #[async_trait] - impl SecretRetriever for TestSecretRetriever { - async fn get_latest( - &self, - ) -> Result { - let epoch = 0; - let salt = [0u8; 32]; - let secret = [0x1d; 32]; - - Ok(VersionedIkm::new(epoch, salt, &secret)) - } + // Spawn storage related tasks and return a handle to pass to both the `ServiceManager` + // and `ZoneBundler`. However, it is expected that this handle is not actually used + // as there are no provisioned zones or datasets. This is consistent with the use of + // `test_config.override_paths` below. + async fn setup_storage(log: &Logger) -> StorageHandle { + let (mut key_manager, key_requester) = + KeyManager::new(log, HardcodedSecretRetriever {}); + let (mut manager, handle) = StorageManager::new(log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); - async fn get( - &self, - epoch: u64, - ) -> Result { - if epoch != 0 { - return Err(SecretRetrieverError::NoSuchEpoch(epoch)); - } - Ok(SecretState::Current(self.get_latest().await?)) - } + // Inform the storage manager that the secret retriever is ready We + // are using the HardcodedSecretRetriever, so no need to wait for RSS + // or anything to setup the LRTQ + handle.key_manager_ready().await; + handle } #[tokio::test] @@ -3174,10 +3172,10 @@ mod test { let log = logctx.log.clone(); let test_config = TestConfig::new().await; - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage(&log).await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -3188,7 +3186,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources, + storage_handle, zone_bundler, ); test_config.override_paths(&mgr); @@ -3222,10 +3220,10 @@ mod test { let log = logctx.log.clone(); let test_config = TestConfig::new().await; - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage(&log).await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -3236,7 +3234,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources, + storage_handle, zone_bundler, ); test_config.override_paths(&mgr); @@ -3275,10 +3273,10 @@ mod test { // First, spin up a ServiceManager, create a new service, and tear it // down. - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage(&log).await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -3289,7 +3287,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle.clone(), zone_bundler.clone(), ); test_config.override_paths(&mgr); @@ -3322,7 +3320,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle.clone(), zone_bundler.clone(), ); test_config.override_paths(&mgr); @@ -3358,10 +3356,10 @@ mod test { // First, spin up a ServiceManager, create a new service, and tear it // down. - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage(&log).await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -3372,7 +3370,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle.clone(), zone_bundler.clone(), ); test_config.override_paths(&mgr); @@ -3410,7 +3408,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle, zone_bundler.clone(), ); test_config.override_paths(&mgr); diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index debd0d5e95..c48fd5cbfa 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -6,17 +6,16 @@ //! and dispatches them to other parst of the bootstrap agent and sled agent //! code. +use crate::dump_setup::DumpSetup; use crate::nexus::NexusClientWithResolver; use derive_more::From; use futures::stream::FuturesOrdered; use futures::FutureExt; use nexus_client::types::PhysicalDiskDeleteRequest; -use nexus_client::types::PhysicalDiskKind; use nexus_client::types::PhysicalDiskPutRequest; use nexus_client::types::ZpoolPutRequest; use omicron_common::api::external::ByteCount; use omicron_common::backoff; -use sled_storage::disk::Disk; use sled_storage::manager::StorageHandle; use sled_storage::pool::Pool; use sled_storage::resources::StorageResources; @@ -74,6 +73,9 @@ pub struct StorageMonitor { // A queue for sending nexus notifications in order nexus_notifications: FuturesOrdered, + + // Invokes dumpadm(8) and savecore(8) when new disks are encountered + dump_setup: DumpSetup, } impl StorageMonitor { @@ -83,6 +85,7 @@ impl StorageMonitor { ) -> (StorageMonitor, StorageMonitorHandle) { let (handle_tx, handle_rx) = mpsc::channel(QUEUE_SIZE); let storage_resources = StorageResources::default(); + let dump_setup = DumpSetup::new(&log); let log = log.new(o!("component" => "StorageMonitor")); ( StorageMonitor { @@ -92,6 +95,7 @@ impl StorageMonitor { storage_resources, underlay: None, nexus_notifications: FuturesOrdered::new(), + dump_setup, }, StorageMonitorHandle { tx: handle_tx }, ) @@ -129,6 +133,9 @@ impl StorageMonitor { let sled_id = underlay.sled_id; self.underlay = Some(underlay); self.notify_nexus_about_existing_resources(sled_id).await; + self.dump_setup + .update_dumpdev_setup(&self.storage_resources.disks) + .await; } } } @@ -162,6 +169,12 @@ impl StorageMonitor { &self.storage_resources, &updated_resources, ); + if nexus_updates.has_disk_updates() { + self.dump_setup + .update_dumpdev_setup(&self.storage_resources.disks) + .await; + } + for put in nexus_updates.disk_puts { self.physical_disk_notify(put.into()).await; } @@ -171,8 +184,6 @@ impl StorageMonitor { for (pool, put) in nexus_updates.zpool_puts { self.add_zpool_notify(pool, put).await; } - - // TODO: Update Dump Setup if any diffs } // Save the updated `StorageResources` self.storage_resources = updated_resources; @@ -291,6 +302,12 @@ struct NexusUpdates { zpool_puts: Vec<(Pool, ZpoolPutRequest)>, } +impl NexusUpdates { + fn has_disk_updates(&self) -> bool { + !self.disk_puts.is_empty() || !self.disk_deletes.is_empty() + } +} + fn compute_resource_diffs( log: &Logger, sled_id: &Uuid, diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index ea7481bd6d..55058ee23a 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -1764,7 +1764,6 @@ mod illumos_tests { use super::CleanupPeriod; use super::PriorityOrder; use super::StorageLimit; - use super::StorageResources; use super::Utf8Path; use super::Utf8PathBuf; use super::Uuid; @@ -1774,9 +1773,15 @@ mod illumos_tests { use super::ZoneBundleMetadata; use super::ZoneBundler; use super::ZFS; + use crate::bootstrap::secret_retriever::HardcodedSecretRetriever; use anyhow::Context; use chrono::TimeZone; use chrono::Utc; + use illumos_utils::zpool::{Zpool, ZpoolName}; + use key_manager::KeyManager; + use sled_storage::disk::RawDisk; + use sled_storage::disk::SyntheticDisk; + use sled_storage::manager::{StorageHandle, StorageManager}; use slog::Drain; use slog::Logger; use tokio::process::Command; @@ -1818,31 +1823,62 @@ mod illumos_tests { // system, that creates the directories implied by the `StorageResources` // expected disk structure. struct ResourceWrapper { - resources: StorageResources, + storage_handle: StorageHandle, + zpool_names: Vec, dirs: Vec, } + async fn setup_storage(log: &Logger) -> (StorageHandle, Vec) { + let (mut key_manager, key_requester) = + KeyManager::new(log, HardcodedSecretRetriever {}); + let (mut manager, handle) = StorageManager::new(log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + // Inform the storage manager that the secret retriever is ready We + // are using the HardcodedSecretRetriever, so no need to wait for RSS + // or anything to setup the LRTQ + handle.key_manager_ready().await; + + // Put the zpools under /rpool + let dir = + camino::Utf8PathBuf::from(format!("/rpool/{}", Uuid::new_v4())); + + let internal_zpool_name = ZpoolName::new_internal(Uuid::new_v4()); + let internal_disk: RawDisk = + SyntheticDisk::create_zpool(&dir, &internal_zpool_name).into(); + let external_zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let external_disk: RawDisk = + SyntheticDisk::create_zpool(&dir, &external_zpool_name).into(); + handle.upsert_disk(internal_disk).await; + handle.upsert_disk(external_disk).await; + + (handle, vec![internal_zpool_name, external_zpool_name]) + } + impl ResourceWrapper { // Create new storage resources, and mount fake datasets at the required // locations. - async fn new() -> Self { - let resources = StorageResources::new_for_test(); - let dirs = resources.all_zone_bundle_directories().await; - for d in dirs.iter() { - let id = - d.components().nth(3).unwrap().as_str().parse().unwrap(); - create_test_dataset(&id, d).await.unwrap(); - } - Self { resources, dirs } + async fn new(log: Logger) -> Self { + // Spawn the storage related tasks required for testing and insert + // synthetic disks. + let (storage_handle, zpool_names) = setup_storage(&log).await; + let resources = storage_handle.get_latest_resources().await; + let dirs = resources.all_zone_bundle_directories(); + Self { storage_handle, zpool_names, dirs } } } impl Drop for ResourceWrapper { fn drop(&mut self) { - for d in self.dirs.iter() { - let id = - d.components().nth(3).unwrap().as_str().parse().unwrap(); - remove_test_dataset(&id).unwrap(); + for name in &self.zpool_names { + Zpool::destroy(name).unwrap(); } } } @@ -1854,9 +1890,12 @@ mod illumos_tests { let log = Logger::root(drain, slog::o!("component" => "fake-cleanup-task")); let context = CleanupContext::default(); - let resource_wrapper = ResourceWrapper::new().await; - let bundler = - ZoneBundler::new(log, resource_wrapper.resources.clone(), context); + let resource_wrapper = ResourceWrapper::new(log.clone()).await; + let bundler = ZoneBundler::new( + log, + resource_wrapper.storage_handle.clone(), + context, + ); Ok(CleanupTestContext { resource_wrapper, context, bundler }) } @@ -1891,64 +1930,6 @@ mod illumos_tests { assert_eq!(context, new_context, "failed to update context"); } - // Quota applied to test datasets. - // - // This needs to be at least this big lest we get "out of space" errors when - // creating. Not sure where those come from, but could be ZFS overhead. - const TEST_QUOTA: u64 = 1024 * 32; - - async fn create_test_dataset( - id: &Uuid, - mountpoint: &Utf8PathBuf, - ) -> anyhow::Result<()> { - let output = Command::new("/usr/bin/pfexec") - .arg(ZFS) - .arg("create") - .arg("-o") - .arg(format!("quota={TEST_QUOTA}")) - .arg("-o") - .arg(format!("mountpoint={mountpoint}")) - .arg(format!("rpool/{id}")) - .output() - .await - .context("failed to spawn zfs create operation")?; - anyhow::ensure!( - output.status.success(), - "zfs create operation failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - - // Make the path operable by the test code. - let output = Command::new("/usr/bin/pfexec") - .arg("chmod") - .arg("a+rw") - .arg(&mountpoint) - .output() - .await - .context("failed to spawn chmod operation")?; - anyhow::ensure!( - output.status.success(), - "chmod-ing the dataset failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - Ok(()) - } - - fn remove_test_dataset(id: &Uuid) -> anyhow::Result<()> { - let output = std::process::Command::new("/usr/bin/pfexec") - .arg(ZFS) - .arg("destroy") - .arg(format!("rpool/{id}")) - .output() - .context("failed to spawn zfs destroy operation")?; - anyhow::ensure!( - output.status.success(), - "zfs destroy operation failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - Ok(()) - } - async fn run_test_with_zfs_dataset(test: T) where T: FnOnce(CleanupTestContext) -> Fut, diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 0bdca5c19c..64136e756d 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -46,7 +46,9 @@ impl StorageResources { /// Insert a disk and its zpool /// /// Return true, if data was changed, false otherwise - pub(crate) fn insert_disk(&mut self, disk: Disk) -> Result { + /// + /// This really should not be used outside this crate, except for testing + pub fn insert_disk(&mut self, disk: Disk) -> Result { let disk_id = disk.identity().clone(); let zpool_name = disk.zpool_name().clone(); let zpool = Pool::new(zpool_name, disk_id.clone())?; From 6a81c2c70e1b70ab0ceac2704525210bc7938948 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 12 Oct 2023 04:45:56 +0000 Subject: [PATCH 33/66] wip --- Cargo.lock | 1 + sled-agent/Cargo.toml | 3 ++- sled-agent/src/services.rs | 20 ++++++++++++++++- sled-agent/src/sled_agent.rs | 11 +++++++++ sled-agent/src/storage_monitor.rs | 27 ++++++++++------------ sled-agent/src/zone_bundle.rs | 37 ++++++++++++++++++------------- sled-storage/Cargo.toml | 5 +++++ sled-storage/src/dataset.rs | 14 ++++++++++-- 8 files changed, 84 insertions(+), 34 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d8c01f0136..9cc8995cb6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7963,6 +7963,7 @@ dependencies = [ "async-trait", "camino", "camino-tempfile", + "cfg-if 1.0.0", "derive_more", "glob", "illumos-utils", diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index e219461b9b..50af5209dd 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -96,7 +96,8 @@ subprocess.workspace = true slog-async.workspace = true slog-term.workspace = true -illumos-utils = { workspace = true, features = ["testing"] } +illumos-utils = { workspace = true, features = ["testing", "tmp_keypath"] } +sled-storage = { workspace = true, features = ["testing"] } # # Disable doc builds by default for our binaries to work around issue diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 6aaf69f198..63e8067b8d 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -2937,6 +2937,7 @@ mod test { use super::*; use crate::bootstrap::secret_retriever::HardcodedSecretRetriever; use crate::params::{ServiceZoneService, ZoneType}; + use illumos_utils::zpool::ZpoolName; use illumos_utils::{ dladm::{ Etherstub, MockDladm, BOOTSTRAP_ETHERSTUB_NAME, @@ -2947,6 +2948,8 @@ mod test { }; use key_manager::KeyManager; use omicron_common::address::OXIMETER_PORT; + use sled_storage::disk::{RawDisk, SyntheticDisk}; + use sled_storage::manager::{StorageHandle, StorageManager}; use std::net::{Ipv6Addr, SocketAddrV6}; use std::os::unix::process::ExitStatusExt; @@ -3017,7 +3020,6 @@ mod test { let wait_ctx = svc::wait_for_service_context(); wait_ctx.expect().return_once(|_, _| Ok(())); - // Import the manifest, enable the service let execute_ctx = illumos_utils::execute_context(); execute_ctx.expect().times(..).returning(|_| { Ok(std::process::Output { @@ -3161,6 +3163,22 @@ mod test { // are using the HardcodedSecretRetriever, so no need to wait for RSS // or anything to setup the LRTQ handle.key_manager_ready().await; + + // Create some backing disks + let tempdir = camino_tempfile::Utf8TempDir::new().unwrap(); + + // These must be internal zpools + //let mut zpool_names = vec![]; + let internal_zpool_name = ZpoolName::new_internal(Uuid::new_v4()); + let internal_disk: RawDisk = + SyntheticDisk::new(internal_zpool_name).into(); + handle.upsert_disk(internal_disk).await; + let external_zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let external_disk: RawDisk = + SyntheticDisk::new(external_zpool_name).into(); + handle.upsert_disk(external_disk).await; + + //zpool_names.push(internal_zpool_name); handle } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 571ea45499..e448f0719c 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -19,6 +19,7 @@ use crate::params::{ VpcFirewallRule, ZoneBundleMetadata, Zpool, }; use crate::services::{self, ServiceManager}; +use crate::storage_monitor::UnderlayAccess; use crate::updates::{ConfigUpdates, UpdateManager}; use crate::zone_bundle; use crate::zone_bundle::BundleError; @@ -329,6 +330,16 @@ impl SledAgent { *sled_address.ip(), ); + // Inform the `StorageMonitor` that the underlay is available so that + // it can try to contact nexus. + long_running_task_handles + .storage_monitor + .underlay_available(UnderlayAccess { + nexus_client: nexus_client.clone(), + sled_id: request.id, + }) + .await; + let instances = InstanceManager::new( parent_log.clone(), nexus_client.clone(), diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index c48fd5cbfa..da9f6b4897 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -60,6 +60,15 @@ pub struct StorageMonitorHandle { tx: mpsc::Sender, } +impl StorageMonitorHandle { + pub async fn underlay_available(&self, underlay_access: UnderlayAccess) { + self.tx + .send(StorageMonitorMsg::UnderlayAvailable(underlay_access)) + .await + .unwrap(); + } +} + pub struct StorageMonitor { log: Logger, storage_manager: StorageHandle, @@ -133,9 +142,6 @@ impl StorageMonitor { let sled_id = underlay.sled_id; self.underlay = Some(underlay); self.notify_nexus_about_existing_resources(sled_id).await; - self.dump_setup - .update_dumpdev_setup(&self.storage_resources.disks) - .await; } } } @@ -169,11 +175,6 @@ impl StorageMonitor { &self.storage_resources, &updated_resources, ); - if nexus_updates.has_disk_updates() { - self.dump_setup - .update_dumpdev_setup(&self.storage_resources.disks) - .await; - } for put in nexus_updates.disk_puts { self.physical_disk_notify(put.into()).await; @@ -185,6 +186,8 @@ impl StorageMonitor { self.add_zpool_notify(pool, put).await; } } + self.dump_setup.update_dumpdev_setup(&updated_resources.disks).await; + // Save the updated `StorageResources` self.storage_resources = updated_resources; } @@ -302,12 +305,6 @@ struct NexusUpdates { zpool_puts: Vec<(Pool, ZpoolPutRequest)>, } -impl NexusUpdates { - fn has_disk_updates(&self) -> bool { - !self.disk_puts.is_empty() || !self.disk_deletes.is_empty() - } -} - fn compute_resource_diffs( log: &Logger, sled_id: &Uuid, @@ -362,7 +359,7 @@ fn compute_resource_diffs( // Diff the existing resources with the update to see what has changed // This loop finds new disks and pools - for (disk_id, (updated_disk, updated_pool)) in updated.disks.iter() { + for (disk_id, (updated_disk, _)) in updated.disks.iter() { if !current.disks.contains_key(disk_id) { disk_puts.push(PhysicalDiskPutRequest { sled_id: *sled_id, diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index 55058ee23a..c2f6fceadf 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -184,6 +184,7 @@ impl Inner { async fn bundle_directories(&self) -> Vec { let resources = self.storage_handle.get_latest_resources().await; let expected = resources.all_zone_bundle_directories(); + println!("dirs = {:?}", expected); let mut out = Vec::with_capacity(expected.len()); for each in expected.into_iter() { if tokio::fs::create_dir_all(&each).await.is_ok() { @@ -1772,7 +1773,6 @@ mod illumos_tests { use super::ZoneBundleInfo; use super::ZoneBundleMetadata; use super::ZoneBundler; - use super::ZFS; use crate::bootstrap::secret_retriever::HardcodedSecretRetriever; use anyhow::Context; use chrono::TimeZone; @@ -1784,7 +1784,6 @@ mod illumos_tests { use sled_storage::manager::{StorageHandle, StorageManager}; use slog::Drain; use slog::Logger; - use tokio::process::Command; #[tokio::test] async fn test_zfs_quota() { @@ -1846,20 +1845,21 @@ mod illumos_tests { // or anything to setup the LRTQ handle.key_manager_ready().await; - // Put the zpools under /rpool - let dir = - camino::Utf8PathBuf::from(format!("/rpool/{}", Uuid::new_v4())); + let tempdir = camino_tempfile::Utf8TempDir::new().unwrap(); - let internal_zpool_name = ZpoolName::new_internal(Uuid::new_v4()); - let internal_disk: RawDisk = - SyntheticDisk::create_zpool(&dir, &internal_zpool_name).into(); - let external_zpool_name = ZpoolName::new_external(Uuid::new_v4()); - let external_disk: RawDisk = - SyntheticDisk::create_zpool(&dir, &external_zpool_name).into(); - handle.upsert_disk(internal_disk).await; - handle.upsert_disk(external_disk).await; - - (handle, vec![internal_zpool_name, external_zpool_name]) + // These must be internal zpools + let mut zpool_names = vec![]; + for _ in 0..2 { + let internal_zpool_name = ZpoolName::new_internal(Uuid::new_v4()); + let internal_disk: RawDisk = SyntheticDisk::create_zpool( + tempdir.path(), + &internal_zpool_name, + ) + .into(); + handle.upsert_disk(internal_disk).await; + zpool_names.push(internal_zpool_name); + } + (handle, zpool_names) } impl ResourceWrapper { @@ -1871,6 +1871,7 @@ mod illumos_tests { let (storage_handle, zpool_names) = setup_storage(&log).await; let resources = storage_handle.get_latest_resources().await; let dirs = resources.all_zone_bundle_directories(); + info!(log, "Initial dirs = {:?}", dirs); Self { storage_handle, zpool_names, dirs } } } @@ -1930,6 +1931,12 @@ mod illumos_tests { assert_eq!(context, new_context, "failed to update context"); } + // Quota applied to test datasets. + // + // This needs to be at least this big lest we get "out of space" errors when + // creating. Not sure where those come from, but could be ZFS overhead. + const TEST_QUOTA: u64 = sled_storage::dataset::DEBUG_DATASET_QUOTA as u64; + async fn run_test_with_zfs_dataset(test: T) where T: FnOnce(CleanupTestContext) -> Fut, diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml index 617a0a0fd7..efb6afd6bc 100644 --- a/sled-storage/Cargo.toml +++ b/sled-storage/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" [dependencies] async-trait.workspace = true camino.workspace = true +cfg-if.workspace = true derive_more.workspace = true glob.workspace = true illumos-utils.workspace = true @@ -31,3 +32,7 @@ uuid.workspace = true illumos-utils = { workspace = true, features = ["tmp_keypath"] } omicron-test-utils.workspace = true camino-tempfile.workspace = true + +[features] +# Quotas and the like can be shrunk via this feature +testing = [] diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index 71e04a6935..e56fde9aa4 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -6,6 +6,7 @@ use crate::keyfile::KeyFile; use camino::Utf8PathBuf; +use cfg_if::cfg_if; use illumos_utils::zfs::{ self, DestroyDatasetErrorVariant, EncryptionDetails, Keypath, Mountpoint, SizeDetails, Zfs, @@ -26,10 +27,19 @@ pub const CRASH_DATASET: &'static str = "crash"; pub const CLUSTER_DATASET: &'static str = "cluster"; pub const CONFIG_DATASET: &'static str = "config"; pub const M2_DEBUG_DATASET: &'static str = "debug"; + +cfg_if! { + if #[cfg(any(test, feature = "testing"))] { + // Tuned for zone_bundle tests + pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 10); + } else { + // TODO-correctness: This value of 100GiB is a pretty wild guess, and should be + // tuned as needed. + pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30); + } +} // TODO-correctness: This value of 100GiB is a pretty wild guess, and should be // tuned as needed. -pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30); -// ditto. pub const DUMP_DATASET_QUOTA: usize = 100 * (1 << 30); // passed to zfs create -o compression= pub const DUMP_DATASET_COMPRESSION: &'static str = "gzip-9"; From e3b77cfffa8bda3135d29ea5edb0960c2970856c Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 24 Oct 2023 01:01:18 +0000 Subject: [PATCH 34/66] wip - FakeStorageManager --- illumos-utils/src/zpool.rs | 11 +++++++ sled-agent/src/services.rs | 36 ++++----------------- sled-storage/src/manager.rs | 61 +++++++++++++++++++++++++++++++++++ sled-storage/src/pool.rs | 7 ++++ sled-storage/src/resources.rs | 18 +++++++++++ 5 files changed, 104 insertions(+), 29 deletions(-) diff --git a/illumos-utils/src/zpool.rs b/illumos-utils/src/zpool.rs index f0916b236a..f2c395e22b 100644 --- a/illumos-utils/src/zpool.rs +++ b/illumos-utils/src/zpool.rs @@ -128,6 +128,17 @@ impl ZpoolInfo { pub fn health(&self) -> ZpoolHealth { self.health } + + #[cfg(any(test, feature = "testing"))] + pub fn new_hardcoded(name: String) -> ZpoolInfo { + ZpoolInfo { + name, + size: 1024 * 1024 * 64, + allocated: 1024, + free: 1024 * 1023 * 64, + health: ZpoolHealth::Online, + } + } } impl FromStr for ZpoolInfo { diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 63e8067b8d..264325b6e5 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -2935,7 +2935,6 @@ impl ServiceManager { #[cfg(test)] mod test { use super::*; - use crate::bootstrap::secret_retriever::HardcodedSecretRetriever; use crate::params::{ServiceZoneService, ZoneType}; use illumos_utils::zpool::ZpoolName; use illumos_utils::{ @@ -2946,11 +2945,10 @@ mod test { svc, zone::MockZones, }; - use key_manager::KeyManager; use omicron_common::address::OXIMETER_PORT; use sled_storage::disk::{RawDisk, SyntheticDisk}; - use sled_storage::manager::{StorageHandle, StorageManager}; + use sled_storage::manager::{FakeStorageManager, StorageHandle}; use std::net::{Ipv6Addr, SocketAddrV6}; use std::os::unix::process::ExitStatusExt; use uuid::Uuid; @@ -3142,33 +3140,14 @@ mod test { } } - // Spawn storage related tasks and return a handle to pass to both the `ServiceManager` - // and `ZoneBundler`. However, it is expected that this handle is not actually used - // as there are no provisioned zones or datasets. This is consistent with the use of - // `test_config.override_paths` below. - async fn setup_storage(log: &Logger) -> StorageHandle { - let (mut key_manager, key_requester) = - KeyManager::new(log, HardcodedSecretRetriever {}); - let (mut manager, handle) = StorageManager::new(log, key_requester); - - // Spawn the key_manager so that it will respond to requests for encryption keys - tokio::spawn(async move { key_manager.run().await }); + async fn setup_storage() -> StorageHandle { + let (mut manager, handle) = FakeStorageManager::new(); // Spawn the storage manager as done by sled-agent tokio::spawn(async move { manager.run().await; }); - // Inform the storage manager that the secret retriever is ready We - // are using the HardcodedSecretRetriever, so no need to wait for RSS - // or anything to setup the LRTQ - handle.key_manager_ready().await; - - // Create some backing disks - let tempdir = camino_tempfile::Utf8TempDir::new().unwrap(); - - // These must be internal zpools - //let mut zpool_names = vec![]; let internal_zpool_name = ZpoolName::new_internal(Uuid::new_v4()); let internal_disk: RawDisk = SyntheticDisk::new(internal_zpool_name).into(); @@ -3178,7 +3157,6 @@ mod test { SyntheticDisk::new(external_zpool_name).into(); handle.upsert_disk(external_disk).await; - //zpool_names.push(internal_zpool_name); handle } @@ -3190,7 +3168,7 @@ mod test { let log = logctx.log.clone(); let test_config = TestConfig::new().await; - let storage_handle = setup_storage(&log).await; + let storage_handle = setup_storage().await; let zone_bundler = ZoneBundler::new( log.clone(), storage_handle.clone(), @@ -3238,7 +3216,7 @@ mod test { let log = logctx.log.clone(); let test_config = TestConfig::new().await; - let storage_handle = setup_storage(&log).await; + let storage_handle = setup_storage().await; let zone_bundler = ZoneBundler::new( log.clone(), storage_handle.clone(), @@ -3291,7 +3269,7 @@ mod test { // First, spin up a ServiceManager, create a new service, and tear it // down. - let storage_handle = setup_storage(&log).await; + let storage_handle = setup_storage().await; let zone_bundler = ZoneBundler::new( log.clone(), storage_handle.clone(), @@ -3374,7 +3352,7 @@ mod test { // First, spin up a ServiceManager, create a new service, and tear it // down. - let storage_handle = setup_storage(&log).await; + let storage_handle = setup_storage().await; let zone_bundler = ZoneBundler::new( log.clone(), storage_handle.clone(), diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index e00c9ad4fa..e12fb337c8 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -159,6 +159,67 @@ impl StorageHandle { } } +// Some sled-agent tests cannot currently use the real StorageManager +// and want to fake the entire behavior, but still have access to the +// `StorageResources`. We allow this via use of the `FakeStorageManager` +// that will respond to real storage requests from a real `StorageHandle`. +#[cfg(feature = "testing")] +pub struct FakeStorageManager { + rx: mpsc::Receiver, + resources: StorageResources, + resource_updates: watch::Sender, +} + +impl FakeStorageManager { + pub fn new() -> (Self, StorageHandle) { + let (tx, rx) = mpsc::channel(QUEUE_SIZE); + let resources = StorageResources::default(); + let (update_tx, update_rx) = watch::channel(resources.clone()); + ( + Self { rx, resources, resource_updates: update_tx }, + StorageHandle { tx, resource_updates: update_rx }, + ) + } + + /// Run the main receive loop of the `StorageManager` + /// + /// This should be spawned into a tokio task + pub async fn run(&mut self) { + loop { + // The sending side should never disappear + match self.rx.recv().await.unwrap() { + StorageRequest::AddDisk(raw_disk) => { + if self.add_disk(raw_disk) { + self.resource_updates + .send_replace(self.resources.clone()); + } + } + StorageRequest::GetLatestResources(tx) => { + let _ = tx.send(self.resources.clone()); + } + _ => { + unreachable!(); + } + } + } + } + + // Add a disk to `StorageResources` if it is new and return Ok(true) if so + fn add_disk(&mut self, raw_disk: RawDisk) -> bool { + let disk = match raw_disk { + RawDisk::Real(_) => { + panic!( + "Only synthetic disks can be used with `FakeStorageManager`" + ); + } + RawDisk::Synthetic(synthetic_disk) => { + Disk::Synthetic(synthetic_disk) + } + }; + self.resources.insert_fake_disk(disk) + } +} + /// The storage manager responsible for the state of the storage /// on a sled. The storage manager runs in its own task and is interacted /// with via the [`StorageHandle`]. diff --git a/sled-storage/src/pool.rs b/sled-storage/src/pool.rs index bac851df46..cc71aeb19d 100644 --- a/sled-storage/src/pool.rs +++ b/sled-storage/src/pool.rs @@ -25,4 +25,11 @@ impl Pool { let info = Zpool::get_info(&name.to_string())?; Ok(Pool { name, info, parent }) } + + /// Return a Pool consisting of fake info + #[cfg(feature = "testing")] + pub fn new_with_fake_info(name: ZpoolName, parent: DiskIdentity) -> Pool { + let info = ZpoolInfo::new_hardcoded(name.to_string()); + Pool { name, info, parent } + } } diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 64136e756d..ffb0e93ef2 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -65,6 +65,24 @@ impl StorageResources { Ok(true) } + /// Insert a disk while creating a fake pool + /// This is a workaround for current mock based testing strategies + /// in the sled-agent. + /// + /// Return true, if data was changed, false otherwise + #[cfg(feature = "testing")] + pub fn insert_fake_disk(&mut self, disk: Disk) -> bool { + let disk_id = disk.identity().clone(); + let zpool_name = disk.zpool_name().clone(); + let zpool = Pool::new_with_fake_info(zpool_name, disk_id.clone()); + if self.disks.contains_key(&disk_id) { + return false; + } + // Either the disk or zpool changed + Arc::make_mut(&mut self.disks).insert(disk_id, (disk, zpool)); + true + } + /// Delete a real disk and its zpool /// /// Return true, if data was changed, false otherwise From 79bd794ee4ecc2911d5138602a1b9a261a84ce3b Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 24 Oct 2023 16:03:12 +0000 Subject: [PATCH 35/66] fix zone bundle tests --- sled-agent/src/zone_bundle.rs | 103 ++++++++++++++++++++++++---------- sled-storage/src/manager.rs | 2 +- sled-storage/src/resources.rs | 4 +- 3 files changed, 76 insertions(+), 33 deletions(-) diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index c2f6fceadf..004a0ac646 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -1773,17 +1773,17 @@ mod illumos_tests { use super::ZoneBundleInfo; use super::ZoneBundleMetadata; use super::ZoneBundler; - use crate::bootstrap::secret_retriever::HardcodedSecretRetriever; + use super::ZFS; use anyhow::Context; use chrono::TimeZone; use chrono::Utc; - use illumos_utils::zpool::{Zpool, ZpoolName}; - use key_manager::KeyManager; + use illumos_utils::zpool::ZpoolName; use sled_storage::disk::RawDisk; use sled_storage::disk::SyntheticDisk; - use sled_storage::manager::{StorageHandle, StorageManager}; + use sled_storage::manager::{FakeStorageManager, StorageHandle}; use slog::Drain; use slog::Logger; + use tokio::process::Command; #[tokio::test] async fn test_zfs_quota() { @@ -1827,35 +1827,20 @@ mod illumos_tests { dirs: Vec, } - async fn setup_storage(log: &Logger) -> (StorageHandle, Vec) { - let (mut key_manager, key_requester) = - KeyManager::new(log, HardcodedSecretRetriever {}); - let (mut manager, handle) = StorageManager::new(log, key_requester); - - // Spawn the key_manager so that it will respond to requests for encryption keys - tokio::spawn(async move { key_manager.run().await }); + async fn setup_storage() -> (StorageHandle, Vec) { + let (mut manager, handle) = FakeStorageManager::new(); // Spawn the storage manager as done by sled-agent tokio::spawn(async move { manager.run().await; }); - // Inform the storage manager that the secret retriever is ready We - // are using the HardcodedSecretRetriever, so no need to wait for RSS - // or anything to setup the LRTQ - handle.key_manager_ready().await; - - let tempdir = camino_tempfile::Utf8TempDir::new().unwrap(); - // These must be internal zpools let mut zpool_names = vec![]; for _ in 0..2 { let internal_zpool_name = ZpoolName::new_internal(Uuid::new_v4()); - let internal_disk: RawDisk = SyntheticDisk::create_zpool( - tempdir.path(), - &internal_zpool_name, - ) - .into(); + let internal_disk: RawDisk = + SyntheticDisk::new(internal_zpool_name.clone()).into(); handle.upsert_disk(internal_disk).await; zpool_names.push(internal_zpool_name); } @@ -1865,21 +1850,27 @@ mod illumos_tests { impl ResourceWrapper { // Create new storage resources, and mount fake datasets at the required // locations. - async fn new(log: Logger) -> Self { + async fn new() -> Self { // Spawn the storage related tasks required for testing and insert // synthetic disks. - let (storage_handle, zpool_names) = setup_storage(&log).await; + let (storage_handle, zpool_names) = setup_storage().await; let resources = storage_handle.get_latest_resources().await; let dirs = resources.all_zone_bundle_directories(); - info!(log, "Initial dirs = {:?}", dirs); + for d in dirs.iter() { + let id = + d.components().nth(3).unwrap().as_str().parse().unwrap(); + create_test_dataset(&id, d).await.unwrap(); + } Self { storage_handle, zpool_names, dirs } } } impl Drop for ResourceWrapper { fn drop(&mut self) { - for name in &self.zpool_names { - Zpool::destroy(name).unwrap(); + for d in self.dirs.iter() { + let id = + d.components().nth(3).unwrap().as_str().parse().unwrap(); + remove_test_dataset(&id).unwrap(); } } } @@ -1891,7 +1882,7 @@ mod illumos_tests { let log = Logger::root(drain, slog::o!("component" => "fake-cleanup-task")); let context = CleanupContext::default(); - let resource_wrapper = ResourceWrapper::new(log.clone()).await; + let resource_wrapper = ResourceWrapper::new().await; let bundler = ZoneBundler::new( log, resource_wrapper.storage_handle.clone(), @@ -1935,7 +1926,59 @@ mod illumos_tests { // // This needs to be at least this big lest we get "out of space" errors when // creating. Not sure where those come from, but could be ZFS overhead. - const TEST_QUOTA: u64 = sled_storage::dataset::DEBUG_DATASET_QUOTA as u64; + const TEST_QUOTA: u64 = 1024 * 32; + + async fn create_test_dataset( + id: &Uuid, + mountpoint: &Utf8PathBuf, + ) -> anyhow::Result<()> { + let output = Command::new("/usr/bin/pfexec") + .arg(ZFS) + .arg("create") + .arg("-o") + .arg(format!("quota={TEST_QUOTA}")) + .arg("-o") + .arg(format!("mountpoint={mountpoint}")) + .arg(format!("rpool/{id}")) + .output() + .await + .context("failed to spawn zfs create operation")?; + anyhow::ensure!( + output.status.success(), + "zfs create operation failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + // Make the path operable by the test code. + let output = Command::new("/usr/bin/pfexec") + .arg("chmod") + .arg("a+rw") + .arg(&mountpoint) + .output() + .await + .context("failed to spawn chmod operation")?; + anyhow::ensure!( + output.status.success(), + "chmod-ing the dataset failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + Ok(()) + } + + fn remove_test_dataset(id: &Uuid) -> anyhow::Result<()> { + let output = std::process::Command::new("/usr/bin/pfexec") + .arg(ZFS) + .arg("destroy") + .arg(format!("rpool/{id}")) + .output() + .context("failed to spawn zfs destroy operation")?; + anyhow::ensure!( + output.status.success(), + "zfs destroy operation failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + Ok(()) + } async fn run_test_with_zfs_dataset(test: T) where diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index e12fb337c8..410dcc225d 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -204,7 +204,7 @@ impl FakeStorageManager { } } - // Add a disk to `StorageResources` if it is new and return Ok(true) if so + // Add a disk to `StorageResources` if it is new and return true if so fn add_disk(&mut self, raw_disk: RawDisk) -> bool { let disk = match raw_disk { RawDisk::Real(_) => { diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index ffb0e93ef2..51fd3ae222 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -45,7 +45,7 @@ pub struct StorageResources { impl StorageResources { /// Insert a disk and its zpool /// - /// Return true, if data was changed, false otherwise + /// Return true if data was changed, false otherwise /// /// This really should not be used outside this crate, except for testing pub fn insert_disk(&mut self, disk: Disk) -> Result { @@ -69,7 +69,7 @@ impl StorageResources { /// This is a workaround for current mock based testing strategies /// in the sled-agent. /// - /// Return true, if data was changed, false otherwise + /// Return true if data was changed, false otherwise #[cfg(feature = "testing")] pub fn insert_fake_disk(&mut self, disk: Disk) -> bool { let disk_id = disk.identity().clone(); From 91742c6e79c04fccd832a8f20734da324858c85f Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 24 Oct 2023 16:15:33 +0000 Subject: [PATCH 36/66] wip --- sled-agent/src/zone_bundle.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index 004a0ac646..34c49a2f25 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -1823,11 +1823,10 @@ mod illumos_tests { // expected disk structure. struct ResourceWrapper { storage_handle: StorageHandle, - zpool_names: Vec, dirs: Vec, } - async fn setup_storage() -> (StorageHandle, Vec) { + async fn setup_storage() -> StorageHandle { let (mut manager, handle) = FakeStorageManager::new(); // Spawn the storage manager as done by sled-agent @@ -1836,15 +1835,13 @@ mod illumos_tests { }); // These must be internal zpools - let mut zpool_names = vec![]; for _ in 0..2 { let internal_zpool_name = ZpoolName::new_internal(Uuid::new_v4()); let internal_disk: RawDisk = SyntheticDisk::new(internal_zpool_name.clone()).into(); handle.upsert_disk(internal_disk).await; - zpool_names.push(internal_zpool_name); } - (handle, zpool_names) + handle } impl ResourceWrapper { @@ -1853,7 +1850,7 @@ mod illumos_tests { async fn new() -> Self { // Spawn the storage related tasks required for testing and insert // synthetic disks. - let (storage_handle, zpool_names) = setup_storage().await; + let storage_handle = setup_storage().await; let resources = storage_handle.get_latest_resources().await; let dirs = resources.all_zone_bundle_directories(); for d in dirs.iter() { @@ -1861,7 +1858,7 @@ mod illumos_tests { d.components().nth(3).unwrap().as_str().parse().unwrap(); create_test_dataset(&id, d).await.unwrap(); } - Self { storage_handle, zpool_names, dirs } + Self { storage_handle, dirs } } } From 34903bc2cb1c25d7d68adf93beb2a384a52004fa Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 24 Oct 2023 17:40:52 +0000 Subject: [PATCH 37/66] wip --- sled-agent/src/bootstrap/{bootstore.rs => bootstore_setup.rs} | 0 sled-agent/src/bootstrap/mod.rs | 2 +- sled-agent/src/long_running_tasks.rs | 2 +- sled-storage/src/manager.rs | 1 + 4 files changed, 3 insertions(+), 2 deletions(-) rename sled-agent/src/bootstrap/{bootstore.rs => bootstore_setup.rs} (100%) diff --git a/sled-agent/src/bootstrap/bootstore.rs b/sled-agent/src/bootstrap/bootstore_setup.rs similarity index 100% rename from sled-agent/src/bootstrap/bootstore.rs rename to sled-agent/src/bootstrap/bootstore_setup.rs diff --git a/sled-agent/src/bootstrap/mod.rs b/sled-agent/src/bootstrap/mod.rs index 5bf25b8521..590e13c891 100644 --- a/sled-agent/src/bootstrap/mod.rs +++ b/sled-agent/src/bootstrap/mod.rs @@ -4,7 +4,7 @@ //! Bootstrap-related utilities -pub(crate) mod bootstore; +pub(crate) mod bootstore_setup; pub mod client; pub mod config; pub mod early_networking; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index 714bd1e406..e87c990175 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -12,7 +12,7 @@ //! these tasks are supposed to run forever, and they can shutdown if their //! handles are dropped. -use crate::bootstrap::bootstore::{ +use crate::bootstrap::bootstore_setup::{ new_bootstore_config, poll_ddmd_for_bootstore_peer_update, }; use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 410dcc225d..667264b4b7 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -170,6 +170,7 @@ pub struct FakeStorageManager { resource_updates: watch::Sender, } +#[cfg(feature = "testing")] impl FakeStorageManager { pub fn new() -> (Self, StorageHandle) { let (tx, rx) = mpsc::channel(QUEUE_SIZE); From bbddcb4df496c35eef59747b4b036217c9ebf359 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 24 Oct 2023 19:50:41 +0000 Subject: [PATCH 38/66] cargo check --all-targets works --- Cargo.lock | 1 + installinator/Cargo.toml | 1 + installinator/src/hardware.rs | 6 ++++-- installinator/src/write.rs | 5 +++-- sled-agent/src/bootstrap/secret_retriever.rs | 2 +- sled-storage/src/disk.rs | 7 +++++++ 6 files changed, 17 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9cc8995cb6..380bcf8bf7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3496,6 +3496,7 @@ dependencies = [ "serde", "sha2", "sled-hardware", + "sled-storage", "slog", "slog-async", "slog-envlogger", diff --git a/installinator/Cargo.toml b/installinator/Cargo.toml index c0e7625e6e..cda32a60d3 100644 --- a/installinator/Cargo.toml +++ b/installinator/Cargo.toml @@ -30,6 +30,7 @@ reqwest.workspace = true serde.workspace = true sha2.workspace = true sled-hardware.workspace = true +sled-storage.workspace = true slog.workspace = true slog-async.workspace = true slog-envlogger.workspace = true diff --git a/installinator/src/hardware.rs b/installinator/src/hardware.rs index ffa0b74739..b037384cbe 100644 --- a/installinator/src/hardware.rs +++ b/installinator/src/hardware.rs @@ -6,10 +6,11 @@ use anyhow::anyhow; use anyhow::ensure; use anyhow::Context; use anyhow::Result; -use sled_hardware::Disk; use sled_hardware::DiskVariant; use sled_hardware::HardwareManager; use sled_hardware::SledMode; +use sled_storage::disk::Disk; +use sled_storage::disk::RawDisk; use slog::info; use slog::Logger; @@ -28,7 +29,8 @@ impl Hardware { anyhow!("failed to create HardwareManager: {err}") })?; - let disks = hardware.disks(); + let disks: Vec = + hardware.disks().into_iter().map(|disk| disk.into()).collect(); info!( log, "found gimlet hardware"; diff --git a/installinator/src/write.rs b/installinator/src/write.rs index 6c0c1f63c7..22dd2adbf6 100644 --- a/installinator/src/write.rs +++ b/installinator/src/write.rs @@ -122,8 +122,9 @@ impl WriteDestination { ); let zpool_name = disk.zpool_name().clone(); - let control_plane_dir = zpool_name - .dataset_mountpoint(sled_hardware::INSTALL_DATASET); + let control_plane_dir = zpool_name.dataset_mountpoint( + sled_storage::dataset::INSTALL_DATASET, + ); match drives.entry(slot) { Entry::Vacant(entry) => { diff --git a/sled-agent/src/bootstrap/secret_retriever.rs b/sled-agent/src/bootstrap/secret_retriever.rs index d6b542378d..5cae06310c 100644 --- a/sled-agent/src/bootstrap/secret_retriever.rs +++ b/sled-agent/src/bootstrap/secret_retriever.rs @@ -92,7 +92,7 @@ impl LrtqOrHardcodedSecretRetriever { /// /// The local retriever only returns keys for epoch 0 #[derive(Debug)] -pub struct HardcodedSecretRetriever {} +struct HardcodedSecretRetriever {} #[async_trait] impl SecretRetriever for HardcodedSecretRetriever { diff --git a/sled-storage/src/disk.rs b/sled-storage/src/disk.rs index f5a0e60c8f..f5209def77 100644 --- a/sled-storage/src/disk.rs +++ b/sled-storage/src/disk.rs @@ -116,6 +116,13 @@ impl RawDisk { pub fn is_real(&self) -> bool { !self.is_synthetic() } + + pub fn devfs_path(&self) -> &Utf8PathBuf { + match self { + Self::Real(disk) => disk.devfs_path(), + Self::Synthetic(_) => unreachable!(), + } + } } /// A physical [`PooledDisk`] or a [`SyntheticDisk`] that contains or is backed From b44defa8a9915a031dd6f3b89db56627a5c1fe2c Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 24 Oct 2023 22:56:46 +0000 Subject: [PATCH 39/66] feed hikari --- Cargo.lock | 1 + sled-agent/src/bootstrap/server.rs | 10 ---------- sled-storage/Cargo.toml | 1 + 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 524db338d6..85eb29e1a1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8210,6 +8210,7 @@ dependencies = [ "nexus-client 0.1.0", "omicron-common 0.1.0", "omicron-test-utils", + "omicron-workspace-hack", "rand 0.8.5", "schemars", "serde", diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 17d988e749..635b33893d 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -25,7 +25,6 @@ use crate::config::ConfigError; use crate::long_running_tasks::LongRunningTaskHandles; use crate::server::Server as SledAgentServer; use crate::services::ServiceManager; -use crate::sled_agent::SledAgent; use camino::Utf8PathBuf; use cancel_safe_futures::TryStreamExt; use ddm_admin_client::Client as DdmAdminClient; @@ -312,15 +311,6 @@ enum SledAgentState { ServerStarted(SledAgentServer), } -impl SledAgentState { - fn sled_agent(&self) -> Option<&SledAgent> { - match self { - SledAgentState::Bootstrapping => None, - SledAgentState::ServerStarted(server) => Some(server.sled_agent()), - } - } -} - #[derive(thiserror::Error, Debug)] pub enum SledAgentServerStartError { #[error("Failed to start sled-agent server: {0}")] diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml index efb6afd6bc..38863e160b 100644 --- a/sled-storage/Cargo.toml +++ b/sled-storage/Cargo.toml @@ -27,6 +27,7 @@ slog.workspace = true thiserror.workspace = true tokio.workspace = true uuid.workspace = true +omicron-workspace-hack.workspace = true [dev-dependencies] illumos-utils = { workspace = true, features = ["tmp_keypath"] } From d60994cb76daebfbe18e74994c0a9085b7be1e6f Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 25 Oct 2023 19:50:06 +0000 Subject: [PATCH 40/66] tests pass --- illumos-utils/src/lib.rs | 27 ++++++++++++++++++++++++++- sled-agent/src/services.rs | 3 ++- sled-storage/src/manager.rs | 8 ++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/illumos-utils/src/lib.rs b/illumos-utils/src/lib.rs index 345f097ae2..e50b35849a 100644 --- a/illumos-utils/src/lib.rs +++ b/illumos-utils/src/lib.rs @@ -4,6 +4,8 @@ //! Wrappers around illumos-specific commands. +use std::sync::atomic::{AtomicBool, Ordering}; + use cfg_if::cfg_if; pub mod addrobj; @@ -93,7 +95,7 @@ mod inner { // Helper function for starting the process and checking the // exit code result. - pub fn execute( + pub fn execute_helper( command: &mut std::process::Command, ) -> Result { let output = command.output().map_err(|err| { @@ -108,6 +110,29 @@ mod inner { } } +// Due to feature unification, the `testing` feature is enabled when some tests +// don't actually want to use it. We allow them to opt out of the use of the +// free function here. We also explicitly opt-in where mocks are used. +// +// We can remove all this when we get rid of the mocks. +pub static USE_MOCKS: AtomicBool = AtomicBool::new(false); + +pub fn execute( + command: &mut std::process::Command, +) -> Result { + cfg_if! { + if #[cfg(any(test, feature = "testing"))] { + if USE_MOCKS.load(Ordering::SeqCst) { + mock_inner::execute_helper(command) + } else { + inner::execute_helper(command) + } + } else { + inner::execute_helper(command) + } + } +} + cfg_if! { if #[cfg(any(test, feature = "testing"))] { pub use mock_inner::*; diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index b9a8cdada0..c692cca309 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -3037,6 +3037,7 @@ mod test { // Returns the expectations for a new service to be created. fn expect_new_service() -> Vec> { + illumos_utils::USE_MOCKS.store(true, Ordering::SeqCst); // Create a VNIC let create_vnic_ctx = MockDladm::create_vnic_context(); create_vnic_ctx.expect().return_once( @@ -3079,7 +3080,7 @@ mod test { let wait_ctx = svc::wait_for_service_context(); wait_ctx.expect().return_once(|_, _| Ok(())); - let execute_ctx = illumos_utils::execute_context(); + let execute_ctx = illumos_utils::execute_helper_context(); execute_ctx.expect().times(..).returning(|_| { Ok(std::process::Output { status: std::process::ExitStatus::from_raw(0), diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index fd0e607348..46fb34a3bf 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -606,6 +606,7 @@ mod tests { #[tokio::test] async fn add_u2_disk_while_not_in_normal_stage_and_ensure_it_gets_queued() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); let logctx = test_setup_log( "add_u2_disk_while_not_in_normal_stage_and_ensure_it_gets_queued", ); @@ -630,6 +631,7 @@ mod tests { #[tokio::test] async fn ensure_u2_gets_added_to_resources() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); let logctx = test_setup_log("ensure_u2_gets_added_to_resources"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); @@ -651,6 +653,7 @@ mod tests { #[tokio::test] async fn wait_for_bootdisk() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); let logctx = test_setup_log("wait_for_bootdisk"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); @@ -677,6 +680,7 @@ mod tests { #[tokio::test] async fn queued_disks_get_added_as_resources() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); let logctx = test_setup_log("queued_disks_get_added_as_resources"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); @@ -714,6 +718,7 @@ mod tests { /// This allows us to control timing precisely. #[tokio::test] async fn queued_disks_get_requeued_on_secret_retriever_error() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); let logctx = test_setup_log( "queued_disks_get_requeued_on_secret_retriever_error", ); @@ -766,6 +771,7 @@ mod tests { #[tokio::test] async fn delete_disk_triggers_notification() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); let logctx = test_setup_log("delete_disk_triggers_notification"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); @@ -806,6 +812,7 @@ mod tests { #[tokio::test] async fn ensure_using_exactly_these_disks() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); let logctx = test_setup_log("ensure_using_exactly_these_disks"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); @@ -922,6 +929,7 @@ mod tests { #[tokio::test] async fn upsert_filesystem() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); let logctx = test_setup_log("upsert_filesystem"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); From 564f44bbd1e01c91df20f14697eef240f84744cd Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 25 Oct 2023 19:54:12 +0000 Subject: [PATCH 41/66] addendum --- illumos-utils/src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/illumos-utils/src/lib.rs b/illumos-utils/src/lib.rs index e50b35849a..c882f258c9 100644 --- a/illumos-utils/src/lib.rs +++ b/illumos-utils/src/lib.rs @@ -114,6 +114,10 @@ mod inner { // don't actually want to use it. We allow them to opt out of the use of the // free function here. We also explicitly opt-in where mocks are used. // +// Note that this only works if the tests that use mocks and those that don't +// are run sequentially. However, this is how we do things in CI with nextest, +// so there is no problem currently. +// // We can remove all this when we get rid of the mocks. pub static USE_MOCKS: AtomicBool = AtomicBool::new(false); From 396fddad47d4d4f456d77cdf30b3a06391a80bd9 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 25 Oct 2023 21:11:49 +0000 Subject: [PATCH 42/66] clippy clean --- sled-agent/src/bootstrap/bootstore_setup.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sled-agent/src/bootstrap/bootstore_setup.rs b/sled-agent/src/bootstrap/bootstore_setup.rs index 3c7e860b4a..9eb0a87c03 100644 --- a/sled-agent/src/bootstrap/bootstore_setup.rs +++ b/sled-agent/src/bootstrap/bootstore_setup.rs @@ -5,6 +5,8 @@ //! Helpers for configuring and starting the bootstore during bootstrap agent //! startup. +#![allow(clippy::result_large_err)] + use super::config::BOOTSTORE_PORT; use super::server::StartError; use bootstore::schemes::v0 as bootstore; From 56a614f11b66b660da05e34d7f6eefe72f2d0adf Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 25 Oct 2023 18:10:51 -0400 Subject: [PATCH 43/66] fix sim builds --- sled-hardware/src/non_illumos/mod.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sled-hardware/src/non_illumos/mod.rs b/sled-hardware/src/non_illumos/mod.rs index 6e36330df0..d8372dd8aa 100644 --- a/sled-hardware/src/non_illumos/mod.rs +++ b/sled-hardware/src/non_illumos/mod.rs @@ -2,7 +2,9 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use crate::disk::{DiskError, DiskPaths, DiskVariant, Partition, UnparsedDisk}; +use crate::disk::{ + DiskPaths, DiskVariant, Partition, PooledDiskError, UnparsedDisk, +}; use crate::{Baseboard, SledMode}; use slog::Logger; use std::collections::HashSet; @@ -16,6 +18,7 @@ use tokio::sync::broadcast; /// /// If you're actually trying to run the Sled Agent on non-illumos platforms, /// use the simulated sled agent, which does not attempt to abstract hardware. +#[derive(Clone)] pub struct HardwareManager {} impl HardwareManager { @@ -56,7 +59,7 @@ pub fn ensure_partition_layout( _log: &Logger, _paths: &DiskPaths, _variant: DiskVariant, -) -> Result, DiskError> { +) -> Result, PooledDiskError> { unimplemented!("Accessing hardware unsupported on non-illumos"); } From 5e74730aba439e34c1c44ffbf2b5fa2d470154d8 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 25 Oct 2023 22:13:05 +0000 Subject: [PATCH 44/66] fix doc build --- illumos-utils/src/lib.rs | 1 + nexus/src/app/background/common.rs | 4 ++-- sled-agent/src/services.rs | 2 +- sled-agent/src/storage_monitor.rs | 2 +- sled-hardware/src/disk.rs | 6 +++--- sled-storage/src/resources.rs | 4 ++-- wicketd/src/artifacts/extracted_artifacts.rs | 2 +- 7 files changed, 11 insertions(+), 10 deletions(-) diff --git a/illumos-utils/src/lib.rs b/illumos-utils/src/lib.rs index c882f258c9..3b696d178b 100644 --- a/illumos-utils/src/lib.rs +++ b/illumos-utils/src/lib.rs @@ -4,6 +4,7 @@ //! Wrappers around illumos-specific commands. +#[allow(unused)] use std::sync::atomic::{AtomicBool, Ordering}; use cfg_if::cfg_if; diff --git a/nexus/src/app/background/common.rs b/nexus/src/app/background/common.rs index 3fcf0483a5..b5291cf93c 100644 --- a/nexus/src/app/background/common.rs +++ b/nexus/src/app/background/common.rs @@ -177,7 +177,7 @@ pub struct Driver { /// /// This is returned by [`Driver::register()`] to identify the corresponding /// background task. It's then accepted by functions like -/// [`Driver::activate()`] and [`Driver::status()`] to identify the task. +/// [`Driver::activate()`] and [`Driver::task_status()`] to identify the task. #[derive(Clone, Debug, Ord, PartialOrd, PartialEq, Eq)] pub struct TaskHandle(String); @@ -277,7 +277,7 @@ impl Driver { /// Enumerate all registered background tasks /// /// This is aimed at callers that want to get the status of all background - /// tasks. You'd call [`Driver::status()`] with each of the items produced + /// tasks. You'd call [`Driver::task_status()`] with each of the items produced /// by the iterator. pub fn tasks(&self) -> impl Iterator { self.tasks.keys() diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index c692cca309..384a745285 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -5,7 +5,7 @@ //! Sled-local service management. //! //! For controlling zone-based storage services, refer to -//! [sled_storage:manager::StorageManager]. +//! [sled_storage::manager::StorageManager]. //! //! For controlling virtual machine instances, refer to //! [crate::instance_manager::InstanceManager]. diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index da9f6b4897..1a98a54f25 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! A task that listens for storage events from [`sled_storage::StorageMonitor`] +//! A task that listens for storage events from [`sled_storage::manager::StorageManager`] //! and dispatches them to other parst of the bootstrap agent and sled agent //! code. diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index aeaca9dc31..44658658be 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -118,9 +118,9 @@ impl DiskPaths { /// A disk which has been observed by monitoring hardware. /// /// No guarantees are made about the partitions which exist within this disk. -/// This exists as a distinct entity from [Disk] because it may be desirable to -/// monitor for hardware in one context, and conform disks to partition layouts -/// in a different context. +/// This exists as a distinct entity from `Disk` in `sled-storage` because it +/// may be desirable to monitor for hardware in one context, and conform disks +/// to partition layouts in a different context. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct UnparsedDisk { paths: DiskPaths, diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 51fd3ae222..07c46e1265 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -23,9 +23,9 @@ const ZONE_BUNDLE_DIRECTORY: &str = "zone"; /// Storage related resources: disks and zpools /// -/// This state is internal to the [`crate::StorageManager`] task. Clones +/// This state is internal to the [`crate::manager::StorageManager`] task. Clones /// of this state can be retrieved by requests to the `StorageManager` task -/// from the [`crate::StorageManagerHandle`]. This state is not `Sync`, and +/// from the [`crate::manager::StorageHandle`]. This state is not `Sync`, and /// as such does not require any mutexes. However, we do expect to share it /// relatively frequently, and we want copies of it to be as cheaply made /// as possible. So any large state is stored inside `Arc`s. On the other diff --git a/wicketd/src/artifacts/extracted_artifacts.rs b/wicketd/src/artifacts/extracted_artifacts.rs index 352d8ad3d5..b796201936 100644 --- a/wicketd/src/artifacts/extracted_artifacts.rs +++ b/wicketd/src/artifacts/extracted_artifacts.rs @@ -169,7 +169,7 @@ impl ExtractedArtifacts { /// /// As the returned file is written to, the data will be hashed; once /// writing is complete, call [`ExtractedArtifacts::store_tempfile()`] to - /// persist the temporary file into an [`ExtractedArtifactDataHandle()`]. + /// persist the temporary file into an [`ExtractedArtifactDataHandle`]. pub(super) fn new_tempfile( &self, ) -> Result { From fa35e2839b853800994c5bd3f6a2f120ba5203eb Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 26 Oct 2023 00:30:43 +0000 Subject: [PATCH 45/66] deadlock fix + logging --- sled-agent/src/bootstrap/pre_server.rs | 27 ---------------------- sled-agent/src/long_running_tasks.rs | 32 +++++++++++++++++++++++++- sled-agent/src/services.rs | 1 + sled-storage/src/manager.rs | 1 + 4 files changed, 33 insertions(+), 28 deletions(-) diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index 8a25023adb..7af9bbbf68 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -34,8 +34,6 @@ use omicron_common::FileKv; use sled_hardware::underlay; use sled_hardware::DendriteAsic; use sled_hardware::SledMode; -use sled_storage::disk::SyntheticDisk; -use sled_storage::manager::StorageHandle; use slog::Drain; use slog::Logger; use std::net::IpAddr; @@ -111,13 +109,6 @@ impl BootstrapAgentStartup { &base_log, sled_mode, startup_networking.global_zone_bootstrap_ip, - ) - .await; - - // Add some synthetic disks if necessary. - upsert_synthetic_zpools_if_needed( - &log, - &long_running_task_handles.storage_manager, &config, ) .await; @@ -275,24 +266,6 @@ fn ensure_zfs_ramdisk_dataset() -> Result<(), StartError> { .map_err(StartError::EnsureZfsRamdiskDataset) } -async fn upsert_synthetic_zpools_if_needed( - log: &Logger, - storage_manager: &StorageHandle, - config: &Config, -) { - if let Some(pools) = &config.zpools { - for pool in pools { - info!( - log, - "Upserting synthetic zpool to Storage Manager: {}", - pool.to_string() - ); - let disk = SyntheticDisk::new(pool.clone()).into(); - storage_manager.upsert_disk(disk).await; - } - } -} - // Combine the `sled_mode` config with the build-time switch type to determine // the actual sled mode. fn sled_mode_from_config(config: &Config) -> Result { diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index e87c990175..cf74f54e7e 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -16,12 +16,14 @@ use crate::bootstrap::bootstore_setup::{ new_bootstore_config, poll_ddmd_for_bootstore_peer_update, }; use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; +use crate::config::Config; use crate::hardware_monitor::{HardwareMonitor, HardwareMonitorHandle}; use crate::storage_monitor::{StorageMonitor, StorageMonitorHandle}; use crate::zone_bundle::{CleanupContext, ZoneBundler}; use bootstore::schemes::v0 as bootstore; use key_manager::{KeyManager, StorageKeyRequester}; use sled_hardware::{HardwareManager, SledMode}; +use sled_storage::disk::SyntheticDisk; use sled_storage::manager::{StorageHandle, StorageManager}; use slog::{info, Logger}; use std::net::Ipv6Addr; @@ -64,6 +66,7 @@ pub async fn spawn_all_longrunning_tasks( log: &Logger, sled_mode: SledMode, global_zone_bootstrap_ip: Ipv6Addr, + config: &Config, ) -> LongRunningTaskHandles { let storage_key_requester = spawn_key_manager(log); let mut storage_manager = @@ -78,9 +81,14 @@ pub async fn spawn_all_longrunning_tasks( let hardware_monitor = spawn_hardware_monitor(log, &hardware_manager, &storage_manager); + // Add some synthetic disks if necessary. + upsert_synthetic_zpools_if_needed(&log, &storage_manager, &config).await; + // Wait for the boot disk so that we can work with any ledgers, // such as those needed by the bootstore and sled-agent - let _ = storage_manager.wait_for_boot_disk().await; + info!(log, "Waiting for boot disk"); + let (disk_id, _) = storage_manager.wait_for_boot_disk().await; + info!(log, "Found boot disk {:?}", disk_id); let bootstore = spawn_bootstore_tasks( log, @@ -158,6 +166,7 @@ fn spawn_hardware_monitor( hardware_manager: &HardwareManager, storage_handle: &StorageHandle, ) -> HardwareMonitorHandle { + info!(log, "Starting HardwareMonitor"); let (mut monitor, handle) = HardwareMonitor::new(log, hardware_manager, storage_handle); tokio::spawn(async move { @@ -181,10 +190,12 @@ async fn spawn_bootstore_tasks( .unwrap(); // Create and spawn the bootstore + info!(log, "Starting Bootstore"); let (mut node, node_handle) = bootstore::Node::new(config, log).await; tokio::spawn(async move { node.run().await }); // Spawn a task for polling DDMD and updating bootstore with peer addresses + info!(log, "Starting Bootstore DDMD poller"); let log = log.new(o!("component" => "bootstore_ddmd_poller")); let node_handle2 = node_handle.clone(); tokio::spawn(async move { @@ -199,6 +210,25 @@ fn spawn_zone_bundler_tasks( log: &Logger, storage_handle: &mut StorageHandle, ) -> ZoneBundler { + info!(log, "Starting ZoneBundler related tasks"); let log = log.new(o!("component" => "ZoneBundler")); ZoneBundler::new(log, storage_handle.clone(), CleanupContext::default()) } + +async fn upsert_synthetic_zpools_if_needed( + log: &Logger, + storage_manager: &StorageHandle, + config: &Config, +) { + if let Some(pools) = &config.zpools { + for pool in pools { + info!( + log, + "Upserting synthetic zpool to Storage Manager: {}", + pool.to_string() + ); + let disk = SyntheticDisk::new(pool.clone()).into(); + storage_manager.upsert_disk(disk).await; + } + } +} diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 384a745285..33859465cb 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -422,6 +422,7 @@ impl ServiceManager { zone_bundler: ZoneBundler, ) -> Self { let log = log.new(o!("component" => "ServiceManager")); + info!(log, "Creating ServiceManager"); Self { inner: Arc::new(ServiceManagerInner { log: log.clone(), diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 46fb34a3bf..c58316ef1b 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -335,6 +335,7 @@ impl StorageManager { // // Return true if updates should be sent to watchers, false otherwise async fn add_queued_disks(&mut self) -> bool { + info!(self.log, "Attempting to add queued disks"); self.state = StorageManagerState::Normal; let mut send_updates = false; From 999714952f6dc0cd0bb5d818088403e50e4d0bbe Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 26 Oct 2023 01:45:44 +0000 Subject: [PATCH 46/66] synthetic disk related fixes --- sled-storage/src/manager.rs | 5 ++++- sled-storage/src/resources.rs | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index c58316ef1b..19cd86df64 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -39,6 +39,7 @@ struct NewFilesystemRequest { responder: oneshot::Sender>, } +#[derive(Debug)] enum StorageRequest { AddDisk(RawDisk), RemoveDisk(RawDisk), @@ -286,7 +287,9 @@ impl StorageManager { /// This is useful for testing/debugging pub async fn step(&mut self) -> Result<(), Error> { // The sending side should never disappear - let should_send_updates = match self.rx.recv().await.unwrap() { + let req = self.rx.recv().await.unwrap(); + info!(self.log, "Received {:?}", req); + let should_send_updates = match req { StorageRequest::AddDisk(raw_disk) => { self.add_disk(raw_disk).await? } diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 07c46e1265..f3444ac798 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -83,9 +83,33 @@ impl StorageResources { true } + /// Delete a disk and its zpool + /// + /// Return true, if data was changed, false otherwise + /// + /// Note: We never allow removal of synthetic disks as they are only added + /// once. + #[cfg(not(test))] + pub(crate) fn remove_disk(&mut self, id: &DiskIdentity) -> bool { + if let Some((disk, _)) = self.disks.get(id) { + if disk.is_synthetic() { + return false; + } + } else { + return false; + } + // Safe to unwrap as we just checked the key existed above + Arc::make_mut(&mut self.disks).remove(id).unwrap(); + true + } + /// Delete a real disk and its zpool /// /// Return true, if data was changed, false otherwise + /// + /// Note: For testing purposes of this crate, we allow synthetic disks to + /// be deleted. + #[cfg(test)] pub(crate) fn remove_disk(&mut self, id: &DiskIdentity) -> bool { if !self.disks.contains_key(id) { return false; From 82e25053ceba10ced74ac04b3b16b5c1b8de79bd Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 26 Oct 2023 18:51:40 +0000 Subject: [PATCH 47/66] Fix M2 expected datasets --- sled-storage/src/dataset.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index 4f39a20bb0..164141e875 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -63,7 +63,7 @@ static U2_EXPECTED_DATASETS: [ExpectedDataset; U2_EXPECTED_DATASET_COUNT] = [ .compression(DUMP_DATASET_COMPRESSION), ]; -const M2_EXPECTED_DATASET_COUNT: usize = 5; +const M2_EXPECTED_DATASET_COUNT: usize = 6; static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ // Stores software images. // @@ -71,6 +71,10 @@ static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ ExpectedDataset::new(INSTALL_DATASET), // Stores crash dumps. ExpectedDataset::new(CRASH_DATASET), + // Backing store for OS data that should be persisted across reboots. + // Its children are selectively overlay mounted onto parts of the ramdisk + // root. + ExpectedDataset::new(M2_BACKING_DATASET), // Stores cluter configuration information. // // Should be duplicated to both M.2s. From a622fec45eb56d7371d36120b34472f66d951031 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 26 Oct 2023 22:18:22 +0000 Subject: [PATCH 48/66] actually poll nexus storage notifications --- sled-agent/src/storage_monitor.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index 1a98a54f25..a8fd899f2d 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -11,6 +11,7 @@ use crate::nexus::NexusClientWithResolver; use derive_more::From; use futures::stream::FuturesOrdered; use futures::FutureExt; +use futures::StreamExt; use nexus_client::types::PhysicalDiskDeleteRequest; use nexus_client::types::PhysicalDiskPutRequest; use nexus_client::types::ZpoolPutRequest; @@ -116,6 +117,11 @@ impl StorageMonitor { pub async fn run(&mut self) { loop { tokio::select! { + _ = self.nexus_notifications.next(), + if !self.nexus_notifications.is_empty() => + { + debug!(self.log, "Processing nexus notification"); + } resources = self.storage_manager.wait_for_changes() => { info!( self.log, From 6de271b3f9fb075ca62240e6303944cb75d240d9 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 27 Oct 2023 07:02:17 +0000 Subject: [PATCH 49/66] better logging for nexus requests --- sled-agent/src/storage_monitor.rs | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index a8fd899f2d..696335636e 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -116,11 +116,12 @@ impl StorageMonitor { /// This should be spawned into a tokio task pub async fn run(&mut self) { loop { + info!(self.log, "looping again"); tokio::select! { - _ = self.nexus_notifications.next(), + res = self.nexus_notifications.next(), if !self.nexus_notifications.is_empty() => { - debug!(self.log, "Processing nexus notification"); + info!(self.log, "Nexus notification complete: {:?}", res); } resources = self.storage_manager.wait_for_changes() => { info!( @@ -134,7 +135,7 @@ impl StorageMonitor { info!( self.log, "Received storage monitor message"; - "msg" => ?msg + "monitor_msg" => ?msg ); self.handle_monitor_msg(msg).await; } @@ -227,7 +228,8 @@ impl StorageMonitor { })?; } } - Ok(()) + let msg = format!("{:?}", disk); + Ok(msg) } }; @@ -235,11 +237,14 @@ impl StorageMonitor { // This notification is often invoked before Nexus has started // running, so avoid flagging any errors as concerning until some // time has passed. - let log_post_failure = move |_, call_count, total_duration| { + let log_post_failure = move |err, call_count, total_duration| { if call_count == 0 { - info!(log, "failed to notify nexus about {disk2:?}"); + info!(log, "failed to notify nexus about {disk2:?}"; + "err" => ?err + ); } else if total_duration > std::time::Duration::from_secs(30) { warn!(log, "failed to notify nexus about {disk2:?}"; + "err" => ?err, "total duration" => ?total_duration); } }; @@ -275,18 +280,21 @@ impl StorageMonitor { .map_err(|e| { backoff::BackoffError::transient(e.to_string()) })?; - Ok(()) + let msg = format!("{:?}", zpool_request); + Ok(msg) } }; let log = self.log.clone(); let name = pool.name.clone(); let disk = pool.parent.clone(); - let log_post_failure = move |_, call_count, total_duration| { + let log_post_failure = move |err, call_count, total_duration| { if call_count == 0 { - info!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"); + info!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"; + "err" => ?err); } else if total_duration > std::time::Duration::from_secs(30) { warn!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"; + "err" => ?err, "total duration" => ?total_duration); } }; @@ -303,7 +311,7 @@ impl StorageMonitor { // The type of a future which is used to send a notification to Nexus. type NotifyFut = - Pin> + Send>>; + Pin> + Send>>; struct NexusUpdates { disk_puts: Vec, From adedecc5adfdefd38f7d3a16d40f3cc1fc32b599 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 27 Oct 2023 08:15:33 +0000 Subject: [PATCH 50/66] actually add zpools to nexus --- sled-agent/src/storage_monitor.rs | 42 ++++++++++++++++++------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index 696335636e..0b4475ce74 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -17,6 +17,7 @@ use nexus_client::types::PhysicalDiskPutRequest; use nexus_client::types::ZpoolPutRequest; use omicron_common::api::external::ByteCount; use omicron_common::backoff; +use omicron_common::disk::DiskIdentity; use sled_storage::manager::StorageHandle; use sled_storage::pool::Pool; use sled_storage::resources::StorageResources; @@ -329,6 +330,27 @@ fn compute_resource_diffs( let mut disk_deletes = vec![]; let mut zpool_puts = vec![]; + let mut put_pool = |disk_id: &DiskIdentity, updated_pool: &Pool| { + match ByteCount::try_from(updated_pool.info.size()) { + Ok(size) => zpool_puts.push(( + updated_pool.clone(), + ZpoolPutRequest { + size: size.into(), + disk_model: disk_id.model.clone(), + disk_serial: disk_id.serial.clone(), + disk_vendor: disk_id.vendor.clone(), + }, + )), + Err(err) => { + error!( + log, + "Error parsing pool size"; + "name" => updated_pool.name.to_string(), + "err" => ?err); + } + } + }; + // Diff the existing resources with the update to see what has changed // This loop finds disks and pools that were modified or deleted for (disk_id, (disk, pool)) in current.disks.iter() { @@ -344,22 +366,7 @@ fn compute_resource_diffs( }); } if pool != updated_pool { - match ByteCount::try_from(pool.info.size()) { - Ok(size) => zpool_puts.push(( - pool.clone(), - ZpoolPutRequest { - size: size.into(), - disk_model: disk_id.model.clone(), - disk_serial: disk_id.serial.clone(), - disk_vendor: disk_id.vendor.clone(), - }, - )), - Err(err) => error!( - log, - "Error parsing pool size"; - "name" => pool.name.to_string(), - "err" => ?err), - } + put_pool(disk_id, updated_pool); } } None => disk_deletes.push(PhysicalDiskDeleteRequest { @@ -373,7 +380,7 @@ fn compute_resource_diffs( // Diff the existing resources with the update to see what has changed // This loop finds new disks and pools - for (disk_id, (updated_disk, _)) in updated.disks.iter() { + for (disk_id, (updated_disk, updated_pool)) in updated.disks.iter() { if !current.disks.contains_key(disk_id) { disk_puts.push(PhysicalDiskPutRequest { sled_id: *sled_id, @@ -382,6 +389,7 @@ fn compute_resource_diffs( vendor: disk_id.vendor.clone(), variant: updated_disk.variant().into(), }); + put_pool(disk_id, updated_pool); } } From c8db7c88dec4030f29bea0520dea43e81187c8dd Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 27 Oct 2023 23:14:17 +0000 Subject: [PATCH 51/66] remove unnecessary prints --- sled-agent/src/storage_monitor.rs | 1 - sled-agent/src/zone_bundle.rs | 1 - 2 files changed, 2 deletions(-) diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index 0b4475ce74..71e61e84c2 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -117,7 +117,6 @@ impl StorageMonitor { /// This should be spawned into a tokio task pub async fn run(&mut self) { loop { - info!(self.log, "looping again"); tokio::select! { res = self.nexus_notifications.next(), if !self.nexus_notifications.is_empty() => diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index d2615ffce6..91604b7099 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -257,7 +257,6 @@ impl Inner { async fn bundle_directories(&self) -> Vec { let resources = self.storage_handle.get_latest_resources().await; let expected = resources.all_zone_bundle_directories(); - println!("dirs = {:?}", expected); let mut out = Vec::with_capacity(expected.len()); for each in expected.into_iter() { if tokio::fs::create_dir_all(&each).await.is_ok() { From a7a1971e99ef8d58823c3bcf921c2c65694d9739 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 8 Nov 2023 20:38:57 +0000 Subject: [PATCH 52/66] Some review fixes --- Cargo.lock | 4 +-- clients/nexus-client/Cargo.toml | 1 + clients/nexus-client/src/lib.rs | 14 ++++++++++ clients/sled-agent-client/Cargo.toml | 1 + clients/sled-agent-client/src/lib.rs | 25 +++++++++++++++++ illumos-utils/src/zfs.rs | 4 +-- sled-storage/Cargo.toml | 6 ---- sled-storage/src/dataset.rs | 41 ---------------------------- sled-storage/src/error.rs | 3 -- sled-storage/src/keyfile.rs | 3 -- 10 files changed, 44 insertions(+), 58 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 85eb29e1a1..c62148cdd4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4341,6 +4341,7 @@ dependencies = [ "serde", "serde_json", "sled-hardware", + "sled-storage", "slog", "uuid", ] @@ -8164,6 +8165,7 @@ dependencies = [ "regress", "reqwest", "serde", + "sled-storage", "slog", "uuid", ] @@ -8207,7 +8209,6 @@ dependencies = [ "glob", "illumos-utils", "key-manager", - "nexus-client 0.1.0", "omicron-common 0.1.0", "omicron-test-utils", "omicron-workspace-hack", @@ -8215,7 +8216,6 @@ dependencies = [ "schemars", "serde", "serde_json", - "sled-agent-client", "sled-hardware", "slog", "thiserror", diff --git a/clients/nexus-client/Cargo.toml b/clients/nexus-client/Cargo.toml index 1085cf1ec9..239cb77789 100644 --- a/clients/nexus-client/Cargo.toml +++ b/clients/nexus-client/Cargo.toml @@ -11,6 +11,7 @@ ipnetwork.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true sled-hardware.workspace = true +sled-storage.workspace = true progenitor.workspace = true regress.workspace = true reqwest = { workspace = true, features = ["rustls-tls", "stream"] } diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 71a555476b..9f81492d10 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -407,3 +407,17 @@ impl From for types::Baseboard { } } } + +impl From for types::DatasetKind { + fn from(k: sled_storage::dataset::DatasetKind) -> Self { + use sled_storage::dataset::DatasetKind::*; + match k { + CockroachDb => Self::Cockroach, + Crucible => Self::Crucible, + Clickhouse => Self::Clickhouse, + ClickhouseKeeper => Self::ClickhouseKeeper, + ExternalDns => Self::ExternalDns, + InternalDns => Self::InternalDns, + } + } +} diff --git a/clients/sled-agent-client/Cargo.toml b/clients/sled-agent-client/Cargo.toml index b2ed07caba..e2cc737e70 100644 --- a/clients/sled-agent-client/Cargo.toml +++ b/clients/sled-agent-client/Cargo.toml @@ -14,5 +14,6 @@ regress.workspace = true reqwest = { workspace = true, features = [ "json", "rustls-tls", "stream" ] } serde.workspace = true slog.workspace = true +sled-storage.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 0df21d894e..30b554a021 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -6,6 +6,7 @@ use async_trait::async_trait; use std::convert::TryFrom; +use std::str::FromStr; use uuid::Uuid; progenitor::generate_api!( @@ -528,3 +529,27 @@ impl TestInterfaces for Client { .expect("disk_finish_transition() failed unexpectedly"); } } + +impl From for types::DatasetKind { + fn from(k: sled_storage::dataset::DatasetKind) -> Self { + use sled_storage::dataset::DatasetKind::*; + match k { + CockroachDb => Self::CockroachDb, + Crucible => Self::Crucible, + Clickhouse => Self::Clickhouse, + ClickhouseKeeper => Self::ClickhouseKeeper, + ExternalDns => Self::ExternalDns, + InternalDns => Self::InternalDns, + } + } +} + +impl From for types::DatasetName { + fn from(n: sled_storage::dataset::DatasetName) -> Self { + Self { + pool_name: types::ZpoolName::from_str(&n.pool().to_string()) + .unwrap(), + kind: n.dataset().clone().into(), + } + } +} diff --git a/illumos-utils/src/zfs.rs b/illumos-utils/src/zfs.rs index 0d8f468705..e9554100af 100644 --- a/illumos-utils/src/zfs.rs +++ b/illumos-utils/src/zfs.rs @@ -184,9 +184,7 @@ impl From<&DiskIdentity> for Keypath { fn build_keypath(id: &DiskIdentity, root: &str) -> Keypath { let filename = format!("{}-{}-{}-zfs-aes-256-gcm.key", id.vendor, id.serial, id.model); - let mut path = Utf8PathBuf::new(); - path.push(root); - path.push(filename); + let path: Utf8PathBuf = [root, &filename].iter().collect(); Keypath(path) } diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml index 38863e160b..82ab206a8e 100644 --- a/sled-storage/Cargo.toml +++ b/sled-storage/Cargo.toml @@ -11,17 +11,11 @@ derive_more.workspace = true glob.workspace = true illumos-utils.workspace = true key-manager.workspace = true -# Needed strictly for parameter type conversion -# We could put this in the nexus-client instead -nexus-client.workspace = true omicron-common.workspace = true rand.workspace = true schemars = { workspace = true, features = [ "chrono", "uuid1" ] } serde.workspace = true serde_json.workspace = true -# Needed strictly for parameter type conversion -# We could put this in the sled-agent-client instead -sled-agent-client.workspace = true sled-hardware.workspace = true slog.workspace = true thiserror.workspace = true diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index 164141e875..503ccb053a 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -19,7 +19,6 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sled_hardware::DiskVariant; use slog::{info, Logger}; -use std::str::FromStr; use std::sync::OnceLock; pub const INSTALL_DATASET: &'static str = "install"; @@ -139,34 +138,6 @@ pub enum DatasetKind { InternalDns, } -impl From for sled_agent_client::types::DatasetKind { - fn from(k: DatasetKind) -> Self { - use DatasetKind::*; - match k { - CockroachDb => Self::CockroachDb, - Crucible => Self::Crucible, - Clickhouse => Self::Clickhouse, - ClickhouseKeeper => Self::ClickhouseKeeper, - ExternalDns => Self::ExternalDns, - InternalDns => Self::InternalDns, - } - } -} - -impl From for nexus_client::types::DatasetKind { - fn from(k: DatasetKind) -> Self { - use DatasetKind::*; - match k { - CockroachDb => Self::Cockroach, - Crucible => Self::Crucible, - Clickhouse => Self::Clickhouse, - ClickhouseKeeper => Self::ClickhouseKeeper, - ExternalDns => Self::ExternalDns, - InternalDns => Self::InternalDns, - } - } -} - impl std::fmt::Display for DatasetKind { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use DatasetKind::*; @@ -210,18 +181,6 @@ impl DatasetName { } } -impl From for sled_agent_client::types::DatasetName { - fn from(n: DatasetName) -> Self { - Self { - pool_name: sled_agent_client::types::ZpoolName::from_str( - &n.pool().to_string(), - ) - .unwrap(), - kind: n.dataset().clone().into(), - } - } -} - #[derive(Debug, thiserror::Error)] pub enum DatasetError { #[error("Cannot open {path} due to {error}")] diff --git a/sled-storage/src/error.rs b/sled-storage/src/error.rs index 70d7fe7c1e..b9f97ee428 100644 --- a/sled-storage/src/error.rs +++ b/sled-storage/src/error.rs @@ -78,7 +78,4 @@ pub enum Error { #[error("Zpool Not Found: {0}")] ZpoolNotFound(String), - - #[error("Underlay not yet initialized")] - UnderlayNotInitialized, } diff --git a/sled-storage/src/keyfile.rs b/sled-storage/src/keyfile.rs index fcdbf8b3bf..105092c99e 100644 --- a/sled-storage/src/keyfile.rs +++ b/sled-storage/src/keyfile.rs @@ -26,10 +26,7 @@ impl KeyFile { key: &[u8; 32], log: &Logger, ) -> std::io::Result { - // TODO: fix this to not truncate // We want to overwrite any existing contents. - // If we truncate we may leave dirty pages around - // containing secrets. let mut file = tokio::fs::OpenOptions::new() .create(true) .write(true) From 7d457cee41e5c428e8d1be9ca87945a78436c1bf Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 8 Nov 2023 23:08:27 +0000 Subject: [PATCH 53/66] Fix subtle bugs wrt watch channels --- sled-storage/src/manager.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 19cd86df64..078b9a19f8 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -115,21 +115,21 @@ impl StorageHandle { /// Wait for a boot disk to be initialized pub async fn wait_for_boot_disk(&mut self) -> (DiskIdentity, ZpoolName) { loop { - // We panic if the sender is dropped, as this means - // the StorageManager has gone away, which it should not do. - self.resource_updates.changed().await.unwrap(); - // Limit any RWLock related cancellation issues by immediately cloning - let resources = self.resource_updates.borrow().clone(); + let resources = self.resource_updates.borrow_and_update(); if let Some((disk_id, zpool_name)) = resources.boot_disk() { return (disk_id, zpool_name); } + drop(resources); + // We panic if the sender is dropped, as this means + // the StorageManager has gone away, which it should not do. + self.resource_updates.changed().await.unwrap(); } } /// Wait for any storage resource changes pub async fn wait_for_changes(&mut self) -> StorageResources { self.resource_updates.changed().await.unwrap(); - self.resource_updates.borrow().clone() + self.resource_updates.borrow_and_update().clone() } /// Retrieve the latest value of `StorageResources` from the From c895657e1d29e440f5de20d59689dbb870763ece Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 9 Nov 2023 00:25:11 +0000 Subject: [PATCH 54/66] some more review fixes --- sled-storage/src/manager.rs | 86 ++++++++++++++++++++++++++----------- 1 file changed, 62 insertions(+), 24 deletions(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 078b9a19f8..b60114a519 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -23,6 +23,31 @@ use uuid::Uuid; // The size of the mpsc bounded channel used to communicate // between the `StorageHandle` and `StorageManager`. +// +// How did we choose this bound, and why? +// +// Picking a bound can be tricky, but in general, you want the channel to act +// unbounded, such that sends never fail. This makes the channels reliable, +// such that we never drop messages inside the process, and the caller doesn't +// have to choose what to do when overloaded. This simplifies things drastically +// for developers. However, you also don't want to make the channel actually +// unbounded, because that can lead to run-away memory growth and pathological +// behaviors, such that requests get slower over time until the system crashes. +// +// Our team's chosen solution, and used elsewhere in the codebase, is is to +// choose a large enough bound such that we should never hit it in practice +// unless we are truly overloaded. If we hit the bound it means that beyond that +// requests will start to build up and we will eventually topple over. So when +// we hit this bound, we just go ahead and panic. +// +// Picking a channel bound is hard to do empirically, but practically, if +// requests are mostly mutating task local state, a bound of 1024 or even 8192 +// should be plenty. Tasks that must perform longer running ops can spawn helper +// tasks as necessary or include their own handles for replies rather than +// synchronously waiting. Memory for the queue can be kept small with boxing of +// large messages. +// +// Here we start relatively small so that we can evaluate our choice over time. const QUEUE_SIZE: usize = 256; #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -186,22 +211,22 @@ impl FakeStorageManager { /// Run the main receive loop of the `StorageManager` /// /// This should be spawned into a tokio task - pub async fn run(&mut self) { + pub async fn run(mut self) { loop { - // The sending side should never disappear - match self.rx.recv().await.unwrap() { - StorageRequest::AddDisk(raw_disk) => { + match self.rx.recv().await { + Some(StorageRequest::AddDisk(raw_disk)) => { if self.add_disk(raw_disk) { self.resource_updates .send_replace(self.resources.clone()); } } - StorageRequest::GetLatestResources(tx) => { + Some(StorageRequest::GetLatestResources(tx)) => { let _ = tx.send(self.resources.clone()); } - _ => { + Some(_) => { unreachable!(); } + None => break, } } } @@ -260,15 +285,23 @@ impl StorageManager { /// Run the main receive loop of the `StorageManager` /// /// This should be spawned into a tokio task - pub async fn run(&mut self) { + pub async fn run(mut self) { loop { const QUEUED_DISK_RETRY_TIMEOUT: Duration = Duration::from_secs(10); let mut interval = interval(QUEUED_DISK_RETRY_TIMEOUT); interval.set_missed_tick_behavior(MissedTickBehavior::Delay); tokio::select! { res = self.step() => { - if let Err(e) = res { - warn!(self.log, "{e}"); + match res { + Some(Ok(())) => (), + Some(Err(e)) => warn!(self.log, "{e}"), + None => { + info!( + self.log, + "Shutting down StorageManager task: no handles." + ); + return; + } } } _ = interval.tick(), @@ -285,13 +318,20 @@ impl StorageManager { /// Process the next event /// /// This is useful for testing/debugging - pub async fn step(&mut self) -> Result<(), Error> { - // The sending side should never disappear - let req = self.rx.recv().await.unwrap(); + /// + /// Return `None` if the sender side has disappeared and the task should + /// shutdown. + pub async fn step(&mut self) -> Option> { + let Some(req) = self.rx.recv().await else { + return None; + }; info!(self.log, "Received {:?}", req); let should_send_updates = match req { StorageRequest::AddDisk(raw_disk) => { - self.add_disk(raw_disk).await? + match self.add_disk(raw_disk).await { + Ok(is_new) => is_new, + Err(e) => return Some(Err(e)), + } } StorageRequest::RemoveDisk(raw_disk) => { self.remove_disk(raw_disk).await @@ -328,7 +368,7 @@ impl StorageManager { let _ = self.resource_updates.send_replace(self.resources.clone()); } - Ok(()) + Some(Ok(())) } // Loop through all queued disks inserting them into [`StorageResources`] @@ -624,7 +664,7 @@ mod tests { assert!(manager.resources.all_u2_zpools().is_empty()); assert_eq!(manager.queued_u2_drives, HashSet::from([raw_disk.clone()])); - // Check other non-normal stages and enusre disk gets queued + // Check other non-normal stages and ensure disk gets queued manager.queued_u2_drives.clear(); manager.state = StorageManagerState::QueuingDisks; manager.add_u2_disk(raw_disk.clone()).await.unwrap(); @@ -661,7 +701,7 @@ mod tests { let logctx = test_setup_log("wait_for_bootdisk"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (mut manager, mut handle) = + let (manager, mut handle) = StorageManager::new(&logctx.log, key_requester); // Spawn the key_manager so that it will respond to requests for encryption keys tokio::spawn(async move { key_manager.run().await }); @@ -688,8 +728,7 @@ mod tests { let logctx = test_setup_log("queued_disks_get_added_as_resources"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (mut manager, handle) = - StorageManager::new(&logctx.log, key_requester); + let (manager, handle) = StorageManager::new(&logctx.log, key_requester); // Spawn the key_manager so that it will respond to requests for encryption keys tokio::spawn(async move { key_manager.run().await }); @@ -743,7 +782,7 @@ mod tests { let dir = tempdir().unwrap(); let disk = SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); handle.upsert_disk(disk).await; - manager.step().await.unwrap(); + manager.step().await.unwrap().unwrap(); // We can't wait for a reply through the handle as the storage manager task // isn't actually running. We just check the resources directly. @@ -756,7 +795,7 @@ mod tests { // Now inform the storage manager that the key manager is ready // The queued disk should not be added due to the error handle.key_manager_ready().await; - manager.step().await.unwrap(); + manager.step().await.unwrap().unwrap(); assert!(manager.resources.all_u2_zpools().is_empty()); // Manually simulating a timer tick to add queued disks should also @@ -779,7 +818,7 @@ mod tests { let logctx = test_setup_log("delete_disk_triggers_notification"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (mut manager, mut handle) = + let (manager, mut handle) = StorageManager::new(&logctx.log, key_requester); // Spawn the key_manager so that it will respond to requests for encryption keys @@ -820,7 +859,7 @@ mod tests { let logctx = test_setup_log("ensure_using_exactly_these_disks"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (mut manager, mut handle) = + let (manager, mut handle) = StorageManager::new(&logctx.log, key_requester); // Spawn the key_manager so that it will respond to requests for encryption keys @@ -937,8 +976,7 @@ mod tests { let logctx = test_setup_log("upsert_filesystem"); let (mut key_manager, key_requester) = KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (mut manager, handle) = - StorageManager::new(&logctx.log, key_requester); + let (manager, handle) = StorageManager::new(&logctx.log, key_requester); // Spawn the key_manager so that it will respond to requests for encryption keys tokio::spawn(async move { key_manager.run().await }); From ed7f059d0b251954e026b5ef9839c10e0a3820f2 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 9 Nov 2023 03:37:43 +0000 Subject: [PATCH 55/66] Use oneshot channels for HardwareMonitor setup --- sled-agent/src/bootstrap/pre_server.rs | 20 +++-- sled-agent/src/bootstrap/server.rs | 30 ++++--- sled-agent/src/hardware_monitor.rs | 111 +++++++++---------------- sled-agent/src/long_running_tasks.rs | 46 +++++----- sled-agent/src/services.rs | 2 +- sled-agent/src/zone_bundle.rs | 2 +- sled-storage/src/manager.rs | 2 +- 7 files changed, 100 insertions(+), 113 deletions(-) diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index 7af9bbbf68..ff87437001 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -17,6 +17,7 @@ use crate::long_running_tasks::{ spawn_all_longrunning_tasks, LongRunningTaskHandles, }; use crate::services::ServiceManager; +use crate::sled_agent::SledAgent; use camino::Utf8PathBuf; use cancel_safe_futures::TryStreamExt; use ddm_admin_client::Client as DdmAdminClient; @@ -38,6 +39,7 @@ use slog::Drain; use slog::Logger; use std::net::IpAddr; use std::net::Ipv6Addr; +use tokio::sync::oneshot; pub(super) struct BootstrapAgentStartup { pub(super) config: Config, @@ -47,6 +49,7 @@ pub(super) struct BootstrapAgentStartup { pub(super) startup_log: Logger, pub(super) service_manager: ServiceManager, pub(super) long_running_task_handles: LongRunningTaskHandles, + pub(super) sled_agent_started_tx: oneshot::Sender, } impl BootstrapAgentStartup { @@ -105,7 +108,11 @@ impl BootstrapAgentStartup { // Spawn all important long running tasks that live for the lifetime of // the process and are used by both the bootstrap agent and sled agent - let long_running_task_handles = spawn_all_longrunning_tasks( + let ( + long_running_task_handles, + sled_agent_started_tx, + service_manager_ready_tx, + ) = spawn_all_longrunning_tasks( &base_log, sled_mode, startup_networking.global_zone_bootstrap_ip, @@ -128,10 +135,12 @@ impl BootstrapAgentStartup { long_running_task_handles.zone_bundler.clone(), ); - long_running_task_handles - .hardware_monitor - .service_manager_ready(service_manager.clone()) - .await; + // Inform the hardware monitor that the service manager is ready + // This is a onetime operation, and so we use a oneshot channel + service_manager_ready_tx + .send(service_manager.clone()) + .map_err(|_| ()) + .expect("Failed to send to StorageMonitor"); Ok(Self { config, @@ -141,6 +150,7 @@ impl BootstrapAgentStartup { startup_log: log, service_manager, long_running_task_handles, + sled_agent_started_tx, }) } } diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 635b33893d..90653873c4 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -25,6 +25,7 @@ use crate::config::ConfigError; use crate::long_running_tasks::LongRunningTaskHandles; use crate::server::Server as SledAgentServer; use crate::services::ServiceManager; +use crate::sled_agent::SledAgent; use camino::Utf8PathBuf; use cancel_safe_futures::TryStreamExt; use ddm_admin_client::Client as DdmAdminClient; @@ -175,6 +176,7 @@ impl Server { startup_log, service_manager, long_running_task_handles, + sled_agent_started_tx, } = BootstrapAgentStartup::run(config).await?; // Do we have a StartSledAgentRequest stored in the ledger? @@ -251,10 +253,10 @@ impl Server { // switch zone, if we're a scrimlet, to give it our underlay network // information. let sled_agent = sled_agent_server.sled_agent(); - long_running_task_handles - .hardware_monitor - .sled_agent_started(sled_agent.clone()) - .await; + sled_agent_started_tx + .send(sled_agent.clone()) + .map_err(|_| ()) + .expect("Failed to send to StorageMonitor"); // For cold boot specifically, we now need to load the services // we're responsible for, while continuing to handle hardware @@ -263,7 +265,7 @@ impl Server { sled_agent.cold_boot_load_services().await; SledAgentState::ServerStarted(sled_agent_server) } else { - SledAgentState::Bootstrapping + SledAgentState::Bootstrapping(Some(sled_agent_started_tx)) }; // Spawn our inner task that handles any future hardware updates and any @@ -306,7 +308,7 @@ impl Server { // bootstrap server). enum SledAgentState { // We're still in the bootstrapping phase, waiting for a sled-agent request. - Bootstrapping, + Bootstrapping(Option>), // ... or the sled agent server is running. ServerStarted(SledAgentServer), } @@ -548,8 +550,11 @@ impl Inner { response_tx: oneshot::Sender>, log: &Logger, ) { - match &self.state { - SledAgentState::Bootstrapping => { + match &mut self.state { + SledAgentState::Bootstrapping(sled_agent_started_tx) => { + // Extract from an option to satisfy the borrow checker + let sled_agent_started_tx = + sled_agent_started_tx.take().unwrap(); let response = match start_sled_agent( &self.config, &request, @@ -565,11 +570,10 @@ impl Inner { // We've created sled-agent; we need to (possibly) // reconfigure the switch zone, if we're a scrimlet, to // give it our underlay network information. - self.long_running_task_handles - .hardware_monitor - .sled_agent_started(server.sled_agent().clone()) - .await; - + sled_agent_started_tx + .send(server.sled_agent().clone()) + .map_err(|_| ()) + .expect("Failed to send to StorageMonitor"); self.state = SledAgentState::ServerStarted(server); Ok(SledAgentResponse { id: request.id }) } diff --git a/sled-agent/src/hardware_monitor.rs b/sled-agent/src/hardware_monitor.rs index f3402cb6bd..698d2d4608 100644 --- a/sled-agent/src/hardware_monitor.rs +++ b/sled-agent/src/hardware_monitor.rs @@ -12,30 +12,8 @@ use sled_hardware::{Baseboard, HardwareManager, HardwareUpdate}; use sled_storage::disk::RawDisk; use sled_storage::manager::StorageHandle; use slog::Logger; -use std::fmt::Debug; -use tokio::sync::broadcast; use tokio::sync::broadcast::error::RecvError; -use tokio::sync::mpsc; - -const QUEUE_SIZE: usize = 10; - -pub enum HardwareMonitorMsg { - SledAgentStarted(SledAgent), - ServiceManagerCreated(ServiceManager), -} - -impl Debug for HardwareMonitorMsg { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - HardwareMonitorMsg::SledAgentStarted(_) => { - f.debug_struct("SledAgentStarted").finish() - } - HardwareMonitorMsg::ServiceManagerCreated(_) => { - f.debug_struct("ServiceManagerCreated").finish() - } - } - } -} +use tokio::sync::{broadcast, oneshot}; // A thin wrapper around the the [`ServiceManager`] that caches the state // whether or not the tofino is loaded if the [`ServiceManager`] doesn't exist @@ -62,36 +40,26 @@ impl TofinoManager { *self = Self::Ready(service_manager); tofino_loaded } -} - -#[derive(Clone)] -pub struct HardwareMonitorHandle { - tx: mpsc::Sender, -} - -impl HardwareMonitorHandle { - pub async fn service_manager_ready(&self, service_manager: ServiceManager) { - self.tx - .send(HardwareMonitorMsg::ServiceManagerCreated(service_manager)) - .await - .unwrap(); - } - pub async fn sled_agent_started(&self, sled_agent: SledAgent) { - self.tx - .send(HardwareMonitorMsg::SledAgentStarted(sled_agent)) - .await - .unwrap(); + pub fn is_ready(&self) -> bool { + match self { + TofinoManager::Ready(_) => true, + _ => false, + } } } +// A monitor for hardware events pub struct HardwareMonitor { log: Logger, baseboard: Baseboard, - // Receive messages from the [`HardwareMonitorHandle`] - handle_rx: mpsc::Receiver, + // Receive a onetime notification that the SledAgent has started + sled_agent_started_rx: oneshot::Receiver, + + // Receive a onetime notification that the ServiceManager is ready + service_manager_ready_rx: oneshot::Receiver, // Receive messages from the [`HardwareManager`] hardware_rx: broadcast::Receiver, @@ -123,9 +91,15 @@ impl HardwareMonitor { log: &Logger, hardware_manager: &HardwareManager, storage_manager: &StorageHandle, - ) -> (HardwareMonitor, HardwareMonitorHandle) { + ) -> ( + HardwareMonitor, + oneshot::Sender, + oneshot::Sender, + ) { + let (sled_agent_started_tx, sled_agent_started_rx) = oneshot::channel(); + let (service_manager_ready_tx, service_manager_ready_rx) = + oneshot::channel(); let baseboard = hardware_manager.baseboard(); - let (handle_tx, handle_rx) = mpsc::channel(QUEUE_SIZE); let hardware_rx = hardware_manager.monitor(); let log = log.new(o!("component" => "HardwareMonitor")); let tofino_manager = TofinoManager::new(); @@ -133,14 +107,16 @@ impl HardwareMonitor { HardwareMonitor { log, baseboard, - handle_rx, + sled_agent_started_rx, + service_manager_ready_rx, hardware_rx, hardware_manager: hardware_manager.clone(), storage_manager: storage_manager.clone(), sled_agent: None, tofino_manager, }, - HardwareMonitorHandle { tx: handle_tx }, + sled_agent_started_tx, + service_manager_ready_tx, ) } @@ -155,13 +131,21 @@ impl HardwareMonitor { loop { tokio::select! { - Some(msg) = self.handle_rx.recv() => { - info!( - self.log, - "Received hardware monitor message"; - "msg" => ?msg - ); - self.handle_monitor_msg(msg).await; + Ok(sled_agent) = &mut self.sled_agent_started_rx, + if self.sled_agent.is_none() => + { + info!(self.log, "Sled Agent Started"); + self.sled_agent = Some(sled_agent); + self.check_latest_hardware_snapshot().await; + } + Ok(service_manager) = &mut self.service_manager_ready_rx, + if !self.tofino_manager.is_ready() => + { + let tofino_loaded = + self.tofino_manager.become_ready(service_manager); + if tofino_loaded { + self.activate_switch().await; + } } update = self.hardware_rx.recv() => { info!( @@ -175,23 +159,6 @@ impl HardwareMonitor { } } - // Handle a message from the [`HardwareMonitorHandle`] - async fn handle_monitor_msg(&mut self, msg: HardwareMonitorMsg) { - match msg { - HardwareMonitorMsg::SledAgentStarted(sled_agent) => { - self.sled_agent = Some(sled_agent); - self.check_latest_hardware_snapshot().await; - } - HardwareMonitorMsg::ServiceManagerCreated(service_manager) => { - let tofino_loaded = - self.tofino_manager.become_ready(service_manager); - if tofino_loaded { - self.activate_switch().await; - } - } - } - } - // Handle an update from the [`HardwareMonitor`] async fn handle_hardware_update( &mut self, diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index cf74f54e7e..9411e30c2d 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -17,7 +17,9 @@ use crate::bootstrap::bootstore_setup::{ }; use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; use crate::config::Config; -use crate::hardware_monitor::{HardwareMonitor, HardwareMonitorHandle}; +use crate::hardware_monitor::HardwareMonitor; +use crate::services::ServiceManager; +use crate::sled_agent::SledAgent; use crate::storage_monitor::{StorageMonitor, StorageMonitorHandle}; use crate::zone_bundle::{CleanupContext, ZoneBundler}; use bootstore::schemes::v0 as bootstore; @@ -27,6 +29,7 @@ use sled_storage::disk::SyntheticDisk; use sled_storage::manager::{StorageHandle, StorageManager}; use slog::{info, Logger}; use std::net::Ipv6Addr; +use tokio::sync::oneshot; /// A mechanism for interacting with all long running tasks that can be shared /// between the bootstrap-agent and sled-agent code. @@ -50,10 +53,6 @@ pub struct LongRunningTaskHandles { /// A mechanism for interacting with the hardware device tree pub hardware_manager: HardwareManager, - /// A mechanism for interacting with the task that monitors for hardware - /// updates from the [`HardwareManager`] - pub hardware_monitor: HardwareMonitorHandle, - // A handle for interacting with the bootstore pub bootstore: bootstore::NodeHandle, @@ -67,7 +66,11 @@ pub async fn spawn_all_longrunning_tasks( sled_mode: SledMode, global_zone_bootstrap_ip: Ipv6Addr, config: &Config, -) -> LongRunningTaskHandles { +) -> ( + LongRunningTaskHandles, + oneshot::Sender, + oneshot::Sender, +) { let storage_key_requester = spawn_key_manager(log); let mut storage_manager = spawn_storage_manager(log, storage_key_requester.clone()); @@ -78,7 +81,7 @@ pub async fn spawn_all_longrunning_tasks( let hardware_manager = spawn_hardware_manager(log, sled_mode); // Start monitoring for hardware changes - let hardware_monitor = + let (sled_agent_started_tx, service_manager_ready_tx) = spawn_hardware_monitor(log, &hardware_manager, &storage_manager); // Add some synthetic disks if necessary. @@ -100,15 +103,18 @@ pub async fn spawn_all_longrunning_tasks( let zone_bundler = spawn_zone_bundler_tasks(log, &mut storage_manager); - LongRunningTaskHandles { - storage_key_requester, - storage_manager, - storage_monitor, - hardware_manager, - hardware_monitor, - bootstore, - zone_bundler, - } + ( + LongRunningTaskHandles { + storage_key_requester, + storage_manager, + storage_monitor, + hardware_manager, + bootstore, + zone_bundler, + }, + sled_agent_started_tx, + service_manager_ready_tx, + ) } fn spawn_key_manager(log: &Logger) -> StorageKeyRequester { @@ -125,7 +131,7 @@ fn spawn_storage_manager( key_requester: StorageKeyRequester, ) -> StorageHandle { info!(log, "Starting StorageManager"); - let (mut manager, handle) = StorageManager::new(log, key_requester); + let (manager, handle) = StorageManager::new(log, key_requester); tokio::spawn(async move { manager.run().await; }); @@ -165,14 +171,14 @@ fn spawn_hardware_monitor( log: &Logger, hardware_manager: &HardwareManager, storage_handle: &StorageHandle, -) -> HardwareMonitorHandle { +) -> (oneshot::Sender, oneshot::Sender) { info!(log, "Starting HardwareMonitor"); - let (mut monitor, handle) = + let (mut monitor, sled_agent_started_tx, service_manager_ready_tx) = HardwareMonitor::new(log, hardware_manager, storage_handle); tokio::spawn(async move { monitor.run().await; }); - handle + (sled_agent_started_tx, service_manager_ready_tx) } async fn spawn_bootstore_tasks( diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 33859465cb..3d99ee9558 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -3204,7 +3204,7 @@ mod test { } async fn setup_storage() -> StorageHandle { - let (mut manager, handle) = FakeStorageManager::new(); + let (manager, handle) = FakeStorageManager::new(); // Spawn the storage manager as done by sled-agent tokio::spawn(async move { diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index 91604b7099..b17baf533f 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -2222,7 +2222,7 @@ mod illumos_tests { } async fn setup_storage() -> StorageHandle { - let (mut manager, handle) = FakeStorageManager::new(); + let (manager, handle) = FakeStorageManager::new(); // Spawn the storage manager as done by sled-agent tokio::spawn(async move { diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index b60114a519..8b121bc467 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -392,7 +392,7 @@ impl StorageManager { // We hit a transient error in a prior iteration. saved.insert(disk); } else { - // Try ot add the disk. If there was a transient error the disk will + // Try to add the disk. If there was a transient error the disk will // have been requeued. If there was a permanent error, it will have been // dropped. If there is an another unexpected error, we will handle it and // requeue ourselves. From c0e3c710ac59c7e1f7384d14facd586c1f6a4067 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 9 Nov 2023 17:42:06 +0000 Subject: [PATCH 56/66] Use oneshot for UnderlayAccess --- sled-agent/src/bootstrap/pre_server.rs | 4 ++ sled-agent/src/bootstrap/server.rs | 25 ++++++++--- sled-agent/src/long_running_tasks.rs | 19 ++++---- sled-agent/src/server.rs | 4 ++ sled-agent/src/services.rs | 2 +- sled-agent/src/sled_agent.rs | 10 +++-- sled-agent/src/storage_monitor.rs | 60 +++++++------------------- 7 files changed, 58 insertions(+), 66 deletions(-) diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index ff87437001..61a8b09edf 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -18,6 +18,7 @@ use crate::long_running_tasks::{ }; use crate::services::ServiceManager; use crate::sled_agent::SledAgent; +use crate::storage_monitor::UnderlayAccess; use camino::Utf8PathBuf; use cancel_safe_futures::TryStreamExt; use ddm_admin_client::Client as DdmAdminClient; @@ -50,6 +51,7 @@ pub(super) struct BootstrapAgentStartup { pub(super) service_manager: ServiceManager, pub(super) long_running_task_handles: LongRunningTaskHandles, pub(super) sled_agent_started_tx: oneshot::Sender, + pub(super) underlay_available_tx: oneshot::Sender, } impl BootstrapAgentStartup { @@ -112,6 +114,7 @@ impl BootstrapAgentStartup { long_running_task_handles, sled_agent_started_tx, service_manager_ready_tx, + underlay_available_tx, ) = spawn_all_longrunning_tasks( &base_log, sled_mode, @@ -151,6 +154,7 @@ impl BootstrapAgentStartup { service_manager, long_running_task_handles, sled_agent_started_tx, + underlay_available_tx, }) } } diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 90653873c4..0a055d13cc 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -26,6 +26,7 @@ use crate::long_running_tasks::LongRunningTaskHandles; use crate::server::Server as SledAgentServer; use crate::services::ServiceManager; use crate::sled_agent::SledAgent; +use crate::storage_monitor::UnderlayAccess; use camino::Utf8PathBuf; use cancel_safe_futures::TryStreamExt; use ddm_admin_client::Client as DdmAdminClient; @@ -177,6 +178,7 @@ impl Server { service_manager, long_running_task_handles, sled_agent_started_tx, + underlay_available_tx, } = BootstrapAgentStartup::run(config).await?; // Do we have a StartSledAgentRequest stored in the ledger? @@ -242,6 +244,7 @@ impl Server { &config, &sled_request.request, long_running_task_handles.clone(), + underlay_available_tx, service_manager.clone(), &ddm_admin_localhost_client, &base_log, @@ -249,9 +252,7 @@ impl Server { ) .await?; - // We've created sled-agent; we need to (possibly) reconfigure the - // switch zone, if we're a scrimlet, to give it our underlay network - // information. + // Give the HardwareMonitory access to the `SledAgent` let sled_agent = sled_agent_server.sled_agent(); sled_agent_started_tx .send(sled_agent.clone()) @@ -265,7 +266,10 @@ impl Server { sled_agent.cold_boot_load_services().await; SledAgentState::ServerStarted(sled_agent_server) } else { - SledAgentState::Bootstrapping(Some(sled_agent_started_tx)) + SledAgentState::Bootstrapping( + Some(sled_agent_started_tx), + Some(underlay_available_tx), + ) }; // Spawn our inner task that handles any future hardware updates and any @@ -308,7 +312,10 @@ impl Server { // bootstrap server). enum SledAgentState { // We're still in the bootstrapping phase, waiting for a sled-agent request. - Bootstrapping(Option>), + Bootstrapping( + Option>, + Option>, + ), // ... or the sled agent server is running. ServerStarted(SledAgentServer), } @@ -345,6 +352,7 @@ async fn start_sled_agent( config: &SledConfig, request: &StartSledAgentRequest, long_running_task_handles: LongRunningTaskHandles, + underlay_available_tx: oneshot::Sender, service_manager: ServiceManager, ddmd_client: &DdmAdminClient, base_log: &Logger, @@ -392,6 +400,7 @@ async fn start_sled_agent( request.clone(), long_running_task_handles.clone(), service_manager, + underlay_available_tx, ) .await .map_err(SledAgentServerStartError::FailedStartingServer)?; @@ -551,7 +560,10 @@ impl Inner { log: &Logger, ) { match &mut self.state { - SledAgentState::Bootstrapping(sled_agent_started_tx) => { + SledAgentState::Bootstrapping( + sled_agent_started_tx, + underlay_available_tx, + ) => { // Extract from an option to satisfy the borrow checker let sled_agent_started_tx = sled_agent_started_tx.take().unwrap(); @@ -559,6 +571,7 @@ impl Inner { &self.config, &request, self.long_running_task_handles.clone(), + underlay_available_tx.take().unwrap(), self.service_manager.clone(), &self.ddm_admin_localhost_client, &self.base_log, diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index 9411e30c2d..e6736f6ea6 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -20,7 +20,7 @@ use crate::config::Config; use crate::hardware_monitor::HardwareMonitor; use crate::services::ServiceManager; use crate::sled_agent::SledAgent; -use crate::storage_monitor::{StorageMonitor, StorageMonitorHandle}; +use crate::storage_monitor::{StorageMonitor, UnderlayAccess}; use crate::zone_bundle::{CleanupContext, ZoneBundler}; use bootstore::schemes::v0 as bootstore; use key_manager::{KeyManager, StorageKeyRequester}; @@ -45,11 +45,6 @@ pub struct LongRunningTaskHandles { /// for establishing zpools on disks and managing their datasets. pub storage_manager: StorageHandle, - /// A task which monitors for updates from the `StorageManager` and takes - /// actions based on those updates, such as informing Nexus and setting - /// up dump locations. - pub storage_monitor: StorageMonitorHandle, - /// A mechanism for interacting with the hardware device tree pub hardware_manager: HardwareManager, @@ -70,12 +65,14 @@ pub async fn spawn_all_longrunning_tasks( LongRunningTaskHandles, oneshot::Sender, oneshot::Sender, + oneshot::Sender, ) { let storage_key_requester = spawn_key_manager(log); let mut storage_manager = spawn_storage_manager(log, storage_key_requester.clone()); - let storage_monitor = spawn_storage_monitor(log, storage_manager.clone()); + let underlay_available_tx = + spawn_storage_monitor(log, storage_manager.clone()); // TODO: Does this need to run inside tokio::task::spawn_blocking? let hardware_manager = spawn_hardware_manager(log, sled_mode); @@ -107,13 +104,13 @@ pub async fn spawn_all_longrunning_tasks( LongRunningTaskHandles { storage_key_requester, storage_manager, - storage_monitor, hardware_manager, bootstore, zone_bundler, }, sled_agent_started_tx, service_manager_ready_tx, + underlay_available_tx, ) } @@ -141,14 +138,14 @@ fn spawn_storage_manager( fn spawn_storage_monitor( log: &Logger, storage_handle: StorageHandle, -) -> StorageMonitorHandle { +) -> oneshot::Sender { info!(log, "Starting StorageMonitor"); - let (mut storage_monitor, handle) = + let (storage_monitor, underlay_available_tx) = StorageMonitor::new(log, storage_handle); tokio::spawn(async move { storage_monitor.run().await; }); - handle + underlay_available_tx } fn spawn_hardware_manager( diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs index c9828e7542..903c8dabaa 100644 --- a/sled-agent/src/server.rs +++ b/sled-agent/src/server.rs @@ -11,10 +11,12 @@ use crate::bootstrap::params::StartSledAgentRequest; use crate::long_running_tasks::LongRunningTaskHandles; use crate::nexus::NexusClientWithResolver; use crate::services::ServiceManager; +use crate::storage_monitor::UnderlayAccess; use internal_dns::resolver::Resolver; use slog::Logger; use std::net::SocketAddr; use std::sync::Arc; +use tokio::sync::oneshot; use uuid::Uuid; /// Packages up a [`SledAgent`], running the sled agent API under a Dropshot @@ -40,6 +42,7 @@ impl Server { request: StartSledAgentRequest, long_running_tasks_handles: LongRunningTaskHandles, services: ServiceManager, + underlay_available_tx: oneshot::Sender, ) -> Result { info!(log, "setting up sled agent server"); @@ -62,6 +65,7 @@ impl Server { request, services, long_running_tasks_handles, + underlay_available_tx, ) .await .map_err(|e| e.to_string())?; diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 3d99ee9558..90466370fc 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -474,10 +474,10 @@ impl ServiceManager { } async fn all_service_ledgers(&self) -> Vec { - let resources = self.inner.storage.get_latest_resources().await; if let Some(dir) = self.inner.ledger_directory_override.get() { return vec![dir.join(SERVICES_LEDGER_FILENAME)]; } + let resources = self.inner.storage.get_latest_resources().await; resources .all_m2_mountpoints(CONFIG_DATASET) .into_iter() diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index fe6ff0a1c2..1f08177d21 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -57,6 +57,7 @@ use slog::Logger; use std::collections::BTreeMap; use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; use std::sync::Arc; +use tokio::sync::oneshot; use uuid::Uuid; #[cfg(not(test))] @@ -271,6 +272,7 @@ impl SledAgent { request: StartSledAgentRequest, services: ServiceManager, long_running_task_handles: LongRunningTaskHandles, + underlay_available_tx: oneshot::Sender, ) -> Result { // Pass the "parent_log" to all subcomponents that want to set their own // "component" value. @@ -347,13 +349,13 @@ impl SledAgent { // Inform the `StorageMonitor` that the underlay is available so that // it can try to contact nexus. - long_running_task_handles - .storage_monitor - .underlay_available(UnderlayAccess { + underlay_available_tx + .send(UnderlayAccess { nexus_client: nexus_client.clone(), sled_id: request.id, }) - .await; + .map_err(|_| ()) + .expect("Failed to send to StorageMonitor"); let instances = InstanceManager::new( parent_log.clone(), diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index 71e61e84c2..3500803164 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -24,23 +24,15 @@ use sled_storage::resources::StorageResources; use slog::Logger; use std::fmt::Debug; use std::pin::Pin; -use tokio::sync::mpsc; +use tokio::sync::oneshot; use uuid::Uuid; -const QUEUE_SIZE: usize = 10; - #[derive(From, Clone, Debug)] enum NexusDiskRequest { Put(PhysicalDiskPutRequest), Delete(PhysicalDiskDeleteRequest), } -/// A message sent from the `StorageMonitorHandle` to the `StorageMonitor`. -#[derive(Debug)] -pub enum StorageMonitorMsg { - UnderlayAvailable(UnderlayAccess), -} - /// Describes the access to the underlay used by the StorageManager. #[derive(Clone)] pub struct UnderlayAccess { @@ -56,25 +48,12 @@ impl Debug for UnderlayAccess { } } -/// A mechanism for interacting with the StorageMonitor -#[derive(Clone)] -pub struct StorageMonitorHandle { - tx: mpsc::Sender, -} - -impl StorageMonitorHandle { - pub async fn underlay_available(&self, underlay_access: UnderlayAccess) { - self.tx - .send(StorageMonitorMsg::UnderlayAvailable(underlay_access)) - .await - .unwrap(); - } -} - pub struct StorageMonitor { log: Logger, storage_manager: StorageHandle, - handle_rx: mpsc::Receiver, + + // Receive a onetime notification that the underlay is available + underlay_available_rx: oneshot::Receiver, // A cached copy of the `StorageResources` from the last update storage_resources: StorageResources, @@ -93,8 +72,8 @@ impl StorageMonitor { pub fn new( log: &Logger, storage_manager: StorageHandle, - ) -> (StorageMonitor, StorageMonitorHandle) { - let (handle_tx, handle_rx) = mpsc::channel(QUEUE_SIZE); + ) -> (StorageMonitor, oneshot::Sender) { + let (underlay_available_tx, underlay_available_rx) = oneshot::channel(); let storage_resources = StorageResources::default(); let dump_setup = DumpSetup::new(&log); let log = log.new(o!("component" => "StorageMonitor")); @@ -102,20 +81,20 @@ impl StorageMonitor { StorageMonitor { log, storage_manager, - handle_rx, + underlay_available_rx, storage_resources, underlay: None, nexus_notifications: FuturesOrdered::new(), dump_setup, }, - StorageMonitorHandle { tx: handle_tx }, + underlay_available_tx, ) } /// Run the main receive loop of the `StorageMonitor` /// /// This should be spawned into a tokio task - pub async fn run(&mut self) { + pub async fn run(mut self) { loop { tokio::select! { res = self.nexus_notifications.next(), @@ -131,28 +110,21 @@ impl StorageMonitor { ); self.handle_resource_update(resources).await; } - Some(msg) = self.handle_rx.recv() => { + Ok(underlay) = &mut self.underlay_available_rx, + if self.underlay.is_none() => + { + let sled_id = underlay.sled_id; info!( self.log, - "Received storage monitor message"; - "monitor_msg" => ?msg + "Underlay Available"; "sled_id" => %sled_id ); - self.handle_monitor_msg(msg).await; + self.underlay = Some(underlay); + self.notify_nexus_about_existing_resources(sled_id).await; } } } } - async fn handle_monitor_msg(&mut self, msg: StorageMonitorMsg) { - match msg { - StorageMonitorMsg::UnderlayAvailable(underlay) => { - let sled_id = underlay.sled_id; - self.underlay = Some(underlay); - self.notify_nexus_about_existing_resources(sled_id).await; - } - } - } - /// When the underlay becomes available, we need to notify nexus about any /// discovered disks and pools, since we don't attempt to notify until there /// is an underlay available. From a7848f91ffbfd1208b6a0baf69c17b84915509ba Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 9 Nov 2023 18:17:16 +0000 Subject: [PATCH 57/66] more review fixes --- sled-agent/src/long_running_tasks.rs | 12 ++++++++---- sled-agent/src/storage_monitor.rs | 7 ++++++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index e6736f6ea6..f4a665c098 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -74,8 +74,7 @@ pub async fn spawn_all_longrunning_tasks( let underlay_available_tx = spawn_storage_monitor(log, storage_manager.clone()); - // TODO: Does this need to run inside tokio::task::spawn_blocking? - let hardware_manager = spawn_hardware_manager(log, sled_mode); + let hardware_manager = spawn_hardware_manager(log, sled_mode).await; // Start monitoring for hardware changes let (sled_agent_started_tx, service_manager_ready_tx) = @@ -148,7 +147,7 @@ fn spawn_storage_monitor( underlay_available_tx } -fn spawn_hardware_manager( +async fn spawn_hardware_manager( log: &Logger, sled_mode: SledMode, ) -> HardwareManager { @@ -161,7 +160,12 @@ fn spawn_hardware_manager( // There are pros and cons to both methods, but the reason to mention it here is that // the handle in this case is the `HardwareManager` itself. info!(log, "Starting HardwareManager"; "sled_mode" => ?sled_mode); - HardwareManager::new(log, sled_mode).unwrap() + let log = log.clone(); + tokio::task::spawn_blocking(move || { + HardwareManager::new(&log, sled_mode).unwrap() + }) + .await + .unwrap() } fn spawn_hardware_monitor( diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index 3500803164..4f4af3b59e 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -100,7 +100,12 @@ impl StorageMonitor { res = self.nexus_notifications.next(), if !self.nexus_notifications.is_empty() => { - info!(self.log, "Nexus notification complete: {:?}", res); + match res { + Some(Ok(s)) => { + info!(self.log, "Nexus notification complete: {s}"); + } + e => error!(self.log, "Nexus notification error: {e:?}") + } } resources = self.storage_manager.wait_for_changes() => { info!( From 3998708ef6d3ec50c84c37edbaff0663f178ef23 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 9 Nov 2023 20:18:41 +0000 Subject: [PATCH 58/66] more review fixes --- sled-agent/src/bootstrap/server.rs | 1 + sled-storage/src/keyfile.rs | 23 ++++++++++- sled-storage/src/manager.rs | 62 +++++++++++++++++++----------- 3 files changed, 62 insertions(+), 24 deletions(-) diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 0a055d13cc..68d7aedf02 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -348,6 +348,7 @@ impl From for StartError { } } +#[allow(clippy::too_many_arguments)] async fn start_sled_agent( config: &SledConfig, request: &StartSledAgentRequest, diff --git a/sled-storage/src/keyfile.rs b/sled-storage/src/keyfile.rs index 105092c99e..48e5d9a528 100644 --- a/sled-storage/src/keyfile.rs +++ b/sled-storage/src/keyfile.rs @@ -5,7 +5,7 @@ //! Key file support for ZFS dataset encryption use illumos_utils::zfs::Keypath; -use slog::{info, Logger}; +use slog::{error, info, Logger}; use tokio::fs::{remove_file, File}; use tokio::io::{AsyncSeekExt, AsyncWriteExt, SeekFrom}; @@ -18,6 +18,7 @@ pub struct KeyFile { path: Keypath, file: File, log: Logger, + zero_and_unlink_called: bool, } impl KeyFile { @@ -34,7 +35,12 @@ impl KeyFile { .await?; file.write_all(key).await?; info!(log, "Created keyfile {}", path); - Ok(KeyFile { path, file, log: log.clone() }) + Ok(KeyFile { + path, + file, + log: log.clone(), + zero_and_unlink_called: false, + }) } /// These keyfiles live on a tmpfs and we zero the file so the data doesn't @@ -43,6 +49,7 @@ impl KeyFile { /// It'd be nice to `impl Drop for `KeyFile` and then call `zero` /// from within the drop handler, but async `Drop` isn't supported. pub async fn zero_and_unlink(&mut self) -> std::io::Result<()> { + self.zero_and_unlink_called = true; let zeroes = [0u8; 32]; let _ = self.file.seek(SeekFrom::Start(0)).await?; self.file.write_all(&zeroes).await?; @@ -55,3 +62,15 @@ impl KeyFile { &self.path } } + +impl Drop for KeyFile { + fn drop(&mut self) { + if !self.zero_and_unlink_called { + error!( + self.log, + "Failed to call zero_and_unlink for keyfile"; + "path" => %self.path + ); + } + } +} diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 8b121bc467..e69034e396 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -208,7 +208,7 @@ impl FakeStorageManager { ) } - /// Run the main receive loop of the `StorageManager` + /// Run the main receive loop of the `FakeStorageManager` /// /// This should be spawned into a tokio task pub async fn run(mut self) { @@ -253,11 +253,14 @@ impl FakeStorageManager { pub struct StorageManager { log: Logger, state: StorageManagerState, + // Used to find the capacity of the channel for tracking purposes + tx: mpsc::Sender, rx: mpsc::Receiver, resources: StorageResources, queued_u2_drives: HashSet, key_requester: StorageKeyRequester, resource_updates: watch::Sender, + last_logged_capacity: usize, } impl StorageManager { @@ -272,11 +275,13 @@ impl StorageManager { StorageManager { log: log.new(o!("component" => "StorageManager")), state: StorageManagerState::WaitingForKeyManager, + tx: tx.clone(), rx, resources, queued_u2_drives: HashSet::new(), key_requester, resource_updates: update_tx, + last_logged_capacity: 0, }, StorageHandle { tx, resource_updates: update_rx }, ) @@ -292,16 +297,8 @@ impl StorageManager { interval.set_missed_tick_behavior(MissedTickBehavior::Delay); tokio::select! { res = self.step() => { - match res { - Some(Ok(())) => (), - Some(Err(e)) => warn!(self.log, "{e}"), - None => { - info!( - self.log, - "Shutting down StorageManager task: no handles." - ); - return; - } + if let Err(e) = res { + warn!(self.log, "{e}"); } } _ = interval.tick(), @@ -321,17 +318,38 @@ impl StorageManager { /// /// Return `None` if the sender side has disappeared and the task should /// shutdown. - pub async fn step(&mut self) -> Option> { - let Some(req) = self.rx.recv().await else { - return None; - }; + pub async fn step(&mut self) -> Result<(), Error> { + const CAPACITY_LOG_THRESHOLD: usize = 10; + // We check the capacity and log it every time it changes by at least 10 + // entries in either direction. + let current = self.tx.capacity(); + if self.last_logged_capacity.saturating_sub(current) + >= CAPACITY_LOG_THRESHOLD + { + info!( + self.log, + "Channel capacity decreased"; + "previous" => ?self.last_logged_capacity, + "current" => ?current + ); + self.last_logged_capacity = current; + } else if current.saturating_sub(self.last_logged_capacity) + >= CAPACITY_LOG_THRESHOLD + { + info!( + self.log, + "Channel capacity increased"; + "previous" => ?self.last_logged_capacity, + "current" => ?current + ); + self.last_logged_capacity = current; + } + // The sending side never disappears because we hold a copy + let req = self.rx.recv().await.unwrap(); info!(self.log, "Received {:?}", req); let should_send_updates = match req { StorageRequest::AddDisk(raw_disk) => { - match self.add_disk(raw_disk).await { - Ok(is_new) => is_new, - Err(e) => return Some(Err(e)), - } + self.add_disk(raw_disk).await? } StorageRequest::RemoveDisk(raw_disk) => { self.remove_disk(raw_disk).await @@ -368,7 +386,7 @@ impl StorageManager { let _ = self.resource_updates.send_replace(self.resources.clone()); } - Some(Ok(())) + Ok(()) } // Loop through all queued disks inserting them into [`StorageResources`] @@ -782,7 +800,7 @@ mod tests { let dir = tempdir().unwrap(); let disk = SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); handle.upsert_disk(disk).await; - manager.step().await.unwrap().unwrap(); + manager.step().await.unwrap(); // We can't wait for a reply through the handle as the storage manager task // isn't actually running. We just check the resources directly. @@ -795,7 +813,7 @@ mod tests { // Now inform the storage manager that the key manager is ready // The queued disk should not be added due to the error handle.key_manager_ready().await; - manager.step().await.unwrap().unwrap(); + manager.step().await.unwrap(); assert!(manager.resources.all_u2_zpools().is_empty()); // Manually simulating a timer tick to add queued disks should also From 9b5816afde41362b24d53e276df568af3532bf2f Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 9 Nov 2023 21:02:35 +0000 Subject: [PATCH 59/66] Some more review updates --- sled-agent/src/dump_setup.rs | 2 +- sled-agent/src/storage_monitor.rs | 10 +++++----- sled-storage/src/manager.rs | 22 +++++++++++++++------- sled-storage/src/resources.rs | 21 ++++++++++++++------- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/sled-agent/src/dump_setup.rs b/sled-agent/src/dump_setup.rs index 50bbda44b4..e675e6e12d 100644 --- a/sled-agent/src/dump_setup.rs +++ b/sled-agent/src/dump_setup.rs @@ -100,7 +100,7 @@ const ARCHIVAL_INTERVAL: Duration = Duration::from_secs(300); impl DumpSetup { pub(crate) async fn update_dumpdev_setup( &self, - disks: &Arc>, + disks: &BTreeMap, ) { let log = &self.log; let mut m2_dump_slices = Vec::new(); diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index 4f4af3b59e..3d49f44815 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -170,7 +170,7 @@ impl StorageMonitor { self.add_zpool_notify(pool, put).await; } } - self.dump_setup.update_dumpdev_setup(&updated_resources.disks).await; + self.dump_setup.update_dumpdev_setup(&updated_resources.disks()).await; // Save the updated `StorageResources` self.storage_resources = updated_resources; @@ -329,8 +329,8 @@ fn compute_resource_diffs( // Diff the existing resources with the update to see what has changed // This loop finds disks and pools that were modified or deleted - for (disk_id, (disk, pool)) in current.disks.iter() { - match updated.disks.get(disk_id) { + for (disk_id, (disk, pool)) in current.disks().iter() { + match updated.disks().get(disk_id) { Some((updated_disk, updated_pool)) => { if disk != updated_disk { disk_puts.push(PhysicalDiskPutRequest { @@ -356,8 +356,8 @@ fn compute_resource_diffs( // Diff the existing resources with the update to see what has changed // This loop finds new disks and pools - for (disk_id, (updated_disk, updated_pool)) in updated.disks.iter() { - if !current.disks.contains_key(disk_id) { + for (disk_id, (updated_disk, updated_pool)) in updated.disks().iter() { + if !current.disks().contains_key(disk_id) { disk_puts.push(PhysicalDiskPutRequest { sled_id: *sled_id, model: disk_id.model.clone(), diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index e69034e396..7b48610e15 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -351,9 +351,7 @@ impl StorageManager { StorageRequest::AddDisk(raw_disk) => { self.add_disk(raw_disk).await? } - StorageRequest::RemoveDisk(raw_disk) => { - self.remove_disk(raw_disk).await - } + StorageRequest::RemoveDisk(raw_disk) => self.remove_disk(raw_disk), StorageRequest::DisksChanged(raw_disks) => { self.ensure_using_exactly_these_disks(raw_disks).await } @@ -433,7 +431,17 @@ impl StorageManager { send_updates } - // Add a disk to `StorageResources` if it is new and return Ok(true) if so + // Add a disk to `StorageResources` if it is new, + // updated, or its pool has been updated as determined by + // [`$crate::resources::StorageResources::insert_disk`] and we decide not to + // queue the disk for later addition. If the disk was inserted to resources + // return `Ok(true)`. + // + // In case the disk is queued, it wasn't inserted into `StorageResources` + // for another reason, or we have already consumed and logged an error + // return `Ok(false). + // + // In all other cases return an Error. async fn add_disk(&mut self, raw_disk: RawDisk) -> Result { match raw_disk.variant() { DiskVariant::U2 => self.add_u2_disk(raw_disk).await, @@ -486,7 +494,7 @@ impl StorageManager { } // Delete a real disk and return `true` if the disk was actually removed - async fn remove_disk(&mut self, raw_disk: RawDisk) -> bool { + fn remove_disk(&mut self, raw_disk: RawDisk) -> bool { // If the disk is a U.2, we want to first delete it from any queued disks let _ = self.queued_u2_drives.remove(&raw_disk); self.resources.remove_disk(raw_disk.identity()) @@ -512,7 +520,7 @@ impl StorageManager { // Find all existing disks not in the current set let to_remove: Vec = self .resources - .disks + .disks() .keys() .filter_map(|id| { if !all_ids.contains(id) { @@ -555,7 +563,7 @@ impl StorageManager { info!(self.log, "add_dataset: {:?}", request); if !self .resources - .disks + .disks() .values() .any(|(_, pool)| &pool.name == request.dataset_name.pool()) { diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index f3444ac798..93f7f0793c 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -34,21 +34,28 @@ const ZONE_BUNDLE_DIRECTORY: &str = "zone"; /// inside the `StorageManager` task if there are any outstanding copies. /// Therefore, we only pay the cost to update infrequently, and no locks are /// required by callers when operating on cloned data. The only contention here -/// is for the refrence counters of the internal Arcs when `StorageResources` +/// is for the reference counters of the internal Arcs when `StorageResources` /// gets cloned or dropped. #[derive(Debug, Clone, Default, PartialEq, Eq)] pub struct StorageResources { // All disks, real and synthetic, being managed by this sled - pub disks: Arc>, + disks: Arc>, } impl StorageResources { + /// Return a reference to the current snapshot of disks + pub fn disks(&self) -> &BTreeMap { + &self.disks + } + /// Insert a disk and its zpool /// - /// Return true if data was changed, false otherwise - /// - /// This really should not be used outside this crate, except for testing - pub fn insert_disk(&mut self, disk: Disk) -> Result { + /// If the disk passed in is new or modified, or its pool size or pool name + /// changed, then insert the changed values and return `true`. Otherwise, + /// do not insert anything and return false. For instance, if only the pool + /// health changes, because it is not one of the checked values, we will not + /// insert the update and will return `false`. + pub(crate) fn insert_disk(&mut self, disk: Disk) -> Result { let disk_id = disk.identity().clone(); let zpool_name = disk.zpool_name().clone(); let zpool = Pool::new(zpool_name, disk_id.clone())?; @@ -130,6 +137,7 @@ impl StorageResources { } None } + /// Returns all M.2 zpools pub fn all_m2_zpools(&self) -> Vec { self.all_zpools(DiskVariant::M2) @@ -159,7 +167,6 @@ impl StorageResources { pub fn get_all_zpools(&self) -> Vec<(ZpoolName, DiskVariant)> { self.disks .values() - .cloned() .map(|(disk, _)| (disk.zpool_name().clone(), disk.variant())) .collect() } From 34fe6cce485b8c4765c2e2f410a1b15779eed0d0 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 9 Nov 2023 21:32:08 +0000 Subject: [PATCH 60/66] Review fixes --- sled-storage/src/manager.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 7b48610e15..243f154037 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -53,7 +53,7 @@ const QUEUE_SIZE: usize = 256; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum StorageManagerState { WaitingForKeyManager, - QueuingDisks, + QueueingDisks, Normal, } @@ -72,7 +72,7 @@ enum StorageRequest { NewFilesystem(NewFilesystemRequest), KeyManagerReady, /// This will always grab the latest state after any new updates, as it - /// serializes through the `StorageManager` task. + /// serializes through the `StorageManager` task after all prior requests. /// This serialization is particularly useful for tests. GetLatestResources(oneshot::Sender), @@ -302,7 +302,7 @@ impl StorageManager { } } _ = interval.tick(), - if self.state == StorageManagerState::QueuingDisks => + if self.state == StorageManagerState::QueueingDisks => { if self.add_queued_disks().await { let _ = self.resource_updates.send_replace(self.resources.clone()); @@ -404,7 +404,7 @@ impl StorageManager { let queued = std::mem::take(&mut self.queued_u2_drives); let mut iter = queued.into_iter(); while let Some(disk) = iter.next() { - if self.state == StorageManagerState::QueuingDisks { + if self.state == StorageManagerState::QueueingDisks { // We hit a transient error in a prior iteration. saved.insert(disk); } else { @@ -467,7 +467,7 @@ impl StorageManager { "disk_id" => ?raw_disk.identity() ); self.queued_u2_drives.insert(raw_disk); - self.state = StorageManagerState::QueuingDisks; + self.state = StorageManagerState::QueueingDisks; Ok(false) } Err(err) => { @@ -692,7 +692,7 @@ mod tests { // Check other non-normal stages and ensure disk gets queued manager.queued_u2_drives.clear(); - manager.state = StorageManagerState::QueuingDisks; + manager.state = StorageManagerState::QueueingDisks; manager.add_u2_disk(raw_disk.clone()).await.unwrap(); assert!(manager.resources.all_u2_zpools().is_empty()); assert_eq!(manager.queued_u2_drives, HashSet::from([raw_disk])); From 7884213840a27055b19cba46083d99f3ba42fdc0 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 9 Nov 2023 21:39:30 +0000 Subject: [PATCH 61/66] USE_MOCKS only exists during testing now --- illumos-utils/src/lib.rs | 3 ++- sled-storage/Cargo.toml | 2 +- sled-storage/src/manager.rs | 10 +++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/illumos-utils/src/lib.rs b/illumos-utils/src/lib.rs index 3b696d178b..1faa4c5c37 100644 --- a/illumos-utils/src/lib.rs +++ b/illumos-utils/src/lib.rs @@ -112,7 +112,7 @@ mod inner { } // Due to feature unification, the `testing` feature is enabled when some tests -// don't actually want to use it. We allow them to opt out of the use of the +// don't actually want to use it. We allow them to opt out of the use of the // free function here. We also explicitly opt-in where mocks are used. // // Note that this only works if the tests that use mocks and those that don't @@ -120,6 +120,7 @@ mod inner { // so there is no problem currently. // // We can remove all this when we get rid of the mocks. +#[cfg(any(test, feature = "testing"))] pub static USE_MOCKS: AtomicBool = AtomicBool::new(false); pub fn execute( diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml index 82ab206a8e..cb3a790631 100644 --- a/sled-storage/Cargo.toml +++ b/sled-storage/Cargo.toml @@ -24,7 +24,7 @@ uuid.workspace = true omicron-workspace-hack.workspace = true [dev-dependencies] -illumos-utils = { workspace = true, features = ["tmp_keypath"] } +illumos-utils = { workspace = true, features = ["tmp_keypath", "testing"] } omicron-test-utils.workspace = true camino-tempfile.workspace = true diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 243f154037..f7f9238a56 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -922,7 +922,7 @@ mod tests { let resources = handle.wait_for_changes().await; let expected: HashSet<_> = disks.iter().take(3).map(|d| d.identity()).collect(); - let actual: HashSet<_> = resources.disks.keys().collect(); + let actual: HashSet<_> = resources.disks().keys().collect(); assert_eq!(expected, actual); // Add first three disks after the initial one. The returned resources @@ -935,7 +935,7 @@ mod tests { let resources = handle.wait_for_changes().await; let expected: HashSet<_> = disks.iter().skip(1).take(3).map(|d| d.identity()).collect(); - let actual: HashSet<_> = resources.disks.keys().collect(); + let actual: HashSet<_> = resources.disks().keys().collect(); assert_eq!(expected, actual); // Ensure the same set of disks and make sure no change occurs @@ -958,7 +958,7 @@ mod tests { let resources = handle.wait_for_changes().await; let expected: HashSet<_> = disks.iter().skip(4).take(5).map(|d| d.identity()).collect(); - let actual: HashSet<_> = resources.disks.keys().collect(); + let actual: HashSet<_> = resources.disks().keys().collect(); assert_eq!(expected, actual); // Finally, change the zpool backing of the 5th disk to be that of the 10th @@ -980,10 +980,10 @@ mod tests { let resources = handle.wait_for_changes().await; // Ensure the one modified disk changed as we expected - assert_eq!(5, resources.disks.len()); + assert_eq!(5, resources.disks().len()); for raw_disk in expected { let (disk, pool) = - resources.disks.get(raw_disk.identity()).unwrap(); + resources.disks().get(raw_disk.identity()).unwrap(); assert_eq!(disk.zpool_name(), raw_disk.zpool_name()); assert_eq!(&pool.name, disk.zpool_name()); assert_eq!(raw_disk.identity(), &pool.parent); From 49f26561fef0aba95571dccfc4ea52e023d57be2 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 9 Nov 2023 22:31:41 +0000 Subject: [PATCH 62/66] remove unnecessary ref --- sled-agent/src/storage_monitor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index 3d49f44815..f552fdfd86 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -170,7 +170,7 @@ impl StorageMonitor { self.add_zpool_notify(pool, put).await; } } - self.dump_setup.update_dumpdev_setup(&updated_resources.disks()).await; + self.dump_setup.update_dumpdev_setup(updated_resources.disks()).await; // Save the updated `StorageResources` self.storage_resources = updated_resources; From 35e252bde0ddf79f98a44687e812ff64f1587452 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 9 Nov 2023 23:33:40 +0000 Subject: [PATCH 63/66] remove autogenerated file --- nexus/preprocessed_configs/config.xml | 41 --------------------------- 1 file changed, 41 deletions(-) delete mode 100644 nexus/preprocessed_configs/config.xml diff --git a/nexus/preprocessed_configs/config.xml b/nexus/preprocessed_configs/config.xml deleted file mode 100644 index 9b13f12aea..0000000000 --- a/nexus/preprocessed_configs/config.xml +++ /dev/null @@ -1,41 +0,0 @@ - - - - - trace - true - - - 8123 - 9000 - 9004 - - ./ - - true - - - - - - - ::/0 - - - default - default - 1 - - - - - - - - - - - \ No newline at end of file From 11f1b3695b4954e94ce40b9680e8e863b735c56f Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 10 Nov 2023 06:15:49 +0000 Subject: [PATCH 64/66] More review cleanup --- sled-storage/src/manager.rs | 113 ++++++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 42 deletions(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index f7f9238a56..d5df71c0b2 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -57,6 +57,31 @@ pub enum StorageManagerState { Normal, } +enum AddDiskResult { + DiskInserted, + DiskAlreadyInserted, + DiskQueued, +} + +impl AddDiskResult { + fn disk_inserted(&self) -> bool { + match self { + AddDiskResult::DiskInserted => true, + _ => false, + } + } +} + +impl From for AddDiskResult { + fn from(value: bool) -> Self { + if value { + AddDiskResult::DiskInserted + } else { + AddDiskResult::DiskAlreadyInserted + } + } +} + #[derive(Debug)] struct NewFilesystemRequest { dataset_id: Uuid, @@ -281,7 +306,7 @@ impl StorageManager { queued_u2_drives: HashSet::new(), key_requester, resource_updates: update_tx, - last_logged_capacity: 0, + last_logged_capacity: QUEUE_SIZE, }, StorageHandle { tx, resource_updates: update_rx }, ) @@ -349,7 +374,7 @@ impl StorageManager { info!(self.log, "Received {:?}", req); let should_send_updates = match req { StorageRequest::AddDisk(raw_disk) => { - self.add_disk(raw_disk).await? + self.add_disk(raw_disk).await?.disk_inserted() } StorageRequest::RemoveDisk(raw_disk) => self.remove_disk(raw_disk), StorageRequest::DisksChanged(raw_disks) => { @@ -394,55 +419,53 @@ impl StorageManager { // // Return true if updates should be sent to watchers, false otherwise async fn add_queued_disks(&mut self) -> bool { - info!(self.log, "Attempting to add queued disks"); + info!( + self.log, + "Attempting to add queued disks"; + "num_disks" => %self.queued_u2_drives.len() + ); self.state = StorageManagerState::Normal; let mut send_updates = false; // Disks that should be requeued. - let mut saved = HashSet::new(); - let queued = std::mem::take(&mut self.queued_u2_drives); - let mut iter = queued.into_iter(); - while let Some(disk) = iter.next() { + let queued = self.queued_u2_drives.clone(); + let mut to_dequeue = HashSet::new(); + for disk in queued.iter() { if self.state == StorageManagerState::QueueingDisks { // We hit a transient error in a prior iteration. - saved.insert(disk); + break; } else { - // Try to add the disk. If there was a transient error the disk will - // have been requeued. If there was a permanent error, it will have been - // dropped. If there is an another unexpected error, we will handle it and - // requeue ourselves. match self.add_u2_disk(disk.clone()).await { - Err(err) => { - warn!( - self.log, - "Potentially transient error: {err}: requeuing disk"; - "disk_id" => ?disk.identity() - ); - saved.insert(disk); + Err(_) => { + // This is an unrecoverable error, so we don't queue the + // disk again. + to_dequeue.insert(disk); + } + Ok(AddDiskResult::DiskInserted) => { + send_updates = true; + to_dequeue.insert(disk); } - Ok(true) => send_updates = true, - Ok(false) => (), + Ok(AddDiskResult::DiskAlreadyInserted) => { + to_dequeue.insert(disk); + } + Ok(AddDiskResult::DiskQueued) => (), } } } - // Merge any requeued disks from transient errors with saved disks here - self.queued_u2_drives.extend(saved); + // Dequeue any inserted disks + self.queued_u2_drives.retain(|k| !to_dequeue.contains(k)); send_updates } // Add a disk to `StorageResources` if it is new, // updated, or its pool has been updated as determined by // [`$crate::resources::StorageResources::insert_disk`] and we decide not to - // queue the disk for later addition. If the disk was inserted to resources - // return `Ok(true)`. - // - // In case the disk is queued, it wasn't inserted into `StorageResources` - // for another reason, or we have already consumed and logged an error - // return `Ok(false). - // - // In all other cases return an Error. - async fn add_disk(&mut self, raw_disk: RawDisk) -> Result { + // queue the disk for later addition. + async fn add_disk( + &mut self, + raw_disk: RawDisk, + ) -> Result { match raw_disk.variant() { DiskVariant::U2 => self.add_u2_disk(raw_disk).await, DiskVariant::M2 => self.add_m2_disk(raw_disk).await, @@ -450,16 +473,19 @@ impl StorageManager { } // Add a U.2 disk to [`StorageResources`] or queue it to be added later - async fn add_u2_disk(&mut self, raw_disk: RawDisk) -> Result { + async fn add_u2_disk( + &mut self, + raw_disk: RawDisk, + ) -> Result { if self.state != StorageManagerState::Normal { self.queued_u2_drives.insert(raw_disk); - return Ok(false); + return Ok(AddDiskResult::DiskQueued); } match Disk::new(&self.log, raw_disk.clone(), Some(&self.key_requester)) .await { - Ok(disk) => self.resources.insert_disk(disk), + Ok(disk) => self.resources.insert_disk(disk).map(Into::into), Err(err @ DiskError::Dataset(DatasetError::KeyManager(_))) => { warn!( self.log, @@ -468,7 +494,7 @@ impl StorageManager { ); self.queued_u2_drives.insert(raw_disk); self.state = StorageManagerState::QueueingDisks; - Ok(false) + Ok(AddDiskResult::DiskQueued) } Err(err) => { error!( @@ -476,7 +502,7 @@ impl StorageManager { "Persistent error: {err}: not queueing disk"; "disk_id" => ?raw_disk.identity() ); - Ok(false) + Err(err.into()) } } } @@ -486,11 +512,14 @@ impl StorageManager { // // We never queue M.2 drives, as they don't rely on [`KeyManager`] based // encryption - async fn add_m2_disk(&mut self, raw_disk: RawDisk) -> Result { + async fn add_m2_disk( + &mut self, + raw_disk: RawDisk, + ) -> Result { let disk = Disk::new(&self.log, raw_disk.clone(), Some(&self.key_requester)) .await?; - self.resources.insert_disk(disk) + self.resources.insert_disk(disk).map(Into::into) } // Delete a real disk and return `true` if the disk was actually removed @@ -500,7 +529,7 @@ impl StorageManager { self.resources.remove_disk(raw_disk.identity()) } - // Find all disks to remove that are not in raw_disks and remove them Then + // Find all disks to remove that are not in raw_disks and remove them. Then // take the remaining disks and try to add them all. `StorageResources` will // inform us if anything changed, and if so we return true, otherwise we // return false. @@ -540,8 +569,8 @@ impl StorageManager { for raw_disk in raw_disks { let disk_id = raw_disk.identity().clone(); match self.add_disk(raw_disk).await { - Ok(true) => should_update = true, - Ok(false) => (), + Ok(AddDiskResult::DiskInserted) => should_update = true, + Ok(_) => (), Err(err) => { warn!( self.log, From cfc3ef7c732afdfbf476ac4a62b64471c697bfbb Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 14 Nov 2023 18:36:28 +0000 Subject: [PATCH 65/66] review fixes --- sled-agent/src/bootstrap/server.rs | 20 +++++-- sled-storage/src/manager.rs | 38 ++------------ sled-storage/src/resources.rs | 84 +++++++++++++++++------------- 3 files changed, 70 insertions(+), 72 deletions(-) diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 604baea55b..f4948de83b 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -565,14 +565,23 @@ impl Inner { underlay_available_tx, ) => { let request_id = request.body.id; - // Extract from an option to satisfy the borrow checker + + // Extract from options to satisfy the borrow checker. + // It is not possible for `start_sled_agent` to be cancelled + // or fail in a safe, restartable manner. Therefore, for now, + // we explicitly unwrap here, and panic on error below. + // + // See https://github.com/oxidecomputer/omicron/issues/4494 let sled_agent_started_tx = sled_agent_started_tx.take().unwrap(); + let underlay_available_tx = + underlay_available_tx.take().unwrap(); + let response = match start_sled_agent( &self.config, request, self.long_running_task_handles.clone(), - underlay_available_tx.take().unwrap(), + underlay_available_tx, self.service_manager.clone(), &self.ddm_admin_localhost_client, &self.base_log, @@ -591,7 +600,12 @@ impl Inner { self.state = SledAgentState::ServerStarted(server); Ok(SledAgentResponse { id: request_id }) } - Err(err) => Err(format!("{err:#}")), + Err(err) => { + // This error is unrecoverable, and if returned we'd + // end up in maintenance mode anyway. + error!(log, "Failed to start sled agent: {err:#}"); + panic!("Failed to start sled agent"); + } }; _ = response_tx.send(response); } diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index d5df71c0b2..50b1c44148 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -9,7 +9,7 @@ use std::collections::HashSet; use crate::dataset::{DatasetError, DatasetName}; use crate::disk::{Disk, DiskError, RawDisk}; use crate::error::Error; -use crate::resources::StorageResources; +use crate::resources::{AddDiskResult, StorageResources}; use camino::Utf8PathBuf; use illumos_utils::zfs::{Mountpoint, Zfs}; use illumos_utils::zpool::ZpoolName; @@ -57,31 +57,6 @@ pub enum StorageManagerState { Normal, } -enum AddDiskResult { - DiskInserted, - DiskAlreadyInserted, - DiskQueued, -} - -impl AddDiskResult { - fn disk_inserted(&self) -> bool { - match self { - AddDiskResult::DiskInserted => true, - _ => false, - } - } -} - -impl From for AddDiskResult { - fn from(value: bool) -> Self { - if value { - AddDiskResult::DiskInserted - } else { - AddDiskResult::DiskAlreadyInserted - } - } -} - #[derive(Debug)] struct NewFilesystemRequest { dataset_id: Uuid, @@ -240,7 +215,7 @@ impl FakeStorageManager { loop { match self.rx.recv().await { Some(StorageRequest::AddDisk(raw_disk)) => { - if self.add_disk(raw_disk) { + if self.add_disk(raw_disk).disk_inserted() { self.resource_updates .send_replace(self.resources.clone()); } @@ -257,7 +232,7 @@ impl FakeStorageManager { } // Add a disk to `StorageResources` if it is new and return true if so - fn add_disk(&mut self, raw_disk: RawDisk) -> bool { + fn add_disk(&mut self, raw_disk: RawDisk) -> AddDiskResult { let disk = match raw_disk { RawDisk::Real(_) => { panic!( @@ -340,9 +315,6 @@ impl StorageManager { /// Process the next event /// /// This is useful for testing/debugging - /// - /// Return `None` if the sender side has disappeared and the task should - /// shutdown. pub async fn step(&mut self) -> Result<(), Error> { const CAPACITY_LOG_THRESHOLD: usize = 10; // We check the capacity and log it every time it changes by at least 10 @@ -485,7 +457,7 @@ impl StorageManager { match Disk::new(&self.log, raw_disk.clone(), Some(&self.key_requester)) .await { - Ok(disk) => self.resources.insert_disk(disk).map(Into::into), + Ok(disk) => self.resources.insert_disk(disk), Err(err @ DiskError::Dataset(DatasetError::KeyManager(_))) => { warn!( self.log, @@ -519,7 +491,7 @@ impl StorageManager { let disk = Disk::new(&self.log, raw_disk.clone(), Some(&self.key_requester)) .await?; - self.resources.insert_disk(disk).map(Into::into) + self.resources.insert_disk(disk) } // Delete a real disk and return `true` if the disk was actually removed diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 93f7f0793c..c1f460dc92 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -9,6 +9,7 @@ use crate::disk::Disk; use crate::error::Error; use crate::pool::Pool; use camino::Utf8PathBuf; +use cfg_if::cfg_if; use illumos_utils::zpool::ZpoolName; use omicron_common::disk::DiskIdentity; use sled_hardware::DiskVariant; @@ -21,6 +22,21 @@ const BUNDLE_DIRECTORY: &str = "bundle"; // The directory for zone bundles. const ZONE_BUNDLE_DIRECTORY: &str = "zone"; +pub enum AddDiskResult { + DiskInserted, + DiskAlreadyInserted, + DiskQueued, +} + +impl AddDiskResult { + pub fn disk_inserted(&self) -> bool { + match self { + AddDiskResult::DiskInserted => true, + _ => false, + } + } +} + /// Storage related resources: disks and zpools /// /// This state is internal to the [`crate::manager::StorageManager`] task. Clones @@ -50,12 +66,16 @@ impl StorageResources { /// Insert a disk and its zpool /// - /// If the disk passed in is new or modified, or its pool size or pool name - /// changed, then insert the changed values and return `true`. Otherwise, - /// do not insert anything and return false. For instance, if only the pool - /// health changes, because it is not one of the checked values, we will not - /// insert the update and will return `false`. - pub(crate) fn insert_disk(&mut self, disk: Disk) -> Result { + /// If the disk passed in is new or modified, or its pool size or pool + /// name changed, then insert the changed values and return `DiskInserted`. + /// Otherwise, do not insert anything and return `DiskAlreadyInserted`. + /// For instance, if only the pool health changes, because it is not one + /// of the checked values, we will not insert the update and will return + /// `DiskAlreadyInserted`. + pub(crate) fn insert_disk( + &mut self, + disk: Disk, + ) -> Result { let disk_id = disk.identity().clone(); let zpool_name = disk.zpool_name().clone(); let zpool = Pool::new(zpool_name, disk_id.clone())?; @@ -64,63 +84,55 @@ impl StorageResources { && stored_pool.info.size() == zpool.info.size() && stored_pool.name == zpool.name { - return Ok(false); + return Ok(AddDiskResult::DiskAlreadyInserted); } } // Either the disk or zpool changed Arc::make_mut(&mut self.disks).insert(disk_id, (disk, zpool)); - Ok(true) + Ok(AddDiskResult::DiskInserted) } /// Insert a disk while creating a fake pool /// This is a workaround for current mock based testing strategies /// in the sled-agent. - /// - /// Return true if data was changed, false otherwise #[cfg(feature = "testing")] - pub fn insert_fake_disk(&mut self, disk: Disk) -> bool { + pub fn insert_fake_disk(&mut self, disk: Disk) -> AddDiskResult { let disk_id = disk.identity().clone(); let zpool_name = disk.zpool_name().clone(); let zpool = Pool::new_with_fake_info(zpool_name, disk_id.clone()); if self.disks.contains_key(&disk_id) { - return false; + return AddDiskResult::DiskAlreadyInserted; } // Either the disk or zpool changed Arc::make_mut(&mut self.disks).insert(disk_id, (disk, zpool)); - true + AddDiskResult::DiskInserted } /// Delete a disk and its zpool /// /// Return true, if data was changed, false otherwise /// - /// Note: We never allow removal of synthetic disks as they are only added - /// once. - #[cfg(not(test))] + /// Note: We never allow removal of synthetic disks in production as they + /// are only added once. pub(crate) fn remove_disk(&mut self, id: &DiskIdentity) -> bool { - if let Some((disk, _)) = self.disks.get(id) { - if disk.is_synthetic() { - return false; - } - } else { + let Some((disk, _)) = self.disks.get(id) else { return false; + }; + + cfg_if! { + if #[cfg(test)] { + // For testing purposes, we allow synthetic disks to be deleted. + // Silence an unused variable warning. + _ = disk; + } else { + // In production, we disallow removal of synthetic disks as they + // are only added once. + if disk.is_synthetic() { + return false; + } + } } - // Safe to unwrap as we just checked the key existed above - Arc::make_mut(&mut self.disks).remove(id).unwrap(); - true - } - /// Delete a real disk and its zpool - /// - /// Return true, if data was changed, false otherwise - /// - /// Note: For testing purposes of this crate, we allow synthetic disks to - /// be deleted. - #[cfg(test)] - pub(crate) fn remove_disk(&mut self, id: &DiskIdentity) -> bool { - if !self.disks.contains_key(id) { - return false; - } // Safe to unwrap as we just checked the key existed above Arc::make_mut(&mut self.disks).remove(id).unwrap(); true From 6b7845c825c5be415f9d4f36167419e614729cae Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 14 Nov 2023 18:48:27 +0000 Subject: [PATCH 66/66] fix nit --- sled-storage/src/dataset.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index 503ccb053a..a2878af7f6 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -324,7 +324,10 @@ pub(crate) async fn ensure_zpool_has_datasets( Zfs::destroy_dataset(name).or_else(|err| { // If we can't find the dataset, that's fine -- it might // not have been formatted yet. - if let DestroyDatasetErrorVariant::NotFound = err.err { + if matches!( + err.err, + DestroyDatasetErrorVariant::NotFound + ) { Ok(()) } else { Err(err)