From 1e61ea90e52a5f6b909b61ea8b650088971a109a Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 11 Oct 2023 22:37:36 +0000 Subject: [PATCH] wip --- sled-agent/src/bootstrap/secret_retriever.rs | 2 +- sled-agent/src/dump_setup.rs | 8 +- sled-agent/src/long_running_tasks.rs | 22 +++ sled-agent/src/services.rs | 80 ++++++----- sled-agent/src/storage_monitor.rs | 25 +++- sled-agent/src/zone_bundle.rs | 133 ++++++++----------- sled-storage/src/resources.rs | 4 +- 7 files changed, 147 insertions(+), 127 deletions(-) diff --git a/sled-agent/src/bootstrap/secret_retriever.rs b/sled-agent/src/bootstrap/secret_retriever.rs index 5cae06310c..d6b542378d 100644 --- a/sled-agent/src/bootstrap/secret_retriever.rs +++ b/sled-agent/src/bootstrap/secret_retriever.rs @@ -92,7 +92,7 @@ impl LrtqOrHardcodedSecretRetriever { /// /// The local retriever only returns keys for epoch 0 #[derive(Debug)] -struct HardcodedSecretRetriever {} +pub struct HardcodedSecretRetriever {} #[async_trait] impl SecretRetriever for HardcodedSecretRetriever { diff --git a/sled-agent/src/dump_setup.rs b/sled-agent/src/dump_setup.rs index ea60998955..50bbda44b4 100644 --- a/sled-agent/src/dump_setup.rs +++ b/sled-agent/src/dump_setup.rs @@ -7,13 +7,13 @@ use omicron_common::disk::DiskIdentity; use sled_hardware::DiskVariant; use sled_storage::dataset::{CRASH_DATASET, DUMP_DATASET}; use sled_storage::disk::Disk; +use sled_storage::pool::Pool; use slog::Logger; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashSet}; use std::ffi::OsString; use std::path::{Path, PathBuf}; use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime, SystemTimeError, UNIX_EPOCH}; -use tokio::sync::MutexGuard; pub struct DumpSetup { worker: Arc>, @@ -100,13 +100,13 @@ const ARCHIVAL_INTERVAL: Duration = Duration::from_secs(300); impl DumpSetup { pub(crate) async fn update_dumpdev_setup( &self, - disks: &mut MutexGuard<'_, HashMap>, + disks: &Arc>, ) { let log = &self.log; let mut m2_dump_slices = Vec::new(); let mut u2_debug_datasets = Vec::new(); let mut m2_core_datasets = Vec::new(); - for (_id, disk) in disks.iter() { + for (_id, (disk, _)) in disks.iter() { if disk.is_synthetic() { // We only setup dump devices on real disks continue; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index f322126714..714bd1e406 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -17,6 +17,7 @@ use crate::bootstrap::bootstore::{ }; use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; use crate::hardware_monitor::{HardwareMonitor, HardwareMonitorHandle}; +use crate::storage_monitor::{StorageMonitor, StorageMonitorHandle}; use crate::zone_bundle::{CleanupContext, ZoneBundler}; use bootstore::schemes::v0 as bootstore; use key_manager::{KeyManager, StorageKeyRequester}; @@ -39,6 +40,11 @@ pub struct LongRunningTaskHandles { /// for establishing zpools on disks and managing their datasets. pub storage_manager: StorageHandle, + /// A task which monitors for updates from the `StorageManager` and takes + /// actions based on those updates, such as informing Nexus and setting + /// up dump locations. + pub storage_monitor: StorageMonitorHandle, + /// A mechanism for interacting with the hardware device tree pub hardware_manager: HardwareManager, @@ -63,6 +69,8 @@ pub async fn spawn_all_longrunning_tasks( let mut storage_manager = spawn_storage_manager(log, storage_key_requester.clone()); + let storage_monitor = spawn_storage_monitor(log, storage_manager.clone()); + // TODO: Does this need to run inside tokio::task::spawn_blocking? let hardware_manager = spawn_hardware_manager(log, sled_mode); @@ -87,6 +95,7 @@ pub async fn spawn_all_longrunning_tasks( LongRunningTaskHandles { storage_key_requester, storage_manager, + storage_monitor, hardware_manager, hardware_monitor, bootstore, @@ -115,6 +124,19 @@ fn spawn_storage_manager( handle } +fn spawn_storage_monitor( + log: &Logger, + storage_handle: StorageHandle, +) -> StorageMonitorHandle { + info!(log, "Starting StorageMonitor"); + let (mut storage_monitor, handle) = + StorageMonitor::new(log, storage_handle); + tokio::spawn(async move { + storage_monitor.run().await; + }); + handle +} + fn spawn_hardware_manager( log: &Logger, sled_mode: SledMode, diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index c22eae6baa..6aaf69f198 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -5,7 +5,7 @@ //! Sled-local service management. //! //! For controlling zone-based storage services, refer to -//! [sled_hardware:manager::StorageManager]. +//! [sled_storage:manager::StorageManager]. //! //! For controlling virtual machine instances, refer to //! [crate::instance_manager::InstanceManager]. @@ -2935,8 +2935,8 @@ impl ServiceManager { #[cfg(test)] mod test { use super::*; + use crate::bootstrap::secret_retriever::HardcodedSecretRetriever; use crate::params::{ServiceZoneService, ZoneType}; - use async_trait::async_trait; use illumos_utils::{ dladm::{ Etherstub, MockDladm, BOOTSTRAP_ETHERSTUB_NAME, @@ -2945,10 +2945,9 @@ mod test { svc, zone::MockZones, }; - use key_manager::{ - SecretRetriever, SecretRetrieverError, SecretState, VersionedIkm, - }; + use key_manager::KeyManager; use omicron_common::address::OXIMETER_PORT; + use sled_storage::manager::{StorageHandle, StorageManager}; use std::net::{Ipv6Addr, SocketAddrV6}; use std::os::unix::process::ExitStatusExt; use uuid::Uuid; @@ -3141,29 +3140,28 @@ mod test { } } - pub struct TestSecretRetriever {} - - #[async_trait] - impl SecretRetriever for TestSecretRetriever { - async fn get_latest( - &self, - ) -> Result { - let epoch = 0; - let salt = [0u8; 32]; - let secret = [0x1d; 32]; - - Ok(VersionedIkm::new(epoch, salt, &secret)) - } + // Spawn storage related tasks and return a handle to pass to both the `ServiceManager` + // and `ZoneBundler`. However, it is expected that this handle is not actually used + // as there are no provisioned zones or datasets. This is consistent with the use of + // `test_config.override_paths` below. + async fn setup_storage(log: &Logger) -> StorageHandle { + let (mut key_manager, key_requester) = + KeyManager::new(log, HardcodedSecretRetriever {}); + let (mut manager, handle) = StorageManager::new(log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); - async fn get( - &self, - epoch: u64, - ) -> Result { - if epoch != 0 { - return Err(SecretRetrieverError::NoSuchEpoch(epoch)); - } - Ok(SecretState::Current(self.get_latest().await?)) - } + // Inform the storage manager that the secret retriever is ready We + // are using the HardcodedSecretRetriever, so no need to wait for RSS + // or anything to setup the LRTQ + handle.key_manager_ready().await; + handle } #[tokio::test] @@ -3174,10 +3172,10 @@ mod test { let log = logctx.log.clone(); let test_config = TestConfig::new().await; - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage(&log).await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -3188,7 +3186,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources, + storage_handle, zone_bundler, ); test_config.override_paths(&mgr); @@ -3222,10 +3220,10 @@ mod test { let log = logctx.log.clone(); let test_config = TestConfig::new().await; - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage(&log).await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -3236,7 +3234,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources, + storage_handle, zone_bundler, ); test_config.override_paths(&mgr); @@ -3275,10 +3273,10 @@ mod test { // First, spin up a ServiceManager, create a new service, and tear it // down. - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage(&log).await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -3289,7 +3287,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle.clone(), zone_bundler.clone(), ); test_config.override_paths(&mgr); @@ -3322,7 +3320,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle.clone(), zone_bundler.clone(), ); test_config.override_paths(&mgr); @@ -3358,10 +3356,10 @@ mod test { // First, spin up a ServiceManager, create a new service, and tear it // down. - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage(&log).await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -3372,7 +3370,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle.clone(), zone_bundler.clone(), ); test_config.override_paths(&mgr); @@ -3410,7 +3408,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle, zone_bundler.clone(), ); test_config.override_paths(&mgr); diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index debd0d5e95..c48fd5cbfa 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -6,17 +6,16 @@ //! and dispatches them to other parst of the bootstrap agent and sled agent //! code. +use crate::dump_setup::DumpSetup; use crate::nexus::NexusClientWithResolver; use derive_more::From; use futures::stream::FuturesOrdered; use futures::FutureExt; use nexus_client::types::PhysicalDiskDeleteRequest; -use nexus_client::types::PhysicalDiskKind; use nexus_client::types::PhysicalDiskPutRequest; use nexus_client::types::ZpoolPutRequest; use omicron_common::api::external::ByteCount; use omicron_common::backoff; -use sled_storage::disk::Disk; use sled_storage::manager::StorageHandle; use sled_storage::pool::Pool; use sled_storage::resources::StorageResources; @@ -74,6 +73,9 @@ pub struct StorageMonitor { // A queue for sending nexus notifications in order nexus_notifications: FuturesOrdered, + + // Invokes dumpadm(8) and savecore(8) when new disks are encountered + dump_setup: DumpSetup, } impl StorageMonitor { @@ -83,6 +85,7 @@ impl StorageMonitor { ) -> (StorageMonitor, StorageMonitorHandle) { let (handle_tx, handle_rx) = mpsc::channel(QUEUE_SIZE); let storage_resources = StorageResources::default(); + let dump_setup = DumpSetup::new(&log); let log = log.new(o!("component" => "StorageMonitor")); ( StorageMonitor { @@ -92,6 +95,7 @@ impl StorageMonitor { storage_resources, underlay: None, nexus_notifications: FuturesOrdered::new(), + dump_setup, }, StorageMonitorHandle { tx: handle_tx }, ) @@ -129,6 +133,9 @@ impl StorageMonitor { let sled_id = underlay.sled_id; self.underlay = Some(underlay); self.notify_nexus_about_existing_resources(sled_id).await; + self.dump_setup + .update_dumpdev_setup(&self.storage_resources.disks) + .await; } } } @@ -162,6 +169,12 @@ impl StorageMonitor { &self.storage_resources, &updated_resources, ); + if nexus_updates.has_disk_updates() { + self.dump_setup + .update_dumpdev_setup(&self.storage_resources.disks) + .await; + } + for put in nexus_updates.disk_puts { self.physical_disk_notify(put.into()).await; } @@ -171,8 +184,6 @@ impl StorageMonitor { for (pool, put) in nexus_updates.zpool_puts { self.add_zpool_notify(pool, put).await; } - - // TODO: Update Dump Setup if any diffs } // Save the updated `StorageResources` self.storage_resources = updated_resources; @@ -291,6 +302,12 @@ struct NexusUpdates { zpool_puts: Vec<(Pool, ZpoolPutRequest)>, } +impl NexusUpdates { + fn has_disk_updates(&self) -> bool { + !self.disk_puts.is_empty() || !self.disk_deletes.is_empty() + } +} + fn compute_resource_diffs( log: &Logger, sled_id: &Uuid, diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index ea7481bd6d..55058ee23a 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -1764,7 +1764,6 @@ mod illumos_tests { use super::CleanupPeriod; use super::PriorityOrder; use super::StorageLimit; - use super::StorageResources; use super::Utf8Path; use super::Utf8PathBuf; use super::Uuid; @@ -1774,9 +1773,15 @@ mod illumos_tests { use super::ZoneBundleMetadata; use super::ZoneBundler; use super::ZFS; + use crate::bootstrap::secret_retriever::HardcodedSecretRetriever; use anyhow::Context; use chrono::TimeZone; use chrono::Utc; + use illumos_utils::zpool::{Zpool, ZpoolName}; + use key_manager::KeyManager; + use sled_storage::disk::RawDisk; + use sled_storage::disk::SyntheticDisk; + use sled_storage::manager::{StorageHandle, StorageManager}; use slog::Drain; use slog::Logger; use tokio::process::Command; @@ -1818,31 +1823,62 @@ mod illumos_tests { // system, that creates the directories implied by the `StorageResources` // expected disk structure. struct ResourceWrapper { - resources: StorageResources, + storage_handle: StorageHandle, + zpool_names: Vec, dirs: Vec, } + async fn setup_storage(log: &Logger) -> (StorageHandle, Vec) { + let (mut key_manager, key_requester) = + KeyManager::new(log, HardcodedSecretRetriever {}); + let (mut manager, handle) = StorageManager::new(log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + // Inform the storage manager that the secret retriever is ready We + // are using the HardcodedSecretRetriever, so no need to wait for RSS + // or anything to setup the LRTQ + handle.key_manager_ready().await; + + // Put the zpools under /rpool + let dir = + camino::Utf8PathBuf::from(format!("/rpool/{}", Uuid::new_v4())); + + let internal_zpool_name = ZpoolName::new_internal(Uuid::new_v4()); + let internal_disk: RawDisk = + SyntheticDisk::create_zpool(&dir, &internal_zpool_name).into(); + let external_zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let external_disk: RawDisk = + SyntheticDisk::create_zpool(&dir, &external_zpool_name).into(); + handle.upsert_disk(internal_disk).await; + handle.upsert_disk(external_disk).await; + + (handle, vec![internal_zpool_name, external_zpool_name]) + } + impl ResourceWrapper { // Create new storage resources, and mount fake datasets at the required // locations. - async fn new() -> Self { - let resources = StorageResources::new_for_test(); - let dirs = resources.all_zone_bundle_directories().await; - for d in dirs.iter() { - let id = - d.components().nth(3).unwrap().as_str().parse().unwrap(); - create_test_dataset(&id, d).await.unwrap(); - } - Self { resources, dirs } + async fn new(log: Logger) -> Self { + // Spawn the storage related tasks required for testing and insert + // synthetic disks. + let (storage_handle, zpool_names) = setup_storage(&log).await; + let resources = storage_handle.get_latest_resources().await; + let dirs = resources.all_zone_bundle_directories(); + Self { storage_handle, zpool_names, dirs } } } impl Drop for ResourceWrapper { fn drop(&mut self) { - for d in self.dirs.iter() { - let id = - d.components().nth(3).unwrap().as_str().parse().unwrap(); - remove_test_dataset(&id).unwrap(); + for name in &self.zpool_names { + Zpool::destroy(name).unwrap(); } } } @@ -1854,9 +1890,12 @@ mod illumos_tests { let log = Logger::root(drain, slog::o!("component" => "fake-cleanup-task")); let context = CleanupContext::default(); - let resource_wrapper = ResourceWrapper::new().await; - let bundler = - ZoneBundler::new(log, resource_wrapper.resources.clone(), context); + let resource_wrapper = ResourceWrapper::new(log.clone()).await; + let bundler = ZoneBundler::new( + log, + resource_wrapper.storage_handle.clone(), + context, + ); Ok(CleanupTestContext { resource_wrapper, context, bundler }) } @@ -1891,64 +1930,6 @@ mod illumos_tests { assert_eq!(context, new_context, "failed to update context"); } - // Quota applied to test datasets. - // - // This needs to be at least this big lest we get "out of space" errors when - // creating. Not sure where those come from, but could be ZFS overhead. - const TEST_QUOTA: u64 = 1024 * 32; - - async fn create_test_dataset( - id: &Uuid, - mountpoint: &Utf8PathBuf, - ) -> anyhow::Result<()> { - let output = Command::new("/usr/bin/pfexec") - .arg(ZFS) - .arg("create") - .arg("-o") - .arg(format!("quota={TEST_QUOTA}")) - .arg("-o") - .arg(format!("mountpoint={mountpoint}")) - .arg(format!("rpool/{id}")) - .output() - .await - .context("failed to spawn zfs create operation")?; - anyhow::ensure!( - output.status.success(), - "zfs create operation failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - - // Make the path operable by the test code. - let output = Command::new("/usr/bin/pfexec") - .arg("chmod") - .arg("a+rw") - .arg(&mountpoint) - .output() - .await - .context("failed to spawn chmod operation")?; - anyhow::ensure!( - output.status.success(), - "chmod-ing the dataset failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - Ok(()) - } - - fn remove_test_dataset(id: &Uuid) -> anyhow::Result<()> { - let output = std::process::Command::new("/usr/bin/pfexec") - .arg(ZFS) - .arg("destroy") - .arg(format!("rpool/{id}")) - .output() - .context("failed to spawn zfs destroy operation")?; - anyhow::ensure!( - output.status.success(), - "zfs destroy operation failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - Ok(()) - } - async fn run_test_with_zfs_dataset(test: T) where T: FnOnce(CleanupTestContext) -> Fut, diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 0bdca5c19c..64136e756d 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -46,7 +46,9 @@ impl StorageResources { /// Insert a disk and its zpool /// /// Return true, if data was changed, false otherwise - pub(crate) fn insert_disk(&mut self, disk: Disk) -> Result { + /// + /// This really should not be used outside this crate, except for testing + pub fn insert_disk(&mut self, disk: Disk) -> Result { let disk_id = disk.identity().clone(); let zpool_name = disk.zpool_name().clone(); let zpool = Pool::new(zpool_name, disk_id.clone())?;