From f540c086f5cbc1dda34bfa3f85203e0826aa07ca Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 6 Oct 2023 05:33:07 +0000 Subject: [PATCH] wip --- sled-agent/src/bootstrap/pre_server.rs | 79 +++++++++----------------- sled-agent/src/bootstrap/server.rs | 1 - sled-agent/src/long_running_tasks.rs | 30 ++++++++-- sled-agent/src/services.rs | 18 +++--- sled-agent/src/sled_agent.rs | 2 +- sled-agent/src/zone_bundle.rs | 39 ++++++------- sled-storage/src/lib.rs | 2 +- sled-storage/src/manager.rs | 4 +- 8 files changed, 83 insertions(+), 92 deletions(-) diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index 0899bdd82f..d7c3e9d5d8 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -11,9 +11,11 @@ #![allow(clippy::result_large_err)] use super::maghemite; -use super::secret_retriever::LrtqOrHardcodedSecretRetriever; use super::server::StartError; use crate::config::Config; +use crate::long_running_tasks::{ + spawn_all_longrunning_tasks, LongRunningTaskHandles, +}; use crate::services::ServiceManager; use crate::sled_agent::SledAgent; use crate::storage_manager::StorageManager; @@ -29,8 +31,6 @@ use illumos_utils::zfs; use illumos_utils::zfs::Zfs; use illumos_utils::zone; use illumos_utils::zone::Zones; -use key_manager::KeyManager; -use key_manager::StorageKeyRequester; use omicron_common::address::Ipv6Subnet; use omicron_common::FileKv; use sled_hardware::underlay; @@ -38,6 +38,8 @@ use sled_hardware::DendriteAsic; use sled_hardware::HardwareManager; use sled_hardware::HardwareUpdate; use sled_hardware::SledMode; +use sled_storage::disk::SyntheticDisk; +use sled_storage::manager::StorageHandle; use slog::Drain; use slog::Logger; use std::net::IpAddr; @@ -200,36 +202,24 @@ impl BootstrapAgentStartup { // This should be a no-op if already enabled. BootstrapNetworking::enable_ipv6_forwarding().await?; - // Spawn the `KeyManager` which is needed by the the StorageManager to - // retrieve encryption keys. - let (storage_key_requester, key_manager_handle) = - spawn_key_manager_task(&base_log); - let sled_mode = sled_mode_from_config(&config)?; - // Start monitoring hardware. This is blocking so we use - // `spawn_blocking`; similar to above, we move some things in and (on - // success) it gives them back. - let (base_log, log, hardware_manager) = { - tokio::task::spawn_blocking(move || { - info!( - log, "Starting hardware monitor"; - "sled_mode" => ?sled_mode, - ); - let hardware_manager = - HardwareManager::new(&base_log, sled_mode) - .map_err(StartError::StartHardwareManager)?; - Ok::<_, StartError>((base_log, log, hardware_manager)) - }) - .await - .unwrap()? - }; + // Spawn all important long running tasks that live for the lifetime of + // the process and are used by both the bootstrap agent and sled agent + let long_running_task_handles = spawn_all_longrunning_tasks( + &base_log, + sled_mode, + startup_networking.global_zone_bootstrap_ip, + ) + .await; - // Create a `StorageManager` and (possibly) synthetic disks. - let storage_manager = - StorageManager::new(&base_log, storage_key_requester).await; - upsert_synthetic_zpools_if_needed(&log, &storage_manager, &config) - .await; + // Add some synthetic disks if necessary. + upsert_synthetic_zpools_if_needed( + &log, + &long_running_task_handles.storage_manager, + &config, + ) + .await; let global_zone_bootstrap_ip = startup_networking.global_zone_bootstrap_ip; @@ -242,7 +232,7 @@ impl BootstrapAgentStartup { config.skip_timesync, config.sidecar_revision.clone(), config.switch_zone_maghemite_links.clone(), - storage_manager.resources().clone(), + long_running_task_handles.storage_manager.clone(), storage_manager.zone_bundler().clone(), ); @@ -357,13 +347,10 @@ fn ensure_zfs_key_directory_exists(log: &Logger) -> Result<(), StartError> { // to create and mount encrypted datasets. info!( log, "Ensuring zfs key directory exists"; - "path" => sled_hardware::disk::KEYPATH_ROOT, + "path" => zfs::KEYPATH_ROOT, ); - std::fs::create_dir_all(sled_hardware::disk::KEYPATH_ROOT).map_err(|err| { - StartError::CreateZfsKeyDirectory { - dir: sled_hardware::disk::KEYPATH_ROOT, - err, - } + std::fs::create_dir_all(zfs::KEYPATH_ROOT).map_err(|err| { + StartError::CreateZfsKeyDirectory { dir: zfs::KEYPATH_ROOT, err } }) } @@ -387,7 +374,7 @@ fn ensure_zfs_ramdisk_dataset() -> Result<(), StartError> { async fn upsert_synthetic_zpools_if_needed( log: &Logger, - storage_manager: &StorageManager, + storage_manager: &StorageHandle, config: &Config, ) { if let Some(pools) = &config.zpools { @@ -397,7 +384,8 @@ async fn upsert_synthetic_zpools_if_needed( "Upserting synthetic zpool to Storage Manager: {}", pool.to_string() ); - storage_manager.upsert_synthetic_disk(pool.clone()).await; + let disk = SyntheticDisk::new(pool.clone()).into(); + storage_manager.upsert_disk(disk).await; } } } @@ -435,19 +423,6 @@ fn sled_mode_from_config(config: &Config) -> Result { Ok(sled_mode) } -fn spawn_key_manager_task( - log: &Logger, -) -> (StorageKeyRequester, JoinHandle<()>) { - let secret_retriever = LrtqOrHardcodedSecretRetriever::new(); - let (mut key_manager, storage_key_requester) = - KeyManager::new(log, secret_retriever); - - let key_manager_handle = - tokio::spawn(async move { key_manager.run().await }); - - (storage_key_requester, key_manager_handle) -} - #[derive(Debug, Clone)] pub(crate) struct BootstrapNetworking { pub(crate) bootstrap_etherstub: dladm::Etherstub, diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 0cbbf0678b..638aa51dee 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -13,7 +13,6 @@ use super::rack_ops::RackInitId; use super::views::SledAgentResponse; use super::BootstrapError; use super::RssAccessError; -use crate::bootstrap::bootstore::BootstoreHandles; use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; use crate::bootstrap::http_entrypoints::api as http_api; use crate::bootstrap::http_entrypoints::BootstrapServerContext; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index 54e8ed7e18..cb82648a8c 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -16,8 +16,10 @@ use crate::bootstrap::bootstore::{ new_bootstore_config, poll_ddmd_for_bootstore_peer_update, }; use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; +use crate::zone_bundle::{CleanupContext, ZoneBundler}; use bootstore::schemes::v0 as bootstore; use key_manager::{KeyManager, StorageKeyRequester}; +use sled_agent_client::types::CleanupContext; use sled_hardware::{HardwareManager, SledMode}; use sled_storage::manager::{StorageHandle, StorageManager}; use slog::{info, Logger}; @@ -35,24 +37,29 @@ pub struct LongRunningTaskHandles { /// A mechanism for talking to the [`StorageManager`] which is responsible /// for establishing zpools on disks and managing their datasets. - pub storage_handle: StorageHandle, + pub storage_manager: StorageHandle, /// A mechanism for interacting with the hardware device tree pub hardware_manager: HardwareManager, // A handle for interacting with the bootstore pub bootstore: bootstore::NodeHandle, + + // A reference to the object used to manage zone bundles + pub zone_bundler: ZoneBundler, } /// Spawn all long running tasks -pub async fn spawn_all( +pub async fn spawn_all_longrunning_tasks( log: &Logger, sled_mode: SledMode, global_zone_bootstrap_ip: Ipv6Addr, ) -> LongRunningTaskHandles { let storage_key_requester = spawn_key_manager(log); - let mut storage_handle = + let mut storage_manager = spawn_storage_manager(log, storage_key_requester.clone()); + + // TODO: Does this need to run inside tokio::task::spawn_blocking? let hardware_manager = spawn_hardware_manager(log, sled_mode); // Wait for the boot disk so that we can work with any ledgers, @@ -67,9 +74,11 @@ pub async fn spawn_all( ) .await; + let zone_bundler = spawn_zone_bundler_tasks(log, &mut storage_handle); + LongRunningTaskHandles { storage_key_requester, - storage_handle, + storage_manager, hardware_manager, bootstore, } @@ -140,3 +149,16 @@ async fn spawn_bootstore_tasks( node_handle } + +// `ZoneBundler::new` spawns a periodic cleanup task that runs indefinitely +fn spawn_zone_bundler_tasks( + log: &Logger, + storage_handle: &mut StorageHandle, +) -> ZoneBundler { + let log = log.new(o!("component" => "ZoneBundler")); + let zone_bundler = ZoneBundler::new( + log, + storage_handle.clone(), + CleanupContext::default(), + ); +} diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 96cdf8222b..3fcbf717fa 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -5,7 +5,7 @@ //! Sled-local service management. //! //! For controlling zone-based storage services, refer to -//! [crate::storage_manager::StorageManager]. +//! [sled_hardware:manager::StorageManager]. //! //! For controlling virtual machine instances, refer to //! [crate::instance_manager::InstanceManager]. @@ -38,7 +38,6 @@ use crate::params::{ use crate::profile::*; use crate::smf_helper::Service; use crate::smf_helper::SmfHelper; -use crate::storage_manager::StorageResources; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; use anyhow::anyhow; @@ -88,12 +87,14 @@ use omicron_common::nexus_config::{ use once_cell::sync::OnceCell; use rand::prelude::SliceRandom; use rand::SeedableRng; -use sled_hardware::disk::ZONE_DATASET; use sled_hardware::is_gimlet; use sled_hardware::underlay; use sled_hardware::underlay::BOOTSTRAP_PREFIX; use sled_hardware::Baseboard; use sled_hardware::SledMode; +use sled_storage::dataset::{CONFIG_DATASET, ZONE_DATASET}; +use sled_storage::manager::StorageHandle; +use sled_storage::resources::StorageResources; use slog::Logger; use std::collections::HashSet; use std::collections::{BTreeMap, HashMap}; @@ -370,7 +371,7 @@ pub struct ServiceManagerInner { advertised_prefixes: Mutex>>, sled_info: OnceCell, switch_zone_bootstrap_address: Ipv6Addr, - storage: StorageResources, + storage: StorageHandle, zone_bundler: ZoneBundler, ledger_directory_override: OnceCell, image_directory_override: OnceCell, @@ -415,7 +416,7 @@ impl ServiceManager { skip_timesync: Option, sidecar_revision: SidecarRevision, switch_zone_maghemite_links: Vec, - storage: StorageResources, + storage: StorageHandle, zone_bundler: ZoneBundler, ) -> Self { let log = log.new(o!("component" => "ServiceManager")); @@ -470,13 +471,12 @@ impl ServiceManager { } async fn all_service_ledgers(&self) -> Vec { + let resources = self.inner.storage.get_latest_resources().await; if let Some(dir) = self.inner.ledger_directory_override.get() { return vec![dir.join(SERVICES_LEDGER_FILENAME)]; } - self.inner - .storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) - .await + resources + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(SERVICES_LEDGER_FILENAME)) .collect() diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 7e62f6a8a7..dc130524f6 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -90,7 +90,7 @@ pub enum Error { Instance(#[from] crate::instance_manager::Error), #[error("Error managing storage: {0}")] - Storage(#[from] crate::storage_manager::Error), + Storage(#[from] sled_storage::error::Error), #[error("Error updating: {0}")] Download(#[from] crate::updates::Error), diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index 2eeb8ebe7d..ea7481bd6d 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -6,7 +6,6 @@ //! Tools for collecting and inspecting service bundles for zones. -use crate::storage_manager::StorageResources; use anyhow::anyhow; use anyhow::Context; use camino::FromPathBufError; @@ -22,6 +21,8 @@ use illumos_utils::zone::AdmError; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; +use sled_storage::dataset::U2_DEBUG_DATASET; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::cmp::Ord; use std::cmp::Ordering; @@ -148,20 +149,12 @@ pub struct ZoneBundler { inner: Arc>, // Channel for notifying the cleanup task that it should reevaluate. notify_cleanup: Arc, - // Tokio task handle running the period cleanup operation. - cleanup_task: Arc>, -} - -impl Drop for ZoneBundler { - fn drop(&mut self) { - self.cleanup_task.abort(); - } } // State shared between tasks, e.g., used when creating a bundle in different // tasks or between a creation and cleanup. struct Inner { - resources: StorageResources, + storage_handle: StorageHandle, cleanup_context: CleanupContext, last_cleanup_at: Instant, } @@ -189,7 +182,8 @@ impl Inner { // that can exist but do not, i.e., those whose parent datasets already // exist; and returns those. async fn bundle_directories(&self) -> Vec { - let expected = self.resources.all_zone_bundle_directories().await; + let resources = self.storage_handle.get_latest_resources().await; + let expected = resources.all_zone_bundle_directories(); let mut out = Vec::with_capacity(expected.len()); for each in expected.into_iter() { if tokio::fs::create_dir_all(&each).await.is_ok() { @@ -249,26 +243,28 @@ impl ZoneBundler { /// Create a new zone bundler. /// /// This creates an object that manages zone bundles on the system. It can - /// be used to create bundles from running zones, and runs a period task to - /// clean them up to free up space. + /// be used to create bundles from running zones, and runs a periodic task + /// to clean them up to free up space. pub fn new( log: Logger, - resources: StorageResources, + storage_handle: StorageHandle, cleanup_context: CleanupContext, ) -> Self { let notify_cleanup = Arc::new(Notify::new()); let inner = Arc::new(Mutex::new(Inner { - resources, + storage_handle, cleanup_context, last_cleanup_at: Instant::now(), })); let cleanup_log = log.new(slog::o!("component" => "auto-cleanup-task")); let notify_clone = notify_cleanup.clone(); let inner_clone = inner.clone(); - let cleanup_task = Arc::new(tokio::task::spawn( - Self::periodic_cleanup(cleanup_log, inner_clone, notify_clone), + tokio::task::spawn(Self::periodic_cleanup( + cleanup_log, + inner_clone, + notify_clone, )); - Self { log, inner, notify_cleanup, cleanup_task } + Self { log, inner, notify_cleanup } } /// Trigger an immediate cleanup of low-priority zone bundles. @@ -353,10 +349,9 @@ impl ZoneBundler { ) -> Result { let inner = self.inner.lock().await; let storage_dirs = inner.bundle_directories().await; - let extra_log_dirs = inner - .resources - .all_u2_mountpoints(sled_hardware::disk::U2_DEBUG_DATASET) - .await + let resources = inner.storage_handle.get_latest_resources().await; + let extra_log_dirs = resources + .all_u2_mountpoints(U2_DEBUG_DATASET) .into_iter() .map(|p| p.join(zone.name())) .collect(); diff --git a/sled-storage/src/lib.rs b/sled-storage/src/lib.rs index 20f6442b9a..0c1b383d7f 100644 --- a/sled-storage/src/lib.rs +++ b/sled-storage/src/lib.rs @@ -9,7 +9,7 @@ //! `illumos-utils` crate to actually perform ZFS related OS calls. pub mod dataset; -pub(crate) mod disk; +pub mod disk; pub(crate) mod dump_setup; pub mod error; pub(crate) mod keyfile; diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 7bcb8df192..7e2050084b 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -133,14 +133,14 @@ impl StorageHandle { /// Retrieve the latest value of `StorageResources` from the /// `StorageManager` task. - pub async fn get_latest_resources(&mut self) -> StorageResources { + pub async fn get_latest_resources(&self) -> StorageResources { let (tx, rx) = oneshot::channel(); self.tx.send(StorageRequest::GetLatestResources(tx)).await.unwrap(); rx.await.unwrap() } /// Return internal data useful for debugging and testing - pub async fn get_manager_state(&mut self) -> StorageManagerData { + pub async fn get_manager_state(&self) -> StorageManagerData { let (tx, rx) = oneshot::channel(); self.tx.send(StorageRequest::GetManagerState(tx)).await.unwrap(); rx.await.unwrap()