From 1cf9437f05a49d8801ab85c3f33b9bc5b1864e52 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Sat, 23 Nov 2024 13:16:15 -0800 Subject: [PATCH] Revert "[sled-agent] PUT zones doesn't create datasets (#7006)" (#7157) This reverts commit 81c808c31bfda62bb371a12a049bfb846f88624e. We cannot yet rely on `omicron-datasets.json` existing, because old systems need new blueprints to receive that configuration file. Unfortunately, systems exist (e.g. dogfood) which have not had blueprints created that contain necessary datasets - as a result, the original PR would prevent cold boot from functioning properly, as the lack of `omicron-datasets.json` would prevent zones from being initialized. By reverting this PR, zones should be able to launch during cold boot **without** an `omicron-datasets.json` file once more. --- common/src/ledger.rs | 1 - nexus-sled-agent-shared/src/inventory.rs | 18 +----- sled-agent/src/services.rs | 70 ------------------------ sled-agent/src/sled_agent.rs | 19 +++++++ sled-storage/src/manager.rs | 25 +++------ 5 files changed, 29 insertions(+), 104 deletions(-) diff --git a/common/src/ledger.rs b/common/src/ledger.rs index 289bf72081..a52c2441ca 100644 --- a/common/src/ledger.rs +++ b/common/src/ledger.rs @@ -130,7 +130,6 @@ impl Ledger { warn!(self.log, "Failed to write ledger"; "path" => ?path, "err" => ?e); failed_paths.push((path.to_path_buf(), e)); } else { - info!(self.log, "Successfully wrote to ledger"; "path" => ?path); one_successful_write = true; } } diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index da6edfda1f..ebc683a264 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -11,7 +11,7 @@ use omicron_common::{ external::{ByteCount, Generation}, internal::shared::{NetworkInterface, SourceNatConfig}, }, - disk::{DatasetKind, DatasetName, DiskVariant}, + disk::DiskVariant, zpool_name::ZpoolName, }; use omicron_uuid_kinds::{DatasetUuid, OmicronZoneUuid}; @@ -168,22 +168,6 @@ impl OmicronZoneConfig { pub fn underlay_ip(&self) -> Ipv6Addr { self.zone_type.underlay_ip() } - - /// If this zone has a transient filesystem, return the dataset's name. - /// Otherwise, return `None`. - pub fn transient_zone_dataset_name(&self) -> Option { - self.filesystem_pool.as_ref().map(|pool| { - DatasetName::new( - pool.clone(), - DatasetKind::TransientZone { - name: illumos_utils::zone::zone_name( - self.zone_type.kind().zone_prefix(), - Some(self.id), - ), - }, - ) - }) - } } /// Describes a persistent ZFS dataset associated with an Omicron zone diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index f0a0da4675..e73773bcc8 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -104,7 +104,6 @@ use sled_storage::dataset::{CONFIG_DATASET, INSTALL_DATASET, ZONE_DATASET}; use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeMap; -use std::collections::BTreeSet; use std::collections::HashSet; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; use std::str::FromStr; @@ -276,12 +275,6 @@ pub enum Error { #[error("Error querying simnet devices")] Simnet(#[from] GetSimnetError), - #[error("Failed to access storage")] - Storage(#[from] sled_storage::error::Error), - - #[error("Missing datasets: {datasets:?}")] - MissingDatasets { datasets: BTreeSet }, - #[error( "Requested generation ({requested}) is older than current ({current})" )] @@ -3478,54 +3471,6 @@ impl ServiceManager { Ok(()) } - // Compares the incoming set of zones with the datasets this sled is - // configured to use. - // - // If the requested zones contain any datasets not configured on this sled, - // an error is returned. - // - // In this check the configured datasets are those intended to exist as - // written in a ledger. The datasets themselves may not actually physically - // exist yet, or they may have failed. - async fn check_requested_zone_datasets_configured( - &self, - request: &OmicronZonesConfig, - ) -> Result<(), Error> { - // Before we provision these zones, inspect the request, and - // check all the datasets we're trying to make. - let mut requested_datasets = BTreeSet::new(); - for zone in &request.zones { - if let Some(dataset) = zone.dataset_name() { - requested_datasets.insert(dataset); - } - if let Some(dataset) = zone.transient_zone_dataset_name() { - requested_datasets.insert(dataset); - } - } - - // These datasets are configured to be part of the control plane. - let datasets_config = self.inner.storage.datasets_config_list().await?; - let existing_datasets: BTreeSet<_> = datasets_config - .datasets - .into_values() - .map(|config| config.name) - .collect(); - - // Check that the request is no larger than the configured set. - // - // (It's fine to have "existing_datasets" not contained in the - // request set). - let missing_datasets: BTreeSet<_> = requested_datasets - .difference(&existing_datasets) - .cloned() - .collect(); - if !missing_datasets.is_empty() { - warn!(self.inner.log, "Missing datasets: {missing_datasets:?}"); - return Err(Error::MissingDatasets { datasets: missing_datasets }); - } - Ok(()) - } - // Ensures that only the following Omicron zones are running. // // This method strives to be idempotent. @@ -3549,8 +3494,6 @@ impl ServiceManager { new_request: OmicronZonesConfig, fake_install_dir: Option<&String>, ) -> Result<(), Error> { - self.check_requested_zone_datasets_configured(&new_request).await?; - // Do some data-normalization to ensure we can compare the "requested // set" vs the "existing set" as HashSets. let old_zone_configs: HashSet = existing_zones @@ -4752,7 +4695,6 @@ mod illumos_tests { zone::MockZones, }; - use omicron_common::disk::DatasetsConfig; use omicron_uuid_kinds::OmicronZoneUuid; use sled_storage::manager_test_harness::StorageManagerTestHarness; use std::os::unix::process::ExitStatusExt; @@ -5098,18 +5040,6 @@ mod illumos_tests { .await .expect("Failed to ensure disks"); assert!(!result.has_error(), "{:?}", result); - let result = harness - .handle() - .datasets_ensure(DatasetsConfig { - generation: Generation::new(), - datasets: BTreeMap::new(), - }) - .await - .unwrap(); - assert!( - !result.has_error(), - "Failed to ensure empty dataset ledger: {result:?}" - ); harness } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 80c0cdda21..f1b2f910e7 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -15,6 +15,7 @@ use crate::metrics::MetricsManager; use crate::nexus::{ NexusClient, NexusNotifierHandle, NexusNotifierInput, NexusNotifierTask, }; +use crate::params::OmicronZoneTypeExt; use crate::probe_manager::ProbeManager; use crate::services::{self, ServiceManager}; use crate::storage_monitor::StorageMonitorHandle; @@ -942,6 +943,24 @@ impl SledAgent { &self, requested_zones: OmicronZonesConfig, ) -> Result<(), Error> { + // TODO(https://github.com/oxidecomputer/omicron/issues/6043): + // - If these are the set of filesystems, we should also consider + // removing the ones which are not listed here. + // - It's probably worth sending a bulk request to the storage system, + // rather than requesting individual datasets. + for zone in &requested_zones.zones { + let Some(dataset_name) = zone.dataset_name() else { + continue; + }; + + // First, ensure the dataset exists + let dataset_id = zone.id.into_untyped_uuid(); + self.inner + .storage + .upsert_filesystem(dataset_id, dataset_name) + .await?; + } + self.inner .services .ensure_all_omicron_zones_persistent(requested_zones, None) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 62f5834209..a760285d3f 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -856,7 +856,6 @@ impl StorageManager { let ledger_paths = self.all_omicron_dataset_ledgers().await; let maybe_ledger = Ledger::::new(&log, ledger_paths.clone()).await; - let had_old_ledger = maybe_ledger.is_some(); let mut ledger = match maybe_ledger { Some(ledger) => { @@ -915,14 +914,12 @@ impl StorageManager { let result = self.datasets_ensure_internal(&log, &config).await; let ledger_data = ledger.data_mut(); - - // Commit the ledger to storage if either: - // - No ledger exists in storage, or - // - The ledger has changed - if !had_old_ledger || *ledger_data != config { - *ledger_data = config; - ledger.commit().await?; + if *ledger_data == config { + return Ok(result); } + *ledger_data = config; + ledger.commit().await?; + Ok(result) } @@ -1145,7 +1142,6 @@ impl StorageManager { ledger_paths.clone(), ) .await; - let had_old_ledger = maybe_ledger.is_some(); let mut ledger = match maybe_ledger { Some(ledger) => { @@ -1185,14 +1181,11 @@ impl StorageManager { self.omicron_physical_disks_ensure_internal(&log, &config).await?; let ledger_data = ledger.data_mut(); - - // Commit the ledger to storage if either: - // - No ledger exists in storage, or - // - The ledger has changed - if !had_old_ledger || *ledger_data != config { - *ledger_data = config; - ledger.commit().await?; + if *ledger_data == config { + return Ok(result); } + *ledger_data = config; + ledger.commit().await?; Ok(result) }