From f71e2f3a2409e378aa01586c94bc4e5ee420a65b Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 8 Mar 2024 14:43:15 -0500 Subject: [PATCH 01/11] RSS: build a blueprint describing the rack plan --- Cargo.lock | 1 + common/src/api/external/mod.rs | 30 +++- .../planning/src/blueprint_builder.rs | 4 +- nexus/reconfigurator/planning/src/planner.rs | 2 +- sled-agent/Cargo.toml | 1 + sled-agent/src/params.rs | 8 +- sled-agent/src/rack_setup/service.rs | 156 +++++++++++++----- sled-agent/src/services_migration.rs | 7 +- 8 files changed, 154 insertions(+), 55 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5900cf3dec..dea83d1d03 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5406,6 +5406,7 @@ dependencies = [ "mg-admin-client", "nexus-client", "nexus-config", + "nexus-types", "omicron-common", "omicron-test-utils", "omicron-workspace-hack", diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 58bc82c825..1a355f4929 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -650,16 +650,28 @@ impl From for i64 { pub struct Generation(u64); impl Generation { - pub fn new() -> Generation { + pub const fn new() -> Generation { Generation(1) } - pub fn next(&self) -> Generation { + pub const fn from_u32(value: u32) -> Generation { + // `as` is a little distasteful because it allows lossy conversion, but + // (a) we know converting `u32` to `u64` will always succeed + // losslessly, and (b) it allows to make this function `const`, unlike + // if we were to use `u64::from(value)`. + Generation(value as u64) + } + + pub const fn next(&self) -> Generation { // It should technically be an operational error if this wraps or even // exceeds the value allowed by an i64. But it seems unlikely enough to // happen in practice that we can probably feel safe with this. let next_gen = self.0 + 1; - assert!(next_gen <= u64::try_from(i64::MAX).unwrap()); + // `as` is a little distasteful because it allows lossy conversion, but + // (a) we know converting `i64::MAX` to `u64` will always succeed + // losslessly, and (b) it allows to make this function `const`, unlike + // if we were to use `u64::try_from(i64::MAX).unwrap()`. + assert!(next_gen <= i64::MAX as u64); Generation(next_gen) } } @@ -697,11 +709,21 @@ impl TryFrom for Generation { fn try_from(value: i64) -> Result { Ok(Generation( u64::try_from(value) - .map_err(|_| anyhow!("generation number too large"))?, + .map_err(|_| anyhow!("negative generation number"))?, )) } } +impl TryFrom for Generation { + type Error = anyhow::Error; + + fn try_from(value: u64) -> Result { + i64::try_from(value) + .map_err(|_| anyhow!("generation number too large"))?; + Ok(Generation(value)) + } +} + /// An RFC-1035-compliant hostname. #[derive( Clone, Debug, Deserialize, Display, Eq, PartialEq, SerializeDisplay, diff --git a/nexus/reconfigurator/planning/src/blueprint_builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder.rs index 26d0d1e23f..bf8a7fc362 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder.rs @@ -687,8 +687,8 @@ impl<'a> BlueprintZones<'a> { } else { // The first generation is reserved to mean the one // containing no zones. See - // OMICRON_ZONES_CONFIG_INITIAL_GENERATION. So we start - // with the next one. + // OmicronZonesConfig::INITIAL_GENERATION. So we start with the + // next one. OmicronZonesConfig { generation: Generation::new().next(), zones: vec![], diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index bae3dfd68a..074f088588 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -403,7 +403,7 @@ mod test { let (sled_id, sled_zones) = sleds[0]; // We have defined elsewhere that the first generation contains no // zones. So the first one with zones must be newer. See - // OMICRON_ZONES_CONFIG_INITIAL_GENERATION. + // OmicronZonesConfig::INITIAL_GENERATION. assert!(sled_zones.generation > Generation::new()); assert_eq!(sled_id, new_sled_id); assert_eq!(sled_zones.zones.len(), 1); diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 7f46a277ce..c6321040f4 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -46,6 +46,7 @@ macaddr.workspace = true mg-admin-client.workspace = true nexus-client.workspace = true nexus-config.workspace = true +nexus-types.workspace = true omicron-common.workspace = true once_cell.workspace = true oximeter.workspace = true diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index f30c910efc..5d5b71d3f5 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -288,9 +288,6 @@ impl std::fmt::Display for ZoneType { } } -/// Generation 1 of `OmicronZonesConfig` is always the set of no zones. -pub const OMICRON_ZONES_CONFIG_INITIAL_GENERATION: u32 = 1; - /// Describes the set of Omicron-managed zones running on a sled #[derive( Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, @@ -310,6 +307,11 @@ pub struct OmicronZonesConfig { pub zones: Vec, } +impl OmicronZonesConfig { + /// Generation 1 of `OmicronZonesConfig` is always the set of no zones. + pub const INITIAL_GENERATION: Generation = Generation::from_u32(1); +} + impl From for sled_agent_client::types::OmicronZonesConfig { fn from(local: OmicronZonesConfig) -> Self { Self { diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 2788e189cc..7be85b5447 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -74,24 +74,24 @@ use crate::bootstrap::params::BootstrapAddressDiscovery; use crate::bootstrap::params::StartSledAgentRequest; use crate::bootstrap::rss_handle::BootstrapAgentHandle; use crate::nexus::{d2n_params, ConvertInto}; -use crate::params::{ - OmicronZoneType, OmicronZonesConfig, TimeSync, - OMICRON_ZONES_CONFIG_INITIAL_GENERATION, -}; +use crate::params::{OmicronZoneType, OmicronZonesConfig, TimeSync}; use crate::rack_setup::plan::service::{ Plan as ServicePlan, PlanError as ServicePlanError, }; use crate::rack_setup::plan::sled::{ Plan as SledPlan, PlanError as SledPlanError, }; +use anyhow::{bail, Context}; use bootstore::schemes::v0 as bootstore; use camino::Utf8PathBuf; +use chrono::Utc; use ddm_admin_client::{Client as DdmAdminClient, DdmError}; use internal_dns::resolver::{DnsError, Resolver as DnsResolver}; use internal_dns::ServiceName; use nexus_client::{ types as NexusTypes, Client as NexusClient, Error as NexusError, }; +use nexus_types::deployment::Blueprint; use omicron_common::address::get_sled_address; use omicron_common::api::external::Generation; use omicron_common::api::internal::shared::ExternalPortDiscovery; @@ -107,11 +107,12 @@ use sled_hardware::underlay::BootstrapInterface; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; use slog::Logger; -use std::collections::BTreeSet; +use std::collections::{BTreeMap, BTreeSet}; use std::collections::{HashMap, HashSet}; use std::iter; use std::net::{Ipv6Addr, SocketAddrV6}; use thiserror::Error; +use uuid::Uuid; /// Describes errors which may occur while operating the setup service. #[derive(Error, Debug)] @@ -168,6 +169,9 @@ pub enum SetupServiceError { #[error("Bootstore error: {0}")] Bootstore(#[from] bootstore::NodeRequestError), + #[error("Failed to convert setup plan to blueprint: {0:#}")] + ConvertPlanToBlueprint(anyhow::Error), + // We used transparent, because `EarlyNetworkSetupError` contains a subset // of error variants already in this type #[error(transparent)] @@ -548,6 +552,12 @@ impl ServiceInner { ) -> Result<(), SetupServiceError> { info!(self.log, "Handing off control to Nexus"); + // Build a Blueprint describing our service plan. This should never + // fail, unless we've set up an invalid plan. + let _blueprint = + build_initial_blueprint_from_plan(sled_plan, service_plan) + .map_err(SetupServiceError::ConvertPlanToBlueprint)?; + info!(self.log, "Nexus address: {}", nexus_address.to_string()); let nexus_client = NexusClient::new( @@ -970,44 +980,14 @@ impl ServiceInner { .await? }; - // The service plan describes all the zones that we will eventually - // deploy on each sled. But we cannot currently just deploy them all - // concurrently. We'll do it in a few stages, each corresponding to a - // version of each sled's configuration. - // - // - version 1: no services running - // (We don't have to do anything for this. But we do - // reserve this version number for "no services running" so - // that sled agents can begin with an initial, valid - // OmicronZonesConfig before they've got anything running.) - // - version 2: internal DNS only - // - version 3: internal DNS + NTP servers - // - version 4: internal DNS + NTP servers + CockroachDB - // - version 5: everything - // - // At each stage, we're specifying a complete configuration of what - // should be running on the sled -- including this version number. - // And Sled Agents will reject requests for versions older than the - // one they're currently running. Thus, the version number is a piece - // of global, distributed state. - // - // For now, we hardcode the requests we make to use specific version - // numbers. - let version1_nothing = - Generation::from(OMICRON_ZONES_CONFIG_INITIAL_GENERATION); - let version2_dns_only = version1_nothing.next(); - let version3_dns_and_ntp = version2_dns_only.next(); - let version4_cockroachdb = version3_dns_and_ntp.next(); - let version5_everything = version4_cockroachdb.next(); - // Set up internal DNS services first and write the initial // DNS configuration to the internal DNS servers. let v1generator = OmicronZonesConfigGenerator::initial_version( &service_plan, - version1_nothing, + DeployStepVersion::V1_NOTHING, ); let v2generator = v1generator.new_version_with( - version2_dns_only, + DeployStepVersion::V2_DNS_ONLY, &|zone_type: &OmicronZoneType| { matches!(zone_type, OmicronZoneType::InternalDns { .. }) }, @@ -1022,7 +1002,7 @@ impl ServiceInner { // Next start up the NTP services. let v3generator = v2generator.new_version_with( - version3_dns_and_ntp, + DeployStepVersion::V3_DNS_AND_NTP, &|zone_type: &OmicronZoneType| { matches!( zone_type, @@ -1040,7 +1020,7 @@ impl ServiceInner { // Wait until Cockroach has been initialized before running Nexus. let v4generator = v3generator.new_version_with( - version4_cockroachdb, + DeployStepVersion::V4_COCKROACHDB, &|zone_type: &OmicronZoneType| { matches!(zone_type, OmicronZoneType::CockroachDb { .. }) }, @@ -1052,8 +1032,8 @@ impl ServiceInner { self.initialize_cockroach(&service_plan).await?; // Issue the rest of the zone initialization requests. - let v5generator = - v4generator.new_version_with(version5_everything, &|_| true); + let v5generator = v4generator + .new_version_with(DeployStepVersion::V5_EVERYTHING, &|_| true); self.ensure_zone_config_at_least(v5generator.sled_configs()).await?; info!(self.log, "Finished setting up services"); @@ -1084,6 +1064,100 @@ impl ServiceInner { } } +/// The service plan describes all the zones that we will eventually +/// deploy on each sled. But we cannot currently just deploy them all +/// concurrently. We'll do it in a few stages, each corresponding to a +/// version of each sled's configuration. +/// +/// - version 1: no services running +/// (We don't have to do anything for this. But we do +/// reserve this version number for "no services running" so +/// that sled agents can begin with an initial, valid +/// OmicronZonesConfig before they've got anything running.) +/// - version 2: internal DNS only +/// - version 3: internal DNS + NTP servers +/// - version 4: internal DNS + NTP servers + CockroachDB +/// - version 5: everything +/// +/// At each stage, we're specifying a complete configuration of what +/// should be running on the sled -- including this version number. +/// And Sled Agents will reject requests for versions older than the +/// one they're currently running. Thus, the version number is a piece +/// of global, distributed state. +/// +/// For now, we hardcode the requests we make to use specific version +/// numbers. +struct DeployStepVersion; + +impl DeployStepVersion { + const V1_NOTHING: Generation = OmicronZonesConfig::INITIAL_GENERATION; + const V2_DNS_ONLY: Generation = Self::V1_NOTHING.next(); + const V3_DNS_AND_NTP: Generation = Self::V2_DNS_ONLY.next(); + const V4_COCKROACHDB: Generation = Self::V3_DNS_AND_NTP.next(); + const V5_EVERYTHING: Generation = Self::V4_COCKROACHDB.next(); +} + +fn build_initial_blueprint_from_plan( + sled_plan: &SledPlan, + service_plan: &ServicePlan, +) -> anyhow::Result { + let internal_dns_version = + Generation::try_from(service_plan.dns_config.generation) + .context("invalid internal dns version")?; + + let mut sled_id_by_addr = BTreeMap::new(); + for sled_request in sled_plan.sleds.values() { + let addr = get_sled_address(sled_request.body.subnet); + if sled_id_by_addr.insert(addr, sled_request.body.id).is_some() { + bail!("duplicate sled address deriving blueprint: {addr}"); + } + } + + let mut omicron_zones = BTreeMap::new(); + let mut zones_in_service = BTreeSet::new(); + for (sled_addr, sled_config) in &service_plan.services { + let sled_id = *sled_id_by_addr.get(&sled_addr).with_context(|| { + format!("could not sled ID for sled with address {sled_addr}") + })?; + for zone in &sled_config.zones { + zones_in_service.insert(zone.id); + } + let zones_config = sled_agent_client::types::OmicronZonesConfig::from( + OmicronZonesConfig { + // This is a bit of a hack. We only construct a blueprint after + // completing RSS, so we need to know the final generation value + // sent to all sleds. Arguably, we should record this in the + // serialized RSS plan; however, we have already deployed + // systems that did not. We know that every such system used + // `V5_EVERYTHING` as the final generation count, so we can just + // use that value here. If we ever change this, in particular in + // a way where newly-deployed systems will have a different + // value, we will need to revisit storing this in the serialized + // RSS plan. + generation: DeployStepVersion::V5_EVERYTHING, + zones: sled_config.zones.clone(), + }, + ); + if omicron_zones.insert(sled_id, zones_config).is_some() { + bail!( + "duplicate sled ID deriving blueprint: \ + {sled_id} (address={sled_addr}" + ); + } + } + + Ok(Blueprint { + id: Uuid::new_v4(), + omicron_zones, + zones_in_service, + parent_blueprint_id: None, + internal_dns_version, + time_created: Utc::now(), + creator: "RSS".to_string(), + comment: "initial blueprint from rack setup".to_string(), + }) +} + /// Facilitates creating a sequence of OmicronZonesConfig objects for each sled /// in a service plan to enable phased rollout of services /// diff --git a/sled-agent/src/services_migration.rs b/sled-agent/src/services_migration.rs index bedd4759c8..511368e2f6 100644 --- a/sled-agent/src/services_migration.rs +++ b/sled-agent/src/services_migration.rs @@ -24,8 +24,8 @@ //! past this change. use crate::params::{ - OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType, ZoneType, - OMICRON_ZONES_CONFIG_INITIAL_GENERATION, + OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType, OmicronZonesConfig, + ZoneType, }; use crate::services::{OmicronZoneConfigLocal, OmicronZonesConfigLocal}; use anyhow::{anyhow, ensure, Context}; @@ -103,8 +103,7 @@ impl TryFrom for OmicronZonesConfigLocal { // some buffer.) // // In summary, 2 seems fine. - let omicron_generation = - Generation::from(OMICRON_ZONES_CONFIG_INITIAL_GENERATION).next(); + let omicron_generation = OmicronZonesConfig::INITIAL_GENERATION.next(); // The ledger generation doesn't really matter. In case it's useful, we // pick the generation from the ledger that we loaded. From f76f990f31b6c58589b2633badf314c57c437497 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 11 Mar 2024 11:42:22 -0400 Subject: [PATCH 02/11] pass blueprint during RSS <-> Nexus handoff --- nexus/src/lib.rs | 3 + nexus/test-interface/src/lib.rs | 2 + nexus/test-utils/src/lib.rs | 16 ++++- nexus/types/src/internal_api/params.rs | 3 + openapi/nexus-internal.json | 9 +++ sled-agent/src/rack_setup/mod.rs | 2 + sled-agent/src/rack_setup/service.rs | 55 +++++++++------ sled-agent/src/sim/server.rs | 92 ++++++++++++++++++++------ 8 files changed, 139 insertions(+), 43 deletions(-) diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index c0fba31afb..03470a4f3a 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -27,6 +27,7 @@ use dropshot::ConfigDropshot; use external_api::http_entrypoints::external_api; use internal_api::http_entrypoints::internal_api; use nexus_config::NexusConfig; +use nexus_types::deployment::Blueprint; use nexus_types::external_api::views::SledProvisionPolicy; use nexus_types::internal_api::params::ServiceKind; use omicron_common::address::IpRange; @@ -232,6 +233,7 @@ impl nexus_test_interface::NexusServer for Server { async fn start( internal_server: InternalServer, config: &NexusConfig, + blueprint: Blueprint, services: Vec, datasets: Vec, internal_dns_zone_config: nexus_types::internal_api::params::DnsConfigParams, @@ -276,6 +278,7 @@ impl nexus_test_interface::NexusServer for Server { &opctx, config.deployment.rack_id, internal_api::params::RackInitializationRequest { + blueprint, services, datasets, internal_services_ip_pool_ranges, diff --git a/nexus/test-interface/src/lib.rs b/nexus/test-interface/src/lib.rs index 10bc9e63f0..d9bb69276a 100644 --- a/nexus/test-interface/src/lib.rs +++ b/nexus/test-interface/src/lib.rs @@ -33,6 +33,7 @@ use async_trait::async_trait; use nexus_config::NexusConfig; +use nexus_types::deployment::Blueprint; use slog::Logger; use std::net::{SocketAddr, SocketAddrV6}; use uuid::Uuid; @@ -50,6 +51,7 @@ pub trait NexusServer: Send + Sync + 'static { async fn start( internal_server: Self::InternalServer, config: &NexusConfig, + blueprint: Blueprint, services: Vec, datasets: Vec, internal_dns_config: nexus_types::internal_api::params::DnsConfigParams, diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 9681d9ff97..a557cef3dc 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -7,6 +7,7 @@ use anyhow::Context; use anyhow::Result; use camino::Utf8Path; +use chrono::Utc; use dns_service_client::types::DnsConfigParams; use dropshot::test_util::ClientTestContext; use dropshot::test_util::LogContext; @@ -24,6 +25,7 @@ use nexus_config::MgdConfig; use nexus_config::NexusConfig; use nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; use nexus_test_interface::NexusServer; +use nexus_types::deployment::Blueprint; use nexus_types::external_api::params::UserId; use nexus_types::internal_api::params::Certificate; use nexus_types::internal_api::params::DatasetCreateRequest; @@ -54,6 +56,8 @@ use oximeter_collector::Oximeter; use oximeter_producer::LogConfig; use oximeter_producer::Server as ProducerServer; use slog::{debug, error, o, Logger}; +use std::collections::BTreeMap; +use std::collections::BTreeSet; use std::collections::HashMap; use std::fmt::Debug; use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; @@ -790,7 +794,17 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { self.nexus_internal .take() .expect("Must launch internal nexus first"), - &self.config, + self.config, + Blueprint { + id: Uuid::new_v4(), + omicron_zones: BTreeMap::new(), + zones_in_service: BTreeSet::new(), + parent_blueprint_id: None, + internal_dns_version: Generation::new(), + time_created: Utc::now(), + creator: "fixme".to_string(), + comment: "fixme".to_string(), + }, self.rack_init_builder.services.clone(), // NOTE: We should probably hand off // "self.rack_init_builder.datasets" here, but Nexus won't be happy diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs index e8c4703008..9f80d313fd 100644 --- a/nexus/types/src/internal_api/params.rs +++ b/nexus/types/src/internal_api/params.rs @@ -4,6 +4,7 @@ //! Params define the request bodies of API endpoints for creating or updating resources. +use crate::deployment::Blueprint; use crate::external_api::params::PhysicalDiskKind; use crate::external_api::params::UserId; use crate::external_api::shared::Baseboard; @@ -248,6 +249,8 @@ impl std::fmt::Debug for Certificate { #[derive(Debug, Clone, Deserialize, JsonSchema)] pub struct RackInitializationRequest { + /// Blueprint describing services initialized by RSS. + pub blueprint: Blueprint, /// Services on the rack which have been created by RSS. pub services: Vec, /// Datasets on the rack which have been provisioned by RSS. diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 9e18c7d6bc..ecc90fa646 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -5879,6 +5879,14 @@ "RackInitializationRequest": { "type": "object", "properties": { + "blueprint": { + "description": "Blueprint describing services initialized by RSS.", + "allOf": [ + { + "$ref": "#/components/schemas/Blueprint" + } + ] + }, "certs": { "description": "x.509 Certificates used to encrypt communication with the external API.", "type": "array", @@ -5945,6 +5953,7 @@ } }, "required": [ + "blueprint", "certs", "datasets", "external_dns_zone_name", diff --git a/sled-agent/src/rack_setup/mod.rs b/sled-agent/src/rack_setup/mod.rs index cfabca6209..0ad8e0ce71 100644 --- a/sled-agent/src/rack_setup/mod.rs +++ b/sled-agent/src/rack_setup/mod.rs @@ -9,3 +9,5 @@ pub mod config; mod plan; /// The main implementation of the RSS service. pub mod service; + +pub use plan::service::SledConfig; diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 7be85b5447..952240bd74 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -65,6 +65,7 @@ //! thereafter. use super::config::SetupServiceConfig as Config; +use super::plan::service::SledConfig; use crate::bootstrap::config::BOOTSTRAP_AGENT_HTTP_PORT; use crate::bootstrap::early_networking::{ EarlyNetworkConfig, EarlyNetworkConfigBody, EarlyNetworkSetup, @@ -107,7 +108,7 @@ use sled_hardware::underlay::BootstrapInterface; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; use slog::Logger; -use std::collections::{BTreeMap, BTreeSet}; +use std::collections::{btree_map, BTreeMap, BTreeSet}; use std::collections::{HashMap, HashSet}; use std::iter; use std::net::{Ipv6Addr, SocketAddrV6}; @@ -554,7 +555,7 @@ impl ServiceInner { // Build a Blueprint describing our service plan. This should never // fail, unless we've set up an invalid plan. - let _blueprint = + let blueprint = build_initial_blueprint_from_plan(sled_plan, service_plan) .map_err(SetupServiceError::ConvertPlanToBlueprint)?; @@ -665,6 +666,7 @@ impl ServiceInner { info!(self.log, "rack_network_config: {:#?}", rack_network_config); let request = NexusTypes::RackInitializationRequest { + blueprint, services, datasets, internal_services_ip_pool_ranges, @@ -1105,20 +1107,38 @@ fn build_initial_blueprint_from_plan( Generation::try_from(service_plan.dns_config.generation) .context("invalid internal dns version")?; - let mut sled_id_by_addr = BTreeMap::new(); + let mut sled_configs = BTreeMap::new(); for sled_request in sled_plan.sleds.values() { - let addr = get_sled_address(sled_request.body.subnet); - if sled_id_by_addr.insert(addr, sled_request.body.id).is_some() { - bail!("duplicate sled address deriving blueprint: {addr}"); - } + let sled_addr = get_sled_address(sled_request.body.subnet); + let sled_id = sled_request.body.id; + let entry = match sled_configs.entry(sled_id) { + btree_map::Entry::Vacant(entry) => entry, + btree_map::Entry::Occupied(_) => { + bail!("duplicate sled address deriving blueprint: {sled_addr}"); + } + }; + let sled_config = + service_plan.services.get(&sled_addr).with_context(|| { + format!( + "missing services in plan for sled {sled_id} ({sled_addr})" + ) + })?; + entry.insert(sled_config.clone()); } + Ok(build_initial_blueprint_from_sled_configs( + sled_configs, + internal_dns_version, + )) +} + +pub(crate) fn build_initial_blueprint_from_sled_configs( + sled_configs: BTreeMap, + internal_dns_version: Generation, +) -> Blueprint { let mut omicron_zones = BTreeMap::new(); let mut zones_in_service = BTreeSet::new(); - for (sled_addr, sled_config) in &service_plan.services { - let sled_id = *sled_id_by_addr.get(&sled_addr).with_context(|| { - format!("could not sled ID for sled with address {sled_addr}") - })?; + for (sled_id, sled_config) in sled_configs { for zone in &sled_config.zones { zones_in_service.insert(zone.id); } @@ -1135,18 +1155,13 @@ fn build_initial_blueprint_from_plan( // value, we will need to revisit storing this in the serialized // RSS plan. generation: DeployStepVersion::V5_EVERYTHING, - zones: sled_config.zones.clone(), + zones: sled_config.zones, }, ); - if omicron_zones.insert(sled_id, zones_config).is_some() { - bail!( - "duplicate sled ID deriving blueprint: \ - {sled_id} (address={sled_addr}" - ); - } + omicron_zones.insert(sled_id, zones_config); } - Ok(Blueprint { + Blueprint { id: Uuid::new_v4(), omicron_zones, zones_in_service, @@ -1155,7 +1170,7 @@ fn build_initial_blueprint_from_plan( time_created: Utc::now(), creator: "RSS".to_string(), comment: "initial blueprint from rack setup".to_string(), - }) + } } /// Facilitates creating a sequence of OmicronZonesConfig objects for each sled diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index dd815775ff..b7577e6d02 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -10,21 +10,30 @@ use super::sled_agent::SledAgent; use super::storage::PantryServer; use crate::nexus::d2n_params; use crate::nexus::NexusClient; +use crate::params::OmicronZoneConfig; +use crate::params::OmicronZoneDataset; +use crate::params::OmicronZoneType; +use crate::rack_setup::service::build_initial_blueprint_from_sled_configs; +use crate::rack_setup::SledConfig; use anyhow::anyhow; use crucible_agent_client::types::State as RegionState; +use illumos_utils::zpool::ZpoolName; use internal_dns::ServiceName; use nexus_client::types as NexusTypes; use nexus_client::types::{IpRange, Ipv4Range, Ipv6Range}; use nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; +use nexus_types::inventory::NetworkInterfaceKind; use omicron_common::address::DNS_OPTE_IPV4_SUBNET; use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; use omicron_common::api::external::Generation; use omicron_common::api::external::MacAddr; +use omicron_common::api::external::Vni; use omicron_common::backoff::{ retry_notify, retry_policy_internal_service_aggressive, BackoffError, }; use omicron_common::FileKv; use slog::{info, Drain, Logger}; +use std::collections::BTreeMap; use std::collections::HashMap; use std::net::IpAddr; use std::net::Ipv4Addr; @@ -334,6 +343,8 @@ pub async fn run_standalone_server( // Initialize the internal DNS entries let dns_config = dns_config_builder.build(); dns.initialize_with_config(&log, &dns_config).await?; + let internal_dns_version = Generation::try_from(dns_config.generation) + .expect("invalid internal dns version"); // Record the internal DNS server as though RSS had provisioned it so // that Nexus knows about it. @@ -341,37 +352,58 @@ pub async fn run_standalone_server( SocketAddr::V4(_) => panic!("did not expect v4 address"), SocketAddr::V6(a) => a, }; - let mut services = vec![NexusTypes::ServicePutRequest { - address: http_bound.to_string(), - kind: NexusTypes::ServiceKind::InternalDns, - service_id: Uuid::new_v4(), - sled_id: config.id, - zone_id: Some(Uuid::new_v4()), + let mut zones = vec![OmicronZoneConfig { + id: Uuid::new_v4(), + underlay_address: *http_bound.ip(), + zone_type: OmicronZoneType::InternalDns { + dataset: OmicronZoneDataset { + pool_name: ZpoolName::new_external(Uuid::new_v4()), + }, + http_address: http_bound, + dns_address: match dns.dns_server.local_address() { + SocketAddr::V4(_) => panic!("did not expect v4 address"), + SocketAddr::V6(a) => a, + }, + gz_address: Ipv6Addr::LOCALHOST, + gz_address_index: 0, + }, }]; let mut internal_services_ip_pool_ranges = vec![]; let mut macs = MacAddr::iter_system(); if let Some(nexus_external_addr) = rss_args.nexus_external_addr { let ip = nexus_external_addr.ip(); + let id = Uuid::new_v4(); - services.push(NexusTypes::ServicePutRequest { - address: config.nexus_address.to_string(), - kind: NexusTypes::ServiceKind::Nexus { - external_address: ip, - nic: NexusTypes::ServiceNic { + zones.push(OmicronZoneConfig { + id, + underlay_address: match ip { + IpAddr::V4(_) => panic!("did not expect v4 address"), + IpAddr::V6(a) => a, + }, + zone_type: OmicronZoneType::Nexus { + internal_address: match config.nexus_address { + SocketAddr::V4(_) => panic!("did not expect v4 address"), + SocketAddr::V6(a) => a, + }, + external_ip: ip, + nic: nexus_types::inventory::NetworkInterface { id: Uuid::new_v4(), + kind: NetworkInterfaceKind::Service { id }, name: "nexus".parse().unwrap(), ip: NEXUS_OPTE_IPV4_SUBNET .nth(NUM_INITIAL_RESERVED_IP_ADDRESSES as u32 + 1) .unwrap() .into(), mac: macs.next().unwrap(), + subnet: (*NEXUS_OPTE_IPV4_SUBNET).into(), + vni: Vni::SERVICES_VNI, + primary: true, slot: 0, }, + external_tls: false, + external_dns_servers: vec![], }, - service_id: Uuid::new_v4(), - sled_id: config.id, - zone_id: Some(Uuid::new_v4()), }); internal_services_ip_pool_ranges.push(match ip { @@ -388,24 +420,31 @@ pub async fn run_standalone_server( rss_args.external_dns_internal_addr { let ip = *external_dns_internal_addr.ip(); - services.push(NexusTypes::ServicePutRequest { - address: external_dns_internal_addr.to_string(), - kind: NexusTypes::ServiceKind::ExternalDns { - external_address: ip.into(), - nic: NexusTypes::ServiceNic { + let id = Uuid::new_v4(); + zones.push(OmicronZoneConfig { + id, + underlay_address: ip, + zone_type: OmicronZoneType::ExternalDns { + dataset: OmicronZoneDataset { + pool_name: ZpoolName::new_external(Uuid::new_v4()), + }, + http_address: external_dns_internal_addr, + dns_address: SocketAddr::V6(external_dns_internal_addr), + nic: nexus_types::inventory::NetworkInterface { id: Uuid::new_v4(), + kind: NetworkInterfaceKind::Service { id }, name: "external-dns".parse().unwrap(), ip: DNS_OPTE_IPV4_SUBNET .nth(NUM_INITIAL_RESERVED_IP_ADDRESSES as u32 + 1) .unwrap() .into(), mac: macs.next().unwrap(), + subnet: (*DNS_OPTE_IPV4_SUBNET).into(), + vni: Vni::SERVICES_VNI, + primary: true, slot: 0, }, }, - service_id: Uuid::new_v4(), - sled_id: config.id, - zone_id: Some(Uuid::new_v4()), }); internal_services_ip_pool_ranges @@ -450,7 +489,16 @@ pub async fn run_standalone_server( None => vec![], }; + let services = + zones.iter().map(|z| z.to_nexus_service_req(config.id)).collect(); + let mut sled_configs = BTreeMap::new(); + sled_configs.insert(config.id, SledConfig { zones }); + let rack_init_request = NexusTypes::RackInitializationRequest { + blueprint: build_initial_blueprint_from_sled_configs( + sled_configs, + internal_dns_version, + ), services, datasets, internal_services_ip_pool_ranges, From 7111cca1a3861db87885826a3478c015acbc4c11 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 11 Mar 2024 11:59:23 -0400 Subject: [PATCH 03/11] Nexus: insert RSS blueprint and make it the target during handoff --- .../db-queries/src/db/datastore/deployment.rs | 30 +++++++-- nexus/db-queries/src/db/datastore/rack.rs | 66 ++++++++++++++++++- nexus/src/app/rack.rs | 1 + 3 files changed, 90 insertions(+), 7 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 5fc17c88d0..501af6e244 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -79,6 +79,17 @@ impl DataStore { &self, opctx: &OpContext, blueprint: &Blueprint, + ) -> Result<(), Error> { + let conn = self.pool_connection_authorized(opctx).await?; + Self::blueprint_insert_on_connection(&conn, opctx, blueprint).await + } + + /// Variant of [Self::blueprint_insert] which may be called from a + /// transaction context. + pub(crate) async fn blueprint_insert_on_connection( + conn: &async_bb8_diesel::Connection, + opctx: &OpContext, + blueprint: &Blueprint, ) -> Result<(), Error> { opctx .authorize(authz::Action::Modify, &authz::BLUEPRINT_CONFIG) @@ -152,8 +163,7 @@ impl DataStore { // batch rather than making a bunch of round-trips to the database. // We'd do that if we had an interface for doing that with bound // parameters, etc. See oxidecomputer/omicron#973. - let pool = self.pool_connection_authorized(opctx).await?; - pool.transaction_async(|conn| async move { + conn.transaction_async(|conn| async move { // Insert the row for the blueprint. { use db::schema::blueprint::dsl; @@ -620,6 +630,18 @@ impl DataStore { &self, opctx: &OpContext, target: BlueprintTarget, + ) -> Result<(), Error> { + let conn = self.pool_connection_authorized(opctx).await?; + Self::blueprint_target_set_current_on_connection(&conn, opctx, target) + .await + } + + /// Variant of [Self::blueprint_target_set_current] which may be called from + /// a transaction context. + pub(crate) async fn blueprint_target_set_current_on_connection( + conn: &async_bb8_diesel::Connection, + opctx: &OpContext, + target: BlueprintTarget, ) -> Result<(), Error> { opctx .authorize(authz::Action::Modify, &authz::BLUEPRINT_CONFIG) @@ -631,10 +653,8 @@ impl DataStore { time_made_target: target.time_made_target, }; - let conn = self.pool_connection_authorized(opctx).await?; - query - .execute_async(&*conn) + .execute_async(conn) .await .map_err(|e| Error::from(query.decode_error(e)))?; diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index 3224f36b8d..bc962e4b4d 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -44,6 +44,8 @@ use nexus_db_model::PasswordHashString; use nexus_db_model::SiloUser; use nexus_db_model::SiloUserPasswordHash; use nexus_db_model::SledUnderlaySubnetAllocation; +use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintTarget; use nexus_types::external_api::params as external_params; use nexus_types::external_api::shared; use nexus_types::external_api::shared::IdentityType; @@ -68,6 +70,7 @@ use uuid::Uuid; pub struct RackInit { pub rack_id: Uuid, pub rack_subnet: IpNetwork, + pub blueprint: Blueprint, pub services: Vec, pub datasets: Vec, pub service_ip_pool_ranges: Vec, @@ -85,6 +88,8 @@ pub struct RackInit { enum RackInitError { AddingIp(Error), AddingNic(Error), + BlueprintInsert(Error), + BlueprintTargetSet(Error), ServiceInsert(Error), DatasetInsert { err: AsyncInsertError, zpool_id: Uuid }, RackUpdate { err: DieselError, rack_id: Uuid }, @@ -126,6 +131,12 @@ impl From for Error { RackInitError::ServiceInsert(err) => Error::internal_error( &format!("failed to insert Service record: {:#}", err), ), + RackInitError::BlueprintInsert(err) => Error::internal_error( + &format!("failed to insert Blueprint: {:#}", err), + ), + RackInitError::BlueprintTargetSet(err) => Error::internal_error( + &format!("failed to insert set target Blueprint: {:#}", err), + ), RackInitError::RackUpdate { err, rack_id } => { public_error_from_diesel( err, @@ -583,6 +594,7 @@ impl DataStore { let service_pool = service_pool.clone(); async move { let rack_id = rack_init.rack_id; + let blueprint = rack_init.blueprint; let services = rack_init.services; let datasets = rack_init.datasets; let service_ip_pool_ranges = rack_init.service_ip_pool_ranges; @@ -608,7 +620,7 @@ impl DataStore { return Ok::<_, DieselError>(rack); } - // Otherwise, insert services and datasets. + // Otherwise, insert blueprint and datasets. // Set up the IP pool for internal services. for range in service_ip_pool_ranges { @@ -629,6 +641,46 @@ impl DataStore { })?; } + // Insert the RSS-generated blueprint. + Self::blueprint_insert_on_connection( + &conn, + opctx, + &blueprint, + ).await + .map_err(|e| { + warn!( + log, + "Initializing Rack: Failed to insert blueprint" + ); + err.set(RackInitError::BlueprintInsert(e)).unwrap(); + DieselError::RollbackTransaction + })?; + + // Mark the RSS-generated blueprint as the current target, + // DISABLED. We may change this to enabled in the future + // when more of Reconfigurator is automated, but for now we + // require a support operation to enable it. + Self::blueprint_target_set_current_on_connection( + &conn, + opctx, + BlueprintTarget { + target_id: blueprint.id, + enabled: false, + time_made_target: Utc::now(), + }, + ) + .await + .map_err(|e| { + warn!( + log, + "Initializing Rack: \ + Failed to set blueprint as target" + ); + err.set(RackInitError::BlueprintTargetSet(e)) + .unwrap(); + DieselError::RollbackTransaction + })?; + // Allocate records for all services. for service in services { self.rack_populate_service_records( @@ -882,7 +934,7 @@ mod test { }; use omicron_common::api::internal::shared::SourceNatConfig; use omicron_test_utils::dev; - use std::collections::HashMap; + use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddrV6}; use std::num::NonZeroU32; @@ -893,6 +945,16 @@ mod test { RackInit { rack_id: Uuid::parse_str(nexus_test_utils::RACK_UUID).unwrap(), rack_subnet: nexus_test_utils::RACK_SUBNET.parse().unwrap(), + blueprint: Blueprint { + id: Uuid::new_v4(), + omicron_zones: BTreeMap::new(), + zones_in_service: BTreeSet::new(), + parent_blueprint_id: None, + internal_dns_version: Generation::new().next(), + time_created: Utc::now(), + creator: "test suite".to_string(), + comment: "test suite".to_string(), + }, services: vec![], datasets: vec![], service_ip_pool_ranges: vec![], diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 4030fce31d..079578619a 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -221,6 +221,7 @@ impl super::Nexus { RackInit { rack_subnet: rack_network_config.rack_subnet.into(), rack_id, + blueprint: request.blueprint, services: request.services, datasets, service_ip_pool_ranges, From 325047cf3852069a3480619e1ae038c5367c2089 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 12 Mar 2024 11:04:47 -0400 Subject: [PATCH 04/11] nexus-test-utils: set reasonableish initial blueprint --- nexus/src/app/background/blueprint_load.rs | 29 ++++++++------ nexus/test-utils/src/lib.rs | 46 +++++++++++++++++----- 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/nexus/src/app/background/blueprint_load.rs b/nexus/src/app/background/blueprint_load.rs index 8886df81cd..8968621f45 100644 --- a/nexus/src/app/background/blueprint_load.rs +++ b/nexus/src/app/background/blueprint_load.rs @@ -193,7 +193,7 @@ mod test { nexus_test_utils::ControlPlaneTestContext; fn create_blueprint( - parent_blueprint_id: Option, + parent_blueprint_id: Uuid, ) -> (BlueprintTarget, Blueprint) { let id = Uuid::new_v4(); ( @@ -206,7 +206,7 @@ mod test { id, omicron_zones: BTreeMap::new(), zones_in_service: BTreeSet::new(), - parent_blueprint_id, + parent_blueprint_id: Some(parent_blueprint_id), internal_dns_version: Generation::new(), time_created: now_db_precision(), creator: "test".to_string(), @@ -236,26 +236,31 @@ mod test { let mut task = TargetBlueprintLoader::new(datastore.clone()); let mut rx = task.watcher(); - // We expect an appropriate status with no blueprint in the datastore + // We expect to see the initial blueprint set up by nexus-test-utils + // (emulating RSS). let value = task.activate(&opctx).await; - assert_eq!(json!({"status": "no target blueprint"}), value); - assert!(rx.borrow().is_none()); + let initial_blueprint = + rx.borrow_and_update().clone().expect("no initial blueprint"); + let update = serde_json::from_value::(value).unwrap(); + assert_eq!(update.target_id, initial_blueprint.1.id); + assert_eq!(update.status, "first target blueprint"); - let (target, blueprint) = create_blueprint(None); + let (target, blueprint) = create_blueprint(update.target_id); - // Inserting a blueprint, but not making it the target returns the same - // status + // Inserting a blueprint, but not making it the target return status + // indicating that the target hasn't changed datastore.blueprint_insert(&opctx, &blueprint).await.unwrap(); let value = task.activate(&opctx).await; - assert_eq!(json!({"status": "no target blueprint"}), value); - assert!(rx.borrow().is_none()); + let update = serde_json::from_value::(value).unwrap(); + assert_eq!(update.target_id, initial_blueprint.1.id); + assert_eq!(update.status, "target blueprint unchanged"); // Setting a target blueprint makes the loader see it and broadcast it datastore.blueprint_target_set_current(&opctx, target).await.unwrap(); let value = task.activate(&opctx).await; let update = serde_json::from_value::(value).unwrap(); assert_eq!(update.target_id, blueprint.id); - assert_eq!(update.status, "first target blueprint"); + assert_eq!(update.status, "target blueprint updated"); let rx_update = rx.borrow_and_update().clone().unwrap(); assert_eq!(rx_update.0, target); assert_eq!(rx_update.1, blueprint); @@ -268,7 +273,7 @@ mod test { assert_eq!(false, rx.has_changed().unwrap()); // Adding a new blueprint and updating the target triggers a change - let (new_target, new_blueprint) = create_blueprint(Some(blueprint.id)); + let (new_target, new_blueprint) = create_blueprint(blueprint.id); datastore.blueprint_insert(&opctx, &new_blueprint).await.unwrap(); datastore .blueprint_target_set_current(&opctx, new_target) diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index a557cef3dc..dfa06e8ce4 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -789,22 +789,48 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { user_password_hash, }; + let blueprint = { + let mut omicron_zones = BTreeMap::new(); + let mut zones_in_service = BTreeSet::new(); + for (maybe_sled_agent, zones) in [ + (self.sled_agent.as_ref(), &self.omicron_zones), + (self.sled_agent2.as_ref(), &self.omicron_zones2), + ] { + if let Some(sa) = maybe_sled_agent { + omicron_zones.insert( + sa.sled_agent.id, + OmicronZonesConfig { + generation: Generation::new().next(), + zones: zones.clone(), + }, + ); + for z in zones { + zones_in_service.insert(z.id); + } + } + } + Blueprint { + id: Uuid::new_v4(), + omicron_zones, + zones_in_service, + parent_blueprint_id: None, + internal_dns_version: dns_config + .generation + .try_into() + .expect("bad internal DNS generation"), + time_created: Utc::now(), + creator: "nexus-test-utils".to_string(), + comment: "initial test blueprint".to_string(), + } + }; + // Handoff all known service information to Nexus let server = N::start( self.nexus_internal .take() .expect("Must launch internal nexus first"), self.config, - Blueprint { - id: Uuid::new_v4(), - omicron_zones: BTreeMap::new(), - zones_in_service: BTreeSet::new(), - parent_blueprint_id: None, - internal_dns_version: Generation::new(), - time_created: Utc::now(), - creator: "fixme".to_string(), - comment: "fixme".to_string(), - }, + blueprint, self.rack_init_builder.services.clone(), // NOTE: We should probably hand off // "self.rack_init_builder.datasets" here, but Nexus won't be happy From aac78b0b29986e6b65691c47f3e5cda96a2bf7f5 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 12 Mar 2024 14:37:57 -0400 Subject: [PATCH 05/11] unwedge rustfmt --- nexus/db-queries/src/db/datastore/rack.rs | 92 ++++++++++++----------- 1 file changed, 49 insertions(+), 43 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index bc962e4b4d..325a199cc1 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -597,7 +597,8 @@ impl DataStore { let blueprint = rack_init.blueprint; let services = rack_init.services; let datasets = rack_init.datasets; - let service_ip_pool_ranges = rack_init.service_ip_pool_ranges; + let service_ip_pool_ranges = + rack_init.service_ip_pool_ranges; let internal_dns = rack_init.internal_dns; let external_dns = rack_init.external_dns; @@ -608,11 +609,15 @@ impl DataStore { .get_result_async(&conn) .await .map_err(|e| { - warn!(log, "Initializing Rack: Rack UUID not found"); + warn!( + log, + "Initializing Rack: Rack UUID not found" + ); err.set(RackInitError::RackUpdate { err: e, rack_id, - }).unwrap(); + }) + .unwrap(); DieselError::RollbackTransaction })?; if rack.initialized { @@ -643,18 +648,17 @@ impl DataStore { // Insert the RSS-generated blueprint. Self::blueprint_insert_on_connection( - &conn, - opctx, - &blueprint, - ).await - .map_err(|e| { - warn!( - log, - "Initializing Rack: Failed to insert blueprint" - ); - err.set(RackInitError::BlueprintInsert(e)).unwrap(); - DieselError::RollbackTransaction - })?; + &conn, opctx, &blueprint, + ) + .await + .map_err(|e| { + warn!( + log, + "Initializing Rack: Failed to insert blueprint" + ); + err.set(RackInitError::BlueprintInsert(e)).unwrap(); + DieselError::RollbackTransaction + })?; // Mark the RSS-generated blueprint as the current target, // DISABLED. We may change this to enabled in the future @@ -668,18 +672,17 @@ impl DataStore { enabled: false, time_made_target: Utc::now(), }, - ) - .await - .map_err(|e| { - warn!( - log, - "Initializing Rack: \ + ) + .await + .map_err(|e| { + warn!( + log, + "Initializing Rack: \ Failed to set blueprint as target" - ); - err.set(RackInitError::BlueprintTargetSet(e)) - .unwrap(); - DieselError::RollbackTransaction - })?; + ); + err.set(RackInitError::BlueprintTargetSet(e)).unwrap(); + DieselError::RollbackTransaction + })?; // Allocate records for all services. for service in services { @@ -700,7 +703,7 @@ impl DataStore { for dataset in datasets { use db::schema::dataset::dsl; let zpool_id = dataset.pool_id; - >::insert_resource( + Zpool::insert_resource( zpool_id, diesel::insert_into(dsl::dataset) .values(dataset.clone()) @@ -720,7 +723,8 @@ impl DataStore { err.set(RackInitError::DatasetInsert { err: e, zpool_id, - }).unwrap(); + }) + .unwrap(); DieselError::RollbackTransaction })?; } @@ -728,20 +732,22 @@ impl DataStore { // Insert the initial contents of the internal and external DNS // zones. - Self::load_dns_data(&conn, internal_dns) - .await - .map_err(|e| { - err.set(RackInitError::DnsSerialization(e)).unwrap(); + Self::load_dns_data(&conn, internal_dns).await.map_err( + |e| { + err.set(RackInitError::DnsSerialization(e)) + .unwrap(); DieselError::RollbackTransaction - })?; + }, + )?; info!(log, "Populated DNS tables for internal DNS"); - Self::load_dns_data(&conn, external_dns) - .await - .map_err(|e| { - err.set(RackInitError::DnsSerialization(e)).unwrap(); - DieselError::RollbackTransaction - })?; + Self::load_dns_data(&conn, external_dns).await.map_err( + |e| { + err.set(RackInitError::DnsSerialization(e)) + .unwrap(); + DieselError::RollbackTransaction + }, + )?; info!(log, "Populated DNS tables for external DNS"); // Create the initial Recovery Silo @@ -761,7 +767,7 @@ impl DataStore { _ => { err.set(e).unwrap(); DieselError::RollbackTransaction - }, + } })?; let rack = diesel::update(rack_dsl::rack) @@ -780,13 +786,13 @@ impl DataStore { err.set(RackInitError::RackUpdate { err: e, rack_id, - }).unwrap(); + }) + .unwrap(); DieselError::RollbackTransaction })?; Ok(rack) } - }, - ) + }) .await .map_err(|e| { if let Some(err) = Arc::try_unwrap(err).unwrap().take() { From 3cccdedaae18407b323238f8ec81ef391762f276 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 14 Mar 2024 09:25:21 -0400 Subject: [PATCH 06/11] fixups from merging main --- nexus/db-queries/src/db/datastore/rack.rs | 3 ++- nexus/test-utils/src/lib.rs | 1 + sled-agent/src/rack_setup/service.rs | 7 +++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index 4efb264587..3d7669e2c0 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -946,7 +946,8 @@ mod test { omicron_zones: BTreeMap::new(), zones_in_service: BTreeSet::new(), parent_blueprint_id: None, - internal_dns_version: Generation::new().next(), + internal_dns_version: *Generation::new(), + external_dns_version: *Generation::new(), time_created: Utc::now(), creator: "test suite".to_string(), comment: "test suite".to_string(), diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index dfa06e8ce4..0358cf523c 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -818,6 +818,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { .generation .try_into() .expect("bad internal DNS generation"), + external_dns_version: Generation::new(), time_created: Utc::now(), creator: "nexus-test-utils".to_string(), comment: "initial test blueprint".to_string(), diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index d14d04cc04..1e6f1b574d 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -1167,6 +1167,13 @@ pub(crate) fn build_initial_blueprint_from_sled_configs( zones_in_service, parent_blueprint_id: None, internal_dns_version, + // We don't actually configure external DNS during RSS. During handoff, + // Nexus creates an initial empty external DNS configuration, then + // creates the recovery silo, which bumps the generation. + // + // TODO-john How should we do this? Baking this knowledge of how Nexus + // sets up external DNS here seems bad. + external_dns_version: Generation::new().next(), time_created: Utc::now(), creator: "RSS".to_string(), comment: "initial blueprint from rack setup".to_string(), From c5737515d179cc363439500bb245bcf5e00ec029 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 14 Mar 2024 09:25:41 -0400 Subject: [PATCH 07/11] derive SlogInlineError on omicron_common::api::external::Error --- Cargo.lock | 1 + common/Cargo.toml | 1 + common/src/api/external/error.rs | 11 ++++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index ba0f5d5b23..ec66987948 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5069,6 +5069,7 @@ dependencies = [ "serde_urlencoded", "serde_with", "slog", + "slog-error-chain", "strum 0.26.1", "test-strategy", "thiserror", diff --git a/common/Cargo.toml b/common/Cargo.toml index 4451d92bdb..c03d1b8f30 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -34,6 +34,7 @@ serde_human_bytes.workspace = true serde_json.workspace = true serde_with.workspace = true slog.workspace = true +slog-error-chain.workspace = true strum.workspace = true test-strategy = { workspace = true, optional = true } thiserror.workspace = true diff --git a/common/src/api/external/error.rs b/common/src/api/external/error.rs index ef64aa3bbb..6b3b93187f 100644 --- a/common/src/api/external/error.rs +++ b/common/src/api/external/error.rs @@ -12,6 +12,7 @@ use dropshot::HttpError; use omicron_uuid_kinds::GenericUuid; use serde::Deserialize; use serde::Serialize; +use slog_error_chain::SlogInlineError; use std::fmt::Display; use uuid::Uuid; @@ -26,7 +27,15 @@ use uuid::Uuid; /// General best practices for error design apply here. Where possible, we want /// to reuse existing variants rather than inventing new ones to distinguish /// cases that no programmatic consumer needs to distinguish. -#[derive(Clone, Debug, Deserialize, thiserror::Error, PartialEq, Serialize)] +#[derive( + Clone, + Debug, + Deserialize, + thiserror::Error, + PartialEq, + Serialize, + SlogInlineError, +)] pub enum Error { /// An object needed as part of this operation was not found. #[error("Object (of type {lookup_type:?}) not found: {type_name}")] From f1acada1266311c7ae518ff570d69c842dcecb01 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 14 Mar 2024 09:30:39 -0400 Subject: [PATCH 08/11] more verbose error logs for failed rack init --- Cargo.lock | 1 + nexus/db-queries/Cargo.toml | 1 + nexus/db-queries/src/db/datastore/rack.rs | 31 ++++++++++++++--------- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ec66987948..793b330977 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4507,6 +4507,7 @@ dependencies = [ "serde_with", "sled-agent-client", "slog", + "slog-error-chain", "static_assertions", "steno", "strum 0.26.1", diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index a33ca24c84..9bb7040af9 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -42,6 +42,7 @@ serde_urlencoded.workspace = true serde_with.workspace = true sled-agent-client.workspace = true slog.workspace = true +slog-error-chain.workspace = true static_assertions.workspace = true steno.workspace = true strum.workspace = true diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index 3d7669e2c0..e290343978 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -62,6 +62,7 @@ use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; use omicron_common::bail_unless; +use slog_error_chain::InlineErrorChain; use std::net::IpAddr; use std::sync::{Arc, OnceLock}; use uuid::Uuid; @@ -530,11 +531,12 @@ impl DataStore { Self::allocate_external_ip_on_connection(conn, db_ip) .await .map_err(|err| { - warn!( + error!( log, "Initializing Rack: Failed to allocate \ - IP address for {}", - service.kind, + IP address for {}", + service.kind; + "err" => %err, ); match err.retryable() { Retryable(e) => RackInitError::Retryable(e), @@ -610,9 +612,10 @@ impl DataStore { .get_result_async(&conn) .await .map_err(|e| { - warn!( + error!( log, - "Initializing Rack: Rack UUID not found" + "Initializing Rack: Rack UUID not found"; + InlineErrorChain::new(&e), ); err.set(RackInitError::RackUpdate { err: e, @@ -638,9 +641,11 @@ impl DataStore { ) .await .map_err(|e| { - warn!( + error!( log, - "Initializing Rack: Failed to add IP pool range" + "Initializing Rack: Failed to add \ + IP pool range"; + &e, ); err.set(RackInitError::AddingIp(e)).unwrap(); DieselError::RollbackTransaction @@ -653,9 +658,10 @@ impl DataStore { ) .await .map_err(|e| { - warn!( + error!( log, - "Initializing Rack: Failed to insert blueprint" + "Initializing Rack: Failed to insert blueprint"; + &e, ); err.set(RackInitError::BlueprintInsert(e)).unwrap(); DieselError::RollbackTransaction @@ -676,10 +682,11 @@ impl DataStore { ) .await .map_err(|e| { - warn!( + error!( log, - "Initializing Rack: \ - Failed to set blueprint as target" + "Initializing Rack: Failed to set blueprint \ + as target"; + &e, ); err.set(RackInitError::BlueprintTargetSet(e)).unwrap(); DieselError::RollbackTransaction From c1ca898f97bf70799eaccab1450d0833ce4715e3 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 14 Mar 2024 09:31:25 -0400 Subject: [PATCH 09/11] error wordsmithing --- sled-agent/src/rack_setup/service.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 1e6f1b574d..b62555a899 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -1114,7 +1114,10 @@ fn build_initial_blueprint_from_plan( let entry = match sled_configs.entry(sled_id) { btree_map::Entry::Vacant(entry) => entry, btree_map::Entry::Occupied(_) => { - bail!("duplicate sled address deriving blueprint: {sled_addr}"); + bail!( + "duplicate sled address found while deriving blueprint: \ + {sled_addr}" + ); } }; let sled_config = From 51b031bb316ccf203276a865e9fbcc143e0c8e0b Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 14 Mar 2024 10:54:22 -0400 Subject: [PATCH 10/11] fix failing test (use initial blueprint instead of collection) --- nexus/reconfigurator/execution/src/dns.rs | 63 +++-------------------- 1 file changed, 6 insertions(+), 57 deletions(-) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 9e0434c59f..aa3ee956e2 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -490,13 +490,9 @@ mod test { use omicron_common::address::NEXUS_REDUNDANCY; use omicron_common::address::RACK_PREFIX; use omicron_common::address::SLED_PREFIX; - use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::IdentityMetadataCreateParams; - use omicron_test_utils::dev::poll::wait_for_condition; - use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::test_setup_log; - use slog::{debug, info}; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::HashMap; @@ -506,7 +502,6 @@ mod test { use std::net::SocketAddrV6; use std::str::FromStr; use std::sync::Arc; - use std::time::Duration; use uuid::Uuid; type ControlPlaneTestContext = @@ -1171,38 +1166,6 @@ mod test { datastore.clone(), ); - // First, wait until Nexus has successfully completed an inventory - // collection. - let collection = wait_for_condition( - || async { - let result = - datastore.inventory_get_latest_collection(&opctx).await; - let log_result = match &result { - Ok(Some(_)) => Ok("found"), - Ok(None) => Ok("not found"), - Err(error) => Err(error), - }; - debug!( - log, - "attempt to fetch latest inventory collection"; - "result" => ?log_result, - ); - - match result { - Ok(None) => Err(CondCheckError::NotYet), - Ok(Some(c)) => Ok(c), - Err(Error::ServiceUnavailable { .. }) => { - Err(CondCheckError::NotYet) - } - Err(error) => Err(CondCheckError::Failed(error)), - } - }, - &Duration::from_millis(50), - &Duration::from_secs(30), - ) - .await - .expect("expected to find inventory collection"); - // Fetch the initial contents of internal and external DNS. let dns_initial_internal = datastore .dns_config_read(&opctx, DnsGroup::Internal) @@ -1213,29 +1176,15 @@ mod test { .await .expect("fetching initial external DNS"); - // Now, use the collection to construct an initial blueprint. - // This stores it into the database, too. - info!(log, "using collection"; "collection_id" => %collection.id); - let blueprint = nexus - .blueprint_generate_from_collection(&opctx, collection.id) + // Fetch the initial blueprint installed during rack initialization. + let (_blueprint_target, blueprint) = datastore + .blueprint_target_get_current_full(&opctx) .await - .expect("failed to generate initial blueprint"); + .expect("failed to read current target blueprint") + .expect("no target blueprint set"); eprintln!("blueprint: {:?}", blueprint); - // Set it as the current target. We'll need this later. - datastore - .blueprint_target_set_current( - &opctx, - BlueprintTarget { - target_id: blueprint.id, - enabled: false, - time_made_target: chrono::Utc::now(), - }, - ) - .await - .expect("failed to set blueprint as target"); - - // Now, execute the blueprint. + // Now, execute the initial blueprint. let overrides = Overridables::for_test(cptestctx); crate::realize_blueprint_with_overrides( &opctx, From 0b2829874a83445514e7124545f0866d796c8f3a Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 15 Mar 2024 08:36:06 -0400 Subject: [PATCH 11/11] cleaner initial blueprint external_dns_version --- nexus/src/app/rack.rs | 15 +++++++++++---- sled-agent/src/rack_setup/service.rs | 11 ++++------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 59182c3466..9905df04ff 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -190,6 +190,12 @@ impl super::Nexus { format!("{silo_dns_name}.{}", request.external_dns_zone_name); dns_update.add_name(silo_dns_name, dns_records)?; + // We're providing an update to the initial `external_dns` group we + // defined above; also bump RSS's blueprint's `external_dns_version` to + // match this update. + let mut blueprint = request.blueprint; + blueprint.external_dns_version = blueprint.external_dns_version.next(); + // Administrators of the Recovery Silo are automatically made // administrators of the Fleet. let mapped_fleet_roles = BTreeMap::from([( @@ -202,9 +208,10 @@ impl super::Nexus { name: request.recovery_silo.silo_name, description: "built-in recovery Silo".to_string(), }, - // The recovery silo is initialized with no allocated capacity given it's - // not intended to be used to deploy workloads. Operators can add capacity - // after the fact if they want to use it for that purpose. + // The recovery silo is initialized with no allocated capacity given + // it's not intended to be used to deploy workloads. Operators can + // add capacity after the fact if they want to use it for that + // purpose. quotas: params::SiloQuotasCreate::empty(), discoverable: false, identity_mode: SiloIdentityMode::LocalOnly, @@ -221,7 +228,7 @@ impl super::Nexus { RackInit { rack_subnet: rack_network_config.rack_subnet.into(), rack_id, - blueprint: request.blueprint, + blueprint, services: request.services, datasets, service_ip_pool_ranges, diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index b62555a899..6b189f423c 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -1170,13 +1170,10 @@ pub(crate) fn build_initial_blueprint_from_sled_configs( zones_in_service, parent_blueprint_id: None, internal_dns_version, - // We don't actually configure external DNS during RSS. During handoff, - // Nexus creates an initial empty external DNS configuration, then - // creates the recovery silo, which bumps the generation. - // - // TODO-john How should we do this? Baking this knowledge of how Nexus - // sets up external DNS here seems bad. - external_dns_version: Generation::new().next(), + // We don't configure external DNS during RSS, so set it to an initial + // generation of 1. Nexus will bump this up when it updates external DNS + // (including creating the recovery silo). + external_dns_version: Generation::new(), time_created: Utc::now(), creator: "RSS".to_string(), comment: "initial blueprint from rack setup".to_string(),