diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs index 9a2a39267c..eda6b00ef9 100644 --- a/illumos-utils/src/running_zone.rs +++ b/illumos-utils/src/running_zone.rs @@ -147,6 +147,10 @@ impl RunningZone { format!("{}/root", self.inner.zonepath.display()) } + pub fn control_interface(&self) -> AddrObject { + AddrObject::new(self.inner.get_control_vnic_name(), "omicron6").unwrap() + } + /// Runs a command within the Zone, return the output. pub fn run_cmd(&self, args: I) -> Result where @@ -592,8 +596,8 @@ impl InstalledZone { /// /// This results in a zone name which is distinct across different zpools, /// but stable and predictable across reboots. - pub fn get_zone_name(zone_name: &str, unique_name: Option<&str>) -> String { - let mut zone_name = format!("{}{}", ZONE_PREFIX, zone_name); + pub fn get_zone_name(zone_type: &str, unique_name: Option<&str>) -> String { + let mut zone_name = format!("{}{}", ZONE_PREFIX, zone_type); if let Some(suffix) = unique_name { zone_name.push_str(&format!("_{}", suffix)); } @@ -618,7 +622,7 @@ impl InstalledZone { log: &Logger, underlay_vnic_allocator: &VnicAllocator, zone_root_path: &Path, - zone_name: &str, + zone_type: &str, unique_name: Option<&str>, datasets: &[zone::Dataset], filesystems: &[zone::Fs], @@ -631,14 +635,14 @@ impl InstalledZone { let control_vnic = underlay_vnic_allocator.new_control(None).map_err(|err| { InstallZoneError::CreateVnic { - zone: zone_name.to_string(), + zone: zone_type.to_string(), err, } })?; - let full_zone_name = Self::get_zone_name(zone_name, unique_name); + let full_zone_name = Self::get_zone_name(zone_type, unique_name); let zone_image_path = - PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_name)); + PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_type)); let net_device_names: Vec = opte_ports .iter() diff --git a/illumos-utils/src/zpool.rs b/illumos-utils/src/zpool.rs index cd0fa847c7..dc4507f0ea 100644 --- a/illumos-utils/src/zpool.rs +++ b/illumos-utils/src/zpool.rs @@ -249,6 +249,7 @@ impl Zpool { } #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, JsonSchema)] +#[serde(rename_all = "snake_case")] pub enum ZpoolKind { // This zpool is used for external storage (u.2) External, diff --git a/internal-dns/src/resolver.rs b/internal-dns/src/resolver.rs index c6e0ea1e19..484f31c7fa 100644 --- a/internal-dns/src/resolver.rs +++ b/internal-dns/src/resolver.rs @@ -121,6 +121,31 @@ impl Resolver { Ok(*address) } + pub async fn lookup_all_ipv6( + &self, + srv: crate::ServiceName, + ) -> Result, ResolveError> { + let name = format!("{}.{}", srv.dns_name(), DNS_ZONE); + debug!(self.log, "lookup_ipv6 srv"; "dns_name" => &name); + let response = self.inner.ipv6_lookup(&name).await?; + let addresses = response.iter().map(|a| *a).collect::>(); + Ok(addresses) + } + + pub async fn lookup_ip( + &self, + srv: crate::ServiceName, + ) -> Result { + let name = format!("{}.{}", srv.dns_name(), DNS_ZONE); + debug!(self.log, "lookup srv"; "dns_name" => &name); + let response = self.inner.lookup_ip(&name).await?; + let address = response + .iter() + .next() + .ok_or_else(|| ResolveError::NotFound(srv))?; + Ok(address) + } + /// Looks up a single [`SocketAddrV6`] based on the SRV name /// Returns an error if the record does not exist. pub async fn lookup_socket_v6( @@ -156,20 +181,6 @@ impl Resolver { } }) } - - pub async fn lookup_ip( - &self, - srv: crate::ServiceName, - ) -> Result { - let name = format!("{}.{}", srv.dns_name(), DNS_ZONE); - debug!(self.log, "lookup srv"; "dns_name" => &name); - let response = self.inner.lookup_ip(&name).await?; - let address = response - .iter() - .next() - .ok_or_else(|| ResolveError::NotFound(srv))?; - Ok(address) - } } #[cfg(test)] diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index a268eb0095..26914d2a88 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -10,6 +10,24 @@ "version": "0.0.1" }, "paths": { + "/cockroachdb": { + "post": { + "summary": "Initializes a CockroachDB cluster, calling:", + "description": "\nand also populating the contents of the filesystem with preliminary tables.", + "operationId": "cockroachdb_init", + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/disks/{disk_id}": { "put": { "operationId": "disk_put", @@ -640,13 +658,6 @@ { "type": "object", "properties": { - "all_addresses": { - "description": "The addresses of all nodes within the cluster.", - "type": "array", - "items": { - "type": "string" - } - }, "type": { "type": "string", "enum": [ @@ -655,7 +666,6 @@ } }, "required": [ - "all_addresses", "type" ] }, @@ -689,6 +699,21 @@ } ] }, + "DatasetName": { + "type": "object", + "properties": { + "kind": { + "$ref": "#/components/schemas/DatasetKind" + }, + "pool_name": { + "$ref": "#/components/schemas/ZpoolName" + } + }, + "required": [ + "kind", + "pool_name" + ] + }, "DendriteAsic": { "type": "string", "enum": [ @@ -1738,7 +1763,7 @@ "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" }, "ServiceEnsureBody": { - "description": "Used to request that the Sled initialize certain services on initialization.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.", + "description": "Used to request that the Sled initialize multiple services.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.", "type": "object", "properties": { "services": { @@ -2036,6 +2061,48 @@ "mode", "type" ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "clickhouse" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "cockroach_db" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "crucible" + ] + } + }, + "required": [ + "type" + ] } ] }, @@ -2050,6 +2117,15 @@ "format": "ipv6" } }, + "dataset": { + "nullable": true, + "default": null, + "allOf": [ + { + "$ref": "#/components/schemas/DatasetName" + } + ] + }, "gz_addresses": { "default": [], "type": "array", @@ -2080,6 +2156,7 @@ ] }, "ServiceZoneService": { + "description": "Used to request that the Sled initialize a single service.", "type": "object", "properties": { "details": { @@ -2481,13 +2558,16 @@ "description": "The type of zone which may be requested from Sled Agent", "type": "string", "enum": [ + "clickhouse", + "cockroach_db", + "crucible_pantry", + "crucible", "external_dns", "internal_dns", "nexus", + "ntp", "oximeter", - "switch", - "crucible_pantry", - "ntp" + "switch" ] }, "Zpool": { @@ -2505,6 +2585,30 @@ "disk_type", "id" ] + }, + "ZpoolKind": { + "type": "string", + "enum": [ + "external", + "internal" + ] + }, + "ZpoolName": { + "description": "A wrapper around a zpool name.\n\nThis expects that the format will be: `ox{i,p}_` - we parse the prefix when reading the structure, and validate that the UUID can be utilized.", + "type": "object", + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "kind": { + "$ref": "#/components/schemas/ZpoolKind" + } + }, + "required": [ + "id", + "kind" + ] } } } diff --git a/sled-agent/src/bootstrap/hardware.rs b/sled-agent/src/bootstrap/hardware.rs index 56d0f9c937..4312f80457 100644 --- a/sled-agent/src/bootstrap/hardware.rs +++ b/sled-agent/src/bootstrap/hardware.rs @@ -178,10 +178,7 @@ impl HardwareMonitor { let hardware = HardwareManager::new(log, sled_mode) .map_err(|e| Error::Hardware(e))?; - // TODO: The coupling between the storage and service manager is growing - // pretty tight; we should consider merging them together. - let storage_manager = - StorageManager::new(&log, underlay_etherstub.clone()).await; + let storage_manager = StorageManager::new(&log).await; let service_manager = ServiceManager::new( log.clone(), diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 0fc17cb8e0..9c01c75af9 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -11,9 +11,19 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use serde_with::serde_as; use std::borrow::Cow; +use std::collections::HashSet; use std::net::{Ipv4Addr, Ipv6Addr, SocketAddrV6}; use uuid::Uuid; +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum BootstrapAddressDiscovery { + /// Ignore all bootstrap addresses except our own. + OnlyOurs, + /// Ignore all bootstrap addresses except the following. + OnlyThese(HashSet), +} + /// Configuration for the "rack setup service". /// /// The Rack Setup Service should be responsible for one-time setup actions, @@ -24,6 +34,9 @@ use uuid::Uuid; pub struct RackInitializeRequest { pub rack_subnet: Ipv6Addr, + /// Describes how bootstrap addresses should be collected during RSS. + pub bootstrap_discovery: BootstrapAddressDiscovery, + /// The minimum number of sleds required to unlock the rack secret. /// /// If this value is less than 2, no rack secret will be created on startup; diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 027cf7dc88..06757e2e1a 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -32,6 +32,7 @@ pub fn api() -> SledApiDescription { fn register_endpoints(api: &mut SledApiDescription) -> Result<(), String> { api.register(disk_put)?; api.register(filesystem_put)?; + api.register(cockroachdb_init)?; api.register(instance_issue_disk_snapshot_request)?; api.register(instance_put_migration_ids)?; api.register(instance_put_state)?; @@ -103,6 +104,7 @@ async fn filesystem_put( let sa = rqctx.context(); let body_args = body.into_inner(); sa.filesystem_ensure( + body_args.id, body_args.zpool_id, body_args.dataset_kind, body_args.address, @@ -112,6 +114,24 @@ async fn filesystem_put( Ok(HttpResponseUpdatedNoContent()) } +/// Initializes a CockroachDB cluster, calling: +/// +/// +/// +/// and also populating the contents of the filesystem with preliminary +/// tables. +#[endpoint { + method = POST, + path = "/cockroachdb", +}] +async fn cockroachdb_init( + rqctx: RequestContext, +) -> Result { + let sa = rqctx.context(); + sa.cockroachdb_initialize().await?; + Ok(HttpResponseUpdatedNoContent()) +} + /// Path parameters for Instance requests (sled agent API) #[derive(Deserialize, JsonSchema)] struct InstancePathParam { diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 0944aa62bb..9682fa3cc8 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -31,6 +31,7 @@ mod services; mod sled_agent; mod smf_helper; pub mod sp; +pub(crate) mod storage; mod storage_manager; mod updates; diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 5f3107d19a..cefc5b3420 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -206,24 +206,45 @@ pub struct Zpool { /// The type of a dataset, and an auxiliary information necessary /// to successfully launch a zone managing the associated data. -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +#[derive( + Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, +)] #[serde(tag = "type", rename_all = "snake_case")] pub enum DatasetKind { - CockroachDb { - /// The addresses of all nodes within the cluster. - all_addresses: Vec, - }, + CockroachDb, Crucible, Clickhouse, } +impl DatasetKind { + /// Returns the type of the zone which manages this dataset. + pub fn zone_type(&self) -> ZoneType { + match *self { + DatasetKind::CockroachDb => ZoneType::CockroachDb, + DatasetKind::Crucible => ZoneType::Crucible, + DatasetKind::Clickhouse => ZoneType::Clickhouse, + } + } + + /// Returns the service type which runs in the zone managing this dataset. + /// + /// NOTE: This interface is only viable because datasets run a single + /// service in their zone. If that precondition is no longer true, this + /// interface should be re-visited. + pub fn service_type(&self) -> ServiceType { + match *self { + DatasetKind::CockroachDb => ServiceType::CockroachDb, + DatasetKind::Crucible => ServiceType::Crucible, + DatasetKind::Clickhouse => ServiceType::Clickhouse, + } + } +} + impl From for sled_agent_client::types::DatasetKind { fn from(k: DatasetKind) -> Self { use DatasetKind::*; match k { - CockroachDb { all_addresses } => Self::CockroachDb( - all_addresses.iter().map(|a| a.to_string()).collect(), - ), + CockroachDb => Self::CockroachDb, Crucible => Self::Crucible, Clickhouse => Self::Clickhouse, } @@ -333,6 +354,9 @@ pub enum ServiceType { Maghemite { mode: String, }, + Clickhouse, + CockroachDb, + Crucible, } impl std::fmt::Display for ServiceType { @@ -350,6 +374,9 @@ impl std::fmt::Display for ServiceType { ServiceType::BoundaryNtp { .. } | ServiceType::InternalNtp { .. } => write!(f, "ntp"), ServiceType::Maghemite { .. } => write!(f, "mg-ddm"), + ServiceType::Clickhouse => write!(f, "clickhouse"), + ServiceType::CockroachDb => write!(f, "cockroachdb"), + ServiceType::Crucible => write!(f, "crucible"), } } } @@ -427,6 +454,9 @@ impl From for sled_agent_client::types::ServiceType { AutoSt::InternalNtp { ntp_servers, dns_servers, domain } } St::Maghemite { mode } => AutoSt::Maghemite { mode }, + St::Clickhouse => AutoSt::Clickhouse, + St::CockroachDb => AutoSt::CockroachDb, + St::Crucible => AutoSt::Crucible, } } } @@ -437,25 +467,31 @@ impl From for sled_agent_client::types::ServiceType { )] #[serde(rename_all = "snake_case")] pub enum ZoneType { + Clickhouse, + CockroachDb, + CruciblePantry, + Crucible, ExternalDns, InternalDns, Nexus, + Ntp, Oximeter, Switch, - CruciblePantry, - Ntp, } impl From for sled_agent_client::types::ZoneType { fn from(zt: ZoneType) -> Self { match zt { + ZoneType::Clickhouse => Self::Clickhouse, + ZoneType::CockroachDb => Self::CockroachDb, + ZoneType::Crucible => Self::Crucible, + ZoneType::CruciblePantry => Self::CruciblePantry, ZoneType::InternalDns => Self::InternalDns, ZoneType::ExternalDns => Self::ExternalDns, ZoneType::Nexus => Self::Nexus, + ZoneType::Ntp => Self::Ntp, ZoneType::Oximeter => Self::Oximeter, ZoneType::Switch => Self::Switch, - ZoneType::CruciblePantry => Self::CruciblePantry, - ZoneType::Ntp => Self::Ntp, } } } @@ -464,13 +500,16 @@ impl std::fmt::Display for ZoneType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use ZoneType::*; let name = match self { + Clickhouse => "clickhouse", + CockroachDb => "cockroachdb", + Crucible => "crucible", + CruciblePantry => "crucible_pantry", ExternalDns => "external_dns", InternalDns => "internal_dns", Nexus => "nexus", + Ntp => "ntp", Oximeter => "oximeter", Switch => "switch", - CruciblePantry => "crucible_pantry", - Ntp => "ntp", }; write!(f, "{name}") } @@ -482,11 +521,16 @@ impl std::fmt::Display for ZoneType { )] pub struct ServiceZoneRequest { // The UUID of the zone to be initialized. + // TODO: Should this be removed? If we have UUIDs on the services, what's + // the point of this? pub id: Uuid, // The type of the zone to be created. pub zone_type: ZoneType, // The addresses on which the service should listen for requests. pub addresses: Vec, + // Datasets which should be managed by this service. + #[serde(default)] + pub dataset: Option, // The addresses in the global zone which should be created, if necessary // to route to the service. // @@ -500,6 +544,21 @@ pub struct ServiceZoneRequest { pub services: Vec, } +impl ServiceZoneRequest { + // The full name of the zone, if it was to be created as a zone. + pub fn zone_name(&self) -> String { + illumos_utils::running_zone::InstalledZone::get_zone_name( + &self.zone_type.to_string(), + self.zone_name_unique_identifier().as_deref(), + ) + } + + // The name of a unique identifier for the zone, if one is necessary. + pub fn zone_name_unique_identifier(&self) -> Option { + self.dataset.as_ref().map(|d| d.pool().to_string()) + } +} + impl From for sled_agent_client::types::ServiceZoneRequest { fn from(s: ServiceZoneRequest) -> Self { let mut services = Vec::new(); @@ -511,12 +570,14 @@ impl From for sled_agent_client::types::ServiceZoneRequest { id: s.id, zone_type: s.zone_type.into(), addresses: s.addresses, + dataset: s.dataset.map(|d| d.into()), gz_addresses: s.gz_addresses, services, } } } +/// Used to request that the Sled initialize a single service. #[derive( Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, )] @@ -531,7 +592,7 @@ impl From for sled_agent_client::types::ServiceZoneService { } } -/// Used to request that the Sled initialize certain services on initialization. +/// Used to request that the Sled initialize multiple services. /// /// This may be used to record that certain sleds are responsible for /// launching services which may not be associated with a dataset, such diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs index cba9a9f44f..9957d9257f 100644 --- a/sled-agent/src/rack_setup/config.rs +++ b/sled-agent/src/rack_setup/config.rs @@ -39,6 +39,7 @@ impl SetupServiceConfig { #[cfg(test)] mod test { use super::*; + use crate::bootstrap::params::BootstrapAddressDiscovery; use crate::bootstrap::params::Gateway; use omicron_common::address::IpRange; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; @@ -47,6 +48,7 @@ mod test { fn test_subnets() { let cfg = SetupServiceConfig { rack_subnet: "fd00:1122:3344:0100::".parse().unwrap(), + bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, rack_secret_threshold: 0, gateway: Some(Gateway { address: None, diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 008ae9d3f4..4c09b87d35 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -43,7 +43,7 @@ const BOUNDARY_NTP_COUNT: usize = 2; const NEXUS_COUNT: usize = 1; // The number of CRDB instances to create from RSS. -const CRDB_COUNT: usize = 1; +const CRDB_COUNT: usize = 3; // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove // when Nexus provisions Oximeter. @@ -297,6 +297,7 @@ impl Plan { id, zone_type: ZoneType::ExternalDns, addresses: vec![internal_ip], + dataset: None, gz_addresses: vec![], services: vec![ServiceZoneService { id, @@ -333,6 +334,7 @@ impl Plan { id, zone_type: ZoneType::Nexus, addresses: vec![address], + dataset: None, gz_addresses: vec![], services: vec![ServiceZoneService { id, @@ -361,6 +363,7 @@ impl Plan { id, zone_type: ZoneType::Oximeter, addresses: vec![address], + dataset: None, gz_addresses: vec![], services: vec![ServiceZoneService { id, @@ -383,9 +386,7 @@ impl Plan { request.datasets.push(DatasetEnsureBody { id, zpool_id: u2_zpools[0], - dataset_kind: crate::params::DatasetKind::CockroachDb { - all_addresses: vec![address], - }, + dataset_kind: crate::params::DatasetKind::CockroachDb, address, }); } @@ -454,6 +455,7 @@ impl Plan { id, zone_type: ZoneType::InternalDns, addresses: vec![dns_addr], + dataset: None, gz_addresses: vec![dns_subnet.gz_address().ip()], services: vec![ServiceZoneService { id, @@ -489,6 +491,7 @@ impl Plan { id, zone_type: ZoneType::CruciblePantry, addresses: vec![address], + dataset: None, gz_addresses: vec![], services: vec![ServiceZoneService { id, @@ -545,6 +548,7 @@ impl Plan { id, zone_type: ZoneType::Ntp, addresses: vec![address], + dataset: None, gz_addresses: vec![], services, }); diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index a754aac8e5..f78a4c2484 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -13,7 +13,7 @@ use crate::rack_setup::config::SetupServiceConfig as Config; use serde::{Deserialize, Serialize}; use slog::Logger; use sprockets_host::Ed25519Certificate; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::net::{Ipv6Addr, SocketAddrV6}; use std::path::{Path, PathBuf}; use thiserror::Error; @@ -119,7 +119,7 @@ impl Plan { pub async fn create( log: &Logger, config: &Config, - bootstrap_addrs: Vec, + bootstrap_addrs: HashSet, ) -> Result { let rack_id = Uuid::new_v4(); diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 1586cfb5de..3c9dde29a1 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -56,6 +56,7 @@ use super::config::SetupServiceConfig as Config; use crate::bootstrap::config::BOOTSTRAP_AGENT_HTTP_PORT; +use crate::bootstrap::params::BootstrapAddressDiscovery; use crate::bootstrap::params::SledAgentRequest; use crate::bootstrap::rss_handle::BootstrapAgentHandle; use crate::nexus::d2n_params; @@ -94,9 +95,6 @@ use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; use std::path::PathBuf; use thiserror::Error; -// The minimum number of sleds to initialize the rack. -const MINIMUM_SLED_COUNT: usize = 1; - /// Describes errors which may occur while operating the setup service. #[derive(Error, Debug)] pub enum SetupServiceError { @@ -217,24 +215,6 @@ fn rss_completed_marker_path() -> PathBuf { .join("rss-plan-completed.marker") } -// Describes the options when awaiting for peers. -enum PeerExpectation { - // Await a set of peers that matches this group of IPv6 addresses exactly. - // - // TODO: We currently don't deal with the case where: - // - // - RSS boots, sees some sleds, comes up with a plan. - // - RSS reboots, sees a *different* set of sleds, and needs - // to adjust the plan. - // - // This case is fairly tricky because some sleds may have - // already received requests to initialize - modifying the - // allocated subnets would be non-trivial. - LoadOldPlan(HashSet), - // Await any peers, as long as there are at least enough to make a new plan. - CreateNewPlan(usize), -} - /// The implementation of the Rack Setup Service. struct ServiceInner { log: Logger, @@ -251,7 +231,6 @@ impl ServiceInner { datasets: &Vec, ) -> Result<(), SetupServiceError> { let dur = std::time::Duration::from_secs(60); - let client = reqwest::ClientBuilder::new() .connect_timeout(dur) .timeout(dur) @@ -458,75 +437,6 @@ impl ServiceInner { Ok(()) } - /// Waits for sufficient neighbors to exist so the initial set of requests - /// can be sent out. - async fn wait_for_peers( - &self, - expectation: PeerExpectation, - our_bootstrap_address: Ipv6Addr, - ) -> Result, DdmError> { - let ddm_admin_client = DdmAdminClient::localhost(&self.log)?; - let addrs = retry_notify( - retry_policy_internal_service_aggressive(), - || async { - let peer_addrs = ddm_admin_client - .derive_bootstrap_addrs_from_prefixes(&[ - BootstrapInterface::GlobalZone, - ]) - .await - .map_err(|err| { - BackoffError::transient(format!( - "Failed getting peers from mg-ddm: {err}" - )) - })?; - - let all_addrs = peer_addrs - .chain(iter::once(our_bootstrap_address)) - .collect::>(); - - match expectation { - PeerExpectation::LoadOldPlan(ref expected) => { - if all_addrs.is_superset(expected) { - Ok(all_addrs.into_iter().collect()) - } else { - Err(BackoffError::transient( - concat!( - "Waiting for a LoadOldPlan set ", - "of peers not found yet." - ) - .to_string(), - )) - } - } - PeerExpectation::CreateNewPlan(wanted_peer_count) => { - if all_addrs.len() >= wanted_peer_count { - Ok(all_addrs.into_iter().collect()) - } else { - Err(BackoffError::transient(format!( - "Waiting for {} peers (currently have {})", - wanted_peer_count, - all_addrs.len() - ))) - } - } - } - }, - |message, duration| { - info!( - self.log, - "{} (will retry after {:?})", message, duration - ); - }, - ) - // `retry_policy_internal_service_aggressive()` retries indefinitely on - // transient errors (the only kind we produce), allowing us to - // `.unwrap()` without panicking - .await - .unwrap(); - - Ok(addrs) - } - async fn sled_timesync( &self, sled_address: &SocketAddrV6, @@ -875,6 +785,54 @@ impl ServiceInner { Ok(()) } + async fn initialize_cockroach( + &self, + service_plan: &ServicePlan, + ) -> Result<(), SetupServiceError> { + // Now that datasets and zones have started for CockroachDB, + // perform one-time initialization of the cluster. + let sled_address = service_plan + .services + .iter() + .find_map(|(sled_address, sled_request)| { + if sled_request.datasets.iter().any(|dataset| { + dataset.dataset_kind + == crate::params::DatasetKind::CockroachDb + }) { + Some(sled_address) + } else { + None + } + }) + .expect("Should not create service plans without CockroachDb"); + let dur = std::time::Duration::from_secs(60); + let client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .timeout(dur) + .build() + .map_err(SetupServiceError::HttpClient)?; + let client = SledAgentClient::new_with_client( + &format!("http://{}", sled_address), + client, + self.log.new(o!("SledAgentClient" => sled_address.to_string())), + ); + let initialize_db = || async { + client.cockroachdb_init().await.map_err(BackoffError::transient)?; + Ok::<(), BackoffError>>(()) + }; + let log_failure = |error, _| { + warn!(self.log, "Failed to initialize CockroachDB"; "error" => ?error); + }; + retry_notify( + retry_policy_internal_service_aggressive(), + initialize_db, + log_failure, + ) + .await + .unwrap(); + Ok(()) + } + // This method has a few distinct phases, identified by files in durable // storage: // @@ -936,19 +894,25 @@ impl ServiceInner { // Wait for either: // - All the peers to re-load an old plan (if one exists) // - Enough peers to create a new plan (if one does not exist) - let maybe_sled_plan = SledPlan::load(&self.log).await?; - let expectation = if let Some(plan) = &maybe_sled_plan { - PeerExpectation::LoadOldPlan( - plan.sleds.keys().map(|a| *a.ip()).collect(), - ) - } else { - PeerExpectation::CreateNewPlan(MINIMUM_SLED_COUNT) + let bootstrap_addrs = match &config.bootstrap_discovery { + BootstrapAddressDiscovery::OnlyOurs => { + HashSet::from([local_bootstrap_agent.our_address()]) + } + BootstrapAddressDiscovery::OnlyThese(peers) => peers.clone(), }; - - let addrs = self - .wait_for_peers(expectation, local_bootstrap_agent.our_address()) - .await?; - info!(self.log, "Enough peers exist to enact RSS plan"); + let maybe_sled_plan = SledPlan::load(&self.log).await?; + if let Some(plan) = &maybe_sled_plan { + let stored_peers: HashSet = + plan.sleds.keys().map(|a| *a.ip()).collect(); + if stored_peers != bootstrap_addrs { + return Err(SetupServiceError::BadConfig("Set of sleds requested does not match those in existing sled plan".to_string())); + } + } + if bootstrap_addrs.is_empty() { + return Err(SetupServiceError::BadConfig( + "Must request at least one peer".to_string(), + )); + } // If we created a plan, reuse it. Otherwise, create a new plan. // @@ -961,7 +925,7 @@ impl ServiceInner { plan } else { info!(self.log, "Creating new allocation plan"); - SledPlan::create(&self.log, config, addrs).await? + SledPlan::create(&self.log, config, bootstrap_addrs).await? }; let config = &plan.config; @@ -1078,13 +1042,17 @@ impl ServiceInner { info!(self.log, "Finished setting up agents and datasets"); + // Now that datasets and zones have started for CockroachDB, + // perform one-time initialization of the cluster. + self.initialize_cockroach(&service_plan).await?; + // Issue service initialization requests. // // NOTE: This must happen *after* the dataset initialization, // to ensure that CockroachDB has been initialized before Nexus // starts. // - // If Nexus was more resilient to concurrent initialization + // TODO: If Nexus was more resilient to concurrent initialization // of CRDB, this requirement could be relaxed. futures::future::join_all(service_plan.services.iter().map( |(sled_address, services_request)| async move { diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index e155872723..92e942a820 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -20,7 +20,7 @@ //! of what other services Nexus wants to have executing on the sled. //! //! To accomplish this, the following interfaces are exposed: -//! - [ServiceManager::ensure_persistent] exposes an API to request a set of +//! - [ServiceManager::ensure_all_services] exposes an API to request a set of //! services that should persist beyond reboot. //! - [ServiceManager::activate_switch] exposes an API to specifically enable //! or disable (via [ServiceManager::deactivate_switch]) the switch zone. @@ -40,7 +40,9 @@ use illumos_utils::addrobj::IPV6_LINK_LOCAL_NAME; use illumos_utils::dladm::{Dladm, Etherstub, EtherstubVnic, PhysicalLink}; use illumos_utils::link::{Link, VnicAllocator}; use illumos_utils::opte::{Port, PortManager, PortTicket}; -use illumos_utils::running_zone::{InstalledZone, RunningZone}; +use illumos_utils::running_zone::{ + InstalledZone, RunCommandError, RunningZone, +}; use illumos_utils::zfs::ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT; use illumos_utils::zone::AddressRequest; use illumos_utils::zone::Zones; @@ -50,7 +52,10 @@ use itertools::Itertools; use omicron_common::address::Ipv6Subnet; use omicron_common::address::AZ_PREFIX; use omicron_common::address::BOOTSTRAP_ARTIFACT_PORT; +use omicron_common::address::CLICKHOUSE_PORT; +use omicron_common::address::COCKROACH_PORT; use omicron_common::address::CRUCIBLE_PANTRY_PORT; +use omicron_common::address::CRUCIBLE_PORT; use omicron_common::address::DENDRITE_PORT; use omicron_common::address::MGS_PORT; use omicron_common::address::NEXUS_INTERNAL_PORT; @@ -85,17 +90,14 @@ use tokio::sync::Mutex; use tokio::task::JoinHandle; use uuid::Uuid; -// The filename of ServiceManager's internal storage. -const SERVICE_CONFIG_FILENAME: &str = "service.toml"; -// The filename of a half-completed config, in need of parameters supplied at -// runtime. -const PARTIAL_CONFIG_FILENAME: &str = "config-partial.toml"; -// The filename of a completed config, merging the partial config with -// additional appended parameters known at runtime. -const COMPLETE_CONFIG_FILENAME: &str = "config.toml"; - #[derive(thiserror::Error, Debug)] pub enum Error { + #[error("Failed to initilize CockroachDb: {err}")] + CockroachInit { + #[source] + err: RunCommandError, + }, + #[error("Cannot serialize TOML to file {path}: {err}")] TomlSerialize { path: PathBuf, err: toml::ser::Error }, @@ -167,6 +169,9 @@ pub enum Error { #[error("Services already configured for this Sled Agent")] ServicesAlreadyConfigured, + #[error("Failed to get address: {0}")] + GetAddressFailure(#[from] illumos_utils::zone::GetAddressError), + #[error("NTP zone not ready")] NtpZoneNotReady, @@ -194,12 +199,6 @@ impl From for omicron_common::api::external::Error { } } -/// The default path to service configuration, if one is not -/// explicitly provided. -pub fn default_services_config_path() -> PathBuf { - Path::new(omicron_common::OMICRON_CONFIG_PATH).join(SERVICE_CONFIG_FILENAME) -} - /// Configuration parameters which modify the [`ServiceManager`]'s behavior. pub struct Config { /// Identifies the sled being configured @@ -211,9 +210,10 @@ pub struct Config { /// An optional internet gateway address for external services. pub gateway_address: Option, - /// The path for the ServiceManager to store information about - /// all running services. - pub all_svcs_config_path: PathBuf, + // The path for the ServiceManager to store information about + // all running services. + all_svcs_ledger_path: PathBuf, + storage_svcs_ledger_path: PathBuf, } impl Config { @@ -226,11 +226,42 @@ impl Config { sled_id, sidecar_revision, gateway_address, - all_svcs_config_path: default_services_config_path(), + all_svcs_ledger_path: default_services_ledger_path(), + storage_svcs_ledger_path: default_storage_services_ledger_path(), } } } +// The filename of ServiceManager's internal storage. +const SERVICES_CONFIG_FILENAME: &str = "services.toml"; +const STORAGE_SERVICES_CONFIG_FILENAME: &str = "storage-services.toml"; + +// The default path to service configuration +fn default_services_ledger_path() -> PathBuf { + Path::new(omicron_common::OMICRON_CONFIG_PATH) + .join(SERVICES_CONFIG_FILENAME) +} + +// The default path to storage service configuration +fn default_storage_services_ledger_path() -> PathBuf { + Path::new(omicron_common::OMICRON_CONFIG_PATH) + .join(STORAGE_SERVICES_CONFIG_FILENAME) +} + +// TODO(ideas): +// - "ServiceLedger" +// - Manages the serializable "AllZoneRequests" object +// - Constructor which reads from config location (kinda like +// "read_from") +// - ... Writer which *knows the type* to be serialized, so can direct it to the +// appropriate output path. +// +// - TODO: later: Can also make the path writing safer, by... +// - ... TODO: Writing to both M.2s, basically using multiple output paths +// - ... TODO: Using a temporary file and renaming it to make the update atomic +// - ... TODO: Add a .json EXPECTORATE test for the format of "AllZoneRequests" +// - we need to be careful not to break compatibility in the future. + // A wrapper around `ZoneRequest`, which allows it to be serialized // to a toml file. #[derive(Clone, serde::Serialize, serde::Deserialize)] @@ -242,6 +273,44 @@ impl AllZoneRequests { fn new() -> Self { Self { requests: vec![] } } + + // Reads from `path` as a toml-serialized version of `Self`. + async fn read_from(log: &Logger, path: &Path) -> Result { + if path.exists() { + debug!( + log, + "Reading old storage service requests from {}", + path.display() + ); + toml::from_str( + &tokio::fs::read_to_string(&path) + .await + .map_err(|err| Error::io_path(&path, err))?, + ) + .map_err(|err| Error::TomlDeserialize { + path: path.to_path_buf(), + err, + }) + } else { + debug!(log, "No old storage service requests"); + Ok(AllZoneRequests::new()) + } + } + + // Writes to `path` as a toml-serialized version of `Self`. + async fn write_to(&self, log: &Logger, path: &Path) -> Result<(), Error> { + debug!(log, "Writing zone request configuration to {}", path.display()); + let serialized_services = toml::Value::try_from(&self) + .expect("Cannot serialize service list"); + let services_str = + toml::to_string(&serialized_services).map_err(|err| { + Error::TomlSerialize { path: path.to_path_buf(), err } + })?; + tokio::fs::write(&path, services_str) + .await + .map_err(|err| Error::io_path(&path, err))?; + Ok(()) + } } // This struct represents the combo of "what zone did you ask for" + "where did @@ -249,6 +318,7 @@ impl AllZoneRequests { #[derive(Clone, serde::Serialize, serde::Deserialize)] struct ZoneRequest { zone: ServiceZoneRequest, + // TODO: Consider collapsing "root" into ServiceZoneRequest root: PathBuf, } @@ -304,7 +374,10 @@ pub struct ServiceManagerInner { time_synced: AtomicBool, sidecar_revision: String, switch_zone_maghemite_links: Vec, + // Zones representing running services zones: Mutex>, + // Zones representing services which own datasets + dataset_zones: Mutex>, underlay_vnic_allocator: VnicAllocator, underlay_vnic: EtherstubVnic, bootstrap_vnic_allocator: VnicAllocator, @@ -376,6 +449,7 @@ impl ServiceManager { sidecar_revision, switch_zone_maghemite_links, zones: Mutex::new(vec![]), + dataset_zones: Mutex::new(vec![]), underlay_vnic_allocator: VnicAllocator::new( "Service", underlay_etherstub, @@ -399,6 +473,95 @@ impl ServiceManager { self.inner.switch_zone_bootstrap_address } + pub async fn load_non_storage_services(&self) -> Result<(), Error> { + let log = &self.inner.log; + let ledger = self.services_ledger_path()?; + if !ledger.exists() { + return Ok(()); + } + let services = AllZoneRequests::read_from(log, &ledger).await?; + let mut existing_zones = self.inner.zones.lock().await; + + // Initialize and DNS and NTP services first as they are required + // for time synchronization, which is a pre-requisite for the other + // services. + self.initialize_services_locked( + &mut existing_zones, + &services + .requests + .clone() + .into_iter() + .filter(|svc| { + matches!( + svc.zone.zone_type, + ZoneType::InternalDns | ZoneType::Ntp + ) + }) + .collect(), + ) + .await?; + + drop(existing_zones); + + info!(&self.inner.log, "Waiting for sled time synchronization"); + + retry_notify( + retry_policy_local(), + || async { + match self.timesync_get().await { + Ok(TimeSync { sync: true, .. }) => { + info!(&self.inner.log, "Time is synchronized"); + Ok(()) + } + Ok(ts) => Err(BackoffError::transient(format!( + "No sync {:?}", + ts + ))), + Err(e) => Err(BackoffError::transient(format!( + "Error checking for time synchronization: {}", + e + ))), + } + }, + |error, delay| { + warn!( + self.inner.log, + "Time not yet synchronised (retrying in {:?})", + delay; + "error" => ?error + ); + }, + ) + .await + .expect("Expected an infinite retry loop syncing time"); + + let mut existing_zones = self.inner.zones.lock().await; + + // Initialize all remaining serivces + self.initialize_services_locked( + &mut existing_zones, + &services.requests, + ) + .await?; + Ok(()) + } + + pub async fn load_storage_services(&self) -> Result<(), Error> { + let log = &self.inner.log; + let ledger = self.storage_services_ledger_path()?; + if !ledger.exists() { + return Ok(()); + } + let services = AllZoneRequests::read_from(log, &ledger).await?; + let mut existing_zones = self.inner.dataset_zones.lock().await; + self.initialize_services_locked( + &mut existing_zones, + &services.requests, + ) + .await?; + Ok(()) + } + /// Loads services from the services manager, and returns once all requested /// services have been started. pub async fn sled_agent_started( @@ -408,6 +571,7 @@ impl ServiceManager { underlay_address: Ipv6Addr, rack_id: Uuid, ) -> Result<(), Error> { + debug!(&self.inner.log, "sled agent started"; "underlay_address" => underlay_address.to_string()); self.inner .sled_info .set(SledAgentInfo { @@ -423,88 +587,14 @@ impl ServiceManager { .map_err(|_| "already set".to_string()) .expect("Sled Agent should only start once"); - let config_path = self.services_config_path()?; - if config_path.exists() { - info!( - &self.inner.log, - "Sled services found at {}; loading", - config_path.to_string_lossy() - ); - let cfg: AllZoneRequests = toml::from_str( - &tokio::fs::read_to_string(&config_path) - .await - .map_err(|err| Error::io_path(&config_path, err))?, - ) - .map_err(|err| Error::TomlDeserialize { - path: config_path.clone(), - err, - })?; - let mut existing_zones = self.inner.zones.lock().await; - - // Initialize and DNS and NTP services first as they are required - // for time synchronization, which is a pre-requisite for the other - // services. - self.initialize_services_locked( - &mut existing_zones, - &cfg.requests - .clone() - .into_iter() - .filter(|svc| { - matches!( - svc.zone.zone_type, - ZoneType::InternalDns | ZoneType::Ntp - ) - }) - .collect(), - ) - .await?; - - drop(existing_zones); - - info!(&self.inner.log, "Waiting for sled time synchronization"); - - retry_notify( - retry_policy_local(), - || async { - match self.timesync_get().await { - Ok(TimeSync { sync: true, .. }) => { - info!(&self.inner.log, "Time is synchronized"); - Ok(()) - } - Ok(ts) => Err(BackoffError::transient(format!( - "No sync {:?}", - ts - ))), - Err(e) => Err(BackoffError::transient(format!( - "Error checking for time synchronization: {}", - e - ))), - } - }, - |error, delay| { - warn!( - self.inner.log, - "Time not yet synchronised (retrying in {:?})", - delay; - "error" => ?error - ); - }, - ) - .await - .expect("Expected an infinite retry loop syncing time"); - - let mut existing_zones = self.inner.zones.lock().await; - - // Initialize all remaining serivces - self.initialize_services_locked(&mut existing_zones, &cfg.requests) - .await?; - } else { - info!( - &self.inner.log, - "No sled services found at {}", - config_path.to_string_lossy() - ); - } + self.load_non_storage_services().await?; + // TODO: These will fail if the disks aren't attached. + // Should we have a retry loop here? Kinda like we have with the switch + // / NTP zone? + // + // NOTE: We could totally do the same thing with + // "load_non_storage_services". + self.load_storage_services().await?; Ok(()) } @@ -514,11 +604,21 @@ impl ServiceManager { self.inner.sled_mode } - // Returns either the path to the explicitly provided config path, or + // Returns either the path to the explicitly provided ledger path, or // chooses the default one. - fn services_config_path(&self) -> Result { + fn services_ledger_path(&self) -> Result { if let Some(info) = self.inner.sled_info.get() { - Ok(info.config.all_svcs_config_path.clone()) + Ok(info.config.all_svcs_ledger_path.clone()) + } else { + Err(Error::SledAgentNotReady) + } + } + + // Returns either the path to the explicitly provided ledger path, or + // chooses the default one. + fn storage_services_ledger_path(&self) -> Result { + if let Some(info) = self.inner.sled_info.get() { + Ok(info.config.storage_svcs_ledger_path.clone()) } else { Err(Error::SledAgentNotReady) } @@ -870,6 +970,17 @@ impl ServiceManager { let opte_ports = self.opte_ports_needed(&request.zone).await?; let limit_priv = Self::privs_needed(&request.zone); + // If the zone is managing a particular dataset, plumb that + // dataset into the zone. Additionally, construct a "unique enough" name + // so we can create multiple zones of this type without collision. + let unique_name = request.zone.zone_name_unique_identifier(); + let datasets = request + .zone + .dataset + .iter() + .map(|d| zone::Dataset { name: d.full() }) + .collect::>(); + let devices: Vec = device_names .iter() .map(|d| zone::Device { name: d.to_string() }) @@ -880,11 +991,8 @@ impl ServiceManager { &self.inner.underlay_vnic_allocator, &request.root, &request.zone.zone_type.to_string(), - // unique_name= - None, - // dataset= - &[], - // filesystems= + unique_name.as_deref(), + datasets.as_slice(), &filesystems, &devices, opte_ports, @@ -899,6 +1007,125 @@ impl ServiceManager { // These zones are self-assembling -- after they boot, there should // be no "zlogin" necessary to initialize. match request.zone.zone_type { + ZoneType::Clickhouse => { + let Some(info) = self.inner.sled_info.get() else { + return Err(Error::SledAgentNotReady); + }; + let datalink = installed_zone.get_control_vnic_name(); + let gateway = &info.underlay_address.to_string(); + assert_eq!(request.zone.addresses.len(), 1); + let listen_addr = &request.zone.addresses[0].to_string(); + let listen_port = &CLICKHOUSE_PORT.to_string(); + + let config = PropertyGroupBuilder::new("config") + .add_property("datalink", "astring", datalink) + .add_property("gateway", "astring", gateway) + .add_property("listen_addr", "astring", listen_addr) + .add_property("listen_port", "astring", listen_port) + .add_property("store", "astring", "/data"); + + let profile = ProfileBuilder::new("omicron").add_service( + ServiceBuilder::new("oxide/clickhouse").add_instance( + ServiceInstanceBuilder::new("default") + .add_property_group(config), + ), + ); + profile + .add_to_zone(&self.inner.log, &installed_zone) + .await + .map_err(|err| { + Error::io("Failed to setup clickhouse profile", err) + })?; + return Ok(RunningZone::boot(installed_zone).await?); + } + ZoneType::CockroachDb => { + let Some(info) = self.inner.sled_info.get() else { + return Err(Error::SledAgentNotReady); + }; + let datalink = installed_zone.get_control_vnic_name(); + let gateway = &info.underlay_address.to_string(); + assert_eq!(request.zone.addresses.len(), 1); + let address = SocketAddr::new( + IpAddr::V6(request.zone.addresses[0]), + COCKROACH_PORT, + ); + let listen_addr = &address.ip().to_string(); + let listen_port = &address.port().to_string(); + + // Look up all cockroachdb addresses stored in internal DNS. + let all_crdb_addresses = info + .resolver + .lookup_all_ipv6(internal_dns::ServiceName::Cockroach) + .await? + .into_iter() + .map(|addr| { + SocketAddr::new(IpAddr::V6(addr), COCKROACH_PORT) + .to_string() + }) + .collect::>() + .join(","); + + let config = PropertyGroupBuilder::new("config") + .add_property("datalink", "astring", datalink) + .add_property("gateway", "astring", gateway) + .add_property("listen_addr", "astring", listen_addr) + .add_property("listen_port", "astring", listen_port) + .add_property("store", "astring", "/data") + .add_property("join_addrs", "astring", &all_crdb_addresses); + + let profile = ProfileBuilder::new("omicron").add_service( + ServiceBuilder::new("oxide/cockroachdb").add_instance( + ServiceInstanceBuilder::new("default") + .add_property_group(config), + ), + ); + profile + .add_to_zone(&self.inner.log, &installed_zone) + .await + .map_err(|err| { + Error::io("Failed to setup CRDB profile", err) + })?; + return Ok(RunningZone::boot(installed_zone).await?); + } + ZoneType::Crucible => { + let Some(info) = self.inner.sled_info.get() else { + return Err(Error::SledAgentNotReady); + }; + let datalink = installed_zone.get_control_vnic_name(); + let gateway = &info.underlay_address.to_string(); + assert_eq!(request.zone.addresses.len(), 1); + let listen_addr = &request.zone.addresses[0].to_string(); + let listen_port = &CRUCIBLE_PORT.to_string(); + + let dataset = request + .zone + .dataset + .as_ref() + .expect("Crucible requires dataset"); + let uuid = &Uuid::new_v4().to_string(); + let config = PropertyGroupBuilder::new("config") + .add_property("datalink", "astring", datalink) + .add_property("gateway", "astring", gateway) + .add_property("dataset", "astring", &dataset.full()) + .add_property("listen_addr", "astring", listen_addr) + .add_property("listen_port", "astring", listen_port) + .add_property("uuid", "astring", uuid) + .add_property("store", "astring", "/data"); + + let profile = ProfileBuilder::new("omicron").add_service( + ServiceBuilder::new("oxide/crucible/agent").add_instance( + ServiceInstanceBuilder::new("default") + .add_property_group(config), + ), + ); + profile + .add_to_zone(&self.inner.log, &installed_zone) + .await + .map_err(|err| { + Error::io("Failed to setup crucible profile", err) + })?; + return Ok(RunningZone::boot(installed_zone).await?); + } ZoneType::CruciblePantry => { let Some(info) = self.inner.sled_info.get() else { return Err(Error::SledAgentNotReady); @@ -1111,6 +1338,12 @@ impl ServiceManager { "{}/var/svc/manifest/site/nexus", running_zone.root() )); + // The filename of a half-completed config, in need of parameters supplied at + // runtime. + const PARTIAL_CONFIG_FILENAME: &str = "config-partial.toml"; + // The filename of a completed config, merging the partial config with + // additional appended parameters known at runtime. + const COMPLETE_CONFIG_FILENAME: &str = "config.toml"; let partial_config_path = config_dir.join(PARTIAL_CONFIG_FILENAME); let config_path = config_dir.join(COMPLETE_CONFIG_FILENAME); @@ -1350,9 +1583,6 @@ impl ServiceManager { smfh.setprop("config/port", &format!("{}", DENDRITE_PORT))?; smfh.refresh()?; } - ServiceType::CruciblePantry => { - panic!("CruciblePantry is self-assembling now") - } ServiceType::BoundaryNtp { ntp_servers, dns_servers, @@ -1478,6 +1708,15 @@ impl ServiceManager { smfh.refresh()?; } + ServiceType::Crucible + | ServiceType::CruciblePantry + | ServiceType::CockroachDb + | ServiceType::Clickhouse => { + panic!( + "{} is a service which exists as part of a self-assembling zone", + service.details, + ) + } } debug!(self.inner.log, "enabling service"); @@ -1508,10 +1747,7 @@ impl ServiceManager { ); // Before we bother allocating anything for this request, check if // this service has already been created. - let expected_zone_name = InstalledZone::get_zone_name( - &req.zone.zone_type.to_string(), - None, - ); + let expected_zone_name = req.zone.zone_name(); if existing_zones.iter().any(|z| z.name() == expected_zone_name) { info!( self.inner.log, @@ -1542,30 +1778,15 @@ impl ServiceManager { /// These services will be instantiated by this function, and will be /// recorded to a local file to ensure they start automatically on next /// boot. - pub async fn ensure_persistent( + pub async fn ensure_all_services( &self, request: ServiceEnsureBody, ) -> Result<(), Error> { let mut existing_zones = self.inner.zones.lock().await; - let config_path = self.services_config_path()?; + let ledger_path = self.services_ledger_path()?; - let old_zone_requests: AllZoneRequests = { - if config_path.exists() { - debug!(self.inner.log, "Reading old service requests"); - toml::from_str( - &tokio::fs::read_to_string(&config_path) - .await - .map_err(|err| Error::io_path(&config_path, err))?, - ) - .map_err(|err| Error::TomlDeserialize { - path: config_path.clone(), - err, - })? - } else { - debug!(self.inner.log, "No old service requests"); - AllZoneRequests::new() - } - }; + let old_zone_requests = + AllZoneRequests::read_from(&self.inner.log, &ledger_path).await?; let new_zone_requests: Vec = { let known_set: HashSet<&ServiceZoneRequest> = HashSet::from_iter( @@ -1573,6 +1794,7 @@ impl ServiceManager { ); let requested_set = HashSet::from_iter(request.services.iter()); + // TODO: We probably want to handle this case. if !requested_set.is_superset(&known_set) { // The caller may only request services additively. // @@ -1605,15 +1827,109 @@ impl ServiceManager { .await?; zone_requests.requests.append(&mut old_zone_requests.requests.clone()); - let serialized_services = toml::Value::try_from(&zone_requests) - .expect("Cannot serialize service list"); - let services_str = - toml::to_string(&serialized_services).map_err(|err| { - Error::TomlSerialize { path: config_path.clone(), err } - })?; - tokio::fs::write(&config_path, services_str) - .await - .map_err(|err| Error::io_path(&config_path, err))?; + zone_requests.write_to(&self.inner.log, &ledger_path).await?; + + Ok(()) + } + + /// Ensures that a storage zone be initialized. + /// + /// These services will be instantiated by this function, and will be + /// recorded to a local file to ensure they start automatically on next + /// boot. + pub async fn ensure_storage_service( + &self, + request: ServiceZoneRequest, + ) -> Result<(), Error> { + let mut existing_zones = self.inner.dataset_zones.lock().await; + let ledger_path = self.storage_services_ledger_path()?; + + let mut zone_requests = + AllZoneRequests::read_from(&self.inner.log, &ledger_path).await?; + + if !zone_requests + .requests + .iter() + .any(|zone_request| zone_request.zone.id == request.id) + { + // If this is a new request, provision a zone filesystem on the same + // disk as the dataset. + let dataset = request + .dataset + .as_ref() + .expect("Storage services should have a dataset"); + let root = dataset + .pool() + .dataset_mountpoint(sled_hardware::disk::ZONE_DATASET); + zone_requests.requests.push(ZoneRequest { zone: request, root }); + } + + self.initialize_services_locked( + &mut existing_zones, + &zone_requests.requests, + ) + .await?; + + zone_requests.write_to(&self.inner.log, &ledger_path).await?; + + Ok(()) + } + + pub async fn cockroachdb_initialize(&self) -> Result<(), Error> { + let log = &self.inner.log; + let dataset_zones = self.inner.dataset_zones.lock().await; + for zone in dataset_zones.iter() { + // TODO: We could probably store the ZoneKind in the running zone to + // make this "comparison to existing zones by name" mechanism a bit + // safer. + if zone.name().contains(&ZoneType::CockroachDb.to_string()) { + let address = Zones::get_address( + Some(zone.name()), + &zone.control_interface(), + )? + .ip(); + let host = &format!("[{address}]:{COCKROACH_PORT}"); + info!( + log, + "Initializing CRDB Cluster - sending request to {host}" + ); + zone.run_cmd(&[ + "/opt/oxide/cockroachdb/bin/cockroach", + "init", + "--insecure", + "--host", + host, + ]) + .map_err(|err| Error::CockroachInit { err })?; + info!(log, "Formatting CRDB"); + zone.run_cmd(&[ + "/opt/oxide/cockroachdb/bin/cockroach", + "sql", + "--insecure", + "--host", + host, + "--file", + "/opt/oxide/cockroachdb/sql/dbwipe.sql", + ]) + .map_err(|err| Error::CockroachInit { err })?; + zone.run_cmd(&[ + "/opt/oxide/cockroachdb/bin/cockroach", + "sql", + "--insecure", + "--host", + host, + "--file", + "/opt/oxide/cockroachdb/sql/dbinit.sql", + ]) + .map_err(|err| Error::CockroachInit { err })?; + info!(log, "Formatting CRDB - Completed"); + + // In the single-sled case, if there are multiple CRDB nodes on + // a single device, we'd still only want to send the + // initialization requests to a single dataset. + return Ok(()); + } + } Ok(()) } @@ -1773,6 +2089,7 @@ impl ServiceManager { id: Uuid::new_v4(), zone_type: ZoneType::Switch, addresses, + dataset: None, gz_addresses: vec![], services: services .into_iter() @@ -2131,11 +2448,12 @@ mod test { async fn ensure_new_service(mgr: &ServiceManager, id: Uuid) { let _expectations = expect_new_service(); - mgr.ensure_persistent(ServiceEnsureBody { + mgr.ensure_all_services(ServiceEnsureBody { services: vec![ServiceZoneRequest { id, zone_type: ZoneType::Oximeter, addresses: vec![Ipv6Addr::LOCALHOST], + dataset: None, gz_addresses: vec![], services: vec![ServiceZoneService { id, @@ -2150,11 +2468,12 @@ mod test { // Prepare to call "ensure" for a service which already exists. We should // return the service without actually installing a new zone. async fn ensure_existing_service(mgr: &ServiceManager, id: Uuid) { - mgr.ensure_persistent(ServiceEnsureBody { + mgr.ensure_all_services(ServiceEnsureBody { services: vec![ServiceZoneRequest { id, zone_type: ZoneType::Oximeter, addresses: vec![Ipv6Addr::LOCALHOST], + dataset: None, gz_addresses: vec![], services: vec![ServiceZoneService { id, @@ -2190,22 +2509,20 @@ mod test { impl TestConfig { async fn new() -> Self { let config_dir = tempfile::TempDir::new().unwrap(); - tokio::fs::File::create( - config_dir.path().join(PARTIAL_CONFIG_FILENAME), - ) - .await - .unwrap(); Self { config_dir } } fn make_config(&self) -> Config { - let all_svcs_config_path = - self.config_dir.path().join(SERVICE_CONFIG_FILENAME); + let all_svcs_ledger_path = + self.config_dir.path().join(SERVICES_CONFIG_FILENAME); + let storage_svcs_ledger_path = + self.config_dir.path().join(STORAGE_SERVICES_CONFIG_FILENAME); Config { sled_id: Uuid::new_v4(), sidecar_revision: "rev_whatever_its_a_test".to_string(), gateway_address: None, - all_svcs_config_path, + all_svcs_ledger_path, + storage_svcs_ledger_path, } } } @@ -2229,11 +2546,7 @@ mod test { "rev-test".to_string(), SWITCH_ZONE_BOOTSTRAP_IP, vec![], - StorageManager::new( - &log, - Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()), - ) - .await, + StorageManager::new(&log).await, ) .await .unwrap(); @@ -2279,11 +2592,7 @@ mod test { "rev-test".to_string(), SWITCH_ZONE_BOOTSTRAP_IP, vec![], - StorageManager::new( - &log, - Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()), - ) - .await, + StorageManager::new(&log).await, ) .await .unwrap(); @@ -2332,11 +2641,7 @@ mod test { "rev-test".to_string(), SWITCH_ZONE_BOOTSTRAP_IP, vec![], - StorageManager::new( - &log, - Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()), - ) - .await, + StorageManager::new(&log).await, ) .await .unwrap(); @@ -2373,11 +2678,7 @@ mod test { "rev-test".to_string(), SWITCH_ZONE_BOOTSTRAP_IP, vec![], - StorageManager::new( - &log, - Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()), - ) - .await, + StorageManager::new(&log).await, ) .await .unwrap(); @@ -2423,11 +2724,7 @@ mod test { "rev-test".to_string(), SWITCH_ZONE_BOOTSTRAP_IP, vec![], - StorageManager::new( - &log, - Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()), - ) - .await, + StorageManager::new(&log).await, ) .await .unwrap(); @@ -2452,7 +2749,7 @@ mod test { // Next, delete the config. This means the service we just created will // not be remembered on the next initialization. let config = test_config.make_config(); - std::fs::remove_file(&config.all_svcs_config_path).unwrap(); + std::fs::remove_file(&config.all_svcs_ledger_path).unwrap(); // Observe that the old service is not re-initialized. let mgr = ServiceManager::new( @@ -2466,11 +2763,7 @@ mod test { "rev-test".to_string(), SWITCH_ZONE_BOOTSTRAP_IP, vec![], - StorageManager::new( - &log, - Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()), - ) - .await, + StorageManager::new(&log).await, ) .await .unwrap(); diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 9ae0a2a2b9..73e56d4827 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -12,7 +12,7 @@ use crate::params::{ DatasetKind, DiskStateRequested, InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, ServiceEnsureBody, - SledRole, TimeSync, VpcFirewallRule, Zpool, + ServiceZoneService, SledRole, TimeSync, VpcFirewallRule, Zpool, }; use crate::services::{self, ServiceManager}; use crate::storage_manager::{self, StorageManager}; @@ -237,7 +237,6 @@ impl SledAgent { storage .setup_underlay_access(storage_manager::UnderlayAccess { lazy_nexus_client: lazy_nexus_client.clone(), - underlay_address: *sled_address.ip(), sled_id: request.id, }) .await?; @@ -501,7 +500,12 @@ impl SledAgent { &self, requested_services: ServiceEnsureBody, ) -> Result<(), Error> { - self.inner.services.ensure_persistent(requested_services).await?; + self.inner.services.ensure_all_services(requested_services).await?; + Ok(()) + } + + pub async fn cockroachdb_initialize(&self) -> Result<(), Error> { + self.inner.services.cockroachdb_initialize().await?; Ok(()) } @@ -523,14 +527,41 @@ impl SledAgent { /// Ensures that a filesystem type exists within the zpool. pub async fn filesystem_ensure( &self, - zpool_uuid: Uuid, + dataset_id: Uuid, + zpool_id: Uuid, dataset_kind: DatasetKind, address: SocketAddrV6, ) -> Result<(), Error> { - self.inner + // First, ensure the dataset exists + let dataset = self + .inner .storage - .upsert_filesystem(zpool_uuid, dataset_kind, address) + .upsert_filesystem(dataset_id, zpool_id, dataset_kind.clone()) .await?; + + // NOTE: We use the "dataset_id" as the "service_id" here. + // + // Since datasets are tightly coupled with their own services - e.g., + // from the perspective of Nexus, provisioning a dataset implies the + // sled should start a service - this is ID re-use is reasonable. + // + // If Nexus ever wants sleds to provision datasets independently of + // launching services, this ID type overlap should be reconsidered. + let service_type = dataset_kind.service_type(); + let services = + vec![ServiceZoneService { id: dataset_id, details: service_type }]; + + // Next, ensure a zone exists to manage storage for that dataset + let request = crate::params::ServiceZoneRequest { + id: dataset_id, + zone_type: dataset_kind.zone_type(), + addresses: vec![*address.ip()], + dataset: Some(dataset), + gz_addresses: vec![], + services, + }; + self.inner.services.ensure_storage_service(request).await?; + Ok(()) } diff --git a/sled-agent/src/storage/dataset.rs b/sled-agent/src/storage/dataset.rs new file mode 100644 index 0000000000..d4f0d63474 --- /dev/null +++ b/sled-agent/src/storage/dataset.rs @@ -0,0 +1,72 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::params::DatasetKind; +use illumos_utils::zpool::ZpoolKind; +use illumos_utils::zpool::ZpoolName; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +#[derive( + Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Clone, JsonSchema, +)] +pub struct DatasetName { + // A unique identifier for the Zpool on which the dataset is stored. + pool_name: ZpoolName, + // A name for the dataset within the Zpool. + kind: DatasetKind, +} + +impl DatasetName { + pub fn new(pool_name: ZpoolName, kind: DatasetKind) -> Self { + Self { pool_name, kind } + } + + pub fn pool(&self) -> &ZpoolName { + &self.pool_name + } + + pub fn dataset(&self) -> &DatasetKind { + &self.kind + } + + pub fn full(&self) -> String { + format!("{}/{}", self.pool_name, self.kind) + } +} + +impl From for sled_agent_client::types::DatasetName { + fn from(n: DatasetName) -> Self { + let id = n.pool().id(); + + // NOTE: Ideally, this translation would live alongside the definitions + // of ZpoolKind and ZpoolName, but they're currently in illumos-utils, + // which has no dependency on sled_agent_client. + let kind = match n.pool().kind() { + ZpoolKind::External => { + sled_agent_client::types::ZpoolKind::External + } + ZpoolKind::Internal => { + sled_agent_client::types::ZpoolKind::Internal + } + }; + let pool_name = sled_agent_client::types::ZpoolName { id, kind }; + + Self { pool_name, kind: n.dataset().clone().into() } + } +} + +#[cfg(test)] +mod test { + use super::*; + use uuid::Uuid; + + #[test] + fn serialize_dataset_name() { + let pool = ZpoolName::new_internal(Uuid::new_v4()); + let kind = DatasetKind::Crucible; + let name = DatasetName::new(pool, kind); + toml::to_string(&name).unwrap(); + } +} diff --git a/sled-agent/src/storage/mod.rs b/sled-agent/src/storage/mod.rs new file mode 100644 index 0000000000..8444ecace4 --- /dev/null +++ b/sled-agent/src/storage/mod.rs @@ -0,0 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Management of local storage + +pub(crate) mod dataset; diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index 142fa9f568..8e23fc8929 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -6,35 +6,26 @@ use crate::nexus::LazyNexusClient; use crate::params::DatasetKind; -use crate::profile::*; +use crate::storage::dataset::DatasetName; use futures::stream::FuturesOrdered; use futures::FutureExt; use futures::StreamExt; -use illumos_utils::dladm::Etherstub; -use illumos_utils::link::VnicAllocator; -use illumos_utils::running_zone::{InstalledZone, RunningZone}; -use illumos_utils::zone::AddressRequest; use illumos_utils::zpool::{ZpoolKind, ZpoolName}; -use illumos_utils::{zfs::Mountpoint, zone::ZONE_PREFIX, zpool::ZpoolInfo}; +use illumos_utils::{zfs::Mountpoint, zpool::ZpoolInfo}; use nexus_client::types::PhysicalDiskDeleteRequest; use nexus_client::types::PhysicalDiskKind; use nexus_client::types::PhysicalDiskPutRequest; use nexus_client::types::ZpoolPutRequest; use omicron_common::api::external::{ByteCount, ByteCountRangeError}; use omicron_common::backoff; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; use sled_hardware::{Disk, DiskIdentity, DiskVariant, UnparsedDisk}; use slog::Logger; use std::collections::hash_map; use std::collections::HashMap; use std::convert::TryFrom; -use std::net::{IpAddr, Ipv6Addr, SocketAddrV6}; use std::path::PathBuf; use std::pin::Pin; use std::sync::Arc; -use tokio::fs::{create_dir_all, File}; -use tokio::io::AsyncWriteExt; use tokio::sync::{mpsc, oneshot, Mutex}; use tokio::task::JoinHandle; use uuid::Uuid; @@ -90,6 +81,9 @@ pub enum Error { err: uuid::Error, }, + #[error("Dataset {name:?} exists with a different uuid (has {old}, requested {new})")] + UuidMismatch { name: DatasetName, old: Uuid, new: Uuid }, + #[error("Error parsing pool {name}'s size: {err}")] BadPoolSize { name: String, @@ -132,18 +126,10 @@ pub enum Error { UnderlayNotInitialized, } -impl Error { - fn io(message: &str, err: std::io::Error) -> Self { - Self::Io { message: message.to_string(), err } - } -} - /// A ZFS storage pool. struct Pool { name: ZpoolName, info: ZpoolInfo, - // ZFS filesytem UUID -> Zone. - zones: HashMap, parent: DiskIdentity, } @@ -153,312 +139,12 @@ impl Pool { /// Returns Ok if the pool exists. fn new(name: ZpoolName, parent: DiskIdentity) -> Result { let info = Zpool::get_info(&name.to_string())?; - Ok(Pool { name, info, zones: HashMap::new(), parent }) - } - - /// Associate an already running zone with this pool object. - /// - /// Typically this is used when a dataset within the zone (identified - /// by ID) has a running zone (e.g. Crucible, Cockroach) operating on - /// behalf of that data. - fn add_zone(&mut self, id: Uuid, zone: RunningZone) { - self.zones.insert(id, zone); - } - - /// Access a zone managing data within this pool. - fn get_zone(&self, id: Uuid) -> Option<&RunningZone> { - self.zones.get(&id) - } - - /// Returns the ID of the pool itself. - fn id(&self) -> Uuid { - self.name.id() + Ok(Pool { name, info, parent }) } fn parent(&self) -> &DiskIdentity { &self.parent } - - /// Returns the path for the configuration of a particular - /// dataset within the pool. This configuration file provides - /// the necessary information for zones to "launch themselves" - /// after a reboot. - async fn dataset_config_path( - &self, - dataset_id: Uuid, - ) -> Result { - let path = std::path::Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join(self.id().to_string()); - create_dir_all(&path).await.map_err(|err| Error::Io { - message: format!("creating config dir {path:?}, which would contain config for {dataset_id}"), - err, - })?; - let mut path = path.join(dataset_id.to_string()); - path.set_extension("toml"); - Ok(path) - } -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema, Clone)] -struct DatasetName { - // A unique identifier for the Zpool on which the dataset is stored. - pool_name: ZpoolName, - // A name for the dataset within the Zpool. - dataset_name: String, -} - -impl DatasetName { - fn new(pool_name: ZpoolName, dataset_name: &str) -> Self { - Self { pool_name, dataset_name: dataset_name.to_string() } - } - - fn full(&self) -> String { - format!("{}/{}", self.pool_name, self.dataset_name) - } -} - -// Description of a dataset within a ZFS pool, which should be created -// by the Sled Agent. -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] -struct DatasetInfo { - address: SocketAddrV6, - kind: DatasetKind, - name: DatasetName, -} - -impl DatasetInfo { - fn new( - pool: ZpoolName, - kind: DatasetKind, - address: SocketAddrV6, - ) -> DatasetInfo { - match kind { - DatasetKind::CockroachDb { .. } => DatasetInfo { - name: DatasetName::new(pool, "cockroachdb"), - address, - kind, - }, - DatasetKind::Clickhouse { .. } => DatasetInfo { - name: DatasetName::new(pool, "clickhouse"), - address, - kind, - }, - DatasetKind::Crucible { .. } => DatasetInfo { - name: DatasetName::new(pool, "crucible"), - address, - kind, - }, - } - } - - fn zone_prefix(&self) -> String { - format!("{}{}_", ZONE_PREFIX, self.name.full()) - } -} - -// Ensures that a zone backing a particular dataset is running. -async fn ensure_running_zone( - log: &Logger, - vnic_allocator: &VnicAllocator, - dataset_info: &DatasetInfo, - dataset_name: &DatasetName, - do_format: bool, - underlay_address: Ipv6Addr, -) -> Result { - let address_request = AddressRequest::new_static( - IpAddr::V6(*dataset_info.address.ip()), - None, - ); - - let err = RunningZone::get( - log, - &vnic_allocator, - &dataset_info.zone_prefix(), - address_request, - ) - .await; - match err { - Ok(zone) => { - info!(log, "Zone for {} is already running", dataset_name.full()); - return Ok(zone); - } - Err(illumos_utils::running_zone::GetZoneError::NotFound { .. }) => { - info!(log, "Zone for {} was not found", dataset_name.full()); - - let zone_root_path = dataset_name - .pool_name - .dataset_mountpoint(sled_hardware::disk::ZONE_DATASET); - info!( - log, - "Installing zone {} to {}", - dataset_name.full(), - zone_root_path.display() - ); - let installed_zone = InstalledZone::install( - log, - vnic_allocator, - &zone_root_path, - &dataset_info.name.dataset_name, - Some(&dataset_name.pool_name.to_string()), - &[zone::Dataset { name: dataset_name.full() }], - &[], - &[], - vec![], - None, - vec![], - vec![], - ) - .await?; - - let datalink = installed_zone.get_control_vnic_name(); - let gateway = &underlay_address.to_string(); - let listen_addr = &dataset_info.address.ip().to_string(); - let listen_port = &dataset_info.address.port().to_string(); - - let zone = match dataset_info.kind { - DatasetKind::CockroachDb { .. } => { - let config = PropertyGroupBuilder::new("config") - .add_property("datalink", "astring", datalink) - .add_property("gateway", "astring", gateway) - .add_property("listen_addr", "astring", listen_addr) - .add_property("listen_port", "astring", listen_port) - .add_property("store", "astring", "/data"); - - let profile = ProfileBuilder::new("omicron").add_service( - ServiceBuilder::new("oxide/cockroachdb").add_instance( - ServiceInstanceBuilder::new("default") - .add_property_group(config), - ), - ); - profile.add_to_zone(log, &installed_zone).await.map_err( - |err| Error::io("Failed to setup CRDB profile", err), - )?; - let zone = RunningZone::boot(installed_zone).await?; - - // Await liveness of the cluster. - info!(log, "start_zone: awaiting liveness of CRDB"); - let check_health = || async { - let http_addr = SocketAddrV6::new( - *dataset_info.address.ip(), - 8080, - 0, - 0, - ); - reqwest::get(format!( - "http://{}/health?ready=1", - http_addr - )) - .await - .map_err(backoff::BackoffError::transient) - }; - let log_failure = |_, call_count, total_duration| { - if call_count == 0 { - info!(log, "cockroachdb not yet alive"); - } else if total_duration - > std::time::Duration::from_secs(5) - { - warn!(log, "cockroachdb not yet alive"; "total duration" => ?total_duration); - } - }; - backoff::retry_notify_ext( - backoff::retry_policy_internal_service(), - check_health, - log_failure, - ) - .await - .expect("expected an infinite retry loop waiting for crdb"); - - info!(log, "CRDB is online"); - // If requested, format the cluster with the initial tables. - if do_format { - info!(log, "Formatting CRDB"); - zone.run_cmd(&[ - "/opt/oxide/cockroachdb/bin/cockroach", - "sql", - "--insecure", - "--host", - &dataset_info.address.to_string(), - "--file", - "/opt/oxide/cockroachdb/sql/dbwipe.sql", - ])?; - zone.run_cmd(&[ - "/opt/oxide/cockroachdb/bin/cockroach", - "sql", - "--insecure", - "--host", - &dataset_info.address.to_string(), - "--file", - "/opt/oxide/cockroachdb/sql/dbinit.sql", - ])?; - info!(log, "Formatting CRDB - Completed"); - } - - zone - } - DatasetKind::Clickhouse { .. } => { - let config = PropertyGroupBuilder::new("config") - .add_property("datalink", "astring", datalink) - .add_property("gateway", "astring", gateway) - .add_property("listen_addr", "astring", listen_addr) - .add_property("listen_port", "astring", listen_port) - .add_property("store", "astring", "/data"); - - let profile = ProfileBuilder::new("omicron").add_service( - ServiceBuilder::new("oxide/clickhouse").add_instance( - ServiceInstanceBuilder::new("default") - .add_property_group(config), - ), - ); - profile.add_to_zone(log, &installed_zone).await.map_err( - |err| { - Error::io("Failed to setup clickhouse profile", err) - }, - )?; - RunningZone::boot(installed_zone).await? - } - DatasetKind::Crucible => { - let dataset = &dataset_info.name.full(); - let uuid = &Uuid::new_v4().to_string(); - let config = PropertyGroupBuilder::new("config") - .add_property("datalink", "astring", datalink) - .add_property("gateway", "astring", gateway) - .add_property("dataset", "astring", dataset) - .add_property("listen_addr", "astring", listen_addr) - .add_property("listen_port", "astring", listen_port) - .add_property("uuid", "astring", uuid) - .add_property("store", "astring", "/data"); - - let profile = ProfileBuilder::new("omicron").add_service( - ServiceBuilder::new("oxide/crucible/agent") - .add_instance( - ServiceInstanceBuilder::new("default") - .add_property_group(config), - ), - ); - profile.add_to_zone(log, &installed_zone).await.map_err( - |err| { - Error::io("Failed to setup crucible profile", err) - }, - )?; - RunningZone::boot(installed_zone).await? - } - }; - Ok(zone) - } - Err(illumos_utils::running_zone::GetZoneError::NotRunning { - name, - state, - }) => { - // TODO(https://github.com/oxidecomputer/omicron/issues/725): - unimplemented!("Handle a zone which exists, but is not running: {name}, in {state:?}"); - } - Err(err) => { - // TODO(https://github.com/oxidecomputer/omicron/issues/725): - unimplemented!( - "Handle a zone which exists, has some other problem: {err}" - ); - } - } } // The type of a future which is used to send a notification to Nexus. @@ -467,10 +153,10 @@ type NotifyFut = #[derive(Debug)] struct NewFilesystemRequest { + dataset_id: Uuid, zpool_id: Uuid, dataset_kind: DatasetKind, - address: SocketAddrV6, - responder: oneshot::Sender>, + responder: oneshot::Sender>, } struct UnderlayRequest { @@ -558,7 +244,6 @@ impl StorageResources { /// Describes the access to the underlay used by the StorageManager. pub struct UnderlayAccess { pub lazy_nexus_client: LazyNexusClient, - pub underlay_address: Ipv6Addr, pub sled_id: Uuid, } @@ -567,7 +252,6 @@ struct StorageWorker { log: Logger, nexus_notifications: FuturesOrdered, rx: mpsc::Receiver, - vnic_allocator: VnicAllocator, underlay: Arc>>, } @@ -582,17 +266,16 @@ impl StorageWorker { // creating it if `do_format` is true. // // Returns the UUID attached to the ZFS filesystem. - fn ensure_dataset_with_id( + fn ensure_dataset( + &mut self, + dataset_id: Uuid, dataset_name: &DatasetName, - do_format: bool, - ) -> Result { + ) -> Result<(), Error> { let zoned = true; let fs_name = &dataset_name.full(); + let do_format = true; Zfs::ensure_filesystem( - &format!( - "{}/{}", - dataset_name.pool_name, dataset_name.dataset_name - ), + &dataset_name.full(), Mountpoint::Path(PathBuf::from("/data")), zoned, do_format, @@ -600,70 +283,18 @@ impl StorageWorker { // Ensure the dataset has a usable UUID. if let Ok(id_str) = Zfs::get_oxide_value(&fs_name, "uuid") { if let Ok(id) = id_str.parse::() { - return Ok(id); + if id != dataset_id { + return Err(Error::UuidMismatch { + name: dataset_name.clone(), + old: id, + new: dataset_id, + }); + } + return Ok(()); } } - let id = Uuid::new_v4(); - Zfs::set_oxide_value(&fs_name, "uuid", &id.to_string())?; - Ok(id) - } - - // Starts the zone for a dataset within a particular zpool. - // - // If requested via the `do_format` parameter, may also initialize - // these resources. - // - // Returns the UUID attached to the underlying ZFS dataset. - // Returns (was_inserted, Uuid). - async fn initialize_dataset_and_zone( - &mut self, - pool: &mut Pool, - dataset_info: &DatasetInfo, - do_format: bool, - ) -> Result<(bool, Uuid), Error> { - // Ensure the underlying dataset exists before trying to poke at zones. - let dataset_name = &dataset_info.name; - info!(&self.log, "Ensuring dataset {} exists", dataset_name.full()); - let id = - StorageWorker::ensure_dataset_with_id(&dataset_name, do_format)?; - - // If this zone has already been processed by us, return immediately. - if let Some(_) = pool.get_zone(id) { - return Ok((false, id)); - } - // Otherwise, the zone may or may not exist. - // We need to either look up or create the zone. - info!( - &self.log, - "Ensuring zone for {} is running", - dataset_name.full() - ); - - let underlay_guard = self.underlay.lock().await; - let Some(underlay) = underlay_guard.as_ref() else { - return Err(Error::UnderlayNotInitialized); - }; - let underlay_address = underlay.underlay_address; - drop(underlay_guard); - - let zone = ensure_running_zone( - &self.log, - &self.vnic_allocator, - dataset_info, - &dataset_name, - do_format, - underlay_address, - ) - .await?; - - info!( - &self.log, - "Zone {} with address {} is running", - zone.name(), - dataset_info.address, - ); - pool.add_zone(id, zone); - Ok((true, id)) + Zfs::set_oxide_value(&fs_name, "uuid", &dataset_id.to_string())?; + Ok(()) } // Adds a "notification to nexus" to `nexus_notifications`, @@ -1012,34 +643,6 @@ impl StorageWorker { })?; // Notify Nexus of the zpool. self.add_zpool_notify(&pool, size); - - // If we find filesystems within our datasets, ensure their - // zones are up-and-running. - let mut datasets = vec![]; - let existing_filesystems = Zfs::list_datasets(&pool_name.to_string())?; - for fs_name in existing_filesystems { - info!( - &self.log, - "StorageWorker loading fs {} on zpool {}", - fs_name, - pool_name.to_string() - ); - // We intentionally do not exit on error here - - // otherwise, the failure of a single dataset would - // stop the storage manager from processing all storage. - // - // Instead, we opt to log the failure. - let dataset_name = DatasetName::new(pool_name.clone(), &fs_name); - let result = self.load_dataset(pool, &dataset_name).await; - match result { - Ok(dataset) => datasets.push(dataset), - Err(e) => warn!( - &self.log, - "StorageWorker Failed to load dataset: {}", e - ), - } - } - Ok(()) } @@ -1048,7 +651,7 @@ impl StorageWorker { &mut self, resources: &StorageResources, request: &NewFilesystemRequest, - ) -> Result<(), Error> { + ) -> Result { info!(self.log, "add_dataset: {:?}", request); let mut pools = resources.pools.lock().await; let pool = pools.get_mut(&request.zpool_id).ok_or_else(|| { @@ -1057,80 +660,10 @@ impl StorageWorker { request.zpool_id )) })?; - - let dataset_info = DatasetInfo::new( - pool.name.clone(), - request.dataset_kind.clone(), - request.address, - ); - let (is_new_dataset, id) = self - .initialize_dataset_and_zone( - pool, - &dataset_info, - // do_format= - true, - ) - .await?; - - if !is_new_dataset { - return Ok(()); - } - - // Now that the dataset has been initialized, record the configuration - // so it can re-initialize itself after a reboot. - let path = pool.dataset_config_path(id).await?; - let info_str = toml::to_string(&dataset_info) - .map_err(|err| Error::Serialize { path: path.clone(), err })?; - let pool_name = &pool.name; - let mut file = File::create(&path).await.map_err(|err| Error::Io { - message: format!("Failed creating config file at {path:?} for pool {pool_name}, dataset: {id}"), - err, - })?; - file.write_all(info_str.as_bytes()).await.map_err(|err| Error::Io { - message: format!("Failed writing config to {path:?} for pool {pool_name}, dataset: {id}"), - err, - })?; - - Ok(()) - } - - async fn load_dataset( - &mut self, - pool: &mut Pool, - dataset_name: &DatasetName, - ) -> Result<(Uuid, SocketAddrV6, DatasetKind), Error> { - let name = dataset_name.full(); - let id = Zfs::get_oxide_value(&name, "uuid")? - .parse::() - .map_err(|err| Error::ParseDatasetUuid { name, err })?; - let config_path = pool.dataset_config_path(id).await?; - info!( - self.log, - "Loading Dataset from {}", - config_path.to_string_lossy() - ); - let pool_name = pool.info.name(); - let dataset_info: DatasetInfo = - toml::from_str( - &tokio::fs::read_to_string(&config_path).await.map_err(|err| Error::Io { - message: format!("read config for pool {pool_name}, dataset {dataset_name:?} from {config_path:?}"), - err, - })? - ).map_err(|err| { - Error::Deserialize { - path: config_path, - err, - } - })?; - self.initialize_dataset_and_zone( - pool, - &dataset_info, - // do_format= - false, - ) - .await?; - - Ok((id, dataset_info.address, dataset_info.kind)) + let dataset_name = + DatasetName::new(pool.name.clone(), request.dataset_kind.clone()); + self.ensure_dataset(request.dataset_id, &dataset_name)?; + Ok(dataset_name) } // Small wrapper around `Self::do_work_internal` that ensures we always @@ -1221,14 +754,13 @@ pub struct StorageManager { impl StorageManager { /// Creates a new [`StorageManager`] which should manage local storage. - pub async fn new(log: &Logger, etherstub: Etherstub) -> Self { + pub async fn new(log: &Logger) -> Self { let log = log.new(o!("component" => "StorageManager")); let resources = StorageResources { disks: Arc::new(Mutex::new(HashMap::new())), pools: Arc::new(Mutex::new(HashMap::new())), }; let (tx, rx) = mpsc::channel(30); - let vnic_allocator = VnicAllocator::new("Storage", etherstub); StorageManager { inner: Arc::new(StorageManagerInner { @@ -1240,7 +772,6 @@ impl StorageManager { log, nexus_notifications: FuturesOrdered::new(), rx, - vnic_allocator, underlay: Arc::new(Mutex::new(None)), }; @@ -1351,15 +882,15 @@ impl StorageManager { pub async fn upsert_filesystem( &self, + dataset_id: Uuid, zpool_id: Uuid, dataset_kind: DatasetKind, - address: SocketAddrV6, - ) -> Result<(), Error> { + ) -> Result { let (tx, rx) = oneshot::channel(); let request = NewFilesystemRequest { + dataset_id, zpool_id, dataset_kind, - address, responder: tx, }; @@ -1367,13 +898,13 @@ impl StorageManager { .tx .send(StorageWorkerRequest::NewFilesystem(request)) .await - .map_err(|_| ()) + .map_err(|e| e.to_string()) .expect("Storage worker bug (not alive)"); - rx.await.expect( + let dataset_name = rx.await.expect( "Storage worker bug (dropped responder without responding)", )?; - Ok(()) + Ok(dataset_name) } pub fn resources(&self) -> &StorageResources { @@ -1392,22 +923,3 @@ impl Drop for StorageManagerInner { self.task.abort(); } } - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn serialize_dataset_info() { - let dataset_info = DatasetInfo { - address: "[::1]:8080".parse().unwrap(), - kind: DatasetKind::Crucible, - name: DatasetName::new( - ZpoolName::new_internal(Uuid::new_v4()), - "dataset", - ), - }; - - toml::to_string(&dataset_info).unwrap(); - } -} diff --git a/smf/cockroachdb/method_script.sh b/smf/cockroachdb/method_script.sh index fb9bacf854..5027aeb42b 100755 --- a/smf/cockroachdb/method_script.sh +++ b/smf/cockroachdb/method_script.sh @@ -11,12 +11,18 @@ LISTEN_PORT="$(svcprop -c -p config/listen_port "${SMF_FMRI}")" DATASTORE="$(svcprop -c -p config/store "${SMF_FMRI}")" DATALINK="$(svcprop -c -p config/datalink "${SMF_FMRI}")" GATEWAY="$(svcprop -c -p config/gateway "${SMF_FMRI}")" +JOIN_ADDRS="$(svcprop -c -p config/join_addrs "${SMF_FMRI}")" if [[ $DATALINK == unknown ]] || [[ $GATEWAY == unknown ]]; then printf 'ERROR: missing datalink or gateway\n' >&2 exit "$SMF_EXIT_ERR_CONFIG" fi +if [[ $JOIN_ADDRS == unknown ]]; then + printf 'ERROR: missing join_addrs\n' >&2 + exit "$SMF_EXIT_ERR_CONFIG" +fi + ipadm show-addr "$DATALINK/ll" || ipadm create-addr -t -T addrconf "$DATALINK/ll" ipadm show-addr "$DATALINK/omicron6" || ipadm create-addr -t -T static -a "$LISTEN_ADDR" "$DATALINK/omicron6" route get -inet6 default -inet6 "$GATEWAY" || route add -inet6 default -inet6 "$GATEWAY" @@ -25,6 +31,7 @@ args=( '--insecure' '--listen-addr' "[$LISTEN_ADDR]:$LISTEN_PORT" '--store' "$DATASTORE" + '--join' "$JOIN_ADDRS" ) -exec /opt/oxide/cockroachdb/bin/cockroach start-single-node "${args[@]}" & +exec /opt/oxide/cockroachdb/bin/cockroach start "${args[@]}" & diff --git a/smf/sled-agent/gimlet-standalone/config-rss.toml b/smf/sled-agent/gimlet-standalone/config-rss.toml index 05f2860502..246f57848d 100644 --- a/smf/sled-agent/gimlet-standalone/config-rss.toml +++ b/smf/sled-agent/gimlet-standalone/config-rss.toml @@ -6,6 +6,9 @@ # |...............| <- This /56 is the Rack Subnet rack_subnet = "fd00:1122:3344:0100::" +# Only include "our own sled" in the bootstrap network +bootstrap_discovery = "only_ours" + # The number of sleds required to unlock the rack secret. # # For values less than 2, no rack secret will be generated. diff --git a/smf/sled-agent/non-gimlet/config-rss.toml b/smf/sled-agent/non-gimlet/config-rss.toml index 05f2860502..246f57848d 100644 --- a/smf/sled-agent/non-gimlet/config-rss.toml +++ b/smf/sled-agent/non-gimlet/config-rss.toml @@ -6,6 +6,9 @@ # |...............| <- This /56 is the Rack Subnet rack_subnet = "fd00:1122:3344:0100::" +# Only include "our own sled" in the bootstrap network +bootstrap_discovery = "only_ours" + # The number of sleds required to unlock the rack secret. # # For values less than 2, no rack secret will be generated.