diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index cc67b91fce..d79c836fba 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@ccc14bdc8d34cddf54e4f9fb2da0c208427207a3 # v2 + uses: taiki-e/install-action@8f354f35e51028c902e8ab954045e37739acf562 # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date diff --git a/Cargo.lock b/Cargo.lock index 5427bb3d0a..e0d619588c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -259,6 +259,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "async-recursion" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd55a5ba1179988837d24ab4c7cc8ed6efdeff578ede0416b4225a5fca35bd0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.32", +] + [[package]] name = "async-stream" version = "0.3.5" @@ -2280,9 +2291,9 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa" [[package]] name = "fs-err" -version = "2.10.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5fd9bcbe8b1087cbd395b51498c01bc997cef73e778a80b77a811af5e2d29f" +checksum = "88a41f105fe1d5b6b34b2055e3dc59bb79b46b48b2040b9e6c7b4b5de097aa41" dependencies = [ "autocfg", ] @@ -2490,7 +2501,7 @@ dependencies = [ "serde", "serde-big-array 0.5.1", "slog", - "socket2 0.5.4", + "socket2 0.5.5", "string_cache", "thiserror", "tlvc 0.3.1 (git+https://github.com/oxidecomputer/tlvc.git?branch=main)", @@ -3343,7 +3354,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b58db92f96b720de98181bbbe63c831e87005ab460c1bf306eb2622b4707997f" dependencies = [ - "socket2 0.5.4", + "socket2 0.5.5", "widestring", "windows-sys 0.48.0", "winreg", @@ -3824,9 +3835,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +checksum = "3dce281c5e46beae905d4de1870d8b1509a9142b62eedf18b443b011ca8343d0" dependencies = [ "libc", "log", @@ -4578,6 +4589,7 @@ dependencies = [ "async-bb8-diesel", "async-trait", "base64 0.21.5", + "buf-list", "camino", "cancel-safe-futures", "chrono", @@ -4976,6 +4988,7 @@ dependencies = [ "signature 2.1.0", "similar", "slog", + "snafu", "spin 0.9.8", "string_cache", "subtle", @@ -4986,6 +4999,7 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-stream", + "tokio-util", "toml 0.7.8", "toml_datetime", "toml_edit 0.19.15", @@ -5550,24 +5564,6 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" -[[package]] -name = "path-absolutize" -version = "3.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43eb3595c63a214e1b37b44f44b0a84900ef7ae0b4c5efce59e123d246d7a0de" -dependencies = [ - "path-dedot", -] - -[[package]] -name = "path-dedot" -version = "3.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d55e486337acb9973cdea3ec5638c1b3bcb22e573b2b7b41969e0c744d5a15e" -dependencies = [ - "once_cell", -] - [[package]] name = "path-slash" version = "0.1.5" @@ -6652,13 +6648,13 @@ dependencies = [ [[package]] name = "rpassword" -version = "7.2.0" +version = "7.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6678cf63ab3491898c0d021b493c94c9b221d91295294a2a5746eacbe5928322" +checksum = "80472be3c897911d0137b2d2b9055faf6eeac5b14e324073d83bc17b191d7e3f" dependencies = [ "libc", "rtoolbox", - "winapi", + "windows-sys 0.48.0", ] [[package]] @@ -7755,6 +7751,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" dependencies = [ "doc-comment", + "futures-core", + "pin-project", "snafu-derive", ] @@ -7782,9 +7780,9 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e" +checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9" dependencies = [ "libc", "windows-sys 0.48.0", @@ -8439,9 +8437,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.33.0" +version = "1.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653" +checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9" dependencies = [ "backtrace", "bytes", @@ -8451,16 +8449,16 @@ dependencies = [ "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", - "socket2 0.5.4", + "socket2 0.5.5", "tokio-macros", "windows-sys 0.48.0", ] [[package]] name = "tokio-macros" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", @@ -8497,7 +8495,7 @@ dependencies = [ "postgres-protocol", "postgres-types", "rand 0.8.5", - "socket2 0.5.4", + "socket2 0.5.5", "tokio", "tokio-util", "whoami", @@ -8651,18 +8649,22 @@ checksum = "ea68304e134ecd095ac6c3574494fc62b909f416c4fca77e440530221e549d3d" [[package]] name = "tough" -version = "0.14.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eda3efa9005cf9c1966984c3b9a44c3f37b7ed2c95ba338d6ad51bba70e989a0" +checksum = "d16dc5f42fc7ce7cb51eebc7a6ef91f4d69a6d41bb13f34a09674ec47e454d9b" dependencies = [ + "async-recursion", + "async-trait", + "bytes", "chrono", "dyn-clone", + "futures", + "futures-core", "globset", "hex", "log", "olpc-cjson", - "path-absolutize", - "pem 1.1.1", + "pem 3.0.2", "percent-encoding", "reqwest", "ring 0.16.20", @@ -8671,6 +8673,9 @@ dependencies = [ "serde_plain", "snafu", "tempfile", + "tokio", + "tokio-util", + "typed-path", "untrusted 0.7.1", "url", "walkdir", @@ -8845,6 +8850,7 @@ dependencies = [ "slog-envlogger", "slog-term", "tempfile", + "tokio", "tufaceous-lib", ] @@ -8853,6 +8859,7 @@ name = "tufaceous-lib" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "buf-list", "bytes", "bytesize", @@ -8862,6 +8869,7 @@ dependencies = [ "debug-ignore", "flate2", "fs-err", + "futures", "hex", "hubtools", "itertools 0.12.0", @@ -8876,6 +8884,7 @@ dependencies = [ "sha2", "slog", "tar", + "tokio", "toml 0.8.8", "tough", "url", @@ -8930,6 +8939,12 @@ dependencies = [ "utf-8", ] +[[package]] +name = "typed-path" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb9d13b8242894ff21f9990082b90a6410a43dcc6029ac4227a1467853ba781" + [[package]] name = "typenum" version = "1.16.0" @@ -9193,9 +9208,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" -version = "1.5.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88ad59a7560b41a70d191093a945f0b87bc1deeda46fb237479708a1d6b6cdfc" +checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560" dependencies = [ "getrandom 0.2.10", "serde", @@ -9513,6 +9528,7 @@ dependencies = [ "async-trait", "base64 0.21.5", "bootstrap-agent-client", + "buf-list", "bytes", "camino", "camino-tempfile", diff --git a/Cargo.toml b/Cargo.toml index b55c4fca6a..fb220ba53d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -193,7 +193,7 @@ filetime = "0.2.22" flate2 = "1.0.28" flume = "0.11.0" foreign-types = "0.3.2" -fs-err = "2.10.0" +fs-err = "2.11.0" futures = "0.3.29" gateway-client = { path = "clients/gateway-client" } gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "2739c18e80697aa6bc235c935176d14b4d757ee9", default-features = false, features = ["std"] } @@ -304,7 +304,7 @@ regex = "1.10.2" regress = "0.7.1" reqwest = { version = "0.11", default-features = false } ring = "0.16" -rpassword = "7.2.0" +rpassword = "7.3.1" rstest = "0.18.2" rustfmt-wrapper = "0.2" rustls = "0.21.9" @@ -363,15 +363,15 @@ textwrap = "0.16.0" test-strategy = "0.3.1" thiserror = "1.0" tofino = { git = "http://github.com/oxidecomputer/tofino", branch = "main" } -tokio = "1.33.0" +tokio = "1.34.0" tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1" ] } tokio-stream = "0.1.14" tokio-tungstenite = "0.18" -tokio-util = "0.7.10" +tokio-util = { version = "0.7.10", features = ["io", "io-util"] } toml = "0.8.8" toml_edit = "0.21.0" topological-sort = "0.2.2" -tough = { version = "0.14", features = [ "http" ] } +tough = { version = "0.15", features = [ "http" ] } trust-dns-client = "0.22" trust-dns-proto = "0.22" trust-dns-resolver = "0.22" @@ -382,7 +382,7 @@ tufaceous-lib = { path = "tufaceous-lib" } unicode-width = "0.1.11" update-engine = { path = "update-engine" } usdt = "0.3" -uuid = { version = "1.5.0", features = ["serde", "v4"] } +uuid = { version = "1.6.1", features = ["serde", "v4"] } walkdir = "2.4" wicket = { path = "wicket" } wicket-common = { path = "wicket-common" } diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 23ceb114fc..6667f759e4 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -202,6 +202,19 @@ impl From<&types::InstanceState> } } +impl From + for types::ProducerKind +{ + fn from(kind: omicron_common::api::internal::nexus::ProducerKind) -> Self { + use omicron_common::api::internal::nexus::ProducerKind; + match kind { + ProducerKind::SledAgent => Self::SledAgent, + ProducerKind::Service => Self::Service, + ProducerKind::Instance => Self::Instance, + } + } +} + impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> for types::ProducerEndpoint { @@ -212,6 +225,7 @@ impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> address: s.address.to_string(), base_route: s.base_route.clone(), id: s.id, + kind: s.kind.map(Into::into), interval: s.interval.into(), } } diff --git a/clients/oximeter-client/src/lib.rs b/clients/oximeter-client/src/lib.rs index 7bd17d7e76..8a03304e06 100644 --- a/clients/oximeter-client/src/lib.rs +++ b/clients/oximeter-client/src/lib.rs @@ -20,6 +20,19 @@ impl From for types::Duration { } } +impl From + for types::ProducerKind +{ + fn from(kind: omicron_common::api::internal::nexus::ProducerKind) -> Self { + use omicron_common::api::internal::nexus; + match kind { + nexus::ProducerKind::Service => Self::Service, + nexus::ProducerKind::SledAgent => Self::SledAgent, + nexus::ProducerKind::Instance => Self::Instance, + } + } +} + impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> for types::ProducerEndpoint { @@ -30,6 +43,7 @@ impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> address: s.address.to_string(), base_route: s.base_route.clone(), id: s.id, + kind: s.kind.map(Into::into), interval: s.interval.into(), } } diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index a4a539ad9b..1daa85dbe7 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -84,13 +84,34 @@ pub struct SledInstanceState { // Oximeter producer/collector objects. +/// The kind of metric producer this is. +#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ProducerKind { + /// The producer is a sled-agent. + SledAgent, + /// The producer is an Omicron-managed service. + Service, + /// The producer is a Propolis VMM managing a guest instance. + Instance, +} + /// Information announced by a metric server, used so that clients can contact it and collect /// available metric data from it. #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize, PartialEq)] pub struct ProducerEndpoint { + /// A unique ID for this producer. pub id: Uuid, + /// The kind of producer. + pub kind: Option, + /// The IP address and port at which `oximeter` can collect metrics from the + /// producer. pub address: SocketAddr, + /// The API base route from which `oximeter` can collect metrics. + /// + /// The full route is `{base_route}/{id}`. pub base_route: String, + /// The interval on which `oximeter` should collect metrics. pub interval: Duration, } diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 4fc13a31d8..704a7ab7bd 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -12,6 +12,7 @@ anyhow.workspace = true assert_matches.workspace = true async-trait.workspace = true base64.workspace = true +buf-list.workspace = true cancel-safe-futures.workspace = true camino.workspace = true clap.workspace = true diff --git a/nexus/db-model/src/producer_endpoint.rs b/nexus/db-model/src/producer_endpoint.rs index 29e57b0877..52a69e0508 100644 --- a/nexus/db-model/src/producer_endpoint.rs +++ b/nexus/db-model/src/producer_endpoint.rs @@ -3,12 +3,47 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::SqlU16; +use crate::impl_enum_type; use crate::schema::metric_producer; use db_macros::Asset; use nexus_types::identity::Asset; use omicron_common::api::internal; use uuid::Uuid; +impl_enum_type!( + #[derive(SqlType, Copy, Clone, Debug, QueryId)] + #[diesel(postgres_type(name = "producer_kind"))] + pub struct ProducerKindEnum; + + #[derive(AsExpression, Copy, Clone, Debug, FromSqlRow, PartialEq)] + #[diesel(sql_type = ProducerKindEnum)] + pub enum ProducerKind; + + SledAgent => b"sled_agent" + Service => b"service" + Instance => b"instance" +); + +impl From for ProducerKind { + fn from(kind: internal::nexus::ProducerKind) -> Self { + match kind { + internal::nexus::ProducerKind::SledAgent => ProducerKind::SledAgent, + internal::nexus::ProducerKind::Service => ProducerKind::Service, + internal::nexus::ProducerKind::Instance => ProducerKind::Instance, + } + } +} + +impl From for internal::nexus::ProducerKind { + fn from(kind: ProducerKind) -> Self { + match kind { + ProducerKind::SledAgent => internal::nexus::ProducerKind::SledAgent, + ProducerKind::Service => internal::nexus::ProducerKind::Service, + ProducerKind::Instance => internal::nexus::ProducerKind::Instance, + } + } +} + /// Information announced by a metric server, used so that clients can contact it and collect /// available metric data from it. #[derive(Queryable, Insertable, Debug, Clone, Selectable, Asset)] @@ -17,6 +52,7 @@ pub struct ProducerEndpoint { #[diesel(embed)] identity: ProducerEndpointIdentity, + pub kind: Option, pub ip: ipnetwork::IpNetwork, pub port: SqlU16, pub interval: f64, @@ -33,6 +69,7 @@ impl ProducerEndpoint { ) -> Self { Self { identity: ProducerEndpointIdentity::new(endpoint.id), + kind: endpoint.kind.map(Into::into), ip: endpoint.address.ip().into(), port: endpoint.address.port().into(), base_route: endpoint.base_route.clone(), diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 9afc2a83bc..c940a684ca 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -399,6 +399,7 @@ table! { id -> Uuid, time_created -> Timestamptz, time_modified -> Timestamptz, + kind -> Nullable, ip -> Inet, port -> Int4, interval -> Float8, @@ -1352,3 +1353,8 @@ allow_tables_to_appear_in_same_query!( switch_port, switch_port_settings_route_config ); + +allow_tables_to_appear_in_same_query!( + switch_port, + switch_port_settings_bgp_peer_config +); diff --git a/nexus/db-model/src/sled.rs b/nexus/db-model/src/sled.rs index 5e059946ff..ba572901c6 100644 --- a/nexus/db-model/src/sled.rs +++ b/nexus/db-model/src/sled.rs @@ -62,38 +62,6 @@ pub struct Sled { } impl Sled { - pub fn new( - id: Uuid, - addr: SocketAddrV6, - baseboard: SledBaseboard, - hardware: SledSystemHardware, - rack_id: Uuid, - ) -> Self { - let last_used_address = { - let mut segments = addr.ip().segments(); - segments[7] += omicron_common::address::RSS_RESERVED_ADDRESSES; - ipv6::Ipv6Addr::from(Ipv6Addr::from(segments)) - }; - Self { - identity: SledIdentity::new(id), - time_deleted: None, - rcgen: Generation::new(), - rack_id, - is_scrimlet: hardware.is_scrimlet, - serial_number: baseboard.serial_number, - part_number: baseboard.part_number, - revision: baseboard.revision, - usable_hardware_threads: SqlU32::new( - hardware.usable_hardware_threads, - ), - usable_physical_ram: hardware.usable_physical_ram, - reservoir_size: hardware.reservoir_size, - ip: ipv6::Ipv6Addr::from(addr.ip()), - port: addr.port().into(), - last_used_address, - } - } - pub fn is_scrimlet(&self) -> bool { self.is_scrimlet } @@ -153,6 +121,107 @@ impl DatastoreCollectionConfig for Sled { type CollectionIdColumn = service::dsl::sled_id; } +/// Form of `Sled` used for updates from sled-agent. This is missing some +/// columns that are present in `Sled` because sled-agent doesn't control them. +#[derive(Debug, Clone)] +pub struct SledUpdate { + id: Uuid, + + pub rack_id: Uuid, + + is_scrimlet: bool, + serial_number: String, + part_number: String, + revision: i64, + + pub usable_hardware_threads: SqlU32, + pub usable_physical_ram: ByteCount, + pub reservoir_size: ByteCount, + + // ServiceAddress (Sled Agent). + pub ip: ipv6::Ipv6Addr, + pub port: SqlU16, +} + +impl SledUpdate { + pub fn new( + id: Uuid, + addr: SocketAddrV6, + baseboard: SledBaseboard, + hardware: SledSystemHardware, + rack_id: Uuid, + ) -> Self { + Self { + id, + rack_id, + is_scrimlet: hardware.is_scrimlet, + serial_number: baseboard.serial_number, + part_number: baseboard.part_number, + revision: baseboard.revision, + usable_hardware_threads: SqlU32::new( + hardware.usable_hardware_threads, + ), + usable_physical_ram: hardware.usable_physical_ram, + reservoir_size: hardware.reservoir_size, + ip: addr.ip().into(), + port: addr.port().into(), + } + } + + /// Converts self into a form used for inserts of new sleds into the + /// database. + /// + /// This form adds default values for fields that are not present in + /// `SledUpdate`. + pub fn into_insertable(self) -> Sled { + let last_used_address = { + let mut segments = self.ip().segments(); + segments[7] += omicron_common::address::RSS_RESERVED_ADDRESSES; + ipv6::Ipv6Addr::from(Ipv6Addr::from(segments)) + }; + Sled { + identity: SledIdentity::new(self.id), + rcgen: Generation::new(), + time_deleted: None, + rack_id: self.rack_id, + is_scrimlet: self.is_scrimlet, + serial_number: self.serial_number, + part_number: self.part_number, + revision: self.revision, + usable_hardware_threads: self.usable_hardware_threads, + usable_physical_ram: self.usable_physical_ram, + reservoir_size: self.reservoir_size, + ip: self.ip, + port: self.port, + last_used_address, + } + } + + pub fn id(&self) -> Uuid { + self.id + } + + pub fn is_scrimlet(&self) -> bool { + self.is_scrimlet + } + + pub fn ip(&self) -> Ipv6Addr { + self.ip.into() + } + + pub fn address(&self) -> SocketAddrV6 { + self.address_with_port(self.port.into()) + } + + pub fn address_with_port(&self, port: u16) -> SocketAddrV6 { + SocketAddrV6::new(self.ip(), port, 0, 0) + } + + pub fn serial_number(&self) -> &str { + &self.serial_number + } +} + /// A set of constraints that can be placed on operations that select a sled. #[derive(Debug)] pub struct SledReservationConstraints { diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 8be3386183..0612b960c9 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -371,8 +371,8 @@ mod test { use crate::db::model::{ BlockSize, ComponentUpdate, ComponentUpdateIdentity, ConsoleSession, Dataset, DatasetKind, ExternalIp, PhysicalDisk, PhysicalDiskKind, - Project, Rack, Region, Service, ServiceKind, SiloUser, Sled, - SledBaseboard, SledSystemHardware, SshKey, SystemUpdate, + Project, Rack, Region, Service, ServiceKind, SiloUser, SledBaseboard, + SledSystemHardware, SledUpdate, SshKey, SystemUpdate, UpdateableComponentType, VpcSubnet, Zpool, }; use crate::db::queries::vpc_subnet::FilterConflictingVpcSubnetRangesQuery; @@ -599,14 +599,14 @@ mod test { let rack_id = Uuid::new_v4(); let sled_id = Uuid::new_v4(); - let sled = Sled::new( + let sled_update = SledUpdate::new( sled_id, bogus_addr, sled_baseboard_for_test(), sled_system_hardware_for_test(), rack_id, ); - datastore.sled_upsert(sled).await.unwrap(); + datastore.sled_upsert(sled_update).await.unwrap(); sled_id } @@ -1205,7 +1205,7 @@ mod test { let rack_id = Uuid::new_v4(); let addr1 = "[fd00:1de::1]:12345".parse().unwrap(); let sled1_id = "0de4b299-e0b4-46f0-d528-85de81a7095f".parse().unwrap(); - let sled1 = db::model::Sled::new( + let sled1 = db::model::SledUpdate::new( sled1_id, addr1, sled_baseboard_for_test(), @@ -1216,7 +1216,7 @@ mod test { let addr2 = "[fd00:1df::1]:12345".parse().unwrap(); let sled2_id = "66285c18-0c79-43e0-e54f-95271f271314".parse().unwrap(); - let sled2 = db::model::Sled::new( + let sled2 = db::model::SledUpdate::new( sled2_id, addr2, sled_baseboard_for_test(), diff --git a/nexus/db-queries/src/db/datastore/oximeter.rs b/nexus/db-queries/src/db/datastore/oximeter.rs index 55b650ea53..116e8586b0 100644 --- a/nexus/db-queries/src/db/datastore/oximeter.rs +++ b/nexus/db-queries/src/db/datastore/oximeter.rs @@ -96,6 +96,7 @@ impl DataStore { .do_update() .set(( dsl::time_modified.eq(Utc::now()), + dsl::kind.eq(producer.kind), dsl::ip.eq(producer.ip), dsl::port.eq(producer.port), dsl::interval.eq(producer.interval), diff --git a/nexus/db-queries/src/db/datastore/physical_disk.rs b/nexus/db-queries/src/db/datastore/physical_disk.rs index 3c83b91d21..ecb583ee29 100644 --- a/nexus/db-queries/src/db/datastore/physical_disk.rs +++ b/nexus/db-queries/src/db/datastore/physical_disk.rs @@ -141,7 +141,7 @@ mod test { use crate::db::datastore::test::{ sled_baseboard_for_test, sled_system_hardware_for_test, }; - use crate::db::model::{PhysicalDiskKind, Sled}; + use crate::db::model::{PhysicalDiskKind, Sled, SledUpdate}; use dropshot::PaginationOrder; use nexus_test_utils::db::test_setup_database; use nexus_types::identity::Asset; @@ -153,14 +153,14 @@ mod test { let sled_id = Uuid::new_v4(); let addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0); let rack_id = Uuid::new_v4(); - let sled = Sled::new( + let sled_update = SledUpdate::new( sled_id, addr, sled_baseboard_for_test(), sled_system_hardware_for_test(), rack_id, ); - db.sled_upsert(sled) + db.sled_upsert(sled_update) .await .expect("Could not upsert sled during test prep") } diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index ae982d86f8..2cc5880470 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -680,7 +680,7 @@ mod test { use crate::db::model::Sled; use async_bb8_diesel::AsyncSimpleConnection; use internal_params::DnsRecord; - use nexus_db_model::{DnsGroup, InitialDnsGroup}; + use nexus_db_model::{DnsGroup, InitialDnsGroup, SledUpdate}; use nexus_test_utils::db::test_setup_database; use nexus_types::external_api::shared::SiloIdentityMode; use nexus_types::identity::Asset; @@ -870,14 +870,14 @@ mod test { async fn create_test_sled(db: &DataStore) -> Sled { let sled_id = Uuid::new_v4(); let addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0); - let sled = Sled::new( + let sled_update = SledUpdate::new( sled_id, addr, sled_baseboard_for_test(), sled_system_hardware_for_test(), rack_id(), ); - db.sled_upsert(sled) + db.sled_upsert(sled_update) .await .expect("Could not upsert sled during test prep") } diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index f4f5188057..130c36b496 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -11,9 +11,9 @@ use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::error::TransactionError; -use crate::db::identity::Asset; use crate::db::model::Sled; use crate::db::model::SledResource; +use crate::db::model::SledUpdate; use crate::db::pagination::paginated; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; @@ -29,21 +29,25 @@ use uuid::Uuid; impl DataStore { /// Stores a new sled in the database. - pub async fn sled_upsert(&self, sled: Sled) -> CreateResult { + pub async fn sled_upsert( + &self, + sled_update: SledUpdate, + ) -> CreateResult { use db::schema::sled::dsl; diesel::insert_into(dsl::sled) - .values(sled.clone()) + .values(sled_update.clone().into_insertable()) .on_conflict(dsl::id) .do_update() .set(( dsl::time_modified.eq(Utc::now()), - dsl::ip.eq(sled.ip), - dsl::port.eq(sled.port), - dsl::rack_id.eq(sled.rack_id), - dsl::is_scrimlet.eq(sled.is_scrimlet()), - dsl::usable_hardware_threads.eq(sled.usable_hardware_threads), - dsl::usable_physical_ram.eq(sled.usable_physical_ram), - dsl::reservoir_size.eq(sled.reservoir_size), + dsl::ip.eq(sled_update.ip), + dsl::port.eq(sled_update.port), + dsl::rack_id.eq(sled_update.rack_id), + dsl::is_scrimlet.eq(sled_update.is_scrimlet()), + dsl::usable_hardware_threads + .eq(sled_update.usable_hardware_threads), + dsl::usable_physical_ram.eq(sled_update.usable_physical_ram), + dsl::reservoir_size.eq(sled_update.reservoir_size), )) .returning(Sled::as_returning()) .get_result_async(&*self.pool_connection_unauthorized().await?) @@ -53,7 +57,7 @@ impl DataStore { e, ErrorHandler::Conflict( ResourceType::Sled, - &sled.id().to_string(), + &sled_update.id().to_string(), ), ) }) @@ -241,7 +245,7 @@ mod test { let sled_id = Uuid::new_v4(); let addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0); - let mut sled = Sled::new( + let mut sled_update = SledUpdate::new( sled_id, addr, sled_baseboard_for_test(), @@ -249,44 +253,50 @@ mod test { rack_id(), ); let observed_sled = datastore - .sled_upsert(sled.clone()) + .sled_upsert(sled_update.clone()) .await .expect("Could not upsert sled during test prep"); assert_eq!( observed_sled.usable_hardware_threads, - sled.usable_hardware_threads + sled_update.usable_hardware_threads + ); + assert_eq!( + observed_sled.usable_physical_ram, + sled_update.usable_physical_ram ); - assert_eq!(observed_sled.usable_physical_ram, sled.usable_physical_ram); - assert_eq!(observed_sled.reservoir_size, sled.reservoir_size); + assert_eq!(observed_sled.reservoir_size, sled_update.reservoir_size); // Modify the sizes of hardware - sled.usable_hardware_threads = - SqlU32::new(sled.usable_hardware_threads.0 + 1); + sled_update.usable_hardware_threads = + SqlU32::new(sled_update.usable_hardware_threads.0 + 1); const MIB: u64 = 1024 * 1024; - sled.usable_physical_ram = ByteCount::from( + sled_update.usable_physical_ram = ByteCount::from( external::ByteCount::try_from( - sled.usable_physical_ram.0.to_bytes() + MIB, + sled_update.usable_physical_ram.0.to_bytes() + MIB, ) .unwrap(), ); - sled.reservoir_size = ByteCount::from( + sled_update.reservoir_size = ByteCount::from( external::ByteCount::try_from( - sled.reservoir_size.0.to_bytes() + MIB, + sled_update.reservoir_size.0.to_bytes() + MIB, ) .unwrap(), ); // Test that upserting the sled propagates those changes to the DB. let observed_sled = datastore - .sled_upsert(sled.clone()) + .sled_upsert(sled_update.clone()) .await .expect("Could not upsert sled during test prep"); assert_eq!( observed_sled.usable_hardware_threads, - sled.usable_hardware_threads + sled_update.usable_hardware_threads + ); + assert_eq!( + observed_sled.usable_physical_ram, + sled_update.usable_physical_ram ); - assert_eq!(observed_sled.usable_physical_ram, sled.usable_physical_ram); - assert_eq!(observed_sled.reservoir_size, sled.reservoir_size); + assert_eq!(observed_sled.reservoir_size, sled_update.reservoir_size); db.cleanup().await.unwrap(); logctx.cleanup_successful(); diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs index f301750ee9..d7319347f0 100644 --- a/nexus/db-queries/src/db/datastore/switch_port.rs +++ b/nexus/db-queries/src/db/datastore/switch_port.rs @@ -23,8 +23,8 @@ use crate::db::pagination::paginated; use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; use diesel::result::Error as DieselError; use diesel::{ - ExpressionMethods, JoinOnDsl, NullableExpressionMethods, QueryDsl, - SelectableHelper, + CombineDsl, ExpressionMethods, JoinOnDsl, NullableExpressionMethods, + QueryDsl, SelectableHelper, }; use nexus_types::external_api::params; use omicron_common::api::external::http_pagination::PaginatedBy; @@ -1110,6 +1110,7 @@ impl DataStore { ) -> ListResultVec { use db::schema::{ switch_port::dsl as switch_port_dsl, + switch_port_settings_bgp_peer_config::dsl as bgp_peer_config_dsl, switch_port_settings_route_config::dsl as route_config_dsl, }; @@ -1126,6 +1127,18 @@ impl DataStore { // pagination in the future, or maybe a way to constrain the query to // a rack? .limit(64) + .union( + switch_port_dsl::switch_port + .filter(switch_port_dsl::port_settings_id.is_not_null()) + .inner_join( + bgp_peer_config_dsl::switch_port_settings_bgp_peer_config + .on(switch_port_dsl::port_settings_id + .eq(bgp_peer_config_dsl::port_settings_id.nullable()), + ), + ) + .select(SwitchPort::as_select()) + .limit(64), + ) .load_async::( &*self.pool_connection_authorized(opctx).await?, ) @@ -1133,3 +1146,116 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } } + +#[cfg(test)] +mod test { + use crate::db::datastore::{datastore_test, UpdatePrecondition}; + use nexus_test_utils::db::test_setup_database; + use nexus_types::external_api::params::{ + BgpAnnounceSetCreate, BgpConfigCreate, BgpPeerConfig, SwitchPortConfig, + SwitchPortGeometry, SwitchPortSettingsCreate, + }; + use omicron_common::api::external::{ + IdentityMetadataCreateParams, Name, NameOrId, + }; + use omicron_test_utils::dev; + use std::collections::HashMap; + use uuid::Uuid; + + #[tokio::test] + async fn test_bgp_boundary_switches() { + let logctx = dev::test_setup_log("test_bgp_boundary_switches"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + let rack_id: Uuid = + nexus_test_utils::RACK_UUID.parse().expect("parse uuid"); + let switch0: Name = "switch0".parse().expect("parse switch location"); + let qsfp0: Name = "qsfp0".parse().expect("parse qsfp0"); + + let port_result = datastore + .switch_port_create(&opctx, rack_id, switch0.into(), qsfp0.into()) + .await + .expect("switch port create"); + + let announce_set = BgpAnnounceSetCreate { + identity: IdentityMetadataCreateParams { + name: "test-announce-set".parse().unwrap(), + description: "test bgp announce set".into(), + }, + announcement: Vec::new(), + }; + + datastore.bgp_create_announce_set(&opctx, &announce_set).await.unwrap(); + + let bgp_config = BgpConfigCreate { + identity: IdentityMetadataCreateParams { + name: "test-bgp-config".parse().unwrap(), + description: "test bgp config".into(), + }, + asn: 47, + bgp_announce_set_id: NameOrId::Name( + "test-announce-set".parse().unwrap(), + ), + vrf: None, + }; + + datastore.bgp_config_set(&opctx, &bgp_config).await.unwrap(); + + let settings = SwitchPortSettingsCreate { + identity: IdentityMetadataCreateParams { + name: "test-settings".parse().unwrap(), + description: "test settings".into(), + }, + port_config: SwitchPortConfig { + geometry: SwitchPortGeometry::Qsfp28x1, + }, + groups: Vec::new(), + links: HashMap::new(), + interfaces: HashMap::new(), + routes: HashMap::new(), + bgp_peers: HashMap::from([( + "phy0".into(), + BgpPeerConfig { + bgp_announce_set: NameOrId::Name( + "test-announce-set".parse().unwrap(), + ), + bgp_config: NameOrId::Name( + "test-bgp-config".parse().unwrap(), + ), + interface_name: "qsfp0".into(), + addr: "192.168.1.1".parse().unwrap(), + hold_time: 0, + idle_hold_time: 0, + delay_open: 0, + connect_retry: 0, + keepalive: 0, + }, + )]), + addresses: HashMap::new(), + }; + + let settings_result = datastore + .switch_port_settings_create(&opctx, &settings, None) + .await + .unwrap(); + + datastore + .switch_port_set_settings_id( + &opctx, + port_result.id, + Some(settings_result.settings.identity.id), + UpdatePrecondition::DontCare, + ) + .await + .unwrap(); + + let uplink_ports = + datastore.switch_ports_with_uplinks(&opctx).await.unwrap(); + + assert_eq!(uplink_ports.len(), 1); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } +} diff --git a/nexus/inventory/tests/output/collector_basic.txt b/nexus/inventory/tests/output/collector_basic.txt index f1ec36476b..b9894ff184 100644 --- a/nexus/inventory/tests/output/collector_basic.txt +++ b/nexus/inventory/tests/output/collector_basic.txt @@ -5,9 +5,9 @@ baseboards: part "FAKE_SIM_SIDECAR" serial "SimSidecar1" cabooses: - board "SimGimletRot" name "SimGimlet" version "0.0.1" git_commit "eeeeeeee" board "SimGimletSp" name "SimGimlet" version "0.0.1" git_commit "ffffffff" - board "SimSidecarRot" name "SimSidecar" version "0.0.1" git_commit "eeeeeeee" + board "SimRot" name "SimGimlet" version "0.0.1" git_commit "eeeeeeee" + board "SimRot" name "SimSidecar" version "0.0.1" git_commit "eeeeeeee" board "SimSidecarSp" name "SimSidecar" version "0.0.1" git_commit "ffffffff" rot pages: @@ -41,14 +41,14 @@ cabooses found: SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" - RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" - RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" - RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" - RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" - RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" - RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" - RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" - RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimRot" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimRot" rot pages found: Cmpa baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": data_base64 "Z2ltbGV0LWNtcGEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=" diff --git a/nexus/inventory/tests/output/collector_errors.txt b/nexus/inventory/tests/output/collector_errors.txt index 905e044d58..a50e24ca30 100644 --- a/nexus/inventory/tests/output/collector_errors.txt +++ b/nexus/inventory/tests/output/collector_errors.txt @@ -5,9 +5,9 @@ baseboards: part "FAKE_SIM_SIDECAR" serial "SimSidecar1" cabooses: - board "SimGimletRot" name "SimGimlet" version "0.0.1" git_commit "eeeeeeee" board "SimGimletSp" name "SimGimlet" version "0.0.1" git_commit "ffffffff" - board "SimSidecarRot" name "SimSidecar" version "0.0.1" git_commit "eeeeeeee" + board "SimRot" name "SimGimlet" version "0.0.1" git_commit "eeeeeeee" + board "SimRot" name "SimSidecar" version "0.0.1" git_commit "eeeeeeee" board "SimSidecarSp" name "SimSidecar" version "0.0.1" git_commit "ffffffff" rot pages: @@ -41,14 +41,14 @@ cabooses found: SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" - RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" - RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" - RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" - RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" - RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" - RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" - RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" - RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimRot" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimRot" rot pages found: Cmpa baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": data_base64 "Z2ltbGV0LWNtcGEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=" diff --git a/nexus/src/app/oximeter.rs b/nexus/src/app/oximeter.rs index 7dfa2fb68b..66f39a32b6 100644 --- a/nexus/src/app/oximeter.rs +++ b/nexus/src/app/oximeter.rs @@ -127,6 +127,9 @@ impl super::Nexus { for producer in producers.into_iter() { let producer_info = oximeter_client::types::ProducerEndpoint { id: producer.id(), + kind: producer + .kind + .map(|kind| nexus::ProducerKind::from(kind).into()), address: SocketAddr::new( producer.ip.ip(), producer.port.try_into().unwrap(), @@ -149,6 +152,7 @@ impl super::Nexus { pub(crate) async fn register_as_producer(&self, address: SocketAddr) { let producer_endpoint = nexus::ProducerEndpoint { id: self.id, + kind: Some(nexus::ProducerKind::Service), address, base_route: String::from("/metrics/collect"), interval: Duration::from_secs(10), diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 5b1843be3d..89e1a10052 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -36,6 +36,7 @@ pub mod snapshot_create; pub mod snapshot_delete; pub mod switch_port_settings_apply; pub mod switch_port_settings_clear; +pub mod switch_port_settings_common; pub mod test_saga; pub mod volume_delete; pub mod volume_remove_rop; diff --git a/nexus/src/app/sagas/switch_port_settings_apply.rs b/nexus/src/app/sagas/switch_port_settings_apply.rs index 0c06d6ff83..aba62b6937 100644 --- a/nexus/src/app/sagas/switch_port_settings_apply.rs +++ b/nexus/src/app/sagas/switch_port_settings_apply.rs @@ -3,53 +3,32 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{NexusActionContext, NEXUS_DPD_TAG}; -use crate::app::map_switch_zone_addrs; use crate::app::sagas::retry_until_known_result; +use crate::app::sagas::switch_port_settings_common::{ + api_to_dpd_port_settings, ensure_switch_port_bgp_settings, + ensure_switch_port_uplink, select_mg_client, switch_sled_agent, + write_bootstore_config, +}; use crate::app::sagas::{ declare_saga_actions, ActionRegistry, NexusSaga, SagaInitError, }; -use crate::Nexus; use anyhow::Error; use db::datastore::SwitchPortSettingsCombinedResult; -use dpd_client::types::{ - LinkCreate, LinkId, LinkSettings, PortFec, PortId, PortSettings, PortSpeed, - RouteSettingsV4, RouteSettingsV6, -}; -use dpd_client::{Ipv4Cidr, Ipv6Cidr}; -use internal_dns::ServiceName; -use ipnetwork::IpNetwork; -use mg_admin_client::types::Prefix4; -use mg_admin_client::types::{ApplyRequest, BgpPeerConfig}; -use nexus_db_model::{SwitchLinkFec, SwitchLinkSpeed, NETWORK_KEY}; -use nexus_db_queries::context::OpContext; +use dpd_client::types::PortId; +use nexus_db_model::NETWORK_KEY; use nexus_db_queries::db::datastore::UpdatePrecondition; use nexus_db_queries::{authn, db}; -use nexus_types::external_api::params; -use omicron_common::address::SLED_AGENT_PORT; use omicron_common::api::external::{self, NameOrId}; use omicron_common::api::internal::shared::{ ParseSwitchLocationError, SwitchLocation, }; use serde::{Deserialize, Serialize}; -use sled_agent_client::types::PortConfigV1; -use sled_agent_client::types::RouteConfig; -use sled_agent_client::types::{BgpConfig, EarlyNetworkConfig}; -use sled_agent_client::types::{ - BgpPeerConfig as OmicronBgpPeerConfig, HostPortConfig, -}; -use std::collections::HashMap; -use std::net::SocketAddrV6; -use std::net::{IpAddr, Ipv6Addr}; +use std::net::IpAddr; use std::str::FromStr; use std::sync::Arc; use steno::ActionError; use uuid::Uuid; -// This is more of an implementation detail of the BGP implementation. It -// defines the maximum time the peering engine will wait for external messages -// before breaking to check for shutdown conditions. -const BGP_SESSION_RESOLUTION: u64 = 100; - // switch port settings apply saga: input parameters #[derive(Debug, Deserialize, Serialize)] @@ -176,91 +155,6 @@ async fn spa_get_switch_port_settings( Ok(port_settings) } -pub(crate) fn api_to_dpd_port_settings( - settings: &SwitchPortSettingsCombinedResult, -) -> Result { - let mut dpd_port_settings = PortSettings { - links: HashMap::new(), - v4_routes: HashMap::new(), - v6_routes: HashMap::new(), - }; - - //TODO breakouts - let link_id = LinkId(0); - - for l in settings.links.iter() { - dpd_port_settings.links.insert( - link_id.to_string(), - LinkSettings { - params: LinkCreate { - autoneg: false, - lane: Some(LinkId(0)), - kr: false, - fec: match l.fec { - SwitchLinkFec::Firecode => PortFec::Firecode, - SwitchLinkFec::Rs => PortFec::Rs, - SwitchLinkFec::None => PortFec::None, - }, - speed: match l.speed { - SwitchLinkSpeed::Speed0G => PortSpeed::Speed0G, - SwitchLinkSpeed::Speed1G => PortSpeed::Speed1G, - SwitchLinkSpeed::Speed10G => PortSpeed::Speed10G, - SwitchLinkSpeed::Speed25G => PortSpeed::Speed25G, - SwitchLinkSpeed::Speed40G => PortSpeed::Speed40G, - SwitchLinkSpeed::Speed50G => PortSpeed::Speed50G, - SwitchLinkSpeed::Speed100G => PortSpeed::Speed100G, - SwitchLinkSpeed::Speed200G => PortSpeed::Speed200G, - SwitchLinkSpeed::Speed400G => PortSpeed::Speed400G, - }, - }, - //TODO won't work for breakouts - addrs: settings - .addresses - .iter() - .map(|a| a.address.ip()) - .collect(), - }, - ); - } - - for r in &settings.routes { - match &r.dst { - IpNetwork::V4(n) => { - let gw = match r.gw.ip() { - IpAddr::V4(gw) => gw, - IpAddr::V6(_) => { - return Err( - "IPv4 destination cannot have IPv6 nexthop".into() - ) - } - }; - dpd_port_settings.v4_routes.insert( - Ipv4Cidr { prefix: n.ip(), prefix_len: n.prefix() } - .to_string(), - vec![RouteSettingsV4 { link_id: link_id.0, nexthop: gw }], - ); - } - IpNetwork::V6(n) => { - let gw = match r.gw.ip() { - IpAddr::V6(gw) => gw, - IpAddr::V4(_) => { - return Err( - "IPv6 destination cannot have IPv4 nexthop".into() - ) - } - }; - dpd_port_settings.v6_routes.insert( - Ipv6Cidr { prefix: n.ip(), prefix_len: n.prefix() } - .to_string(), - vec![RouteSettingsV6 { link_id: link_id.0, nexthop: gw }], - ); - } - } - } - - Ok(dpd_port_settings) -} - async fn spa_ensure_switch_port_settings( sagactx: NexusActionContext, ) -> Result<(), ActionError> { @@ -380,101 +274,6 @@ async fn spa_undo_ensure_switch_port_settings( Ok(()) } -async fn spa_ensure_switch_port_bgp_settings( - sagactx: NexusActionContext, -) -> Result<(), ActionError> { - let settings = sagactx - .lookup::("switch_port_settings") - .map_err(|e| { - ActionError::action_failed(format!( - "lookup switch port settings: {e}" - )) - })?; - - ensure_switch_port_bgp_settings(sagactx, settings).await -} - -pub(crate) async fn ensure_switch_port_bgp_settings( - sagactx: NexusActionContext, - settings: SwitchPortSettingsCombinedResult, -) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); - let nexus = osagactx.nexus(); - let params = sagactx.saga_params::()?; - - let opctx = crate::context::op_context_for_saga_action( - &sagactx, - ¶ms.serialized_authn, - ); - let mg_client: Arc = - select_mg_client(&sagactx).await.map_err(|e| { - ActionError::action_failed(format!("select mg client: {e}")) - })?; - - let mut bgp_peer_configs = Vec::new(); - - for peer in settings.bgp_peers { - let config = nexus - .bgp_config_get(&opctx, peer.bgp_config_id.into()) - .await - .map_err(|e| { - ActionError::action_failed(format!("get bgp config: {e}")) - })?; - - let announcements = nexus - .bgp_announce_list( - &opctx, - ¶ms::BgpAnnounceSetSelector { - name_or_id: NameOrId::Id(config.bgp_announce_set_id), - }, - ) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "get bgp announcements: {e}" - )) - })?; - - let mut prefixes = Vec::new(); - for a in &announcements { - let value = match a.network.ip() { - IpAddr::V4(value) => Ok(value), - IpAddr::V6(_) => Err(ActionError::action_failed( - "IPv6 announcement not yet supported".to_string(), - )), - }?; - prefixes.push(Prefix4 { value, length: a.network.prefix() }); - } - - let bpc = BgpPeerConfig { - asn: *config.asn, - name: format!("{}", peer.addr.ip()), //TODO user defined name? - host: format!("{}:179", peer.addr.ip()), - hold_time: peer.hold_time.0.into(), - idle_hold_time: peer.idle_hold_time.0.into(), - delay_open: peer.delay_open.0.into(), - connect_retry: peer.connect_retry.0.into(), - keepalive: peer.keepalive.0.into(), - resolution: BGP_SESSION_RESOLUTION, - originate: prefixes, - }; - - bgp_peer_configs.push(bpc); - } - - mg_client - .inner - .bgp_apply(&ApplyRequest { - peer_group: params.switch_port_name.clone(), - peers: bgp_peer_configs, - }) - .await - .map_err(|e| { - ActionError::action_failed(format!("apply bgp settings: {e}")) - })?; - - Ok(()) -} async fn spa_undo_ensure_switch_port_bgp_settings( sagactx: NexusActionContext, ) -> Result<(), Error> { @@ -497,9 +296,13 @@ async fn spa_undo_ensure_switch_port_bgp_settings( })?; let mg_client: Arc = - select_mg_client(&sagactx).await.map_err(|e| { - ActionError::action_failed(format!("select mg client (undo): {e}")) - })?; + select_mg_client(&sagactx, &opctx, params.switch_port_id) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "select mg client (undo): {e}" + )) + })?; for peer in settings.bgp_peers { let config = nexus @@ -592,96 +395,39 @@ async fn spa_undo_ensure_switch_port_bootstore_network_settings( async fn spa_ensure_switch_port_uplink( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - ensure_switch_port_uplink(sagactx, false, None).await + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + ensure_switch_port_uplink( + sagactx, + &opctx, + false, + None, + params.switch_port_id, + params.switch_port_name, + ) + .await } async fn spa_undo_ensure_switch_port_uplink( sagactx: NexusActionContext, ) -> Result<(), Error> { - Ok(ensure_switch_port_uplink(sagactx, true, None).await?) -} - -pub(crate) async fn ensure_switch_port_uplink( - sagactx: NexusActionContext, - skip_self: bool, - inject: Option, -) -> Result<(), ActionError> { let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( &sagactx, ¶ms.serialized_authn, ); - let osagactx = sagactx.user_data(); - let nexus = osagactx.nexus(); - - let switch_port = nexus - .get_switch_port(&opctx, params.switch_port_id) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "get switch port for uplink: {e}" - )) - })?; - - let switch_location: SwitchLocation = - switch_port.switch_location.parse().map_err(|e| { - ActionError::action_failed(format!( - "get switch location for uplink: {e:?}", - )) - })?; - - let mut uplinks: Vec = Vec::new(); - - // The sled agent uplinks interface is an all or nothing interface, so we - // need to get all the uplink configs for all the ports. - let active_ports = - nexus.active_port_settings(&opctx).await.map_err(|e| { - ActionError::action_failed(format!( - "get active switch port settings: {e}" - )) - })?; - - for (port, info) in &active_ports { - // Since we are undoing establishing uplinks for the settings - // associated with this port we skip adding this ports uplinks - // to the list - effectively removing them. - if skip_self && port.id == switch_port.id { - continue; - } - uplinks.push(HostPortConfig { - port: port.port_name.clone(), - addrs: info.addresses.iter().map(|a| a.address).collect(), - }) - } - - if let Some(id) = inject { - let opctx = crate::context::op_context_for_saga_action( - &sagactx, - ¶ms.serialized_authn, - ); - let settings = nexus - .switch_port_settings_get(&opctx, &id.into()) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "get switch port settings for injection: {e}" - )) - })?; - uplinks.push(HostPortConfig { - port: params.switch_port_name.clone(), - addrs: settings.addresses.iter().map(|a| a.address).collect(), - }) - } - - let sc = switch_sled_agent(switch_location, &sagactx).await?; - sc.uplink_ensure(&sled_agent_client::types::SwitchPorts { uplinks }) - .await - .map_err(|e| { - ActionError::action_failed(format!("ensure uplink: {e}")) - })?; - - Ok(()) + Ok(ensure_switch_port_uplink( + sagactx, + &opctx, + true, + None, + params.switch_port_id, + params.switch_port_name, + ) + .await?) } // a common route representation for dendrite and port settings @@ -767,307 +513,29 @@ pub(crate) async fn select_dendrite_client( Ok(dpd_client) } -pub(crate) async fn select_mg_client( - sagactx: &NexusActionContext, -) -> Result, ActionError> { - let osagactx = sagactx.user_data(); - let params = sagactx.saga_params::()?; - let nexus = osagactx.nexus(); - let opctx = crate::context::op_context_for_saga_action( - &sagactx, - ¶ms.serialized_authn, - ); - - let switch_port = nexus - .get_switch_port(&opctx, params.switch_port_id) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "get switch port for mg client selection: {e}" - )) - })?; - - let switch_location: SwitchLocation = - switch_port.switch_location.parse().map_err( - |e: ParseSwitchLocationError| { - ActionError::action_failed(format!( - "get switch location for uplink: {e:?}", - )) - }, - )?; - - let mg_client: Arc = osagactx - .nexus() - .mg_clients - .get(&switch_location) - .ok_or_else(|| { - ActionError::action_failed(format!( - "requested switch not available: {switch_location}" - )) - })? - .clone(); - Ok(mg_client) -} - -pub(crate) async fn get_scrimlet_address( - location: SwitchLocation, - nexus: &Arc, -) -> Result { - /* TODO this depends on DNS entries only coming from RSS, it's broken - on the upgrade path - nexus - .resolver() - .await - .lookup_socket_v6(ServiceName::Scrimlet(location)) - .await - .map_err(|e| e.to_string()) - .map_err(|e| { - ActionError::action_failed(format!( - "scrimlet dns lookup failed {e}", - )) - }) - */ - let result = nexus - .resolver() - .await - .lookup_all_ipv6(ServiceName::Dendrite) - .await +async fn spa_ensure_switch_port_bgp_settings( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let settings = sagactx + .lookup::("switch_port_settings") .map_err(|e| { ActionError::action_failed(format!( - "scrimlet dns lookup failed {e}", - )) - }); - - let mappings = match result { - Ok(addrs) => map_switch_zone_addrs(&nexus.log, addrs).await, - Err(e) => { - warn!(nexus.log, "Failed to lookup Dendrite address: {e}"); - return Err(ActionError::action_failed(format!( - "switch mapping failed {e}", - ))); - } - }; - - let addr = match mappings.get(&location) { - Some(addr) => addr, - None => { - return Err(ActionError::action_failed(format!( - "address for switch at location: {location} not found", - ))); - } - }; - - let mut segments = addr.segments(); - segments[7] = 1; - let addr = Ipv6Addr::from(segments); - - Ok(SocketAddrV6::new(addr, SLED_AGENT_PORT, 0, 0)) -} - -#[derive(Clone, Debug)] -pub struct EarlyNetworkPortUpdate { - port: PortConfigV1, - bgp_configs: Vec, -} - -pub(crate) async fn bootstore_update( - nexus: &Arc, - opctx: &OpContext, - switch_port_id: Uuid, - switch_port_name: &str, - settings: &SwitchPortSettingsCombinedResult, -) -> Result { - let switch_port = - nexus.get_switch_port(&opctx, switch_port_id).await.map_err(|e| { - ActionError::action_failed(format!( - "get switch port for uplink: {e}" + "lookup switch port settings: {e}" )) })?; - let switch_location: SwitchLocation = - switch_port.switch_location.parse().map_err( - |e: ParseSwitchLocationError| { - ActionError::action_failed(format!( - "get switch location for uplink: {e:?}", - )) - }, - )?; - - let mut peer_info = Vec::new(); - let mut bgp_configs = Vec::new(); - for p in &settings.bgp_peers { - let bgp_config = nexus - .bgp_config_get(&opctx, p.bgp_config_id.into()) - .await - .map_err(|e| { - ActionError::action_failed(format!("get bgp config: {e}")) - })?; - - let announcements = nexus - .bgp_announce_list( - &opctx, - ¶ms::BgpAnnounceSetSelector { - name_or_id: NameOrId::Id(bgp_config.bgp_announce_set_id), - }, - ) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "get bgp announcements: {e}" - )) - })?; - - peer_info.push((p, bgp_config.asn.0)); - bgp_configs.push(BgpConfig { - asn: bgp_config.asn.0, - originate: announcements - .iter() - .filter_map(|a| match a.network { - IpNetwork::V4(net) => Some(net.into()), - //TODO v6 - _ => None, - }) - .collect(), - }); - } - - let update = EarlyNetworkPortUpdate { - port: PortConfigV1 { - routes: settings - .routes - .iter() - .map(|r| RouteConfig { destination: r.dst, nexthop: r.gw.ip() }) - .collect(), - addresses: settings.addresses.iter().map(|a| a.address).collect(), - switch: switch_location, - port: switch_port_name.into(), - uplink_port_fec: settings - .links - .get(0) - .map(|l| l.fec) - .unwrap_or(SwitchLinkFec::None) - .into(), - uplink_port_speed: settings - .links - .get(0) - .map(|l| l.speed) - .unwrap_or(SwitchLinkSpeed::Speed100G) - .into(), - bgp_peers: peer_info - .iter() - .filter_map(|(p, asn)| { - //TODO v6 - match p.addr.ip() { - IpAddr::V4(addr) => Some(OmicronBgpPeerConfig { - asn: *asn, - port: switch_port_name.into(), - addr, - hold_time: Some(p.hold_time.0.into()), - connect_retry: Some(p.connect_retry.0.into()), - delay_open: Some(p.delay_open.0.into()), - idle_hold_time: Some(p.idle_hold_time.0.into()), - keepalive: Some(p.keepalive.0.into()), - }), - IpAddr::V6(_) => { - warn!(opctx.log, "IPv6 peers not yet supported"); - None - } - } - }) - .collect(), - }, - bgp_configs, - }; - - Ok(update) -} - -pub(crate) async fn read_bootstore_config( - sa: &sled_agent_client::Client, -) -> Result { - Ok(sa - .read_network_bootstore_config_cache() - .await - .map_err(|e| { - ActionError::action_failed(format!( - "read bootstore network config: {e}" - )) - })? - .into_inner()) -} - -pub(crate) async fn write_bootstore_config( - sa: &sled_agent_client::Client, - config: &EarlyNetworkConfig, -) -> Result<(), ActionError> { - sa.write_network_bootstore_config(config).await.map_err(|e| { - ActionError::action_failed(format!( - "write bootstore network config: {e}" - )) - })?; - Ok(()) -} - -#[derive(Clone, Debug, Default)] -pub(crate) struct BootstoreNetworkPortChange { - previous_port_config: Option, - changed_bgp_configs: Vec, - added_bgp_configs: Vec, -} - -pub(crate) fn apply_bootstore_update( - config: &mut EarlyNetworkConfig, - update: &EarlyNetworkPortUpdate, -) -> Result { - let mut change = BootstoreNetworkPortChange::default(); - - let rack_net_config = match &mut config.body.rack_network_config { - Some(cfg) => cfg, - None => { - return Err(ActionError::action_failed( - "rack network config not yet initialized".to_string(), - )) - } - }; - - for port in &mut rack_net_config.ports { - if port.port == update.port.port { - change.previous_port_config = Some(port.clone()); - *port = update.port.clone(); - break; - } - } - if change.previous_port_config.is_none() { - rack_net_config.ports.push(update.port.clone()); - } - - for updated_bgp in &update.bgp_configs { - let mut exists = false; - for resident_bgp in &mut rack_net_config.bgp { - if resident_bgp.asn == updated_bgp.asn { - change.changed_bgp_configs.push(resident_bgp.clone()); - *resident_bgp = updated_bgp.clone(); - exists = true; - break; - } - } - if !exists { - change.added_bgp_configs.push(updated_bgp.clone()); - } - } - rack_net_config.bgp.extend_from_slice(&change.added_bgp_configs); - - Ok(change) -} + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); -pub(crate) async fn switch_sled_agent( - location: SwitchLocation, - sagactx: &NexusActionContext, -) -> Result { - let nexus = sagactx.user_data().nexus(); - let sled_agent_addr = get_scrimlet_address(location, nexus).await?; - Ok(sled_agent_client::Client::new( - &format!("http://{}", sled_agent_addr), - sagactx.user_data().log().clone(), - )) + ensure_switch_port_bgp_settings( + sagactx, + &opctx, + settings, + params.switch_port_name.clone(), + params.switch_port_id, + ) + .await } diff --git a/nexus/src/app/sagas/switch_port_settings_clear.rs b/nexus/src/app/sagas/switch_port_settings_clear.rs index 1ab2f6be0c..bcbd5bf894 100644 --- a/nexus/src/app/sagas/switch_port_settings_clear.rs +++ b/nexus/src/app/sagas/switch_port_settings_clear.rs @@ -5,7 +5,7 @@ use super::switch_port_settings_apply::select_dendrite_client; use super::{NexusActionContext, NEXUS_DPD_TAG}; use crate::app::sagas::retry_until_known_result; -use crate::app::sagas::switch_port_settings_apply::{ +use crate::app::sagas::switch_port_settings_common::{ api_to_dpd_port_settings, apply_bootstore_update, bootstore_update, ensure_switch_port_bgp_settings, ensure_switch_port_uplink, read_bootstore_config, select_mg_client, switch_sled_agent, @@ -214,7 +214,20 @@ async fn spa_undo_clear_switch_port_settings( async fn spa_clear_switch_port_uplink( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - ensure_switch_port_uplink(sagactx, true, None).await + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + ensure_switch_port_uplink( + sagactx, + &opctx, + true, + None, + params.switch_port_id, + params.port_name.clone(), + ) + .await } async fn spa_undo_clear_switch_port_uplink( @@ -223,8 +236,21 @@ async fn spa_undo_clear_switch_port_uplink( let id = sagactx .lookup::>("original_switch_port_settings_id") .map_err(|e| external::Error::internal_error(&e.to_string()))?; + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); - Ok(ensure_switch_port_uplink(sagactx, false, id).await?) + Ok(ensure_switch_port_uplink( + sagactx, + &opctx, + false, + id, + params.switch_port_id, + params.port_name.clone(), + ) + .await?) } async fn spa_clear_switch_port_bgp_settings( @@ -257,9 +283,13 @@ async fn spa_clear_switch_port_bgp_settings( .map_err(ActionError::action_failed)?; let mg_client: Arc = - select_mg_client(&sagactx).await.map_err(|e| { - ActionError::action_failed(format!("select mg client (undo): {e}")) - })?; + select_mg_client(&sagactx, &opctx, params.switch_port_id) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "select mg client (undo): {e}" + )) + })?; for peer in settings.bgp_peers { let config = nexus @@ -306,7 +336,14 @@ async fn spa_undo_clear_switch_port_bgp_settings( let settings = nexus.switch_port_settings_get(&opctx, &NameOrId::Id(id)).await?; - Ok(ensure_switch_port_bgp_settings(sagactx, settings).await?) + Ok(ensure_switch_port_bgp_settings( + sagactx, + &opctx, + settings, + params.port_name.clone(), + params.switch_port_id, + ) + .await?) } async fn spa_clear_switch_port_bootstore_network_settings( diff --git a/nexus/src/app/sagas/switch_port_settings_common.rs b/nexus/src/app/sagas/switch_port_settings_common.rs new file mode 100644 index 0000000000..8e66aa12f8 --- /dev/null +++ b/nexus/src/app/sagas/switch_port_settings_common.rs @@ -0,0 +1,577 @@ +use super::NexusActionContext; +use crate::app::map_switch_zone_addrs; +use crate::Nexus; +use db::datastore::SwitchPortSettingsCombinedResult; +use dpd_client::types::{ + LinkCreate, LinkId, LinkSettings, PortFec, PortSettings, PortSpeed, + RouteSettingsV4, RouteSettingsV6, +}; +use dpd_client::{Ipv4Cidr, Ipv6Cidr}; +use internal_dns::ServiceName; +use ipnetwork::IpNetwork; +use mg_admin_client::types::Prefix4; +use mg_admin_client::types::{ApplyRequest, BgpPeerConfig}; +use nexus_db_model::{SwitchLinkFec, SwitchLinkSpeed}; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db; +use nexus_types::external_api::params; +use omicron_common::address::SLED_AGENT_PORT; +use omicron_common::api::external::NameOrId; +use omicron_common::api::internal::shared::{ + ParseSwitchLocationError, SwitchLocation, +}; +use sled_agent_client::types::PortConfigV1; +use sled_agent_client::types::RouteConfig; +use sled_agent_client::types::{BgpConfig, EarlyNetworkConfig}; +use sled_agent_client::types::{ + BgpPeerConfig as OmicronBgpPeerConfig, HostPortConfig, +}; +use std::collections::HashMap; +use std::net::SocketAddrV6; +use std::net::{IpAddr, Ipv6Addr}; +use std::sync::Arc; +use steno::ActionError; +use uuid::Uuid; + +// This is more of an implementation detail of the BGP implementation. It +// defines the maximum time the peering engine will wait for external messages +// before breaking to check for shutdown conditions. +const BGP_SESSION_RESOLUTION: u64 = 100; + +pub(crate) fn api_to_dpd_port_settings( + settings: &SwitchPortSettingsCombinedResult, +) -> Result { + let mut dpd_port_settings = PortSettings { + links: HashMap::new(), + v4_routes: HashMap::new(), + v6_routes: HashMap::new(), + }; + + //TODO breakouts + let link_id = LinkId(0); + + for l in settings.links.iter() { + dpd_port_settings.links.insert( + link_id.to_string(), + LinkSettings { + params: LinkCreate { + autoneg: false, + lane: Some(LinkId(0)), + kr: false, + fec: match l.fec { + SwitchLinkFec::Firecode => PortFec::Firecode, + SwitchLinkFec::Rs => PortFec::Rs, + SwitchLinkFec::None => PortFec::None, + }, + speed: match l.speed { + SwitchLinkSpeed::Speed0G => PortSpeed::Speed0G, + SwitchLinkSpeed::Speed1G => PortSpeed::Speed1G, + SwitchLinkSpeed::Speed10G => PortSpeed::Speed10G, + SwitchLinkSpeed::Speed25G => PortSpeed::Speed25G, + SwitchLinkSpeed::Speed40G => PortSpeed::Speed40G, + SwitchLinkSpeed::Speed50G => PortSpeed::Speed50G, + SwitchLinkSpeed::Speed100G => PortSpeed::Speed100G, + SwitchLinkSpeed::Speed200G => PortSpeed::Speed200G, + SwitchLinkSpeed::Speed400G => PortSpeed::Speed400G, + }, + }, + //TODO won't work for breakouts + addrs: settings + .addresses + .iter() + .map(|a| a.address.ip()) + .collect(), + }, + ); + } + + for r in &settings.routes { + match &r.dst { + IpNetwork::V4(n) => { + let gw = match r.gw.ip() { + IpAddr::V4(gw) => gw, + IpAddr::V6(_) => { + return Err( + "IPv4 destination cannot have IPv6 nexthop".into() + ) + } + }; + dpd_port_settings.v4_routes.insert( + Ipv4Cidr { prefix: n.ip(), prefix_len: n.prefix() } + .to_string(), + vec![RouteSettingsV4 { link_id: link_id.0, nexthop: gw }], + ); + } + IpNetwork::V6(n) => { + let gw = match r.gw.ip() { + IpAddr::V6(gw) => gw, + IpAddr::V4(_) => { + return Err( + "IPv6 destination cannot have IPv4 nexthop".into() + ) + } + }; + dpd_port_settings.v6_routes.insert( + Ipv6Cidr { prefix: n.ip(), prefix_len: n.prefix() } + .to_string(), + vec![RouteSettingsV6 { link_id: link_id.0, nexthop: gw }], + ); + } + } + } + + Ok(dpd_port_settings) +} + +pub(crate) fn apply_bootstore_update( + config: &mut EarlyNetworkConfig, + update: &EarlyNetworkPortUpdate, +) -> Result { + let mut change = BootstoreNetworkPortChange::default(); + + let rack_net_config = match &mut config.body.rack_network_config { + Some(cfg) => cfg, + None => { + return Err(ActionError::action_failed( + "rack network config not yet initialized".to_string(), + )) + } + }; + + for port in &mut rack_net_config.ports { + if port.port == update.port.port { + change.previous_port_config = Some(port.clone()); + *port = update.port.clone(); + break; + } + } + if change.previous_port_config.is_none() { + rack_net_config.ports.push(update.port.clone()); + } + + for updated_bgp in &update.bgp_configs { + let mut exists = false; + for resident_bgp in &mut rack_net_config.bgp { + if resident_bgp.asn == updated_bgp.asn { + change.changed_bgp_configs.push(resident_bgp.clone()); + *resident_bgp = updated_bgp.clone(); + exists = true; + break; + } + } + if !exists { + change.added_bgp_configs.push(updated_bgp.clone()); + } + } + rack_net_config.bgp.extend_from_slice(&change.added_bgp_configs); + + Ok(change) +} + +pub(crate) async fn bootstore_update( + nexus: &Arc, + opctx: &OpContext, + switch_port_id: Uuid, + switch_port_name: &str, + settings: &SwitchPortSettingsCombinedResult, +) -> Result { + let switch_port = + nexus.get_switch_port(&opctx, switch_port_id).await.map_err(|e| { + ActionError::action_failed(format!( + "get switch port for uplink: {e}" + )) + })?; + + let switch_location: SwitchLocation = + switch_port.switch_location.parse().map_err( + |e: ParseSwitchLocationError| { + ActionError::action_failed(format!( + "get switch location for uplink: {e:?}", + )) + }, + )?; + + let mut peer_info = Vec::new(); + let mut bgp_configs = Vec::new(); + for p in &settings.bgp_peers { + let bgp_config = nexus + .bgp_config_get(&opctx, p.bgp_config_id.into()) + .await + .map_err(|e| { + ActionError::action_failed(format!("get bgp config: {e}")) + })?; + + let announcements = nexus + .bgp_announce_list( + &opctx, + ¶ms::BgpAnnounceSetSelector { + name_or_id: NameOrId::Id(bgp_config.bgp_announce_set_id), + }, + ) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "get bgp announcements: {e}" + )) + })?; + + peer_info.push((p, bgp_config.asn.0)); + bgp_configs.push(BgpConfig { + asn: bgp_config.asn.0, + originate: announcements + .iter() + .filter_map(|a| match a.network { + IpNetwork::V4(net) => Some(net.into()), + //TODO v6 + _ => None, + }) + .collect(), + }); + } + + let update = EarlyNetworkPortUpdate { + port: PortConfigV1 { + routes: settings + .routes + .iter() + .map(|r| RouteConfig { destination: r.dst, nexthop: r.gw.ip() }) + .collect(), + addresses: settings.addresses.iter().map(|a| a.address).collect(), + switch: switch_location, + port: switch_port_name.into(), + uplink_port_fec: settings + .links + .get(0) + .map(|l| l.fec) + .unwrap_or(SwitchLinkFec::None) + .into(), + uplink_port_speed: settings + .links + .get(0) + .map(|l| l.speed) + .unwrap_or(SwitchLinkSpeed::Speed100G) + .into(), + bgp_peers: peer_info + .iter() + .filter_map(|(p, asn)| { + //TODO v6 + match p.addr.ip() { + IpAddr::V4(addr) => Some(OmicronBgpPeerConfig { + asn: *asn, + port: switch_port_name.into(), + addr, + hold_time: Some(p.hold_time.0.into()), + connect_retry: Some(p.connect_retry.0.into()), + delay_open: Some(p.delay_open.0.into()), + idle_hold_time: Some(p.idle_hold_time.0.into()), + keepalive: Some(p.keepalive.0.into()), + }), + IpAddr::V6(_) => { + warn!(opctx.log, "IPv6 peers not yet supported"); + None + } + } + }) + .collect(), + }, + bgp_configs, + }; + + Ok(update) +} + +pub(crate) async fn ensure_switch_port_uplink( + sagactx: NexusActionContext, + opctx: &OpContext, + skip_self: bool, + inject: Option, + switch_port_id: Uuid, + switch_port_name: String, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let nexus = osagactx.nexus(); + + let switch_port = + nexus.get_switch_port(&opctx, switch_port_id).await.map_err(|e| { + ActionError::action_failed(format!( + "get switch port for uplink: {e}" + )) + })?; + + let switch_location: SwitchLocation = + switch_port.switch_location.parse().map_err(|e| { + ActionError::action_failed(format!( + "get switch location for uplink: {e:?}", + )) + })?; + + let mut uplinks: Vec = Vec::new(); + + // The sled agent uplinks interface is an all or nothing interface, so we + // need to get all the uplink configs for all the ports. + let active_ports = + nexus.active_port_settings(&opctx).await.map_err(|e| { + ActionError::action_failed(format!( + "get active switch port settings: {e}" + )) + })?; + + for (port, info) in &active_ports { + // Since we are undoing establishing uplinks for the settings + // associated with this port we skip adding this ports uplinks + // to the list - effectively removing them. + if skip_self && port.id == switch_port.id { + continue; + } + uplinks.push(HostPortConfig { + port: port.port_name.clone(), + addrs: info.addresses.iter().map(|a| a.address).collect(), + }) + } + + if let Some(id) = inject { + let settings = nexus + .switch_port_settings_get(&opctx, &id.into()) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "get switch port settings for injection: {e}" + )) + })?; + uplinks.push(HostPortConfig { + port: switch_port_name.clone(), + addrs: settings.addresses.iter().map(|a| a.address).collect(), + }) + } + + let sc = switch_sled_agent(switch_location, &sagactx).await?; + sc.uplink_ensure(&sled_agent_client::types::SwitchPorts { uplinks }) + .await + .map_err(|e| { + ActionError::action_failed(format!("ensure uplink: {e}")) + })?; + + Ok(()) +} + +pub(crate) async fn read_bootstore_config( + sa: &sled_agent_client::Client, +) -> Result { + Ok(sa + .read_network_bootstore_config_cache() + .await + .map_err(|e| { + ActionError::action_failed(format!( + "read bootstore network config: {e}" + )) + })? + .into_inner()) +} + +pub(crate) async fn write_bootstore_config( + sa: &sled_agent_client::Client, + config: &EarlyNetworkConfig, +) -> Result<(), ActionError> { + sa.write_network_bootstore_config(config).await.map_err(|e| { + ActionError::action_failed(format!( + "write bootstore network config: {e}" + )) + })?; + Ok(()) +} + +pub(crate) async fn select_mg_client( + sagactx: &NexusActionContext, + opctx: &OpContext, + switch_port_id: Uuid, +) -> Result, ActionError> { + let osagactx = sagactx.user_data(); + let nexus = osagactx.nexus(); + + let switch_port = + nexus.get_switch_port(&opctx, switch_port_id).await.map_err(|e| { + ActionError::action_failed(format!( + "get switch port for mg client selection: {e}" + )) + })?; + + let switch_location: SwitchLocation = + switch_port.switch_location.parse().map_err( + |e: ParseSwitchLocationError| { + ActionError::action_failed(format!( + "get switch location for uplink: {e:?}", + )) + }, + )?; + + let mg_client: Arc = osagactx + .nexus() + .mg_clients + .get(&switch_location) + .ok_or_else(|| { + ActionError::action_failed(format!( + "requested switch not available: {switch_location}" + )) + })? + .clone(); + Ok(mg_client) +} + +pub(crate) async fn switch_sled_agent( + location: SwitchLocation, + sagactx: &NexusActionContext, +) -> Result { + let nexus = sagactx.user_data().nexus(); + let sled_agent_addr = get_scrimlet_address(location, nexus).await?; + Ok(sled_agent_client::Client::new( + &format!("http://{}", sled_agent_addr), + sagactx.user_data().log().clone(), + )) +} + +pub(crate) async fn ensure_switch_port_bgp_settings( + sagactx: NexusActionContext, + opctx: &OpContext, + settings: SwitchPortSettingsCombinedResult, + switch_port_name: String, + switch_port_id: Uuid, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let nexus = osagactx.nexus(); + let mg_client: Arc = + select_mg_client(&sagactx, opctx, switch_port_id).await.map_err( + |e| ActionError::action_failed(format!("select mg client: {e}")), + )?; + + let mut bgp_peer_configs = Vec::new(); + + for peer in settings.bgp_peers { + let config = nexus + .bgp_config_get(&opctx, peer.bgp_config_id.into()) + .await + .map_err(|e| { + ActionError::action_failed(format!("get bgp config: {e}")) + })?; + + let announcements = nexus + .bgp_announce_list( + &opctx, + ¶ms::BgpAnnounceSetSelector { + name_or_id: NameOrId::Id(config.bgp_announce_set_id), + }, + ) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "get bgp announcements: {e}" + )) + })?; + + let mut prefixes = Vec::new(); + for a in &announcements { + let value = match a.network.ip() { + IpAddr::V4(value) => Ok(value), + IpAddr::V6(_) => Err(ActionError::action_failed( + "IPv6 announcement not yet supported".to_string(), + )), + }?; + prefixes.push(Prefix4 { value, length: a.network.prefix() }); + } + + let bpc = BgpPeerConfig { + asn: *config.asn, + name: format!("{}", peer.addr.ip()), //TODO user defined name? + host: format!("{}:179", peer.addr.ip()), + hold_time: peer.hold_time.0.into(), + idle_hold_time: peer.idle_hold_time.0.into(), + delay_open: peer.delay_open.0.into(), + connect_retry: peer.connect_retry.0.into(), + keepalive: peer.keepalive.0.into(), + resolution: BGP_SESSION_RESOLUTION, + originate: prefixes, + }; + + bgp_peer_configs.push(bpc); + } + + mg_client + .inner + .bgp_apply(&ApplyRequest { + peer_group: switch_port_name, + peers: bgp_peer_configs, + }) + .await + .map_err(|e| { + ActionError::action_failed(format!("apply bgp settings: {e}")) + })?; + + Ok(()) +} + +pub(crate) async fn get_scrimlet_address( + location: SwitchLocation, + nexus: &Arc, +) -> Result { + /* TODO this depends on DNS entries only coming from RSS, it's broken + on the upgrade path + nexus + .resolver() + .await + .lookup_socket_v6(ServiceName::Scrimlet(location)) + .await + .map_err(|e| e.to_string()) + .map_err(|e| { + ActionError::action_failed(format!( + "scrimlet dns lookup failed {e}", + )) + }) + */ + let result = nexus + .resolver() + .await + .lookup_all_ipv6(ServiceName::Dendrite) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "scrimlet dns lookup failed {e}", + )) + }); + + let mappings = match result { + Ok(addrs) => map_switch_zone_addrs(&nexus.log, addrs).await, + Err(e) => { + warn!(nexus.log, "Failed to lookup Dendrite address: {e}"); + return Err(ActionError::action_failed(format!( + "switch mapping failed {e}", + ))); + } + }; + + let addr = match mappings.get(&location) { + Some(addr) => addr, + None => { + return Err(ActionError::action_failed(format!( + "address for switch at location: {location} not found", + ))); + } + }; + + let mut segments = addr.segments(); + segments[7] = 1; + let addr = Ipv6Addr::from(segments); + + Ok(SocketAddrV6::new(addr, SLED_AGENT_PORT, 0, 0)) +} + +#[derive(Clone, Debug, Default)] +pub(crate) struct BootstoreNetworkPortChange { + previous_port_config: Option, + changed_bgp_configs: Vec, + added_bgp_configs: Vec, +} + +#[derive(Clone, Debug)] +pub struct EarlyNetworkPortUpdate { + port: PortConfigV1, + bgp_configs: Vec, +} diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index da89e7e25a..8189c0a93d 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -51,7 +51,7 @@ impl super::Nexus { SledRole::Scrimlet => true, }; - let sled = db::model::Sled::new( + let sled = db::model::SledUpdate::new( id, info.sa_address, db::model::SledBaseboard { diff --git a/nexus/src/app/test_interfaces.rs b/nexus/src/app/test_interfaces.rs index ad2ea50e07..6161a9a1c1 100644 --- a/nexus/src/app/test_interfaces.rs +++ b/nexus/src/app/test_interfaces.rs @@ -10,6 +10,9 @@ use sled_agent_client::Client as SledAgentClient; use std::sync::Arc; use uuid::Uuid; +pub use super::update::MgsClients; +pub use super::update::RotUpdateError; +pub use super::update::RotUpdater; pub use super::update::SpUpdateError; pub use super::update::SpUpdater; pub use super::update::UpdateProgress; diff --git a/nexus/src/app/update/mgs_clients.rs b/nexus/src/app/update/mgs_clients.rs new file mode 100644 index 0000000000..5915505829 --- /dev/null +++ b/nexus/src/app/update/mgs_clients.rs @@ -0,0 +1,240 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Module providing support for handling failover between multiple MGS clients + +use futures::Future; +use gateway_client::types::SpType; +use gateway_client::types::SpUpdateStatus; +use gateway_client::Client; +use slog::Logger; +use std::collections::VecDeque; +use std::sync::Arc; +use uuid::Uuid; + +pub(super) type GatewayClientError = + gateway_client::Error; + +pub(super) enum PollUpdateStatus { + Preparing { progress: Option }, + InProgress { progress: Option }, + Complete, +} + +#[derive(Debug, thiserror::Error)] +pub enum UpdateStatusError { + #[error("different update is now preparing ({0})")] + DifferentUpdatePreparing(Uuid), + #[error("different update is now in progress ({0})")] + DifferentUpdateInProgress(Uuid), + #[error("different update is now complete ({0})")] + DifferentUpdateComplete(Uuid), + #[error("different update is now aborted ({0})")] + DifferentUpdateAborted(Uuid), + #[error("different update failed ({0})")] + DifferentUpdateFailed(Uuid), + #[error("update status lost (did the SP reset?)")] + UpdateStatusLost, + #[error("update was aborted")] + UpdateAborted, + #[error("update failed (error code {0})")] + UpdateFailedWithCode(u32), + #[error("update failed (error message {0})")] + UpdateFailedWithMessage(String), +} + +#[derive(Debug, thiserror::Error)] +pub(super) enum PollUpdateStatusError { + #[error(transparent)] + StatusError(#[from] UpdateStatusError), + #[error(transparent)] + ClientError(#[from] GatewayClientError), +} + +#[derive(Debug, Clone)] +pub struct MgsClients { + clients: VecDeque>, +} + +impl MgsClients { + /// Create a new `MgsClients` with the given `clients`. + /// + /// # Panics + /// + /// Panics if `clients` is empty. + pub fn new>>>(clients: T) -> Self { + let clients = clients.into(); + assert!(!clients.is_empty()); + Self { clients } + } + + /// Create a new `MgsClients` with the given `clients`. + /// + /// # Panics + /// + /// Panics if `clients` is empty. + pub fn from_clients>(clients: I) -> Self { + let clients = clients + .into_iter() + .map(Arc::new) + .collect::>>(); + Self::new(clients) + } + + /// Run `op` against all clients in sequence until either one succeeds (in + /// which case the success value is returned), one fails with a + /// non-communication error (in which case that error is returned), or all + /// of them fail with communication errors (in which case the communication + /// error from the last-attempted client is returned). + /// + /// On a successful return, the internal client list will be reordered so + /// any future accesses will attempt the most-recently-successful client. + pub(super) async fn try_all_serially( + &mut self, + log: &Logger, + op: F, + ) -> Result + where + // Seems like it would be nicer to take `&Client` here instead of + // needing to clone each `Arc`, but there's currently no decent way of + // doing that without boxing the returned future: + // https://users.rust-lang.org/t/how-to-express-that-the-future-returned-by-a-closure-lives-only-as-long-as-its-argument/90039/10 + F: Fn(Arc) -> Fut, + Fut: Future>, + { + let mut last_err = None; + for (i, client) in self.clients.iter().enumerate() { + match op(Arc::clone(client)).await { + Ok(value) => { + self.clients.rotate_left(i); + return Ok(value); + } + Err(GatewayClientError::CommunicationError(err)) => { + if i < self.clients.len() { + warn!( + log, "communication error with MGS; \ + will try next client"; + "mgs_addr" => client.baseurl(), + "err" => %err, + ); + } + last_err = Some(err); + continue; + } + Err(err) => return Err(err), + } + } + + // The only way to get here is if all clients failed with communication + // errors. Return the error from the last MGS we tried. + Err(GatewayClientError::CommunicationError(last_err.unwrap())) + } + + /// Poll for the status of an expected-to-be-in-progress update. + pub(super) async fn poll_update_status( + &mut self, + sp_type: SpType, + sp_slot: u32, + component: &'static str, + update_id: Uuid, + log: &Logger, + ) -> Result { + let update_status = self + .try_all_serially(log, |client| async move { + let update_status = client + .sp_component_update_status(sp_type, sp_slot, component) + .await?; + + debug!( + log, "got update status"; + "mgs_addr" => client.baseurl(), + "status" => ?update_status, + ); + + Ok(update_status) + }) + .await? + .into_inner(); + + match update_status { + SpUpdateStatus::Preparing { id, progress } => { + if id == update_id { + let progress = progress.and_then(|progress| { + if progress.current > progress.total { + warn!( + log, "nonsense preparing progress"; + "current" => progress.current, + "total" => progress.total, + ); + None + } else if progress.total == 0 { + None + } else { + Some( + f64::from(progress.current) + / f64::from(progress.total), + ) + } + }); + Ok(PollUpdateStatus::Preparing { progress }) + } else { + Err(UpdateStatusError::DifferentUpdatePreparing(id).into()) + } + } + SpUpdateStatus::InProgress { id, bytes_received, total_bytes } => { + if id == update_id { + let progress = if bytes_received > total_bytes { + warn!( + log, "nonsense update progress"; + "bytes_received" => bytes_received, + "total_bytes" => total_bytes, + ); + None + } else if total_bytes == 0 { + None + } else { + Some(f64::from(bytes_received) / f64::from(total_bytes)) + }; + Ok(PollUpdateStatus::InProgress { progress }) + } else { + Err(UpdateStatusError::DifferentUpdateInProgress(id).into()) + } + } + SpUpdateStatus::Complete { id } => { + if id == update_id { + Ok(PollUpdateStatus::Complete) + } else { + Err(UpdateStatusError::DifferentUpdateComplete(id).into()) + } + } + SpUpdateStatus::None => { + Err(UpdateStatusError::UpdateStatusLost.into()) + } + SpUpdateStatus::Aborted { id } => { + if id == update_id { + Err(UpdateStatusError::UpdateAborted.into()) + } else { + Err(UpdateStatusError::DifferentUpdateAborted(id).into()) + } + } + SpUpdateStatus::Failed { code, id } => { + if id == update_id { + Err(UpdateStatusError::UpdateFailedWithCode(code).into()) + } else { + Err(UpdateStatusError::DifferentUpdateFailed(id).into()) + } + } + SpUpdateStatus::RotError { id, message } => { + if id == update_id { + Err(UpdateStatusError::UpdateFailedWithMessage(format!( + "rot error: {message}" + )) + .into()) + } else { + Err(UpdateStatusError::DifferentUpdateFailed(id).into()) + } + } + } + } +} diff --git a/nexus/src/app/update/mod.rs b/nexus/src/app/update/mod.rs index 4196cd8a71..7d5c642822 100644 --- a/nexus/src/app/update/mod.rs +++ b/nexus/src/app/update/mod.rs @@ -26,9 +26,22 @@ use std::path::Path; use tokio::io::AsyncWriteExt; use uuid::Uuid; +mod mgs_clients; +mod rot_updater; mod sp_updater; -pub use sp_updater::{SpUpdateError, SpUpdater, UpdateProgress}; +pub use mgs_clients::{MgsClients, UpdateStatusError}; +pub use rot_updater::{RotUpdateError, RotUpdater}; +pub use sp_updater::{SpUpdateError, SpUpdater}; + +#[derive(Debug, PartialEq, Clone)] +pub enum UpdateProgress { + Started, + Preparing { progress: Option }, + InProgress { progress: Option }, + Complete, + Failed(String), +} static BASE_ARTIFACT_DIR: &str = "/var/tmp/oxide_artifacts"; @@ -69,14 +82,14 @@ impl super::Nexus { ), })?; - let artifacts = tokio::task::spawn_blocking(move || { - crate::updates::read_artifacts(&trusted_root, base_url) - }) - .await - .unwrap() - .map_err(|e| Error::InternalError { - internal_message: format!("error trying to refresh updates: {}", e), - })?; + let artifacts = crate::updates::read_artifacts(&trusted_root, base_url) + .await + .map_err(|e| Error::InternalError { + internal_message: format!( + "error trying to refresh updates: {}", + e + ), + })?; // FIXME: if we hit an error in any of these database calls, the // available artifact table will be out of sync with the current diff --git a/nexus/src/app/update/rot_updater.rs b/nexus/src/app/update/rot_updater.rs new file mode 100644 index 0000000000..d7d21e3b3a --- /dev/null +++ b/nexus/src/app/update/rot_updater.rs @@ -0,0 +1,272 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Module containing types for updating RoTs via MGS. + +use super::mgs_clients::PollUpdateStatusError; +use super::MgsClients; +use super::UpdateProgress; +use super::UpdateStatusError; +use crate::app::update::mgs_clients::PollUpdateStatus; +use gateway_client::types::RotSlot; +use gateway_client::types::SpComponentFirmwareSlot; +use gateway_client::types::SpType; +use gateway_client::SpComponent; +use slog::Logger; +use std::time::Duration; +use tokio::sync::watch; +use uuid::Uuid; + +type GatewayClientError = gateway_client::Error; + +#[derive(Debug, thiserror::Error)] +pub enum RotUpdateError { + #[error("error communicating with MGS")] + MgsCommunication(#[from] GatewayClientError), + + #[error("failed checking update status: {0}")] + PollUpdateStatus(#[from] UpdateStatusError), +} + +impl From for RotUpdateError { + fn from(err: PollUpdateStatusError) -> Self { + match err { + PollUpdateStatusError::StatusError(err) => err.into(), + PollUpdateStatusError::ClientError(err) => err.into(), + } + } +} + +pub struct RotUpdater { + log: Logger, + progress: watch::Sender>, + sp_type: SpType, + sp_slot: u32, + target_rot_slot: RotSlot, + update_id: Uuid, + // TODO-clarity maybe a newtype for this? TBD how we get this from + // wherever it's stored, which might give us a stronger type already. + rot_hubris_archive: Vec, +} + +impl RotUpdater { + pub fn new( + sp_type: SpType, + sp_slot: u32, + target_rot_slot: RotSlot, + update_id: Uuid, + rot_hubris_archive: Vec, + log: &Logger, + ) -> Self { + let log = log.new(slog::o!( + "component" => "RotUpdater", + "sp_type" => format!("{sp_type:?}"), + "sp_slot" => sp_slot, + "target_rot_slot" => format!("{target_rot_slot:?}"), + "update_id" => format!("{update_id}"), + )); + let progress = watch::Sender::new(None); + Self { + log, + progress, + sp_type, + sp_slot, + target_rot_slot, + update_id, + rot_hubris_archive, + } + } + + pub fn progress_watcher(&self) -> watch::Receiver> { + self.progress.subscribe() + } + + /// Drive this RoT update to completion (or failure). + /// + /// Only one MGS instance is required to drive an update; however, if + /// multiple MGS instances are available and passed to this method and an + /// error occurs communicating with one instance, `RotUpdater` will try the + /// remaining instances before failing. + pub async fn update( + self, + mut mgs_clients: MgsClients, + ) -> Result<(), RotUpdateError> { + // The async blocks below want `&self` references, but we take `self` + // for API clarity (to start a new update, the caller should construct a + // new updater). Create a `&self` ref that we use through the remainder + // of this method. + let me = &self; + + mgs_clients + .try_all_serially(&self.log, |client| async move { + me.start_update_one_mgs(&client).await + }) + .await?; + + // `wait_for_update_completion` uses `try_all_mgs_clients` internally, + // so we don't wrap it here. + me.wait_for_update_completion(&mut mgs_clients).await?; + + mgs_clients + .try_all_serially(&self.log, |client| async move { + me.mark_target_slot_active_one_mgs(&client).await + }) + .await?; + + mgs_clients + .try_all_serially(&self.log, |client| async move { + me.finalize_update_via_reset_one_mgs(&client).await + }) + .await?; + + // wait for any progress watchers to be dropped before we return; + // otherwise, they'll get `RecvError`s when trying to check the current + // status + self.progress.closed().await; + + Ok(()) + } + + async fn start_update_one_mgs( + &self, + client: &gateway_client::Client, + ) -> Result<(), GatewayClientError> { + let firmware_slot = self.target_rot_slot.as_u16(); + + // Start the update. + client + .sp_component_update( + self.sp_type, + self.sp_slot, + SpComponent::ROT.const_as_str(), + firmware_slot, + &self.update_id, + reqwest::Body::from(self.rot_hubris_archive.clone()), + ) + .await?; + + self.progress.send_replace(Some(UpdateProgress::Started)); + + info!( + self.log, "RoT update started"; + "mgs_addr" => client.baseurl(), + ); + + Ok(()) + } + + async fn wait_for_update_completion( + &self, + mgs_clients: &mut MgsClients, + ) -> Result<(), RotUpdateError> { + // How frequently do we poll MGS for the update progress? + const STATUS_POLL_INTERVAL: Duration = Duration::from_secs(3); + + loop { + let status = mgs_clients + .poll_update_status( + self.sp_type, + self.sp_slot, + SpComponent::ROT.const_as_str(), + self.update_id, + &self.log, + ) + .await?; + + // For `Preparing` and `InProgress`, we could check the progress + // information returned by these steps and try to check that + // we're still _making_ progress, but every Nexus instance needs + // to do that anyway in case we (or the MGS instance delivering + // the update) crash, so we'll omit that check here. Instead, we + // just sleep and we'll poll again shortly. + match status { + PollUpdateStatus::Preparing { progress } => { + self.progress.send_replace(Some( + UpdateProgress::Preparing { progress }, + )); + } + PollUpdateStatus::InProgress { progress } => { + self.progress.send_replace(Some( + UpdateProgress::InProgress { progress }, + )); + } + PollUpdateStatus::Complete => { + self.progress.send_replace(Some( + UpdateProgress::InProgress { progress: Some(1.0) }, + )); + return Ok(()); + } + } + + tokio::time::sleep(STATUS_POLL_INTERVAL).await; + } + } + + async fn mark_target_slot_active_one_mgs( + &self, + client: &gateway_client::Client, + ) -> Result<(), GatewayClientError> { + // RoT currently doesn't support non-persistent slot swapping, so always + // tell it to persist our choice. + let persist = true; + + let slot = self.target_rot_slot.as_u16(); + + client + .sp_component_active_slot_set( + self.sp_type, + self.sp_slot, + SpComponent::ROT.const_as_str(), + persist, + &SpComponentFirmwareSlot { slot }, + ) + .await?; + + // TODO-correctness Should we send some kind of update to + // `self.progress`? We already sent `InProgress(1.0)` when the update + // finished delivering. Or perhaps we shouldn't even be doing this step + // and the reset, and let our caller handle the finalization? + + info!( + self.log, "RoT target slot marked active"; + "mgs_addr" => client.baseurl(), + ); + + Ok(()) + } + + async fn finalize_update_via_reset_one_mgs( + &self, + client: &gateway_client::Client, + ) -> Result<(), GatewayClientError> { + client + .sp_component_reset( + self.sp_type, + self.sp_slot, + SpComponent::ROT.const_as_str(), + ) + .await?; + + self.progress.send_replace(Some(UpdateProgress::Complete)); + info!( + self.log, "RoT update complete"; + "mgs_addr" => client.baseurl(), + ); + + Ok(()) + } +} + +trait RotSlotAsU16 { + fn as_u16(&self) -> u16; +} + +impl RotSlotAsU16 for RotSlot { + fn as_u16(&self) -> u16 { + match self { + RotSlot::A => 0, + RotSlot::B => 1, + } + } +} diff --git a/nexus/src/app/update/sp_updater.rs b/nexus/src/app/update/sp_updater.rs index 9abb2ad222..419a733441 100644 --- a/nexus/src/app/update/sp_updater.rs +++ b/nexus/src/app/update/sp_updater.rs @@ -4,13 +4,15 @@ //! Module containing types for updating SPs via MGS. -use futures::Future; +use crate::app::update::mgs_clients::PollUpdateStatus; + +use super::mgs_clients::PollUpdateStatusError; +use super::MgsClients; +use super::UpdateProgress; +use super::UpdateStatusError; use gateway_client::types::SpType; -use gateway_client::types::SpUpdateStatus; use gateway_client::SpComponent; use slog::Logger; -use std::collections::VecDeque; -use std::sync::Arc; use std::time::Duration; use tokio::sync::watch; use uuid::Uuid; @@ -22,20 +24,17 @@ pub enum SpUpdateError { #[error("error communicating with MGS")] MgsCommunication(#[from] GatewayClientError), - // Error returned when we successfully start an update but it fails to - // complete successfully. - #[error("update failed to complete: {0}")] - FailedToComplete(String), + #[error("failed checking update status: {0}")] + PollUpdateStatus(#[from] UpdateStatusError), } -// TODO-cleanup Probably share this with other update implementations? -#[derive(Debug, PartialEq, Clone)] -pub enum UpdateProgress { - Started, - Preparing { progress: Option }, - InProgress { progress: Option }, - Complete, - Failed(String), +impl From for SpUpdateError { + fn from(err: PollUpdateStatusError) -> Self { + match err { + PollUpdateStatusError::StatusError(err) => err.into(), + PollUpdateStatusError::ClientError(err) => err.into(), + } + } } pub struct SpUpdater { @@ -58,6 +57,7 @@ impl SpUpdater { log: &Logger, ) -> Self { let log = log.new(slog::o!( + "component" => "SpUpdater", "sp_type" => format!("{sp_type:?}"), "sp_slot" => sp_slot, "update_id" => format!("{update_id}"), @@ -76,78 +76,38 @@ impl SpUpdater { /// multiple MGS instances are available and passed to this method and an /// error occurs communicating with one instance, `SpUpdater` will try the /// remaining instances before failing. - /// - /// # Panics - /// - /// If `mgs_clients` is empty. - pub async fn update>>>( + pub async fn update( self, - mgs_clients: T, + mut mgs_clients: MgsClients, ) -> Result<(), SpUpdateError> { - let mut mgs_clients = mgs_clients.into(); - assert!(!mgs_clients.is_empty()); - // The async blocks below want `&self` references, but we take `self` // for API clarity (to start a new SP update, the caller should // construct a new `SpUpdater`). Create a `&self` ref that we use // through the remainder of this method. let me = &self; - me.try_all_mgs_clients(&mut mgs_clients, |client| async move { - me.start_update_one_mgs(&client).await - }) - .await?; + mgs_clients + .try_all_serially(&self.log, |client| async move { + me.start_update_one_mgs(&client).await + }) + .await?; // `wait_for_update_completion` uses `try_all_mgs_clients` internally, // so we don't wrap it here. me.wait_for_update_completion(&mut mgs_clients).await?; - me.try_all_mgs_clients(&mut mgs_clients, |client| async move { - me.finalize_update_via_reset_one_mgs(&client).await - }) - .await?; + mgs_clients + .try_all_serially(&self.log, |client| async move { + me.finalize_update_via_reset_one_mgs(&client).await + }) + .await?; - Ok(()) - } + // wait for any progress watchers to be dropped before we return; + // otherwise, they'll get `RecvError`s when trying to check the current + // status + self.progress.closed().await; - // Helper method to run `op` against all clients. If `op` returns - // successfully for one client, that client will be rotated to the front of - // the list (so any subsequent operations can start with the first client). - async fn try_all_mgs_clients( - &self, - mgs_clients: &mut VecDeque>, - op: F, - ) -> Result - where - F: Fn(Arc) -> Fut, - Fut: Future>, - { - let mut last_err = None; - for (i, client) in mgs_clients.iter().enumerate() { - match op(Arc::clone(client)).await { - Ok(val) => { - // Shift our list of MGS clients such that the one we just - // used is at the front for subsequent requests. - mgs_clients.rotate_left(i); - return Ok(val); - } - // If we have an error communicating with an MGS instance - // (timeout, unexpected connection close, etc.), we'll move on - // and try the next MGS client. If this was the last client, - // we'll stash the error in `last_err` and return it below the - // loop. - Err(GatewayClientError::CommunicationError(err)) => { - last_err = Some(err); - continue; - } - Err(err) => return Err(err), - } - } - - // We know we have at least one `mgs_client`, so the only way to get - // here is if all clients failed with connection errors. Return the - // error from the last MGS we tried. - Err(GatewayClientError::CommunicationError(last_err.unwrap())) + Ok(()) } async fn start_update_one_mgs( @@ -183,142 +143,48 @@ impl SpUpdater { async fn wait_for_update_completion( &self, - mgs_clients: &mut VecDeque>, + mgs_clients: &mut MgsClients, ) -> Result<(), SpUpdateError> { // How frequently do we poll MGS for the update progress? const STATUS_POLL_INTERVAL: Duration = Duration::from_secs(3); loop { - let update_status = self - .try_all_mgs_clients(mgs_clients, |client| async move { - let update_status = client - .sp_component_update_status( - self.sp_type, - self.sp_slot, - SpComponent::SP_ITSELF.const_as_str(), - ) - .await?; - - info!( - self.log, "got SP update status"; - "mgs_addr" => client.baseurl(), - "status" => ?update_status, - ); - - Ok(update_status) - }) - .await? - .into_inner(); - - // The majority of possible update statuses indicate failure; we'll - // handle the small number of non-failure cases by either - // `continue`ing or `return`ing; all other branches will give us an - // error string that we can report. - let error_message = match update_status { - // For `Preparing` and `InProgress`, we could check the progress - // information returned by these steps and try to check that - // we're still _making_ progress, but every Nexus instance needs - // to do that anyway in case we (or the MGS instance delivering - // the update) crash, so we'll omit that check here. Instead, we - // just sleep and we'll poll again shortly. - SpUpdateStatus::Preparing { id, progress } => { - if id == self.update_id { - let progress = progress.and_then(|progress| { - if progress.current > progress.total { - warn!( - self.log, "nonsense SP preparing progress"; - "current" => progress.current, - "total" => progress.total, - ); - None - } else if progress.total == 0 { - None - } else { - Some( - f64::from(progress.current) - / f64::from(progress.total), - ) - } - }); - self.progress.send_replace(Some( - UpdateProgress::Preparing { progress }, - )); - tokio::time::sleep(STATUS_POLL_INTERVAL).await; - continue; - } else { - format!("different update is now preparing ({id})") - } - } - SpUpdateStatus::InProgress { - id, - bytes_received, - total_bytes, - } => { - if id == self.update_id { - let progress = if bytes_received > total_bytes { - warn!( - self.log, "nonsense SP progress"; - "bytes_received" => bytes_received, - "total_bytes" => total_bytes, - ); - None - } else if total_bytes == 0 { - None - } else { - Some( - f64::from(bytes_received) - / f64::from(total_bytes), - ) - }; - self.progress.send_replace(Some( - UpdateProgress::InProgress { progress }, - )); - tokio::time::sleep(STATUS_POLL_INTERVAL).await; - continue; - } else { - format!("different update is now in progress ({id})") - } - } - SpUpdateStatus::Complete { id } => { - if id == self.update_id { - self.progress.send_replace(Some( - UpdateProgress::InProgress { progress: Some(1.0) }, - )); - return Ok(()); - } else { - format!("different update is now in complete ({id})") - } + let status = mgs_clients + .poll_update_status( + self.sp_type, + self.sp_slot, + SpComponent::SP_ITSELF.const_as_str(), + self.update_id, + &self.log, + ) + .await?; + + // For `Preparing` and `InProgress`, we could check the progress + // information returned by these steps and try to check that + // we're still _making_ progress, but every Nexus instance needs + // to do that anyway in case we (or the MGS instance delivering + // the update) crash, so we'll omit that check here. Instead, we + // just sleep and we'll poll again shortly. + match status { + PollUpdateStatus::Preparing { progress } => { + self.progress.send_replace(Some( + UpdateProgress::Preparing { progress }, + )); } - SpUpdateStatus::None => { - "update status lost (did the SP reset?)".to_string() + PollUpdateStatus::InProgress { progress } => { + self.progress.send_replace(Some( + UpdateProgress::InProgress { progress }, + )); } - SpUpdateStatus::Aborted { id } => { - if id == self.update_id { - "update was aborted".to_string() - } else { - format!("different update is now in complete ({id})") - } + PollUpdateStatus::Complete => { + self.progress.send_replace(Some( + UpdateProgress::InProgress { progress: Some(1.0) }, + )); + return Ok(()); } - SpUpdateStatus::Failed { code, id } => { - if id == self.update_id { - format!("update failed (error code {code})") - } else { - format!("different update failed ({id})") - } - } - SpUpdateStatus::RotError { id, message } => { - if id == self.update_id { - format!("update failed (rot error: {message})") - } else { - format!("different update failed with rot error ({id})") - } - } - }; + } - self.progress.send_replace(Some(UpdateProgress::Failed( - error_message.clone(), - ))); - return Err(SpUpdateError::FailedToComplete(error_message)); + tokio::time::sleep(STATUS_POLL_INTERVAL).await; } } diff --git a/nexus/src/updates.rs b/nexus/src/updates.rs index c2265096dc..2f57868acc 100644 --- a/nexus/src/updates.rs +++ b/nexus/src/updates.rs @@ -2,38 +2,38 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use buf_list::BufList; +use futures::TryStreamExt; use nexus_db_queries::db; use omicron_common::update::ArtifactsDocument; use std::convert::TryInto; -// TODO(iliana): make async/.await. awslabs/tough#213 -pub(crate) fn read_artifacts( +pub(crate) async fn read_artifacts( trusted_root: &[u8], mut base_url: String, ) -> Result< Vec, Box, > { - use std::io::Read; - if !base_url.ends_with('/') { base_url.push('/'); } let repository = tough::RepositoryLoader::new( - trusted_root, + &trusted_root, format!("{}metadata/", base_url).parse()?, format!("{}targets/", base_url).parse()?, ) - .load()?; + .load() + .await?; - let mut artifact_document = Vec::new(); - match repository.read_target(&"artifacts.json".parse()?)? { - Some(mut target) => target.read_to_end(&mut artifact_document)?, - None => return Err("artifacts.json missing".into()), - }; + let artifact_document = + match repository.read_target(&"artifacts.json".parse()?).await? { + Some(target) => target.try_collect::().await?, + None => return Err("artifacts.json missing".into()), + }; let artifacts: ArtifactsDocument = - serde_json::from_slice(&artifact_document)?; + serde_json::from_reader(buf_list::Cursor::new(&artifact_document))?; let valid_until = repository .root() diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 647232031d..1e7de6132b 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -30,6 +30,7 @@ use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; use omicron_common::api::external::MacAddr; use omicron_common::api::external::{IdentityMetadata, Name}; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use omicron_common::api::internal::shared::SwitchLocation; use omicron_common::nexus_config; use omicron_common::nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; @@ -1092,6 +1093,7 @@ pub async fn start_producer_server( let producer_address = SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 0); let server_info = ProducerEndpoint { id, + kind: Some(ProducerKind::Service), address: producer_address, base_route: "/collect".to_string(), interval: Duration::from_secs(1), diff --git a/nexus/tests/integration_tests/mod.rs b/nexus/tests/integration_tests/mod.rs index e0bb09de4f..87c5c74f0f 100644 --- a/nexus/tests/integration_tests/mod.rs +++ b/nexus/tests/integration_tests/mod.rs @@ -25,6 +25,7 @@ mod projects; mod rack; mod role_assignments; mod roles_builtin; +mod rot_updater; mod router_routes; mod saml; mod schema; diff --git a/nexus/tests/integration_tests/oximeter.rs b/nexus/tests/integration_tests/oximeter.rs index 65aaa18642..e97f36daf4 100644 --- a/nexus/tests/integration_tests/oximeter.rs +++ b/nexus/tests/integration_tests/oximeter.rs @@ -9,6 +9,7 @@ use http::StatusCode; use nexus_test_interface::NexusServer; use nexus_test_utils_macros::nexus_test; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use oximeter_db::DbWrite; use std::collections::BTreeSet; @@ -360,6 +361,7 @@ async fn test_oximeter_collector_reregistration_gets_all_assignments() { ids.insert(id); let info = ProducerEndpoint { id, + kind: Some(ProducerKind::Service), address: SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 12345), base_route: String::from("/collect"), interval: Duration::from_secs(1), diff --git a/nexus/tests/integration_tests/rot_updater.rs b/nexus/tests/integration_tests/rot_updater.rs new file mode 100644 index 0000000000..750f9571d0 --- /dev/null +++ b/nexus/tests/integration_tests/rot_updater.rs @@ -0,0 +1,627 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Tests `RotUpdater`'s delivery of updates to RoTs via MGS + +use gateway_client::types::{RotSlot, SpType}; +use gateway_messages::{SpPort, UpdateInProgressStatus, UpdateStatus}; +use gateway_test_utils::setup as mgs_setup; +use hubtools::RawHubrisArchive; +use hubtools::{CabooseBuilder, HubrisArchiveBuilder}; +use omicron_nexus::app::test_interfaces::{ + MgsClients, RotUpdater, UpdateProgress, +}; +use sp_sim::SimulatedSp; +use sp_sim::SIM_ROT_BOARD; +use std::mem; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Duration; +use tokio::io::AsyncWriteExt; +use tokio::net::TcpListener; +use tokio::net::TcpStream; +use tokio::sync::mpsc; +use uuid::Uuid; + +fn make_fake_rot_image() -> Vec { + let caboose = CabooseBuilder::default() + .git_commit("fake-git-commit") + .board(SIM_ROT_BOARD) + .version("0.0.0") + .name("fake-name") + .build(); + + let mut builder = HubrisArchiveBuilder::with_fake_image(); + builder.write_caboose(caboose.as_slice()).unwrap(); + builder.build_to_vec().unwrap() +} + +#[tokio::test] +async fn test_rot_updater_updates_sled() { + // Start MGS + Sim SP. + let mgstestctx = + mgs_setup::test_setup("test_rot_updater_updates_sled", SpPort::One) + .await; + + // Configure an MGS client. + let mgs_clients = MgsClients::from_clients([gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + )]); + + // Configure and instantiate an `RotUpdater`. + let sp_type = SpType::Sled; + let sp_slot = 0; + let update_id = Uuid::new_v4(); + let hubris_archive = make_fake_rot_image(); + let target_rot_slot = RotSlot::B; + + let rot_updater = RotUpdater::new( + sp_type, + sp_slot, + target_rot_slot, + update_id, + hubris_archive.clone(), + &mgstestctx.logctx.log, + ); + + // Run the update. + rot_updater.update(mgs_clients).await.expect("update failed"); + + // Ensure the RoT received the complete update. + let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] + .last_rot_update_data() + .await + .expect("simulated RoT did not receive an update"); + + let hubris_archive = RawHubrisArchive::from_vec(hubris_archive).unwrap(); + + assert_eq!( + hubris_archive.image.data.as_slice(), + &*last_update_image, + "simulated RoT update contents (len {}) \ + do not match test generated fake image (len {})", + last_update_image.len(), + hubris_archive.image.data.len() + ); + + mgstestctx.teardown().await; +} + +#[tokio::test] +async fn test_rot_updater_updates_switch() { + // Start MGS + Sim SP. + let mgstestctx = + mgs_setup::test_setup("test_rot_updater_updates_switch", SpPort::One) + .await; + + // Configure an MGS client. + let mgs_clients = MgsClients::from_clients([gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + )]); + + let sp_type = SpType::Switch; + let sp_slot = 0; + let update_id = Uuid::new_v4(); + let hubris_archive = make_fake_rot_image(); + let target_rot_slot = RotSlot::B; + + let rot_updater = RotUpdater::new( + sp_type, + sp_slot, + target_rot_slot, + update_id, + hubris_archive.clone(), + &mgstestctx.logctx.log, + ); + + rot_updater.update(mgs_clients).await.expect("update failed"); + + let last_update_image = mgstestctx.simrack.sidecars[sp_slot as usize] + .last_rot_update_data() + .await + .expect("simulated RoT did not receive an update"); + + let hubris_archive = RawHubrisArchive::from_vec(hubris_archive).unwrap(); + + assert_eq!( + hubris_archive.image.data.as_slice(), + &*last_update_image, + "simulated RoT update contents (len {}) \ + do not match test generated fake image (len {})", + last_update_image.len(), + hubris_archive.image.data.len() + ); + + mgstestctx.teardown().await; +} + +#[tokio::test] +async fn test_rot_updater_remembers_successful_mgs_instance() { + // Start MGS + Sim SP. + let mgstestctx = mgs_setup::test_setup( + "test_rot_updater_remembers_successful_mgs_instance", + SpPort::One, + ) + .await; + + // Also start a local TCP server that we will claim is an MGS instance, but + // it will close connections immediately after accepting them. This will + // allow us to count how many connections it receives, while simultaneously + // causing errors in the RotUpdater when it attempts to use this "MGS". + let (failing_mgs_task, failing_mgs_addr, failing_mgs_conn_counter) = { + let socket = TcpListener::bind("[::1]:0").await.unwrap(); + let addr = socket.local_addr().unwrap(); + let conn_count = Arc::new(AtomicUsize::new(0)); + + let task = { + let conn_count = Arc::clone(&conn_count); + tokio::spawn(async move { + loop { + let (mut stream, _peer) = socket.accept().await.unwrap(); + conn_count.fetch_add(1, Ordering::SeqCst); + stream.shutdown().await.unwrap(); + } + }) + }; + + (task, addr, conn_count) + }; + + // Order the MGS clients such that the bogus MGS that immediately closes + // connections comes first. `RotUpdater` should remember that the second MGS + // instance succeeds, and only send subsequent requests to it: we should + // only see a single attempted connection to the bogus MGS, even though + // delivering an update requires a bare minimum of three requests (start the + // update, query the status, reset the RoT) and often more (if repeated + // queries are required to wait for completion). + let mgs_clients = MgsClients::from_clients([ + gateway_client::Client::new( + &format!("http://{failing_mgs_addr}"), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient1")), + ), + gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + ), + ]); + + let sp_type = SpType::Sled; + let sp_slot = 0; + let update_id = Uuid::new_v4(); + let hubris_archive = make_fake_rot_image(); + let target_rot_slot = RotSlot::B; + + let rot_updater = RotUpdater::new( + sp_type, + sp_slot, + target_rot_slot, + update_id, + hubris_archive.clone(), + &mgstestctx.logctx.log, + ); + + rot_updater.update(mgs_clients).await.expect("update failed"); + + let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] + .last_rot_update_data() + .await + .expect("simulated RoT did not receive an update"); + + let hubris_archive = RawHubrisArchive::from_vec(hubris_archive).unwrap(); + + assert_eq!( + hubris_archive.image.data.as_slice(), + &*last_update_image, + "simulated RoT update contents (len {}) \ + do not match test generated fake image (len {})", + last_update_image.len(), + hubris_archive.image.data.len() + ); + + // Check that our bogus MGS only received a single connection attempt. + // (After RotUpdater failed to talk to this instance, it should have fallen + // back to the valid one for all further requests.) + assert_eq!( + failing_mgs_conn_counter.load(Ordering::SeqCst), + 1, + "bogus MGS instance didn't receive the expected number of connections" + ); + failing_mgs_task.abort(); + + mgstestctx.teardown().await; +} + +#[tokio::test] +async fn test_rot_updater_switches_mgs_instances_on_failure() { + enum MgsProxy { + One(TcpStream), + Two(TcpStream), + } + + // Start MGS + Sim SP. + let mgstestctx = mgs_setup::test_setup( + "test_rot_updater_switches_mgs_instances_on_failure", + SpPort::One, + ) + .await; + let mgs_bind_addr = mgstestctx.client.bind_address; + + let spawn_mgs_proxy_task = |mut stream: TcpStream| { + tokio::spawn(async move { + let mut mgs_stream = TcpStream::connect(mgs_bind_addr) + .await + .expect("failed to connect to MGS"); + tokio::io::copy_bidirectional(&mut stream, &mut mgs_stream) + .await + .expect("failed to proxy connection to MGS"); + }) + }; + + // Start two MGS proxy tasks; when each receives an incoming TCP connection, + // it forwards that `TcpStream` along the `mgs_proxy_connections` channel + // along with a tag of which proxy it is. We'll use this below to flip flop + // between MGS "instances" (really these two proxies). + let (mgs_proxy_connections_tx, mut mgs_proxy_connections_rx) = + mpsc::unbounded_channel(); + let (mgs_proxy_one_task, mgs_proxy_one_addr) = { + let socket = TcpListener::bind("[::1]:0").await.unwrap(); + let addr = socket.local_addr().unwrap(); + let mgs_proxy_connections_tx = mgs_proxy_connections_tx.clone(); + let task = tokio::spawn(async move { + loop { + let (stream, _peer) = socket.accept().await.unwrap(); + mgs_proxy_connections_tx.send(MgsProxy::One(stream)).unwrap(); + } + }); + (task, addr) + }; + let (mgs_proxy_two_task, mgs_proxy_two_addr) = { + let socket = TcpListener::bind("[::1]:0").await.unwrap(); + let addr = socket.local_addr().unwrap(); + let task = tokio::spawn(async move { + loop { + let (stream, _peer) = socket.accept().await.unwrap(); + mgs_proxy_connections_tx.send(MgsProxy::Two(stream)).unwrap(); + } + }); + (task, addr) + }; + + // Disable connection pooling so each request gets a new TCP connection. + let client = + reqwest::Client::builder().pool_max_idle_per_host(0).build().unwrap(); + + // Configure two MGS clients pointed at our two proxy tasks. + let mgs_clients = MgsClients::from_clients([ + gateway_client::Client::new_with_client( + &format!("http://{mgs_proxy_one_addr}"), + client.clone(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient1")), + ), + gateway_client::Client::new_with_client( + &format!("http://{mgs_proxy_two_addr}"), + client, + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient2")), + ), + ]); + + let sp_type = SpType::Sled; + let sp_slot = 0; + let update_id = Uuid::new_v4(); + let hubris_archive = make_fake_rot_image(); + let target_rot_slot = RotSlot::B; + + let rot_updater = RotUpdater::new( + sp_type, + sp_slot, + target_rot_slot, + update_id, + hubris_archive.clone(), + &mgstestctx.logctx.log, + ); + + // Spawn the actual update task. + let mut update_task = tokio::spawn(rot_updater.update(mgs_clients)); + + // Loop over incoming requests. We expect this sequence: + // + // 1. Connection arrives on the first proxy + // 2. We spawn a task to service that request, and set `should_swap` + // 3. Connection arrives on the first proxy + // 4. We drop that connection, flip `expected_proxy`, and clear + // `should_swap` + // 5. Connection arrives on the second proxy + // 6. We spawn a task to service that request, and set `should_swap` + // 7. Connection arrives on the second proxy + // 8. We drop that connection, flip `expected_proxy`, and clear + // `should_swap` + // + // ... repeat until the update is complete. + let mut expected_proxy = 0; + let mut proxy_one_count = 0; + let mut proxy_two_count = 0; + let mut total_requests_handled = 0; + let mut should_swap = false; + loop { + tokio::select! { + Some(proxy_stream) = mgs_proxy_connections_rx.recv() => { + let stream = match proxy_stream { + MgsProxy::One(stream) => { + assert_eq!(expected_proxy, 0); + proxy_one_count += 1; + stream + } + MgsProxy::Two(stream) => { + assert_eq!(expected_proxy, 1); + proxy_two_count += 1; + stream + } + }; + + // Should we trigger `RotUpdater` to swap to the other MGS + // (proxy)? If so, do that by dropping this connection (which + // will cause a client failure) and note that we expect the next + // incoming request to come on the other proxy. + if should_swap { + mem::drop(stream); + expected_proxy ^= 1; + should_swap = false; + } else { + // Otherwise, handle this connection. + total_requests_handled += 1; + spawn_mgs_proxy_task(stream); + should_swap = true; + } + } + + result = &mut update_task => { + match result { + Ok(Ok(())) => { + mgs_proxy_one_task.abort(); + mgs_proxy_two_task.abort(); + break; + } + Ok(Err(err)) => panic!("update failed: {err}"), + Err(err) => panic!("update task panicked: {err}"), + } + } + } + } + + // An RoT update requires a minimum of 4 requests to MGS: post the update, + // check the status, post to mark the new target slot active, and post an + // RoT reset. There may be more requests if the update is not yet complete + // when the status is checked, but we can just check that each of our + // proxies received at least 2 incoming requests; based on our outline + // above, if we got the minimum of 4 requests, it would look like this: + // + // 1. POST update -> first proxy (success) + // 2. GET status -> first proxy (fail) + // 3. GET status retry -> second proxy (success) + // 4. POST new target slot -> second proxy (fail) + // 5. POST new target slot -> first proxy (success) + // 6. POST reset -> first proxy (fail) + // 7. POST reset -> second proxy (success) + // + // This pattern would repeat if multiple status requests were required, so + // we always expect the first proxy to see exactly one more connection + // attempt than the second (because it went first before they started + // swapping), and the two together should see a total of one less than + // double the number of successful requests required. + assert!(total_requests_handled >= 3); + assert_eq!(proxy_one_count, proxy_two_count + 1); + assert_eq!( + (proxy_one_count + proxy_two_count + 1) / 2, + total_requests_handled + ); + + let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] + .last_rot_update_data() + .await + .expect("simulated RoT did not receive an update"); + + let hubris_archive = RawHubrisArchive::from_vec(hubris_archive).unwrap(); + + assert_eq!( + hubris_archive.image.data.as_slice(), + &*last_update_image, + "simulated RoT update contents (len {}) \ + do not match test generated fake image (len {})", + last_update_image.len(), + hubris_archive.image.data.len() + ); + + mgstestctx.teardown().await; +} + +#[tokio::test] +async fn test_rot_updater_delivers_progress() { + // Start MGS + Sim SP. + let mgstestctx = mgs_setup::test_setup( + "test_rot_updater_delivers_progress", + SpPort::One, + ) + .await; + + // Configure an MGS client. + let mgs_clients = MgsClients::from_clients([gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + )]); + + let sp_type = SpType::Sled; + let sp_slot = 0; + let update_id = Uuid::new_v4(); + let hubris_archive = make_fake_rot_image(); + let target_rot_slot = RotSlot::B; + + let rot_updater = RotUpdater::new( + sp_type, + sp_slot, + target_rot_slot, + update_id, + hubris_archive.clone(), + &mgstestctx.logctx.log, + ); + + let hubris_archive = RawHubrisArchive::from_vec(hubris_archive).unwrap(); + let rot_image_len = hubris_archive.image.data.len() as u32; + + // Subscribe to update progress, and check that there is no status yet; we + // haven't started the update. + let mut progress = rot_updater.progress_watcher(); + assert_eq!(*progress.borrow_and_update(), None); + + // Install a semaphore on the requests our target SP will receive so we can + // inspect progress messages without racing. + let target_sp = &mgstestctx.simrack.gimlets[sp_slot as usize]; + let sp_accept_sema = target_sp.install_udp_accept_semaphore().await; + let mut sp_responses = target_sp.responses_sent_count().unwrap(); + + // Spawn the update on a background task so we can watch `progress` as it is + // applied. + let do_update_task = tokio::spawn(rot_updater.update(mgs_clients)); + + // Allow the SP to respond to 1 message: the "prepare update" messages that + // trigger the start of an update, then ensure we see the "started" + // progress. + sp_accept_sema.send(1).unwrap(); + progress.changed().await.unwrap(); + assert_eq!(*progress.borrow_and_update(), Some(UpdateProgress::Started)); + + // Ensure our simulated SP is in the state we expect: it's prepared for an + // update but has not yet received any data. + assert_eq!( + target_sp.current_update_status().await, + UpdateStatus::InProgress(UpdateInProgressStatus { + id: update_id.into(), + bytes_received: 0, + total_size: rot_image_len, + }) + ); + + // Record the number of responses the SP has sent; we'll use + // `sp_responses.changed()` in the loop below, and want to mark whatever + // value this watch channel currently has as seen. + sp_responses.borrow_and_update(); + + // At this point, there are two clients racing each other to talk to our + // simulated SP: + // + // 1. MGS is trying to deliver the update + // 2. `rot_updater` is trying to poll (via MGS) for update status + // + // and we want to ensure that we see any relevant progress reports from + // `rot_updater`. We'll let one MGS -> SP message through at a time (waiting + // until our SP has responded by waiting for a change to `sp_responses`) + // then check its update state: if it changed, the packet we let through was + // data from MGS; otherwise, it was a status request from `rot_updater`. + // + // This loop will continue until either: + // + // 1. We see an `UpdateStatus::InProgress` message indicating 100% delivery, + // at which point we break out of the loop + // 2. We time out waiting for the previous step (by timing out for either + // the SP to process a request or `rot_updater` to realize there's been + // progress), at which point we panic and fail this test. + let mut prev_bytes_received = 0; + let mut expect_progress_change = false; + loop { + // Allow the SP to accept and respond to a single UDP packet. + sp_accept_sema.send(1).unwrap(); + + // Wait until the SP has sent a response, with a safety rail that we + // haven't screwed up our untangle-the-race logic: if we don't see the + // SP process any new messages after several seconds, our test is + // broken, so fail. + tokio::time::timeout(Duration::from_secs(10), sp_responses.changed()) + .await + .expect("timeout waiting for SP response count to change") + .expect("sp response count sender dropped"); + + // Inspec the SP's in-memory update state; we expect only `InProgress` + // or `Complete`, and in either case we note whether we expect to see + // status changes from `rot_updater`. + match target_sp.current_update_status().await { + UpdateStatus::InProgress(rot_progress) => { + if rot_progress.bytes_received > prev_bytes_received { + prev_bytes_received = rot_progress.bytes_received; + expect_progress_change = true; + continue; + } + } + UpdateStatus::Complete(_) => { + if prev_bytes_received < rot_image_len { + prev_bytes_received = rot_image_len; + continue; + } + } + status @ (UpdateStatus::None + | UpdateStatus::Preparing(_) + | UpdateStatus::SpUpdateAuxFlashChckScan { .. } + | UpdateStatus::Aborted(_) + | UpdateStatus::Failed { .. } + | UpdateStatus::RotError { .. }) => { + panic!("unexpected status {status:?}"); + } + } + + // If we get here, the most recent packet did _not_ change the SP's + // internal update state, so it was a status request from `rot_updater`. + // If we expect the updater to see new progress, wait for that change + // here. + if expect_progress_change || prev_bytes_received == rot_image_len { + // Safety rail that we haven't screwed up our untangle-the-race + // logic: if we don't see a new progress after several seconds, our + // test is broken, so fail. + tokio::time::timeout(Duration::from_secs(10), progress.changed()) + .await + .expect("progress timeout") + .expect("progress watch sender dropped"); + let status = progress.borrow_and_update().clone().unwrap(); + expect_progress_change = false; + + // We're done if we've observed the final progress message. + if let UpdateProgress::InProgress { progress: Some(value) } = status + { + if value == 1.0 { + break; + } + } else { + panic!("unexpected progerss status {status:?}"); + } + } + } + + // The update has been fully delivered to the SP, but we don't see an + // `UpdateStatus::Complete` message until the RoT is reset. Release the SP + // semaphore since we're no longer racing to observe intermediate progress, + // and wait for the completion message. + sp_accept_sema.send(usize::MAX).unwrap(); + progress.changed().await.unwrap(); + assert_eq!(*progress.borrow_and_update(), Some(UpdateProgress::Complete)); + + // drop our progress receiver so `do_update_task` can complete + mem::drop(progress); + + do_update_task.await.expect("update task panicked").expect("update failed"); + + let last_update_image = target_sp + .last_rot_update_data() + .await + .expect("simulated RoT did not receive an update"); + + assert_eq!( + hubris_archive.image.data.as_slice(), + &*last_update_image, + "simulated RoT update contents (len {}) \ + do not match test generated fake image (len {})", + last_update_image.len(), + hubris_archive.image.data.len() + ); + + mgstestctx.teardown().await; +} diff --git a/nexus/tests/integration_tests/sp_updater.rs b/nexus/tests/integration_tests/sp_updater.rs index 351c28ad9c..89735ac3d9 100644 --- a/nexus/tests/integration_tests/sp_updater.rs +++ b/nexus/tests/integration_tests/sp_updater.rs @@ -9,7 +9,9 @@ use gateway_messages::{SpPort, UpdateInProgressStatus, UpdateStatus}; use gateway_test_utils::setup as mgs_setup; use hubtools::RawHubrisArchive; use hubtools::{CabooseBuilder, HubrisArchiveBuilder}; -use omicron_nexus::app::test_interfaces::{SpUpdater, UpdateProgress}; +use omicron_nexus::app::test_interfaces::{ + MgsClients, SpUpdater, UpdateProgress, +}; use sp_sim::SimulatedSp; use sp_sim::SIM_GIMLET_BOARD; use sp_sim::SIM_SIDECAR_BOARD; @@ -44,10 +46,10 @@ async fn test_sp_updater_updates_sled() { .await; // Configure an MGS client. - let mgs_client = Arc::new(gateway_client::Client::new( + let mgs_clients = MgsClients::from_clients([gateway_client::Client::new( &mgstestctx.client.url("/").to_string(), mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), - )); + )]); // Configure and instantiate an `SpUpdater`. let sp_type = SpType::Sled; @@ -64,11 +66,11 @@ async fn test_sp_updater_updates_sled() { ); // Run the update. - sp_updater.update([mgs_client]).await.expect("update failed"); + sp_updater.update(mgs_clients).await.expect("update failed"); // Ensure the SP received the complete update. let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] - .last_update_data() + .last_sp_update_data() .await .expect("simulated SP did not receive an update"); @@ -94,10 +96,10 @@ async fn test_sp_updater_updates_switch() { .await; // Configure an MGS client. - let mgs_client = Arc::new(gateway_client::Client::new( + let mgs_clients = MgsClients::from_clients([gateway_client::Client::new( &mgstestctx.client.url("/").to_string(), mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), - )); + )]); let sp_type = SpType::Switch; let sp_slot = 0; @@ -112,10 +114,10 @@ async fn test_sp_updater_updates_switch() { &mgstestctx.logctx.log, ); - sp_updater.update([mgs_client]).await.expect("update failed"); + sp_updater.update(mgs_clients).await.expect("update failed"); let last_update_image = mgstestctx.simrack.sidecars[sp_slot as usize] - .last_update_data() + .last_sp_update_data() .await .expect("simulated SP did not receive an update"); @@ -172,16 +174,16 @@ async fn test_sp_updater_remembers_successful_mgs_instance() { // delivering an update requires a bare minimum of three requests (start the // update, query the status, reset the SP) and often more (if repeated // queries are required to wait for completion). - let mgs_clients = [ - Arc::new(gateway_client::Client::new( + let mgs_clients = MgsClients::from_clients([ + gateway_client::Client::new( &format!("http://{failing_mgs_addr}"), mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient1")), - )), - Arc::new(gateway_client::Client::new( + ), + gateway_client::Client::new( &mgstestctx.client.url("/").to_string(), mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), - )), - ]; + ), + ]); let sp_type = SpType::Sled; let sp_slot = 0; @@ -199,7 +201,7 @@ async fn test_sp_updater_remembers_successful_mgs_instance() { sp_updater.update(mgs_clients).await.expect("update failed"); let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] - .last_update_data() + .last_sp_update_data() .await .expect("simulated SP did not receive an update"); @@ -288,18 +290,18 @@ async fn test_sp_updater_switches_mgs_instances_on_failure() { reqwest::Client::builder().pool_max_idle_per_host(0).build().unwrap(); // Configure two MGS clients pointed at our two proxy tasks. - let mgs_clients = [ - Arc::new(gateway_client::Client::new_with_client( + let mgs_clients = MgsClients::from_clients([ + gateway_client::Client::new_with_client( &format!("http://{mgs_proxy_one_addr}"), client.clone(), mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient1")), - )), - Arc::new(gateway_client::Client::new_with_client( + ), + gateway_client::Client::new_with_client( &format!("http://{mgs_proxy_two_addr}"), client, mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient2")), - )), - ]; + ), + ]); let sp_type = SpType::Sled; let sp_slot = 0; @@ -408,7 +410,7 @@ async fn test_sp_updater_switches_mgs_instances_on_failure() { ); let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] - .last_update_data() + .last_sp_update_data() .await .expect("simulated SP did not receive an update"); @@ -434,10 +436,10 @@ async fn test_sp_updater_delivers_progress() { .await; // Configure an MGS client. - let mgs_client = Arc::new(gateway_client::Client::new( + let mgs_clients = MgsClients::from_clients([gateway_client::Client::new( &mgstestctx.client.url("/").to_string(), mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), - )); + )]); let sp_type = SpType::Sled; let sp_slot = 0; @@ -468,7 +470,7 @@ async fn test_sp_updater_delivers_progress() { // Spawn the update on a background task so we can watch `progress` as it is // applied. - let do_update_task = tokio::spawn(sp_updater.update([mgs_client])); + let do_update_task = tokio::spawn(sp_updater.update(mgs_clients)); // Allow the SP to respond to 2 messages: the caboose check and the "prepare // update" messages that trigger the start of an update, then ensure we see @@ -589,10 +591,13 @@ async fn test_sp_updater_delivers_progress() { progress.changed().await.unwrap(); assert_eq!(*progress.borrow_and_update(), Some(UpdateProgress::Complete)); + // drop our progress receiver so `do_update_task` can complete + mem::drop(progress); + do_update_task.await.expect("update task panicked").expect("update failed"); let last_update_image = target_sp - .last_update_data() + .last_sp_update_data() .await .expect("simulated SP did not receive an update"); diff --git a/nexus/tests/integration_tests/switch_port.rs b/nexus/tests/integration_tests/switch_port.rs index ccd0b50fbe..d163fc6b06 100644 --- a/nexus/tests/integration_tests/switch_port.rs +++ b/nexus/tests/integration_tests/switch_port.rs @@ -318,4 +318,19 @@ async fn test_port_settings_basic_crud(ctx: &ControlPlaneTestContext) { .execute() .await .unwrap(); + + // clear port settings + + NexusRequest::new( + RequestBuilder::new( + client, + Method::DELETE, + &format!("/v1/system/hardware/switch-port/qsfp0/settings?rack_id={rack_id}&switch_location=switch0"), + ) + .expect_status(Some(StatusCode::NO_CONTENT)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap(); } diff --git a/nexus/tests/integration_tests/updates.rs b/nexus/tests/integration_tests/updates.rs index 918d5ac100..891166ed19 100644 --- a/nexus/tests/integration_tests/updates.rs +++ b/nexus/tests/integration_tests/updates.rs @@ -7,6 +7,7 @@ // - test that an unknown artifact returns 404, not 500 // - tests around target names and artifact names that contain dangerous paths like `../` +use async_trait::async_trait; use chrono::{Duration, Utc}; use dropshot::test_util::LogContext; use dropshot::{ @@ -45,17 +46,22 @@ const UPDATE_COMPONENT: &'static str = "omicron-test-component"; #[tokio::test] async fn test_update_end_to_end() { let mut config = load_test_config(); + let logctx = LogContext::new("test_update_end_to_end", &config.pkg.log); // build the TUF repo let rng = SystemRandom::new(); - let tuf_repo = new_tuf_repo(&rng); + let tuf_repo = new_tuf_repo(&rng).await; + slog::info!( + logctx.log, + "TUF repo created at {}", + tuf_repo.path().display() + ); // serve it over HTTP let dropshot_config = Default::default(); let mut api = ApiDescription::new(); api.register(static_content).unwrap(); let context = FileServerContext { base: tuf_repo.path().to_owned() }; - let logctx = LogContext::new("test_update_end_to_end", &config.pkg.log); let server = HttpServerStarter::new(&dropshot_config, api, context, &logctx.log) .unwrap() @@ -122,9 +128,14 @@ async fn static_content( for component in path.into_inner().path { fs_path.push(component); } - let body = tokio::fs::read(fs_path) - .await - .map_err(|e| HttpError::for_bad_request(None, e.to_string()))?; + let body = tokio::fs::read(fs_path).await.map_err(|e| { + // tough 0.15+ depend on ENOENT being translated into 404. + if e.kind() == std::io::ErrorKind::NotFound { + HttpError::for_not_found(None, e.to_string()) + } else { + HttpError::for_bad_request(None, e.to_string()) + } + })?; Ok(Response::builder().status(StatusCode::OK).body(body.into())?) } @@ -132,7 +143,7 @@ async fn static_content( const TARGET_CONTENTS: &[u8] = b"hello world".as_slice(); -fn new_tuf_repo(rng: &dyn SecureRandom) -> TempDir { +async fn new_tuf_repo(rng: &(dyn SecureRandom + Sync)) -> TempDir { let version = NonZeroU64::new(Utc::now().timestamp().try_into().unwrap()).unwrap(); let expires = Utc::now() + Duration::minutes(5); @@ -180,13 +191,14 @@ fn new_tuf_repo(rng: &dyn SecureRandom) -> TempDir { &signing_keys, rng, ) + .await .unwrap(); // TODO(iliana): there's no way to create a `RepositoryEditor` without having the root.json on // disk. this is really unergonomic. write and upstream a fix let mut root_tmp = NamedTempFile::new().unwrap(); root_tmp.as_file_mut().write_all(signed_root.buffer()).unwrap(); - let mut editor = RepositoryEditor::new(&root_tmp).unwrap(); + let mut editor = RepositoryEditor::new(&root_tmp).await.unwrap(); root_tmp.close().unwrap(); editor @@ -200,19 +212,20 @@ fn new_tuf_repo(rng: &dyn SecureRandom) -> TempDir { .timestamp_expires(expires); let (targets_dir, target_names) = generate_targets(); for target in target_names { - editor.add_target_path(targets_dir.path().join(target)).unwrap(); + editor.add_target_path(targets_dir.path().join(target)).await.unwrap(); } - let signed_repo = editor.sign(&signing_keys).unwrap(); + let signed_repo = editor.sign(&signing_keys).await.unwrap(); let repo = TempDir::new().unwrap(); - signed_repo.write(repo.path().join("metadata")).unwrap(); + signed_repo.write(repo.path().join("metadata")).await.unwrap(); signed_repo .copy_targets( targets_dir, repo.path().join("targets"), PathExists::Fail, ) + .await .unwrap(); repo @@ -257,8 +270,9 @@ impl Debug for KeyKeySource { } } +#[async_trait] impl KeySource for KeyKeySource { - fn as_sign( + async fn as_sign( &self, ) -> Result, Box> { @@ -267,7 +281,7 @@ impl KeySource for KeyKeySource { Ok(Box::new(Ed25519KeyPair::from_pkcs8(self.0.as_ref()).unwrap())) } - fn write( + async fn write( &self, _value: &str, _key_id_hex: &str, diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index fcb285d9eb..c358b4109b 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -4322,17 +4322,34 @@ "type": "object", "properties": { "address": { + "description": "The IP address and port at which `oximeter` can collect metrics from the producer.", "type": "string" }, "base_route": { + "description": "The API base route from which `oximeter` can collect metrics.\n\nThe full route is `{base_route}/{id}`.", "type": "string" }, "id": { + "description": "A unique ID for this producer.", "type": "string", "format": "uuid" }, "interval": { - "$ref": "#/components/schemas/Duration" + "description": "The interval on which `oximeter` should collect metrics.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + }, + "kind": { + "nullable": true, + "description": "The kind of producer.", + "allOf": [ + { + "$ref": "#/components/schemas/ProducerKind" + } + ] } }, "required": [ @@ -4342,6 +4359,32 @@ "interval" ] }, + "ProducerKind": { + "description": "The kind of metric producer this is.", + "oneOf": [ + { + "description": "The producer is a sled-agent.", + "type": "string", + "enum": [ + "sled_agent" + ] + }, + { + "description": "The producer is an Omicron-managed service.", + "type": "string", + "enum": [ + "service" + ] + }, + { + "description": "The producer is a Propolis VMM managing a guest instance.", + "type": "string", + "enum": [ + "instance" + ] + } + ] + }, "ProducerResultsItem": { "oneOf": [ { diff --git a/openapi/oximeter.json b/openapi/oximeter.json index 529d20e921..f7e534c95d 100644 --- a/openapi/oximeter.json +++ b/openapi/oximeter.json @@ -191,17 +191,34 @@ "type": "object", "properties": { "address": { + "description": "The IP address and port at which `oximeter` can collect metrics from the producer.", "type": "string" }, "base_route": { + "description": "The API base route from which `oximeter` can collect metrics.\n\nThe full route is `{base_route}/{id}`.", "type": "string" }, "id": { + "description": "A unique ID for this producer.", "type": "string", "format": "uuid" }, "interval": { - "$ref": "#/components/schemas/Duration" + "description": "The interval on which `oximeter` should collect metrics.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + }, + "kind": { + "nullable": true, + "description": "The kind of producer.", + "allOf": [ + { + "$ref": "#/components/schemas/ProducerKind" + } + ] } }, "required": [ @@ -231,6 +248,32 @@ "required": [ "items" ] + }, + "ProducerKind": { + "description": "The kind of metric producer this is.", + "oneOf": [ + { + "description": "The producer is a sled-agent.", + "type": "string", + "enum": [ + "sled_agent" + ] + }, + { + "description": "The producer is an Omicron-managed service.", + "type": "string", + "enum": [ + "service" + ] + }, + { + "description": "The producer is a Propolis VMM managing a guest instance.", + "type": "string", + "enum": [ + "instance" + ] + } + ] } }, "responses": { diff --git a/oximeter/collector/src/agent.rs b/oximeter/collector/src/agent.rs index 23ff32ed66..f6da172909 100644 --- a/oximeter/collector/src/agent.rs +++ b/oximeter/collector/src/agent.rs @@ -648,6 +648,7 @@ mod tests { use hyper::Response; use hyper::Server; use hyper::StatusCode; + use omicron_common::api::internal::nexus::ProducerKind; use omicron_test_utils::dev::test_setup_log; use std::convert::Infallible; use std::net::Ipv6Addr; @@ -694,6 +695,7 @@ mod tests { let interval = Duration::from_secs(1); let endpoint = ProducerEndpoint { id: Uuid::new_v4(), + kind: Some(ProducerKind::Service), address, base_route: String::from("/"), interval, @@ -752,6 +754,7 @@ mod tests { let interval = Duration::from_secs(1); let endpoint = ProducerEndpoint { id: Uuid::new_v4(), + kind: Some(ProducerKind::Service), address: SocketAddr::V6(SocketAddrV6::new( Ipv6Addr::LOCALHOST, 0, @@ -840,6 +843,7 @@ mod tests { let interval = Duration::from_secs(1); let endpoint = ProducerEndpoint { id: Uuid::new_v4(), + kind: Some(ProducerKind::Service), address, base_route: String::from("/"), interval, diff --git a/oximeter/producer/examples/producer.rs b/oximeter/producer/examples/producer.rs index dd9722c80a..baa4f57bf7 100644 --- a/oximeter/producer/examples/producer.rs +++ b/oximeter/producer/examples/producer.rs @@ -15,6 +15,7 @@ use dropshot::ConfigLogging; use dropshot::ConfigLoggingLevel; use dropshot::HandlerTaskMode; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use oximeter::types::Cumulative; use oximeter::types::ProducerRegistry; use oximeter::types::Sample; @@ -124,6 +125,7 @@ async fn main() -> anyhow::Result<()> { registry.register_producer(producer).unwrap(); let server_info = ProducerEndpoint { id: registry.producer_id(), + kind: Some(ProducerKind::Service), address: args.address, base_route: "/collect".to_string(), interval: Duration::from_secs(10), diff --git a/schema/crdb/11.0.0/README.md b/schema/crdb/10.0.0/README.md similarity index 100% rename from schema/crdb/11.0.0/README.md rename to schema/crdb/10.0.0/README.md diff --git a/schema/crdb/11.0.0/up01.sql b/schema/crdb/10.0.0/up01.sql similarity index 100% rename from schema/crdb/11.0.0/up01.sql rename to schema/crdb/10.0.0/up01.sql diff --git a/schema/crdb/11.0.0/up02.sql b/schema/crdb/10.0.0/up02.sql similarity index 100% rename from schema/crdb/11.0.0/up02.sql rename to schema/crdb/10.0.0/up02.sql diff --git a/schema/crdb/11.0.0/up03.sql b/schema/crdb/10.0.0/up03.sql similarity index 100% rename from schema/crdb/11.0.0/up03.sql rename to schema/crdb/10.0.0/up03.sql diff --git a/schema/crdb/11.0.0/up04.sql b/schema/crdb/10.0.0/up04.sql similarity index 100% rename from schema/crdb/11.0.0/up04.sql rename to schema/crdb/10.0.0/up04.sql diff --git a/schema/crdb/11.0.0/up05.sql b/schema/crdb/10.0.0/up05.sql similarity index 100% rename from schema/crdb/11.0.0/up05.sql rename to schema/crdb/10.0.0/up05.sql diff --git a/schema/crdb/11.0.0/up06.sql b/schema/crdb/10.0.0/up06.sql similarity index 100% rename from schema/crdb/11.0.0/up06.sql rename to schema/crdb/10.0.0/up06.sql diff --git a/schema/crdb/11.0.0/up07.sql b/schema/crdb/10.0.0/up07.sql similarity index 100% rename from schema/crdb/11.0.0/up07.sql rename to schema/crdb/10.0.0/up07.sql diff --git a/schema/crdb/11.0.0/up08.sql b/schema/crdb/10.0.0/up08.sql similarity index 100% rename from schema/crdb/11.0.0/up08.sql rename to schema/crdb/10.0.0/up08.sql diff --git a/schema/crdb/11.0.0/up09.sql b/schema/crdb/10.0.0/up09.sql similarity index 100% rename from schema/crdb/11.0.0/up09.sql rename to schema/crdb/10.0.0/up09.sql diff --git a/schema/crdb/11.0.0/up10.sql b/schema/crdb/10.0.0/up10.sql similarity index 100% rename from schema/crdb/11.0.0/up10.sql rename to schema/crdb/10.0.0/up10.sql diff --git a/schema/crdb/11.0.0/up11.sql b/schema/crdb/10.0.0/up11.sql similarity index 100% rename from schema/crdb/11.0.0/up11.sql rename to schema/crdb/10.0.0/up11.sql diff --git a/schema/crdb/11.0.0/up12.sql b/schema/crdb/10.0.0/up12.sql similarity index 100% rename from schema/crdb/11.0.0/up12.sql rename to schema/crdb/10.0.0/up12.sql diff --git a/schema/crdb/12.0.0/up01.sql b/schema/crdb/12.0.0/up01.sql new file mode 100644 index 0000000000..36f2f810ca --- /dev/null +++ b/schema/crdb/12.0.0/up01.sql @@ -0,0 +1,27 @@ +/* + * Drop the entire metric producer assignment table. + * + * Programs wishing to produce metrics need to register with Nexus. That creates + * an assignment of the producer to a collector, which is recorded in this + * table. That registration is idempotent, and every _current_ producer will + * register when it restarts. For example, `dpd` includes a task that registers + * with Nexus, so each time it (re)starts, that registration will happen. + * + * With that in mind, dropping this table is safe, because as of today, all + * software updates reuqire that the whole control plane be offline. We know + * that these entries will be recreated shortly, as the services registering + * producers are restarted. + * + * The current metric producers are: + * + * - `dpd` + * - Each `nexus` instance + * - Each `sled-agent` instance + * - The Propolis server for each guest Instance + * + * Another reason we're dropping the table is because we will add a new column, + * `kind`, in a following update file, but we don't have a good way to backfill + * that value for existing rows. We also don't need to, because these services + * will soon reregister, and provide us with a value. + */ +DROP TABLE IF EXISTS omicron.public.metric_producer; diff --git a/schema/crdb/12.0.0/up02.sql b/schema/crdb/12.0.0/up02.sql new file mode 100644 index 0000000000..96c4c5d6b4 --- /dev/null +++ b/schema/crdb/12.0.0/up02.sql @@ -0,0 +1,11 @@ +/* + * The kind of metric producer each record corresponds to. + */ +CREATE TYPE IF NOT EXISTS omicron.public.producer_kind AS ENUM ( + -- A sled agent for an entry in the sled table. + 'sled_agent', + -- A service in the omicron.public.service table + 'service', + -- A Propolis VMM for an instance in the omicron.public.instance table + 'instance' +); diff --git a/schema/crdb/12.0.0/up03.sql b/schema/crdb/12.0.0/up03.sql new file mode 100644 index 0000000000..fc57667541 --- /dev/null +++ b/schema/crdb/12.0.0/up03.sql @@ -0,0 +1,17 @@ +/* + * Recreate the metric producer assignment table. + * + * Note that we're adding the `kind` column here, using the new enum in the + * previous update SQL file. + */ +CREATE TABLE IF NOT EXISTS omicron.public.metric_producer ( + id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + time_modified TIMESTAMPTZ NOT NULL, + kind omicron.public.producer_kind, + ip INET NOT NULL, + port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL, + interval FLOAT NOT NULL, + base_route STRING(512) NOT NULL, + oximeter_id UUID NOT NULL +); diff --git a/schema/crdb/12.0.0/up04.sql b/schema/crdb/12.0.0/up04.sql new file mode 100644 index 0000000000..cad33ddcf2 --- /dev/null +++ b/schema/crdb/12.0.0/up04.sql @@ -0,0 +1,8 @@ +/* + * Recreate index to support looking up a producer by its assigned oximeter + * collector ID. + */ +CREATE UNIQUE INDEX IF NOT EXISTS lookup_producer_by_oximeter ON omicron.public.metric_producer ( + oximeter_id, + id +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 104261afac..321f30138a 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1108,6 +1108,18 @@ CREATE TABLE IF NOT EXISTS omicron.public.oximeter ( port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL ); +/* + * The kind of metric producer each record corresponds to. + */ +CREATE TYPE IF NOT EXISTS omicron.public.producer_kind AS ENUM ( + -- A sled agent for an entry in the sled table. + 'sled_agent', + -- A service in the omicron.public.service table + 'service', + -- A Propolis VMM for an instance in the omicron.public.instance table + 'instance' +); + /* * Information about registered metric producers. */ @@ -1115,6 +1127,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.metric_producer ( id UUID PRIMARY KEY, time_created TIMESTAMPTZ NOT NULL, time_modified TIMESTAMPTZ NOT NULL, + kind omicron.public.producer_kind, ip INET NOT NULL, port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL, interval FLOAT NOT NULL, diff --git a/sled-agent/src/sim/disk.rs b/sled-agent/src/sim/disk.rs index 0f08289b74..f131fd2bff 100644 --- a/sled-agent/src/sim/disk.rs +++ b/sled-agent/src/sim/disk.rs @@ -17,6 +17,7 @@ use omicron_common::api::external::Generation; use omicron_common::api::external::ResourceType; use omicron_common::api::internal::nexus::DiskRuntimeState; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use oximeter_producer::LogConfig; use oximeter_producer::Server as ProducerServer; use propolis_client::types::DiskAttachmentState as PropolisDiskState; @@ -168,6 +169,7 @@ impl SimDisk { let producer_address = SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 0); let server_info = ProducerEndpoint { id, + kind: Some(ProducerKind::SledAgent), address: producer_address, base_route: "/collect".to_string(), interval: Duration::from_millis(200), diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 9826a987d4..cfa8c5d7ca 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -43,6 +43,7 @@ use omicron_common::address::{ }; use omicron_common::api::external::Vni; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use omicron_common::api::internal::nexus::{ SledInstanceState, VmmRuntimeState, }; @@ -504,6 +505,7 @@ impl SledAgent { // Nexus. This should not block progress here. let endpoint = ProducerEndpoint { id: request.body.id, + kind: Some(ProducerKind::SledAgent), address: sled_address.into(), base_route: String::from("/metrics/collect"), interval: crate::metrics::METRIC_COLLECTION_INTERVAL, diff --git a/sp-sim/src/gimlet.rs b/sp-sim/src/gimlet.rs index b3c4637f5d..635e8fde6b 100644 --- a/sp-sim/src/gimlet.rs +++ b/sp-sim/src/gimlet.rs @@ -14,6 +14,7 @@ use crate::server::UdpServer; use crate::update::SimSpUpdate; use crate::Responsiveness; use crate::SimulatedSp; +use crate::SIM_ROT_BOARD; use anyhow::{anyhow, bail, Context, Result}; use async_trait::async_trait; use futures::future; @@ -110,10 +111,16 @@ impl SimulatedSp for Gimlet { self.rot.lock().unwrap().handle_deserialized(request) } - async fn last_update_data(&self) -> Option> { + async fn last_sp_update_data(&self) -> Option> { let handler = self.handler.as_ref()?; let handler = handler.lock().await; - handler.update_state.last_update_data() + handler.update_state.last_sp_update_data() + } + + async fn last_rot_update_data(&self) -> Option> { + let handler = self.handler.as_ref()?; + let handler = handler.lock().await; + handler.update_state.last_rot_update_data() } async fn current_update_status(&self) -> gateway_messages::UpdateStatus { @@ -576,7 +583,7 @@ struct Handler { power_state: PowerState, startup_options: StartupOptions, update_state: SimSpUpdate, - reset_pending: bool, + reset_pending: Option, // To simulate an SP reset, we should (after doing whatever housekeeping we // need to track the reset) intentionally _fail_ to respond to the request, @@ -618,7 +625,7 @@ impl Handler { power_state: PowerState::A2, startup_options: StartupOptions::empty(), update_state: SimSpUpdate::default(), - reset_pending: false, + reset_pending: None, should_fail_to_respond_signal: None, } } @@ -1068,8 +1075,9 @@ impl SpHandler for Handler { "port" => ?port, "component" => ?component, ); - if component == SpComponent::SP_ITSELF { - self.reset_pending = true; + if component == SpComponent::SP_ITSELF || component == SpComponent::ROT + { + self.reset_pending = Some(component); Ok(()) } else { Err(SpError::RequestUnsupportedForComponent) @@ -1089,9 +1097,9 @@ impl SpHandler for Handler { "component" => ?component, ); if component == SpComponent::SP_ITSELF { - if self.reset_pending { + if self.reset_pending == Some(SpComponent::SP_ITSELF) { self.update_state.sp_reset(); - self.reset_pending = false; + self.reset_pending = None; if let Some(signal) = self.should_fail_to_respond_signal.take() { // Instruct `server::handle_request()` to _not_ respond to @@ -1102,6 +1110,14 @@ impl SpHandler for Handler { } else { Err(SpError::ResetComponentTriggerWithoutPrepare) } + } else if component == SpComponent::ROT { + if self.reset_pending == Some(SpComponent::ROT) { + self.update_state.rot_reset(); + self.reset_pending = None; + Ok(()) + } else { + Err(SpError::ResetComponentTriggerWithoutPrepare) + } } else { Err(SpError::RequestUnsupportedForComponent) } @@ -1325,7 +1341,7 @@ impl SpHandler for Handler { static SP_VERS: &[u8] = b"0.0.1"; static ROT_GITC: &[u8] = b"eeeeeeee"; - static ROT_BORD: &[u8] = b"SimGimletRot"; + static ROT_BORD: &[u8] = SIM_ROT_BOARD.as_bytes(); static ROT_NAME: &[u8] = b"SimGimlet"; static ROT_VERS: &[u8] = b"0.0.1"; diff --git a/sp-sim/src/lib.rs b/sp-sim/src/lib.rs index 668c7c3311..0958e8a177 100644 --- a/sp-sim/src/lib.rs +++ b/sp-sim/src/lib.rs @@ -28,6 +28,8 @@ use std::net::SocketAddrV6; use tokio::sync::mpsc; use tokio::sync::watch; +pub const SIM_ROT_BOARD: &str = "SimRot"; + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Responsiveness { Responsive, @@ -58,8 +60,13 @@ pub trait SimulatedSp { /// Get the last completed update delivered to this simulated SP. /// - /// Only returns data after a simulated reset. - async fn last_update_data(&self) -> Option>; + /// Only returns data after a simulated reset of the SP. + async fn last_sp_update_data(&self) -> Option>; + + /// Get the last completed update delivered to this simulated RoT. + /// + /// Only returns data after a simulated reset of the RoT. + async fn last_rot_update_data(&self) -> Option>; /// Get the current update status, just as would be returned by an MGS /// request to get the update status. diff --git a/sp-sim/src/sidecar.rs b/sp-sim/src/sidecar.rs index 5433a8bb40..19e84ffc64 100644 --- a/sp-sim/src/sidecar.rs +++ b/sp-sim/src/sidecar.rs @@ -16,6 +16,7 @@ use crate::server::UdpServer; use crate::update::SimSpUpdate; use crate::Responsiveness; use crate::SimulatedSp; +use crate::SIM_ROT_BOARD; use anyhow::Result; use async_trait::async_trait; use futures::future; @@ -121,10 +122,16 @@ impl SimulatedSp for Sidecar { self.rot.lock().unwrap().handle_deserialized(request) } - async fn last_update_data(&self) -> Option> { + async fn last_sp_update_data(&self) -> Option> { let handler = self.handler.as_ref()?; let handler = handler.lock().await; - handler.update_state.last_update_data() + handler.update_state.last_sp_update_data() + } + + async fn last_rot_update_data(&self) -> Option> { + let handler = self.handler.as_ref()?; + let handler = handler.lock().await; + handler.update_state.last_rot_update_data() } async fn current_update_status(&self) -> gateway_messages::UpdateStatus { @@ -383,7 +390,7 @@ struct Handler { power_state: PowerState, update_state: SimSpUpdate, - reset_pending: bool, + reset_pending: Option, // To simulate an SP reset, we should (after doing whatever housekeeping we // need to track the reset) intentionally _fail_ to respond to the request, @@ -422,7 +429,7 @@ impl Handler { rot_active_slot: RotSlotId::A, power_state: PowerState::A2, update_state: SimSpUpdate::default(), - reset_pending: false, + reset_pending: None, should_fail_to_respond_signal: None, } } @@ -849,8 +856,9 @@ impl SpHandler for Handler { "port" => ?port, "component" => ?component, ); - if component == SpComponent::SP_ITSELF { - self.reset_pending = true; + if component == SpComponent::SP_ITSELF || component == SpComponent::ROT + { + self.reset_pending = Some(component); Ok(()) } else { Err(SpError::RequestUnsupportedForComponent) @@ -870,9 +878,9 @@ impl SpHandler for Handler { "component" => ?component, ); if component == SpComponent::SP_ITSELF { - if self.reset_pending { + if self.reset_pending == Some(SpComponent::SP_ITSELF) { self.update_state.sp_reset(); - self.reset_pending = false; + self.reset_pending = None; if let Some(signal) = self.should_fail_to_respond_signal.take() { // Instruct `server::handle_request()` to _not_ respond to @@ -883,6 +891,14 @@ impl SpHandler for Handler { } else { Err(SpError::ResetComponentTriggerWithoutPrepare) } + } else if component == SpComponent::ROT { + if self.reset_pending == Some(SpComponent::ROT) { + self.update_state.rot_reset(); + self.reset_pending = None; + Ok(()) + } else { + Err(SpError::ResetComponentTriggerWithoutPrepare) + } } else { Err(SpError::RequestUnsupportedForComponent) } @@ -1104,7 +1120,7 @@ impl SpHandler for Handler { static SP_VERS: &[u8] = b"0.0.1"; static ROT_GITC: &[u8] = b"eeeeeeee"; - static ROT_BORD: &[u8] = b"SimSidecarRot"; + static ROT_BORD: &[u8] = SIM_ROT_BOARD.as_bytes(); static ROT_NAME: &[u8] = b"SimSidecar"; static ROT_VERS: &[u8] = b"0.0.1"; diff --git a/sp-sim/src/update.rs b/sp-sim/src/update.rs index e57659ca1a..9879a3ecde 100644 --- a/sp-sim/src/update.rs +++ b/sp-sim/src/update.rs @@ -13,12 +13,17 @@ use gateway_messages::UpdateInProgressStatus; pub(crate) struct SimSpUpdate { state: UpdateState, - last_update_data: Option>, + last_sp_update_data: Option>, + last_rot_update_data: Option>, } impl Default for SimSpUpdate { fn default() -> Self { - Self { state: UpdateState::NotPrepared, last_update_data: None } + Self { + state: UpdateState::NotPrepared, + last_sp_update_data: None, + last_rot_update_data: None, + } } } @@ -80,6 +85,7 @@ impl SimSpUpdate { let mut stolen = Cursor::new(Box::default()); mem::swap(data, &mut stolen); self.state = UpdateState::Completed { + component: *component, id: *id, data: stolen.into_inner(), }; @@ -112,16 +118,37 @@ impl SimSpUpdate { } pub(crate) fn sp_reset(&mut self) { - self.last_update_data = match &self.state { - UpdateState::Completed { data, .. } => Some(data.clone()), + match &self.state { + UpdateState::Completed { data, component, .. } => { + if *component == SpComponent::SP_ITSELF { + self.last_sp_update_data = Some(data.clone()); + } + } + UpdateState::NotPrepared + | UpdateState::Prepared { .. } + | UpdateState::Aborted(_) => (), + } + } + + pub(crate) fn rot_reset(&mut self) { + match &self.state { + UpdateState::Completed { data, component, .. } => { + if *component == SpComponent::ROT { + self.last_rot_update_data = Some(data.clone()); + } + } UpdateState::NotPrepared | UpdateState::Prepared { .. } - | UpdateState::Aborted(_) => None, - }; + | UpdateState::Aborted(_) => (), + } + } + + pub(crate) fn last_sp_update_data(&self) -> Option> { + self.last_sp_update_data.clone() } - pub(crate) fn last_update_data(&self) -> Option> { - self.last_update_data.clone() + pub(crate) fn last_rot_update_data(&self) -> Option> { + self.last_rot_update_data.clone() } } @@ -138,6 +165,7 @@ enum UpdateState { }, Aborted(UpdateId), Completed { + component: SpComponent, id: UpdateId, data: Box<[u8]>, }, diff --git a/tufaceous-lib/Cargo.toml b/tufaceous-lib/Cargo.toml index bcfcee6b9c..0df3a33f98 100644 --- a/tufaceous-lib/Cargo.toml +++ b/tufaceous-lib/Cargo.toml @@ -7,6 +7,7 @@ publish = false [dependencies] anyhow = { workspace = true, features = ["backtrace"] } +async-trait.workspace = true buf-list.workspace = true bytes.workspace = true bytesize = { workspace = true, features = ["serde"] } @@ -16,6 +17,7 @@ chrono.workspace = true debug-ignore.workspace = true flate2.workspace = true fs-err.workspace = true +futures.workspace = true hex.workspace = true hubtools.workspace = true itertools.workspace = true @@ -28,6 +30,7 @@ serde_path_to_error.workspace = true sha2.workspace = true slog.workspace = true tar.workspace = true +tokio.workspace = true toml.workspace = true tough.workspace = true url = "2.4.1" @@ -36,3 +39,4 @@ omicron-workspace-hack.workspace = true [dev-dependencies] omicron-test-utils.workspace = true +tokio = { workspace = true, features = ["test-util"] } diff --git a/tufaceous-lib/src/artifact.rs b/tufaceous-lib/src/artifact.rs index 56f3e34ecb..23cf31e8c3 100644 --- a/tufaceous-lib/src/artifact.rs +++ b/tufaceous-lib/src/artifact.rs @@ -127,7 +127,7 @@ pub struct HostPhaseImages { } impl HostPhaseImages { - pub fn extract(reader: R) -> Result { + pub fn extract(reader: R) -> Result { let mut phase_1 = Vec::new(); let mut phase_2 = Vec::new(); Self::extract_into( @@ -138,13 +138,12 @@ impl HostPhaseImages { Ok(Self { phase_1: phase_1.into(), phase_2: phase_2.into() }) } - pub fn extract_into( + pub fn extract_into( reader: R, phase_1: W, phase_2: W, ) -> Result<()> { - let uncompressed = - flate2::bufread::GzDecoder::new(BufReader::new(reader)); + let uncompressed = flate2::bufread::GzDecoder::new(reader); let mut archive = tar::Archive::new(uncompressed); let mut oxide_json_found = false; @@ -248,7 +247,7 @@ pub struct RotArchives { } impl RotArchives { - pub fn extract(reader: R) -> Result { + pub fn extract(reader: R) -> Result { let mut archive_a = Vec::new(); let mut archive_b = Vec::new(); Self::extract_into( @@ -259,13 +258,12 @@ impl RotArchives { Ok(Self { archive_a: archive_a.into(), archive_b: archive_b.into() }) } - pub fn extract_into( + pub fn extract_into( reader: R, archive_a: W, archive_b: W, ) -> Result<()> { - let uncompressed = - flate2::bufread::GzDecoder::new(BufReader::new(reader)); + let uncompressed = flate2::bufread::GzDecoder::new(reader); let mut archive = tar::Archive::new(uncompressed); let mut oxide_json_found = false; diff --git a/tufaceous-lib/src/assemble/build.rs b/tufaceous-lib/src/assemble/build.rs index 081e2e10d5..4cb636c9d3 100644 --- a/tufaceous-lib/src/assemble/build.rs +++ b/tufaceous-lib/src/assemble/build.rs @@ -44,7 +44,7 @@ impl OmicronRepoAssembler { self } - pub fn build(&self) -> Result<()> { + pub async fn build(&self) -> Result<()> { let (build_dir, is_temp) = match &self.build_dir { Some(dir) => (dir.clone(), false), None => { @@ -61,7 +61,7 @@ impl OmicronRepoAssembler { slog::info!(self.log, "assembling repository in `{build_dir}`"); - match self.build_impl(&build_dir) { + match self.build_impl(&build_dir).await { Ok(()) => { if is_temp { slog::debug!(self.log, "assembly successful, cleaning up"); @@ -92,15 +92,17 @@ impl OmicronRepoAssembler { Ok(()) } - fn build_impl(&self, build_dir: &Utf8Path) -> Result<()> { + async fn build_impl(&self, build_dir: &Utf8Path) -> Result<()> { let mut repository = OmicronRepo::initialize( &self.log, build_dir, self.manifest.system_version.clone(), self.keys.clone(), self.expiry, - )? - .into_editor()?; + ) + .await? + .into_editor() + .await?; // Add all the artifacts. for (kind, entries) in &self.manifest.artifacts { @@ -118,10 +120,11 @@ impl OmicronRepoAssembler { } // Write out the repository. - repository.sign_and_finish(self.keys.clone(), self.expiry)?; + repository.sign_and_finish(self.keys.clone(), self.expiry).await?; // Now reopen the repository to archive it into a zip file. let repo2 = OmicronRepo::load_untrusted(&self.log, build_dir) + .await .context("error reopening repository to archive")?; repo2 .archive(&self.output_path) diff --git a/tufaceous-lib/src/key.rs b/tufaceous-lib/src/key.rs index 8a5054b331..96282ee377 100644 --- a/tufaceous-lib/src/key.rs +++ b/tufaceous-lib/src/key.rs @@ -5,6 +5,7 @@ use ring::rand::SecureRandom; use ring::signature::Ed25519KeyPair; use std::fmt::Display; use std::str::FromStr; +use tough::async_trait; use tough::key_source::KeySource; use tough::sign::{Sign, SignKeyPair}; @@ -38,30 +39,32 @@ impl Key { } } +#[async_trait] impl Sign for Key { fn tuf_key(&self) -> tough::schema::key::Key { self.as_sign().tuf_key() } - fn sign( + async fn sign( &self, msg: &[u8], - rng: &dyn SecureRandom, + rng: &(dyn SecureRandom + Sync), ) -> Result, Box> { - self.as_sign().sign(msg, rng) + self.as_sign().sign(msg, rng).await } } +#[async_trait] impl KeySource for Key { - fn as_sign( + async fn as_sign( &self, ) -> Result, Box> { Ok(Box::new(self.clone())) } - fn write( + async fn write( &self, _value: &str, _key_id_hex: &str, diff --git a/tufaceous-lib/src/repository.rs b/tufaceous-lib/src/repository.rs index 11a6064602..416d5c9990 100644 --- a/tufaceous-lib/src/repository.rs +++ b/tufaceous-lib/src/repository.rs @@ -4,9 +4,11 @@ use crate::{key::Key, target::TargetWriter, AddArtifact, ArchiveBuilder}; use anyhow::{anyhow, bail, Context, Result}; +use buf_list::BufList; use camino::{Utf8Path, Utf8PathBuf}; use chrono::{DateTime, Utc}; -use fs_err::{self as fs, File}; +use fs_err::{self as fs}; +use futures::TryStreamExt; use omicron_common::{ api::external::SemverVersion, update::{Artifact, ArtifactsDocument}, @@ -28,38 +30,41 @@ pub struct OmicronRepo { impl OmicronRepo { /// Initializes a new repository at the given path, writing it to disk. - pub fn initialize( + pub async fn initialize( log: &slog::Logger, repo_path: &Utf8Path, system_version: SemverVersion, keys: Vec, expiry: DateTime, ) -> Result { - let root = crate::root::new_root(keys.clone(), expiry)?; + let root = crate::root::new_root(keys.clone(), expiry).await?; let editor = OmicronRepoEditor::initialize( repo_path.to_owned(), root, system_version, - )?; + ) + .await?; editor .sign_and_finish(keys, expiry) + .await .context("error signing new repository")?; // In theory we "trust" the key we just used to sign this repository, // but the code path is equivalent to `load_untrusted`. - Self::load_untrusted(log, repo_path) + Self::load_untrusted(log, repo_path).await } /// Loads a repository from the given path. /// /// This method enforces expirations. To load without expiration enforcement, use /// [`Self::load_untrusted_ignore_expiration`]. - pub fn load_untrusted( + pub async fn load_untrusted( log: &slog::Logger, repo_path: &Utf8Path, ) -> Result { Self::load_untrusted_impl(log, repo_path, ExpirationEnforcement::Safe) + .await } /// Loads a repository from the given path, ignoring expiration. @@ -68,30 +73,36 @@ impl OmicronRepo { /// /// 1. When you're editing an existing repository and will re-sign it afterwards. /// 2. In an environment in which time isn't available. - pub fn load_untrusted_ignore_expiration( + pub async fn load_untrusted_ignore_expiration( log: &slog::Logger, repo_path: &Utf8Path, ) -> Result { Self::load_untrusted_impl(log, repo_path, ExpirationEnforcement::Unsafe) + .await } - fn load_untrusted_impl( + async fn load_untrusted_impl( log: &slog::Logger, repo_path: &Utf8Path, exp: ExpirationEnforcement, ) -> Result { let log = log.new(slog::o!("component" => "OmicronRepo")); let repo_path = repo_path.canonicalize_utf8()?; + let root_json = repo_path.join("metadata").join("1.root.json"); + let root = tokio::fs::read(&root_json) + .await + .with_context(|| format!("error reading from {root_json}"))?; let repo = RepositoryLoader::new( - File::open(repo_path.join("metadata").join("1.root.json"))?, + &root, Url::from_file_path(repo_path.join("metadata")) .expect("the canonical path is not absolute?"), Url::from_file_path(repo_path.join("targets")) .expect("the canonical path is not absolute?"), ) .expiration_enforcement(exp) - .load()?; + .load() + .await?; Ok(Self { log, repo, repo_path }) } @@ -107,12 +118,17 @@ impl OmicronRepo { } /// Reads the artifacts document from the repo. - pub fn read_artifacts(&self) -> Result { + pub async fn read_artifacts(&self) -> Result { let reader = self .repo - .read_target(&"artifacts.json".try_into()?)? + .read_target(&"artifacts.json".try_into()?) + .await? .ok_or_else(|| anyhow!("artifacts.json should be present"))?; - serde_json::from_reader(reader) + let buf_list = reader + .try_collect::() + .await + .context("error reading from artifacts.json")?; + serde_json::from_reader(buf_list::Cursor::new(&buf_list)) .context("error deserializing artifacts.json") } @@ -177,8 +193,8 @@ impl OmicronRepo { /// Converts `self` into an `OmicronRepoEditor`, which can be used to perform /// modifications to the repository. - pub fn into_editor(self) -> Result { - OmicronRepoEditor::new(self) + pub async fn into_editor(self) -> Result { + OmicronRepoEditor::new(self).await } /// Prepends the target digest to the name if using consistent snapshots. Returns both the @@ -210,8 +226,8 @@ pub struct OmicronRepoEditor { } impl OmicronRepoEditor { - fn new(repo: OmicronRepo) -> Result { - let artifacts = repo.read_artifacts()?; + async fn new(repo: OmicronRepo) -> Result { + let artifacts = repo.read_artifacts().await?; let existing_target_names = repo .repo @@ -226,7 +242,8 @@ impl OmicronRepoEditor { .join("metadata") .join(format!("{}.root.json", repo.repo.root().signed.version)), repo.repo, - )?; + ) + .await?; Ok(Self { editor, @@ -236,7 +253,7 @@ impl OmicronRepoEditor { }) } - fn initialize( + async fn initialize( repo_path: Utf8PathBuf, root: SignedRole, system_version: SemverVersion, @@ -250,7 +267,7 @@ impl OmicronRepoEditor { fs::create_dir_all(&targets_dir)?; fs::write(&root_path, root.buffer())?; - let editor = RepositoryEditor::new(&root_path)?; + let editor = RepositoryEditor::new(&root_path).await?; Ok(Self { editor, @@ -297,7 +314,7 @@ impl OmicronRepoEditor { } /// Consumes self, signing the repository and writing out this repository to disk. - pub fn sign_and_finish( + pub async fn sign_and_finish( mut self, keys: Vec, expiry: DateTime, @@ -313,9 +330,11 @@ impl OmicronRepoEditor { let signed = self .editor .sign(&crate::key::boxed_keys(keys)) + .await .context("error signing keys")?; signed .write(self.repo_path.join("metadata")) + .await .context("error writing repository")?; Ok(()) } @@ -346,8 +365,8 @@ mod tests { use chrono::Days; use omicron_test_utils::dev::test_setup_log; - #[test] - fn reject_artifacts_with_the_same_filename() { + #[tokio::test] + async fn reject_artifacts_with_the_same_filename() { let logctx = test_setup_log("reject_artifacts_with_the_same_filename"); let tempdir = Utf8TempDir::new().unwrap(); let mut repo = OmicronRepo::initialize( @@ -357,8 +376,10 @@ mod tests { vec![Key::generate_ed25519()], Utc::now() + Days::new(1), ) + .await .unwrap() .into_editor() + .await .unwrap(); // Targets are uniquely identified by their kind/name/version triple; diff --git a/tufaceous-lib/src/root.rs b/tufaceous-lib/src/root.rs index 8ecd1cdf9d..cf5f7129c5 100644 --- a/tufaceous-lib/src/root.rs +++ b/tufaceous-lib/src/root.rs @@ -8,7 +8,7 @@ use tough::editor::signed::SignedRole; use tough::schema::{KeyHolder, RoleKeys, RoleType, Root}; use tough::sign::Sign; -pub(crate) fn new_root( +pub(crate) async fn new_root( keys: Vec, expires: DateTime, ) -> Result> { @@ -47,5 +47,6 @@ pub(crate) fn new_root( &KeyHolder::Root(root), &keys, &SystemRandom::new(), - )?) + ) + .await?) } diff --git a/tufaceous/Cargo.toml b/tufaceous/Cargo.toml index e48513e24c..81248af57d 100644 --- a/tufaceous/Cargo.toml +++ b/tufaceous/Cargo.toml @@ -17,6 +17,7 @@ slog.workspace = true slog-async.workspace = true slog-envlogger.workspace = true slog-term.workspace = true +tokio.workspace = true tufaceous-lib.workspace = true omicron-workspace-hack.workspace = true @@ -27,6 +28,7 @@ fs-err.workspace = true omicron-test-utils.workspace = true predicates.workspace = true tempfile.workspace = true +tokio = { workspace = true, features = ["test-util"] } [[test]] name = "manifest-tests" diff --git a/tufaceous/src/dispatch.rs b/tufaceous/src/dispatch.rs index ea0db63fce..fc86c948df 100644 --- a/tufaceous/src/dispatch.rs +++ b/tufaceous/src/dispatch.rs @@ -36,7 +36,7 @@ pub struct Args { impl Args { /// Executes these arguments. - pub fn exec(self, log: &slog::Logger) -> Result<()> { + pub async fn exec(self, log: &slog::Logger) -> Result<()> { let repo_path = match self.repo { Some(repo) => repo, None => std::env::current_dir()?.try_into()?, @@ -52,7 +52,8 @@ impl Args { system_version, keys, self.expiry, - )?; + ) + .await?; slog::info!( log, "Initialized TUF repository in {}", @@ -87,8 +88,9 @@ impl Args { let repo = OmicronRepo::load_untrusted_ignore_expiration( &log, &repo_path, - )?; - let mut editor = repo.into_editor()?; + ) + .await?; + let mut editor = repo.into_editor().await?; let new_artifact = AddArtifact::from_path(kind, name, version, path)?; @@ -96,7 +98,7 @@ impl Args { editor .add_artifact(&new_artifact) .context("error adding artifact")?; - editor.sign_and_finish(self.keys, self.expiry)?; + editor.sign_and_finish(self.keys, self.expiry).await?; println!( "added {} {}, version {}", new_artifact.kind(), @@ -113,7 +115,8 @@ impl Args { let repo = OmicronRepo::load_untrusted_ignore_expiration( &log, &repo_path, - )?; + ) + .await?; repo.archive(&output_path)?; Ok(()) @@ -124,13 +127,14 @@ impl Args { // Now load the repository and ensure it's valid. let repo = OmicronRepo::load_untrusted(&log, &dest) + .await .with_context(|| { format!( "error loading extracted repository at `{dest}` \ (extracted files are still available)" ) })?; - repo.read_artifacts().with_context(|| { + repo.read_artifacts().await.with_context(|| { format!( "error loading artifacts.json from extracted archive \ at `{dest}`" @@ -169,7 +173,7 @@ impl Args { assembler.set_build_dir(dir); } - assembler.build()?; + assembler.build().await?; Ok(()) } diff --git a/tufaceous/src/main.rs b/tufaceous/src/main.rs index 30832cffbf..014817ee53 100644 --- a/tufaceous/src/main.rs +++ b/tufaceous/src/main.rs @@ -7,10 +7,11 @@ use clap::Parser; use slog::Drain; use tufaceous::Args; -fn main() -> Result<()> { +#[tokio::main] +async fn main() -> Result<()> { let log = setup_log(); let args = Args::parse(); - args.exec(&log) + args.exec(&log).await } fn setup_log() -> slog::Logger { diff --git a/tufaceous/tests/integration-tests/command_tests.rs b/tufaceous/tests/integration-tests/command_tests.rs index 73c94572eb..72c3a1a13a 100644 --- a/tufaceous/tests/integration-tests/command_tests.rs +++ b/tufaceous/tests/integration-tests/command_tests.rs @@ -14,8 +14,8 @@ use omicron_test_utils::dev::test_setup_log; use predicates::prelude::*; use tufaceous_lib::{Key, OmicronRepo}; -#[test] -fn test_init_and_add() -> Result<()> { +#[tokio::test] +async fn test_init_and_add() -> Result<()> { let logctx = test_setup_log("test_init_and_add"); let tempdir = tempfile::tempdir().unwrap(); let key = Key::generate_ed25519(); @@ -54,9 +54,9 @@ fn test_init_and_add() -> Result<()> { // Now read the repository and ensure the list of expected artifacts. let repo_path: Utf8PathBuf = tempdir.path().join("repo").try_into()?; - let repo = OmicronRepo::load_untrusted(&logctx.log, &repo_path)?; + let repo = OmicronRepo::load_untrusted(&logctx.log, &repo_path).await?; - let artifacts = repo.read_artifacts()?; + let artifacts = repo.read_artifacts().await?; assert_eq!( artifacts.artifacts.len(), 2, diff --git a/wicket-common/src/update_events.rs b/wicket-common/src/update_events.rs index e0f9d4b228..fe92887646 100644 --- a/wicket-common/src/update_events.rs +++ b/wicket-common/src/update_events.rs @@ -10,6 +10,7 @@ use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; use std::fmt; +use std::sync::Arc; use thiserror::Error; use update_engine::errors::NestedEngineError; use update_engine::StepSpec; @@ -197,12 +198,13 @@ pub enum UpdateTerminalError { #[source] error: gateway_client::Error, }, - #[error("failed to upload trampoline phase 2 to MGS (was a new TUF repo uploaded?)")] - // Currently, the only way this error variant can be produced is if the - // upload task died or was replaced because a new TUF repository was - // uploaded. In the future, we may want to produce errors here if the upload - // to MGS fails too many times, for example. - TrampolinePhase2UploadFailed, + #[error("uploading trampoline phase 2 to MGS failed")] + TrampolinePhase2UploadFailed { + #[source] + error: Arc, + }, + #[error("uploading trampoline phase 2 to MGS cancelled (was a new TUF repo uploaded?)")] + TrampolinePhase2UploadCancelled, #[error("downloading installinator failed")] DownloadingInstallinatorFailed { #[source] diff --git a/wicketd/Cargo.toml b/wicketd/Cargo.toml index db1ac9c04a..1360c28b19 100644 --- a/wicketd/Cargo.toml +++ b/wicketd/Cargo.toml @@ -8,6 +8,7 @@ license = "MPL-2.0" anyhow.workspace = true async-trait.workspace = true base64.workspace = true +buf-list.workspace = true bytes.workspace = true camino.workspace = true camino-tempfile.workspace = true diff --git a/wicketd/src/artifacts/artifacts_with_plan.rs b/wicketd/src/artifacts/artifacts_with_plan.rs index 331aecfc70..d3319d7f6b 100644 --- a/wicketd/src/artifacts/artifacts_with_plan.rs +++ b/wicketd/src/artifacts/artifacts_with_plan.rs @@ -50,7 +50,7 @@ pub(super) struct ArtifactsWithPlan { } impl ArtifactsWithPlan { - pub(super) fn from_zip( + pub(super) async fn from_zip( zip_data: T, log: &Logger, ) -> Result @@ -68,10 +68,12 @@ impl ArtifactsWithPlan { // anyone can sign the repositories and this code will accept that. let repository = OmicronRepo::load_untrusted_ignore_expiration(log, dir.path()) + .await .map_err(RepositoryError::LoadRepository)?; let artifacts = repository .read_artifacts() + .await .map_err(RepositoryError::ReadArtifactsDocument)?; // Create another temporary directory where we'll "permanently" (as long @@ -132,9 +134,10 @@ impl ArtifactsWithPlan { .map_err(RepositoryError::TargetHashLength)?, ); - let reader = repository + let stream = repository .repo() .read_target(&target_name) + .await .map_err(|error| RepositoryError::LocateTarget { target: artifact.target.clone(), error: Box::new(error), @@ -143,13 +146,15 @@ impl ArtifactsWithPlan { RepositoryError::MissingTarget(artifact.target.clone()) })?; - plan_builder.add_artifact( - artifact.into_id(), - artifact_hash, - io::BufReader::new(reader), - &mut by_id, - &mut by_hash, - )?; + plan_builder + .add_artifact( + artifact.into_id(), + artifact_hash, + stream, + &mut by_id, + &mut by_hash, + ) + .await?; } // Ensure we know how to apply updates from this set of artifacts; we'll @@ -218,8 +223,11 @@ mod tests { /// Test that `ArtifactsWithPlan` can extract the fake repository generated /// by tufaceous. - #[test] - fn test_extract_fake() -> Result<()> { + /// + /// See documentation for extract_nested_artifact_pair in update_plan.rs + /// for why multi_thread is required. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_extract_fake() -> Result<()> { let logctx = test_setup_log("test_extract_fake"); let temp_dir = Utf8TempDir::new()?; let archive_path = temp_dir.path().join("archive.zip"); @@ -233,12 +241,15 @@ mod tests { ]) .context("error parsing args")?; - args.exec(&logctx.log).context("error executing assemble command")?; + args.exec(&logctx.log) + .await + .context("error executing assemble command")?; // Now check that it can be read by the archive extractor. let zip_bytes = std::fs::File::open(&archive_path) .context("error opening archive.zip")?; let plan = ArtifactsWithPlan::from_zip(zip_bytes, &logctx.log) + .await .context("error reading archive.zip")?; // Check that all known artifact kinds are present in the map. let by_id_kinds: BTreeSet<_> = diff --git a/wicketd/src/artifacts/error.rs b/wicketd/src/artifacts/error.rs index ef81ec66f3..ada8fbe011 100644 --- a/wicketd/src/artifacts/error.rs +++ b/wicketd/src/artifacts/error.rs @@ -57,6 +57,13 @@ pub(super) enum RepositoryError { )] MissingTarget(String), + #[error("error reading artifact of kind `{kind}` from repository")] + ReadArtifact { + kind: ArtifactKind, + #[source] + error: Box, + }, + #[error("error copying artifact of kind `{kind}` from repository")] CopyExtractedArtifact { kind: ArtifactKind, @@ -160,6 +167,7 @@ impl RepositoryError { | RepositoryError::LoadRepository(_) | RepositoryError::ReadArtifactsDocument(_) | RepositoryError::TargetHashRead { .. } + | RepositoryError::ReadArtifact { .. } | RepositoryError::CopyExtractedArtifact { .. } => { HttpError::for_bad_request(None, message) } diff --git a/wicketd/src/artifacts/extracted_artifacts.rs b/wicketd/src/artifacts/extracted_artifacts.rs index b796201936..5683cd1c13 100644 --- a/wicketd/src/artifacts/extracted_artifacts.rs +++ b/wicketd/src/artifacts/extracted_artifacts.rs @@ -7,6 +7,8 @@ use anyhow::Context; use camino::Utf8PathBuf; use camino_tempfile::NamedUtf8TempFile; use camino_tempfile::Utf8TempDir; +use futures::Stream; +use futures::StreamExt; use omicron_common::update::ArtifactHash; use omicron_common::update::ArtifactHashId; use omicron_common::update::ArtifactKind; @@ -14,13 +16,11 @@ use sha2::Digest; use sha2::Sha256; use slog::info; use slog::Logger; -use std::fs::File; use std::io; -use std::io::BufWriter; -use std::io::Read; use std::io::Write; use std::sync::Arc; use tokio::io::AsyncRead; +use tokio::io::AsyncWriteExt; use tokio_util::io::ReaderStream; /// Handle to the data of an extracted artifact. @@ -123,17 +123,18 @@ impl ExtractedArtifacts { self.tempdir.path().join(format!("{}", artifact_hash_id.hash)) } - /// Copy from `reader` into our temp directory, returning a handle to the + /// Copy from `stream` into our temp directory, returning a handle to the /// extracted artifact on success. - pub(super) fn store( + pub(super) async fn store( &mut self, artifact_hash_id: ArtifactHashId, - mut reader: impl Read, + stream: impl Stream>, ) -> Result { let output_path = self.path_for_artifact(&artifact_hash_id); - let mut writer = BufWriter::new( - File::create(&output_path) + let mut writer = tokio::io::BufWriter::new( + tokio::fs::File::create(&output_path) + .await .with_context(|| { format!("failed to create temp file {output_path}") }) @@ -143,15 +144,29 @@ impl ExtractedArtifacts { })?, ); - let file_size = io::copy(&mut reader, &mut writer) - .with_context(|| format!("failed writing to {output_path}")) - .map_err(|error| RepositoryError::CopyExtractedArtifact { + let mut stream = std::pin::pin!(stream); + + let mut file_size = 0; + + while let Some(res) = stream.next().await { + let chunk = res.map_err(|error| RepositoryError::ReadArtifact { kind: artifact_hash_id.kind.clone(), - error, - })? as usize; + error: Box::new(error), + })?; + file_size += chunk.len(); + writer + .write_all(&chunk) + .await + .with_context(|| format!("failed writing to {output_path}")) + .map_err(|error| RepositoryError::CopyExtractedArtifact { + kind: artifact_hash_id.kind.clone(), + error, + })?; + } writer .flush() + .await .with_context(|| format!("failed flushing {output_path}")) .map_err(|error| RepositoryError::CopyExtractedArtifact { kind: artifact_hash_id.kind.clone(), diff --git a/wicketd/src/artifacts/store.rs b/wicketd/src/artifacts/store.rs index 29e1ecef0a..2a7b4a646b 100644 --- a/wicketd/src/artifacts/store.rs +++ b/wicketd/src/artifacts/store.rs @@ -42,12 +42,9 @@ impl WicketdArtifactStore { slog::debug!(self.log, "adding repository"); let log = self.log.clone(); - let new_artifacts = tokio::task::spawn_blocking(move || { - ArtifactsWithPlan::from_zip(data, &log) - .map_err(|error| error.to_http_error()) - }) - .await - .unwrap()?; + let new_artifacts = ArtifactsWithPlan::from_zip(data, &log) + .await + .map_err(|error| error.to_http_error())?; self.replace(new_artifacts); Ok(()) diff --git a/wicketd/src/artifacts/update_plan.rs b/wicketd/src/artifacts/update_plan.rs index 5d7bee629a..c6db7c1b65 100644 --- a/wicketd/src/artifacts/update_plan.rs +++ b/wicketd/src/artifacts/update_plan.rs @@ -14,7 +14,10 @@ use super::extracted_artifacts::HashingNamedUtf8TempFile; use super::ArtifactIdData; use super::Board; use super::ExtractedArtifactDataHandle; -use anyhow::anyhow; +use bytes::Bytes; +use futures::Stream; +use futures::StreamExt; +use futures::TryStreamExt; use hubtools::RawHubrisArchive; use omicron_common::api::external::SemverVersion; use omicron_common::api::internal::nexus::KnownArtifactKind; @@ -28,7 +31,6 @@ use std::collections::btree_map; use std::collections::BTreeMap; use std::collections::HashMap; use std::io; -use std::io::Read; use tufaceous_lib::HostPhaseImages; use tufaceous_lib::RotArchives; @@ -143,24 +145,26 @@ impl<'a> UpdatePlanBuilder<'a> { }) } - pub(super) fn add_artifact( + pub(super) async fn add_artifact( &mut self, artifact_id: ArtifactId, artifact_hash: ArtifactHash, - reader: io::BufReader, + stream: impl Stream> + Send, by_id: &mut BTreeMap>, by_hash: &mut HashMap, ) -> Result<(), RepositoryError> { // If we don't know this artifact kind, we'll still serve it up by hash, // but we don't do any further processing on it. let Some(artifact_kind) = artifact_id.kind.to_known() else { - return self.add_unknown_artifact( - artifact_id, - artifact_hash, - reader, - by_id, - by_hash, - ); + return self + .add_unknown_artifact( + artifact_id, + artifact_hash, + stream, + by_id, + by_hash, + ) + .await; }; // If we do know the artifact kind, we may have additional work to do, @@ -170,48 +174,57 @@ impl<'a> UpdatePlanBuilder<'a> { match artifact_kind { KnownArtifactKind::GimletSp | KnownArtifactKind::PscSp - | KnownArtifactKind::SwitchSp => self.add_sp_artifact( - artifact_id, - artifact_kind, - artifact_hash, - reader, - by_id, - by_hash, - ), + | KnownArtifactKind::SwitchSp => { + self.add_sp_artifact( + artifact_id, + artifact_kind, + artifact_hash, + stream, + by_id, + by_hash, + ) + .await + } KnownArtifactKind::GimletRot | KnownArtifactKind::PscRot - | KnownArtifactKind::SwitchRot => self.add_rot_artifact( - artifact_id, - artifact_kind, - reader, - by_id, - by_hash, - ), + | KnownArtifactKind::SwitchRot => { + self.add_rot_artifact( + artifact_id, + artifact_kind, + stream, + by_id, + by_hash, + ) + .await + } KnownArtifactKind::Host => { - self.add_host_artifact(artifact_id, reader, by_id, by_hash) + self.add_host_artifact(artifact_id, stream, by_id, by_hash) } KnownArtifactKind::Trampoline => self.add_trampoline_artifact( artifact_id, - reader, - by_id, - by_hash, - ), - KnownArtifactKind::ControlPlane => self.add_control_plane_artifact( - artifact_id, - artifact_hash, - reader, + stream, by_id, by_hash, ), + KnownArtifactKind::ControlPlane => { + self.add_control_plane_artifact( + artifact_id, + artifact_hash, + stream, + by_id, + by_hash, + ) + .await + } } } - fn add_sp_artifact( + async fn add_sp_artifact( &mut self, artifact_id: ArtifactId, artifact_kind: KnownArtifactKind, artifact_hash: ArtifactHash, - mut reader: io::BufReader, + stream: impl Stream> + Send, by_id: &mut BTreeMap>, by_hash: &mut HashMap, ) -> Result<(), RepositoryError> { @@ -228,15 +241,18 @@ impl<'a> UpdatePlanBuilder<'a> { | KnownArtifactKind::SwitchRot => unreachable!(), }; + let mut stream = std::pin::pin!(stream); + // SP images are small, and hubtools wants a `&[u8]` to parse, so we'll // read the whole thing into memory. let mut data = Vec::new(); - reader.read_to_end(&mut data).map_err(|error| { - RepositoryError::CopyExtractedArtifact { + while let Some(res) = stream.next().await { + let chunk = res.map_err(|error| RepositoryError::ReadArtifact { kind: artifact_kind.into(), - error: anyhow!(error), - } - })?; + error: Box::new(error), + })?; + data.extend_from_slice(&chunk); + } let (artifact_id, board) = read_hubris_board_from_archive(artifact_id, data.clone())?; @@ -255,7 +271,11 @@ impl<'a> UpdatePlanBuilder<'a> { ArtifactHashId { kind: artifact_kind.into(), hash: artifact_hash }; let data = self .extracted_artifacts - .store(artifact_hash_id, io::Cursor::new(&data))?; + .store( + artifact_hash_id, + futures::stream::iter([Ok(Bytes::from(data))]), + ) + .await?; slot.insert(ArtifactIdData { id: artifact_id.clone(), data: data.clone(), @@ -273,11 +293,11 @@ impl<'a> UpdatePlanBuilder<'a> { Ok(()) } - fn add_rot_artifact( + async fn add_rot_artifact( &mut self, artifact_id: ArtifactId, artifact_kind: KnownArtifactKind, - reader: io::BufReader, + stream: impl Stream> + Send, by_id: &mut BTreeMap>, by_hash: &mut HashMap, ) -> Result<(), RepositoryError> { @@ -310,9 +330,12 @@ impl<'a> UpdatePlanBuilder<'a> { }; let (rot_a_data, rot_b_data) = Self::extract_nested_artifact_pair( + stream, &mut self.extracted_artifacts, artifact_kind, - |out_a, out_b| RotArchives::extract_into(reader, out_a, out_b), + |reader, out_a, out_b| { + RotArchives::extract_into(reader, out_a, out_b) + }, )?; // Technically we've done all we _need_ to do with the RoT images. We @@ -358,7 +381,7 @@ impl<'a> UpdatePlanBuilder<'a> { fn add_host_artifact( &mut self, artifact_id: ArtifactId, - reader: io::BufReader, + stream: impl Stream> + Send, by_id: &mut BTreeMap>, by_hash: &mut HashMap, ) -> Result<(), RepositoryError> { @@ -369,9 +392,12 @@ impl<'a> UpdatePlanBuilder<'a> { } let (phase_1_data, phase_2_data) = Self::extract_nested_artifact_pair( + stream, &mut self.extracted_artifacts, KnownArtifactKind::Host, - |out_1, out_2| HostPhaseImages::extract_into(reader, out_1, out_2), + |reader, out_1, out_2| { + HostPhaseImages::extract_into(reader, out_1, out_2) + }, )?; // Similarly to the RoT, we need to create new, non-conflicting artifact @@ -409,7 +435,7 @@ impl<'a> UpdatePlanBuilder<'a> { fn add_trampoline_artifact( &mut self, artifact_id: ArtifactId, - reader: io::BufReader, + stream: impl Stream> + Send, by_id: &mut BTreeMap>, by_hash: &mut HashMap, ) -> Result<(), RepositoryError> { @@ -422,9 +448,12 @@ impl<'a> UpdatePlanBuilder<'a> { } let (phase_1_data, phase_2_data) = Self::extract_nested_artifact_pair( + stream, &mut self.extracted_artifacts, KnownArtifactKind::Trampoline, - |out_1, out_2| HostPhaseImages::extract_into(reader, out_1, out_2), + |reader, out_1, out_2| { + HostPhaseImages::extract_into(reader, out_1, out_2) + }, )?; // Similarly to the RoT, we need to create new, non-conflicting artifact @@ -466,11 +495,11 @@ impl<'a> UpdatePlanBuilder<'a> { Ok(()) } - fn add_control_plane_artifact( + async fn add_control_plane_artifact( &mut self, artifact_id: ArtifactId, artifact_hash: ArtifactHash, - reader: io::BufReader, + stream: impl Stream> + Send, by_id: &mut BTreeMap>, by_hash: &mut HashMap, ) -> Result<(), RepositoryError> { @@ -487,7 +516,8 @@ impl<'a> UpdatePlanBuilder<'a> { hash: artifact_hash, }; - let data = self.extracted_artifacts.store(artifact_hash_id, reader)?; + let data = + self.extracted_artifacts.store(artifact_hash_id, stream).await?; self.control_plane_hash = Some(data.hash()); @@ -503,11 +533,11 @@ impl<'a> UpdatePlanBuilder<'a> { Ok(()) } - fn add_unknown_artifact( + async fn add_unknown_artifact( &mut self, artifact_id: ArtifactId, artifact_hash: ArtifactHash, - reader: io::BufReader, + stream: impl Stream> + Send, by_id: &mut BTreeMap>, by_hash: &mut HashMap, ) -> Result<(), RepositoryError> { @@ -515,7 +545,8 @@ impl<'a> UpdatePlanBuilder<'a> { let artifact_hash_id = ArtifactHashId { kind: artifact_kind.clone(), hash: artifact_hash }; - let data = self.extracted_artifacts.store(artifact_hash_id, reader)?; + let data = + self.extracted_artifacts.store(artifact_hash_id, stream).await?; record_extracted_artifact( artifact_id, @@ -529,11 +560,80 @@ impl<'a> UpdatePlanBuilder<'a> { Ok(()) } - // RoT, host OS, and trampoline OS artifacts all contain a pair of artifacts - // we actually care about (RoT: A/B images; host/trampoline: phase1/phase2 - // images). This method is a helper that converts a single artifact `reader` - // into a pair of extracted artifacts. + /// A helper that converts a single artifact `stream` into a pair of + /// extracted artifacts. + /// + /// RoT, host OS, and trampoline OS artifacts all contain a pair of + /// artifacts we actually care about (RoT: A/B images; host/trampoline: + /// phase1/phase2 images). This method is a helper to extract that. + /// + /// This method uses a `block_in_place` into synchronous code, because the + /// value of changing tufaceous to do async tarball extraction is honestly + /// pretty dubious. + /// + /// The main costs of this are that: + /// 1. This code can only be used with multithreaded Tokio executors. (This + /// is OK for production, but does require that our tests use `flavor = + /// "multi_thread`.) + /// 2. Parallelizing extraction is harder if we ever want to do that in the + /// future. (It can be done using the async-scoped crate, though.) + /// + /// Depending on how things shake out, we may want to revisit this in the + /// future. fn extract_nested_artifact_pair( + stream: impl Stream> + Send, + extracted_artifacts: &mut ExtractedArtifacts, + kind: KnownArtifactKind, + extract: F, + ) -> Result< + (ExtractedArtifactDataHandle, ExtractedArtifactDataHandle), + RepositoryError, + > + where + F: FnOnce( + &mut dyn io::BufRead, + &mut HashingNamedUtf8TempFile, + &mut HashingNamedUtf8TempFile, + ) -> anyhow::Result<()> + + Send, + { + // Since stream isn't guaranteed to be 'static, we have to use + // block_in_place here, not spawn_blocking. This does mean that the + // current task is taken over, and that this function can only be used + // from a multithreaded Tokio runtime. + // + // An alternative would be to use the `async-scoped` crate. However: + // + // - We would only spawn one task there. + // - The only safe use of async-scoped is with the `scope_and_block` + // call, which uses `tokio::task::block_in_place` anyway. + // - async-scoped also requires a multithreaded Tokio runtime. + // + // If we ever want to parallelize extraction across all the different + // artifacts, `async-scoped` would be a good fit. + tokio::task::block_in_place(|| { + let stream = std::pin::pin!(stream); + let reader = + tokio_util::io::StreamReader::new(stream.map_err(|error| { + // StreamReader requires a conversion from tough's errors to + // std::io::Error. + std::io::Error::new(io::ErrorKind::Other, error) + })); + + // RotArchives::extract_into takes a synchronous reader, so we need + // to use this bridge. The bridge can only be used from a blocking + // context. + let mut reader = tokio_util::io::SyncIoBridge::new(reader); + + Self::extract_nested_artifact_pair_impl( + extracted_artifacts, + kind, + |out_a, out_b| extract(&mut reader, out_a, out_b), + ) + }) + } + + fn extract_nested_artifact_pair_impl( extracted_artifacts: &mut ExtractedArtifacts, kind: KnownArtifactKind, extract: F, @@ -838,7 +938,9 @@ mod tests { builder.build_to_vec().unwrap() } - #[tokio::test] + // See documentation for extract_nested_artifact_pair for why multi_thread + // is required. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_update_plan_from_artifacts() { const VERSION_0: SemverVersion = SemverVersion::new(0, 0, 0); @@ -867,10 +969,11 @@ mod tests { .add_artifact( id, hash, - io::BufReader::new(io::Cursor::new(&data)), + futures::stream::iter([Ok(Bytes::from(data))]), &mut by_id, &mut by_hash, ) + .await .unwrap(); } @@ -889,10 +992,11 @@ mod tests { .add_artifact( id, hash, - io::BufReader::new(io::Cursor::new(&data)), + futures::stream::iter([Ok(Bytes::from(data))]), &mut by_id, &mut by_hash, ) + .await .unwrap(); } @@ -917,10 +1021,11 @@ mod tests { .add_artifact( id, hash, - io::BufReader::new(io::Cursor::new(&data)), + futures::stream::iter([Ok(Bytes::from(data))]), &mut by_id, &mut by_hash, ) + .await .unwrap(); } } @@ -945,10 +1050,11 @@ mod tests { .add_artifact( id, hash, - io::BufReader::new(io::Cursor::new(data)), + futures::stream::iter([Ok(data.clone())]), &mut by_id, &mut by_hash, ) + .await .unwrap(); } @@ -972,10 +1078,11 @@ mod tests { .add_artifact( id, hash, - io::BufReader::new(io::Cursor::new(data)), + futures::stream::iter([Ok(data.clone())]), &mut by_id, &mut by_hash, ) + .await .unwrap(); } diff --git a/wicketd/src/update_tracker.rs b/wicketd/src/update_tracker.rs index f4b5db2476..a86ea35cc3 100644 --- a/wicketd/src/update_tracker.rs +++ b/wicketd/src/update_tracker.rs @@ -41,7 +41,6 @@ use installinator_common::InstallinatorSpec; use installinator_common::M2Slot; use installinator_common::WriteOutput; use omicron_common::api::external::SemverVersion; -use omicron_common::backoff; use omicron_common::update::ArtifactHash; use slog::error; use slog::info; @@ -103,12 +102,22 @@ struct SpUpdateData { } #[derive(Debug)] -struct UploadTrampolinePhase2ToMgsStatus { - hash: ArtifactHash, - // The upload task retries forever until it succeeds, so we don't need to - // keep a "tried but failed" variant here; we just need to know the ID of - // the uploaded image once it's done. - uploaded_image_id: Option, +enum UploadTrampolinePhase2ToMgsStatus { + Running { hash: ArtifactHash }, + Done { hash: ArtifactHash, uploaded_image_id: HostPhase2RecoveryImageId }, + Failed(Arc), +} + +impl UploadTrampolinePhase2ToMgsStatus { + fn hash(&self) -> Option { + match self { + UploadTrampolinePhase2ToMgsStatus::Running { hash } + | UploadTrampolinePhase2ToMgsStatus::Done { hash, .. } => { + Some(*hash) + } + UploadTrampolinePhase2ToMgsStatus::Failed(_) => None, + } + } } #[derive(Debug)] @@ -308,9 +317,8 @@ impl UpdateTracker { ) -> UploadTrampolinePhase2ToMgs { let artifact = plan.trampoline_phase_2.clone(); let (status_tx, status_rx) = - watch::channel(UploadTrampolinePhase2ToMgsStatus { + watch::channel(UploadTrampolinePhase2ToMgsStatus::Running { hash: artifact.data.hash(), - uploaded_image_id: None, }); let task = tokio::spawn(upload_trampoline_phase_2_to_mgs( self.mgs_client.clone(), @@ -426,8 +434,8 @@ impl<'tr> SpawnUpdateDriver for RealSpawnUpdateDriver<'tr> { // this artifact? If not, cancel the old task (which // might still be trying to upload) and start a new one // with our current image. - if prev.status.borrow().hash - != plan.trampoline_phase_2.data.hash() + if prev.status.borrow().hash() + != Some(plan.trampoline_phase_2.data.hash()) { // It does _not_ match - we have a new plan with a // different trampoline image. If the old task is @@ -1147,19 +1155,38 @@ impl UpdateDriver { // We expect this loop to run just once, but iterate just in // case the image ID doesn't get populated the first time. loop { + match &*upload_trampoline_phase_2_to_mgs.borrow_and_update() + { + UploadTrampolinePhase2ToMgsStatus::Running { .. } => { + // fall through to `.changed()` below + }, + UploadTrampolinePhase2ToMgsStatus::Done { + uploaded_image_id, + .. + } => { + return StepSuccess::new( + uploaded_image_id.clone(), + ).into(); + } + UploadTrampolinePhase2ToMgsStatus::Failed(error) => { + let error = Arc::clone(error); + return Err(UpdateTerminalError::TrampolinePhase2UploadFailed { + error, + }); + } + } + + // `upload_trampoline_phase_2_to_mgs` holds onto the sending + // half of this channel until all receivers are gone, so the + // only way we can fail to receive here is if that task + // panicked (which would abort our process) or was cancelled + // (because a new TUF repo has been uploaded), in which case + // we should fail the current update. upload_trampoline_phase_2_to_mgs.changed().await.map_err( |_recv_err| { - UpdateTerminalError::TrampolinePhase2UploadFailed + UpdateTerminalError::TrampolinePhase2UploadCancelled } )?; - - if let Some(image_id) = upload_trampoline_phase_2_to_mgs - .borrow() - .uploaded_image_id - .as_ref() - { - return StepSuccess::new(image_id.clone()).into(); - } } }, ).register(); @@ -2149,59 +2176,68 @@ async fn upload_trampoline_phase_2_to_mgs( status: watch::Sender, log: Logger, ) { - let data = artifact.data; - let hash = data.hash(); - let upload_task = move || { - let mgs_client = mgs_client.clone(); - let data = data.clone(); - - async move { - let image_stream = data.reader_stream().await.map_err(|e| { - // TODO-correctness If we get an I/O error opening the file - // associated with `data`, is it actually a transient error? If - // we change this to `permanent` we'll have to do some different - // error handling below and at our call site to retry. We - // _shouldn't_ get errors from `reader_stream()` in general, so - // it's probably okay either way? - backoff::BackoffError::transient(format!("{e:#}")) - })?; - mgs_client - .recovery_host_phase2_upload(reqwest::Body::wrap_stream( - image_stream, - )) - .await - .map_err(|e| backoff::BackoffError::transient(e.to_string())) - } - }; + // We make at most 3 attempts to upload the trampoline to our local MGS, + // sleeping briefly between attempts if we fail. + const MAX_ATTEMPTS: usize = 3; + const SLEEP_BETWEEN_ATTEMPTS: Duration = Duration::from_secs(1); + + let mut attempt = 1; + let final_status = loop { + let image_stream = match artifact.data.reader_stream().await { + Ok(stream) => stream, + Err(err) => { + error!( + log, "failed to read trampoline phase 2"; + "err" => #%err, + ); + break UploadTrampolinePhase2ToMgsStatus::Failed(Arc::new( + err.context("failed to read trampoline phase 2"), + )); + } + }; - let log_failure = move |err, delay| { - warn!( - log, - "failed to upload trampoline phase 2 to MGS, will retry in {:?}", - delay; - "err" => %err, - ); + match mgs_client + .recovery_host_phase2_upload(reqwest::Body::wrap_stream( + image_stream, + )) + .await + { + Ok(response) => { + break UploadTrampolinePhase2ToMgsStatus::Done { + hash: artifact.data.hash(), + uploaded_image_id: response.into_inner(), + }; + } + Err(err) => { + if attempt < MAX_ATTEMPTS { + error!( + log, "failed to upload trampoline phase 2 to MGS; \ + will retry after {SLEEP_BETWEEN_ATTEMPTS:?}"; + "attempt" => attempt, + "err" => %DisplayErrorChain::new(&err), + ); + tokio::time::sleep(SLEEP_BETWEEN_ATTEMPTS).await; + attempt += 1; + continue; + } else { + error!( + log, "failed to upload trampoline phase 2 to MGS; \ + giving up"; + "attempt" => attempt, + "err" => %DisplayErrorChain::new(&err), + ); + break UploadTrampolinePhase2ToMgsStatus::Failed(Arc::new( + anyhow::Error::new(err) + .context("failed to upload trampoline phase 2"), + )); + } + } + } }; - // retry_policy_internal_service_aggressive() retries forever, so we can - // unwrap this call to retry_notify - let uploaded_image_id = backoff::retry_notify( - backoff::retry_policy_internal_service_aggressive(), - upload_task, - log_failure, - ) - .await - .unwrap() - .into_inner(); - - // Notify all receivers that we've uploaded the image. - _ = status.send(UploadTrampolinePhase2ToMgsStatus { - hash, - uploaded_image_id: Some(uploaded_image_id), - }); - - // Wait for all receivers to be gone before we exit, so they don't get recv - // errors unless we're cancelled. + // Send our final status, then wait for all receivers to be gone before we + // exit, so they don't get recv errors unless we're cancelled. + status.send_replace(final_status); status.closed().await; } diff --git a/wicketd/tests/integration_tests/updates.rs b/wicketd/tests/integration_tests/updates.rs index fb1637f44e..52bf1d1283 100644 --- a/wicketd/tests/integration_tests/updates.rs +++ b/wicketd/tests/integration_tests/updates.rs @@ -31,7 +31,9 @@ use wicketd_client::types::{ StartUpdateParams, }; -#[tokio::test] +// See documentation for extract_nested_artifact_pair in update_plan.rs for why +// multi_thread is required. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_updates() { let gateway = gateway_setup::test_setup("test_updates", SpPort::One).await; let wicketd_testctx = WicketdTestContext::setup(gateway).await; @@ -48,7 +50,7 @@ async fn test_updates() { ]) .expect("args parsed correctly"); - args.exec(log).expect("assemble command completed successfully"); + args.exec(log).await.expect("assemble command completed successfully"); // Read the archive and upload it to the server. let zip_bytes = @@ -175,15 +177,15 @@ async fn test_updates() { StepEventKind::ExecutionFailed { failed_step, .. } => { // TODO: obviously we shouldn't stop here, get past more of the // update process in this test. - assert_eq!(failed_step.info.component, UpdateComponent::Rot); + assert_eq!(failed_step.info.component, UpdateComponent::Host); } other => { panic!("unexpected terminal event kind: {other:?}"); } } - // Try starting the update again -- this should fail because we require that update state is - // cleared before starting a new one. + // Try starting the update again -- this should fail because we require that + // update state is cleared before starting a new one. { let error = wicketd_testctx .wicketd_client @@ -195,8 +197,8 @@ async fn test_updates() { ); let error_str = error.to_string(); assert!( - // Errors lose type information across the OpenAPI boundary, so sadly we have to match on - // the error string. + // Errors lose type information across the OpenAPI boundary, so + // sadly we have to match on the error string. error_str.contains("existing update data found"), "unexpected error: {error_str}" ); @@ -258,7 +260,9 @@ async fn test_updates() { wicketd_testctx.teardown().await; } -#[tokio::test] +// See documentation for extract_nested_artifact_pair in update_plan.rs for why +// multi_thread is required. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_installinator_fetch() { let gateway = gateway_setup::test_setup("test_updates", SpPort::One).await; let wicketd_testctx = WicketdTestContext::setup(gateway).await; @@ -275,7 +279,7 @@ async fn test_installinator_fetch() { ]) .expect("args parsed correctly"); - args.exec(log).expect("assemble command completed successfully"); + args.exec(log).await.expect("assemble command completed successfully"); // Read the archive and upload it to the server. let zip_bytes = @@ -391,7 +395,9 @@ async fn test_installinator_fetch() { wicketd_testctx.teardown().await; } -#[tokio::test] +// See documentation for extract_nested_artifact_pair in update_plan.rs for why +// multi_thread is required. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_update_races() { let gateway = gateway_setup::test_setup( "test_artifact_upload_while_updating", @@ -412,7 +418,7 @@ async fn test_update_races() { ]) .expect("args parsed correctly"); - args.exec(log).expect("assemble command completed successfully"); + args.exec(log).await.expect("assemble command completed successfully"); // Read the archive and upload it to the server. let zip_bytes = diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 4d416eca02..1a289bd0cb 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -85,15 +85,17 @@ sha2 = { version = "0.10.8", features = ["oid"] } signature = { version = "2.1.0", default-features = false, features = ["digest", "rand_core", "std"] } similar = { version = "2.2.1", features = ["inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } +snafu = { version = "0.7.5", features = ["futures"] } spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } syn-dff4ba8e3ae991db = { package = "syn", version = "1.0.109", features = ["extra-traits", "fold", "full", "visit"] } syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.32", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.27", features = ["formatting", "local-offset", "macros", "parsing"] } -tokio = { version = "1.33.0", features = ["full", "test-util"] } +tokio = { version = "1.34.0", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.10", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } tokio-stream = { version = "0.1.14", features = ["net"] } +tokio-util = { version = "0.7.10", features = ["codec", "io-util"] } toml = { version = "0.7.8" } toml_edit-647d43efb71741da = { package = "toml_edit", version = "0.21.0", features = ["serde"] } tracing = { version = "0.1.37", features = ["log"] } @@ -101,7 +103,7 @@ trust-dns-proto = { version = "0.22.0" } unicode-bidi = { version = "0.3.13" } unicode-normalization = { version = "0.1.22" } usdt = { version = "0.3.5" } -uuid = { version = "1.5.0", features = ["serde", "v4"] } +uuid = { version = "1.6.1", features = ["serde", "v4"] } yasna = { version = "0.5.2", features = ["bit-vec", "num-bigint", "std", "time"] } zeroize = { version = "1.6.0", features = ["std", "zeroize_derive"] } zip = { version = "0.6.6", default-features = false, features = ["bzip2", "deflate"] } @@ -178,6 +180,7 @@ sha2 = { version = "0.10.8", features = ["oid"] } signature = { version = "2.1.0", default-features = false, features = ["digest", "rand_core", "std"] } similar = { version = "2.2.1", features = ["inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } +snafu = { version = "0.7.5", features = ["futures"] } spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } @@ -185,9 +188,10 @@ syn-dff4ba8e3ae991db = { package = "syn", version = "1.0.109", features = ["extr syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.32", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.27", features = ["formatting", "local-offset", "macros", "parsing"] } time-macros = { version = "0.2.13", default-features = false, features = ["formatting", "parsing"] } -tokio = { version = "1.33.0", features = ["full", "test-util"] } +tokio = { version = "1.34.0", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.10", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } tokio-stream = { version = "0.1.14", features = ["net"] } +tokio-util = { version = "0.7.10", features = ["codec", "io-util"] } toml = { version = "0.7.8" } toml_edit-647d43efb71741da = { package = "toml_edit", version = "0.21.0", features = ["serde"] } tracing = { version = "0.1.37", features = ["log"] } @@ -195,7 +199,7 @@ trust-dns-proto = { version = "0.22.0" } unicode-bidi = { version = "0.3.13" } unicode-normalization = { version = "0.1.22" } usdt = { version = "0.3.5" } -uuid = { version = "1.5.0", features = ["serde", "v4"] } +uuid = { version = "1.6.1", features = ["serde", "v4"] } yasna = { version = "0.5.2", features = ["bit-vec", "num-bigint", "std", "time"] } zeroize = { version = "1.6.0", features = ["std", "zeroize_derive"] } zip = { version = "0.6.6", default-features = false, features = ["bzip2", "deflate"] } @@ -203,49 +207,49 @@ zip = { version = "0.6.6", default-features = false, features = ["bzip2", "defla [target.x86_64-unknown-linux-gnu.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.2" } -mio = { version = "0.8.8", features = ["net", "os-ext"] } +mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } rustix = { version = "0.38.9", features = ["fs", "termios"] } [target.x86_64-unknown-linux-gnu.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.2" } -mio = { version = "0.8.8", features = ["net", "os-ext"] } +mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } rustix = { version = "0.38.9", features = ["fs", "termios"] } [target.x86_64-apple-darwin.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.2" } -mio = { version = "0.8.8", features = ["net", "os-ext"] } +mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } rustix = { version = "0.38.9", features = ["fs", "termios"] } [target.x86_64-apple-darwin.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.2" } -mio = { version = "0.8.8", features = ["net", "os-ext"] } +mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } rustix = { version = "0.38.9", features = ["fs", "termios"] } [target.aarch64-apple-darwin.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.2" } -mio = { version = "0.8.8", features = ["net", "os-ext"] } +mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } rustix = { version = "0.38.9", features = ["fs", "termios"] } [target.aarch64-apple-darwin.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.2" } -mio = { version = "0.8.8", features = ["net", "os-ext"] } +mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } rustix = { version = "0.38.9", features = ["fs", "termios"] } [target.x86_64-unknown-illumos.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.2" } -mio = { version = "0.8.8", features = ["net", "os-ext"] } +mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } rustix = { version = "0.38.9", features = ["fs", "termios"] } toml_datetime = { version = "0.6.5", default-features = false, features = ["serde"] } @@ -254,7 +258,7 @@ toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", featu [target.x86_64-unknown-illumos.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.2" } -mio = { version = "0.8.8", features = ["net", "os-ext"] } +mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } rustix = { version = "0.38.9", features = ["fs", "termios"] } toml_datetime = { version = "0.6.5", default-features = false, features = ["serde"] }