From abfca0b50430593f9aca90902c75f3a3297130c2 Mon Sep 17 00:00:00 2001
From: Jordan Hendricks <jordan@oxide.computer>
Date: Thu, 25 May 2023 11:10:32 -0700
Subject: [PATCH 01/14] Allow VMM reservoir to be configured by sled-agent
 (#3200)

Co-authored-by: Sean Klein <sean@oxide.computer>
---
 Cargo.lock                                   |  5 +-
 Cargo.toml                                   |  1 +
 common/src/sql/dbinit.sql                    |  1 +
 illumos-utils/Cargo.toml                     |  1 +
 illumos-utils/src/lib.rs                     |  1 +
 illumos-utils/src/vmm_reservoir.rs           | 66 ++++++++++++++++
 nexus/db-model/src/schema.rs                 |  1 +
 nexus/db-model/src/sled.rs                   |  9 ++-
 nexus/db-queries/src/db/datastore/mod.rs     |  2 +
 nexus/db-queries/src/db/datastore/sled.rs    | 81 ++++++++++++++++++++
 nexus/src/app/sled.rs                        |  1 +
 nexus/test-utils/src/lib.rs                  |  3 +
 nexus/types/src/internal_api/params.rs       |  5 ++
 openapi/nexus-internal.json                  |  9 +++
 sled-agent/src/bin/sled-agent-sim.rs         |  1 +
 sled-agent/src/config.rs                     |  3 +
 sled-agent/src/instance_manager.rs           | 62 +++++++++++++++
 sled-agent/src/sim/config.rs                 |  1 +
 sled-agent/src/sim/server.rs                 |  4 +
 sled-agent/src/sled_agent.rs                 | 26 ++++++-
 smf/sled-agent/gimlet-standalone/config.toml |  4 +
 smf/sled-agent/gimlet/config.toml            |  4 +
 smf/sled-agent/non-gimlet/config.toml        |  4 +
 23 files changed, 289 insertions(+), 6 deletions(-)
 create mode 100644 illumos-utils/src/vmm_reservoir.rs

diff --git a/Cargo.lock b/Cargo.lock
index 54005f2493..7101e6c1cc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1990,7 +1990,7 @@ dependencies = [
 [[package]]
 name = "dropshot"
 version = "0.9.1-dev"
-source = "git+https://github.com/oxidecomputer/dropshot?branch=main#616e9fbda73ca90ee40dfea8283fe62437af99ff"
+source = "git+https://github.com/oxidecomputer/dropshot?branch=main#61c035b1f70d7b2c2e2ee2481e850105bf15ff31"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -2033,7 +2033,7 @@ dependencies = [
 [[package]]
 name = "dropshot_endpoint"
 version = "0.9.1-dev"
-source = "git+https://github.com/oxidecomputer/dropshot?branch=main#616e9fbda73ca90ee40dfea8283fe62437af99ff"
+source = "git+https://github.com/oxidecomputer/dropshot?branch=main#61c035b1f70d7b2c2e2ee2481e850105bf15ff31"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -3168,6 +3168,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-trait",
+ "bhyve_api",
  "camino",
  "cfg-if 1.0.0",
  "futures",
diff --git a/Cargo.toml b/Cargo.toml
index 6afa748019..9836accda4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -250,6 +250,7 @@ pretty-hex = "0.3.0"
 proc-macro2 = "1.0"
 progenitor = { git = "https://github.com/oxidecomputer/progenitor", branch = "main" }
 progenitor-client = { git = "https://github.com/oxidecomputer/progenitor", branch = "main" }
+bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "5f694f4833a4368e05764a3b4b0fcaa6feed54df" }
 propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "5f694f4833a4368e05764a3b4b0fcaa6feed54df", features = [ "generated-migration" ] }
 propolis-server = { git = "https://github.com/oxidecomputer/propolis", rev = "5f694f4833a4368e05764a3b4b0fcaa6feed54df", default-features = false, features = ["mock-only"] }
 proptest = "1.1.0"
diff --git a/common/src/sql/dbinit.sql b/common/src/sql/dbinit.sql
index dfa27d2d7b..d5eef59247 100644
--- a/common/src/sql/dbinit.sql
+++ b/common/src/sql/dbinit.sql
@@ -84,6 +84,7 @@ CREATE TABLE omicron.public.sled (
     /* CPU & RAM summary for the sled */
     usable_hardware_threads INT8 CHECK (usable_hardware_threads BETWEEN 0 AND 4294967295) NOT NULL,
     usable_physical_ram INT8 NOT NULL,
+    reservoir_size INT8 CHECK (reservoir_size < usable_physical_ram) NOT NULL,
 
     /* The IP address and bound port of the sled agent server. */
     ip INET NOT NULL,
diff --git a/illumos-utils/Cargo.toml b/illumos-utils/Cargo.toml
index b3c619d01c..2ad76d579f 100644
--- a/illumos-utils/Cargo.toml
+++ b/illumos-utils/Cargo.toml
@@ -8,6 +8,7 @@ license = "MPL-2.0"
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
+bhyve_api.workspace = true
 camino.workspace = true
 cfg-if.workspace = true
 futures.workspace = true
diff --git a/illumos-utils/src/lib.rs b/illumos-utils/src/lib.rs
index a988c2e88a..ed6ab8e13d 100644
--- a/illumos-utils/src/lib.rs
+++ b/illumos-utils/src/lib.rs
@@ -15,6 +15,7 @@ pub mod link;
 pub mod opte;
 pub mod running_zone;
 pub mod svc;
+pub mod vmm_reservoir;
 pub mod zfs;
 pub mod zone;
 pub mod zpool;
diff --git a/illumos-utils/src/vmm_reservoir.rs b/illumos-utils/src/vmm_reservoir.rs
new file mode 100644
index 0000000000..2ff2925fcc
--- /dev/null
+++ b/illumos-utils/src/vmm_reservoir.rs
@@ -0,0 +1,66 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Wrappers around VMM Reservoir controls
+
+use omicron_common::api::external::ByteCount;
+
+#[allow(non_upper_case_globals)]
+const MiB: u64 = 1024 * 1024;
+#[allow(non_upper_case_globals)]
+const GiB: u64 = 1024 * 1024 * 1024;
+
+/// The control plane's required alignment for a reservoir size request.
+//
+// The ioctl interface for resizing the reservoir only requires rounding the
+// requested size to PAGESIZE. But we choose a larger value here of 2MiB to
+// better enable large pages for guests in the future: Currently, the reservoir
+// does not have support for large pages, but choosing a size that aligns with
+// the minimum large page size on x86 (2 MiB) will simplify things later.
+///
+pub const RESERVOIR_SZ_ALIGN: u64 = 2 * MiB;
+
+// Chunk size to request when resizing the reservoir. It's okay if this value is
+// greater than the requested size.
+const RESERVOIR_CHUNK_SZ: usize = GiB as usize;
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("Reservoir size must be aligned to 2 MiB: {0}")]
+    InvalidSize(ByteCount),
+
+    #[error("Failed to resize reservoir: {0}")]
+    ReservoirError(#[from] std::io::Error),
+}
+
+/// Controls the size of the memory reservoir
+pub struct ReservoirControl {}
+
+impl ReservoirControl {
+    /// Sets the reservoir to a particular size.
+    ///
+    /// The size must be aligned on RESERVOIR_SZ_ALIGN.
+    pub fn set(size: ByteCount) -> Result<(), Error> {
+        if !reservoir_size_is_aligned(size.to_bytes()) {
+            return Err(Error::InvalidSize(size));
+        }
+
+        let ctl = bhyve_api::VmmCtlFd::open()?;
+        ctl.reservoir_resize(
+            size.to_bytes().try_into().map_err(|_| Error::InvalidSize(size))?,
+            RESERVOIR_CHUNK_SZ,
+        )
+        .map_err(std::io::Error::from)?;
+
+        Ok(())
+    }
+}
+
+pub fn align_reservoir_size(size_bytes: u64) -> u64 {
+    size_bytes - (size_bytes % RESERVOIR_SZ_ALIGN)
+}
+
+pub fn reservoir_size_is_aligned(size_bytes: u64) -> bool {
+    (size_bytes % RESERVOIR_SZ_ALIGN) == 0
+}
diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs
index 0ff6c258bf..6f518b55a0 100644
--- a/nexus/db-model/src/schema.rs
+++ b/nexus/db-model/src/schema.rs
@@ -680,6 +680,7 @@ table! {
 
         usable_hardware_threads -> Int8,
         usable_physical_ram -> Int8,
+        reservoir_size -> Int8,
 
         ip -> Inet,
         port -> Int4,
diff --git a/nexus/db-model/src/sled.rs b/nexus/db-model/src/sled.rs
index 22d1b1ddd3..1f1fd78631 100644
--- a/nexus/db-model/src/sled.rs
+++ b/nexus/db-model/src/sled.rs
@@ -28,6 +28,9 @@ pub struct SledSystemHardware {
     pub is_scrimlet: bool,
     pub usable_hardware_threads: u32,
     pub usable_physical_ram: ByteCount,
+
+    // current VMM reservoir size
+    pub reservoir_size: ByteCount,
 }
 
 /// Database representation of a Sled.
@@ -46,8 +49,9 @@ pub struct Sled {
     part_number: String,
     revision: i64,
 
-    usable_hardware_threads: SqlU32,
-    usable_physical_ram: ByteCount,
+    pub usable_hardware_threads: SqlU32,
+    pub usable_physical_ram: ByteCount,
+    pub reservoir_size: ByteCount,
 
     // ServiceAddress (Sled Agent).
     pub ip: ipv6::Ipv6Addr,
@@ -83,6 +87,7 @@ impl Sled {
                 hardware.usable_hardware_threads,
             ),
             usable_physical_ram: hardware.usable_physical_ram,
+            reservoir_size: hardware.reservoir_size,
             ip: ipv6::Ipv6Addr::from(addr.ip()),
             port: addr.port().into(),
             last_used_address,
diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs
index 2fc4d377fe..22c3f2c161 100644
--- a/nexus/db-queries/src/db/datastore/mod.rs
+++ b/nexus/db-queries/src/db/datastore/mod.rs
@@ -325,6 +325,8 @@ mod test {
             usable_hardware_threads: 4,
             usable_physical_ram: crate::db::model::ByteCount::try_from(1 << 40)
                 .unwrap(),
+            reservoir_size: crate::db::model::ByteCount::try_from(1 << 39)
+                .unwrap(),
         }
     }
 
diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs
index 47a95441ad..a70ec26d8c 100644
--- a/nexus/db-queries/src/db/datastore/sled.rs
+++ b/nexus/db-queries/src/db/datastore/sled.rs
@@ -41,6 +41,9 @@ impl DataStore {
                 dsl::port.eq(sled.port),
                 dsl::rack_id.eq(sled.rack_id),
                 dsl::is_scrimlet.eq(sled.is_scrimlet()),
+                dsl::usable_hardware_threads.eq(sled.usable_hardware_threads),
+                dsl::usable_physical_ram.eq(sled.usable_physical_ram),
+                dsl::reservoir_size.eq(sled.reservoir_size),
             ))
             .returning(Sled::as_returning())
             .get_result_async(self.pool())
@@ -202,3 +205,81 @@ impl DataStore {
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::db::datastore::datastore_test;
+    use crate::db::datastore::test::{
+        sled_baseboard_for_test, sled_system_hardware_for_test,
+    };
+    use crate::db::model::ByteCount;
+    use crate::db::model::SqlU32;
+    use nexus_test_utils::db::test_setup_database;
+    use omicron_common::api::external;
+    use omicron_test_utils::dev;
+    use std::net::{Ipv6Addr, SocketAddrV6};
+
+    fn rack_id() -> Uuid {
+        Uuid::parse_str(nexus_test_utils::RACK_UUID).unwrap()
+    }
+
+    #[tokio::test]
+    async fn upsert_sled_updates_hardware() {
+        let logctx = dev::test_setup_log("upsert_sled");
+        let mut db = test_setup_database(&logctx.log).await;
+        let (_opctx, datastore) = datastore_test(&logctx, &db).await;
+
+        let sled_id = Uuid::new_v4();
+        let addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0);
+        let mut sled = Sled::new(
+            sled_id,
+            addr,
+            sled_baseboard_for_test(),
+            sled_system_hardware_for_test(),
+            rack_id(),
+        );
+        let observed_sled = datastore
+            .sled_upsert(sled.clone())
+            .await
+            .expect("Could not upsert sled during test prep");
+        assert_eq!(
+            observed_sled.usable_hardware_threads,
+            sled.usable_hardware_threads
+        );
+        assert_eq!(observed_sled.usable_physical_ram, sled.usable_physical_ram);
+        assert_eq!(observed_sled.reservoir_size, sled.reservoir_size);
+
+        // Modify the sizes of hardware
+        sled.usable_hardware_threads =
+            SqlU32::new(sled.usable_hardware_threads.0 + 1);
+        const MIB: u64 = 1024 * 1024;
+        sled.usable_physical_ram = ByteCount::from(
+            external::ByteCount::try_from(
+                sled.usable_physical_ram.0.to_bytes() + MIB,
+            )
+            .unwrap(),
+        );
+        sled.reservoir_size = ByteCount::from(
+            external::ByteCount::try_from(
+                sled.reservoir_size.0.to_bytes() + MIB,
+            )
+            .unwrap(),
+        );
+
+        // Test that upserting the sled propagates those changes to the DB.
+        let observed_sled = datastore
+            .sled_upsert(sled.clone())
+            .await
+            .expect("Could not upsert sled during test prep");
+        assert_eq!(
+            observed_sled.usable_hardware_threads,
+            sled.usable_hardware_threads
+        );
+        assert_eq!(observed_sled.usable_physical_ram, sled.usable_physical_ram);
+        assert_eq!(observed_sled.reservoir_size, sled.reservoir_size);
+
+        db.cleanup().await.unwrap();
+        logctx.cleanup_successful();
+    }
+}
diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs
index 1e2da91f65..37d07a6d67 100644
--- a/nexus/src/app/sled.rs
+++ b/nexus/src/app/sled.rs
@@ -63,6 +63,7 @@ impl super::Nexus {
                 is_scrimlet,
                 usable_hardware_threads: info.usable_hardware_threads,
                 usable_physical_ram: info.usable_physical_ram.into(),
+                reservoir_size: info.reservoir_size.into(),
             },
             self.rack_id,
         );
diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs
index 07a268726b..20b283517f 100644
--- a/nexus/test-utils/src/lib.rs
+++ b/nexus/test-utils/src/lib.rs
@@ -50,6 +50,8 @@ pub const PRODUCER_UUID: &str = "a6458b7d-87c3-4483-be96-854d814c20de";
 pub const TEST_HARDWARE_THREADS: u32 = 16;
 /// The reported amount of physical RAM for an emulated sled agent.
 pub const TEST_PHYSICAL_RAM: u64 = 32 * (1 << 30);
+/// The reported amount of VMM reservoir RAM for an emulated sled agent.
+pub const TEST_RESERVOIR_RAM: u64 = 16 * (1 << 30);
 
 /// Password for the user created by the test suite
 ///
@@ -402,6 +404,7 @@ pub async fn start_sled_agent(
         hardware: sim::ConfigHardware {
             hardware_threads: TEST_HARDWARE_THREADS,
             physical_ram: TEST_PHYSICAL_RAM,
+            reservoir_ram: TEST_RESERVOIR_RAM,
         },
     };
 
diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs
index 7d28194503..68623778ed 100644
--- a/nexus/types/src/internal_api/params.rs
+++ b/nexus/types/src/internal_api/params.rs
@@ -59,6 +59,11 @@ pub struct SledAgentStartupInfo {
 
     /// Amount of RAM which may be used by the Sled's OS
     pub usable_physical_ram: ByteCount,
+
+    /// Amount of RAM dedicated to the VMM reservoir
+    ///
+    /// Must be smaller than "usable_physical_ram"
+    pub reservoir_size: ByteCount,
 }
 
 /// Describes the type of physical disk.
diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json
index cf594ec9e9..4643d832cd 100644
--- a/openapi/nexus-internal.json
+++ b/openapi/nexus-internal.json
@@ -2894,6 +2894,14 @@
               }
             ]
           },
+          "reservoir_size": {
+            "description": "Amount of RAM dedicated to the VMM reservoir\n\nMust be smaller than \"usable_physical_ram\"",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/ByteCount"
+              }
+            ]
+          },
           "role": {
             "description": "Describes the responsibilities of the sled",
             "allOf": [
@@ -2923,6 +2931,7 @@
         },
         "required": [
           "baseboard",
+          "reservoir_size",
           "role",
           "sa_address",
           "usable_hardware_threads",
diff --git a/sled-agent/src/bin/sled-agent-sim.rs b/sled-agent/src/bin/sled-agent-sim.rs
index a9078f84fb..baff3fd85a 100644
--- a/sled-agent/src/bin/sled-agent-sim.rs
+++ b/sled-agent/src/bin/sled-agent-sim.rs
@@ -95,6 +95,7 @@ async fn do_run() -> Result<(), CmdError> {
         hardware: ConfigHardware {
             hardware_threads: 32,
             physical_ram: 64 * (1 << 30),
+            reservoir_ram: 32 * (1 << 30),
         },
     };
 
diff --git a/sled-agent/src/config.rs b/sled-agent/src/config.rs
index 34618a202f..461b1bdcb7 100644
--- a/sled-agent/src/config.rs
+++ b/sled-agent/src/config.rs
@@ -41,6 +41,7 @@ pub struct SoftPortConfig {
 
 /// Configuration for a sled agent
 #[derive(Clone, Debug, Deserialize)]
+#[serde(deny_unknown_fields)]
 pub struct Config {
     /// Configuration for the sled agent debug log
     pub log: ConfigLogging,
@@ -48,6 +49,8 @@ pub struct Config {
     pub sled_mode: SledMode,
     // TODO: Remove once this can be auto-detected.
     pub sidecar_revision: SidecarRevision,
+    /// Optional percentage of DRAM to reserve for guest memory
+    pub vmm_reservoir_percentage: Option<u8>,
     /// Optional VLAN ID to be used for tagging guest VNICs.
     pub vlan: Option<VlanID>,
     /// Optional list of zpools to be used as "discovered disks".
diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs
index 7ff2212692..e4d082a05f 100644
--- a/sled-agent/src/instance_manager.rs
+++ b/sled-agent/src/instance_manager.rs
@@ -13,6 +13,8 @@ use illumos_utils::dladm::Etherstub;
 use illumos_utils::link::VnicAllocator;
 use illumos_utils::opte::params::SetVirtualNetworkInterfaceHost;
 use illumos_utils::opte::PortManager;
+use illumos_utils::vmm_reservoir;
+use omicron_common::api::external::ByteCount;
 use omicron_common::api::internal::nexus::InstanceRuntimeState;
 use slog::Logger;
 use std::collections::BTreeMap;
@@ -35,6 +37,9 @@ pub enum Error {
     #[error("OPTE port management error: {0}")]
     Opte(#[from] illumos_utils::opte::Error),
 
+    #[error("Failed to create reservoir: {0}")]
+    Reservoir(#[from] vmm_reservoir::Error),
+
     #[error("Cannot find data link: {0}")]
     Underlay(#[from] sled_hardware::underlay::Error),
 }
@@ -43,6 +48,9 @@ struct InstanceManagerInternal {
     log: Logger,
     lazy_nexus_client: LazyNexusClient,
 
+    /// Last set size of the VMM reservoir (in bytes)
+    reservoir_size: Mutex<ByteCount>,
+
     // TODO: If we held an object representing an enum of "Created OR Running"
     // instance, we could avoid the methods within "instance.rs" that panic
     // if the Propolis client hasn't been initialized.
@@ -70,6 +78,9 @@ impl InstanceManager {
             inner: Arc::new(InstanceManagerInternal {
                 log: log.new(o!("component" => "InstanceManager")),
                 lazy_nexus_client,
+
+                // no reservoir size set on startup
+                reservoir_size: Mutex::new(ByteCount::from_kibibytes_u32(0)),
                 instances: Mutex::new(BTreeMap::new()),
                 vnic_allocator: VnicAllocator::new("Instance", etherstub),
                 port_manager,
@@ -77,6 +88,57 @@ impl InstanceManager {
         })
     }
 
+    /// Sets the VMM reservoir size to the requested (nonzero) percentage of
+    /// usable physical RAM, rounded down to nearest aligned size required by
+    /// the control plane.
+    pub fn set_reservoir_size(
+        &self,
+        hardware: &sled_hardware::HardwareManager,
+        target_percent: u8,
+    ) -> Result<(), Error> {
+        assert!(
+            target_percent > 0 && target_percent < 100,
+            "target_percent {} must be nonzero and < 100",
+            target_percent
+        );
+
+        let req_bytes = (hardware.usable_physical_ram_bytes() as f64
+            * (target_percent as f64 / 100.0))
+            .floor() as u64;
+        let req_bytes_aligned = vmm_reservoir::align_reservoir_size(req_bytes);
+
+        if req_bytes_aligned == 0 {
+            warn!(
+                self.inner.log,
+                "Requested reservoir size of {} bytes < minimum aligned size of {} bytes",
+                req_bytes, vmm_reservoir::RESERVOIR_SZ_ALIGN);
+            return Ok(());
+        }
+
+        // The max ByteCount value is i64::MAX, which is ~8 million TiB. As this
+        // value is a percentage of DRAM, constructing this should always work.
+        let reservoir_size = ByteCount::try_from(req_bytes_aligned).unwrap();
+        info!(
+            self.inner.log,
+            "Setting reservoir size to {} bytes \
+            ({}% of {} total = {} bytes requested)",
+            reservoir_size,
+            target_percent,
+            hardware.usable_physical_ram_bytes(),
+            req_bytes,
+        );
+        vmm_reservoir::ReservoirControl::set(reservoir_size)?;
+
+        *self.inner.reservoir_size.lock().unwrap() = reservoir_size;
+
+        Ok(())
+    }
+
+    /// Returns the last-set size of the reservoir
+    pub fn reservoir_size(&self) -> ByteCount {
+        *self.inner.reservoir_size.lock().unwrap()
+    }
+
     /// Ensures that the instance manager contains a registered instance with
     /// the supplied instance ID and the Propolis ID specified in
     /// `initial_hardware`.
diff --git a/sled-agent/src/sim/config.rs b/sled-agent/src/sim/config.rs
index 52e3950919..62012a7109 100644
--- a/sled-agent/src/sim/config.rs
+++ b/sled-agent/src/sim/config.rs
@@ -46,6 +46,7 @@ pub struct ConfigStorage {
 pub struct ConfigHardware {
     pub hardware_threads: u32,
     pub physical_ram: u64,
+    pub reservoir_ram: u64,
 }
 
 /// Configuration for a sled agent
diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs
index ac95ae63cb..14414a995c 100644
--- a/sled-agent/src/sim/server.rs
+++ b/sled-agent/src/sim/server.rs
@@ -110,6 +110,10 @@ impl Server {
                             config.hardware.physical_ram,
                         )
                         .unwrap(),
+                        reservoir_size: NexusTypes::ByteCount::try_from(
+                            config.hardware.reservoir_ram,
+                        )
+                        .unwrap(),
                     },
                 )
                 .await)
diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
index be21f6c3e8..47c8fc6c7f 100644
--- a/sled-agent/src/sled_agent.rs
+++ b/sled-agent/src/sled_agent.rs
@@ -239,6 +239,9 @@ impl SledAgent {
             })
             .await?;
 
+        let hardware = HardwareManager::new(&parent_log, services.sled_mode())
+            .map_err(|e| Error::Hardware(e))?;
+
         let instances = InstanceManager::new(
             parent_log.clone(),
             lazy_nexus_client.clone(),
@@ -246,8 +249,23 @@ impl SledAgent {
             port_manager.clone(),
         )?;
 
-        let hardware = HardwareManager::new(&parent_log, services.sled_mode())
-            .map_err(|e| Error::Hardware(e))?;
+        match config.vmm_reservoir_percentage {
+            Some(sz) if sz > 0 && sz < 100 => {
+                instances.set_reservoir_size(&hardware, sz).map_err(|e| {
+                    warn!(log, "Failed to set VMM reservoir size: {e}");
+                    e
+                })?;
+            }
+            Some(sz) if sz == 0 => {
+                warn!(log, "Not using VMM reservoir (size 0 bytes requested)");
+            }
+            None => {
+                warn!(log, "Not using VMM reservoir");
+            }
+            Some(sz) => {
+                panic!("invalid requested VMM reservoir percentage: {}", sz);
+            }
+        }
 
         let update_config = ConfigUpdates {
             zone_artifact_path: Utf8PathBuf::from("/opt/oxide"),
@@ -408,6 +426,7 @@ impl SledAgent {
             self.inner.hardware.online_processor_count();
         let usable_physical_ram =
             self.inner.hardware.usable_physical_ram_bytes();
+        let reservoir_size = self.inner.instances.reservoir_size();
 
         let log = log.clone();
         let fut = async move {
@@ -445,6 +464,9 @@ impl SledAgent {
                             usable_physical_ram: nexus_client::types::ByteCount(
                                 usable_physical_ram,
                             ),
+                            reservoir_size: nexus_client::types::ByteCount(
+                                reservoir_size.to_bytes(),
+                            ),
                         },
                     )
                     .await
diff --git a/smf/sled-agent/gimlet-standalone/config.toml b/smf/sled-agent/gimlet-standalone/config.toml
index b635fece89..2f3769fc89 100644
--- a/smf/sled-agent/gimlet-standalone/config.toml
+++ b/smf/sled-agent/gimlet-standalone/config.toml
@@ -18,6 +18,10 @@ sidecar_revision.physical = "b"
 # in-sync, rather than querying its NTP zone.
 skip_timesync = true
 
+# Percentage of usable physical DRAM to use for the VMM reservoir, which
+# guest memory is pulled from.
+vmm_reservoir_percentage = 0
+
 # An optional data link from which we extract a MAC address.
 # This is used as a unique identifier for the bootstrap address.
 #
diff --git a/smf/sled-agent/gimlet/config.toml b/smf/sled-agent/gimlet/config.toml
index 6a76e15bbb..fe7c4f1f54 100644
--- a/smf/sled-agent/gimlet/config.toml
+++ b/smf/sled-agent/gimlet/config.toml
@@ -21,6 +21,10 @@ sidecar_revision.physical = "b"
 # $ dladm show-phys -p -o LINK
 data_link = "cxgbe0"
 
+# Percentage of usable physical DRAM to use for the VMM reservoir, which
+# guest memory is pulled from.
+vmm_reservoir_percentage = 0
+
 [log]
 level = "info"
 mode = "file"
diff --git a/smf/sled-agent/non-gimlet/config.toml b/smf/sled-agent/non-gimlet/config.toml
index 9dbff8f194..22ef5120ee 100644
--- a/smf/sled-agent/non-gimlet/config.toml
+++ b/smf/sled-agent/non-gimlet/config.toml
@@ -32,6 +32,10 @@ zpools = [
   "oxp_f4b4dc87-ab46-49fb-a4b4-d361ae214c03",
 ]
 
+# Percentage of usable physical DRAM to use for the VMM reservoir, which
+# guest memory is pulled from.
+vmm_reservoir_percentage = 0
+
 # An optional data link from which we extract a MAC address.
 # This is used as a unique identifier for the bootstrap address.
 #

From d16dc4296cf01d0399dda328684156229d273007 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 25 May 2023 14:25:28 -0700
Subject: [PATCH 02/14] Bump reedline from 0.19.0 to 0.19.1 (#3179)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [reedline](https://github.com/nushell/reedline) from 0.19.0 to
0.19.1.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/nushell/reedline/releases">reedline's
releases</a>.</em></p>
<blockquote>
<h2>0.19.1</h2>
<p>Bugfix release for <a
href="https://github.com/nushell/nushell">nushell</a>
<code>0.80.0</code></p>
<p>This release fixes the bracketed paste mode but doesn't include any
new features.</p>
<h2>What's Changed</h2>
<ul>
<li>fix bracketed paste by <a
href="https://github.com/WindSoilder"><code>@​WindSoilder</code></a> in
<a
href="https://redirect.github.com/nushell/reedline/pull/577">nushell/reedline#577</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/nushell/reedline/compare/v0.19.0...v0.19.1">https://github.com/nushell/reedline/compare/v0.19.0...v0.19.1</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/nushell/reedline/commit/dac4fecaba1943ae11eb69d7f1d24af043b265d5"><code>dac4fec</code></a>
Bump version for <code>0.19.1</code> point release</li>
<li><a
href="https://github.com/nushell/reedline/commit/65c4e7a419122a526b66adb9695d9d924bf921f6"><code>65c4e7a</code></a>
fix bracketed paste (<a
href="https://redirect.github.com/nushell/reedline/issues/577">#577</a>)</li>
<li>See full diff in <a
href="https://github.com/nushell/reedline/compare/v0.19.0...v0.19.1">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=reedline&package-manager=cargo&previous-version=0.19.0&new-version=0.19.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock            | 4 ++--
 wicket-dbg/Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7101e6c1cc..4a433c39e6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6267,9 +6267,9 @@ dependencies = [
 
 [[package]]
 name = "reedline"
-version = "0.19.0"
+version = "0.19.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b8684e0f5d6fb8529156a19c637645dd7fd5018efbdeaa306f53f54e3b404de"
+checksum = "31d763be5efcabcd27d6e804d126661215aa0a2d898c13c0aa1c953c6652fcec"
 dependencies = [
  "chrono",
  "crossterm 0.26.1",
diff --git a/wicket-dbg/Cargo.toml b/wicket-dbg/Cargo.toml
index aad8a46cc1..1989049c02 100644
--- a/wicket-dbg/Cargo.toml
+++ b/wicket-dbg/Cargo.toml
@@ -21,7 +21,7 @@ tui.workspace = true
 wicket.workspace = true
 
 # used only by wicket-dbg binary
-reedline = "0.19.0"
+reedline = "0.19.1"
 
 [[bin]]
 name = "wicket-dbg"

From 490946c60b7c6690f0383ecd386eed82e7216c7a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 25 May 2023 14:25:40 -0700
Subject: [PATCH 03/14] Bump zip from 0.6.5 to 0.6.6 (#3178)

Bumps [zip](https://github.com/zip-rs/zip) from 0.6.5 to 0.6.6.
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/zip-rs/zip/blob/master/CHANGELOG.md">zip's
changelog</a>.</em></p>
<blockquote>
<h2>[0.6.6]</h2>
<h3>Changed</h3>
<ul>
<li>Updated <code>aes</code> dependency to <code>0.8.2</code> (<a
href="https://redirect.github.com/zip-rs/zip/pull/354">zip-rs/zip#354</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li>See full diff in <a
href="https://github.com/zip-rs/zip/commits">compare view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=zip&package-manager=cargo&previous-version=0.6.5&new-version=0.6.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4a433c39e6..cced40facb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9610,9 +9610,9 @@ dependencies = [
 
 [[package]]
 name = "zip"
-version = "0.6.5"
+version = "0.6.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e92305c174683d78035cbf1b70e18db6329cc0f1b9cae0a52ca90bf5bfe7125"
+checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261"
 dependencies = [
  "byteorder",
  "bzip2",
diff --git a/Cargo.toml b/Cargo.toml
index 9836accda4..ac5b666b9a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -339,7 +339,7 @@ wicket = { path = "wicket" }
 wicket-common = { path = "wicket-common" }
 wicketd-client = { path = "wicketd-client" }
 zeroize = { version = "1.6.0", features = ["zeroize_derive", "std"] }
-zip = { version = "0.6.5", default-features = false, features = ["deflate","bzip2"] }
+zip = { version = "0.6.6", default-features = false, features = ["deflate","bzip2"] }
 zone = { version = "0.2", default-features = false, features = ["async"] }
 
 [profile.dev]

From 35a5ed78a836b43c86d563025f20f6c9accd0041 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Thu, 25 May 2023 15:59:50 -0700
Subject: [PATCH 04/14] update steno dep to handle undo actions failing (#3211)

---
 Cargo.lock                            |  34 ++++++--
 Cargo.toml                            |   4 +-
 common/src/sql/dbinit.sql             |   3 -
 nexus/src/app/saga.rs                 |  20 ++++-
 nexus/src/app/sagas/mod.rs            |   4 +
 nexus/src/app/sagas/test_saga.rs      | 107 ++++++++++++++++++++++++++
 nexus/types/src/internal_api/views.rs |  91 +++++++++++++++++-----
 openapi/nexus-internal.json           |  32 ++++++++
 8 files changed, 262 insertions(+), 33 deletions(-)
 create mode 100644 nexus/src/app/sagas/test_saga.rs

diff --git a/Cargo.lock b/Cargo.lock
index cced40facb..ea3e196752 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3977,7 +3977,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sled-agent-client",
- "steno",
+ "steno 0.4.0",
  "thiserror",
  "uuid",
 ]
@@ -4050,7 +4050,7 @@ dependencies = [
  "serde_with",
  "sled-agent-client",
  "slog",
- "steno",
+ "steno 0.4.0",
  "strum",
  "subprocess",
  "tempfile",
@@ -4154,7 +4154,7 @@ dependencies = [
  "schemars",
  "serde",
  "serde_json",
- "steno",
+ "steno 0.4.0",
  "uuid",
 ]
 
@@ -4394,7 +4394,7 @@ dependencies = [
  "serde_urlencoded",
  "serde_with",
  "slog",
- "steno",
+ "steno 0.4.0",
  "strum",
  "test-strategy",
  "thiserror",
@@ -4433,7 +4433,7 @@ dependencies = [
  "serde_json",
  "serde_with",
  "slog",
- "steno",
+ "steno 0.3.1",
  "strum",
  "thiserror",
  "tokio",
@@ -4617,7 +4617,7 @@ dependencies = [
  "sled-agent-client",
  "slog",
  "slog-dtrace",
- "steno",
+ "steno 0.4.0",
  "strum",
  "subprocess",
  "tempfile",
@@ -7716,6 +7716,28 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "steno"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a1e7ccea133c197729abfd16dccf91a3c4d0da1e94bb0c0aa164c2b8a227481"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "chrono",
+ "futures",
+ "lazy_static",
+ "newtype_derive",
+ "petgraph",
+ "schemars",
+ "serde",
+ "serde_json",
+ "slog",
+ "thiserror",
+ "tokio",
+ "uuid",
+]
+
 [[package]]
 name = "string_cache"
 version = "0.8.7"
diff --git a/Cargo.toml b/Cargo.toml
index ac5b666b9a..47cca34bbe 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -300,7 +300,7 @@ sprockets-rot = { git = "http://github.com/oxidecomputer/sprockets", rev = "77df
 # Please do not change the Steno version to a Git dependency.  It makes it
 # harder than expected to make breaking changes (even if you specify a specific
 # SHA).  Cut a new Steno release instead.  See omicron#2117.
-steno = "0.3.1"
+steno = "0.4.0"
 strum = { version = "0.24", features = [ "derive" ] }
 subprocess = "0.2.9"
 libsw = { version = "3.2.4", features = ["tokio"] }
@@ -380,7 +380,7 @@ panic = "abort"
 #
 #[patch."https://github.com/oxidecomputer/dropshot"]
 #dropshot = { path = "../dropshot/dropshot" }
-#[patch."https://github.com/oxidecomputer/steno"]
+#[patch.crates-io]
 #steno = { path = "../steno" }
 #[patch."https://github.com/oxidecomputer/propolis"]
 #propolis-client = { path = "../propolis/lib/propolis-client" }
diff --git a/common/src/sql/dbinit.sql b/common/src/sql/dbinit.sql
index d5eef59247..fa7ec0b1c9 100644
--- a/common/src/sql/dbinit.sql
+++ b/common/src/sql/dbinit.sql
@@ -1620,9 +1620,6 @@ CREATE UNIQUE INDEX ON omicron.public.external_ip (
  * Sagas
  */
 
-/*
- * TODO This may eventually have 'paused', 'needs-operator', and 'needs-support'
- */
 CREATE TYPE omicron.public.saga_state AS ENUM (
     'running',
     'unwinding',
diff --git a/nexus/src/app/saga.rs b/nexus/src/app/saga.rs
index 46865cc63b..7cfb8ff0bb 100644
--- a/nexus/src/app/saga.rs
+++ b/nexus/src/app/saga.rs
@@ -141,6 +141,7 @@ impl super::Nexus {
         &self,
         runnable_saga: RunnableSaga,
     ) -> Result<SagaResultOk, Error> {
+        let log = &self.log;
         self.sec_client
             .saga_start(runnable_saga.id)
             .await
@@ -149,14 +150,27 @@ impl super::Nexus {
 
         let result = runnable_saga.fut.await;
         result.kind.map_err(|saga_error| {
-            saga_error
+            let mut error = saga_error
                 .error_source
                 .convert::<Error>()
                 .unwrap_or_else(|e| Error::internal_error(&e.to_string()))
                 .internal_context(format!(
-                    "saga error at node {:?}",
+                    "saga ACTION error at node {:?}",
                     saga_error.error_node_name
-                ))
+                ));
+            if let Some((undo_node, undo_error)) = saga_error.undo_failure {
+                error = error.internal_context(format!(
+                    "UNDO ACTION failed (node {:?}, error {:#}) after",
+                    undo_node, undo_error
+                ));
+
+                error!(log, "saga stuck";
+                    "saga_id" => runnable_saga.id.to_string(),
+                    "error" => #%error,
+                );
+            }
+
+            error
         })
     }
 
diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs
index 84c20e5196..cd5cda8c27 100644
--- a/nexus/src/app/sagas/mod.rs
+++ b/nexus/src/app/sagas/mod.rs
@@ -34,6 +34,7 @@ pub mod snapshot_delete;
 pub mod switch_port_settings_apply;
 pub mod switch_port_settings_clear;
 pub mod switch_port_settings_update;
+pub mod test_saga;
 pub mod volume_delete;
 pub mod volume_remove_rop;
 pub mod vpc_create;
@@ -159,6 +160,9 @@ fn make_action_registry() -> ActionRegistry {
     );
     <vpc_create::SagaVpcCreate as NexusSaga>::register_actions(&mut registry);
 
+    #[cfg(test)]
+    <test_saga::SagaTest as NexusSaga>::register_actions(&mut registry);
+
     registry
 }
 
diff --git a/nexus/src/app/sagas/test_saga.rs b/nexus/src/app/sagas/test_saga.rs
new file mode 100644
index 0000000000..26e77d494c
--- /dev/null
+++ b/nexus/src/app/sagas/test_saga.rs
@@ -0,0 +1,107 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Saga only used for testing
+#![cfg(test)]
+
+use super::{NexusActionContext, NexusSaga, SagaInitError};
+use crate::app::saga::create_saga_dag;
+use crate::app::sagas::declare_saga_actions;
+use nexus_test_utils_macros::nexus_test;
+use omicron_common::api::external::Error;
+use serde::Deserialize;
+use serde::Serialize;
+use std::fmt::Debug;
+use steno::ActionError;
+
+// test saga: input parameters
+
+#[derive(Clone, Debug, Deserialize, Serialize)]
+pub struct Params {}
+
+// test saga: actions
+
+declare_saga_actions! {
+    test_saga;
+    TEST_ACTION1 -> "n1" {
+        + ts_test_action
+        - ts_test_undo
+    }
+    TEST_ACTION2 -> "n2" {
+        + ts_test_action
+        - ts_test_undo
+    }
+}
+
+// test saga: definition
+
+#[derive(Debug)]
+pub struct SagaTest;
+impl NexusSaga for SagaTest {
+    const NAME: &'static str = "test-saga";
+    type Params = Params;
+
+    fn register_actions(registry: &mut super::ActionRegistry) {
+        test_saga_register_actions(registry);
+    }
+
+    fn make_saga_dag(
+        _params: &Self::Params,
+        mut builder: steno::DagBuilder,
+    ) -> Result<steno::Dag, SagaInitError> {
+        builder.append(test_action1_action());
+        builder.append(test_action2_action());
+        Ok(builder.build()?)
+    }
+}
+
+async fn ts_test_action(
+    _sagactx: NexusActionContext,
+) -> Result<(), ActionError> {
+    Ok(())
+}
+
+async fn ts_test_undo(
+    _sagactx: NexusActionContext,
+) -> Result<(), anyhow::Error> {
+    Ok(())
+}
+
+// tests
+
+type ControlPlaneTestContext =
+    nexus_test_utils::ControlPlaneTestContext<crate::Server>;
+
+#[nexus_test(server = crate::Server)]
+async fn test_saga_stuck(cptestctx: &ControlPlaneTestContext) {
+    let nexus = &cptestctx.server.apictx().nexus;
+    let params = Params {};
+    let dag = create_saga_dag::<SagaTest>(params).unwrap();
+    let runnable_saga = nexus.create_runnable_saga(dag.clone()).await.unwrap();
+    let saga_id = runnable_saga.id();
+
+    // Inject an error into the second node's action and the first node's undo
+    // action.  This should cause the saga to become stuck.
+    let n1 = dag.get_index("n1").unwrap();
+    let n2 = dag.get_index("n2").unwrap();
+    nexus.sec().saga_inject_error(saga_id, n2).await.unwrap();
+    nexus.sec().saga_inject_error_undo(saga_id, n1).await.unwrap();
+    let result = nexus
+        .run_saga(runnable_saga)
+        .await
+        .expect_err("expected saga to finish stuck");
+
+    match result {
+        Error::InternalError { internal_message } => {
+            assert_eq!(
+                internal_message,
+                "UNDO ACTION failed (node \"n1\", error undo action failed \
+                permanently: {\n  \"message\": \"undo action attempt 1: \
+                error injected\"\n}) after: saga ACTION error at node \
+                \"n2\": error injected"
+            );
+        }
+        error => panic!("unexpected error from saga: {:?}", error),
+    }
+}
diff --git a/nexus/types/src/internal_api/views.rs b/nexus/types/src/internal_api/views.rs
index c6990d701d..e1e8dae453 100644
--- a/nexus/types/src/internal_api/views.rs
+++ b/nexus/types/src/internal_api/views.rs
@@ -7,6 +7,8 @@ use futures::stream::StreamExt;
 use omicron_common::api::external::ObjectStream;
 use schemars::JsonSchema;
 use serde::Serialize;
+use steno::SagaResultErr;
+use steno::UndoActionError;
 use uuid::Uuid;
 
 pub async fn to_list<T, U>(object_stream: ObjectStream<T>) -> Vec<U>
@@ -40,9 +42,24 @@ impl From<steno::SagaView> for Saga {
 #[derive(Clone, Debug, Serialize, JsonSchema)]
 #[serde(tag = "state", rename_all = "snake_case")]
 pub enum SagaState {
+    /// Saga is currently executing
     Running,
+    /// Saga completed successfully
     Succeeded,
+    /// One or more saga actions failed and the saga was successfully unwound
+    /// (i.e., undo actions were executed for any actions that were completed).
+    /// The saga is no longer running.
     Failed { error_node_name: steno::NodeName, error_info: SagaErrorInfo },
+    /// One or more saga actions failed, *and* one or more undo actions failed
+    /// during unwinding.  State managed by the saga may now be inconsistent.
+    /// Support may be required to repair the state.  The saga is no longer
+    /// running.
+    Stuck {
+        error_node_name: steno::NodeName,
+        error_info: SagaErrorInfo,
+        undo_error_node_name: steno::NodeName,
+        undo_source_error: serde_json::Value,
+    },
 }
 
 #[derive(Clone, Debug, Serialize, JsonSchema)]
@@ -55,6 +72,26 @@ pub enum SagaErrorInfo {
     SubsagaCreateFailed { message: String },
 }
 
+impl From<steno::ActionError> for SagaErrorInfo {
+    fn from(error_source: steno::ActionError) -> Self {
+        match error_source {
+            steno::ActionError::ActionFailed { source_error } => {
+                SagaErrorInfo::ActionFailed { source_error }
+            }
+            steno::ActionError::DeserializeFailed { message } => {
+                SagaErrorInfo::DeserializeFailed { message }
+            }
+            steno::ActionError::InjectedError => SagaErrorInfo::InjectedError,
+            steno::ActionError::SerializeFailed { message } => {
+                SagaErrorInfo::SerializeFailed { message }
+            }
+            steno::ActionError::SubsagaCreateFailed { message } => {
+                SagaErrorInfo::SubsagaCreateFailed { message }
+            }
+        }
+    }
+}
+
 impl From<steno::SagaStateView> for SagaState {
     fn from(st: steno::SagaStateView) -> Self {
         match st {
@@ -65,27 +102,43 @@ impl From<steno::SagaStateView> for SagaState {
                 ..
             } => SagaState::Succeeded,
             steno::SagaStateView::Done {
-                result: steno::SagaResult { kind: Err(e), .. },
+                result:
+                    steno::SagaResult {
+                        kind:
+                            Err(SagaResultErr {
+                                error_node_name,
+                                error_source,
+                                undo_failure: Some((undo_node_name, undo_error)),
+                            }),
+                        ..
+                    },
+                ..
+            } => {
+                let UndoActionError::PermanentFailure {
+                    source_error: undo_source_error,
+                } = undo_error;
+                SagaState::Stuck {
+                    error_node_name,
+                    error_info: SagaErrorInfo::from(error_source),
+                    undo_error_node_name: undo_node_name,
+                    undo_source_error,
+                }
+            }
+            steno::SagaStateView::Done {
+                result:
+                    steno::SagaResult {
+                        kind:
+                            Err(SagaResultErr {
+                                error_node_name,
+                                error_source,
+                                undo_failure: None,
+                            }),
+                        ..
+                    },
                 ..
             } => SagaState::Failed {
-                error_node_name: e.error_node_name,
-                error_info: match e.error_source {
-                    steno::ActionError::ActionFailed { source_error } => {
-                        SagaErrorInfo::ActionFailed { source_error }
-                    }
-                    steno::ActionError::DeserializeFailed { message } => {
-                        SagaErrorInfo::DeserializeFailed { message }
-                    }
-                    steno::ActionError::InjectedError => {
-                        SagaErrorInfo::InjectedError
-                    }
-                    steno::ActionError::SerializeFailed { message } => {
-                        SagaErrorInfo::SerializeFailed { message }
-                    }
-                    steno::ActionError::SubsagaCreateFailed { message } => {
-                        SagaErrorInfo::SubsagaCreateFailed { message }
-                    }
-                },
+                error_node_name: error_node_name,
+                error_info: SagaErrorInfo::from(error_source),
             },
         }
     }
diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json
index 4643d832cd..69227e4406 100644
--- a/openapi/nexus-internal.json
+++ b/openapi/nexus-internal.json
@@ -2607,6 +2607,7 @@
       "SagaState": {
         "oneOf": [
           {
+            "description": "Saga is currently executing",
             "type": "object",
             "properties": {
               "state": {
@@ -2621,6 +2622,7 @@
             ]
           },
           {
+            "description": "Saga completed successfully",
             "type": "object",
             "properties": {
               "state": {
@@ -2635,6 +2637,7 @@
             ]
           },
           {
+            "description": "One or more saga actions failed and the saga was successfully unwound (i.e., undo actions were executed for any actions that were completed). The saga is no longer running.",
             "type": "object",
             "properties": {
               "error_info": {
@@ -2655,6 +2658,35 @@
               "error_node_name",
               "state"
             ]
+          },
+          {
+            "description": "One or more saga actions failed, *and* one or more undo actions failed during unwinding.  State managed by the saga may now be inconsistent. Support may be required to repair the state.  The saga is no longer running.",
+            "type": "object",
+            "properties": {
+              "error_info": {
+                "$ref": "#/components/schemas/SagaErrorInfo"
+              },
+              "error_node_name": {
+                "$ref": "#/components/schemas/NodeName"
+              },
+              "state": {
+                "type": "string",
+                "enum": [
+                  "stuck"
+                ]
+              },
+              "undo_error_node_name": {
+                "$ref": "#/components/schemas/NodeName"
+              },
+              "undo_source_error": {}
+            },
+            "required": [
+              "error_info",
+              "error_node_name",
+              "state",
+              "undo_error_node_name",
+              "undo_source_error"
+            ]
           }
         ]
       },

From dc488bbd2c9af70e11f4438651a4e1a83c7754ff Mon Sep 17 00:00:00 2001
From: Rain <rain@oxide.computer>
Date: Thu, 25 May 2023 18:02:55 -0700
Subject: [PATCH 05/14] [wicket] add scrolling to popups (#3218)

Now that we're displaying error messages in popups, we can have a pretty
large amount of text here. Add the ability to scroll down in a popup,
building on #3194 which wraps all popup text.

This is a bit tricky because we want to cap the offset to prevent
scrolling down once we've reached the bottom of the text, but a lot of
values are computed at layout time.  To handle this, introduce the
notion of an "actual scroll offset", which is the scroll offset as
computed after text is laid out.  Also compute the layout parameters in
the constructor for `Popup` so we can return the actual scroll offset.

Display scroll status in the button bar (I didn't figure out a better
place to show it; we could potentially show an actual scroll bar to the
right but that's a lot more work than this.)

See https://github.com/oxidecomputer/omicron/pull/3218 for examples.
---
 wicket/src/ui/defaults/style.rs   |   2 +-
 wicket/src/ui/panes/mod.rs        | 119 ++++++---
 wicket/src/ui/panes/overview.rs   |  12 +-
 wicket/src/ui/panes/update.rs     | 316 +++++++++++++++++-------
 wicket/src/ui/widgets/ignition.rs |   2 +-
 wicket/src/ui/widgets/mod.rs      |   2 +-
 wicket/src/ui/widgets/popup.rs    | 389 ++++++++++++++++++++++--------
 7 files changed, 607 insertions(+), 235 deletions(-)

diff --git a/wicket/src/ui/defaults/style.rs b/wicket/src/ui/defaults/style.rs
index 79eafe5672..10549383a8 100644
--- a/wicket/src/ui/defaults/style.rs
+++ b/wicket/src/ui/defaults/style.rs
@@ -30,7 +30,7 @@ pub fn help_function() -> Style {
 }
 
 pub fn help_keys() -> Style {
-    Style::default().fg(TUI_PURPLE_DIM)
+    selected_line()
 }
 
 pub fn divider() -> Style {
diff --git a/wicket/src/ui/panes/mod.rs b/wicket/src/ui/panes/mod.rs
index 00da10c2e1..cf8ee040eb 100644
--- a/wicket/src/ui/panes/mod.rs
+++ b/wicket/src/ui/panes/mod.rs
@@ -73,46 +73,93 @@ pub fn align_by(
     Text::from(Spans::from(text))
 }
 
-/// Compute the scroll offset for a `Paragraph` widget
+/// A computed scroll offset.
 ///
-/// This takes the text size and number of visible lines of the terminal `Rect`
-/// account such that it will render the whole paragraph and not scroll if
-/// there is enough room. Otherwise it will scroll as expected with the top of
-/// the terminal `Rect` set to current_offset. We don't allow scrolling beyond
-/// the bottom of the text content.
-///
-///
-/// XXX: This whole paragraph scroll only allows length of less than
-/// 64k rows even though it shouldn't be limited. We can do our own
-/// scrolling instead to obviate this limit. we may want to anyway, as
-/// it's less data to be formatted for the Paragraph.
-///
-/// XXX: This algorithm doesn't work properly with line wraps, so we should
-/// ensure that the data we populate doesn't wrap or we have truncation turned
-/// on.
-pub fn compute_scroll_offset(
-    current_offset: usize,
-    text_height: usize,
-    num_lines: usize,
-) -> u16 {
-    let mut offset: usize = current_offset;
+/// This scroll offset is computed by [`Self::new`], and is capped so that we
+/// don't allow scrolling past the bottom of content.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum ComputedScrollOffset {
+    Top,
+    Middle(u16),
+    Bottom(u16),
+}
 
-    if offset > text_height {
-        offset = text_height;
+impl ComputedScrollOffset {
+    /// Compute the scroll offset for a `Paragraph` widget
+    ///
+    /// This takes the text size and number of visible lines of the terminal
+    /// `Rect` account such that it will render the whole paragraph and not
+    /// scroll if there is enough room. Otherwise it will scroll as expected
+    /// with the top of the terminal `Rect` set to current_offset. We don't
+    /// allow scrolling beyond the bottom of the text content.
+    ///
+    /// XXX: This whole paragraph scroll only allows length of less than 64k
+    /// rows even though it shouldn't be limited. We can do our own scrolling
+    /// instead to obviate this limit. we may want to anyway, as it's less data
+    /// to be formatted for the Paragraph.
+    ///
+    /// XXX: This algorithm doesn't work properly with line wraps, so we should
+    /// ensure that the data we populate doesn't wrap or we have truncation
+    /// turned on.
+    pub fn new(
+        current_offset: usize,
+        text_height: usize,
+        num_lines: usize,
+    ) -> Self {
+        let mut offset: usize = current_offset;
+
+        if offset > text_height {
+            offset = text_height;
+        }
+
+        if text_height <= num_lines {
+            offset = 0;
+        } else {
+            if text_height - offset < num_lines {
+                // Don't allow scrolling past bottom of content
+                //
+                // Reset the scroll offset, so that an up arrow will scroll up
+                // on the next try.
+                offset = text_height - num_lines;
+            }
+        }
+        // This doesn't allow data more than 64k rows. We shouldn't need more
+        // than that for wicket, but who knows!
+        if offset == 0 {
+            Self::Top
+        } else if offset < text_height - num_lines {
+            Self::Middle(u16::try_from(offset).unwrap())
+        } else if offset == text_height - num_lines {
+            Self::Bottom(u16::try_from(offset).unwrap())
+        } else {
+            panic!(
+                "offset ({offset}) not capped to \
+                 text_height ({text_height}) - num_lines ({num_lines})"
+            )
+        }
+    }
+
+    /// Returns true if scrolling up is possible.
+    pub fn can_scroll_up(&self) -> bool {
+        match self {
+            Self::Top => false,
+            Self::Middle(_) | Self::Bottom(_) => true,
+        }
+    }
+
+    /// Returns true if scrolling down is possible.
+    pub fn can_scroll_down(&self) -> bool {
+        match self {
+            Self::Top | Self::Middle(_) => true,
+            Self::Bottom(_) => false,
+        }
     }
 
-    if text_height <= num_lines {
-        offset = 0;
-    } else {
-        if text_height - offset < num_lines {
-            // Don't allow scrolling past bottom of content
-            //
-            // Reset the scroll_offset, so that an up arrow
-            // will scroll up on the next try.
-            offset = text_height - num_lines;
+    /// Returns the numerical offset.
+    pub fn into_offset(self) -> u16 {
+        match self {
+            Self::Top => 0,
+            Self::Middle(offset) | Self::Bottom(offset) => offset,
         }
     }
-    // This doesn't allow data more than 64k rows. We shouldn't need
-    // more than that for wicket, but who knows!
-    u16::try_from(offset).unwrap()
 }
diff --git a/wicket/src/ui/panes/overview.rs b/wicket/src/ui/panes/overview.rs
index 3023e694a7..b875fe101f 100644
--- a/wicket/src/ui/panes/overview.rs
+++ b/wicket/src/ui/panes/overview.rs
@@ -5,12 +5,13 @@
 use std::collections::BTreeMap;
 
 use super::help_text;
+use super::ComputedScrollOffset;
 use super::Control;
 use crate::state::{ComponentId, ALL_COMPONENT_IDS};
 use crate::ui::defaults::colors::*;
 use crate::ui::defaults::style;
-use crate::ui::panes::compute_scroll_offset;
 use crate::ui::widgets::IgnitionPopup;
+use crate::ui::widgets::PopupScrollKind;
 use crate::ui::widgets::{BoxConnector, BoxConnectorKind, Rack};
 use crate::{Action, Cmd, Frame, State};
 use tui::layout::{Constraint, Direction, Layout, Rect};
@@ -238,9 +239,11 @@ impl InventoryView {
             x: 0,
             y: 0,
         };
+
         let popup_builder =
             self.ignition.to_popup_builder(state.rack_state.selected);
-        let popup = popup_builder.build(full_screen);
+        // Scrolling in the ignition popup is always disabled.
+        let popup = popup_builder.build(full_screen, PopupScrollKind::Disabled);
         frame.render_widget(popup, full_screen);
     }
 
@@ -334,11 +337,12 @@ impl Control for InventoryView {
         };
 
         let scroll_offset = self.scroll_offsets.get_mut(&component_id).unwrap();
-        let y_offset = compute_scroll_offset(
+        let y_offset = ComputedScrollOffset::new(
             *scroll_offset,
             text.height(),
             chunks[1].height as usize,
-        );
+        )
+        .into_offset();
         *scroll_offset = y_offset as usize;
 
         let inventory = Paragraph::new(text)
diff --git a/wicket/src/ui/panes/update.rs b/wicket/src/ui/panes/update.rs
index 8a7895a5d8..cbc2698cac 100644
--- a/wicket/src/ui/panes/update.rs
+++ b/wicket/src/ui/panes/update.rs
@@ -13,7 +13,7 @@ use crate::state::{
 use crate::ui::defaults::style;
 use crate::ui::widgets::{
     BoxConnector, BoxConnectorKind, ButtonText, IgnitionPopup, PopupBuilder,
-    StatusView,
+    PopupScrollKind, StatusView,
 };
 use crate::ui::wrap::wrap_text;
 use crate::{Action, Cmd, Frame, State};
@@ -35,6 +35,24 @@ use wicketd_client::types::SemverVersion;
 
 const MAX_COLUMN_WIDTH: u16 = 25;
 
+#[derive(Debug)]
+struct UpdatePanePopup {
+    kind: PopupKind,
+    scroll_kind: PopupScrollKind,
+}
+
+impl UpdatePanePopup {
+    fn new(kind: PopupKind) -> Self {
+        let scroll_kind = if kind.is_scrollable() {
+            PopupScrollKind::enabled()
+        } else {
+            PopupScrollKind::Disabled
+        };
+
+        Self { kind, scroll_kind }
+    }
+}
+
 #[derive(Debug)]
 enum PopupKind {
     StartUpdate { popup_state: StartUpdatePopupState },
@@ -43,6 +61,19 @@ enum PopupKind {
     ClearUpdateState { popup_state: ClearUpdateStatePopupState },
 }
 
+impl PopupKind {
+    fn is_scrollable(&self) -> bool {
+        match self {
+            Self::StartUpdate { popup_state } => popup_state.is_scrollable(),
+            Self::StepLogs => true,
+            Self::Ignition => false,
+            Self::ClearUpdateState { popup_state } => {
+                popup_state.is_scrollable()
+            }
+        }
+    }
+}
+
 #[derive(Debug)]
 enum StartUpdatePopupState {
     Prompting,
@@ -50,12 +81,30 @@ enum StartUpdatePopupState {
     Failed { message: String },
 }
 
+impl StartUpdatePopupState {
+    fn is_scrollable(&self) -> bool {
+        match self {
+            Self::Prompting | Self::Waiting => false,
+            Self::Failed { .. } => true,
+        }
+    }
+}
+
 #[derive(Debug)]
 enum ClearUpdateStatePopupState {
     Waiting,
     Failed { message: String },
 }
 
+impl ClearUpdateStatePopupState {
+    fn is_scrollable(&self) -> bool {
+        match self {
+            Self::Waiting => false,
+            Self::Failed { .. } => true,
+        }
+    }
+}
+
 /// Overview of update status and ability to install updates
 /// from a single TUF repo uploaded to wicketd via wicket.
 pub struct UpdatePane {
@@ -85,7 +134,7 @@ pub struct UpdatePane {
 
     status_view_version_rect: Rect,
     status_view_main_rect: Rect,
-    popup: Option<PopupKind>,
+    popup: Option<UpdatePanePopup>,
 
     ignition: IgnitionPopup,
 }
@@ -132,7 +181,8 @@ impl UpdatePane {
         &mut self,
         state: &State,
         frame: &mut Frame<'_>,
-    ) {
+        scroll_kind: PopupScrollKind,
+    ) -> PopupScrollKind {
         let selected = state.rack_state.selected;
         let id_state = self.component_state.get(&selected).unwrap();
         // We only open the popup if id_state.selected is not None, but in some cases
@@ -142,7 +192,7 @@ impl UpdatePane {
             Some(key) => key,
             None => {
                 self.popup = None;
-                return;
+                return PopupScrollKind::Disabled;
             }
         };
         let value = id_state
@@ -151,10 +201,10 @@ impl UpdatePane {
             .expect("selected_key is always valid");
         let step_info = value.step_info();
 
-        let header = Spans::from(vec![Span::styled(
-            step_info.description.clone(),
-            style::header(true),
-        )]);
+        let header = Spans::from(vec![
+            Span::styled("Step: ", style::header(true)),
+            Span::styled(step_info.description.clone(), style::header(true)),
+        ]);
 
         let mut body = Text::default();
 
@@ -358,8 +408,7 @@ impl UpdatePane {
             }
         }
 
-        let buttons =
-            vec![ButtonText { instruction: "NAVIGATE", key: "LEFT/RIGHT" }];
+        let buttons = vec![ButtonText::new("Navigate", "Left/Right")];
 
         let full_screen = Rect {
             width: state.screen_width,
@@ -367,16 +416,20 @@ impl UpdatePane {
             x: 0,
             y: 0,
         };
+
         let popup_builder = PopupBuilder { header, body, buttons };
-        let popup = popup_builder.build(full_screen);
+        let popup = popup_builder.build(full_screen, scroll_kind);
+        let actual_scroll_kind = popup.actual_scroll_kind();
         frame.render_widget(popup, full_screen);
+        actual_scroll_kind
     }
 
     pub fn draw_start_update_prompting_popup(
         &mut self,
         state: &State,
         frame: &mut Frame<'_>,
-    ) {
+        scroll_kind: PopupScrollKind,
+    ) -> PopupScrollKind {
         let popup_builder = PopupBuilder {
             header: Spans::from(vec![Span::styled(
                 format!("START UPDATE: {}", state.rack_state.selected),
@@ -387,8 +440,8 @@ impl UpdatePane {
                 style::plain_text(),
             )])]),
             buttons: vec![
-                ButtonText { instruction: "YES", key: "Y" },
-                ButtonText { instruction: "NO", key: "N" },
+                ButtonText::new("Yes", "Y"),
+                ButtonText::new("No", "N"),
             ],
         };
         let full_screen = Rect {
@@ -398,15 +451,18 @@ impl UpdatePane {
             y: 0,
         };
 
-        let popup = popup_builder.build(full_screen);
+        let popup = popup_builder.build(full_screen, scroll_kind);
+        let actual_scroll_kind = popup.actual_scroll_kind();
         frame.render_widget(popup, full_screen);
+        actual_scroll_kind
     }
 
     fn draw_start_update_waiting_popup(
         &self,
         state: &State,
         frame: &mut Frame<'_>,
-    ) {
+        scroll_kind: PopupScrollKind,
+    ) -> PopupScrollKind {
         let popup_builder = PopupBuilder {
             header: Spans::from(vec![Span::styled(
                 format!("START UPDATE: {}", state.rack_state.selected),
@@ -425,8 +481,10 @@ impl UpdatePane {
             y: 0,
         };
 
-        let popup = popup_builder.build(full_screen);
+        let popup = popup_builder.build(full_screen, scroll_kind);
+        let actual_scroll_kind = popup.actual_scroll_kind();
         frame.render_widget(popup, full_screen);
+        actual_scroll_kind
     }
 
     fn draw_start_update_failed_popup(
@@ -434,7 +492,8 @@ impl UpdatePane {
         state: &State,
         message: &str,
         frame: &mut Frame<'_>,
-    ) {
+        scroll_kind: PopupScrollKind,
+    ) -> PopupScrollKind {
         let mut body = Text::default();
         let prefix = vec![Span::styled("Message: ", style::selected())];
         push_text_lines(message, prefix, &mut body.lines);
@@ -445,7 +504,7 @@ impl UpdatePane {
                 style::failed_update(),
             )]),
             body,
-            buttons: vec![ButtonText { instruction: "CLOSE", key: "ESC" }],
+            buttons: vec![ButtonText::new("Close", "Esc")],
         };
         let full_screen = Rect {
             width: state.screen_width,
@@ -454,15 +513,18 @@ impl UpdatePane {
             y: 0,
         };
 
-        let popup = popup_builder.build(full_screen);
+        let popup = popup_builder.build(full_screen, scroll_kind);
+        let actual_scroll_kind = popup.actual_scroll_kind();
         frame.render_widget(popup, full_screen);
+        actual_scroll_kind
     }
 
     fn draw_clear_update_state_waiting_popup(
         &self,
         state: &State,
         frame: &mut Frame<'_>,
-    ) {
+        scroll_kind: PopupScrollKind,
+    ) -> PopupScrollKind {
         let popup_builder = PopupBuilder {
             header: Spans::from(vec![Span::styled(
                 format!("CLEAR UPDATE STATE: {}", state.rack_state.selected),
@@ -481,8 +543,10 @@ impl UpdatePane {
             y: 0,
         };
 
-        let popup = popup_builder.build(full_screen);
+        let popup = popup_builder.build(full_screen, scroll_kind);
+        let actual_scroll_kind = popup.actual_scroll_kind();
         frame.render_widget(popup, full_screen);
+        actual_scroll_kind
     }
 
     fn draw_clear_update_state_failed_popup(
@@ -490,7 +554,8 @@ impl UpdatePane {
         state: &State,
         message: &str,
         frame: &mut Frame<'_>,
-    ) {
+        scroll_kind: PopupScrollKind,
+    ) -> PopupScrollKind {
         let mut body = Text::default();
         let prefix = vec![Span::styled("Message: ", style::selected())];
         push_text_lines(message, prefix, &mut body.lines);
@@ -504,7 +569,7 @@ impl UpdatePane {
                 style::failed_update(),
             )]),
             body,
-            buttons: vec![ButtonText { instruction: "CLOSE", key: "ESC" }],
+            buttons: vec![ButtonText::new("Close", "Esc")],
         };
         let full_screen = Rect {
             width: state.screen_width,
@@ -513,15 +578,18 @@ impl UpdatePane {
             y: 0,
         };
 
-        let popup = popup_builder.build(full_screen);
+        let popup = popup_builder.build(full_screen, scroll_kind);
+        let actual_scroll_kind = popup.actual_scroll_kind();
         frame.render_widget(popup, full_screen);
+        actual_scroll_kind
     }
 
     pub fn draw_ignition_popup(
         &mut self,
         state: &State,
         frame: &mut Frame<'_>,
-    ) {
+        scroll_kind: PopupScrollKind,
+    ) -> PopupScrollKind {
         let full_screen = Rect {
             width: state.screen_width,
             height: state.screen_height,
@@ -530,8 +598,18 @@ impl UpdatePane {
         };
         let popup_builder =
             self.ignition.to_popup_builder(state.rack_state.selected);
-        let popup = popup_builder.build(full_screen);
+
+        // The scroll offset here should always be disabled, but make it a
+        // parameter for uniformity with the other popups.
+        debug_assert_eq!(
+            scroll_kind,
+            PopupScrollKind::Disabled,
+            "scrolling in ignition popup is always disabled"
+        );
+        let popup = popup_builder.build(full_screen, scroll_kind);
+        let actual_scroll_kind = popup.actual_scroll_kind();
         frame.render_widget(popup, full_screen);
+        actual_scroll_kind
     }
 
     fn update_items(&mut self, state: &State) {
@@ -652,7 +730,8 @@ impl UpdatePane {
                     .get(&state.rack_state.selected)
                     .unwrap();
                 if id_state.selected.is_some() {
-                    self.popup = Some(PopupKind::StepLogs);
+                    self.popup =
+                        Some(UpdatePanePopup::new(PopupKind::StepLogs));
                     Some(Action::Redraw)
                 } else {
                     None
@@ -668,9 +747,11 @@ impl UpdatePane {
                     UpdateItemState::NotStarted => {
                         // If an update hasn't been started or has failed to
                         // start, "Press ... to start" is displayed.
-                        self.popup = Some(PopupKind::StartUpdate {
-                            popup_state: StartUpdatePopupState::Prompting,
-                        });
+                        self.popup = Some(UpdatePanePopup::new(
+                            PopupKind::StartUpdate {
+                                popup_state: StartUpdatePopupState::Prompting,
+                            },
+                        ));
                         Some(Action::Redraw)
                     }
                     UpdateItemState::AwaitingRepository
@@ -678,40 +759,7 @@ impl UpdatePane {
                     | UpdateItemState::RunningOrCompleted { .. } => None,
                 }
             }
-            Cmd::ClearUpdateState => {
-                let selected = state.rack_state.selected;
-                match state.update_state.item_state(selected) {
-                    UpdateItemState::RunningOrCompleted { .. } => {
-                        let id_state =
-                            self.component_state.get(&selected).unwrap();
-                        let event_buffer = &id_state.event_buffer;
-                        if let Some(root_execution_id) =
-                            event_buffer.root_execution_id()
-                        {
-                            let summary = event_buffer.steps().summarize();
-                            let summary = summary.get(&root_execution_id).expect(
-                                "root execution ID should have a summary associated with it",
-                            );
-                            match summary.execution_status {
-                                ExecutionStatus::Completed { .. }
-                                | ExecutionStatus::Failed { .. } => {
-                                    // If execution has reached a terminal
-                                    // state, we can clear it.
-                                    self.popup = Some(PopupKind::ClearUpdateState { popup_state: ClearUpdateStatePopupState::Waiting });
-                                    Some(Action::ClearUpdateState(selected))
-                                }
-                                ExecutionStatus::NotStarted
-                                | ExecutionStatus::Running { .. } => None,
-                            }
-                        } else {
-                            None
-                        }
-                    }
-                    UpdateItemState::AwaitingRepository
-                    | UpdateItemState::NotStarted
-                    | UpdateItemState::UpdateStarted => None,
-                }
-            }
+            Cmd::ClearUpdateState => self.handle_clear_update_state(state),
             Cmd::GotoTop => {
                 let id_state = self
                     .component_state
@@ -732,6 +780,49 @@ impl UpdatePane {
         }
     }
 
+    fn handle_clear_update_state(
+        &mut self,
+        state: &mut State,
+    ) -> Option<Action> {
+        let selected = state.rack_state.selected;
+        match state.update_state.item_state(selected) {
+            UpdateItemState::RunningOrCompleted { .. } => {
+                let id_state = self.component_state.get(&selected).unwrap();
+                let event_buffer = &id_state.event_buffer;
+                if let Some(root_execution_id) =
+                    event_buffer.root_execution_id()
+                {
+                    let summary = event_buffer.steps().summarize();
+                    let summary = summary.get(&root_execution_id).expect(
+                        "root execution ID should have a summary \
+                            associated with it",
+                    );
+                    match summary.execution_status {
+                        ExecutionStatus::Completed { .. }
+                        | ExecutionStatus::Failed { .. } => {
+                            // If execution has reached a terminal
+                            // state, we can clear it.
+                            self.popup = Some(UpdatePanePopup::new(
+                                PopupKind::ClearUpdateState {
+                                    popup_state:
+                                        ClearUpdateStatePopupState::Waiting,
+                                },
+                            ));
+                            Some(Action::ClearUpdateState(selected))
+                        }
+                        ExecutionStatus::NotStarted
+                        | ExecutionStatus::Running { .. } => None,
+                    }
+                } else {
+                    None
+                }
+            }
+            UpdateItemState::AwaitingRepository
+            | UpdateItemState::NotStarted
+            | UpdateItemState::UpdateStarted => None,
+        }
+    }
+
     fn is_force_update_visible(&self, state: &State) -> bool {
         // We only show the toggle spans for force updating the SP/RoT when the
         // user could potentially start an update.
@@ -752,7 +843,24 @@ impl UpdatePane {
             self.popup = None;
             return Some(Action::Redraw);
         }
-        match self.popup.as_mut().unwrap() {
+        let popup = self.popup.as_mut().unwrap();
+
+        if popup.scroll_kind.is_scrollable() {
+            // Handle up or down commands here.
+            match cmd {
+                Cmd::Up => {
+                    popup.scroll_kind.scroll_up();
+                    return Some(Action::Redraw);
+                }
+                Cmd::Down => {
+                    popup.scroll_kind.scroll_down();
+                    return Some(Action::Redraw);
+                }
+                _ => {}
+            }
+        }
+
+        match &mut popup.kind {
             PopupKind::StepLogs => match cmd {
                 // TODO: up/down for scrolling popup data
                 Cmd::Left => {
@@ -1796,7 +1904,7 @@ impl Control for UpdatePane {
             }
             Cmd::Ignition => {
                 self.ignition.reset();
-                self.popup = Some(PopupKind::Ignition);
+                self.popup = Some(UpdatePanePopup::new(PopupKind::Ignition));
                 Some(Action::Redraw)
             }
             Cmd::GotoTop => {
@@ -1828,32 +1936,60 @@ impl Control for UpdatePane {
             self.draw_tree_view(state, frame, active);
         }
 
-        match &self.popup {
-            Some(PopupKind::StepLogs) => self.draw_step_log_popup(state, frame),
-            Some(PopupKind::StartUpdate { popup_state }) => match popup_state {
-                StartUpdatePopupState::Prompting => {
-                    self.draw_start_update_prompting_popup(state, frame)
-                }
-                StartUpdatePopupState::Waiting => {
-                    self.draw_start_update_waiting_popup(state, frame)
-                }
-                StartUpdatePopupState::Failed { message } => {
-                    self.draw_start_update_failed_popup(state, &message, frame)
+        let mut actual_scroll_kind = None;
+
+        if let Some(popup) = &self.popup {
+            // The functions called return the effective scroll offset.
+            actual_scroll_kind = Some(match &popup.kind {
+                PopupKind::StepLogs => {
+                    self.draw_step_log_popup(state, frame, popup.scroll_kind)
                 }
-            },
-            Some(PopupKind::ClearUpdateState { popup_state }) => {
-                match popup_state {
-                    ClearUpdateStatePopupState::Waiting => {
-                        self.draw_clear_update_state_waiting_popup(state, frame)
-                    }
-                    ClearUpdateStatePopupState::Failed { message } => self
-                        .draw_clear_update_state_failed_popup(
-                            state, message, frame,
+                PopupKind::StartUpdate { popup_state } => match popup_state {
+                    StartUpdatePopupState::Prompting => self
+                        .draw_start_update_prompting_popup(
+                            state,
+                            frame,
+                            popup.scroll_kind,
                         ),
+                    StartUpdatePopupState::Waiting => self
+                        .draw_start_update_waiting_popup(
+                            state,
+                            frame,
+                            popup.scroll_kind,
+                        ),
+                    StartUpdatePopupState::Failed { message } => self
+                        .draw_start_update_failed_popup(
+                            state,
+                            &message,
+                            frame,
+                            popup.scroll_kind,
+                        ),
+                },
+                PopupKind::ClearUpdateState { popup_state } => {
+                    match popup_state {
+                        ClearUpdateStatePopupState::Waiting => self
+                            .draw_clear_update_state_waiting_popup(
+                                state,
+                                frame,
+                                popup.scroll_kind,
+                            ),
+                        ClearUpdateStatePopupState::Failed { message } => self
+                            .draw_clear_update_state_failed_popup(
+                                state,
+                                message,
+                                frame,
+                                popup.scroll_kind,
+                            ),
+                    }
                 }
-            }
-            Some(PopupKind::Ignition) => self.draw_ignition_popup(state, frame),
-            None => (),
+                PopupKind::Ignition => {
+                    self.draw_ignition_popup(state, frame, popup.scroll_kind)
+                }
+            });
+        }
+
+        if let Some(popup) = &mut self.popup {
+            popup.scroll_kind = actual_scroll_kind.unwrap();
         }
     }
 }
diff --git a/wicket/src/ui/widgets/ignition.rs b/wicket/src/ui/widgets/ignition.rs
index 2fc1c821e3..079f9cb39c 100644
--- a/wicket/src/ui/widgets/ignition.rs
+++ b/wicket/src/ui/widgets/ignition.rs
@@ -84,7 +84,7 @@ impl IgnitionPopup {
                     )]),
                 ],
             },
-            buttons: vec![ButtonText { instruction: "CLOSE", key: "ESC" }],
+            buttons: vec![ButtonText::new("Close", "Esc")],
         }
     }
 }
diff --git a/wicket/src/ui/widgets/mod.rs b/wicket/src/ui/widgets/mod.rs
index 8cf2cfbf85..01ded9724c 100644
--- a/wicket/src/ui/widgets/mod.rs
+++ b/wicket/src/ui/widgets/mod.rs
@@ -16,6 +16,6 @@ pub use animated_logo::{Logo, LogoState, LOGO_HEIGHT, LOGO_WIDTH};
 pub use box_connector::{BoxConnector, BoxConnectorKind};
 pub use fade::Fade;
 pub use ignition::IgnitionPopup;
-pub use popup::{ButtonText, Popup, PopupBuilder};
+pub use popup::{ButtonText, Popup, PopupBuilder, PopupScrollKind};
 pub use rack::Rack;
 pub use status_view::StatusView;
diff --git a/wicket/src/ui/widgets/popup.rs b/wicket/src/ui/widgets/popup.rs
index ed3cc88f18..d563b75954 100644
--- a/wicket/src/ui/widgets/popup.rs
+++ b/wicket/src/ui/widgets/popup.rs
@@ -13,7 +13,9 @@ use tui::{
 };
 
 use crate::ui::widgets::{BoxConnector, BoxConnectorKind, Fade};
-use crate::ui::{defaults::dimensions::RectExt, wrap::wrap_text};
+use crate::ui::{
+    defaults::dimensions::RectExt, panes::ComputedScrollOffset, wrap::wrap_text,
+};
 use crate::ui::{
     defaults::{colors::*, style},
     wrap::wrap_line,
@@ -24,8 +26,28 @@ const BUTTON_HEIGHT: u16 = 3;
 
 #[derive(Clone, Debug)]
 pub struct ButtonText<'a> {
-    pub instruction: &'a str,
-    pub key: &'a str,
+    pub instruction: Spans<'a>,
+    pub key: Spans<'a>,
+}
+
+impl<'a> ButtonText<'a> {
+    pub fn new(instruction: &'a str, key: &'a str) -> Self {
+        Self {
+            instruction: Spans::from(Span::styled(
+                instruction,
+                Self::default_instruction_style(),
+            )),
+            key: Spans::from(Span::styled(key, Self::default_key_style())),
+        }
+    }
+
+    pub fn default_instruction_style() -> Style {
+        style::help_function()
+    }
+
+    pub fn default_key_style() -> Style {
+        style::help_keys()
+    }
 }
 
 #[derive(Default)]
@@ -36,23 +58,28 @@ pub struct PopupBuilder<'a> {
 }
 
 impl<'a> PopupBuilder<'a> {
-    pub fn build(&self, full_screen: Rect) -> Popup<'_> {
-        Popup::new(full_screen, &self.header, &self.body, self.buttons.clone())
+    pub fn build(
+        &self,
+        full_screen: Rect,
+        scroll_kind: PopupScrollKind,
+    ) -> Popup<'_> {
+        Popup::new(
+            full_screen,
+            &self.header,
+            &self.body,
+            self.buttons.clone(),
+            scroll_kind,
+        )
     }
 }
 
 #[derive(Default)]
 pub struct Popup<'a> {
-    // Fields are private because we always want users to go through the
-    // constructor.
-
-    // This is the header as passed in, except wrapped.
-    wrapped_header: Text<'a>,
-
-    // We store the *wrapped body* rather than the unwrapped body to make
-    // `self.height()` and `self.width()` be computed correctly.
-    wrapped_body: Text<'a>,
-    buttons: Vec<ButtonText<'a>>,
+    data: PopupData<'a>,
+    rect: Rect,
+    chunks: Vec<Rect>,
+    body_rect: Rect,
+    actual_scroll_kind: PopupScrollKind,
 }
 
 impl<'a> Popup<'a> {
@@ -61,53 +88,125 @@ impl<'a> Popup<'a> {
         header: &'a Spans<'_>,
         body: &'a Text<'_>,
         buttons: Vec<ButtonText<'a>>,
+        scroll_kind: PopupScrollKind,
     ) -> Self {
         let wrapped_header =
             wrap_line(header, Self::default_wrap_options(full_screen.width));
         let wrapped_body =
             wrap_text(body, Self::default_wrap_options(full_screen.width));
-        Self { wrapped_header, wrapped_body, buttons }
-    }
 
-    pub fn height(&self) -> u16 {
-        let button_height: u16 =
-            if self.buttons.is_empty() { 0 } else { BUTTON_HEIGHT };
-        let bottom_margin: u16 = 1;
-        let borders: u16 = 3;
-        u16::try_from(self.wrapped_header.height()).unwrap()
-            + u16::try_from(self.wrapped_body.height()).unwrap()
-            + button_height
-            + bottom_margin
-            + borders
-    }
+        let mut data = PopupData { wrapped_header, wrapped_body, buttons };
 
-    pub fn width(&self) -> u16 {
-        let borders: u16 = 2;
-        // Left padding is taken care of by prepending spaces to the body. Right
-        // padding is added here.
-        let right_padding: u16 = 1;
-        let body_width = u16::try_from(self.wrapped_body.width()).unwrap()
-            + borders
-            + right_padding;
-        let header_width = u16::try_from(self.wrapped_header.width()).unwrap()
-            + borders
-            + right_padding;
-        let width = u16::max(body_width, header_width);
-        u16::max(width, self.button_width())
+        // Compute the dimensions here so we can compute scroll positions more
+        // effectively.
+        let width = u16::min(data.width(), Self::max_width(full_screen.width));
+        let height =
+            u16::min(data.height(), Self::max_height(full_screen.height));
+
+        let rect =
+            full_screen.center_horizontally(width).center_vertically(height);
+
+        let chunks = Layout::default()
+            .direction(Direction::Vertical)
+            .constraints(
+                [
+                    // Top titlebar
+                    Constraint::Length(data.wrapped_header.height() as u16 + 2),
+                    Constraint::Min(0),
+                    // Buttons at the bottom will be accounted for while
+                    // rendering the body
+                ]
+                .as_ref(),
+            )
+            .split(rect);
+
+        let mut body_rect = chunks[1];
+        // Ensure we're inside the outer border.
+        body_rect.x += 1;
+        body_rect.width = body_rect.width.saturating_sub(2);
+        body_rect.height = body_rect.height.saturating_sub(1);
+
+        if !data.buttons.is_empty() {
+            body_rect.height = body_rect.height.saturating_sub(BUTTON_HEIGHT);
+        }
+
+        enum ScrollKind {
+            Disabled,
+            NotRequired,
+            Scrolling(u16),
+        }
+
+        let scroll_kind = match scroll_kind {
+            PopupScrollKind::Disabled => ScrollKind::Disabled,
+            PopupScrollKind::Enabled { offset } => {
+                let height_exceeded =
+                    data.wrapped_body.height() > body_rect.height as usize;
+                match (data.buttons.is_empty(), height_exceeded) {
+                    (true, true) => {
+                        // Need to add scroll buttons, which necessitates reducing
+                        // the size.
+                        body_rect.height =
+                            body_rect.height.saturating_sub(BUTTON_HEIGHT);
+                        ScrollKind::Scrolling(offset)
+                    }
+                    (false, true) => ScrollKind::Scrolling(offset),
+                    (_, false) => ScrollKind::NotRequired,
+                }
+            }
+        };
+
+        let actual_scroll_kind = match scroll_kind {
+            ScrollKind::Disabled => PopupScrollKind::Disabled,
+            ScrollKind::NotRequired => PopupScrollKind::Enabled { offset: 0 },
+            ScrollKind::Scrolling(offset) => {
+                // Add scroll buttons.
+                let offset = ComputedScrollOffset::new(
+                    offset as usize,
+                    data.wrapped_body.height(),
+                    body_rect.height as usize,
+                );
+
+                let up_style = if offset.can_scroll_up() {
+                    style::selected()
+                } else {
+                    ButtonText::default_key_style()
+                };
+                let down_style = if offset.can_scroll_down() {
+                    style::selected()
+                } else {
+                    ButtonText::default_key_style()
+                };
+
+                data.buttons.insert(
+                    0,
+                    ButtonText {
+                        instruction: Span::styled(
+                            "Scroll",
+                            ButtonText::default_instruction_style(),
+                        )
+                        .into(),
+                        key: Spans::from(vec![
+                            Span::styled("Up", up_style),
+                            Span::styled("/", ButtonText::default_key_style()),
+                            Span::styled("Down", down_style),
+                        ]),
+                    },
+                );
+
+                PopupScrollKind::Enabled { offset: offset.into_offset() }
+            }
+        };
+
+        Self { data, rect, chunks, body_rect, actual_scroll_kind }
     }
 
-    pub fn button_width(&self) -> u16 {
-        let space_between_buttons = 1;
-        let margins = 4;
-        // Margin + space + angle brackets
-        let button_extras = 6;
-        let width = self.buttons.iter().fold(margins, |acc, text| {
-            acc + text.instruction.len()
-                + text.key.len()
-                + button_extras
-                + space_between_buttons
-        });
-        u16::try_from(width).unwrap()
+    /// Returns the effective, or actual, scroll kind after the text is laid
+    /// out.
+    ///
+    /// If this is a `PopupScrollKind::Enabled` popup, the offset is is capped
+    /// to the maximum degree to which text can be scrolled.
+    pub fn actual_scroll_kind(&self) -> PopupScrollKind {
+        self.actual_scroll_kind
     }
 
     /// Returns the maximum width that this popup can have, including outer
@@ -157,64 +256,149 @@ impl Widget for Popup<'_> {
             .border_type(BorderType::Rounded)
             .style(style::selected_line());
 
-        let width = u16::min(self.width(), Self::max_width(full_screen.width));
-        let height =
-            u16::min(self.height(), Self::max_height(full_screen.height));
-
-        let rect =
-            full_screen.center_horizontally(width).center_vertically(height);
-
         // Clear the popup
-        Clear.render(rect, buf);
+        Clear.render(self.rect, buf);
         Block::default()
             .style(Style::default().bg(TUI_BLACK).fg(TUI_BLACK))
-            .render(rect, buf);
+            .render(self.rect, buf);
 
-        let chunks = Layout::default()
-            .direction(Direction::Vertical)
-            .constraints(
-                [
-                    // Top titlebar
-                    Constraint::Length(self.wrapped_header.height() as u16 + 2),
-                    Constraint::Min(0),
-                    // Buttons at the bottom will be accounted for while
-                    // rendering the body
-                ]
-                .as_ref(),
-            )
-            .split(rect);
-
-        let header = Paragraph::new(self.wrapped_header).block(block.clone());
-        header.render(chunks[0], buf);
+        let header =
+            Paragraph::new(self.data.wrapped_header).block(block.clone());
+        header.render(self.chunks[0], buf);
 
         block
             .borders(Borders::BOTTOM | Borders::LEFT | Borders::RIGHT)
-            .render(chunks[1], buf);
+            .render(self.chunks[1], buf);
+
+        let mut body = Paragraph::new(self.data.wrapped_body);
+        match self.actual_scroll_kind {
+            PopupScrollKind::Disabled => {}
+            PopupScrollKind::Enabled { offset } => {
+                body = body.scroll((offset, 0));
+            }
+        }
 
-        // NOTE: wrapping should be performed externally, by e.g. wrap_text.
-        let body = Paragraph::new(self.wrapped_body);
+        body.render(self.body_rect, buf);
 
-        let mut body_rect = chunks[1];
-        // Ensure we're inside the outer border.
-        body_rect.x += 1;
-        body_rect.width = body_rect.width.saturating_sub(2);
-        body_rect.height = body_rect.height.saturating_sub(1);
+        let connector = BoxConnector::new(BoxConnectorKind::Top);
 
-        if !self.buttons.is_empty() {
-            // Reduce the height so that the body text doesn't overflow into the
-            // button area.
-            body_rect.height = body_rect.height.saturating_sub(BUTTON_HEIGHT);
+        connector.render(self.chunks[1], buf);
+
+        draw_buttons(self.data.buttons, self.chunks[1], buf);
+    }
+}
+
+/// Scroll kind for a popup.
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
+pub enum PopupScrollKind {
+    /// Scrolling is disabled.
+    #[default]
+    Disabled,
+
+    /// Scrolling is enabled.
+    Enabled {
+        /// The offset.
+        offset: u16,
+    },
+}
+
+impl PopupScrollKind {
+    /// Returns a new enabled scroll kind.
+    pub fn enabled() -> Self {
+        Self::Enabled { offset: 0 }
+    }
+
+    /// Returns true if scrolling is enabled.
+    pub fn is_scrollable(&self) -> bool {
+        match self {
+            Self::Disabled => false,
+            Self::Enabled { .. } => true,
         }
+    }
 
-        body.render(body_rect, buf);
+    /// Scrolls up.
+    pub fn scroll_up(&mut self) {
+        match self {
+            Self::Disabled => {}
+            Self::Enabled { offset } => {
+                *offset = offset.saturating_sub(1);
+            }
+        }
+    }
 
-        let connector = BoxConnector::new(BoxConnectorKind::Top);
+    /// Scrolls down.
+    ///
+    /// This method doesn't account for scrolling past the bottom of the text.
+    /// That will be considered and fixed up while creating the `Popup`, and the
+    /// fixed value returned in [`Popup::actual_offset`].
+    pub fn scroll_down(&mut self) {
+        match self {
+            PopupScrollKind::Disabled => {}
+            PopupScrollKind::Enabled { offset } => {
+                *offset = offset.saturating_add(1);
+            }
+        }
+    }
+}
+
+#[derive(Default)]
+struct PopupData<'a> {
+    // Fields are private because we always want users to go through the
+    // constructor.
+
+    // This is the header as passed in, except wrapped.
+    wrapped_header: Text<'a>,
 
-        connector.render(chunks[1], buf);
+    // We store the *wrapped body* rather than the unwrapped body to make
+    // `self.height()` and `self.width()` be computed correctly.
+    wrapped_body: Text<'a>,
+
+    buttons: Vec<ButtonText<'a>>,
+}
 
-        draw_buttons(self.buttons, chunks[1], buf);
+impl<'a> PopupData<'a> {
+    pub fn height(&self) -> u16 {
+        let button_height: u16 =
+            if self.buttons.is_empty() { 0 } else { BUTTON_HEIGHT };
+        let bottom_margin: u16 = 1;
+        let borders: u16 = 3;
+        u16::try_from(self.wrapped_header.height()).unwrap()
+            + u16::try_from(self.wrapped_body.height()).unwrap()
+            + button_height
+            + bottom_margin
+            + borders
+    }
+
+    pub fn width(&self) -> u16 {
+        let borders: u16 = 2;
+        // Left padding is taken care of by prepending spaces to the body. Right
+        // padding is added here.
+        let right_padding: u16 = 1;
+        let body_width = u16::try_from(self.wrapped_body.width()).unwrap()
+            + borders
+            + right_padding;
+        let header_width = u16::try_from(self.wrapped_header.width()).unwrap()
+            + borders
+            + right_padding;
+        let width = u16::max(body_width, header_width);
+        u16::max(width, self.button_width())
+    }
+
+    pub fn button_width(&self) -> u16 {
+        let space_between_buttons = 1;
+        let margins = 4;
+        // Margin + space + angle brackets
+        let button_extras = 6;
+        let width = self.buttons.iter().fold(margins, |acc, text| {
+            acc + text.instruction.width()
+                + text.key.width()
+                + button_extras
+                + space_between_buttons
+        });
+        u16::try_from(width).unwrap()
     }
 }
+
 pub fn draw_buttons(
     buttons: Vec<ButtonText<'_>>,
     body_rect: Rect,
@@ -233,8 +417,8 @@ pub fn draw_buttons(
         .chain(buttons.iter().map(|b| {
             Constraint::Length(
                 u16::try_from(
-                    b.instruction.len()
-                        + b.key.len()
+                    b.instruction.width()
+                        + b.key.width()
                         + brackets
                         + margin
                         + borders
@@ -257,12 +441,13 @@ pub fn draw_buttons(
         .style(style::selected_line());
 
     for (i, button) in buttons.into_iter().enumerate() {
-        let b = Paragraph::new(Spans::from(vec![
-            Span::raw(" "),
-            Span::styled(button.instruction, style::selected()),
-            Span::styled(format!(" <{}> ", button.key), style::selected_line()),
-        ]))
-        .block(block.clone());
+        let mut spans = vec![Span::raw(" ")];
+        spans.extend(button.instruction.0);
+        spans.push(Span::styled(" <", ButtonText::default_key_style()));
+        spans.extend(button.key.0);
+        spans.push(Span::styled(">", ButtonText::default_key_style()));
+
+        let b = Paragraph::new(Spans(spans)).block(block.clone());
         b.render(button_rects[i + 1], buf);
     }
 }

From d280f4ef642ecac91194ed531ce872c507f29e40 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Thu, 25 May 2023 19:08:46 -0700
Subject: [PATCH 06/14] Nexus needs to use SNI to present the right TLS
 certificates (#3164)

---
 .github/buildomat/jobs/deploy.sh              |    2 +-
 Cargo.lock                                    |   15 +-
 common/src/api/external/mod.rs                |    2 +-
 common/src/nexus_config.rs                    |   65 +-
 dev-tools/Cargo.toml                          |    5 +
 dev-tools/src/bin/omicron-dev.rs              |   70 +-
 .../output/cmd-omicron-dev-noargs-stderr      |    1 +
 dev-tools/tests/test_omicron_dev.rs           |   37 +
 dns-server/tests/basic_test.rs                |    1 -
 docs/how-to-run-simulated.adoc                |   24 +
 end-to-end-tests/Cargo.toml                   |    1 -
 end-to-end-tests/src/helpers/ctx.rs           |  216 ++--
 gateway/src/lib.rs                            |    1 -
 installinator-artifactd/src/server.rs         |    1 -
 internal-dns/src/resolver.rs                  |    1 -
 nexus/Cargo.toml                              |    2 +
 nexus/db-model/src/certificate.rs             |   15 +-
 nexus/db-queries/src/db/datastore/mod.rs      |    1 +
 nexus/db-queries/src/db/datastore/rack.rs     |    2 +
 nexus/db-queries/src/db/datastore/silo.rs     |   28 +-
 nexus/examples/config.toml                    |   11 +
 .../src/app/background/external_endpoints.rs  |  202 ++++
 nexus/src/app/background/init.rs              |   28 +-
 nexus/src/app/background/mod.rs               |    1 +
 nexus/src/app/certificate.rs                  |  147 +--
 nexus/src/app/external_endpoints.rs           | 1048 +++++++++++++++++
 nexus/src/app/mod.rs                          |  105 +-
 nexus/src/app/rack.rs                         |   10 +-
 nexus/src/app/silo.rs                         |   39 +-
 nexus/src/lib.rs                              |   52 +-
 nexus/test-interface/src/lib.rs               |    4 +-
 nexus/test-utils/src/http_testing.rs          |   70 +-
 nexus/test-utils/src/lib.rs                   |   50 +-
 nexus/tests/config.test.toml                  |    8 +-
 nexus/tests/integration_tests/authn_http.rs   |    2 +-
 nexus/tests/integration_tests/certificates.rs |  572 ++++++---
 nexus/tests/integration_tests/console_api.rs  |    1 +
 nexus/tests/integration_tests/updates.rs      |    1 +
 openapi/sled-agent.json                       |    5 +
 oxide-client/Cargo.toml                       |    5 +
 oxide-client/src/lib.rs                       |  139 +++
 oximeter/producer/examples/producer.rs        |    7 +-
 sled-agent/src/bin/sled-agent-sim.rs          |   35 +-
 sled-agent/src/params.rs                      |   11 +-
 sled-agent/src/rack_setup/plan/service.rs     |    9 +
 sled-agent/src/services.rs                    |   22 +-
 sled-agent/src/sim/server.rs                  |   10 +-
 sled-agent/src/sim/storage.rs                 |    1 -
 smf/nexus/config-partial.toml                 |    5 +
 wicketd/src/lib.rs                            |    1 -
 50 files changed, 2380 insertions(+), 711 deletions(-)
 create mode 100644 nexus/src/app/background/external_endpoints.rs
 create mode 100644 nexus/src/app/external_endpoints.rs

diff --git a/.github/buildomat/jobs/deploy.sh b/.github/buildomat/jobs/deploy.sh
index 4387ae7aa2..66719067d0 100755
--- a/.github/buildomat/jobs/deploy.sh
+++ b/.github/buildomat/jobs/deploy.sh
@@ -253,10 +253,10 @@ pfexec ./out/softnpu/scadm \
 	dump-state
 
 export RUST_BACKTRACE=1
+export E2E_TLS_CERT
 ./tests/bootstrap
 
 rm ./tests/bootstrap
-export E2E_TLS_CERT
 for test_bin in tests/*; do
 	./"$test_bin"
 done
diff --git a/Cargo.lock b/Cargo.lock
index ea3e196752..c70565566f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1990,7 +1990,7 @@ dependencies = [
 [[package]]
 name = "dropshot"
 version = "0.9.1-dev"
-source = "git+https://github.com/oxidecomputer/dropshot?branch=main#61c035b1f70d7b2c2e2ee2481e850105bf15ff31"
+source = "git+https://github.com/oxidecomputer/dropshot?branch=main#62ee51d5494699c9d12271275010c5433a665a71"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -2033,7 +2033,7 @@ dependencies = [
 [[package]]
 name = "dropshot_endpoint"
 version = "0.9.1-dev"
-source = "git+https://github.com/oxidecomputer/dropshot?branch=main#61c035b1f70d7b2c2e2ee2481e850105bf15ff31"
+source = "git+https://github.com/oxidecomputer/dropshot?branch=main#62ee51d5494699c9d12271275010c5433a665a71"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -2156,7 +2156,6 @@ dependencies = [
  "chrono",
  "futures",
  "http",
- "hyper",
  "omicron-sled-agent",
  "omicron-test-utils",
  "oxide-client",
@@ -4464,6 +4463,8 @@ name = "omicron-dev-tools"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "camino",
+ "camino-tempfile",
  "clap 4.3.0",
  "dropshot",
  "expectorate",
@@ -4476,8 +4477,10 @@ dependencies = [
  "omicron-rpaths",
  "omicron-sled-agent",
  "omicron-test-utils",
+ "openssl",
  "oxide-client",
  "pq-sys",
+ "rcgen",
  "signal-hook",
  "signal-hook-tokio",
  "subprocess",
@@ -4590,6 +4593,7 @@ dependencies = [
  "openssl-probe",
  "openssl-sys",
  "oso",
+ "oxide-client",
  "oximeter 0.1.0",
  "oximeter-client",
  "oximeter-db",
@@ -4984,16 +4988,21 @@ checksum = "c1b04fb49957986fdce4d6ee7a65027d55d4b6d2265e5848bbb507b58ccfdb6f"
 name = "oxide-client"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
  "base64 0.21.1",
  "chrono",
  "futures",
+ "http",
+ "hyper",
  "progenitor",
  "rand 0.8.5",
  "regress 0.6.0",
  "reqwest",
  "serde",
  "serde_json",
+ "thiserror",
  "tokio",
+ "trust-dns-resolver",
  "uuid",
 ]
 
diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs
index ec4c595f43..2c9ab80de5 100644
--- a/common/src/api/external/mod.rs
+++ b/common/src/api/external/mod.rs
@@ -82,7 +82,7 @@ pub trait ObjectIdentity {
 ///
 /// `NameType` is the type of the field used to sort the returned values and it's
 /// usually `Name`.
-#[derive(Debug)]
+#[derive(Clone, Debug)]
 pub struct DataPageParams<'a, NameType> {
     /// If present, this is the value of the sort field for the last object seen
     pub marker: Option<&'a NameType>,
diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs
index dbc0b6fccf..b5e87fdbf6 100644
--- a/common/src/nexus_config.rs
+++ b/common/src/nexus_config.rs
@@ -112,11 +112,7 @@ pub struct DeploymentConfig {
     /// Uuid of the Rack where Nexus is executing.
     pub rack_id: Uuid,
     /// Dropshot configuration for the external API server.
-    ///
-    /// If certificate information is available to Nexus, these
-    /// settings will also be used to launch an HTTPS server
-    /// on [PackageConfig::nexus_https_port].
-    pub dropshot_external: ConfigDropshot,
+    pub dropshot_external: ConfigDropshotWithTls,
     /// Dropshot configuration for internal API server.
     pub dropshot_internal: ConfigDropshot,
     /// Portion of the IP space to be managed by the Rack.
@@ -140,6 +136,22 @@ impl DeploymentConfig {
     }
 }
 
+/// Thin wrapper around `ConfigDropshot` that adds a boolean for enabling TLS
+///
+/// The configuration for TLS consists of the list of TLS certificates used.
+/// This is dynamic, driven by what's in CockroachDB.  That's why we only need a
+/// boolean here.  (If in the future we want to configure other things about
+/// TLS, this could be extended.)
+#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
+pub struct ConfigDropshotWithTls {
+    /// Regular Dropshot configuration parameters
+    #[serde(flatten)]
+    pub dropshot: ConfigDropshot,
+    /// Whether TLS is enabled (default: false)
+    #[serde(default)]
+    pub tls: bool,
+}
+
 // By design, we require that all config properties be specified (i.e., we don't
 // use `serde(default)`).
 
@@ -259,10 +271,6 @@ impl Default for Tunables {
     }
 }
 
-fn default_https_port() -> u16 {
-    443
-}
-
 /// Background task configuration
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
 pub struct BackgroundTaskConfig {
@@ -270,6 +278,8 @@ pub struct BackgroundTaskConfig {
     pub dns_internal: DnsTasksConfig,
     /// configuration for external DNS background tasks
     pub dns_external: DnsTasksConfig,
+    /// configuration for external endpoint list watcher
+    pub external_endpoints: ExternalEndpointsConfig,
 }
 
 #[serde_as]
@@ -294,6 +304,16 @@ pub struct DnsTasksConfig {
     pub max_concurrent_server_updates: usize,
 }
 
+#[serde_as]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
+pub struct ExternalEndpointsConfig {
+    /// period (in seconds) for periodic activations of this background task
+    #[serde_as(as = "DurationSeconds<u64>")]
+    pub period_secs: Duration,
+    // Other policy around the TLS certificates could go here (e.g.,
+    // allow/disallow wildcard certs, don't serve expired certs, etc.)
+}
+
 /// Configuration for a nexus server
 #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
 pub struct PackageConfig {
@@ -303,9 +323,6 @@ pub struct PackageConfig {
     pub log: ConfigLogging,
     /// Authentication-related configuration
     pub authn: AuthnConfig,
-    /// Port Nexus should use for launching HTTPS servers
-    #[serde(default = "default_https_port")]
-    pub nexus_https_port: u16,
     /// Timeseries database configuration.
     #[serde(default)]
     pub timeseries_db: TimeseriesDbConfig,
@@ -394,8 +411,9 @@ mod test {
     };
     use crate::address::{Ipv6Subnet, RACK_PREFIX};
     use crate::nexus_config::{
-        BackgroundTaskConfig, Database, DeploymentConfig, DnsTasksConfig,
-        DpdConfig, LoadErrorKind,
+        BackgroundTaskConfig, ConfigDropshotWithTls, Database,
+        DeploymentConfig, DnsTasksConfig, DpdConfig, ExternalEndpointsConfig,
+        LoadErrorKind,
     };
     use dropshot::ConfigDropshot;
     use dropshot::ConfigLogging;
@@ -537,6 +555,7 @@ mod test {
             dns_external.period_secs_servers = 6
             dns_external.period_secs_propagation = 7
             dns_external.max_concurrent_server_updates = 8
+            external_endpoints.period_secs = 9
             "##,
         )
         .unwrap();
@@ -549,11 +568,14 @@ mod test {
                     rack_id: "38b90dc4-c22a-65ba-f49a-f051fe01208f"
                         .parse()
                         .unwrap(),
-                    dropshot_external: ConfigDropshot {
-                        bind_address: "10.1.2.3:4567"
-                            .parse::<SocketAddr>()
-                            .unwrap(),
-                        ..Default::default()
+                    dropshot_external: ConfigDropshotWithTls {
+                        tls: false,
+                        dropshot: ConfigDropshot {
+                            bind_address: "10.1.2.3:4567"
+                                .parse::<SocketAddr>()
+                                .unwrap(),
+                            ..Default::default()
+                        }
                     },
                     dropshot_internal: ConfigDropshot {
                         bind_address: "10.1.2.3:4568"
@@ -571,7 +593,6 @@ mod test {
                         session_absolute_timeout_minutes: 480
                     },
                     authn: AuthnConfig { schemes_external: Vec::new() },
-                    nexus_https_port: 443,
                     log: ConfigLogging::File {
                         level: ConfigLoggingLevel::Debug,
                         if_exists: ConfigLoggingIfExists::Fail,
@@ -603,6 +624,9 @@ mod test {
                             period_secs_propagation: Duration::from_secs(7),
                             max_concurrent_server_updates: 8,
                         },
+                        external_endpoints: ExternalEndpointsConfig {
+                            period_secs: Duration::from_secs(9),
+                        }
                     },
                 },
             }
@@ -648,6 +672,7 @@ mod test {
             dns_external.period_secs_servers = 6
             dns_external.period_secs_propagation = 7
             dns_external.max_concurrent_server_updates = 8
+            external_endpoints.period_secs = 9
             "##,
         )
         .unwrap();
diff --git a/dev-tools/Cargo.toml b/dev-tools/Cargo.toml
index dd63651646..b9e249195b 100644
--- a/dev-tools/Cargo.toml
+++ b/dev-tools/Cargo.toml
@@ -9,9 +9,11 @@ path = "../rpaths"
 
 [dependencies]
 anyhow.workspace = true
+camino.workspace = true
 clap.workspace = true
 dropshot.workspace = true
 futures.workspace = true
+libc.workspace = true
 nexus-test-utils.workspace = true
 nexus-test-interface.workspace = true
 omicron-common.workspace = true
@@ -20,6 +22,7 @@ omicron-test-utils.workspace = true
 omicron-sled-agent.workspace = true
 # See omicron-rpaths for more about the "pq-sys" dependency.
 pq-sys = "*"
+rcgen.workspace = true
 signal-hook.workspace = true
 signal-hook-tokio.workspace = true
 tokio = { workspace = true, features = [ "full" ] }
@@ -27,9 +30,11 @@ tokio-postgres.workspace = true
 toml.workspace = true
 
 [dev-dependencies]
+camino-tempfile.workspace = true
 expectorate.workspace = true
 libc.workspace = true
 omicron-test-utils.workspace = true
+openssl.workspace = true
 oxide-client.workspace = true
 subprocess.workspace = true
 
diff --git a/dev-tools/src/bin/omicron-dev.rs b/dev-tools/src/bin/omicron-dev.rs
index 903e71d5a7..66695f4795 100644
--- a/dev-tools/src/bin/omicron-dev.rs
+++ b/dev-tools/src/bin/omicron-dev.rs
@@ -4,9 +4,10 @@
 
 //! Developer tool for easily running bits of Omicron
 
-use anyhow::anyhow;
 use anyhow::bail;
 use anyhow::Context;
+use camino::Utf8Path;
+use camino::Utf8PathBuf;
 use clap::Args;
 use clap::Parser;
 use futures::stream::StreamExt;
@@ -17,6 +18,8 @@ use omicron_sled_agent::sim;
 use omicron_test_utils::dev;
 use signal_hook::consts::signal::SIGINT;
 use signal_hook_tokio::Signals;
+use std::io::Write;
+use std::os::unix::prelude::OpenOptionsExt;
 use std::path::PathBuf;
 
 #[tokio::main]
@@ -28,6 +31,7 @@ async fn main() -> Result<(), anyhow::Error> {
         OmicronDb::DbWipe { ref args } => cmd_db_wipe(args).await,
         OmicronDb::ChRun { ref args } => cmd_clickhouse_run(args).await,
         OmicronDb::RunAll { ref args } => cmd_run_all(args).await,
+        OmicronDb::CertCreate { ref args } => cmd_cert_create(args).await,
     };
     if let Err(error) = result {
         fatal(CmdError::Failure(format!("{:#}", error)));
@@ -69,6 +73,12 @@ enum OmicronDb {
         #[clap(flatten)]
         args: RunAllArgs,
     },
+
+    /// Create a self-signed certificate for use with Omicron
+    CertCreate {
+        #[clap(flatten)]
+        args: CertCreateArgs,
+    },
 }
 
 #[derive(Clone, Debug, Args)]
@@ -321,7 +331,7 @@ async fn cmd_run_all(args: &RunAllArgs) -> Result<(), anyhow::Error> {
     };
 
     if let Some(p) = args.nexus_listen_port {
-        config.deployment.dropshot_external.bind_address.set_port(p);
+        config.deployment.dropshot_external.dropshot.bind_address.set_port(p);
     }
 
     // Start up a ControlPlaneTestContext, which tautologically sets up
@@ -329,17 +339,14 @@ async fn cmd_run_all(args: &RunAllArgs) -> Result<(), anyhow::Error> {
     println!("omicron-dev: setting up all services ... ");
     let cptestctx = nexus_test_utils::test_setup_with_config::<
         omicron_nexus::Server,
-    >("omicron-dev", &mut config, sim::SimMode::Auto)
+    >("omicron-dev", &mut config, sim::SimMode::Auto, None)
     .await;
     println!("omicron-dev: services are running.");
 
     // Print out basic information about what was started.
     // NOTE: The stdout strings here are not intended to be stable, but they are
     // used by the test suite.
-    let addr =
-        cptestctx.server.get_http_server_external_address().await.ok_or_else(
-            || anyhow!("Nexus unexpectedly had no external HTTP address"),
-        )?;
+    let addr = cptestctx.external_client.bind_address;
     println!("omicron-dev: nexus external API:    {:?}", addr);
     println!(
         "omicron-dev: nexus internal API:    {:?}",
@@ -393,3 +400,52 @@ async fn cmd_run_all(args: &RunAllArgs) -> Result<(), anyhow::Error> {
     cptestctx.teardown().await;
     Ok(())
 }
+
+#[derive(Clone, Debug, Args)]
+struct CertCreateArgs {
+    /// path to where the generated certificate and key files should go
+    /// (e.g., "out/initial-" would cause the files to be called
+    /// "out/initial-cert.pem" and "out/initial-key.pem")
+    #[clap(action)]
+    output_base: Utf8PathBuf,
+
+    /// DNS names that the certificate claims to be valid for (subject
+    /// alternative names)
+    #[clap(action, required = true)]
+    server_names: Vec<String>,
+}
+
+async fn cmd_cert_create(args: &CertCreateArgs) -> Result<(), anyhow::Error> {
+    let cert = rcgen::generate_simple_self_signed(args.server_names.clone())
+        .context("generating certificate")?;
+    let cert_pem =
+        cert.serialize_pem().context("serializing certificate as PEM")?;
+    let key_pem = cert.serialize_private_key_pem();
+
+    let cert_path = Utf8PathBuf::from(format!("{}cert.pem", args.output_base));
+    write_private_file(&cert_path, cert_pem.as_bytes())
+        .context("writing certificate file")?;
+    println!("wrote certificate to {}", cert_path);
+
+    let key_path = Utf8PathBuf::from(format!("{}key.pem", args.output_base));
+    write_private_file(&key_path, key_pem.as_bytes())
+        .context("writing private key file")?;
+    println!("wrote private key to {}", key_path);
+
+    Ok(())
+}
+
+fn write_private_file(
+    path: &Utf8Path,
+    contents: &[u8],
+) -> Result<(), anyhow::Error> {
+    // The file should be readable and writable by the user only.
+    let perms = libc::S_IRUSR | libc::S_IWUSR;
+    let mut file = std::fs::OpenOptions::new()
+        .write(true)
+        .create_new(true)
+        .mode(perms)
+        .open(path)
+        .with_context(|| format!("open {:?} for writing", path))?;
+    file.write_all(contents).with_context(|| format!("write to {:?}", path))
+}
diff --git a/dev-tools/tests/output/cmd-omicron-dev-noargs-stderr b/dev-tools/tests/output/cmd-omicron-dev-noargs-stderr
index c8c7de5af3..10da31f6c2 100644
--- a/dev-tools/tests/output/cmd-omicron-dev-noargs-stderr
+++ b/dev-tools/tests/output/cmd-omicron-dev-noargs-stderr
@@ -8,6 +8,7 @@ Commands:
   db-wipe      Wipe the Omicron schema (and all data) from an existing CockroachDB cluster
   ch-run       Run a ClickHouse database server for development
   run-all      Run a full simulated control plane
+  cert-create  Create a self-signed certificate for use with Omicron
   help         Print this message or the help of the given subcommand(s)
 
 Options:
diff --git a/dev-tools/tests/test_omicron_dev.rs b/dev-tools/tests/test_omicron_dev.rs
index 59729ac364..07c604f86b 100644
--- a/dev-tools/tests/test_omicron_dev.rs
+++ b/dev-tools/tests/test_omicron_dev.rs
@@ -4,12 +4,14 @@
 
 //! Smoke tests for the omicron-dev command-line tool
 
+use anyhow::Context;
 use expectorate::assert_contents;
 use omicron_test_utils::dev::db::has_omicron_schema;
 use omicron_test_utils::dev::process_running;
 use omicron_test_utils::dev::test_cmds::assert_exit_code;
 use omicron_test_utils::dev::test_cmds::path_to_executable;
 use omicron_test_utils::dev::test_cmds::run_command;
+use omicron_test_utils::dev::test_cmds::EXIT_SUCCESS;
 use omicron_test_utils::dev::test_cmds::EXIT_USAGE;
 use oxide_client::ClientHiddenExt;
 use std::io::BufRead;
@@ -512,3 +514,38 @@ fn test_omicron_dev_db_wipe_no_args() {
         &stderr_text,
     );
 }
+
+#[test]
+fn test_cert_create() {
+    let tmpdir = camino_tempfile::tempdir().unwrap();
+    println!("tmpdir: {}", tmpdir.path());
+    let output_base = format!("{}/test-", tmpdir.path());
+    let exec = Exec::cmd(path_to_omicron_dev())
+        .arg("cert-create")
+        .arg(output_base)
+        .arg("foo.example")
+        .arg("bar.example");
+    let (exit_status, _, stderr_text) = run_command(exec);
+    assert_exit_code(exit_status, EXIT_SUCCESS, &stderr_text);
+    let cert_path = tmpdir.path().join("test-cert.pem");
+    let key_path = tmpdir.path().join("test-key.pem");
+    let cert_contents = std::fs::read(&cert_path)
+        .with_context(|| format!("reading certificate path {:?}", cert_path))
+        .unwrap();
+    let key_contents = std::fs::read(&key_path)
+        .with_context(|| format!("reading private key path: {:?}", key_path))
+        .unwrap();
+    let certs_pem = openssl::x509::X509::stack_from_pem(&cert_contents)
+        .context("parsing certificate")
+        .unwrap();
+    let private_key = openssl::pkey::PKey::private_key_from_pem(&key_contents)
+        .context("parsing private key")
+        .unwrap();
+    assert!(certs_pem
+        .iter()
+        .last()
+        .unwrap()
+        .public_key()
+        .unwrap()
+        .public_eq(&private_key));
+}
diff --git a/dns-server/tests/basic_test.rs b/dns-server/tests/basic_test.rs
index 297a7176fc..64a854e3d2 100644
--- a/dns-server/tests/basic_test.rs
+++ b/dns-server/tests/basic_test.rs
@@ -397,7 +397,6 @@ fn test_config(
     let config_dropshot = dropshot::ConfigDropshot {
         bind_address: "[::1]:0".to_string().parse().unwrap(),
         request_body_max_bytes: 1024,
-        ..Default::default()
     };
 
     Ok((tmp_dir, config_storage, config_dropshot, logctx))
diff --git a/docs/how-to-run-simulated.adoc b/docs/how-to-run-simulated.adoc
index 560b1d0b0c..5af145bec1 100644
--- a/docs/how-to-run-simulated.adoc
+++ b/docs/how-to-run-simulated.adoc
@@ -194,3 +194,27 @@ Once everything is up and running, you can use the system in a few ways:
 
 * Use the browser-based console.  The Nexus log output will show what IP address and port it's listening on.  This is also configured in the config file.  If you're using the defaults, you can reach the console at `http://127.0.0.1:12220/projects`.  Depending on the environment where you're running this, you may need an ssh tunnel or the like to reach this from your browser.
 * Use the xref:cli.adoc[`oxide` CLI].
+
+== Running with TLS
+
+When you run the above, you will wind up with Nexus listening on HTTP (with no TLS) on its external address.  This is convenient for debugging, but not representative of a real system.  If you want to run it with TLS, you need to tweak the above procedure slightly:
+
+1. You'll need to use the "Running the pieces by hand" section.  `omicron-dev run-all` does not currently provide a way to do this (because it doesn't have a way to specify a certificate to be used during rack initialization).
+2. Acquire a TLS certificate.  The easiest approach is to use `omicron-dev cert-create` to create a self-signed certificate.  However you get one, it should be valid for the domain corresponding to your recovery Silo.  When you run the pieces by hand, this would be `demo-silo.sys.oxide-dev.test`.  If you want a certificate you can use for multiple Silos, make it a wildcard certificate.  Here's an example:
++
+[source,text]
+----
+$ cargo run --bin=omicron-dev -- cert-create demo- '*.sys.oxide-dev.test'
+wrote certificate to demo-cert.pem
+wrote private key to demo-key.pem
+----
+3. Modify your Nexus configuration file to include `tls = true`.  See `./nexus/examples/config.toml` for an example.  This property is present but commented-out in that file.  If you're running on standard port 80 (which is not usually the case in development), you may also want to change the `deployment.dropshot_external.bind_address` port to 443.
+4. When you run `sled-agent-sim`, pass the `--rss-tls-cert` and `--rss-tls-key` options as well.  These should refer to the files created by `omicron-dev cert-create` above.  (They can be any PEM-formatted x509 certificate and associated private key.)
+5. Usually at this point you'll be using a self-signed certificate for a domain that's not publicly resolvable with DNS.  This makes it hard to use standard clients.  Fortunately, `curl` does have flags to make this easy.  Continuing with this example, assuming your Nexus HTTPS server is listening on 127.0.0.1:12220 and your Silo's DNS name is `demo-silo.sys.oxide-dev.test`:
++
+[source,text]
+----
+$ curl -i --resolve test-suite-silo.sys.oxide-dev.test:12220:127.0.0.1 --cacert /path/to/your/certificate.pem https://test-suite-silo.sys.oxide-dev.test:12220
+----
++
+The Oxide CLI does not https://github.com/oxidecomputer/oxide.rs/issues/193[yet] support this so it's a little unwieldy to use a local TLS server this way.
diff --git a/end-to-end-tests/Cargo.toml b/end-to-end-tests/Cargo.toml
index 6c0312e204..d443276b17 100644
--- a/end-to-end-tests/Cargo.toml
+++ b/end-to-end-tests/Cargo.toml
@@ -12,7 +12,6 @@ camino.workspace = true
 chrono.workspace = true
 futures.workspace = true
 http.workspace = true
-hyper.workspace = true
 omicron-sled-agent.workspace = true
 omicron-test-utils.workspace = true
 oxide-client.workspace = true
diff --git a/end-to-end-tests/src/helpers/ctx.rs b/end-to-end-tests/src/helpers/ctx.rs
index b5e3e0346d..120aa50e4a 100644
--- a/end-to-end-tests/src/helpers/ctx.rs
+++ b/end-to-end-tests/src/helpers/ctx.rs
@@ -1,10 +1,10 @@
 use crate::helpers::generate_name;
 use anyhow::{anyhow, Context as _, Result};
 use chrono::Utc;
-use futures::future::FutureExt;
 use omicron_sled_agent::rack_setup::config::SetupServiceConfig;
 use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError};
-use oxide_client::types::{Name, ProjectCreate, UsernamePasswordCredentials};
+use oxide_client::types::{Name, ProjectCreate};
+use oxide_client::CustomDnsResolver;
 use oxide_client::{Client, ClientProjectsExt, ClientVpcsExt};
 use reqwest::header::{HeaderMap, HeaderValue};
 use reqwest::Url;
@@ -12,11 +12,7 @@ use std::net::IpAddr;
 use std::net::SocketAddr;
 use std::sync::Arc;
 use std::time::Duration;
-use trust_dns_resolver::config::{
-    NameServerConfig, Protocol, ResolverConfig, ResolverOpts,
-};
 use trust_dns_resolver::error::ResolveErrorKind;
-use trust_dns_resolver::TokioAsyncResolver;
 
 const RSS_CONFIG_PATH: &str = concat!(
     env!("CARGO_MANIFEST_DIR"),
@@ -127,13 +123,13 @@ pub async fn nexus_addr() -> Result<IpAddr> {
     let dns_addr = external_dns_addr(&config)?;
     let dns_name = nexus_external_dns_name(&config);
     let resolver = CustomDnsResolver::new(dns_addr)?;
-    resolver
-        .wait_for_records(
-            &dns_name,
-            Duration::from_secs(1),
-            Duration::from_secs(300),
-        )
-        .await
+    wait_for_records(
+        &resolver,
+        &dns_name,
+        Duration::from_secs(1),
+        Duration::from_secs(300),
+    )
+    .await
 }
 
 pub async fn build_client() -> Result<oxide_client::Client> {
@@ -145,15 +141,6 @@ pub async fn build_client() -> Result<oxide_client::Client> {
     let dns_name = nexus_external_dns_name(&config);
     let resolver = Arc::new(CustomDnsResolver::new(dns_addr)?);
 
-    // Do not have reqwest follow redirects.  That's because our login response
-    // includes both a redirect and the session cookie header.  If reqwest
-    // follows the redirect, we won't have a chance to get the cookie.
-    let mut builder = reqwest::ClientBuilder::new()
-        .connect_timeout(Duration::from_secs(15))
-        .redirect(reqwest::redirect::Policy::none())
-        .dns_resolver(resolver.clone())
-        .timeout(Duration::from_secs(60));
-
     // If we were provided with a path to a certificate in the environment, add
     // it as a trusted one.
     let (proto, extra_root_cert) = match std::env::var(E2E_TLS_CERT_ENV) {
@@ -170,12 +157,6 @@ pub async fn build_client() -> Result<oxide_client::Client> {
         }
     };
 
-    if let Some(cert) = &extra_root_cert {
-        builder = builder.add_root_certificate(cert.clone());
-    }
-
-    let reqwest_login_client = builder.build()?;
-
     // Prepare to make a login request.
     let base_url = format!("{}://{}", proto, dns_name);
     let silo_name = config.recovery_silo.silo_name.as_str();
@@ -186,38 +167,44 @@ pub async fn build_client() -> Result<oxide_client::Client> {
         })?;
     // See the comment in the config file about this password.
     let password: oxide_client::types::Password = "oxide".parse().unwrap();
-    let login_request_body =
-        serde_json::to_string(&UsernamePasswordCredentials {
-            username,
-            password,
-        })
-        .context("serializing login request body")?;
 
     // By the time we get here, Nexus might not be up yet.  It may not have
     // published its names to external DNS, and even if it has, it may not have
     // opened its external listening socket.  So we have to retry a bit until we
     // succeed.
-    let response = wait_for_condition(
+    let session_token = wait_for_condition(
         || async {
             // Use a raw reqwest client because it's not clear that Progenitor
             // is intended to support endpoints that return 300-level response
             // codes.  See progenitor#451.
             eprintln!("{}: attempting to log into API", Utc::now());
-            reqwest_login_client
-                .post(&login_url)
-                .body(login_request_body.clone())
-                .send()
-                .await
-                .map_err(|e| {
-                    eprintln!("{}: login failed: {:#}", Utc::now(), e);
+
+            let mut builder = reqwest::ClientBuilder::new()
+                .connect_timeout(Duration::from_secs(15))
+                .dns_resolver(resolver.clone())
+                .timeout(Duration::from_secs(60));
+
+            if let Some(cert) = &extra_root_cert {
+                builder = builder.add_root_certificate(cert.clone());
+            }
+
+            oxide_client::login(
+                builder,
+                &login_url,
+                username.clone(),
+                password.clone(),
+            )
+            .await
+            .map_err(|e| {
+                eprintln!("{}: login failed: {:#}", Utc::now(), e);
+                if let oxide_client::LoginError::RequestError(e) = &e {
                     if e.is_connect() {
-                        CondCheckError::NotYet
-                    } else {
-                        CondCheckError::Failed(
-                            anyhow::Error::new(e).context("logging in"),
-                        )
+                        return CondCheckError::NotYet;
                     }
-                })
+                }
+
+                CondCheckError::Failed(e)
+            })
         },
         &Duration::from_secs(1),
         &Duration::from_secs(300),
@@ -226,20 +213,11 @@ pub async fn build_client() -> Result<oxide_client::Client> {
     .context("logging in")?;
 
     eprintln!("{}: login succeeded", Utc::now());
-    let session_cookie = response
-        .headers()
-        .get(http::header::SET_COOKIE)
-        .ok_or_else(|| anyhow!("expected session cookie after login"))?
-        .to_str()
-        .context("expected session cookie token to be a string")?;
-    let (session_token, rest) = session_cookie.split_once("; ").unwrap();
-    assert!(session_token.starts_with("session="));
-    assert!(rest.contains("Path=/; HttpOnly; SameSite=Lax; Max-Age="));
 
     let mut headers = HeaderMap::new();
     headers.insert(
         http::header::COOKIE,
-        HeaderValue::from_str(session_token).unwrap(),
+        HeaderValue::from_str(&format!("session={}", session_token)).unwrap(),
     );
 
     let mut builder = reqwest::ClientBuilder::new()
@@ -256,91 +234,43 @@ pub async fn build_client() -> Result<oxide_client::Client> {
     Ok(Client::new_with_client(&base_url, reqwest_client))
 }
 
-/// Wrapper around a `TokioAsyncResolver` so that we can impl
-/// `reqwest::dns::Resolve` for it.
-struct CustomDnsResolver {
-    dns_addr: SocketAddr,
-    // The lifetime constraints on the `Resolve` trait make it hard to avoid an
-    // Arc here.
-    resolver: Arc<TokioAsyncResolver>,
-}
-
-impl CustomDnsResolver {
-    fn new(dns_addr: SocketAddr) -> Result<CustomDnsResolver> {
-        let mut resolver_config = ResolverConfig::new();
-        resolver_config.add_name_server(NameServerConfig {
-            socket_addr: dns_addr,
-            protocol: Protocol::Udp,
-            tls_dns_name: None,
-            trust_nx_responses: false,
-            bind_addr: None,
-        });
-
-        let resolver = Arc::new(
-            TokioAsyncResolver::tokio(resolver_config, ResolverOpts::default())
-                .context("failed to create resolver")?,
-        );
-        Ok(CustomDnsResolver { dns_addr, resolver })
-    }
-
-    async fn wait_for_records(
-        &self,
-        dns_name: &str,
-        check_period: Duration,
-        max: Duration,
-    ) -> Result<IpAddr> {
-        wait_for_condition::<_, anyhow::Error, _, _>(
-            || async {
-                self.resolver
-                    .lookup_ip(dns_name)
-                    .await
-                    .map_err(|e| match e.kind() {
-                        ResolveErrorKind::NoRecordsFound { .. }
-                        | ResolveErrorKind::Timeout => CondCheckError::NotYet,
-                        _ => CondCheckError::Failed(
-                            anyhow::Error::new(e).context(format!(
-                                "resolving {:?} from {}",
-                                dns_name, self.dns_addr
-                            )),
+async fn wait_for_records(
+    resolver: &CustomDnsResolver,
+    dns_name: &str,
+    check_period: Duration,
+    max: Duration,
+) -> Result<IpAddr> {
+    wait_for_condition::<_, anyhow::Error, _, _>(
+        || async {
+            resolver
+                .resolver()
+                .lookup_ip(dns_name)
+                .await
+                .map_err(|e| match e.kind() {
+                    ResolveErrorKind::NoRecordsFound { .. }
+                    | ResolveErrorKind::Timeout => CondCheckError::NotYet,
+                    _ => CondCheckError::Failed(anyhow::Error::new(e).context(
+                        format!(
+                            "resolving {:?} from {}",
+                            dns_name,
+                            resolver.dns_addr()
                         ),
-                    })?
-                    .iter()
-                    .next()
-                    .ok_or(CondCheckError::NotYet)
-            },
-            &check_period,
-            &max,
+                    )),
+                })?
+                .iter()
+                .next()
+                .ok_or(CondCheckError::NotYet)
+        },
+        &check_period,
+        &max,
+    )
+    .await
+    .with_context(|| {
+        format!(
+            "failed to resolve {:?} from {:?} within {:?}",
+            dns_name,
+            resolver.dns_addr(),
+            max
         )
-        .await
-        .with_context(|| {
-            format!(
-                "failed to resolve {:?} from {:?} within {:?}",
-                dns_name, self.dns_addr, max
-            )
-        })
-    }
-}
-
-pub fn gateway_ip() -> String {
-    std::env::var("GATEWAY_IP")
-        .expect("GATEWAY_IP environment variable required")
-}
-
-impl reqwest::dns::Resolve for CustomDnsResolver {
-    fn resolve(
-        &self,
-        name: hyper::client::connect::dns::Name,
-    ) -> reqwest::dns::Resolving {
-        let resolver = self.resolver.clone();
-        async move {
-            let list = resolver.lookup_ip(name.as_str()).await?;
-            Ok(Box::new(list.into_iter().map(|s| {
-                // reqwest does not appear to use the port number here.
-                // (See the docs for `ClientBuilder::resolve()`, which isn't
-                // the same thing, but is related.)
-                SocketAddr::from((s, 0))
-            })) as Box<dyn Iterator<Item = SocketAddr> + Send>)
-        }
-        .boxed()
-    }
+    })
 }
diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs
index a9c5e5a383..d65536901c 100644
--- a/gateway/src/lib.rs
+++ b/gateway/src/lib.rs
@@ -93,7 +93,6 @@ fn start_dropshot_server(
     let dropshot = ConfigDropshot {
         bind_address: SocketAddr::V6(addr),
         request_body_max_bytes,
-        ..Default::default()
     };
     let http_server_starter = dropshot::HttpServerStarter::new(
         &dropshot,
diff --git a/installinator-artifactd/src/server.rs b/installinator-artifactd/src/server.rs
index 1054953a2c..feb7dd75d8 100644
--- a/installinator-artifactd/src/server.rs
+++ b/installinator-artifactd/src/server.rs
@@ -54,7 +54,6 @@ impl ArtifactServer {
             // https://github.com/oxidecomputer/dropshot/pull/618 lands and is
             // available in omicron.
             request_body_max_bytes: 4 * 1024 * 1024,
-            ..Default::default()
         };
 
         let api = crate::http_entrypoints::api();
diff --git a/internal-dns/src/resolver.rs b/internal-dns/src/resolver.rs
index c6e0ea1e19..1df2f6fc5c 100644
--- a/internal-dns/src/resolver.rs
+++ b/internal-dns/src/resolver.rs
@@ -232,7 +232,6 @@ mod test {
                 &dropshot::ConfigDropshot {
                     bind_address: "[::1]:0".parse().unwrap(),
                     request_body_max_bytes: 8 * 1024,
-                    ..Default::default()
                 },
             )
             .await
diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml
index 49de7cea8a..dfaadf10d6 100644
--- a/nexus/Cargo.toml
+++ b/nexus/Cargo.toml
@@ -87,6 +87,7 @@ omicron-passwords.workspace = true
 oximeter.workspace = true
 oximeter-instruments = { workspace = true, features = ["http-instruments"] }
 oximeter-producer.workspace = true
+rustls = { workspace = true }
 
 [dev-dependencies]
 assert_matches.workspace = true
@@ -101,6 +102,7 @@ omicron-sled-agent.workspace = true
 omicron-test-utils.workspace = true
 openapi-lint.workspace = true
 openapiv3.workspace = true
+oxide-client.workspace = true
 pem.workspace = true
 petgraph.workspace = true
 rcgen.workspace = true
diff --git a/nexus/db-model/src/certificate.rs b/nexus/db-model/src/certificate.rs
index bede4dfeac..2d7e90f959 100644
--- a/nexus/db-model/src/certificate.rs
+++ b/nexus/db-model/src/certificate.rs
@@ -128,13 +128,24 @@ impl Certificate {
         let cert = validate_certs(&params.cert)?;
         validate_private_key(cert, &params.key)?;
 
-        Ok(Self {
+        Ok(Self::new_unvalidated(silo_id, id, service, params))
+    }
+
+    // For testing only: allow creation of certificates that would otherwise not
+    // be allowed (e.g., expired)
+    pub fn new_unvalidated(
+        silo_id: Uuid,
+        id: Uuid,
+        service: ServiceKind,
+        params: params::CertificateCreate,
+    ) -> Self {
+        Self {
             identity: CertificateIdentity::new(id, params.identity),
             silo_id,
             service,
             cert: params.cert,
             key: params.key,
-        })
+        }
     }
 }
 
diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs
index 22c3f2c161..cf06f23aef 100644
--- a/nexus/db-queries/src/db/datastore/mod.rs
+++ b/nexus/db-queries/src/db/datastore/mod.rs
@@ -82,6 +82,7 @@ mod zpool;
 pub use address_lot::AddressLotCreateResult;
 pub use dns::DnsVersionUpdateBuilder;
 pub use rack::RackInit;
+pub use silo::Discoverability;
 pub use switch_port::SwitchPortSettingsCombinedResult;
 pub use virtual_provisioning_collection::StorageType;
 pub use volume::CrucibleResources;
diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs
index d45ee89377..0292fafec1 100644
--- a/nexus/db-queries/src/db/datastore/rack.rs
+++ b/nexus/db-queries/src/db/datastore/rack.rs
@@ -548,6 +548,7 @@ mod test {
     use crate::db::datastore::test::{
         sled_baseboard_for_test, sled_system_hardware_for_test,
     };
+    use crate::db::datastore::Discoverability;
     use crate::db::lookup::LookupPath;
     use crate::db::model::ExternalIp;
     use crate::db::model::IpKind;
@@ -668,6 +669,7 @@ mod test {
                     limit: NonZeroU32::new(2).unwrap(),
                     direction: dropshot::PaginationOrder::Ascending,
                 }),
+                Discoverability::DiscoverableOnly,
             )
             .await
             .expect("Failed to list Silos");
diff --git a/nexus/db-queries/src/db/datastore/silo.rs b/nexus/db-queries/src/db/datastore/silo.rs
index 73ba986d07..75b8ccaa6e 100644
--- a/nexus/db-queries/src/db/datastore/silo.rs
+++ b/nexus/db-queries/src/db/datastore/silo.rs
@@ -42,6 +42,14 @@ use omicron_common::api::external::ResourceType;
 use ref_cast::RefCast;
 use uuid::Uuid;
 
+/// Filter a "silo_list" query based on silos' discoverability
+pub enum Discoverability {
+    /// Show all Silos
+    All,
+    /// Show only discoverable Silos
+    DiscoverableOnly,
+}
+
 impl DataStore {
     /// Load built-in silos into the database
     pub async fn load_builtin_silos(
@@ -284,11 +292,12 @@ impl DataStore {
         &self,
         opctx: &OpContext,
         pagparams: &PaginatedBy<'_>,
+        discoverability: Discoverability,
     ) -> ListResultVec<Silo> {
         opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?;
 
         use db::schema::silo::dsl;
-        match pagparams {
+        let mut query = match pagparams {
             PaginatedBy::Id(params) => paginated(dsl::silo, dsl::id, &params),
             PaginatedBy::Name(params) => paginated(
                 dsl::silo,
@@ -296,12 +305,17 @@ impl DataStore {
                 &params.map_name(|n| Name::ref_cast(n)),
             ),
         }
-        .filter(dsl::time_deleted.is_null())
-        .filter(dsl::discoverable.eq(true))
-        .select(Silo::as_select())
-        .load_async::<Silo>(self.pool_authorized(opctx).await?)
-        .await
-        .map_err(|e| public_error_from_diesel_pool(e, ErrorHandler::Server))
+        .filter(dsl::time_deleted.is_null());
+
+        if let Discoverability::DiscoverableOnly = discoverability {
+            query = query.filter(dsl::discoverable.eq(true));
+        }
+
+        query
+            .select(Silo::as_select())
+            .load_async::<Silo>(self.pool_authorized(opctx).await?)
+            .await
+            .map_err(|e| public_error_from_diesel_pool(e, ErrorHandler::Server))
     }
 
     pub async fn silo_delete(
diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml
index c97f8ae089..3ca3eaa921 100644
--- a/nexus/examples/config.toml
+++ b/nexus/examples/config.toml
@@ -43,6 +43,12 @@ bind_address = "127.0.0.1:12220"
 # Allow larger request bodies (1MiB) to accomodate firewall endpoints (one
 # rule is ~500 bytes)
 request_body_max_bytes = 1048576
+# To have Nexus's external HTTP endpoint use TLS, uncomment the line below.  You
+# will also need to provide an initial TLS certificate during rack
+# initialization.  If you're using this config file, you're probably running a
+# simulated system.  In that case, the initial certificate is provided to the
+# simulated sled agent (acting as RSS) via command-line arguments.
+#tls = true
 
 [deployment.dropshot_internal]
 # IP Address and TCP port on which to listen for the internal API
@@ -77,3 +83,8 @@ dns_external.period_secs_config = 60
 dns_external.period_secs_servers = 60
 dns_external.period_secs_propagation = 60
 dns_external.max_concurrent_server_updates = 5
+# How frequently we check the list of stored TLS certificates.  This is
+# approximately an upper bound on how soon after updating the list of
+# certificates it will take _other_ Nexus instances to notice and stop serving
+# them (on a sunny day).
+external_endpoints.period_secs = 60
diff --git a/nexus/src/app/background/external_endpoints.rs b/nexus/src/app/background/external_endpoints.rs
new file mode 100644
index 0000000000..53401c16de
--- /dev/null
+++ b/nexus/src/app/background/external_endpoints.rs
@@ -0,0 +1,202 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Background task for keeping track of all externally-visible endpoints:
+//! all Silos, their externally-visible DNS names, and the TLS certificates
+//! associated with those names
+
+use super::common::BackgroundTask;
+use crate::app::external_endpoints::read_all_endpoints;
+pub use crate::app::external_endpoints::ExternalEndpoints;
+use futures::future::BoxFuture;
+use futures::FutureExt;
+use nexus_db_queries::context::OpContext;
+use nexus_db_queries::db::DataStore;
+use serde_json::json;
+use std::sync::Arc;
+use tokio::sync::watch;
+
+/// Background task that keeps track of the latest list of TLS certificates for
+/// Nexus's external endpoint
+pub struct ExternalEndpointsWatcher {
+    datastore: Arc<DataStore>,
+    last: Option<ExternalEndpoints>,
+    tx: watch::Sender<Option<ExternalEndpoints>>,
+    rx: watch::Receiver<Option<ExternalEndpoints>>,
+}
+
+impl ExternalEndpointsWatcher {
+    pub fn new(datastore: Arc<DataStore>) -> ExternalEndpointsWatcher {
+        let (tx, rx) = watch::channel(None);
+        ExternalEndpointsWatcher { datastore, last: None, tx, rx }
+    }
+
+    /// Exposes the latest set of TLS certificates
+    ///
+    /// You can use the returned [`watch::Receiver`] to look at the latest
+    /// configuration or to be notified when it changes.
+    pub fn watcher(&self) -> watch::Receiver<Option<ExternalEndpoints>> {
+        self.rx.clone()
+    }
+}
+
+impl BackgroundTask for ExternalEndpointsWatcher {
+    fn activate<'a, 'b, 'c>(
+        &'a mut self,
+        opctx: &'b OpContext,
+    ) -> BoxFuture<'c, serde_json::Value>
+    where
+        'a: 'c,
+        'b: 'c,
+    {
+        async {
+            let log = &opctx.log;
+
+            let result = read_all_endpoints(&self.datastore, opctx).await;
+
+            if let Err(error) = result {
+                warn!(
+                    &log,
+                    "failed to read Silo/DNS/TLS configuration";
+                    "error" => format!("{:#}", error)
+                );
+                return json!({
+                    "error":
+                        format!(
+                            "failed to read Silo/DNS/TLS configuration: \
+                            {:#}",
+                            error
+                        )
+                });
+            }
+
+            let new_config = result.unwrap();
+            let rv =
+                serde_json::to_value(&new_config).unwrap_or_else(|error| {
+                    json!({
+                        "error":
+                            format!(
+                                "failed to serialize final value: {:#}",
+                                error
+                            )
+                    })
+                });
+
+            match &self.last {
+                None => {
+                    info!(
+                        &log,
+                        "found Silo/DNS/TLS config (initial)";
+                        "config" => ?new_config,
+                    );
+                    self.last = Some(new_config.clone());
+                    self.tx.send_replace(Some(new_config));
+                }
+
+                Some(old) => {
+                    if *old == new_config {
+                        debug!(
+                            &log,
+                            "found Silo/DNS/TLS config (no change)";
+                            "config" => ?new_config,
+                        );
+                    } else {
+                        info!(
+                            &log,
+                            "found Silo/DNS/TLS config (changed)";
+                            "config" => ?new_config,
+                        );
+                        self.last = Some(new_config.clone());
+                        self.tx.send_replace(Some(new_config));
+                    }
+                }
+            };
+
+            rv
+        }
+        .boxed()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::app::background::common::BackgroundTask;
+    use crate::app::background::external_endpoints::ExternalEndpointsWatcher;
+    use nexus_db_queries::context::OpContext;
+    use nexus_db_queries::db::fixed_data::silo::DEFAULT_SILO;
+    use nexus_test_utils::resource_helpers::create_silo;
+    use nexus_test_utils_macros::nexus_test;
+    use nexus_types::external_api::shared::SiloIdentityMode;
+    use nexus_types::identity::Resource;
+
+    type ControlPlaneTestContext =
+        nexus_test_utils::ControlPlaneTestContext<crate::Server>;
+
+    #[nexus_test(server = crate::Server)]
+    async fn test_basic(cptestctx: &ControlPlaneTestContext) {
+        let nexus = &cptestctx.server.apictx().nexus;
+        let datastore = nexus.datastore();
+        let opctx = OpContext::for_tests(
+            cptestctx.logctx.log.clone(),
+            datastore.clone(),
+        );
+
+        // Verify the initial state.
+        let mut task = ExternalEndpointsWatcher::new(datastore.clone());
+        let watcher = task.watcher();
+        assert!(watcher.borrow().is_none());
+
+        // The datastore from the ControlPlaneTestContext is initialized with
+        // two Silos: the built-in Silo and the recovery Silo.
+        let recovery_silo_dns_name = format!(
+            "{}.sys.{}",
+            cptestctx.silo_name.as_str(),
+            cptestctx.external_dns_zone_name
+        );
+        let builtin_silo_dns_name = format!(
+            "{}.sys.{}",
+            DEFAULT_SILO.identity().name.as_str(),
+            cptestctx.external_dns_zone_name,
+        );
+        let _ = task.activate(&opctx).await;
+        let initial_state_raw = watcher.borrow();
+        let initial_state = initial_state_raw.as_ref().unwrap();
+        assert!(initial_state.has_domain(recovery_silo_dns_name.as_str()));
+        assert!(initial_state.has_domain(builtin_silo_dns_name.as_str()));
+        // There are no other Silos.
+        assert_eq!(initial_state.ndomains(), 2);
+        // Neither of these will have a valid certificate in this configuration.
+        assert_eq!(initial_state.nwarnings(), 2);
+        drop(initial_state_raw);
+
+        // If we create another Silo, we should see that one, too.
+        let new_silo_name = "test-silo";
+        create_silo(
+            &cptestctx.external_client,
+            new_silo_name,
+            false,
+            SiloIdentityMode::LocalOnly,
+        )
+        .await;
+        let new_silo_dns_name = format!(
+            "{}.sys.{}",
+            new_silo_name, cptestctx.external_dns_zone_name
+        );
+        let _ = task.activate(&opctx).await;
+        let new_state_raw = watcher.borrow();
+        let new_state = new_state_raw.as_ref().unwrap();
+        assert!(new_state.has_domain(recovery_silo_dns_name.as_str()));
+        assert!(new_state.has_domain(builtin_silo_dns_name.as_str()));
+        assert!(new_state.has_domain(new_silo_dns_name.as_str()));
+        // There are no other Silos.
+        assert_eq!(new_state.ndomains(), 3);
+        // None of these will have a valid certificate in this configuration.
+        assert_eq!(new_state.nwarnings(), 3);
+
+        // That's it.  We're not testing all possible cases.  That's done with
+        // unit tests for the underlying function.  We're just testing that the
+        // background task reports updated state when the underlying state
+        // changes.
+    }
+}
diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
index 24c713e030..79de660083 100644
--- a/nexus/src/app/background/init.rs
+++ b/nexus/src/app/background/init.rs
@@ -8,6 +8,7 @@ use super::common;
 use super::dns_config;
 use super::dns_propagation;
 use super::dns_servers;
+use super::external_endpoints;
 use nexus_db_model::DnsGroup;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::DataStore;
@@ -34,6 +35,13 @@ pub struct BackgroundTasks {
     pub task_external_dns_config: common::TaskHandle,
     /// task handle for the external DNS servers background task
     pub task_external_dns_servers: common::TaskHandle,
+
+    /// task handle for the task that keeps track of external endpoints
+    pub task_external_endpoints: common::TaskHandle,
+    /// external endpoints read by the background task
+    pub external_endpoints: tokio::sync::watch::Receiver<
+        Option<external_endpoints::ExternalEndpoints>,
+    >,
 }
 
 impl BackgroundTasks {
@@ -55,17 +63,34 @@ impl BackgroundTasks {
         let (task_external_dns_config, task_external_dns_servers) = init_dns(
             &mut driver,
             opctx,
-            datastore,
+            datastore.clone(),
             DnsGroup::External,
             &config.dns_external,
         );
 
+        // Background task: External endpoints list watcher
+        let (task_external_endpoints, external_endpoints) = {
+            let watcher =
+                external_endpoints::ExternalEndpointsWatcher::new(datastore);
+            let watcher_channel = watcher.watcher();
+            let task = driver.register(
+                "external_endpoints".to_string(),
+                config.external_endpoints.period_secs,
+                Box::new(watcher),
+                opctx.child(BTreeMap::new()),
+                vec![],
+            );
+            (task, watcher_channel)
+        };
+
         BackgroundTasks {
             driver,
             task_internal_dns_config,
             task_internal_dns_servers,
             task_external_dns_config,
             task_external_dns_servers,
+            task_external_endpoints,
+            external_endpoints,
         }
     }
 
@@ -223,7 +248,6 @@ pub mod test {
             &dropshot::ConfigDropshot {
                 bind_address: "[::1]:0".parse().unwrap(),
                 request_body_max_bytes: 8 * 1024,
-                ..Default::default()
             },
         )
         .await
diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs
index f77a0bb001..a3bf8efb29 100644
--- a/nexus/src/app/background/mod.rs
+++ b/nexus/src/app/background/mod.rs
@@ -8,6 +8,7 @@ mod common;
 mod dns_config;
 mod dns_propagation;
 mod dns_servers;
+mod external_endpoints;
 mod init;
 
 pub use common::Driver;
diff --git a/nexus/src/app/certificate.rs b/nexus/src/app/certificate.rs
index 323701eda4..a84a71ebef 100644
--- a/nexus/src/app/certificate.rs
+++ b/nexus/src/app/certificate.rs
@@ -13,12 +13,9 @@ use crate::external_api::params;
 use crate::external_api::shared;
 use nexus_db_queries::authz;
 use nexus_db_queries::context::OpContext;
-use nexus_types::identity::Resource;
 use omicron_common::api::external::http_pagination::PaginatedBy;
 use omicron_common::api::external::CreateResult;
-use omicron_common::api::external::DataPageParams;
 use omicron_common::api::external::DeleteResult;
-use omicron_common::api::external::Error;
 use omicron_common::api::external::InternalContext;
 use omicron_common::api::external::ListResultVec;
 use omicron_common::api::external::NameOrId;
@@ -63,13 +60,12 @@ impl super::Nexus {
 
         match kind {
             shared::ServiceUsingCertificate::ExternalApi => {
-                // TODO: If we make this operation "add a certificate, and try to update
-                // nearby Nexus servers to use it", that means it'll be combining a DB
-                // operation with a service update request. If we want both to reliably
-                // complete together, we should consider making this a saga.
-                // TODO: Refresh other nexus servers?
-                self.refresh_tls_config(&opctx).await?;
-                info!(self.log, "TLS refreshed successfully");
+                // TODO We could improve the latency of other Nexus instances
+                // noticing this certificate change with an explicit request to
+                // them.  Today, Nexus instances generally don't talk to each
+                // other.  That's a very valuable simplifying assumption.
+                self.background_tasks
+                    .activate(&self.background_tasks.task_external_endpoints);
                 Ok(cert)
             }
         }
@@ -95,137 +91,12 @@ impl super::Nexus {
         self.db_datastore.certificate_delete(opctx, &authz_cert).await?;
         match db_cert.service {
             ServiceKind::Nexus => {
-                // TODO: Refresh other nexus servers?
-                self.refresh_tls_config(&opctx).await?;
+                // See the comment in certificate_create() above.
+                self.background_tasks
+                    .activate(&self.background_tasks.task_external_endpoints);
             }
             _ => (),
         };
         Ok(())
     }
-
-    // Helper functions used by Nexus when managing its own server.
-
-    /// Returns the dropshot TLS configuration to run the Nexus external server.
-    pub async fn get_nexus_tls_config(
-        &self,
-        opctx: &OpContext,
-    ) -> Result<Option<dropshot::ConfigTls>, Error> {
-        // Lookup x509 certificates which might be stored in CRDB, specifically
-        // for launching the Nexus service.
-        //
-        // We only grab one certificate (see: the "limit" argument) because
-        // we're currently fine just using whatever certificate happens to be
-        // available (as long as it's for Nexus).
-        let certs = self
-            .datastore()
-            .certificate_list_for(
-                &opctx,
-                Some(db::model::ServiceKind::Nexus),
-                &PaginatedBy::Name(DataPageParams {
-                    marker: None,
-                    direction: dropshot::PaginationOrder::Ascending,
-                    limit: std::num::NonZeroU32::new(2).unwrap(),
-                }),
-                false,
-            )
-            .await
-            .map_err(|e| {
-                Error::internal_error(&format!(
-                    "Failed to list certificates: {e}"
-                ))
-            })?;
-
-        let certificate = if let Some(certificate) = certs.get(0) {
-            certificate
-        } else {
-            return Ok(None);
-        };
-
-        if certs.len() > 1 {
-            warn!(
-                self.log,
-                "Multiple x.509 certificates available for nexus, but choosing: {}",
-                certificate.name()
-            );
-        }
-
-        Ok(Some(dropshot::ConfigTls::AsBytes {
-            certs: certificate.cert.clone(),
-            key: certificate.key.clone(),
-        }))
-    }
-
-    /// Refreshes the TLS configuration for the currently-running Nexus external
-    /// server. This involves either:
-    /// - Creating a new HTTPS server if one does not exist
-    /// - Refreshing an existing HTTPS server if it already exists
-    /// - Tearing down an HTTPS server if no certificates exist
-    pub async fn refresh_tls_config(
-        &self,
-        opctx: &OpContext,
-    ) -> Result<(), Error> {
-        let tls_config = self.get_nexus_tls_config(&opctx).await?;
-
-        let mut external_servers = self.external_servers.lock().await;
-
-        match (tls_config, external_servers.https.take()) {
-            // Create a new server, using server context from an existing HTTP
-            // server.
-            (Some(tls_config), None) => {
-                info!(self.log, "Refresh TLS: Creating HTTPS server");
-                let mut cfg = external_servers.config.clone();
-                cfg.bind_address.set_port(external_servers.https_port());
-                cfg.tls = Some(tls_config);
-
-                let context =
-                    external_servers.get_context().ok_or_else(|| {
-                        Error::internal_error("No server context available")
-                    })?;
-
-                let log =
-                    context.log.new(o!("component" => "dropshot_external"));
-                let server_starter_external = dropshot::HttpServerStarter::new(
-                    &cfg,
-                    crate::external_api::http_entrypoints::external_api(),
-                    context,
-                    &log,
-                )
-                .map_err(|e| {
-                    Error::internal_error(&format!(
-                        "Initializing HTTPS server: {e}"
-                    ))
-                })?;
-                external_servers.set_https(server_starter_external.start());
-            }
-            // Refresh an existing server.
-            (Some(tls_config), Some(https)) => {
-                info!(
-                    self.log,
-                    "Refresh TLS: Refreshing HTTPS server at {}",
-                    https.local_addr()
-                );
-                https.refresh_tls(&tls_config)
-                    .await
-                    .expect("HTTPS server should be using HTTPS already, to enable refresh");
-                external_servers.set_https(https);
-            }
-            // Tear down an existing server.
-            (None, Some(https)) => {
-                info!(
-                    self.log,
-                    "Refresh TLS: Stopping HTTPS server at {}",
-                    https.local_addr()
-                );
-                https.close().await.map_err(|e| {
-                    Error::internal_error(&format!(
-                        "Failed to stop server: {e}"
-                    ))
-                })?;
-            }
-            // No config, no server.
-            (None, None) => (),
-        }
-
-        Ok(())
-    }
 }
diff --git a/nexus/src/app/external_endpoints.rs b/nexus/src/app/external_endpoints.rs
new file mode 100644
index 0000000000..a55fb96caf
--- /dev/null
+++ b/nexus/src/app/external_endpoints.rs
@@ -0,0 +1,1048 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Management of external HTTPS endpoints
+//!
+//! Whenever a client connects to one of our external endpoints and attempts to
+//! establish a TLS session, we must provide a TLS certificate to authenticate
+//! ourselves to the client.  But each Silo has a separate external DNS name and
+//! may have its own TLS certificate for that DNS name.  These all resolve to
+//! the same set of IPs, so we cannot tell from the IP address alone which
+//! Silo's endpoint the client is trying to reach nor which certificate to
+//! present.  TLS provides a mechanism called Server Name Indication (SNI) for
+//! clients to specify the name of the server they're trying to reach _before_
+//! the TLS session is established.  We use this to determine which Silo
+//! endpoint the client is trying to reach and so which TLS certificate to
+//! present.
+//!
+//! To achieve this, we first need to know what DNS names, Silos, and TLS
+//! certificates are available at any given time.  This is summarized in
+//! [`ExternalEndpoints`].  A background task is responsible for maintaining
+//! this, providing the latest version to whoever needs it via a `watch`
+//! channel.  How do we tell the TLS stack what certificate to use?  When
+//! setting up the Dropshot server in the first place, we provide a
+//! [`rustls::ServerConfig`] that describes various TLS settings, including an
+//! "certificate resolver" object that impls
+//! [`rustls::server::ResolvesServerCert`].  See [`NexusCertResolver`].
+
+use super::silo::silo_dns_name;
+use crate::db::model::ServiceKind;
+use anyhow::anyhow;
+use anyhow::bail;
+use anyhow::Context;
+use nexus_db_model::Certificate;
+use nexus_db_model::DnsGroup;
+use nexus_db_queries::context::OpContext;
+use nexus_db_queries::db::datastore::Discoverability;
+use nexus_db_queries::db::DataStore;
+use nexus_types::identity::Resource;
+use omicron_common::api::external::http_pagination::PaginatedBy;
+use omicron_common::api::external::DataPageParams;
+use omicron_common::api::external::Error;
+use omicron_common::api::external::Name as ExternalName;
+use omicron_common::bail_unless;
+use openssl::pkey::PKey;
+use openssl::x509::X509;
+use rustls::sign::CertifiedKey;
+use serde::Serialize;
+use serde_with::SerializeDisplay;
+use std::collections::btree_map::Entry;
+use std::collections::BTreeMap;
+use std::fmt;
+use std::num::NonZeroU32;
+use std::sync::Arc;
+use thiserror::Error;
+use tokio::sync::watch;
+use uuid::Uuid;
+
+/// Describes the set of external endpoints, organized by DNS name
+///
+/// This data structure provides a quick way to determine which Silo and TLS
+/// certificate(s) make sense for an incoming request, based on the TLS
+/// session's SNI (DNS name).  See module-level docs for details.
+///
+/// This object provides no interfaces outside this module.  It's only used by
+/// the `NexusCertResolver` that's also in this module.
+///
+/// This structure impls `Serialize` only so that background tasks can easily
+/// present the latest configuration that they've found (e.g., via a debug API)
+#[derive(Clone, Debug, Eq, PartialEq, Serialize)]
+pub struct ExternalEndpoints {
+    by_dns_name: BTreeMap<String, Arc<ExternalEndpoint>>,
+    warnings: Vec<ExternalEndpointError>,
+}
+
+impl ExternalEndpoints {
+    /// Assemble a list of Silos, TLS certificates, and external DNS zones into
+    /// a structure that we can use for quickly figuring out which Silo and TLS
+    /// certificates are associated with each incoming DNS name
+    fn new(
+        silos: Vec<nexus_db_model::Silo>,
+        certs: Vec<Certificate>,
+        external_dns_zones: Vec<nexus_db_model::DnsZone>,
+    ) -> ExternalEndpoints {
+        // We want to avoid failing this operation even if we encounter problems
+        // because we want to serve as many DNS certificates as we can (so that
+        // an operator has a chance of fixing any problems that do exist).
+        // Instead of returning any errors, keep track of any issues as
+        // warnings.
+        let mut warnings = vec![];
+
+        // Compute a mapping from external DNS name to Silo id.  Detect any
+        // duplicates and leave them out (but report them).  There should not
+        // be any duplicates since the DNS names are constructed from the
+        // (unique) Silo names.  Even if we support aliases in the future, they
+        // will presumably need to be unique, too.
+        let silo_names: BTreeMap<Uuid, ExternalName> = silos
+            .iter()
+            .map(|db_silo| (db_silo.id(), db_silo.name().clone()))
+            .collect();
+        let mut dns_names: BTreeMap<String, Uuid> = BTreeMap::new();
+        for z in external_dns_zones {
+            for s in &silos {
+                let dns_name =
+                    format!("{}.{}", silo_dns_name(s.name()), z.zone_name);
+                match dns_names.entry(dns_name.clone()) {
+                    Entry::Vacant(vac) => {
+                        vac.insert(s.id());
+                    }
+                    Entry::Occupied(occ) => {
+                        let first_silo_id = *occ.get();
+                        let first_silo_name =
+                            silo_names.get(&first_silo_id).unwrap().to_string();
+                        warnings.push(ExternalEndpointError::DupDnsName {
+                            dup_silo_id: s.id(),
+                            dup_silo_name: s.name().to_string(),
+                            first_silo_id,
+                            first_silo_name,
+                            dns_name,
+                        })
+                    }
+                };
+            }
+        }
+
+        // Compute a mapping from silo id to a list of usable TLS certificates
+        // for the Silo.  By "usable" here, we just mean that we are capable of
+        // providing it to the client.  This basically means that we can parse
+        // it.  A certificate might be invalid for some other reason (e.g., does
+        // not match the right DNS name or it's expired).  We may later choose
+        // to prefer some certificates over others, but that'll be decided later
+        // (see best_certificate()).  And in the end it'll be better to provide
+        // an expired certificate than none at all.
+        let (silo_tls_certs, cert_warnings): (Vec<_>, Vec<_>) = certs
+            .into_iter()
+            .map(|db_cert| (db_cert.silo_id, TlsCertificate::try_from(db_cert)))
+            .partition(|(_, e)| e.is_ok());
+        warnings.extend(cert_warnings.into_iter().map(|(silo_id, e)| {
+            let reason = match e {
+                // We partitioned above by whether this is an error not, so we
+                // shouldn't find a non-error here.  (We cannot use unwrap_err()
+                // because the `Ok` type doesn't impl `Debug`.)
+                Ok(_) => unreachable!("found certificate in list of errors"),
+                Err(e) => Arc::new(e),
+            };
+
+            ExternalEndpointError::BadCert { silo_id, reason }
+        }));
+        let mut certs_by_silo_id = BTreeMap::new();
+        for (silo_id, tls_cert) in silo_tls_certs.into_iter() {
+            // This was partitioned above so we should only have the non-errors
+            // here.
+            let tls_cert = tls_cert.unwrap();
+            let silo_entry =
+                certs_by_silo_id.entry(silo_id).or_insert_with(|| {
+                    ExternalEndpoint { silo_id, tls_certs: Vec::new() }
+                });
+            silo_entry.tls_certs.push(tls_cert)
+        }
+
+        let certs_by_silo_id: BTreeMap<_, _> = certs_by_silo_id
+            .into_iter()
+            .map(|(k, v)| (k, Arc::new(v)))
+            .collect();
+
+        let by_dns_name: BTreeMap<_, _> = dns_names
+            .into_iter()
+            .map(|(dns_name, silo_id)| {
+                let silo_info = certs_by_silo_id
+                    .get(&silo_id)
+                    .cloned()
+                    .unwrap_or_else(|| {
+                        Arc::new(ExternalEndpoint {
+                            silo_id,
+                            tls_certs: vec![],
+                        })
+                    });
+
+                if silo_info.tls_certs.is_empty() {
+                    warnings.push(ExternalEndpointError::NoSiloCerts {
+                        silo_id,
+                        dns_name: dns_name.clone(),
+                    })
+                }
+
+                (dns_name, silo_info)
+            })
+            .collect();
+
+        if by_dns_name.is_empty() {
+            warnings.push(ExternalEndpointError::NoEndpoints);
+        }
+
+        ExternalEndpoints { by_dns_name, warnings }
+    }
+
+    #[cfg(test)]
+    pub fn has_domain(&self, dns_name: &str) -> bool {
+        self.by_dns_name.contains_key(dns_name)
+    }
+
+    #[cfg(test)]
+    pub fn ndomains(&self) -> usize {
+        self.by_dns_name.len()
+    }
+
+    #[cfg(test)]
+    pub fn nwarnings(&self) -> usize {
+        self.warnings.len()
+    }
+}
+
+/// Describes a single external "endpoint", by which we mean an external DNS
+/// name that's associated with a particular Silo
+#[derive(Debug, Eq, PartialEq, Serialize)]
+struct ExternalEndpoint {
+    /// the Silo associated with this endpoint
+    silo_id: Uuid,
+    /// the set of TLS certificate chains that could be appropriate for this
+    /// endpoint
+    tls_certs: Vec<TlsCertificate>,
+}
+
+impl ExternalEndpoint {
+    /// Chooses a TLS certificate (chain) to use when handling connections to
+    /// this endpoint
+    fn best_certificate(&self) -> Result<&TlsCertificate, anyhow::Error> {
+        // We expect the most common case to be that there's only one
+        // certificate chain here.  The next most common case is that there are
+        // two because the administrator is in the process of rotating
+        // certificates, usually due to upcoming expiration.  In principle, it
+        // would be useful to allow operators to control which certificate chain
+        // gets used, and maybe even do something like a canary to mitigate the
+        // risk of a botched certificate update.  Absent that, we're going to do
+        // our best to pick the best chain automatically.
+        //
+        // This could be a lot more sophisticated than it is.  We could try to
+        // avoid using certificates that are clearly not valid based on the
+        // "not_after" and "not_before" bounds.  We could check each certificate
+        // in the chain, not just the last one.  We could use a margin of error
+        // when doing this to account for small variations in the wall clock
+        // between us and the client.  We could try to avoid using a certificate
+        // that doesn't appear to be compatible with the SNI value (DNS domain)
+        // that this request came in on.
+        //
+        // IMPORTANT: If we ever decide to do those things, they should only be
+        // used to decide which of several certificates is preferred.  We should
+        // always pick a certificate if we possibly can, even if it seems to be
+        // invalid.  A client can always choose not to trust it.  But in the
+        // unfortunate case where there are no good certificates, a customer's
+        // only option may be to instruct their client to trust an invalid
+        // certificate _so that they can log in and fix the certificate
+        // problem_.  If we provide no certificate at all here, a customer may
+        // have no way to fix the problem.
+        //
+        // Anyway, we don't yet do anything of these things.  For now, pick the
+        // certificate chain whose leaf certificate has the latest expiration
+        // time.
+
+        // This would be cleaner if Asn1Time impl'd Ord or even just a way to
+        // convert it to a Unix timestamp or any other comparable timestamp.
+        let mut latest_expiration: Option<&TlsCertificate> = None;
+        for t in &self.tls_certs {
+            // We'll choose this certificate (so far) if we find that it's
+            // anything other than "earlier" than the best we've seen so far.
+            // That includes the case where we haven't seen any so far, where
+            // this one is greater than or equal to the best so far, as well as
+            // the case where they're incomparable for whatever reason.  (This
+            // ensures that we always pick at least one.)
+            if latest_expiration.is_none()
+                || !matches!(
+                    t.parsed.not_after().partial_cmp(
+                        latest_expiration.unwrap().parsed.not_after()
+                    ),
+                    Some(std::cmp::Ordering::Less)
+                )
+            {
+                latest_expiration = Some(t);
+            }
+        }
+
+        latest_expiration.ok_or_else(|| {
+            anyhow!("silo {} has no usable certificates", self.silo_id)
+        })
+    }
+}
+
+/// Describes a problem encountered while assembling an [`ExternalEndpoints`]
+/// object
+#[derive(Clone, Debug, Error, SerializeDisplay)]
+enum ExternalEndpointError {
+    #[error(
+        "ignoring silo {dup_silo_id} ({dup_silo_name:?}): has the same DNS \
+        name ({dns_name:?}) as previously-found silo {first_silo_id} \
+        ({first_silo_name:?})"
+    )]
+    DupDnsName {
+        dup_silo_id: Uuid,
+        dup_silo_name: String,
+        first_silo_id: Uuid,
+        first_silo_name: String,
+        dns_name: String,
+    },
+
+    #[error("ignoring certificate for silo {silo_id}: {reason:#}")]
+    BadCert {
+        silo_id: Uuid,
+        #[source]
+        reason: Arc<anyhow::Error>,
+    },
+
+    #[error(
+        "silo {silo_id} with DNS name {dns_name:?} has no usable certificates"
+    )]
+    NoSiloCerts { silo_id: Uuid, dns_name: String },
+
+    #[error("no external endpoints were found")]
+    NoEndpoints,
+}
+
+impl Eq for ExternalEndpointError {}
+impl PartialEq for ExternalEndpointError {
+    fn eq(&self, other: &Self) -> bool {
+        self.to_string() == other.to_string()
+    }
+}
+
+/// A parsed, validated TLS certificate ready to use with an external TLS server
+#[derive(Serialize)]
+#[serde(transparent)]
+struct TlsCertificate {
+    /// This is what we need to provide to the TLS stack when we decide to use
+    /// this certificate for an incoming TLS connection
+    // NOTE: It's important that we do not serialize the private key!
+    #[serde(skip)]
+    certified_key: Arc<CertifiedKey>,
+
+    /// Parsed representation of the whole certificate chain
+    ///
+    /// This is used to extract metadata like the expiration time.
+    // NOTE: It's important that we do not serialize the private key!
+    #[serde(skip)]
+    parsed: X509,
+
+    /// certificate digest (historically sometimes called a "fingerprint")
+    // This is the only field that appears in the serialized output or debug
+    // output.
+    digest: String,
+}
+
+impl fmt::Debug for TlsCertificate {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // It's important that only the digest appear in the debug output.  We
+        // definitely don't want to leak the private key this way.  Really,
+        // we don't want even the public parts adding noise to debug output.
+        f.debug_struct("TlsCertificate").field("digest", &self.digest).finish()
+    }
+}
+
+impl Eq for TlsCertificate {}
+impl PartialEq for TlsCertificate {
+    fn eq(&self, other: &Self) -> bool {
+        self.digest == other.digest
+    }
+}
+
+impl TryFrom<Certificate> for TlsCertificate {
+    type Error = anyhow::Error;
+
+    fn try_from(db_cert: Certificate) -> Result<TlsCertificate, anyhow::Error> {
+        // Parse and validate what we've got.
+        let certs_pem = openssl::x509::X509::stack_from_pem(&db_cert.cert)
+            .context("parsing PEM stack")?;
+        let private_key = PKey::private_key_from_pem(&db_cert.key)
+            .context("parsing private key PEM")?;
+
+        // Assemble a rustls CertifiedKey with both the certificate and the key.
+        let certified_key = {
+            let private_key_der = private_key
+                .private_key_to_der()
+                .context("serializing private key to DER")?;
+            let rustls_private_key = rustls::PrivateKey(private_key_der);
+            let rustls_signing_key =
+                rustls::sign::any_supported_type(&rustls_private_key)
+                    .context("parsing DER private key")?;
+            let rustls_certs = certs_pem
+                .iter()
+                .map(|x509| {
+                    x509.to_der()
+                        .context("serializing cert to DER")
+                        .map(rustls::Certificate)
+                })
+                .collect::<Result<_, _>>()?;
+            Arc::new(CertifiedKey::new(rustls_certs, rustls_signing_key))
+        };
+
+        let end_cert = certs_pem
+            .into_iter()
+            .next()
+            .ok_or_else(|| anyhow!("no certificates in PEM stack"))?;
+        anyhow::ensure!(
+            end_cert
+                .public_key()
+                .context("certificate publickey")?
+                .public_eq(&private_key),
+            "certificate public key does not match stored private key"
+        );
+
+        // Compute a digest (fingerprint) that we can use for debugging.
+        let digest = {
+            let digest_bytes = end_cert
+                .digest(openssl::hash::MessageDigest::sha256())
+                .context("computing fingerprint")?;
+            hex::encode(&digest_bytes)
+        };
+
+        Ok(TlsCertificate { certified_key, digest, parsed: end_cert })
+    }
+}
+
+/// Read the lists of all Silos, external DNS zones, and external TLS
+/// certificates from the database and assemble an `ExternalEndpoints` structure
+/// that describes what DNS names exist, which Silos they correspond to, and
+/// what TLS certificates can be used for them
+// This structure is used to determine what TLS certificates are used for
+// incoming connections to the external console/API endpoints.  As such, it's
+// critical that we produce a usable result if at all possible, even if it's
+// incomplete.  Otherwise, we won't be able to serve _any_ incoming connections
+// to _any_ of our external endpoints!  If data from the database is invalid or
+// inconsistent, that data is discarded and a warning is produced, but we'll
+// still return a usable object.
+pub async fn read_all_endpoints(
+    datastore: &DataStore,
+    opctx: &OpContext,
+) -> Result<ExternalEndpoints, Error> {
+    // We will not look for more than this number of external DNS zones, Silos,
+    // or certificates.  We do not expect very many of any of these objects.
+    const MAX: u32 = 200;
+    let pagparams_id = DataPageParams {
+        marker: None,
+        limit: NonZeroU32::new(MAX).unwrap(),
+        direction: dropshot::PaginationOrder::Ascending,
+    };
+    let pagbyid = PaginatedBy::Id(pagparams_id);
+    let pagparams_name = DataPageParams {
+        marker: None,
+        limit: NonZeroU32::new(MAX).unwrap(),
+        direction: dropshot::PaginationOrder::Ascending,
+    };
+
+    let silos =
+        datastore.silos_list(opctx, &pagbyid, Discoverability::All).await?;
+    let external_dns_zones = datastore
+        .dns_zones_list(opctx, DnsGroup::External, &pagparams_name)
+        .await?;
+    bail_unless!(
+        !external_dns_zones.is_empty(),
+        "expected at least one external DNS zone"
+    );
+    let certs = datastore
+        .certificate_list_for(opctx, Some(ServiceKind::Nexus), &pagbyid, false)
+        .await?;
+
+    // If we found too many of any of these things, complain as loudly as we
+    // can.  Our results will be wrong.  But we still don't want to fail if we
+    // can avoid it because we want to be able to serve as many endpoints as we
+    // can.
+    // TODO-reliability we should prevent people from creating more than this
+    // maximum number of Silos and certificates.
+    let max = usize::try_from(MAX).unwrap();
+    if silos.len() >= max {
+        error!(
+            &opctx.log,
+            "reading endpoints: expected at most {} silos, but found at \
+            least {}.  TLS may not work on some Silos' external endpoints.",
+            MAX,
+            silos.len(),
+        );
+    }
+    if external_dns_zones.len() >= max {
+        error!(
+            &opctx.log,
+            "reading endpoints: expected at most {} external DNS zones, but \
+            found at least {}.  TLS may not work on some Silos' external \
+            endpoints.",
+            MAX,
+            external_dns_zones.len(),
+        );
+    }
+    if certs.len() >= max {
+        error!(
+            &opctx.log,
+            "reading endpoints: expected at most {} certificates, but \
+            found at least {}.  TLS may not work on some Silos' external \
+            endpoints.",
+            MAX,
+            certs.len(),
+        );
+    }
+
+    Ok(ExternalEndpoints::new(silos, certs, external_dns_zones))
+}
+
+/// TLS SNI certificate resolver for use with rustls/Dropshot
+///
+/// This object exists to impl `rustls::server::ResolvesServerCert`.  This
+/// object looks at an incoming TLS session's SNI field, matches it against the
+/// latest `ExternalEndpoints` configuration (available via a watch channel),
+/// and then determines which certificate (if any) to provide for the new
+/// session.
+///
+/// See the module-level comment for more details.
+pub struct NexusCertResolver {
+    log: slog::Logger,
+    config_rx: watch::Receiver<Option<ExternalEndpoints>>,
+}
+
+impl NexusCertResolver {
+    pub fn new(
+        log: slog::Logger,
+        config_rx: watch::Receiver<Option<ExternalEndpoints>>,
+    ) -> NexusCertResolver {
+        NexusCertResolver { log, config_rx }
+    }
+
+    fn do_resolve_endpoint(
+        &self,
+        server_name: Option<&str>,
+    ) -> Result<Arc<ExternalEndpoint>, anyhow::Error> {
+        let Some(server_name) = server_name else {
+            bail!("TLS session had no server name")
+        };
+
+        let config_ref = self.config_rx.borrow();
+        let config = match &*config_ref {
+            Some(c) => c,
+            None => bail!("no TLS config found"),
+        };
+
+        config
+            .by_dns_name
+            .get(server_name)
+            .ok_or_else(|| anyhow!("unrecognized server name: {}", server_name))
+            .cloned()
+    }
+
+    fn do_resolve(
+        &self,
+        server_name: Option<&str>,
+    ) -> Option<Arc<CertifiedKey>> {
+        let log =
+            self.log.new(o!("server_name" => server_name.map(String::from)));
+
+        trace!(&log, "resolving TLS certificate");
+        let resolved = self.do_resolve_endpoint(server_name);
+        let result = match resolved {
+            Ok(ref endpoint) => match endpoint.best_certificate() {
+                Ok(certificate) => Ok((endpoint.silo_id, certificate)),
+                Err(error) => Err(error),
+            },
+            Err(error) => Err(error),
+        };
+        match result {
+            Ok((silo_id, certificate)) => {
+                debug!(log, "resolved TLS certificate";
+                    "silo_id" => silo_id.to_string(),
+                    "certificate" => ?certificate
+                );
+                Some(certificate.certified_key.clone())
+            }
+            Err(error) => {
+                // TODO-security There is a (limited) DoS risk here, in that the
+                // client controls the request made to this endpoint and we're
+                // going to emit something to the log every time this happens.
+                // But at this stage it's pretty valuable to be able to debug
+                // this problem.
+                warn!(
+                    log,
+                    "failed to resolve TLS certificate";
+                    "error" => format!("{:#}", error),
+                );
+                None
+            }
+        }
+    }
+}
+
+impl rustls::server::ResolvesServerCert for NexusCertResolver {
+    fn resolve(
+        &self,
+        client_hello: rustls::server::ClientHello,
+    ) -> Option<Arc<CertifiedKey>> {
+        let server_name = client_hello.server_name();
+        self.do_resolve(server_name)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::ExternalEndpoints;
+    use super::TlsCertificate;
+    use crate::app::external_endpoints::ExternalEndpointError;
+    use crate::app::external_endpoints::NexusCertResolver;
+    use chrono::Utc;
+    use dropshot::test_util::LogContext;
+    use dropshot::ConfigLogging;
+    use dropshot::ConfigLoggingIfExists;
+    use dropshot::ConfigLoggingLevel;
+    use nexus_db_model::Certificate;
+    use nexus_db_model::DnsGroup;
+    use nexus_db_model::DnsZone;
+    use nexus_db_model::ServiceKind;
+    use nexus_db_model::Silo;
+    use nexus_types::external_api::params;
+    use nexus_types::external_api::shared;
+    use nexus_types::identity::Resource;
+    use omicron_common::api::external::IdentityMetadataCreateParams;
+    use uuid::Uuid;
+
+    fn create_silo(silo_id: Option<Uuid>, name: &str) -> Silo {
+        let params = params::SiloCreate {
+            identity: IdentityMetadataCreateParams {
+                name: name.parse().unwrap(),
+                description: String::new(),
+            },
+            discoverable: false,
+            identity_mode: shared::SiloIdentityMode::LocalOnly,
+            admin_group_name: None,
+            tls_certificates: vec![],
+        };
+
+        if let Some(silo_id) = silo_id {
+            Silo::new_with_id(silo_id, params)
+        } else {
+            Silo::new(params)
+        }
+    }
+
+    fn create_certificate(
+        domain: &str,
+        expired: bool,
+    ) -> params::CertificateCreate {
+        let mut cert_params =
+            rcgen::CertificateParams::new(vec![domain.to_string()]);
+        if expired {
+            cert_params.not_after = std::time::UNIX_EPOCH.into();
+        }
+        let cert = rcgen::Certificate::from_params(cert_params).unwrap();
+        let cert_pem =
+            cert.serialize_pem().expect("serializing certificate as PEM");
+        let key_pem = cert.serialize_private_key_pem();
+        let namestr = format!("cert-for-{}", domain.replace('.', "-"));
+        params::CertificateCreate {
+            identity: IdentityMetadataCreateParams {
+                name: namestr.parse().unwrap(),
+                description: String::new(),
+            },
+            cert: cert_pem.into_bytes(),
+            key: key_pem.into_bytes(),
+            service: shared::ServiceUsingCertificate::ExternalApi,
+        }
+    }
+
+    fn create_dns_zone(domain: &str) -> DnsZone {
+        DnsZone {
+            id: Uuid::new_v4(),
+            time_created: Utc::now(),
+            dns_group: DnsGroup::External,
+            zone_name: format!("{}.test", domain),
+        }
+    }
+
+    fn cert_matches(tls_cert: &TlsCertificate, cert: &Certificate) -> bool {
+        let parse_right = openssl::x509::X509::from_pem(&cert.cert).unwrap();
+        tls_cert.parsed == parse_right
+    }
+
+    #[test]
+    fn test_external_endpoints_empty() {
+        // Truly trivial case: no endpoints at all.
+        let ee1 = ExternalEndpoints::new(vec![], vec![], vec![]);
+        assert_eq!(ee1.ndomains(), 0);
+        assert_eq!(ee1.nwarnings(), 1);
+        assert_eq!(
+            ee1.warnings[0].to_string(),
+            "no external endpoints were found"
+        );
+
+        // There are also no endpoints if there's a Silo but no external DNS
+        // zones.
+        let silo_id: Uuid =
+            "6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3".parse().unwrap();
+        let silo = create_silo(Some(silo_id), "dummy");
+        let ee2 = ExternalEndpoints::new(vec![silo], vec![], vec![]);
+        assert_eq!(ee2.ndomains(), 0);
+        assert_eq!(ee2.nwarnings(), 1);
+        assert_eq!(
+            ee2.warnings[0].to_string(),
+            "no external endpoints were found"
+        );
+        // Test PartialEq impl.
+        assert_eq!(ee1, ee2);
+
+        // There are also no endpoints if there's an external DNS zone but no
+        // Silo.
+        let dns_zone1 = create_dns_zone("oxide1");
+        let ee2 = ExternalEndpoints::new(vec![], vec![], vec![dns_zone1]);
+        assert_eq!(ee2.ndomains(), 0);
+        assert_eq!(ee2.nwarnings(), 1);
+        assert_eq!(
+            ee2.warnings[0].to_string(),
+            "no external endpoints were found"
+        );
+        // Test PartialEq impl.
+        assert_eq!(ee1, ee2);
+
+        // Finally, there are no endpoints if there's a certificate and nothing
+        // else.  This isn't really valid.  But it's useful to verify that we
+        // won't crash or otherwise fail if we get a certificate with an invalid
+        // silo_id.
+        let cert_create = create_certificate("dummy.sys.oxide1.test", false);
+        let cert = Certificate::new(
+            silo_id,
+            Uuid::new_v4(),
+            ServiceKind::Nexus,
+            cert_create,
+        )
+        .unwrap();
+        let ee2 = ExternalEndpoints::new(vec![], vec![cert], vec![]);
+        assert_eq!(ee2.ndomains(), 0);
+        assert_eq!(ee2.nwarnings(), 1);
+        assert_eq!(
+            ee2.warnings[0].to_string(),
+            "no external endpoints were found"
+        );
+        // Test PartialEq impl.
+        assert_eq!(ee1, ee2);
+    }
+
+    #[test]
+    fn test_external_endpoints_basic() {
+        // Empty case for comparison.
+        let ee1 = ExternalEndpoints::new(vec![], vec![], vec![]);
+
+        // Sample data
+        let silo_id: Uuid =
+            "6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3".parse().unwrap();
+        let silo = create_silo(Some(silo_id), "dummy");
+        let dns_zone1 = create_dns_zone("oxide1");
+        let cert_create = create_certificate("dummy.sys.oxide1.test", false);
+        let cert = Certificate::new(
+            silo_id,
+            Uuid::new_v4(),
+            ServiceKind::Nexus,
+            cert_create,
+        )
+        .unwrap();
+
+        // Simple case: one silo, one DNS zone.  We should see an endpoint for
+        // the Silo.  Since it has no certificates, we'll get a warning.
+        let ee3 = ExternalEndpoints::new(
+            vec![silo.clone()],
+            vec![],
+            vec![dns_zone1.clone()],
+        );
+        // Test PartialEq impl.
+        assert_ne!(ee1, ee3);
+        assert_eq!(ee3.ndomains(), 1);
+        assert!(ee3.has_domain("dummy.sys.oxide1.test"));
+        assert_eq!(ee3.nwarnings(), 1);
+        assert_eq!(
+            ee3.warnings[0].to_string(),
+            "silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 with DNS name \
+            \"dummy.sys.oxide1.test\" has no usable certificates"
+        );
+        // This also exercises best_certificate() with zero certificates.
+        assert_eq!(
+            ee3.by_dns_name["dummy.sys.oxide1.test"]
+                .best_certificate()
+                .unwrap_err()
+                .to_string(),
+            "silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 has no usable \
+            certificates"
+        );
+
+        // Now try with a certificate.
+        let ee4 = ExternalEndpoints::new(
+            vec![silo.clone()],
+            vec![cert.clone()],
+            vec![dns_zone1.clone()],
+        );
+        assert_ne!(ee3, ee4);
+        assert_eq!(ee4.ndomains(), 1);
+        assert!(ee4.has_domain("dummy.sys.oxide1.test"));
+        assert_eq!(ee4.nwarnings(), 0);
+        let endpoint = &ee4.by_dns_name["dummy.sys.oxide1.test"];
+        assert_eq!(endpoint.silo_id, silo_id);
+        assert_eq!(endpoint.tls_certs.len(), 1);
+        assert!(cert_matches(&endpoint.tls_certs[0], &cert));
+        // This also exercises best_certificate() with one certificate.
+        assert_eq!(
+            *endpoint.best_certificate().unwrap(),
+            endpoint.tls_certs[0]
+        );
+
+        // Add a second external DNS zone.  There should now be two endpoints,
+        // both pointing to the same Silo.
+        let dns_zone2 = DnsZone {
+            id: Uuid::new_v4(),
+            time_created: Utc::now(),
+            dns_group: DnsGroup::External,
+            zone_name: String::from("oxide2.test"),
+        };
+        let ee5 = ExternalEndpoints::new(
+            vec![silo.clone()],
+            vec![cert.clone()],
+            vec![dns_zone1.clone(), dns_zone2],
+        );
+        assert_ne!(ee4, ee5);
+        assert_eq!(ee5.ndomains(), 2);
+        assert!(ee5.has_domain("dummy.sys.oxide1.test"));
+        assert!(ee5.has_domain("dummy.sys.oxide2.test"));
+        assert_eq!(ee5.nwarnings(), 0);
+        let endpoint1 = &ee5.by_dns_name["dummy.sys.oxide1.test"];
+        let endpoint2 = &ee5.by_dns_name["dummy.sys.oxide2.test"];
+        assert_eq!(endpoint1, endpoint2);
+        assert_eq!(endpoint1.silo_id, silo_id);
+        assert_eq!(endpoint1.tls_certs.len(), 1);
+        assert_eq!(endpoint2.silo_id, silo_id);
+        assert_eq!(endpoint2.tls_certs.len(), 1);
+
+        // Add a second Silo with the same name as the first one.  This should
+        // not be possible in practice.  In the future, we expect other features
+        // (e.g., DNS aliases) to make it possible for silos' DNS names to
+        // overlap like this.
+        let silo2_same_name_id =
+            "e3f36f20-56c3-c545-8320-c19d98b82c1d".parse().unwrap();
+        let silo2_same_name = create_silo(Some(silo2_same_name_id), "dummy");
+        let ee6 = ExternalEndpoints::new(
+            vec![silo, silo2_same_name],
+            vec![cert],
+            vec![dns_zone1],
+        );
+        assert_ne!(ee5, ee6);
+        assert_eq!(ee6.ndomains(), 1);
+        assert!(ee6.has_domain("dummy.sys.oxide1.test"));
+        let endpoint = &ee6.by_dns_name["dummy.sys.oxide1.test"];
+        assert_eq!(endpoint.silo_id, silo_id);
+        assert_eq!(endpoint.tls_certs.len(), 1);
+        assert_eq!(ee6.nwarnings(), 1);
+        assert_eq!(
+            ee6.warnings[0].to_string(),
+            "ignoring silo e3f36f20-56c3-c545-8320-c19d98b82c1d (\"dummy\"): \
+            has the same DNS name (\"dummy.sys.oxide1.test\") as \
+            previously-found silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 \
+            (\"dummy\")"
+        );
+    }
+
+    #[test]
+    fn test_external_endpoints_complex() {
+        // Set up a somewhat complex scenario:
+        //
+        // - four Silos
+        //   - silo1: two certificates, one of which is expired
+        //   - silo2: two certificates, one of which is expired
+        //     (in the other order to make sure it's not working by accident)
+        //   - silo3: one certificate that is invalid
+        //   - silo4: one certificate that is expired
+        // - two DNS zones
+        //
+        // We should wind up with eight endpoints and one warning.
+        let silo1 = create_silo(None, "silo1");
+        let silo2 = create_silo(None, "silo2");
+        let silo3 = create_silo(None, "silo3");
+        let silo4 = create_silo(None, "silo4");
+        let silo1_cert1_params =
+            create_certificate("silo1.sys.oxide1.test", false);
+        let silo1_cert1 = Certificate::new(
+            silo1.identity().id,
+            Uuid::new_v4(),
+            ServiceKind::Nexus,
+            silo1_cert1_params,
+        )
+        .unwrap();
+        let silo1_cert2_params =
+            create_certificate("silo1.sys.oxide1.test", true);
+        let silo1_cert2 = Certificate::new_unvalidated(
+            silo1.identity().id,
+            Uuid::new_v4(),
+            ServiceKind::Nexus,
+            silo1_cert2_params,
+        );
+        let silo2_cert1_params =
+            create_certificate("silo2.sys.oxide1.test", true);
+        let silo2_cert1 = Certificate::new_unvalidated(
+            silo2.identity().id,
+            Uuid::new_v4(),
+            ServiceKind::Nexus,
+            silo2_cert1_params,
+        );
+        let silo2_cert2_params =
+            create_certificate("silo2.sys.oxide1.test", false);
+        let silo2_cert2 = Certificate::new(
+            silo2.identity().id,
+            Uuid::new_v4(),
+            ServiceKind::Nexus,
+            silo2_cert2_params,
+        )
+        .unwrap();
+        let silo3_cert_params =
+            create_certificate("silo3.sys.oxide1.test", false);
+        let mut silo3_cert = Certificate::new(
+            silo3.identity().id,
+            Uuid::new_v4(),
+            ServiceKind::Nexus,
+            silo3_cert_params,
+        )
+        .unwrap();
+        // Corrupt a byte of this last certificate.  (This has to be done after
+        // constructing it or we would fail validation.)
+        silo3_cert.cert[0] ^= 1;
+        let silo4_cert_params =
+            create_certificate("silo4.sys.oxide1.test", true);
+        let silo4_cert = Certificate::new_unvalidated(
+            silo4.identity().id,
+            Uuid::new_v4(),
+            ServiceKind::Nexus,
+            silo4_cert_params,
+        );
+        let dns_zone1 = create_dns_zone("oxide1");
+        let dns_zone2 = create_dns_zone("oxide2");
+
+        let ee = ExternalEndpoints::new(
+            vec![silo1.clone(), silo2.clone(), silo3.clone(), silo4.clone()],
+            vec![
+                silo1_cert1.clone(),
+                silo1_cert2.clone(),
+                silo2_cert1,
+                silo2_cert2.clone(),
+                silo3_cert.clone(),
+                silo4_cert.clone(),
+            ],
+            vec![dns_zone1, dns_zone2],
+        );
+        println!("{:?}", ee);
+        assert_eq!(ee.ndomains(), 8);
+        assert_eq!(ee.nwarnings(), 3);
+        assert_eq!(
+            2,
+            ee.warnings
+                .iter()
+                .filter(|warning| matches!(warning,
+                    ExternalEndpointError::NoSiloCerts { silo_id, .. }
+                        if *silo_id == silo3.id()
+                ))
+                .count()
+        );
+        assert_eq!(
+            1,
+            ee.warnings
+                .iter()
+                .filter(|warning| matches!(warning,
+                    ExternalEndpointError::BadCert { silo_id, .. }
+                        if *silo_id == silo3.id()
+                ))
+                .count()
+        );
+
+        assert_eq!(
+            ee.by_dns_name["silo1.sys.oxide1.test"],
+            ee.by_dns_name["silo1.sys.oxide2.test"]
+        );
+        assert_eq!(
+            ee.by_dns_name["silo2.sys.oxide1.test"],
+            ee.by_dns_name["silo2.sys.oxide2.test"]
+        );
+        assert_eq!(
+            ee.by_dns_name["silo3.sys.oxide1.test"],
+            ee.by_dns_name["silo3.sys.oxide2.test"]
+        );
+        assert_eq!(
+            ee.by_dns_name["silo4.sys.oxide1.test"],
+            ee.by_dns_name["silo4.sys.oxide2.test"]
+        );
+
+        let e1 = &ee.by_dns_name["silo1.sys.oxide1.test"];
+        assert_eq!(e1.silo_id, silo1.id());
+        let c1 = e1.best_certificate().unwrap();
+        // It must be cert1 because cert2 is expired.
+        assert!(cert_matches(c1, &silo1_cert1));
+
+        let e2 = &ee.by_dns_name["silo2.sys.oxide1.test"];
+        assert_eq!(e2.silo_id, silo2.id());
+        let c2 = e2.best_certificate().unwrap();
+        // It must be cert2 because cert1 is expired.
+        assert!(cert_matches(c2, &silo2_cert2));
+        assert!(!cert_matches(c2, &silo1_cert1));
+        assert!(!cert_matches(c2, &silo1_cert2));
+
+        let e3 = &ee.by_dns_name["silo3.sys.oxide1.test"];
+        assert_eq!(e3.silo_id, silo3.id());
+        assert!(e3.best_certificate().is_err());
+
+        // We should get an expired cert if it's the only option.
+        let e4 = &ee.by_dns_name["silo4.sys.oxide1.test"];
+        assert_eq!(e4.silo_id, silo4.id());
+        let c4 = e4.best_certificate().unwrap();
+        assert!(cert_matches(c4, &silo4_cert));
+
+        // Now test the NexusCertResolver.
+        let logctx = LogContext::new(
+            "test_external_endpoints_complex",
+            &ConfigLogging::File {
+                level: ConfigLoggingLevel::Trace,
+                path: "UNUSED".into(),
+                if_exists: ConfigLoggingIfExists::Append,
+            },
+        );
+        let (watch_tx, watch_rx) = tokio::sync::watch::channel(None);
+        let cert_resolver =
+            NexusCertResolver::new(logctx.log.clone(), watch_rx);
+
+        // At this point we haven't filled in the configuration so any attempt
+        // to resolve anything should fail.
+        assert!(cert_resolver
+            .do_resolve(Some("silo1.sys.oxide1.test"))
+            .is_none());
+
+        // Now pass along the configuration and try again.
+        watch_tx.send(Some(ee.clone())).unwrap();
+        let resolved_c1 =
+            cert_resolver.do_resolve(Some("silo1.sys.oxide1.test")).unwrap();
+        assert_eq!(resolved_c1.cert, c1.certified_key.cert);
+        let resolved_c2 =
+            cert_resolver.do_resolve(Some("silo2.sys.oxide1.test")).unwrap();
+        assert_eq!(resolved_c2.cert, c2.certified_key.cert);
+        assert!(cert_resolver
+            .do_resolve(Some("silo3.sys.oxide1.test"))
+            .is_none());
+        // We should get an expired cert if it's the only option.
+        let resolved_c4 =
+            cert_resolver.do_resolve(Some("silo4.sys.oxide1.test")).unwrap();
+        assert_eq!(resolved_c4.cert, c4.certified_key.cert);
+
+        logctx.cleanup_successful();
+    }
+}
diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs
index bc2bfac8bf..dcba38aaee 100644
--- a/nexus/src/app/mod.rs
+++ b/nexus/src/app/mod.rs
@@ -4,6 +4,7 @@
 
 //! Nexus, the service that operates much of the control plane in an Oxide fleet
 
+use self::external_endpoints::NexusCertResolver;
 use crate::app::oximeter::LazyTimeseriesClient;
 use crate::authn;
 use crate::authz;
@@ -31,6 +32,7 @@ pub mod background;
 mod certificate;
 mod device_auth;
 mod disk;
+mod external_endpoints;
 mod external_ip;
 mod iam;
 mod image;
@@ -71,48 +73,6 @@ pub(crate) const MAX_NICS_PER_INSTANCE: usize = 8;
 // TODO-completeness: Support multiple external IPs
 pub(crate) const MAX_EXTERNAL_IPS_PER_INSTANCE: usize = 1;
 
-pub(crate) struct ExternalServers {
-    config: dropshot::ConfigDropshot,
-    https_port: u16,
-    http: Option<DropshotServer>,
-    https: Option<DropshotServer>,
-}
-
-impl ExternalServers {
-    pub fn new(config: dropshot::ConfigDropshot, https_port: u16) -> Self {
-        Self { config, https_port, http: None, https: None }
-    }
-
-    pub fn set_http(&mut self, http: DropshotServer) {
-        self.http = Some(http);
-    }
-
-    pub fn set_https(&mut self, https: DropshotServer) {
-        self.https = Some(https);
-    }
-
-    pub fn https_port(&self) -> u16 {
-        self.https_port
-    }
-
-    /// Returns a context object, if one exists.
-    pub fn get_context(&self) -> Option<Arc<crate::ServerContext>> {
-        if let Some(context) =
-            self.https.as_ref().map(|server| server.app_private())
-        {
-            // If an HTTPS server is already running, use that server context.
-            Some(context.clone())
-        } else if let Some(context) =
-            self.http.as_ref().map(|server| server.app_private())
-        {
-            // If an HTTP server is already running, use that server context.
-            Some(context.clone())
-        } else {
-            None
-        }
-    }
-}
-
 /// Manages an Oxide fleet -- the heart of the control plane
 pub struct Nexus {
     /// uuid for this nexus instance.
@@ -137,7 +97,7 @@ pub struct Nexus {
     recovery_task: std::sync::Mutex<Option<db::RecoveryTask>>,
 
     /// External dropshot servers
-    external_servers: Mutex<ExternalServers>,
+    external_server: std::sync::Mutex<Option<DropshotServer>>,
 
     /// Internal dropshot server
     internal_server: std::sync::Mutex<Option<DropshotServer>>,
@@ -278,10 +238,7 @@ impl Nexus {
             authz: Arc::clone(&authz),
             sec_client: Arc::clone(&sec_client),
             recovery_task: std::sync::Mutex::new(None),
-            external_servers: Mutex::new(ExternalServers::new(
-                config.deployment.dropshot_external.clone(),
-                config.pkg.nexus_https_port,
-            )),
+            external_server: std::sync::Mutex::new(None),
             internal_server: std::sync::Mutex::new(None),
             populate_status,
             timeseries_client,
@@ -378,26 +335,50 @@ impl Nexus {
         }
     }
 
+    pub async fn external_tls_config(
+        &self,
+        tls_enabled: bool,
+    ) -> Option<rustls::ServerConfig> {
+        if !tls_enabled {
+            return None;
+        }
+
+        // Wait for the background task to complete at least once.  We don't
+        // care about its value.  To do this, we need our own copy of the
+        // channel.
+        let mut rx = self.background_tasks.external_endpoints.clone();
+        let _ = rx.wait_for(|s| s.is_some()).await;
+        let mut rustls_cfg = rustls::ServerConfig::builder()
+            .with_safe_default_cipher_suites()
+            .with_safe_default_kx_groups()
+            .with_safe_default_protocol_versions()
+            .unwrap()
+            .with_no_client_auth()
+            .with_cert_resolver(Arc::new(NexusCertResolver::new(
+                self.log.new(o!("component" => "NexusCertResolver")),
+                self.background_tasks.external_endpoints.clone(),
+            )));
+        rustls_cfg.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+        Some(rustls_cfg)
+    }
+
     // Called to hand off management of external servers to Nexus.
     pub(crate) async fn set_servers(
         &self,
-        external_servers: ExternalServers,
+        external_server: DropshotServer,
         internal_server: DropshotServer,
     ) {
         // If any servers already exist, close them.
         let _ = self.close_servers().await;
 
         // Insert the new servers.
-        *self.external_servers.lock().await = external_servers;
+        self.external_server.lock().unwrap().replace(external_server);
         self.internal_server.lock().unwrap().replace(internal_server);
     }
 
     pub async fn close_servers(&self) -> Result<(), String> {
-        let mut external_servers = self.external_servers.lock().await;
-        if let Some(server) = external_servers.http.take() {
-            server.close().await?;
-        }
-        if let Some(server) = external_servers.https.take() {
+        let external_server = self.external_server.lock().unwrap().take();
+        if let Some(server) = external_server {
             server.close().await?;
         }
         let internal_server = self.internal_server.lock().unwrap().take();
@@ -424,18 +405,14 @@ impl Nexus {
         Ok(())
     }
 
-    pub async fn get_http_external_server_address(
+    pub async fn get_external_server_address(
         &self,
     ) -> Option<std::net::SocketAddr> {
-        let external_servers = self.external_servers.lock().await;
-        external_servers.http.as_ref().map(|server| server.local_addr())
-    }
-
-    pub async fn get_https_external_server_address(
-        &self,
-    ) -> Option<std::net::SocketAddr> {
-        let external_servers = self.external_servers.lock().await;
-        external_servers.https.as_ref().map(|server| server.local_addr())
+        self.external_server
+            .lock()
+            .unwrap()
+            .as_ref()
+            .map(|server| server.local_addr())
     }
 
     pub async fn get_internal_server_address(
diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs
index d65f3728bb..9c8fac3428 100644
--- a/nexus/src/app/rack.rs
+++ b/nexus/src/app/rack.rs
@@ -4,6 +4,7 @@
 
 //! Rack management
 
+use super::silo::silo_dns_name;
 use crate::authz;
 use crate::db;
 use crate::db::lookup::LookupPath;
@@ -184,7 +185,7 @@ impl super::Nexus {
             format!("create silo: {:?}", silo_name),
             self.id.to_string(),
         );
-        dns_update.add_name(Self::silo_dns_name(silo_name), dns_records)?;
+        dns_update.add_name(silo_dns_name(silo_name), dns_records)?;
 
         let recovery_silo = SiloCreate {
             identity: IdentityMetadataCreateParams {
@@ -218,14 +219,15 @@ impl super::Nexus {
             )
             .await?;
 
-        // We've potentially updated both the list of DNS servers and the DNS
-        // configuration.  Activate both background tasks, for both internal and
-        // external DNS.
+        // We've potentially updated the list of DNS servers and the DNS
+        // configuration for both internal and external DNS, plus the Silo
+        // certificates.  Activate the relevant background tasks.
         for task in &[
             &self.background_tasks.task_internal_dns_config,
             &self.background_tasks.task_internal_dns_servers,
             &self.background_tasks.task_external_dns_config,
             &self.background_tasks.task_external_dns_servers,
+            &self.background_tasks.task_external_endpoints,
         ] {
             self.background_tasks.activate(task);
         }
diff --git a/nexus/src/app/silo.rs b/nexus/src/app/silo.rs
index 6efb8e6b66..cc8e5eb19a 100644
--- a/nexus/src/app/silo.rs
+++ b/nexus/src/app/silo.rs
@@ -16,6 +16,7 @@ use crate::{authn, authz};
 use anyhow::Context;
 use nexus_db_model::{DnsGroup, UserProvisionType};
 use nexus_db_queries::context::OpContext;
+use nexus_db_queries::db::datastore::Discoverability;
 use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder;
 use nexus_types::internal_api::params::DnsRecord;
 use omicron_common::api::external::http_pagination::PaginatedBy;
@@ -64,19 +65,6 @@ impl super::Nexus {
         }
     }
 
-    /// Returns the (relative) DNS name for this Silo's API and console endpoints
-    /// _within_ the control plane DNS zone (i.e., without that zone's suffix)
-    ///
-    /// This specific naming scheme is determined under RFD 357.
-    pub(crate) fn silo_dns_name(
-        name: &omicron_common::api::external::Name,
-    ) -> String {
-        // RFD 4 constrains resource names (including Silo names) to DNS-safe
-        // strings, which is why it's safe to directly put the name of the
-        // resource into the DNS name rather than doing any kind of escaping.
-        format!("{}.sys", name)
-    }
-
     pub async fn silo_create(
         &self,
         opctx: &OpContext,
@@ -107,13 +95,15 @@ impl super::Nexus {
             format!("create silo: {:?}", silo_name),
             self.id.to_string(),
         );
-        dns_update.add_name(Self::silo_dns_name(silo_name), dns_records)?;
+        dns_update.add_name(silo_dns_name(silo_name), dns_records)?;
 
         let silo = datastore
             .silo_create(&opctx, &nexus_opctx, new_silo_params, dns_update)
             .await?;
         self.background_tasks
             .activate(&self.background_tasks.task_external_dns_config);
+        self.background_tasks
+            .activate(&self.background_tasks.task_external_endpoints);
         Ok(silo)
     }
 
@@ -122,7 +112,9 @@ impl super::Nexus {
         opctx: &OpContext,
         pagparams: &PaginatedBy<'_>,
     ) -> ListResultVec<db::model::Silo> {
-        self.db_datastore.silos_list(opctx, pagparams).await
+        self.db_datastore
+            .silos_list(opctx, pagparams, Discoverability::DiscoverableOnly)
+            .await
     }
 
     pub async fn silo_delete(
@@ -139,12 +131,14 @@ impl super::Nexus {
             format!("delete silo: {:?}", db_silo.name()),
             self.id.to_string(),
         );
-        dns_update.remove_name(Self::silo_dns_name(&db_silo.name()))?;
+        dns_update.remove_name(silo_dns_name(&db_silo.name()))?;
         datastore
             .silo_delete(opctx, &authz_silo, &db_silo, dns_opctx, dns_update)
             .await?;
         self.background_tasks
             .activate(&self.background_tasks.task_external_dns_config);
+        self.background_tasks
+            .activate(&self.background_tasks.task_external_endpoints);
         Ok(())
     }
 
@@ -880,3 +874,16 @@ impl super::Nexus {
         LookupPath::new(opctx, &self.db_datastore).silo_group_id(*group_id)
     }
 }
+
+/// Returns the (relative) DNS name for this Silo's API and console endpoints
+/// _within_ the external DNS zone (i.e., without that zone's suffix)
+///
+/// This specific naming scheme is determined under RFD 357.
+pub(crate) fn silo_dns_name(
+    name: &omicron_common::api::external::Name,
+) -> String {
+    // RFD 4 constrains resource names (including Silo names) to DNS-safe
+    // strings, which is why it's safe to directly put the name of the
+    // resource into the DNS name rather than doing any kind of escaping.
+    format!("{}.sys", name)
+}
diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs
index a1ffb61f19..2aa2b9a392 100644
--- a/nexus/src/lib.rs
+++ b/nexus/src/lib.rs
@@ -128,37 +128,28 @@ impl Server {
         apictx.nexus.await_rack_initialization(&opctx).await;
 
         // Launch the external server.
+        let tls_config = apictx
+            .nexus
+            .external_tls_config(config.deployment.dropshot_external.tls)
+            .await;
         let http_server_external = {
-            let server_starter_external = dropshot::HttpServerStarter::new(
-                &config.deployment.dropshot_external,
-                external_api(),
-                Arc::clone(&apictx),
-                &log.new(o!("component" => "dropshot_external")),
-            )
-            .map_err(|error| {
-                format!("initializing external server: {}", error)
-            })?;
+            let server_starter_external =
+                dropshot::HttpServerStarter::new_with_tls(
+                    &config.deployment.dropshot_external.dropshot,
+                    external_api(),
+                    Arc::clone(&apictx),
+                    &log.new(o!("component" => "dropshot_external")),
+                    tls_config.map(dropshot::ConfigTls::Dynamic),
+                )
+                .map_err(|error| {
+                    format!("initializing external server: {}", error)
+                })?;
             server_starter_external.start()
         };
-
-        // Transfer control of the external server to Nexus
-        let mut http_servers_external = crate::app::ExternalServers::new(
-            config.deployment.dropshot_external.clone(),
-            config.pkg.nexus_https_port,
-        );
-        http_servers_external.set_http(http_server_external);
         apictx
             .nexus
-            .set_servers(http_servers_external, http_server_internal)
+            .set_servers(http_server_external, http_server_internal)
             .await;
-
-        // If Nexus has TLS certificates, launch the HTTPS server.
-        apictx
-            .nexus
-            .refresh_tls_config(&opctx)
-            .await
-            .map_err(|e| e.to_string())?;
-
         let server = Server { apictx: apictx.clone() };
         Ok(server)
     }
@@ -209,6 +200,7 @@ impl nexus_test_interface::NexusServer for Server {
         services: Vec<nexus_types::internal_api::params::ServicePutRequest>,
         external_dns_zone_name: &str,
         recovery_silo: nexus_types::internal_api::params::RecoverySiloConfig,
+        certs: Vec<nexus_types::internal_api::params::Certificate>,
     ) -> Self {
         // Perform the "handoff from RSS".
         //
@@ -249,7 +241,7 @@ impl nexus_test_interface::NexusServer for Server {
                     services,
                     datasets: vec![],
                     internal_services_ip_pool_ranges,
-                    certs: vec![],
+                    certs,
                     internal_dns_zone_config: DnsConfigBuilder::new().build(),
                     external_dns_zone_name: external_dns_zone_name.to_owned(),
                     recovery_silo,
@@ -264,12 +256,8 @@ impl nexus_test_interface::NexusServer for Server {
         Server::start(internal_server).await.unwrap()
     }
 
-    async fn get_http_server_external_address(&self) -> Option<SocketAddr> {
-        self.apictx.nexus.get_http_external_server_address().await
-    }
-
-    async fn get_https_server_external_address(&self) -> Option<SocketAddr> {
-        self.apictx.nexus.get_https_external_server_address().await
+    async fn get_http_server_external_address(&self) -> SocketAddr {
+        self.apictx.nexus.get_external_server_address().await.unwrap()
     }
 
     async fn get_http_server_internal_address(&self) -> SocketAddr {
diff --git a/nexus/test-interface/src/lib.rs b/nexus/test-interface/src/lib.rs
index 9019f1d9ce..599c8baba6 100644
--- a/nexus/test-interface/src/lib.rs
+++ b/nexus/test-interface/src/lib.rs
@@ -52,10 +52,10 @@ pub trait NexusServer {
         services: Vec<nexus_types::internal_api::params::ServicePutRequest>,
         external_dns_zone_name: &str,
         recovery_silo: nexus_types::internal_api::params::RecoverySiloConfig,
+        tls_certificates: Vec<nexus_types::internal_api::params::Certificate>,
     ) -> Self;
 
-    async fn get_http_server_external_address(&self) -> Option<SocketAddr>;
-    async fn get_https_server_external_address(&self) -> Option<SocketAddr>;
+    async fn get_http_server_external_address(&self) -> SocketAddr;
     async fn get_http_server_internal_address(&self) -> SocketAddr;
 
     async fn set_resolver(&self, resolver: internal_dns::resolver::Resolver);
diff --git a/nexus/test-utils/src/http_testing.rs b/nexus/test-utils/src/http_testing.rs
index cd33cb38ff..bf5370a925 100644
--- a/nexus/test-utils/src/http_testing.rs
+++ b/nexus/test-utils/src/http_testing.rs
@@ -481,6 +481,38 @@ pub enum AuthnMode {
     Session(String),
 }
 
+impl AuthnMode {
+    pub fn authn_header(
+        &self,
+    ) -> Result<
+        (http::header::HeaderName, http::header::HeaderValue),
+        anyhow::Error,
+    > {
+        use nexus_db_queries::authn;
+        match self {
+            AuthnMode::UnprivilegedUser => {
+                let header_value = spoof::make_header_value(
+                    authn::USER_TEST_UNPRIVILEGED.id(),
+                );
+                Ok((http::header::AUTHORIZATION, header_value.0.encode()))
+            }
+            AuthnMode::PrivilegedUser => {
+                let header_value =
+                    spoof::make_header_value(authn::USER_TEST_PRIVILEGED.id());
+                Ok((http::header::AUTHORIZATION, header_value.0.encode()))
+            }
+            AuthnMode::SiloUser(silo_user_id) => {
+                let header_value = spoof::make_header_value(*silo_user_id);
+                Ok((http::header::AUTHORIZATION, header_value.0.encode()))
+            }
+            AuthnMode::Session(session_token) => {
+                let header_value = format!("session={}", session_token);
+                parse_header_pair(http::header::COOKIE, header_value)
+            }
+        }
+    }
+}
+
 /// Helper for constructing requests to Nexus's external API
 ///
 /// This is a thin wrapper around [`RequestBuilder`] that exists to allow
@@ -506,41 +538,15 @@ impl<'a> NexusRequest<'a> {
     /// Causes the request to authenticate to Nexus as a user specified by
     /// `mode`
     pub fn authn_as(mut self, mode: AuthnMode) -> Self {
-        use nexus_db_queries::authn;
-
-        match mode {
-            AuthnMode::UnprivilegedUser => {
-                let header_value = spoof::make_header_value(
-                    authn::USER_TEST_UNPRIVILEGED.id(),
-                );
-                self.request_builder = self.request_builder.header(
-                    &http::header::AUTHORIZATION,
-                    header_value.0.encode(),
-                );
+        match mode.authn_header() {
+            Ok((header_name, header_value)) => {
+                self.request_builder =
+                    self.request_builder.header(header_name, header_value);
             }
-            AuthnMode::PrivilegedUser => {
-                let header_value =
-                    spoof::make_header_value(authn::USER_TEST_PRIVILEGED.id());
-                self.request_builder = self.request_builder.header(
-                    &http::header::AUTHORIZATION,
-                    header_value.0.encode(),
-                );
-            }
-            AuthnMode::SiloUser(silo_user_id) => {
-                let header_value = spoof::make_header_value(silo_user_id);
-                self.request_builder = self.request_builder.header(
-                    &http::header::AUTHORIZATION,
-                    header_value.0.encode(),
-                );
-            }
-            AuthnMode::Session(session_token) => {
-                self.request_builder = self.request_builder.header(
-                    &http::header::COOKIE,
-                    format!("session={}", session_token),
-                );
+            Err(error) => {
+                self.request_builder.error = Some(error);
             }
         }
-
         self
     }
 
diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs
index 20b283517f..470e864f8c 100644
--- a/nexus/test-utils/src/lib.rs
+++ b/nexus/test-utils/src/lib.rs
@@ -13,6 +13,7 @@ use dropshot::ConfigLogging;
 use dropshot::ConfigLoggingLevel;
 use nexus_test_interface::NexusServer;
 use nexus_types::external_api::params::UserId;
+use nexus_types::internal_api::params::Certificate;
 use nexus_types::internal_api::params::RecoverySiloConfig;
 use nexus_types::internal_api::params::ServiceKind;
 use nexus_types::internal_api::params::ServicePutRequest;
@@ -93,36 +94,6 @@ impl<N: NexusServer> ControlPlaneTestContext<N> {
         self.dendrite.cleanup().await.unwrap();
         self.logctx.cleanup_successful();
     }
-
-    pub async fn external_http_client(&self) -> ClientTestContext {
-        self.server
-            .get_http_server_external_address()
-            .await
-            .map(|addr| {
-                ClientTestContext::new(
-                    addr,
-                    self.logctx.log.new(
-                        o!("component" => "external http client test context"),
-                    ),
-                )
-            })
-            .unwrap()
-    }
-
-    pub async fn external_https_client(&self) -> ClientTestContext {
-        self.server
-            .get_https_server_external_address()
-            .await
-            .map(|addr| {
-                ClientTestContext::new(
-                    addr,
-                    self.logctx.log.new(
-                        o!("component" => "external https client test context"),
-                    ),
-                )
-            })
-            .unwrap()
-    }
 }
 
 pub fn load_test_config() -> omicron_common::nexus_config::Config {
@@ -153,14 +124,20 @@ pub async fn test_setup<N: NexusServer>(
     test_name: &str,
 ) -> ControlPlaneTestContext<N> {
     let mut config = load_test_config();
-    test_setup_with_config::<N>(test_name, &mut config, sim::SimMode::Explicit)
-        .await
+    test_setup_with_config::<N>(
+        test_name,
+        &mut config,
+        sim::SimMode::Explicit,
+        None,
+    )
+    .await
 }
 
 pub async fn test_setup_with_config<N: NexusServer>(
     test_name: &str,
     config: &mut omicron_common::nexus_config::Config,
     sim_mode: sim::SimMode,
+    initial_cert: Option<Certificate>,
 ) -> ControlPlaneTestContext<N> {
     let start_time = chrono::Utc::now();
     let logctx = LogContext::new(test_name, &config.pkg.log);
@@ -281,6 +258,7 @@ pub async fn test_setup_with_config<N: NexusServer>(
             external_address: config
                 .deployment
                 .dropshot_external
+                .dropshot
                 .bind_address
                 .ip(),
         },
@@ -303,17 +281,19 @@ pub async fn test_setup_with_config<N: NexusServer>(
         user_password_hash,
     };
 
+    let tls_certificates = initial_cert.into_iter().collect();
+
     let server = N::start(
         nexus_internal,
         &config,
         vec![dns_service_internal, dns_service_external, nexus_service],
         &external_dns_zone_name,
         recovery_silo,
+        tls_certificates,
     )
     .await;
 
-    let external_server_addr =
-        server.get_http_server_external_address().await.unwrap();
+    let external_server_addr = server.get_http_server_external_address().await;
     let internal_server_addr = server.get_http_server_internal_address().await;
 
     let testctx_external = ClientTestContext::new(
@@ -390,7 +370,6 @@ pub async fn start_sled_agent(
         dropshot: ConfigDropshot {
             bind_address: SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 0),
             request_body_max_bytes: 1024 * 1024,
-            ..Default::default()
         },
         // TODO-cleanup this is unused
         log: ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug },
@@ -574,7 +553,6 @@ pub async fn start_dns_server(
         &dropshot::ConfigDropshot {
             bind_address: "[::1]:0".parse().unwrap(),
             request_body_max_bytes: 8 * 1024,
-            ..Default::default()
         },
     )
     .await
diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml
index 49818f945c..cd17bdc175 100644
--- a/nexus/tests/config.test.toml
+++ b/nexus/tests/config.test.toml
@@ -2,9 +2,6 @@
 # Oxide API: configuration file for test suite
 #
 
-# Avoid conflicting with other ports during HTTPS tests
-nexus_https_port = 0
-
 [console]
 # Directory for static assets. Absolute path or relative to CWD.
 static_dir = "tests/static"
@@ -78,3 +75,8 @@ dns_external.period_secs_config = 60
 dns_external.period_secs_servers = 60
 dns_external.period_secs_propagation = 60
 dns_external.max_concurrent_server_updates = 5
+# How frequently we check the list of stored TLS certificates.  This is
+# approximately an upper bound on how soon after updating the list of
+# certificates it will take _other_ Nexus instances to notice and stop serving
+# them (on a sunny day).
+external_endpoints.period_secs = 60
diff --git a/nexus/tests/integration_tests/authn_http.rs b/nexus/tests/integration_tests/authn_http.rs
index af2cdb300c..9c1707f64d 100644
--- a/nexus/tests/integration_tests/authn_http.rs
+++ b/nexus/tests/integration_tests/authn_http.rs
@@ -300,7 +300,7 @@ async fn start_whoami_server(
     TestContext::new(
         whoami_api,
         server_state,
-        &config.deployment.dropshot_external,
+        &config.deployment.dropshot_external.dropshot,
         Some(logctx),
         log,
     )
diff --git a/nexus/tests/integration_tests/certificates.rs b/nexus/tests/integration_tests/certificates.rs
index 3ec16aa513..102fb54810 100644
--- a/nexus/tests/integration_tests/certificates.rs
+++ b/nexus/tests/integration_tests/certificates.rs
@@ -6,24 +6,30 @@
 
 use dropshot::test_util::ClientTestContext;
 use dropshot::HttpErrorResponseBody;
+use futures::TryStreamExt;
 use http::method::Method;
 use http::StatusCode;
+use internal_dns::names::DNS_ZONE_EXTERNAL_TESTING;
 use nexus_test_utils::http_testing::AuthnMode;
 use nexus_test_utils::http_testing::NexusRequest;
+use nexus_test_utils::load_test_config;
 use nexus_test_utils::resource_helpers::create_certificate;
-use nexus_test_utils::resource_helpers::create_local_user;
 use nexus_test_utils::resource_helpers::delete_certificate;
-use nexus_test_utils::resource_helpers::grant_iam;
-use nexus_test_utils::resource_helpers::object_create;
 use nexus_test_utils_macros::nexus_test;
 use nexus_types::external_api::views::Certificate;
+use nexus_types::internal_api::params as internal_params;
 use omicron_common::api::external::IdentityMetadataCreateParams;
-use omicron_common::api::external::Name;
-use omicron_nexus::authz::SiloRole;
 use omicron_nexus::external_api::params;
 use omicron_nexus::external_api::shared;
-use omicron_nexus::external_api::views::Silo;
+use omicron_test_utils::dev::poll::wait_for_condition;
+use omicron_test_utils::dev::poll::CondCheckError;
+use oxide_client::ClientSessionExt;
+use oxide_client::ClientSilosExt;
+use oxide_client::ClientSystemExt;
+use oxide_client::CustomDnsResolver;
 use std::io::Write;
+use std::sync::Arc;
+use std::time::Duration;
 
 type ControlPlaneTestContext =
     nexus_test_utils::ControlPlaneTestContext<omicron_nexus::Server>;
@@ -97,48 +103,6 @@ impl CertificateChain {
     pub fn cert_chain_as_pem(&self) -> Vec<u8> {
         tls_cert_to_pem(&self.cert_chain())
     }
-
-    // Issues a GET request using the certificate chain.
-    async fn do_request(
-        &self,
-        client: &dropshot::test_util::ClientTestContext,
-        scheme: http::uri::Scheme,
-    ) -> Result<(), hyper::Error> {
-        let address = client.bind_address;
-        let port = address.port();
-        let uri: hyper::Uri =
-            format!("{scheme}://localhost:{port}/").parse().unwrap();
-        let request = hyper::Request::builder()
-            .method(http::method::Method::GET)
-            .uri(&uri)
-            .body(hyper::Body::empty())
-            .unwrap();
-
-        match scheme.as_str() {
-            "http" => {
-                let http_client = hyper::Client::builder().build_http();
-                http_client.request(request).await.map(|_| ())
-            }
-            "https" => {
-                let mut root_store = rustls::RootCertStore { roots: vec![] };
-                root_store.add(&self.root_cert).expect("adding root cert");
-                let tls_config = rustls::ClientConfig::builder()
-                    .with_safe_defaults()
-                    .with_root_certificates(root_store)
-                    .with_no_client_auth();
-                let https_connector =
-                    hyper_rustls::HttpsConnectorBuilder::new()
-                        .with_tls_config(tls_config)
-                        .https_only()
-                        .enable_http1()
-                        .build();
-                let https_client =
-                    hyper::Client::builder().build(https_connector);
-                https_client.request(request).await.map(|_| ())
-            }
-            _ => panic!("Unsupported scheme"),
-        }
-    }
 }
 
 fn tls_cert_to_pem(certs: &Vec<rustls::Certificate>) -> Vec<u8> {
@@ -307,65 +271,6 @@ async fn test_crud(cptestctx: &ControlPlaneTestContext) {
     cert_delete_expect_not_found(&client).await;
 }
 
-#[nexus_test]
-async fn test_refresh(cptestctx: &ControlPlaneTestContext) {
-    let chain = CertificateChain::new();
-    let (cert, key) =
-        (chain.cert_chain_as_pem(), chain.end_cert_private_key_as_pem());
-
-    create_certificate(
-        &cptestctx.external_client,
-        CERT_NAME,
-        cert.clone(),
-        key.clone(),
-    )
-    .await;
-
-    let http_client = &cptestctx.external_http_client().await;
-    let https_client = &cptestctx.external_https_client().await;
-
-    let mut root_certs = rustls::RootCertStore::empty();
-    root_certs.add(&chain.root_cert).expect("Failed to add certificate");
-
-    // We can use HTTP on the HTTP interface, and HTTPS on the HTTPS
-    // interface...
-    chain.do_request(&http_client, http::uri::Scheme::HTTP).await.unwrap();
-    chain.do_request(&https_client, http::uri::Scheme::HTTPS).await.unwrap();
-
-    // ... but not vice-versa.
-    chain.do_request(&http_client, http::uri::Scheme::HTTPS).await.unwrap_err();
-    chain.do_request(&https_client, http::uri::Scheme::HTTP).await.unwrap_err();
-
-    // Remove the default certificate, add a new one.
-    //
-    // NOTE: We're doing this on the "HTTP client" interface only because dropshot
-    // makes a hard-coded assumption that the test client is not using HTTPS:
-    // https://docs.rs/dropshot/0.8.0/src/dropshot/test_util.rs.html#106
-    let chain2 = CertificateChain::new();
-    let (cert, key) =
-        (chain2.cert_chain_as_pem(), chain2.end_cert_private_key_as_pem());
-    create_certificate(
-        &http_client,
-        "my-other-certificate",
-        cert.clone(),
-        key.clone(),
-    )
-    .await;
-    delete_certificate(&http_client, CERT_NAME).await;
-
-    // (Test config) Refresh the clients -- the port for the HTTPS interface
-    // probably changed.
-    let https_client = &cptestctx.external_https_client().await;
-
-    // Requests through the old certificate chain fail -- it was removed.
-    chain
-        .do_request(&https_client, http::uri::Scheme::HTTPS)
-        .await
-        .unwrap_err();
-    // Requests through the new certificate chain succeed.
-    chain2.do_request(&https_client, http::uri::Scheme::HTTPS).await.unwrap();
-}
-
 #[nexus_test]
 async fn test_cannot_create_certificate_with_bad_key(
     cptestctx: &ControlPlaneTestContext,
@@ -428,76 +333,399 @@ async fn test_cannot_create_certificate_with_expired_cert(
     cert_create_expect_error(&client, CERT_NAME, cert, key).await;
 }
 
-#[nexus_test]
-async fn test_silo_with_certificates(cptestctx: &ControlPlaneTestContext) {
-    let client = &cptestctx.external_client;
-
-    // Create a Silo with TLS certificates.  Make sure that the certificates
-    // show up in the new Silo.
-    let chain = CertificateChain::new();
-    let (cert, key) =
-        (chain.cert_chain_as_pem(), chain.end_cert_private_key_as_pem());
-    let cert_name: Name = "a-cert".parse().unwrap();
-    let api_cert = params::CertificateCreate {
-        identity: IdentityMetadataCreateParams {
-            name: cert_name.clone(),
-            description: String::from("certifies stuff"),
-        },
-        cert,
-        key,
-        service: shared::ServiceUsingCertificate::ExternalApi,
+#[tokio::test]
+async fn test_silo_certificates() {
+    // The goal of this test is to verify that Nexus serves the correct TLS
+    // certificate for each incoming connection.  The correct TLS certificate
+    // for a connection depends on the certificates configured for the Silo
+    // whose endpoint the client used to reach Nexus.  So in order to test this,
+    // we need to do a few things:
+    //
+    // 1. Stand up Nexus in a mode that will run its HTTP server with TLS.
+    //    Easy enough: this just involves a slight tweak to the configuration.
+    //
+    //    Except that this means Nexus _won't_ be listening over HTTP.  That
+    //    means we need to connect to it using a TLS-capable client.  Our usual
+    //    test client is not TLS-capable.  Fortunately, the oxide-client is.
+    //
+    //    We also need to generate a TLS certificate, have Nexus use that, and
+    //    then have our client trust it.  Again, because Nexus is not listening
+    //    on HTTP, we can't use the usual APIs to add the certificate after
+    //    Nexus is up.  We need to provide the certificate when initializing the
+    //    rack.
+    //
+    //    All of this is required just to be able to have our test be able to
+    //    talk to Nexus using HTTP/TLS.
+    //
+    // 2. Having done all that, we'll create two more certificates for two more
+    //    Silos, create users in those Silos so that we can make requests, and
+    //    then make requests as those users to list certificates and verify that
+    //    the correct certificate (and only that certificate) appears in each
+    //    Silo.
+    //
+    // 3. We'll do all this using separate reqwest clients for each Silo
+    //    endpoint.  Each client will trust only its own Silo's certificate.  We
+    //    will verify that these clients _can't_ make requests to the other Silo
+    //    because it doesn't trust the certificate.
+
+    // Create the certificates and metadata for the three Silos that we're going
+    // to test with.
+    let silo1 = SiloCert::new("test-suite-silo".parse().unwrap());
+    let silo2 = SiloCert::new("silo2".parse().unwrap());
+    let silo3 = SiloCert::new("silo3".parse().unwrap());
+
+    // Start Nexus with a TLS server instead of its usual HTTP server.
+    let cptestctx = {
+        let mut config = load_test_config();
+        config.deployment.dropshot_external.tls = true;
+        nexus_test_utils::test_setup_with_config::<omicron_nexus::Server>(
+            "test_silo_certificates",
+            &mut config,
+            omicron_sled_agent::sim::SimMode::Explicit,
+            Some(silo1.cert.clone()),
+        )
+        .await
     };
-    let silo: Silo = object_create(
-        client,
-        "/v1/system/silos",
-        &params::SiloCreate {
-            identity: IdentityMetadataCreateParams {
-                name: "silo-name".parse().unwrap(),
-                description: "a silo".to_string(),
-            },
-            discoverable: false,
-            identity_mode: shared::SiloIdentityMode::LocalOnly,
-            admin_group_name: None,
-            tls_certificates: vec![api_cert],
-        },
-    )
-    .await;
 
-    // We should *not* see this certificate if we list certs in the usual way
-    // because it's in a different Silo.
-    let certs = certs_list(&client).await;
-    assert!(certs.is_empty());
+    let nexus_port = cptestctx.external_client.bind_address.port();
 
-    // Create a new user in the silo.
-    let new_silo_user_id = create_local_user(
-        client,
-        &silo,
-        &"some-silo-user".parse().unwrap(),
-        params::UserPassword::InvalidPassword,
+    // Log into silo1 (the Silo created during rack initialization) as the user
+    // that was created when that Silo was created.  We'll use this session to
+    // create the other Silos and their users.
+    let resolver = Arc::new(
+        CustomDnsResolver::new(*cptestctx.external_dns_server.local_address())
+            .unwrap(),
+    );
+    let session_token = oxide_client::login(
+        silo1.reqwest_client().dns_resolver(resolver.clone()),
+        &silo1.login_url(nexus_port),
+        cptestctx.user_name.as_ref().parse().unwrap(),
+        "oxide".parse().unwrap(),
     )
     .await
-    .id;
+    .expect("failed to log into recovery silo");
 
-    // Grant the user "admin" privileges on that Silo.
-    let silo_url = "/v1/system/silos/silo-name";
-    grant_iam(
-        client,
-        silo_url,
-        SiloRole::Admin,
-        new_silo_user_id,
-        AuthnMode::PrivilegedUser,
+    let silo1_client = silo1.oxide_client(
+        silo1.reqwest_client(),
+        resolver.clone(),
+        AuthnMode::Session(session_token),
+        nexus_port,
+    );
+
+    // Using that user, create a second Silo.
+    let silo2_cert = oxide_client::types::CertificateCreate::builder()
+        .name(silo2.cert_name.clone())
+        .description("")
+        .cert(silo2.cert.cert.clone())
+        .key(silo2.cert.key.clone())
+        .service(oxide_client::types::ServiceUsingCertificate::ExternalApi);
+    silo1_client
+        .silo_create()
+        .body(
+            oxide_client::types::SiloCreate::builder()
+                .name(silo2.silo_name.clone())
+                .description("")
+                .discoverable(false)
+                .identity_mode(oxide_client::types::SiloIdentityMode::LocalOnly)
+                .tls_certificates(vec![silo2_cert.try_into().unwrap()]),
+        )
+        .send()
+        .await
+        .expect("failed to create Silo");
+
+    // Create a local user in that Silo.
+    let silo2_user = silo1_client
+        .local_idp_user_create()
+        .silo(silo2.silo_name.clone())
+        .body(
+            oxide_client::types::UserCreate::builder()
+                .external_id("testuser-silo2")
+                .password(oxide_client::types::UserPassword::InvalidPassword),
+        )
+        .send()
+        .await
+        .expect("failed to create user")
+        .into_inner()
+        .id;
+
+    // Grant that user admin privileges on that Silo.
+    let mut silo2_policy = silo1_client
+        .silo_policy_view()
+        .silo(silo2.silo_name.clone())
+        .send()
+        .await
+        .unwrap()
+        .into_inner();
+    silo2_policy.role_assignments.push(
+        oxide_client::types::SiloRoleRoleAssignment::builder()
+            .identity_id(silo2_user)
+            .identity_type(oxide_client::types::IdentityType::SiloUser)
+            .role_name(oxide_client::types::SiloRole::Admin)
+            .try_into()
+            .unwrap(),
+    );
+    silo1_client
+        .silo_policy_update()
+        .silo(silo2.silo_name.clone())
+        .body(silo2_policy)
+        .send()
+        .await
+        .expect("failed to update Silo policy");
+
+    // Create another Silo with an admin user in it.
+    let silo3_cert = oxide_client::types::CertificateCreate::builder()
+        .name(silo3.cert_name.clone())
+        .description("")
+        .cert(silo3.cert.cert.clone())
+        .key(silo3.cert.key.clone())
+        .service(oxide_client::types::ServiceUsingCertificate::ExternalApi);
+    silo1_client
+        .silo_create()
+        .body(
+            oxide_client::types::SiloCreate::builder()
+                .name(silo3.silo_name.clone())
+                .description("")
+                .discoverable(false)
+                .identity_mode(oxide_client::types::SiloIdentityMode::LocalOnly)
+                .tls_certificates(vec![silo3_cert.try_into().unwrap()]),
+        )
+        .send()
+        .await
+        .expect("failed to create Silo");
+    let silo3_user = silo1_client
+        .local_idp_user_create()
+        .silo(silo3.silo_name.clone())
+        .body(
+            oxide_client::types::UserCreate::builder()
+                .external_id("testuser-silo3")
+                .password(oxide_client::types::UserPassword::InvalidPassword),
+        )
+        .send()
+        .await
+        .expect("failed to create user")
+        .into_inner()
+        .id;
+
+    // Grant that user admin privileges on that Silo.
+    let mut silo3_policy = silo1_client
+        .silo_policy_view()
+        .silo(silo3.silo_name.clone())
+        .send()
+        .await
+        .unwrap()
+        .into_inner();
+    silo3_policy.role_assignments.push(
+        oxide_client::types::SiloRoleRoleAssignment::builder()
+            .identity_id(silo3_user)
+            .identity_type(oxide_client::types::IdentityType::SiloUser)
+            .role_name(oxide_client::types::SiloRole::Admin)
+            .try_into()
+            .unwrap(),
+    );
+    silo1_client
+        .silo_policy_update()
+        .silo(silo3.silo_name.clone())
+        .body(silo3_policy)
+        .send()
+        .await
+        .expect("failed to update Silo policy");
+
+    // Verify that the user in silo 1 (the one created by the test suite) cannot
+    // see any of the three certificates that we created in the other Silos.
+    let certs = silo1_client
+        .certificate_list()
+        .stream()
+        .try_collect::<Vec<_>>()
+        .await
+        .expect("failed to list certificates");
+    assert_eq!(certs.len(), 1);
+    // This is the name created during rack initialization.
+    assert_eq!(certs[0].name, "default-1".parse().unwrap());
+
+    // Similarly, create clients for the users in the other two Silos.  Note
+    // that loading of the right certificates is asynchronous.  Wait for them to
+    // show up.  (We only have to wait for silo3's because they will be
+    // propagated in sequential order.)
+    let silo2_client = silo2.oxide_client(
+        silo2.reqwest_client(),
+        resolver.clone(),
+        AuthnMode::SiloUser(silo2_user),
+        nexus_port,
+    );
+    let silo3_client = silo3.oxide_client(
+        silo3.reqwest_client(),
+        resolver.clone(),
+        AuthnMode::SiloUser(silo3_user),
+        nexus_port,
+    );
+    wait_for_condition(
+        || async {
+            match silo3_client.current_user_view().send().await {
+                Ok(result) => {
+                    assert_eq!(result.into_inner().id, silo3_user);
+                    Ok(())
+                }
+                Err(oxide_client::Error::CommunicationError(error))
+                    if error.is_connect() =>
+                {
+                    Err(CondCheckError::NotYet)
+                }
+                Err(e) => Err(CondCheckError::Failed(e)),
+            }
+        },
+        &Duration::from_millis(50),
+        &Duration::from_secs(30),
     )
-    .await;
-
-    // As that new user, list certificates.  We should see the one we created.
-    let certs: dropshot::ResultsPage<Certificate> =
-        NexusRequest::object_get(client, CERTS_URL)
-            .authn_as(AuthnMode::SiloUser(new_silo_user_id))
-            .execute()
-            .await
-            .expect("failed to list certificates")
-            .parsed_body()
-            .expect("failed to parse list of certificates");
-    assert_eq!(certs.items.len(), 1);
-    assert_eq!(certs.items[0].identity.name, cert_name);
+    .await
+    .unwrap_or_else(|error| {
+        panic!(
+            "failed to connect to silo3's endpoint within timeout: {:#}",
+            error
+        )
+    });
+
+    // Now make sure each client can see only the certificate for its own Silo.
+    // This also exercises that Nexus is _serving_ the right certificate.
+    // That's because each client is configured to trust only its own client's
+    // certificate.
+    let certs = silo2_client
+        .certificate_list()
+        .stream()
+        .try_collect::<Vec<_>>()
+        .await
+        .expect("failed to list certificates");
+    assert_eq!(certs.len(), 1);
+    assert_eq!(certs[0].name, silo2.cert_name);
+
+    let certs = silo3_client
+        .certificate_list()
+        .stream()
+        .try_collect::<Vec<_>>()
+        .await
+        .expect("failed to list certificates");
+    assert_eq!(certs.len(), 1);
+    assert_eq!(certs[0].name, silo3.cert_name);
+
+    // For good measure, to make sure we got the certificate stuff right, let's
+    // try to use the wrong certificate to reach each endpoint and confirm that
+    // we get a TLS error.
+    let silo2_client_wrong_cert = silo2.oxide_client(
+        silo3.reqwest_client(),
+        resolver.clone(),
+        AuthnMode::SiloUser(silo2_user),
+        nexus_port,
+    );
+    let error =
+        silo2_client_wrong_cert.current_user_view().send().await.expect_err(
+            "unexpectedly connected with wrong certificate trusted",
+        );
+    if let oxide_client::Error::CommunicationError(error) = error {
+        assert!(error.is_connect());
+        assert!(error.to_string().contains("invalid peer certificate"));
+    } else {
+        panic!(
+            "unexpected error connecting with wrong certificate: {:#}",
+            error
+        );
+    }
+    let silo3_client_wrong_cert = silo3.oxide_client(
+        silo2.reqwest_client(),
+        resolver.clone(),
+        AuthnMode::SiloUser(silo2_user),
+        nexus_port,
+    );
+    let error =
+        silo3_client_wrong_cert.current_user_view().send().await.expect_err(
+            "unexpectedly connected with wrong certificate trusted",
+        );
+    if let oxide_client::Error::CommunicationError(error) = error {
+        assert!(error.is_connect());
+        assert!(error.to_string().contains("invalid peer certificate"));
+    } else {
+        panic!(
+            "unexpected error connecting with wrong certificate: {:#}",
+            error
+        );
+    }
+
+    cptestctx.teardown().await;
+}
+
+/// Helper for the certificate test
+///
+/// This structure keeps track of various metadata about a Silo to ensure
+/// that we operate on it consistently.
+struct SiloCert {
+    silo_name: oxide_client::types::Name,
+    dns_name: String,
+    cert_name: oxide_client::types::Name,
+    cert: internal_params::Certificate,
+}
+
+impl SiloCert {
+    /// Given just the silo name, construct the DNS name, a new certificate
+    /// chain for that DNS name, and a name for that certificate.
+    fn new(silo_name: oxide_client::types::Name) -> SiloCert {
+        let dns_name =
+            format!("{}.sys.{}", silo_name.as_str(), DNS_ZONE_EXTERNAL_TESTING);
+        let chain =
+            CertificateChain::with_params(rcgen::CertificateParams::new(vec![
+                dns_name.clone(),
+            ]));
+        let cert_name = format!("cert-{}", silo_name.as_str()).parse().unwrap();
+        let cert = internal_params::Certificate {
+            cert: chain.cert_chain_as_pem(),
+            key: chain.end_cert_private_key_as_pem(),
+        };
+        SiloCert { silo_name, dns_name, cert_name, cert }
+    }
+
+    /// Returns the base URL of the HTTPS endpoint for this Silo
+    fn base_url(&self, port: u16) -> String {
+        format!("https://{}:{}", &self.dns_name, port)
+    }
+
+    /// Returns the full URL to the login endpoint for this Silo
+    fn login_url(&self, port: u16) -> String {
+        format!(
+            "{}/login/{}/local",
+            &self.base_url(port),
+            &self.silo_name.as_str()
+        )
+    }
+
+    /// Returns a new `ReqwestClientBuilder` that's configured to trust this
+    /// client's certificate
+    fn reqwest_client(&self) -> reqwest::ClientBuilder {
+        let rustls_cert =
+            reqwest::tls::Certificate::from_pem(&self.cert.cert).unwrap();
+        reqwest::ClientBuilder::new().add_root_certificate(rustls_cert)
+    }
+
+    /// Returns an `oxide_client::Client` that's configured to talk to this
+    /// Silo's endpoint
+    ///
+    /// You provide the underlying `reqwest_client`.  To actually use this
+    /// Silo's endpoint, use `self.reqwest_client()` to make sure the client
+    /// will trust this Silo's certificate.  These methods are separated so
+    /// that we can test what happens when we _don't_ trust this client's
+    /// certificate.
+    fn oxide_client(
+        &self,
+        reqwest_client: reqwest::ClientBuilder,
+        resolver: Arc<CustomDnsResolver>,
+        authn_mode: AuthnMode,
+        port: u16,
+    ) -> oxide_client::Client {
+        let (header_name, header_value) = authn_mode.authn_header().unwrap();
+        let mut headers = http::header::HeaderMap::new();
+        headers.append(header_name, header_value);
+        let reqwest_client = reqwest_client
+            .default_headers(headers)
+            .dns_resolver(resolver)
+            .build()
+            .unwrap();
+        let base_url = self.base_url(port);
+        oxide_client::Client::new_with_client(&base_url, reqwest_client)
+    }
 }
diff --git a/nexus/tests/integration_tests/console_api.rs b/nexus/tests/integration_tests/console_api.rs
index 46db361dc1..679930dbc9 100644
--- a/nexus/tests/integration_tests/console_api.rs
+++ b/nexus/tests/integration_tests/console_api.rs
@@ -328,6 +328,7 @@ async fn test_absolute_static_dir() {
         "test_absolute_static_dir",
         &mut config,
         sim::SimMode::Explicit,
+        None,
     )
     .await;
     let testctx = &cptestctx.external_client;
diff --git a/nexus/tests/integration_tests/updates.rs b/nexus/tests/integration_tests/updates.rs
index 254e3703f7..041fd6e2ff 100644
--- a/nexus/tests/integration_tests/updates.rs
+++ b/nexus/tests/integration_tests/updates.rs
@@ -71,6 +71,7 @@ async fn test_update_end_to_end() {
         "test_update_end_to_end",
         &mut config,
         sim::SimMode::Explicit,
+        None,
     )
     .await;
     let client = &cptestctx.external_client;
diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json
index c41c3513a0..a0276c5ed8 100644
--- a/openapi/sled-agent.json
+++ b/openapi/sled-agent.json
@@ -1762,6 +1762,10 @@
                 "type": "string",
                 "format": "ip"
               },
+              "external_tls": {
+                "description": "Whether Nexus's external endpoint should use TLS",
+                "type": "boolean"
+              },
               "internal_ip": {
                 "description": "The address at which the internal nexus server is reachable.",
                 "type": "string",
@@ -1784,6 +1788,7 @@
             },
             "required": [
               "external_ip",
+              "external_tls",
               "internal_ip",
               "nic",
               "type"
diff --git a/oxide-client/Cargo.toml b/oxide-client/Cargo.toml
index 692ee4a7a3..0602066e6d 100644
--- a/oxide-client/Cargo.toml
+++ b/oxide-client/Cargo.toml
@@ -5,14 +5,19 @@ edition = "2021"
 license = "MPL-2.0"
 
 [dependencies]
+anyhow.workspace = true
 base64.workspace = true
 chrono.workspace = true
 futures.workspace = true
+http.workspace = true
+hyper.workspace = true
 progenitor.workspace = true
 rand.workspace = true
 regress.workspace = true
 reqwest = { workspace = true, features = [ "json", "rustls-tls", "stream" ] }
 serde.workspace = true
 serde_json.workspace = true
+thiserror.workspace = true
 tokio = { workspace = true, features = [ "net" ] }
+trust-dns-resolver.workspace = true
 uuid.workspace = true
diff --git a/oxide-client/src/lib.rs b/oxide-client/src/lib.rs
index 7a6237759c..16f81d4855 100644
--- a/oxide-client/src/lib.rs
+++ b/oxide-client/src/lib.rs
@@ -4,8 +4,147 @@
 
 //! Interface for making API requests to the Oxide control plane.
 
+use anyhow::anyhow;
+use anyhow::Context;
+use futures::FutureExt;
+use std::net::SocketAddr;
+use std::sync::Arc;
+use thiserror::Error;
+use trust_dns_resolver::config::{
+    NameServerConfig, Protocol, ResolverConfig, ResolverOpts,
+};
+use trust_dns_resolver::TokioAsyncResolver;
+
 progenitor::generate_api!(
     spec = "../openapi/nexus.json",
     interface = Builder,
     tags = Separate,
 );
+
+/// Custom reqwest DNS resolver intended for use with the Oxide client
+///
+/// In development, the Oxide client is often used against a deployment with its
+/// own DNS server that can resolve DNS names for Nexus.  This impl lets
+/// consumers use that DNS server directly with reqwest to resolve IP addresses
+/// for Nexus.  This is often useful when trying to connect with Nexus using
+/// TLS, since you need to come in via the DNS name to do that.
+///
+/// This is a thin wrapper around `TokioAsyncResolver`
+pub struct CustomDnsResolver {
+    dns_addr: SocketAddr,
+    // The lifetime constraints on the `Resolve` trait make it hard to avoid an
+    // Arc here.
+    resolver: Arc<TokioAsyncResolver>,
+}
+
+impl CustomDnsResolver {
+    /// Make a new custom resolver that uses the DNS server at the specified
+    /// address
+    pub fn new(dns_addr: SocketAddr) -> anyhow::Result<CustomDnsResolver> {
+        let mut resolver_config = ResolverConfig::new();
+        resolver_config.add_name_server(NameServerConfig {
+            socket_addr: dns_addr,
+            protocol: Protocol::Udp,
+            tls_dns_name: None,
+            trust_nx_responses: false,
+            bind_addr: None,
+        });
+
+        let resolver = Arc::new(
+            TokioAsyncResolver::tokio(resolver_config, ResolverOpts::default())
+                .context("failed to create resolver")?,
+        );
+        Ok(CustomDnsResolver { dns_addr, resolver })
+    }
+
+    /// Returns the address of the DNS server that we're using to resolve names
+    pub fn dns_addr(&self) -> SocketAddr {
+        self.dns_addr
+    }
+
+    /// Returns the underlying `TokioAsyncResolver
+    pub fn resolver(&self) -> &TokioAsyncResolver {
+        &self.resolver
+    }
+}
+
+impl reqwest::dns::Resolve for CustomDnsResolver {
+    fn resolve(
+        &self,
+        name: hyper::client::connect::dns::Name,
+    ) -> reqwest::dns::Resolving {
+        let resolver = self.resolver.clone();
+        async move {
+            let list = resolver.lookup_ip(name.as_str()).await?;
+            Ok(Box::new(list.into_iter().map(|s| {
+                // reqwest does not appear to use the port number here.
+                // (See the docs for `ClientBuilder::resolve()`, which isn't
+                // the same thing, but is related.)
+                SocketAddr::from((s, 0))
+            })) as Box<dyn Iterator<Item = SocketAddr> + Send>)
+        }
+        .boxed()
+    }
+}
+
+#[derive(Debug, Error)]
+pub enum LoginError {
+    #[error("logging in: {0:#}")]
+    RequestError(#[from] reqwest::Error),
+    #[error("logging in: {0:#}")]
+    CatchAll(#[from] anyhow::Error),
+}
+
+/// Logs into the specified Oxide API endpoint and returns a session token.
+///
+/// This is intended for test suites.
+pub async fn login(
+    reqwest_builder: reqwest::ClientBuilder,
+    silo_login_url: &str,
+    username: crate::types::UserId,
+    password: crate::types::Password,
+) -> Result<String, LoginError> {
+    let login_request_body =
+        serde_json::to_string(&crate::types::UsernamePasswordCredentials {
+            username,
+            password,
+        })
+        .context("serializing login request body")?;
+
+    // Do not have reqwest follow redirects.  That's because our login response
+    // includes both a redirect and the session cookie header.  If reqwest
+    // follows the redirect, we won't have a chance to get the cookie.
+    let reqwest_client = reqwest_builder
+        .redirect(reqwest::redirect::Policy::none())
+        .build()
+        .context("creating reqwest client for login")?;
+
+    let response = reqwest_client
+        .post(silo_login_url)
+        .body(login_request_body)
+        .send()
+        .await?;
+    let session_cookie = response
+        .headers()
+        .get(http::header::SET_COOKIE)
+        .ok_or_else(|| anyhow!("expected session cookie after login"))?
+        .to_str()
+        .context("expected session cookie token to be a string")?;
+    let (session_token, rest) =
+        session_cookie.split_once("; ").context("parsing session cookie")?;
+    let (key, value) =
+        session_token.split_once('=').context("parsing session token")?;
+    if key != "session" {
+        return Err(
+            anyhow!("unexpected key parsing session token: {:?}", key).into()
+        );
+    }
+    if !rest.contains("Path=/; HttpOnly; SameSite=Lax; Max-Age=") {
+        return Err(anyhow!(
+            "unexpected cookie header format: {:?}",
+            session_cookie
+        )
+        .into());
+    }
+    Ok(value.to_string())
+}
diff --git a/oximeter/producer/examples/producer.rs b/oximeter/producer/examples/producer.rs
index d80b814149..4f1427fa2b 100644
--- a/oximeter/producer/examples/producer.rs
+++ b/oximeter/producer/examples/producer.rs
@@ -94,11 +94,8 @@ impl Producer for CpuBusyProducer {
 #[tokio::main]
 async fn main() {
     let address = "[::1]:0".parse().unwrap();
-    let dropshot = ConfigDropshot {
-        bind_address: address,
-        request_body_max_bytes: 2048,
-        tls: None,
-    };
+    let dropshot =
+        ConfigDropshot { bind_address: address, request_body_max_bytes: 2048 };
     let log = LogConfig::Config(ConfigLogging::StderrTerminal {
         level: ConfigLoggingLevel::Debug,
     });
diff --git a/sled-agent/src/bin/sled-agent-sim.rs b/sled-agent/src/bin/sled-agent-sim.rs
index baff3fd85a..20418eb703 100644
--- a/sled-agent/src/bin/sled-agent-sim.rs
+++ b/sled-agent/src/bin/sled-agent-sim.rs
@@ -6,10 +6,13 @@
 
 // TODO see the TODO for nexus.
 
+use anyhow::Context;
+use camino::Utf8PathBuf;
 use clap::Parser;
 use dropshot::ConfigDropshot;
 use dropshot::ConfigLogging;
 use dropshot::ConfigLoggingLevel;
+use nexus_client::types as NexusTypes;
 use omicron_common::cmd::fatal;
 use omicron_common::cmd::CmdError;
 use omicron_sled_agent::sim::RssArgs;
@@ -62,6 +65,18 @@ struct Args {
     /// IP address.  When combined with `NEXUS_EXTERNAL_IP:PORT`, this will cause
     /// Nexus to publish DNS names to external DNS.
     rss_external_dns_internal_addr: Option<SocketAddrV6>,
+
+    #[clap(long, name = "TLS_CERT_PEM_FILE", action)]
+    /// If this flag and TLS_KEY_PEM_FILE are specified, when the simulated sled
+    /// agent initializes the rack, the specified certificate and private keys
+    /// will be provided for the initial TLS certificates for the recovery silo.
+    rss_tls_cert: Option<Utf8PathBuf>,
+
+    #[clap(long, name = "TLS_KEY_PEM_FILE", action)]
+    /// If this flag and TLS_CERT_PEM_FILE are specified, when the simulated sled
+    /// agent initializes the rack, the specified certificate and private keys
+    /// will be provided for the initial TLS certificates for the recovery silo.
+    rss_tls_key: Option<Utf8PathBuf>,
 }
 
 #[tokio::main]
@@ -83,7 +98,6 @@ async fn do_run() -> Result<(), CmdError> {
         dropshot: ConfigDropshot {
             bind_address: args.sled_agent_addr.into(),
             request_body_max_bytes: 1024 * 1024,
-            ..Default::default()
         },
         log: ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Info },
         storage: ConfigStorage {
@@ -99,9 +113,28 @@ async fn do_run() -> Result<(), CmdError> {
         },
     };
 
+    let tls_certificate = match (args.rss_tls_cert, args.rss_tls_key) {
+        (None, None) => None,
+        (Some(cert_path), Some(key_path)) => {
+            let cert_bytes = std::fs::read(&cert_path)
+                .with_context(|| format!("read {:?}", &cert_path))
+                .map_err(|e| CmdError::Failure(e.to_string()))?;
+            let key_bytes = std::fs::read(&key_path)
+                .with_context(|| format!("read {:?}", &key_path))
+                .map_err(|e| CmdError::Failure(e.to_string()))?;
+            Some(NexusTypes::Certificate { cert: cert_bytes, key: key_bytes })
+        }
+        _ => {
+            return Err(CmdError::Usage(String::from(
+                "--rss-tls-key and --rss-tls-cert must be specified together",
+            )))
+        }
+    };
+
     let rss_args = RssArgs {
         nexus_external_addr: args.rss_nexus_external_addr,
         external_dns_internal_addr: args.rss_external_dns_internal_addr,
+        tls_certificate,
     };
 
     run_server(&config, &rss_args).await.map_err(CmdError::Failure)
diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs
index 89b5698d96..cfffd1fe82 100644
--- a/sled-agent/src/params.rs
+++ b/sled-agent/src/params.rs
@@ -316,6 +316,8 @@ pub enum ServiceType {
         external_ip: IpAddr,
         /// The service vNIC providing external connectivity using OPTE.
         nic: NetworkInterface,
+        /// Whether Nexus's external endpoint should use TLS
+        external_tls: bool,
     },
     ExternalDns {
         /// The address at which the external DNS server API is reachable.
@@ -431,8 +433,13 @@ impl TryFrom<ServiceType> for sled_agent_client::types::ServiceType {
         use ServiceType as St;
 
         match s {
-            St::Nexus { internal_ip, external_ip, nic } => {
-                Ok(AutoSt::Nexus { internal_ip, external_ip, nic: nic.into() })
+            St::Nexus { internal_ip, external_ip, nic, external_tls } => {
+                Ok(AutoSt::Nexus {
+                    internal_ip,
+                    external_ip,
+                    nic: nic.into(),
+                    external_tls,
+                })
             }
             St::ExternalDns { http_address, dns_address, nic } => {
                 Ok(AutoSt::ExternalDns {
diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs
index 84b90d5b38..7323d9f2d8 100644
--- a/sled-agent/src/rack_setup/plan/service.rs
+++ b/sled-agent/src/rack_setup/plan/service.rs
@@ -345,6 +345,15 @@ impl Plan {
                             internal_ip: address,
                             external_ip,
                             nic,
+                            // Tell Nexus to use TLS if and only if the caller
+                            // provided TLS certificates.  This effectively
+                            // determines the status of TLS for the lifetime of
+                            // the rack.  In production-like deployments, we'd
+                            // always expect TLS to be enabled.  It's only in
+                            // development that it might not be.
+                            external_tls: !config
+                                .external_certificates
+                                .is_empty(),
                         },
                     }],
                 })
diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs
index 735d152e89..0194497ad5 100644
--- a/sled-agent/src/services.rs
+++ b/sled-agent/src/services.rs
@@ -70,7 +70,7 @@ use omicron_common::backoff::{
     BackoffError,
 };
 use omicron_common::nexus_config::{
-    self, DeploymentConfig as NexusDeploymentConfig,
+    self, ConfigDropshotWithTls, DeploymentConfig as NexusDeploymentConfig,
 };
 use once_cell::sync::OnceCell;
 use sled_hardware::is_gimlet;
@@ -1294,7 +1294,7 @@ impl ServiceManager {
             smfh.import_manifest()?;
 
             match &service.details {
-                ServiceType::Nexus { internal_ip, .. } => {
+                ServiceType::Nexus { internal_ip, external_tls, .. } => {
                     info!(self.inner.log, "Setting up Nexus service");
 
                     let sled_info = self
@@ -1313,16 +1313,21 @@ impl ServiceManager {
 
                     // Nexus takes a separate config file for parameters which
                     // cannot be known at packaging time.
+                    let nexus_port = if *external_tls { 443 } else { 80 };
                     let deployment_config = NexusDeploymentConfig {
                         id: request.zone.id,
                         rack_id: sled_info.rack_id,
 
-                        dropshot_external: dropshot::ConfigDropshot {
-                            bind_address: SocketAddr::new(port_ip, 80),
-                            // This has to be large enough to support:
-                            // - bulk writes to disks
-                            request_body_max_bytes: 8192 * 1024,
-                            ..Default::default()
+                        dropshot_external: ConfigDropshotWithTls {
+                            tls: *external_tls,
+                            dropshot: dropshot::ConfigDropshot {
+                                bind_address: SocketAddr::new(
+                                    port_ip, nexus_port,
+                                ),
+                                // This has to be large enough to support:
+                                // - bulk writes to disks
+                                request_body_max_bytes: 8192 * 1024,
+                            },
                         },
                         dropshot_internal: dropshot::ConfigDropshot {
                             bind_address: SocketAddr::new(
@@ -1334,7 +1339,6 @@ impl ServiceManager {
                             // certificates provided by the customer during rack
                             // setup.
                             request_body_max_bytes: 10 * 1024 * 1024,
-                            ..Default::default()
                         },
                         subnet: Ipv6Subnet::<RACK_PREFIX>::new(
                             sled_info.underlay_address,
diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs
index 14414a995c..46aa287c54 100644
--- a/sled-agent/src/sim/server.rs
+++ b/sled-agent/src/sim/server.rs
@@ -322,11 +322,16 @@ impl Server {
                 .unwrap(),
         };
 
+        let certs = match &rss_args.tls_certificate {
+            Some(c) => vec![c.clone()],
+            None => vec![],
+        };
+
         let rack_init_request = NexusTypes::RackInitializationRequest {
             services,
             datasets,
             internal_services_ip_pool_ranges,
-            certs: vec![],
+            certs,
             internal_dns_zone_config: d2n_params(&dns_config),
             external_dns_zone_name:
                 internal_dns::names::DNS_ZONE_EXTERNAL_TESTING.to_owned(),
@@ -397,6 +402,9 @@ pub struct RssArgs {
     /// Specify the (internal) address of an external DNS server so that Nexus
     /// will know about it and keep it up to date
     pub external_dns_internal_addr: Option<SocketAddrV6>,
+    /// Specify a certificate and associated private key for the initial Silo's
+    /// initial TLS certificates
+    pub tls_certificate: Option<NexusTypes::Certificate>,
 }
 
 /// Run an instance of the `Server`
diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs
index 69d476576b..e8a668881a 100644
--- a/sled-agent/src/sim/storage.rs
+++ b/sled-agent/src/sim/storage.rs
@@ -800,7 +800,6 @@ impl PantryServer {
                 // This has to be large enough to support:
                 // - bulk writes into disks
                 request_body_max_bytes: 8192 * 1024,
-                ..Default::default()
             },
             super::http_entrypoints_pantry::api(),
             pantry.clone(),
diff --git a/smf/nexus/config-partial.toml b/smf/nexus/config-partial.toml
index d2ec931904..c22dc61d55 100644
--- a/smf/nexus/config-partial.toml
+++ b/smf/nexus/config-partial.toml
@@ -28,3 +28,8 @@ dns_external.period_secs_config = 60
 dns_external.period_secs_servers = 60
 dns_external.period_secs_propagation = 60
 dns_external.max_concurrent_server_updates = 5
+# How frequently we check the list of stored TLS certificates.  This is
+# approximately an upper bound on how soon after updating the list of
+# certificates it will take _other_ Nexus instances to notice and stop serving
+# them (on a sunny day).
+external_endpoints.period_secs = 60
diff --git a/wicketd/src/lib.rs b/wicketd/src/lib.rs
index d2c823851a..78d55c84cf 100644
--- a/wicketd/src/lib.rs
+++ b/wicketd/src/lib.rs
@@ -76,7 +76,6 @@ impl Server {
             // The maximum request size is set to 4 GB -- artifacts can be large and there's currently
             // no way to set a larger request size for some endpoints.
             request_body_max_bytes: 4 << 30,
-            ..Default::default()
         };
 
         let mgs_manager = MgsManager::new(&log, args.mgs_address);

From 0cf36137f4b6c9acc7af2da12f7c5685be45d0aa Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 25 May 2023 21:01:22 -0700
Subject: [PATCH 07/14] Bump console from 0.15.5 to 0.15.7 (#3177)

Bumps [console](https://github.com/console-rs/console) from 0.15.5 to
0.15.7.
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/console-rs/console/blob/master/CHANGELOG.md">console's
changelog</a>.</em></p>
<blockquote>
<h2>0.15.7</h2>
<h3>Enhancements</h3>
<ul>
<li>Set an appropriate lower version of libc for macos changes.</li>
<li>Improved behavior of <code>read_single_key</code> so it does not
disturb other
threads quite as much. (<a
href="https://redirect.github.com/console-rs/console/issues/165">#165</a>)</li>
<li>More reliably reset raw mode in terminal. (<a
href="https://redirect.github.com/console-rs/console/issues/171">#171</a>)</li>
</ul>
<h2>0.15.6</h2>
<h3>Enhancements</h3>
<ul>
<li>Switch to <code>select()</code> on macOS for polling on TTYs to work
around
a macOS bug. (<a
href="https://redirect.github.com/console-rs/console/issues/169">#169</a>)</li>
<li>Added blink fast and strikethrough attributes. (<a
href="https://redirect.github.com/console-rs/console/issues/159">#159</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/console-rs/console/commit/842b3767f9bca687c216a0bc619fa64b9880f18d"><code>842b376</code></a>
0.15.7</li>
<li><a
href="https://github.com/console-rs/console/commit/03aefb988ee3f3bcbe9745ed7e798a43f5c99eef"><code>03aefb9</code></a>
Mention changelog entry</li>
<li><a
href="https://github.com/console-rs/console/commit/85c4de4b5cc205c218ccc26c7904957837b7057a"><code>85c4de4</code></a>
More reliably reset raw mode (<a
href="https://redirect.github.com/console-rs/console/issues/171">#171</a>)</li>
<li><a
href="https://github.com/console-rs/console/commit/55e4762c1e13ce67d7302ee8ee057d19640a2b42"><code>55e4762</code></a>
Update changelog entries</li>
<li><a
href="https://github.com/console-rs/console/commit/bda931cb573aed8f0dcaaaaadaa88852f5108a1d"><code>bda931c</code></a>
Don't touch output config attributes in read_single_key() (<a
href="https://redirect.github.com/console-rs/console/issues/165">#165</a>)</li>
<li><a
href="https://github.com/console-rs/console/commit/b5c80ccbd18a1e36e3457eef823ae86fa936cf36"><code>b5c80cc</code></a>
Raise libc minimums to 0.2.99</li>
<li><a
href="https://github.com/console-rs/console/commit/d71320f1b2a14df3ebb3f33258b72421ef930448"><code>d71320f</code></a>
0.15.6</li>
<li><a
href="https://github.com/console-rs/console/commit/bf43c2012d457d784d60e7f0116f5695c24e1fba"><code>bf43c20</code></a>
Added missing changelog entry</li>
<li><a
href="https://github.com/console-rs/console/commit/10bb1089473dc4f74175ab198921dd4f10c1ce0b"><code>10bb108</code></a>
Added changelog entry</li>
<li><a
href="https://github.com/console-rs/console/commit/66cb2aa942a8779874c6888bb7cd15d069e913e8"><code>66cb2aa</code></a>
Prefer select() over poll() on macos for ttys (<a
href="https://redirect.github.com/console-rs/console/issues/169">#169</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/console-rs/console/compare/0.15.5...0.15.7">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=console&package-manager=cargo&previous-version=0.15.5&new-version=0.15.7)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock           | 6 +++---
 tufaceous/Cargo.toml | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c70565566f..2346cdd5a6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1053,15 +1053,15 @@ dependencies = [
 
 [[package]]
 name = "console"
-version = "0.15.5"
+version = "0.15.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3d79fbe8970a77e3e34151cc13d3b3e248aa0faaecb9f6091fa07ebefe5ad60"
+checksum = "c926e00cc70edefdc64d3a5ff31cc65bb97a3460097762bd23afb4d8145fccf8"
 dependencies = [
  "encode_unicode",
  "lazy_static",
  "libc",
  "unicode-width",
- "windows-sys 0.42.0",
+ "windows-sys 0.45.0",
 ]
 
 [[package]]
diff --git a/tufaceous/Cargo.toml b/tufaceous/Cargo.toml
index e49de3fc6f..8e8d9ae4e3 100644
--- a/tufaceous/Cargo.toml
+++ b/tufaceous/Cargo.toml
@@ -9,7 +9,7 @@ anyhow = { workspace = true, features = ["backtrace"] }
 camino.workspace = true
 clap = { workspace = true, features = ["derive", "env"] }
 chrono.workspace = true
-console = { version = "0.15.5", default-features = false }
+console = { version = "0.15.7", default-features = false }
 humantime.workspace = true
 omicron-common.workspace = true
 slog.workspace = true

From bd62d069b8fe5629a31abb6f2cd3b1104b108d20 Mon Sep 17 00:00:00 2001
From: liffy <629075+lifning@users.noreply.github.com>
Date: Fri, 26 May 2023 00:42:16 -0700
Subject: [PATCH 08/14] Report instance lookup errors as websocket close codes
 & Follow migrated propolis instance in nexus' serial console proxy (#2871)

Implements the seamless following of migrated instances' serial console
streams within nexus such that nexus' own clients just see a continuous
stream of bytes.
---
 nexus/src/app/instance.rs                  | 182 +++++++++++++--------
 nexus/src/external_api/http_entrypoints.rs |  38 ++++-
 2 files changed, 150 insertions(+), 70 deletions(-)

diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
index 79f973b10e..7673538e3a 100644
--- a/nexus/src/app/instance.rs
+++ b/nexus/src/app/instance.rs
@@ -35,21 +35,21 @@ use omicron_common::api::external::NameOrId;
 use omicron_common::api::external::UpdateResult;
 use omicron_common::api::external::Vni;
 use omicron_common::api::internal::nexus;
+use propolis_client::support::tungstenite::protocol::frame::coding::CloseCode;
+use propolis_client::support::tungstenite::protocol::CloseFrame;
+use propolis_client::support::tungstenite::Message as WebSocketMessage;
+use propolis_client::support::WebSocketStream;
 use sled_agent_client::types::InstanceMigrationSourceParams;
 use sled_agent_client::types::InstancePutMigrationIdsBody;
 use sled_agent_client::types::InstancePutStateBody;
 use sled_agent_client::types::InstanceStateRequested;
 use sled_agent_client::types::SourceNatConfig;
 use sled_agent_client::Client as SledAgentClient;
+use slog::Logger;
 use std::net::SocketAddr;
 use std::str::FromStr;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode;
-use tokio_tungstenite::tungstenite::protocol::CloseFrame;
-use tokio_tungstenite::tungstenite::protocol::Role as WebSocketRole;
-use tokio_tungstenite::tungstenite::Message as WebSocketMessage;
-use tokio_tungstenite::WebSocketStream;
 use uuid::Uuid;
 
 const MAX_KEYS_PER_INSTANCE: u32 = 8;
@@ -1286,7 +1286,7 @@ impl super::Nexus {
             .await
             .map_err(|_| {
                 Error::internal_error(
-                    "failed to connect to instance's propolis server",
+                    "websocket connection to instance's serial port failed",
                 )
             })?
             .into_inner();
@@ -1298,33 +1298,61 @@ impl super::Nexus {
 
     pub(crate) async fn instance_serial_console_stream(
         &self,
-        conn: dropshot::WebsocketConnection,
+        mut client_stream: WebSocketStream<impl AsyncRead + AsyncWrite + Unpin>,
         instance_lookup: &lookup::Instance<'_>,
         params: &params::InstanceSerialConsoleStreamRequest,
     ) -> Result<(), Error> {
-        let client = self
+        let client = match self
             .propolis_client_for_instance(
                 instance_lookup,
                 authz::Action::Modify,
             )
-            .await?;
+            .await
+        {
+            Ok(x) => x,
+            Err(e) => {
+                let _ = client_stream
+                    .close(Some(CloseFrame {
+                        code: CloseCode::Error,
+                        reason: e.to_string().into(),
+                    }))
+                    .await
+                    .is_ok();
+                return Err(e);
+            }
+        };
+
         let mut req = client.instance_serial();
         if let Some(most_recent) = params.most_recent {
             req = req.most_recent(most_recent);
         }
-        let propolis_upgraded = req
-            .send()
-            .await
-            .map_err(|_| {
-                Error::internal_error(
-                    "failed to connect to instance's propolis server",
+        match req.send().await {
+            Ok(response) => {
+                let propolis_upgraded = response.into_inner();
+                let log = self.log.clone();
+                Self::proxy_instance_serial_ws(
+                    client_stream,
+                    propolis_upgraded,
+                    Some(log),
                 )
-            })?
-            .into_inner();
-
-        Self::proxy_instance_serial_ws(conn.into_inner(), propolis_upgraded)
-            .await
-            .map_err(|e| Error::internal_error(&format!("{}", e)))
+                .await
+                .map_err(|e| Error::internal_error(&format!("{}", e)))
+            }
+            Err(e) => {
+                let message = format!(
+                    "websocket connection to instance's serial port failed: {}",
+                    e
+                );
+                let _ = client_stream
+                    .close(Some(CloseFrame {
+                        code: CloseCode::Error,
+                        reason: message.clone().into(),
+                    }))
+                    .await
+                    .is_ok();
+                Err(Error::internal_error(&message))
+            }
+        }
     }
 
     async fn propolis_client_for_instance(
@@ -1338,7 +1366,7 @@ impl super::Nexus {
             .propolis_ip
             .ok_or_else(|| {
                 Error::internal_error(
-                    "instance's propolis server ip address not found",
+                    "instance's hypervisor IP address not found",
                 )
             })?
             .ip();
@@ -1347,44 +1375,63 @@ impl super::Nexus {
     }
 
     async fn proxy_instance_serial_ws(
-        client_upgraded: impl AsyncRead + AsyncWrite + Unpin,
-        propolis_upgraded: impl AsyncRead + AsyncWrite + Unpin,
-    ) -> Result<(), tokio_tungstenite::tungstenite::Error> {
-        let (mut propolis_sink, mut propolis_stream) =
-            WebSocketStream::from_raw_socket(
+        client_stream: WebSocketStream<impl AsyncRead + AsyncWrite + Unpin>,
+        propolis_upgraded: impl AsyncRead + AsyncWrite + Unpin + Send + 'static,
+        log: Option<Logger>,
+    ) -> Result<(), propolis_client::support::tungstenite::Error> {
+        let mut propolis_conn =
+            propolis_client::support::InstanceSerialConsoleHelper::new(
                 propolis_upgraded,
-                WebSocketRole::Client,
-                None,
-            )
-            .await
-            .split();
-        let (mut nexus_sink, mut nexus_stream) =
-            WebSocketStream::from_raw_socket(
-                client_upgraded,
-                WebSocketRole::Server,
-                None,
+                log,
             )
-            .await
-            .split();
+            .await;
+
+        let (mut nexus_sink, mut nexus_stream) = client_stream.split();
 
         let mut buffered_output = None;
         let mut buffered_input = None;
         loop {
-            let (nexus_read, propolis_write) = match buffered_input.take() {
-                None => (nexus_stream.next().fuse(), Fuse::terminated()),
+            let nexus_read;
+            let nexus_write;
+            let propolis_read;
+            let propolis_write;
+            match buffered_input.take() {
+                None => {
+                    nexus_read = nexus_stream.next().fuse();
+                    propolis_write = Fuse::terminated();
+                    match buffered_output.take() {
+                        None => {
+                            nexus_write = Fuse::terminated();
+                            propolis_read = propolis_conn.recv().fuse();
+                        }
+                        Some(msg) => {
+                            nexus_write = nexus_sink.send(msg).fuse();
+                            propolis_read = Fuse::terminated();
+                        }
+                    }
+                }
                 Some(msg) => {
-                    (Fuse::terminated(), propolis_sink.send(msg).fuse())
+                    nexus_read = Fuse::terminated();
+                    propolis_write = propolis_conn.send(msg).fuse();
+                    match buffered_output.take() {
+                        // can't propolis_read simultaneously due to a
+                        // &mut propolis_conn being taken above
+                        None => {
+                            nexus_write = Fuse::terminated();
+                            propolis_read = Fuse::terminated();
+                        }
+                        Some(msg) => {
+                            nexus_write = nexus_sink.send(msg).fuse();
+                            propolis_read = Fuse::terminated();
+                        }
+                    }
                 }
             };
-            let (nexus_write, propolis_read) = match buffered_output.take() {
-                None => (Fuse::terminated(), propolis_stream.next().fuse()),
-                Some(msg) => (nexus_sink.send(msg).fuse(), Fuse::terminated()),
-            };
             tokio::select! {
                 msg = nexus_read => {
                     match msg {
                         None => {
-                            propolis_sink.send(WebSocketMessage::Close(Some(CloseFrame {
+                            propolis_conn.send(WebSocketMessage::Close(Some(CloseFrame {
                                 code: CloseCode::Abnormal,
                                 reason: std::borrow::Cow::from(
                                     "nexus: websocket connection to client closed unexpectedly"
@@ -1393,7 +1440,7 @@ impl super::Nexus {
                             break;
                         }
                         Some(Err(e)) => {
-                            propolis_sink.send(WebSocketMessage::Close(Some(CloseFrame {
+                            propolis_conn.send(WebSocketMessage::Close(Some(CloseFrame {
                                 code: CloseCode::Error,
                                 reason: std::borrow::Cow::from(
                                     format!("nexus: error in websocket connection to client: {}", e)
@@ -1402,7 +1449,7 @@ impl super::Nexus {
                             return Err(e);
                         }
                         Some(Ok(WebSocketMessage::Close(details))) => {
-                            propolis_sink.send(WebSocketMessage::Close(details)).await?;
+                            propolis_conn.send(WebSocketMessage::Close(details)).await?;
                             break;
                         }
                         Some(Ok(WebSocketMessage::Text(_text))) => {
@@ -1424,7 +1471,7 @@ impl super::Nexus {
                             nexus_sink.send(WebSocketMessage::Close(Some(CloseFrame {
                                 code: CloseCode::Abnormal,
                                 reason: std::borrow::Cow::from(
-                                    "nexus: websocket connection to propolis closed unexpectedly"
+                                    "nexus: websocket connection to serial port closed unexpectedly"
                                 ),
                             }))).await?;
                             break;
@@ -1433,7 +1480,7 @@ impl super::Nexus {
                             nexus_sink.send(WebSocketMessage::Close(Some(CloseFrame {
                                 code: CloseCode::Error,
                                 reason: std::borrow::Cow::from(
-                                    format!("nexus: error in websocket connection to propolis: {}", e)
+                                    format!("nexus: error in websocket connection to serial port: {}", e)
                                 ),
                             }))).await?;
                             return Err(e);
@@ -1442,11 +1489,11 @@ impl super::Nexus {
                             nexus_sink.send(WebSocketMessage::Close(details)).await?;
                             break;
                         }
-                        Some(Ok(WebSocketMessage::Text(_text))) => {
-                            // TODO: deserialize a json payload, specifying:
-                            //  - event: "migration"
-                            //  - address: the address of the new propolis-server
-                            //  - offset: what byte offset to start from (the last one sent from old propolis)
+                        Some(Ok(WebSocketMessage::Text(_json))) => {
+                            // connecting to new propolis-server is handled
+                            // within InstanceSerialConsoleHelper already.
+                            // we might consider sending the nexus client
+                            // an informational event for UX polish.
                         }
                         Some(Ok(WebSocketMessage::Binary(data))) => {
                             buffered_output = Some(WebSocketMessage::Binary(data))
@@ -1467,23 +1514,28 @@ impl super::Nexus {
 #[cfg(test)]
 mod tests {
     use super::super::Nexus;
+    use super::{CloseCode, CloseFrame, WebSocketMessage, WebSocketStream};
     use core::time::Duration;
     use futures::{SinkExt, StreamExt};
-    use tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode;
-    use tokio_tungstenite::tungstenite::protocol::CloseFrame;
-    use tokio_tungstenite::tungstenite::protocol::Role;
-    use tokio_tungstenite::tungstenite::Message;
-    use tokio_tungstenite::WebSocketStream;
+    use propolis_client::support::tungstenite::protocol::Role;
 
     #[tokio::test]
     async fn test_serial_console_stream_proxying() {
         let (nexus_client_conn, nexus_server_conn) = tokio::io::duplex(1024);
         let (propolis_client_conn, propolis_server_conn) =
             tokio::io::duplex(1024);
+
         let jh = tokio::spawn(async move {
-            Nexus::proxy_instance_serial_ws(
+            let nexus_client_stream = WebSocketStream::from_raw_socket(
                 nexus_server_conn,
+                Role::Server,
+                None,
+            )
+            .await;
+            Nexus::proxy_instance_serial_ws(
+                nexus_client_stream,
                 propolis_client_conn,
+                None,
             )
             .await
         });
@@ -1500,17 +1552,17 @@ mod tests {
         )
         .await;
 
-        let sent = Message::Binary(vec![1, 2, 3, 42, 5]);
+        let sent = WebSocketMessage::Binary(vec![1, 2, 3, 42, 5]);
         nexus_client_ws.send(sent.clone()).await.unwrap();
         let received = propolis_server_ws.next().await.unwrap().unwrap();
         assert_eq!(sent, received);
 
-        let sent = Message::Binary(vec![6, 7, 8, 90]);
+        let sent = WebSocketMessage::Binary(vec![6, 7, 8, 90]);
         propolis_server_ws.send(sent.clone()).await.unwrap();
         let received = nexus_client_ws.next().await.unwrap().unwrap();
         assert_eq!(sent, received);
 
-        let sent = Message::Close(Some(CloseFrame {
+        let sent = WebSocketMessage::Close(Some(CloseFrame {
             code: CloseCode::Normal,
             reason: std::borrow::Cow::from("test done"),
         }));
diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs
index fc0d0ee64f..930ece5ae3 100644
--- a/nexus/src/external_api/http_entrypoints.rs
+++ b/nexus/src/external_api/http_entrypoints.rs
@@ -76,6 +76,11 @@ use omicron_common::api::external::VpcFirewallRuleUpdateParams;
 use omicron_common::api::external::VpcFirewallRules;
 use omicron_common::bail_unless;
 use parse_display::Display;
+use propolis_client::support::tungstenite::protocol::frame::coding::CloseCode;
+use propolis_client::support::tungstenite::protocol::{
+    CloseFrame, Role as WebSocketRole,
+};
+use propolis_client::support::WebSocketStream;
 use ref_cast::RefCast;
 use schemars::JsonSchema;
 use serde::Deserialize;
@@ -2033,11 +2038,34 @@ async fn instance_serial_console_stream(
         project: query.project.clone(),
         instance: path.instance,
     };
-    let instance_lookup = nexus.instance_lookup(&opctx, instance_selector)?;
-    nexus
-        .instance_serial_console_stream(conn, &instance_lookup, &query)
-        .await?;
-    Ok(())
+    let mut client_stream = WebSocketStream::from_raw_socket(
+        conn.into_inner(),
+        WebSocketRole::Server,
+        None,
+    )
+    .await;
+    match nexus.instance_lookup(&opctx, instance_selector) {
+        Ok(instance_lookup) => {
+            nexus
+                .instance_serial_console_stream(
+                    client_stream,
+                    &instance_lookup,
+                    &query,
+                )
+                .await?;
+            Ok(())
+        }
+        Err(e) => {
+            let _ = client_stream
+                .close(Some(CloseFrame {
+                    code: CloseCode::Error,
+                    reason: e.to_string().into(),
+                }))
+                .await
+                .is_ok();
+            Err(e.into())
+        }
+    }
 }
 
 /// List an instance's disks

From c27796faf380dd5fc89b87342bf1bdc0b994b7b8 Mon Sep 17 00:00:00 2001
From: John Gallagher <john@oxidecomputer.com>
Date: Fri, 26 May 2023 10:11:51 -0400
Subject: [PATCH 09/14] [wicketd] Skip updating the SP if it already has the
 version we're about to update to (#3220)

Running a mupdate where the version specified in the TUF repo matches
the version reported by the SP now skips updating the SP:


![sp-skip](https://github.com/oxidecomputer/omicron/assets/1435635/0c33e123-da0b-43f0-ae89-227aa0766f78)

The user can override this via the functionality added in #3150, in
which case we do the update and mark the step as a "Warning":


![sp-force](https://github.com/oxidecomputer/omicron/assets/1435635/76083c63-6b65-413f-a395-d56152c617b4)
---
 openapi/wicketd.json               |  16 +--
 wicket-common/src/update_events.rs |  10 +-
 wicketd/src/http_entrypoints.rs    |   1 -
 wicketd/src/update_tracker.rs      | 194 ++++++++++++++++++++++-------
 4 files changed, 155 insertions(+), 66 deletions(-)

diff --git a/openapi/wicketd.json b/openapi/wicketd.json
index 5838df189f..1e86c95024 100644
--- a/openapi/wicketd.json
+++ b/openapi/wicketd.json
@@ -3020,21 +3020,7 @@
               "id": {
                 "type": "string",
                 "enum": [
-                  "reset_rot"
-                ]
-              }
-            },
-            "required": [
-              "id"
-            ]
-          },
-          {
-            "type": "object",
-            "properties": {
-              "id": {
-                "type": "string",
-                "enum": [
-                  "reset_sp"
+                  "interrogate_sp"
                 ]
               }
             },
diff --git a/wicket-common/src/update_events.rs b/wicket-common/src/update_events.rs
index 073cba9126..924ff4cd06 100644
--- a/wicket-common/src/update_events.rs
+++ b/wicket-common/src/update_events.rs
@@ -41,8 +41,7 @@ pub enum UpdateStepId {
     TestStep,
     SetHostPowerState { state: PowerState },
     InterrogateRot,
-    ResetRot,
-    ResetSp,
+    InterrogateSp,
     SpComponentUpdate,
     SettingInstallinatorImageId,
     ClearingInstallinatorImageId,
@@ -73,6 +72,8 @@ pub enum SpComponentUpdateStepId {
     Sending,
     Preparing,
     Writing,
+    SettingActiveBootSlot,
+    Resetting,
 }
 
 impl StepSpec for SpComponentUpdateSpec {
@@ -114,6 +115,11 @@ pub enum UpdateTerminalError {
         #[source]
         error: anyhow::Error,
     },
+    #[error("getting SP caboose failed")]
+    GetSpCabooseFailed {
+        #[source]
+        error: gateway_client::Error<gateway_client::types::Error>,
+    },
     #[error("SP reset failed")]
     SpResetFailed {
         #[source]
diff --git a/wicketd/src/http_entrypoints.rs b/wicketd/src/http_entrypoints.rs
index 2088ba15df..ec83170755 100644
--- a/wicketd/src/http_entrypoints.rs
+++ b/wicketd/src/http_entrypoints.rs
@@ -160,7 +160,6 @@ pub(crate) struct StartUpdateOptions {
 
     /// If true, skip the check on the current SP version and always update it
     /// regardless of whether the update appears to be neeeded.
-    #[allow(dead_code)] // TODO actually use this
     pub(crate) skip_sp_version_check: bool,
 }
 
diff --git a/wicketd/src/update_tracker.rs b/wicketd/src/update_tracker.rs
index adf70c12df..f82987b2b6 100644
--- a/wicketd/src/update_tracker.rs
+++ b/wicketd/src/update_tracker.rs
@@ -36,6 +36,7 @@ use installinator_common::InstallinatorCompletionMetadata;
 use installinator_common::InstallinatorSpec;
 use installinator_common::M2Slot;
 use installinator_common::WriteOutput;
+use omicron_common::api::external::SemverVersion;
 use omicron_common::backoff;
 use omicron_common::update::ArtifactId;
 use slog::error;
@@ -69,7 +70,9 @@ use wicket_common::update_events::StepContext;
 use wicket_common::update_events::StepHandle;
 use wicket_common::update_events::StepProgress;
 use wicket_common::update_events::StepResult;
+use wicket_common::update_events::StepSkipped;
 use wicket_common::update_events::StepSuccess;
+use wicket_common::update_events::StepWarning;
 use wicket_common::update_events::UpdateComponent;
 use wicket_common::update_events::UpdateEngine;
 use wicket_common::update_events::UpdateStepId;
@@ -635,49 +638,47 @@ impl UpdateDriver {
                 .register();
         }
 
-        // Reset the RoT into the updated build we just sent.
-        rot_registrar
+        let sp_registrar = engine.for_component(UpdateComponent::Sp);
+
+        let sp_current_version = sp_registrar
             .new_step(
-                UpdateStepId::ResetRot,
-                "Resetting RoT",
-                |cx| async move {
-                    let (rot_firmware_slot, _) = rot_firmware_slot_and_artifact
-                        .into_value(cx.token())
-                        .await;
-
-                    // Mark the slot we just updated as the slot to use,
-                    // persistently.
-                    update_cx
-                        .set_component_active_slot(
-                            SpComponent::ROT.const_as_str(),
-                            rot_firmware_slot,
-                            true,
+                UpdateStepId::InterrogateSp,
+                "Checking current SP version",
+                move |_cx| async move {
+                    let caboose = update_cx
+                        .mgs_client
+                        .sp_component_caboose_get(
+                            update_cx.sp.type_,
+                            update_cx.sp.slot,
+                            SpComponent::SP_ITSELF.const_as_str(),
                         )
                         .await
                         .map_err(|error| {
-                            UpdateTerminalError::SetRotActiveSlotFailed {
-                                error,
-                            }
-                        })?;
-
-                    // Reset the RoT.
-                    update_cx
-                        .reset_sp_component(SpComponent::ROT.const_as_str())
-                        .await
-                        .map_err(|error| {
-                            UpdateTerminalError::RotResetFailed { error }
-                        })?;
-
-                    StepSuccess::new(()).into()
+                            UpdateTerminalError::GetSpCabooseFailed { error }
+                        })?
+                        .into_inner();
+
+                    let message = format!(
+                        "SP version {} (git commit {})",
+                        caboose.version.as_deref().unwrap_or("unknown"),
+                        caboose.git_commit
+                    );
+                    match caboose.version.map(|v| v.parse::<SemverVersion>()) {
+                        Some(Ok(version)) => StepSuccess::new(Some(version))
+                            .with_message(message)
+                            .into(),
+                        Some(Err(err)) => StepWarning::new(
+                            None,
+                            format!(
+                                "{message} (failed to parse SP version: {err})"
+                            ),
+                        )
+                        .into(),
+                        None => StepWarning::new(None, message).into(),
+                    }
                 },
             )
             .register();
-
-        // The SP only has one updateable firmware slot ("the inactive bank") -
-        // we always pass 0.
-        let sp_firmware_slot = 0;
-
-        let sp_registrar = engine.for_component(UpdateComponent::Sp);
         let inner_cx =
             SpComponentUpdateContext::new(update_cx, UpdateComponent::Sp);
         sp_registrar
@@ -685,6 +686,29 @@ impl UpdateDriver {
                 UpdateStepId::SpComponentUpdate,
                 "Updating SP",
                 move |cx| async move {
+                    let sp_current_version =
+                        sp_current_version.into_value(cx.token()).await;
+
+                    let sp_has_this_version = Some(&sp_artifact.id.version)
+                        == sp_current_version.as_ref();
+
+                    // If this SP already has this version, skip the rest of
+                    // this step, UNLESS we've been told to skip this version
+                    // check.
+                    if sp_has_this_version && !opts.skip_sp_version_check {
+                        return StepSkipped::new(
+                            (),
+                            format!(
+                                "SP already at version {}",
+                                sp_artifact.id.version
+                            ),
+                        )
+                        .into();
+                    }
+
+                    // The SP only has one updateable firmware slot ("the
+                    // inactive bank") - we always pass 0.
+                    let sp_firmware_slot = 0;
                     cx.with_nested_engine(|engine| {
                         inner_cx.register_steps(
                             engine,
@@ -694,21 +718,25 @@ impl UpdateDriver {
                         Ok(())
                     })
                     .await?;
-                    StepSuccess::new(()).into()
+
+                    // If we updated despite the SP already having the version
+                    // we updated to, make this step return a warning with that
+                    // message; otherwise, this is a normal success.
+                    if sp_has_this_version {
+                        StepWarning::new(
+                            (),
+                            format!(
+                                "SP updated despite already having version {}",
+                                sp_artifact.id.version
+                            ),
+                        )
+                        .into()
+                    } else {
+                        StepSuccess::new(()).into()
+                    }
                 },
             )
             .register();
-        sp_registrar
-            .new_step(UpdateStepId::ResetSp, "Resetting SP", |_cx| async move {
-                update_cx
-                    .reset_sp_component(SpComponent::SP_ITSELF.const_as_str())
-                    .await
-                    .map_err(|error| UpdateTerminalError::SpResetFailed {
-                        error,
-                    })?;
-                StepSuccess::new(()).into()
-            })
-            .register();
 
         if update_cx.sp.type_ == SpType::Sled {
             self.register_sled_steps(
@@ -1623,5 +1651,75 @@ impl<'a> SpComponentUpdateContext<'a> {
                 },
             )
             .register();
+
+        // If we just updated the RoT or SP, immediately reboot it into the new
+        // update. (One can imagine an update process _not_ wanting to do this,
+        // to stage updates for example, but for wicketd-driven recovery it's
+        // fine to do this immediately.)
+        match component {
+            UpdateComponent::Rot => {
+                // Prior to rebooting the RoT, we have to tell it to boot into
+                // the firmware slot we just updated.
+                registrar
+                    .new_step(
+                        SpComponentUpdateStepId::SettingActiveBootSlot,
+                        format!("Setting RoT active slot to {firmware_slot}"),
+                        move |_cx| async move {
+                            update_cx
+                                .set_component_active_slot(
+                                    component_name,
+                                    firmware_slot,
+                                    true,
+                                )
+                                .await
+                                .map_err(|error| {
+                                    UpdateTerminalError::SetRotActiveSlotFailed {
+                                        error,
+                                    }
+                                })?;
+                            StepSuccess::new(()).into()
+                        },
+                    )
+                    .register();
+
+                // Reset the RoT.
+                registrar
+                    .new_step(
+                        SpComponentUpdateStepId::Resetting,
+                        "Resetting RoT",
+                        move |_cx| async move {
+                            update_cx
+                                .reset_sp_component(component_name)
+                                .await
+                                .map_err(|error| {
+                                    UpdateTerminalError::RotResetFailed {
+                                        error,
+                                    }
+                                })?;
+                            StepSuccess::new(()).into()
+                        },
+                    )
+                    .register();
+            }
+            UpdateComponent::Sp => {
+                // Nothing special to do on the SP - just reset it.
+                registrar
+                    .new_step(
+                        SpComponentUpdateStepId::Resetting,
+                        "Resetting SP",
+                        move |_cx| async move {
+                            update_cx
+                                .reset_sp_component(component_name)
+                                .await
+                                .map_err(|error| {
+                                    UpdateTerminalError::SpResetFailed { error }
+                                })?;
+                            StepSuccess::new(()).into()
+                        },
+                    )
+                    .register();
+            }
+            UpdateComponent::Host => (),
+        }
     }
 }

From cab09253e6e6e27761d1ef99f8087f86ccad2e8b Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 26 May 2023 09:20:26 -0700
Subject: [PATCH 10/14] document tips for Omicron development (#3221)

---
 README.adoc | 132 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 118 insertions(+), 14 deletions(-)

diff --git a/README.adoc b/README.adoc
index e83f893988..14f39738a0 100644
--- a/README.adoc
+++ b/README.adoc
@@ -51,7 +51,49 @@ You can **run the https://github.com/rust-lang/rust-clippy[Clippy linter]** usin
 
 == Working in Omicron
 
-Omicron is a pretty large repo containing a bunch of related components.  The repo is structured as one large Cargo workspace.  When you run `cargo build` or `cargo test`, you'll wind up building and testing everything.  In practice, people are usually only working on one or two components at a time.  You can usually iterate faster by specifying a specific package, like `cargo build -p omicron-nexus`.
+Omicron is a pretty large repo containing a bunch of related components.  If you just build the whole thing with `cargo build` or `cargo test`, it can take a while, even for incremental builds.  Since most people are only working on a few of these components at a time, it's helpful to be know about Cargo's tools for working with individual packages in a workspace.
+
+NOTE: This section assumes you're already familiar with the prerequisites and environment setup needed to do _any_ work on Omicron.  See xref:docs/how-to-run-simulated.adoc[] or xref:docs/how-to-run.adoc[] for more on that.
+
+=== Key tips
+
+* Use `cargo check` when you just want to know if your code compiles.  It's _much_ faster than `cargo build` or `cargo test`.
+* When using Cargo's check/build/test/clippy commands, you can use the `-p PACKAGE` flag to only operate on a specific package.  This often saves a lot of time for incremental builds.
+* When using Cargo's check/build/clippy commands, use `--all-targets` to make sure you're checking or building the test code, too.
+
+These are explained a bit more below, along with some common pitfalls.
+
+Here's an example workflow.  Suppose you're working on some changes to the Nexus database model (`nexus-db-model` package, located at `nexus/db-model` from the root).  While you're actively writing and checking code, you might run:
+
+```
+cargo check --all-targets
+```
+
+_without_ any `-p` flag.  Running this incrementally is pretty fast even on the whole workspace.  This also uncovers places where your changes have broken code that uses this package.  (If you're making big changes, you might not want that right away.  In that case, you might choose to use `-p nexus-db-model` here.)
+
+When you're ready to test the changes you've made, start with building and running tests for the most specific package you've changed:
+
+```
+cargo test -p nexus-db-model
+```
+
+Once that works, check the tests for the next package up:
+
+```
+cargo test -p omicron-nexus
+```
+
+When you're happy with things and want to make sure you haven't missed something, test everything:
+
+```
+cargo test
+```
+
+=== Rust packages in Omicron
+
+NOTE: The term "package" is overloaded: most programming languages and operating systems have their own definitions of a package.  On top of that, Omicron bundles up components into our own kind of "package" that gets delivered via the install and update systems.  These are described in the `package-manifest.toml` file in the root of the repo.  In this section, we're just concerned with Rust packages.
+
+NOTE: There's also confusion in the Rust world about the terms https://doc.rust-lang.org/book/ch07-01-packages-and-crates.html["packages" and "crates"].  _Packages_ are the things that have a Cargo.toml file.  (Workspaces like Omicron itself have Cargo.toml files, too.)  Packages are also the things that you publish to crates.io (confusingly).  One package might have a library, a standalone executable binary, several examples, integration tests, etc. that are all compiled individually and produce separate artifacts.  These are what Rust calls _crates_.  We're generally just concerned with packages here, not crates.
 
 Here are some of the big components in the control plane that live in this repo:
 
@@ -89,7 +131,40 @@ Here are some of the big components in the control plane that live in this repo:
 
 For those with access to Oxide RFDs, RFD 61 discusses the organization principles and key components in more detail.
 
-Many of these components themselves are made up of subpackages (e.g., `nexus-db-queries` is under `omicron-nexus`).  There are also many more top-level packages than what's mentioned above.  These are used for common code, clients, tools, etc.  For more, see the Rustdoc for each module.  (Where docs are missing or incomplete, please contribute!)
+Many of these components themselves are made up of other packages (e.g., `nexus-db-model` is under `omicron-nexus`).  There are also many more top-level packages than what's mentioned above.  These are used for common code, clients, tools, etc.  For more, see the Rustdoc for each module.  (Where docs are missing or incomplete, please contribute!)
+
+Use Cargo's `-p PACKAGE` to check/build/test only the package you're working on.  Since people are usually only working on one or two components at a time, you can usually iterate faster this way.
+
+=== Why is Cargo rebuilding stuff all the time?
+
+People are often surprised to find Cargo rebuilding stuff that it seems like it's just built, even when the relevant source files haven't changed.
+
+* Say you're iterating on code, running `cargo build -p nexus-db-model` to build just that package.  Great, it works.  Let's run tests: `cargo test -p nexus-db-model`.  Now it's rebuilding some _dependency_ of `nexus-db-model` again?!
+* Say you've just run `cargo test -p nexus-db-model`.  Now you go run `cargo test -p omicron-nexus`, which uses `nexus-db-model`.  You see Cargo building `nexus-db-model` again?!
+
+This usually has to do with the way Cargo selects package https://doc.rust-lang.org/cargo/reference/features.html[features].  These are essentially tags that are used at build time to include specific code or dependencies.  For example, the https://docs.rs/serde/latest/serde/[serde] crate defines a feature called https://docs.rs/crate/serde/latest/features["derive"] that controls whether the `Serialize`/`Deserialize` derive macros will be included.  Let's look at how this affects builds.
+
+TIP: You can use `cargo tree` to inspect a package's dependencies, including features.  This is useful for debugging feature-related build issues.
+
+==== Feature selection differs when building tests
+
+When you run `cargo build -p nexus-db-model`, Cargo looks at all the packages in the depencency tree of `nexus-db-model` and figures out what features it needs for each one.  Let's take the `uuid` package as an example.  Cargo takes https://doc.rust-lang.org/cargo/reference/features.html#feature-unification[union of the features required by any of the packages that depend on `uuid` in the whole dependency tree of `nexus-db-model`].  Let's say that's just the "v4" feature.  Simple enough.
+
+When you then run `cargo test -p nexus-db-model`, it does the same thing.  Only this time, it's looking at the `dev-dependencies` tree.  `nexus-db-model` 's dev-dependencies might include some other package that depends on `uuid` and requires the "v5" feature.  Now, Cargo has to rebuild `uuid` -- and anything else that depends on it.
+
+This is why when using Cargo's check/build/clippy commands, we suggest using `--all-targets`.  When you use `cargo build --all-targets`, it builds the tests as well.  It's usually not much more time and avoids extra rebuilds when switching back and forth between the default targets and the targets with tests included.
+
+==== Feature selection differs when building different packages
+
+People run into a similar problem when switching packages within Omicron.  Once you've got `cargo test -p nexus-db-model` working, you may run `cargo test -p omicron-nexus`, which uses `nexus-db-model`.  And you may be surprised to see Cargo rebuilding some common dependency like `uuid`.  It's the same as above: we're building a different package now.  It has a different (larger) dependency tree.  That may result in some crate deep in the dependency tree needing some new feature, causing it and all of its dependents to be rebuilt.
+
+NOTE: https://github.com/rust-lang/cargo/issues/4463[There is interest in changing the way feature selection works in workspaces like Omicron for exactly this reason.]  It's been suggested to have an option for Cargo to always look at the features required for all packages in the workspace, rather than just the one you've selected.  This could eliminate this particular problem.  In the meantime, we mitigate this with heavy use of https://doc.rust-lang.org/cargo/reference/workspaces.html#the-dependencies-table[workspace dependencies], which helps make sure that different packages _within_ Omicron depend on the same set of features for a given dependency.
+
+=== Why am I getting compile errors after I thought I'd already built everything?
+
+Say you're iterating on code, running `cargo build -p nexus-db-model` to build just that package.  You work through lots of compiler errors until finally it works.  Now you run tests: `cargo test -p nexus-db-model`.  Now you see a bunch of compiler errors again!  What gives?
+
+By default, Cargo does not operate on the tests.  Cargo's check/build/clippy commands ignore them.  This is another reason we suggest using `--all-targets` most of the time.
 
 === Generated Service Clients and Updating
 
@@ -118,24 +193,53 @@ documents being checked in.
 
 In general, changes any service API **require the following set of build steps**:
 
-* Make changes to the service API
-* Build the package for the modified service alone. This can be done by changing
-  directories there, or `cargo build -p <package>`. This is step is important,
-  to avoid the circular dependency at this point. One needs to update this one
-  OpenAPI document, without rebuilding the other components that depend on a
-  now-outdated spec.
-* Update the OpenAPI document by running the relevant test with overwrite set:
-  `EXPECTORATE=overwrite cargo test test_nexus_openapi_internal` (changing the
-  test name as necessary)
-* This will cause the generated client to be updated which may break the build
-  for dependent consumers
-* Modify any dependent services to fix calls to the generated client
+. Make changes to the service API.
+. Update the OpenAPI document by running the relevant test with overwrite set:
+  `EXPECTORATE=overwrite cargo test -p <package> -- test_nexus_openapi_internal`
+  (changing the package name and test name as necessary).  It's important to do
+  this _before_ the next step.
+. This will cause the generated client to be updated which may break the build
+  for dependent consumers.
+. Modify any dependent services to fix calls to the generated client.
 
 Note that if you make changes to both Nexus and Sled Agent simultaneously, you
 may end up in a spot where neither can build and therefore neither OpenAPI
 document can be generated. In this case, revert or comment out changes in one
 so that the OpenAPI document can be generated.
 
+This is a particular problem if you find yourself resolving merge conflicts in the generated files.  You have basically two options for this:
+
+* Resolve the merge conflicts by hand.  This is usually not too bad in practice.
+* Take the upstream copy of the file, back out your client side changes (`git stash` and its `-p` option can be helpful for this), follow the steps above to regenerate the file using the automated test, and finally re-apply your changes to the client side.  This is essentially getting yourself back to step 1 above and then following the procedure above.
+
+=== Resolving merge conflicts in Cargo.lock
+
+When pulling in new changes from upstream "main", you may find conflicts in Cargo.lock.  The easiest way to deal with these is usually to take the upstream changes as-is, then trigger any Cargo operation that updates the lockfile.  `cargo metadata` is a quick one.  Here's an example:
+
+```
+# Pull in changes from upstream "main"
+$ git fetch
+$ git merge origin/main
+
+# Oh no!  We've got conflicts in Cargo.lock.  First, let's just take what's upstream:
+$ git show origin/main:Cargo.lock > Cargo.lock
+
+# Now, run any command that causes Cargo to update the lock file as needed.
+$ cargo metadata > /dev/null
+```
+
+When you do this, Cargo makes only changes to Cargo.lock that are necessary based on the various Cargo.toml files in the workspace and dependencies.
+
+Here are things you _don't_ want to do to resolve this conflict:
+
+* Run `cargo generate-lockfile` to generate a new lock file from scratch.
+* Remove `Cargo.lock` and let Cargo regenerate it from scratch.
+
+Both of these will cause Cargo to make many more changes (relative to "main") than necessary because it's choosing the latest version of all dependencies in the whole tree.  You'll be inadvertently updating all of Omicron's transitive dependencies.  (You might conceivably want that.  But usually we update dependencies either as-needed for a particular change or via individual PRs via dependabot, not all at once because someone had to merge Cargo.lock.)
+
+You can also resolve conflicts by hand.  It's tedious and error-prone.
+
+
 == Configuration reference
 
 `nexus` requires a TOML configuration file.  There's an example in

From 20ade16dc6f86540bcc8afeff19ad06689bc8a97 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 26 May 2023 10:02:12 -0700
Subject: [PATCH 11/14] use bb8 error sink to log more errors (#3232)

---
 nexus/db-queries/src/db/collection_attach.rs  | 20 +++---
 nexus/db-queries/src/db/collection_detach.rs  | 12 ++--
 .../src/db/collection_detach_many.rs          | 16 ++---
 nexus/db-queries/src/db/collection_insert.rs  |  4 +-
 nexus/db-queries/src/db/datastore/mod.rs      |  6 +-
 nexus/db-queries/src/db/explain.rs            |  4 +-
 nexus/db-queries/src/db/pagination.rs         |  8 +--
 nexus/db-queries/src/db/pool.rs               | 66 +++++++++++++++----
 .../db-queries/src/db/queries/external_ip.rs  |  2 +-
 nexus/db-queries/src/db/queries/next_item.rs  |  2 +-
 nexus/db-queries/src/db/queries/vpc_subnet.rs |  2 +-
 nexus/db-queries/src/db/saga_recovery.rs      |  2 +-
 nexus/src/context.rs                          |  2 +-
 nexus/src/populate.rs                         |  5 +-
 14 files changed, 97 insertions(+), 54 deletions(-)

diff --git a/nexus/db-queries/src/db/collection_attach.rs b/nexus/db-queries/src/db/collection_attach.rs
index 84a4073df0..c88054795d 100644
--- a/nexus/db-queries/src/db/collection_attach.rs
+++ b/nexus/db-queries/src/db/collection_attach.rs
@@ -854,7 +854,7 @@ mod test {
             dev::test_setup_log("test_attach_missing_collection_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -883,7 +883,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_missing_resource_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -920,7 +920,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_once");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -968,7 +968,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_once_synchronous");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -1028,7 +1028,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_multiple_times");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -1084,7 +1084,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_beyond_capacity_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -1148,7 +1148,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_while_already_attached");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -1255,7 +1255,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_once");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -1310,7 +1310,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_deleted_resource_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -1355,7 +1355,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_without_update_filter");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
diff --git a/nexus/db-queries/src/db/collection_detach.rs b/nexus/db-queries/src/db/collection_detach.rs
index c4ae237062..04894ecb21 100644
--- a/nexus/db-queries/src/db/collection_detach.rs
+++ b/nexus/db-queries/src/db/collection_detach.rs
@@ -775,7 +775,7 @@ mod test {
             dev::test_setup_log("test_detach_missing_collection_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -803,7 +803,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_missing_resource_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -839,7 +839,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_once");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -879,7 +879,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_while_already_detached");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -943,7 +943,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_deleted_resource_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -987,7 +987,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_without_update_filter");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
diff --git a/nexus/db-queries/src/db/collection_detach_many.rs b/nexus/db-queries/src/db/collection_detach_many.rs
index caef0eaa53..3418296568 100644
--- a/nexus/db-queries/src/db/collection_detach_many.rs
+++ b/nexus/db-queries/src/db/collection_detach_many.rs
@@ -773,7 +773,7 @@ mod test {
             dev::test_setup_log("test_detach_missing_collection_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -803,7 +803,7 @@ mod test {
             dev::test_setup_log("test_detach_missing_resource_succeeds");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -844,7 +844,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_once");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -887,7 +887,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_once_synchronous");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -942,7 +942,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_while_already_detached");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -998,7 +998,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_filter_collection");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -1049,7 +1049,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_deleted_resource");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -1107,7 +1107,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_many");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
diff --git a/nexus/db-queries/src/db/collection_insert.rs b/nexus/db-queries/src/db/collection_insert.rs
index 647e3e443d..1076f8eb29 100644
--- a/nexus/db-queries/src/db/collection_insert.rs
+++ b/nexus/db-queries/src/db/collection_insert.rs
@@ -548,7 +548,7 @@ mod test {
         let logctx = dev::test_setup_log("test_collection_not_present");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
@@ -578,7 +578,7 @@ mod test {
         let logctx = dev::test_setup_log("test_collection_present");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         setup_db(&pool).await;
 
diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs
index cf06f23aef..56d27d8c9d 100644
--- a/nexus/db-queries/src/db/datastore/mod.rs
+++ b/nexus/db-queries/src/db/datastore/mod.rs
@@ -244,7 +244,7 @@ pub async fn datastore_test(
     use crate::authn;
 
     let cfg = db::Config { url: db.pg_config().clone() };
-    let pool = Arc::new(db::Pool::new(&cfg));
+    let pool = Arc::new(db::Pool::new(&logctx.log, &cfg));
     let datastore = Arc::new(DataStore::new(pool));
 
     // Create an OpContext with the credentials of "db-init" just for the
@@ -895,7 +895,7 @@ mod test {
             dev::test_setup_log("test_queries_do_not_require_full_table_scan");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
         let datastore = DataStore::new(Arc::new(pool));
 
         let explanation = DataStore::get_allocated_regions_query(Uuid::nil())
@@ -944,7 +944,7 @@ mod test {
         let logctx = dev::test_setup_log("test_sled_ipv6_address_allocation");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(db::Pool::new(&cfg));
+        let pool = Arc::new(db::Pool::new(&logctx.log, &cfg));
         let datastore = Arc::new(DataStore::new(Arc::clone(&pool)));
         let opctx =
             OpContext::for_tests(logctx.log.new(o!()), datastore.clone());
diff --git a/nexus/db-queries/src/db/explain.rs b/nexus/db-queries/src/db/explain.rs
index 6bb1b31d6b..de834eb301 100644
--- a/nexus/db-queries/src/db/explain.rs
+++ b/nexus/db-queries/src/db/explain.rs
@@ -166,7 +166,7 @@ mod test {
         let logctx = dev::test_setup_log("test_explain_async");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         create_schema(&pool).await;
 
@@ -189,7 +189,7 @@ mod test {
         let logctx = dev::test_setup_log("test_explain_full_table_scan");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         create_schema(&pool).await;
 
diff --git a/nexus/db-queries/src/db/pagination.rs b/nexus/db-queries/src/db/pagination.rs
index 86b2466cf6..50da36c156 100644
--- a/nexus/db-queries/src/db/pagination.rs
+++ b/nexus/db-queries/src/db/pagination.rs
@@ -263,7 +263,7 @@ mod test {
             dev::test_setup_log("test_paginated_single_column_ascending");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         use schema::test_users::dsl;
 
@@ -298,7 +298,7 @@ mod test {
             dev::test_setup_log("test_paginated_single_column_descending");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         use schema::test_users::dsl;
 
@@ -333,7 +333,7 @@ mod test {
             dev::test_setup_log("test_paginated_multicolumn_ascending");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         use schema::test_users::dsl;
 
@@ -387,7 +387,7 @@ mod test {
             dev::test_setup_log("test_paginated_multicolumn_descending");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&cfg);
+        let pool = db::Pool::new(&logctx.log, &cfg);
 
         use schema::test_users::dsl;
 
diff --git a/nexus/db-queries/src/db/pool.rs b/nexus/db-queries/src/db/pool.rs
index b8ceb26768..6311121bd1 100644
--- a/nexus/db-queries/src/db/pool.rs
+++ b/nexus/db-queries/src/db/pool.rs
@@ -44,21 +44,38 @@ pub struct Pool {
 }
 
 impl Pool {
-    pub fn new(db_config: &DbConfig) -> Self {
-        let manager =
-            ConnectionManager::<DbConnection>::new(&db_config.url.url());
-        let pool = bb8::Builder::new()
-            .connection_customizer(Box::new(DisallowFullTableScans {}))
-            .build_unchecked(manager);
-        Pool { pool }
+    pub fn new(log: &slog::Logger, db_config: &DbConfig) -> Self {
+        Self::new_builder(log, db_config, bb8::Builder::new())
+    }
+
+    pub fn new_failfast_for_tests(
+        log: &slog::Logger,
+        db_config: &DbConfig,
+    ) -> Self {
+        Self::new_builder(
+            log,
+            db_config,
+            bb8::Builder::new()
+                .connection_timeout(std::time::Duration::from_millis(1)),
+        )
     }
 
-    pub fn new_failfast_for_tests(db_config: &DbConfig) -> Self {
-        let manager =
-            ConnectionManager::<DbConnection>::new(&db_config.url.url());
-        let pool = bb8::Builder::new()
+    fn new_builder(
+        log: &slog::Logger,
+        db_config: &DbConfig,
+        builder: bb8::Builder<ConnectionManager<DbConnection>>,
+    ) -> Self {
+        let url = db_config.url.url();
+        let log = log.new(o!(
+            "database_url" => url.clone(),
+            "component" => "db::Pool"
+        ));
+        info!(&log, "database connection pool");
+        let error_sink = LoggingErrorSink::new(log);
+        let manager = ConnectionManager::<DbConnection>::new(url);
+        let pool = builder
             .connection_customizer(Box::new(DisallowFullTableScans {}))
-            .connection_timeout(std::time::Duration::from_millis(1))
+            .error_sink(Box::new(error_sink))
             .build_unchecked(manager);
         Pool { pool }
     }
@@ -85,3 +102,28 @@ impl CustomizeConnection<Connection<DbConnection>, ConnectionError>
         conn.batch_execute_async(DISALLOW_FULL_TABLE_SCAN_SQL).await
     }
 }
+
+#[derive(Clone, Debug)]
+struct LoggingErrorSink {
+    log: slog::Logger,
+}
+
+impl LoggingErrorSink {
+    fn new(log: slog::Logger) -> LoggingErrorSink {
+        LoggingErrorSink { log }
+    }
+}
+
+impl bb8::ErrorSink<ConnectionError> for LoggingErrorSink {
+    fn sink(&self, error: ConnectionError) {
+        error!(
+            &self.log,
+            "database connection error";
+            "error_message" => #%error
+        );
+    }
+
+    fn boxed_clone(&self) -> Box<dyn bb8::ErrorSink<ConnectionError>> {
+        Box::new(self.clone())
+    }
+}
diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs
index caba23caae..be13775d46 100644
--- a/nexus/db-queries/src/db/queries/external_ip.rs
+++ b/nexus/db-queries/src/db/queries/external_ip.rs
@@ -850,7 +850,7 @@ mod tests {
             let db = test_setup_database(&log).await;
             crate::db::datastore::datastore_test(&logctx, &db).await;
             let cfg = crate::db::Config { url: db.pg_config().clone() };
-            let pool = Arc::new(crate::db::Pool::new(&cfg));
+            let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg));
             let db_datastore =
                 Arc::new(crate::db::DataStore::new(Arc::clone(&pool)));
             let opctx =
diff --git a/nexus/db-queries/src/db/queries/next_item.rs b/nexus/db-queries/src/db/queries/next_item.rs
index d7486e369d..3ba09788a0 100644
--- a/nexus/db-queries/src/db/queries/next_item.rs
+++ b/nexus/db-queries/src/db/queries/next_item.rs
@@ -592,7 +592,7 @@ mod tests {
         let log = logctx.log.new(o!());
         let mut db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(crate::db::Pool::new(&cfg));
+        let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg));
 
         // We're going to operate on a separate table, for simplicity.
         setup_test_schema(&pool).await;
diff --git a/nexus/db-queries/src/db/queries/vpc_subnet.rs b/nexus/db-queries/src/db/queries/vpc_subnet.rs
index b0ed18d58a..c76d8cac91 100644
--- a/nexus/db-queries/src/db/queries/vpc_subnet.rs
+++ b/nexus/db-queries/src/db/queries/vpc_subnet.rs
@@ -443,7 +443,7 @@ mod test {
         let log = logctx.log.new(o!());
         let mut db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(crate::db::Pool::new(&cfg));
+        let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg));
         let db_datastore =
             Arc::new(crate::db::DataStore::new(Arc::clone(&pool)));
 
diff --git a/nexus/db-queries/src/db/saga_recovery.rs b/nexus/db-queries/src/db/saga_recovery.rs
index ba5928e0d0..6c4a539be7 100644
--- a/nexus/db-queries/src/db/saga_recovery.rs
+++ b/nexus/db-queries/src/db/saga_recovery.rs
@@ -328,7 +328,7 @@ mod test {
     ) -> (dev::db::CockroachInstance, Arc<db::DataStore>) {
         let db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(db::Pool::new(&cfg));
+        let pool = Arc::new(db::Pool::new(log, &cfg));
         let db_datastore = Arc::new(db::DataStore::new(Arc::clone(&pool)));
         (db, db_datastore)
     }
diff --git a/nexus/src/context.rs b/nexus/src/context.rs
index 3232e0f00e..871f7ed2bb 100644
--- a/nexus/src/context.rs
+++ b/nexus/src/context.rs
@@ -160,7 +160,7 @@ impl ServerContext {
                 .map_err(|e| format!("Cannot parse Postgres URL: {}", e))?
             }
         };
-        let pool = db::Pool::new(&db::Config { url });
+        let pool = db::Pool::new(&log, &db::Config { url });
         let nexus = Nexus::new_with_id(
             rack_id,
             log.new(o!("component" => "nexus")),
diff --git a/nexus/src/populate.rs b/nexus/src/populate.rs
index c077e14c4f..be2ab47d6f 100644
--- a/nexus/src/populate.rs
+++ b/nexus/src/populate.rs
@@ -341,7 +341,7 @@ mod test {
         let logctx = dev::test_setup_log("test_populator");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(db::Pool::new(&cfg));
+        let pool = Arc::new(db::Pool::new(&logctx.log, &cfg));
         let datastore = Arc::new(db::DataStore::new(pool));
         let opctx = OpContext::for_background(
             logctx.log.clone(),
@@ -387,7 +387,8 @@ mod test {
         //
         // Anyway, if we try again with a broken database, we should get a
         // ServiceUnavailable error, which indicates a transient failure.
-        let pool = Arc::new(db::Pool::new_failfast_for_tests(&cfg));
+        let pool =
+            Arc::new(db::Pool::new_failfast_for_tests(&logctx.log, &cfg));
         let datastore = Arc::new(db::DataStore::new(pool));
         let opctx = OpContext::for_background(
             logctx.log.clone(),

From 9d9c1dff77396b138533afe6cadb000f31c94535 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 26 May 2023 11:57:25 -0700
Subject: [PATCH 12/14] update async-bb8-diesel for error message update
 (#3233)

---
 Cargo.lock                       | 2 +-
 Cargo.toml                       | 2 +-
 nexus/db-queries/src/db/error.rs | 7 +++----
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2346cdd5a6..38f633e4e3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9"
 [[package]]
 name = "async-bb8-diesel"
 version = "0.1.0"
-source = "git+https://github.com/oxidecomputer/async-bb8-diesel?rev=7944dafc8a36dc6e20a1405eca59d04662de2bb7#7944dafc8a36dc6e20a1405eca59d04662de2bb7"
+source = "git+https://github.com/oxidecomputer/async-bb8-diesel?rev=be3d9bce50051d8c0e0c06078e8066cc27db3001#be3d9bce50051d8c0e0c06078e8066cc27db3001"
 dependencies = [
  "async-trait",
  "bb8",
diff --git a/Cargo.toml b/Cargo.toml
index 47cca34bbe..89ed572ee8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -121,7 +121,7 @@ anyhow = "1.0"
 api_identity = { path = "api_identity" }
 assert_matches = "1.5.0"
 assert_cmd = "2.0.11"
-async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "7944dafc8a36dc6e20a1405eca59d04662de2bb7" }
+async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "be3d9bce50051d8c0e0c06078e8066cc27db3001" }
 async-trait = "0.1.68"
 authz-macros = { path = "nexus/authz-macros" }
 backoff = { version = "0.4.0", features = [ "tokio" ] }
diff --git a/nexus/db-queries/src/db/error.rs b/nexus/db-queries/src/db/error.rs
index 348daf2737..82a4703b2f 100644
--- a/nexus/db-queries/src/db/error.rs
+++ b/nexus/db-queries/src/db/error.rs
@@ -168,10 +168,9 @@ where
 {
     match error {
         PoolError::Connection(error) => match error {
-            ConnectionError::Checkout(error) => PublicError::unavail(&format!(
-                "Failed to access connection pool: {}",
-                error
-            )),
+            ConnectionError::Connection(error) => PublicError::unavail(
+                &format!("Failed to access connection pool: {}", error),
+            ),
             ConnectionError::Query(error) => make_query_error(error),
         },
         PoolError::Timeout => {

From 54e53a5ef52eab7ff781cfec83e4057df886ad75 Mon Sep 17 00:00:00 2001
From: "Andrew J. Stone" <andrew@oxidecomputer.com>
Date: Fri, 26 May 2023 15:16:58 -0400
Subject: [PATCH 13/14] [sled-agent] Delay nexus hw notifications (#3234)

Before underlay access has been allocated to the `StorageWorker`, there
is no chance of successfully talking to nexus. Delay any notifications
until after underlay access is available. This occurs when sled-agent
starts, after disks and zpools may have already been allocated/recorded
locally.

By waiting we reduce unnecessary backoff timer increases that are not
due to any actual faults, and delay normal sled startup. This is
especially important during rack setup since those backoff timers wait
indefinitely until RSS is run.

This is a partial fix to dependency order, but just a small one. We
should take a more systematic approach ala https://github.com/
oxidecomputer/omicron/issues/1917 in the future.
---
 sled-agent/src/storage_manager.rs | 91 +++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 6 deletions(-)

diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs
index ce7d3cbdc0..34763288d8 100644
--- a/sled-agent/src/storage_manager.rs
+++ b/sled-agent/src/storage_manager.rs
@@ -333,7 +333,20 @@ impl StorageWorker {
 
     // Adds a "notification to nexus" to `nexus_notifications`,
     // informing it about the addition of `pool_id` to this sled.
-    fn add_zpool_notify(&mut self, pool: &Pool, size: ByteCount) {
+    async fn add_zpool_notify(&mut self, pool: &Pool, size: ByteCount) {
+        // The underlay network is setup once at sled-agent startup. Before
+        // there is an underlay we want to avoid sending notifications to nexus for
+        // two reasons:
+        //  1. They can't possibly succeed
+        //  2. They increase the backoff time exponentially, so that once
+        //   sled-agent does start it may take much longer to notify nexus
+        //   than it would if we avoid this. This goes especially so for rack
+        //   setup, when bootstrap agent is waiting an aribtrary time for RSS
+        //   initialization.
+        if self.underlay.lock().await.is_none() {
+            return;
+        }
+
         let pool_id = pool.name.id();
         let DiskIdentity { vendor, serial, model } = pool.parent.clone();
         let underlay = self.underlay.clone();
@@ -540,7 +553,8 @@ impl StorageWorker {
         self.physical_disk_notify(NotifyDiskRequest::Add {
             identity: disk.identity(),
             variant: disk.variant(),
-        });
+        })
+        .await;
         self.upsert_zpool(&resources, disk.identity(), disk.zpool_name())
             .await?;
 
@@ -574,14 +588,70 @@ impl StorageWorker {
     ) -> Result<(), Error> {
         if let Some(parsed_disk) = disks.remove(key) {
             resources.pools.lock().await.remove(&parsed_disk.zpool_name().id());
-            self.physical_disk_notify(NotifyDiskRequest::Remove(key.clone()));
+            self.physical_disk_notify(NotifyDiskRequest::Remove(key.clone()))
+                .await;
         }
         Ok(())
     }
 
+    /// When the underlay becomes available, we need to notify nexus about any
+    /// discovered disks and pools, since we don't attempt to notify until there
+    /// is an underlay available.
+    async fn notify_nexus_about_existing_resources(
+        &mut self,
+        resources: &StorageResources,
+    ) -> Result<(), Error> {
+        let disks = resources.disks.lock().await;
+        for disk in disks.values() {
+            self.physical_disk_notify(NotifyDiskRequest::Add {
+                identity: disk.identity(),
+                variant: disk.variant(),
+            })
+            .await;
+        }
+
+        // We may encounter errors while processing any of the pools; keep track of
+        // any errors that occur and return any of them if something goes wrong.
+        //
+        // That being said, we should not prevent notification to nexus of the
+        // other pools if only one failure occurs.
+        let mut err: Option<Error> = None;
+
+        let pools = resources.pools.lock().await;
+        for pool in pools.values() {
+            match ByteCount::try_from(pool.info.size()).map_err(|err| {
+                Error::BadPoolSize { name: pool.name.to_string(), err }
+            }) {
+                Ok(size) => self.add_zpool_notify(pool, size).await,
+                Err(e) => {
+                    warn!(self.log, "Failed to notify nexus about pool: {e}");
+                    err = Some(e)
+                }
+            }
+        }
+
+        if let Some(err) = err {
+            Err(err)
+        } else {
+            Ok(())
+        }
+    }
+
     // Adds a "notification to nexus" to `self.nexus_notifications`, informing it
     // about the addition/removal of a physical disk to this sled.
-    fn physical_disk_notify(&mut self, disk: NotifyDiskRequest) {
+    async fn physical_disk_notify(&mut self, disk: NotifyDiskRequest) {
+        // The underlay network is setup once at sled-agent startup. Before
+        // there is an underlay we want to avoid sending notifications to nexus for
+        // two reasons:
+        //  1. They can't possibly succeed
+        //  2. They increase the backoff time exponentially, so that once
+        //   sled-agent does start it may take much longer to notify nexus
+        //   than it would if we avoid this. This goes especially so for rack
+        //   setup, when bootstrap agent is waiting an aribtrary time for RSS
+        //   initialization.
+        if self.underlay.lock().await.is_none() {
+            return;
+        }
         let underlay = self.underlay.clone();
         let disk2 = disk.clone();
         let notify_nexus = move || {
@@ -676,7 +746,7 @@ impl StorageWorker {
             Error::BadPoolSize { name: pool_name.to_string(), err }
         })?;
         // Notify Nexus of the zpool.
-        self.add_zpool_notify(&pool, size);
+        self.add_zpool_notify(&pool, size).await;
         Ok(())
     }
 
@@ -750,7 +820,16 @@ impl StorageWorker {
                             self.ensure_using_exactly_these_disks(&resources, disks).await?;
                         },
                         SetupUnderlayAccess(UnderlayRequest { underlay, responder }) => {
-                            self.underlay.lock().await.replace(underlay);
+                            // If this is the first time establishing an
+                            // underlay we should notify nexus of all existing
+                            // disks and zpools.
+                            //
+                            // Instead of individual notifications, we should
+                            // send a bulk notification as described in https://
+                            // github.com/oxidecomputer/omicron/issues/1917
+                            if self.underlay.lock().await.replace(underlay).is_none() {
+                                self.notify_nexus_about_existing_resources(&resources).await?;
+                            }
                             let _ = responder.send(Ok(()));
                         }
                     }

From bcec33fc90542f0e8557a5bf14056719ab4dc728 Mon Sep 17 00:00:00 2001
From: James MacMahon <james@oxide.computer>
Date: Fri, 26 May 2023 15:43:46 -0400
Subject: [PATCH 14/14] `create-if` may fail for self-assembling zones (#3222)

Part of the work to set MTUs involved setting the IP interface MTU for
control vnics inside the zone they're used in. The problem is
`create-if` will fail if the vnic being used inside a zone already has
an interface in that zone. This occurs when the zone is self-assembling,
meaning some software inside the zone is responsible for bringing up a
listen address for its service(s). What would happen is sled-agent's
`create-if` would race with something inside the zone running
`create-addr`: these zones would eventually come up after racing for a
long time but were delayed.

Fix this by detecting if a zone is self-assembling, and eating
`create-if` error if it is. `create-if` still needs to be run so that
the `set-ifprop` commands also run as part of setting MTU don't fail due
to a missing interface.
---
 illumos-utils/src/running_zone.rs | 45 ++++++++++++++++++++++++++++---
 sled-agent/src/profile.rs         |  5 +---
 2 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs
index 722ec1d0e6..266719d0b5 100644
--- a/illumos-utils/src/running_zone.rs
+++ b/illumos-utils/src/running_zone.rs
@@ -199,18 +199,51 @@ impl RunningZone {
             }
         })?;
 
+        let site_profile_xml_exists =
+            std::path::Path::new(&zone.site_profile_xml_path()).exists();
+
         let running_zone = RunningZone { running: true, inner: zone };
 
         // Make sure the control vnic has an IP MTU of 9000 inside the zone
         const CONTROL_VNIC_MTU: usize = 9000;
         let vnic = running_zone.inner.control_vnic.name().to_string();
-        let commands = vec![
-            vec![
+
+        // If the zone is self-assembling, then SMF service(s) inside the zone
+        // will be creating the listen address for the zone's service(s). This
+        // will create IP interfaces, and means that `create-if` here will fail
+        // due to the interface already existing. Checking the output of
+        // `show-if` is also problematic due to TOCTOU. Use the check for the
+        // existence of site.xml, which means the zone is performing this
+        // self-assembly, and skip create-if if so.
+
+        if !site_profile_xml_exists {
+            let args = vec![
                 IPADM.to_string(),
                 "create-if".to_string(),
                 "-t".to_string(),
                 vnic.clone(),
-            ],
+            ];
+
+            running_zone.run_cmd(args)?;
+        } else {
+            // If the zone is self-assembling, then it's possible that the IP
+            // interface does not exist yet because it has not been brought up
+            // by the software in the zone. Run `create-if` here, but eat the
+            // error if there is one: this is safe unless the software that's
+            // part of self-assembly inside the zone is also trying to run
+            // `create-if` (instead of `create-addr`), and required for the
+            // `set-ifprop` commands below to pass.
+            let args = vec![
+                IPADM.to_string(),
+                "create-if".to_string(),
+                "-t".to_string(),
+                vnic.clone(),
+            ];
+
+            let _result = running_zone.run_cmd(args);
+        }
+
+        let commands = vec![
             vec![
                 IPADM.to_string(),
                 "set-ifprop".to_string(),
@@ -747,4 +780,10 @@ impl InstalledZone {
             links,
         })
     }
+
+    pub fn site_profile_xml_path(&self) -> Utf8PathBuf {
+        let mut path: Utf8PathBuf = self.zonepath().into();
+        path.push("root/var/svc/profile/site.xml");
+        path
+    }
 }
diff --git a/sled-agent/src/profile.rs b/sled-agent/src/profile.rs
index 264072aa75..eaf93bc97e 100644
--- a/sled-agent/src/profile.rs
+++ b/sled-agent/src/profile.rs
@@ -30,10 +30,7 @@ impl ProfileBuilder {
     ) -> Result<(), std::io::Error> {
         info!(log, "Profile for {}:\n{}", installed_zone.name(), self);
 
-        let profile_path = format!(
-            "{zonepath}/root/var/svc/profile/site.xml",
-            zonepath = installed_zone.zonepath()
-        );
+        let profile_path = installed_zone.site_profile_xml_path();
 
         tokio::fs::write(&profile_path, format!("{self}").as_bytes()).await?;
         Ok(())