From 7dca6fc9edf52fbf6846deec9346217c495c235d Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 5 Dec 2023 06:59:04 -0800 Subject: [PATCH] [wicketd] Add refresh-config subcommand instead of using curl (#4606) This allows us to add some time for refreshing a wicketd that hasn't fully started (and therefore isn't reachable on its dropshot server yet). Fixes https://github.com/oxidecomputer/omicron/issues/4604. Testing this is a little awkward because config refresh is only support on illumos via SMF, so I tested this by hand on `madrid`: 0. Disable wicketd (`svcadm disable wicketd`) 1. Install the new wicketd binary and `manifest.xml` in the switch zone 2. Import the new manifest.xml (`svccfg import /var/svc/manifest/site/wicketd/manifest.xml`) 3. (I expected to need to restore the config properties that sled-agent had set, but they persisted) 4. Enable wicketd and immediately try to refresh it (`svcadm enable wicketd && svcadm refresh wicketd`) In the wicketd logs, we see that the first refresh attempt failed because it fired before the dropshot server was up: ``` [ Dec 4 18:34:22 Executing start method ("ctrun -l child -o noorphan,regent /opt/oxide/wicketd/bin/wicketd run /var/svc/manifest/site/wicketd/config.toml --address [::1]:12226 --artifact-address [fdb0:a840:2504:355::2]:12227 --mgs-address [::1]:12225 --nex us-proxy-address [::]:12229 --baseboard-file /opt/oxide/baseboard.json --read-smf-config &"). ] [ Dec 4 18:34:22 Method "start" exited with status 0. ] [ Dec 4 18:34:22 Rereading configuration. ] note: configured to log to "/dev/stdout" [ Dec 4 18:34:22 Executing refresh method ("/opt/oxide/wicketd/bin/wicketd refresh-config /var/svc/manifest/site/wicketd/config.toml --address [::1]:12226"). ] note: configured to log to "/dev/stdout" 18:34:22.330Z WARN wicketd: failed to refresh wicketd config (attempt 1 of 3); will retry after 5s err = Communication Error: error sending request for url (http://[::1]:12226/reload-config): error trying to connect: tcp connect error: Connection refused (os error 146)\nCaused by:\n -> error sending request for url (http://[::1]:12226/reload-config) : error trying to connect: tcp connect error: Connection refused (os error 146)\n -> error trying to connect: tcp connect error: Connection refused (os error 146)\n -> tcp connect error: Connection refused (os error 146)\n -> Connection refused (os error 146) 18:34:22.396Z INFO wicketd (dropshot (wicketd)): listening file = /home/john/.cargo/git/checkouts/dropshot-a4a923d29dccc492/ff87a01/dropshot/src/server.rs:195 local_addr = [::1]:12226 ``` 10 seconds later, we see the successful connection, POST, and exit of the SMF `refresh`: ``` 18:34:32.332Z INFO wicketd (dropshot (wicketd)): accepted connection file = /home/john/.cargo/git/checkouts/dropshot-a4a923d29dccc492/ff87a01/dropshot/src/server.rs:769 local_addr = [::1]:12226 remote_addr = [::1]:32976 18:34:32.388Z INFO wicketd (dropshot (wicketd)): request completed file = /home/john/.cargo/git/checkouts/dropshot-a4a923d29dccc492/ff87a01/dropshot/src/server.rs:853 latency_us = 30475 local_addr = [::1]:12226 method = POST remote_addr = [::1]:32976 req_id = e0c2034a-0a99-45c1-a651-57249ca258f0 response_code = 204 uri = /reload-config [ Dec 4 18:34:32 Method "refresh" exited with status 0. ] ``` --- smf/wicketd/manifest.xml | 2 +- wicketd/Cargo.toml | 2 +- wicketd/src/bin/wicketd.rs | 41 ++++++++++++++++++++--- wicketd/src/lib.rs | 68 +++++++++++++++++++++++++++++++++++--- 4 files changed, 102 insertions(+), 11 deletions(-) diff --git a/smf/wicketd/manifest.xml b/smf/wicketd/manifest.xml index 778a7abf2d..b45ff1544b 100644 --- a/smf/wicketd/manifest.xml +++ b/smf/wicketd/manifest.xml @@ -32,7 +32,7 @@ it expected https). --> diff --git a/wicketd/Cargo.toml b/wicketd/Cargo.toml index 1360c28b19..97550342d0 100644 --- a/wicketd/Cargo.toml +++ b/wicketd/Cargo.toml @@ -58,6 +58,7 @@ sled-hardware.workspace = true tufaceous-lib.workspace = true update-engine.workspace = true wicket-common.workspace = true +wicketd-client.workspace = true omicron-workspace-hack.workspace = true [[bin]] @@ -83,4 +84,3 @@ tar.workspace = true tokio = { workspace = true, features = ["test-util"] } tufaceous.workspace = true wicket.workspace = true -wicketd-client.workspace = true diff --git a/wicketd/src/bin/wicketd.rs b/wicketd/src/bin/wicketd.rs index 887ac496e0..24fa802c79 100644 --- a/wicketd/src/bin/wicketd.rs +++ b/wicketd/src/bin/wicketd.rs @@ -5,6 +5,7 @@ //! Executable for wicketd: technician port based management service use anyhow::{anyhow, Context}; +use camino::Utf8PathBuf; use clap::Parser; use omicron_common::{ address::Ipv6Subnet, @@ -24,9 +25,9 @@ enum Args { /// Start a wicketd server Run { #[clap(name = "CONFIG_FILE_PATH", action)] - config_file_path: PathBuf, + config_file_path: Utf8PathBuf, - /// The address for the technician port + /// The address on which the main wicketd dropshot server should listen #[clap(short, long, action)] address: SocketAddrV6, @@ -57,6 +58,19 @@ enum Args { #[clap(long, action, conflicts_with("read_smf_config"))] rack_subnet: Option, }, + + /// Instruct a running wicketd server to refresh its config + /// + /// Mechanically, this hits a specific endpoint served by wicketd's dropshot + /// server + RefreshConfig { + #[clap(name = "CONFIG_FILE_PATH", action)] + config_file_path: Utf8PathBuf, + + /// The address of the server to refresh + #[clap(short, long, action)] + address: SocketAddrV6, + }, } #[tokio::main] @@ -104,9 +118,7 @@ async fn do_run() -> Result<(), CmdError> { }; let config = Config::from_file(&config_file_path) - .with_context(|| { - format!("failed to parse {}", config_file_path.display()) - }) + .with_context(|| format!("failed to parse {config_file_path}")) .map_err(CmdError::Failure)?; let rack_subnet = match rack_subnet { @@ -140,5 +152,24 @@ async fn do_run() -> Result<(), CmdError> { .await .map_err(|err| CmdError::Failure(anyhow!(err))) } + Args::RefreshConfig { config_file_path, address } => { + let config = Config::from_file(&config_file_path) + .with_context(|| format!("failed to parse {config_file_path}")) + .map_err(CmdError::Failure)?; + + let log = config + .log + .to_logger("wicketd") + .context("failed to initialize logger") + .map_err(CmdError::Failure)?; + + // When run via `svcadm refresh ...`, we need to respect the special + // [SMF exit codes](https://illumos.org/man/7/smf_method). Returning + // an error from main exits with code 1 (from libc::EXIT_FAILURE), + // which does not collide with any special SMF codes. + Server::refresh_config(log, address) + .await + .map_err(CmdError::Failure) + } } } diff --git a/wicketd/src/lib.rs b/wicketd/src/lib.rs index ada1902654..32188d77de 100644 --- a/wicketd/src/lib.rs +++ b/wicketd/src/lib.rs @@ -16,11 +16,12 @@ mod preflight_check; mod rss_config; mod update_tracker; -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, Context, Result}; use artifacts::{WicketdArtifactServer, WicketdArtifactStore}; use bootstrap_addrs::BootstrapPeers; pub use config::Config; pub(crate) use context::ServerContext; +use display_error_chain::DisplayErrorChain; use dropshot::{ConfigDropshot, HandlerTaskMode, HttpServer}; pub use installinator_progress::{IprUpdateTracker, RunningUpdateState}; use internal_dns::resolver::Resolver; @@ -34,6 +35,7 @@ use preflight_check::PreflightCheckerHandler; use sled_hardware::Baseboard; use slog::{debug, error, o, Drain}; use std::sync::{Mutex, OnceLock}; +use std::time::Duration; use std::{ net::{SocketAddr, SocketAddrV6}, sync::Arc, @@ -70,7 +72,6 @@ pub struct SmfConfigValues { impl SmfConfigValues { #[cfg(target_os = "illumos")] pub fn read_current() -> Result { - use anyhow::Context; use illumos_utils::scf::ScfHandle; const CONFIG_PG: &str = "config"; @@ -259,11 +260,70 @@ impl Server { res = self.artifact_server => { match res { Ok(()) => Err("artifact server exited unexpectedly".to_owned()), - // The artifact server returns an anyhow::Error, which has a `Debug` impl that - // prints out the chain of errors. + // The artifact server returns an anyhow::Error, which has a + // `Debug` impl that prints out the chain of errors. Err(err) => Err(format!("running artifact server: {err:?}")), } } } } + + /// Instruct a running server at the specified address to reload its config + /// parameters + pub async fn refresh_config( + log: slog::Logger, + address: SocketAddrV6, + ) -> Result<()> { + // It's possible we're being told to refresh a server's config before + // it's ready to receive such a request, so we'll give it a healthy + // amount of time before we give up: we'll set a client timeout and also + // retry a few times. See + // https://github.com/oxidecomputer/omicron/issues/4604. + const CLIENT_TIMEOUT: Duration = Duration::from_secs(5); + const SLEEP_BETWEEN_RETRIES: Duration = Duration::from_secs(10); + const NUM_RETRIES: usize = 3; + + let client = reqwest::Client::builder() + .connect_timeout(CLIENT_TIMEOUT) + .timeout(CLIENT_TIMEOUT) + .build() + .context("failed to construct reqwest Client")?; + + let client = wicketd_client::Client::new_with_client( + &format!("http://{address}"), + client, + log, + ); + let log = client.inner(); + + let mut attempt = 0; + loop { + attempt += 1; + + // If we succeed, we're done. + let Err(err) = client.post_reload_config().await else { + return Ok(()); + }; + + // If we failed, either warn+sleep and try again, or fail. + if attempt < NUM_RETRIES { + slog::warn!( + log, + "failed to refresh wicketd config \ + (attempt {attempt} of {NUM_RETRIES}); \ + will retry after {CLIENT_TIMEOUT:?}"; + "err" => %DisplayErrorChain::new(&err), + ); + tokio::time::sleep(SLEEP_BETWEEN_RETRIES).await; + } else { + slog::error!( + log, + "failed to refresh wicketd config \ + (tried {NUM_RETRIES} times)"; + "err" => %DisplayErrorChain::new(&err), + ); + return Err(err).context("failed to contact wicketd"); + } + } + } }