From f55b44f65adcd02a8cd59e55e270cc1f3d86d6e2 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 5 Jul 2022 17:46:33 +0000 Subject: [PATCH] Better cleanup of networking resources - Remove IP addresses, VNICs, and etherstub during `omicron-package uninstall` - Fail `destroy_virtual_hardware.sh` if Omicron zones are still installed - In `destroy_virtual_hardware.sh`, also unload xde driver and delete interfaces over the simulated Chelsio VNICs - Update how-to-run.adoc. --- Cargo.lock | 3 + docs/how-to-run.adoc | 14 ++-- package/Cargo.toml | 3 + package/src/bin/omicron-package.rs | 67 +++++++++++------ sled-agent/src/common/underlay.rs | 2 +- sled-agent/src/illumos/dladm.rs | 45 +++++++++++ sled-agent/src/illumos/zone.rs | 2 +- sled-agent/src/lib.rs | 1 + sled-agent/src/opte/opte.rs | 1 + sled-agent/src/sled_agent.rs | 115 ++++++++++++++++++++++++----- tools/create_virtual_hardware.sh | 2 + tools/destroy_virtual_hardware.sh | 51 ++++++++++++- 12 files changed, 252 insertions(+), 54 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0dafce1bc3..e9b7c6c99e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3151,6 +3151,9 @@ dependencies = [ "ring", "serde", "serde_derive", + "slog", + "slog-async", + "slog-term", "smf", "tar", "tempfile", diff --git a/docs/how-to-run.adoc b/docs/how-to-run.adoc index 58d3d3a07f..3c8647259e 100644 --- a/docs/how-to-run.adoc +++ b/docs/how-to-run.adoc @@ -25,7 +25,7 @@ Any additional prerequisite software may be installed with the following script: $ ./tools/install_prerequisites.sh ---- -=== Make me a Gimlet! +=== Make (or unmake) me a Gimlet! The sled agent expects to manage a real Gimlet. However, until those are built, developers generally make do with something else, usually a commodity machine. @@ -35,13 +35,11 @@ file-based ZFS vdevs and ZFS zpools on top of those, and a couple of VNICs. The vdevs model the actual U.2s that will be in a Gimlet, and the VNICs model the two Chelsio NIC ports. -You can clean up these resources with `./tools/destroy_virtual_hardware.sh`. Be -aware that this is a best effort script. If the sled agent or other Omicron -zones are still running, the zpools cannot be deleted. The script will warn you -about the things it cannot remove, so you can do so yourself. Also note that all -the networking bits are temporary, so a reboot should always clear them. - -Both scripts must be run as root, e.g, `pfexec ./tools/create_virtual_hardware.sh`. +You can clean up these resources with `./tools/destroy_virtual_hardware.sh`. +This script requires Omicron be uninstalled, e.g., with `pfexec +./target/release/omicron-package uninstall`, and a warning will be printed if +that is not the case. The script will then remove the file-based vdevs and the +VNICs created by `create_virtual_hardware.sh`. == Deploying Omicron diff --git a/package/Cargo.toml b/package/Cargo.toml index 62f3e945c0..770347ac2d 100644 --- a/package/Cargo.toml +++ b/package/Cargo.toml @@ -19,6 +19,9 @@ reqwest = { version = "0.11", default-features = false, features = ["rustls-tls" ring = "0.16" serde = { version = "1.0", features = [ "derive" ] } serde_derive = "1.0" +slog = { version = "2.5", features = [ "max_level_trace", "release_max_level_debug"] } +slog-async = "2.7" +slog-term = "2.9" smf = "0.2" tar = "0.4" tempfile = "3.3" diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index c794dd2fdb..a2edba2b71 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -9,6 +9,7 @@ use clap::{Parser, Subcommand}; use futures::stream::{self, StreamExt, TryStreamExt}; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use omicron_package::{parse, BuildCommand, DeployCommand}; +use omicron_sled_agent::cleanup_networking_resources; use omicron_sled_agent::zone; use omicron_zone_package::config::Config; use omicron_zone_package::config::ExternalPackage; @@ -17,6 +18,10 @@ use omicron_zone_package::package::Package; use omicron_zone_package::package::Progress; use rayon::prelude::*; use ring::digest::{Context as DigestContext, Digest, SHA256}; +use slog::info; +use slog::o; +use slog::Drain; +use slog::Logger; use std::env; use std::fs::create_dir_all; use std::path::{Path, PathBuf}; @@ -298,6 +303,7 @@ fn do_install( config: &Config, artifact_dir: &Path, install_dir: &Path, + log: &Logger, ) -> Result<()> { create_dir_all(&install_dir).map_err(|err| { anyhow!("Cannot create installation directory: {}", err) @@ -323,10 +329,11 @@ fn do_install( let src = tarfile.as_path(); let dst = install_dir.join(src.strip_prefix(artifact_dir)?); - println!( - "Installing Service: {} -> {}", - src.to_string_lossy(), - dst.to_string_lossy() + info!( + log, + "Installing service"; + "src" => %src.to_string_lossy(), + "dst" => %dst.to_string_lossy(), ); std::fs::copy(&src, &dst)?; Ok(()) @@ -346,10 +353,11 @@ fn do_install( for service_name in global_zone_service_names { let tar_path = install_dir.join(format!("{}.tar", service_name)); let service_path = install_dir.join(service_name); - println!( - "Unpacking {} to {}", - tar_path.to_string_lossy(), - service_path.to_string_lossy() + info!( + log, + "Unpacking service tarball"; + "tar_path" => %tar_path.to_string_lossy(), + "service_path" => %service_path.to_string_lossy(), ); let tar_file = std::fs::File::open(&tar_path)?; @@ -366,8 +374,9 @@ fn do_install( .join(&package.service_name) .join("pkg") .join("manifest.xml"); - println!( - "Installing bootstrap service from {}", + info!( + log, + "Installing boostrap service from {}", manifest_path.to_string_lossy() ); smf::Config::import().run(&manifest_path)?; @@ -425,7 +434,11 @@ fn remove_all_unless_already_removed>(path: P) -> Result<()> { Ok(()) } -fn remove_all_except>(path: P, to_keep: &[&str]) -> Result<()> { +fn remove_all_except>( + path: P, + to_keep: &[&str], + log: &Logger, +) -> Result<()> { let dir = match path.as_ref().read_dir() { Ok(dir) => dir, Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(()), @@ -434,9 +447,9 @@ fn remove_all_except>(path: P, to_keep: &[&str]) -> Result<()> { for entry in dir { let entry = entry?; if to_keep.contains(&&*(entry.file_name().to_string_lossy())) { - println!(" Keeping: '{}'", entry.path().to_string_lossy()); + info!(log, "Keeping: '{}'", entry.path().to_string_lossy()); } else { - println!(" Removing: '{}'", entry.path().to_string_lossy()); + info!(log, "Removing: '{}'", entry.path().to_string_lossy()); if entry.metadata()?.is_dir() { remove_all_unless_already_removed(entry.path())?; } else { @@ -447,27 +460,32 @@ fn remove_all_except>(path: P, to_keep: &[&str]) -> Result<()> { Ok(()) } -fn do_uninstall( +async fn do_uninstall( config: &Config, artifact_dir: &Path, install_dir: &Path, + log: &Logger, ) -> Result<()> { - println!("Removing all Omicron zones"); + info!(log, "Removing all Omicron zones"); uninstall_all_omicron_zones()?; - println!("Uninstalling all packages"); + info!(log, "Uninstalling all packages"); uninstall_all_packages(config); - println!("Removing artifacts in: {}", artifact_dir.to_string_lossy()); + info!(log, "Removing artifacts in: {}", artifact_dir.to_string_lossy()); const ARTIFACTS_TO_KEEP: &[&str] = &["clickhouse", "cockroachdb", "xde", "console-assets", "downloads"]; - remove_all_except(artifact_dir, ARTIFACTS_TO_KEEP)?; + remove_all_except(artifact_dir, ARTIFACTS_TO_KEEP, log)?; - println!( + info!( + log, "Removing installed objects in: {}", install_dir.to_string_lossy() ); const INSTALLED_OBJECTS_TO_KEEP: &[&str] = &["opte"]; - remove_all_except(install_dir, INSTALLED_OBJECTS_TO_KEEP)?; + remove_all_except(install_dir, INSTALLED_OBJECTS_TO_KEEP, log)?; + + cleanup_networking_resources(log).await?; + Ok(()) } @@ -545,6 +563,11 @@ async fn main() -> Result<()> { let args = Args::try_parse()?; let config = parse::<_, Config>(&args.manifest)?; + let decorator = slog_term::TermDecorator::new().build(); + let drain = slog_term::CompactFormat::new(decorator).build().fuse(); + let drain = slog_async::Async::new(drain).build().fuse(); + let log = slog::Logger::root(drain, o!()); + // Use a CWD that is the root of the Omicron repository. if let Ok(manifest) = env::var("CARGO_MANIFEST_DIR") { let manifest_dir = PathBuf::from(manifest); @@ -561,13 +584,13 @@ async fn main() -> Result<()> { artifact_dir, install_dir, }) => { - do_install(&config, &artifact_dir, &install_dir)?; + do_install(&config, &artifact_dir, &install_dir, &log)?; } SubCommand::Deploy(DeployCommand::Uninstall { artifact_dir, install_dir, }) => { - do_uninstall(&config, &artifact_dir, &install_dir)?; + do_uninstall(&config, &artifact_dir, &install_dir, &log).await?; } } diff --git a/sled-agent/src/common/underlay.rs b/sled-agent/src/common/underlay.rs index 42e22f6691..2eaa1dce86 100644 --- a/sled-agent/src/common/underlay.rs +++ b/sled-agent/src/common/underlay.rs @@ -36,7 +36,7 @@ pub fn find_nics() -> Result, Error> { Ok(addr_objs) } -fn find_chelsio_links() -> Result, Error> { +pub fn find_chelsio_links() -> Result, Error> { // TODO-correctness: This should eventually be determined by a call to // `Dladm` to get the real Chelsio links on a Gimlet. These will likely be // called `cxgbeN`, but we explicitly call them `netN` to be clear that diff --git a/sled-agent/src/illumos/dladm.rs b/sled-agent/src/illumos/dladm.rs index 2df5edc958..c2e19d7297 100644 --- a/sled-agent/src/illumos/dladm.rs +++ b/sled-agent/src/illumos/dladm.rs @@ -6,6 +6,7 @@ use crate::common::vlan::VlanID; use crate::illumos::vnic::VnicKind; +use crate::illumos::zone::IPADM; use crate::illumos::{execute, ExecutionError, PFEXEC}; use omicron_common::api::external::MacAddr; use serde::{Deserialize, Serialize}; @@ -153,6 +154,50 @@ impl Dladm { Ok(EtherstubVnic(ETHERSTUB_VNIC_NAME.to_string())) } + // Return the name of the IP interface over the etherstub VNIC, if it + // exists. + fn get_etherstub_vnic_interface() -> Result { + let mut cmd = std::process::Command::new(PFEXEC); + let cmd = cmd.args(&[ + IPADM, + "show-if", + "-p", + "-o", + "IFNAME", + ETHERSTUB_VNIC_NAME, + ]); + execute(cmd)?; + Ok(ETHERSTUB_VNIC_NAME.to_string()) + } + + /// Delete the VNIC over the inter-zone comms etherstub. + pub(crate) fn delete_etherstub_vnic() -> Result<(), ExecutionError> { + // It's not clear why, but this requires deleting the _interface_ that's + // over the VNIC first. Other VNICs don't require this for some reason. + if Self::get_etherstub_vnic_interface().is_ok() { + let mut cmd = std::process::Command::new(PFEXEC); + let cmd = cmd.args(&[IPADM, "delete-if", ETHERSTUB_VNIC_NAME]); + execute(cmd)?; + } + + if Self::get_etherstub_vnic().is_ok() { + let mut cmd = std::process::Command::new(PFEXEC); + let cmd = cmd.args(&[DLADM, "delete-vnic", ETHERSTUB_VNIC_NAME]); + execute(cmd)?; + } + Ok(()) + } + + /// Delete the inter-zone comms etherstub. + pub(crate) fn delete_etherstub() -> Result<(), ExecutionError> { + if Self::get_etherstub().is_ok() { + let mut cmd = std::process::Command::new(PFEXEC); + let cmd = cmd.args(&[DLADM, "delete-etherstub", ETHERSTUB_NAME]); + execute(cmd)?; + } + Ok(()) + } + /// Returns the name of the first observed physical data link. pub fn find_physical() -> Result { let mut command = std::process::Command::new(PFEXEC); diff --git a/sled-agent/src/illumos/zone.rs b/sled-agent/src/illumos/zone.rs index 4c7ba3baaa..4fb0561d86 100644 --- a/sled-agent/src/illumos/zone.rs +++ b/sled-agent/src/illumos/zone.rs @@ -16,7 +16,7 @@ use crate::illumos::{execute, PFEXEC}; use omicron_common::address::SLED_PREFIX; const DLADM: &str = "/usr/sbin/dladm"; -const IPADM: &str = "/usr/sbin/ipadm"; +pub const IPADM: &str = "/usr/sbin/ipadm"; pub const SVCADM: &str = "/usr/sbin/svcadm"; pub const SVCCFG: &str = "/usr/sbin/svccfg"; pub const ZLOGIN: &str = "/usr/sbin/zlogin"; diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 462a621b90..602293cab2 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -38,6 +38,7 @@ mod storage_manager; mod updates; pub use illumos::zone; +pub use sled_agent::cleanup_networking_resources; #[cfg(test)] mod mocks; diff --git a/sled-agent/src/opte/opte.rs b/sled-agent/src/opte/opte.rs index db91c9377a..8ca00be1b7 100644 --- a/sled-agent/src/opte/opte.rs +++ b/sled-agent/src/opte/opte.rs @@ -320,6 +320,7 @@ impl Drop for OptePort { /// Delete all xde devices on the system. pub fn delete_all_xde_devices(log: &Logger) -> Result<(), Error> { + // Delete the VNICs as well. let hdl = OpteHdl::open(OpteHdl::DLD_CTL)?; for port_info in hdl.list_ports()?.ports.into_iter() { let name = &port_info.name; diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index e385d2271a..903a10369b 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -9,6 +9,7 @@ use crate::illumos::vnic::VnicKind; use crate::illumos::zfs::{ Mountpoint, ZONE_ZFS_DATASET, ZONE_ZFS_DATASET_MOUNTPOINT, }; +use crate::illumos::zone::IPADM; use crate::illumos::{execute, PFEXEC}; use crate::instance_manager::InstanceManager; use crate::nexus::NexusClient; @@ -26,6 +27,7 @@ use omicron_common::api::{ }; use slog::Logger; use std::net::SocketAddrV6; +use std::process::Command; use std::sync::Arc; use uuid::Uuid; @@ -57,6 +59,12 @@ pub enum Error { #[error("Failed to delete VNIC on boot: {0}")] DeleteVnic(#[from] crate::illumos::dladm::DeleteVnicError), + #[error("Failed to remove Omicron address: {0}")] + DeleteAddress(#[from] crate::illumos::ExecutionError), + + #[error("Failed to operate on underlay device: {0}")] + Underlay(#[from] crate::common::underlay::Error), + #[error(transparent)] Services(#[from] crate::services::Error), @@ -197,24 +205,7 @@ impl SledAgent { // Note that we don't currently delete the VNICs in any particular // order. That should be OK, since we're definitely deleting the guest // VNICs before the xde devices, which is the main constraint. - let vnics = Dladm::get_vnics()?; - stream::iter(vnics) - .zip(stream::iter(std::iter::repeat(log.clone()))) - .map(Ok::<_, crate::illumos::dladm::DeleteVnicError>) - .try_for_each_concurrent(None, |(vnic, log)| async { - tokio::task::spawn_blocking(move || { - warn!( - log, - "Deleting existing VNIC"; - "vnic_name" => &vnic, - "vnic_kind" => ?VnicKind::from_name(&vnic).unwrap(), - ); - Dladm::delete_vnic(&vnic) - }) - .await - .unwrap() - }) - .await?; + delete_omicron_vnics(&log).await?; // Also delete any extant xde devices. These should also eventually be // recovered / tracked, to avoid interruption of any guests that are @@ -362,3 +353,91 @@ impl SledAgent { .map_err(Error::from) } } + +// Delete all underlay addresses created directly over the etherstub VNIC used +// for inter-zone communications. +fn delete_etherstub_addresses(log: &Logger) -> Result<(), Error> { + let prefix = format!("{}/", crate::illumos::dladm::ETHERSTUB_VNIC_NAME); + delete_addresses_matching_prefixes(log, &[prefix]) +} + +fn delete_underlay_addresses(log: &Logger) -> Result<(), Error> { + use crate::illumos::dladm::VnicSource; + let prefixes = crate::common::underlay::find_chelsio_links()? + .into_iter() + .map(|link| format!("{}/", link.name())) + .collect::>(); + delete_addresses_matching_prefixes(log, &prefixes) +} + +fn delete_addresses_matching_prefixes( + log: &Logger, + prefixes: &[String], +) -> Result<(), Error> { + use std::io::BufRead; + let mut cmd = Command::new(PFEXEC); + let cmd = cmd.args(&[IPADM, "show-addr", "-p", "-o", "ADDROBJ"]); + let output = execute(cmd)?; + for line in output.stdout.lines() { + if let Ok(ref addrobj) = line { + warn!( + log, + "Deleting existing Omicron IP address"; + "addrobj" => addrobj, + ); + if prefixes.iter().any(|prefix| addrobj.starts_with(prefix)) { + let mut cmd = Command::new(PFEXEC); + let cmd = cmd.args(&[IPADM, "delete-addr", addrobj.as_str()]); + execute(cmd)?; + } + } + } + Ok(()) +} + +// Delete all VNICs that can be managed by the control plane. +// +// These are currently those that match the prefix `ox` or `vopte`. +async fn delete_omicron_vnics(log: &Logger) -> Result<(), Error> { + let vnics = Dladm::get_vnics()?; + stream::iter(vnics) + .zip(stream::iter(std::iter::repeat(log.clone()))) + .map(Ok::<_, crate::illumos::dladm::DeleteVnicError>) + .try_for_each_concurrent(None, |(vnic, log)| async { + tokio::task::spawn_blocking(move || { + warn!( + log, + "Deleting existing VNIC"; + "vnic_name" => &vnic, + "vnic_kind" => ?VnicKind::from_name(&vnic).unwrap(), + ); + Dladm::delete_vnic(&vnic) + }) + .await + .unwrap() + }) + .await?; + Ok(()) +} + +// Delete the etherstub and underlay VNIC used for interzone communication +fn delete_etherstub(log: &Logger) -> Result<(), Error> { + use crate::illumos::dladm::ETHERSTUB_NAME; + use crate::illumos::dladm::ETHERSTUB_VNIC_NAME; + warn!(log, "Deleting Omicron underlay VNIC"; "vnic_name" => ETHERSTUB_VNIC_NAME); + Dladm::delete_etherstub_vnic()?; + warn!(log, "Deleting Omicron etherstub"; "stub_name" => ETHERSTUB_NAME); + Dladm::delete_etherstub()?; + Ok(()) +} + +/// Delete all networking resources installed by the sled agent, in the global +/// zone. +pub async fn cleanup_networking_resources(log: &Logger) -> Result<(), Error> { + delete_etherstub_addresses(log)?; + delete_underlay_addresses(log)?; + delete_omicron_vnics(log).await?; + delete_etherstub(log)?; + crate::opte::delete_all_xde_devices(log)?; + Ok(()) +} diff --git a/tools/create_virtual_hardware.sh b/tools/create_virtual_hardware.sh index f3cb160b60..8dc44ffdd6 100755 --- a/tools/create_virtual_hardware.sh +++ b/tools/create_virtual_hardware.sh @@ -32,7 +32,9 @@ fi echo "Using $PHYSICAL_LINK as physical link" function success { + set +x echo -e "\e[1;36m$1\e[0m" + set -x } # Create the ZFS zpools required for the sled agent, backed by file-based vdevs. diff --git a/tools/destroy_virtual_hardware.sh b/tools/destroy_virtual_hardware.sh index 3dd9a154af..75746fb2b7 100755 --- a/tools/destroy_virtual_hardware.sh +++ b/tools/destroy_virtual_hardware.sh @@ -26,11 +26,38 @@ if [[ "$(id -u)" -ne 0 ]]; then fi function warn { + set +x echo -e "\e[1;31m$1\e[0m" + set -x } function success { + set +x echo -e "\e[1;36m$1\e[0m" + set -x +} + +function verify_omicron_uninstalled { + svcs "svc:/system/illumos/sled-agent:default" 2>&1 > /dev/null + if [[ $? -eq 0 ]]; then + set +x + warn "Omicron is still installed, please run \`omicron-package uninstall\`, and then re-run this script" + exit 1 + fi +} + +function unload_xde_driver { + local ID="$(modinfo | grep xde | cut -d ' ' -f 1)" + if [[ "$ID" ]]; then + local RC=0 + modunload -i "$ID" + RC=$? + if [[ $RC -ne 0 ]]; then + warn "Failed to unload xde driver" + exit 1 + fi + fi + success "Verified the xde kernel driver is unloaded" } function try_remove_address { @@ -41,12 +68,26 @@ function try_remove_address { RC=$? fi if [[ $RC -eq 0 ]]; then - success "Address $ADDRESS destroyed" + success "Verified address $ADDRESS does not exist" else warn "Failed to delete address $ADDRESS" fi } +function try_remove_interface { + local IFACE="$1" + RC=0 + if [[ "$(ipadm show-if -p -o IFNAME "$IFACE")" ]]; then + ipadm delete-if "$IFACE" + RC=$? + fi + if [[ $RC -eq 0 ]]; then + success "Verified IP interface $IFACE does not exist" + else + warn "Failed to delete interface $IFACE" + fi +} + function try_remove_vnic { local LINK="$1" RC=0 @@ -55,7 +96,7 @@ function try_remove_vnic { RC=$? fi if [[ $RC -eq 0 ]]; then - success "VNIC link $LINK destroyed" + success "Verified VNIC link $LINK does not exist" else warn "Failed to delete VNIC link $LINK" fi @@ -65,7 +106,7 @@ function try_remove_vnics { try_remove_address "lo0/underlay" VNIC_LINKS=("net0" "net1") for LINK in "${VNIC_LINKS[@]}"; do - try_remove_vnic "$LINK" + try_remove_interface "$LINK" && try_remove_vnic "$LINK" done } @@ -78,12 +119,14 @@ function try_destroy_zpools { RC=$? if [[ $RC -eq 0 ]]; then - success "Removed ZFS pool and vdev: $ZPOOL" + success "Verified ZFS pool and vdev $ZPOOL does not exist" else warn "Failed to remove ZFS pool and vdev: $ZPOOL" fi done } +verify_omicron_uninstalled +unload_xde_driver try_remove_vnics try_destroy_zpools