From 637dbdefec50c166109d70f5e7085710ca372849 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Fri, 8 Jul 2022 21:51:02 +0000 Subject: [PATCH 1/7] Reorganize OPTE port management - Simplify platform deps and module structure - Add new Port and PortManager types. The port manager is a centralized object to manage all OPTE ports. This is currently required in order to support correctly implementing the external IP workaround, which requires keeping track of all current MAC addresses for guest interfaces. This is all modeled after the instance / instance manager relationship, where ports remove themselves from the manager on drop. - Add better logging - Add better handling of the overlay VNIC, currently also required for OPTE to work with Viona. --- common/src/api/external/mod.rs | 6 + sled-agent/src/illumos/dladm.rs | 34 ++ sled-agent/src/illumos/running_zone.rs | 14 +- sled-agent/src/instance.rs | 122 +++--- sled-agent/src/instance_manager.rs | 26 +- sled-agent/src/opte/mock_opte.rs | 194 --------- sled-agent/src/opte/mod.rs | 260 +++++++++++- sled-agent/src/opte/opte.rs | 473 ---------------------- sled-agent/src/opte/port.rs | 158 ++++++++ sled-agent/src/opte/port_manager.rs | 537 +++++++++++++++++++++++++ tools/populate/populate-images.sh | 4 + 11 files changed, 1075 insertions(+), 753 deletions(-) delete mode 100644 sled-agent/src/opte/mock_opte.rs delete mode 100644 sled-agent/src/opte/opte.rs create mode 100644 sled-agent/src/opte/port.rs create mode 100644 sled-agent/src/opte/port_manager.rs diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 83bb373b2a..c689ba4945 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -1716,6 +1716,12 @@ impl JsonSchema for L4PortRange { )] pub struct MacAddr(pub macaddr::MacAddr6); +impl From for MacAddr { + fn from(mac: macaddr::MacAddr6) -> Self { + Self(mac) + } +} + impl FromStr for MacAddr { type Err = macaddr::ParseError; diff --git a/sled-agent/src/illumos/dladm.rs b/sled-agent/src/illumos/dladm.rs index d9f7975d0d..af74298c0a 100644 --- a/sled-agent/src/illumos/dladm.rs +++ b/sled-agent/src/illumos/dladm.rs @@ -90,6 +90,18 @@ pub struct SetLinkpropError { err: ExecutionError, } +/// Errors returned from [`Dladm::reset_linkprop`]. +#[derive(thiserror::Error, Debug)] +#[error( + "Failed to reset link property \"{prop_name}\" on vnic {link_name}: {err}" +)] +pub struct ResetLinkpropError { + link_name: String, + prop_name: String, + #[source] + err: ExecutionError, +} + /// The name of a physical datalink. #[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] pub struct PhysicalLink(pub String); @@ -346,4 +358,26 @@ impl Dladm { })?; Ok(()) } + + /// Reset a link property on a VNIC + pub fn reset_linkprop( + vnic: &str, + prop_name: &str, + ) -> Result<(), ResetLinkpropError> { + let mut command = std::process::Command::new(PFEXEC); + let cmd = command.args(&[ + DLADM, + "reset-linkprop", + "-t", + "-p", + prop_name, + vnic, + ]); + execute(cmd).map_err(|err| ResetLinkpropError { + link_name: vnic.to_string(), + prop_name: prop_name.to_string(), + err, + })?; + Ok(()) + } } diff --git a/sled-agent/src/illumos/running_zone.rs b/sled-agent/src/illumos/running_zone.rs index 75316eecfb..b03bb42be3 100644 --- a/sled-agent/src/illumos/running_zone.rs +++ b/sled-agent/src/illumos/running_zone.rs @@ -9,7 +9,7 @@ use crate::illumos::dladm::Etherstub; use crate::illumos::svc::wait_for_service; use crate::illumos::vnic::{Vnic, VnicAllocator}; use crate::illumos::zone::{AddressRequest, ZONE_PREFIX}; -use crate::opte::OptePort; +use crate::opte::Port; use ipnetwork::IpNetwork; use slog::Logger; use std::net::{Ipv4Addr, Ipv6Addr}; @@ -292,16 +292,14 @@ impl RunningZone { log: log.new(o!("zone" => zone_name.to_string())), name: zone_name.to_string(), control_vnic, - // TODO(https://github.com/oxidecomputer/omicron/issues/725) - // - // Re-initialize guest_vnic state by inspecting the zone. opte_ports: vec![], physical_nic: None, }, }) } - pub fn get_opte_ports(&self) -> &Vec { + /// Return references to the OPTE ports for this zone. + pub fn opte_ports(&self) -> &[Port] { &self.inner.opte_ports } } @@ -348,7 +346,7 @@ pub struct InstalledZone { control_vnic: Vnic, // OPTE devices for the guest network interfaces - opte_ports: Vec, + opte_ports: Vec, // Physical NIC possibly provisioned to the zone. // TODO: Remove once Nexus traffic is transmitted over OPTE. @@ -385,7 +383,7 @@ impl InstalledZone { unique_name: Option<&str>, datasets: &[zone::Dataset], devices: &[zone::Device], - opte_ports: Vec, + opte_ports: Vec, physical_nic: Option, ) -> Result { let control_vnic = vnic_allocator.new_control(None).map_err(|err| { @@ -401,7 +399,7 @@ impl InstalledZone { let net_device_names: Vec = opte_ports .iter() - .map(|port| port.vnic().name().to_string()) + .map(|port| port.vnic_name().to_string()) .chain(std::iter::once(control_vnic.name().to_string())) .chain(physical_nic.as_ref().map(|vnic| vnic.name().to_string())) .collect(); diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 43ab4dc14c..82215735ad 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -14,8 +14,8 @@ use crate::illumos::vnic::VnicAllocator; use crate::illumos::zone::{AddressRequest, PROPOLIS_ZONE_PREFIX}; use crate::instance_manager::InstanceTicket; use crate::nexus::NexusClient; -use crate::opte::OptePort; -use crate::opte::OptePortAllocator; +use crate::opte::PortManager; +use crate::opte::PortTicket; use crate::params::ExternalIp; use crate::params::NetworkInterface; use crate::params::{ @@ -32,7 +32,6 @@ use propolis_client::api::DiskRequest; use propolis_client::Client as PropolisClient; use slog::Logger; use std::net::IpAddr; -use std::net::Ipv6Addr; use std::net::SocketAddr; use std::sync::Arc; use tokio::task::JoinHandle; @@ -157,7 +156,9 @@ struct RunningState { // Connection to Propolis. client: Arc, // Object representing membership in the "instance manager". - ticket: InstanceTicket, + instance_ticket: InstanceTicket, + // Object representing the instance's OPTE ports in the port manager + port_ticket: Option, // Handle to task monitoring for Propolis state changes. monitor_task: Option>, // Handle to the zone. @@ -190,6 +191,7 @@ impl Drop for RunningState { struct PropolisSetup { client: Arc, running_zone: RunningZone, + port_ticket: Option, } struct InstanceInner { @@ -207,9 +209,11 @@ struct InstanceInner { // NIC-related properties vnic_allocator: VnicAllocator, - // OPTE port related properties - underlay_addr: Ipv6Addr, - port_allocator: OptePortAllocator, + // Reference to the port manager for creating OPTE ports when starting the + // instance + port_manager: PortManager, + + // Guest NIC and OPTE port information requested_nics: Vec, external_ip: ExternalIp, @@ -288,21 +292,20 @@ impl InstanceInner { async fn ensure( &mut self, instance: Instance, - ticket: InstanceTicket, + instance_ticket: InstanceTicket, setup: PropolisSetup, migrate: Option, ) -> Result<(), Error> { - let PropolisSetup { client, running_zone } = setup; + let PropolisSetup { client, running_zone, port_ticket } = setup; - let nics = self - .requested_nics + let nics = running_zone + .opte_ports() .iter() - .zip(running_zone.get_opte_ports().iter()) - .map(|(nic, port)| propolis_client::api::NetworkInterfaceRequest { + .map(|port| propolis_client::api::NetworkInterfaceRequest { // TODO-correctness: Remove `.vnic()` call when we use the port // directly. - name: port.vnic().name().to_string(), - slot: propolis_client::api::Slot(nic.slot), + name: port.vnic_name().to_string(), + slot: propolis_client::api::Slot(port.slot()), }) .collect(); @@ -350,7 +353,8 @@ impl InstanceInner { self.running_state = Some(RunningState { client, - ticket, + instance_ticket, + port_ticket, monitor_task, _running_zone: running_zone, }); @@ -401,15 +405,14 @@ mockall::mock! { pub fn new( log: Logger, id: Uuid, - vnic_allocator: VnicAllocator, - underlay_addr: Ipv6Addr, - port_allocator: OptePortAllocator, initial: InstanceHardware, + vnic_allocator: VnicAllocator, + port_manager: PortManager, nexus_client: Arc, ) -> Result; pub async fn start( &self, - ticket: InstanceTicket, + instance_ticket: InstanceTicket, migrate: Option, ) -> Result<(), Error>; pub async fn transition( @@ -434,23 +437,20 @@ impl Instance { /// Arguments: /// * `log`: Logger for dumping debug information. /// * `id`: UUID of the instance to be created. + /// * `initial`: State of the instance at initialization time. /// * `vnic_allocator`: A unique (to the sled) ID generator to /// refer to a VNIC. (This exists because of a restriction on VNIC name /// lengths, otherwise the UUID would be used instead). - /// * `underlay_addr`: The IPv6 underlay address for the sled hosting this - /// instance. - /// * `port_allocator`: A unique (to the sled) ID generator to - /// refer to an OPTE port for the guest network interfaces. - /// * `initial`: State of the instance at initialization time. + /// * `port_manager`: Handle the the object responsible for managing OPTE + /// ports. /// * `nexus_client`: Connection to Nexus, used for sending notifications. // TODO: This arg list is getting a little long; can we clean this up? pub fn new( log: Logger, id: Uuid, - vnic_allocator: VnicAllocator, - underlay_addr: Ipv6Addr, - port_allocator: OptePortAllocator, initial: InstanceHardware, + vnic_allocator: VnicAllocator, + port_manager: PortManager, nexus_client: Arc, ) -> Result { info!(log, "Instance::new w/initial HW: {:?}", initial); @@ -472,8 +472,7 @@ impl Instance { propolis_id: initial.runtime.propolis_id, propolis_ip: initial.runtime.propolis_addr.unwrap().ip(), vnic_allocator, - underlay_addr, - port_allocator, + port_manager, requested_nics: initial.nics, external_ip: initial.external_ip, requested_disks: initial.disks, @@ -489,34 +488,23 @@ impl Instance { Ok(Instance { inner }) } - fn create_opte_ports( + async fn setup_propolis_locked( &self, inner: &mut MutexGuard<'_, InstanceInner>, - ) -> Result, Error> { - let mut ports = Vec::with_capacity(inner.requested_nics.len()); + ) -> Result { + // Create OPTE ports for the instance + let mut opte_ports = Vec::with_capacity(inner.requested_nics.len()); for nic in inner.requested_nics.iter() { - let vni = crate::opte::Vni::new(nic.vni).expect("Invalid VNI"); let external_ip = if nic.primary { Some(inner.external_ip) } else { None }; - let port = inner.port_allocator.new_port( - nic.ip, - *nic.mac, - ipnetwork::IpNetwork::from(nic.subnet), - vni, - inner.underlay_addr, + let port = inner.port_manager.create_port( + *inner.id(), + nic, external_ip, )?; - info!(inner.log, "created OPTE port for guest"; "port_info" => ?port); - ports.push(port); + opte_ports.push(port); } - Ok(ports) - } - - async fn setup_propolis_locked( - &self, - inner: &mut MutexGuard<'_, InstanceInner>, - ) -> Result { - let opte_ports = self.create_opte_ports(inner)?; + let port_ticket = opte_ports.first().map(|port| port.ticket()); // Create a zone for the propolis instance, using the previously // configured VNICs. @@ -545,8 +533,8 @@ impl Instance { let network = running_zone.ensure_address(addr_request).await?; info!(inner.log, "Created address {} for zone: {}", network, zname); - let gateway = inner.underlay_addr; - running_zone.add_default_route(gateway).await?; + let gateway = inner.port_manager.underlay_ip(); + running_zone.add_default_route(*gateway).await?; // Run Propolis in the Zone. let smf_service_name = "svc:/system/illumos/propolis-server"; @@ -646,13 +634,13 @@ impl Instance { // don't need to worry about initialization races. wait_for_http_server(&inner.log, &client).await?; - Ok(PropolisSetup { client, running_zone }) + Ok(PropolisSetup { client, running_zone, port_ticket }) } /// Begins the execution of the instance's service (Propolis). pub async fn start( &self, - ticket: InstanceTicket, + instance_ticket: InstanceTicket, migrate: Option, ) -> Result<(), Error> { let mut inner = self.inner.lock().await; @@ -662,7 +650,7 @@ impl Instance { // Ensure the instance exists in the Propolis Server before we start // using it. - inner.ensure(self.clone(), ticket, setup, migrate).await?; + inner.ensure(self.clone(), instance_ticket, setup, migrate).await?; Ok(()) } @@ -675,7 +663,14 @@ impl Instance { warn!(inner.log, "Halting and removing zone: {}", zname); Zones::halt_and_remove_logged(&inner.log, &zname).unwrap(); - inner.running_state.as_mut().unwrap().ticket.terminate(); + // Remove ourselves from the instance manager's map of instances. + let running_state = inner.running_state.as_mut().unwrap(); + running_state.instance_ticket.terminate(); + + // And remove the OPTE ports from the port manager + if let Some(ticket) = running_state.port_ticket.as_mut() { + ticket.release(); + } Ok(()) } @@ -766,7 +761,7 @@ mod test { use super::*; use crate::illumos::dladm::Etherstub; use crate::mocks::MockNexusClient; - use crate::opte::OptePortAllocator; + use crate::opte::PortManager; use crate::params::ExternalIp; use crate::params::InstanceStateRequested; use chrono::Utc; @@ -846,19 +841,20 @@ mod test { "Test".to_string(), Etherstub("mylink".to_string()), ); + let underlay_ip = std::net::Ipv6Addr::new( + 0xfd00, 0x1de, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + ); let mac = MacAddr6::from([0u8; 6]); - let port_allocator = OptePortAllocator::new(mac); + let port_manager = + PortManager::new(log.new(slog::o!()), underlay_ip, mac); let nexus_client = MockNexusClient::default(); let inst = Instance::new( log.clone(), test_uuid(), - vnic_allocator, - std::net::Ipv6Addr::new( - 0xfd00, 0x1de, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, - ), - port_allocator, new_initial_instance(), + vnic_allocator, + port_manager, Arc::new(nexus_client), ) .unwrap(); diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 910951a69d..d0fc5f738a 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -7,7 +7,7 @@ use crate::illumos::dladm::Etherstub; use crate::illumos::vnic::VnicAllocator; use crate::nexus::NexusClient; -use crate::opte::OptePortAllocator; +use crate::opte::PortManager; use crate::params::{ InstanceHardware, InstanceMigrateParams, InstanceRuntimeStateRequested, InstanceSerialConsoleData, @@ -33,6 +33,9 @@ pub enum Error { #[error("No such instance ID: {0}")] NoSuchInstance(Uuid), + + #[error("OPTE port management error: {0}")] + Opte(#[from] crate::opte::Error), } struct InstanceManagerInternal { @@ -46,8 +49,7 @@ struct InstanceManagerInternal { instances: Mutex>, vnic_allocator: VnicAllocator, - underlay_addr: Ipv6Addr, - port_allocator: OptePortAllocator, + port_manager: PortManager, } /// All instances currently running on the sled. @@ -61,7 +63,7 @@ impl InstanceManager { log: Logger, nexus_client: Arc, etherstub: Etherstub, - underlay_addr: Ipv6Addr, + underlay_ip: Ipv6Addr, gateway_mac: MacAddr6, ) -> InstanceManager { InstanceManager { @@ -70,8 +72,11 @@ impl InstanceManager { nexus_client, instances: Mutex::new(BTreeMap::new()), vnic_allocator: VnicAllocator::new("Instance", etherstub), - underlay_addr, - port_allocator: OptePortAllocator::new(gateway_mac), + port_manager: PortManager::new( + log.new(o!("component" => "PortManager")), + underlay_ip, + gateway_mac, + ), }), } } @@ -122,10 +127,9 @@ impl InstanceManager { let instance = Instance::new( instance_log, instance_id, - self.inner.vnic_allocator.clone(), - self.inner.underlay_addr, - self.inner.port_allocator.clone(), initial_hardware, + self.inner.vnic_allocator.clone(), + self.inner.port_manager.clone(), self.inner.nexus_client.clone(), )?; let instance_clone = instance.clone(); @@ -317,7 +321,7 @@ mod test { let ticket = Arc::new(std::sync::Mutex::new(None)); let ticket_clone = ticket.clone(); let instance_new_ctx = MockInstance::new_context(); - instance_new_ctx.expect().return_once(move |_, _, _, _, _, _, _| { + instance_new_ctx.expect().return_once(move |_, _, _, _, _, _| { let mut inst = MockInstance::default(); inst.expect_clone().return_once(move || { let mut inst = MockInstance::default(); @@ -389,7 +393,7 @@ mod test { let ticket_clone = ticket.clone(); let instance_new_ctx = MockInstance::new_context(); let mut seq = mockall::Sequence::new(); - instance_new_ctx.expect().return_once(move |_, _, _, _, _, _, _| { + instance_new_ctx.expect().return_once(move |_, _, _, _, _, _| { let mut inst = MockInstance::default(); // First call to ensure (start + transition). inst.expect_clone().times(1).in_sequence(&mut seq).return_once( diff --git a/sled-agent/src/opte/mock_opte.rs b/sled-agent/src/opte/mock_opte.rs deleted file mode 100644 index 37d3614bfa..0000000000 --- a/sled-agent/src/opte/mock_opte.rs +++ /dev/null @@ -1,194 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Mock / empty interface to the Oxide Packet Transformation Engine (OPTE), for -//! building the sled agent on non-illumos systems. - -use crate::illumos::vnic::Vnic; -use crate::params::ExternalIp; -use ipnetwork::IpNetwork; -use macaddr::MacAddr6; -use slog::Logger; -use std::net::IpAddr; -use std::net::Ipv6Addr; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; -use std::sync::Arc; - -#[derive(Debug, Clone, Copy)] -pub struct Vni(u32); - -impl Vni { - pub fn new(n: N) -> Result - where - N: Into, - { - let x = n.into(); - if x <= 0x00_FF_FF_FF { - Ok(Self(x)) - } else { - Err(Error::InvalidArgument(format!("invalid VNI: {}", x))) - } - } -} - -#[derive(thiserror::Error, Debug)] -pub enum Error { - #[error("Invalid argument: {0}")] - InvalidArgument(String), -} - -#[derive(Debug, Clone)] -pub struct OptePortAllocator { - value: Arc, -} - -impl OptePortAllocator { - pub fn new(_gateway_mac: MacAddr6) -> Self { - Self { value: Arc::new(AtomicU64::new(0)) } - } - - fn next(&self) -> String { - format!("opte{}", self.next_id()) - } - - fn next_id(&self) -> u64 { - self.value.fetch_add(1, Ordering::SeqCst) - } - - pub fn new_port( - &self, - ip: IpAddr, - mac: MacAddr6, - subnet: IpNetwork, - vni: Vni, - underlay_ip: Ipv6Addr, - external_ip: Option, - ) -> Result { - // TODO-completess: Remove IPv4 restrictions once OPTE supports virtual - // IPv6 networks. - if matches!(ip, IpAddr::V6(_)) { - return Err(Error::InvalidArgument(String::from( - "IPv6 not yet supported", - ))); - } - let gateway = Gateway::from_subnet(&subnet); - if matches!(gateway.ip, IpAddr::V6(_)) { - return Err(Error::InvalidArgument(String::from( - "IPv6 not yet supported", - ))); - } - let boundary_services = BoundaryServices::default(); - let name = self.next(); - if matches!(subnet.network(), IpAddr::V6(_)) { - return Err(Error::InvalidArgument(String::from( - "IPv6 not yet supported", - ))); - } - Ok(OptePort { - name, - ip, - subnet, - mac, - vni, - underlay_ip, - external_ip, - gateway, - boundary_services, - vnic: None, - }) - } -} - -#[derive(Debug, Clone, Copy)] -pub struct BoundaryServices { - pub ip: Ipv6Addr, - pub vni: Vni, -} - -impl Default for BoundaryServices { - fn default() -> Self { - // TODO-completeness: Don't hardcode this. - // - // Boundary Services will be started on several Sidecars during rack - // setup, and those addresses will need to be propagated here. - const BOUNDARY_SERVICES_ADDR: Ipv6Addr = - Ipv6Addr::new(0xfd00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01); - let boundary_services_vni = Vni::new(99_u32).unwrap(); - - Self { ip: BOUNDARY_SERVICES_ADDR, vni: boundary_services_vni } - } -} - -/// Information about the gateway for an OPTE port -#[derive(Debug, Clone, Copy)] -#[allow(dead_code)] -pub struct Gateway { - mac: MacAddr6, - ip: IpAddr, -} - -// The MAC address that OPTE exposes to guest NICs, i.e., the MAC of the virtual -// gateway OPTE operates as for each guest. See -// https://github.com/oxidecomputer/omicron/pull/955#discussion_r856432498 for -// more context on the genesis of this, but this is just a reserved address -// within the "system" portion of the virtual MAC address space. -const OPTE_VIRTUAL_GATEWAY_MAC: MacAddr6 = - MacAddr6::new(0xa8, 0x40, 0x25, 0xff, 0x77, 0x77); - -impl Gateway { - pub fn from_subnet(subnet: &IpNetwork) -> Self { - Self { - mac: OPTE_VIRTUAL_GATEWAY_MAC, - - // See RFD 21, section 2.2 table 1 - ip: subnet - .iter() - .nth(1) - .expect("IP subnet must have at least 1 address"), - } - } -} - -/// A port on the OPTE "virtual switch", which corresponds to one guest network -/// interface. -#[derive(Debug)] -#[allow(dead_code)] -pub struct OptePort { - name: String, - ip: IpAddr, - subnet: IpNetwork, - mac: MacAddr6, - vni: Vni, - underlay_ip: Ipv6Addr, - external_ip: Option, - gateway: Gateway, - boundary_services: BoundaryServices, - vnic: Option, -} - -impl OptePort { - /// Return the VNIC used to link OPTE and Viona. - // TODO-correctness: Remove this once we can put Viona directly on top of an - // OPTE port device. - pub fn vnic(&self) -> &Vnic { - self.vnic.as_ref().unwrap() - } -} - -impl Drop for OptePort { - fn drop(&mut self) { - self.vnic.take(); - } -} - -pub fn initialize_xde_driver(log: &Logger) -> Result<(), Error> { - slog::warn!(log, "`xde` driver is a fiction on non-illumos systems"); - Ok(()) -} - -pub fn delete_all_xde_devices(log: &Logger) -> Result<(), Error> { - slog::warn!(log, "`xde` driver is a fiction on non-illumos systems"); - Ok(()) -} diff --git a/sled-agent/src/opte/mod.rs b/sled-agent/src/opte/mod.rs index 007ec6f22d..b537b2e336 100644 --- a/sled-agent/src/opte/mod.rs +++ b/sled-agent/src/opte/mod.rs @@ -4,8 +4,260 @@ //! Interactions with the Oxide Packet Transformation Engine (OPTE) -#[cfg_attr(target_os = "illumos", path = "opte.rs")] -#[cfg_attr(not(target_os = "illumos"), path = "mock_opte.rs")] -mod inner; +use ipnetwork::IpNetwork; +use macaddr::MacAddr6; +use slog::Logger; +use std::net::IpAddr; +use std::net::Ipv6Addr; -pub use inner::*; +mod port; +mod port_manager; + +pub use port::Port; +pub use port_manager::PortManager; +pub use port_manager::PortTicket; + +cfg_if::cfg_if! { + if #[cfg(target_os = "illumos")] { + use crate::illumos::dladm; + use crate::common::underlay; + use opte_ioctl::OpteHdl; + use std::fs; + use std::path::Path; + + pub use opte::api::Vni; + + #[derive(thiserror::Error, Debug)] + pub enum Error { + #[error("Failure interacting with the OPTE ioctl(2) interface: {0}")] + Opte(#[from] opte_ioctl::Error), + + #[error("Failed to wrap OPTE port in a VNIC: {0}")] + CreateVnic(#[from] dladm::CreateVnicError), + + #[error("Failed to get VNICs for xde underlay devices: {0}")] + GetVnic(#[from] underlay::Error), + + #[error( + "No xde driver configuration file exists at '/kernel/drv/xde.conf'" + )] + NoXdeConf, + + #[error( + "The OS kernel does not support the xde driver. Please update the OS \ + using `./tools/install_opte.sh` to provide kernel bits and the xde \ + driver which are compatible." + )] + IncompatibleKernel, + + #[error(transparent)] + BadAddrObj(#[from] crate::illumos::addrobj::ParseError), + + #[error(transparent)] + SetLinkpropError(#[from] crate::illumos::dladm::SetLinkpropError), + + #[error(transparent)] + ResetLinkpropError(#[from] crate::illumos::dladm::ResetLinkpropError), + } + + /// Delete all xde devices on the system. + pub fn delete_all_xde_devices(log: &Logger) -> Result<(), Error> { + let hdl = OpteHdl::open(OpteHdl::DLD_CTL)?; + for port_info in hdl.list_ports()?.ports.into_iter() { + let name = &port_info.name; + info!( + log, + "deleting existing OPTE port and xde device"; + "device_name" => name + ); + hdl.delete_xde(name)?; + } + Ok(()) + } + + /// Initialize the underlay devices required for the xde kernel module. + /// + /// The xde driver needs information about the physical devices out which it can + /// send traffic from the guests. + pub fn initialize_xde_driver(log: &Logger) -> Result<(), Error> { + const XDE_CONF: &str = "/kernel/drv/xde.conf"; + let xde_conf = Path::new(XDE_CONF); + if !xde_conf.exists() { + return Err(Error::NoXdeConf); + } + + // TODO-remove + // + // An additional part of the workaround to connect into instances. This is + // required to tell OPTE to actually act as a 1-1 NAT when an instance is + // provided with an external IP address, rather than do its normal job of + // encapsulating the traffic onto the underlay (such as for delivery to + // boundary services). + use_external_ip_workaround(&log, &xde_conf); + + let underlay_nics = underlay::find_nics()?; + info!(log, "using '{:?}' as data links for xde driver", underlay_nics); + if underlay_nics.len() < 2 { + const MESSAGE: &str = concat!( + "There must be at least two underlay NICs for the xde ", + "driver to operate. These are currently created by ", + "`./tools/create_virtual_hardware.sh`. Please ensure that ", + "script has been run, and that two VNICs named `net{0,1}` ", + "exist on the system." + ); + return Err(Error::Opte(opte_ioctl::Error::InvalidArgument( + String::from(MESSAGE), + ))); + } + match OpteHdl::open(OpteHdl::DLD_CTL)?.set_xde_underlay( + underlay_nics[0].interface(), + underlay_nics[1].interface(), + ) { + Ok(_) => Ok(()), + // Handle the specific case where the kernel appears to be unaware of + // xde at all. This implies the developer has not installed the correct + // helios-netdev kernel bits. + // + // TODO-correctness: This error should never occur in the product. Both + // xde the kernel driver and the kernel bits needed to recognize it will + // be packaged as part of our OS ramdisk, meaning it should not be + // possible to get out of sync. + Err(opte_ioctl::Error::IoctlFailed(_, ref message)) + if message.contains("unexpected errno: 48") => + { + Err(Error::IncompatibleKernel) + } + // TODO-correctness: xde provides no way to get the current underlay + // devices we're using, but we'd probably like the further check that + // those are exactly what we're giving it now. + Err(opte_ioctl::Error::CommandError( + _, + opte::api::OpteError::System { errno: libc::EEXIST, .. }, + )) => Ok(()), + Err(e) => Err(e.into()), + } + } + + fn use_external_ip_workaround(log: &Logger, xde_conf: &Path) { + const NEEDLE: &str = "ext_ip_hack = 0;"; + const NEW_NEEDLE: &str = "ext_ip_hack = 1;"; + + // NOTE: This only works in the real sled agent, which is run as root. The + // file is not world-readable. + let contents = fs::read_to_string(xde_conf) + .expect("Failed to read xde configuration file"); + let new = contents.replace(NEEDLE, NEW_NEEDLE); + if contents == new { + info!( + log, + "xde driver configuration file appears to already use external IP workaround"; + "conf_file" => ?xde_conf, + ); + } else { + info!( + log, + "updating xde driver configuration file for external IP workaround"; + "conf_file" => ?xde_conf, + ); + fs::write(xde_conf, &new) + .expect("Failed to modify xde configuration file"); + } + + // Ensure the driver picks up the updated configuration file, if it's been + // loaded previously without the workaround. + std::process::Command::new(crate::illumos::PFEXEC) + .args(&["update_drv", "xde"]) + .output() + .expect("Failed to reload xde driver configuration file"); + } + + } else { + + #[derive(Debug, Clone, Copy)] + pub struct Vni(u32); + + impl Vni { + pub fn new(n: N) -> Result + where + N: Into, + { + let x = n.into(); + if x <= 0x00_FF_FF_FF { + Ok(Self(x)) + } else { + Err(Error::InvalidArgument(format!("invalid VNI: {}", x))) + } + } + } + + #[derive(thiserror::Error, Debug)] + pub enum Error { + #[error("Invalid argument: {0}")] + InvalidArgument(String), + } + + pub fn initialize_xde_driver(log: &Logger) -> Result<(), Error> { + slog::warn!(log, "`xde` driver is a fiction on non-illumos systems"); + Ok(()) + } + + pub fn delete_all_xde_devices(log: &Logger) -> Result<(), Error> { + slog::warn!(log, "`xde` driver is a fiction on non-illumos systems"); + Ok(()) + } + } +} + +/// Location information for reaching Boundary Services, for directing +/// inter-sled or off-rack traffic from guests. +#[derive(Debug, Clone, Copy)] +pub struct BoundaryServices { + pub ip: Ipv6Addr, + pub vni: Vni, +} + +impl Default for BoundaryServices { + fn default() -> Self { + // TODO-completeness: Don't hardcode this. + // + // Boundary Services will be started on several Sidecars during rack + // setup, and those addresses will need to be propagated here. + // See https://github.com/oxidecomputer/omicron/issues/1382 + const BOUNDARY_SERVICES_ADDR: Ipv6Addr = + Ipv6Addr::new(0xfd00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01); + let boundary_services_vni = Vni::new(99_u32).unwrap(); + + Self { ip: BOUNDARY_SERVICES_ADDR, vni: boundary_services_vni } + } +} + +/// Information about the gateway for an OPTE port +#[derive(Debug, Clone, Copy)] +#[allow(dead_code)] +pub struct Gateway { + mac: MacAddr6, + ip: IpAddr, +} + +// The MAC address that OPTE exposes to guest NICs, i.e., the MAC of the virtual +// gateway OPTE operates as for each guest. See +// https://github.com/oxidecomputer/omicron/pull/955#discussion_r856432498 for +// more context on the genesis of this, but this is just a reserved address +// within the "system" portion of the virtual MAC address space. +// See https://github.com/oxidecomputer/omicron/issues/1381 +const OPTE_VIRTUAL_GATEWAY_MAC: MacAddr6 = + MacAddr6::new(0xa8, 0x40, 0x25, 0xff, 0x77, 0x77); + +impl Gateway { + pub fn from_subnet(subnet: &IpNetwork) -> Self { + Self { + mac: OPTE_VIRTUAL_GATEWAY_MAC, + + // See RFD 21, section 2.2 table 1 + ip: subnet + .iter() + .nth(1) + .expect("IP subnet must have at least 1 address"), + } + } +} diff --git a/sled-agent/src/opte/opte.rs b/sled-agent/src/opte/opte.rs deleted file mode 100644 index b4406e9354..0000000000 --- a/sled-agent/src/opte/opte.rs +++ /dev/null @@ -1,473 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Interactions with the Oxide Packet Transformation Engine (OPTE) - -use crate::common::underlay; -use crate::illumos::dladm; -use crate::illumos::dladm::Dladm; -use crate::illumos::dladm::PhysicalLink; -use crate::illumos::vnic::Vnic; -use crate::params::ExternalIp; -use ipnetwork::IpNetwork; -use macaddr::MacAddr6; -use opte::api::IpCidr; -use opte::api::Ipv4Cidr; -use opte::api::Ipv4PrefixLen; -use opte::api::MacAddr; -pub use opte::api::Vni; -use opte::oxide_vpc::api::AddRouterEntryIpv4Req; -use opte::oxide_vpc::api::RouterTarget; -use opte::oxide_vpc::api::SNatCfg; -use opte_ioctl::OpteHdl; -use slog::Logger; -use std::fs; -use std::net::IpAddr; -use std::net::Ipv4Addr; -use std::net::Ipv6Addr; -use std::path::Path; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; -use std::sync::Arc; - -// Prefix used to identify xde data links. -const XDE_LINK_PREFIX: &str = "opte"; - -#[derive(thiserror::Error, Debug)] -pub enum Error { - #[error("Failure interacting with the OPTE ioctl(2) interface: {0}")] - Opte(#[from] opte_ioctl::Error), - - #[error("Failed to wrap OPTE port in a VNIC: {0}")] - CreateVnic(#[from] dladm::CreateVnicError), - - #[error("Failed to get VNICs for xde underlay devices: {0}")] - GetVnic(#[from] underlay::Error), - - #[error( - "No xde driver configuration file exists at '/kernel/drv/xde.conf'" - )] - NoXdeConf, - - #[error( - "The OS kernel does not support the xde driver. Please update the OS \ - using `./tools/install_opte.sh` to provide kernel bits and the xde \ - driver which are compatible." - )] - IncompatibleKernel, - - #[error(transparent)] - BadAddrObj(#[from] crate::illumos::addrobj::ParseError), - - #[error(transparent)] - SetLinkpropError(#[from] crate::illumos::dladm::SetLinkpropError), -} - -#[derive(Debug, Clone)] -pub struct OptePortAllocator { - gateway_mac: MacAddr, - value: Arc, -} - -impl OptePortAllocator { - pub fn new(gateway_mac: MacAddr6) -> Self { - let gateway_mac = MacAddr::from(gateway_mac.into_array()); - Self { gateway_mac, value: Arc::new(AtomicU64::new(0)) } - } - - fn next(&self) -> String { - format!("{}{}", XDE_LINK_PREFIX, self.next_id()) - } - - fn next_id(&self) -> u64 { - self.value.fetch_add(1, Ordering::SeqCst) - } - - pub fn new_port( - &self, - ip: IpAddr, - mac: MacAddr6, - subnet: IpNetwork, - vni: Vni, - underlay_ip: Ipv6Addr, - external_ip: Option, - ) -> Result { - // TODO-completess: Remove IPv4 restrictions once OPTE supports virtual - // IPv6 networks. - let private_ip = match ip { - IpAddr::V4(ip) => Ok(ip), - IpAddr::V6(_) => Err(opte_ioctl::Error::InvalidArgument( - String::from("IPv6 is not yet supported for guest interfaces"), - )), - }?; - let gateway = Gateway::from_subnet(&subnet); - let gateway_ip = match gateway.ip { - IpAddr::V4(ip) => Ok(ip), - IpAddr::V6(_) => Err(opte_ioctl::Error::InvalidArgument( - String::from("IPv6 is not yet supported for guest interfaces"), - )), - }?; - let boundary_services = BoundaryServices::default(); - let name = self.next(); - let vpcsub = match subnet { - IpNetwork::V4(ip4net) => { - // We assume that IpNetwork does not allow an invalid prefix. - Ok(Ipv4Cidr::new( - ip4net.ip().into(), - Ipv4PrefixLen::new(ip4net.prefix()).unwrap(), - )) - } - IpNetwork::V6(_) => Err(opte_ioctl::Error::InvalidArgument( - String::from("IPv6 is not yet supported for guest interfaces"), - )), - }?; - - // Describe the source NAT for this instance. - let snat = match external_ip { - Some(ip) => { - let public_ip = match ip.ip { - IpAddr::V4(ip) => ip.into(), - IpAddr::V6(_) => { - return Err(opte_ioctl::Error::InvalidArgument( - String::from("IPv6 is not yet supported for external addresses") - ).into()); - } - }; - let ports = ip.first_port..=ip.last_port; - Some(SNatCfg { - public_ip, - ports, - phys_gw_mac: self.gateway_mac, - }) - } - None => None, - }; - - let hdl = OpteHdl::open(OpteHdl::DLD_CTL)?; - hdl.create_xde( - &name, - MacAddr::from(mac.into_array()), - private_ip, - vpcsub, - MacAddr::from(gateway.mac.into_array()), - gateway_ip, - boundary_services.ip, - boundary_services.vni, - vni, - underlay_ip, - snat, - /* passthru = */ false, - )?; - - // Add a router entry for this interface's subnet, directing traffic to the - // VPC subnet. - match subnet.network() { - IpAddr::V4(ip) => { - let prefix = - Ipv4PrefixLen::new(subnet.prefix()).map_err(|e| { - opte_ioctl::Error::InvalidArgument(format!( - "Invalid IPv4 subnet prefix: {}", - e - )) - })?; - let cidr = Ipv4Cidr::new(opte::api::Ipv4Addr::from(ip), prefix); - let route = AddRouterEntryIpv4Req { - port_name: name.clone(), - dest: cidr, - target: RouterTarget::VpcSubnet(IpCidr::Ip4(cidr)), - }; - hdl.add_router_entry_ip4(&route)?; - } - IpAddr::V6(_) => { - return Err(opte_ioctl::Error::InvalidArgument(String::from( - "IPv6 not yet supported", - )) - .into()); - } - } - - // Create a VNIC on top of this device, to hook Viona into. - // - // Viona is the illumos MAC provider that implements the VIRTIO - // specification.It sits on top of a MAC provider, which is responsible - // for delivering frames to the underlying data link. The guest includes - // a driver that handles the virtio-net specification on their side, - // which talks to Viona. - // - // In theory, Viona work with any MAC provider. However, there are - // implicit assumptions, in both Viona _and_ MAC, that require Viona to - // be built on top of a VNIC specifically. There is probably a good deal - // of work required to relax that assumption, so in the meantime, we - // create a superfluous VNIC on the OPTE device, solely so Viona can use - // it. - let vnic = { - let phys = PhysicalLink(name.clone()); - let vnic_name = format!("v{}", name); - Dladm::create_vnic( - &phys, - &vnic_name, - Some(omicron_common::api::external::MacAddr(mac)), - None, - )?; - // Safety: We're explicitly creating the VNIC with the prefix - // `VNIC_PREFIX_GUEST`, so this call must return Some(_). - Some(Vnic::wrap_existing(vnic_name).unwrap()) - }; - - // TODO-remove - // - // This is part of the workaround to get external connectivity into - // instances, without setting up all of boundary services. Rather than - // encap/decap the guest traffic, OPTE just performs 1-1 NAT between the - // private IP address of the guest and the external address provided by - // the control plane. This call here allows the underlay nic, `net0` to - // advertise as having the guest's MAC address. - Dladm::set_linkprop( - underlay::find_chelsio_links()?[0].0.as_str(), - "secondary-macs", - &mac.to_string().to_lowercase(), - )?; - - // TODO-remove - // - // This is another part of the workaround, allowing reply traffic from - // the guest back out. Normally, OPTE would drop such traffic at the - // router layer, as it has no route for that external IP address. This - // allows such traffic through. - // - // Note that this exact rule will eventually be included, since it's one - // of the default routing rules in the VPC System Router. However, that - // will likely be communicated in a different way, or could be modified, - // and this specific call should be removed in favor of sending the - // routing rules the control plane provides. - // - // This rule sends all traffic that has no better match to the gateway. - let prefix = Ipv4PrefixLen::new(0).unwrap(); - let dest = Ipv4Cidr::new(Ipv4Addr::UNSPECIFIED.into(), prefix); - let target = RouterTarget::InternetGateway; - hdl.add_router_entry_ip4(&AddRouterEntryIpv4Req { - port_name: name.clone(), - dest, - target, - })?; - - Ok(OptePort { - name, - ip, - subnet, - mac, - vni, - underlay_ip, - external_ip, - gateway, - boundary_services, - vnic, - }) - } -} - -#[derive(Debug, Clone, Copy)] -pub struct BoundaryServices { - pub ip: Ipv6Addr, - pub vni: Vni, -} - -impl Default for BoundaryServices { - fn default() -> Self { - // TODO-completeness: Don't hardcode this. - // - // Boundary Services will be started on several Sidecars during rack - // setup, and those addresses will need to be propagated here. - const BOUNDARY_SERVICES_ADDR: Ipv6Addr = - Ipv6Addr::new(0xfd00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01); - let boundary_services_vni = Vni::new(99_u32).unwrap(); - - Self { ip: BOUNDARY_SERVICES_ADDR, vni: boundary_services_vni } - } -} - -/// Information about the gateway for an OPTE port -#[derive(Debug, Clone, Copy)] -pub struct Gateway { - mac: MacAddr6, - ip: IpAddr, -} - -// The MAC address that OPTE exposes to guest NICs, i.e., the MAC of the virtual -// gateway OPTE operates as for each guest. See -// https://github.com/oxidecomputer/omicron/pull/955#discussion_r856432498 for -// more context on the genesis of this, but this is just a reserved address -// within the "system" portion of the virtual MAC address space. -const OPTE_VIRTUAL_GATEWAY_MAC: MacAddr6 = - MacAddr6::new(0xa8, 0x40, 0x25, 0xff, 0x77, 0x77); - -impl Gateway { - pub fn from_subnet(subnet: &IpNetwork) -> Self { - Self { - mac: OPTE_VIRTUAL_GATEWAY_MAC, - - // See RFD 21, section 2.2 table 1 - ip: subnet - .iter() - .nth(1) - .expect("IP subnet must have at least 1 address"), - } - } -} - -/// A port on the OPTE "virtual switch", which corresponds to one guest network -/// interface. -#[derive(Debug)] -#[allow(dead_code)] -pub struct OptePort { - name: String, - ip: IpAddr, - subnet: IpNetwork, - mac: MacAddr6, - vni: Vni, - underlay_ip: Ipv6Addr, - // The external IP information for this port, or None if it has no external - // connectivity. Only the primary interface has Some(_) here. - external_ip: Option, - gateway: Gateway, - boundary_services: BoundaryServices, - // TODO-correctness: Remove this once we can put Viona directly on top of an - // OPTE port device. - // - // Note that this will always be `Some(_)`. It is wrapped in an optional to - // ensure we can drop the VNIC before we drop the OPTE port itself. - vnic: Option, -} - -impl OptePort { - /// Return the VNIC used to link OPTE and Viona. - // TODO-correctness: Remove this once we can put Viona directly on top of an - // OPTE port device. - pub fn vnic(&self) -> &Vnic { - self.vnic.as_ref().unwrap() - } -} - -impl Drop for OptePort { - fn drop(&mut self) { - self.vnic.take(); - if let Ok(hdl) = OpteHdl::open(OpteHdl::DLD_CTL) { - if hdl.delete_xde(&self.name).is_ok() { - return; - } - } - eprintln!("WARNING: Failed to delete OPTE port '{}'", self.name); - } -} - -/// Delete all xde devices on the system. -pub fn delete_all_xde_devices(log: &Logger) -> Result<(), Error> { - let hdl = OpteHdl::open(OpteHdl::DLD_CTL)?; - for port_info in hdl.list_ports()?.ports.into_iter() { - let name = &port_info.name; - info!( - log, - "deleting existing OPTE port and xde device"; - "device_name" => name - ); - hdl.delete_xde(name)?; - } - Ok(()) -} - -/// Initialize the underlay devices required for the xde kernel module. -/// -/// The xde driver needs information about the physical devices out which it can -/// send traffic from the guests. -pub fn initialize_xde_driver(log: &Logger) -> Result<(), Error> { - const XDE_CONF: &str = "/kernel/drv/xde.conf"; - let xde_conf = Path::new(XDE_CONF); - if !xde_conf.exists() { - return Err(Error::NoXdeConf); - } - - // TODO-remove - // - // An additional part of the workaround to connect into instances. This is - // required to tell OPTE to actually act as a 1-1 NAT when an instance is - // provided with an external IP address, rather than do its normal job of - // encapsulating the traffic onto the underlay (such as for delivery to - // boundary services). - use_external_ip_workaround(&log, &xde_conf); - - let underlay_nics = underlay::find_nics()?; - info!(log, "using '{:?}' as data links for xde driver", underlay_nics); - if underlay_nics.len() < 2 { - const MESSAGE: &str = concat!( - "There must be at least two underlay NICs for the xde ", - "driver to operate. These are currently created by ", - "`./tools/create_virtual_hardware.sh`. Please ensure that ", - "script has been run, and that two VNICs named `net{0,1}` ", - "exist on the system." - ); - return Err(Error::Opte(opte_ioctl::Error::InvalidArgument( - String::from(MESSAGE), - ))); - } - match OpteHdl::open(OpteHdl::DLD_CTL)?.set_xde_underlay( - underlay_nics[0].interface(), - underlay_nics[1].interface(), - ) { - Ok(_) => Ok(()), - // Handle the specific case where the kernel appears to be unaware of - // xde at all. This implies the developer has not installed the correct - // helios-netdev kernel bits. - // - // TODO-correctness: This error should never occur in the product. Both - // xde the kernel driver and the kernel bits needed to recognize it will - // be packaged as part of our OS ramdisk, meaning it should not be - // possible to get out of sync. - Err(opte_ioctl::Error::IoctlFailed(_, ref message)) - if message.contains("unexpected errno: 48") => - { - Err(Error::IncompatibleKernel) - } - // TODO-correctness: xde provides no way to get the current underlay - // devices we're using, but we'd probably like the further check that - // those are exactly what we're giving it now. - Err(opte_ioctl::Error::CommandError( - _, - opte::api::OpteError::System { errno: libc::EEXIST, .. }, - )) => Ok(()), - Err(e) => Err(e.into()), - } -} - -fn use_external_ip_workaround(log: &Logger, xde_conf: &Path) { - const NEEDLE: &str = "ext_ip_hack = 0;"; - const NEW_NEEDLE: &str = "ext_ip_hack = 1;"; - - // NOTE: This only works in the real sled agent, which is run as root. The - // file is not world-readable. - let contents = fs::read_to_string(xde_conf) - .expect("Failed to read xde configuration file"); - let new = contents.replace(NEEDLE, NEW_NEEDLE); - if contents == new { - info!( - log, - "xde driver configuration file appears to already use external IP workaround"; - "conf_file" => ?xde_conf, - ); - } else { - info!( - log, - "updating xde driver configuration file for external IP workaround"; - "conf_file" => ?xde_conf, - ); - fs::write(xde_conf, &new) - .expect("Failed to modify xde configuration file"); - } - - // Ensure the driver picks up the updated configuration file, if it's been - // loaded previously without the workaround. - std::process::Command::new(crate::illumos::PFEXEC) - .args(&["update_drv", "xde"]) - .output() - .expect("Failed to reload xde driver configuration file"); -} diff --git a/sled-agent/src/opte/port.rs b/sled-agent/src/opte/port.rs new file mode 100644 index 0000000000..396a6743f8 --- /dev/null +++ b/sled-agent/src/opte/port.rs @@ -0,0 +1,158 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A single port on the OPTE virtual switch. + +use super::port_manager::PortTicket; +use super::BoundaryServices; +use super::Gateway; +use super::Vni; +#[cfg(target_os = "illumos")] +use crate::illumos::dladm::Dladm; +use crate::params::ExternalIp; +use ipnetwork::IpNetwork; +use macaddr::MacAddr6; +use std::net::IpAddr; +use std::net::Ipv6Addr; +use std::sync::Arc; + +#[derive(Debug)] +#[cfg_attr(not(target_os = "illumos"), allow(dead_code))] +struct PortInner { + // Contains instance ID and a pointer to the parent manager + ticket: PortTicket, + // Name of the port as identified by OPTE + name: String, + // IP address within the VPC Subnet + _ip: IpAddr, + // VPC Subnet + _subnet: IpNetwork, + // VPC-private MAC address + mac: MacAddr6, + // Emulated PCI slot for the guest NIC, passed to Propolis + slot: u8, + // Geneve VNI for the VPC + _vni: Vni, + // IP address of the hosting sled + _underlay_ip: Ipv6Addr, + // The external IP information for this port, or None if it has no external + // connectivity. Only the primary interface has Some(_) here. + external_ip: Option, + // Information about the virtual gateway, aka OPTE + _gateway: Gateway, + // Information about Boundary Services, for forwarding traffic between sleds + // or off the rack. + _boundary_services: BoundaryServices, + // TODO-correctness: Remove this once we can put Viona directly on top of an + // OPTE port device. + // + // NOTE: This is intentionally not an actual `Vnic` object. We'd like to + // delete the VNIC manually in `PortInner::drop`, because we _can't_ delete + // the xde device if we fail to delete the VNIC. See + // https://github.com/oxidecomputer/opte/issues/178 for more details. This + // can be changed back to a real VNIC when that is resolved, and the Drop + // impl below can simplify to just call `drop(self.vnic)`. + vnic: String, +} + +#[cfg(target_os = "illumos")] +impl Drop for PortInner { + fn drop(&mut self) { + if let Err(e) = Dladm::delete_vnic(&self.vnic) { + eprintln!( + "WARNING: Failed to delete OPTE port overlay VNIC \ + while dropping port. The VNIC will not be cleaned up \ + properly, and the xde device itself will not be deleted. \ + Both the VNIC and the xde device must be deleted out \ + of band, and it will not be possible to recreate the xde \ + device until then. Error: {:?}", + e + ); + return; + } + let err = match opte_ioctl::OpteHdl::open(opte_ioctl::OpteHdl::DLD_CTL) + { + Ok(hdl) => { + if let Err(e) = hdl.delete_xde(&self.name) { + e + } else { + return; + } + } + Err(e) => e, + }; + eprintln!( + "WARNING: OPTE port overlay VNIC deleted, but failed \ + to delete the xde device. It must be deleted out \ + of band, and it will not be possible to recreate the xde \ + device until then. Error: {:?}", + err, + ); + } +} + +/// A port on the OPTE virtual switch, providing the virtual networking +/// abstractions for guest instances. +/// +/// Note that the type is clonable and refers to the same underlying port on the +/// system. +#[derive(Debug, Clone)] +pub struct Port { + inner: Arc, +} + +impl Port { + #[allow(clippy::too_many_arguments)] + pub fn new( + ticket: PortTicket, + name: String, + ip: IpAddr, + subnet: IpNetwork, + mac: MacAddr6, + slot: u8, + vni: Vni, + underlay_ip: Ipv6Addr, + external_ip: Option, + gateway: Gateway, + boundary_services: BoundaryServices, + vnic: String, + ) -> Self { + Self { + inner: Arc::new(PortInner { + ticket, + name, + _ip: ip, + _subnet: subnet, + mac, + slot, + _vni: vni, + _underlay_ip: underlay_ip, + external_ip, + _gateway: gateway, + _boundary_services: boundary_services, + vnic, + }), + } + } + + pub fn external_ip(&self) -> &Option { + &self.inner.external_ip + } + + pub fn mac(&self) -> &MacAddr6 { + &self.inner.mac + } + + pub fn vnic_name(&self) -> &str { + &self.inner.vnic + } + + pub fn slot(&self) -> u8 { + self.inner.slot + } + + pub fn ticket(&self) -> PortTicket { + self.inner.ticket.clone() + } +} diff --git a/sled-agent/src/opte/port_manager.rs b/sled-agent/src/opte/port_manager.rs new file mode 100644 index 0000000000..39d24e106c --- /dev/null +++ b/sled-agent/src/opte/port_manager.rs @@ -0,0 +1,537 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Manager for all OPTE ports on a Helios system + +use super::BoundaryServices; +use super::Error; +use super::Gateway; +use super::Port; +use super::Vni; +use crate::illumos::dladm::PhysicalLink; +use crate::params::ExternalIp; +use crate::params::NetworkInterface; +use ipnetwork::IpNetwork; +use macaddr::MacAddr6; +use slog::debug; +use slog::info; +use slog::Logger; +use std::collections::BTreeMap; +use std::net::IpAddr; +use std::net::Ipv6Addr; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::sync::Mutex; +use uuid::Uuid; + +cfg_if::cfg_if! { + if #[cfg(target_os = "illumos")] { + use crate::illumos::dladm::Dladm; + use crate::illumos::dladm::VnicSource; + use std::sync::MutexGuard; + use opte::api::IpCidr; + use opte::api::Ipv4Cidr; + use opte::api::Ipv4PrefixLen; + use opte::api::MacAddr; + use opte::oxide_vpc::api::AddRouterEntryIpv4Req; + use opte::oxide_vpc::api::RouterTarget; + use opte::oxide_vpc::api::SNatCfg; + use opte_ioctl::OpteHdl; + use slog::warn; + } +} + +// Prefix used to identify xde data links. +const XDE_LINK_PREFIX: &str = "opte"; + +#[derive(Debug)] +#[cfg_attr(not(target_os = "illumos"), allow(dead_code))] +struct PortManagerInner { + log: Logger, + + // Sequential identifier for each port on the system. + next_port_id: AtomicU64, + + // TODO-remove: This is part of the external IP address workaround + // + // We only need to know this while we're setting the secondary MACs of the + // link to support OPTE's proxy ARP for the guest's IP. + data_link: PhysicalLink, + + // TODO-remove: This is part of the external IP address workaround. + // + // We only need this while OPTE needs to forward traffic to the local + // gateway. This will be replaced by boundary services. + gateway_mac: MacAddr6, + + // IP address of the hosting sled on the underlay. + underlay_ip: Ipv6Addr, + + // Map of instance ID to list of ports. + // + // NOTE: By storing all ports in a vector, the ticket mechanism makes the + // first dropped port cause the whole vector to be dropped. The remaining + // ports' drop impls will still call remove on this map, but there will no + // longer be a value with that key. + ports: Mutex>>, +} + +impl PortManagerInner { + fn next_port_name(&self) -> String { + format!( + "{}{}", + XDE_LINK_PREFIX, + self.next_port_id.fetch_add(1, Ordering::SeqCst) + ) + } + + // TODO-remove + // + // This is part of the workaround to get external connectivity into + // instances, without setting up all of boundary services. Rather than + // encap/decap the guest traffic, OPTE just performs 1-1 NAT between the + // private IP address of the guest and the external address provided by + // the control plane. This call here allows the underlay nic, `net0` to + // advertise as having the guest's MAC address. + #[cfg(target_os = "illumos")] + fn update_secondary_macs( + &self, + ports: &mut MutexGuard<'_, BTreeMap>>, + ) -> Result<(), Error> { + let secondary_macs = ports + .values() + .flatten() + .filter_map(|port| { + // Only advertise Ports with an external address (primary + // interface for an instance). + if port.external_ip().is_some() { + Some(port.mac().to_string()) + } else { + None + } + }) + .collect::>() + .join(","); + if secondary_macs.is_empty() { + Dladm::reset_linkprop(self.data_link.name(), "secondary-macs")?; + debug!( + self.log, + "Reset data link secondary MACs link prop for OPTE proxy ARP"; + "link_name" => self.data_link.name(), + ); + } else { + Dladm::set_linkprop( + self.data_link.name(), + "secondary-macs", + &secondary_macs, + )?; + debug!( + self.log, + "Updated data link secondary MACs link prop for OPTE proxy ARP"; + "data_link" => &self.data_link.0, + "secondary_macs" => ?secondary_macs, + ); + } + Ok(()) + } +} + +/// The port manager controls all OPTE ports on a single host. +#[derive(Debug, Clone)] +pub struct PortManager { + inner: Arc, +} + +impl PortManager { + /// Create a new manager, for creating OPTE ports for guest network + /// interfaces + pub fn new( + log: Logger, + underlay_ip: Ipv6Addr, + gateway_mac: MacAddr6, + ) -> Self { + let data_link = crate::common::underlay::find_chelsio_links() + .unwrap() + .into_iter() + .next() + .unwrap(); + let inner = Arc::new(PortManagerInner { + log, + next_port_id: AtomicU64::new(0), + data_link, + gateway_mac, + underlay_ip, + ports: Mutex::new(BTreeMap::new()), + }); + + Self { inner } + } + + pub fn underlay_ip(&self) -> &Ipv6Addr { + &self.inner.underlay_ip + } + + /// Create an OPTE port for the given guest instance. + #[cfg(target_os = "illumos")] + pub fn create_port( + &self, + instance_id: Uuid, + nic: &NetworkInterface, + external_ip: Option, + ) -> Result { + // TODO-completess: Remove IPv4 restrictions once OPTE supports virtual + // IPv6 networks. + let private_ip = match nic.ip { + IpAddr::V4(ip) => Ok(ip), + IpAddr::V6(_) => Err(opte_ioctl::Error::InvalidArgument( + String::from("IPv6 is not yet supported for guest interfaces"), + )), + }?; + + // Argument checking and conversions into OPTE data types. + let subnet = IpNetwork::from(nic.subnet); + let mac = *nic.mac; + let vni = Vni::new(nic.vni).unwrap(); + let (opte_subnet, gateway) = match subnet { + IpNetwork::V4(net) => ( + Ipv4Cidr::new( + net.ip().into(), + Ipv4PrefixLen::new(net.prefix()).unwrap(), + ), + Gateway::from_subnet(&subnet), + ), + IpNetwork::V6(_) => { + return Err(opte_ioctl::Error::InvalidArgument(String::from( + "IPv6 is not yet supported for guest interfaces", + )) + .into()); + } + }; + let gateway_ip = match gateway.ip { + IpAddr::V4(ip) => Ok(ip), + IpAddr::V6(_) => Err(opte_ioctl::Error::InvalidArgument( + String::from("IPv6 is not yet supported for guest interfaces"), + )), + }?; + let boundary_services = BoundaryServices::default(); + + // Describe the source NAT for this instance. + let snat = match external_ip { + Some(ip) => { + let public_ip = match ip.ip { + IpAddr::V4(ip) => ip.into(), + IpAddr::V6(_) => { + return Err(opte_ioctl::Error::InvalidArgument( + String::from("IPv6 is not yet supported for external addresses") + ).into()); + } + }; + let ports = ip.first_port..=ip.last_port; + Some(SNatCfg { + public_ip, + ports, + phys_gw_mac: MacAddr::from( + self.inner.gateway_mac.into_array(), + ), + }) + } + None => None, + }; + + // Create the xde device. + // + // The sequencing here is important. We'd like to make sure things are + // cleaned up properly, while having a sequence of fallible operations. + // So we: + // + // - create the xde device + // - create the vnic, cleaning up the xde device if that fails + // - add both to the Port + // + // The Port object's drop implementation will clean up both of those, if + // any of the remaining fallible operations fail. + let port_name = self.inner.next_port_name(); + let hdl = OpteHdl::open(OpteHdl::DLD_CTL)?; + hdl.create_xde( + &port_name, + MacAddr::from(mac.into_array()), + private_ip, + opte_subnet, + MacAddr::from(gateway.mac.into_array()), + gateway_ip, + boundary_services.ip, + boundary_services.vni, + vni, + self.inner.underlay_ip, + snat, + /* passthru = */ false, + )?; + debug!( + self.inner.log, + "Created xde device for guest port"; + "port_name" => &port_name, + ); + + // Create a VNIC on top of this device, to hook Viona into. + // + // Viona is the illumos MAC provider that implements the VIRTIO + // specification. It sits on top of a MAC provider, which is responsible + // for delivering frames to the underlying data link. The guest includes + // a driver that handles the virtio-net specification on their side, + // which talks to Viona. + // + // In theory, Viona works with any MAC provider. However, there are + // implicit assumptions, in both Viona _and_ MAC, that require Viona to + // be built on top of a VNIC specifically. There is probably a good deal + // of work required to relax that assumption, so in the meantime, we + // create a superfluous VNIC on the OPTE device, solely so Viona can use + // it. + let vnic = { + let phys = PhysicalLink(port_name.clone()); + let vnic_name = format!("v{}", port_name); + if let Err(e) = + Dladm::create_vnic(&phys, &vnic_name, Some(nic.mac), None) + { + warn!( + self.inner.log, + "Failed to create overlay VNIC for xde device"; + "port_name" => port_name.as_str(), + "err" => ?e + ); + if let Err(e) = hdl.delete_xde(&port_name) { + warn!( + self.inner.log, + "Failed to clean up xde device after failure to create overlay VNIC"; + "err" => ?e + ); + } + return Err(e.into()); + } + debug!( + self.inner.log, + "Created overlay VNIC for xde device"; + "port_name" => &port_name, + "vnic_name" => &vnic_name, + ); + + // NOTE: We intentionally use a string rather than the Vnic type + // here. See the notes on the `opte::PortInner::vnic` field. + vnic_name + }; + + let port = { + let mut ports = self.inner.ports.lock().unwrap(); + let ticket = PortTicket::new(instance_id, self.inner.clone()); + let port = Port::new( + ticket, + port_name.clone(), + nic.ip, + subnet, + mac, + nic.slot, + vni, + self.inner.underlay_ip, + external_ip, + gateway, + boundary_services, + vnic, + ); + ports + .entry(instance_id) + .or_insert_with(Vec::new) + .push(port.clone()); + self.inner.update_secondary_macs(&mut ports)?; + port + }; + + // Add a router entry for this interface's subnet, directing traffic to the + // VPC subnet. + match subnet.network() { + IpAddr::V4(ip) => { + let prefix = + Ipv4PrefixLen::new(subnet.prefix()).map_err(|e| { + opte_ioctl::Error::InvalidArgument(format!( + "Invalid IPv4 subnet prefix: {}", + e + )) + })?; + let cidr = Ipv4Cidr::new(opte::api::Ipv4Addr::from(ip), prefix); + let route = AddRouterEntryIpv4Req { + port_name: port_name.clone(), + dest: cidr, + target: RouterTarget::VpcSubnet(IpCidr::Ip4(cidr)), + }; + hdl.add_router_entry_ip4(&route)?; + debug!( + self.inner.log, + "Added IPv4 VPC Subnet router entry for OPTE port"; + "port_name" => &port_name, + "entry" => ?route, + ); + } + IpAddr::V6(_) => { + return Err(opte_ioctl::Error::InvalidArgument(String::from( + "IPv6 not yet supported", + )) + .into()); + } + } + + // TODO-remove + // + // This is another part of the workaround, allowing reply traffic from + // the guest back out. Normally, OPTE would drop such traffic at the + // router layer, as it has no route for that external IP address. This + // allows such traffic through. + // + // Note that this exact rule will eventually be included, since it's one + // of the default routing rules in the VPC System Router. However, that + // will likely be communicated in a different way, or could be modified, + // and this specific call should be removed in favor of sending the + // routing rules the control plane provides. + // + // This rule sends all traffic that has no better match to the gateway. + let prefix = Ipv4PrefixLen::new(0).unwrap(); + let dest = + Ipv4Cidr::new(std::net::Ipv4Addr::UNSPECIFIED.into(), prefix); + let target = RouterTarget::InternetGateway; + hdl.add_router_entry_ip4(&AddRouterEntryIpv4Req { + port_name: port_name.clone(), + dest, + target, + })?; + + info!( + self.inner.log, + "Created OPTE port for guest"; + "port" => ?&port, + ); + Ok(port) + } + + #[cfg(not(target_os = "illumos"))] + pub fn create_port( + &self, + instance_id: Uuid, + nic: &NetworkInterface, + external_ip: Option, + ) -> Result { + // TODO-completess: Remove IPv4 restrictions once OPTE supports virtual + // IPv6 networks. + let _ = match nic.ip { + IpAddr::V4(ip) => Ok(ip), + IpAddr::V6(_) => Err(Error::InvalidArgument(String::from( + "IPv6 is not yet supported for guest interfaces", + ))), + }?; + + // Argument checking and conversions into OPTE data types. + let subnet = IpNetwork::from(nic.subnet); + let mac = *nic.mac; + let vni = Vni::new(nic.vni).unwrap(); + let gateway = match subnet { + IpNetwork::V4(_) => Gateway::from_subnet(&subnet), + IpNetwork::V6(_) => { + return Err(Error::InvalidArgument(String::from( + "IPv6 is not yet supported for guest interfaces", + ))); + } + }; + let _ = match gateway.ip { + IpAddr::V4(ip) => Ok(ip), + IpAddr::V6(_) => Err(Error::InvalidArgument(String::from( + "IPv6 is not yet supported for guest interfaces", + ))), + }?; + let boundary_services = BoundaryServices::default(); + let port_name = self.inner.next_port_name(); + let vnic = format!("v{}", port_name); + let port = { + let mut ports = self.inner.ports.lock().unwrap(); + let ticket = PortTicket::new(instance_id, self.inner.clone()); + let port = Port::new( + ticket, + port_name.clone(), + nic.ip, + subnet, + mac, + nic.slot, + vni, + self.inner.underlay_ip, + external_ip, + gateway, + boundary_services, + vnic, + ); + ports + .entry(instance_id) + .or_insert_with(Vec::new) + .push(port.clone()); + port + }; + + info!( + self.inner.log, + "Created OPTE port for guest"; + "port" => ?&port, + ); + Ok(port) + } +} + +#[derive(Clone)] +pub struct PortTicket { + id: Uuid, + manager: Option>, +} + +impl std::fmt::Debug for PortTicket { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + const SOME: &str = "Some(_)"; + const NONE: &str = "None"; + f.debug_struct("PortTicket") + .field("id", &self.id) + .field( + "manager", + if self.manager.is_some() { &SOME } else { &NONE }, + ) + .finish() + } +} + +impl PortTicket { + fn new(id: Uuid, manager: Arc) -> Self { + Self { id, manager: Some(manager) } + } + + pub fn release(&mut self) { + if let Some(manager) = self.manager.take() { + let mut ports = manager.ports.lock().unwrap(); + let n_ports = ports.remove(&self.id).map(|p| p.len()).unwrap_or(0); + debug!( + manager.log, + "Removing OPTE ports from manager"; + "instance_id" => ?self.id, + "n_ports" => n_ports, + ); + #[cfg(target_os = "illumos")] + if let Err(e) = manager.update_secondary_macs(&mut ports) { + warn!( + manager.log, + "Failed to update secondary-macs linkprop when \ + releasing OPTE ports for instance"; + "instance_id" => ?self.id, + "err" => ?e, + ); + } + } + } +} + +impl Drop for PortTicket { + fn drop(&mut self) { + self.release(); + } +} diff --git a/tools/populate/populate-images.sh b/tools/populate/populate-images.sh index da87e1afba..19fd100255 100755 --- a/tools/populate/populate-images.sh +++ b/tools/populate/populate-images.sh @@ -1,5 +1,9 @@ #!/bin/bash # Populate an Oxide host running Omicron with images from server catacomb. +# +# Note that the default tunnel IP of `fd00:...` will only be available _after_ +# launching the control plane with `omicron-package install`, since Omicron +# creates that address. set -eu CATACOMB_TUNNEL="${CATACOMB_TUNNEL:-"[fd00:1122:3344:101::1]:54321"}" From 15668b153f6d376a3a0a21d45fa81c54998659ab Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 12 Jul 2022 03:27:55 +0000 Subject: [PATCH 2/7] Review feedback - Adds some better documentation around the semantics of dropping a port ticket - Adds links to issues in some todos - Splits illumos/other platform implementations at module level. --- sled-agent/src/illumos/running_zone.rs | 3 + sled-agent/src/instance.rs | 7 +- sled-agent/src/opte/illumos/mod.rs | 167 +++++++++++++ sled-agent/src/opte/{ => illumos}/port.rs | 11 +- .../src/opte/{ => illumos}/port_manager.rs | 132 +++------- sled-agent/src/opte/mod.rs | 209 +--------------- sled-agent/src/opte/non_illumos/mod.rs | 47 ++++ sled-agent/src/opte/non_illumos/port.rs | 120 +++++++++ .../src/opte/non_illumos/port_manager.rs | 228 ++++++++++++++++++ 9 files changed, 618 insertions(+), 306 deletions(-) create mode 100644 sled-agent/src/opte/illumos/mod.rs rename sled-agent/src/opte/{ => illumos}/port.rs (95%) rename sled-agent/src/opte/{ => illumos}/port_manager.rs (82%) create mode 100644 sled-agent/src/opte/non_illumos/mod.rs create mode 100644 sled-agent/src/opte/non_illumos/port.rs create mode 100644 sled-agent/src/opte/non_illumos/port_manager.rs diff --git a/sled-agent/src/illumos/running_zone.rs b/sled-agent/src/illumos/running_zone.rs index b03bb42be3..5a07c17c15 100644 --- a/sled-agent/src/illumos/running_zone.rs +++ b/sled-agent/src/illumos/running_zone.rs @@ -292,6 +292,9 @@ impl RunningZone { log: log.new(o!("zone" => zone_name.to_string())), name: zone_name.to_string(), control_vnic, + // TODO(https://github.com/oxidecomputer/omicron/issues/725) + // + // Re-initialize guest_vnic state by inspecting the zone. opte_ports: vec![], physical_nic: None, }, diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 82215735ad..799c60c2b6 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -504,6 +504,11 @@ impl Instance { )?; opte_ports.push(port); } + // We only acquire and store the first port ticket in the zone. + // + // The Ports are stored in the manager as a list for each instance. The + // tickets point to that entire list, so calling `PortTicket::release` + // on any one ticket actually releases every port for the instance. let port_ticket = opte_ports.first().map(|port| port.ticket()); // Create a zone for the propolis instance, using the previously @@ -669,7 +674,7 @@ impl Instance { // And remove the OPTE ports from the port manager if let Some(ticket) = running_state.port_ticket.as_mut() { - ticket.release(); + ticket.release()?; } Ok(()) diff --git a/sled-agent/src/opte/illumos/mod.rs b/sled-agent/src/opte/illumos/mod.rs new file mode 100644 index 0000000000..6df9eef118 --- /dev/null +++ b/sled-agent/src/opte/illumos/mod.rs @@ -0,0 +1,167 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Interactions with the Oxide Packet Transformation Engine (OPTE) + +use crate::common::underlay; +use crate::illumos::dladm; +use opte_ioctl::OpteHdl; +use slog::Logger; +use std::fs; +use std::path::Path; + +pub use opte::api::Vni; + +mod port; +mod port_manager; + +pub use port::Port; +pub use port_manager::PortManager; +pub use port_manager::PortTicket; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("Failure interacting with the OPTE ioctl(2) interface: {0}")] + Opte(#[from] opte_ioctl::Error), + + #[error("Failed to wrap OPTE port in a VNIC: {0}")] + CreateVnic(#[from] dladm::CreateVnicError), + + #[error("Failed to get VNICs for xde underlay devices: {0}")] + GetVnic(#[from] underlay::Error), + + #[error( + "No xde driver configuration file exists at '/kernel/drv/xde.conf'" + )] + NoXdeConf, + + #[error( + "The OS kernel does not support the xde driver. Please update the OS \ + using `./tools/install_opte.sh` to provide kernel bits and the xde \ + driver which are compatible." + )] + IncompatibleKernel, + + #[error(transparent)] + BadAddrObj(#[from] crate::illumos::addrobj::ParseError), + + #[error(transparent)] + SetLinkpropError(#[from] crate::illumos::dladm::SetLinkpropError), + + #[error(transparent)] + ResetLinkpropError(#[from] crate::illumos::dladm::ResetLinkpropError), +} + +/// Delete all xde devices on the system. +pub fn delete_all_xde_devices(log: &Logger) -> Result<(), Error> { + let hdl = OpteHdl::open(OpteHdl::DLD_CTL)?; + for port_info in hdl.list_ports()?.ports.into_iter() { + let name = &port_info.name; + info!( + log, + "deleting existing OPTE port and xde device"; + "device_name" => name + ); + hdl.delete_xde(name)?; + } + Ok(()) +} + +/// Initialize the underlay devices required for the xde kernel module. +/// +/// The xde driver needs information about the physical devices out which it can +/// send traffic from the guests. +pub fn initialize_xde_driver(log: &Logger) -> Result<(), Error> { + const XDE_CONF: &str = "/kernel/drv/xde.conf"; + let xde_conf = Path::new(XDE_CONF); + if !xde_conf.exists() { + return Err(Error::NoXdeConf); + } + + // TODO-remove + // + // See https://github.com/oxidecomputer/omicron/issues/1337 + // + // An additional part of the workaround to connect into instances. This is + // required to tell OPTE to actually act as a 1-1 NAT when an instance is + // provided with an external IP address, rather than do its normal job of + // encapsulating the traffic onto the underlay (such as for delivery to + // boundary services). + use_external_ip_workaround(&log, &xde_conf); + + let underlay_nics = underlay::find_nics()?; + info!(log, "using '{:?}' as data links for xde driver", underlay_nics); + if underlay_nics.len() < 2 { + const MESSAGE: &str = concat!( + "There must be at least two underlay NICs for the xde ", + "driver to operate. These are currently created by ", + "`./tools/create_virtual_hardware.sh`. Please ensure that ", + "script has been run, and that two VNICs named `net{0,1}` ", + "exist on the system." + ); + return Err(Error::Opte(opte_ioctl::Error::InvalidArgument( + String::from(MESSAGE), + ))); + } + match OpteHdl::open(OpteHdl::DLD_CTL)?.set_xde_underlay( + underlay_nics[0].interface(), + underlay_nics[1].interface(), + ) { + Ok(_) => Ok(()), + // Handle the specific case where the kernel appears to be unaware of + // xde at all. This implies the developer has not installed the correct + // helios-netdev kernel bits. + // + // TODO-correctness: This error should never occur in the product. Both + // xde the kernel driver and the kernel bits needed to recognize it will + // be packaged as part of our OS ramdisk, meaning it should not be + // possible to get out of sync. + Err(opte_ioctl::Error::IoctlFailed(_, ref message)) + if message.contains("unexpected errno: 48") => + { + Err(Error::IncompatibleKernel) + } + // TODO-correctness: xde provides no way to get the current underlay + // devices we're using, but we'd probably like the further check that + // those are exactly what we're giving it now. + Err(opte_ioctl::Error::CommandError( + _, + opte::api::OpteError::System { errno: libc::EEXIST, .. }, + )) => Ok(()), + Err(e) => Err(e.into()), + } +} + +fn use_external_ip_workaround(log: &Logger, xde_conf: &Path) { + const NEEDLE: &str = "ext_ip_hack = 0;"; + const NEW_NEEDLE: &str = "ext_ip_hack = 1;"; + + // NOTE: This only works in the real sled agent, which is run as root. The + // file is not world-readable. + let contents = fs::read_to_string(xde_conf) + .expect("Failed to read xde configuration file"); + let new = contents.replace(NEEDLE, NEW_NEEDLE); + if contents == new { + info!( + log, + "xde driver configuration file appears to already use external IP workaround"; + "conf_file" => ?xde_conf, + ); + } else { + info!( + log, + "updating xde driver configuration file for external IP workaround"; + "conf_file" => ?xde_conf, + ); + fs::write(xde_conf, &new) + .expect("Failed to modify xde configuration file"); + } + + // Ensure the driver picks up the updated configuration file, if it's been + // loaded previously without the workaround. + std::process::Command::new(crate::illumos::PFEXEC) + .args(&["update_drv", "xde"]) + .output() + .expect("Failed to reload xde driver configuration file"); +} diff --git a/sled-agent/src/opte/port.rs b/sled-agent/src/opte/illumos/port.rs similarity index 95% rename from sled-agent/src/opte/port.rs rename to sled-agent/src/opte/illumos/port.rs index 396a6743f8..93fbe57373 100644 --- a/sled-agent/src/opte/port.rs +++ b/sled-agent/src/opte/illumos/port.rs @@ -4,12 +4,11 @@ //! A single port on the OPTE virtual switch. -use super::port_manager::PortTicket; -use super::BoundaryServices; -use super::Gateway; -use super::Vni; -#[cfg(target_os = "illumos")] use crate::illumos::dladm::Dladm; +use crate::opte::BoundaryServices; +use crate::opte::Gateway; +use crate::opte::PortTicket; +use crate::opte::Vni; use crate::params::ExternalIp; use ipnetwork::IpNetwork; use macaddr::MacAddr6; @@ -18,7 +17,6 @@ use std::net::Ipv6Addr; use std::sync::Arc; #[derive(Debug)] -#[cfg_attr(not(target_os = "illumos"), allow(dead_code))] struct PortInner { // Contains instance ID and a pointer to the parent manager ticket: PortTicket, @@ -56,7 +54,6 @@ struct PortInner { vnic: String, } -#[cfg(target_os = "illumos")] impl Drop for PortInner { fn drop(&mut self) { if let Err(e) = Dladm::delete_vnic(&self.vnic) { diff --git a/sled-agent/src/opte/port_manager.rs b/sled-agent/src/opte/illumos/port_manager.rs similarity index 82% rename from sled-agent/src/opte/port_manager.rs rename to sled-agent/src/opte/illumos/port_manager.rs index 39d24e106c..b45f1378b3 100644 --- a/sled-agent/src/opte/port_manager.rs +++ b/sled-agent/src/opte/illumos/port_manager.rs @@ -4,18 +4,29 @@ //! Manager for all OPTE ports on a Helios system -use super::BoundaryServices; -use super::Error; -use super::Gateway; -use super::Port; -use super::Vni; +use crate::illumos::dladm::Dladm; use crate::illumos::dladm::PhysicalLink; +use crate::illumos::dladm::VnicSource; +use crate::opte::BoundaryServices; +use crate::opte::Error; +use crate::opte::Gateway; +use crate::opte::Port; +use crate::opte::Vni; use crate::params::ExternalIp; use crate::params::NetworkInterface; use ipnetwork::IpNetwork; use macaddr::MacAddr6; +use opte::api::IpCidr; +use opte::api::Ipv4Cidr; +use opte::api::Ipv4PrefixLen; +use opte::api::MacAddr; +use opte::oxide_vpc::api::AddRouterEntryIpv4Req; +use opte::oxide_vpc::api::RouterTarget; +use opte::oxide_vpc::api::SNatCfg; +use opte_ioctl::OpteHdl; use slog::debug; use slog::info; +use slog::warn; use slog::Logger; use std::collections::BTreeMap; use std::net::IpAddr; @@ -24,30 +35,13 @@ use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; use std::sync::Mutex; +use std::sync::MutexGuard; use uuid::Uuid; -cfg_if::cfg_if! { - if #[cfg(target_os = "illumos")] { - use crate::illumos::dladm::Dladm; - use crate::illumos::dladm::VnicSource; - use std::sync::MutexGuard; - use opte::api::IpCidr; - use opte::api::Ipv4Cidr; - use opte::api::Ipv4PrefixLen; - use opte::api::MacAddr; - use opte::oxide_vpc::api::AddRouterEntryIpv4Req; - use opte::oxide_vpc::api::RouterTarget; - use opte::oxide_vpc::api::SNatCfg; - use opte_ioctl::OpteHdl; - use slog::warn; - } -} - // Prefix used to identify xde data links. const XDE_LINK_PREFIX: &str = "opte"; #[derive(Debug)] -#[cfg_attr(not(target_os = "illumos"), allow(dead_code))] struct PortManagerInner { log: Logger, @@ -56,12 +50,16 @@ struct PortManagerInner { // TODO-remove: This is part of the external IP address workaround // + // See https://github.com/oxidecomputer/omicron/issues/1335 + // // We only need to know this while we're setting the secondary MACs of the // link to support OPTE's proxy ARP for the guest's IP. data_link: PhysicalLink, // TODO-remove: This is part of the external IP address workaround. // + // See https://github.com/oxidecomputer/omicron/issues/1335 + // // We only need this while OPTE needs to forward traffic to the local // gateway. This will be replaced by boundary services. gateway_mac: MacAddr6, @@ -89,13 +87,14 @@ impl PortManagerInner { // TODO-remove // + // See https://github.com/oxidecomputer/omicron/issues/1335 + // // This is part of the workaround to get external connectivity into // instances, without setting up all of boundary services. Rather than // encap/decap the guest traffic, OPTE just performs 1-1 NAT between the // private IP address of the guest and the external address provided by // the control plane. This call here allows the underlay nic, `net0` to // advertise as having the guest's MAC address. - #[cfg(target_os = "illumos")] fn update_secondary_macs( &self, ports: &mut MutexGuard<'_, BTreeMap>>, @@ -174,7 +173,6 @@ impl PortManager { } /// Create an OPTE port for the given guest instance. - #[cfg(target_os = "illumos")] pub fn create_port( &self, instance_id: Uuid, @@ -381,6 +379,8 @@ impl PortManager { // TODO-remove // + // See https://github.com/oxidecomputer/omicron/issues/1336 + // // This is another part of the workaround, allowing reply traffic from // the guest back out. Normally, OPTE would drop such traffic at the // router layer, as it has no route for that external IP address. This @@ -398,7 +398,7 @@ impl PortManager { Ipv4Cidr::new(std::net::Ipv4Addr::UNSPECIFIED.into(), prefix); let target = RouterTarget::InternetGateway; hdl.add_router_entry_ip4(&AddRouterEntryIpv4Req { - port_name: port_name.clone(), + port_name, dest, target, })?; @@ -410,75 +410,6 @@ impl PortManager { ); Ok(port) } - - #[cfg(not(target_os = "illumos"))] - pub fn create_port( - &self, - instance_id: Uuid, - nic: &NetworkInterface, - external_ip: Option, - ) -> Result { - // TODO-completess: Remove IPv4 restrictions once OPTE supports virtual - // IPv6 networks. - let _ = match nic.ip { - IpAddr::V4(ip) => Ok(ip), - IpAddr::V6(_) => Err(Error::InvalidArgument(String::from( - "IPv6 is not yet supported for guest interfaces", - ))), - }?; - - // Argument checking and conversions into OPTE data types. - let subnet = IpNetwork::from(nic.subnet); - let mac = *nic.mac; - let vni = Vni::new(nic.vni).unwrap(); - let gateway = match subnet { - IpNetwork::V4(_) => Gateway::from_subnet(&subnet), - IpNetwork::V6(_) => { - return Err(Error::InvalidArgument(String::from( - "IPv6 is not yet supported for guest interfaces", - ))); - } - }; - let _ = match gateway.ip { - IpAddr::V4(ip) => Ok(ip), - IpAddr::V6(_) => Err(Error::InvalidArgument(String::from( - "IPv6 is not yet supported for guest interfaces", - ))), - }?; - let boundary_services = BoundaryServices::default(); - let port_name = self.inner.next_port_name(); - let vnic = format!("v{}", port_name); - let port = { - let mut ports = self.inner.ports.lock().unwrap(); - let ticket = PortTicket::new(instance_id, self.inner.clone()); - let port = Port::new( - ticket, - port_name.clone(), - nic.ip, - subnet, - mac, - nic.slot, - vni, - self.inner.underlay_ip, - external_ip, - gateway, - boundary_services, - vnic, - ); - ports - .entry(instance_id) - .or_insert_with(Vec::new) - .push(port.clone()); - port - }; - - info!( - self.inner.log, - "Created OPTE port for guest"; - "port" => ?&port, - ); - Ok(port) - } } #[derive(Clone)] @@ -506,7 +437,7 @@ impl PortTicket { Self { id, manager: Some(manager) } } - pub fn release(&mut self) { + pub fn release(&mut self) -> Result<(), Error> { if let Some(manager) = self.manager.take() { let mut ports = manager.ports.lock().unwrap(); let n_ports = ports.remove(&self.id).map(|p| p.len()).unwrap_or(0); @@ -516,7 +447,6 @@ impl PortTicket { "instance_id" => ?self.id, "n_ports" => n_ports, ); - #[cfg(target_os = "illumos")] if let Err(e) = manager.update_secondary_macs(&mut ports) { warn!( manager.log, @@ -525,13 +455,19 @@ impl PortTicket { "instance_id" => ?self.id, "err" => ?e, ); + return Err(e); + } else { + return Ok(()); } } + Ok(()) } } impl Drop for PortTicket { fn drop(&mut self) { - self.release(); + // We're ignoring the value since (1) it's already logged and (2) we + // can't do anything with it anyway. + let _ = self.release(); } } diff --git a/sled-agent/src/opte/mod.rs b/sled-agent/src/opte/mod.rs index b537b2e336..d11e95b9b4 100644 --- a/sled-agent/src/opte/mod.rs +++ b/sled-agent/src/opte/mod.rs @@ -2,212 +2,21 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Interactions with the Oxide Packet Transformation Engine (OPTE) - -use ipnetwork::IpNetwork; -use macaddr::MacAddr6; -use slog::Logger; -use std::net::IpAddr; -use std::net::Ipv6Addr; - -mod port; -mod port_manager; - -pub use port::Port; -pub use port_manager::PortManager; -pub use port_manager::PortTicket; - cfg_if::cfg_if! { if #[cfg(target_os = "illumos")] { - use crate::illumos::dladm; - use crate::common::underlay; - use opte_ioctl::OpteHdl; - use std::fs; - use std::path::Path; - - pub use opte::api::Vni; - - #[derive(thiserror::Error, Debug)] - pub enum Error { - #[error("Failure interacting with the OPTE ioctl(2) interface: {0}")] - Opte(#[from] opte_ioctl::Error), - - #[error("Failed to wrap OPTE port in a VNIC: {0}")] - CreateVnic(#[from] dladm::CreateVnicError), - - #[error("Failed to get VNICs for xde underlay devices: {0}")] - GetVnic(#[from] underlay::Error), - - #[error( - "No xde driver configuration file exists at '/kernel/drv/xde.conf'" - )] - NoXdeConf, - - #[error( - "The OS kernel does not support the xde driver. Please update the OS \ - using `./tools/install_opte.sh` to provide kernel bits and the xde \ - driver which are compatible." - )] - IncompatibleKernel, - - #[error(transparent)] - BadAddrObj(#[from] crate::illumos::addrobj::ParseError), - - #[error(transparent)] - SetLinkpropError(#[from] crate::illumos::dladm::SetLinkpropError), - - #[error(transparent)] - ResetLinkpropError(#[from] crate::illumos::dladm::ResetLinkpropError), - } - - /// Delete all xde devices on the system. - pub fn delete_all_xde_devices(log: &Logger) -> Result<(), Error> { - let hdl = OpteHdl::open(OpteHdl::DLD_CTL)?; - for port_info in hdl.list_ports()?.ports.into_iter() { - let name = &port_info.name; - info!( - log, - "deleting existing OPTE port and xde device"; - "device_name" => name - ); - hdl.delete_xde(name)?; - } - Ok(()) - } - - /// Initialize the underlay devices required for the xde kernel module. - /// - /// The xde driver needs information about the physical devices out which it can - /// send traffic from the guests. - pub fn initialize_xde_driver(log: &Logger) -> Result<(), Error> { - const XDE_CONF: &str = "/kernel/drv/xde.conf"; - let xde_conf = Path::new(XDE_CONF); - if !xde_conf.exists() { - return Err(Error::NoXdeConf); - } - - // TODO-remove - // - // An additional part of the workaround to connect into instances. This is - // required to tell OPTE to actually act as a 1-1 NAT when an instance is - // provided with an external IP address, rather than do its normal job of - // encapsulating the traffic onto the underlay (such as for delivery to - // boundary services). - use_external_ip_workaround(&log, &xde_conf); - - let underlay_nics = underlay::find_nics()?; - info!(log, "using '{:?}' as data links for xde driver", underlay_nics); - if underlay_nics.len() < 2 { - const MESSAGE: &str = concat!( - "There must be at least two underlay NICs for the xde ", - "driver to operate. These are currently created by ", - "`./tools/create_virtual_hardware.sh`. Please ensure that ", - "script has been run, and that two VNICs named `net{0,1}` ", - "exist on the system." - ); - return Err(Error::Opte(opte_ioctl::Error::InvalidArgument( - String::from(MESSAGE), - ))); - } - match OpteHdl::open(OpteHdl::DLD_CTL)?.set_xde_underlay( - underlay_nics[0].interface(), - underlay_nics[1].interface(), - ) { - Ok(_) => Ok(()), - // Handle the specific case where the kernel appears to be unaware of - // xde at all. This implies the developer has not installed the correct - // helios-netdev kernel bits. - // - // TODO-correctness: This error should never occur in the product. Both - // xde the kernel driver and the kernel bits needed to recognize it will - // be packaged as part of our OS ramdisk, meaning it should not be - // possible to get out of sync. - Err(opte_ioctl::Error::IoctlFailed(_, ref message)) - if message.contains("unexpected errno: 48") => - { - Err(Error::IncompatibleKernel) - } - // TODO-correctness: xde provides no way to get the current underlay - // devices we're using, but we'd probably like the further check that - // those are exactly what we're giving it now. - Err(opte_ioctl::Error::CommandError( - _, - opte::api::OpteError::System { errno: libc::EEXIST, .. }, - )) => Ok(()), - Err(e) => Err(e.into()), - } - } - - fn use_external_ip_workaround(log: &Logger, xde_conf: &Path) { - const NEEDLE: &str = "ext_ip_hack = 0;"; - const NEW_NEEDLE: &str = "ext_ip_hack = 1;"; - - // NOTE: This only works in the real sled agent, which is run as root. The - // file is not world-readable. - let contents = fs::read_to_string(xde_conf) - .expect("Failed to read xde configuration file"); - let new = contents.replace(NEEDLE, NEW_NEEDLE); - if contents == new { - info!( - log, - "xde driver configuration file appears to already use external IP workaround"; - "conf_file" => ?xde_conf, - ); - } else { - info!( - log, - "updating xde driver configuration file for external IP workaround"; - "conf_file" => ?xde_conf, - ); - fs::write(xde_conf, &new) - .expect("Failed to modify xde configuration file"); - } - - // Ensure the driver picks up the updated configuration file, if it's been - // loaded previously without the workaround. - std::process::Command::new(crate::illumos::PFEXEC) - .args(&["update_drv", "xde"]) - .output() - .expect("Failed to reload xde driver configuration file"); - } - + mod illumos; + pub use illumos::*; } else { - - #[derive(Debug, Clone, Copy)] - pub struct Vni(u32); - - impl Vni { - pub fn new(n: N) -> Result - where - N: Into, - { - let x = n.into(); - if x <= 0x00_FF_FF_FF { - Ok(Self(x)) - } else { - Err(Error::InvalidArgument(format!("invalid VNI: {}", x))) - } - } - } - - #[derive(thiserror::Error, Debug)] - pub enum Error { - #[error("Invalid argument: {0}")] - InvalidArgument(String), - } - - pub fn initialize_xde_driver(log: &Logger) -> Result<(), Error> { - slog::warn!(log, "`xde` driver is a fiction on non-illumos systems"); - Ok(()) - } - - pub fn delete_all_xde_devices(log: &Logger) -> Result<(), Error> { - slog::warn!(log, "`xde` driver is a fiction on non-illumos systems"); - Ok(()) - } + mod non_illumos; + pub use non_illumos::*; } } +use ipnetwork::IpNetwork; +use macaddr::MacAddr6; +use std::net::IpAddr; +use std::net::Ipv6Addr; + /// Location information for reaching Boundary Services, for directing /// inter-sled or off-rack traffic from guests. #[derive(Debug, Clone, Copy)] diff --git a/sled-agent/src/opte/non_illumos/mod.rs b/sled-agent/src/opte/non_illumos/mod.rs new file mode 100644 index 0000000000..c7bcd2dfe6 --- /dev/null +++ b/sled-agent/src/opte/non_illumos/mod.rs @@ -0,0 +1,47 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Mock / dummy versions of the OPTE module, for non-illumos platforms + +use slog::Logger; + +mod port; +mod port_manager; + +pub use port::Port; +pub use port_manager::PortManager; +pub use port_manager::PortTicket; + +#[derive(Debug, Clone, Copy)] +pub struct Vni(u32); + +impl Vni { + pub fn new(n: N) -> Result + where + N: Into, + { + let x = n.into(); + if x <= 0x00_FF_FF_FF { + Ok(Self(x)) + } else { + Err(Error::InvalidArgument(format!("invalid VNI: {}", x))) + } + } +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("Invalid argument: {0}")] + InvalidArgument(String), +} + +pub fn initialize_xde_driver(log: &Logger) -> Result<(), Error> { + slog::warn!(log, "`xde` driver is a fiction on non-illumos systems"); + Ok(()) +} + +pub fn delete_all_xde_devices(log: &Logger) -> Result<(), Error> { + slog::warn!(log, "`xde` driver is a fiction on non-illumos systems"); + Ok(()) +} diff --git a/sled-agent/src/opte/non_illumos/port.rs b/sled-agent/src/opte/non_illumos/port.rs new file mode 100644 index 0000000000..99da12b709 --- /dev/null +++ b/sled-agent/src/opte/non_illumos/port.rs @@ -0,0 +1,120 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A single port on the OPTE virtual switch. + +use super::port_manager::PortTicket; +use super::BoundaryServices; +use super::Gateway; +use super::Vni; +use crate::params::ExternalIp; +use ipnetwork::IpNetwork; +use macaddr::MacAddr6; +use std::net::IpAddr; +use std::net::Ipv6Addr; +use std::sync::Arc; + +#[derive(Debug)] +#[allow(dead_code)] +struct PortInner { + // Contains instance ID and a pointer to the parent manager + ticket: PortTicket, + // Name of the port as identified by OPTE + name: String, + // IP address within the VPC Subnet + _ip: IpAddr, + // VPC Subnet + _subnet: IpNetwork, + // VPC-private MAC address + mac: MacAddr6, + // Emulated PCI slot for the guest NIC, passed to Propolis + slot: u8, + // Geneve VNI for the VPC + _vni: Vni, + // IP address of the hosting sled + _underlay_ip: Ipv6Addr, + // The external IP information for this port, or None if it has no external + // connectivity. Only the primary interface has Some(_) here. + external_ip: Option, + // Information about the virtual gateway, aka OPTE + _gateway: Gateway, + // Information about Boundary Services, for forwarding traffic between sleds + // or off the rack. + _boundary_services: BoundaryServices, + // TODO-correctness: Remove this once we can put Viona directly on top of an + // OPTE port device. + // + // NOTE: This is intentionally not an actual `Vnic` object. We'd like to + // delete the VNIC manually in `PortInner::drop`, because we _can't_ delete + // the xde device if we fail to delete the VNIC. See + // https://github.com/oxidecomputer/opte/issues/178 for more details. This + // can be changed back to a real VNIC when that is resolved, and the Drop + // impl below can simplify to just call `drop(self.vnic)`. + vnic: String, +} + +/// A port on the OPTE virtual switch, providing the virtual networking +/// abstractions for guest instances. +/// +/// Note that the type is clonable and refers to the same underlying port on the +/// system. +#[derive(Debug, Clone)] +pub struct Port { + inner: Arc, +} + +impl Port { + #[allow(clippy::too_many_arguments)] + pub fn new( + ticket: PortTicket, + name: String, + ip: IpAddr, + subnet: IpNetwork, + mac: MacAddr6, + slot: u8, + vni: Vni, + underlay_ip: Ipv6Addr, + external_ip: Option, + gateway: Gateway, + boundary_services: BoundaryServices, + vnic: String, + ) -> Self { + Self { + inner: Arc::new(PortInner { + ticket, + name, + _ip: ip, + _subnet: subnet, + mac, + slot, + _vni: vni, + _underlay_ip: underlay_ip, + external_ip, + _gateway: gateway, + _boundary_services: boundary_services, + vnic, + }), + } + } + + pub fn external_ip(&self) -> &Option { + &self.inner.external_ip + } + + pub fn mac(&self) -> &MacAddr6 { + &self.inner.mac + } + + pub fn vnic_name(&self) -> &str { + &self.inner.vnic + } + + pub fn slot(&self) -> u8 { + self.inner.slot + } + + pub fn ticket(&self) -> PortTicket { + self.inner.ticket.clone() + } +} diff --git a/sled-agent/src/opte/non_illumos/port_manager.rs b/sled-agent/src/opte/non_illumos/port_manager.rs new file mode 100644 index 0000000000..1b1b43343d --- /dev/null +++ b/sled-agent/src/opte/non_illumos/port_manager.rs @@ -0,0 +1,228 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Manager for all OPTE ports on a Helios system + +use super::BoundaryServices; +use super::Error; +use super::Gateway; +use super::Port; +use super::Vni; +use crate::illumos::dladm::PhysicalLink; +use crate::params::ExternalIp; +use crate::params::NetworkInterface; +use ipnetwork::IpNetwork; +use macaddr::MacAddr6; +use slog::debug; +use slog::info; +use slog::Logger; +use std::collections::BTreeMap; +use std::net::IpAddr; +use std::net::Ipv6Addr; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::sync::Mutex; +use uuid::Uuid; + +// Prefix used to identify xde data links. +const XDE_LINK_PREFIX: &str = "opte"; + +#[derive(Debug)] +#[allow(dead_code)] +struct PortManagerInner { + log: Logger, + + // Sequential identifier for each port on the system. + next_port_id: AtomicU64, + + // TODO-remove: This is part of the external IP address workaround + // + // See https://github.com/oxidecomputer/omicron/issues/1335 + // + // We only need to know this while we're setting the secondary MACs of the + // link to support OPTE's proxy ARP for the guest's IP. + data_link: PhysicalLink, + + // TODO-remove: This is part of the external IP address workaround. + // + // See https://github.com/oxidecomputer/omicron/issues/1335 + // + // We only need this while OPTE needs to forward traffic to the local + // gateway. This will be replaced by boundary services. + gateway_mac: MacAddr6, + + // IP address of the hosting sled on the underlay. + underlay_ip: Ipv6Addr, + + // Map of instance ID to list of ports. + // + // NOTE: By storing all ports in a vector, the ticket mechanism makes the + // first dropped port cause the whole vector to be dropped. The remaining + // ports' drop impls will still call remove on this map, but there will no + // longer be a value with that key. + ports: Mutex>>, +} + +impl PortManagerInner { + fn next_port_name(&self) -> String { + format!( + "{}{}", + XDE_LINK_PREFIX, + self.next_port_id.fetch_add(1, Ordering::SeqCst) + ) + } +} + +/// The port manager controls all OPTE ports on a single host. +#[derive(Debug, Clone)] +pub struct PortManager { + inner: Arc, +} + +impl PortManager { + /// Create a new manager, for creating OPTE ports for guest network + /// interfaces + pub fn new( + log: Logger, + underlay_ip: Ipv6Addr, + gateway_mac: MacAddr6, + ) -> Self { + let data_link = crate::common::underlay::find_chelsio_links() + .unwrap() + .into_iter() + .next() + .unwrap(); + let inner = Arc::new(PortManagerInner { + log, + next_port_id: AtomicU64::new(0), + data_link, + gateway_mac, + underlay_ip, + ports: Mutex::new(BTreeMap::new()), + }); + + Self { inner } + } + + pub fn underlay_ip(&self) -> &Ipv6Addr { + &self.inner.underlay_ip + } + + pub fn create_port( + &self, + instance_id: Uuid, + nic: &NetworkInterface, + external_ip: Option, + ) -> Result { + // TODO-completess: Remove IPv4 restrictions once OPTE supports virtual + // IPv6 networks. + let _ = match nic.ip { + IpAddr::V4(ip) => Ok(ip), + IpAddr::V6(_) => Err(Error::InvalidArgument(String::from( + "IPv6 is not yet supported for guest interfaces", + ))), + }?; + + // Argument checking and conversions into OPTE data types. + let subnet = IpNetwork::from(nic.subnet); + let mac = *nic.mac; + let vni = Vni::new(nic.vni).unwrap(); + let gateway = match subnet { + IpNetwork::V4(_) => Gateway::from_subnet(&subnet), + IpNetwork::V6(_) => { + return Err(Error::InvalidArgument(String::from( + "IPv6 is not yet supported for guest interfaces", + ))); + } + }; + let _ = match gateway.ip { + IpAddr::V4(ip) => Ok(ip), + IpAddr::V6(_) => Err(Error::InvalidArgument(String::from( + "IPv6 is not yet supported for guest interfaces", + ))), + }?; + let boundary_services = BoundaryServices::default(); + let port_name = self.inner.next_port_name(); + let vnic = format!("v{}", port_name); + let port = { + let mut ports = self.inner.ports.lock().unwrap(); + let ticket = PortTicket::new(instance_id, self.inner.clone()); + let port = Port::new( + ticket, + port_name.clone(), + nic.ip, + subnet, + mac, + nic.slot, + vni, + self.inner.underlay_ip, + external_ip, + gateway, + boundary_services, + vnic, + ); + ports + .entry(instance_id) + .or_insert_with(Vec::new) + .push(port.clone()); + port + }; + + info!( + self.inner.log, + "Created OPTE port for guest"; + "port" => ?&port, + ); + Ok(port) + } +} + +#[derive(Clone)] +pub struct PortTicket { + id: Uuid, + manager: Option>, +} + +impl std::fmt::Debug for PortTicket { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + const SOME: &str = "Some(_)"; + const NONE: &str = "None"; + f.debug_struct("PortTicket") + .field("id", &self.id) + .field( + "manager", + if self.manager.is_some() { &SOME } else { &NONE }, + ) + .finish() + } +} + +impl PortTicket { + fn new(id: Uuid, manager: Arc) -> Self { + Self { id, manager: Some(manager) } + } + + pub fn release(&mut self) -> Result<(), Error> { + if let Some(manager) = self.manager.take() { + let mut ports = manager.ports.lock().unwrap(); + let n_ports = ports.remove(&self.id).map(|p| p.len()).unwrap_or(0); + debug!( + manager.log, + "Removing OPTE ports from manager"; + "instance_id" => ?self.id, + "n_ports" => n_ports, + ); + } + Ok(()) + } +} + +impl Drop for PortTicket { + fn drop(&mut self) { + // We're ignoring the value since (1) it's already logged and (2) we + // can't do anything with it anyway. + let _ = self.release(); + } +} From 4edec92758c3a02473e58d5f4faed07b3a3cb3b5 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Mon, 11 Jul 2022 20:52:04 -0700 Subject: [PATCH 3/7] Fixup non-illumos OPTE module --- sled-agent/src/opte/non_illumos/port.rs | 8 ++++---- sled-agent/src/opte/non_illumos/port_manager.rs | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sled-agent/src/opte/non_illumos/port.rs b/sled-agent/src/opte/non_illumos/port.rs index 99da12b709..3d3745329a 100644 --- a/sled-agent/src/opte/non_illumos/port.rs +++ b/sled-agent/src/opte/non_illumos/port.rs @@ -4,10 +4,10 @@ //! A single port on the OPTE virtual switch. -use super::port_manager::PortTicket; -use super::BoundaryServices; -use super::Gateway; -use super::Vni; +use crate::opte::BoundaryServices; +use crate::opte::Gateway; +use crate::opte::PortTicket; +use crate::opte::Vni; use crate::params::ExternalIp; use ipnetwork::IpNetwork; use macaddr::MacAddr6; diff --git a/sled-agent/src/opte/non_illumos/port_manager.rs b/sled-agent/src/opte/non_illumos/port_manager.rs index 1b1b43343d..816ba9cf5f 100644 --- a/sled-agent/src/opte/non_illumos/port_manager.rs +++ b/sled-agent/src/opte/non_illumos/port_manager.rs @@ -4,12 +4,12 @@ //! Manager for all OPTE ports on a Helios system -use super::BoundaryServices; -use super::Error; -use super::Gateway; -use super::Port; -use super::Vni; use crate::illumos::dladm::PhysicalLink; +use crate::opte::BoundaryServices; +use crate::opte::Error; +use crate::opte::Gateway; +use crate::opte::Port; +use crate::opte::Vni; use crate::params::ExternalIp; use crate::params::NetworkInterface; use ipnetwork::IpNetwork; @@ -151,7 +151,7 @@ impl PortManager { let ticket = PortTicket::new(instance_id, self.inner.clone()); let port = Port::new( ticket, - port_name.clone(), + port_name, nic.ip, subnet, mac, From a98be984a63b3bc36441ef7c2c17f9897b6bc16b Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 12 Jul 2022 15:47:02 +0000 Subject: [PATCH 4/7] Store OPTE ports individually in the manager - Store ports keyed on the combination of instance ID and port name, allowing each port to be stored as a separate kv-pair - Collect a ticket for each port in the running zone, and release each port explicitly and separately --- sled-agent/src/instance.rs | 30 +++++++------ sled-agent/src/opte/illumos/port_manager.rs | 45 +++++++++++-------- .../src/opte/non_illumos/port_manager.rs | 40 ++++++++++------- 3 files changed, 66 insertions(+), 49 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 4768e536da..08394669af 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -160,8 +160,8 @@ struct RunningState { client: Arc, // Object representing membership in the "instance manager". instance_ticket: InstanceTicket, - // Object representing the instance's OPTE ports in the port manager - port_ticket: Option, + // Objects representing the instance's OPTE ports in the port manager + port_tickets: Option>, // Handle to task monitoring for Propolis state changes. monitor_task: Option>, // Handle to the zone. @@ -194,7 +194,7 @@ impl Drop for RunningState { struct PropolisSetup { client: Arc, running_zone: RunningZone, - port_ticket: Option, + port_tickets: Option>, } struct InstanceInner { @@ -301,7 +301,7 @@ impl InstanceInner { setup: PropolisSetup, migrate: Option, ) -> Result<(), Error> { - let PropolisSetup { client, running_zone, port_ticket } = setup; + let PropolisSetup { client, running_zone, port_tickets } = setup; let nics = running_zone .opte_ports() @@ -359,7 +359,7 @@ impl InstanceInner { self.running_state = Some(RunningState { client, instance_ticket, - port_ticket, + port_tickets, monitor_task, _running_zone: running_zone, }); @@ -499,6 +499,7 @@ impl Instance { ) -> Result { // Create OPTE ports for the instance let mut opte_ports = Vec::with_capacity(inner.requested_nics.len()); + let mut port_tickets = Vec::with_capacity(inner.requested_nics.len()); for nic in inner.requested_nics.iter() { let external_ip = if nic.primary { Some(inner.external_ip) } else { None }; @@ -507,14 +508,9 @@ impl Instance { nic, external_ip, )?; + port_tickets.push(port.ticket()); opte_ports.push(port); } - // We only acquire and store the first port ticket in the zone. - // - // The Ports are stored in the manager as a list for each instance. The - // tickets point to that entire list, so calling `PortTicket::release` - // on any one ticket actually releases every port for the instance. - let port_ticket = opte_ports.first().map(|port| port.ticket()); // Create a zone for the propolis instance, using the previously // configured VNICs. @@ -644,7 +640,11 @@ impl Instance { // don't need to worry about initialization races. wait_for_http_server(&inner.log, &client).await?; - Ok(PropolisSetup { client, running_zone, port_ticket }) + Ok(PropolisSetup { + client, + running_zone, + port_tickets: Some(port_tickets), + }) } /// Begins the execution of the instance's service (Propolis). @@ -678,8 +678,10 @@ impl Instance { running_state.instance_ticket.terminate(); // And remove the OPTE ports from the port manager - if let Some(ticket) = running_state.port_ticket.as_mut() { - ticket.release()?; + if let Some(tickets) = running_state.port_tickets.as_mut() { + for ticket in tickets.iter_mut() { + ticket.release()?; + } } Ok(()) diff --git a/sled-agent/src/opte/illumos/port_manager.rs b/sled-agent/src/opte/illumos/port_manager.rs index b45f1378b3..cd6eb20593 100644 --- a/sled-agent/src/opte/illumos/port_manager.rs +++ b/sled-agent/src/opte/illumos/port_manager.rs @@ -67,13 +67,8 @@ struct PortManagerInner { // IP address of the hosting sled on the underlay. underlay_ip: Ipv6Addr, - // Map of instance ID to list of ports. - // - // NOTE: By storing all ports in a vector, the ticket mechanism makes the - // first dropped port cause the whole vector to be dropped. The remaining - // ports' drop impls will still call remove on this map, but there will no - // longer be a value with that key. - ports: Mutex>>, + // Map of all ports, keyed on the instance Uuid and the port name. + ports: Mutex>, } impl PortManagerInner { @@ -97,11 +92,10 @@ impl PortManagerInner { // advertise as having the guest's MAC address. fn update_secondary_macs( &self, - ports: &mut MutexGuard<'_, BTreeMap>>, + ports: &mut MutexGuard<'_, BTreeMap<(Uuid, String), Port>>, ) -> Result<(), Error> { let secondary_macs = ports .values() - .flatten() .filter_map(|port| { // Only advertise Ports with an external address (primary // interface for an instance). @@ -321,7 +315,11 @@ impl PortManager { let port = { let mut ports = self.inner.ports.lock().unwrap(); - let ticket = PortTicket::new(instance_id, self.inner.clone()); + let ticket = PortTicket::new( + instance_id, + port_name.clone(), + self.inner.clone(), + ); let port = Port::new( ticket, port_name.clone(), @@ -336,10 +334,14 @@ impl PortManager { boundary_services, vnic, ); - ports - .entry(instance_id) - .or_insert_with(Vec::new) - .push(port.clone()); + let old = + ports.insert((instance_id, port_name.clone()), port.clone()); + assert!( + old.is_none(), + "Duplicate OPTE port detected: instance_id = {}, port_name = {}", + instance_id, + &port_name, + ); self.inner.update_secondary_macs(&mut ports)?; port }; @@ -415,6 +417,7 @@ impl PortManager { #[derive(Clone)] pub struct PortTicket { id: Uuid, + port_name: String, manager: Option>, } @@ -433,19 +436,23 @@ impl std::fmt::Debug for PortTicket { } impl PortTicket { - fn new(id: Uuid, manager: Arc) -> Self { - Self { id, manager: Some(manager) } + fn new( + id: Uuid, + port_name: String, + manager: Arc, + ) -> Self { + Self { id, port_name, manager: Some(manager) } } pub fn release(&mut self) -> Result<(), Error> { if let Some(manager) = self.manager.take() { let mut ports = manager.ports.lock().unwrap(); - let n_ports = ports.remove(&self.id).map(|p| p.len()).unwrap_or(0); + ports.remove(&(self.id, self.port_name.clone())); debug!( manager.log, - "Removing OPTE ports from manager"; + "Removing OPTE port from manager"; "instance_id" => ?self.id, - "n_ports" => n_ports, + "port_name" => &self.port_name, ); if let Err(e) = manager.update_secondary_macs(&mut ports) { warn!( diff --git a/sled-agent/src/opte/non_illumos/port_manager.rs b/sled-agent/src/opte/non_illumos/port_manager.rs index 816ba9cf5f..cd927c0f6c 100644 --- a/sled-agent/src/opte/non_illumos/port_manager.rs +++ b/sled-agent/src/opte/non_illumos/port_manager.rs @@ -56,13 +56,8 @@ struct PortManagerInner { // IP address of the hosting sled on the underlay. underlay_ip: Ipv6Addr, - // Map of instance ID to list of ports. - // - // NOTE: By storing all ports in a vector, the ticket mechanism makes the - // first dropped port cause the whole vector to be dropped. The remaining - // ports' drop impls will still call remove on this map, but there will no - // longer be a value with that key. - ports: Mutex>>, + // Map of all ports, keyed on the instance Uuid and the port name. + ports: Mutex>, } impl PortManagerInner { @@ -148,7 +143,11 @@ impl PortManager { let vnic = format!("v{}", port_name); let port = { let mut ports = self.inner.ports.lock().unwrap(); - let ticket = PortTicket::new(instance_id, self.inner.clone()); + let ticket = PortTicket::new( + instance_id, + port_name.clone(), + self.inner.clone(), + ); let port = Port::new( ticket, port_name, @@ -163,10 +162,14 @@ impl PortManager { boundary_services, vnic, ); - ports - .entry(instance_id) - .or_insert_with(Vec::new) - .push(port.clone()); + let old = + ports.insert((instance_id, port_name.clone()), port.clone()); + assert!( + old.is_none(), + "Duplicate OPTE port detected: instance_id = {}, port_name = {}", + instance_id, + &port_name, + ); port }; @@ -182,6 +185,7 @@ impl PortManager { #[derive(Clone)] pub struct PortTicket { id: Uuid, + name: String, manager: Option>, } @@ -200,19 +204,23 @@ impl std::fmt::Debug for PortTicket { } impl PortTicket { - fn new(id: Uuid, manager: Arc) -> Self { - Self { id, manager: Some(manager) } + fn new( + id: Uuid, + port_name: String, + manager: Arc, + ) -> Self { + Self { id, port_name, manager: Some(manager) } } pub fn release(&mut self) -> Result<(), Error> { if let Some(manager) = self.manager.take() { let mut ports = manager.ports.lock().unwrap(); - let n_ports = ports.remove(&self.id).map(|p| p.len()).unwrap_or(0); + ports.remove(&(self.id, self.port_name.clone())); debug!( manager.log, "Removing OPTE ports from manager"; "instance_id" => ?self.id, - "n_ports" => n_ports, + "port_name" => &self.port_name, ); } Ok(()) From 189efc01d6b655c9026231baf7c44b547d383f3d Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 12 Jul 2022 08:53:00 -0700 Subject: [PATCH 5/7] fixup non-illumos --- sled-agent/src/opte/non_illumos/port_manager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sled-agent/src/opte/non_illumos/port_manager.rs b/sled-agent/src/opte/non_illumos/port_manager.rs index cd927c0f6c..5918cabfcf 100644 --- a/sled-agent/src/opte/non_illumos/port_manager.rs +++ b/sled-agent/src/opte/non_illumos/port_manager.rs @@ -185,7 +185,7 @@ impl PortManager { #[derive(Clone)] pub struct PortTicket { id: Uuid, - name: String, + port_name: String, manager: Option>, } From 61c5fad18fb27361700491c80cdbd17964e81aa1 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 12 Jul 2022 09:00:52 -0700 Subject: [PATCH 6/7] more non-illumos fixup --- sled-agent/src/opte/non_illumos/port_manager.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sled-agent/src/opte/non_illumos/port_manager.rs b/sled-agent/src/opte/non_illumos/port_manager.rs index 5918cabfcf..76ed8fc3a7 100644 --- a/sled-agent/src/opte/non_illumos/port_manager.rs +++ b/sled-agent/src/opte/non_illumos/port_manager.rs @@ -150,7 +150,7 @@ impl PortManager { ); let port = Port::new( ticket, - port_name, + port_name.clone(), nic.ip, subnet, mac, @@ -162,8 +162,7 @@ impl PortManager { boundary_services, vnic, ); - let old = - ports.insert((instance_id, port_name.clone()), port.clone()); + let old = ports.insert((instance_id, port_name), port.clone()); assert!( old.is_none(), "Duplicate OPTE port detected: instance_id = {}, port_name = {}", From 8a4f6411755b3ebdf59b582dc0ad3f4f579b52ad Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 12 Jul 2022 09:34:49 -0700 Subject: [PATCH 7/7] more non-illumos patching --- sled-agent/src/opte/non_illumos/port_manager.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sled-agent/src/opte/non_illumos/port_manager.rs b/sled-agent/src/opte/non_illumos/port_manager.rs index 76ed8fc3a7..f263b023b4 100644 --- a/sled-agent/src/opte/non_illumos/port_manager.rs +++ b/sled-agent/src/opte/non_illumos/port_manager.rs @@ -162,7 +162,8 @@ impl PortManager { boundary_services, vnic, ); - let old = ports.insert((instance_id, port_name), port.clone()); + let old = + ports.insert((instance_id, port_name.clone()), port.clone()); assert!( old.is_none(), "Duplicate OPTE port detected: instance_id = {}, port_name = {}",