From a69c52b0abb5dcac31d25872021905693f64cb3b Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 19 Jun 2024 22:42:10 +0000 Subject: [PATCH 01/55] [WIP] refactor to make VM creation async --- bin/propolis-server/src/lib/lib.rs | 1 + .../src/lib/vm2/guest_event.rs | 62 ++ bin/propolis-server/src/lib/vm2/mod.rs | 208 ++++++ .../src/lib/vm2/request_queue.rs | 604 ++++++++++++++++++ .../src/lib/vm2/state_driver.rs | 151 +++++ crates/propolis-server-config/src/lib.rs | 2 +- 6 files changed, 1027 insertions(+), 1 deletion(-) create mode 100644 bin/propolis-server/src/lib/vm2/guest_event.rs create mode 100644 bin/propolis-server/src/lib/vm2/mod.rs create mode 100644 bin/propolis-server/src/lib/vm2/request_queue.rs create mode 100644 bin/propolis-server/src/lib/vm2/state_driver.rs diff --git a/bin/propolis-server/src/lib/lib.rs b/bin/propolis-server/src/lib/lib.rs index cf8e79b15..3467e6b29 100644 --- a/bin/propolis-server/src/lib/lib.rs +++ b/bin/propolis-server/src/lib/lib.rs @@ -11,4 +11,5 @@ mod spec; mod stats; mod vcpu_tasks; mod vm; +mod vm2; pub mod vnc; diff --git a/bin/propolis-server/src/lib/vm2/guest_event.rs b/bin/propolis-server/src/lib/vm2/guest_event.rs new file mode 100644 index 000000000..7a5b29f7b --- /dev/null +++ b/bin/propolis-server/src/lib/vm2/guest_event.rs @@ -0,0 +1,62 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::{collections::VecDeque, time::Duration}; + +/// An event raised by some component in the instance (e.g. a vCPU or the +/// chipset) that the state worker must handle. +/// +/// The vCPU-sourced events carry a time element (duration since VM boot) as +/// emitted by the kernel vmm. This is used to deduplicate events when all +/// vCPUs running in-kernel are kicked out for the suspend state. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(super) enum GuestEvent { + /// VM entered halt state + VcpuSuspendHalt(Duration), + /// VM entered reboot state + VcpuSuspendReset(Duration), + /// vCPU encounted triple-fault + VcpuSuspendTripleFault(i32, Duration), + /// Chipset signaled halt condition + ChipsetHalt, + /// Chipset signaled reboot condition + ChipsetReset, +} + +#[derive(Debug, Default)] +pub(super) struct GuestEventQueue { + queue: VecDeque, +} + +pub(crate) trait GuestEventHandler: Send + Sync { + fn suspend_halt_event(&self, when: Duration); + fn suspend_reset_event(&self, when: Duration); + fn suspend_triple_fault_event(&self, vcpu_id: i32, when: Duration); + fn unhandled_vm_exit( + &self, + vcpu_id: i32, + exit: propolis::exits::VmExitKind, + ); + fn io_error_event(&self, vcpu_id: i32, error: std::io::Error); +} + +pub(crate) trait ChipsetEventHandler: Send + Sync { + fn chipset_halt(&self); + fn chipset_reset(&self); +} + +impl GuestEventQueue { + pub(super) fn enqueue(&mut self, event: GuestEvent) -> bool { + if !self.queue.iter().any(|ev| *ev == event) { + self.queue.push_back(event); + true + } else { + false + } + } + + pub(super) fn dequeue(&mut self) -> Option { + self.queue.pop_front() + } +} diff --git a/bin/propolis-server/src/lib/vm2/mod.rs b/bin/propolis-server/src/lib/vm2/mod.rs new file mode 100644 index 000000000..00b5c07ea --- /dev/null +++ b/bin/propolis-server/src/lib/vm2/mod.rs @@ -0,0 +1,208 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! This module implements the `Vm` wrapper type that encapsulates a single +//! instance on behalf of a Propolis server. + +use std::{ + collections::BTreeMap, + sync::{Arc, RwLock, RwLockReadGuard, Weak}, +}; + +use propolis::{ + hw::{ps2::ctrl::PS2Ctrl, qemu::ramfb::RamFb, uart::LpcUart}, + vmm::Machine, +}; +use propolis_api_types::{ + instance_spec::v0::InstanceSpecV0, InstanceProperties, + InstanceStateMonitorResponse, +}; +use uuid::Uuid; + +use crate::{serial::Serial, vm::VmControllerError}; + +mod guest_event; +mod request_queue; +mod state_driver; + +pub(crate) type LifecycleMap = + BTreeMap>; +pub(crate) type BlockBackendMap = + BTreeMap>; +pub(crate) type CrucibleBackendMap = + BTreeMap>; + +type InstanceStateTx = tokio::sync::watch::Sender< + propolis_api_types::InstanceStateMonitorResponse, +>; +type InstanceStateRx = tokio::sync::watch::Receiver< + propolis_api_types::InstanceStateMonitorResponse, +>; + +#[derive(Debug, thiserror::Error)] +pub(crate) enum VmError { + #[error("VM already initialized")] + AlreadyInitialized, +} + +/// The top-level VM wrapper type. Callers are expected to wrap this in an +/// `Arc`. +pub(crate) struct Vm { + /// A reference to the VM state machine. + state: RwLock, +} + +/// The state stored in a [`Vm`] when there is an actual underlying virtual +/// machine. +pub(super) struct ActiveVm { + parent: Arc, + + state_driver_queue: Arc, + external_state_rx: InstanceStateRx, + + properties: InstanceProperties, + spec: InstanceSpecV0, + + machine: Machine, + lifecycle_components: LifecycleMap, + block_backends: BlockBackendMap, + crucible_backends: CrucibleBackendMap, + com1: Arc>, + framebuffer: Option>, + ps2ctrl: Arc, + migration_src_state: + tokio::sync::Mutex, +} + +impl Drop for ActiveVm { + fn drop(&mut self) { + let guard = self.parent.state.write().unwrap(); + let VmState::ActiveVm(active) = + std::mem::replace(&mut *guard, VmState::NoVm) + else { + panic!("oops"); + }; + + std::mem::replace( + &mut *guard, + VmState::Defunct(DefunctVm { + external_state_rx: active.external_state_rx, + properties: active.properties, + spec: active.spec, + }), + ); + } +} + +pub struct DefunctVm { + external_state_rx: InstanceStateRx, + properties: InstanceProperties, + spec: InstanceSpecV0, +} + +#[allow(clippy::large_enum_variant)] +pub enum VmState { + NoVm, + WaitingToStart, + Active(Weak), + Defunct(DefunctVm), +} + +impl Vm { + pub fn new() -> Arc { + Arc::new(Self { state: RwLock::new(VmState::NoVm) }) + } + + fn vm_state(&self) -> RwLockReadGuard<'_, VmState> { + self.state.read().unwrap() + } + + pub(super) fn active_vm(&self) -> Option> { + let guard = self.vm_state(); + if let VmState::Active(weak) = guard { + weak.upgrade() + } + } + + fn start_failed(&self) { + let mut guard = self.state.write().unwrap(); + match *guard { + VmState::WaitingToStart => *guard = VmState::NoVm, + _ => unreachable!( + "only a starting VM's state worker calls start_failed" + ), + } + } + + fn make_active(&self, active: ActiveVm) { + let mut guard = self.state.write().unwrap(); + let old = std::mem::replace(&mut *guard, VmState::NoVm); + match old { + VmState::WaitingToStart => { + std::mem::replace(&mut *guard, VmState::Active(active)); + } + _ => unreachable!( + "only a starting VM's state worker calls make_active" + ), + } + } + + fn make_defunct(&self) { + let mut guard = self.state.write().unwrap(); + let old = std::mem::replace(&mut *guard, VmState::NoVm); + match old { + VmState::Active(vm) => { + let ActiveVm { external_state_rx, properties, spec, .. } = vm; + let defunct = DefunctVm { external_state_rx, properties, spec }; + std::mem::replace(&mut *guard, VmState::Defunct(defunct)); + } + _ => unreachable!( + "only an active VM's state worker calls make_defunct" + ), + } + } + + pub async fn ensure( + self: &Arc, + log: slog::Logger, + ensure_request: propolis_api_types::InstanceSpecEnsureRequest, + ) -> anyhow::Result<(), VmError> { + // Take the lock for writing, since in the common case this call will be + // creating a new VM and there's no easy way to upgrade from a reader + // lock to a writer lock. + let guard = self.state.write().unwrap(); + + // + if matches!(*guard, VmState::WaitingToStart | VmState::Active(_)) { + return Err(VmError::AlreadyInitialized); + } + + let (external_tx, external_rx) = + tokio::sync::watch::channel(InstanceStateMonitorResponse { + gen: 1, + state: propolis_api_types::InstanceState::Starting, + migration: propolis_api_types::InstanceMigrateStatusResponse { + migration_in: None, + migration_out: None, + }, + }); + + let input_queue = state_driver::InputQueue::new( + log.new(slog::o!("component" => "vmm_request_queue")), + ); + + let state_driver = state_driver::StateDriver::new( + log, + self.clone(), + Arc::new(input_queue), + external_tx, + ); + + let _ = tokio::spawn(async move { + state_driver.run(ensure_request, external_rx).await + }); + + Ok(()) + } +} diff --git a/bin/propolis-server/src/lib/vm2/request_queue.rs b/bin/propolis-server/src/lib/vm2/request_queue.rs new file mode 100644 index 000000000..c5c1fe3cd --- /dev/null +++ b/bin/propolis-server/src/lib/vm2/request_queue.rs @@ -0,0 +1,604 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Handles requests to change a Propolis server's state via the external API. +//! +//! An instance accepts or rejects requests to change state based on a +//! combination of its current state and its knowledge of the requests it has +//! previously queued but not processed yet. The latter knowledge is used to +//! reject requests that will never be fulfilled (because they're preceded by an +//! action that will forbid them; consider rebooting after stopping) or that may +//! need be to redirected to a migration target. +//! +//! The queue maintains a disposition for each kind of request that can be sent +//! to it, which allows that request to be enqueued, denied, or silently ignored +//! (for idempotency purposes). These dispositions can change as new requests +//! are queued. The queue also provides callbacks to the VM state driver that +//! allow the driver to advise the queue of state changes that further affect +//! what requests should be accepted. +//! +//! Users who want to share a queue must wrap it in the synchronization objects +//! of their choice. + +use std::collections::VecDeque; + +use slog::{debug, info, Logger}; +use thiserror::Error; +use uuid::Uuid; + +use crate::migrate::MigrateError; + +use super::{ + MigrateSourceCommand, MigrateSourceResponse, MigrateTargetCommand, +}; + +/// An external request made of a VM controller via the server API. Handled by +/// the controller's state driver thread. +#[derive(Debug)] +pub enum ExternalRequest { + /// Initializes the VM through live migration by running a + /// migration-destination task. + MigrateAsTarget { + /// The ID of the live migration to use when initializing. + migration_id: Uuid, + + /// A handle to the task that will execute the migration procedure. + task: tokio::task::JoinHandle>, + + /// The sender side of a one-shot channel that, when signaled, tells the + /// migration task to start its work. + start_tx: tokio::sync::oneshot::Sender<()>, + + /// A channel that receives commands from the migration task. + command_rx: tokio::sync::mpsc::Receiver, + }, + + /// Resets all the VM's devices and CPUs, then starts the VM. + Start { properties: InstanceProperties, spec: InstanceSpecV0 }, + + /// Asks the state worker to start a migration-source task. + MigrateAsSource { + /// The ID of the live migration for which this VM will be the source. + migration_id: Uuid, + + /// A handle to the task that will execute the migration procedure. + task: tokio::task::JoinHandle>, + + /// The sender side of a one-shot channel that, when signaled, tells the + /// migration task to start its work. + start_tx: tokio::sync::oneshot::Sender<()>, + + /// A channel that receives commands from the migration task. + command_rx: tokio::sync::mpsc::Receiver, + + /// A channel used to send responses to migration commands. + response_tx: tokio::sync::mpsc::Sender, + }, + + /// Resets the guest by pausing all devices, resetting them to their + /// cold-boot states, and resuming the devices. Note that this is not a + /// graceful reboot and does not coordinate with guest software. + Reboot, + + /// Halts the VM. Note that this is not a graceful shutdown and does not + /// coordinate with guest software. + Stop, +} + +/// A set of reasons why a request to queue an external state transition can +/// fail. +#[derive(Copy, Clone, Debug, Error)] +pub enum RequestDeniedReason { + #[error("Operation requires an active instance")] + InstanceNotActive, + + #[error("Already migrating into this instance")] + MigrationTargetInProgress, + + #[error("Instance is currently starting")] + StartInProgress, + + #[error("Instance is already a migration source")] + AlreadyMigrationSource, + + #[error("Operation cannot be performed on a migration source")] + InvalidRequestForMigrationSource, + + #[error("Instance is preparing to stop")] + HaltPending, + + #[error("Instance failed to start or halted due to a failure")] + InstanceFailed, +} + +/// The set of instance state changes that should change the dispositions of +/// future requests to the queue. +#[derive(Copy, Clone, Debug)] +pub enum InstanceStateChange { + StartedRunning, + Rebooted, + Stopped, + Failed, +} + +/// A reason for a change in the queue's request dispositions. +#[derive(Debug)] +enum DispositionChangeReason<'a> { + ApiRequest(&'a ExternalRequest), + StateChange(InstanceStateChange), +} + +/// The possible methods of handling a request to queue a state change. +#[derive(Copy, Clone, Debug)] +enum RequestDisposition { + /// Put the state change on the queue. + Enqueue, + + /// Drop the state change silently. This is used to make requests appear + /// idempotent to callers without making the state driver deal with the + /// consequences of queuing the same state change request twice. + Ignore, + + /// Deny the request to change state. + Deny(RequestDeniedReason), +} + +/// The current disposition for each kind of incoming request. +#[derive(Copy, Clone, Debug)] +struct AllowedRequests { + migrate_as_target: RequestDisposition, + start: RequestDisposition, + migrate_as_source: RequestDisposition, + reboot: RequestDisposition, + stop: RequestDisposition, +} + +#[derive(Debug)] +pub struct ExternalRequestQueue { + queue: VecDeque, + allowed: AllowedRequests, + log: Logger, +} + +impl ExternalRequestQueue { + /// Creates a new queue that logs to the supplied logger. + pub fn new(log: Logger) -> Self { + Self { + queue: VecDeque::new(), + allowed: AllowedRequests { + migrate_as_target: RequestDisposition::Enqueue, + start: RequestDisposition::Enqueue, + migrate_as_source: RequestDisposition::Deny( + RequestDeniedReason::InstanceNotActive, + ), + reboot: RequestDisposition::Deny( + RequestDeniedReason::InstanceNotActive, + ), + stop: RequestDisposition::Enqueue, + }, + log, + } + } + + /// Pops the request at the front of the queue. + pub fn pop_front(&mut self) -> Option { + self.queue.pop_front() + } + + /// Indicates whether the queue is empty. + pub fn is_empty(&self) -> bool { + self.queue.is_empty() + } + + /// Asks to place the supplied request on the queue. If the requests is + /// enqueued, updates the dispositions to use for future requests. + pub fn try_queue( + &mut self, + request: ExternalRequest, + ) -> Result<(), RequestDeniedReason> { + let disposition = match request { + ExternalRequest::MigrateAsTarget { .. } => { + self.allowed.migrate_as_target + } + ExternalRequest::Start => self.allowed.start, + ExternalRequest::MigrateAsSource { .. } => { + self.allowed.migrate_as_source + } + ExternalRequest::Reboot => self.allowed.reboot, + + // Requests to stop always succeed. Note that a request to stop a VM + // that hasn't started should still be queued to the state worker so + // that the worker can exit and drop its references to the instance. + ExternalRequest::Stop => self.allowed.stop, + }; + + info!(&self.log, "Queuing external request"; + "request" => ?request, + "disposition" => ?disposition); + + match disposition { + RequestDisposition::Enqueue => {} + RequestDisposition::Ignore => return Ok(()), + RequestDisposition::Deny(reason) => return Err(reason), + }; + + self.allowed = self.get_new_dispositions( + DispositionChangeReason::ApiRequest(&request), + ); + self.queue.push_back(request); + Ok(()) + } + + /// Notifies the queue that the instance's state has changed and that its + /// disposition should be updated accordingly. + pub fn notify_instance_state_change(&mut self, state: InstanceStateChange) { + self.allowed = self + .get_new_dispositions(DispositionChangeReason::StateChange(state)); + } + + /// Indicates whether the queue would allow a request to migrate into this + /// instance. This can be used to avoid setting up migration tasks for + /// requests that will ultimately be denied. + /// + /// # Return value + /// + /// - `Ok(true)` if the request will be queued. + /// - `Ok(false)` if the request is allowed for idempotency reasons but will + /// not be queued. + /// - `Err` if the request is forbidden. + pub fn migrate_as_target_will_enqueue( + &self, + ) -> Result { + match self.allowed.migrate_as_target { + RequestDisposition::Enqueue => Ok(true), + RequestDisposition::Ignore => Ok(false), + RequestDisposition::Deny(reason) => Err(reason), + } + } + + /// Indicates whether the queue would allow a request to migrate out of this + /// instance. This can be used to avoid setting up migration tasks for + /// requests that will ultimately be denied. + /// + /// # Return value + /// + /// - `Ok(true)` if the request will be queued. + /// - `Ok(false)` if the request is allowed for idempotency reasons but will + /// not be queued. + /// - `Err` if the request is forbidden. + pub fn migrate_as_source_will_enqueue( + &self, + ) -> Result { + assert!(!matches!( + self.allowed.migrate_as_source, + RequestDisposition::Ignore + )); + + match self.allowed.migrate_as_source { + RequestDisposition::Enqueue => Ok(true), + RequestDisposition::Ignore => unreachable!(), + RequestDisposition::Deny(reason) => Err(reason), + } + } + + /// Computes a new set of queue dispositions given the current state of the + /// queue and the event that is changing those dispositions. + fn get_new_dispositions( + &self, + reason: DispositionChangeReason, + ) -> AllowedRequests { + debug!(self.log, "Computing new queue dispositions"; + "reason" => ?reason); + + use DispositionChangeReason as ChangeReason; + use RequestDeniedReason as DenyReason; + use RequestDisposition as Disposition; + match reason { + // Starting the instance, whether via migration or cold boot, + // forecloses on further attempts to migrate in. For idempotency, + // further requests to start are allowed when an instance-starting + // transition is enqueued. + ChangeReason::ApiRequest(ExternalRequest::MigrateAsTarget { + .. + }) + | ChangeReason::ApiRequest(ExternalRequest::Start) => { + let (migrate_as_target_disposition, deny_reason) = match reason + { + // If this is a request to migrate in, make sure future + // requests to migrate in are handled idempotently. + ChangeReason::ApiRequest( + ExternalRequest::MigrateAsTarget { .. }, + ) => ( + Disposition::Ignore, + DenyReason::MigrationTargetInProgress, + ), + ChangeReason::ApiRequest(ExternalRequest::Start) => ( + Disposition::Deny(DenyReason::StartInProgress), + DenyReason::StartInProgress, + ), + _ => unreachable!(), + }; + + AllowedRequests { + migrate_as_target: migrate_as_target_disposition, + start: Disposition::Ignore, + migrate_as_source: Disposition::Deny(deny_reason), + reboot: Disposition::Deny(deny_reason), + stop: self.allowed.stop, + } + } + ChangeReason::ApiRequest(ExternalRequest::MigrateAsSource { + .. + }) => { + assert!(matches!(self.allowed.start, Disposition::Ignore)); + + // Requests to migrate into the instance should not be enqueued + // from this point, but whether they're dropped or ignored + // depends on how the instance was originally initialized. + assert!(!matches!( + self.allowed.migrate_as_target, + Disposition::Enqueue + )); + + AllowedRequests { + migrate_as_target: self.allowed.migrate_as_target, + start: self.allowed.start, + migrate_as_source: Disposition::Deny( + DenyReason::AlreadyMigrationSource, + ), + reboot: Disposition::Deny( + DenyReason::InvalidRequestForMigrationSource, + ), + stop: self.allowed.stop, + } + } + + // Requests to reboot prevent additional reboot requests from being + // queued, but do not affect other operations. + ChangeReason::ApiRequest(ExternalRequest::Reboot) => { + assert!(matches!(self.allowed.start, Disposition::Ignore)); + assert!(!matches!( + self.allowed.migrate_as_target, + Disposition::Enqueue + )); + + AllowedRequests { reboot: Disposition::Ignore, ..self.allowed } + } + + // Requests to stop the instance block other requests from being + // queued. Additional requests to stop are ignored for idempotency. + ChangeReason::ApiRequest(ExternalRequest::Stop) => { + AllowedRequests { + migrate_as_target: Disposition::Deny( + DenyReason::HaltPending, + ), + start: Disposition::Deny(DenyReason::HaltPending), + migrate_as_source: Disposition::Deny( + DenyReason::HaltPending, + ), + reboot: Disposition::Deny(DenyReason::HaltPending), + stop: Disposition::Ignore, + } + } + + // When an instance begins running, requests to migrate out of it or + // to reboot it become valid. + ChangeReason::StateChange(InstanceStateChange::StartedRunning) => { + AllowedRequests { + migrate_as_target: self.allowed.migrate_as_target, + start: self.allowed.start, + migrate_as_source: Disposition::Enqueue, + reboot: Disposition::Enqueue, + stop: self.allowed.stop, + } + } + + // When an instance finishes rebooting, allow new reboot requests to + // be queued again, unless reboot requests began to be denied in the + // meantime. + ChangeReason::StateChange(InstanceStateChange::Rebooted) => { + let new_reboot = + if let Disposition::Ignore = self.allowed.reboot { + Disposition::Enqueue + } else { + self.allowed.reboot + }; + + AllowedRequests { reboot: new_reboot, ..self.allowed } + } + + // When an instance stops or fails, requests to do anything other + // than stop it are denied with an appropriate deny reason. Note + // that an instance may stop or fail due to guest activity, so the + // previous dispositions for migrate and reboot requests may not be + // "deny". + ChangeReason::StateChange(InstanceStateChange::Stopped) => { + AllowedRequests { + migrate_as_target: Disposition::Deny( + DenyReason::InstanceNotActive, + ), + start: Disposition::Deny(DenyReason::InstanceNotActive), + migrate_as_source: Disposition::Deny( + DenyReason::InstanceNotActive, + ), + reboot: Disposition::Deny(DenyReason::InstanceNotActive), + stop: Disposition::Ignore, + } + } + ChangeReason::StateChange(InstanceStateChange::Failed) => { + AllowedRequests { + migrate_as_target: Disposition::Deny( + DenyReason::InstanceFailed, + ), + start: Disposition::Deny(DenyReason::InstanceFailed), + migrate_as_source: Disposition::Deny( + DenyReason::InstanceFailed, + ), + reboot: Disposition::Deny(DenyReason::InstanceFailed), + stop: self.allowed.stop, + } + } + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + use uuid::Uuid; + + fn test_logger() -> slog::Logger { + slog::Logger::root(slog::Discard, slog::o!()) + } + + fn make_migrate_as_target_request() -> ExternalRequest { + let task = tokio::task::spawn(async { Ok(()) }); + let (start_tx, _) = tokio::sync::oneshot::channel(); + let (_, command_rx) = tokio::sync::mpsc::channel(1); + ExternalRequest::MigrateAsTarget { + migration_id: Uuid::new_v4(), + task, + start_tx, + command_rx, + } + } + + fn make_migrate_as_source_request() -> ExternalRequest { + let task = tokio::task::spawn(async { Ok(()) }); + let (start_tx, _) = tokio::sync::oneshot::channel(); + let (_, command_rx) = tokio::sync::mpsc::channel(1); + let (response_tx, _) = tokio::sync::mpsc::channel(1); + ExternalRequest::MigrateAsSource { + migration_id: Uuid::new_v4(), + task, + start_tx, + command_rx, + response_tx, + } + } + + #[tokio::test] + async fn migrate_as_target_is_idempotent() { + let mut queue = ExternalRequestQueue::new(test_logger()); + + // Requests to migrate as a target should queue normally at first. + assert!(queue.migrate_as_target_will_enqueue().unwrap()); + + // After queuing such a request, subsequent requests should be allowed + // without enqueuing anything. + assert!(queue.try_queue(make_migrate_as_target_request()).is_ok()); + assert!(!queue.migrate_as_target_will_enqueue().unwrap()); + + // Pop the request and tell the queue the instance is running. + assert!(matches!( + queue.pop_front(), + Some(ExternalRequest::MigrateAsTarget { .. }) + )); + queue.notify_instance_state_change(InstanceStateChange::StartedRunning); + + // Because the instance was started via migration in, future requests + // to migrate in should be allowed. + assert!(queue.try_queue(make_migrate_as_target_request()).is_ok()); + assert!(!queue.migrate_as_target_will_enqueue().unwrap()); + } + + #[tokio::test] + async fn migrate_as_target_is_forbidden_after_cold_boot() { + let mut queue = ExternalRequestQueue::new(test_logger()); + assert!(queue.try_queue(ExternalRequest::Start).is_ok()); + queue.notify_instance_state_change(InstanceStateChange::StartedRunning); + + assert!(queue.migrate_as_target_will_enqueue().is_err()); + assert!(queue.try_queue(make_migrate_as_target_request()).is_err()); + } + + #[tokio::test] + async fn migrate_as_source_is_not_idempotent() { + // Simulate a running instance. + let mut queue = ExternalRequestQueue::new(test_logger()); + assert!(queue.try_queue(ExternalRequest::Start).is_ok()); + assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); + queue.notify_instance_state_change(InstanceStateChange::StartedRunning); + + // Requests to migrate out should be allowed. + assert!(queue.migrate_as_source_will_enqueue().unwrap()); + assert!(queue.try_queue(make_migrate_as_source_request()).is_ok()); + + // Once the request is queued, other requests to migrate out are + // disallowed until the queued request is disposed of. + // + // This differs from the migration-in case in that requests to migrate + // in are issued by the sled agent as part of a saga (where idempotency + // is assumed), but requests to migrate out are issued by the target + // Propolis (which does not assume idempotency and issues only one + // request per migration attempt). + assert!(queue.migrate_as_source_will_enqueue().is_err()); + assert!(queue.try_queue(make_migrate_as_source_request()).is_err()); + + // If migration fails, the instance resumes running, and then another + // request to migrate out should be allowed. + assert!(matches!( + queue.pop_front(), + Some(ExternalRequest::MigrateAsSource { .. }) + )); + queue.notify_instance_state_change(InstanceStateChange::StartedRunning); + assert!(queue.migrate_as_source_will_enqueue().unwrap()); + assert!(queue.try_queue(make_migrate_as_source_request()).is_ok()); + + // A successful migration stops the instance, which forecloses on future + // requests to migrate out. + queue.pop_front(); + queue.notify_instance_state_change(InstanceStateChange::Stopped); + assert!(queue.migrate_as_source_will_enqueue().is_err()); + assert!(queue.try_queue(make_migrate_as_source_request()).is_err()); + } + + #[tokio::test] + async fn stop_requests_enqueue_after_vm_failure() { + let mut queue = ExternalRequestQueue::new(test_logger()); + assert!(queue.try_queue(ExternalRequest::Start).is_ok()); + assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); + queue.notify_instance_state_change(InstanceStateChange::Failed); + + assert!(queue.try_queue(ExternalRequest::Stop).is_ok()); + assert!(matches!(queue.pop_front(), Some(ExternalRequest::Stop))); + } + + #[tokio::test] + async fn reboot_requests_are_idempotent_except_when_stopping() { + let mut queue = ExternalRequestQueue::new(test_logger()); + assert!(queue.try_queue(ExternalRequest::Start).is_ok()); + assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); + queue.notify_instance_state_change(InstanceStateChange::StartedRunning); + + // Once the instance is started, reboot requests should be allowed, but + // after the first, subsequent requests should be dropped for + // idempotency. + assert!(queue.is_empty()); + for _ in 0..5 { + assert!(queue.try_queue(ExternalRequest::Reboot).is_ok()); + } + assert!(matches!(queue.pop_front(), Some(ExternalRequest::Reboot))); + assert!(queue.is_empty()); + + // Once the instance has rebooted, new requests can be queued. + queue.notify_instance_state_change(InstanceStateChange::Rebooted); + assert!(queue.try_queue(ExternalRequest::Reboot).is_ok()); + assert!(!queue.is_empty()); + assert!(matches!(queue.pop_front(), Some(ExternalRequest::Reboot))); + queue.notify_instance_state_change(InstanceStateChange::Rebooted); + + // If a request to reboot is queued, and then a request to stop is + // queued, new requests to reboot should always fail, even after the + // instance finishes rebooting. + assert!(queue.try_queue(ExternalRequest::Reboot).is_ok()); + assert!(!queue.is_empty()); + assert!(queue.try_queue(ExternalRequest::Stop).is_ok()); + assert!(queue.try_queue(ExternalRequest::Reboot).is_err()); + assert!(matches!(queue.pop_front(), Some(ExternalRequest::Reboot))); + queue.notify_instance_state_change(InstanceStateChange::Rebooted); + assert!(queue.try_queue(ExternalRequest::Reboot).is_err()); + } +} diff --git a/bin/propolis-server/src/lib/vm2/state_driver.rs b/bin/propolis-server/src/lib/vm2/state_driver.rs new file mode 100644 index 000000000..76cec6c5f --- /dev/null +++ b/bin/propolis-server/src/lib/vm2/state_driver.rs @@ -0,0 +1,151 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! It drives the state vroom vroom + +use std::{ + sync::{Arc, Condvar, Mutex}, + time::Duration, +}; + +use propolis_server_config::Config; + +use super::guest_event; + +struct InputQueueInner { + external_requests: super::request_queue::ExternalRequestQueue, + guest_events: super::guest_event::GuestEventQueue, +} + +impl InputQueueInner { + fn new(log: slog::Logger) -> Self { + Self { + external_requests: super::request_queue::ExternalRequestQueue::new( + log, + ), + guest_events: super::guest_event::GuestEventQueue::default(), + } + } +} + +pub(super) struct InputQueue { + inner: Mutex, + cv: Condvar, +} + +impl InputQueue { + pub(super) fn new(log: slog::Logger) -> Self { + Self { + inner: Mutex::new(InputQueueInner::new(log)), + cv: Condvar::new(), + } + } +} + +impl guest_event::GuestEventHandler for InputQueue { + fn suspend_halt_event(&self, when: Duration) { + let mut guard = self.inner.lock().unwrap(); + if guard + .guest_events + .enqueue(guest_event::GuestEvent::VcpuSuspendHalt(when)) + { + self.cv.notify_all(); + } + } + + fn suspend_reset_event(&self, when: Duration) { + let mut guard = self.inner.lock().unwrap(); + if guard + .guest_events + .enqueue(guest_event::GuestEvent::VcpuSuspendReset(when)) + { + self.cv.notify_all(); + } + } + + fn suspend_triple_fault_event(&self, vcpu_id: i32, when: Duration) { + let mut guard = self.inner.lock().unwrap(); + if guard.guest_events.enqueue( + guest_event::GuestEvent::VcpuSuspendTripleFault(vcpu_id, when), + ) { + self.cv.notify_all(); + } + } + + fn unhandled_vm_exit( + &self, + vcpu_id: i32, + exit: propolis::exits::VmExitKind, + ) { + panic!("vCPU {}: Unhandled VM exit: {:?}", vcpu_id, exit); + } + + fn io_error_event(&self, vcpu_id: i32, error: std::io::Error) { + panic!("vCPU {}: Unhandled vCPU error: {}", vcpu_id, error); + } +} + +impl guest_event::ChipsetEventHandler for InputQueue { + fn chipset_halt(&self) { + let mut guard = self.inner.lock().unwrap(); + if guard.guest_events.enqueue(guest_event::GuestEvent::ChipsetHalt) { + self.cv.notify_all(); + } + } + + fn chipset_reset(&self) { + let mut guard = self.inner.lock().unwrap(); + if guard.guest_events.enqueue(guest_event::GuestEvent::ChipsetReset) { + self.cv.notify_all(); + } + } +} + +/// The context for a VM state driver task. +pub(super) struct StateDriver { + log: slog::Logger, + parent_vm: Arc, + input_queue: Arc, + external_state_tx: super::InstanceStateTx, + state_gen: u64, + paused: bool, +} + +impl StateDriver { + pub(super) fn new( + log: slog::Logger, + vm: Arc, + input_queue: Arc, + external_state_tx: super::InstanceStateTx, + ) -> Self { + let log = log.new(slog::o!("component" => "state_driver")); + Self { + log, + parent_vm: vm, + input_queue, + external_state_tx, + state_gen: 0, + paused: false, + } + } + + pub(super) async fn run( + self, + ensure_request: propolis_api_types::InstanceSpecEnsureRequest, + external_state_rx: super::InstanceStateRx, + ) { + if self.initialize_vm(ensure_request, external_state_rx).is_err() { + self.parent_vm.start_failed(); + return; + } + } + + fn initialize_vm( + &self, + ensure_request: propolis_api_types::InstanceSpecEnsureRequest, + external_state_rx: super::InstanceStateRx, + ) -> anyhow::Result<()> { + todo!("gjc"); + } +} diff --git a/crates/propolis-server-config/src/lib.rs b/crates/propolis-server-config/src/lib.rs index 796683485..303b08f24 100644 --- a/crates/propolis-server-config/src/lib.rs +++ b/crates/propolis-server-config/src/lib.rs @@ -14,7 +14,7 @@ pub use cpuid_profile_config::CpuidProfile; /// Configuration for the Propolis server. // NOTE: This is expected to change over time; portions of the hard-coded // configuration will likely become more dynamic. -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq)] pub struct Config { pub bootrom: PathBuf, From 21869eb384a9beb0ffc7e5fc4f69af2c21e34342 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 21 Jun 2024 16:03:38 +0000 Subject: [PATCH 02/55] [WIP] get rid of some request queue stuff we don't need --- .../src/lib/vm2/migrate_commands.rs | 46 +++++ bin/propolis-server/src/lib/vm2/mod.rs | 41 ++-- .../src/lib/vm2/request_queue.rs | 190 +----------------- crates/propolis-server-config/src/lib.rs | 2 +- 4 files changed, 70 insertions(+), 209 deletions(-) create mode 100644 bin/propolis-server/src/lib/vm2/migrate_commands.rs diff --git a/bin/propolis-server/src/lib/vm2/migrate_commands.rs b/bin/propolis-server/src/lib/vm2/migrate_commands.rs new file mode 100644 index 000000000..f448364b9 --- /dev/null +++ b/bin/propolis-server/src/lib/vm2/migrate_commands.rs @@ -0,0 +1,46 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Requests and responses between the VM state driver and the live migration +//! protocol. + +use crate::migrate::MigrateError; + +/// A message sent from a live migration destination task to update the +/// externally visible state of the migration attempt. +#[derive(Clone, Copy, Debug)] +pub enum MigrateTargetCommand { + /// Update the externally-visible migration state. + UpdateState(propolis_api_types::MigrationState), +} + +/// A message sent from a live migration driver to the state worker, asking it +/// to act on source instance components on the task's behalf. +#[derive(Clone, Copy, Debug)] +pub enum MigrateSourceCommand { + /// Update the externally-visible migration state. + UpdateState(propolis_api_types::MigrationState), + + /// Pause the instance's devices and CPUs. + Pause, +} + +/// A message sent from the state worker to the live migration driver in +/// response to a previous command. +#[derive(Debug)] +pub enum MigrateSourceResponse { + /// A request to pause completed with the attached result. + Pause(Result<(), std::io::Error>), +} + +/// An event raised by a migration task that must be handled by the state +/// worker. +#[derive(Debug)] +enum MigrateTaskEvent { + /// The task completed with the associated result. + TaskExited(Result<(), MigrateError>), + + /// The task sent a command requesting work. + Command(T), +} diff --git a/bin/propolis-server/src/lib/vm2/mod.rs b/bin/propolis-server/src/lib/vm2/mod.rs index 00b5c07ea..95fd32ce6 100644 --- a/bin/propolis-server/src/lib/vm2/mod.rs +++ b/bin/propolis-server/src/lib/vm2/mod.rs @@ -23,6 +23,7 @@ use uuid::Uuid; use crate::{serial::Serial, vm::VmControllerError}; mod guest_event; +mod migrate_commands; mod request_queue; mod state_driver; @@ -77,19 +78,13 @@ pub(super) struct ActiveVm { impl Drop for ActiveVm { fn drop(&mut self) { - let guard = self.parent.state.write().unwrap(); - let VmState::ActiveVm(active) = - std::mem::replace(&mut *guard, VmState::NoVm) - else { - panic!("oops"); - }; - + let mut guard = self.parent.state.write().unwrap(); std::mem::replace( &mut *guard, VmState::Defunct(DefunctVm { - external_state_rx: active.external_state_rx, - properties: active.properties, - spec: active.spec, + external_state_rx: self.external_state_rx.clone(), + properties: self.properties.clone(), + spec: self.spec.clone(), }), ); } @@ -120,8 +115,10 @@ impl Vm { pub(super) fn active_vm(&self) -> Option> { let guard = self.vm_state(); - if let VmState::Active(weak) = guard { + if let VmState::Active(weak) = &*guard { weak.upgrade() + } else { + None } } @@ -135,12 +132,15 @@ impl Vm { } } - fn make_active(&self, active: ActiveVm) { + fn make_active(&self, active: Arc) { let mut guard = self.state.write().unwrap(); let old = std::mem::replace(&mut *guard, VmState::NoVm); match old { VmState::WaitingToStart => { - std::mem::replace(&mut *guard, VmState::Active(active)); + std::mem::replace( + &mut *guard, + VmState::Active(Arc::downgrade(&active)), + ); } _ => unreachable!( "only a starting VM's state worker calls make_active" @@ -148,21 +148,6 @@ impl Vm { } } - fn make_defunct(&self) { - let mut guard = self.state.write().unwrap(); - let old = std::mem::replace(&mut *guard, VmState::NoVm); - match old { - VmState::Active(vm) => { - let ActiveVm { external_state_rx, properties, spec, .. } = vm; - let defunct = DefunctVm { external_state_rx, properties, spec }; - std::mem::replace(&mut *guard, VmState::Defunct(defunct)); - } - _ => unreachable!( - "only an active VM's state worker calls make_defunct" - ), - } - } - pub async fn ensure( self: &Arc, log: slog::Logger, diff --git a/bin/propolis-server/src/lib/vm2/request_queue.rs b/bin/propolis-server/src/lib/vm2/request_queue.rs index c5c1fe3cd..fe52d2135 100644 --- a/bin/propolis-server/src/lib/vm2/request_queue.rs +++ b/bin/propolis-server/src/lib/vm2/request_queue.rs @@ -29,34 +29,12 @@ use uuid::Uuid; use crate::migrate::MigrateError; -use super::{ - MigrateSourceCommand, MigrateSourceResponse, MigrateTargetCommand, -}; +use super::migrate_commands::{MigrateSourceCommand, MigrateSourceResponse}; /// An external request made of a VM controller via the server API. Handled by /// the controller's state driver thread. #[derive(Debug)] pub enum ExternalRequest { - /// Initializes the VM through live migration by running a - /// migration-destination task. - MigrateAsTarget { - /// The ID of the live migration to use when initializing. - migration_id: Uuid, - - /// A handle to the task that will execute the migration procedure. - task: tokio::task::JoinHandle>, - - /// The sender side of a one-shot channel that, when signaled, tells the - /// migration task to start its work. - start_tx: tokio::sync::oneshot::Sender<()>, - - /// A channel that receives commands from the migration task. - command_rx: tokio::sync::mpsc::Receiver, - }, - - /// Resets all the VM's devices and CPUs, then starts the VM. - Start { properties: InstanceProperties, spec: InstanceSpecV0 }, - /// Asks the state worker to start a migration-source task. MigrateAsSource { /// The ID of the live migration for which this VM will be the source. @@ -147,8 +125,6 @@ enum RequestDisposition { /// The current disposition for each kind of incoming request. #[derive(Copy, Clone, Debug)] struct AllowedRequests { - migrate_as_target: RequestDisposition, - start: RequestDisposition, migrate_as_source: RequestDisposition, reboot: RequestDisposition, stop: RequestDisposition, @@ -167,8 +143,6 @@ impl ExternalRequestQueue { Self { queue: VecDeque::new(), allowed: AllowedRequests { - migrate_as_target: RequestDisposition::Enqueue, - start: RequestDisposition::Enqueue, migrate_as_source: RequestDisposition::Deny( RequestDeniedReason::InstanceNotActive, ), @@ -198,10 +172,6 @@ impl ExternalRequestQueue { request: ExternalRequest, ) -> Result<(), RequestDeniedReason> { let disposition = match request { - ExternalRequest::MigrateAsTarget { .. } => { - self.allowed.migrate_as_target - } - ExternalRequest::Start => self.allowed.start, ExternalRequest::MigrateAsSource { .. } => { self.allowed.migrate_as_source } @@ -237,26 +207,6 @@ impl ExternalRequestQueue { .get_new_dispositions(DispositionChangeReason::StateChange(state)); } - /// Indicates whether the queue would allow a request to migrate into this - /// instance. This can be used to avoid setting up migration tasks for - /// requests that will ultimately be denied. - /// - /// # Return value - /// - /// - `Ok(true)` if the request will be queued. - /// - `Ok(false)` if the request is allowed for idempotency reasons but will - /// not be queued. - /// - `Err` if the request is forbidden. - pub fn migrate_as_target_will_enqueue( - &self, - ) -> Result { - match self.allowed.migrate_as_target { - RequestDisposition::Enqueue => Ok(true), - RequestDisposition::Ignore => Ok(false), - RequestDisposition::Deny(reason) => Err(reason), - } - } - /// Indicates whether the queue would allow a request to migrate out of this /// instance. This can be used to avoid setting up migration tasks for /// requests that will ultimately be denied. @@ -295,74 +245,21 @@ impl ExternalRequestQueue { use RequestDeniedReason as DenyReason; use RequestDisposition as Disposition; match reason { - // Starting the instance, whether via migration or cold boot, - // forecloses on further attempts to migrate in. For idempotency, - // further requests to start are allowed when an instance-starting - // transition is enqueued. - ChangeReason::ApiRequest(ExternalRequest::MigrateAsTarget { - .. - }) - | ChangeReason::ApiRequest(ExternalRequest::Start) => { - let (migrate_as_target_disposition, deny_reason) = match reason - { - // If this is a request to migrate in, make sure future - // requests to migrate in are handled idempotently. - ChangeReason::ApiRequest( - ExternalRequest::MigrateAsTarget { .. }, - ) => ( - Disposition::Ignore, - DenyReason::MigrationTargetInProgress, - ), - ChangeReason::ApiRequest(ExternalRequest::Start) => ( - Disposition::Deny(DenyReason::StartInProgress), - DenyReason::StartInProgress, - ), - _ => unreachable!(), - }; - - AllowedRequests { - migrate_as_target: migrate_as_target_disposition, - start: Disposition::Ignore, - migrate_as_source: Disposition::Deny(deny_reason), - reboot: Disposition::Deny(deny_reason), - stop: self.allowed.stop, - } - } ChangeReason::ApiRequest(ExternalRequest::MigrateAsSource { .. - }) => { - assert!(matches!(self.allowed.start, Disposition::Ignore)); - - // Requests to migrate into the instance should not be enqueued - // from this point, but whether they're dropped or ignored - // depends on how the instance was originally initialized. - assert!(!matches!( - self.allowed.migrate_as_target, - Disposition::Enqueue - )); - - AllowedRequests { - migrate_as_target: self.allowed.migrate_as_target, - start: self.allowed.start, - migrate_as_source: Disposition::Deny( - DenyReason::AlreadyMigrationSource, - ), - reboot: Disposition::Deny( - DenyReason::InvalidRequestForMigrationSource, - ), - stop: self.allowed.stop, - } - } + }) => AllowedRequests { + migrate_as_source: Disposition::Deny( + DenyReason::AlreadyMigrationSource, + ), + reboot: Disposition::Deny( + DenyReason::InvalidRequestForMigrationSource, + ), + stop: self.allowed.stop, + }, // Requests to reboot prevent additional reboot requests from being // queued, but do not affect other operations. ChangeReason::ApiRequest(ExternalRequest::Reboot) => { - assert!(matches!(self.allowed.start, Disposition::Ignore)); - assert!(!matches!( - self.allowed.migrate_as_target, - Disposition::Enqueue - )); - AllowedRequests { reboot: Disposition::Ignore, ..self.allowed } } @@ -370,10 +267,6 @@ impl ExternalRequestQueue { // queued. Additional requests to stop are ignored for idempotency. ChangeReason::ApiRequest(ExternalRequest::Stop) => { AllowedRequests { - migrate_as_target: Disposition::Deny( - DenyReason::HaltPending, - ), - start: Disposition::Deny(DenyReason::HaltPending), migrate_as_source: Disposition::Deny( DenyReason::HaltPending, ), @@ -386,8 +279,6 @@ impl ExternalRequestQueue { // to reboot it become valid. ChangeReason::StateChange(InstanceStateChange::StartedRunning) => { AllowedRequests { - migrate_as_target: self.allowed.migrate_as_target, - start: self.allowed.start, migrate_as_source: Disposition::Enqueue, reboot: Disposition::Enqueue, stop: self.allowed.stop, @@ -415,10 +306,6 @@ impl ExternalRequestQueue { // "deny". ChangeReason::StateChange(InstanceStateChange::Stopped) => { AllowedRequests { - migrate_as_target: Disposition::Deny( - DenyReason::InstanceNotActive, - ), - start: Disposition::Deny(DenyReason::InstanceNotActive), migrate_as_source: Disposition::Deny( DenyReason::InstanceNotActive, ), @@ -428,10 +315,6 @@ impl ExternalRequestQueue { } ChangeReason::StateChange(InstanceStateChange::Failed) => { AllowedRequests { - migrate_as_target: Disposition::Deny( - DenyReason::InstanceFailed, - ), - start: Disposition::Deny(DenyReason::InstanceFailed), migrate_as_source: Disposition::Deny( DenyReason::InstanceFailed, ), @@ -453,18 +336,6 @@ mod test { slog::Logger::root(slog::Discard, slog::o!()) } - fn make_migrate_as_target_request() -> ExternalRequest { - let task = tokio::task::spawn(async { Ok(()) }); - let (start_tx, _) = tokio::sync::oneshot::channel(); - let (_, command_rx) = tokio::sync::mpsc::channel(1); - ExternalRequest::MigrateAsTarget { - migration_id: Uuid::new_v4(), - task, - start_tx, - command_rx, - } - } - fn make_migrate_as_source_request() -> ExternalRequest { let task = tokio::task::spawn(async { Ok(()) }); let (start_tx, _) = tokio::sync::oneshot::channel(); @@ -479,47 +350,10 @@ mod test { } } - #[tokio::test] - async fn migrate_as_target_is_idempotent() { - let mut queue = ExternalRequestQueue::new(test_logger()); - - // Requests to migrate as a target should queue normally at first. - assert!(queue.migrate_as_target_will_enqueue().unwrap()); - - // After queuing such a request, subsequent requests should be allowed - // without enqueuing anything. - assert!(queue.try_queue(make_migrate_as_target_request()).is_ok()); - assert!(!queue.migrate_as_target_will_enqueue().unwrap()); - - // Pop the request and tell the queue the instance is running. - assert!(matches!( - queue.pop_front(), - Some(ExternalRequest::MigrateAsTarget { .. }) - )); - queue.notify_instance_state_change(InstanceStateChange::StartedRunning); - - // Because the instance was started via migration in, future requests - // to migrate in should be allowed. - assert!(queue.try_queue(make_migrate_as_target_request()).is_ok()); - assert!(!queue.migrate_as_target_will_enqueue().unwrap()); - } - - #[tokio::test] - async fn migrate_as_target_is_forbidden_after_cold_boot() { - let mut queue = ExternalRequestQueue::new(test_logger()); - assert!(queue.try_queue(ExternalRequest::Start).is_ok()); - queue.notify_instance_state_change(InstanceStateChange::StartedRunning); - - assert!(queue.migrate_as_target_will_enqueue().is_err()); - assert!(queue.try_queue(make_migrate_as_target_request()).is_err()); - } - #[tokio::test] async fn migrate_as_source_is_not_idempotent() { // Simulate a running instance. let mut queue = ExternalRequestQueue::new(test_logger()); - assert!(queue.try_queue(ExternalRequest::Start).is_ok()); - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); queue.notify_instance_state_change(InstanceStateChange::StartedRunning); // Requests to migrate out should be allowed. @@ -558,8 +392,6 @@ mod test { #[tokio::test] async fn stop_requests_enqueue_after_vm_failure() { let mut queue = ExternalRequestQueue::new(test_logger()); - assert!(queue.try_queue(ExternalRequest::Start).is_ok()); - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); queue.notify_instance_state_change(InstanceStateChange::Failed); assert!(queue.try_queue(ExternalRequest::Stop).is_ok()); @@ -569,8 +401,6 @@ mod test { #[tokio::test] async fn reboot_requests_are_idempotent_except_when_stopping() { let mut queue = ExternalRequestQueue::new(test_logger()); - assert!(queue.try_queue(ExternalRequest::Start).is_ok()); - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); queue.notify_instance_state_change(InstanceStateChange::StartedRunning); // Once the instance is started, reboot requests should be allowed, but diff --git a/crates/propolis-server-config/src/lib.rs b/crates/propolis-server-config/src/lib.rs index 303b08f24..796683485 100644 --- a/crates/propolis-server-config/src/lib.rs +++ b/crates/propolis-server-config/src/lib.rs @@ -14,7 +14,7 @@ pub use cpuid_profile_config::CpuidProfile; /// Configuration for the Propolis server. // NOTE: This is expected to change over time; portions of the hard-coded // configuration will likely become more dynamic. -#[derive(Clone, Serialize, Deserialize, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Debug, PartialEq)] pub struct Config { pub bootrom: PathBuf, From 301ad33d569e67eb4f96cba75f10150c1178ce46 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 21 Jun 2024 16:27:31 +0000 Subject: [PATCH 03/55] [WIP] checkpoint: move some event handling logic --- bin/propolis-server/src/lib/initializer.rs | 1 + .../src/lib/vm2/guest_event.rs | 6 +- .../src/lib/vm2/lifecycle_ops.rs | 82 ++++ bin/propolis-server/src/lib/vm2/mod.rs | 75 ++-- .../src/lib/vm2/state_driver.rs | 359 +++++++++++++++++- lib/propolis/src/block/crucible.rs | 2 + 6 files changed, 486 insertions(+), 39 deletions(-) create mode 100644 bin/propolis-server/src/lib/vm2/lifecycle_ops.rs diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index e738d6337..ddd6e8043 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -110,6 +110,7 @@ pub struct MachineInitializerState { pub struct MachineInitializer<'a> { pub(crate) log: slog::Logger, pub(crate) machine: &'a Machine, + // TODO(gjc) clean up types here pub(crate) devices: DeviceMap, pub(crate) block_backends: BlockBackendMap, pub(crate) crucible_backends: CrucibleBackendMap, diff --git a/bin/propolis-server/src/lib/vm2/guest_event.rs b/bin/propolis-server/src/lib/vm2/guest_event.rs index 7a5b29f7b..05e0f4998 100644 --- a/bin/propolis-server/src/lib/vm2/guest_event.rs +++ b/bin/propolis-server/src/lib/vm2/guest_event.rs @@ -56,7 +56,11 @@ impl GuestEventQueue { } } - pub(super) fn dequeue(&mut self) -> Option { + pub(super) fn pop_front(&mut self) -> Option { self.queue.pop_front() } + + pub(super) fn is_empty(&self) -> bool { + self.queue.is_empty() + } } diff --git a/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs b/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs new file mode 100644 index 000000000..5e6d25d26 --- /dev/null +++ b/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs @@ -0,0 +1,82 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +/// Commands that the VM state driver can invoke on its active VM to pause, +/// resume, and reset the devices under its care. +/// +/// These functions are abstracted into a trait to allow them to be mocked out +/// while testing the rest of the state driver. +#[cfg_attr(test, mockall::automock)] +pub(super) trait VmLifecycle { + /// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated + /// devices and vCPUs are brought to a consistent state. + /// + /// When the VM is paused, attempts to run its vCPUs (via `VM_RUN` ioctl) + /// will fail. A corresponding `resume_vm()` call must be made prior to + /// allowing vCPU tasks to run. + fn pause_vm(&self); + + /// Resume a previously-paused VM at the kernel VMM level. This will resume + /// any timers driving in-kernel-emulated devices, and allow the vCPU to run + /// again. + fn resume_vm(&self); + + /// Sends a reset request to each device in the instance, then sends a + /// reset command to the instance's bhyve VM. + fn reset_devices_and_machine(&self); + + /// Sends each device (and backend) a start request. + fn start_devices(&self) -> anyhow::Result<()>; + + /// Sends each device a pause request, then waits for all these requests to + /// complete. + fn pause_devices(&self); + + /// Sends each device a resume request. + fn resume_devices(&self); + + /// Sends each device (and backend) a halt request. + fn halt_devices(&self); + + /// Resets the state of each vCPU in the instance to its on-reboot state. + fn reset_vcpu_state(&self); +} + +impl VmLifecycle for super::ActiveVm { + fn pause_vm(&self) { + self.objects.machine.hdl.pause().expect("VM_PAUSE should succeed"); + } + + fn resume_vm(&self) { + self.objects.machine.hdl.resume().expect("VM_RESUME should succeed"); + } + + fn reset_devices_and_machine(&self) { + self.objects.for_each_device(|name, dev| { + dev.reset(); + }); + + self.objects.machine.reinitialize().unwrap(); + } + + fn start_devices(&self) -> anyhow::Result<()> { + todo!() + } + + fn pause_devices(&self) { + todo!() + } + + fn resume_devices(&self) { + todo!() + } + + fn halt_devices(&self) { + todo!() + } + + fn reset_vcpu_state(&self) { + todo!() + } +} diff --git a/bin/propolis-server/src/lib/vm2/mod.rs b/bin/propolis-server/src/lib/vm2/mod.rs index 95fd32ce6..cedc41315 100644 --- a/bin/propolis-server/src/lib/vm2/mod.rs +++ b/bin/propolis-server/src/lib/vm2/mod.rs @@ -7,9 +7,11 @@ use std::{ collections::BTreeMap, - sync::{Arc, RwLock, RwLockReadGuard, Weak}, + path::PathBuf, + sync::{Arc, Mutex, RwLock, RwLockReadGuard, Weak}, }; +use oximeter::types::ProducerRegistry; use propolis::{ hw::{ps2::ctrl::PS2Ctrl, qemu::ramfb::RamFb, uart::LpcUart}, vmm::Machine, @@ -23,6 +25,7 @@ use uuid::Uuid; use crate::{serial::Serial, vm::VmControllerError}; mod guest_event; +mod lifecycle_ops; mod migrate_commands; mod request_queue; mod state_driver; @@ -54,6 +57,27 @@ pub(crate) struct Vm { state: RwLock, } +struct VmObjects { + machine: Machine, + lifecycle_components: LifecycleMap, + block_backends: BlockBackendMap, + crucible_backends: CrucibleBackendMap, + com1: Arc>, + framebuffer: Option>, + ps2ctrl: Arc, +} + +impl VmObjects { + fn for_each_device( + &self, + mut func: impl FnMut(&str, &Arc), + ) { + for (name, dev) in self.vm_objects.devices.iter() { + func(name, dev); + } + } +} + /// The state stored in a [`Vm`] when there is an actual underlying virtual /// machine. pub(super) struct ActiveVm { @@ -65,45 +89,41 @@ pub(super) struct ActiveVm { properties: InstanceProperties, spec: InstanceSpecV0, - machine: Machine, - lifecycle_components: LifecycleMap, - block_backends: BlockBackendMap, - crucible_backends: CrucibleBackendMap, - com1: Arc>, - framebuffer: Option>, - ps2ctrl: Arc, - migration_src_state: - tokio::sync::Mutex, + objects: VmObjects, } impl Drop for ActiveVm { fn drop(&mut self) { let mut guard = self.parent.state.write().unwrap(); - std::mem::replace( - &mut *guard, - VmState::Defunct(DefunctVm { - external_state_rx: self.external_state_rx.clone(), - properties: self.properties.clone(), - spec: self.spec.clone(), - }), - ); + *guard = VmState::Defunct(DefunctVm { + external_state_rx: self.external_state_rx.clone(), + properties: self.properties.clone(), + spec: self.spec.clone(), + }); } } -pub struct DefunctVm { +struct DefunctVm { external_state_rx: InstanceStateRx, properties: InstanceProperties, spec: InstanceSpecV0, } #[allow(clippy::large_enum_variant)] -pub enum VmState { +enum VmState { NoVm, WaitingToStart, Active(Weak), Defunct(DefunctVm), } +pub(super) struct EnsureOptions { + pub toml_config: Arc, + pub use_reservoir: bool, + pub oximeter_registry: Option, + pub nexus_client: Option, +} + impl Vm { pub fn new() -> Arc { Arc::new(Self { state: RwLock::new(VmState::NoVm) }) @@ -137,10 +157,7 @@ impl Vm { let old = std::mem::replace(&mut *guard, VmState::NoVm); match old { VmState::WaitingToStart => { - std::mem::replace( - &mut *guard, - VmState::Active(Arc::downgrade(&active)), - ); + *guard = VmState::Active(Arc::downgrade(&active)) } _ => unreachable!( "only a starting VM's state worker calls make_active" @@ -152,17 +169,19 @@ impl Vm { self: &Arc, log: slog::Logger, ensure_request: propolis_api_types::InstanceSpecEnsureRequest, + options: EnsureOptions, ) -> anyhow::Result<(), VmError> { // Take the lock for writing, since in the common case this call will be // creating a new VM and there's no easy way to upgrade from a reader // lock to a writer lock. - let guard = self.state.write().unwrap(); + let mut guard = self.state.write().unwrap(); - // if matches!(*guard, VmState::WaitingToStart | VmState::Active(_)) { return Err(VmError::AlreadyInitialized); } + *guard = VmState::WaitingToStart; + let (external_tx, external_rx) = tokio::sync::watch::channel(InstanceStateMonitorResponse { gen: 1, @@ -184,8 +203,8 @@ impl Vm { external_tx, ); - let _ = tokio::spawn(async move { - state_driver.run(ensure_request, external_rx).await + tokio::spawn(async move { + state_driver.run(ensure_request, options, external_rx).await }); Ok(()) diff --git a/bin/propolis-server/src/lib/vm2/state_driver.rs b/bin/propolis-server/src/lib/vm2/state_driver.rs index 76cec6c5f..1230c1356 100644 --- a/bin/propolis-server/src/lib/vm2/state_driver.rs +++ b/bin/propolis-server/src/lib/vm2/state_driver.rs @@ -9,9 +9,74 @@ use std::{ time::Duration, }; -use propolis_server_config::Config; +use propolis_api_types::{ + instance_spec::VersionedInstanceSpec, InstanceProperties, InstanceState, +}; +use slog::info; +use uuid::Uuid; + +use crate::{ + initializer::{ + build_instance, MachineInitializer, MachineInitializerState, + }, + migrate::MigrateRole, + vcpu_tasks::{VcpuTaskController, VcpuTasks}, +}; + +use super::{ + guest_event::{self, GuestEvent}, + lifecycle_ops, +}; -use super::guest_event; +struct MigrationStateUpdate { + state: propolis_api_types::MigrationState, + id: Uuid, + role: MigrateRole, +} + +impl MigrationStateUpdate { + fn apply_to( + self, + old: propolis_api_types::InstanceMigrateStatusResponse, + ) -> propolis_api_types::InstanceMigrateStatusResponse { + let new = propolis_api_types::InstanceMigrationStatus { + id: self.id, + state: self.state, + }; + match self.role { + MigrateRole::Destination => { + propolis_api_types::InstanceMigrateStatusResponse { + migration_in: Some(new), + migration_out: old.migration_out, + } + } + MigrateRole::Source => { + propolis_api_types::InstanceMigrateStatusResponse { + migration_in: old.migration_in, + migration_out: Some(new), + } + } + } + } +} + +enum ExternalStateUpdate { + Instance(InstanceState), + Migration(MigrationStateUpdate), + Complete(InstanceState, MigrationStateUpdate), +} + +#[derive(Debug, PartialEq, Eq)] +enum HandleEventOutcome { + Continue, + Exit, +} + +#[derive(Debug)] +enum InputQueueEvent { + ExternalRequest(super::request_queue::ExternalRequest), + GuestEvent(GuestEvent), +} struct InputQueueInner { external_requests: super::request_queue::ExternalRequestQueue, @@ -41,6 +106,24 @@ impl InputQueue { cv: Condvar::new(), } } + + fn wait_for_next_event(&self) -> InputQueueEvent { + let guard = self.inner.lock().unwrap(); + let mut guard = self + .cv + .wait_while(guard, |i| { + i.external_requests.is_empty() && i.guest_events.is_empty() + }) + .unwrap(); + + if let Some(guest_event) = guard.guest_events.pop_front() { + InputQueueEvent::GuestEvent(guest_event) + } else { + InputQueueEvent::ExternalRequest( + guard.external_requests.pop_front().unwrap(), + ) + } + } } impl guest_event::GuestEventHandler for InputQueue { @@ -108,8 +191,10 @@ pub(super) struct StateDriver { parent_vm: Arc, input_queue: Arc, external_state_tx: super::InstanceStateTx, - state_gen: u64, paused: bool, + vcpu_tasks: Option, + vm_lifecycle: Option>, + migration_src_state: crate::migrate::source::PersistentState, } impl StateDriver { @@ -125,27 +210,281 @@ impl StateDriver { parent_vm: vm, input_queue, external_state_tx, - state_gen: 0, paused: false, + vcpu_tasks: None, + vm_lifecycle: None, + migration_src_state: Default::default(), } } pub(super) async fn run( - self, + mut self, ensure_request: propolis_api_types::InstanceSpecEnsureRequest, + ensure_options: super::EnsureOptions, external_state_rx: super::InstanceStateRx, ) { - if self.initialize_vm(ensure_request, external_state_rx).is_err() { + if let Ok(active) = self + .initialize_vm(ensure_request, ensure_options, external_state_rx) + .await + { + self.parent_vm.make_active(active.clone()); + self.vm_lifecycle = + Some(active as Arc); + } else { + // TODO(gjc) also publish that it failed. we're the only thing that + // has the external tx so need to do that from here self.parent_vm.start_failed(); return; } + + self.run_loop(); } - fn initialize_vm( - &self, - ensure_request: propolis_api_types::InstanceSpecEnsureRequest, + fn update_external_state(&mut self, state: ExternalStateUpdate) { + let (instance_state, migration_state) = match state { + ExternalStateUpdate::Instance(i) => (Some(i), None), + ExternalStateUpdate::Migration(m) => (None, Some(m)), + ExternalStateUpdate::Complete(i, m) => (Some(i), Some(m)), + }; + + let propolis_api_types::InstanceStateMonitorResponse { + state: old_instance, + migration: old_migration, + gen: old_gen, + } = self.external_state_tx.borrow().clone(); + + let state = instance_state.unwrap_or(old_instance); + let migration = if let Some(migration_state) = migration_state { + migration_state.apply_to(old_migration) + } else { + old_migration + }; + + let gen = old_gen + 1; + info!(self.log, "publishing new instance state"; + "gen" => gen, + "state" => ?state, + "migration" => ?migration); + + let _ = self.external_state_tx.send( + propolis_api_types::InstanceStateMonitorResponse { + gen, + state, + migration, + }, + ); + } + + async fn initialize_vm( + &mut self, + request: propolis_api_types::InstanceSpecEnsureRequest, + options: super::EnsureOptions, external_state_rx: super::InstanceStateRx, - ) -> anyhow::Result<()> { + ) -> anyhow::Result> { + let active_vm = match request.migrate { + None => { + let vm_objects = self + .initialize_vm_from_spec( + &request.properties, + &request.instance_spec, + options, + ) + .await?; + let VersionedInstanceSpec::V0(v0_spec) = request.instance_spec; + let active_vm = Arc::new(super::ActiveVm { + parent: self.parent_vm.clone(), + state_driver_queue: self.input_queue.clone(), + external_state_rx, + properties: request.properties, + spec: v0_spec, + objects: vm_objects, + }); + + active_vm + } + Some(_migrate_request) => todo!("gjc"), + }; + + Ok(active_vm) + } + + /// Initializes all of the components of a VM from the supplied + /// specification. + async fn initialize_vm_from_spec( + &mut self, + properties: &InstanceProperties, + spec: &VersionedInstanceSpec, + options: super::EnsureOptions, + ) -> anyhow::Result { + info!(self.log, "initializing new VM"; + "spec" => #?spec, + "properties" => #?properties, + "use_reservoir" => options.use_reservoir, + "bootrom" => %options.toml_config.bootrom.display()); + + let vmm_log = self.log.new(slog::o!("component" => "vmm")); + + // Set up the 'shell' instance into which the rest of this routine will + // add components. + let VersionedInstanceSpec::V0(v0_spec) = &spec; + let machine = build_instance( + &properties.vm_name(), + v0_spec, + options.use_reservoir, + vmm_log, + )?; + + let mut init = MachineInitializer { + log: self.log.clone(), + machine: &machine, + devices: Default::default(), + block_backends: Default::default(), + crucible_backends: Default::default(), + spec: &v0_spec, + properties: &properties, + toml_config: &options.toml_config, + producer_registry: options.oximeter_registry, + state: MachineInitializerState::default(), + }; + + init.initialize_rom(options.toml_config.bootrom.as_path())?; + let chipset = init.initialize_chipset( + &(self.input_queue.clone() + as Arc), + )?; + + init.initialize_rtc(&chipset)?; + init.initialize_hpet()?; + + let com1 = Arc::new(init.initialize_uart(&chipset)?); + let ps2ctrl = init.initialize_ps2(&chipset)?; + init.initialize_qemu_debug_port()?; + init.initialize_qemu_pvpanic(properties.into())?; + init.initialize_network_devices(&chipset)?; + + #[cfg(not(feature = "omicron-build"))] + init.initialize_test_devices(&options.toml_config.devices)?; + #[cfg(feature = "omicron-build")] + info!( + self.log, + "`omicron-build` feature enabled, ignoring any test devices" + ); + + #[cfg(feature = "falcon")] + init.initialize_softnpu_ports(&chipset)?; + #[cfg(feature = "falcon")] + init.initialize_9pfs(&chipset)?; + + init.initialize_storage_devices(&chipset, options.nexus_client)?; + let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; + init.initialize_cpus()?; + let vcpu_tasks = crate::vcpu_tasks::VcpuTasks::new( + &machine, + &(self.input_queue.clone() + as Arc), + self.log.new(slog::o!("component" => "vcpu_tasks")), + )?; + + let MachineInitializer { + devices, + block_backends, + crucible_backends, + .. + } = init; + + self.vcpu_tasks = Some(vcpu_tasks); + Ok(super::VmObjects { + machine, + lifecycle_components: devices, + block_backends, + crucible_backends, + com1, + framebuffer: Some(ramfb), + ps2ctrl, + }) + } + + fn run_loop(mut self) { + info!(self.log, "state driver launched"); + + loop { + let event = self.input_queue.wait_for_next_event(); + info!(self.log, "state driver handling event"; "event" => ?event); + + let outcome = match event { + InputQueueEvent::ExternalRequest(req) => { + self.handle_external_request(req).await + } + InputQueueEvent::GuestEvent(event) => { + self.handle_guest_event(event).await + } + }; + + let outcome = self.handle_event(event).await; + info!(self.log, "state driver handled event"; "outcome" => ?outcome); + if outcome == HandleEventOutcome::Exit { + break; + } + } + + info!(self.log, "state driver exiting"); + } + + async fn handle_guest_event( + &mut self, + event: GuestEvent, + ) -> HandleEventOutcome { + match event { + GuestEvent::VcpuSuspendHalt(_when) => { + info!(self.log, "Halting due to VM suspend event",); + self.do_halt(); + HandleEventOutcome::Exit + } + GuestEvent::VcpuSuspendReset(_when) => { + info!(self.log, "Resetting due to VM suspend event"); + self.do_reboot().await; + HandleEventOutcome::Continue + } + GuestEvent::VcpuSuspendTripleFault(vcpu_id, _when) => { + info!( + self.log, + "Resetting due to triple fault on vCPU {}", vcpu_id + ); + self.do_reboot().await; + HandleEventOutcome::Continue + } + GuestEvent::ChipsetHalt => { + info!(self.log, "Halting due to chipset-driven halt"); + self.do_halt(); + HandleEventOutcome::Exit + } + GuestEvent::ChipsetReset => { + info!(self.log, "Resetting due to chipset-driven reset"); + self.do_reboot().await; + HandleEventOutcome::Continue + } + } + } + + fn handle_external_request( + &mut self, + request: super::request_queue::ExternalRequest, + ) -> HandleEventOutcome { + todo!("gjc"); + } + + async fn do_reboot(&mut self) { + info!(self.log, "resetting instance"); + + self.update_external_state(ExternalStateUpdate::Instance( + InstanceState::Rebooting, + )); + + self.vcpu_tasks + .as_mut() + .expect("running instance has vCPUs") + .pause_all(); + todo!("gjc"); } } diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index d662f01af..cf7feac65 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -87,6 +87,8 @@ impl CrucibleBackend { nexus_client: Option, log: slog::Logger, ) -> io::Result> { + // TODO(gjc) don't call `block_on` anymore, this is going to get called + // from an async context now let rt = tokio::runtime::Handle::current(); rt.block_on(async move { CrucibleBackend::_create( From df4658d9ebbbb896f5446fa8c6e009f7a6ea2360 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 21 Jun 2024 22:48:39 +0000 Subject: [PATCH 04/55] [WIP] more vm lifecycle routines --- .../src/lib/vm2/lifecycle_ops.rs | 131 ++++++++++++++++-- bin/propolis-server/src/lib/vm2/mod.rs | 17 ++- 2 files changed, 137 insertions(+), 11 deletions(-) diff --git a/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs b/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs index 5e6d25d26..906184295 100644 --- a/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs +++ b/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs @@ -2,13 +2,21 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use std::{ + pin::Pin, + task::{Context, Poll}, +}; + +use futures::{future::BoxFuture, stream::FuturesUnordered, Future, StreamExt}; +use slog::{error, info}; + /// Commands that the VM state driver can invoke on its active VM to pause, /// resume, and reset the devices under its care. /// /// These functions are abstracted into a trait to allow them to be mocked out /// while testing the rest of the state driver. #[cfg_attr(test, mockall::automock)] -pub(super) trait VmLifecycle { +pub(super) trait VmLifecycle: Send + Sync { /// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated /// devices and vCPUs are brought to a consistent state. /// @@ -29,9 +37,11 @@ pub(super) trait VmLifecycle { /// Sends each device (and backend) a start request. fn start_devices(&self) -> anyhow::Result<()>; - /// Sends each device a pause request, then waits for all these requests to - /// complete. - fn pause_devices(&self); + /// Sends each device a pause request. Returns a future that can be awaited + /// to wait for all pause requests to complete. + fn pause_devices( + &self, + ) -> Pin + Send + 'static>>; /// Sends each device a resume request. fn resume_devices(&self); @@ -45,15 +55,18 @@ pub(super) trait VmLifecycle { impl VmLifecycle for super::ActiveVm { fn pause_vm(&self) { + info!(self.log, "pausing kernel VMM resources"); self.objects.machine.hdl.pause().expect("VM_PAUSE should succeed"); } fn resume_vm(&self) { + info!(self.log, "resuming kernel VMM resources"); self.objects.machine.hdl.resume().expect("VM_RESUME should succeed"); } fn reset_devices_and_machine(&self) { self.objects.for_each_device(|name, dev| { + info!(self.log, "sending reset request to {}", name); dev.reset(); }); @@ -61,22 +74,120 @@ impl VmLifecycle for super::ActiveVm { } fn start_devices(&self) -> anyhow::Result<()> { - todo!() + self.objects.for_each_device_fallible(|name, dev| { + info!(self.log, "sending startup complete to {}", name); + let res = dev.start(); + if let Err(e) = &res { + error!(self.log, "startup failed for {}: {:?}", name, e); + } + res + })?; + + for (name, backend) in self.objects.block_backends.iter() { + info!(self.log, "starting block backend {}", name); + let res = backend.start(); + if let Err(e) = &res { + error!(self.log, "Startup failed for {}: {:?}", name, e); + return res; + } + } + Ok(()) } - fn pause_devices(&self) { - todo!() + fn pause_devices( + &self, + ) -> Pin + Send + 'static>> { + self.objects.for_each_device(|name, dev| { + info!(self.log, "sending pause request to {}", name); + dev.pause(); + }); + + struct NamedFuture { + name: String, + future: BoxFuture<'static, ()>, + } + + impl std::future::Future for NamedFuture { + type Output = String; + + fn poll( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll { + let mut_self = self.get_mut(); + match Pin::new(&mut mut_self.future).poll(cx) { + Poll::Pending => Poll::Pending, + Poll::Ready(()) => Poll::Ready(mut_self.name.clone()), + } + } + } + + info!(self.log, "waiting for devices to pause"); + let mut stream: FuturesUnordered<_> = self + .objects + .lifecycle_components + .iter() + .map(|(name, dev)| { + info!(self.log, "got paused future from dev {}", name); + NamedFuture { name: name.clone(), future: dev.paused() } + }) + .collect(); + + let log_fut = self.log.clone(); + Box::pin(async move { + loop { + match stream.next().await { + Some(name) => { + info!(log_fut, "dev {} completed pause", name); + } + + None => { + info!(log_fut, "all devices paused"); + break; + } + } + } + }) } fn resume_devices(&self) { - todo!() + self.objects.for_each_device(|name, dev| { + info!(self.log, "sending resume request to {}", name); + dev.resume(); + }) } fn halt_devices(&self) { - todo!() + self.objects.for_each_device(|name, dev| { + info!(self.log, "sending halt request to {}", name); + dev.halt(); + }); + + for (name, backend) in self.objects.block_backends.iter() { + info!(self.log, "stopping and detaching block backend {}", name); + backend.stop(); + if let Err(err) = backend.detach() { + error!(self.log, "error detaching block backend"; + "name" => name, + "error" => ?err); + } + } } fn reset_vcpu_state(&self) { - todo!() + for vcpu in self.objects.machine.vcpus.iter() { + info!(self.log, "resetting vCPU {}", vcpu.id); + vcpu.activate().unwrap(); + vcpu.reboot_state().unwrap(); + if vcpu.is_bsp() { + info!(self.log, "Resetting BSP vCPU {}", vcpu.id); + vcpu.set_run_state(propolis::bhyve_api::VRS_RUN, None).unwrap(); + vcpu.set_reg( + propolis::bhyve_api::vm_reg_name::VM_REG_GUEST_RIP, + 0xfff0, + ) + .unwrap(); + } + } } } diff --git a/bin/propolis-server/src/lib/vm2/mod.rs b/bin/propolis-server/src/lib/vm2/mod.rs index cedc41315..f0170d168 100644 --- a/bin/propolis-server/src/lib/vm2/mod.rs +++ b/bin/propolis-server/src/lib/vm2/mod.rs @@ -72,16 +72,31 @@ impl VmObjects { &self, mut func: impl FnMut(&str, &Arc), ) { - for (name, dev) in self.vm_objects.devices.iter() { + for (name, dev) in self.lifecycle_components.iter() { func(name, dev); } } + + fn for_each_device_fallible( + &self, + mut func: impl FnMut( + &str, + &Arc, + ) -> std::result::Result<(), E>, + ) -> std::result::Result<(), E> { + for (name, dev) in self.lifecycle_components.iter() { + func(name, dev)?; + } + + Ok(()) + } } /// The state stored in a [`Vm`] when there is an actual underlying virtual /// machine. pub(super) struct ActiveVm { parent: Arc, + log: slog::Logger, state_driver_queue: Arc, external_state_rx: InstanceStateRx, From 356f84ff8d739ff84d0080a4d987d3bc7f073c4e Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 21 Jun 2024 23:36:57 +0000 Subject: [PATCH 05/55] [WIP] do_reboot --- .../src/lib/vm2/state_driver.rs | 55 ++++++++++++++++--- 1 file changed, 47 insertions(+), 8 deletions(-) diff --git a/bin/propolis-server/src/lib/vm2/state_driver.rs b/bin/propolis-server/src/lib/vm2/state_driver.rs index 1230c1356..14a665c5f 100644 --- a/bin/propolis-server/src/lib/vm2/state_driver.rs +++ b/bin/propolis-server/src/lib/vm2/state_driver.rs @@ -124,6 +124,14 @@ impl InputQueue { ) } } + + fn notify_instance_state_change( + &self, + state: super::request_queue::InstanceStateChange, + ) { + let guard = self.inner.lock().unwrap(); + guard.external_requests.notify_instance_state_change(state); + } } impl guest_event::GuestEventHandler for InputQueue { @@ -192,7 +200,7 @@ pub(super) struct StateDriver { input_queue: Arc, external_state_tx: super::InstanceStateTx, paused: bool, - vcpu_tasks: Option, + vcpu_tasks: Option>, vm_lifecycle: Option>, migration_src_state: crate::migrate::source::PersistentState, } @@ -237,7 +245,7 @@ impl StateDriver { return; } - self.run_loop(); + self.run_loop().await; } fn update_external_state(&mut self, state: ExternalStateUpdate) { @@ -404,7 +412,7 @@ impl StateDriver { }) } - fn run_loop(mut self) { + async fn run_loop(mut self) { info!(self.log, "state driver launched"); loop { @@ -480,11 +488,42 @@ impl StateDriver { InstanceState::Rebooting, )); - self.vcpu_tasks - .as_mut() - .expect("running instance has vCPUs") - .pause_all(); + // Reboot is implemented as a pause -> reset -> resume transition. + // + // First, pause the vCPUs and all devices so no partially-completed + // work is present. + self.vcpu_tasks().pause_all(); + self.vm_lifecycle().pause_devices().await; + + // Reset all entities and the VM's bhyve state, then reset the vCPUs. + // The vCPU reset must come after the bhyve reset. + self.vm_lifecycle().reset_devices_and_machine(); + self.reset_vcpus(); + + // Resume devices so they're ready to do more work, then resume vCPUs. + self.vm_lifecycle().resume_devices(); + self.vcpu_tasks().resume_all(); + + // Notify other consumers that the instance successfully rebooted and is + // now back to Running. + self.input_queue.notify_instance_state_change( + super::request_queue::InstanceStateChange::Rebooted, + ); + self.update_external_state(ExternalStateUpdate::Instance( + InstanceState::Running, + )); + } + + fn reset_vcpus(&mut self) { + self.vcpu_tasks().new_generation(); + self.vm_lifecycle.as_ref().unwrap().reset_vcpu_state(); + } - todo!("gjc"); + fn vcpu_tasks(&mut self) -> &mut dyn VcpuTaskController { + self.vcpu_tasks.as_mut().unwrap().as_mut() + } + + fn vm_lifecycle(&self) -> &dyn lifecycle_ops::VmLifecycle { + self.vm_lifecycle.as_ref().unwrap().as_ref() } } From a7af98f13c534a50a71e7dd32fcc632115510be6 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 21 Jun 2024 23:53:01 +0000 Subject: [PATCH 06/55] [WIP] migration's not there but it builds at least --- bin/propolis-server/src/lib/initializer.rs | 2 +- bin/propolis-server/src/lib/vcpu_tasks.rs | 6 +- bin/propolis-server/src/lib/vm2/mod.rs | 8 +- .../src/lib/vm2/state_driver.rs | 91 ++++++++++++++++--- 4 files changed, 86 insertions(+), 21 deletions(-) diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index ddd6e8043..a8b5d09cb 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -186,7 +186,7 @@ impl<'a> MachineInitializer<'a> { pub fn initialize_chipset( &mut self, - event_handler: &Arc, + event_handler: &Arc, ) -> Result { let mut pci_builder = pci::topology::Builder::new(); for (name, bridge) in &self.spec.devices.pci_pci_bridges { diff --git a/bin/propolis-server/src/lib/vcpu_tasks.rs b/bin/propolis-server/src/lib/vcpu_tasks.rs index cdbb20412..d9af3adda 100644 --- a/bin/propolis-server/src/lib/vcpu_tasks.rs +++ b/bin/propolis-server/src/lib/vcpu_tasks.rs @@ -30,7 +30,7 @@ pub struct VcpuTasks { } #[cfg_attr(test, mockall::automock)] -pub(crate) trait VcpuTaskController { +pub(crate) trait VcpuTaskController: Send { fn new_generation(&self); fn pause_all(&mut self); fn resume_all(&mut self); @@ -40,7 +40,7 @@ pub(crate) trait VcpuTaskController { impl VcpuTasks { pub(crate) fn new( machine: &propolis::Machine, - event_handler: Arc, + event_handler: Arc, log: slog::Logger, ) -> Result { let generation = Arc::new(AtomicUsize::new(0)); @@ -72,7 +72,7 @@ impl VcpuTasks { fn vcpu_loop( vcpu: &Vcpu, task: propolis::tasks::TaskHdl, - event_handler: Arc, + event_handler: Arc, generation: Arc, log: slog::Logger, ) { diff --git a/bin/propolis-server/src/lib/vm2/mod.rs b/bin/propolis-server/src/lib/vm2/mod.rs index f0170d168..93f50de06 100644 --- a/bin/propolis-server/src/lib/vm2/mod.rs +++ b/bin/propolis-server/src/lib/vm2/mod.rs @@ -7,8 +7,7 @@ use std::{ collections::BTreeMap, - path::PathBuf, - sync::{Arc, Mutex, RwLock, RwLockReadGuard, Weak}, + sync::{Arc, RwLock, RwLockReadGuard, Weak}, }; use oximeter::types::ProducerRegistry; @@ -20,11 +19,10 @@ use propolis_api_types::{ instance_spec::v0::InstanceSpecV0, InstanceProperties, InstanceStateMonitorResponse, }; -use uuid::Uuid; -use crate::{serial::Serial, vm::VmControllerError}; +use crate::serial::Serial; -mod guest_event; +pub(crate) mod guest_event; mod lifecycle_ops; mod migrate_commands; mod request_queue; diff --git a/bin/propolis-server/src/lib/vm2/state_driver.rs b/bin/propolis-server/src/lib/vm2/state_driver.rs index 14a665c5f..05fd6fb3f 100644 --- a/bin/propolis-server/src/lib/vm2/state_driver.rs +++ b/bin/propolis-server/src/lib/vm2/state_driver.rs @@ -20,7 +20,7 @@ use crate::{ build_instance, MachineInitializer, MachineInitializerState, }, migrate::MigrateRole, - vcpu_tasks::{VcpuTaskController, VcpuTasks}, + vcpu_tasks::VcpuTaskController, }; use super::{ @@ -129,7 +129,7 @@ impl InputQueue { &self, state: super::request_queue::InstanceStateChange, ) { - let guard = self.inner.lock().unwrap(); + let mut guard = self.inner.lock().unwrap(); guard.external_requests.notify_instance_state_change(state); } } @@ -301,6 +301,7 @@ impl StateDriver { let VersionedInstanceSpec::V0(v0_spec) = request.instance_spec; let active_vm = Arc::new(super::ActiveVm { parent: self.parent_vm.clone(), + log: self.log.clone(), state_driver_queue: self.input_queue.clone(), external_state_rx, properties: request.properties, @@ -386,12 +387,12 @@ impl StateDriver { init.initialize_storage_devices(&chipset, options.nexus_client)?; let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; init.initialize_cpus()?; - let vcpu_tasks = crate::vcpu_tasks::VcpuTasks::new( + let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( &machine, - &(self.input_queue.clone() - as Arc), + self.input_queue.clone() + as Arc, self.log.new(slog::o!("component" => "vcpu_tasks")), - )?; + )?); let MachineInitializer { devices, @@ -400,7 +401,7 @@ impl StateDriver { .. } = init; - self.vcpu_tasks = Some(vcpu_tasks); + self.vcpu_tasks = Some(vcpu_tasks as Box); Ok(super::VmObjects { machine, lifecycle_components: devices, @@ -428,7 +429,6 @@ impl StateDriver { } }; - let outcome = self.handle_event(event).await; info!(self.log, "state driver handled event"; "outcome" => ?outcome); if outcome == HandleEventOutcome::Exit { break; @@ -445,7 +445,7 @@ impl StateDriver { match event { GuestEvent::VcpuSuspendHalt(_when) => { info!(self.log, "Halting due to VM suspend event",); - self.do_halt(); + self.do_halt().await; HandleEventOutcome::Exit } GuestEvent::VcpuSuspendReset(_when) => { @@ -463,7 +463,7 @@ impl StateDriver { } GuestEvent::ChipsetHalt => { info!(self.log, "Halting due to chipset-driven halt"); - self.do_halt(); + self.do_halt().await; HandleEventOutcome::Exit } GuestEvent::ChipsetReset => { @@ -474,11 +474,23 @@ impl StateDriver { } } - fn handle_external_request( + async fn handle_external_request( &mut self, request: super::request_queue::ExternalRequest, ) -> HandleEventOutcome { - todo!("gjc"); + match request { + super::request_queue::ExternalRequest::MigrateAsSource { + .. + } => todo!("gjc"), + super::request_queue::ExternalRequest::Reboot => { + self.do_reboot(); + HandleEventOutcome::Continue + } + super::request_queue::ExternalRequest::Stop => { + self.do_halt(); + HandleEventOutcome::Exit + } + } } async fn do_reboot(&mut self) { @@ -514,11 +526,66 @@ impl StateDriver { )); } + async fn do_halt(&mut self) { + info!(self.log, "stopping instance"); + self.update_external_state(ExternalStateUpdate::Instance( + InstanceState::Stopping, + )); + + // Entities expect to be paused before being halted. Note that the VM + // may be paused already if it is being torn down after a successful + // migration out. + if !self.paused { + self.pause().await; + } + + self.vcpu_tasks().exit_all(); + self.vm_lifecycle().halt_devices(); + self.publish_steady_state(InstanceState::Stopped); + } + + async fn pause(&mut self) { + assert!(!self.paused); + self.vcpu_tasks().pause_all(); + self.vm_lifecycle().pause_devices().await; + self.vm_lifecycle().pause_vm(); + self.paused = true; + } + + fn resume(&mut self) { + assert!(self.paused); + self.vm_lifecycle().resume_vm(); + self.vm_lifecycle().resume_devices(); + self.vcpu_tasks().resume_all(); + self.paused = false; + } + fn reset_vcpus(&mut self) { self.vcpu_tasks().new_generation(); self.vm_lifecycle.as_ref().unwrap().reset_vcpu_state(); } + fn publish_steady_state(&mut self, state: InstanceState) { + let change = match state { + InstanceState::Running => { + super::request_queue::InstanceStateChange::StartedRunning + } + InstanceState::Stopped => { + super::request_queue::InstanceStateChange::Stopped + } + InstanceState::Failed => { + super::request_queue::InstanceStateChange::Failed + } + _ => panic!( + "Called publish_steady_state on non-terminal state {:?}", + state + ), + }; + + self.input_queue.notify_instance_state_change(change); + self.update_external_state(ExternalStateUpdate::Instance(state)); + } + fn vcpu_tasks(&mut self) -> &mut dyn VcpuTaskController { self.vcpu_tasks.as_mut().unwrap().as_mut() } From 52d6eb80684ca6329239774b2a6ef7b853e71875 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Sat, 22 Jun 2024 02:19:11 +0000 Subject: [PATCH 07/55] [WIP] rip out block_on --- bin/propolis-server/src/lib/initializer.rs | 10 +- .../src/lib/vm2/lifecycle_ops.rs | 50 +++---- .../src/lib/vm2/state_driver.rs | 2 +- bin/propolis-standalone/src/main.rs | 30 ++-- lib/propolis/src/block/crucible.rs | 137 +++++++++--------- lib/propolis/src/block/file.rs | 29 ++-- lib/propolis/src/block/in_memory.rs | 30 ++-- lib/propolis/src/block/mem_async.rs | 50 ++++--- lib/propolis/src/block/mod.rs | 5 +- lib/propolis/src/hw/chipset/mod.rs | 2 +- lib/propolis/src/tasks.rs | 24 ++- 11 files changed, 198 insertions(+), 171 deletions(-) diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index a8b5d09cb..fd95c58cb 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -372,7 +372,7 @@ impl<'a> MachineInitializer<'a> { Ok(()) } - fn create_storage_backend_from_spec( + async fn create_storage_backend_from_spec( &self, backend_spec: &instance_spec::v0::StorageBackendV0, backend_name: &str, @@ -410,7 +410,8 @@ impl<'a> MachineInitializer<'a> { self.log.new( slog::o!("component" => format!("crucible-{cru_id}")), ), - )?; + ) + .await?; let crucible = Some((be.get_uuid()?, be.clone())); Ok(StorageBackendInstance { be, crucible }) @@ -481,7 +482,7 @@ impl<'a> MachineInitializer<'a> { /// /// On success, returns a map from Crucible backend IDs to Crucible /// backends. - pub fn initialize_storage_devices( + pub async fn initialize_storage_devices( &mut self, chipset: &RegisteredChipset, nexus_client: Option, @@ -538,7 +539,8 @@ impl<'a> MachineInitializer<'a> { backend_spec, backend_name, &nexus_client, - )?; + ) + .await?; self.block_backends.insert(backend_name.clone(), backend.clone()); match device_interface { diff --git a/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs b/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs index 906184295..4e16c5ece 100644 --- a/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs +++ b/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs @@ -7,7 +7,7 @@ use std::{ task::{Context, Poll}, }; -use futures::{future::BoxFuture, stream::FuturesUnordered, Future, StreamExt}; +use futures::{future::BoxFuture, stream::FuturesUnordered, StreamExt}; use slog::{error, info}; /// Commands that the VM state driver can invoke on its active VM to pause, @@ -35,13 +35,11 @@ pub(super) trait VmLifecycle: Send + Sync { fn reset_devices_and_machine(&self); /// Sends each device (and backend) a start request. - fn start_devices(&self) -> anyhow::Result<()>; + fn start_devices(&self) -> BoxFuture<'_, anyhow::Result<()>>; /// Sends each device a pause request. Returns a future that can be awaited /// to wait for all pause requests to complete. - fn pause_devices( - &self, - ) -> Pin + Send + 'static>>; + fn pause_devices(&self) -> BoxFuture<'_, ()>; /// Sends each device a resume request. fn resume_devices(&self); @@ -73,30 +71,30 @@ impl VmLifecycle for super::ActiveVm { self.objects.machine.reinitialize().unwrap(); } - fn start_devices(&self) -> anyhow::Result<()> { - self.objects.for_each_device_fallible(|name, dev| { - info!(self.log, "sending startup complete to {}", name); - let res = dev.start(); - if let Err(e) = &res { - error!(self.log, "startup failed for {}: {:?}", name, e); - } - res - })?; - - for (name, backend) in self.objects.block_backends.iter() { - info!(self.log, "starting block backend {}", name); - let res = backend.start(); - if let Err(e) = &res { - error!(self.log, "Startup failed for {}: {:?}", name, e); - return res; + fn start_devices(&self) -> BoxFuture<'_, anyhow::Result<()>> { + Box::pin(async { + self.objects.for_each_device_fallible(|name, dev| { + info!(self.log, "sending startup complete to {}", name); + let res = dev.start(); + if let Err(e) = &res { + error!(self.log, "startup failed for {}: {:?}", name, e); + } + res + })?; + + for (name, backend) in self.objects.block_backends.iter() { + info!(self.log, "starting block backend {}", name); + let res = backend.start().await; + if let Err(e) = &res { + error!(self.log, "Startup failed for {}: {:?}", name, e); + return res; + } } - } - Ok(()) + Ok(()) + }) } - fn pause_devices( - &self, - ) -> Pin + Send + 'static>> { + fn pause_devices(&self) -> BoxFuture<'_, ()> { self.objects.for_each_device(|name, dev| { info!(self.log, "sending pause request to {}", name); dev.pause(); diff --git a/bin/propolis-server/src/lib/vm2/state_driver.rs b/bin/propolis-server/src/lib/vm2/state_driver.rs index 05fd6fb3f..8e5c6d9dd 100644 --- a/bin/propolis-server/src/lib/vm2/state_driver.rs +++ b/bin/propolis-server/src/lib/vm2/state_driver.rs @@ -384,7 +384,7 @@ impl StateDriver { #[cfg(feature = "falcon")] init.initialize_9pfs(&chipset)?; - init.initialize_storage_devices(&chipset, options.nexus_client)?; + init.initialize_storage_devices(&chipset, options.nexus_client).await?; let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; init.initialize_cpus()?; let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( diff --git a/bin/propolis-standalone/src/main.rs b/bin/propolis-standalone/src/main.rs index bdd4a3f97..1f210b661 100644 --- a/bin/propolis-standalone/src/main.rs +++ b/bin/propolis-standalone/src/main.rs @@ -359,22 +359,26 @@ impl Instance { // Drive block backends through their necessary states too match state { State::Run if first_boot => { - for (_name, be) in guard.inventory.block.iter() { - be.start().expect("blockdev start succeeds"); - } + tokio::runtime::Handle::current().block_on(async { + for (_name, be) in guard.inventory.block.iter() { + be.start().await.expect("blockdev start succeeds"); + } + }); } State::Halt => { - for (name, be) in guard.inventory.block.iter() { - be.stop(); - if let Err(err) = be.detach() { - slog::error!( - log, - "Error during detach of block backend {}: {:?}", - name, - err - ); + tokio::runtime::Handle::current().block_on(async { + for (name, be) in guard.inventory.block.iter() { + be.stop().await; + if let Err(err) = be.detach() { + slog::error!( + log, + "Error during detach of block backend {}: {:?}", + name, + err + ); + } } - } + }); } _ => {} } diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index cf7feac65..9505ce2f7 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -16,6 +16,7 @@ use crucible::{ BlockIO, Buffer, CrucibleError, ReplaceResult, SnapshotDetails, Volume, }; use crucible_client_types::VolumeConstructionRequest; +use futures::future::BoxFuture; use oximeter::types::ProducerRegistry; use slog::{error, info}; use thiserror::Error; @@ -80,26 +81,21 @@ impl WorkerState { } impl CrucibleBackend { - pub fn create( + pub async fn create( request: VolumeConstructionRequest, opts: block::BackendOpts, producer_registry: Option, nexus_client: Option, log: slog::Logger, ) -> io::Result> { - // TODO(gjc) don't call `block_on` anymore, this is going to get called - // from an async context now - let rt = tokio::runtime::Handle::current(); - rt.block_on(async move { - CrucibleBackend::_create( - request, - opts, - producer_registry, - nexus_client, - log, - ) - .await - }) + CrucibleBackend::_create( + request, + opts, + producer_registry, + nexus_client, + log, + ) + .await .map_err(CrucibleError::into) } @@ -178,42 +174,41 @@ impl CrucibleBackend { /// Create Crucible backend using the in-memory volume backend, rather than /// "real" Crucible downstairs instances. - pub fn create_mem( + pub async fn create_mem( size: u64, opts: block::BackendOpts, log: slog::Logger, ) -> io::Result> { - let rt = tokio::runtime::Handle::current(); - rt.block_on(async move { - let block_size = u64::from(opts.block_size.ok_or_else(|| { - CrucibleError::GenericError( - "block_size is required parameter".into(), - ) - })?); - // Allocate and construct the volume. - let mem_disk = Arc::new(crucible::InMemoryBlockIO::new( - Uuid::new_v4(), - block_size, - size as usize, - )); - let mut volume = Volume::new(block_size, log); - volume.add_subvolume(mem_disk).await?; - - Ok(Arc::new(CrucibleBackend { - state: Arc::new(WorkerState { - attachment: block::BackendAttachment::new(), - volume, - info: block::DeviceInfo { - block_size: block_size as u32, - total_size: size / block_size, - read_only: opts.read_only.unwrap_or(false), - }, - skip_flush: opts.skip_flush.unwrap_or(false), - }), - workers: TaskGroup::new(), - })) - }) - .map_err(CrucibleError::into) + let block_size = u64::from(opts.block_size.ok_or_else(|| { + CrucibleError::GenericError( + "block_size is required parameter".into(), + ) + })?); + // Allocate and construct the volume. + let mem_disk = Arc::new(crucible::InMemoryBlockIO::new( + Uuid::new_v4(), + block_size, + size as usize, + )); + let mut volume = Volume::new(block_size, log); + volume + .add_subvolume(mem_disk) + .await + .map_err(|e| std::io::Error::from(e))?; + + Ok(Arc::new(CrucibleBackend { + state: Arc::new(WorkerState { + attachment: block::BackendAttachment::new(), + volume, + info: block::DeviceInfo { + block_size: block_size as u32, + total_size: size / block_size, + read_only: opts.read_only.unwrap_or(false), + }, + skip_flush: opts.skip_flush.unwrap_or(false), + }), + workers: TaskGroup::new(), + })) } // Communicate to Nexus that we can remove the read only parent for @@ -276,22 +271,24 @@ impl CrucibleBackend { .map_err(CrucibleError::into) } - fn spawn_workers(&self) { + async fn spawn_workers(&self) { // TODO: make this tunable? let worker_count = 8; - self.workers.extend((0..worker_count).map(|n| { - let worker_state = self.state.clone(); - let worker_acc = self - .state - .attachment - .accessor_mem(|acc_mem| { - acc_mem.child(Some(format!("crucible worker {n}"))) + self.workers + .extend((0..worker_count).map(|n| { + let worker_state = self.state.clone(); + let worker_acc = self + .state + .attachment + .accessor_mem(|acc_mem| { + acc_mem.child(Some(format!("crucible worker {n}"))) + }) + .expect("backend is attached"); + tokio::spawn(async move { + worker_state.process_loop(worker_acc).await }) - .expect("backend is attached"); - tokio::spawn( - async move { worker_state.process_loop(worker_acc).await }, - ) - })) + })) + .await; } pub async fn volume_is_active(&self) -> Result { @@ -306,17 +303,19 @@ impl block::Backend for CrucibleBackend { fn info(&self) -> DeviceInfo { self.state.info } - fn start(&self) -> anyhow::Result<()> { - let rt = tokio::runtime::Handle::current(); - rt.block_on(async move { self.state.volume.activate().await })?; - - self.state.attachment.start(); - self.spawn_workers(); - Ok(()) + fn start(&self) -> BoxFuture<'_, anyhow::Result<()>> { + Box::pin(async { + self.state.volume.activate().await?; + self.state.attachment.start(); + self.spawn_workers().await; + Ok(()) + }) } - fn stop(&self) { - self.state.attachment.stop(); - self.workers.block_until_joined(); + fn stop(&self) -> BoxFuture<'_, ()> { + Box::pin(async { + self.state.attachment.stop(); + self.workers.block_until_joined().await; + }) } } diff --git a/lib/propolis/src/block/file.rs b/lib/propolis/src/block/file.rs index 1d1e91d52..8f7265df2 100644 --- a/lib/propolis/src/block/file.rs +++ b/lib/propolis/src/block/file.rs @@ -18,6 +18,7 @@ use crate::util::ioctl; use crate::vmm::{MappingExt, MemCtx}; use anyhow::Context; +use futures::future::BoxFuture; // XXX: completely arb for now const MAX_WORKERS: usize = 32; @@ -219,19 +220,25 @@ impl block::Backend for FileBackend { fn info(&self) -> DeviceInfo { self.state.info } - fn start(&self) -> anyhow::Result<()> { - self.state.attachment.start(); - if let Err(e) = self.spawn_workers() { + + fn start(&self) -> BoxFuture<'_, anyhow::Result<()>> { + Box::pin(async { + self.state.attachment.start(); + if let Err(e) = self.spawn_workers() { + self.state.attachment.stop(); + self.workers.block_until_joined(); + Err(e).context("failure while spawning workers") + } else { + Ok(()) + } + }) + } + + fn stop(&self) -> BoxFuture<'_, ()> { + Box::pin(async { self.state.attachment.stop(); self.workers.block_until_joined(); - Err(e).context("failure while spawning workers") - } else { - Ok(()) - } - } - fn stop(&self) { - self.state.attachment.stop(); - self.workers.block_until_joined(); + }) } } diff --git a/lib/propolis/src/block/in_memory.rs b/lib/propolis/src/block/in_memory.rs index 17ea47f44..1eb967a20 100644 --- a/lib/propolis/src/block/in_memory.rs +++ b/lib/propolis/src/block/in_memory.rs @@ -12,6 +12,7 @@ use crate::tasks::ThreadGroup; use crate::vmm::{MemCtx, SubMapping}; use anyhow::Context; +use futures::future::BoxFuture; pub struct InMemoryBackend { state: Arc, @@ -144,22 +145,29 @@ impl block::Backend for InMemoryBackend { fn attachment(&self) -> &block::BackendAttachment { &self.state.attachment } + fn info(&self) -> block::DeviceInfo { self.state.info } - fn start(&self) -> anyhow::Result<()> { - self.state.attachment.start(); - if let Err(e) = self.spawn_workers() { + + fn start(&self) -> BoxFuture<'_, anyhow::Result<()>> { + Box::pin(async { + self.state.attachment.start(); + if let Err(e) = self.spawn_workers() { + self.state.attachment.stop(); + self.workers.block_until_joined(); + Err(e).context("failure while spawning workers") + } else { + Ok(()) + } + }) + } + + fn stop(&self) -> BoxFuture<'_, ()> { + Box::pin(async { self.state.attachment.stop(); self.workers.block_until_joined(); - Err(e).context("failure while spawning workers") - } else { - Ok(()) - } - } - fn stop(&self) { - self.state.attachment.stop(); - self.workers.block_until_joined(); + }) } } diff --git a/lib/propolis/src/block/mem_async.rs b/lib/propolis/src/block/mem_async.rs index 86cff9a1b..1feaf7e0e 100644 --- a/lib/propolis/src/block/mem_async.rs +++ b/lib/propolis/src/block/mem_async.rs @@ -7,6 +7,8 @@ use std::num::NonZeroUsize; use std::ptr::NonNull; use std::sync::Arc; +use futures::future::BoxFuture; + use crate::accessors::MemAccessor; use crate::block; use crate::tasks::TaskGroup; @@ -141,20 +143,22 @@ impl MemAsyncBackend { })) } - fn spawn_workers(&self) { - self.workers.extend((0..self.worker_count.get()).map(|n| { - let worker_state = self.work_state.clone(); - let worker_acc = self - .work_state - .attachment - .accessor_mem(|acc_mem| { - acc_mem.child(Some(format!("worker {n}"))) + async fn spawn_workers(&self) { + self.workers + .extend((0..self.worker_count.get()).map(|n| { + let worker_state = self.work_state.clone(); + let worker_acc = self + .work_state + .attachment + .accessor_mem(|acc_mem| { + acc_mem.child(Some(format!("worker {n}"))) + }) + .expect("backend is attached"); + tokio::spawn(async move { + worker_state.processing_loop(worker_acc).await }) - .expect("backend is attached"); - tokio::spawn(async move { - worker_state.processing_loop(worker_acc).await - }) - })) + })) + .await; } } @@ -209,17 +213,23 @@ impl block::Backend for MemAsyncBackend { fn info(&self) -> block::DeviceInfo { self.work_state.info } + fn attachment(&self) -> &block::BackendAttachment { &self.work_state.attachment } - fn start(&self) -> anyhow::Result<()> { - self.work_state.attachment.start(); - self.spawn_workers(); - Ok(()) + + fn start(&self) -> BoxFuture<'_, anyhow::Result<()>> { + Box::pin(async { + self.work_state.attachment.start(); + self.spawn_workers().await; + Ok(()) + }) } - fn stop(&self) { - self.work_state.attachment.stop(); - self.workers.block_until_joined(); + fn stop(&self) -> BoxFuture<'_, ()> { + Box::pin(async { + self.work_state.attachment.stop(); + self.workers.block_until_joined().await; + }) } } diff --git a/lib/propolis/src/block/mod.rs b/lib/propolis/src/block/mod.rs index 722076efd..6e47701b7 100644 --- a/lib/propolis/src/block/mod.rs +++ b/lib/propolis/src/block/mod.rs @@ -13,6 +13,7 @@ use crate::vmm::{MemCtx, SubMapping}; mod file; pub use file::FileBackend; +use futures::future::BoxFuture; #[cfg(feature = "crucible")] mod crucible; @@ -244,7 +245,7 @@ pub trait Backend: Send + Sync + 'static { /// /// Spawning of any tasks required to do such request processing can be done /// as part of this start-up. - fn start(&self) -> anyhow::Result<()>; + fn start(&self) -> BoxFuture<'_, anyhow::Result<()>>; /// Stop attempting to process new [Request]s from [Device] (if attached) /// @@ -253,7 +254,7 @@ pub trait Backend: Send + Sync + 'static { /// /// If any tasks were spawned as part of [Backend::start()], they should be /// brought to rest as part of this call. - fn stop(&self); + fn stop(&self) -> BoxFuture<'_, ()>; /// Attempt to detach from associated [Device] /// diff --git a/lib/propolis/src/hw/chipset/mod.rs b/lib/propolis/src/hw/chipset/mod.rs index 47f930d72..40177f674 100644 --- a/lib/propolis/src/hw/chipset/mod.rs +++ b/lib/propolis/src/hw/chipset/mod.rs @@ -9,7 +9,7 @@ use crate::intr_pins::IntrPin; pub mod i440fx; -pub trait Chipset { +pub trait Chipset: Send + Sync { fn pci_attach( &self, bdf: Bdf, diff --git a/lib/propolis/src/tasks.rs b/lib/propolis/src/tasks.rs index efcc45404..3920b82d5 100644 --- a/lib/propolis/src/tasks.rs +++ b/lib/propolis/src/tasks.rs @@ -419,37 +419,35 @@ impl Drop for CtrlHeld<'_> { /// Holds a group of tokio task [task::JoinHandle]s to be later joined as a /// group when they have all concluded. -pub struct TaskGroup(Mutex>>); +pub struct TaskGroup(tokio::sync::Mutex>>); impl TaskGroup { pub fn new() -> Self { - Self(Mutex::new(Vec::new())) + Self(tokio::sync::Mutex::new(Vec::new())) } /// Add to the group of contained tasks - pub fn extend(&self, tasks: I) + pub async fn extend(&self, tasks: I) where I: Iterator>, { - let mut guard = self.0.lock().unwrap(); + let mut guard = self.0.lock().await; guard.extend(tasks); } /// Block until all held tasks have been joined, returning any resulting /// [task::JoinError]s after doing so. - pub fn block_until_joined(&self) -> Option> { - let mut guard = self.0.lock().unwrap(); + pub async fn block_until_joined(&self) -> Option> { + let mut guard = self.0.lock().await; let workers = std::mem::replace(&mut *guard, Vec::new()); if workers.is_empty() { return None; } - let rt = tokio::runtime::Handle::current(); - let errors = rt.block_on(async { - FuturesUnordered::from_iter(workers) - .filter_map(|res| futures::future::ready(res.err())) - .collect::>() - .await - }); + let errors = FuturesUnordered::from_iter(workers) + .filter_map(|res| futures::future::ready(res.err())) + .collect::>() + .await; + if errors.is_empty() { None } else { From e16af9ba2fe985b97e73ea08f79dead808ce83e7 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Sat, 22 Jun 2024 04:26:41 +0000 Subject: [PATCH 08/55] [WIP] cut the server over to the new VM module (semi-disaster) Getting rid of all the `block_on` calls and making everything async turns out to have created a somewhat substantial hassle: now certain locks have to be tokio locks (to avoid holding a lock over an await point), which means new routines have to be async, and some of those are in traits, so now everything is returning a `BoxFuture` and arghhh. Maybe this would be less of a hassle if we weren't using trait objects? --- bin/propolis-server/src/lib/initializer.rs | 2 +- bin/propolis-server/src/lib/lib.rs | 1 - .../src/lib/migrate/destination.rs | 38 +- bin/propolis-server/src/lib/migrate/mod.rs | 56 +- .../src/lib/migrate/preamble.rs | 5 +- bin/propolis-server/src/lib/migrate/source.rs | 93 +- bin/propolis-server/src/lib/vcpu_tasks.rs | 4 +- .../src/lib/{vm2 => vm}/guest_event.rs | 0 .../src/lib/{vm2 => vm}/lifecycle_ops.rs | 29 +- .../src/lib/{vm2 => vm}/migrate_commands.rs | 12 + bin/propolis-server/src/lib/vm/mod.rs | 1261 ++----------- .../src/lib/vm/request_queue.rs | 190 +- bin/propolis-server/src/lib/vm/services.rs | 49 + .../src/lib/vm/state_driver.rs | 1667 +++++------------ bin/propolis-server/src/lib/vm2/mod.rs | 225 --- .../src/lib/vm2/state_driver.rs | 596 ------ bin/propolis-server/src/lib/vm_old/mod.rs | 1138 +++++++++++ .../src/lib/{vm2 => vm_old}/request_queue.rs | 190 +- .../src/lib/vm_old/state_driver.rs | 1384 ++++++++++++++ 19 files changed, 3519 insertions(+), 3421 deletions(-) rename bin/propolis-server/src/lib/{vm2 => vm}/guest_event.rs (100%) rename bin/propolis-server/src/lib/{vm2 => vm}/lifecycle_ops.rs (88%) rename bin/propolis-server/src/lib/{vm2 => vm}/migrate_commands.rs (79%) create mode 100644 bin/propolis-server/src/lib/vm/services.rs delete mode 100644 bin/propolis-server/src/lib/vm2/mod.rs delete mode 100644 bin/propolis-server/src/lib/vm2/state_driver.rs create mode 100644 bin/propolis-server/src/lib/vm_old/mod.rs rename bin/propolis-server/src/lib/{vm2 => vm_old}/request_queue.rs (67%) create mode 100644 bin/propolis-server/src/lib/vm_old/state_driver.rs diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index fd95c58cb..971862562 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -186,7 +186,7 @@ impl<'a> MachineInitializer<'a> { pub fn initialize_chipset( &mut self, - event_handler: &Arc, + event_handler: &Arc, ) -> Result { let mut pci_builder = pci::topology::Builder::new(); for (name, bridge) in &self.spec.devices.pci_pci_bridges { diff --git a/bin/propolis-server/src/lib/lib.rs b/bin/propolis-server/src/lib/lib.rs index 3467e6b29..cf8e79b15 100644 --- a/bin/propolis-server/src/lib/lib.rs +++ b/bin/propolis-server/src/lib/lib.rs @@ -11,5 +11,4 @@ mod spec; mod stats; mod vcpu_tasks; mod vm; -mod vm2; pub mod vnc; diff --git a/bin/propolis-server/src/lib/migrate/destination.rs b/bin/propolis-server/src/lib/migrate/destination.rs index 18ba3f1ef..f77f19104 100644 --- a/bin/propolis-server/src/lib/migrate/destination.rs +++ b/bin/propolis-server/src/lib/migrate/destination.rs @@ -24,14 +24,14 @@ use crate::migrate::probes; use crate::migrate::{ Device, MigrateError, MigratePhase, MigrateRole, MigrationState, PageIter, }; -use crate::vm::{MigrateTargetCommand, VmController}; +use crate::vm::{migrate_commands::MigrateTargetCommand, ActiveVm}; use super::protocol::Protocol; /// Launches an attempt to migrate into a supplied instance using the supplied /// source connection. pub async fn migrate( - vm_controller: Arc, + vm: Arc, command_tx: tokio::sync::mpsc::Sender, conn: WebSocketStream, local_addr: SocketAddr, @@ -39,12 +39,9 @@ pub async fn migrate( ) -> Result<(), MigrateError> { let err_tx = command_tx.clone(); let mut proto = match protocol { - Protocol::RonV0 => DestinationProtocol::new( - vm_controller, - command_tx, - conn, - local_addr, - ), + Protocol::RonV0 => { + DestinationProtocol::new(vm, command_tx, conn, local_addr) + } }; if let Err(err) = proto.run().await { @@ -68,7 +65,7 @@ pub async fn migrate( struct DestinationProtocol { /// The VM controller for the instance of interest. - vm_controller: Arc, + vm: Arc, /// The channel to use to send messages to the state worker coordinating /// this migration. @@ -84,16 +81,16 @@ struct DestinationProtocol { impl DestinationProtocol { fn new( - vm_controller: Arc, + vm: Arc, command_tx: tokio::sync::mpsc::Sender, conn: WebSocketStream, local_addr: SocketAddr, ) -> Self { - Self { vm_controller, command_tx, conn, local_addr } + Self { vm, command_tx, conn, local_addr } } fn log(&self) -> &slog::Logger { - self.vm_controller.log() + self.vm.log() } async fn update_state(&mut self, state: MigrationState) { @@ -174,7 +171,7 @@ impl DestinationProtocol { }?; info!(self.log(), "Destination read Preamble: {:?}", preamble); if let Err(e) = preamble - .is_migration_compatible(self.vm_controller.instance_spec().await) + .is_migration_compatible(&*self.vm.objects().instance_spec()) { error!( self.log(), @@ -319,7 +316,8 @@ impl DestinationProtocol { info!(self.log(), "Devices: {devices:#?}"); { - let machine = self.vm_controller.machine(); + let objects = self.vm.objects(); + let machine = objects.machine(); let migrate_ctx = MigrateCtx { mem: &machine.acc_mem.access().unwrap() }; for device in devices { @@ -328,8 +326,7 @@ impl DestinationProtocol { "Applying state to device {}", device.instance_name ); - let target = self - .vm_controller + let target = objects .device_by_name(&device.instance_name) .ok_or_else(|| { MigrateError::UnknownDevice( @@ -371,7 +368,7 @@ impl DestinationProtocol { // Take a snapshot of the host hrtime/wall clock time, then adjust // time data appropriately. - let vmm_hdl = &self.vm_controller.machine().hdl.clone(); + let vmm_hdl = &self.vm.objects().machine().hdl.clone(); let (dst_hrt, dst_wc) = vmm::time::host_time_snapshot(vmm_hdl) .map_err(|e| { MigrateError::TimeData(format!( @@ -564,7 +561,8 @@ impl DestinationProtocol { } }; - self.vm_controller + self.vm + .objects() .com1() .import(&com1_history) .await @@ -639,8 +637,8 @@ impl DestinationProtocol { addr: GuestAddr, buf: &[u8], ) -> Result<(), MigrateError> { - let machine = self.vm_controller.machine(); - let memctx = machine.acc_mem.access().unwrap(); + let objects = self.vm.objects(); + let memctx = objects.machine().acc_mem.access().unwrap(); let len = buf.len(); memctx.write_from(addr, buf, len); Ok(()) diff --git a/bin/propolis-server/src/lib/migrate/mod.rs b/bin/propolis-server/src/lib/migrate/mod.rs index 5c246d0c0..d2afba1fe 100644 --- a/bin/propolis-server/src/lib/migrate/mod.rs +++ b/bin/propolis-server/src/lib/migrate/mod.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use std::net::SocketAddr; use std::sync::Arc; use bit_field::BitField; @@ -18,10 +19,8 @@ use tokio_tungstenite::tungstenite::protocol::CloseFrame; use tokio_tungstenite::{tungstenite, WebSocketStream}; use uuid::Uuid; -use crate::{ - server::{DropshotEndpointContext, VmControllerState}, - vm::{VmController, VmControllerError}, -}; +use crate::server::{DropshotEndpointContext, VmControllerState}; +use crate::vm::ActiveVm; mod codec; pub mod destination; @@ -160,16 +159,6 @@ impl From for MigrateError { } } -impl From for MigrateError { - fn from(err: VmControllerError) -> Self { - match err { - VmControllerError::AlreadyMigrationSource => { - MigrateError::MigrationAlreadyInProgress - } - _ => MigrateError::StateMachine(err.to_string()), - } - } -} impl From for MigrateError { fn from(value: MigrateStateError) -> Self { Self::DeviceState(value.to_string()) @@ -307,6 +296,15 @@ pub async fn source_start< Ok(()) } +pub(crate) struct DestinationContext< + T: AsyncRead + AsyncWrite + Unpin + Send + 'static, +> { + pub migration_id: Uuid, + pub conn: WebSocketStream, + pub local_addr: SocketAddr, + pub protocol: crate::migrate::protocol::Protocol, +} + /// Initiate a migration to the given source instance. /// /// This will attempt to open a websocket to the given source instance and @@ -315,9 +313,13 @@ pub async fn source_start< /// migration process (destination-side). pub(crate) async fn dest_initiate( rqctx: &RequestContext>, - controller: Arc, migrate_info: api::InstanceMigrateInitiateRequest, -) -> Result { +) -> Result< + DestinationContext< + tokio_tungstenite::MaybeTlsStream, + >, + MigrateError, +> { let migration_id = migrate_info.migration_id; // Create a new log context for the migration @@ -383,21 +385,13 @@ pub(crate) async fn dest_initiate( } }; let local_addr = rqctx.server.local_addr; - tokio::runtime::Handle::current() - .spawn_blocking(move || -> Result<(), MigrateError> { - // Now start using the websocket for the migration protocol - controller.request_migration_into( - migration_id, - conn, - local_addr, - selected, - )?; - Ok(()) - }) - .await - .unwrap()?; - - Ok(api::InstanceMigrateInitiateResponse { migration_id }) + + Ok(DestinationContext { + migration_id, + conn, + local_addr, + protocol: selected, + }) } // We should probably turn this into some kind of ValidatedBitmap diff --git a/bin/propolis-server/src/lib/migrate/preamble.rs b/bin/propolis-server/src/lib/migrate/preamble.rs index 8618b0449..b45a0d9ac 100644 --- a/bin/propolis-server/src/lib/migrate/preamble.rs +++ b/bin/propolis-server/src/lib/migrate/preamble.rs @@ -10,7 +10,6 @@ use propolis_api_types::instance_spec::{ VersionedInstanceSpec, }; use serde::{Deserialize, Serialize}; -use tokio::sync::MutexGuard; #[derive(Deserialize, Serialize, Debug)] pub(crate) struct Preamble { @@ -40,10 +39,8 @@ impl Preamble { pub fn is_migration_compatible( &self, - other_spec: MutexGuard<'_, VersionedInstanceSpec>, + other_spec: &InstanceSpecV0, ) -> Result<(), MigrationCompatibilityError> { - let VersionedInstanceSpec::V0(other_spec) = &*other_spec; - self.device_spec.can_migrate_devices_from(&other_spec.devices)?; let other_keys = get_spec_backend_keys(other_spec); if self.backend_keys.len() != other_keys.len() { diff --git a/bin/propolis-server/src/lib/migrate/source.rs b/bin/propolis-server/src/lib/migrate/source.rs index 2ffd12a62..99b2f38a0 100644 --- a/bin/propolis-server/src/lib/migrate/source.rs +++ b/bin/propolis-server/src/lib/migrate/source.rs @@ -9,6 +9,7 @@ use propolis::migrate::{ MigrateCtx, MigrateStateError, Migrator, PayloadOutputs, }; use propolis::vmm; +use propolis_api_types::instance_spec::VersionedInstanceSpec; use slog::{debug, error, info, trace}; use std::collections::HashMap; use std::convert::TryInto; @@ -28,7 +29,11 @@ use crate::migrate::{ Device, DevicePayload, MigrateError, MigratePhase, MigrateRole, MigrationState, PageIter, }; -use crate::vm::{MigrateSourceCommand, MigrateSourceResponse, VmController}; + +use crate::vm::migrate_commands::{ + MigrateSourceCommand, MigrateSourceResponse, +}; +use crate::vm::ActiveVm; /// Specifies which pages should be offered during a RAM transfer phase. /// @@ -110,7 +115,7 @@ enum RamOfferDiscipline { } pub async fn migrate( - vm_controller: Arc, + vm: Arc, command_tx: tokio::sync::mpsc::Sender, response_rx: tokio::sync::mpsc::Receiver, conn: WebSocketStream, @@ -119,7 +124,7 @@ pub async fn migrate( let err_tx = command_tx.clone(); let mut proto = match protocol { Protocol::RonV0 => { - SourceProtocol::new(vm_controller, command_tx, response_rx, conn) + SourceProtocol::new(vm, command_tx, response_rx, conn).await } }; @@ -143,11 +148,8 @@ pub async fn migrate( // See the lengthy comment on `RamOfferDiscipline` above for more // details about what's going on here. for (&GuestAddr(gpa), dirtiness) in proto.dirt.iter().flatten() { - if let Err(e) = proto - .vm_controller - .machine() - .hdl - .set_dirty_pages(gpa, dirtiness) + if let Err(e) = + proto.vm.objects().machine().hdl.set_dirty_pages(gpa, dirtiness) { // Bad news! Our attempt to re-set the dirty bit on these // pages has failed! Thus, subsequent migration attempts @@ -156,10 +158,10 @@ pub async fn migrate( // phase. // // Record that now so we never try to do this again. - proto - .vm_controller - .migration_src_state() - .has_redirtying_ever_failed = true; + // + // TODO(gjc) + // proto.vm.migration_src_state().has_redirtying_ever_failed = + // true; error!( proto.log(), "failed to restore dirty bits: {e}"; @@ -187,15 +189,15 @@ pub async fn migrate( #[derive(Default)] pub(crate) struct PersistentState { /// Set if we were unable to re-set dirty bits on guest pages after a failed - /// migration attempt. If this occurs, we can no longer offer only dirty pages - /// in a subsequent migration attempt, as some pages which should be marked as - /// dirty may not be. + /// migration attempt. If this occurs, we can no longer offer only dirty + /// pages in a subsequent migration attempt, as some pages which should be + /// marked as dirty may not be. has_redirtying_ever_failed: bool, } struct SourceProtocol { /// The VM controller for the instance of interest. - vm_controller: Arc, + vm: Arc, /// The channel to use to send messages to the state worker coordinating /// this migration. @@ -234,33 +236,37 @@ const PAGE_BITMAP_SIZE: usize = 4096; type PageBitmap = [u8; PAGE_BITMAP_SIZE]; impl SourceProtocol { - fn new( - vm_controller: Arc, + async fn new( + vm: Arc, command_tx: tokio::sync::mpsc::Sender, response_rx: tokio::sync::mpsc::Receiver, conn: WebSocketStream, ) -> Self { let dirt = { - let can_npt_operate = vm_controller.machine().hdl.can_npt_operate(); - let has_redirtying_ever_failed = - vm_controller.migration_src_state().has_redirtying_ever_failed; - if can_npt_operate && !has_redirtying_ever_failed { + let can_npt_operate = vm.objects().machine().hdl.can_npt_operate(); + + // TODO(gjc) the pre-pause offer phase needs to look at whether + // redirtying has previously failed. This is done over the command + // channel (command_tx/response_rx) but that can't be used here + // because the state driver isn't actually coordinating with + // anything yet (the point of this function is to create the objects + // that need to be stuffed into a message to send to the state + // driver) + if can_npt_operate { Some(Default::default()) } else { info!( - vm_controller.log(), - "guest pages are not redirtyable; will offer all pages in pre-pause RAM push"; - "can_npt_operate" => can_npt_operate, - "has_redirtying_ever_failed" => has_redirtying_ever_failed + vm.log(), + "NPT operations not supported, will offer all pages pre-push"; ); None } }; - Self { vm_controller, command_tx, response_rx, conn, dirt } + Self { vm, command_tx, response_rx, conn, dirt } } fn log(&self) -> &slog::Logger { - self.vm_controller.log() + self.vm.log() } async fn update_state(&mut self, state: MigrationState) { @@ -316,8 +322,9 @@ impl SourceProtocol { async fn sync(&mut self) -> Result<(), MigrateError> { self.update_state(MigrationState::Sync).await; - let preamble = - Preamble::new(self.vm_controller.instance_spec().await.clone()); + let preamble = Preamble::new(VersionedInstanceSpec::V0( + self.vm.objects().instance_spec().clone(), + )); let s = ron::ser::to_string(&preamble) .map_err(codec::ProtocolError::from)?; self.send_msg(codec::Message::Serialized(s)).await?; @@ -446,7 +453,7 @@ impl SourceProtocol { // says to offer all pages. This ensures that pages that are // transferred now and not touched again will not be offered again // by a subsequent phase. - self.track_dirty(GuestAddr(gpa), &mut bits)?; + self.track_dirty(GuestAddr(gpa), &mut bits).await?; match offer_discipline { RamOfferDiscipline::OfferAll => { @@ -536,12 +543,13 @@ impl SourceProtocol { self.update_state(MigrationState::Device).await; let mut device_states = vec![]; { - let machine = self.vm_controller.machine(); + let objects = self.vm.objects(); + let machine = objects.machine(); let migrate_ctx = MigrateCtx { mem: &machine.acc_mem.access().unwrap() }; // Collect together the serialized state for all the devices - self.vm_controller.for_each_device_fallible(|name, devop| { + self.vm.for_each_device_fallible(|name, devop| { let mut dev = Device { instance_name: name.to_string(), payload: Vec::new(), @@ -582,7 +590,7 @@ impl SourceProtocol { } } Ok(()) - })?; + }).await?; } info!(self.log(), "Device States: {device_states:#?}"); @@ -599,7 +607,7 @@ impl SourceProtocol { // Read and send over the time data async fn time_data(&mut self) -> Result<(), MigrateError> { - let vmm_hdl = &self.vm_controller.machine().hdl.clone(); + let vmm_hdl = &self.vm.objects().machine().hdl.clone(); let vm_time_data = vmm::time::export_time_data(vmm_hdl).map_err(|e| { MigrateError::TimeData(format!( @@ -638,7 +646,7 @@ impl SourceProtocol { _ => return Err(MigrateError::UnexpectedMessage), }; let com1_history = - self.vm_controller.com1().export_history(remote_addr).await?; + self.vm.objects().com1().export_history(remote_addr).await?; self.send_msg(codec::Message::Serialized(com1_history)).await?; self.read_ok().await } @@ -671,7 +679,7 @@ impl SourceProtocol { let mut bits = [0u8; PAGE_BITMAP_SIZE]; let step = bits.len() * 8 * PAGE_SIZE; for gpa in (vmm_range.start().0..vmm_range.end().0).step_by(step) { - self.track_dirty(GuestAddr(gpa), &mut bits).unwrap(); + self.track_dirty(GuestAddr(gpa), &mut bits).await.unwrap(); let pages_left_behind = BitSlice::<_, Lsb0>::from_slice(&bits).count_ones() as u64; assert_eq!( @@ -748,17 +756,19 @@ impl SourceProtocol { async fn vmm_ram_bounds( &mut self, ) -> Result, MigrateError> { - let machine = self.vm_controller.machine(); + let objects = self.vm.objects(); + let machine = objects.machine(); let memctx = machine.acc_mem.access().unwrap(); memctx.mem_bounds().ok_or(MigrateError::InvalidInstanceState) } - fn track_dirty( + async fn track_dirty( &mut self, start_gpa: GuestAddr, bits: &mut [u8], ) -> Result<(), MigrateError> { - self.vm_controller + self.vm + .objects() .machine() .hdl .track_dirty_pages(start_gpa.0, bits) @@ -770,7 +780,8 @@ impl SourceProtocol { addr: GuestAddr, buf: &mut [u8], ) -> Result<(), MigrateError> { - let machine = self.vm_controller.machine(); + let objects = self.vm.objects(); + let machine = objects.machine(); let memctx = machine.acc_mem.access().unwrap(); let len = buf.len(); memctx.direct_read_into(addr, buf, len); diff --git a/bin/propolis-server/src/lib/vcpu_tasks.rs b/bin/propolis-server/src/lib/vcpu_tasks.rs index d9af3adda..fb4917ec6 100644 --- a/bin/propolis-server/src/lib/vcpu_tasks.rs +++ b/bin/propolis-server/src/lib/vcpu_tasks.rs @@ -40,7 +40,7 @@ pub(crate) trait VcpuTaskController: Send { impl VcpuTasks { pub(crate) fn new( machine: &propolis::Machine, - event_handler: Arc, + event_handler: Arc, log: slog::Logger, ) -> Result { let generation = Arc::new(AtomicUsize::new(0)); @@ -72,7 +72,7 @@ impl VcpuTasks { fn vcpu_loop( vcpu: &Vcpu, task: propolis::tasks::TaskHdl, - event_handler: Arc, + event_handler: Arc, generation: Arc, log: slog::Logger, ) { diff --git a/bin/propolis-server/src/lib/vm2/guest_event.rs b/bin/propolis-server/src/lib/vm/guest_event.rs similarity index 100% rename from bin/propolis-server/src/lib/vm2/guest_event.rs rename to bin/propolis-server/src/lib/vm/guest_event.rs diff --git a/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs b/bin/propolis-server/src/lib/vm/lifecycle_ops.rs similarity index 88% rename from bin/propolis-server/src/lib/vm2/lifecycle_ops.rs rename to bin/propolis-server/src/lib/vm/lifecycle_ops.rs index 4e16c5ece..88245be4f 100644 --- a/bin/propolis-server/src/lib/vm2/lifecycle_ops.rs +++ b/bin/propolis-server/src/lib/vm/lifecycle_ops.rs @@ -54,26 +54,30 @@ pub(super) trait VmLifecycle: Send + Sync { impl VmLifecycle for super::ActiveVm { fn pause_vm(&self) { info!(self.log, "pausing kernel VMM resources"); - self.objects.machine.hdl.pause().expect("VM_PAUSE should succeed"); + self.objects().machine().hdl.pause().expect("VM_PAUSE should succeed"); } fn resume_vm(&self) { info!(self.log, "resuming kernel VMM resources"); - self.objects.machine.hdl.resume().expect("VM_RESUME should succeed"); + self.objects() + .machine() + .hdl + .resume() + .expect("VM_RESUME should succeed"); } fn reset_devices_and_machine(&self) { - self.objects.for_each_device(|name, dev| { + self.for_each_device(|name, dev| { info!(self.log, "sending reset request to {}", name); dev.reset(); }); - self.objects.machine.reinitialize().unwrap(); + self.objects().machine().reinitialize().unwrap(); } fn start_devices(&self) -> BoxFuture<'_, anyhow::Result<()>> { Box::pin(async { - self.objects.for_each_device_fallible(|name, dev| { + self.objects().for_each_device_fallible(|name, dev| { info!(self.log, "sending startup complete to {}", name); let res = dev.start(); if let Err(e) = &res { @@ -95,7 +99,8 @@ impl VmLifecycle for super::ActiveVm { } fn pause_devices(&self) -> BoxFuture<'_, ()> { - self.objects.for_each_device(|name, dev| { + let objects = self.objects(); + objects.for_each_device(|name, dev| { info!(self.log, "sending pause request to {}", name); dev.pause(); }); @@ -121,8 +126,7 @@ impl VmLifecycle for super::ActiveVm { } info!(self.log, "waiting for devices to pause"); - let mut stream: FuturesUnordered<_> = self - .objects + let mut stream: FuturesUnordered<_> = objects .lifecycle_components .iter() .map(|(name, dev)| { @@ -149,19 +153,20 @@ impl VmLifecycle for super::ActiveVm { } fn resume_devices(&self) { - self.objects.for_each_device(|name, dev| { + self.objects().for_each_device(|name, dev| { info!(self.log, "sending resume request to {}", name); dev.resume(); }) } fn halt_devices(&self) { - self.objects.for_each_device(|name, dev| { + let objects = self.objects(); + objects.for_each_device(|name, dev| { info!(self.log, "sending halt request to {}", name); dev.halt(); }); - for (name, backend) in self.objects.block_backends.iter() { + for (name, backend) in objects.block_backends.iter() { info!(self.log, "stopping and detaching block backend {}", name); backend.stop(); if let Err(err) = backend.detach() { @@ -173,7 +178,7 @@ impl VmLifecycle for super::ActiveVm { } fn reset_vcpu_state(&self) { - for vcpu in self.objects.machine.vcpus.iter() { + for vcpu in self.objects().machine().vcpus.iter() { info!(self.log, "resetting vCPU {}", vcpu.id); vcpu.activate().unwrap(); vcpu.reboot_state().unwrap(); diff --git a/bin/propolis-server/src/lib/vm2/migrate_commands.rs b/bin/propolis-server/src/lib/vm/migrate_commands.rs similarity index 79% rename from bin/propolis-server/src/lib/vm2/migrate_commands.rs rename to bin/propolis-server/src/lib/vm/migrate_commands.rs index f448364b9..08e564690 100644 --- a/bin/propolis-server/src/lib/vm2/migrate_commands.rs +++ b/bin/propolis-server/src/lib/vm/migrate_commands.rs @@ -22,6 +22,14 @@ pub enum MigrateSourceCommand { /// Update the externally-visible migration state. UpdateState(propolis_api_types::MigrationState), + /// Determine whether a previous attempt to restore the VM's dirty bitmap + /// has failed. + QueryRedirtyingFailed, + + /// Record that the guest's dirty page bitmap may be inconsistent so that + /// future attempts to migrate out transmit all pages. + RedirtyingFailed, + /// Pause the instance's devices and CPUs. Pause, } @@ -30,6 +38,10 @@ pub enum MigrateSourceCommand { /// response to a previous command. #[derive(Debug)] pub enum MigrateSourceResponse { + /// A previous migration out has (or has not) failed to restore the VM's + /// dirty bitmap. + RedirtyingFailed(bool), + /// A request to pause completed with the attached result. Pause(Result<(), std::io::Error>), } diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 8d47392db..a8680b0ac 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -2,1181 +2,292 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Implements the VM controller: the public interface to a single Propolis -//! instance. -//! -//! The VM controller serves two purposes. First, it collects all of the objects -//! describing a single Propolis VM (the Propolis `Instance` itself, the -//! instance's spec, direct references to components in the instance, etc.). -//! Second, it records requests and events that affect how a VM moves through -//! the stages of its lifecycle, i.e. how and when it boots, reboots, migrates, -//! and stops. -//! -//! Each VM controller has a single "state driver" thread that processes -//! requests and events recorded by its controller and acts on the underlying -//! Propolis instance to move the VM into the appropriate states. Doing this -//! work on a single thread ensures that a VM can only undergo one state change -//! at a time, that there are no races to start/pause/resume/halt a VM's -//! components, and that there is a single source of truth as to a VM's current -//! state (and as to the steps that are required to move it to a different -//! state). Operations like live migration that require components to pause and -//! resume coordinate directly with the state driver thread. -//! -//! The VM controller's public API allows a Propolis Dropshot server to query a -//! VM's current state, to ask to change that state, and to obtain references to -//! objects in a VM as needed to handle other requests made of the server (e.g. -//! requests to connect to an instance's serial console or to take a disk -//! snapshot). The controller also implements traits that allow a VM's -//! components to raise events for the state driver to process (e.g. a request -//! from a VM's chipset to reboot or halt the VM). +//! This module implements the `Vm` wrapper type that encapsulates a single +//! instance on behalf of a Propolis server. -use crate::migrate; - -use futures::{future::BoxFuture, stream::FuturesUnordered, StreamExt}; use std::{ - collections::{BTreeMap, VecDeque}, - fmt::Debug, - net::SocketAddr, - pin::Pin, - sync::{Arc, Condvar, Mutex, MutexGuard, Weak}, - task::{Context, Poll}, - thread::JoinHandle, - time::Duration, + collections::BTreeMap, + sync::{Arc, RwLock, RwLockReadGuard, Weak}, }; -use anyhow::Context as AnyhowContext; use oximeter::types::ProducerRegistry; use propolis::{ hw::{ps2::ctrl::PS2Ctrl, qemu::ramfb::RamFb, uart::LpcUart}, vmm::Machine, }; use propolis_api_types::{ - instance_spec::VersionedInstanceSpec, - InstanceMigrateStatusResponse as ApiMigrateStatusResponse, - InstanceMigrationStatus as ApiMigrationStatus, InstanceProperties, - InstanceState as ApiInstanceState, - InstanceStateMonitorResponse as ApiMonitoredState, - InstanceStateRequested as ApiInstanceStateRequested, - MigrationState as ApiMigrationState, -}; -use slog::{debug, error, info, Logger}; -use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::sync::oneshot; -use tokio_tungstenite::WebSocketStream; -use uuid::Uuid; - -use crate::{ - initializer::{ - build_instance, MachineInitializer, MachineInitializerState, - }, - migrate::{MigrateError, MigrateRole}, - serial::Serial, - server::{BlockBackendMap, CrucibleBackendMap, DeviceMap, StaticConfig}, - vm::request_queue::ExternalRequest, + instance_spec::v0::InstanceSpecV0, InstanceProperties, + InstanceStateMonitorResponse, }; -use self::request_queue::{ExternalRequestQueue, RequestDeniedReason}; -pub use nexus_client::Client as NexusClient; +use crate::serial::Serial; +pub(crate) mod guest_event; +mod lifecycle_ops; +pub(crate) mod migrate_commands; mod request_queue; +mod services; mod state_driver; -/// Minimum thread count for the Tokio runtime driving the VMM tasks -const VMM_MIN_RT_THREADS: usize = 8; -const VMM_BASE_RT_THREADS: usize = 4; - -#[derive(Debug, Error)] -pub enum VmControllerError { - #[error("The requested operation requires an active instance")] - InstanceNotActive, - - #[error("The instance has a pending request to halt")] - InstanceHaltPending, - - #[error("Instance is already a migration source")] - AlreadyMigrationSource, - - #[error("Cannot request state {0:?} while migration is in progress")] - InvalidRequestForMigrationSource(ApiInstanceStateRequested), - - #[error("A migration into this instance is in progress")] - MigrationTargetInProgress, - - #[error("Another live migration into this instance already occurred")] - MigrationTargetPreviouslyCompleted, - - #[error("The most recent attempt to migrate into this instance failed")] - MigrationTargetFailed, - - #[error("Can't migrate into a running instance")] - TooLateToBeMigrationTarget, - - #[error("Failed to queue requested state change: {0}")] - StateChangeRequestDenied(#[from] request_queue::RequestDeniedReason), - - #[error("Migration protocol error: {0:?}")] - MigrationProtocolError(#[from] MigrateError), - - #[error("Failed to start vCPU workers")] - VcpuWorkerCreationFailed(#[from] super::vcpu_tasks::VcpuTaskError), - - #[error("Failed to create state worker: {0}")] - StateWorkerCreationFailed(std::io::Error), +pub(crate) type LifecycleMap = + BTreeMap>; +pub(crate) type BlockBackendMap = + BTreeMap>; +pub(crate) type CrucibleBackendMap = + BTreeMap>; + +type InstanceStateTx = tokio::sync::watch::Sender< + propolis_api_types::InstanceStateMonitorResponse, +>; +type InstanceStateRx = tokio::sync::watch::Receiver< + propolis_api_types::InstanceStateMonitorResponse, +>; + +#[derive(Debug, thiserror::Error)] +pub(crate) enum VmError { + #[error("VM already initialized")] + AlreadyInitialized, + + #[error("VM initialization failed")] + InitializationFailed(#[source] anyhow::Error), } -impl From for dropshot::HttpError { - fn from(vm_error: VmControllerError) -> Self { - use dropshot::HttpError; - match vm_error { - VmControllerError::AlreadyMigrationSource - | VmControllerError::InvalidRequestForMigrationSource(_) - | VmControllerError::MigrationTargetInProgress - | VmControllerError::MigrationTargetFailed - | VmControllerError::TooLateToBeMigrationTarget - | VmControllerError::StateChangeRequestDenied(_) - | VmControllerError::InstanceNotActive - | VmControllerError::InstanceHaltPending - | VmControllerError::MigrationTargetPreviouslyCompleted => { - HttpError::for_status( - Some(format!("Instance operation failed: {}", vm_error)), - http::status::StatusCode::FORBIDDEN, - ) - } - VmControllerError::MigrationProtocolError(_) - | VmControllerError::VcpuWorkerCreationFailed(_) - | VmControllerError::StateWorkerCreationFailed(_) => { - HttpError::for_internal_error(format!( - "Instance operation failed: {}", - vm_error - )) - } - } - } +/// The top-level VM wrapper type. Callers are expected to wrap this in an +/// `Arc`. +pub(crate) struct Vm { + /// A reference to the VM state machine. + state: RwLock, } -/// A collection of objects that describe an instance and references to that -/// instance and its components. -pub(crate) struct VmObjects { - /// The underlying Propolis `Machine` this controller is managing. - machine: Option, - - /// The instance properties supplied when this controller was created. - properties: InstanceProperties, - - /// The instance spec used to create this controller's VM. - spec: tokio::sync::Mutex, - - /// Map of the emulated devices associated with the VM - devices: DeviceMap, - - /// Map of the instance's active block backends. +struct VmObjects { + instance_spec: InstanceSpecV0, + machine: Machine, + lifecycle_components: LifecycleMap, block_backends: BlockBackendMap, - - /// Map of the instance's active Crucible backends. crucible_backends: CrucibleBackendMap, - - /// A wrapper around the instance's first COM port, suitable for providing a - /// connection to a guest's serial console. com1: Arc>, - - /// An optional reference to the guest's framebuffer. framebuffer: Option>, - - /// A reference to the guest's PS/2 controller. ps2ctrl: Arc, - - /// A notification receiver to which the state worker publishes the most - /// recent instance state information. - monitor_rx: tokio::sync::watch::Receiver, -} - -/// A message sent from a live migration destination task to update the -/// externally visible state of the migration attempt. -#[derive(Clone, Copy, Debug)] -pub enum MigrateTargetCommand { - /// Update the externally-visible migration state. - UpdateState(ApiMigrationState), } -/// A message sent from a live migration driver to the state worker, asking it -/// to act on source instance components on the task's behalf. -#[derive(Clone, Copy, Debug)] -pub enum MigrateSourceCommand { - /// Update the externally-visible migration state. - UpdateState(ApiMigrationState), - - /// Pause the instance's devices and CPUs. - Pause, -} - -/// A message sent from the state worker to the live migration driver in -/// response to a previous command. -#[derive(Debug)] -pub enum MigrateSourceResponse { - /// A request to pause completed with the attached result. - Pause(Result<(), std::io::Error>), -} - -/// An event raised by a migration task that must be handled by the state -/// worker. -#[derive(Debug)] -enum MigrateTaskEvent { - /// The task completed with the associated result. - TaskExited(Result<(), MigrateError>), - - /// The task sent a command requesting work. - Command(T), -} - -/// An event raised by some component in the instance (e.g. a vCPU or the -/// chipset) that the state worker must handle. -/// -/// The vCPU-sourced events carry a time element (duration since VM boot) as -/// emitted by the kernel vmm. This is used to deduplicate events when all -/// vCPUs running in-kernel are kicked out for the suspend state. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -enum GuestEvent { - /// VM entered halt state - VcpuSuspendHalt(Duration), - /// VM entered reboot state - VcpuSuspendReset(Duration), - /// vCPU encounted triple-fault - VcpuSuspendTripleFault(i32, Duration), - /// Chipset signaled halt condition - ChipsetHalt, - /// Chipset signaled reboot condition - ChipsetReset, -} - -/// Shared instance state guarded by the controller's state mutex. This state is -/// accessed from the controller API and the VM's state worker. -#[derive(Debug)] -struct SharedVmStateInner { - external_request_queue: ExternalRequestQueue, - - /// The state worker's queue of unprocessed events from guest devices. - guest_event_queue: VecDeque, - - /// The expected ID of the next live migration this instance will - /// participate in (either in or out). If this is `Some`, external callers - /// who query migration state will observe that a live migration is in - /// progress even if the state driver has yet to pick up the live migration - /// tasks from its queue. - pending_migration_id: Option<(Uuid, MigrateRole)>, -} - -impl SharedVmStateInner { - fn new(parent_log: &Logger) -> Self { - let queue_log = - parent_log.new(slog::o!("component" => "external_request_queue")); - Self { - external_request_queue: ExternalRequestQueue::new(queue_log), - guest_event_queue: VecDeque::new(), - pending_migration_id: None, - } +impl VmObjects { + pub(crate) fn instance_spec(&self) -> &InstanceSpecV0 { + &self.instance_spec } -} -#[derive(Debug)] -pub(crate) struct SharedVmState { - inner: Mutex, - cv: Condvar, -} - -/// A VM controller: a wrapper around a Propolis instance that supplies the -/// functions needed for the Propolis server to implement its own API. -pub struct VmController { - /// A collection of objects that don't change once an instance is ensured: - /// the instance itself, a description of it, and convenience references to - /// some of its members (used to avoid rummaging through the instance's - /// inventory). - vm_objects: VmObjects, - - /// A wrapper for the runtime state of this instance, managed by the state - /// worker thread. This also serves as a sink for hardware events (e.g. from - /// vCPUs and the chipset), so it is wrapped in an Arc so that it can be - /// shared with those events' sources. - worker_state: Arc, - - /// A handle to the state worker thread for this instance. - worker_thread: Mutex< - Option>>, - >, - - /// This controller's logger. - log: Logger, - - /// The Tokio runtime in which VMM-related processing is to be handled. - /// - /// This includes things such as device emulation, (block) backend - /// processing, and migration workloads. It is held in an [Option] only to - /// facilitate runtime shutdown when the [VmController] is dropped. - vmm_runtime: Option, - - /// Migration source state persisted across multiple migration attempts. - migration_src_state: Mutex, - - /// A weak reference to this controller, suitable for upgrading and passing - /// to tasks the controller spawns. - this: Weak, -} - -impl SharedVmState { - fn new(parent_log: &Logger) -> Self { - Self { - inner: Mutex::new(SharedVmStateInner::new(parent_log)), - cv: Condvar::new(), - } + pub(crate) fn machine(&self) -> &Machine { + &self.machine } - fn queue_external_request( + pub(crate) fn device_by_name( &self, - request: ExternalRequest, - ) -> Result<(), RequestDeniedReason> { - let mut inner = self.inner.lock().unwrap(); - let result = inner.external_request_queue.try_queue(request); - if result.is_ok() { - self.cv.notify_one(); - } - result - } - - fn wait_for_next_event(&self) -> StateDriverEvent { - let guard = self.inner.lock().unwrap(); - let mut guard = self - .cv - .wait_while(guard, |i| { - i.external_request_queue.is_empty() - && i.guest_event_queue.is_empty() - }) - .unwrap(); - - if let Some(guest_event) = guard.guest_event_queue.pop_front() { - StateDriverEvent::Guest(guest_event) - } else { - StateDriverEvent::External( - guard.external_request_queue.pop_front().unwrap(), - ) - } - } - - /// Add a guest event to the queue, so long as it does not appear to be a - /// duplicate of an existing event. - fn enqueue_guest_event(&self, event: GuestEvent) { - let mut inner = self.inner.lock().unwrap(); - if !inner.guest_event_queue.iter().any(|ev| *ev == event) { - // Only queue event if nothing else in the queue is a direct match - inner.guest_event_queue.push_back(event); - self.cv.notify_one(); - } - } - - pub fn suspend_halt_event(&self, when: Duration) { - self.enqueue_guest_event(GuestEvent::VcpuSuspendHalt(when)); - } - - pub fn suspend_reset_event(&self, when: Duration) { - self.enqueue_guest_event(GuestEvent::VcpuSuspendReset(when)); + name: &str, + ) -> Option> { + self.lifecycle_components.get(name).cloned() } - pub fn suspend_triple_fault_event(&self, vcpu_id: i32, when: Duration) { - self.enqueue_guest_event(GuestEvent::VcpuSuspendTripleFault( - vcpu_id, when, - )); + pub(crate) fn com1(&self) -> &Arc> { + &self.com1 } - pub fn unhandled_vm_exit( + fn for_each_device( &self, - vcpu_id: i32, - exit: propolis::exits::VmExitKind, + mut func: impl FnMut(&str, &Arc), ) { - panic!("vCPU {}: Unhandled VM exit: {:?}", vcpu_id, exit); - } - - pub fn io_error_event(&self, vcpu_id: i32, error: std::io::Error) { - panic!("vCPU {}: Unhandled vCPU error: {}", vcpu_id, error); - } - - pub fn clear_pending_migration(&self) { - let mut inner = self.inner.lock().unwrap(); - inner.pending_migration_id = None; - } -} - -/// Functions called by a Propolis chipset to notify another component that an -/// event occurred. -pub trait ChipsetEventHandler: Send + Sync { - fn chipset_halt(&self); - fn chipset_reset(&self); -} - -impl ChipsetEventHandler for SharedVmState { - fn chipset_halt(&self) { - self.enqueue_guest_event(GuestEvent::ChipsetHalt); - } - - fn chipset_reset(&self) { - self.enqueue_guest_event(GuestEvent::ChipsetReset); - } -} - -impl VmController { - #[allow(clippy::too_many_arguments)] - pub fn new( - instance_spec: VersionedInstanceSpec, - properties: InstanceProperties, - &StaticConfig { vm: ref toml_config, use_reservoir, .. }: &StaticConfig, - producer_registry: Option, - nexus_client: Option, - log: Logger, - stop_ch: oneshot::Sender<()>, - ) -> anyhow::Result> { - let vmm_rt = Self::spawn_runtime(&properties)?; - - // All subsequent work should be run under our VMM runtime - let _rt_guard = vmm_rt.enter(); - - let bootrom = &toml_config.bootrom; - info!(log, "initializing new VM"; - "spec" => #?instance_spec, - "properties" => #?properties, - "use_reservoir" => use_reservoir, - "bootrom" => %bootrom.display()); - - let vmm_log = log.new(slog::o!("component" => "vmm")); - - // Set up the 'shell' instance into which the rest of this routine will - // add components. - let VersionedInstanceSpec::V0(v0_spec) = &instance_spec; - let machine = build_instance( - &properties.vm_name(), - v0_spec, - use_reservoir, - vmm_log, - )?; - - // Create the state monitor channel and the worker state struct that - // depends on it. The state struct can then be passed to device - // initialization as an event sink. - let (monitor_tx, monitor_rx) = - tokio::sync::watch::channel(ApiMonitoredState { - gen: 0, - state: ApiInstanceState::Creating, - migration: ApiMigrateStatusResponse { - migration_in: None, - migration_out: None, - }, - }); - - let worker_state = Arc::new(SharedVmState::new(&log)); - - // Create and initialize devices in the new instance. - let mut init = MachineInitializer { - log: log.clone(), - machine: &machine, - devices: DeviceMap::new(), - block_backends: BlockBackendMap::new(), - crucible_backends: CrucibleBackendMap::new(), - spec: v0_spec, - properties: &properties, - toml_config, - producer_registry, - state: MachineInitializerState::default(), - }; - - init.initialize_rom(bootrom.as_path())?; - let chipset = init.initialize_chipset( - &(worker_state.clone() as Arc), - )?; - init.initialize_rtc(&chipset)?; - init.initialize_hpet()?; - - let com1 = Arc::new(init.initialize_uart(&chipset)?); - let ps2ctrl = init.initialize_ps2(&chipset)?; - init.initialize_qemu_debug_port()?; - init.initialize_qemu_pvpanic((&properties).into())?; - init.initialize_network_devices(&chipset)?; - - #[cfg(not(feature = "omicron-build"))] - init.initialize_test_devices(&toml_config.devices)?; - #[cfg(feature = "omicron-build")] - info!( - log, - "`omicron-build` feature enabled, ignoring any test devices" - ); - - #[cfg(feature = "falcon")] - init.initialize_softnpu_ports(&chipset)?; - #[cfg(feature = "falcon")] - init.initialize_9pfs(&chipset)?; - init.initialize_storage_devices(&chipset, nexus_client)?; - let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; - init.initialize_cpus()?; - let vcpu_tasks = super::vcpu_tasks::VcpuTasks::new( - &machine, - worker_state.clone(), - log.new(slog::o!("component" => "vcpu_tasks")), - )?; - - let MachineInitializer { - devices, - block_backends, - crucible_backends, - .. - } = init; - - // The instance is fully set up; pass it to the new controller. - let shared_state_for_worker = worker_state.clone(); - let rt_hdl = vmm_rt.handle().clone(); - let controller = Arc::new_cyclic(|this| Self { - vm_objects: VmObjects { - machine: Some(machine), - properties, - spec: tokio::sync::Mutex::new(instance_spec), - devices, - block_backends, - crucible_backends, - com1, - framebuffer: Some(ramfb), - ps2ctrl, - monitor_rx, - }, - worker_state, - worker_thread: Mutex::new(None), - migration_src_state: Default::default(), - log: log.new(slog::o!("component" => "vm_controller")), - vmm_runtime: Some(vmm_rt), - this: this.clone(), - }); - - // Now that the controller exists, launch the state worker that will - // drive state transitions for this instance. When the VM halts, the - // worker will exit and drop its reference to the controller. - let ctrl_for_worker = controller.clone(); - let log_for_worker = - log.new(slog::o!("component" => "vm_state_worker")); - let worker_thread = std::thread::Builder::new() - .name("vm_state_worker".to_string()) - .spawn(move || { - let driver = state_driver::StateDriver::new( - rt_hdl, - ctrl_for_worker, - shared_state_for_worker, - vcpu_tasks, - log_for_worker, - monitor_tx, - ); - - let monitor_tx = driver.run_state_worker(); - - // Signal back to the server state once the worker has exited. - let _ = stop_ch.send(()); - monitor_tx - }) - .map_err(VmControllerError::StateWorkerCreationFailed)?; - - *controller.worker_thread.lock().unwrap() = Some(worker_thread); - Ok(controller) - } - - pub fn properties(&self) -> &InstanceProperties { - &self.vm_objects.properties - } - - pub fn machine(&self) -> &Machine { - // Unwrap safety: The machine is created when the controller is created - // and removed only when the controller is dropped. - self.vm_objects - .machine - .as_ref() - .expect("VM controller always has a valid machine") - } - - pub(crate) fn migration_src_state( - &self, - ) -> MutexGuard<'_, migrate::source::PersistentState> { - self.migration_src_state.lock().unwrap() - } - - pub async fn instance_spec( - &self, - ) -> tokio::sync::MutexGuard<'_, VersionedInstanceSpec> { - self.vm_objects.spec.lock().await - } - - pub fn com1(&self) -> &Arc> { - &self.vm_objects.com1 - } - - pub fn framebuffer(&self) -> Option<&Arc> { - self.vm_objects.framebuffer.as_ref() - } - - pub fn ps2ctrl(&self) -> &Arc { - &self.vm_objects.ps2ctrl - } - - pub fn crucible_backends( - &self, - ) -> &BTreeMap> { - &self.vm_objects.crucible_backends - } - - pub fn log(&self) -> &Logger { - &self.log - } - pub fn rt_hdl(&self) -> &tokio::runtime::Handle { - self.vmm_runtime - .as_ref() - .expect("vmm_runtime is populated until VmController is dropped") - .handle() - } - - pub fn external_instance_state(&self) -> ApiInstanceState { - self.vm_objects.monitor_rx.borrow().state - } - - pub fn inject_nmi(&self) { - if let Some(machine) = &self.vm_objects.machine { - match machine.inject_nmi() { - Ok(_) => { - info!(self.log, "Sending NMI to instance"); - } - Err(e) => { - error!(self.log, "Could not send NMI to instance: {}", e); - } - }; + for (name, dev) in self.lifecycle_components.iter() { + func(name, dev); } } - pub fn state_watcher( + fn for_each_device_fallible( &self, - ) -> &tokio::sync::watch::Receiver { - &self.vm_objects.monitor_rx - } - - /// Asks to queue a request to start a source migration task for this VM. - /// The migration will have the supplied `migration_id` and will obtain its - /// connection to the target by calling `upgrade_fn` to obtain a future that - /// yields the necessary connection. - /// - /// This routine fails if the VM was not marked as a migration source or if - /// it has another pending request that precludes migration. Note that this - /// routine does not fail if the future returned from `upgrade_fn` fails to - /// produce a connection to the destination. - /// - /// On success, clients may query the instance's migration status to - /// determine how the migration has progressed. - pub fn request_migration_from< - T: AsyncRead + AsyncWrite + Unpin + Send + 'static, - >( - &self, - migration_id: Uuid, - conn: WebSocketStream, - protocol: crate::migrate::protocol::Protocol, - ) -> Result<(), VmControllerError> { - let mut inner = self.worker_state.inner.lock().unwrap(); - - // Check that the request can be enqueued before setting up the - // migration task. - if !inner.external_request_queue.migrate_as_source_will_enqueue()? { - return Ok(()); + mut func: impl FnMut( + &str, + &Arc, + ) -> std::result::Result<(), E>, + ) -> std::result::Result<(), E> { + for (name, dev) in self.lifecycle_components.iter() { + func(name, dev)?; } - let migration_request = - self.launch_source_migration_task(migration_id, conn, protocol); - - // Unwrap is safe because the queue state was checked under the lock. - inner.external_request_queue.try_queue(migration_request).unwrap(); - self.worker_state.cv.notify_one(); Ok(()) } - /// Launches a task that will execute a live migration out of this VM. - /// Returns a state change request message to queue to the state driver, - /// which will coordinate with this task to run the migration. - fn launch_source_migration_task< - T: AsyncRead + AsyncWrite + Unpin + Send + 'static, - >( - &self, - migration_id: Uuid, - conn: WebSocketStream, - protocol: crate::migrate::protocol::Protocol, - ) -> ExternalRequest { - let log_for_task = - self.log.new(slog::o!("component" => "migrate_source_task")); - let ctrl_for_task = self.this.upgrade().unwrap(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - let (response_tx, response_rx) = tokio::sync::mpsc::channel(1); - - // The migration process uses async operations when communicating with - // the migration target. Run that work on the async runtime. - info!(self.log, "Launching migration source task"); - let task = self.rt_hdl().spawn(async move { - info!(log_for_task, "Waiting to be told to start"); - start_rx.await.unwrap(); - - info!(log_for_task, "Starting migration procedure"); - if let Err(e) = crate::migrate::source::migrate( - ctrl_for_task, - command_tx, - response_rx, - conn, - protocol, - ) - .await - { - error!(log_for_task, "Migration task failed: {}", e); - return Err(e); - } - - Ok(()) - }); - - ExternalRequest::MigrateAsSource { - migration_id, - task, - start_tx, - command_rx, - response_tx, - } - } - - /// Asks to queue a request to start a destination migration task for this - /// VM. The migration will have the supplied `migration_id` and will obtain - /// its connection to the source by calling `upgrade_fn` to obtain a future - /// that yields the necessary connection. - /// - /// This routine fails if the VM has already begun to run or if a previous - /// migration in was attempted (regardless of its outcome). Note that this - /// routine does not fail if the future returned from `upgrade_fn` - /// subsequently fails to produce a connection to the destination (though - /// the migration attempt will then fail). - /// - /// On success, clients may query the instance's migration status to - /// determine how the migration has progressed. - pub fn request_migration_into< - T: AsyncRead + AsyncWrite + Unpin + Send + 'static, - >( + pub(crate) fn device_by_name( &self, - migration_id: Uuid, - conn: WebSocketStream, - local_addr: SocketAddr, - protocol: crate::migrate::protocol::Protocol, - ) -> Result<(), VmControllerError> { - let mut inner = self.worker_state.inner.lock().unwrap(); - if !inner.external_request_queue.migrate_as_target_will_enqueue()? { - return Ok(()); - } - - // Check that the request can be enqueued before setting up the - // migration task. - let migration_request = self.launch_target_migration_task( - migration_id, - conn, - local_addr, - protocol, - ); - - // Unwrap is safe because the queue state was checked under the lock. - inner.external_request_queue.try_queue(migration_request).unwrap(); - self.worker_state.cv.notify_one(); - Ok(()) + name: &String, + ) -> Option> { + self.vm_objects.devices.get(name).cloned() } +} - /// Launches a task that will execute a live migration into this VM. - /// Returns a state change request message to queue to the state driver, - /// which will coordinate with this task to run the migration. - fn launch_target_migration_task< - T: AsyncRead + AsyncWrite + Unpin + Send + 'static, - >( - &self, - migration_id: Uuid, - conn: WebSocketStream, - local_addr: SocketAddr, - protocol: crate::migrate::protocol::Protocol, - ) -> ExternalRequest { - let log_for_task = - self.log.new(slog::o!("component" => "migrate_source_task")); - let ctrl_for_task = self.this.upgrade().unwrap(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - - // The migration process uses async operations when communicating with - // the migration target. Run that work on the async runtime. - info!(self.log, "Launching migration target task"); - let task = self.rt_hdl().spawn(async move { - info!(log_for_task, "Waiting to be told to start"); - start_rx.await.unwrap(); +/// The state stored in a [`Vm`] when there is an actual underlying virtual +/// machine. +pub(super) struct ActiveVm { + parent: Arc, + log: slog::Logger, - info!(log_for_task, "Starting migration procedure"); - if let Err(e) = crate::migrate::destination::migrate( - ctrl_for_task, - command_tx, - conn, - local_addr, - protocol, - ) - .await - { - error!(log_for_task, "Migration task failed: {}", e); - return Err(e); - } + state_driver_queue: Arc, + external_state_rx: InstanceStateRx, - Ok(()) - }); - - ExternalRequest::MigrateAsTarget { - migration_id, - task, - start_tx, - command_rx, - } - } + properties: InstanceProperties, - /// Handles a request to change the wrapped instance's state. - pub fn put_state( - &self, - requested: ApiInstanceStateRequested, - ) -> Result<(), VmControllerError> { - info!(self.log(), "Requested state {:?} via API", requested); + objects: RwLock, + services: tokio::sync::Mutex>, +} - self.worker_state - .queue_external_request(match requested { - ApiInstanceStateRequested::Run => ExternalRequest::Start, - ApiInstanceStateRequested::Stop => ExternalRequest::Stop, - ApiInstanceStateRequested::Reboot => ExternalRequest::Reboot, - }) - .map_err(Into::into) +impl ActiveVm { + pub(crate) fn log(&self) -> &slog::Logger { + &self.log } - pub fn migrate_status(&self) -> ApiMigrateStatusResponse { - let mut published = - self.vm_objects.monitor_rx.borrow().migration.clone(); - - // There's a window between the point where a request to migrate returns - // and the point where the state worker actually picks up the migration - // and publishes its state. To ensure that migrations are visible as - // soon as they're queued, pick up the queued migration (if there is - // one) and insert it into the output in the appropriate position. The - // state driver will consume the pending migration before actually - // executing it. - let inner = self.worker_state.inner.lock().unwrap(); - if let Some((id, role)) = inner.pending_migration_id { - match role { - MigrateRole::Destination => { - published.migration_in = Some(ApiMigrationStatus { - id, - state: ApiMigrationState::Sync, - }); - } - MigrateRole::Source => { - published.migration_out = Some(ApiMigrationStatus { - id, - state: ApiMigrationState::Sync, - }); - } - } - } - - published + pub(crate) fn objects(&self) -> RwLockReadGuard<'_, VmObjects> { + self.objects.read().unwrap() } - pub(crate) fn for_each_device( + pub(crate) async fn for_each_device( &self, - mut func: impl FnMut(&str, &Arc), + func: impl FnMut(&str, &Arc), ) { - for (name, dev) in self.vm_objects.devices.iter() { - func(name, dev); - } + self.objects().for_each_device(func); } - pub(crate) fn for_each_device_fallible( + pub(crate) async fn for_each_device_fallible( &self, - mut func: F, - ) -> std::result::Result<(), E> - where - F: FnMut( + func: impl FnMut( &str, &Arc, ) -> std::result::Result<(), E>, - { - for (name, dev) in self.vm_objects.devices.iter() { - func(name, dev)?; - } - Ok(()) - } - - pub(crate) fn device_by_name( - &self, - name: &String, - ) -> Option> { - self.vm_objects.devices.get(name).cloned() + ) -> std::result::Result<(), E> { + self.objects().for_each_device_fallible(func) } - /// Spawn a Tokio runtime in which to run the VMM-related (device emulation, - /// block backends, etc) tasks for an instance. - pub(crate) fn spawn_runtime( - properties: &InstanceProperties, - ) -> anyhow::Result { - // For now, just base the runtime size on vCPU count - let thread_count = usize::max( - VMM_MIN_RT_THREADS, - VMM_BASE_RT_THREADS + properties.vcpus as usize, - ); - - tokio::runtime::Builder::new_multi_thread() - .thread_name("tokio-rt-vmm") - .worker_threads(thread_count) - .enable_all() - .build() - .context("spawning tokio runtime for VMM") + async fn stop_services(&self) { + let services = self.services.lock().await.take().unwrap(); + services.stop(&self.log).await; } } -impl Drop for VmController { +impl Drop for ActiveVm { fn drop(&mut self) { - info!(self.log, "Dropping VM controller"); - let machine = self - .vm_objects - .machine - .take() - .expect("VM controller should have an instance at drop"); - - // Destroy the underlying kernel VMM resource - let hdl = machine.destroy(); - let _ = hdl.destroy(); - - // Detach block backends so they can do any final clean-up - debug!(self.log, "Detaching block backends"); - for backend in self.vm_objects.block_backends.values() { - let _ = backend.attachment().detach(); - } - - // A fully-initialized controller is kept alive in part by its worker - // thread, which owns the sender side of the controller's state-change - // notification channel. Since the controller is being dropped, the - // worker is gone, so reclaim the sender from it and use it to publish - // that the controller is being destroyed. - if let Some(thread) = self.worker_thread.lock().unwrap().take() { - let api_state = thread.join().unwrap(); - let old_state = api_state.borrow().clone(); - - // Preserve the instance's state if it failed so that clients can - // distinguish gracefully-stopped instances from failed instances. - if matches!(old_state.state, ApiInstanceState::Failed) { - return; - } - - let gen = old_state.gen + 1; - let _ = api_state.send(ApiMonitoredState { - gen, - state: ApiInstanceState::Destroyed, - ..old_state - }); - } - - // Tokio will be upset if the VMM runtime is implicitly shutdown (via - // drop) in blocking context. We avoid such troubles by doing an - // explicit background shutdown. - let rt = self.vmm_runtime.take().expect("vmm_runtime is populated"); - rt.shutdown_background(); + let mut guard = self.parent.state.write().unwrap(); + *guard = VmState::Defunct(DefunctVm { + external_state_rx: self.external_state_rx.clone(), + properties: self.properties.clone(), + spec: self.objects().instance_spec.clone(), + }); } } -/// An event that a VM's state driver must process. -#[derive(Debug)] -enum StateDriverEvent { - /// An event that was raised from within the guest. - Guest(GuestEvent), - - /// An event that was raised by an external entity (e.g. an API call to the - /// server). - External(ExternalRequest), +struct DefunctVm { + external_state_rx: InstanceStateRx, + properties: InstanceProperties, + spec: InstanceSpecV0, } -/// Commands issued by the state driver back to its VM controller. These are -/// abstracted into a trait to allow them to be mocked out for testing without -/// having to supply mock implementations of the rest of the VM controller's -/// functionality. -#[cfg_attr(test, mockall::automock)] -trait StateDriverVmController { - /// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated - /// devices and vCPUs are brought to a consistent state. - /// - /// When the VM is paused, attempts to run its vCPUs (via `VM_RUN` ioctl) - /// will fail. A corresponding `resume_vm()` call must be made prior to - /// allowing vCPU tasks to run. - fn pause_vm(&self); - - /// Resume a previously-paused VM at the kernel VMM level. This will resume - /// any timers driving in-kernel-emulated devices, and allow the vCPU to run - /// again. - fn resume_vm(&self); - - /// Sends a reset request to each device in the instance, then sends a - /// reset command to the instance's bhyve VM. - fn reset_devices_and_machine(&self); - - /// Sends each device (and backend) a start request. - fn start_devices(&self) -> anyhow::Result<()>; - - /// Sends each device a pause request, then waits for all these requests to - /// complete. - fn pause_devices(&self); - - /// Sends each device a resume request. - fn resume_devices(&self); - - /// Sends each device (and backend) a halt request. - fn halt_devices(&self); +#[allow(clippy::large_enum_variant)] +enum VmState { + NoVm, + WaitingToStart, + Active(Weak), + Defunct(DefunctVm), +} - /// Resets the state of each vCPU in the instance to its on-reboot state. - fn reset_vcpu_state(&self); +pub(super) struct EnsureOptions { + pub toml_config: Arc, + pub use_reservoir: bool, + pub oximeter_registry: Option, + pub nexus_client: Option, } -impl StateDriverVmController for VmController { - fn pause_vm(&self) { - info!(self.log, "Pausing kernel VMM resources"); - self.machine().hdl.pause().expect("VM_PAUSE should succeed") +impl Vm { + pub fn new() -> Arc { + Arc::new(Self { state: RwLock::new(VmState::NoVm) }) } - fn resume_vm(&self) { - info!(self.log, "Resuming kernel VMM resources"); - self.machine().hdl.resume().expect("VM_RESUME should succeed") + fn vm_state(&self) -> RwLockReadGuard<'_, VmState> { + self.state.read().unwrap() } - fn reset_devices_and_machine(&self) { - let _rtguard = self.rt_hdl().enter(); - self.for_each_device(|name, dev| { - info!(self.log, "Sending reset request to {}", name); - dev.reset(); - }); + pub(super) fn active_vm(&self) -> Option> { + let guard = self.vm_state(); + if let VmState::Active(weak) = &*guard { + weak.upgrade() + } else { + None + } + } - self.machine().reinitialize().unwrap(); + fn start_failed(&self) { + let mut guard = self.state.write().unwrap(); + match *guard { + VmState::WaitingToStart => *guard = VmState::NoVm, + _ => unreachable!( + "only a starting VM's state worker calls start_failed" + ), + } } - fn start_devices(&self) -> anyhow::Result<()> { - let _rtguard = self.rt_hdl().enter(); - self.for_each_device_fallible(|name, dev| { - info!(self.log, "Sending startup complete to {}", name); - let res = dev.start(); - if let Err(e) = &res { - error!(self.log, "Startup failed for {}: {:?}", name, e); - } - res - })?; - for (name, backend) in self.vm_objects.block_backends.iter() { - debug!(self.log, "Starting block backend {}", name); - let res = backend.start(); - if let Err(e) = &res { - error!(self.log, "Startup failed for {}: {:?}", name, e); - return res; + fn make_active(&self, active: Arc) { + let mut guard = self.state.write().unwrap(); + let old = std::mem::replace(&mut *guard, VmState::NoVm); + match old { + VmState::WaitingToStart => { + *guard = VmState::Active(Arc::downgrade(&active)) } + _ => unreachable!( + "only a starting VM's state worker calls make_active" + ), } - Ok(()) } - fn pause_devices(&self) { - let _rtguard = self.rt_hdl().enter(); - self.for_each_device(|name, dev| { - info!(self.log, "Sending pause request to {}", name); - dev.pause(); - }); + pub async fn ensure( + self: &Arc, + log: slog::Logger, + ensure_request: propolis_api_types::InstanceSpecEnsureRequest, + options: EnsureOptions, + ) -> anyhow::Result + { + // Take the lock for writing, since in the common case this call will be + // creating a new VM and there's no easy way to upgrade from a reader + // lock to a writer lock. + let mut guard = self.state.write().unwrap(); - // Create a Future that returns the name of the device that has finished - // pausing: this allows keeping track of which devices have and haven't - // completed pausing yet. - struct NamedFuture { - name: String, - future: BoxFuture<'static, ()>, + if matches!(*guard, VmState::WaitingToStart | VmState::Active(_)) { + return Err(VmError::AlreadyInitialized); } - impl std::future::Future for NamedFuture { - type Output = String; - - fn poll( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll { - let mut_self = self.get_mut(); - match Pin::new(&mut mut_self.future).poll(cx) { - Poll::Pending => Poll::Pending, - Poll::Ready(()) => Poll::Ready(mut_self.name.clone()), - } - } - } + *guard = VmState::WaitingToStart; - info!(self.log, "Waiting for devices to pause"); - self.rt_hdl().block_on(async { - let mut stream: FuturesUnordered<_> = self - .vm_objects - .devices - .iter() - .map(|(name, dev)| { - info!(self.log, "Got paused future from dev {}", name); - NamedFuture { name: name.to_string(), future: dev.paused() } - }) - .collect(); + let (external_tx, external_rx) = + tokio::sync::watch::channel(InstanceStateMonitorResponse { + gen: 1, + state: propolis_api_types::InstanceState::Starting, + migration: propolis_api_types::InstanceMigrateStatusResponse { + migration_in: None, + migration_out: None, + }, + }); - loop { - match stream.next().await { - Some(name) => { - info!(self.log, "dev {} completed pause", name); - } + let input_queue = state_driver::InputQueue::new( + log.new(slog::o!("component" => "vmm_request_queue")), + ); - None => { - // done - info!(self.log, "all devices paused"); - break; - } - } - } - }); - } + let state_driver = state_driver::StateDriver::new( + log, + self.clone(), + Arc::new(input_queue), + external_tx, + ); - fn resume_devices(&self) { - let _rtguard = self.rt_hdl().enter(); - self.for_each_device(|name, dev| { - info!(self.log, "Sending resume request to {}", name); - dev.resume(); - }); - } + let (ensure_reply_tx, ensure_rx) = tokio::sync::oneshot::channel(); - fn halt_devices(&self) { - let _rtguard = self.rt_hdl().enter(); - self.for_each_device(|name, dev| { - info!(self.log, "Sending halt request to {}", name); - dev.halt(); + tokio::spawn(async move { + state_driver + .run(ensure_request, ensure_reply_tx, options, external_rx) + .await }); - for (name, backend) in self.vm_objects.block_backends.iter() { - debug!(self.log, "Stopping and detaching block backend {}", name); - backend.stop(); - if let Err(err) = backend.detach() { - error!( - self.log, - "Error while detaching block backend {name}: {err:?}", - ); - } - } - } - fn reset_vcpu_state(&self) { - for vcpu in self.machine().vcpus.iter() { - info!(self.log, "Resetting vCPU {}", vcpu.id); - vcpu.activate().unwrap(); - vcpu.reboot_state().unwrap(); - if vcpu.is_bsp() { - info!(self.log, "Resetting BSP vCPU {}", vcpu.id); - vcpu.set_run_state(propolis::bhyve_api::VRS_RUN, None).unwrap(); - vcpu.set_reg( - propolis::bhyve_api::vm_reg_name::VM_REG_GUEST_RIP, - 0xfff0, - ) - .unwrap(); - } - } + ensure_rx.await } } diff --git a/bin/propolis-server/src/lib/vm/request_queue.rs b/bin/propolis-server/src/lib/vm/request_queue.rs index 9d23faa26..fe52d2135 100644 --- a/bin/propolis-server/src/lib/vm/request_queue.rs +++ b/bin/propolis-server/src/lib/vm/request_queue.rs @@ -29,34 +29,12 @@ use uuid::Uuid; use crate::migrate::MigrateError; -use super::{ - MigrateSourceCommand, MigrateSourceResponse, MigrateTargetCommand, -}; +use super::migrate_commands::{MigrateSourceCommand, MigrateSourceResponse}; /// An external request made of a VM controller via the server API. Handled by /// the controller's state driver thread. #[derive(Debug)] pub enum ExternalRequest { - /// Initializes the VM through live migration by running a - /// migration-destination task. - MigrateAsTarget { - /// The ID of the live migration to use when initializing. - migration_id: Uuid, - - /// A handle to the task that will execute the migration procedure. - task: tokio::task::JoinHandle>, - - /// The sender side of a one-shot channel that, when signaled, tells the - /// migration task to start its work. - start_tx: tokio::sync::oneshot::Sender<()>, - - /// A channel that receives commands from the migration task. - command_rx: tokio::sync::mpsc::Receiver, - }, - - /// Resets all the VM's devices and CPUs, then starts the VM. - Start, - /// Asks the state worker to start a migration-source task. MigrateAsSource { /// The ID of the live migration for which this VM will be the source. @@ -147,8 +125,6 @@ enum RequestDisposition { /// The current disposition for each kind of incoming request. #[derive(Copy, Clone, Debug)] struct AllowedRequests { - migrate_as_target: RequestDisposition, - start: RequestDisposition, migrate_as_source: RequestDisposition, reboot: RequestDisposition, stop: RequestDisposition, @@ -167,8 +143,6 @@ impl ExternalRequestQueue { Self { queue: VecDeque::new(), allowed: AllowedRequests { - migrate_as_target: RequestDisposition::Enqueue, - start: RequestDisposition::Enqueue, migrate_as_source: RequestDisposition::Deny( RequestDeniedReason::InstanceNotActive, ), @@ -198,10 +172,6 @@ impl ExternalRequestQueue { request: ExternalRequest, ) -> Result<(), RequestDeniedReason> { let disposition = match request { - ExternalRequest::MigrateAsTarget { .. } => { - self.allowed.migrate_as_target - } - ExternalRequest::Start => self.allowed.start, ExternalRequest::MigrateAsSource { .. } => { self.allowed.migrate_as_source } @@ -237,26 +207,6 @@ impl ExternalRequestQueue { .get_new_dispositions(DispositionChangeReason::StateChange(state)); } - /// Indicates whether the queue would allow a request to migrate into this - /// instance. This can be used to avoid setting up migration tasks for - /// requests that will ultimately be denied. - /// - /// # Return value - /// - /// - `Ok(true)` if the request will be queued. - /// - `Ok(false)` if the request is allowed for idempotency reasons but will - /// not be queued. - /// - `Err` if the request is forbidden. - pub fn migrate_as_target_will_enqueue( - &self, - ) -> Result { - match self.allowed.migrate_as_target { - RequestDisposition::Enqueue => Ok(true), - RequestDisposition::Ignore => Ok(false), - RequestDisposition::Deny(reason) => Err(reason), - } - } - /// Indicates whether the queue would allow a request to migrate out of this /// instance. This can be used to avoid setting up migration tasks for /// requests that will ultimately be denied. @@ -295,74 +245,21 @@ impl ExternalRequestQueue { use RequestDeniedReason as DenyReason; use RequestDisposition as Disposition; match reason { - // Starting the instance, whether via migration or cold boot, - // forecloses on further attempts to migrate in. For idempotency, - // further requests to start are allowed when an instance-starting - // transition is enqueued. - ChangeReason::ApiRequest(ExternalRequest::MigrateAsTarget { - .. - }) - | ChangeReason::ApiRequest(ExternalRequest::Start) => { - let (migrate_as_target_disposition, deny_reason) = match reason - { - // If this is a request to migrate in, make sure future - // requests to migrate in are handled idempotently. - ChangeReason::ApiRequest( - ExternalRequest::MigrateAsTarget { .. }, - ) => ( - Disposition::Ignore, - DenyReason::MigrationTargetInProgress, - ), - ChangeReason::ApiRequest(ExternalRequest::Start) => ( - Disposition::Deny(DenyReason::StartInProgress), - DenyReason::StartInProgress, - ), - _ => unreachable!(), - }; - - AllowedRequests { - migrate_as_target: migrate_as_target_disposition, - start: Disposition::Ignore, - migrate_as_source: Disposition::Deny(deny_reason), - reboot: Disposition::Deny(deny_reason), - stop: self.allowed.stop, - } - } ChangeReason::ApiRequest(ExternalRequest::MigrateAsSource { .. - }) => { - assert!(matches!(self.allowed.start, Disposition::Ignore)); - - // Requests to migrate into the instance should not be enqueued - // from this point, but whether they're dropped or ignored - // depends on how the instance was originally initialized. - assert!(!matches!( - self.allowed.migrate_as_target, - Disposition::Enqueue - )); - - AllowedRequests { - migrate_as_target: self.allowed.migrate_as_target, - start: self.allowed.start, - migrate_as_source: Disposition::Deny( - DenyReason::AlreadyMigrationSource, - ), - reboot: Disposition::Deny( - DenyReason::InvalidRequestForMigrationSource, - ), - stop: self.allowed.stop, - } - } + }) => AllowedRequests { + migrate_as_source: Disposition::Deny( + DenyReason::AlreadyMigrationSource, + ), + reboot: Disposition::Deny( + DenyReason::InvalidRequestForMigrationSource, + ), + stop: self.allowed.stop, + }, // Requests to reboot prevent additional reboot requests from being // queued, but do not affect other operations. ChangeReason::ApiRequest(ExternalRequest::Reboot) => { - assert!(matches!(self.allowed.start, Disposition::Ignore)); - assert!(!matches!( - self.allowed.migrate_as_target, - Disposition::Enqueue - )); - AllowedRequests { reboot: Disposition::Ignore, ..self.allowed } } @@ -370,10 +267,6 @@ impl ExternalRequestQueue { // queued. Additional requests to stop are ignored for idempotency. ChangeReason::ApiRequest(ExternalRequest::Stop) => { AllowedRequests { - migrate_as_target: Disposition::Deny( - DenyReason::HaltPending, - ), - start: Disposition::Deny(DenyReason::HaltPending), migrate_as_source: Disposition::Deny( DenyReason::HaltPending, ), @@ -386,8 +279,6 @@ impl ExternalRequestQueue { // to reboot it become valid. ChangeReason::StateChange(InstanceStateChange::StartedRunning) => { AllowedRequests { - migrate_as_target: self.allowed.migrate_as_target, - start: self.allowed.start, migrate_as_source: Disposition::Enqueue, reboot: Disposition::Enqueue, stop: self.allowed.stop, @@ -415,10 +306,6 @@ impl ExternalRequestQueue { // "deny". ChangeReason::StateChange(InstanceStateChange::Stopped) => { AllowedRequests { - migrate_as_target: Disposition::Deny( - DenyReason::InstanceNotActive, - ), - start: Disposition::Deny(DenyReason::InstanceNotActive), migrate_as_source: Disposition::Deny( DenyReason::InstanceNotActive, ), @@ -428,10 +315,6 @@ impl ExternalRequestQueue { } ChangeReason::StateChange(InstanceStateChange::Failed) => { AllowedRequests { - migrate_as_target: Disposition::Deny( - DenyReason::InstanceFailed, - ), - start: Disposition::Deny(DenyReason::InstanceFailed), migrate_as_source: Disposition::Deny( DenyReason::InstanceFailed, ), @@ -453,18 +336,6 @@ mod test { slog::Logger::root(slog::Discard, slog::o!()) } - fn make_migrate_as_target_request() -> ExternalRequest { - let task = tokio::task::spawn(async { Ok(()) }); - let (start_tx, _) = tokio::sync::oneshot::channel(); - let (_, command_rx) = tokio::sync::mpsc::channel(1); - ExternalRequest::MigrateAsTarget { - migration_id: Uuid::new_v4(), - task, - start_tx, - command_rx, - } - } - fn make_migrate_as_source_request() -> ExternalRequest { let task = tokio::task::spawn(async { Ok(()) }); let (start_tx, _) = tokio::sync::oneshot::channel(); @@ -479,47 +350,10 @@ mod test { } } - #[tokio::test] - async fn migrate_as_target_is_idempotent() { - let mut queue = ExternalRequestQueue::new(test_logger()); - - // Requests to migrate as a target should queue normally at first. - assert!(queue.migrate_as_target_will_enqueue().unwrap()); - - // After queuing such a request, subsequent requests should be allowed - // without enqueuing anything. - assert!(queue.try_queue(make_migrate_as_target_request()).is_ok()); - assert!(!queue.migrate_as_target_will_enqueue().unwrap()); - - // Pop the request and tell the queue the instance is running. - assert!(matches!( - queue.pop_front(), - Some(ExternalRequest::MigrateAsTarget { .. }) - )); - queue.notify_instance_state_change(InstanceStateChange::StartedRunning); - - // Because the instance was started via migration in, future requests - // to migrate in should be allowed. - assert!(queue.try_queue(make_migrate_as_target_request()).is_ok()); - assert!(!queue.migrate_as_target_will_enqueue().unwrap()); - } - - #[tokio::test] - async fn migrate_as_target_is_forbidden_after_cold_boot() { - let mut queue = ExternalRequestQueue::new(test_logger()); - assert!(queue.try_queue(ExternalRequest::Start).is_ok()); - queue.notify_instance_state_change(InstanceStateChange::StartedRunning); - - assert!(queue.migrate_as_target_will_enqueue().is_err()); - assert!(queue.try_queue(make_migrate_as_target_request()).is_err()); - } - #[tokio::test] async fn migrate_as_source_is_not_idempotent() { // Simulate a running instance. let mut queue = ExternalRequestQueue::new(test_logger()); - assert!(queue.try_queue(ExternalRequest::Start).is_ok()); - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); queue.notify_instance_state_change(InstanceStateChange::StartedRunning); // Requests to migrate out should be allowed. @@ -558,8 +392,6 @@ mod test { #[tokio::test] async fn stop_requests_enqueue_after_vm_failure() { let mut queue = ExternalRequestQueue::new(test_logger()); - assert!(queue.try_queue(ExternalRequest::Start).is_ok()); - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); queue.notify_instance_state_change(InstanceStateChange::Failed); assert!(queue.try_queue(ExternalRequest::Stop).is_ok()); @@ -569,8 +401,6 @@ mod test { #[tokio::test] async fn reboot_requests_are_idempotent_except_when_stopping() { let mut queue = ExternalRequestQueue::new(test_logger()); - assert!(queue.try_queue(ExternalRequest::Start).is_ok()); - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); queue.notify_instance_state_change(InstanceStateChange::StartedRunning); // Once the instance is started, reboot requests should be allowed, but diff --git a/bin/propolis-server/src/lib/vm/services.rs b/bin/propolis-server/src/lib/vm/services.rs new file mode 100644 index 000000000..538973a40 --- /dev/null +++ b/bin/propolis-server/src/lib/vm/services.rs @@ -0,0 +1,49 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Services visible to consumers outside this Propolis that depend on +//! functionality supplied by an extant VM. + +use std::sync::Arc; + +use rfb::server::VncServer; +use slog::{error, Logger}; + +use crate::{serial::SerialTaskControlMessage, vnc::PropolisVncServer}; + +#[derive(Default)] +struct OximeterState { + server: Option, + stats: Option, +} + +pub(super) struct VmServices { + serial_task: tokio::sync::Mutex>, + oximeter: tokio::sync::Mutex, + vnc_server: Arc>, +} + +impl VmServices { + pub(super) async fn stop(&self, log: &Logger) { + self.vnc_server.stop().await; + + if let Some(serial_task) = self.serial_task.lock().await.take() { + let _ = serial_task + .control_ch + .send(SerialTaskControlMessage::Stopping) + .await; + let _ = serial_task.task.await; + } + + let mut oximeter_state = self.oximeter.lock().await; + if let Some(server) = oximeter_state.server.take() { + if let Err(e) = server.close().await { + error!(log, "failed to close oximeter producer server"; + "error" => ?e); + } + } + + let _ = oximeter_state.stats.take(); + } +} diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 4a2832f64..652434400 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -2,324 +2,464 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::sync::Arc; +//! It drives the state vroom vroom -use crate::migrate::{MigrateError, MigrateRole}; -use crate::vcpu_tasks::VcpuTaskController; - -use super::{ - request_queue, ExternalRequest, GuestEvent, MigrateSourceCommand, - MigrateSourceResponse, MigrateTargetCommand, MigrateTaskEvent, - SharedVmState, StateDriverEvent, +use std::{ + sync::{Arc, Condvar, Mutex}, + time::Duration, }; use propolis_api_types::{ - InstanceMigrateStatusResponse as ApiMigrateStatusResponse, - InstanceMigrationStatus as ApiMigrationStatus, - InstanceState as ApiInstanceState, - InstanceStateMonitorResponse as ApiMonitoredState, - MigrationState as ApiMigrationState, + instance_spec::VersionedInstanceSpec, InstanceProperties, InstanceState, }; -use slog::{error, info, Logger}; +use slog::info; use uuid::Uuid; -#[usdt::provider(provider = "propolis")] -mod probes { - fn state_driver_pause() {} - fn state_driver_resume() {} +use crate::{ + initializer::{ + build_instance, MachineInitializer, MachineInitializerState, + }, + migrate::MigrateRole, + vcpu_tasks::VcpuTaskController, +}; + +use super::{ + guest_event::{self, GuestEvent}, + lifecycle_ops, VmError, +}; + +struct MigrationStateUpdate { + state: propolis_api_types::MigrationState, + id: Uuid, + role: MigrateRole, +} + +impl MigrationStateUpdate { + fn apply_to( + self, + old: propolis_api_types::InstanceMigrateStatusResponse, + ) -> propolis_api_types::InstanceMigrateStatusResponse { + let new = propolis_api_types::InstanceMigrationStatus { + id: self.id, + state: self.state, + }; + match self.role { + MigrateRole::Destination => { + propolis_api_types::InstanceMigrateStatusResponse { + migration_in: Some(new), + migration_out: old.migration_out, + } + } + MigrateRole::Source => { + propolis_api_types::InstanceMigrateStatusResponse { + migration_in: old.migration_in, + migration_out: Some(new), + } + } + } + } +} + +enum ExternalStateUpdate { + Instance(InstanceState), + Migration(MigrationStateUpdate), + Complete(InstanceState, MigrationStateUpdate), } -/// Tells the state driver whether or not to continue running after responding -/// to an event. #[derive(Debug, PartialEq, Eq)] enum HandleEventOutcome { Continue, Exit, } -/// A reason for starting a VM. -#[derive(Debug, PartialEq, Eq)] -enum VmStartReason { - MigratedIn, - ExplicitRequest, +#[derive(Debug)] +enum InputQueueEvent { + ExternalRequest(super::request_queue::ExternalRequest), + GuestEvent(GuestEvent), } -/// A wrapper around all the data needed to describe the status of a live -/// migration. -struct PublishedMigrationState { - state: ApiMigrationState, - id: Uuid, - role: MigrateRole, +struct InputQueueInner { + external_requests: super::request_queue::ExternalRequestQueue, + guest_events: super::guest_event::GuestEventQueue, } -impl PublishedMigrationState { - /// Updates an `old` migration status response to contain information about - /// the migration described by `self`. - fn apply_to( - self, - old: ApiMigrateStatusResponse, - ) -> ApiMigrateStatusResponse { - let new = ApiMigrationStatus { id: self.id, state: self.state }; - match self.role { - MigrateRole::Destination => ApiMigrateStatusResponse { - migration_in: Some(new), - migration_out: old.migration_out, - }, - MigrateRole::Source => ApiMigrateStatusResponse { - migration_in: old.migration_in, - migration_out: Some(new), - }, +impl InputQueueInner { + fn new(log: slog::Logger) -> Self { + Self { + external_requests: super::request_queue::ExternalRequestQueue::new( + log, + ), + guest_events: super::guest_event::GuestEventQueue::default(), } } } -enum PublishedState { - Instance(ApiInstanceState), - Migration(PublishedMigrationState), - Complete(ApiInstanceState, PublishedMigrationState), +pub(super) struct InputQueue { + inner: Mutex, + cv: Condvar, +} + +impl InputQueue { + pub(super) fn new(log: slog::Logger) -> Self { + Self { + inner: Mutex::new(InputQueueInner::new(log)), + cv: Condvar::new(), + } + } + + fn wait_for_next_event(&self) -> InputQueueEvent { + let guard = self.inner.lock().unwrap(); + let mut guard = self + .cv + .wait_while(guard, |i| { + i.external_requests.is_empty() && i.guest_events.is_empty() + }) + .unwrap(); + + if let Some(guest_event) = guard.guest_events.pop_front() { + InputQueueEvent::GuestEvent(guest_event) + } else { + InputQueueEvent::ExternalRequest( + guard.external_requests.pop_front().unwrap(), + ) + } + } + + fn notify_instance_state_change( + &self, + state: super::request_queue::InstanceStateChange, + ) { + let mut guard = self.inner.lock().unwrap(); + guard.external_requests.notify_instance_state_change(state); + } } -pub(super) struct StateDriver< - V: super::StateDriverVmController, - C: VcpuTaskController, -> { - /// A handle to the host server's tokio runtime, useful for spawning tasks - /// that need to interact with async code (e.g. spinning up migration - /// tasks). - runtime_hdl: tokio::runtime::Handle, +impl guest_event::GuestEventHandler for InputQueue { + fn suspend_halt_event(&self, when: Duration) { + let mut guard = self.inner.lock().unwrap(); + if guard + .guest_events + .enqueue(guest_event::GuestEvent::VcpuSuspendHalt(when)) + { + self.cv.notify_all(); + } + } - /// A reference to the command sink to which this driver should send its - /// requests to send messages to devices or update other VM controller - /// state. - controller: Arc, + fn suspend_reset_event(&self, when: Duration) { + let mut guard = self.inner.lock().unwrap(); + if guard + .guest_events + .enqueue(guest_event::GuestEvent::VcpuSuspendReset(when)) + { + self.cv.notify_all(); + } + } - /// A reference to the state this driver shares with its VM controller. - shared_state: Arc, + fn suspend_triple_fault_event(&self, vcpu_id: i32, when: Duration) { + let mut guard = self.inner.lock().unwrap(); + if guard.guest_events.enqueue( + guest_event::GuestEvent::VcpuSuspendTripleFault(vcpu_id, when), + ) { + self.cv.notify_all(); + } + } - /// The controller for this instance's vCPU tasks. - vcpu_tasks: C, + fn unhandled_vm_exit( + &self, + vcpu_id: i32, + exit: propolis::exits::VmExitKind, + ) { + panic!("vCPU {}: Unhandled VM exit: {:?}", vcpu_id, exit); + } - /// The state worker's logger. - log: Logger, + fn io_error_event(&self, vcpu_id: i32, error: std::io::Error) { + panic!("vCPU {}: Unhandled vCPU error: {}", vcpu_id, error); + } +} - /// The generation number to use when publishing externally-visible state - /// updates. - state_gen: u64, +impl guest_event::ChipsetEventHandler for InputQueue { + fn chipset_halt(&self) { + let mut guard = self.inner.lock().unwrap(); + if guard.guest_events.enqueue(guest_event::GuestEvent::ChipsetHalt) { + self.cv.notify_all(); + } + } - /// Whether the worker's VM's devices are paused. - paused: bool, + fn chipset_reset(&self) { + let mut guard = self.inner.lock().unwrap(); + if guard.guest_events.enqueue(guest_event::GuestEvent::ChipsetReset) { + self.cv.notify_all(); + } + } +} - /// The sender side of the monitor that reflects the instance's current - /// externally-visible state (including migration state). - api_state_tx: tokio::sync::watch::Sender, +/// The context for a VM state driver task. +pub(super) struct StateDriver { + log: slog::Logger, + parent_vm: Arc, + input_queue: Arc, + external_state_tx: super::InstanceStateTx, + paused: bool, + vcpu_tasks: Option>, + vm_lifecycle: Option>, + migration_src_state: crate::migrate::source::PersistentState, } -impl StateDriver -where - V: super::StateDriverVmController, - C: VcpuTaskController, -{ - /// Constructs a new state driver context. +impl StateDriver { pub(super) fn new( - runtime_hdl: tokio::runtime::Handle, - controller: Arc, - shared_controller_state: Arc, - vcpu_tasks: C, - log: Logger, - api_state_tx: tokio::sync::watch::Sender, + log: slog::Logger, + vm: Arc, + input_queue: Arc, + external_state_tx: super::InstanceStateTx, ) -> Self { + let log = log.new(slog::o!("component" => "state_driver")); Self { - runtime_hdl, - controller, - shared_state: shared_controller_state, - vcpu_tasks, log, - state_gen: 0, + parent_vm: vm, + input_queue, + external_state_tx, paused: false, - api_state_tx, + vcpu_tasks: None, + vm_lifecycle: None, + migration_src_state: Default::default(), } } - /// Yields the current externally-visible instance state. - fn get_instance_state(&self) -> ApiInstanceState { - self.api_state_tx.borrow().state - } - - /// Retrieves the most recently published migration state from the external - /// migration state channel. - /// - /// This function does not return the borrowed monitor, so the state may - /// change again as soon as this function returns. - fn get_migration_status(&self) -> ApiMigrateStatusResponse { - self.api_state_tx.borrow().migration.clone() + pub(super) async fn run( + mut self, + ensure_request: propolis_api_types::InstanceSpecEnsureRequest, + reply_tx: tokio::sync::oneshot::Sender< + Result, + >, + ensure_options: super::EnsureOptions, + external_state_rx: super::InstanceStateRx, + ) { + match self + .initialize_vm(ensure_request, ensure_options, external_state_rx) + .await + { + Ok(active) => { + self.parent_vm.make_active(active.clone()); + self.vm_lifecycle = + Some(active.clone() as Arc); + + // TODO(gjc) deal with migration + reply_tx.send(Ok(propolis_api_types::InstanceEnsureResponse { + migrate: None, + })); + self.run_loop().await; + active.stop_services().await; + } + Err(e) => { + self.parent_vm.start_failed(); + reply_tx.send(Err(VmError::InitializationFailed(e))); + } + } } - /// Sets the published instance and/or migration state and increases the - /// state generation number. - fn set_published_state(&mut self, state: PublishedState) { + fn update_external_state(&mut self, state: ExternalStateUpdate) { let (instance_state, migration_state) = match state { - PublishedState::Instance(i) => (Some(i), None), - PublishedState::Migration(m) => (None, Some(m)), - PublishedState::Complete(i, m) => (Some(i), Some(m)), + ExternalStateUpdate::Instance(i) => (Some(i), None), + ExternalStateUpdate::Migration(m) => (None, Some(m)), + ExternalStateUpdate::Complete(i, m) => (Some(i), Some(m)), }; - let ApiMonitoredState { - state: old_state, + let propolis_api_types::InstanceStateMonitorResponse { + state: old_instance, migration: old_migration, - .. - } = self.api_state_tx.borrow().clone(); + gen: old_gen, + } = self.external_state_tx.borrow().clone(); - let state = instance_state.unwrap_or(old_state); + let state = instance_state.unwrap_or(old_instance); let migration = if let Some(migration_state) = migration_state { migration_state.apply_to(old_migration) } else { old_migration }; + let gen = old_gen + 1; info!(self.log, "publishing new instance state"; - "gen" => self.state_gen, + "gen" => gen, "state" => ?state, "migration" => ?migration); - self.state_gen += 1; - let _ = self.api_state_tx.send(ApiMonitoredState { - gen: self.state_gen, - state, - migration, - }); - } - - /// Publishes the supplied externally-visible instance state to the external - /// instance state channel. - fn set_instance_state(&mut self, state: ApiInstanceState) { - self.set_published_state(PublishedState::Instance(state)); + let _ = self.external_state_tx.send( + propolis_api_types::InstanceStateMonitorResponse { + gen, + state, + migration, + }, + ); } - /// Publishes the supplied externally-visible migration status to the - /// instance state channel. - fn set_migration_state( + async fn initialize_vm( &mut self, - role: MigrateRole, - migration_id: Uuid, - state: ApiMigrationState, - ) { - self.set_published_state(PublishedState::Migration( - PublishedMigrationState { state, id: migration_id, role }, - )); + request: propolis_api_types::InstanceSpecEnsureRequest, + options: super::EnsureOptions, + external_state_rx: super::InstanceStateRx, + ) -> anyhow::Result> { + let active_vm = match request.migrate { + None => { + let vm_objects = self + .initialize_vm_from_spec( + &request.properties, + &request.instance_spec, + options, + ) + .await?; + let VersionedInstanceSpec::V0(v0_spec) = request.instance_spec; + let active_vm = Arc::new(super::ActiveVm { + parent: self.parent_vm.clone(), + log: self.log.clone(), + state_driver_queue: self.input_queue.clone(), + external_state_rx, + properties: request.properties, + spec: v0_spec, + objects: vm_objects, + }); + + active_vm + } + Some(_migrate_request) => todo!("gjc"), + }; + + Ok(active_vm) } - /// Publishes that an instance is migrating and sets its migration state in - /// a single transaction, then consumes the pending migration information - /// from the shared VM state block. - fn publish_migration_start( + /// Initializes all of the components of a VM from the supplied + /// specification. + async fn initialize_vm_from_spec( &mut self, - migration_id: Uuid, - role: MigrateRole, - ) { - // Order matters here. The 'pending migration' field exists so that - // migration status is available through the external API as soon as an - // external request to migrate returns, even if the migration hasn't yet - // been picked up off the queue. To ensure the migration is continuously - // visible, publish the "actual" migration before consuming the pending - // one. - self.set_published_state(PublishedState::Complete( - ApiInstanceState::Migrating, - PublishedMigrationState { - state: ApiMigrationState::Sync, - id: migration_id, - role, - }, - )); + properties: &InstanceProperties, + spec: &VersionedInstanceSpec, + options: super::EnsureOptions, + ) -> anyhow::Result { + info!(self.log, "initializing new VM"; + "spec" => #?spec, + "properties" => #?properties, + "use_reservoir" => options.use_reservoir, + "bootrom" => %options.toml_config.bootrom.display()); + + let vmm_log = self.log.new(slog::o!("component" => "vmm")); + + // Set up the 'shell' instance into which the rest of this routine will + // add components. + let VersionedInstanceSpec::V0(v0_spec) = &spec; + let machine = build_instance( + &properties.vm_name(), + v0_spec, + options.use_reservoir, + vmm_log, + )?; + + let mut init = MachineInitializer { + log: self.log.clone(), + machine: &machine, + devices: Default::default(), + block_backends: Default::default(), + crucible_backends: Default::default(), + spec: &v0_spec, + properties: &properties, + toml_config: &options.toml_config, + producer_registry: options.oximeter_registry, + state: MachineInitializerState::default(), + }; + + init.initialize_rom(options.toml_config.bootrom.as_path())?; + let chipset = init.initialize_chipset( + &(self.input_queue.clone() + as Arc), + )?; + + init.initialize_rtc(&chipset)?; + init.initialize_hpet()?; + + let com1 = Arc::new(init.initialize_uart(&chipset)?); + let ps2ctrl = init.initialize_ps2(&chipset)?; + init.initialize_qemu_debug_port()?; + init.initialize_qemu_pvpanic(properties.into())?; + init.initialize_network_devices(&chipset)?; + + #[cfg(not(feature = "omicron-build"))] + init.initialize_test_devices(&options.toml_config.devices)?; + #[cfg(feature = "omicron-build")] + info!( + self.log, + "`omicron-build` feature enabled, ignoring any test devices" + ); + + #[cfg(feature = "falcon")] + init.initialize_softnpu_ports(&chipset)?; + #[cfg(feature = "falcon")] + init.initialize_9pfs(&chipset)?; + + init.initialize_storage_devices(&chipset, options.nexus_client).await?; + let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; + init.initialize_cpus()?; + let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( + &machine, + self.input_queue.clone() + as Arc, + self.log.new(slog::o!("component" => "vcpu_tasks")), + )?); + + let MachineInitializer { + devices, + block_backends, + crucible_backends, + .. + } = init; - self.shared_state.clear_pending_migration(); + self.vcpu_tasks = Some(vcpu_tasks as Box); + Ok(super::VmObjects { + machine, + lifecycle_components: devices, + block_backends, + crucible_backends, + com1, + framebuffer: Some(ramfb), + ps2ctrl, + }) } - /// Manages an instance's lifecycle once it has moved to the Running state. - pub(super) fn run_state_worker( - mut self, - ) -> tokio::sync::watch::Sender { - info!(self.log, "State worker launched"); + async fn run_loop(mut self) { + info!(self.log, "state driver launched"); loop { - let event = self.shared_state.wait_for_next_event(); - info!(self.log, "State worker handling event"; "event" => ?event); + let event = self.input_queue.wait_for_next_event(); + info!(self.log, "state driver handling event"; "event" => ?event); + + let outcome = match event { + InputQueueEvent::ExternalRequest(req) => { + self.handle_external_request(req).await + } + InputQueueEvent::GuestEvent(event) => { + self.handle_guest_event(event).await + } + }; - let outcome = self.handle_event(event); - info!(self.log, "State worker handled event"; "outcome" => ?outcome); - if matches!(outcome, HandleEventOutcome::Exit) { + info!(self.log, "state driver handled event"; "outcome" => ?outcome); + if outcome == HandleEventOutcome::Exit { break; } } - info!(self.log, "State worker exiting"); - - self.api_state_tx - } - - fn handle_event(&mut self, event: StateDriverEvent) -> HandleEventOutcome { - let next_action = match event { - StateDriverEvent::Guest(guest_event) => { - return self.handle_guest_event(guest_event); - } - StateDriverEvent::External(external_event) => external_event, - }; - - match next_action { - ExternalRequest::MigrateAsTarget { - migration_id, - task, - start_tx, - command_rx, - } => { - self.migrate_as_target( - migration_id, - task, - start_tx, - command_rx, - ); - HandleEventOutcome::Continue - } - ExternalRequest::Start => { - self.start_vm(VmStartReason::ExplicitRequest); - HandleEventOutcome::Continue - } - ExternalRequest::Reboot => { - self.do_reboot(); - HandleEventOutcome::Continue - } - ExternalRequest::MigrateAsSource { - migration_id, - task, - start_tx, - command_rx, - response_tx, - } => { - self.migrate_as_source( - migration_id, - task, - start_tx, - command_rx, - response_tx, - ); - HandleEventOutcome::Continue - } - ExternalRequest::Stop => { - self.do_halt(); - HandleEventOutcome::Exit - } - } + info!(self.log, "state driver exiting"); } - fn handle_guest_event(&mut self, event: GuestEvent) -> HandleEventOutcome { + async fn handle_guest_event( + &mut self, + event: GuestEvent, + ) -> HandleEventOutcome { match event { GuestEvent::VcpuSuspendHalt(_when) => { info!(self.log, "Halting due to VM suspend event",); - self.do_halt(); + self.do_halt().await; HandleEventOutcome::Exit } GuestEvent::VcpuSuspendReset(_when) => { info!(self.log, "Resetting due to VM suspend event"); - self.do_reboot(); + self.do_reboot().await; HandleEventOutcome::Continue } GuestEvent::VcpuSuspendTripleFault(vcpu_id, _when) => { @@ -327,328 +467,123 @@ where self.log, "Resetting due to triple fault on vCPU {}", vcpu_id ); - self.do_reboot(); + self.do_reboot().await; HandleEventOutcome::Continue } GuestEvent::ChipsetHalt => { info!(self.log, "Halting due to chipset-driven halt"); - self.do_halt(); + self.do_halt().await; HandleEventOutcome::Exit } GuestEvent::ChipsetReset => { info!(self.log, "Resetting due to chipset-driven reset"); - self.do_reboot(); + self.do_reboot().await; HandleEventOutcome::Continue } } } - fn start_vm(&mut self, start_reason: VmStartReason) { - info!(self.log, "Starting instance"; "reason" => ?start_reason); - - // Only move to the Starting state if this VM is starting by explicit - // request (as opposed to the implicit start that happens after a - // migration in). In this case, no one has initialized vCPU state yet, - // so explicitly initialize it here. - // - // In the migration-in case, remain in the Migrating state until the - // VM is actually running. Note that this is contractual behavior--sled - // agent relies on this to represent that a migrating instance is - // continuously running through a successful migration. - match start_reason { - VmStartReason::ExplicitRequest => { - self.set_instance_state(ApiInstanceState::Starting); - self.reset_vcpus(); - } - VmStartReason::MigratedIn => { - assert_eq!( - self.get_instance_state(), - ApiInstanceState::Migrating - ); - // Signal the kernel VMM to resume devices which are handled by - // the in-kernel emulation. They were kept paused for - // consistency while migration state was loaded. - self.controller.resume_vm(); - } - } - - match self.controller.start_devices() { - Ok(()) => { - self.vcpu_tasks.resume_all(); - self.publish_steady_state(ApiInstanceState::Running); + async fn handle_external_request( + &mut self, + request: super::request_queue::ExternalRequest, + ) -> HandleEventOutcome { + match request { + super::request_queue::ExternalRequest::MigrateAsSource { + .. + } => todo!("gjc"), + super::request_queue::ExternalRequest::Reboot => { + self.do_reboot(); + HandleEventOutcome::Continue } - Err(e) => { - error!(&self.log, "Failed to start devices: {:?}", e); - self.publish_steady_state(ApiInstanceState::Failed); + super::request_queue::ExternalRequest::Stop => { + self.do_halt(); + HandleEventOutcome::Exit } } } - fn do_reboot(&mut self) { - info!(self.log, "Resetting instance"); + async fn do_reboot(&mut self) { + info!(self.log, "resetting instance"); - self.set_instance_state(ApiInstanceState::Rebooting); + self.update_external_state(ExternalStateUpdate::Instance( + InstanceState::Rebooting, + )); // Reboot is implemented as a pause -> reset -> resume transition. // // First, pause the vCPUs and all devices so no partially-completed // work is present. - self.vcpu_tasks.pause_all(); - self.controller.pause_devices(); + self.vcpu_tasks().pause_all(); + self.vm_lifecycle().pause_devices().await; - // Reset all the entities and the VM's bhyve state, then reset the - // vCPUs. The vCPU reset must come after the bhyve reset. - self.controller.reset_devices_and_machine(); + // Reset all entities and the VM's bhyve state, then reset the vCPUs. + // The vCPU reset must come after the bhyve reset. + self.vm_lifecycle().reset_devices_and_machine(); self.reset_vcpus(); // Resume devices so they're ready to do more work, then resume vCPUs. - self.controller.resume_devices(); - self.vcpu_tasks.resume_all(); - - // Notify the request queue that this reboot request was processed. - // This does not use the `publish_steady_state` path because the queue - // treats an instance's initial transition to "Running" as a one-time - // event that's different from a return to the running state from a - // transient intermediate state. - self.notify_request_queue(request_queue::InstanceStateChange::Rebooted); - self.set_instance_state(ApiInstanceState::Running); + self.vm_lifecycle().resume_devices(); + self.vcpu_tasks().resume_all(); + + // Notify other consumers that the instance successfully rebooted and is + // now back to Running. + self.input_queue.notify_instance_state_change( + super::request_queue::InstanceStateChange::Rebooted, + ); + self.update_external_state(ExternalStateUpdate::Instance( + InstanceState::Running, + )); } - fn do_halt(&mut self) { - info!(self.log, "Stopping instance"); - self.set_instance_state(ApiInstanceState::Stopping); + async fn do_halt(&mut self) { + info!(self.log, "stopping instance"); + self.update_external_state(ExternalStateUpdate::Instance( + InstanceState::Stopping, + )); // Entities expect to be paused before being halted. Note that the VM // may be paused already if it is being torn down after a successful // migration out. if !self.paused { - self.pause(); - } - - self.vcpu_tasks.exit_all(); - self.controller.halt_devices(); - self.publish_steady_state(ApiInstanceState::Stopped); - } - - fn migrate_as_target( - &mut self, - migration_id: Uuid, - mut task: tokio::task::JoinHandle>, - start_tx: tokio::sync::oneshot::Sender<()>, - mut command_rx: tokio::sync::mpsc::Receiver, - ) { - self.publish_migration_start(migration_id, MigrateRole::Destination); - - // Ensure the VM's vCPUs are activated properly so that they can enter - // the guest after migration. Do this before allowing the migration task - // to start so that reset doesn't overwrite any state written by - // migration. - self.reset_vcpus(); - - // Place the VM in a paused state so we can load emulated device state - // in a consistent manner - self.controller.pause_vm(); - - start_tx.send(()).unwrap(); - loop { - let action = self.runtime_hdl.block_on(async { - Self::next_migrate_task_event( - &mut task, - &mut command_rx, - &self.log, - ) - .await - }); - - match action { - MigrateTaskEvent::TaskExited(res) => { - if res.is_ok() { - // Clients that observe that migration has finished - // need to observe that the instance is running before - // they are guaranteed to be able to do anything else - // that requires a running instance. - assert!(matches!( - self.get_migration_status() - .migration_in - .unwrap() - .state, - ApiMigrationState::Finish - )); - - self.start_vm(VmStartReason::MigratedIn); - } else { - assert!(matches!( - self.get_migration_status() - .migration_in - .unwrap() - .state, - ApiMigrationState::Error - )); - - // Resume the kernel VM so that if this state driver is - // asked to halt, the pause resulting therefrom won't - // observe that the VM is already paused. - self.controller.resume_vm(); - self.publish_steady_state(ApiInstanceState::Failed); - }; - - break; - } - MigrateTaskEvent::Command( - MigrateTargetCommand::UpdateState(state), - ) => { - self.set_migration_state( - MigrateRole::Destination, - migration_id, - state, - ); - } - } - } - } - - fn migrate_as_source( - &mut self, - migration_id: Uuid, - mut task: tokio::task::JoinHandle>, - start_tx: tokio::sync::oneshot::Sender<()>, - mut command_rx: tokio::sync::mpsc::Receiver, - response_tx: tokio::sync::mpsc::Sender, - ) { - self.publish_migration_start(migration_id, MigrateRole::Source); - start_tx.send(()).unwrap(); - - // Wait either for the migration task to exit or for it to ask the - // worker to pause or resume the instance's devices. - loop { - let action = self.runtime_hdl.block_on(async { - Self::next_migrate_task_event( - &mut task, - &mut command_rx, - &self.log, - ) - .await - }); - - match action { - // If the task exited, bubble its result back up to the main - // state worker loop to decide on the instance's next state. - // - // If migration failed while devices were paused, this instance - // is allowed to resume, so resume its components here. - MigrateTaskEvent::TaskExited(res) => { - if res.is_ok() { - assert!(matches!( - self.get_migration_status() - .migration_out - .unwrap() - .state, - ApiMigrationState::Finish - )); - - self.shared_state - .queue_external_request(ExternalRequest::Stop) - .expect("can always queue a request to stop"); - } else { - assert!(matches!( - self.get_migration_status() - .migration_out - .unwrap() - .state, - ApiMigrationState::Error - )); - - if self.paused { - self.resume(); - self.publish_steady_state( - ApiInstanceState::Running, - ); - } - } - - break; - } - MigrateTaskEvent::Command(cmd) => match cmd { - MigrateSourceCommand::UpdateState(state) => { - self.set_migration_state( - MigrateRole::Source, - migration_id, - state, - ); - } - MigrateSourceCommand::Pause => { - self.pause(); - response_tx - .blocking_send(MigrateSourceResponse::Pause(Ok(()))) - .unwrap(); - } - }, - } - } - } - - async fn next_migrate_task_event( - task: &mut tokio::task::JoinHandle>, - command_rx: &mut tokio::sync::mpsc::Receiver, - log: &Logger, - ) -> MigrateTaskEvent { - if let Some(cmd) = command_rx.recv().await { - return MigrateTaskEvent::Command(cmd); + self.pause().await; } - // The sender side of the command channel is dropped, which means the - // migration task is exiting. Wait for it to finish and snag its result. - match task.await { - Ok(res) => { - info!(log, "Migration source task exited: {:?}", res); - MigrateTaskEvent::TaskExited(res) - } - Err(join_err) => { - if join_err.is_cancelled() { - panic!("Migration task canceled"); - } else { - panic!( - "Migration task panicked: {:?}", - join_err.into_panic() - ); - } - } - } + self.vcpu_tasks().exit_all(); + self.vm_lifecycle().halt_devices(); + self.publish_steady_state(InstanceState::Stopped); } - fn pause(&mut self) { + async fn pause(&mut self) { assert!(!self.paused); - probes::state_driver_pause!(|| ()); - self.vcpu_tasks.pause_all(); - self.controller.pause_devices(); - self.controller.pause_vm(); + self.vcpu_tasks().pause_all(); + self.vm_lifecycle().pause_devices().await; + self.vm_lifecycle().pause_vm(); self.paused = true; } fn resume(&mut self) { assert!(self.paused); - probes::state_driver_resume!(|| ()); - self.controller.resume_vm(); - self.controller.resume_devices(); - self.vcpu_tasks.resume_all(); + self.vm_lifecycle().resume_vm(); + self.vm_lifecycle().resume_devices(); + self.vcpu_tasks().resume_all(); self.paused = false; } - fn reset_vcpus(&self) { - self.vcpu_tasks.new_generation(); - self.controller.reset_vcpu_state(); + fn reset_vcpus(&mut self) { + self.vcpu_tasks().new_generation(); + self.vm_lifecycle.as_ref().unwrap().reset_vcpu_state(); } - fn publish_steady_state(&mut self, state: ApiInstanceState) { + fn publish_steady_state(&mut self, state: InstanceState) { let change = match state { - ApiInstanceState::Running => { - request_queue::InstanceStateChange::StartedRunning + InstanceState::Running => { + super::request_queue::InstanceStateChange::StartedRunning } - ApiInstanceState::Stopped => { - request_queue::InstanceStateChange::Stopped + InstanceState::Stopped => { + super::request_queue::InstanceStateChange::Stopped } - ApiInstanceState::Failed => { - request_queue::InstanceStateChange::Failed + InstanceState::Failed => { + super::request_queue::InstanceStateChange::Failed } _ => panic!( "Called publish_steady_state on non-terminal state {:?}", @@ -656,729 +591,15 @@ where ), }; - self.notify_request_queue(change); - self.set_instance_state(state); - } - - fn notify_request_queue( - &self, - queue_change: request_queue::InstanceStateChange, - ) { - self.shared_state - .inner - .lock() - .unwrap() - .external_request_queue - .notify_instance_state_change(queue_change); - } -} - -#[cfg(test)] -mod test { - use anyhow::bail; - use mockall::Sequence; - - use super::*; - use crate::vcpu_tasks::MockVcpuTaskController; - use crate::vm::MockStateDriverVmController; - - struct TestStateDriver { - driver: - StateDriver, - state_rx: tokio::sync::watch::Receiver, - } - - impl TestStateDriver { - fn api_state(&self) -> ApiInstanceState { - self.state_rx.borrow().state - } - } - - struct TestObjects { - vm_ctrl: MockStateDriverVmController, - vcpu_ctrl: MockVcpuTaskController, - shared_state: Arc, - } - - fn make_state_driver(objects: TestObjects) -> TestStateDriver { - let logger = slog::Logger::root(slog::Discard, slog::o!()); - let (state_tx, state_rx) = - tokio::sync::watch::channel(ApiMonitoredState { - gen: 0, - state: ApiInstanceState::Creating, - migration: ApiMigrateStatusResponse { - migration_in: None, - migration_out: None, - }, - }); - - TestStateDriver { - driver: StateDriver::new( - tokio::runtime::Handle::current(), - Arc::new(objects.vm_ctrl), - objects.shared_state.clone(), - objects.vcpu_ctrl, - logger, - state_tx, - ), - state_rx, - } - } - - /// Generates default mocks for the VM controller and vCPU task controller - /// that accept unlimited requests to read state. - fn make_default_mocks() -> TestObjects { - let logger = slog::Logger::root(slog::Discard, slog::o!()); - let vm_ctrl = MockStateDriverVmController::new(); - let vcpu_ctrl = MockVcpuTaskController::new(); - TestObjects { - vm_ctrl, - vcpu_ctrl, - shared_state: Arc::new(SharedVmState::new(&logger)), - } - } - - fn add_reboot_expectations( - vm_ctrl: &mut MockStateDriverVmController, - vcpu_ctrl: &mut MockVcpuTaskController, - ) { - // The reboot process requires careful ordering of steps to make sure - // the VM's vCPUs are put into the correct state when the machine starts - // up. - let mut seq = Sequence::new(); - - // First, reboot has to pause everything. It doesn't actually matter - // whether vCPUs or devices pause first, but there's no way to specify - // that these events must be sequenced before other expectations but - // have no ordering with respect to each other. - vcpu_ctrl - .expect_pause_all() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_pause_devices() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - - // The devices and--importantly--the bhyve VM itself must be reset - // before resetting any vCPU state (so that bhyve will accept the ioctls - // sent to the vCPUs during the reset process). - vm_ctrl - .expect_reset_devices_and_machine() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vcpu_ctrl - .expect_new_generation() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_reset_vcpu_state() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - - // Entities and vCPUs can technically be resumed in either order, but - // resuming devices first allows them to be ready when the vCPUs start - // creating work for them to do. - vm_ctrl - .expect_resume_devices() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vcpu_ctrl - .expect_resume_all() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - } - - #[tokio::test] - async fn guest_triple_fault_reboots() { - let mut test_objects = make_default_mocks(); - - add_reboot_expectations( - &mut test_objects.vm_ctrl, - &mut test_objects.vcpu_ctrl, - ); - let mut driver = make_state_driver(test_objects); - driver.driver.handle_event(StateDriverEvent::Guest( - GuestEvent::VcpuSuspendTripleFault( - 0, - std::time::Duration::default(), - ), - )); - - assert!(matches!(driver.api_state(), ApiInstanceState::Running)); - } - - #[tokio::test] - async fn guest_chipset_reset_reboots() { - let mut test_objects = make_default_mocks(); - - add_reboot_expectations( - &mut test_objects.vm_ctrl, - &mut test_objects.vcpu_ctrl, - ); - let mut driver = make_state_driver(test_objects); - driver - .driver - .handle_event(StateDriverEvent::Guest(GuestEvent::ChipsetReset)); - - assert!(matches!(driver.api_state(), ApiInstanceState::Running)); - } - - #[tokio::test] - async fn start_from_cold_boot() { - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - let mut seq = Sequence::new(); - vcpu_ctrl - .expect_new_generation() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_reset_vcpu_state() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_start_devices() - .times(1) - .in_sequence(&mut seq) - .returning(|| Ok(())); - vcpu_ctrl - .expect_resume_all() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - - let mut driver = make_state_driver(test_objects); - driver - .driver - .handle_event(StateDriverEvent::External(ExternalRequest::Start)); - - assert!(matches!(driver.api_state(), ApiInstanceState::Running)); - } - - #[tokio::test] - async fn device_start_failure_causes_instance_failure() { - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - let mut seq = Sequence::new(); - vcpu_ctrl - .expect_new_generation() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_reset_vcpu_state() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_start_devices() - .times(1) - .in_sequence(&mut seq) - .returning(|| bail!("injected failure into start_devices!")); - - let mut driver = make_state_driver(test_objects); - - // Failure allows the instance to be preserved for debugging. - assert_eq!( - driver.driver.handle_event(StateDriverEvent::External( - ExternalRequest::Start - )), - HandleEventOutcome::Continue - ); - - assert!(matches!(driver.api_state(), ApiInstanceState::Failed)); - } - - #[tokio::test] - async fn devices_pause_before_halting() { - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - let mut seq = Sequence::new(); - vcpu_ctrl - .expect_pause_all() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_pause_devices() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_pause_vm() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vcpu_ctrl - .expect_exit_all() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_halt_devices() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - - let mut driver = make_state_driver(test_objects); - driver - .driver - .handle_event(StateDriverEvent::External(ExternalRequest::Stop)); - - assert!(matches!(driver.api_state(), ApiInstanceState::Stopped)); - } - - #[tokio::test] - async fn devices_pause_once_when_halting_after_migration_out() { - let migration_id = Uuid::new_v4(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - let (response_tx, mut response_rx) = tokio::sync::mpsc::channel(1); - let migrate_task = tokio::spawn(async move { - start_rx.await.unwrap(); - task_exit_rx.await.unwrap() - }); - - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - - // This test will simulate a migration out (with a pause command), then - // order the state driver to halt. This should produce exactly one set - // of pause commands and one set of halt commands with no resume - // commands. - vm_ctrl.expect_pause_devices().times(1).returning(|| ()); - vcpu_ctrl.expect_pause_all().times(1).returning(|| ()); - vcpu_ctrl.expect_exit_all().times(1).returning(|| ()); - vm_ctrl.expect_halt_devices().times(1).returning(|| ()); - vm_ctrl.expect_resume_devices().never(); - vcpu_ctrl.expect_resume_all().never(); - vm_ctrl.expect_pause_vm().times(1).returning(|| ()); - vm_ctrl.expect_resume_vm().never(); - - let mut driver = make_state_driver(test_objects); - - // The state driver expects to run on an OS thread outside the async - // runtime so that it can call `block_on` to wait for messages from the - // migration task. - let hdl = std::thread::spawn(move || { - driver.driver.handle_event(StateDriverEvent::External( - ExternalRequest::MigrateAsSource { - migration_id, - task: migrate_task, - start_tx, - command_rx, - response_tx, - }, - )); - - // Return the driver (which has the mocks attached) when the thread - // is joined so the test can continue using it. - driver - }); - - // Simulate a pause and the successful completion of migration. - command_tx.send(MigrateSourceCommand::Pause).await.unwrap(); - let resp = response_rx.recv().await.unwrap(); - assert!(matches!(resp, MigrateSourceResponse::Pause(Ok(())))); - command_tx - .send(MigrateSourceCommand::UpdateState(ApiMigrationState::Finish)) - .await - .unwrap(); - - drop(command_tx); - task_exit_tx.send(Ok(())).unwrap(); - - // Wait for the call to `handle_event` to return before tearing anything - // else down. - driver = tokio::task::spawn_blocking(move || hdl.join().unwrap()) - .await - .unwrap(); - - // The migration should appear to have finished. The state driver will - // queue a "stop" command to itself in this case, but because the driver - // is not directly processing the queue here, the test has to issue this - // call itself. - assert_eq!( - driver.driver.get_migration_status().migration_out.unwrap(), - ApiMigrationStatus { - id: migration_id, - state: ApiMigrationState::Finish - } - ); - - driver - .driver - .handle_event(StateDriverEvent::External(ExternalRequest::Stop)); - - assert!(matches!(driver.api_state(), ApiInstanceState::Stopped)); - } - - #[tokio::test] - async fn paused_vm_resumes_after_failed_migration_out() { - let migration_id = Uuid::new_v4(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - let (response_tx, mut response_rx) = tokio::sync::mpsc::channel(1); - let migrate_task = tokio::spawn(async move { - start_rx.await.unwrap(); - task_exit_rx.await.unwrap() - }); - - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - - // This test will simulate a migration out up through pausing the - // source, then fail migration. This should pause and resume all the - // devices and the vCPUs. - vm_ctrl.expect_pause_devices().times(1).returning(|| ()); - vm_ctrl.expect_resume_devices().times(1).returning(|| ()); - vcpu_ctrl.expect_pause_all().times(1).returning(|| ()); - vcpu_ctrl.expect_resume_all().times(1).returning(|| ()); - - // VMM will be paused once prior to exporting state, and then resumed - // afterwards when the migration fails. - let mut pause_seq = Sequence::new(); - vm_ctrl - .expect_pause_vm() - .times(1) - .in_sequence(&mut pause_seq) - .returning(|| ()); - vm_ctrl - .expect_resume_vm() - .times(1) - .in_sequence(&mut pause_seq) - .returning(|| ()); - - let mut driver = make_state_driver(test_objects); - let hdl = std::thread::spawn(move || { - let outcome = driver.driver.handle_event( - StateDriverEvent::External(ExternalRequest::MigrateAsSource { - migration_id, - task: migrate_task, - start_tx, - command_rx, - response_tx, - }), - ); - - (driver, outcome) - }); - - // Simulate a successful pause. - command_tx.send(MigrateSourceCommand::Pause).await.unwrap(); - let resp = response_rx.recv().await.unwrap(); - assert!(matches!(resp, MigrateSourceResponse::Pause(Ok(())))); - - // Simulate failure. The migration protocol must both update the state - // to Error and make the task return `Err`. - command_tx - .send(MigrateSourceCommand::UpdateState(ApiMigrationState::Error)) - .await - .unwrap(); - drop(command_tx); - task_exit_tx.send(Err(MigrateError::UnexpectedMessage)).unwrap(); - - // Wait for the call to `handle_event` to return. - let (driver, outcome) = - tokio::task::spawn_blocking(move || hdl.join().unwrap()) - .await - .unwrap(); - - // The VM should be running and the state driver should continue - // operating normally. - assert!(matches!(driver.api_state(), ApiInstanceState::Running)); - assert_eq!(outcome, HandleEventOutcome::Continue); - assert_eq!( - driver.driver.get_migration_status().migration_out.unwrap(), - ApiMigrationStatus { - id: migration_id, - state: ApiMigrationState::Error - } - ); - } - - #[tokio::test] - async fn vm_starts_after_migration_in() { - let migration_id = Uuid::new_v4(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - let migrate_task = tokio::spawn(async move { - start_rx.await.unwrap(); - task_exit_rx.await.unwrap() - }); - - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - - vcpu_ctrl.expect_new_generation().times(1).returning(|| ()); - vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ()); - vm_ctrl.expect_start_devices().times(1).returning(|| Ok(())); - vcpu_ctrl.expect_resume_all().times(1).returning(|| ()); - - let mut pause_seq = Sequence::new(); - vm_ctrl - .expect_pause_vm() - .times(1) - .in_sequence(&mut pause_seq) - .returning(|| ()); - vm_ctrl - .expect_resume_vm() - .times(1) - .in_sequence(&mut pause_seq) - .returning(|| ()); - - let mut driver = make_state_driver(test_objects); - - // The state driver expects to run on an OS thread outside the async - // runtime so that it can call `block_on` to wait for messages from the - // migration task. - let hdl = std::thread::spawn(move || { - driver.driver.handle_event(StateDriverEvent::External( - ExternalRequest::MigrateAsTarget { - migration_id, - task: migrate_task, - start_tx, - command_rx, - }, - )); - - driver - }); - - // Explicitly drop the command channel to signal to the driver that - // the migration task is completing. - command_tx - .send(MigrateTargetCommand::UpdateState(ApiMigrationState::Finish)) - .await - .unwrap(); - drop(command_tx); - task_exit_tx.send(Ok(())).unwrap(); - - // Wait for the call to `handle_event` to return before tearing anything - // else down. - let driver = tokio::task::spawn_blocking(move || hdl.join().unwrap()) - .await - .unwrap(); - - assert_eq!( - driver.driver.get_migration_status().migration_in.unwrap(), - ApiMigrationStatus { - id: migration_id, - state: ApiMigrationState::Finish - } - ); - assert!(matches!(driver.api_state(), ApiInstanceState::Running)); - } - - #[tokio::test] - async fn failed_migration_in_fails_instance() { - let migration_id = Uuid::new_v4(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - let migrate_task = tokio::spawn(async move { - start_rx.await.unwrap(); - task_exit_rx.await.unwrap() - }); - - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - - vcpu_ctrl.expect_new_generation().times(1).returning(|| ()); - vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ()); - vm_ctrl.expect_pause_vm().times(1).returning(|| ()); - vm_ctrl.expect_resume_vm().times(1).returning(|| ()); - let mut driver = make_state_driver(test_objects); - - // The state driver expects to run on an OS thread outside the async - // runtime so that it can call `block_on` to wait for messages from the - // migration task. - let hdl = std::thread::spawn(move || { - let outcome = driver.driver.handle_event( - StateDriverEvent::External(ExternalRequest::MigrateAsTarget { - migration_id, - task: migrate_task, - start_tx, - command_rx, - }), - ); - - (driver, outcome) - }); - - // The migration task is required to update the migration state to - // "Error" before exiting when migration fails. - command_tx - .send(MigrateTargetCommand::UpdateState(ApiMigrationState::Error)) - .await - .unwrap(); - drop(command_tx); - task_exit_tx.send(Err(MigrateError::UnexpectedMessage)).unwrap(); - - // Wait for the call to `handle_event` to return. - let (driver, outcome) = - tokio::task::spawn_blocking(move || hdl.join().unwrap()) - .await - .unwrap(); - - // The migration should appear to have failed, but the VM should be - // preserved for debugging. - assert_eq!(outcome, HandleEventOutcome::Continue); - assert!(matches!(driver.api_state(), ApiInstanceState::Failed)); - assert_eq!( - driver.driver.get_migration_status().migration_in.unwrap(), - ApiMigrationStatus { - id: migration_id, - state: ApiMigrationState::Error - } - ); + self.input_queue.notify_instance_state_change(change); + self.update_external_state(ExternalStateUpdate::Instance(state)); } - #[tokio::test] - async fn failed_vm_start_after_migration_in_fails_instance() { - let migration_id = Uuid::new_v4(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - let migrate_task = tokio::spawn(async move { - start_rx.await.unwrap(); - task_exit_rx.await.unwrap() - }); - - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - - vcpu_ctrl.expect_new_generation().times(1).returning(|| ()); - vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ()); - - let mut pause_seq = Sequence::new(); - vm_ctrl - .expect_pause_vm() - .times(1) - .in_sequence(&mut pause_seq) - .returning(|| ()); - vm_ctrl - .expect_resume_vm() - .times(1) - .in_sequence(&mut pause_seq) - .returning(|| ()); - - vm_ctrl - .expect_start_devices() - .times(1) - .returning(|| bail!("injected failure into start_devices!")); - - let mut driver = make_state_driver(test_objects); - - // The state driver expects to run on an OS thread outside the async - // runtime so that it can call `block_on` to wait for messages from the - // migration task. - let hdl = std::thread::spawn(move || { - let outcome = driver.driver.handle_event( - StateDriverEvent::External(ExternalRequest::MigrateAsTarget { - migration_id, - task: migrate_task, - start_tx, - command_rx, - }), - ); - - (driver, outcome) - }); - - // Explicitly drop the command channel to signal to the driver that - // the migration task is completing. - command_tx - .send(MigrateTargetCommand::UpdateState(ApiMigrationState::Finish)) - .await - .unwrap(); - drop(command_tx); - task_exit_tx.send(Ok(())).unwrap(); - - // Wait for the call to `handle_event` to return. - let (driver, outcome) = - tokio::task::spawn_blocking(move || hdl.join().unwrap()) - .await - .unwrap(); - - // The instance should have failed, but should also be preserved for - // debugging. - assert_eq!(outcome, HandleEventOutcome::Continue); - assert!(matches!(driver.api_state(), ApiInstanceState::Failed)); - - // The migration has still succeeded in this case. - assert_eq!( - driver.driver.get_migration_status().migration_in.unwrap(), - ApiMigrationStatus { - id: migration_id, - state: ApiMigrationState::Finish - } - ); + fn vcpu_tasks(&mut self) -> &mut dyn VcpuTaskController { + self.vcpu_tasks.as_mut().unwrap().as_mut() } - #[tokio::test] - async fn start_vm_after_migration_in_does_not_publish_starting_state() { - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - - // A call to start a VM after a successful migration should start vCPUs - // and devices without resetting anything. - vcpu_ctrl.expect_resume_all().times(1).returning(|| ()); - vm_ctrl.expect_start_devices().times(1).returning(|| Ok(())); - - // As noted below, the instance state is being magicked directly into a - // `Migrating` state, rather than executing the logic which would - // typically carry it there. As such, `pause_vm()` will not be called - // as part of setup. Since instance start _is_ being tested here, the - // `resume_vm()` call is expected. - vm_ctrl.expect_pause_vm().never(); - vm_ctrl.expect_resume_vm().times(1).returning(|| ()); - - // Skip the rigmarole of standing up a fake migration. Instead, just - // push the driver into the state it would have after a successful - // migration to appease the assertions in `start_vm`. - // - // Faking an entire migration, as in the previous tests, requires the - // state driver to run on its own worker thread. This is fine for tests - // that only want to examine state after the driver has finished an - // operation, but this test wants to test side effects of a specific - // part of the state driver's actions, which are tough to synchronize - // with when the driver is running on another thread. - let mut driver = make_state_driver(test_objects); - driver.driver.set_instance_state(ApiInstanceState::Migrating); - - // The driver starts in the Migrating state and should go directly to - // the Running state without passing through Starting. Because there's - // no way to guarantee that the test will see all intermediate states - // that `start_vm` publishes, instead assert that the final state of - // Running is correct and that the state generation only went up by 1 - // (implying that there were no intervening transitions). - let migrating_gen = driver.driver.api_state_tx.borrow().gen; - driver.driver.start_vm(VmStartReason::MigratedIn); - let new_state = driver.driver.api_state_tx.borrow().clone(); - assert!(matches!(new_state.state, ApiInstanceState::Running)); - assert_eq!(new_state.gen, migrating_gen + 1); + fn vm_lifecycle(&self) -> &dyn lifecycle_ops::VmLifecycle { + self.vm_lifecycle.as_ref().unwrap().as_ref() } } diff --git a/bin/propolis-server/src/lib/vm2/mod.rs b/bin/propolis-server/src/lib/vm2/mod.rs deleted file mode 100644 index 93f50de06..000000000 --- a/bin/propolis-server/src/lib/vm2/mod.rs +++ /dev/null @@ -1,225 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! This module implements the `Vm` wrapper type that encapsulates a single -//! instance on behalf of a Propolis server. - -use std::{ - collections::BTreeMap, - sync::{Arc, RwLock, RwLockReadGuard, Weak}, -}; - -use oximeter::types::ProducerRegistry; -use propolis::{ - hw::{ps2::ctrl::PS2Ctrl, qemu::ramfb::RamFb, uart::LpcUart}, - vmm::Machine, -}; -use propolis_api_types::{ - instance_spec::v0::InstanceSpecV0, InstanceProperties, - InstanceStateMonitorResponse, -}; - -use crate::serial::Serial; - -pub(crate) mod guest_event; -mod lifecycle_ops; -mod migrate_commands; -mod request_queue; -mod state_driver; - -pub(crate) type LifecycleMap = - BTreeMap>; -pub(crate) type BlockBackendMap = - BTreeMap>; -pub(crate) type CrucibleBackendMap = - BTreeMap>; - -type InstanceStateTx = tokio::sync::watch::Sender< - propolis_api_types::InstanceStateMonitorResponse, ->; -type InstanceStateRx = tokio::sync::watch::Receiver< - propolis_api_types::InstanceStateMonitorResponse, ->; - -#[derive(Debug, thiserror::Error)] -pub(crate) enum VmError { - #[error("VM already initialized")] - AlreadyInitialized, -} - -/// The top-level VM wrapper type. Callers are expected to wrap this in an -/// `Arc`. -pub(crate) struct Vm { - /// A reference to the VM state machine. - state: RwLock, -} - -struct VmObjects { - machine: Machine, - lifecycle_components: LifecycleMap, - block_backends: BlockBackendMap, - crucible_backends: CrucibleBackendMap, - com1: Arc>, - framebuffer: Option>, - ps2ctrl: Arc, -} - -impl VmObjects { - fn for_each_device( - &self, - mut func: impl FnMut(&str, &Arc), - ) { - for (name, dev) in self.lifecycle_components.iter() { - func(name, dev); - } - } - - fn for_each_device_fallible( - &self, - mut func: impl FnMut( - &str, - &Arc, - ) -> std::result::Result<(), E>, - ) -> std::result::Result<(), E> { - for (name, dev) in self.lifecycle_components.iter() { - func(name, dev)?; - } - - Ok(()) - } -} - -/// The state stored in a [`Vm`] when there is an actual underlying virtual -/// machine. -pub(super) struct ActiveVm { - parent: Arc, - log: slog::Logger, - - state_driver_queue: Arc, - external_state_rx: InstanceStateRx, - - properties: InstanceProperties, - spec: InstanceSpecV0, - - objects: VmObjects, -} - -impl Drop for ActiveVm { - fn drop(&mut self) { - let mut guard = self.parent.state.write().unwrap(); - *guard = VmState::Defunct(DefunctVm { - external_state_rx: self.external_state_rx.clone(), - properties: self.properties.clone(), - spec: self.spec.clone(), - }); - } -} - -struct DefunctVm { - external_state_rx: InstanceStateRx, - properties: InstanceProperties, - spec: InstanceSpecV0, -} - -#[allow(clippy::large_enum_variant)] -enum VmState { - NoVm, - WaitingToStart, - Active(Weak), - Defunct(DefunctVm), -} - -pub(super) struct EnsureOptions { - pub toml_config: Arc, - pub use_reservoir: bool, - pub oximeter_registry: Option, - pub nexus_client: Option, -} - -impl Vm { - pub fn new() -> Arc { - Arc::new(Self { state: RwLock::new(VmState::NoVm) }) - } - - fn vm_state(&self) -> RwLockReadGuard<'_, VmState> { - self.state.read().unwrap() - } - - pub(super) fn active_vm(&self) -> Option> { - let guard = self.vm_state(); - if let VmState::Active(weak) = &*guard { - weak.upgrade() - } else { - None - } - } - - fn start_failed(&self) { - let mut guard = self.state.write().unwrap(); - match *guard { - VmState::WaitingToStart => *guard = VmState::NoVm, - _ => unreachable!( - "only a starting VM's state worker calls start_failed" - ), - } - } - - fn make_active(&self, active: Arc) { - let mut guard = self.state.write().unwrap(); - let old = std::mem::replace(&mut *guard, VmState::NoVm); - match old { - VmState::WaitingToStart => { - *guard = VmState::Active(Arc::downgrade(&active)) - } - _ => unreachable!( - "only a starting VM's state worker calls make_active" - ), - } - } - - pub async fn ensure( - self: &Arc, - log: slog::Logger, - ensure_request: propolis_api_types::InstanceSpecEnsureRequest, - options: EnsureOptions, - ) -> anyhow::Result<(), VmError> { - // Take the lock for writing, since in the common case this call will be - // creating a new VM and there's no easy way to upgrade from a reader - // lock to a writer lock. - let mut guard = self.state.write().unwrap(); - - if matches!(*guard, VmState::WaitingToStart | VmState::Active(_)) { - return Err(VmError::AlreadyInitialized); - } - - *guard = VmState::WaitingToStart; - - let (external_tx, external_rx) = - tokio::sync::watch::channel(InstanceStateMonitorResponse { - gen: 1, - state: propolis_api_types::InstanceState::Starting, - migration: propolis_api_types::InstanceMigrateStatusResponse { - migration_in: None, - migration_out: None, - }, - }); - - let input_queue = state_driver::InputQueue::new( - log.new(slog::o!("component" => "vmm_request_queue")), - ); - - let state_driver = state_driver::StateDriver::new( - log, - self.clone(), - Arc::new(input_queue), - external_tx, - ); - - tokio::spawn(async move { - state_driver.run(ensure_request, options, external_rx).await - }); - - Ok(()) - } -} diff --git a/bin/propolis-server/src/lib/vm2/state_driver.rs b/bin/propolis-server/src/lib/vm2/state_driver.rs deleted file mode 100644 index 8e5c6d9dd..000000000 --- a/bin/propolis-server/src/lib/vm2/state_driver.rs +++ /dev/null @@ -1,596 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! It drives the state vroom vroom - -use std::{ - sync::{Arc, Condvar, Mutex}, - time::Duration, -}; - -use propolis_api_types::{ - instance_spec::VersionedInstanceSpec, InstanceProperties, InstanceState, -}; -use slog::info; -use uuid::Uuid; - -use crate::{ - initializer::{ - build_instance, MachineInitializer, MachineInitializerState, - }, - migrate::MigrateRole, - vcpu_tasks::VcpuTaskController, -}; - -use super::{ - guest_event::{self, GuestEvent}, - lifecycle_ops, -}; - -struct MigrationStateUpdate { - state: propolis_api_types::MigrationState, - id: Uuid, - role: MigrateRole, -} - -impl MigrationStateUpdate { - fn apply_to( - self, - old: propolis_api_types::InstanceMigrateStatusResponse, - ) -> propolis_api_types::InstanceMigrateStatusResponse { - let new = propolis_api_types::InstanceMigrationStatus { - id: self.id, - state: self.state, - }; - match self.role { - MigrateRole::Destination => { - propolis_api_types::InstanceMigrateStatusResponse { - migration_in: Some(new), - migration_out: old.migration_out, - } - } - MigrateRole::Source => { - propolis_api_types::InstanceMigrateStatusResponse { - migration_in: old.migration_in, - migration_out: Some(new), - } - } - } - } -} - -enum ExternalStateUpdate { - Instance(InstanceState), - Migration(MigrationStateUpdate), - Complete(InstanceState, MigrationStateUpdate), -} - -#[derive(Debug, PartialEq, Eq)] -enum HandleEventOutcome { - Continue, - Exit, -} - -#[derive(Debug)] -enum InputQueueEvent { - ExternalRequest(super::request_queue::ExternalRequest), - GuestEvent(GuestEvent), -} - -struct InputQueueInner { - external_requests: super::request_queue::ExternalRequestQueue, - guest_events: super::guest_event::GuestEventQueue, -} - -impl InputQueueInner { - fn new(log: slog::Logger) -> Self { - Self { - external_requests: super::request_queue::ExternalRequestQueue::new( - log, - ), - guest_events: super::guest_event::GuestEventQueue::default(), - } - } -} - -pub(super) struct InputQueue { - inner: Mutex, - cv: Condvar, -} - -impl InputQueue { - pub(super) fn new(log: slog::Logger) -> Self { - Self { - inner: Mutex::new(InputQueueInner::new(log)), - cv: Condvar::new(), - } - } - - fn wait_for_next_event(&self) -> InputQueueEvent { - let guard = self.inner.lock().unwrap(); - let mut guard = self - .cv - .wait_while(guard, |i| { - i.external_requests.is_empty() && i.guest_events.is_empty() - }) - .unwrap(); - - if let Some(guest_event) = guard.guest_events.pop_front() { - InputQueueEvent::GuestEvent(guest_event) - } else { - InputQueueEvent::ExternalRequest( - guard.external_requests.pop_front().unwrap(), - ) - } - } - - fn notify_instance_state_change( - &self, - state: super::request_queue::InstanceStateChange, - ) { - let mut guard = self.inner.lock().unwrap(); - guard.external_requests.notify_instance_state_change(state); - } -} - -impl guest_event::GuestEventHandler for InputQueue { - fn suspend_halt_event(&self, when: Duration) { - let mut guard = self.inner.lock().unwrap(); - if guard - .guest_events - .enqueue(guest_event::GuestEvent::VcpuSuspendHalt(when)) - { - self.cv.notify_all(); - } - } - - fn suspend_reset_event(&self, when: Duration) { - let mut guard = self.inner.lock().unwrap(); - if guard - .guest_events - .enqueue(guest_event::GuestEvent::VcpuSuspendReset(when)) - { - self.cv.notify_all(); - } - } - - fn suspend_triple_fault_event(&self, vcpu_id: i32, when: Duration) { - let mut guard = self.inner.lock().unwrap(); - if guard.guest_events.enqueue( - guest_event::GuestEvent::VcpuSuspendTripleFault(vcpu_id, when), - ) { - self.cv.notify_all(); - } - } - - fn unhandled_vm_exit( - &self, - vcpu_id: i32, - exit: propolis::exits::VmExitKind, - ) { - panic!("vCPU {}: Unhandled VM exit: {:?}", vcpu_id, exit); - } - - fn io_error_event(&self, vcpu_id: i32, error: std::io::Error) { - panic!("vCPU {}: Unhandled vCPU error: {}", vcpu_id, error); - } -} - -impl guest_event::ChipsetEventHandler for InputQueue { - fn chipset_halt(&self) { - let mut guard = self.inner.lock().unwrap(); - if guard.guest_events.enqueue(guest_event::GuestEvent::ChipsetHalt) { - self.cv.notify_all(); - } - } - - fn chipset_reset(&self) { - let mut guard = self.inner.lock().unwrap(); - if guard.guest_events.enqueue(guest_event::GuestEvent::ChipsetReset) { - self.cv.notify_all(); - } - } -} - -/// The context for a VM state driver task. -pub(super) struct StateDriver { - log: slog::Logger, - parent_vm: Arc, - input_queue: Arc, - external_state_tx: super::InstanceStateTx, - paused: bool, - vcpu_tasks: Option>, - vm_lifecycle: Option>, - migration_src_state: crate::migrate::source::PersistentState, -} - -impl StateDriver { - pub(super) fn new( - log: slog::Logger, - vm: Arc, - input_queue: Arc, - external_state_tx: super::InstanceStateTx, - ) -> Self { - let log = log.new(slog::o!("component" => "state_driver")); - Self { - log, - parent_vm: vm, - input_queue, - external_state_tx, - paused: false, - vcpu_tasks: None, - vm_lifecycle: None, - migration_src_state: Default::default(), - } - } - - pub(super) async fn run( - mut self, - ensure_request: propolis_api_types::InstanceSpecEnsureRequest, - ensure_options: super::EnsureOptions, - external_state_rx: super::InstanceStateRx, - ) { - if let Ok(active) = self - .initialize_vm(ensure_request, ensure_options, external_state_rx) - .await - { - self.parent_vm.make_active(active.clone()); - self.vm_lifecycle = - Some(active as Arc); - } else { - // TODO(gjc) also publish that it failed. we're the only thing that - // has the external tx so need to do that from here - self.parent_vm.start_failed(); - return; - } - - self.run_loop().await; - } - - fn update_external_state(&mut self, state: ExternalStateUpdate) { - let (instance_state, migration_state) = match state { - ExternalStateUpdate::Instance(i) => (Some(i), None), - ExternalStateUpdate::Migration(m) => (None, Some(m)), - ExternalStateUpdate::Complete(i, m) => (Some(i), Some(m)), - }; - - let propolis_api_types::InstanceStateMonitorResponse { - state: old_instance, - migration: old_migration, - gen: old_gen, - } = self.external_state_tx.borrow().clone(); - - let state = instance_state.unwrap_or(old_instance); - let migration = if let Some(migration_state) = migration_state { - migration_state.apply_to(old_migration) - } else { - old_migration - }; - - let gen = old_gen + 1; - info!(self.log, "publishing new instance state"; - "gen" => gen, - "state" => ?state, - "migration" => ?migration); - - let _ = self.external_state_tx.send( - propolis_api_types::InstanceStateMonitorResponse { - gen, - state, - migration, - }, - ); - } - - async fn initialize_vm( - &mut self, - request: propolis_api_types::InstanceSpecEnsureRequest, - options: super::EnsureOptions, - external_state_rx: super::InstanceStateRx, - ) -> anyhow::Result> { - let active_vm = match request.migrate { - None => { - let vm_objects = self - .initialize_vm_from_spec( - &request.properties, - &request.instance_spec, - options, - ) - .await?; - let VersionedInstanceSpec::V0(v0_spec) = request.instance_spec; - let active_vm = Arc::new(super::ActiveVm { - parent: self.parent_vm.clone(), - log: self.log.clone(), - state_driver_queue: self.input_queue.clone(), - external_state_rx, - properties: request.properties, - spec: v0_spec, - objects: vm_objects, - }); - - active_vm - } - Some(_migrate_request) => todo!("gjc"), - }; - - Ok(active_vm) - } - - /// Initializes all of the components of a VM from the supplied - /// specification. - async fn initialize_vm_from_spec( - &mut self, - properties: &InstanceProperties, - spec: &VersionedInstanceSpec, - options: super::EnsureOptions, - ) -> anyhow::Result { - info!(self.log, "initializing new VM"; - "spec" => #?spec, - "properties" => #?properties, - "use_reservoir" => options.use_reservoir, - "bootrom" => %options.toml_config.bootrom.display()); - - let vmm_log = self.log.new(slog::o!("component" => "vmm")); - - // Set up the 'shell' instance into which the rest of this routine will - // add components. - let VersionedInstanceSpec::V0(v0_spec) = &spec; - let machine = build_instance( - &properties.vm_name(), - v0_spec, - options.use_reservoir, - vmm_log, - )?; - - let mut init = MachineInitializer { - log: self.log.clone(), - machine: &machine, - devices: Default::default(), - block_backends: Default::default(), - crucible_backends: Default::default(), - spec: &v0_spec, - properties: &properties, - toml_config: &options.toml_config, - producer_registry: options.oximeter_registry, - state: MachineInitializerState::default(), - }; - - init.initialize_rom(options.toml_config.bootrom.as_path())?; - let chipset = init.initialize_chipset( - &(self.input_queue.clone() - as Arc), - )?; - - init.initialize_rtc(&chipset)?; - init.initialize_hpet()?; - - let com1 = Arc::new(init.initialize_uart(&chipset)?); - let ps2ctrl = init.initialize_ps2(&chipset)?; - init.initialize_qemu_debug_port()?; - init.initialize_qemu_pvpanic(properties.into())?; - init.initialize_network_devices(&chipset)?; - - #[cfg(not(feature = "omicron-build"))] - init.initialize_test_devices(&options.toml_config.devices)?; - #[cfg(feature = "omicron-build")] - info!( - self.log, - "`omicron-build` feature enabled, ignoring any test devices" - ); - - #[cfg(feature = "falcon")] - init.initialize_softnpu_ports(&chipset)?; - #[cfg(feature = "falcon")] - init.initialize_9pfs(&chipset)?; - - init.initialize_storage_devices(&chipset, options.nexus_client).await?; - let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; - init.initialize_cpus()?; - let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( - &machine, - self.input_queue.clone() - as Arc, - self.log.new(slog::o!("component" => "vcpu_tasks")), - )?); - - let MachineInitializer { - devices, - block_backends, - crucible_backends, - .. - } = init; - - self.vcpu_tasks = Some(vcpu_tasks as Box); - Ok(super::VmObjects { - machine, - lifecycle_components: devices, - block_backends, - crucible_backends, - com1, - framebuffer: Some(ramfb), - ps2ctrl, - }) - } - - async fn run_loop(mut self) { - info!(self.log, "state driver launched"); - - loop { - let event = self.input_queue.wait_for_next_event(); - info!(self.log, "state driver handling event"; "event" => ?event); - - let outcome = match event { - InputQueueEvent::ExternalRequest(req) => { - self.handle_external_request(req).await - } - InputQueueEvent::GuestEvent(event) => { - self.handle_guest_event(event).await - } - }; - - info!(self.log, "state driver handled event"; "outcome" => ?outcome); - if outcome == HandleEventOutcome::Exit { - break; - } - } - - info!(self.log, "state driver exiting"); - } - - async fn handle_guest_event( - &mut self, - event: GuestEvent, - ) -> HandleEventOutcome { - match event { - GuestEvent::VcpuSuspendHalt(_when) => { - info!(self.log, "Halting due to VM suspend event",); - self.do_halt().await; - HandleEventOutcome::Exit - } - GuestEvent::VcpuSuspendReset(_when) => { - info!(self.log, "Resetting due to VM suspend event"); - self.do_reboot().await; - HandleEventOutcome::Continue - } - GuestEvent::VcpuSuspendTripleFault(vcpu_id, _when) => { - info!( - self.log, - "Resetting due to triple fault on vCPU {}", vcpu_id - ); - self.do_reboot().await; - HandleEventOutcome::Continue - } - GuestEvent::ChipsetHalt => { - info!(self.log, "Halting due to chipset-driven halt"); - self.do_halt().await; - HandleEventOutcome::Exit - } - GuestEvent::ChipsetReset => { - info!(self.log, "Resetting due to chipset-driven reset"); - self.do_reboot().await; - HandleEventOutcome::Continue - } - } - } - - async fn handle_external_request( - &mut self, - request: super::request_queue::ExternalRequest, - ) -> HandleEventOutcome { - match request { - super::request_queue::ExternalRequest::MigrateAsSource { - .. - } => todo!("gjc"), - super::request_queue::ExternalRequest::Reboot => { - self.do_reboot(); - HandleEventOutcome::Continue - } - super::request_queue::ExternalRequest::Stop => { - self.do_halt(); - HandleEventOutcome::Exit - } - } - } - - async fn do_reboot(&mut self) { - info!(self.log, "resetting instance"); - - self.update_external_state(ExternalStateUpdate::Instance( - InstanceState::Rebooting, - )); - - // Reboot is implemented as a pause -> reset -> resume transition. - // - // First, pause the vCPUs and all devices so no partially-completed - // work is present. - self.vcpu_tasks().pause_all(); - self.vm_lifecycle().pause_devices().await; - - // Reset all entities and the VM's bhyve state, then reset the vCPUs. - // The vCPU reset must come after the bhyve reset. - self.vm_lifecycle().reset_devices_and_machine(); - self.reset_vcpus(); - - // Resume devices so they're ready to do more work, then resume vCPUs. - self.vm_lifecycle().resume_devices(); - self.vcpu_tasks().resume_all(); - - // Notify other consumers that the instance successfully rebooted and is - // now back to Running. - self.input_queue.notify_instance_state_change( - super::request_queue::InstanceStateChange::Rebooted, - ); - self.update_external_state(ExternalStateUpdate::Instance( - InstanceState::Running, - )); - } - - async fn do_halt(&mut self) { - info!(self.log, "stopping instance"); - self.update_external_state(ExternalStateUpdate::Instance( - InstanceState::Stopping, - )); - - // Entities expect to be paused before being halted. Note that the VM - // may be paused already if it is being torn down after a successful - // migration out. - if !self.paused { - self.pause().await; - } - - self.vcpu_tasks().exit_all(); - self.vm_lifecycle().halt_devices(); - self.publish_steady_state(InstanceState::Stopped); - } - - async fn pause(&mut self) { - assert!(!self.paused); - self.vcpu_tasks().pause_all(); - self.vm_lifecycle().pause_devices().await; - self.vm_lifecycle().pause_vm(); - self.paused = true; - } - - fn resume(&mut self) { - assert!(self.paused); - self.vm_lifecycle().resume_vm(); - self.vm_lifecycle().resume_devices(); - self.vcpu_tasks().resume_all(); - self.paused = false; - } - - fn reset_vcpus(&mut self) { - self.vcpu_tasks().new_generation(); - self.vm_lifecycle.as_ref().unwrap().reset_vcpu_state(); - } - - fn publish_steady_state(&mut self, state: InstanceState) { - let change = match state { - InstanceState::Running => { - super::request_queue::InstanceStateChange::StartedRunning - } - InstanceState::Stopped => { - super::request_queue::InstanceStateChange::Stopped - } - InstanceState::Failed => { - super::request_queue::InstanceStateChange::Failed - } - _ => panic!( - "Called publish_steady_state on non-terminal state {:?}", - state - ), - }; - - self.input_queue.notify_instance_state_change(change); - self.update_external_state(ExternalStateUpdate::Instance(state)); - } - - fn vcpu_tasks(&mut self) -> &mut dyn VcpuTaskController { - self.vcpu_tasks.as_mut().unwrap().as_mut() - } - - fn vm_lifecycle(&self) -> &dyn lifecycle_ops::VmLifecycle { - self.vm_lifecycle.as_ref().unwrap().as_ref() - } -} diff --git a/bin/propolis-server/src/lib/vm_old/mod.rs b/bin/propolis-server/src/lib/vm_old/mod.rs new file mode 100644 index 000000000..76363fa2a --- /dev/null +++ b/bin/propolis-server/src/lib/vm_old/mod.rs @@ -0,0 +1,1138 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Implements the VM controller: the public interface to a single Propolis +//! instance. +//! +//! The VM controller serves two purposes. First, it collects all of the objects +//! describing a single Propolis VM (the Propolis `Instance` itself, the +//! instance's spec, direct references to components in the instance, etc.). +//! Second, it records requests and events that affect how a VM moves through +//! the stages of its lifecycle, i.e. how and when it boots, reboots, migrates, +//! and stops. +//! +//! Each VM controller has a single "state driver" thread that processes +//! requests and events recorded by its controller and acts on the underlying +//! Propolis instance to move the VM into the appropriate states. Doing this +//! work on a single thread ensures that a VM can only undergo one state change +//! at a time, that there are no races to start/pause/resume/halt a VM's +//! components, and that there is a single source of truth as to a VM's current +//! state (and as to the steps that are required to move it to a different +//! state). Operations like live migration that require components to pause and +//! resume coordinate directly with the state driver thread. +//! +//! The VM controller's public API allows a Propolis Dropshot server to query a +//! VM's current state, to ask to change that state, and to obtain references to +//! objects in a VM as needed to handle other requests made of the server (e.g. +//! requests to connect to an instance's serial console or to take a disk +//! snapshot). The controller also implements traits that allow a VM's +//! components to raise events for the state driver to process (e.g. a request +//! from a VM's chipset to reboot or halt the VM). + +use crate::migrate; + +use futures::{future::BoxFuture, stream::FuturesUnordered, StreamExt}; +use std::{ + collections::{BTreeMap, VecDeque}, + fmt::Debug, + net::SocketAddr, + pin::Pin, + sync::{Arc, Condvar, Mutex, MutexGuard, Weak}, + task::{Context, Poll}, + thread::JoinHandle, + time::Duration, +}; + +use oximeter::types::ProducerRegistry; +use propolis::{ + hw::{ps2::ctrl::PS2Ctrl, qemu::ramfb::RamFb, uart::LpcUart}, + vmm::Machine, +}; +use propolis_api_types::{ + instance_spec::VersionedInstanceSpec, + InstanceMigrateStatusResponse as ApiMigrateStatusResponse, + InstanceMigrationStatus as ApiMigrationStatus, InstanceProperties, + InstanceState as ApiInstanceState, + InstanceStateMonitorResponse as ApiMonitoredState, + InstanceStateRequested as ApiInstanceStateRequested, + MigrationState as ApiMigrationState, +}; +use slog::{debug, error, info, Logger}; +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::sync::oneshot; +use tokio_tungstenite::WebSocketStream; +use uuid::Uuid; + +use crate::{ + initializer::{ + build_instance, MachineInitializer, MachineInitializerState, + }, + migrate::{MigrateError, MigrateRole}, + serial::Serial, + server::{BlockBackendMap, CrucibleBackendMap, DeviceMap, StaticConfig}, + vm::request_queue::ExternalRequest, +}; + +use self::request_queue::{ExternalRequestQueue, RequestDeniedReason}; +pub use nexus_client::Client as NexusClient; + +mod request_queue; +mod state_driver; + +#[derive(Debug, Error)] +pub enum VmControllerError { + #[error("The requested operation requires an active instance")] + InstanceNotActive, + + #[error("The instance has a pending request to halt")] + InstanceHaltPending, + + #[error("Instance is already a migration source")] + AlreadyMigrationSource, + + #[error("Cannot request state {0:?} while migration is in progress")] + InvalidRequestForMigrationSource(ApiInstanceStateRequested), + + #[error("A migration into this instance is in progress")] + MigrationTargetInProgress, + + #[error("Another live migration into this instance already occurred")] + MigrationTargetPreviouslyCompleted, + + #[error("The most recent attempt to migrate into this instance failed")] + MigrationTargetFailed, + + #[error("Can't migrate into a running instance")] + TooLateToBeMigrationTarget, + + #[error("Failed to queue requested state change: {0}")] + StateChangeRequestDenied(#[from] request_queue::RequestDeniedReason), + + #[error("Migration protocol error: {0:?}")] + MigrationProtocolError(#[from] MigrateError), + + #[error("Failed to start vCPU workers")] + VcpuWorkerCreationFailed(#[from] super::vcpu_tasks::VcpuTaskError), + + #[error("Failed to create state worker: {0}")] + StateWorkerCreationFailed(std::io::Error), +} + +impl From for dropshot::HttpError { + fn from(vm_error: VmControllerError) -> Self { + use dropshot::HttpError; + match vm_error { + VmControllerError::AlreadyMigrationSource + | VmControllerError::InvalidRequestForMigrationSource(_) + | VmControllerError::MigrationTargetInProgress + | VmControllerError::MigrationTargetFailed + | VmControllerError::TooLateToBeMigrationTarget + | VmControllerError::StateChangeRequestDenied(_) + | VmControllerError::InstanceNotActive + | VmControllerError::InstanceHaltPending + | VmControllerError::MigrationTargetPreviouslyCompleted => { + HttpError::for_status( + Some(format!("Instance operation failed: {}", vm_error)), + http::status::StatusCode::FORBIDDEN, + ) + } + VmControllerError::MigrationProtocolError(_) + | VmControllerError::VcpuWorkerCreationFailed(_) + | VmControllerError::StateWorkerCreationFailed(_) => { + HttpError::for_internal_error(format!( + "Instance operation failed: {}", + vm_error + )) + } + } + } +} + +/// A collection of objects that describe an instance and references to that +/// instance and its components. +pub(crate) struct VmObjects { + /// The underlying Propolis `Machine` this controller is managing. + machine: Option, + + /// The instance properties supplied when this controller was created. + properties: InstanceProperties, + + /// The instance spec used to create this controller's VM. + spec: tokio::sync::Mutex, + + /// Map of the emulated devices associated with the VM + devices: DeviceMap, + + /// Map of the instance's active block backends. + block_backends: BlockBackendMap, + + /// Map of the instance's active Crucible backends. + crucible_backends: CrucibleBackendMap, + + /// A wrapper around the instance's first COM port, suitable for providing a + /// connection to a guest's serial console. + com1: Arc>, + + /// An optional reference to the guest's framebuffer. + framebuffer: Option>, + + /// A reference to the guest's PS/2 controller. + ps2ctrl: Arc, + + /// A notification receiver to which the state worker publishes the most + /// recent instance state information. + monitor_rx: tokio::sync::watch::Receiver, +} + +/// A message sent from a live migration destination task to update the +/// externally visible state of the migration attempt. +#[derive(Clone, Copy, Debug)] +pub enum MigrateTargetCommand { + /// Update the externally-visible migration state. + UpdateState(ApiMigrationState), +} + +/// A message sent from a live migration driver to the state worker, asking it +/// to act on source instance components on the task's behalf. +#[derive(Clone, Copy, Debug)] +pub enum MigrateSourceCommand { + /// Update the externally-visible migration state. + UpdateState(ApiMigrationState), + + /// Pause the instance's devices and CPUs. + Pause, +} + +/// A message sent from the state worker to the live migration driver in +/// response to a previous command. +#[derive(Debug)] +pub enum MigrateSourceResponse { + /// A request to pause completed with the attached result. + Pause(Result<(), std::io::Error>), +} + +/// An event raised by a migration task that must be handled by the state +/// worker. +#[derive(Debug)] +enum MigrateTaskEvent { + /// The task completed with the associated result. + TaskExited(Result<(), MigrateError>), + + /// The task sent a command requesting work. + Command(T), +} + +/// An event raised by some component in the instance (e.g. a vCPU or the +/// chipset) that the state worker must handle. +/// +/// The vCPU-sourced events carry a time element (duration since VM boot) as +/// emitted by the kernel vmm. This is used to deduplicate events when all +/// vCPUs running in-kernel are kicked out for the suspend state. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum GuestEvent { + /// VM entered halt state + VcpuSuspendHalt(Duration), + /// VM entered reboot state + VcpuSuspendReset(Duration), + /// vCPU encounted triple-fault + VcpuSuspendTripleFault(i32, Duration), + /// Chipset signaled halt condition + ChipsetHalt, + /// Chipset signaled reboot condition + ChipsetReset, +} + +/// Shared instance state guarded by the controller's state mutex. This state is +/// accessed from the controller API and the VM's state worker. +#[derive(Debug)] +struct SharedVmStateInner { + external_request_queue: ExternalRequestQueue, + + /// The state worker's queue of unprocessed events from guest devices. + guest_event_queue: VecDeque, + + /// The expected ID of the next live migration this instance will + /// participate in (either in or out). If this is `Some`, external callers + /// who query migration state will observe that a live migration is in + /// progress even if the state driver has yet to pick up the live migration + /// tasks from its queue. + pending_migration_id: Option<(Uuid, MigrateRole)>, +} + +impl SharedVmStateInner { + fn new(parent_log: &Logger) -> Self { + let queue_log = + parent_log.new(slog::o!("component" => "external_request_queue")); + Self { + external_request_queue: ExternalRequestQueue::new(queue_log), + guest_event_queue: VecDeque::new(), + pending_migration_id: None, + } + } +} + +#[derive(Debug)] +pub(crate) struct SharedVmState { + inner: Mutex, + cv: Condvar, +} + +/// A VM controller: a wrapper around a Propolis instance that supplies the +/// functions needed for the Propolis server to implement its own API. +pub struct VmController { + /// A collection of objects that don't change once an instance is ensured: + /// the instance itself, a description of it, and convenience references to + /// some of its members (used to avoid rummaging through the instance's + /// inventory). + vm_objects: VmObjects, + + /// A wrapper for the runtime state of this instance, managed by the state + /// worker thread. This also serves as a sink for hardware events (e.g. from + /// vCPUs and the chipset), so it is wrapped in an Arc so that it can be + /// shared with those events' sources. + worker_state: Arc, + + /// A handle to the state worker thread for this instance. + worker_thread: Mutex< + Option>>, + >, + + /// This controller's logger. + log: Logger, + + /// A handle to a tokio runtime onto which this controller can spawn tasks + /// (e.g. migration tasks). + runtime_hdl: tokio::runtime::Handle, + + /// Migration source state persisted across multiple migration attempts. + migration_src_state: Mutex, + + /// A weak reference to this controller, suitable for upgrading and passing + /// to tasks the controller spawns. + this: Weak, +} + +impl SharedVmState { + fn new(parent_log: &Logger) -> Self { + Self { + inner: Mutex::new(SharedVmStateInner::new(parent_log)), + cv: Condvar::new(), + } + } + + fn queue_external_request( + &self, + request: ExternalRequest, + ) -> Result<(), RequestDeniedReason> { + let mut inner = self.inner.lock().unwrap(); + let result = inner.external_request_queue.try_queue(request); + if result.is_ok() { + self.cv.notify_one(); + } + result + } + + fn wait_for_next_event(&self) -> StateDriverEvent { + let guard = self.inner.lock().unwrap(); + let mut guard = self + .cv + .wait_while(guard, |i| { + i.external_request_queue.is_empty() + && i.guest_event_queue.is_empty() + }) + .unwrap(); + + if let Some(guest_event) = guard.guest_event_queue.pop_front() { + StateDriverEvent::Guest(guest_event) + } else { + StateDriverEvent::External( + guard.external_request_queue.pop_front().unwrap(), + ) + } + } + + /// Add a guest event to the queue, so long as it does not appear to be a + /// duplicate of an existing event. + fn enqueue_guest_event(&self, event: GuestEvent) { + let mut inner = self.inner.lock().unwrap(); + if !inner.guest_event_queue.iter().any(|ev| *ev == event) { + // Only queue event if nothing else in the queue is a direct match + inner.guest_event_queue.push_back(event); + self.cv.notify_one(); + } + } + + pub fn suspend_halt_event(&self, when: Duration) { + self.enqueue_guest_event(GuestEvent::VcpuSuspendHalt(when)); + } + + pub fn suspend_reset_event(&self, when: Duration) { + self.enqueue_guest_event(GuestEvent::VcpuSuspendReset(when)); + } + + pub fn suspend_triple_fault_event(&self, vcpu_id: i32, when: Duration) { + self.enqueue_guest_event(GuestEvent::VcpuSuspendTripleFault( + vcpu_id, when, + )); + } + + pub fn unhandled_vm_exit( + &self, + vcpu_id: i32, + exit: propolis::exits::VmExitKind, + ) { + panic!("vCPU {}: Unhandled VM exit: {:?}", vcpu_id, exit); + } + + pub fn io_error_event(&self, vcpu_id: i32, error: std::io::Error) { + panic!("vCPU {}: Unhandled vCPU error: {}", vcpu_id, error); + } + + pub fn clear_pending_migration(&self) { + let mut inner = self.inner.lock().unwrap(); + inner.pending_migration_id = None; + } +} + +/// Functions called by a Propolis chipset to notify another component that an +/// event occurred. +pub trait ChipsetEventHandler: Send + Sync { + fn chipset_halt(&self); + fn chipset_reset(&self); +} + +impl ChipsetEventHandler for SharedVmState { + fn chipset_halt(&self) { + self.enqueue_guest_event(GuestEvent::ChipsetHalt); + } + + fn chipset_reset(&self) { + self.enqueue_guest_event(GuestEvent::ChipsetReset); + } +} + +impl VmController { + #[allow(clippy::too_many_arguments)] + pub fn new( + instance_spec: VersionedInstanceSpec, + properties: InstanceProperties, + &StaticConfig { vm: ref toml_config, use_reservoir, .. }: &StaticConfig, + producer_registry: Option, + nexus_client: Option, + log: Logger, + runtime_hdl: tokio::runtime::Handle, + stop_ch: oneshot::Sender<()>, + ) -> anyhow::Result> { + let bootrom = &toml_config.bootrom; + info!(log, "initializing new VM"; + "spec" => #?instance_spec, + "properties" => #?properties, + "use_reservoir" => use_reservoir, + "bootrom" => %bootrom.display()); + + let vmm_log = log.new(slog::o!("component" => "vmm")); + + // Set up the 'shell' instance into which the rest of this routine will + // add components. + let VersionedInstanceSpec::V0(v0_spec) = &instance_spec; + let machine = build_instance( + &properties.vm_name(), + v0_spec, + use_reservoir, + vmm_log, + )?; + + // Create the state monitor channel and the worker state struct that + // depends on it. The state struct can then be passed to device + // initialization as an event sink. + let (monitor_tx, monitor_rx) = + tokio::sync::watch::channel(ApiMonitoredState { + gen: 0, + state: ApiInstanceState::Creating, + migration: ApiMigrateStatusResponse { + migration_in: None, + migration_out: None, + }, + }); + + let worker_state = Arc::new(SharedVmState::new(&log)); + + // Create and initialize devices in the new instance. + let mut init = MachineInitializer { + log: log.clone(), + machine: &machine, + devices: DeviceMap::new(), + block_backends: BlockBackendMap::new(), + crucible_backends: CrucibleBackendMap::new(), + spec: v0_spec, + properties: &properties, + toml_config, + producer_registry, + state: MachineInitializerState::default(), + }; + + init.initialize_rom(bootrom.as_path())?; + let chipset = init.initialize_chipset( + &(worker_state.clone() as Arc), + )?; + init.initialize_rtc(&chipset)?; + init.initialize_hpet()?; + + let com1 = Arc::new(init.initialize_uart(&chipset)?); + let ps2ctrl = init.initialize_ps2(&chipset)?; + init.initialize_qemu_debug_port()?; + init.initialize_qemu_pvpanic((&properties).into())?; + init.initialize_network_devices(&chipset)?; + + #[cfg(not(feature = "omicron-build"))] + init.initialize_test_devices(&toml_config.devices)?; + #[cfg(feature = "omicron-build")] + info!( + log, + "`omicron-build` feature enabled, ignoring any test devices" + ); + + #[cfg(feature = "falcon")] + init.initialize_softnpu_ports(&chipset)?; + #[cfg(feature = "falcon")] + init.initialize_9pfs(&chipset)?; + init.initialize_storage_devices(&chipset, nexus_client)?; + let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; + init.initialize_cpus()?; + let vcpu_tasks = super::vcpu_tasks::VcpuTasks::new( + &machine, + worker_state.clone(), + log.new(slog::o!("component" => "vcpu_tasks")), + )?; + + let MachineInitializer { + devices, + block_backends, + crucible_backends, + .. + } = init; + + // The instance is fully set up; pass it to the new controller. + let shared_state_for_worker = worker_state.clone(); + let controller = Arc::new_cyclic(|this| Self { + vm_objects: VmObjects { + machine: Some(machine), + properties, + spec: tokio::sync::Mutex::new(instance_spec), + devices, + block_backends, + crucible_backends, + com1, + framebuffer: Some(ramfb), + ps2ctrl, + monitor_rx, + }, + worker_state, + worker_thread: Mutex::new(None), + migration_src_state: Default::default(), + log: log.new(slog::o!("component" => "vm_controller")), + runtime_hdl: runtime_hdl.clone(), + this: this.clone(), + }); + + // Now that the controller exists, launch the state worker that will + // drive state transitions for this instance. When the VM halts, the + // worker will exit and drop its reference to the controller. + let ctrl_for_worker = controller.clone(); + let log_for_worker = + log.new(slog::o!("component" => "vm_state_worker")); + let worker_thread = std::thread::Builder::new() + .name("vm_state_worker".to_string()) + .spawn(move || { + let driver = state_driver::StateDriver::new( + runtime_hdl, + ctrl_for_worker, + shared_state_for_worker, + vcpu_tasks, + log_for_worker, + monitor_tx, + ); + + let monitor_tx = driver.run_state_worker(); + + // Signal back to the server state once the worker has exited. + let _ = stop_ch.send(()); + monitor_tx + }) + .map_err(VmControllerError::StateWorkerCreationFailed)?; + + *controller.worker_thread.lock().unwrap() = Some(worker_thread); + Ok(controller) + } + + pub fn properties(&self) -> &InstanceProperties { + &self.vm_objects.properties + } + + pub fn machine(&self) -> &Machine { + // Unwrap safety: The machine is created when the controller is created + // and removed only when the controller is dropped. + self.vm_objects + .machine + .as_ref() + .expect("VM controller always has a valid machine") + } + + pub(crate) fn migration_src_state( + &self, + ) -> MutexGuard<'_, migrate::source::PersistentState> { + self.migration_src_state.lock().unwrap() + } + + pub async fn instance_spec( + &self, + ) -> tokio::sync::MutexGuard<'_, VersionedInstanceSpec> { + self.vm_objects.spec.lock().await + } + + pub fn com1(&self) -> &Arc> { + &self.vm_objects.com1 + } + + pub fn framebuffer(&self) -> Option<&Arc> { + self.vm_objects.framebuffer.as_ref() + } + + pub fn ps2ctrl(&self) -> &Arc { + &self.vm_objects.ps2ctrl + } + + pub fn crucible_backends( + &self, + ) -> &BTreeMap> { + &self.vm_objects.crucible_backends + } + + pub fn log(&self) -> &Logger { + &self.log + } + + pub fn external_instance_state(&self) -> ApiInstanceState { + self.vm_objects.monitor_rx.borrow().state + } + + pub fn inject_nmi(&self) { + if let Some(machine) = &self.vm_objects.machine { + match machine.inject_nmi() { + Ok(_) => { + info!(self.log, "Sending NMI to instance"); + } + Err(e) => { + error!(self.log, "Could not send NMI to instance: {}", e); + } + }; + } + } + + pub fn state_watcher( + &self, + ) -> &tokio::sync::watch::Receiver { + &self.vm_objects.monitor_rx + } + + /// Asks to queue a request to start a source migration task for this VM. + /// The migration will have the supplied `migration_id` and will obtain its + /// connection to the target by calling `upgrade_fn` to obtain a future that + /// yields the necessary connection. + /// + /// This routine fails if the VM was not marked as a migration source or if + /// it has another pending request that precludes migration. Note that this + /// routine does not fail if the future returned from `upgrade_fn` fails to + /// produce a connection to the destination. + /// + /// On success, clients may query the instance's migration status to + /// determine how the migration has progressed. + pub fn request_migration_from< + T: AsyncRead + AsyncWrite + Unpin + Send + 'static, + >( + &self, + migration_id: Uuid, + conn: WebSocketStream, + protocol: crate::migrate::protocol::Protocol, + ) -> Result<(), VmControllerError> { + let mut inner = self.worker_state.inner.lock().unwrap(); + + // Check that the request can be enqueued before setting up the + // migration task. + if !inner.external_request_queue.migrate_as_source_will_enqueue()? { + return Ok(()); + } + + let migration_request = + self.launch_source_migration_task(migration_id, conn, protocol); + + // Unwrap is safe because the queue state was checked under the lock. + inner.external_request_queue.try_queue(migration_request).unwrap(); + self.worker_state.cv.notify_one(); + Ok(()) + } + + /// Launches a task that will execute a live migration out of this VM. + /// Returns a state change request message to queue to the state driver, + /// which will coordinate with this task to run the migration. + fn launch_source_migration_task< + T: AsyncRead + AsyncWrite + Unpin + Send + 'static, + >( + &self, + migration_id: Uuid, + conn: WebSocketStream, + protocol: crate::migrate::protocol::Protocol, + ) -> ExternalRequest { + let log_for_task = + self.log.new(slog::o!("component" => "migrate_source_task")); + let ctrl_for_task = self.this.upgrade().unwrap(); + let (start_tx, start_rx) = tokio::sync::oneshot::channel(); + let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); + let (response_tx, response_rx) = tokio::sync::mpsc::channel(1); + + // The migration process uses async operations when communicating with + // the migration target. Run that work on the async runtime. + info!(self.log, "Launching migration source task"); + let task = self.runtime_hdl.spawn(async move { + info!(log_for_task, "Waiting to be told to start"); + start_rx.await.unwrap(); + + info!(log_for_task, "Starting migration procedure"); + if let Err(e) = crate::migrate::source::migrate( + ctrl_for_task, + command_tx, + response_rx, + conn, + protocol, + ) + .await + { + error!(log_for_task, "Migration task failed: {}", e); + return Err(e); + } + + Ok(()) + }); + + ExternalRequest::MigrateAsSource { + migration_id, + task, + start_tx, + command_rx, + response_tx, + } + } + + /// Asks to queue a request to start a destination migration task for this + /// VM. The migration will have the supplied `migration_id` and will obtain + /// its connection to the source by calling `upgrade_fn` to obtain a future + /// that yields the necessary connection. + /// + /// This routine fails if the VM has already begun to run or if a previous + /// migration in was attempted (regardless of its outcome). Note that this + /// routine does not fail if the future returned from `upgrade_fn` + /// subsequently fails to produce a connection to the destination (though + /// the migration attempt will then fail). + /// + /// On success, clients may query the instance's migration status to + /// determine how the migration has progressed. + pub fn request_migration_into< + T: AsyncRead + AsyncWrite + Unpin + Send + 'static, + >( + &self, + migration_id: Uuid, + conn: WebSocketStream, + local_addr: SocketAddr, + protocol: crate::migrate::protocol::Protocol, + ) -> Result<(), VmControllerError> { + let mut inner = self.worker_state.inner.lock().unwrap(); + if !inner.external_request_queue.migrate_as_target_will_enqueue()? { + return Ok(()); + } + + // Check that the request can be enqueued before setting up the + // migration task. + let migration_request = self.launch_target_migration_task( + migration_id, + conn, + local_addr, + protocol, + ); + + // Unwrap is safe because the queue state was checked under the lock. + inner.external_request_queue.try_queue(migration_request).unwrap(); + self.worker_state.cv.notify_one(); + Ok(()) + } + + /// Launches a task that will execute a live migration into this VM. + /// Returns a state change request message to queue to the state driver, + /// which will coordinate with this task to run the migration. + fn launch_target_migration_task< + T: AsyncRead + AsyncWrite + Unpin + Send + 'static, + >( + &self, + migration_id: Uuid, + conn: WebSocketStream, + local_addr: SocketAddr, + protocol: crate::migrate::protocol::Protocol, + ) -> ExternalRequest { + let log_for_task = + self.log.new(slog::o!("component" => "migrate_source_task")); + let ctrl_for_task = self.this.upgrade().unwrap(); + let (start_tx, start_rx) = tokio::sync::oneshot::channel(); + let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); + + // The migration process uses async operations when communicating with + // the migration target. Run that work on the async runtime. + info!(self.log, "Launching migration target task"); + let task = self.runtime_hdl.spawn(async move { + info!(log_for_task, "Waiting to be told to start"); + start_rx.await.unwrap(); + + info!(log_for_task, "Starting migration procedure"); + if let Err(e) = crate::migrate::destination::migrate( + ctrl_for_task, + command_tx, + conn, + local_addr, + protocol, + ) + .await + { + error!(log_for_task, "Migration task failed: {}", e); + return Err(e); + } + + Ok(()) + }); + + ExternalRequest::MigrateAsTarget { + migration_id, + task, + start_tx, + command_rx, + } + } + + /// Handles a request to change the wrapped instance's state. + pub fn put_state( + &self, + requested: ApiInstanceStateRequested, + ) -> Result<(), VmControllerError> { + info!(self.log(), "Requested state {:?} via API", requested); + + self.worker_state + .queue_external_request(match requested { + ApiInstanceStateRequested::Run => ExternalRequest::Start, + ApiInstanceStateRequested::Stop => ExternalRequest::Stop, + ApiInstanceStateRequested::Reboot => ExternalRequest::Reboot, + }) + .map_err(Into::into) + } + + pub fn migrate_status(&self) -> ApiMigrateStatusResponse { + let mut published = + self.vm_objects.monitor_rx.borrow().migration.clone(); + + // There's a window between the point where a request to migrate returns + // and the point where the state worker actually picks up the migration + // and publishes its state. To ensure that migrations are visible as + // soon as they're queued, pick up the queued migration (if there is + // one) and insert it into the output in the appropriate position. The + // state driver will consume the pending migration before actually + // executing it. + let inner = self.worker_state.inner.lock().unwrap(); + if let Some((id, role)) = inner.pending_migration_id { + match role { + MigrateRole::Destination => { + published.migration_in = Some(ApiMigrationStatus { + id, + state: ApiMigrationState::Sync, + }); + } + MigrateRole::Source => { + published.migration_out = Some(ApiMigrationStatus { + id, + state: ApiMigrationState::Sync, + }); + } + } + } + + published + } + + pub(crate) fn for_each_device( + &self, + mut func: impl FnMut(&str, &Arc), + ) { + for (name, dev) in self.vm_objects.devices.iter() { + func(name, dev); + } + } + + pub(crate) fn for_each_device_fallible( + &self, + mut func: F, + ) -> std::result::Result<(), E> + where + F: FnMut( + &str, + &Arc, + ) -> std::result::Result<(), E>, + { + for (name, dev) in self.vm_objects.devices.iter() { + func(name, dev)?; + } + Ok(()) + } + + pub(crate) fn device_by_name( + &self, + name: &String, + ) -> Option> { + self.vm_objects.devices.get(name).cloned() + } +} + +impl Drop for VmController { + fn drop(&mut self) { + info!(self.log, "Dropping VM controller"); + let machine = self + .vm_objects + .machine + .take() + .expect("VM controller should have an instance at drop"); + + // Destroy the underlying kernel VMM resource + let hdl = machine.destroy(); + let _ = hdl.destroy(); + + // Detach block backends so they can do any final clean-up + debug!(self.log, "Detaching block backends"); + for backend in self.vm_objects.block_backends.values() { + let _ = backend.attachment().detach(); + } + + // A fully-initialized controller is kept alive in part by its worker + // thread, which owns the sender side of the controller's state-change + // notification channel. Since the controller is being dropped, the + // worker is gone, so reclaim the sender from it and use it to publish + // that the controller is being destroyed. + if let Some(thread) = self.worker_thread.lock().unwrap().take() { + let api_state = thread.join().unwrap(); + let old_state = api_state.borrow().clone(); + + // Preserve the instance's state if it failed so that clients can + // distinguish gracefully-stopped instances from failed instances. + if matches!(old_state.state, ApiInstanceState::Failed) { + return; + } + + let gen = old_state.gen + 1; + let _ = api_state.send(ApiMonitoredState { + gen, + state: ApiInstanceState::Destroyed, + ..old_state + }); + } + } +} + +/// An event that a VM's state driver must process. +#[derive(Debug)] +enum StateDriverEvent { + /// An event that was raised from within the guest. + Guest(GuestEvent), + + /// An event that was raised by an external entity (e.g. an API call to the + /// server). + External(ExternalRequest), +} + +/// Commands issued by the state driver back to its VM controller. These are +/// abstracted into a trait to allow them to be mocked out for testing without +/// having to supply mock implementations of the rest of the VM controller's +/// functionality. +#[cfg_attr(test, mockall::automock)] +trait StateDriverVmController { + /// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated + /// devices and vCPUs are brought to a consistent state. + /// + /// When the VM is paused, attempts to run its vCPUs (via `VM_RUN` ioctl) + /// will fail. A corresponding `resume_vm()` call must be made prior to + /// allowing vCPU tasks to run. + fn pause_vm(&self); + + /// Resume a previously-paused VM at the kernel VMM level. This will resume + /// any timers driving in-kernel-emulated devices, and allow the vCPU to run + /// again. + fn resume_vm(&self); + + /// Sends a reset request to each device in the instance, then sends a + /// reset command to the instance's bhyve VM. + fn reset_devices_and_machine(&self); + + /// Sends each device (and backend) a start request. + fn start_devices(&self) -> anyhow::Result<()>; + + /// Sends each device a pause request, then waits for all these requests to + /// complete. + fn pause_devices(&self); + + /// Sends each device a resume request. + fn resume_devices(&self); + + /// Sends each device (and backend) a halt request. + fn halt_devices(&self); + + /// Resets the state of each vCPU in the instance to its on-reboot state. + fn reset_vcpu_state(&self); +} + +impl StateDriverVmController for VmController { + fn pause_vm(&self) { + info!(self.log, "Pausing kernel VMM resources"); + self.machine().hdl.pause().expect("VM_PAUSE should succeed") + } + + fn resume_vm(&self) { + info!(self.log, "Resuming kernel VMM resources"); + self.machine().hdl.resume().expect("VM_RESUME should succeed") + } + + fn reset_devices_and_machine(&self) { + let _rtguard = self.runtime_hdl.enter(); + self.for_each_device(|name, dev| { + info!(self.log, "Sending reset request to {}", name); + dev.reset(); + }); + + self.machine().reinitialize().unwrap(); + } + + fn start_devices(&self) -> anyhow::Result<()> { + let _rtguard = self.runtime_hdl.enter(); + self.for_each_device_fallible(|name, dev| { + info!(self.log, "Sending startup complete to {}", name); + let res = dev.start(); + if let Err(e) = &res { + error!(self.log, "Startup failed for {}: {:?}", name, e); + } + res + })?; + for (name, backend) in self.vm_objects.block_backends.iter() { + debug!(self.log, "Starting block backend {}", name); + let res = backend.start(); + if let Err(e) = &res { + error!(self.log, "Startup failed for {}: {:?}", name, e); + return res; + } + } + Ok(()) + } + + fn pause_devices(&self) { + let _rtguard = self.runtime_hdl.enter(); + self.for_each_device(|name, dev| { + info!(self.log, "Sending pause request to {}", name); + dev.pause(); + }); + + // Create a Future that returns the name of the device that has finished + // pausing: this allows keeping track of which devices have and haven't + // completed pausing yet. + struct NamedFuture { + name: String, + future: BoxFuture<'static, ()>, + } + + impl std::future::Future for NamedFuture { + type Output = String; + + fn poll( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll { + let mut_self = self.get_mut(); + match Pin::new(&mut mut_self.future).poll(cx) { + Poll::Pending => Poll::Pending, + Poll::Ready(()) => Poll::Ready(mut_self.name.clone()), + } + } + } + + info!(self.log, "Waiting for devices to pause"); + self.runtime_hdl.block_on(async { + let mut stream: FuturesUnordered<_> = self + .vm_objects + .devices + .iter() + .map(|(name, dev)| { + info!(self.log, "Got paused future from dev {}", name); + NamedFuture { name: name.to_string(), future: dev.paused() } + }) + .collect(); + + loop { + match stream.next().await { + Some(name) => { + info!(self.log, "dev {} completed pause", name); + } + + None => { + // done + info!(self.log, "all devices paused"); + break; + } + } + } + }); + } + + fn resume_devices(&self) { + let _rtguard = self.runtime_hdl.enter(); + self.for_each_device(|name, dev| { + info!(self.log, "Sending resume request to {}", name); + dev.resume(); + }); + } + + fn halt_devices(&self) { + let _rtguard = self.runtime_hdl.enter(); + self.for_each_device(|name, dev| { + info!(self.log, "Sending halt request to {}", name); + dev.halt(); + }); + for (name, backend) in self.vm_objects.block_backends.iter() { + debug!(self.log, "Stopping and detaching block backend {}", name); + backend.stop(); + if let Err(err) = backend.detach() { + error!( + self.log, + "Error while detaching block backend {name}: {err:?}", + ); + } + } + } + + fn reset_vcpu_state(&self) { + for vcpu in self.machine().vcpus.iter() { + info!(self.log, "Resetting vCPU {}", vcpu.id); + vcpu.activate().unwrap(); + vcpu.reboot_state().unwrap(); + if vcpu.is_bsp() { + info!(self.log, "Resetting BSP vCPU {}", vcpu.id); + vcpu.set_run_state(propolis::bhyve_api::VRS_RUN, None).unwrap(); + vcpu.set_reg( + propolis::bhyve_api::vm_reg_name::VM_REG_GUEST_RIP, + 0xfff0, + ) + .unwrap(); + } + } + } +} diff --git a/bin/propolis-server/src/lib/vm2/request_queue.rs b/bin/propolis-server/src/lib/vm_old/request_queue.rs similarity index 67% rename from bin/propolis-server/src/lib/vm2/request_queue.rs rename to bin/propolis-server/src/lib/vm_old/request_queue.rs index fe52d2135..9d23faa26 100644 --- a/bin/propolis-server/src/lib/vm2/request_queue.rs +++ b/bin/propolis-server/src/lib/vm_old/request_queue.rs @@ -29,12 +29,34 @@ use uuid::Uuid; use crate::migrate::MigrateError; -use super::migrate_commands::{MigrateSourceCommand, MigrateSourceResponse}; +use super::{ + MigrateSourceCommand, MigrateSourceResponse, MigrateTargetCommand, +}; /// An external request made of a VM controller via the server API. Handled by /// the controller's state driver thread. #[derive(Debug)] pub enum ExternalRequest { + /// Initializes the VM through live migration by running a + /// migration-destination task. + MigrateAsTarget { + /// The ID of the live migration to use when initializing. + migration_id: Uuid, + + /// A handle to the task that will execute the migration procedure. + task: tokio::task::JoinHandle>, + + /// The sender side of a one-shot channel that, when signaled, tells the + /// migration task to start its work. + start_tx: tokio::sync::oneshot::Sender<()>, + + /// A channel that receives commands from the migration task. + command_rx: tokio::sync::mpsc::Receiver, + }, + + /// Resets all the VM's devices and CPUs, then starts the VM. + Start, + /// Asks the state worker to start a migration-source task. MigrateAsSource { /// The ID of the live migration for which this VM will be the source. @@ -125,6 +147,8 @@ enum RequestDisposition { /// The current disposition for each kind of incoming request. #[derive(Copy, Clone, Debug)] struct AllowedRequests { + migrate_as_target: RequestDisposition, + start: RequestDisposition, migrate_as_source: RequestDisposition, reboot: RequestDisposition, stop: RequestDisposition, @@ -143,6 +167,8 @@ impl ExternalRequestQueue { Self { queue: VecDeque::new(), allowed: AllowedRequests { + migrate_as_target: RequestDisposition::Enqueue, + start: RequestDisposition::Enqueue, migrate_as_source: RequestDisposition::Deny( RequestDeniedReason::InstanceNotActive, ), @@ -172,6 +198,10 @@ impl ExternalRequestQueue { request: ExternalRequest, ) -> Result<(), RequestDeniedReason> { let disposition = match request { + ExternalRequest::MigrateAsTarget { .. } => { + self.allowed.migrate_as_target + } + ExternalRequest::Start => self.allowed.start, ExternalRequest::MigrateAsSource { .. } => { self.allowed.migrate_as_source } @@ -207,6 +237,26 @@ impl ExternalRequestQueue { .get_new_dispositions(DispositionChangeReason::StateChange(state)); } + /// Indicates whether the queue would allow a request to migrate into this + /// instance. This can be used to avoid setting up migration tasks for + /// requests that will ultimately be denied. + /// + /// # Return value + /// + /// - `Ok(true)` if the request will be queued. + /// - `Ok(false)` if the request is allowed for idempotency reasons but will + /// not be queued. + /// - `Err` if the request is forbidden. + pub fn migrate_as_target_will_enqueue( + &self, + ) -> Result { + match self.allowed.migrate_as_target { + RequestDisposition::Enqueue => Ok(true), + RequestDisposition::Ignore => Ok(false), + RequestDisposition::Deny(reason) => Err(reason), + } + } + /// Indicates whether the queue would allow a request to migrate out of this /// instance. This can be used to avoid setting up migration tasks for /// requests that will ultimately be denied. @@ -245,21 +295,74 @@ impl ExternalRequestQueue { use RequestDeniedReason as DenyReason; use RequestDisposition as Disposition; match reason { + // Starting the instance, whether via migration or cold boot, + // forecloses on further attempts to migrate in. For idempotency, + // further requests to start are allowed when an instance-starting + // transition is enqueued. + ChangeReason::ApiRequest(ExternalRequest::MigrateAsTarget { + .. + }) + | ChangeReason::ApiRequest(ExternalRequest::Start) => { + let (migrate_as_target_disposition, deny_reason) = match reason + { + // If this is a request to migrate in, make sure future + // requests to migrate in are handled idempotently. + ChangeReason::ApiRequest( + ExternalRequest::MigrateAsTarget { .. }, + ) => ( + Disposition::Ignore, + DenyReason::MigrationTargetInProgress, + ), + ChangeReason::ApiRequest(ExternalRequest::Start) => ( + Disposition::Deny(DenyReason::StartInProgress), + DenyReason::StartInProgress, + ), + _ => unreachable!(), + }; + + AllowedRequests { + migrate_as_target: migrate_as_target_disposition, + start: Disposition::Ignore, + migrate_as_source: Disposition::Deny(deny_reason), + reboot: Disposition::Deny(deny_reason), + stop: self.allowed.stop, + } + } ChangeReason::ApiRequest(ExternalRequest::MigrateAsSource { .. - }) => AllowedRequests { - migrate_as_source: Disposition::Deny( - DenyReason::AlreadyMigrationSource, - ), - reboot: Disposition::Deny( - DenyReason::InvalidRequestForMigrationSource, - ), - stop: self.allowed.stop, - }, + }) => { + assert!(matches!(self.allowed.start, Disposition::Ignore)); + + // Requests to migrate into the instance should not be enqueued + // from this point, but whether they're dropped or ignored + // depends on how the instance was originally initialized. + assert!(!matches!( + self.allowed.migrate_as_target, + Disposition::Enqueue + )); + + AllowedRequests { + migrate_as_target: self.allowed.migrate_as_target, + start: self.allowed.start, + migrate_as_source: Disposition::Deny( + DenyReason::AlreadyMigrationSource, + ), + reboot: Disposition::Deny( + DenyReason::InvalidRequestForMigrationSource, + ), + stop: self.allowed.stop, + } + } // Requests to reboot prevent additional reboot requests from being // queued, but do not affect other operations. ChangeReason::ApiRequest(ExternalRequest::Reboot) => { + assert!(matches!(self.allowed.start, Disposition::Ignore)); + assert!(!matches!( + self.allowed.migrate_as_target, + Disposition::Enqueue + )); + AllowedRequests { reboot: Disposition::Ignore, ..self.allowed } } @@ -267,6 +370,10 @@ impl ExternalRequestQueue { // queued. Additional requests to stop are ignored for idempotency. ChangeReason::ApiRequest(ExternalRequest::Stop) => { AllowedRequests { + migrate_as_target: Disposition::Deny( + DenyReason::HaltPending, + ), + start: Disposition::Deny(DenyReason::HaltPending), migrate_as_source: Disposition::Deny( DenyReason::HaltPending, ), @@ -279,6 +386,8 @@ impl ExternalRequestQueue { // to reboot it become valid. ChangeReason::StateChange(InstanceStateChange::StartedRunning) => { AllowedRequests { + migrate_as_target: self.allowed.migrate_as_target, + start: self.allowed.start, migrate_as_source: Disposition::Enqueue, reboot: Disposition::Enqueue, stop: self.allowed.stop, @@ -306,6 +415,10 @@ impl ExternalRequestQueue { // "deny". ChangeReason::StateChange(InstanceStateChange::Stopped) => { AllowedRequests { + migrate_as_target: Disposition::Deny( + DenyReason::InstanceNotActive, + ), + start: Disposition::Deny(DenyReason::InstanceNotActive), migrate_as_source: Disposition::Deny( DenyReason::InstanceNotActive, ), @@ -315,6 +428,10 @@ impl ExternalRequestQueue { } ChangeReason::StateChange(InstanceStateChange::Failed) => { AllowedRequests { + migrate_as_target: Disposition::Deny( + DenyReason::InstanceFailed, + ), + start: Disposition::Deny(DenyReason::InstanceFailed), migrate_as_source: Disposition::Deny( DenyReason::InstanceFailed, ), @@ -336,6 +453,18 @@ mod test { slog::Logger::root(slog::Discard, slog::o!()) } + fn make_migrate_as_target_request() -> ExternalRequest { + let task = tokio::task::spawn(async { Ok(()) }); + let (start_tx, _) = tokio::sync::oneshot::channel(); + let (_, command_rx) = tokio::sync::mpsc::channel(1); + ExternalRequest::MigrateAsTarget { + migration_id: Uuid::new_v4(), + task, + start_tx, + command_rx, + } + } + fn make_migrate_as_source_request() -> ExternalRequest { let task = tokio::task::spawn(async { Ok(()) }); let (start_tx, _) = tokio::sync::oneshot::channel(); @@ -350,10 +479,47 @@ mod test { } } + #[tokio::test] + async fn migrate_as_target_is_idempotent() { + let mut queue = ExternalRequestQueue::new(test_logger()); + + // Requests to migrate as a target should queue normally at first. + assert!(queue.migrate_as_target_will_enqueue().unwrap()); + + // After queuing such a request, subsequent requests should be allowed + // without enqueuing anything. + assert!(queue.try_queue(make_migrate_as_target_request()).is_ok()); + assert!(!queue.migrate_as_target_will_enqueue().unwrap()); + + // Pop the request and tell the queue the instance is running. + assert!(matches!( + queue.pop_front(), + Some(ExternalRequest::MigrateAsTarget { .. }) + )); + queue.notify_instance_state_change(InstanceStateChange::StartedRunning); + + // Because the instance was started via migration in, future requests + // to migrate in should be allowed. + assert!(queue.try_queue(make_migrate_as_target_request()).is_ok()); + assert!(!queue.migrate_as_target_will_enqueue().unwrap()); + } + + #[tokio::test] + async fn migrate_as_target_is_forbidden_after_cold_boot() { + let mut queue = ExternalRequestQueue::new(test_logger()); + assert!(queue.try_queue(ExternalRequest::Start).is_ok()); + queue.notify_instance_state_change(InstanceStateChange::StartedRunning); + + assert!(queue.migrate_as_target_will_enqueue().is_err()); + assert!(queue.try_queue(make_migrate_as_target_request()).is_err()); + } + #[tokio::test] async fn migrate_as_source_is_not_idempotent() { // Simulate a running instance. let mut queue = ExternalRequestQueue::new(test_logger()); + assert!(queue.try_queue(ExternalRequest::Start).is_ok()); + assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); queue.notify_instance_state_change(InstanceStateChange::StartedRunning); // Requests to migrate out should be allowed. @@ -392,6 +558,8 @@ mod test { #[tokio::test] async fn stop_requests_enqueue_after_vm_failure() { let mut queue = ExternalRequestQueue::new(test_logger()); + assert!(queue.try_queue(ExternalRequest::Start).is_ok()); + assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); queue.notify_instance_state_change(InstanceStateChange::Failed); assert!(queue.try_queue(ExternalRequest::Stop).is_ok()); @@ -401,6 +569,8 @@ mod test { #[tokio::test] async fn reboot_requests_are_idempotent_except_when_stopping() { let mut queue = ExternalRequestQueue::new(test_logger()); + assert!(queue.try_queue(ExternalRequest::Start).is_ok()); + assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); queue.notify_instance_state_change(InstanceStateChange::StartedRunning); // Once the instance is started, reboot requests should be allowed, but diff --git a/bin/propolis-server/src/lib/vm_old/state_driver.rs b/bin/propolis-server/src/lib/vm_old/state_driver.rs new file mode 100644 index 000000000..4a2832f64 --- /dev/null +++ b/bin/propolis-server/src/lib/vm_old/state_driver.rs @@ -0,0 +1,1384 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::sync::Arc; + +use crate::migrate::{MigrateError, MigrateRole}; +use crate::vcpu_tasks::VcpuTaskController; + +use super::{ + request_queue, ExternalRequest, GuestEvent, MigrateSourceCommand, + MigrateSourceResponse, MigrateTargetCommand, MigrateTaskEvent, + SharedVmState, StateDriverEvent, +}; + +use propolis_api_types::{ + InstanceMigrateStatusResponse as ApiMigrateStatusResponse, + InstanceMigrationStatus as ApiMigrationStatus, + InstanceState as ApiInstanceState, + InstanceStateMonitorResponse as ApiMonitoredState, + MigrationState as ApiMigrationState, +}; +use slog::{error, info, Logger}; +use uuid::Uuid; + +#[usdt::provider(provider = "propolis")] +mod probes { + fn state_driver_pause() {} + fn state_driver_resume() {} +} + +/// Tells the state driver whether or not to continue running after responding +/// to an event. +#[derive(Debug, PartialEq, Eq)] +enum HandleEventOutcome { + Continue, + Exit, +} + +/// A reason for starting a VM. +#[derive(Debug, PartialEq, Eq)] +enum VmStartReason { + MigratedIn, + ExplicitRequest, +} + +/// A wrapper around all the data needed to describe the status of a live +/// migration. +struct PublishedMigrationState { + state: ApiMigrationState, + id: Uuid, + role: MigrateRole, +} + +impl PublishedMigrationState { + /// Updates an `old` migration status response to contain information about + /// the migration described by `self`. + fn apply_to( + self, + old: ApiMigrateStatusResponse, + ) -> ApiMigrateStatusResponse { + let new = ApiMigrationStatus { id: self.id, state: self.state }; + match self.role { + MigrateRole::Destination => ApiMigrateStatusResponse { + migration_in: Some(new), + migration_out: old.migration_out, + }, + MigrateRole::Source => ApiMigrateStatusResponse { + migration_in: old.migration_in, + migration_out: Some(new), + }, + } + } +} + +enum PublishedState { + Instance(ApiInstanceState), + Migration(PublishedMigrationState), + Complete(ApiInstanceState, PublishedMigrationState), +} + +pub(super) struct StateDriver< + V: super::StateDriverVmController, + C: VcpuTaskController, +> { + /// A handle to the host server's tokio runtime, useful for spawning tasks + /// that need to interact with async code (e.g. spinning up migration + /// tasks). + runtime_hdl: tokio::runtime::Handle, + + /// A reference to the command sink to which this driver should send its + /// requests to send messages to devices or update other VM controller + /// state. + controller: Arc, + + /// A reference to the state this driver shares with its VM controller. + shared_state: Arc, + + /// The controller for this instance's vCPU tasks. + vcpu_tasks: C, + + /// The state worker's logger. + log: Logger, + + /// The generation number to use when publishing externally-visible state + /// updates. + state_gen: u64, + + /// Whether the worker's VM's devices are paused. + paused: bool, + + /// The sender side of the monitor that reflects the instance's current + /// externally-visible state (including migration state). + api_state_tx: tokio::sync::watch::Sender, +} + +impl StateDriver +where + V: super::StateDriverVmController, + C: VcpuTaskController, +{ + /// Constructs a new state driver context. + pub(super) fn new( + runtime_hdl: tokio::runtime::Handle, + controller: Arc, + shared_controller_state: Arc, + vcpu_tasks: C, + log: Logger, + api_state_tx: tokio::sync::watch::Sender, + ) -> Self { + Self { + runtime_hdl, + controller, + shared_state: shared_controller_state, + vcpu_tasks, + log, + state_gen: 0, + paused: false, + api_state_tx, + } + } + + /// Yields the current externally-visible instance state. + fn get_instance_state(&self) -> ApiInstanceState { + self.api_state_tx.borrow().state + } + + /// Retrieves the most recently published migration state from the external + /// migration state channel. + /// + /// This function does not return the borrowed monitor, so the state may + /// change again as soon as this function returns. + fn get_migration_status(&self) -> ApiMigrateStatusResponse { + self.api_state_tx.borrow().migration.clone() + } + + /// Sets the published instance and/or migration state and increases the + /// state generation number. + fn set_published_state(&mut self, state: PublishedState) { + let (instance_state, migration_state) = match state { + PublishedState::Instance(i) => (Some(i), None), + PublishedState::Migration(m) => (None, Some(m)), + PublishedState::Complete(i, m) => (Some(i), Some(m)), + }; + + let ApiMonitoredState { + state: old_state, + migration: old_migration, + .. + } = self.api_state_tx.borrow().clone(); + + let state = instance_state.unwrap_or(old_state); + let migration = if let Some(migration_state) = migration_state { + migration_state.apply_to(old_migration) + } else { + old_migration + }; + + info!(self.log, "publishing new instance state"; + "gen" => self.state_gen, + "state" => ?state, + "migration" => ?migration); + + self.state_gen += 1; + let _ = self.api_state_tx.send(ApiMonitoredState { + gen: self.state_gen, + state, + migration, + }); + } + + /// Publishes the supplied externally-visible instance state to the external + /// instance state channel. + fn set_instance_state(&mut self, state: ApiInstanceState) { + self.set_published_state(PublishedState::Instance(state)); + } + + /// Publishes the supplied externally-visible migration status to the + /// instance state channel. + fn set_migration_state( + &mut self, + role: MigrateRole, + migration_id: Uuid, + state: ApiMigrationState, + ) { + self.set_published_state(PublishedState::Migration( + PublishedMigrationState { state, id: migration_id, role }, + )); + } + + /// Publishes that an instance is migrating and sets its migration state in + /// a single transaction, then consumes the pending migration information + /// from the shared VM state block. + fn publish_migration_start( + &mut self, + migration_id: Uuid, + role: MigrateRole, + ) { + // Order matters here. The 'pending migration' field exists so that + // migration status is available through the external API as soon as an + // external request to migrate returns, even if the migration hasn't yet + // been picked up off the queue. To ensure the migration is continuously + // visible, publish the "actual" migration before consuming the pending + // one. + self.set_published_state(PublishedState::Complete( + ApiInstanceState::Migrating, + PublishedMigrationState { + state: ApiMigrationState::Sync, + id: migration_id, + role, + }, + )); + + self.shared_state.clear_pending_migration(); + } + + /// Manages an instance's lifecycle once it has moved to the Running state. + pub(super) fn run_state_worker( + mut self, + ) -> tokio::sync::watch::Sender { + info!(self.log, "State worker launched"); + + loop { + let event = self.shared_state.wait_for_next_event(); + info!(self.log, "State worker handling event"; "event" => ?event); + + let outcome = self.handle_event(event); + info!(self.log, "State worker handled event"; "outcome" => ?outcome); + if matches!(outcome, HandleEventOutcome::Exit) { + break; + } + } + + info!(self.log, "State worker exiting"); + + self.api_state_tx + } + + fn handle_event(&mut self, event: StateDriverEvent) -> HandleEventOutcome { + let next_action = match event { + StateDriverEvent::Guest(guest_event) => { + return self.handle_guest_event(guest_event); + } + StateDriverEvent::External(external_event) => external_event, + }; + + match next_action { + ExternalRequest::MigrateAsTarget { + migration_id, + task, + start_tx, + command_rx, + } => { + self.migrate_as_target( + migration_id, + task, + start_tx, + command_rx, + ); + HandleEventOutcome::Continue + } + ExternalRequest::Start => { + self.start_vm(VmStartReason::ExplicitRequest); + HandleEventOutcome::Continue + } + ExternalRequest::Reboot => { + self.do_reboot(); + HandleEventOutcome::Continue + } + ExternalRequest::MigrateAsSource { + migration_id, + task, + start_tx, + command_rx, + response_tx, + } => { + self.migrate_as_source( + migration_id, + task, + start_tx, + command_rx, + response_tx, + ); + HandleEventOutcome::Continue + } + ExternalRequest::Stop => { + self.do_halt(); + HandleEventOutcome::Exit + } + } + } + + fn handle_guest_event(&mut self, event: GuestEvent) -> HandleEventOutcome { + match event { + GuestEvent::VcpuSuspendHalt(_when) => { + info!(self.log, "Halting due to VM suspend event",); + self.do_halt(); + HandleEventOutcome::Exit + } + GuestEvent::VcpuSuspendReset(_when) => { + info!(self.log, "Resetting due to VM suspend event"); + self.do_reboot(); + HandleEventOutcome::Continue + } + GuestEvent::VcpuSuspendTripleFault(vcpu_id, _when) => { + info!( + self.log, + "Resetting due to triple fault on vCPU {}", vcpu_id + ); + self.do_reboot(); + HandleEventOutcome::Continue + } + GuestEvent::ChipsetHalt => { + info!(self.log, "Halting due to chipset-driven halt"); + self.do_halt(); + HandleEventOutcome::Exit + } + GuestEvent::ChipsetReset => { + info!(self.log, "Resetting due to chipset-driven reset"); + self.do_reboot(); + HandleEventOutcome::Continue + } + } + } + + fn start_vm(&mut self, start_reason: VmStartReason) { + info!(self.log, "Starting instance"; "reason" => ?start_reason); + + // Only move to the Starting state if this VM is starting by explicit + // request (as opposed to the implicit start that happens after a + // migration in). In this case, no one has initialized vCPU state yet, + // so explicitly initialize it here. + // + // In the migration-in case, remain in the Migrating state until the + // VM is actually running. Note that this is contractual behavior--sled + // agent relies on this to represent that a migrating instance is + // continuously running through a successful migration. + match start_reason { + VmStartReason::ExplicitRequest => { + self.set_instance_state(ApiInstanceState::Starting); + self.reset_vcpus(); + } + VmStartReason::MigratedIn => { + assert_eq!( + self.get_instance_state(), + ApiInstanceState::Migrating + ); + // Signal the kernel VMM to resume devices which are handled by + // the in-kernel emulation. They were kept paused for + // consistency while migration state was loaded. + self.controller.resume_vm(); + } + } + + match self.controller.start_devices() { + Ok(()) => { + self.vcpu_tasks.resume_all(); + self.publish_steady_state(ApiInstanceState::Running); + } + Err(e) => { + error!(&self.log, "Failed to start devices: {:?}", e); + self.publish_steady_state(ApiInstanceState::Failed); + } + } + } + + fn do_reboot(&mut self) { + info!(self.log, "Resetting instance"); + + self.set_instance_state(ApiInstanceState::Rebooting); + + // Reboot is implemented as a pause -> reset -> resume transition. + // + // First, pause the vCPUs and all devices so no partially-completed + // work is present. + self.vcpu_tasks.pause_all(); + self.controller.pause_devices(); + + // Reset all the entities and the VM's bhyve state, then reset the + // vCPUs. The vCPU reset must come after the bhyve reset. + self.controller.reset_devices_and_machine(); + self.reset_vcpus(); + + // Resume devices so they're ready to do more work, then resume vCPUs. + self.controller.resume_devices(); + self.vcpu_tasks.resume_all(); + + // Notify the request queue that this reboot request was processed. + // This does not use the `publish_steady_state` path because the queue + // treats an instance's initial transition to "Running" as a one-time + // event that's different from a return to the running state from a + // transient intermediate state. + self.notify_request_queue(request_queue::InstanceStateChange::Rebooted); + self.set_instance_state(ApiInstanceState::Running); + } + + fn do_halt(&mut self) { + info!(self.log, "Stopping instance"); + self.set_instance_state(ApiInstanceState::Stopping); + + // Entities expect to be paused before being halted. Note that the VM + // may be paused already if it is being torn down after a successful + // migration out. + if !self.paused { + self.pause(); + } + + self.vcpu_tasks.exit_all(); + self.controller.halt_devices(); + self.publish_steady_state(ApiInstanceState::Stopped); + } + + fn migrate_as_target( + &mut self, + migration_id: Uuid, + mut task: tokio::task::JoinHandle>, + start_tx: tokio::sync::oneshot::Sender<()>, + mut command_rx: tokio::sync::mpsc::Receiver, + ) { + self.publish_migration_start(migration_id, MigrateRole::Destination); + + // Ensure the VM's vCPUs are activated properly so that they can enter + // the guest after migration. Do this before allowing the migration task + // to start so that reset doesn't overwrite any state written by + // migration. + self.reset_vcpus(); + + // Place the VM in a paused state so we can load emulated device state + // in a consistent manner + self.controller.pause_vm(); + + start_tx.send(()).unwrap(); + loop { + let action = self.runtime_hdl.block_on(async { + Self::next_migrate_task_event( + &mut task, + &mut command_rx, + &self.log, + ) + .await + }); + + match action { + MigrateTaskEvent::TaskExited(res) => { + if res.is_ok() { + // Clients that observe that migration has finished + // need to observe that the instance is running before + // they are guaranteed to be able to do anything else + // that requires a running instance. + assert!(matches!( + self.get_migration_status() + .migration_in + .unwrap() + .state, + ApiMigrationState::Finish + )); + + self.start_vm(VmStartReason::MigratedIn); + } else { + assert!(matches!( + self.get_migration_status() + .migration_in + .unwrap() + .state, + ApiMigrationState::Error + )); + + // Resume the kernel VM so that if this state driver is + // asked to halt, the pause resulting therefrom won't + // observe that the VM is already paused. + self.controller.resume_vm(); + self.publish_steady_state(ApiInstanceState::Failed); + }; + + break; + } + MigrateTaskEvent::Command( + MigrateTargetCommand::UpdateState(state), + ) => { + self.set_migration_state( + MigrateRole::Destination, + migration_id, + state, + ); + } + } + } + } + + fn migrate_as_source( + &mut self, + migration_id: Uuid, + mut task: tokio::task::JoinHandle>, + start_tx: tokio::sync::oneshot::Sender<()>, + mut command_rx: tokio::sync::mpsc::Receiver, + response_tx: tokio::sync::mpsc::Sender, + ) { + self.publish_migration_start(migration_id, MigrateRole::Source); + start_tx.send(()).unwrap(); + + // Wait either for the migration task to exit or for it to ask the + // worker to pause or resume the instance's devices. + loop { + let action = self.runtime_hdl.block_on(async { + Self::next_migrate_task_event( + &mut task, + &mut command_rx, + &self.log, + ) + .await + }); + + match action { + // If the task exited, bubble its result back up to the main + // state worker loop to decide on the instance's next state. + // + // If migration failed while devices were paused, this instance + // is allowed to resume, so resume its components here. + MigrateTaskEvent::TaskExited(res) => { + if res.is_ok() { + assert!(matches!( + self.get_migration_status() + .migration_out + .unwrap() + .state, + ApiMigrationState::Finish + )); + + self.shared_state + .queue_external_request(ExternalRequest::Stop) + .expect("can always queue a request to stop"); + } else { + assert!(matches!( + self.get_migration_status() + .migration_out + .unwrap() + .state, + ApiMigrationState::Error + )); + + if self.paused { + self.resume(); + self.publish_steady_state( + ApiInstanceState::Running, + ); + } + } + + break; + } + MigrateTaskEvent::Command(cmd) => match cmd { + MigrateSourceCommand::UpdateState(state) => { + self.set_migration_state( + MigrateRole::Source, + migration_id, + state, + ); + } + MigrateSourceCommand::Pause => { + self.pause(); + response_tx + .blocking_send(MigrateSourceResponse::Pause(Ok(()))) + .unwrap(); + } + }, + } + } + } + + async fn next_migrate_task_event( + task: &mut tokio::task::JoinHandle>, + command_rx: &mut tokio::sync::mpsc::Receiver, + log: &Logger, + ) -> MigrateTaskEvent { + if let Some(cmd) = command_rx.recv().await { + return MigrateTaskEvent::Command(cmd); + } + + // The sender side of the command channel is dropped, which means the + // migration task is exiting. Wait for it to finish and snag its result. + match task.await { + Ok(res) => { + info!(log, "Migration source task exited: {:?}", res); + MigrateTaskEvent::TaskExited(res) + } + Err(join_err) => { + if join_err.is_cancelled() { + panic!("Migration task canceled"); + } else { + panic!( + "Migration task panicked: {:?}", + join_err.into_panic() + ); + } + } + } + } + + fn pause(&mut self) { + assert!(!self.paused); + probes::state_driver_pause!(|| ()); + self.vcpu_tasks.pause_all(); + self.controller.pause_devices(); + self.controller.pause_vm(); + self.paused = true; + } + + fn resume(&mut self) { + assert!(self.paused); + probes::state_driver_resume!(|| ()); + self.controller.resume_vm(); + self.controller.resume_devices(); + self.vcpu_tasks.resume_all(); + self.paused = false; + } + + fn reset_vcpus(&self) { + self.vcpu_tasks.new_generation(); + self.controller.reset_vcpu_state(); + } + + fn publish_steady_state(&mut self, state: ApiInstanceState) { + let change = match state { + ApiInstanceState::Running => { + request_queue::InstanceStateChange::StartedRunning + } + ApiInstanceState::Stopped => { + request_queue::InstanceStateChange::Stopped + } + ApiInstanceState::Failed => { + request_queue::InstanceStateChange::Failed + } + _ => panic!( + "Called publish_steady_state on non-terminal state {:?}", + state + ), + }; + + self.notify_request_queue(change); + self.set_instance_state(state); + } + + fn notify_request_queue( + &self, + queue_change: request_queue::InstanceStateChange, + ) { + self.shared_state + .inner + .lock() + .unwrap() + .external_request_queue + .notify_instance_state_change(queue_change); + } +} + +#[cfg(test)] +mod test { + use anyhow::bail; + use mockall::Sequence; + + use super::*; + use crate::vcpu_tasks::MockVcpuTaskController; + use crate::vm::MockStateDriverVmController; + + struct TestStateDriver { + driver: + StateDriver, + state_rx: tokio::sync::watch::Receiver, + } + + impl TestStateDriver { + fn api_state(&self) -> ApiInstanceState { + self.state_rx.borrow().state + } + } + + struct TestObjects { + vm_ctrl: MockStateDriverVmController, + vcpu_ctrl: MockVcpuTaskController, + shared_state: Arc, + } + + fn make_state_driver(objects: TestObjects) -> TestStateDriver { + let logger = slog::Logger::root(slog::Discard, slog::o!()); + let (state_tx, state_rx) = + tokio::sync::watch::channel(ApiMonitoredState { + gen: 0, + state: ApiInstanceState::Creating, + migration: ApiMigrateStatusResponse { + migration_in: None, + migration_out: None, + }, + }); + + TestStateDriver { + driver: StateDriver::new( + tokio::runtime::Handle::current(), + Arc::new(objects.vm_ctrl), + objects.shared_state.clone(), + objects.vcpu_ctrl, + logger, + state_tx, + ), + state_rx, + } + } + + /// Generates default mocks for the VM controller and vCPU task controller + /// that accept unlimited requests to read state. + fn make_default_mocks() -> TestObjects { + let logger = slog::Logger::root(slog::Discard, slog::o!()); + let vm_ctrl = MockStateDriverVmController::new(); + let vcpu_ctrl = MockVcpuTaskController::new(); + TestObjects { + vm_ctrl, + vcpu_ctrl, + shared_state: Arc::new(SharedVmState::new(&logger)), + } + } + + fn add_reboot_expectations( + vm_ctrl: &mut MockStateDriverVmController, + vcpu_ctrl: &mut MockVcpuTaskController, + ) { + // The reboot process requires careful ordering of steps to make sure + // the VM's vCPUs are put into the correct state when the machine starts + // up. + let mut seq = Sequence::new(); + + // First, reboot has to pause everything. It doesn't actually matter + // whether vCPUs or devices pause first, but there's no way to specify + // that these events must be sequenced before other expectations but + // have no ordering with respect to each other. + vcpu_ctrl + .expect_pause_all() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + vm_ctrl + .expect_pause_devices() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + + // The devices and--importantly--the bhyve VM itself must be reset + // before resetting any vCPU state (so that bhyve will accept the ioctls + // sent to the vCPUs during the reset process). + vm_ctrl + .expect_reset_devices_and_machine() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + vcpu_ctrl + .expect_new_generation() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + vm_ctrl + .expect_reset_vcpu_state() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + + // Entities and vCPUs can technically be resumed in either order, but + // resuming devices first allows them to be ready when the vCPUs start + // creating work for them to do. + vm_ctrl + .expect_resume_devices() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + vcpu_ctrl + .expect_resume_all() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + } + + #[tokio::test] + async fn guest_triple_fault_reboots() { + let mut test_objects = make_default_mocks(); + + add_reboot_expectations( + &mut test_objects.vm_ctrl, + &mut test_objects.vcpu_ctrl, + ); + let mut driver = make_state_driver(test_objects); + driver.driver.handle_event(StateDriverEvent::Guest( + GuestEvent::VcpuSuspendTripleFault( + 0, + std::time::Duration::default(), + ), + )); + + assert!(matches!(driver.api_state(), ApiInstanceState::Running)); + } + + #[tokio::test] + async fn guest_chipset_reset_reboots() { + let mut test_objects = make_default_mocks(); + + add_reboot_expectations( + &mut test_objects.vm_ctrl, + &mut test_objects.vcpu_ctrl, + ); + let mut driver = make_state_driver(test_objects); + driver + .driver + .handle_event(StateDriverEvent::Guest(GuestEvent::ChipsetReset)); + + assert!(matches!(driver.api_state(), ApiInstanceState::Running)); + } + + #[tokio::test] + async fn start_from_cold_boot() { + let mut test_objects = make_default_mocks(); + let vm_ctrl = &mut test_objects.vm_ctrl; + let vcpu_ctrl = &mut test_objects.vcpu_ctrl; + let mut seq = Sequence::new(); + vcpu_ctrl + .expect_new_generation() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + vm_ctrl + .expect_reset_vcpu_state() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + vm_ctrl + .expect_start_devices() + .times(1) + .in_sequence(&mut seq) + .returning(|| Ok(())); + vcpu_ctrl + .expect_resume_all() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + + let mut driver = make_state_driver(test_objects); + driver + .driver + .handle_event(StateDriverEvent::External(ExternalRequest::Start)); + + assert!(matches!(driver.api_state(), ApiInstanceState::Running)); + } + + #[tokio::test] + async fn device_start_failure_causes_instance_failure() { + let mut test_objects = make_default_mocks(); + let vm_ctrl = &mut test_objects.vm_ctrl; + let vcpu_ctrl = &mut test_objects.vcpu_ctrl; + let mut seq = Sequence::new(); + vcpu_ctrl + .expect_new_generation() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + vm_ctrl + .expect_reset_vcpu_state() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + vm_ctrl + .expect_start_devices() + .times(1) + .in_sequence(&mut seq) + .returning(|| bail!("injected failure into start_devices!")); + + let mut driver = make_state_driver(test_objects); + + // Failure allows the instance to be preserved for debugging. + assert_eq!( + driver.driver.handle_event(StateDriverEvent::External( + ExternalRequest::Start + )), + HandleEventOutcome::Continue + ); + + assert!(matches!(driver.api_state(), ApiInstanceState::Failed)); + } + + #[tokio::test] + async fn devices_pause_before_halting() { + let mut test_objects = make_default_mocks(); + let vm_ctrl = &mut test_objects.vm_ctrl; + let vcpu_ctrl = &mut test_objects.vcpu_ctrl; + let mut seq = Sequence::new(); + vcpu_ctrl + .expect_pause_all() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + vm_ctrl + .expect_pause_devices() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + vm_ctrl + .expect_pause_vm() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + vcpu_ctrl + .expect_exit_all() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + vm_ctrl + .expect_halt_devices() + .times(1) + .in_sequence(&mut seq) + .returning(|| ()); + + let mut driver = make_state_driver(test_objects); + driver + .driver + .handle_event(StateDriverEvent::External(ExternalRequest::Stop)); + + assert!(matches!(driver.api_state(), ApiInstanceState::Stopped)); + } + + #[tokio::test] + async fn devices_pause_once_when_halting_after_migration_out() { + let migration_id = Uuid::new_v4(); + let (start_tx, start_rx) = tokio::sync::oneshot::channel(); + let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); + let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); + let (response_tx, mut response_rx) = tokio::sync::mpsc::channel(1); + let migrate_task = tokio::spawn(async move { + start_rx.await.unwrap(); + task_exit_rx.await.unwrap() + }); + + let mut test_objects = make_default_mocks(); + let vm_ctrl = &mut test_objects.vm_ctrl; + let vcpu_ctrl = &mut test_objects.vcpu_ctrl; + + // This test will simulate a migration out (with a pause command), then + // order the state driver to halt. This should produce exactly one set + // of pause commands and one set of halt commands with no resume + // commands. + vm_ctrl.expect_pause_devices().times(1).returning(|| ()); + vcpu_ctrl.expect_pause_all().times(1).returning(|| ()); + vcpu_ctrl.expect_exit_all().times(1).returning(|| ()); + vm_ctrl.expect_halt_devices().times(1).returning(|| ()); + vm_ctrl.expect_resume_devices().never(); + vcpu_ctrl.expect_resume_all().never(); + vm_ctrl.expect_pause_vm().times(1).returning(|| ()); + vm_ctrl.expect_resume_vm().never(); + + let mut driver = make_state_driver(test_objects); + + // The state driver expects to run on an OS thread outside the async + // runtime so that it can call `block_on` to wait for messages from the + // migration task. + let hdl = std::thread::spawn(move || { + driver.driver.handle_event(StateDriverEvent::External( + ExternalRequest::MigrateAsSource { + migration_id, + task: migrate_task, + start_tx, + command_rx, + response_tx, + }, + )); + + // Return the driver (which has the mocks attached) when the thread + // is joined so the test can continue using it. + driver + }); + + // Simulate a pause and the successful completion of migration. + command_tx.send(MigrateSourceCommand::Pause).await.unwrap(); + let resp = response_rx.recv().await.unwrap(); + assert!(matches!(resp, MigrateSourceResponse::Pause(Ok(())))); + command_tx + .send(MigrateSourceCommand::UpdateState(ApiMigrationState::Finish)) + .await + .unwrap(); + + drop(command_tx); + task_exit_tx.send(Ok(())).unwrap(); + + // Wait for the call to `handle_event` to return before tearing anything + // else down. + driver = tokio::task::spawn_blocking(move || hdl.join().unwrap()) + .await + .unwrap(); + + // The migration should appear to have finished. The state driver will + // queue a "stop" command to itself in this case, but because the driver + // is not directly processing the queue here, the test has to issue this + // call itself. + assert_eq!( + driver.driver.get_migration_status().migration_out.unwrap(), + ApiMigrationStatus { + id: migration_id, + state: ApiMigrationState::Finish + } + ); + + driver + .driver + .handle_event(StateDriverEvent::External(ExternalRequest::Stop)); + + assert!(matches!(driver.api_state(), ApiInstanceState::Stopped)); + } + + #[tokio::test] + async fn paused_vm_resumes_after_failed_migration_out() { + let migration_id = Uuid::new_v4(); + let (start_tx, start_rx) = tokio::sync::oneshot::channel(); + let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); + let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); + let (response_tx, mut response_rx) = tokio::sync::mpsc::channel(1); + let migrate_task = tokio::spawn(async move { + start_rx.await.unwrap(); + task_exit_rx.await.unwrap() + }); + + let mut test_objects = make_default_mocks(); + let vm_ctrl = &mut test_objects.vm_ctrl; + let vcpu_ctrl = &mut test_objects.vcpu_ctrl; + + // This test will simulate a migration out up through pausing the + // source, then fail migration. This should pause and resume all the + // devices and the vCPUs. + vm_ctrl.expect_pause_devices().times(1).returning(|| ()); + vm_ctrl.expect_resume_devices().times(1).returning(|| ()); + vcpu_ctrl.expect_pause_all().times(1).returning(|| ()); + vcpu_ctrl.expect_resume_all().times(1).returning(|| ()); + + // VMM will be paused once prior to exporting state, and then resumed + // afterwards when the migration fails. + let mut pause_seq = Sequence::new(); + vm_ctrl + .expect_pause_vm() + .times(1) + .in_sequence(&mut pause_seq) + .returning(|| ()); + vm_ctrl + .expect_resume_vm() + .times(1) + .in_sequence(&mut pause_seq) + .returning(|| ()); + + let mut driver = make_state_driver(test_objects); + let hdl = std::thread::spawn(move || { + let outcome = driver.driver.handle_event( + StateDriverEvent::External(ExternalRequest::MigrateAsSource { + migration_id, + task: migrate_task, + start_tx, + command_rx, + response_tx, + }), + ); + + (driver, outcome) + }); + + // Simulate a successful pause. + command_tx.send(MigrateSourceCommand::Pause).await.unwrap(); + let resp = response_rx.recv().await.unwrap(); + assert!(matches!(resp, MigrateSourceResponse::Pause(Ok(())))); + + // Simulate failure. The migration protocol must both update the state + // to Error and make the task return `Err`. + command_tx + .send(MigrateSourceCommand::UpdateState(ApiMigrationState::Error)) + .await + .unwrap(); + drop(command_tx); + task_exit_tx.send(Err(MigrateError::UnexpectedMessage)).unwrap(); + + // Wait for the call to `handle_event` to return. + let (driver, outcome) = + tokio::task::spawn_blocking(move || hdl.join().unwrap()) + .await + .unwrap(); + + // The VM should be running and the state driver should continue + // operating normally. + assert!(matches!(driver.api_state(), ApiInstanceState::Running)); + assert_eq!(outcome, HandleEventOutcome::Continue); + assert_eq!( + driver.driver.get_migration_status().migration_out.unwrap(), + ApiMigrationStatus { + id: migration_id, + state: ApiMigrationState::Error + } + ); + } + + #[tokio::test] + async fn vm_starts_after_migration_in() { + let migration_id = Uuid::new_v4(); + let (start_tx, start_rx) = tokio::sync::oneshot::channel(); + let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); + let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); + let migrate_task = tokio::spawn(async move { + start_rx.await.unwrap(); + task_exit_rx.await.unwrap() + }); + + let mut test_objects = make_default_mocks(); + let vm_ctrl = &mut test_objects.vm_ctrl; + let vcpu_ctrl = &mut test_objects.vcpu_ctrl; + + vcpu_ctrl.expect_new_generation().times(1).returning(|| ()); + vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ()); + vm_ctrl.expect_start_devices().times(1).returning(|| Ok(())); + vcpu_ctrl.expect_resume_all().times(1).returning(|| ()); + + let mut pause_seq = Sequence::new(); + vm_ctrl + .expect_pause_vm() + .times(1) + .in_sequence(&mut pause_seq) + .returning(|| ()); + vm_ctrl + .expect_resume_vm() + .times(1) + .in_sequence(&mut pause_seq) + .returning(|| ()); + + let mut driver = make_state_driver(test_objects); + + // The state driver expects to run on an OS thread outside the async + // runtime so that it can call `block_on` to wait for messages from the + // migration task. + let hdl = std::thread::spawn(move || { + driver.driver.handle_event(StateDriverEvent::External( + ExternalRequest::MigrateAsTarget { + migration_id, + task: migrate_task, + start_tx, + command_rx, + }, + )); + + driver + }); + + // Explicitly drop the command channel to signal to the driver that + // the migration task is completing. + command_tx + .send(MigrateTargetCommand::UpdateState(ApiMigrationState::Finish)) + .await + .unwrap(); + drop(command_tx); + task_exit_tx.send(Ok(())).unwrap(); + + // Wait for the call to `handle_event` to return before tearing anything + // else down. + let driver = tokio::task::spawn_blocking(move || hdl.join().unwrap()) + .await + .unwrap(); + + assert_eq!( + driver.driver.get_migration_status().migration_in.unwrap(), + ApiMigrationStatus { + id: migration_id, + state: ApiMigrationState::Finish + } + ); + assert!(matches!(driver.api_state(), ApiInstanceState::Running)); + } + + #[tokio::test] + async fn failed_migration_in_fails_instance() { + let migration_id = Uuid::new_v4(); + let (start_tx, start_rx) = tokio::sync::oneshot::channel(); + let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); + let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); + let migrate_task = tokio::spawn(async move { + start_rx.await.unwrap(); + task_exit_rx.await.unwrap() + }); + + let mut test_objects = make_default_mocks(); + let vm_ctrl = &mut test_objects.vm_ctrl; + let vcpu_ctrl = &mut test_objects.vcpu_ctrl; + + vcpu_ctrl.expect_new_generation().times(1).returning(|| ()); + vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ()); + vm_ctrl.expect_pause_vm().times(1).returning(|| ()); + vm_ctrl.expect_resume_vm().times(1).returning(|| ()); + let mut driver = make_state_driver(test_objects); + + // The state driver expects to run on an OS thread outside the async + // runtime so that it can call `block_on` to wait for messages from the + // migration task. + let hdl = std::thread::spawn(move || { + let outcome = driver.driver.handle_event( + StateDriverEvent::External(ExternalRequest::MigrateAsTarget { + migration_id, + task: migrate_task, + start_tx, + command_rx, + }), + ); + + (driver, outcome) + }); + + // The migration task is required to update the migration state to + // "Error" before exiting when migration fails. + command_tx + .send(MigrateTargetCommand::UpdateState(ApiMigrationState::Error)) + .await + .unwrap(); + drop(command_tx); + task_exit_tx.send(Err(MigrateError::UnexpectedMessage)).unwrap(); + + // Wait for the call to `handle_event` to return. + let (driver, outcome) = + tokio::task::spawn_blocking(move || hdl.join().unwrap()) + .await + .unwrap(); + + // The migration should appear to have failed, but the VM should be + // preserved for debugging. + assert_eq!(outcome, HandleEventOutcome::Continue); + assert!(matches!(driver.api_state(), ApiInstanceState::Failed)); + assert_eq!( + driver.driver.get_migration_status().migration_in.unwrap(), + ApiMigrationStatus { + id: migration_id, + state: ApiMigrationState::Error + } + ); + } + + #[tokio::test] + async fn failed_vm_start_after_migration_in_fails_instance() { + let migration_id = Uuid::new_v4(); + let (start_tx, start_rx) = tokio::sync::oneshot::channel(); + let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); + let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); + let migrate_task = tokio::spawn(async move { + start_rx.await.unwrap(); + task_exit_rx.await.unwrap() + }); + + let mut test_objects = make_default_mocks(); + let vm_ctrl = &mut test_objects.vm_ctrl; + let vcpu_ctrl = &mut test_objects.vcpu_ctrl; + + vcpu_ctrl.expect_new_generation().times(1).returning(|| ()); + vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ()); + + let mut pause_seq = Sequence::new(); + vm_ctrl + .expect_pause_vm() + .times(1) + .in_sequence(&mut pause_seq) + .returning(|| ()); + vm_ctrl + .expect_resume_vm() + .times(1) + .in_sequence(&mut pause_seq) + .returning(|| ()); + + vm_ctrl + .expect_start_devices() + .times(1) + .returning(|| bail!("injected failure into start_devices!")); + + let mut driver = make_state_driver(test_objects); + + // The state driver expects to run on an OS thread outside the async + // runtime so that it can call `block_on` to wait for messages from the + // migration task. + let hdl = std::thread::spawn(move || { + let outcome = driver.driver.handle_event( + StateDriverEvent::External(ExternalRequest::MigrateAsTarget { + migration_id, + task: migrate_task, + start_tx, + command_rx, + }), + ); + + (driver, outcome) + }); + + // Explicitly drop the command channel to signal to the driver that + // the migration task is completing. + command_tx + .send(MigrateTargetCommand::UpdateState(ApiMigrationState::Finish)) + .await + .unwrap(); + drop(command_tx); + task_exit_tx.send(Ok(())).unwrap(); + + // Wait for the call to `handle_event` to return. + let (driver, outcome) = + tokio::task::spawn_blocking(move || hdl.join().unwrap()) + .await + .unwrap(); + + // The instance should have failed, but should also be preserved for + // debugging. + assert_eq!(outcome, HandleEventOutcome::Continue); + assert!(matches!(driver.api_state(), ApiInstanceState::Failed)); + + // The migration has still succeeded in this case. + assert_eq!( + driver.driver.get_migration_status().migration_in.unwrap(), + ApiMigrationStatus { + id: migration_id, + state: ApiMigrationState::Finish + } + ); + } + + #[tokio::test] + async fn start_vm_after_migration_in_does_not_publish_starting_state() { + let mut test_objects = make_default_mocks(); + let vm_ctrl = &mut test_objects.vm_ctrl; + let vcpu_ctrl = &mut test_objects.vcpu_ctrl; + + // A call to start a VM after a successful migration should start vCPUs + // and devices without resetting anything. + vcpu_ctrl.expect_resume_all().times(1).returning(|| ()); + vm_ctrl.expect_start_devices().times(1).returning(|| Ok(())); + + // As noted below, the instance state is being magicked directly into a + // `Migrating` state, rather than executing the logic which would + // typically carry it there. As such, `pause_vm()` will not be called + // as part of setup. Since instance start _is_ being tested here, the + // `resume_vm()` call is expected. + vm_ctrl.expect_pause_vm().never(); + vm_ctrl.expect_resume_vm().times(1).returning(|| ()); + + // Skip the rigmarole of standing up a fake migration. Instead, just + // push the driver into the state it would have after a successful + // migration to appease the assertions in `start_vm`. + // + // Faking an entire migration, as in the previous tests, requires the + // state driver to run on its own worker thread. This is fine for tests + // that only want to examine state after the driver has finished an + // operation, but this test wants to test side effects of a specific + // part of the state driver's actions, which are tough to synchronize + // with when the driver is running on another thread. + let mut driver = make_state_driver(test_objects); + driver.driver.set_instance_state(ApiInstanceState::Migrating); + + // The driver starts in the Migrating state and should go directly to + // the Running state without passing through Starting. Because there's + // no way to guarantee that the test will see all intermediate states + // that `start_vm` publishes, instead assert that the final state of + // Running is correct and that the state generation only went up by 1 + // (implying that there were no intervening transitions). + let migrating_gen = driver.driver.api_state_tx.borrow().gen; + driver.driver.start_vm(VmStartReason::MigratedIn); + let new_state = driver.driver.api_state_tx.borrow().clone(); + assert!(matches!(new_state.state, ApiInstanceState::Running)); + assert_eq!(new_state.gen, migrating_gen + 1); + } +} From 0c98b8096b9be5ed8d70a45cafa1f8555fc2d721 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Mon, 24 Jun 2024 21:59:19 +0000 Subject: [PATCH 09/55] [WIP] refactor state driver to resolve disaster Lose the `dyn VmLifecycle` in favor of a fake-based testing scheme to be created later. Reorganize structures and state driver startup to clean things up some more. This makes the changes feel like much less of a hacky mess than before. --- .../src/lib/migrate/destination.rs | 9 +- bin/propolis-server/src/lib/migrate/source.rs | 29 +- .../src/lib/vm/lifecycle_ops.rs | 122 +++-- bin/propolis-server/src/lib/vm/mod.rs | 75 ++- .../src/lib/vm/state_driver.rs | 437 +++++++++--------- 5 files changed, 329 insertions(+), 343 deletions(-) diff --git a/bin/propolis-server/src/lib/migrate/destination.rs b/bin/propolis-server/src/lib/migrate/destination.rs index f77f19104..f966f3916 100644 --- a/bin/propolis-server/src/lib/migrate/destination.rs +++ b/bin/propolis-server/src/lib/migrate/destination.rs @@ -171,7 +171,7 @@ impl DestinationProtocol { }?; info!(self.log(), "Destination read Preamble: {:?}", preamble); if let Err(e) = preamble - .is_migration_compatible(&*self.vm.objects().instance_spec()) + .is_migration_compatible(&*self.vm.objects().await.instance_spec()) { error!( self.log(), @@ -316,7 +316,7 @@ impl DestinationProtocol { info!(self.log(), "Devices: {devices:#?}"); { - let objects = self.vm.objects(); + let objects = self.vm.objects().await; let machine = objects.machine(); let migrate_ctx = MigrateCtx { mem: &machine.acc_mem.access().unwrap() }; @@ -368,7 +368,7 @@ impl DestinationProtocol { // Take a snapshot of the host hrtime/wall clock time, then adjust // time data appropriately. - let vmm_hdl = &self.vm.objects().machine().hdl.clone(); + let vmm_hdl = &self.vm.objects().await.machine().hdl.clone(); let (dst_hrt, dst_wc) = vmm::time::host_time_snapshot(vmm_hdl) .map_err(|e| { MigrateError::TimeData(format!( @@ -563,6 +563,7 @@ impl DestinationProtocol { self.vm .objects() + .await .com1() .import(&com1_history) .await @@ -637,7 +638,7 @@ impl DestinationProtocol { addr: GuestAddr, buf: &[u8], ) -> Result<(), MigrateError> { - let objects = self.vm.objects(); + let objects = self.vm.objects().await; let memctx = objects.machine().acc_mem.access().unwrap(); let len = buf.len(); memctx.write_from(addr, buf, len); diff --git a/bin/propolis-server/src/lib/migrate/source.rs b/bin/propolis-server/src/lib/migrate/source.rs index 99b2f38a0..c0435a704 100644 --- a/bin/propolis-server/src/lib/migrate/source.rs +++ b/bin/propolis-server/src/lib/migrate/source.rs @@ -148,8 +148,13 @@ pub async fn migrate( // See the lengthy comment on `RamOfferDiscipline` above for more // details about what's going on here. for (&GuestAddr(gpa), dirtiness) in proto.dirt.iter().flatten() { - if let Err(e) = - proto.vm.objects().machine().hdl.set_dirty_pages(gpa, dirtiness) + if let Err(e) = proto + .vm + .objects() + .await + .machine() + .hdl + .set_dirty_pages(gpa, dirtiness) { // Bad news! Our attempt to re-set the dirty bit on these // pages has failed! Thus, subsequent migration attempts @@ -243,7 +248,8 @@ impl SourceProtocol { conn: WebSocketStream, ) -> Self { let dirt = { - let can_npt_operate = vm.objects().machine().hdl.can_npt_operate(); + let can_npt_operate = + vm.objects().await.machine().hdl.can_npt_operate(); // TODO(gjc) the pre-pause offer phase needs to look at whether // redirtying has previously failed. This is done over the command @@ -323,7 +329,7 @@ impl SourceProtocol { async fn sync(&mut self) -> Result<(), MigrateError> { self.update_state(MigrationState::Sync).await; let preamble = Preamble::new(VersionedInstanceSpec::V0( - self.vm.objects().instance_spec().clone(), + self.vm.objects().await.instance_spec().clone(), )); let s = ron::ser::to_string(&preamble) .map_err(codec::ProtocolError::from)?; @@ -543,13 +549,13 @@ impl SourceProtocol { self.update_state(MigrationState::Device).await; let mut device_states = vec![]; { - let objects = self.vm.objects(); + let objects = self.vm.objects().await; let machine = objects.machine(); let migrate_ctx = MigrateCtx { mem: &machine.acc_mem.access().unwrap() }; // Collect together the serialized state for all the devices - self.vm.for_each_device_fallible(|name, devop| { + objects.for_each_device_fallible(|name, devop| { let mut dev = Device { instance_name: name.to_string(), payload: Vec::new(), @@ -590,7 +596,7 @@ impl SourceProtocol { } } Ok(()) - }).await?; + })?; } info!(self.log(), "Device States: {device_states:#?}"); @@ -607,7 +613,7 @@ impl SourceProtocol { // Read and send over the time data async fn time_data(&mut self) -> Result<(), MigrateError> { - let vmm_hdl = &self.vm.objects().machine().hdl.clone(); + let vmm_hdl = &self.vm.objects().await.machine().hdl.clone(); let vm_time_data = vmm::time::export_time_data(vmm_hdl).map_err(|e| { MigrateError::TimeData(format!( @@ -646,7 +652,7 @@ impl SourceProtocol { _ => return Err(MigrateError::UnexpectedMessage), }; let com1_history = - self.vm.objects().com1().export_history(remote_addr).await?; + self.vm.objects().await.com1().export_history(remote_addr).await?; self.send_msg(codec::Message::Serialized(com1_history)).await?; self.read_ok().await } @@ -756,7 +762,7 @@ impl SourceProtocol { async fn vmm_ram_bounds( &mut self, ) -> Result, MigrateError> { - let objects = self.vm.objects(); + let objects = self.vm.objects().await; let machine = objects.machine(); let memctx = machine.acc_mem.access().unwrap(); memctx.mem_bounds().ok_or(MigrateError::InvalidInstanceState) @@ -769,6 +775,7 @@ impl SourceProtocol { ) -> Result<(), MigrateError> { self.vm .objects() + .await .machine() .hdl .track_dirty_pages(start_gpa.0, bits) @@ -780,7 +787,7 @@ impl SourceProtocol { addr: GuestAddr, buf: &mut [u8], ) -> Result<(), MigrateError> { - let objects = self.vm.objects(); + let objects = self.vm.objects().await; let machine = objects.machine(); let memctx = machine.acc_mem.access().unwrap(); let len = buf.len(); diff --git a/bin/propolis-server/src/lib/vm/lifecycle_ops.rs b/bin/propolis-server/src/lib/vm/lifecycle_ops.rs index 88245be4f..aae45c785 100644 --- a/bin/propolis-server/src/lib/vm/lifecycle_ops.rs +++ b/bin/propolis-server/src/lib/vm/lifecycle_ops.rs @@ -10,21 +10,7 @@ use std::{ use futures::{future::BoxFuture, stream::FuturesUnordered, StreamExt}; use slog::{error, info}; -/// Commands that the VM state driver can invoke on its active VM to pause, -/// resume, and reset the devices under its care. -/// -/// These functions are abstracted into a trait to allow them to be mocked out -/// while testing the rest of the state driver. -#[cfg_attr(test, mockall::automock)] pub(super) trait VmLifecycle: Send + Sync { - /// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated - /// devices and vCPUs are brought to a consistent state. - /// - /// When the VM is paused, attempts to run its vCPUs (via `VM_RUN` ioctl) - /// will fail. A corresponding `resume_vm()` call must be made prior to - /// allowing vCPU tasks to run. - fn pause_vm(&self); - /// Resume a previously-paused VM at the kernel VMM level. This will resume /// any timers driving in-kernel-emulated devices, and allow the vCPU to run /// again. @@ -51,56 +37,56 @@ pub(super) trait VmLifecycle: Send + Sync { fn reset_vcpu_state(&self); } -impl VmLifecycle for super::ActiveVm { - fn pause_vm(&self) { +impl super::VmObjects { + /// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated + /// devices and vCPUs are brought to a consistent state. + /// + /// When the VM is paused, attempts to run its vCPUs (via `VM_RUN` ioctl) + /// will fail. A corresponding `resume_vm()` call must be made prior to + /// allowing vCPU tasks to run. + pub(super) fn pause_vm(&self) { info!(self.log, "pausing kernel VMM resources"); - self.objects().machine().hdl.pause().expect("VM_PAUSE should succeed"); + self.machine.hdl.pause().expect("VM_PAUSE should succeed"); } - fn resume_vm(&self) { + pub(super) fn resume_vm(&self) { info!(self.log, "resuming kernel VMM resources"); - self.objects() - .machine() - .hdl - .resume() - .expect("VM_RESUME should succeed"); + self.machine.hdl.resume().expect("VM_RESUME should succeed"); } - fn reset_devices_and_machine(&self) { + pub(super) fn reset_devices_and_machine(&self) { self.for_each_device(|name, dev| { info!(self.log, "sending reset request to {}", name); dev.reset(); }); - self.objects().machine().reinitialize().unwrap(); + self.machine.reinitialize().unwrap(); } - fn start_devices(&self) -> BoxFuture<'_, anyhow::Result<()>> { - Box::pin(async { - self.objects().for_each_device_fallible(|name, dev| { - info!(self.log, "sending startup complete to {}", name); - let res = dev.start(); - if let Err(e) = &res { - error!(self.log, "startup failed for {}: {:?}", name, e); - } - res - })?; - - for (name, backend) in self.objects.block_backends.iter() { - info!(self.log, "starting block backend {}", name); - let res = backend.start().await; - if let Err(e) = &res { - error!(self.log, "Startup failed for {}: {:?}", name, e); - return res; - } + pub(super) async fn start_devices(&self) -> anyhow::Result<()> { + self.for_each_device_fallible(|name, dev| { + info!(self.log, "sending startup complete to {}", name); + let res = dev.start(); + if let Err(e) = &res { + error!(self.log, "startup failed for {}: {:?}", name, e); } - Ok(()) - }) + res + })?; + + for (name, backend) in self.block_backends.iter() { + info!(self.log, "starting block backend {}", name); + let res = backend.start().await; + if let Err(e) = &res { + error!(self.log, "Startup failed for {}: {:?}", name, e); + return res; + } + } + + Ok(()) } - fn pause_devices(&self) -> BoxFuture<'_, ()> { - let objects = self.objects(); - objects.for_each_device(|name, dev| { + pub(super) async fn pause_devices(&self) { + self.for_each_device(|name, dev| { info!(self.log, "sending pause request to {}", name); dev.pause(); }); @@ -126,7 +112,7 @@ impl VmLifecycle for super::ActiveVm { } info!(self.log, "waiting for devices to pause"); - let mut stream: FuturesUnordered<_> = objects + let mut stream: FuturesUnordered<_> = self .lifecycle_components .iter() .map(|(name, dev)| { @@ -135,38 +121,34 @@ impl VmLifecycle for super::ActiveVm { }) .collect(); - let log_fut = self.log.clone(); - Box::pin(async move { - loop { - match stream.next().await { - Some(name) => { - info!(log_fut, "dev {} completed pause", name); - } - - None => { - info!(log_fut, "all devices paused"); - break; - } + loop { + match stream.next().await { + Some(name) => { + info!(self.log, "dev {} completed pause", name); + } + + None => { + info!(self.log, "all devices paused"); + break; } } - }) + } } - fn resume_devices(&self) { - self.objects().for_each_device(|name, dev| { + pub(super) fn resume_devices(&self) { + self.for_each_device(|name, dev| { info!(self.log, "sending resume request to {}", name); dev.resume(); }) } - fn halt_devices(&self) { - let objects = self.objects(); - objects.for_each_device(|name, dev| { + pub(super) fn halt_devices(&self) { + self.for_each_device(|name, dev| { info!(self.log, "sending halt request to {}", name); dev.halt(); }); - for (name, backend) in objects.block_backends.iter() { + for (name, backend) in self.block_backends.iter() { info!(self.log, "stopping and detaching block backend {}", name); backend.stop(); if let Err(err) = backend.detach() { @@ -177,8 +159,8 @@ impl VmLifecycle for super::ActiveVm { } } - fn reset_vcpu_state(&self) { - for vcpu in self.objects().machine().vcpus.iter() { + pub(super) fn reset_vcpu_state(&self) { + for vcpu in self.machine.vcpus.iter() { info!(self.log, "resetting vCPU {}", vcpu.id); vcpu.activate().unwrap(); vcpu.reboot_state().unwrap(); diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index a8680b0ac..12db8ec66 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -60,6 +60,7 @@ pub(crate) struct Vm { } struct VmObjects { + log: slog::Logger, instance_spec: InstanceSpecV0, machine: Machine, lifecycle_components: LifecycleMap, @@ -86,11 +87,15 @@ impl VmObjects { self.lifecycle_components.get(name).cloned() } + pub(crate) fn block_backends(&self) -> &BlockBackendMap { + &self.block_backends + } + pub(crate) fn com1(&self) -> &Arc> { &self.com1 } - fn for_each_device( + pub(crate) fn for_each_device( &self, mut func: impl FnMut(&str, &Arc), ) { @@ -99,7 +104,7 @@ impl VmObjects { } } - fn for_each_device_fallible( + pub(crate) fn for_each_device_fallible( &self, mut func: impl FnMut( &str, @@ -132,7 +137,7 @@ pub(super) struct ActiveVm { properties: InstanceProperties, - objects: RwLock, + objects: tokio::sync::RwLock, services: tokio::sync::Mutex>, } @@ -141,25 +146,10 @@ impl ActiveVm { &self.log } - pub(crate) fn objects(&self) -> RwLockReadGuard<'_, VmObjects> { - self.objects.read().unwrap() - } - - pub(crate) async fn for_each_device( - &self, - func: impl FnMut(&str, &Arc), - ) { - self.objects().for_each_device(func); - } - - pub(crate) async fn for_each_device_fallible( + pub(crate) async fn objects( &self, - func: impl FnMut( - &str, - &Arc, - ) -> std::result::Result<(), E>, - ) -> std::result::Result<(), E> { - self.objects().for_each_device_fallible(func) + ) -> tokio::sync::RwLockReadGuard<'_, VmObjects> { + self.objects.read().await } async fn stop_services(&self) { @@ -168,17 +158,6 @@ impl ActiveVm { } } -impl Drop for ActiveVm { - fn drop(&mut self) { - let mut guard = self.parent.state.write().unwrap(); - *guard = VmState::Defunct(DefunctVm { - external_state_rx: self.external_state_rx.clone(), - properties: self.properties.clone(), - spec: self.objects().instance_spec.clone(), - }); - } -} - struct DefunctVm { external_state_rx: InstanceStateRx, properties: InstanceProperties, @@ -241,6 +220,24 @@ impl Vm { } } + async fn make_defunct(&self) { + let mut guard = self.state.write().unwrap(); + let old = std::mem::replace(&mut *guard, VmState::NoVm); + match old { + VmState::Active(vm) => { + let active = vm.upgrade().expect("state driver holds a ref"); + *guard = VmState::Defunct(DefunctVm { + external_state_rx: active.external_state_rx.clone(), + properties: active.properties.clone(), + spec: active.objects().await.instance_spec.clone(), + }); + } + _ => unreachable!( + "only an active VM's state worker calls make_defunct" + ), + } + } + pub async fn ensure( self: &Arc, log: slog::Logger, @@ -259,20 +256,6 @@ impl Vm { *guard = VmState::WaitingToStart; - let (external_tx, external_rx) = - tokio::sync::watch::channel(InstanceStateMonitorResponse { - gen: 1, - state: propolis_api_types::InstanceState::Starting, - migration: propolis_api_types::InstanceMigrateStatusResponse { - migration_in: None, - migration_out: None, - }, - }); - - let input_queue = state_driver::InputQueue::new( - log.new(slog::o!("component" => "vmm_request_queue")), - ); - let state_driver = state_driver::StateDriver::new( log, self.clone(), diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 652434400..a10561567 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -25,7 +25,7 @@ use crate::{ use super::{ guest_event::{self, GuestEvent}, - lifecycle_ops, VmError, + VmError, VmObjects, }; struct MigrationStateUpdate { @@ -194,67 +194,191 @@ impl guest_event::ChipsetEventHandler for InputQueue { } /// The context for a VM state driver task. -pub(super) struct StateDriver { +struct StateDriver { log: slog::Logger, - parent_vm: Arc, + parent: Arc, + active_vm: Arc, input_queue: Arc, external_state_tx: super::InstanceStateTx, paused: bool, - vcpu_tasks: Option>, - vm_lifecycle: Option>, + vcpu_tasks: Box, migration_src_state: crate::migrate::source::PersistentState, } -impl StateDriver { - pub(super) fn new( - log: slog::Logger, - vm: Arc, - input_queue: Arc, - external_state_tx: super::InstanceStateTx, - ) -> Self { - let log = log.new(slog::o!("component" => "state_driver")); - Self { - log, - parent_vm: vm, - input_queue, - external_state_tx, - paused: false, - vcpu_tasks: None, - vm_lifecycle: None, - migration_src_state: Default::default(), +pub(super) async fn run_state_driver( + log: slog::Logger, + vm: &Arc, + ensure_request: propolis_api_types::InstanceSpecEnsureRequest, + ensure_result_tx: tokio::sync::oneshot::Sender< + Result, + >, + ensure_options: super::EnsureOptions, +) { + let (external_tx, external_rx) = tokio::sync::watch::channel( + propolis_api_types::InstanceStateMonitorResponse { + gen: 1, + state: propolis_api_types::InstanceState::Starting, + migration: propolis_api_types::InstanceMigrateStatusResponse { + migration_in: None, + migration_out: None, + }, + }, + ); + let input_queue = Arc::new(InputQueue::new( + log.new(slog::o!("component" => "vmm_request_queue")), + )); + + let (vm_objects, vcpu_tasks) = match initialize_vm_from_spec( + &log, + &input_queue, + &ensure_request.properties, + &ensure_request.instance_spec, + ensure_options, + ) + .await + { + Ok(objects) => objects, + Err(e) => { + let _ = + ensure_result_tx.send(Err(VmError::InitializationFailed(e))); + return; } - } + }; + + let active_vm = Arc::new(super::ActiveVm { + parent: vm.clone(), + log: log.clone(), + state_driver_queue: input_queue, + external_state_rx: external_rx, + properties: ensure_request.properties, + objects: tokio::sync::RwLock::new(vm_objects), + services: tokio::sync::Mutex::new(todo!("gjc")), + }); + + let state_driver = StateDriver { + log: log.new(slog::o!("component" => "vmm_state_driver")), + parent: vm.clone(), + active_vm, + input_queue, + external_state_tx: external_tx, + paused: false, + vcpu_tasks, + migration_src_state: Default::default(), + }; + + state_driver.run(ensure_result_tx).await +} + +async fn initialize_vm_from_spec( + log: &slog::Logger, + event_queue: &Arc, + properties: &InstanceProperties, + spec: &VersionedInstanceSpec, + options: super::EnsureOptions, +) -> anyhow::Result<(VmObjects, Box)> { + info!(log, "initializing new VM"; + "spec" => #?spec, + "properties" => #?properties, + "use_reservoir" => options.use_reservoir, + "bootrom" => %options.toml_config.bootrom.display()); + let vmm_log = log.new(slog::o!("component" => "vmm")); + + // Set up the 'shell' instance into which the rest of this routine will + // add components. + let VersionedInstanceSpec::V0(v0_spec) = &spec; + let machine = build_instance( + &properties.vm_name(), + v0_spec, + options.use_reservoir, + vmm_log, + )?; + + let mut init = MachineInitializer { + log: log.clone(), + machine: &machine, + devices: Default::default(), + block_backends: Default::default(), + crucible_backends: Default::default(), + spec: &v0_spec, + properties: &properties, + toml_config: &options.toml_config, + producer_registry: options.oximeter_registry, + state: MachineInitializerState::default(), + }; + + init.initialize_rom(options.toml_config.bootrom.as_path())?; + let chipset = init.initialize_chipset( + &(event_queue.clone() + as Arc), + )?; + + init.initialize_rtc(&chipset)?; + init.initialize_hpet()?; + + let com1 = Arc::new(init.initialize_uart(&chipset)?); + let ps2ctrl = init.initialize_ps2(&chipset)?; + init.initialize_qemu_debug_port()?; + init.initialize_qemu_pvpanic(properties.into())?; + init.initialize_network_devices(&chipset)?; + + #[cfg(not(feature = "omicron-build"))] + init.initialize_test_devices(&options.toml_config.devices)?; + #[cfg(feature = "omicron-build")] + info!(log, "`omicron-build` feature enabled, ignoring any test devices"); + + #[cfg(feature = "falcon")] + init.initialize_softnpu_ports(&chipset)?; + #[cfg(feature = "falcon")] + init.initialize_9pfs(&chipset)?; + + init.initialize_storage_devices(&chipset, options.nexus_client).await?; + let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; + init.initialize_cpus()?; + let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( + &machine, + event_queue.clone() as Arc, + log.new(slog::o!("component" => "vcpu_tasks")), + )?); + + let MachineInitializer { + devices, block_backends, crucible_backends, .. + } = init; + + Ok(( + VmObjects { + log: log.clone(), + instance_spec: v0_spec.clone(), + machine, + lifecycle_components: devices, + block_backends, + crucible_backends, + com1, + framebuffer: Some(ramfb), + ps2ctrl, + }, + vcpu_tasks as Box, + )) +} + +impl StateDriver { pub(super) async fn run( mut self, - ensure_request: propolis_api_types::InstanceSpecEnsureRequest, - reply_tx: tokio::sync::oneshot::Sender< + ensure_result_tx: tokio::sync::oneshot::Sender< Result, >, - ensure_options: super::EnsureOptions, - external_state_rx: super::InstanceStateRx, ) { - match self - .initialize_vm(ensure_request, ensure_options, external_state_rx) - .await - { - Ok(active) => { - self.parent_vm.make_active(active.clone()); - self.vm_lifecycle = - Some(active.clone() as Arc); - - // TODO(gjc) deal with migration - reply_tx.send(Ok(propolis_api_types::InstanceEnsureResponse { - migrate: None, - })); - self.run_loop().await; - active.stop_services().await; - } - Err(e) => { - self.parent_vm.start_failed(); - reply_tx.send(Err(VmError::InitializationFailed(e))); - } - } + self.parent.make_active(self.active_vm.clone()); + self.update_external_state(ExternalStateUpdate::Instance( + InstanceState::Starting, + )); + ensure_result_tx.send(Ok(propolis_api_types::InstanceEnsureResponse { + migrate: None, + })); + + self.run_loop().await; + self.active_vm.stop_services().await; + self.parent.make_defunct(); } fn update_external_state(&mut self, state: ExternalStateUpdate) { @@ -292,137 +416,7 @@ impl StateDriver { ); } - async fn initialize_vm( - &mut self, - request: propolis_api_types::InstanceSpecEnsureRequest, - options: super::EnsureOptions, - external_state_rx: super::InstanceStateRx, - ) -> anyhow::Result> { - let active_vm = match request.migrate { - None => { - let vm_objects = self - .initialize_vm_from_spec( - &request.properties, - &request.instance_spec, - options, - ) - .await?; - let VersionedInstanceSpec::V0(v0_spec) = request.instance_spec; - let active_vm = Arc::new(super::ActiveVm { - parent: self.parent_vm.clone(), - log: self.log.clone(), - state_driver_queue: self.input_queue.clone(), - external_state_rx, - properties: request.properties, - spec: v0_spec, - objects: vm_objects, - }); - - active_vm - } - Some(_migrate_request) => todo!("gjc"), - }; - - Ok(active_vm) - } - - /// Initializes all of the components of a VM from the supplied - /// specification. - async fn initialize_vm_from_spec( - &mut self, - properties: &InstanceProperties, - spec: &VersionedInstanceSpec, - options: super::EnsureOptions, - ) -> anyhow::Result { - info!(self.log, "initializing new VM"; - "spec" => #?spec, - "properties" => #?properties, - "use_reservoir" => options.use_reservoir, - "bootrom" => %options.toml_config.bootrom.display()); - - let vmm_log = self.log.new(slog::o!("component" => "vmm")); - - // Set up the 'shell' instance into which the rest of this routine will - // add components. - let VersionedInstanceSpec::V0(v0_spec) = &spec; - let machine = build_instance( - &properties.vm_name(), - v0_spec, - options.use_reservoir, - vmm_log, - )?; - - let mut init = MachineInitializer { - log: self.log.clone(), - machine: &machine, - devices: Default::default(), - block_backends: Default::default(), - crucible_backends: Default::default(), - spec: &v0_spec, - properties: &properties, - toml_config: &options.toml_config, - producer_registry: options.oximeter_registry, - state: MachineInitializerState::default(), - }; - - init.initialize_rom(options.toml_config.bootrom.as_path())?; - let chipset = init.initialize_chipset( - &(self.input_queue.clone() - as Arc), - )?; - - init.initialize_rtc(&chipset)?; - init.initialize_hpet()?; - - let com1 = Arc::new(init.initialize_uart(&chipset)?); - let ps2ctrl = init.initialize_ps2(&chipset)?; - init.initialize_qemu_debug_port()?; - init.initialize_qemu_pvpanic(properties.into())?; - init.initialize_network_devices(&chipset)?; - - #[cfg(not(feature = "omicron-build"))] - init.initialize_test_devices(&options.toml_config.devices)?; - #[cfg(feature = "omicron-build")] - info!( - self.log, - "`omicron-build` feature enabled, ignoring any test devices" - ); - - #[cfg(feature = "falcon")] - init.initialize_softnpu_ports(&chipset)?; - #[cfg(feature = "falcon")] - init.initialize_9pfs(&chipset)?; - - init.initialize_storage_devices(&chipset, options.nexus_client).await?; - let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; - init.initialize_cpus()?; - let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( - &machine, - self.input_queue.clone() - as Arc, - self.log.new(slog::o!("component" => "vcpu_tasks")), - )?); - - let MachineInitializer { - devices, - block_backends, - crucible_backends, - .. - } = init; - - self.vcpu_tasks = Some(vcpu_tasks as Box); - Ok(super::VmObjects { - machine, - lifecycle_components: devices, - block_backends, - crucible_backends, - com1, - framebuffer: Some(ramfb), - ps2ctrl, - }) - } - - async fn run_loop(mut self) { + async fn run_loop(&mut self) { info!(self.log, "state driver launched"); loop { @@ -509,21 +503,26 @@ impl StateDriver { InstanceState::Rebooting, )); - // Reboot is implemented as a pause -> reset -> resume transition. - // - // First, pause the vCPUs and all devices so no partially-completed - // work is present. - self.vcpu_tasks().pause_all(); - self.vm_lifecycle().pause_devices().await; - - // Reset all entities and the VM's bhyve state, then reset the vCPUs. - // The vCPU reset must come after the bhyve reset. - self.vm_lifecycle().reset_devices_and_machine(); - self.reset_vcpus(); - - // Resume devices so they're ready to do more work, then resume vCPUs. - self.vm_lifecycle().resume_devices(); - self.vcpu_tasks().resume_all(); + { + let (vm_objects, vcpu_tasks) = self.vm_objects_and_cpus().await; + + // Reboot is implemented as a pause -> reset -> resume transition. + // + // First, pause the vCPUs and all devices so no partially-completed + // work is present. + vcpu_tasks.pause_all(); + vm_objects.pause_devices().await; + + // Reset all entities and the VM's bhyve state, then reset the + // vCPUs. The vCPU reset must come after the bhyve reset. + vm_objects.reset_devices_and_machine(); + reset_vcpus(&*vm_objects, vcpu_tasks); + + // Resume devices so they're ready to do more work, then resume + // vCPUs. + vm_objects.resume_devices(); + vcpu_tasks.resume_all(); + } // Notify other consumers that the instance successfully rebooted and is // now back to Running. @@ -548,32 +547,33 @@ impl StateDriver { self.pause().await; } - self.vcpu_tasks().exit_all(); - self.vm_lifecycle().halt_devices(); + self.vcpu_tasks.exit_all(); + self.vm_objects().await.halt_devices(); self.publish_steady_state(InstanceState::Stopped); } async fn pause(&mut self) { assert!(!self.paused); - self.vcpu_tasks().pause_all(); - self.vm_lifecycle().pause_devices().await; - self.vm_lifecycle().pause_vm(); + self.vcpu_tasks.pause_all(); + { + let objects = self.vm_objects().await; + objects.pause_devices().await; + objects.pause_vm(); + } self.paused = true; } - fn resume(&mut self) { + async fn resume(&mut self) { assert!(self.paused); - self.vm_lifecycle().resume_vm(); - self.vm_lifecycle().resume_devices(); - self.vcpu_tasks().resume_all(); + { + let objects = self.vm_objects().await; + objects.resume_vm(); + objects.resume_devices(); + } + self.vcpu_tasks.resume_all(); self.paused = false; } - fn reset_vcpus(&mut self) { - self.vcpu_tasks().new_generation(); - self.vm_lifecycle.as_ref().unwrap().reset_vcpu_state(); - } - fn publish_steady_state(&mut self, state: InstanceState) { let change = match state { InstanceState::Running => { @@ -595,11 +595,24 @@ impl StateDriver { self.update_external_state(ExternalStateUpdate::Instance(state)); } - fn vcpu_tasks(&mut self) -> &mut dyn VcpuTaskController { - self.vcpu_tasks.as_mut().unwrap().as_mut() + async fn vm_objects(&self) -> tokio::sync::RwLockReadGuard<'_, VmObjects> { + self.active_vm.objects().await } - fn vm_lifecycle(&self) -> &dyn lifecycle_ops::VmLifecycle { - self.vm_lifecycle.as_ref().unwrap().as_ref() + async fn vm_objects_and_cpus( + &mut self, + ) -> ( + tokio::sync::RwLockReadGuard<'_, VmObjects>, + &mut dyn VcpuTaskController, + ) { + (self.active_vm.objects().await, self.vcpu_tasks.as_mut()) } } + +fn reset_vcpus( + vm_objects: &VmObjects, + vcpu_tasks: &mut dyn VcpuTaskController, +) { + vcpu_tasks.new_generation(); + vm_objects.reset_vcpu_state(); +} From c23065c4fc0856bdc0a2bc1480cce3ecfd7e9e7c Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Mon, 24 Jun 2024 23:34:15 +0000 Subject: [PATCH 10/55] [WIP] figure out VM services --- bin/propolis-server/src/lib/migrate/mod.rs | 1 - bin/propolis-server/src/lib/vm/mod.rs | 17 +-- bin/propolis-server/src/lib/vm/services.rs | 143 +++++++++++++++++- .../src/lib/vm/state_driver.rs | 23 ++- bin/propolis-server/src/lib/vnc.rs | 34 +++-- 5 files changed, 185 insertions(+), 33 deletions(-) diff --git a/bin/propolis-server/src/lib/migrate/mod.rs b/bin/propolis-server/src/lib/migrate/mod.rs index d2afba1fe..d16979f10 100644 --- a/bin/propolis-server/src/lib/migrate/mod.rs +++ b/bin/propolis-server/src/lib/migrate/mod.rs @@ -20,7 +20,6 @@ use tokio_tungstenite::{tungstenite, WebSocketStream}; use uuid::Uuid; use crate::server::{DropshotEndpointContext, VmControllerState}; -use crate::vm::ActiveVm; mod codec; pub mod destination; diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 12db8ec66..a3b5a0232 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -17,10 +17,12 @@ use propolis::{ }; use propolis_api_types::{ instance_spec::v0::InstanceSpecV0, InstanceProperties, - InstanceStateMonitorResponse, }; +use rfb::server::VncServer; -use crate::serial::Serial; +use crate::{ + serial::Serial, server::MetricsEndpointConfig, vnc::PropolisVncServer, +}; pub(crate) mod guest_event; mod lifecycle_ops; @@ -117,13 +119,6 @@ impl VmObjects { Ok(()) } - - pub(crate) fn device_by_name( - &self, - name: &String, - ) -> Option> { - self.vm_objects.devices.get(name).cloned() - } } /// The state stored in a [`Vm`] when there is an actual underlying virtual @@ -138,7 +133,7 @@ pub(super) struct ActiveVm { properties: InstanceProperties, objects: tokio::sync::RwLock, - services: tokio::sync::Mutex>, + services: services::VmServices, } impl ActiveVm { @@ -175,8 +170,10 @@ enum VmState { pub(super) struct EnsureOptions { pub toml_config: Arc, pub use_reservoir: bool, + pub metrics_config: Option, pub oximeter_registry: Option, pub nexus_client: Option, + pub vnc_server: Arc>, } impl Vm { diff --git a/bin/propolis-server/src/lib/vm/services.rs b/bin/propolis-server/src/lib/vm/services.rs index 538973a40..93edada29 100644 --- a/bin/propolis-server/src/lib/vm/services.rs +++ b/bin/propolis-server/src/lib/vm/services.rs @@ -7,10 +7,16 @@ use std::sync::Arc; +use oximeter::types::ProducerRegistry; use rfb::server::VncServer; -use slog::{error, Logger}; +use slog::{error, info, Logger}; -use crate::{serial::SerialTaskControlMessage, vnc::PropolisVncServer}; +use crate::{ + serial::SerialTaskControlMessage, server::MetricsEndpointConfig, + stats::virtual_machine::VirtualMachine, vnc::PropolisVncServer, +}; + +use super::VmObjects; #[derive(Default)] struct OximeterState { @@ -25,6 +31,52 @@ pub(super) struct VmServices { } impl VmServices { + pub(super) async fn new( + log: &slog::Logger, + vm: &Arc, + vm_objects: &VmObjects, + vm_properties: &propolis_api_types::InstanceProperties, + ensure_options: &super::EnsureOptions, + ) -> Self { + let oximeter_state = if let Some(cfg) = &ensure_options.metrics_config { + let registry = ensure_options.oximeter_registry.as_ref().expect( + "should have a producer registry if metrics are configured", + ); + + let state = + register_oximeter_producer(log, cfg, registry, vm_properties) + .await; + + state + } else { + OximeterState::default() + }; + + let vnc_server = ensure_options.vnc_server.clone(); + if let Some(ramfb) = &vm_objects.framebuffer { + vnc_server.server.initialize( + crate::vnc::RamFb::new(ramfb.get_framebuffer_spec()), + vm_objects.ps2ctrl.clone(), + vm.clone(), + ); + + let notifier_server_ref = vnc_server.clone(); + let rt = tokio::runtime::Handle::current(); + ramfb.set_notifier(Box::new(move |config, is_valid| { + let vnc = notifier_server_ref.clone(); + rt.block_on(vnc.server.update(config, is_valid, &vnc)); + })); + } + + let serial_task = start_serial_task(log, vm_objects).await; + + Self { + serial_task: tokio::sync::Mutex::new(Some(serial_task)), + oximeter: tokio::sync::Mutex::new(oximeter_state), + vnc_server, + } + } + pub(super) async fn stop(&self, log: &Logger) { self.vnc_server.stop().await; @@ -47,3 +99,90 @@ impl VmServices { let _ = oximeter_state.stats.take(); } } + +async fn register_oximeter_producer( + log: &slog::Logger, + cfg: &MetricsEndpointConfig, + registry: &ProducerRegistry, + vm_properties: &propolis_api_types::InstanceProperties, +) -> OximeterState { + let mut oximeter_state = OximeterState::default(); + let virtual_machine = VirtualMachine::from(vm_properties); + + // Create the server itself. + // + // The server manages all details of the registration with Nexus, so we + // don't need our own task for that or way to shut it down. + oximeter_state.server = match crate::stats::start_oximeter_server( + virtual_machine.instance_id, + &cfg, + &log, + registry, + ) { + Ok(server) => { + info!(log, "created metric producer server"); + Some(server) + } + Err(err) => { + error!( + log, + "failed to construct metric producer server, \ + no metrics will be available for this instance."; + "error" => ?err, + ); + None + } + }; + + // Assign our own metrics production for this VM instance to the + // registry, letting the server actually return them to oximeter when + // polled. + oximeter_state.stats = match crate::stats::register_server_metrics( + registry, + virtual_machine, + &log, + ) + .await + { + Ok(stats) => Some(stats), + Err(e) => { + error!( + log, + "failed to register our server metrics with \ + the ProducerRegistry, no server stats will \ + be produced"; + "error" => ?e, + ); + + None + } + }; + + oximeter_state +} + +async fn start_serial_task( + log: &slog::Logger, + vm_objects: &VmObjects, +) -> crate::serial::SerialTask { + let (websocks_ch, websocks_recv) = tokio::sync::mpsc::channel(1); + let (control_ch, control_recv) = tokio::sync::mpsc::channel(1); + + let serial = vm_objects.com1().clone(); + serial.set_task_control_sender(control_ch.clone()).await; + let err_log = log.new(slog::o!("component" => "serial task")); + let task = tokio::spawn(async move { + if let Err(e) = crate::serial::instance_serial_task( + websocks_recv, + control_recv, + serial, + err_log.clone(), + ) + .await + { + error!(err_log, "Failure in serial task: {}", e); + } + }); + + crate::serial::SerialTask { task, control_ch, websocks_ch } +} diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index a10561567..9d83b43c5 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -233,7 +233,7 @@ pub(super) async fn run_state_driver( &input_queue, &ensure_request.properties, &ensure_request.instance_spec, - ensure_options, + &ensure_options, ) .await { @@ -245,14 +245,23 @@ pub(super) async fn run_state_driver( } }; + let services = super::services::VmServices::new( + &log, + vm, + &vm_objects, + &ensure_request.properties, + &ensure_options, + ) + .await; + let active_vm = Arc::new(super::ActiveVm { parent: vm.clone(), log: log.clone(), - state_driver_queue: input_queue, + state_driver_queue: input_queue.clone(), external_state_rx: external_rx, properties: ensure_request.properties, objects: tokio::sync::RwLock::new(vm_objects), - services: tokio::sync::Mutex::new(todo!("gjc")), + services, }); let state_driver = StateDriver { @@ -274,7 +283,7 @@ async fn initialize_vm_from_spec( event_queue: &Arc, properties: &InstanceProperties, spec: &VersionedInstanceSpec, - options: super::EnsureOptions, + options: &super::EnsureOptions, ) -> anyhow::Result<(VmObjects, Box)> { info!(log, "initializing new VM"; "spec" => #?spec, @@ -303,7 +312,7 @@ async fn initialize_vm_from_spec( spec: &v0_spec, properties: &properties, toml_config: &options.toml_config, - producer_registry: options.oximeter_registry, + producer_registry: options.oximeter_registry.clone(), state: MachineInitializerState::default(), }; @@ -332,7 +341,9 @@ async fn initialize_vm_from_spec( #[cfg(feature = "falcon")] init.initialize_9pfs(&chipset)?; - init.initialize_storage_devices(&chipset, options.nexus_client).await?; + init.initialize_storage_devices(&chipset, options.nexus_client.clone()) + .await?; + let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; init.initialize_cpus()?; let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( diff --git a/bin/propolis-server/src/lib/vnc.rs b/bin/propolis-server/src/lib/vnc.rs index 7f2b402e5..8f92dbbf4 100644 --- a/bin/propolis-server/src/lib/vnc.rs +++ b/bin/propolis-server/src/lib/vnc.rs @@ -13,13 +13,11 @@ use rfb::rfb::{ SecurityTypes, }; use rfb::server::{Server, VncServer, VncServerConfig, VncServerData}; -use slog::{debug, error, info, o, trace, Logger}; +use slog::{debug, error, info, o, trace, warn, Logger}; use std::net::SocketAddr; use std::sync::Arc; use tokio::sync::Mutex; -use crate::vm::VmController; - const INITIAL_WIDTH: u16 = 1024; const INITIAL_HEIGHT: u16 = 768; @@ -55,7 +53,7 @@ enum Framebuffer { struct PropolisVncServerInner { framebuffer: Framebuffer, ps2ctrl: Option>, - vm: Option>, + vm: Option>, } #[derive(Clone)] @@ -83,7 +81,7 @@ impl PropolisVncServer { &self, fb: RamFb, ps2ctrl: Arc, - vm: Arc, + vm: Arc, ) { let mut inner = self.inner.lock().await; inner.framebuffer = Framebuffer::Initialized(fb); @@ -154,15 +152,23 @@ impl Server for PropolisVncServer { let len = fb.height as usize * fb.width as usize * 4; let mut buf = vec![0u8; len]; - - let read = tokio::task::block_in_place(|| { - let machine = inner.vm.as_ref().unwrap().machine(); - let memctx = machine.acc_mem.access().unwrap(); - memctx.read_into(GuestAddr(fb.addr), &mut buf, len) - }); - - assert!(read.is_some()); - debug!(self.log, "read {} bytes from guest", read.unwrap()); + if let Some(vm) = inner.vm.as_ref().unwrap().active_vm() { + let vm_objects = vm.objects().await; + let read = tokio::task::block_in_place(|| { + let machine = vm_objects.machine(); + let memctx = machine.acc_mem.access().unwrap(); + memctx.read_into(GuestAddr(fb.addr), &mut buf, len) + }); + + assert!(read.is_some()); + debug!(self.log, "read {} bytes from guest", read.unwrap()); + } else { + warn!( + self.log, + "got framebuffer init message but VM is gone" + ); + buf = vec![0xffu8; len]; + } let r = Rectangle::new( 0, From a2dbd78af8a2c883d957c9223cade4d5932740f3 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 25 Jun 2024 00:23:11 +0000 Subject: [PATCH 11/55] [WIP] partially replace server's VmController --- bin/propolis-server/src/lib/server.rs | 422 ++---------------- bin/propolis-server/src/lib/vm/mod.rs | 13 +- .../src/lib/vm/state_driver.rs | 2 +- 3 files changed, 37 insertions(+), 400 deletions(-) diff --git a/bin/propolis-server/src/lib/server.rs b/bin/propolis-server/src/lib/server.rs index ecaae8019..4cf94966d 100644 --- a/bin/propolis-server/src/lib/server.rs +++ b/bin/propolis-server/src/lib/server.rs @@ -43,7 +43,6 @@ use tokio_tungstenite::WebSocketStream; use crate::spec::{ServerSpecBuilder, ServerSpecBuilderError}; use crate::stats::virtual_machine::VirtualMachine; -use crate::vm::VmController; use crate::vnc::PropolisVncServer; pub(crate) type DeviceMap = @@ -76,7 +75,7 @@ impl MetricsEndpointConfig { /// objects. pub struct StaticConfig { /// The TOML-driven configuration for this server's instances. - pub vm: VmTomlConfig, + pub vm: Arc, /// Whether to use the host's guest memory reservoir to back guest memory. pub use_reservoir: bool, @@ -86,93 +85,6 @@ pub struct StaticConfig { metrics: Option, } -/// The state of the current VM controller in this server, if there is one, or -/// the most recently created one, if one ever existed. -pub enum VmControllerState { - /// No VM controller has ever been constructed in this server. - NotCreated, - - /// A VM controller exists. - Created(Arc), - - /// No VM controller exists. - /// - /// Distinguishing this state from `NotCreated` allows the server to discard - /// the active `VmController` on instance stop while still being able to - /// service get requests for the instance. (If this were not needed, or the - /// server were willing to preserve the `VmController` after halt, this enum - /// could be replaced with an `Option`.) - Destroyed { - /// A copy of the instance properties recorded at the time the instance - /// was destroyed, used to serve subsequent `instance_get` requests. - last_instance: Box, - - /// A copy of the destroyed instance's spec, used to serve subsequent - /// `instance_spec_get` requests. - // - // TODO: Merge this into `api::Instance` when the migration to generated - // types is complete. - last_instance_spec: Box, - - /// A clone of the receiver side of the server's state watcher, used to - /// serve subsequent `instance_state_monitor` requests. Note that an - /// outgoing controller can publish new state changes even after the - /// server has dropped its reference to it (its state worker may - /// continue running for a time). - state_watcher: - tokio::sync::watch::Receiver, - }, -} - -impl VmControllerState { - /// Maps this `VmControllerState` into a mutable reference to its internal - /// `VmController` if a controller is active. - pub fn as_controller(&mut self) -> Option<&mut Arc> { - match self { - VmControllerState::NotCreated => None, - VmControllerState::Created(c) => Some(c), - VmControllerState::Destroyed { .. } => None, - } - } - - /// Takes the active `VmController` if one is present and replaces it with - /// `VmControllerState::Destroyed`. - pub async fn take_controller(&mut self) -> Option> { - if let VmControllerState::Created(vm) = self { - let state = vm.state_watcher().borrow().state; - let last_instance = api::Instance { - properties: vm.properties().clone(), - state, - disks: vec![], - nics: vec![], - }; - let last_instance_spec = vm.instance_spec().await.clone(); - - // Preserve the state watcher so that subsequent updates to the VM's - // state are visible to calls to query/monitor that state. Note that - // the VM's state will change at least once more after this point: - // the final transition to the "destroyed" state happens only when - // all references to the VM have been dropped, including the one - // this routine just exchanged and will return. - let state_watcher = vm.state_watcher().clone(); - if let VmControllerState::Created(vm) = std::mem::replace( - self, - VmControllerState::Destroyed { - last_instance: Box::new(last_instance), - last_instance_spec: Box::new(last_instance_spec), - state_watcher, - }, - ) { - Some(vm) - } else { - unreachable!() - } - } else { - None - } - } -} - /// Objects related to Propolis's Oximeter metric production. pub struct OximeterState { /// The metric producer server. @@ -184,66 +96,11 @@ pub struct OximeterState { stats: Option, } -/// Objects that this server creates, owns, and manipulates in response to API -/// calls. -pub struct ServiceProviders { - /// The VM controller that manages this server's Propolis instance. This is - /// `None` until a guest is created via `instance_ensure`. - pub vm: Mutex, - - /// The currently active serial console handling task, if present. - serial_task: Mutex>, - - /// State related to the Propolis Oximeter server and actual statistics. - oximeter_state: Mutex, - - /// The VNC server hosted within this process. Note that this server always - /// exists irrespective of whether there is an instance. Creating an - /// instance hooks this server up to the instance's framebuffer. - vnc_server: Arc>, -} - -impl ServiceProviders { - /// Directs the current set of per-instance service providers to stop in an - /// orderly fashion, then drops them all. - async fn stop(&self, log: &Logger) { - // Stop the VNC server - self.vnc_server.stop().await; - - if let Some(vm) = self.vm.lock().await.take_controller().await { - slog::info!(log, "Dropping server's VM controller reference"; - "strong_refs" => Arc::strong_count(&vm), - "weak_refs" => Arc::weak_count(&vm), - ); - } - if let Some(serial_task) = self.serial_task.lock().await.take() { - let _ = serial_task - .control_ch - .send(SerialTaskControlMessage::Stopping) - .await; - // Wait for the serial task to exit - let _ = serial_task.task.await; - } - - // Clean up oximeter tasks and statistic state. - let mut oximeter_state = self.oximeter_state.lock().await; - if let Some(server) = oximeter_state.server.take() { - if let Err(e) = server.close().await { - error!( - log, - "failed to close oximeter producer server"; - "error" => ?e, - ); - }; - } - let _ = oximeter_state.stats.take(); - } -} - /// Context accessible from HTTP callbacks. pub struct DropshotEndpointContext { static_config: StaticConfig, - pub services: Arc, + vnc_server: Arc>, + pub vm: Arc, log: Logger, } @@ -258,34 +115,15 @@ impl DropshotEndpointContext { ) -> Self { Self { static_config: StaticConfig { - vm: config, + vm: Arc::new(config), use_reservoir, metrics: metric_config, }, - services: Arc::new(ServiceProviders { - vm: Mutex::new(VmControllerState::NotCreated), - serial_task: Mutex::new(None), - oximeter_state: Mutex::new(OximeterState { - server: None, - stats: None, - }), - vnc_server, - }), + vnc_server, + vm: crate::vm::Vm::new(), log, } } - - /// Get access to the VM controller for this context, emitting a consistent - /// error if it is absent. - pub(crate) async fn vm( - &self, - ) -> Result>, HttpError> { - MutexGuard::try_map( - self.services.vm.lock().await, - VmControllerState::as_controller, - ) - .map_err(|_| not_created_error()) - } } #[derive(Debug, Error)] @@ -330,69 +168,6 @@ fn instance_spec_from_request( Ok(VersionedInstanceSpec::V0(spec_builder.finish())) } -/// Register an Oximeter server reporting metrics from a new instance. -async fn register_oximeter_producer( - services: Arc, - cfg: MetricsEndpointConfig, - registry: &ProducerRegistry, - virtual_machine: VirtualMachine, - log: Logger, -) { - let mut oximeter_state = services.oximeter_state.lock().await; - assert!(oximeter_state.stats.is_none()); - assert!(oximeter_state.server.is_none()); - - // Create the server itself. - // - // The server manages all details of the registration with Nexus, so we - // don't need our own task for that or way to shut it down. - match crate::stats::start_oximeter_server( - virtual_machine.instance_id, - &cfg, - &log, - registry, - ) { - Ok(server) => { - info!(log, "created metric producer server"); - let old = oximeter_state.server.replace(server); - assert!(old.is_none()); - } - Err(err) => { - error!( - log, - "failed to construct metric producer server, \ - no metrics will be available for this instance."; - "error" => ?err, - ); - } - } - - // Assign our own metrics production for this VM instance to the - // registry, letting the server actually return them to oximeter when - // polled. - let stats = match crate::stats::register_server_metrics( - registry, - virtual_machine, - &log, - ) - .await - { - Ok(stats) => stats, - Err(e) => { - error!( - log, - "failed to register our server metrics with \ - the ProducerRegistry, no server stats will \ - be produced"; - "error" => ?e, - ); - return; - } - }; - let old = oximeter_state.stats.replace(stats); - assert!(old.is_none()); -} - /// Wrapper around a [`NexusClient`] object, which allows deferring /// the DNS lookup until accessed. /// @@ -474,177 +249,34 @@ async fn instance_ensure_common( request: api::InstanceSpecEnsureRequest, ) -> Result, HttpError> { let server_context = rqctx.context(); - let api::InstanceSpecEnsureRequest { properties, instance_spec, migrate } = - request; - - // Handle requests to an instance that has already been initialized. Treat - // the instances as compatible (and return Ok) if they have the same - // properties and return an appropriate error otherwise. - // - // TODO(#205): Consider whether to use this interface to change an - // instance's devices and backends at runtime. - if let VmControllerState::Created(existing) = - &*server_context.services.vm.lock().await - { - let existing_properties = existing.properties(); - if existing_properties.id != properties.id { - return Err(HttpError::for_client_error( - Some(api::ErrorCode::AlreadyInitialized.to_string()), - http::status::StatusCode::CONFLICT, - format!( - "Server already initialized with ID {}", - existing_properties.id - ), - )); - } - - if *existing_properties != properties { - return Err(HttpError::for_client_error( - Some(api::ErrorCode::AlreadyRunning.to_string()), - http::status::StatusCode::CONFLICT, - "Cannot update running server".to_string(), - )); - } - - return Ok(HttpResponseCreated(api::InstanceEnsureResponse { - migrate: None, - })); - } - - let producer_registry = - if let Some(cfg) = server_context.static_config.metrics.as_ref() { - // Create a registry and spawn tasks to register with Nexus as an - // oximeter metric producer. - // - // We create a registry here so that we can pass it through to Crucible - // below. We also spawn a task for the actual registration process - // (which may spin indefinitely) so that we can continue to initialize - // the VM instance without blocking for that to succeed. - let registry = ProducerRegistry::with_id(properties.id); - let virtual_machine = VirtualMachine::from(&properties); - register_oximeter_producer( - server_context.services.clone(), - cfg.clone(), - ®istry, - virtual_machine, - rqctx.log.clone(), - ) - .await; - Some(registry) - } else { - None - }; - - let (stop_ch, stop_recv) = oneshot::channel(); + let oximeter_registry = server_context + .static_config + .metrics + .as_ref() + .map(|_| ProducerRegistry::with_id(request.properties.id)); - // Use our current address to generate the expected Nexus client endpoint - // address. let nexus_client = find_local_nexus_client(rqctx.server.local_addr, rqctx.log.clone()) .await; - // Parts of VM initialization (namely Crucible volume attachment) make use - // of async processing, which itself is turned synchronous with `block_on` - // calls to the Tokio runtime. - // - // Since `block_on` will panic if called from an async context, as we are in - // now, the whole process is wrapped up in `spawn_blocking`. It is - // admittedly a big kludge until this can be better refactored. - let vm = { - let properties = properties.clone(); - let server_context = server_context.clone(); - let log = server_context.log.clone(); - - // Block for VM controller setup under the current (API) runtime - let cur_rt_hdl = tokio::runtime::Handle::current(); - let vm_hdl = cur_rt_hdl.spawn_blocking(move || { - VmController::new( - instance_spec, - properties, - &server_context.static_config, - producer_registry, - nexus_client, - log, - stop_ch, - ) - }); - - vm_hdl.await.unwrap() - } - .map_err(|e| { - HttpError::for_internal_error(format!("failed to create instance: {e}")) - })?; - - if let Some(ramfb) = vm.framebuffer() { - // Get a framebuffer description from the wrapped instance. - let fb_spec = ramfb.get_framebuffer_spec(); - let vnc_fb = crate::vnc::RamFb::new(fb_spec); - - // Get a reference to the PS2 controller so that we can pass keyboard input. - let ps2ctrl = vm.ps2ctrl().clone(); - - // Get a reference to the outward-facing VNC server in this process. - let vnc_server = server_context.services.vnc_server.clone(); - - // Initialize the Propolis VNC adapter with references to the VM's Instance, - // framebuffer, and PS2 controller. - vnc_server.server.initialize(vnc_fb, ps2ctrl, vm.clone()).await; - - // Hook up the framebuffer notifier to update the Propolis VNC adapter - let notifier_server_ref = vnc_server.clone(); - let rt = tokio::runtime::Handle::current(); - ramfb.set_notifier(Box::new(move |config, is_valid| { - let vnc = notifier_server_ref.clone(); - rt.block_on(vnc.server.update(config, is_valid, &vnc)); - })); - } - - let mut serial_task = server_context.services.serial_task.lock().await; - if serial_task.is_none() { - let (websocks_ch, websocks_recv) = mpsc::channel(1); - let (control_ch, control_recv) = mpsc::channel(1); - - let serial = vm.com1().clone(); - serial.set_task_control_sender(control_ch.clone()).await; - let err_log = rqctx.log.new(o!("component" => "serial task")); - let task = tokio::spawn(async move { - if let Err(e) = super::serial::instance_serial_task( - websocks_recv, - control_recv, - serial, - err_log.clone(), - ) - .await - { - error!(err_log, "Failure in serial task: {}", e); - } - }); - *serial_task = - Some(super::serial::SerialTask { task, control_ch, websocks_ch }); - } - - let log = server_context.log.clone(); - let services = Arc::clone(&server_context.services); - tokio::task::spawn(async move { - // Once the VmController has signaled that it is shutting down, - // we'll clean up the per-instance service providers as well. - let _ = stop_recv.await; - services.stop(&log).await; - }); - - *server_context.services.vm.lock().await = - VmControllerState::Created(vm.clone()); - - let migrate = if let Some(migrate_request) = migrate { - let res = crate::migrate::dest_initiate(&rqctx, vm, migrate_request) - .await - .map_err(<_ as Into>::into)?; - Some(res) - } else { - None + let ensure_options = crate::vm::EnsureOptions { + toml_config: server_context.static_config.vm.clone(), + use_reservoir: server_context.static_config.use_reservoir, + metrics_config: server_context.static_config.metrics.clone(), + oximeter_registry, + nexus_client, + vnc_server: server_context.vnc_server.clone(), }; - Ok(HttpResponseCreated(api::InstanceEnsureResponse { migrate })) + server_context + .vm + .ensure(rqctx.log.clone(), request, ensure_options) + .await + .expect("gjc"); + + Ok(HttpResponseCreated(api::InstanceEnsureResponse { + migrate: todo!("gjc"), + })) } #[endpoint { diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index a3b5a0232..9098ff8de 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -133,7 +133,7 @@ pub(super) struct ActiveVm { properties: InstanceProperties, objects: tokio::sync::RwLock, - services: services::VmServices, + services: Option, } impl ActiveVm { @@ -148,8 +148,7 @@ impl ActiveVm { } async fn stop_services(&self) { - let services = self.services.lock().await.take().unwrap(); - services.stop(&self.log).await; + self.services.stop(&self.log).await; } } @@ -159,6 +158,12 @@ struct DefunctVm { spec: InstanceSpecV0, } +// TODO(gjc) the shutdown process is not quite right yet, is it? it's possible +// for a VM to go to "Defunct" before actually being completely destroyed... the +// "destroyed" transition used to happen when the VM controller was fully +// dropped. what we might want is to have distinct "defunct" and "destroyed" +// states and only get to the latter when the active VM is dropped? need to +// think about this more. #[allow(clippy::large_enum_variant)] enum VmState { NoVm, @@ -235,7 +240,7 @@ impl Vm { } } - pub async fn ensure( + pub(crate) async fn ensure( self: &Arc, log: slog::Logger, ensure_request: propolis_api_types::InstanceSpecEnsureRequest, diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 9d83b43c5..113bedbda 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -261,7 +261,7 @@ pub(super) async fn run_state_driver( external_state_rx: external_rx, properties: ensure_request.properties, objects: tokio::sync::RwLock::new(vm_objects), - services, + services: Some(services), }); let state_driver = StateDriver { From a5f1ec2a60a2548350851eba940e63fce4f91a2e Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 25 Jun 2024 14:19:47 +0000 Subject: [PATCH 12/55] [WIP] improve ActiveVm cleanup (maybe) --- bin/propolis-server/src/lib/migrate/mod.rs | 16 +- bin/propolis-server/src/lib/vcpu_tasks.rs | 2 +- bin/propolis-server/src/lib/vm/mod.rs | 178 ++++++++++++------ .../src/lib/vm/state_driver.rs | 23 ++- 4 files changed, 149 insertions(+), 70 deletions(-) diff --git a/bin/propolis-server/src/lib/migrate/mod.rs b/bin/propolis-server/src/lib/migrate/mod.rs index d16979f10..c0f2cbd53 100644 --- a/bin/propolis-server/src/lib/migrate/mod.rs +++ b/bin/propolis-server/src/lib/migrate/mod.rs @@ -19,7 +19,7 @@ use tokio_tungstenite::tungstenite::protocol::CloseFrame; use tokio_tungstenite::{tungstenite, WebSocketStream}; use uuid::Uuid; -use crate::server::{DropshotEndpointContext, VmControllerState}; +use crate::server::DropshotEndpointContext; mod codec; pub mod destination; @@ -233,11 +233,12 @@ pub async fn source_start< )); info!(log, "Migration Source"); - let controller = tokio::sync::MutexGuard::try_map( - rqctx.context().services.vm.lock().await, - VmControllerState::as_controller, - ) - .map_err(|_| MigrateError::InstanceNotInitialized)?; + let active_vm = rqctx + .context() + .vm + .active_vm() + .ok_or_else(|| MigrateError::InstanceNotInitialized)? + .clone(); let selected = match conn.next().await { Some(Ok(tungstenite::Message::Text(dst_protocols))) => { @@ -291,7 +292,8 @@ pub async fn source_start< } }; - controller.request_migration_from(migration_id, conn, selected)?; + todo!("gjc"); // need a method on ActiveVm for this + // controller.request_migration_from(migration_id, conn, selected)?; Ok(()) } diff --git a/bin/propolis-server/src/lib/vcpu_tasks.rs b/bin/propolis-server/src/lib/vcpu_tasks.rs index fb4917ec6..ba79dedce 100644 --- a/bin/propolis-server/src/lib/vcpu_tasks.rs +++ b/bin/propolis-server/src/lib/vcpu_tasks.rs @@ -30,7 +30,7 @@ pub struct VcpuTasks { } #[cfg_attr(test, mockall::automock)] -pub(crate) trait VcpuTaskController: Send { +pub(crate) trait VcpuTaskController: Send + Sync + 'static { fn new_generation(&self); fn pause_all(&mut self); fn resume_all(&mut self); diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 9098ff8de..092cd02cf 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -7,7 +7,7 @@ use std::{ collections::BTreeMap, - sync::{Arc, RwLock, RwLockReadGuard, Weak}, + sync::{Arc, RwLock, Weak}, }; use oximeter::types::ProducerRegistry; @@ -47,6 +47,9 @@ type InstanceStateRx = tokio::sync::watch::Receiver< #[derive(Debug, thiserror::Error)] pub(crate) enum VmError { + #[error("VM ensure result channel unexpectedly closed")] + EnsureResultClosed, + #[error("VM already initialized")] AlreadyInitialized, @@ -57,8 +60,12 @@ pub(crate) enum VmError { /// The top-level VM wrapper type. Callers are expected to wrap this in an /// `Arc`. pub(crate) struct Vm { - /// A reference to the VM state machine. - state: RwLock, + inner: RwLock, +} + +struct VmInner { + state: VmState, + driver: Option>, } struct VmObjects { @@ -132,7 +139,7 @@ pub(super) struct ActiveVm { properties: InstanceProperties, - objects: tokio::sync::RwLock, + objects: Option>, services: Option, } @@ -144,32 +151,86 @@ impl ActiveVm { pub(crate) async fn objects( &self, ) -> tokio::sync::RwLockReadGuard<'_, VmObjects> { - self.objects.read().await + self.objects.as_ref().unwrap().read().await } +} + +impl Drop for ActiveVm { + fn drop(&mut self) { + let driver = self + .parent + .inner + .write() + .unwrap() + .driver + .take() + .expect("active VMs always have a driver"); + + let objects = + self.objects.take().expect("active VMs should always have objects"); + + let services = self + .services + .take() + .expect("active VMs should always have services"); + + let parent = self.parent.clone(); + let log = self.log.clone(); + tokio::spawn(async move { + drop(objects); + services.stop(&log).await; + + let tx = driver.await.expect("state driver shouldn't panic"); + let old_state = tx.borrow(); + let new_state = InstanceStateMonitorResponse { + gen: old_state.gen + 1, + state: propolis_api_types::InstanceState::Destroyed, + migration: old_state.migration.clone(), + }; + + tx.send(new_state).expect("VM in rundown should hold a receiver"); - async fn stop_services(&self) { - self.services.stop(&self.log).await; + parent.complete_rundown(); + }); } } -struct DefunctVm { +struct RundownVm { external_state_rx: InstanceStateRx, properties: InstanceProperties, spec: InstanceSpecV0, } -// TODO(gjc) the shutdown process is not quite right yet, is it? it's possible -// for a VM to go to "Defunct" before actually being completely destroyed... the -// "destroyed" transition used to happen when the VM controller was fully -// dropped. what we might want is to have distinct "defunct" and "destroyed" -// states and only get to the latter when the active VM is dropped? need to -// think about this more. +/// An enum representing the VM state machine. The API layer's Dropshot context +/// holds a reference to this state machine via the [`Vm`] wrapper struct. +/// +/// When an instance is running, its components and services are stored in an +/// [`ActiveVm`] whose lifecycle is managed by a "state driver" task. The VM is +/// kept alive by this task's strong reference. API calls that need to access +/// the active VM try to upgrade the state machine's weak reference to the VM. +/// +/// When an active VM halts, the state driver moves the state machine to the +/// `Rundown` state, preventing new API calls from obtaining new strong +/// references to the underlying VM while allowing existing calls to finish. +/// Eventually (barring a leak), the active VM will be dropped. This launches a +/// task that finishes cleaning up the VM and then moves to the +/// `RundownComplete` state, which allows a new VM to start. #[allow(clippy::large_enum_variant)] enum VmState { + /// This state machine has never held a VM. NoVm, + + /// There is an active state driver task, but it is currently creating VM + /// components and/or starting VM services. WaitingToStart, + + /// There is an active virtual machine. Callers may try to upgrade the + /// contained weak reference to access its objects and services. Active(Weak), - Defunct(DefunctVm), + + /// The active VM's state driver has exited, but the + Rundown(RundownVm), + RundownComplete(RundownVm), } pub(super) struct EnsureOptions { @@ -183,16 +244,13 @@ pub(super) struct EnsureOptions { impl Vm { pub fn new() -> Arc { - Arc::new(Self { state: RwLock::new(VmState::NoVm) }) - } - - fn vm_state(&self) -> RwLockReadGuard<'_, VmState> { - self.state.read().unwrap() + let inner = VmInner { state: VmState::NoVm, driver: None }; + Arc::new(Self { inner: RwLock::new(inner) }) } pub(super) fn active_vm(&self) -> Option> { - let guard = self.vm_state(); - if let VmState::Active(weak) = &*guard { + let guard = self.inner.read().unwrap(); + if let VmState::Active(weak) = &guard.state { weak.upgrade() } else { None @@ -200,9 +258,9 @@ impl Vm { } fn start_failed(&self) { - let mut guard = self.state.write().unwrap(); - match *guard { - VmState::WaitingToStart => *guard = VmState::NoVm, + let mut guard = self.inner.write().unwrap(); + match guard.state { + VmState::WaitingToStart => guard.state = VmState::NoVm, _ => unreachable!( "only a starting VM's state worker calls start_failed" ), @@ -210,11 +268,11 @@ impl Vm { } fn make_active(&self, active: Arc) { - let mut guard = self.state.write().unwrap(); - let old = std::mem::replace(&mut *guard, VmState::NoVm); + let mut guard = self.inner.write().unwrap(); + let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { VmState::WaitingToStart => { - *guard = VmState::Active(Arc::downgrade(&active)) + guard.state = VmState::Active(Arc::downgrade(&active)) } _ => unreachable!( "only a starting VM's state worker calls make_active" @@ -222,13 +280,13 @@ impl Vm { } } - async fn make_defunct(&self) { - let mut guard = self.state.write().unwrap(); - let old = std::mem::replace(&mut *guard, VmState::NoVm); + async fn set_rundown(&self) { + let mut guard = self.inner.write().unwrap(); + let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { VmState::Active(vm) => { let active = vm.upgrade().expect("state driver holds a ref"); - *guard = VmState::Defunct(DefunctVm { + guard.state = VmState::Rundown(RundownVm { external_state_rx: active.external_state_rx.clone(), properties: active.properties.clone(), spec: active.objects().await.instance_spec.clone(), @@ -240,6 +298,15 @@ impl Vm { } } + async fn complete_rundown(&self) { + let mut guard = self.inner.write().unwrap(); + let old = std::mem::replace(&mut guard.state, VmState::NoVm); + match old { + VmState::Rundown(vm) => guard.state = VmState::RundownComplete(vm), + _ => unreachable!("VM rundown completed from invalid prior state"), + } + } + pub(crate) async fn ensure( self: &Arc, log: slog::Logger, @@ -247,32 +314,37 @@ impl Vm { options: EnsureOptions, ) -> anyhow::Result { + let (ensure_reply_tx, ensure_rx) = tokio::sync::oneshot::channel(); + // Take the lock for writing, since in the common case this call will be // creating a new VM and there's no easy way to upgrade from a reader // lock to a writer lock. - let mut guard = self.state.write().unwrap(); - - if matches!(*guard, VmState::WaitingToStart | VmState::Active(_)) { - return Err(VmError::AlreadyInitialized); - } - - *guard = VmState::WaitingToStart; - - let state_driver = state_driver::StateDriver::new( - log, - self.clone(), - Arc::new(input_queue), - external_tx, - ); - - let (ensure_reply_tx, ensure_rx) = tokio::sync::oneshot::channel(); + { + let mut guard = self.inner.write().unwrap(); + if matches!( + guard.state, + VmState::WaitingToStart + | VmState::Active(_) + | VmState::Rundown(_) + ) { + return Err(VmError::AlreadyInitialized); + } - tokio::spawn(async move { - state_driver - .run(ensure_request, ensure_reply_tx, options, external_rx) + guard.state = VmState::WaitingToStart; + + let vm_for_driver = self.clone(); + guard.driver = Some(tokio::spawn(async move { + state_driver::run_state_driver( + log, + vm_for_driver, + ensure_request, + ensure_reply_tx, + options, + ) .await - }); + })); + } - ensure_rx.await + ensure_rx.await.map_err(|_| VmError::EnsureResultClosed)? } } diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 113bedbda..6dbe97363 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -25,7 +25,7 @@ use crate::{ use super::{ guest_event::{self, GuestEvent}, - VmError, VmObjects, + InstanceStateTx, VmError, VmObjects, }; struct MigrationStateUpdate { @@ -207,13 +207,13 @@ struct StateDriver { pub(super) async fn run_state_driver( log: slog::Logger, - vm: &Arc, + vm: Arc, ensure_request: propolis_api_types::InstanceSpecEnsureRequest, ensure_result_tx: tokio::sync::oneshot::Sender< Result, >, ensure_options: super::EnsureOptions, -) { +) -> InstanceStateTx { let (external_tx, external_rx) = tokio::sync::watch::channel( propolis_api_types::InstanceStateMonitorResponse { gen: 1, @@ -241,13 +241,13 @@ pub(super) async fn run_state_driver( Err(e) => { let _ = ensure_result_tx.send(Err(VmError::InitializationFailed(e))); - return; + return external_tx; } }; let services = super::services::VmServices::new( &log, - vm, + &vm, &vm_objects, &ensure_request.properties, &ensure_options, @@ -260,7 +260,7 @@ pub(super) async fn run_state_driver( state_driver_queue: input_queue.clone(), external_state_rx: external_rx, properties: ensure_request.properties, - objects: tokio::sync::RwLock::new(vm_objects), + objects: Some(tokio::sync::RwLock::new(vm_objects)), services: Some(services), }); @@ -378,7 +378,7 @@ impl StateDriver { ensure_result_tx: tokio::sync::oneshot::Sender< Result, >, - ) { + ) -> super::InstanceStateTx { self.parent.make_active(self.active_vm.clone()); self.update_external_state(ExternalStateUpdate::Instance( InstanceState::Starting, @@ -387,9 +387,14 @@ impl StateDriver { migrate: None, })); + // TODO(gjc) actually start the VM + self.run_loop().await; - self.active_vm.stop_services().await; - self.parent.make_defunct(); + + // TODO(gjc) get rid of these + self.parent.set_rundown(); + + self.external_state_tx } fn update_external_state(&mut self, state: ExternalStateUpdate) { From 3156df698138961f353db75a38c6336cd9f10744 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 25 Jun 2024 17:16:29 +0000 Subject: [PATCH 13/55] [WIP] fix VM start This doesn't happen automatically on ensure unless the target is migrating in, so fix that up. --- bin/propolis-server/src/lib/vm/mod.rs | 29 +- .../src/lib/vm/request_queue.rs | 65 ++- .../src/lib/vm/state_driver.rs | 429 ++++++++++-------- 3 files changed, 305 insertions(+), 218 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 092cd02cf..956ed3630 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -50,9 +50,15 @@ pub(crate) enum VmError { #[error("VM ensure result channel unexpectedly closed")] EnsureResultClosed, + #[error("VM is currently initializing")] + WaitingToInitialize, + #[error("VM already initialized")] AlreadyInitialized, + #[error("VM is currently shutting down")] + RundownInProgress, + #[error("VM initialization failed")] InitializationFailed(#[source] anyhow::Error), } @@ -222,7 +228,7 @@ enum VmState { /// There is an active state driver task, but it is currently creating VM /// components and/or starting VM services. - WaitingToStart, + WaitingForInit, /// There is an active virtual machine. Callers may try to upgrade the /// contained weak reference to access its objects and services. @@ -260,7 +266,7 @@ impl Vm { fn start_failed(&self) { let mut guard = self.inner.write().unwrap(); match guard.state { - VmState::WaitingToStart => guard.state = VmState::NoVm, + VmState::WaitingForInit => guard.state = VmState::NoVm, _ => unreachable!( "only a starting VM's state worker calls start_failed" ), @@ -271,7 +277,7 @@ impl Vm { let mut guard = self.inner.write().unwrap(); let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { - VmState::WaitingToStart => { + VmState::WaitingForInit => { guard.state = VmState::Active(Arc::downgrade(&active)) } _ => unreachable!( @@ -321,17 +327,16 @@ impl Vm { // lock to a writer lock. { let mut guard = self.inner.write().unwrap(); - if matches!( - guard.state, - VmState::WaitingToStart - | VmState::Active(_) - | VmState::Rundown(_) - ) { - return Err(VmError::AlreadyInitialized); + match guard.state { + VmState::WaitingForInit => { + return Err(VmError::WaitingToInitialize) + } + VmState::Active(_) => return Err(VmError::AlreadyInitialized), + VmState::Rundown(_) => return Err(VmError::RundownInProgress), + _ => {} } - guard.state = VmState::WaitingToStart; - + guard.state = VmState::WaitingForInit; let vm_for_driver = self.clone(); guard.driver = Some(tokio::spawn(async move { state_driver::run_state_driver( diff --git a/bin/propolis-server/src/lib/vm/request_queue.rs b/bin/propolis-server/src/lib/vm/request_queue.rs index fe52d2135..55648bf54 100644 --- a/bin/propolis-server/src/lib/vm/request_queue.rs +++ b/bin/propolis-server/src/lib/vm/request_queue.rs @@ -35,6 +35,8 @@ use super::migrate_commands::{MigrateSourceCommand, MigrateSourceResponse}; /// the controller's state driver thread. #[derive(Debug)] pub enum ExternalRequest { + Start, + /// Asks the state worker to start a migration-source task. MigrateAsSource { /// The ID of the live migration for which this VM will be the source. @@ -125,6 +127,7 @@ enum RequestDisposition { /// The current disposition for each kind of incoming request. #[derive(Copy, Clone, Debug)] struct AllowedRequests { + start: RequestDisposition, migrate_as_source: RequestDisposition, reboot: RequestDisposition, stop: RequestDisposition, @@ -143,6 +146,7 @@ impl ExternalRequestQueue { Self { queue: VecDeque::new(), allowed: AllowedRequests { + start: RequestDisposition::Enqueue, migrate_as_source: RequestDisposition::Deny( RequestDeniedReason::InstanceNotActive, ), @@ -172,6 +176,7 @@ impl ExternalRequestQueue { request: ExternalRequest, ) -> Result<(), RequestDeniedReason> { let disposition = match request { + ExternalRequest::Start => self.allowed.start, ExternalRequest::MigrateAsSource { .. } => { self.allowed.migrate_as_source } @@ -245,32 +250,49 @@ impl ExternalRequestQueue { use RequestDeniedReason as DenyReason; use RequestDisposition as Disposition; match reason { + ChangeReason::ApiRequest(ExternalRequest::Start) => { + let reason = DenyReason::StartInProgress; + AllowedRequests { + start: Disposition::Ignore, + migrate_as_source: Disposition::Deny(reason), + reboot: Disposition::Deny(reason), + stop: self.allowed.stop, + } + } ChangeReason::ApiRequest(ExternalRequest::MigrateAsSource { .. - }) => AllowedRequests { - migrate_as_source: Disposition::Deny( - DenyReason::AlreadyMigrationSource, - ), - reboot: Disposition::Deny( - DenyReason::InvalidRequestForMigrationSource, - ), - stop: self.allowed.stop, - }, + }) => { + assert!(matches!(self.allowed.start, Disposition::Ignore)); + + AllowedRequests { + start: self.allowed.start, + migrate_as_source: Disposition::Deny( + DenyReason::AlreadyMigrationSource, + ), + reboot: Disposition::Deny( + DenyReason::InvalidRequestForMigrationSource, + ), + stop: self.allowed.stop, + } + } // Requests to reboot prevent additional reboot requests from being // queued, but do not affect other operations. ChangeReason::ApiRequest(ExternalRequest::Reboot) => { + assert!(matches!(self.allowed.start, Disposition::Ignore)); AllowedRequests { reboot: Disposition::Ignore, ..self.allowed } } // Requests to stop the instance block other requests from being // queued. Additional requests to stop are ignored for idempotency. ChangeReason::ApiRequest(ExternalRequest::Stop) => { + assert!(matches!(self.allowed.start, Disposition::Ignore)); + + let reason = DenyReason::HaltPending; AllowedRequests { - migrate_as_source: Disposition::Deny( - DenyReason::HaltPending, - ), - reboot: Disposition::Deny(DenyReason::HaltPending), + start: Disposition::Deny(reason), + migrate_as_source: Disposition::Deny(reason), + reboot: Disposition::Deny(reason), stop: Disposition::Ignore, } } @@ -279,6 +301,7 @@ impl ExternalRequestQueue { // to reboot it become valid. ChangeReason::StateChange(InstanceStateChange::StartedRunning) => { AllowedRequests { + start: self.allowed.start, migrate_as_source: Disposition::Enqueue, reboot: Disposition::Enqueue, stop: self.allowed.stop, @@ -305,20 +328,20 @@ impl ExternalRequestQueue { // previous dispositions for migrate and reboot requests may not be // "deny". ChangeReason::StateChange(InstanceStateChange::Stopped) => { + let reason = DenyReason::InstanceNotActive; AllowedRequests { - migrate_as_source: Disposition::Deny( - DenyReason::InstanceNotActive, - ), - reboot: Disposition::Deny(DenyReason::InstanceNotActive), + start: Disposition::Deny(reason), + migrate_as_source: Disposition::Deny(reason), + reboot: Disposition::Deny(reason), stop: Disposition::Ignore, } } ChangeReason::StateChange(InstanceStateChange::Failed) => { + let reason = DenyReason::InstanceFailed; AllowedRequests { - migrate_as_source: Disposition::Deny( - DenyReason::InstanceFailed, - ), - reboot: Disposition::Deny(DenyReason::InstanceFailed), + start: Disposition::Deny(reason), + migrate_as_source: Disposition::Deny(reason), + reboot: Disposition::Deny(reason), stop: self.allowed.stop, } } diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 6dbe97363..a88ecd261 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -12,7 +12,7 @@ use std::{ use propolis_api_types::{ instance_spec::VersionedInstanceSpec, InstanceProperties, InstanceState, }; -use slog::info; +use slog::{error, info}; use uuid::Uuid; use crate::{ @@ -72,6 +72,13 @@ enum HandleEventOutcome { Exit, } +/// A reason for starting a VM. +#[derive(Debug, PartialEq, Eq)] +enum VmStartReason { + MigratedIn, + ExplicitRequest, +} + #[derive(Debug)] enum InputQueueEvent { ExternalRequest(super::request_queue::ExternalRequest), @@ -217,53 +224,80 @@ pub(super) async fn run_state_driver( let (external_tx, external_rx) = tokio::sync::watch::channel( propolis_api_types::InstanceStateMonitorResponse { gen: 1, - state: propolis_api_types::InstanceState::Starting, + state: if ensure_request.migrate.is_some() { + propolis_api_types::InstanceState::Migrating + } else { + propolis_api_types::InstanceState::Starting + }, migration: propolis_api_types::InstanceMigrateStatusResponse { - migration_in: None, + migration_in: ensure_request.migrate.as_ref().map(|req| { + propolis_api_types::InstanceMigrationStatus { + id: req.migration_id, + state: propolis_api_types::MigrationState::Sync, + } + }), migration_out: None, }, }, ); + let input_queue = Arc::new(InputQueue::new( log.new(slog::o!("component" => "vmm_request_queue")), )); - let (vm_objects, vcpu_tasks) = match initialize_vm_from_spec( - &log, - &input_queue, - &ensure_request.properties, - &ensure_request.instance_spec, - &ensure_options, - ) - .await - { - Ok(objects) => objects, - Err(e) => { - let _ = - ensure_result_tx.send(Err(VmError::InitializationFailed(e))); - return external_tx; + let (vcpu_tasks, active_vm) = match ensure_request.migrate { + None => { + let (vm_objects, vcpu_tasks) = match initialize_vm_from_spec( + &log, + &input_queue, + &ensure_request.properties, + &ensure_request.instance_spec, + &ensure_options, + ) + .await + { + Ok(objects) => objects, + Err(e) => { + let _ = ensure_result_tx + .send(Err(VmError::InitializationFailed(e))); + return external_tx; + } + }; + + let services = super::services::VmServices::new( + &log, + &vm, + &vm_objects, + &ensure_request.properties, + &ensure_options, + ) + .await; + + let active_vm = Arc::new(super::ActiveVm { + parent: vm.clone(), + log: log.clone(), + state_driver_queue: input_queue.clone(), + external_state_rx: external_rx, + properties: ensure_request.properties, + objects: Some(tokio::sync::RwLock::new(vm_objects)), + services: Some(services), + }); + + // All the VM components now exist, so allow external callers to + // interact with the VM. + // + // Order matters here: once the ensure result is sent, an external + // caller needs to observe that an active VM is present. + vm.make_active(active_vm.clone()); + ensure_result_tx.send(Ok( + propolis_api_types::InstanceEnsureResponse { migrate: None }, + )); + + (vcpu_tasks, active_vm) } + Some(_migrate) => todo!("gjc"), }; - let services = super::services::VmServices::new( - &log, - &vm, - &vm_objects, - &ensure_request.properties, - &ensure_options, - ) - .await; - - let active_vm = Arc::new(super::ActiveVm { - parent: vm.clone(), - log: log.clone(), - state_driver_queue: input_queue.clone(), - external_state_rx: external_rx, - properties: ensure_request.properties, - objects: Some(tokio::sync::RwLock::new(vm_objects)), - services: Some(services), - }); - let state_driver = StateDriver { log: log.new(slog::o!("component" => "vmm_state_driver")), parent: vm.clone(), @@ -275,163 +309,16 @@ pub(super) async fn run_state_driver( migration_src_state: Default::default(), }; - state_driver.run(ensure_result_tx).await -} - -async fn initialize_vm_from_spec( - log: &slog::Logger, - event_queue: &Arc, - properties: &InstanceProperties, - spec: &VersionedInstanceSpec, - options: &super::EnsureOptions, -) -> anyhow::Result<(VmObjects, Box)> { - info!(log, "initializing new VM"; - "spec" => #?spec, - "properties" => #?properties, - "use_reservoir" => options.use_reservoir, - "bootrom" => %options.toml_config.bootrom.display()); - - let vmm_log = log.new(slog::o!("component" => "vmm")); - - // Set up the 'shell' instance into which the rest of this routine will - // add components. - let VersionedInstanceSpec::V0(v0_spec) = &spec; - let machine = build_instance( - &properties.vm_name(), - v0_spec, - options.use_reservoir, - vmm_log, - )?; - - let mut init = MachineInitializer { - log: log.clone(), - machine: &machine, - devices: Default::default(), - block_backends: Default::default(), - crucible_backends: Default::default(), - spec: &v0_spec, - properties: &properties, - toml_config: &options.toml_config, - producer_registry: options.oximeter_registry.clone(), - state: MachineInitializerState::default(), - }; - - init.initialize_rom(options.toml_config.bootrom.as_path())?; - let chipset = init.initialize_chipset( - &(event_queue.clone() - as Arc), - )?; - - init.initialize_rtc(&chipset)?; - init.initialize_hpet()?; - - let com1 = Arc::new(init.initialize_uart(&chipset)?); - let ps2ctrl = init.initialize_ps2(&chipset)?; - init.initialize_qemu_debug_port()?; - init.initialize_qemu_pvpanic(properties.into())?; - init.initialize_network_devices(&chipset)?; - - #[cfg(not(feature = "omicron-build"))] - init.initialize_test_devices(&options.toml_config.devices)?; - #[cfg(feature = "omicron-build")] - info!(log, "`omicron-build` feature enabled, ignoring any test devices"); - - #[cfg(feature = "falcon")] - init.initialize_softnpu_ports(&chipset)?; - #[cfg(feature = "falcon")] - init.initialize_9pfs(&chipset)?; - - init.initialize_storage_devices(&chipset, options.nexus_client.clone()) - .await?; - - let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; - init.initialize_cpus()?; - let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( - &machine, - event_queue.clone() as Arc, - log.new(slog::o!("component" => "vcpu_tasks")), - )?); - - let MachineInitializer { - devices, block_backends, crucible_backends, .. - } = init; - - Ok(( - VmObjects { - log: log.clone(), - instance_spec: v0_spec.clone(), - machine, - lifecycle_components: devices, - block_backends, - crucible_backends, - com1, - framebuffer: Some(ramfb), - ps2ctrl, - }, - vcpu_tasks as Box, - )) + state_driver.run().await } impl StateDriver { - pub(super) async fn run( - mut self, - ensure_result_tx: tokio::sync::oneshot::Sender< - Result, - >, - ) -> super::InstanceStateTx { - self.parent.make_active(self.active_vm.clone()); - self.update_external_state(ExternalStateUpdate::Instance( - InstanceState::Starting, - )); - ensure_result_tx.send(Ok(propolis_api_types::InstanceEnsureResponse { - migrate: None, - })); - - // TODO(gjc) actually start the VM - + pub(super) async fn run(mut self) -> super::InstanceStateTx { self.run_loop().await; - - // TODO(gjc) get rid of these self.parent.set_rundown(); - self.external_state_tx } - fn update_external_state(&mut self, state: ExternalStateUpdate) { - let (instance_state, migration_state) = match state { - ExternalStateUpdate::Instance(i) => (Some(i), None), - ExternalStateUpdate::Migration(m) => (None, Some(m)), - ExternalStateUpdate::Complete(i, m) => (Some(i), Some(m)), - }; - - let propolis_api_types::InstanceStateMonitorResponse { - state: old_instance, - migration: old_migration, - gen: old_gen, - } = self.external_state_tx.borrow().clone(); - - let state = instance_state.unwrap_or(old_instance); - let migration = if let Some(migration_state) = migration_state { - migration_state.apply_to(old_migration) - } else { - old_migration - }; - - let gen = old_gen + 1; - info!(self.log, "publishing new instance state"; - "gen" => gen, - "state" => ?state, - "migration" => ?migration); - - let _ = self.external_state_tx.send( - propolis_api_types::InstanceStateMonitorResponse { - gen, - state, - migration, - }, - ); - } - async fn run_loop(&mut self) { info!(self.log, "state driver launched"); @@ -457,6 +344,43 @@ impl StateDriver { info!(self.log, "state driver exiting"); } + async fn start_vm( + &mut self, + start_reason: VmStartReason, + ) -> anyhow::Result<()> { + info!(self.log, "starting instance"; "reason" => ?start_reason); + + let start_result = { + let (vm_objects, vcpu_tasks) = self.vm_objects_and_cpus().await; + match start_reason { + VmStartReason::ExplicitRequest => { + reset_vcpus(&*vm_objects, vcpu_tasks); + } + VmStartReason::MigratedIn => { + vm_objects.resume_vm(); + } + } + + let result = vm_objects.start_devices().await; + if result.is_ok() { + vcpu_tasks.resume_all(); + } + + result + }; + + match &start_result { + Ok(()) => self.publish_steady_state(InstanceState::Running), + Err(e) => { + error!(&self.log, "failed to start devices"; + "error" => ?e); + self.publish_steady_state(InstanceState::Failed); + } + } + + start_result + } + async fn handle_guest_event( &mut self, event: GuestEvent, @@ -498,6 +422,12 @@ impl StateDriver { request: super::request_queue::ExternalRequest, ) -> HandleEventOutcome { match request { + super::request_queue::ExternalRequest::Start => { + match self.start_vm(VmStartReason::ExplicitRequest).await { + Ok(_) => HandleEventOutcome::Continue, + Err(_) => HandleEventOutcome::Exit, + } + } super::request_queue::ExternalRequest::MigrateAsSource { .. } => todo!("gjc"), @@ -611,6 +541,41 @@ impl StateDriver { self.update_external_state(ExternalStateUpdate::Instance(state)); } + fn update_external_state(&mut self, state: ExternalStateUpdate) { + let (instance_state, migration_state) = match state { + ExternalStateUpdate::Instance(i) => (Some(i), None), + ExternalStateUpdate::Migration(m) => (None, Some(m)), + ExternalStateUpdate::Complete(i, m) => (Some(i), Some(m)), + }; + + let propolis_api_types::InstanceStateMonitorResponse { + state: old_instance, + migration: old_migration, + gen: old_gen, + } = self.external_state_tx.borrow().clone(); + + let state = instance_state.unwrap_or(old_instance); + let migration = if let Some(migration_state) = migration_state { + migration_state.apply_to(old_migration) + } else { + old_migration + }; + + let gen = old_gen + 1; + info!(self.log, "publishing new instance state"; + "gen" => gen, + "state" => ?state, + "migration" => ?migration); + + let _ = self.external_state_tx.send( + propolis_api_types::InstanceStateMonitorResponse { + gen, + state, + migration, + }, + ); + } + async fn vm_objects(&self) -> tokio::sync::RwLockReadGuard<'_, VmObjects> { self.active_vm.objects().await } @@ -632,3 +597,97 @@ fn reset_vcpus( vcpu_tasks.new_generation(); vm_objects.reset_vcpu_state(); } + +async fn initialize_vm_from_spec( + log: &slog::Logger, + event_queue: &Arc, + properties: &InstanceProperties, + spec: &VersionedInstanceSpec, + options: &super::EnsureOptions, +) -> anyhow::Result<(VmObjects, Box)> { + info!(log, "initializing new VM"; + "spec" => #?spec, + "properties" => #?properties, + "use_reservoir" => options.use_reservoir, + "bootrom" => %options.toml_config.bootrom.display()); + + let vmm_log = log.new(slog::o!("component" => "vmm")); + + // Set up the 'shell' instance into which the rest of this routine will + // add components. + let VersionedInstanceSpec::V0(v0_spec) = &spec; + let machine = build_instance( + &properties.vm_name(), + v0_spec, + options.use_reservoir, + vmm_log, + )?; + + let mut init = MachineInitializer { + log: log.clone(), + machine: &machine, + devices: Default::default(), + block_backends: Default::default(), + crucible_backends: Default::default(), + spec: &v0_spec, + properties: &properties, + toml_config: &options.toml_config, + producer_registry: options.oximeter_registry.clone(), + state: MachineInitializerState::default(), + }; + + init.initialize_rom(options.toml_config.bootrom.as_path())?; + let chipset = init.initialize_chipset( + &(event_queue.clone() + as Arc), + )?; + + init.initialize_rtc(&chipset)?; + init.initialize_hpet()?; + + let com1 = Arc::new(init.initialize_uart(&chipset)?); + let ps2ctrl = init.initialize_ps2(&chipset)?; + init.initialize_qemu_debug_port()?; + init.initialize_qemu_pvpanic(properties.into())?; + init.initialize_network_devices(&chipset)?; + + #[cfg(not(feature = "omicron-build"))] + init.initialize_test_devices(&options.toml_config.devices)?; + #[cfg(feature = "omicron-build")] + info!(log, "`omicron-build` feature enabled, ignoring any test devices"); + + #[cfg(feature = "falcon")] + init.initialize_softnpu_ports(&chipset)?; + #[cfg(feature = "falcon")] + init.initialize_9pfs(&chipset)?; + + init.initialize_storage_devices(&chipset, options.nexus_client.clone()) + .await?; + + let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; + init.initialize_cpus()?; + let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( + &machine, + event_queue.clone() as Arc, + log.new(slog::o!("component" => "vcpu_tasks")), + )?); + + let MachineInitializer { + devices, block_backends, crucible_backends, .. + } = init; + + Ok(( + VmObjects { + log: log.clone(), + instance_spec: v0_spec.clone(), + machine, + lifecycle_components: devices, + block_backends, + crucible_backends, + com1, + framebuffer: Some(ramfb), + ps2ctrl, + }, + vcpu_tasks as Box, + )) +} From e54ec7cac52f159a07d56ff16214c24eb5bd0f0a Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 25 Jun 2024 18:03:28 +0000 Subject: [PATCH 14/55] [WIP] resume fixing server build errors --- bin/propolis-server/src/lib/server.rs | 157 ++++++++---------- bin/propolis-server/src/lib/vm/mod.rs | 102 +++++++++--- bin/propolis-server/src/lib/vm/services.rs | 12 +- .../src/lib/vm/state_driver.rs | 12 ++ 4 files changed, 173 insertions(+), 110 deletions(-) diff --git a/bin/propolis-server/src/lib/server.rs b/bin/propolis-server/src/lib/server.rs index 4cf94966d..202cdcd57 100644 --- a/bin/propolis-server/src/lib/server.rs +++ b/bin/propolis-server/src/lib/server.rs @@ -17,6 +17,7 @@ use std::{collections::BTreeMap, net::SocketAddr}; use crate::serial::history_buffer::SerialHistoryOffset; use crate::serial::SerialTaskControlMessage; +use crate::vm::VmError; use dropshot::{ channel, endpoint, ApiDescription, HttpError, HttpResponseCreated, HttpResponseOk, HttpResponseUpdatedNoContent, Path, Query, RequestContext, @@ -325,40 +326,16 @@ async fn instance_spec_ensure( async fn instance_get_common( rqctx: &RequestContext>, -) -> Result<(api::Instance, VersionedInstanceSpec), HttpError> { +) -> Result { let ctx = rqctx.context(); - match &*ctx.services.vm.lock().await { - VmControllerState::NotCreated => Err(not_created_error()), - VmControllerState::Created(vm) => { - Ok(( - api::Instance { - properties: vm.properties().clone(), - state: vm.external_instance_state(), - disks: vec![], - // TODO: Fix this; we need a way to enumerate attached NICs. - // Possibly using the inventory of the instance? - // - // We *could* record whatever information about the NIC we want - // when they're requested (adding fields to the server), but that - // would make it difficult for Propolis to update any dynamic info - // (i.e., has the device faulted, etc). - nics: vec![], - }, - vm.instance_spec().await.clone(), - )) + ctx.vm.get().await.map_err(|e| match e { + VmError::NotCreated | VmError::WaitingToInitialize => { + not_created_error() } - VmControllerState::Destroyed { - last_instance, - last_instance_spec, - state_watcher, - .. - } => { - let watcher = state_watcher.borrow(); - let mut last_instance = last_instance.clone(); - last_instance.state = watcher.state; - Ok((*last_instance, *last_instance_spec.clone())) - } - } + _ => HttpError::for_internal_error(format!( + "error from VM controller: {e}" + )), + }) } #[endpoint { @@ -368,12 +345,7 @@ async fn instance_get_common( async fn instance_spec_get( rqctx: RequestContext>, ) -> Result, HttpError> { - let (instance, spec) = instance_get_common(&rqctx).await?; - Ok(HttpResponseOk(api::InstanceSpecGetResponse { - properties: instance.properties, - state: instance.state, - spec, - })) + Ok(HttpResponseOk(instance_get_common(&rqctx).await?)) } #[endpoint { @@ -383,8 +355,16 @@ async fn instance_spec_get( async fn instance_get( rqctx: RequestContext>, ) -> Result, HttpError> { - let (instance, _) = instance_get_common(&rqctx).await?; - Ok(HttpResponseOk(api::InstanceGetResponse { instance })) + Ok(instance_get_common(&rqctx).await.map(|full| { + HttpResponseOk(api::InstanceGetResponse { + instance: api::Instance { + properties: full.properties, + state: full.state, + disks: vec![], + nics: vec![], + }, + }) + })?) } #[endpoint { @@ -397,19 +377,14 @@ async fn instance_state_monitor( ) -> Result, HttpError> { let ctx = rqctx.context(); let gen = request.into_inner().gen; - let mut state_watcher = { - // N.B. This lock must be dropped before entering the loop below. - let vm_state = ctx.services.vm.lock().await; - match &*vm_state { - VmControllerState::NotCreated => { - return Err(not_created_error()); - } - VmControllerState::Created(vm) => vm.state_watcher().clone(), - VmControllerState::Destroyed { state_watcher, .. } => { - state_watcher.clone() - } + let mut state_watcher = ctx.vm.state_watcher().map_err(|e| match e { + VmError::NotCreated | VmError::WaitingToInitialize => { + not_created_error() } - }; + _ => HttpError::for_internal_error(format!( + "error from VM controller: {e}" + )), + })?; loop { let last = state_watcher.borrow().clone(); @@ -444,19 +419,29 @@ async fn instance_state_put( ) -> Result { let ctx = rqctx.context(); let requested_state = request.into_inner(); - let vm = ctx.vm().await?; + let vm = ctx.vm.active_vm().ok_or_else(|| not_created_error())?; let result = vm .put_state(requested_state) .map(|_| HttpResponseUpdatedNoContent {}) - .map_err(|e| e.into()); + .map_err(|e| match e { + VmError::NotCreated | VmError::WaitingToInitialize => { + not_created_error() + } + VmError::ForbiddenStateChange(reason) => HttpError::for_status( + Some(format!("instance state change not allowed: {}", reason)), + http::status::StatusCode::FORBIDDEN, + ), + _ => HttpError::for_internal_error(format!( + "error from VM controller: {e}" + )), + }); - drop(vm); if result.is_ok() { if let api::InstanceStateRequested::Reboot = requested_state { - let stats = MutexGuard::map( - ctx.services.oximeter_state.lock().await, - |state| &mut state.stats, - ); + let stats = + MutexGuard::map(vm.services().oximeter.lock().await, |state| { + &mut state.stats + }); if let Some(stats) = stats.as_ref() { stats.count_reset(); } @@ -476,8 +461,8 @@ async fn instance_serial_history_get( ) -> Result, HttpError> { let ctx = rqctx.context(); - let vm = ctx.vm().await?; - let serial = vm.com1().clone(); + let vm = ctx.vm.active_vm().ok_or_else(|| not_created_error())?; + let serial = vm.objects().await.com1().clone(); let query_params = query.into_inner(); let byte_offset = SerialHistoryOffset::try_from(&query_params)?; @@ -504,8 +489,8 @@ async fn instance_serial( websock: WebsocketConnection, ) -> dropshot::WebsocketChannelResult { let ctx = rqctx.context(); - let vm = ctx.vm().await?; - let serial = vm.com1().clone(); + let vm = ctx.vm.active_vm().ok_or_else(|| not_created_error())?; + let serial = vm.objects().await.com1().clone(); // Use the default buffering paramters for the websocket configuration // @@ -536,10 +521,8 @@ async fn instance_serial( } // Get serial task's handle and send it the websocket stream - ctx.services - .serial_task - .lock() - .await + let serial_task = vm.services().serial_task.lock().await; + serial_task .as_ref() .ok_or("Instance has no serial task")? .websocks_ch @@ -581,15 +564,17 @@ async fn instance_migrate_status( rqctx: RequestContext>, ) -> Result, HttpError> { let ctx = rqctx.context(); - match &*ctx.services.vm.lock().await { - VmControllerState::NotCreated => Err(not_created_error()), - VmControllerState::Created(vm) => { - Ok(HttpResponseOk(vm.migrate_status())) - } - VmControllerState::Destroyed { state_watcher, .. } => { - Ok(HttpResponseOk(state_watcher.borrow().migration.clone())) - } - } + ctx.vm + .state_watcher() + .map(|rx| HttpResponseOk(rx.borrow().migration.clone())) + .map_err(|e| match e { + VmError::NotCreated | VmError::WaitingToInitialize => { + not_created_error() + } + _ => HttpError::for_internal_error(format!( + "error from VM controller: {e}" + )), + }) } /// Issues a snapshot request to a crucible backend. @@ -601,8 +586,10 @@ async fn instance_issue_crucible_snapshot_request( rqctx: RequestContext>, path_params: Path, ) -> Result, HttpError> { - let inst = rqctx.context().vm().await?; - let crucible_backends = inst.crucible_backends(); + let vm = + rqctx.context().vm.active_vm().ok_or_else(|| not_created_error())?; + let objects = vm.objects().await; + let crucible_backends = objects.crucible_backends(); let path_params = path_params.into_inner(); let backend = crucible_backends.get(&path_params.id).ok_or_else(|| { @@ -626,10 +613,10 @@ async fn disk_volume_status( path_params: Path, ) -> Result, HttpError> { let path_params = path_params.into_inner(); - - let vm_controller = rqctx.context().vm().await?; - - let crucible_backends = vm_controller.crucible_backends(); + let vm = + rqctx.context().vm.active_vm().ok_or_else(|| not_created_error())?; + let objects = vm.objects().await; + let crucible_backends = objects.crucible_backends(); let backend = crucible_backends.get(&path_params.id).ok_or_else(|| { let s = format!("No crucible backend for id {}", path_params.id); HttpError::for_not_found(Some(s.clone()), s) @@ -726,8 +713,10 @@ async fn instance_issue_crucible_vcr_request( async fn instance_issue_nmi( rqctx: RequestContext>, ) -> Result, HttpError> { - let vm = rqctx.context().vm().await?; - vm.inject_nmi(); + let vm = + rqctx.context().vm.active_vm().ok_or_else(|| not_created_error())?; + let objects = vm.objects().await; + objects.machine().inject_nmi(); Ok(HttpResponseOk(())) } diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 956ed3630..d3e2fc952 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -7,7 +7,7 @@ use std::{ collections::BTreeMap, - sync::{Arc, RwLock, Weak}, + sync::{Arc, RwLock}, }; use oximeter::types::ProducerRegistry; @@ -16,9 +16,12 @@ use propolis::{ vmm::Machine, }; use propolis_api_types::{ - instance_spec::v0::InstanceSpecV0, InstanceProperties, + instance_spec::{v0::InstanceSpecV0, VersionedInstanceSpec}, + InstanceProperties, InstanceStateMonitorResponse, InstanceStateRequested, }; +use request_queue::ExternalRequest; use rfb::server::VncServer; +use slog::info; use crate::{ serial::Serial, server::MetricsEndpointConfig, vnc::PropolisVncServer, @@ -50,6 +53,9 @@ pub(crate) enum VmError { #[error("VM ensure result channel unexpectedly closed")] EnsureResultClosed, + #[error("VM not created")] + NotCreated, + #[error("VM is currently initializing")] WaitingToInitialize, @@ -61,6 +67,9 @@ pub(crate) enum VmError { #[error("VM initialization failed")] InitializationFailed(#[source] anyhow::Error), + + #[error("Forbidden state change")] + ForbiddenStateChange(#[from] request_queue::RequestDeniedReason), } /// The top-level VM wrapper type. Callers are expected to wrap this in an @@ -106,6 +115,10 @@ impl VmObjects { &self.block_backends } + pub(crate) fn crucible_backends(&self) -> &CrucibleBackendMap { + &self.crucible_backends + } + pub(crate) fn com1(&self) -> &Arc> { &self.com1 } @@ -159,6 +172,26 @@ impl ActiveVm { ) -> tokio::sync::RwLockReadGuard<'_, VmObjects> { self.objects.as_ref().unwrap().read().await } + + pub(crate) fn put_state( + &self, + requested: InstanceStateRequested, + ) -> Result<(), VmError> { + info!(self.log, "requested state via API"; + "state" => ?requested); + + self.state_driver_queue + .queue_external_request(match requested { + InstanceStateRequested::Run => ExternalRequest::Start, + InstanceStateRequested::Stop => ExternalRequest::Stop, + InstanceStateRequested::Reboot => ExternalRequest::Reboot, + }) + .map_err(Into::into) + } + + pub(crate) fn services(&self) -> &services::VmServices { + self.services.as_ref().expect("active VMs always have services") + } } impl Drop for ActiveVm { @@ -230,9 +263,8 @@ enum VmState { /// components and/or starting VM services. WaitingForInit, - /// There is an active virtual machine. Callers may try to upgrade the - /// contained weak reference to access its objects and services. - Active(Weak), + /// There is an active virtual machine + Active(Arc), /// The active VM's state driver has exited, but the Rundown(RundownVm), @@ -256,20 +288,51 @@ impl Vm { pub(super) fn active_vm(&self) -> Option> { let guard = self.inner.read().unwrap(); - if let VmState::Active(weak) = &guard.state { - weak.upgrade() + if let VmState::Active(vm) = &guard.state { + Some(vm.clone()) } else { None } } - fn start_failed(&self) { - let mut guard = self.inner.write().unwrap(); - match guard.state { - VmState::WaitingForInit => guard.state = VmState::NoVm, - _ => unreachable!( - "only a starting VM's state worker calls start_failed" - ), + pub(super) async fn get( + &self, + ) -> Result { + let vm = match &self.inner.read().unwrap().state { + VmState::NoVm => { + return Err(VmError::NotCreated); + } + VmState::WaitingForInit => { + return Err(VmError::WaitingToInitialize); + } + VmState::Active(vm) => vm.clone(), + VmState::Rundown(vm) | VmState::RundownComplete(vm) => { + return Ok(propolis_api_types::InstanceSpecGetResponse { + properties: vm.properties.clone(), + state: vm.external_state_rx.borrow().state, + spec: VersionedInstanceSpec::V0(vm.spec.clone()), + }); + } + }; + + let spec = vm.objects().await.instance_spec().clone(); + let state = vm.external_state_rx.borrow().clone(); + Ok(propolis_api_types::InstanceSpecGetResponse { + properties: vm.properties.clone(), + spec: VersionedInstanceSpec::V0(spec), + state: state.state, + }) + } + + pub(super) fn state_watcher(&self) -> Result { + let guard = self.inner.read().unwrap(); + match &guard.state { + VmState::NoVm => Err(VmError::NotCreated), + VmState::WaitingForInit => Err(VmError::WaitingToInitialize), + VmState::Active(vm) => Ok(vm.external_state_rx.clone()), + VmState::Rundown(vm) | VmState::RundownComplete(vm) => { + Ok(vm.external_state_rx.clone()) + } } } @@ -278,7 +341,7 @@ impl Vm { let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { VmState::WaitingForInit => { - guard.state = VmState::Active(Arc::downgrade(&active)) + guard.state = VmState::Active(active.clone()); } _ => unreachable!( "only a starting VM's state worker calls make_active" @@ -291,15 +354,14 @@ impl Vm { let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { VmState::Active(vm) => { - let active = vm.upgrade().expect("state driver holds a ref"); guard.state = VmState::Rundown(RundownVm { - external_state_rx: active.external_state_rx.clone(), - properties: active.properties.clone(), - spec: active.objects().await.instance_spec.clone(), + external_state_rx: vm.external_state_rx.clone(), + properties: vm.properties.clone(), + spec: vm.objects().await.instance_spec.clone(), }); } _ => unreachable!( - "only an active VM's state worker calls make_defunct" + "only an active VM's state worker calls set_rundown" ), } } diff --git a/bin/propolis-server/src/lib/vm/services.rs b/bin/propolis-server/src/lib/vm/services.rs index 93edada29..586b38db1 100644 --- a/bin/propolis-server/src/lib/vm/services.rs +++ b/bin/propolis-server/src/lib/vm/services.rs @@ -19,15 +19,15 @@ use crate::{ use super::VmObjects; #[derive(Default)] -struct OximeterState { +pub(crate) struct OximeterState { server: Option, - stats: Option, + pub stats: Option, } -pub(super) struct VmServices { - serial_task: tokio::sync::Mutex>, - oximeter: tokio::sync::Mutex, - vnc_server: Arc>, +pub(crate) struct VmServices { + pub serial_task: tokio::sync::Mutex>, + pub oximeter: tokio::sync::Mutex, + pub vnc_server: Arc>, } impl VmServices { diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index a88ecd261..a9581056c 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -139,6 +139,18 @@ impl InputQueue { let mut guard = self.inner.lock().unwrap(); guard.external_requests.notify_instance_state_change(state); } + + pub(super) fn queue_external_request( + &self, + request: super::request_queue::ExternalRequest, + ) -> Result<(), super::request_queue::RequestDeniedReason> { + let mut inner = self.inner.lock().unwrap(); + let result = inner.external_requests.try_queue(request); + if result.is_ok() { + self.cv.notify_one(); + } + result + } } impl guest_event::GuestEventHandler for InputQueue { From eb33b4250939da71b92d09b859ccd7d5fca57e7a Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 25 Jun 2024 19:21:06 +0000 Subject: [PATCH 15/55] [WIP] shunt Crucible mutation to worker thread --- bin/propolis-server/src/lib/server.rs | 82 ++++--------- .../src/lib/vm/lifecycle_ops.rs | 4 +- .../src/lib/vm/migrate_commands.rs | 2 +- bin/propolis-server/src/lib/vm/mod.rs | 33 +++++- .../src/lib/vm/request_queue.rs | 42 +++++++ .../src/lib/vm/state_driver.rs | 110 ++++++++++++++++-- 6 files changed, 199 insertions(+), 74 deletions(-) diff --git a/bin/propolis-server/src/lib/server.rs b/bin/propolis-server/src/lib/server.rs index 202cdcd57..cd003c8c1 100644 --- a/bin/propolis-server/src/lib/server.rs +++ b/bin/propolis-server/src/lib/server.rs @@ -333,7 +333,7 @@ async fn instance_get_common( not_created_error() } _ => HttpError::for_internal_error(format!( - "error from VM controller: {e}" + "unexpected error from VM controller: {e}" )), }) } @@ -382,7 +382,7 @@ async fn instance_state_monitor( not_created_error() } _ => HttpError::for_internal_error(format!( - "error from VM controller: {e}" + "unexpected error from VM controller: {e}" )), })?; @@ -432,7 +432,7 @@ async fn instance_state_put( http::status::StatusCode::FORBIDDEN, ), _ => HttpError::for_internal_error(format!( - "error from VM controller: {e}" + "unexpected error from VM controller: {e}" )), }); @@ -572,7 +572,7 @@ async fn instance_migrate_status( not_created_error() } _ => HttpError::for_internal_error(format!( - "error from VM controller: {e}" + "unexpected error from VM controller: {e}" )), }) } @@ -643,66 +643,28 @@ async fn instance_issue_crucible_vcr_request( let request = request.into_inner(); let new_vcr_json = request.vcr_json; let disk_name = request.name; - let log = rqctx.log.clone(); - - // Get the instance spec for storage backend from the disk name. We use - // the VCR stored there to send to crucible along with the new VCR we want - // to replace it. - let vm_controller = rqctx.context().vm().await?; - - // TODO(#205): Mutating a VM's configuration should be a first-class - // operation in the VM controller that synchronizes with ongoing migrations - // and other attempts to mutate the VM. For the time being, use the instance - // spec lock to exclude other concurrent attempts to reconfigure this - // backend. - let mut spec = vm_controller.instance_spec().await; - let VersionedInstanceSpec::V0(v0_spec) = &mut *spec; - - let (readonly, old_vcr_json) = { - let bes = &v0_spec.backends.storage_backends.get(&disk_name); - if let Some(StorageBackendV0::Crucible(bes)) = bes { - (bes.readonly, &bes.request_json) - } else { - let s = format!("Crucible backend for {:?} not found", disk_name); - return Err(HttpError::for_not_found(Some(s.clone()), s)); - } - }; - // Get the crucible backend so we can call the replacement method on it. - let crucible_backends = vm_controller.crucible_backends(); - let backend = crucible_backends.get(&path_params.id).ok_or_else(|| { - let s = format!("No crucible backend for id {}", path_params.id); - HttpError::for_not_found(Some(s.clone()), s) - })?; + let (tx, rx) = tokio::sync::oneshot::channel(); + let vm = rqctx.context().vm.active_vm().ok_or_else(not_created_error)?; - slog::info!( - log, - "{:?} {:?} vcr replace requested", - disk_name, - path_params.id, - ); - - // Try the replacement. - // Crucible does the heavy lifting here to verify that the old/new - // VCRs are different in just the correct way and will return error - // if there is any mismatch. - let replace_result = - backend.vcr_replace(old_vcr_json, &new_vcr_json).await.map_err( - |e| HttpError::for_bad_request(Some(e.to_string()), e.to_string()), - )?; - - // Our replacement request was accepted. We now need to update the - // spec stored in propolis so it matches what the downstairs now has. - let new_storage_backend: StorageBackendV0 = - StorageBackendV0::Crucible(CrucibleStorageBackend { - readonly, - request_json: new_vcr_json, - }); - v0_spec.backends.storage_backends.insert(disk_name, new_storage_backend); + vm.reconfigure_crucible_volume(disk_name, path_params.id, new_vcr_json, tx) + .map_err(|e| match e { + VmError::ForbiddenStateChange(reason) => HttpError::for_status( + Some(format!("instance state change not allowed: {}", reason)), + http::status::StatusCode::FORBIDDEN, + ), + _ => HttpError::for_internal_error(format!( + "unexpected error from VM controller: {e}" + )), + })?; - slog::info!(log, "Replaced the VCR in backend of {:?}", path_params.id); + let result = rx.await.map_err(|_| { + HttpError::for_internal_error( + "VM worker task unexpectedly dropped result channel".to_string(), + ) + })?; - Ok(HttpResponseOk(replace_result)) + result.map(HttpResponseOk) } /// Issues an NMI to the instance. diff --git a/bin/propolis-server/src/lib/vm/lifecycle_ops.rs b/bin/propolis-server/src/lib/vm/lifecycle_ops.rs index aae45c785..a5d9a80bc 100644 --- a/bin/propolis-server/src/lib/vm/lifecycle_ops.rs +++ b/bin/propolis-server/src/lib/vm/lifecycle_ops.rs @@ -142,7 +142,7 @@ impl super::VmObjects { }) } - pub(super) fn halt_devices(&self) { + pub(super) async fn halt_devices(&self) { self.for_each_device(|name, dev| { info!(self.log, "sending halt request to {}", name); dev.halt(); @@ -150,7 +150,7 @@ impl super::VmObjects { for (name, backend) in self.block_backends.iter() { info!(self.log, "stopping and detaching block backend {}", name); - backend.stop(); + backend.stop().await; if let Err(err) = backend.detach() { error!(self.log, "error detaching block backend"; "name" => name, diff --git a/bin/propolis-server/src/lib/vm/migrate_commands.rs b/bin/propolis-server/src/lib/vm/migrate_commands.rs index 08e564690..280ad4036 100644 --- a/bin/propolis-server/src/lib/vm/migrate_commands.rs +++ b/bin/propolis-server/src/lib/vm/migrate_commands.rs @@ -40,7 +40,7 @@ pub enum MigrateSourceCommand { pub enum MigrateSourceResponse { /// A previous migration out has (or has not) failed to restore the VM's /// dirty bitmap. - RedirtyingFailed(bool), + RedirtyingFailed(bool), /// A request to pause completed with the attached result. Pause(Result<(), std::io::Error>), diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index d3e2fc952..b038dcd6a 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -22,6 +22,7 @@ use propolis_api_types::{ use request_queue::ExternalRequest; use rfb::server::VncServer; use slog::info; +use uuid::Uuid; use crate::{ serial::Serial, server::MetricsEndpointConfig, vnc::PropolisVncServer, @@ -48,6 +49,11 @@ type InstanceStateRx = tokio::sync::watch::Receiver< propolis_api_types::InstanceStateMonitorResponse, >; +pub(crate) type CrucibleReplaceResult = + Result; +pub(crate) type CrucibleReplaceResultTx = + tokio::sync::oneshot::Sender; + #[derive(Debug, thiserror::Error)] pub(crate) enum VmError { #[error("VM ensure result channel unexpectedly closed")] @@ -83,7 +89,7 @@ struct VmInner { driver: Option>, } -struct VmObjects { +pub(crate) struct VmObjects { log: slog::Logger, instance_spec: InstanceSpecV0, machine: Machine, @@ -173,6 +179,12 @@ impl ActiveVm { self.objects.as_ref().unwrap().read().await } + async fn objects_mut( + &self, + ) -> tokio::sync::RwLockWriteGuard<'_, VmObjects> { + self.objects.as_ref().unwrap().write().await + } + pub(crate) fn put_state( &self, requested: InstanceStateRequested, @@ -189,6 +201,25 @@ impl ActiveVm { .map_err(Into::into) } + pub(crate) fn reconfigure_crucible_volume( + &self, + disk_name: String, + backend_id: Uuid, + new_vcr_json: String, + result_tx: CrucibleReplaceResultTx, + ) -> Result<(), VmError> { + self.state_driver_queue + .queue_external_request( + ExternalRequest::ReconfigureCrucibleVolume { + disk_name, + backend_id, + new_vcr_json, + result_tx, + }, + ) + .map_err(Into::into) + } + pub(crate) fn services(&self) -> &services::VmServices { self.services.as_ref().expect("active VMs always have services") } diff --git a/bin/propolis-server/src/lib/vm/request_queue.rs b/bin/propolis-server/src/lib/vm/request_queue.rs index 55648bf54..4fa538519 100644 --- a/bin/propolis-server/src/lib/vm/request_queue.rs +++ b/bin/propolis-server/src/lib/vm/request_queue.rs @@ -64,6 +64,27 @@ pub enum ExternalRequest { /// Halts the VM. Note that this is not a graceful shutdown and does not /// coordinate with guest software. Stop, + + /// Attempts to update the volume construction request for the supplied + /// Crucible volume. + /// + /// TODO: Due to https://github.com/oxidecomputer/crucible/issues/871, this + /// is only allowed once the VM is started and the volume has activated, but + /// it should be allowed even before the VM has started. + ReconfigureCrucibleVolume { + /// The name of the Crucible backend component in the instance spec. + disk_name: String, + + /// The ID of the Crucible backend in the VM's Crucible backend map. + backend_id: Uuid, + + /// The new volume construction request to supply to the Crucible + /// upstairs. + new_vcr_json: String, + + /// The sink for the result of this operation. + result_tx: super::CrucibleReplaceResultTx, + }, } /// A set of reasons why a request to queue an external state transition can @@ -130,6 +151,7 @@ struct AllowedRequests { start: RequestDisposition, migrate_as_source: RequestDisposition, reboot: RequestDisposition, + mutate: RequestDisposition, stop: RequestDisposition, } @@ -153,6 +175,9 @@ impl ExternalRequestQueue { reboot: RequestDisposition::Deny( RequestDeniedReason::InstanceNotActive, ), + mutate: RequestDisposition::Deny( + RequestDeniedReason::InstanceNotActive, + ), stop: RequestDisposition::Enqueue, }, log, @@ -181,6 +206,9 @@ impl ExternalRequestQueue { self.allowed.migrate_as_source } ExternalRequest::Reboot => self.allowed.reboot, + ExternalRequest::ReconfigureCrucibleVolume { .. } => { + self.allowed.mutate + } // Requests to stop always succeed. Note that a request to stop a VM // that hasn't started should still be queued to the state worker so @@ -256,6 +284,7 @@ impl ExternalRequestQueue { start: Disposition::Ignore, migrate_as_source: Disposition::Deny(reason), reboot: Disposition::Deny(reason), + mutate: Disposition::Deny(reason), stop: self.allowed.stop, } } @@ -272,6 +301,9 @@ impl ExternalRequestQueue { reboot: Disposition::Deny( DenyReason::InvalidRequestForMigrationSource, ), + mutate: Disposition::Deny( + DenyReason::InvalidRequestForMigrationSource, + ), stop: self.allowed.stop, } } @@ -293,10 +325,17 @@ impl ExternalRequestQueue { start: Disposition::Deny(reason), migrate_as_source: Disposition::Deny(reason), reboot: Disposition::Deny(reason), + mutate: Disposition::Deny(reason), stop: Disposition::Ignore, } } + // Requests to mutate VM configuration don't move the VM state + // machine and don't change any request dispositions. + ChangeReason::ApiRequest( + ExternalRequest::ReconfigureCrucibleVolume { .. }, + ) => self.allowed, + // When an instance begins running, requests to migrate out of it or // to reboot it become valid. ChangeReason::StateChange(InstanceStateChange::StartedRunning) => { @@ -304,6 +343,7 @@ impl ExternalRequestQueue { start: self.allowed.start, migrate_as_source: Disposition::Enqueue, reboot: Disposition::Enqueue, + mutate: Disposition::Enqueue, stop: self.allowed.stop, } } @@ -333,6 +373,7 @@ impl ExternalRequestQueue { start: Disposition::Deny(reason), migrate_as_source: Disposition::Deny(reason), reboot: Disposition::Deny(reason), + mutate: Disposition::Deny(reason), stop: Disposition::Ignore, } } @@ -342,6 +383,7 @@ impl ExternalRequestQueue { start: Disposition::Deny(reason), migrate_as_source: Disposition::Deny(reason), reboot: Disposition::Deny(reason), + mutate: Disposition::Deny(reason), stop: self.allowed.stop, } } diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index a9581056c..80cad3c29 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -10,7 +10,11 @@ use std::{ }; use propolis_api_types::{ - instance_spec::VersionedInstanceSpec, InstanceProperties, InstanceState, + instance_spec::{ + components::backends::CrucibleStorageBackend, v0::StorageBackendV0, + VersionedInstanceSpec, + }, + InstanceProperties, InstanceState, }; use slog::{error, info}; use uuid::Uuid; @@ -25,6 +29,7 @@ use crate::{ use super::{ guest_event::{self, GuestEvent}, + request_queue::ExternalRequest, InstanceStateTx, VmError, VmObjects, }; @@ -81,7 +86,7 @@ enum VmStartReason { #[derive(Debug)] enum InputQueueEvent { - ExternalRequest(super::request_queue::ExternalRequest), + ExternalRequest(ExternalRequest), GuestEvent(GuestEvent), } @@ -142,7 +147,7 @@ impl InputQueue { pub(super) fn queue_external_request( &self, - request: super::request_queue::ExternalRequest, + request: ExternalRequest, ) -> Result<(), super::request_queue::RequestDeniedReason> { let mut inner = self.inner.lock().unwrap(); let result = inner.external_requests.try_queue(request); @@ -431,26 +436,40 @@ impl StateDriver { async fn handle_external_request( &mut self, - request: super::request_queue::ExternalRequest, + request: ExternalRequest, ) -> HandleEventOutcome { match request { - super::request_queue::ExternalRequest::Start => { + ExternalRequest::Start => { match self.start_vm(VmStartReason::ExplicitRequest).await { Ok(_) => HandleEventOutcome::Continue, Err(_) => HandleEventOutcome::Exit, } } - super::request_queue::ExternalRequest::MigrateAsSource { - .. - } => todo!("gjc"), - super::request_queue::ExternalRequest::Reboot => { + ExternalRequest::MigrateAsSource { .. } => todo!("gjc"), + ExternalRequest::Reboot => { self.do_reboot(); HandleEventOutcome::Continue } - super::request_queue::ExternalRequest::Stop => { + ExternalRequest::Stop => { self.do_halt(); HandleEventOutcome::Exit } + ExternalRequest::ReconfigureCrucibleVolume { + disk_name, + backend_id, + new_vcr_json, + result_tx, + } => { + result_tx.send( + self.reconfigure_crucible_volume( + disk_name, + &backend_id, + new_vcr_json, + ) + .await, + ); + HandleEventOutcome::Continue + } } } @@ -592,6 +611,12 @@ impl StateDriver { self.active_vm.objects().await } + async fn vm_objects_mut( + &self, + ) -> tokio::sync::RwLockWriteGuard<'_, VmObjects> { + self.active_vm.objects_mut().await + } + async fn vm_objects_and_cpus( &mut self, ) -> ( @@ -600,6 +625,71 @@ impl StateDriver { ) { (self.active_vm.objects().await, self.vcpu_tasks.as_mut()) } + + async fn reconfigure_crucible_volume( + &self, + disk_name: String, + backend_id: &Uuid, + new_vcr_json: String, + ) -> super::CrucibleReplaceResult { + info!(self.log, "request to replace Crucible VCR"; + "disk_name" => %disk_name, + "backend_id" => %backend_id); + + let mut objects = self.vm_objects_mut().await; + + fn spec_element_not_found(disk_name: &str) -> dropshot::HttpError { + let msg = format!("Crucible backend for {:?} not found", disk_name); + dropshot::HttpError::for_not_found(Some(msg.clone()), msg) + } + + let (readonly, old_vcr_json) = { + let StorageBackendV0::Crucible(bes) = objects + .instance_spec + .backends + .storage_backends + .get(&disk_name) + .ok_or_else(|| spec_element_not_found(&disk_name))? + else { + return Err(spec_element_not_found(&disk_name)); + }; + + (bes.readonly, &bes.request_json) + }; + + let replace_result = { + let backend = + objects.crucible_backends.get(backend_id).ok_or_else(|| { + let msg = + format!("No crucible backend for id {backend_id}"); + dropshot::HttpError::for_not_found(Some(msg.clone()), msg) + })?; + + backend.vcr_replace(old_vcr_json, &new_vcr_json).await.map_err( + |e| { + dropshot::HttpError::for_bad_request( + Some(e.to_string()), + e.to_string(), + ) + }, + ) + }?; + + let new_bes = StorageBackendV0::Crucible(CrucibleStorageBackend { + readonly, + request_json: new_vcr_json, + }); + + objects + .instance_spec + .backends + .storage_backends + .insert(disk_name, new_bes); + + info!(self.log, "replaced Crucible VCR"; "backend_id" => %backend_id); + + Ok(replace_result) + } } fn reset_vcpus( From 1a3430d41056367b977679ae5cd0e34c9f2fe92b Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 25 Jun 2024 20:22:28 +0000 Subject: [PATCH 16/55] [WIP] clean up warnings --- bin/propolis-server/src/lib/server.rs | 71 +++++++++---------- .../src/lib/vm/lifecycle_ops.rs | 27 ------- bin/propolis-server/src/lib/vm/mod.rs | 39 +++++----- bin/propolis-server/src/lib/vm/services.rs | 25 ++++--- .../src/lib/vm/state_driver.rs | 20 +++--- bin/propolis-server/src/lib/vnc.rs | 2 +- 6 files changed, 76 insertions(+), 108 deletions(-) diff --git a/bin/propolis-server/src/lib/server.rs b/bin/propolis-server/src/lib/server.rs index cd003c8c1..e70fa30c4 100644 --- a/bin/propolis-server/src/lib/server.rs +++ b/bin/propolis-server/src/lib/server.rs @@ -16,7 +16,6 @@ use std::sync::Arc; use std::{collections::BTreeMap, net::SocketAddr}; use crate::serial::history_buffer::SerialHistoryOffset; -use crate::serial::SerialTaskControlMessage; use crate::vm::VmError; use dropshot::{ channel, endpoint, ApiDescription, HttpError, HttpResponseCreated, @@ -29,21 +28,17 @@ use internal_dns::ServiceName; pub use nexus_client::Client as NexusClient; use oximeter::types::ProducerRegistry; use propolis_api_types as api; -use propolis_api_types::instance_spec::{ - self, components::backends::CrucibleStorageBackend, v0::StorageBackendV0, - VersionedInstanceSpec, -}; +use propolis_api_types::instance_spec::{self, VersionedInstanceSpec}; pub use propolis_server_config::Config as VmTomlConfig; use rfb::server::VncServer; -use slog::{error, info, o, warn, Logger}; +use slog::{error, warn, Logger}; use thiserror::Error; -use tokio::sync::{mpsc, oneshot, MappedMutexGuard, Mutex, MutexGuard}; +use tokio::sync::MutexGuard; use tokio_tungstenite::tungstenite::protocol::{Role, WebSocketConfig}; use tokio_tungstenite::WebSocketStream; use crate::spec::{ServerSpecBuilder, ServerSpecBuilderError}; -use crate::stats::virtual_machine::VirtualMachine; use crate::vnc::PropolisVncServer; pub(crate) type DeviceMap = @@ -86,22 +81,13 @@ pub struct StaticConfig { metrics: Option, } -/// Objects related to Propolis's Oximeter metric production. -pub struct OximeterState { - /// The metric producer server. - server: Option, - - /// The metrics wrapper for "server-level" metrics, i.e., metrics that are - /// tracked by the server itself (as opposed to being tracked by a component - /// within an instance). - stats: Option, -} - /// Context accessible from HTTP callbacks. pub struct DropshotEndpointContext { static_config: StaticConfig, vnc_server: Arc>, - pub vm: Arc, + pub(crate) vm: Arc, + + #[allow(dead_code)] log: Logger, } @@ -273,11 +259,25 @@ async fn instance_ensure_common( .vm .ensure(rqctx.log.clone(), request, ensure_options) .await - .expect("gjc"); - - Ok(HttpResponseCreated(api::InstanceEnsureResponse { - migrate: todo!("gjc"), - })) + .map(HttpResponseCreated) + .map_err(|e| match e { + VmError::EnsureResultClosed => HttpError::for_internal_error( + "state driver unexpectedly dropped result channel".to_string(), + ), + VmError::WaitingToInitialize + | VmError::AlreadyInitialized + | VmError::RundownInProgress => HttpError::for_client_error( + Some(api::ErrorCode::AlreadyInitialized.to_string()), + http::StatusCode::CONFLICT, + "instance already initialized".to_string(), + ), + VmError::InitializationFailed(e) => HttpError::for_internal_error( + format!("VM initialization failed: {e}"), + ), + _ => HttpError::for_internal_error(format!( + "unexpected error from VM controller: {e}" + )), + }) } #[endpoint { @@ -355,7 +355,7 @@ async fn instance_spec_get( async fn instance_get( rqctx: RequestContext>, ) -> Result, HttpError> { - Ok(instance_get_common(&rqctx).await.map(|full| { + instance_get_common(&rqctx).await.map(|full| { HttpResponseOk(api::InstanceGetResponse { instance: api::Instance { properties: full.properties, @@ -364,7 +364,7 @@ async fn instance_get( nics: vec![], }, }) - })?) + }) } #[endpoint { @@ -419,7 +419,7 @@ async fn instance_state_put( ) -> Result { let ctx = rqctx.context(); let requested_state = request.into_inner(); - let vm = ctx.vm.active_vm().ok_or_else(|| not_created_error())?; + let vm = ctx.vm.active_vm().ok_or_else(not_created_error)?; let result = vm .put_state(requested_state) .map(|_| HttpResponseUpdatedNoContent {}) @@ -461,7 +461,7 @@ async fn instance_serial_history_get( ) -> Result, HttpError> { let ctx = rqctx.context(); - let vm = ctx.vm.active_vm().ok_or_else(|| not_created_error())?; + let vm = ctx.vm.active_vm().ok_or_else(not_created_error)?; let serial = vm.objects().await.com1().clone(); let query_params = query.into_inner(); @@ -489,7 +489,7 @@ async fn instance_serial( websock: WebsocketConnection, ) -> dropshot::WebsocketChannelResult { let ctx = rqctx.context(); - let vm = ctx.vm.active_vm().ok_or_else(|| not_created_error())?; + let vm = ctx.vm.active_vm().ok_or_else(not_created_error)?; let serial = vm.objects().await.com1().clone(); // Use the default buffering paramters for the websocket configuration @@ -586,8 +586,7 @@ async fn instance_issue_crucible_snapshot_request( rqctx: RequestContext>, path_params: Path, ) -> Result, HttpError> { - let vm = - rqctx.context().vm.active_vm().ok_or_else(|| not_created_error())?; + let vm = rqctx.context().vm.active_vm().ok_or_else(not_created_error)?; let objects = vm.objects().await; let crucible_backends = objects.crucible_backends(); let path_params = path_params.into_inner(); @@ -613,8 +612,7 @@ async fn disk_volume_status( path_params: Path, ) -> Result, HttpError> { let path_params = path_params.into_inner(); - let vm = - rqctx.context().vm.active_vm().ok_or_else(|| not_created_error())?; + let vm = rqctx.context().vm.active_vm().ok_or_else(not_created_error)?; let objects = vm.objects().await; let crucible_backends = objects.crucible_backends(); let backend = crucible_backends.get(&path_params.id).ok_or_else(|| { @@ -675,10 +673,9 @@ async fn instance_issue_crucible_vcr_request( async fn instance_issue_nmi( rqctx: RequestContext>, ) -> Result, HttpError> { - let vm = - rqctx.context().vm.active_vm().ok_or_else(|| not_created_error())?; + let vm = rqctx.context().vm.active_vm().ok_or_else(not_created_error)?; let objects = vm.objects().await; - objects.machine().inject_nmi(); + let _ = objects.machine().inject_nmi(); Ok(HttpResponseOk(())) } diff --git a/bin/propolis-server/src/lib/vm/lifecycle_ops.rs b/bin/propolis-server/src/lib/vm/lifecycle_ops.rs index a5d9a80bc..4843969d2 100644 --- a/bin/propolis-server/src/lib/vm/lifecycle_ops.rs +++ b/bin/propolis-server/src/lib/vm/lifecycle_ops.rs @@ -10,33 +10,6 @@ use std::{ use futures::{future::BoxFuture, stream::FuturesUnordered, StreamExt}; use slog::{error, info}; -pub(super) trait VmLifecycle: Send + Sync { - /// Resume a previously-paused VM at the kernel VMM level. This will resume - /// any timers driving in-kernel-emulated devices, and allow the vCPU to run - /// again. - fn resume_vm(&self); - - /// Sends a reset request to each device in the instance, then sends a - /// reset command to the instance's bhyve VM. - fn reset_devices_and_machine(&self); - - /// Sends each device (and backend) a start request. - fn start_devices(&self) -> BoxFuture<'_, anyhow::Result<()>>; - - /// Sends each device a pause request. Returns a future that can be awaited - /// to wait for all pause requests to complete. - fn pause_devices(&self) -> BoxFuture<'_, ()>; - - /// Sends each device a resume request. - fn resume_devices(&self); - - /// Sends each device (and backend) a halt request. - fn halt_devices(&self); - - /// Resets the state of each vCPU in the instance to its on-reboot state. - fn reset_vcpu_state(&self); -} - impl super::VmObjects { /// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated /// devices and vCPUs are brought to a consistent state. diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index b038dcd6a..fc3f17b41 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -251,16 +251,18 @@ impl Drop for ActiveVm { services.stop(&log).await; let tx = driver.await.expect("state driver shouldn't panic"); - let old_state = tx.borrow(); - let new_state = InstanceStateMonitorResponse { - gen: old_state.gen + 1, - state: propolis_api_types::InstanceState::Destroyed, - migration: old_state.migration.clone(), + let new_state = { + let old_state = tx.borrow(); + InstanceStateMonitorResponse { + gen: old_state.gen + 1, + state: propolis_api_types::InstanceState::Destroyed, + migration: old_state.migration.clone(), + } }; tx.send(new_state).expect("VM in rundown should hold a receiver"); - parent.complete_rundown(); + parent.complete_rundown().await; }); } } @@ -381,20 +383,17 @@ impl Vm { } async fn set_rundown(&self) { - let mut guard = self.inner.write().unwrap(); - let old = std::mem::replace(&mut guard.state, VmState::NoVm); - match old { - VmState::Active(vm) => { - guard.state = VmState::Rundown(RundownVm { - external_state_rx: vm.external_state_rx.clone(), - properties: vm.properties.clone(), - spec: vm.objects().await.instance_spec.clone(), - }); - } - _ => unreachable!( - "only an active VM's state worker calls set_rundown" - ), - } + let vm = self + .active_vm() + .expect("VM should be active before being run down"); + + let new_state = VmState::Rundown(RundownVm { + external_state_rx: vm.external_state_rx.clone(), + properties: vm.properties.clone(), + spec: vm.objects().await.instance_spec.clone(), + }); + + self.inner.write().unwrap().state = new_state; } async fn complete_rundown(&self) { diff --git a/bin/propolis-server/src/lib/vm/services.rs b/bin/propolis-server/src/lib/vm/services.rs index 586b38db1..5d01b7516 100644 --- a/bin/propolis-server/src/lib/vm/services.rs +++ b/bin/propolis-server/src/lib/vm/services.rs @@ -43,22 +43,21 @@ impl VmServices { "should have a producer registry if metrics are configured", ); - let state = - register_oximeter_producer(log, cfg, registry, vm_properties) - .await; - - state + register_oximeter_producer(log, cfg, registry, vm_properties).await } else { OximeterState::default() }; let vnc_server = ensure_options.vnc_server.clone(); if let Some(ramfb) = &vm_objects.framebuffer { - vnc_server.server.initialize( - crate::vnc::RamFb::new(ramfb.get_framebuffer_spec()), - vm_objects.ps2ctrl.clone(), - vm.clone(), - ); + vnc_server + .server + .initialize( + crate::vnc::RamFb::new(ramfb.get_framebuffer_spec()), + vm_objects.ps2ctrl.clone(), + vm.clone(), + ) + .await; let notifier_server_ref = vnc_server.clone(); let rt = tokio::runtime::Handle::current(); @@ -115,8 +114,8 @@ async fn register_oximeter_producer( // don't need our own task for that or way to shut it down. oximeter_state.server = match crate::stats::start_oximeter_server( virtual_machine.instance_id, - &cfg, - &log, + cfg, + log, registry, ) { Ok(server) => { @@ -140,7 +139,7 @@ async fn register_oximeter_producer( oximeter_state.stats = match crate::stats::register_server_metrics( registry, virtual_machine, - &log, + log, ) .await { diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 80cad3c29..f06c12836 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -306,7 +306,7 @@ pub(super) async fn run_state_driver( // Order matters here: once the ensure result is sent, an external // caller needs to observe that an active VM is present. vm.make_active(active_vm.clone()); - ensure_result_tx.send(Ok( + let _ = ensure_result_tx.send(Ok( propolis_api_types::InstanceEnsureResponse { migrate: None }, )); @@ -332,7 +332,7 @@ pub(super) async fn run_state_driver( impl StateDriver { pub(super) async fn run(mut self) -> super::InstanceStateTx { self.run_loop().await; - self.parent.set_rundown(); + self.parent.set_rundown().await; self.external_state_tx } @@ -371,7 +371,7 @@ impl StateDriver { let (vm_objects, vcpu_tasks) = self.vm_objects_and_cpus().await; match start_reason { VmStartReason::ExplicitRequest => { - reset_vcpus(&*vm_objects, vcpu_tasks); + reset_vcpus(&vm_objects, vcpu_tasks); } VmStartReason::MigratedIn => { vm_objects.resume_vm(); @@ -447,11 +447,11 @@ impl StateDriver { } ExternalRequest::MigrateAsSource { .. } => todo!("gjc"), ExternalRequest::Reboot => { - self.do_reboot(); + self.do_reboot().await; HandleEventOutcome::Continue } ExternalRequest::Stop => { - self.do_halt(); + self.do_halt().await; HandleEventOutcome::Exit } ExternalRequest::ReconfigureCrucibleVolume { @@ -460,7 +460,7 @@ impl StateDriver { new_vcr_json, result_tx, } => { - result_tx.send( + let _ = result_tx.send( self.reconfigure_crucible_volume( disk_name, &backend_id, @@ -493,7 +493,7 @@ impl StateDriver { // Reset all entities and the VM's bhyve state, then reset the // vCPUs. The vCPU reset must come after the bhyve reset. vm_objects.reset_devices_and_machine(); - reset_vcpus(&*vm_objects, vcpu_tasks); + reset_vcpus(&vm_objects, vcpu_tasks); // Resume devices so they're ready to do more work, then resume // vCPUs. @@ -525,7 +525,7 @@ impl StateDriver { } self.vcpu_tasks.exit_all(); - self.vm_objects().await.halt_devices(); + self.vm_objects().await.halt_devices().await; self.publish_steady_state(InstanceState::Stopped); } @@ -731,8 +731,8 @@ async fn initialize_vm_from_spec( devices: Default::default(), block_backends: Default::default(), crucible_backends: Default::default(), - spec: &v0_spec, - properties: &properties, + spec: v0_spec, + properties, toml_config: &options.toml_config, producer_registry: options.oximeter_registry.clone(), state: MachineInitializerState::default(), diff --git a/bin/propolis-server/src/lib/vnc.rs b/bin/propolis-server/src/lib/vnc.rs index 8f92dbbf4..c9736e4d0 100644 --- a/bin/propolis-server/src/lib/vnc.rs +++ b/bin/propolis-server/src/lib/vnc.rs @@ -77,7 +77,7 @@ impl PropolisVncServer { } } - pub async fn initialize( + pub(crate) async fn initialize( &self, fb: RamFb, ps2ctrl: Arc, From 8cf4b9a7d1445431fb85e2d4eba47ec1f25346ac Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 25 Jun 2024 20:54:09 +0000 Subject: [PATCH 17/55] [WIP] oops I blocked the executor --- bin/propolis-server/src/lib/vm/mod.rs | 6 +-- .../src/lib/vm/state_driver.rs | 42 ++++++++++++------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index fc3f17b41..1605f2496 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -410,8 +410,7 @@ impl Vm { log: slog::Logger, ensure_request: propolis_api_types::InstanceSpecEnsureRequest, options: EnsureOptions, - ) -> anyhow::Result - { + ) -> Result { let (ensure_reply_tx, ensure_rx) = tokio::sync::oneshot::channel(); // Take the lock for writing, since in the common case this call will be @@ -430,9 +429,10 @@ impl Vm { guard.state = VmState::WaitingForInit; let vm_for_driver = self.clone(); + let log_for_driver = log.clone(); guard.driver = Some(tokio::spawn(async move { state_driver::run_state_driver( - log, + log_for_driver, vm_for_driver, ensure_request, ensure_reply_tx, diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index f06c12836..f1debd3ae 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -16,7 +16,7 @@ use propolis_api_types::{ }, InstanceProperties, InstanceState, }; -use slog::{error, info}; +use slog::{debug, error, info, trace, warn}; use uuid::Uuid; use crate::{ @@ -120,21 +120,23 @@ impl InputQueue { } fn wait_for_next_event(&self) -> InputQueueEvent { - let guard = self.inner.lock().unwrap(); - let mut guard = self - .cv - .wait_while(guard, |i| { - i.external_requests.is_empty() && i.guest_events.is_empty() - }) - .unwrap(); - - if let Some(guest_event) = guard.guest_events.pop_front() { - InputQueueEvent::GuestEvent(guest_event) - } else { - InputQueueEvent::ExternalRequest( - guard.external_requests.pop_front().unwrap(), - ) - } + tokio::task::block_in_place(|| { + let guard = self.inner.lock().unwrap(); + let mut guard = self + .cv + .wait_while(guard, |i| { + i.external_requests.is_empty() && i.guest_events.is_empty() + }) + .unwrap(); + + if let Some(guest_event) = guard.guest_events.pop_front() { + InputQueueEvent::GuestEvent(guest_event) + } else { + InputQueueEvent::ExternalRequest( + guard.external_requests.pop_front().unwrap(), + ) + } + }) } fn notify_instance_state_change( @@ -264,6 +266,8 @@ pub(super) async fn run_state_driver( let (vcpu_tasks, active_vm) = match ensure_request.migrate { None => { + trace!(log, "starting VM initialization"); + let (vm_objects, vcpu_tasks) = match initialize_vm_from_spec( &log, &input_queue, @@ -281,6 +285,8 @@ pub(super) async fn run_state_driver( } }; + trace!(log, "initialized VM objects"); + let services = super::services::VmServices::new( &log, &vm, @@ -290,6 +296,8 @@ pub(super) async fn run_state_driver( ) .await; + trace!(log, "initialized VM services"); + let active_vm = Arc::new(super::ActiveVm { parent: vm.clone(), log: log.clone(), @@ -310,6 +318,8 @@ pub(super) async fn run_state_driver( propolis_api_types::InstanceEnsureResponse { migrate: None }, )); + trace!(log, "made VM active"); + (vcpu_tasks, active_vm) } Some(_migrate) => todo!("gjc"), From c2fc6d2f26e6afd02865efa208c1e2c035d111c4 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 26 Jun 2024 02:43:44 +0000 Subject: [PATCH 18/55] [WIP] improve log inheritance --- bin/propolis-server/src/lib/server.rs | 6 ++--- bin/propolis-server/src/lib/vm/mod.rs | 22 +++++++++++++++---- .../src/lib/vm/request_queue.rs | 2 -- .../src/lib/vm/state_driver.rs | 4 ++-- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/bin/propolis-server/src/lib/server.rs b/bin/propolis-server/src/lib/server.rs index e70fa30c4..14c8b1c18 100644 --- a/bin/propolis-server/src/lib/server.rs +++ b/bin/propolis-server/src/lib/server.rs @@ -86,8 +86,6 @@ pub struct DropshotEndpointContext { static_config: StaticConfig, vnc_server: Arc>, pub(crate) vm: Arc, - - #[allow(dead_code)] log: Logger, } @@ -107,7 +105,7 @@ impl DropshotEndpointContext { metrics: metric_config, }, vnc_server, - vm: crate::vm::Vm::new(), + vm: crate::vm::Vm::new(&log), log, } } @@ -257,7 +255,7 @@ async fn instance_ensure_common( server_context .vm - .ensure(rqctx.log.clone(), request, ensure_options) + .ensure(&server_context.log, request, ensure_options) .await .map(HttpResponseCreated) .map_err(|e| match e { diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 1605f2496..c926b2a15 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -82,6 +82,7 @@ pub(crate) enum VmError { /// `Arc`. pub(crate) struct Vm { inner: RwLock, + log: slog::Logger, } struct VmInner { @@ -153,6 +154,12 @@ impl VmObjects { } } +impl Drop for VmObjects { + fn drop(&mut self) { + info!(self.log, "dropping VM objects"); + } +} + /// The state stored in a [`Vm`] when there is an actual underlying virtual /// machine. pub(super) struct ActiveVm { @@ -227,6 +234,8 @@ impl ActiveVm { impl Drop for ActiveVm { fn drop(&mut self) { + info!(self.log, "dropping active VM"); + let driver = self .parent .inner @@ -314,9 +323,10 @@ pub(super) struct EnsureOptions { } impl Vm { - pub fn new() -> Arc { + pub fn new(log: &slog::Logger) -> Arc { + let log = log.new(slog::o!("component" => "vm_wrapper")); let inner = VmInner { state: VmState::NoVm, driver: None }; - Arc::new(Self { inner: RwLock::new(inner) }) + Arc::new(Self { inner: RwLock::new(inner), log }) } pub(super) fn active_vm(&self) -> Option> { @@ -370,6 +380,7 @@ impl Vm { } fn make_active(&self, active: Arc) { + info!(self.log, "installing active VM"); let mut guard = self.inner.write().unwrap(); let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { @@ -387,6 +398,7 @@ impl Vm { .active_vm() .expect("VM should be active before being run down"); + info!(self.log, "setting VM rundown"); let new_state = VmState::Rundown(RundownVm { external_state_rx: vm.external_state_rx.clone(), properties: vm.properties.clone(), @@ -397,6 +409,7 @@ impl Vm { } async fn complete_rundown(&self) { + info!(self.log, "completing VM rundown"); let mut guard = self.inner.write().unwrap(); let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { @@ -407,7 +420,7 @@ impl Vm { pub(crate) async fn ensure( self: &Arc, - log: slog::Logger, + log: &slog::Logger, ensure_request: propolis_api_types::InstanceSpecEnsureRequest, options: EnsureOptions, ) -> Result { @@ -429,7 +442,8 @@ impl Vm { guard.state = VmState::WaitingForInit; let vm_for_driver = self.clone(); - let log_for_driver = log.clone(); + let log_for_driver = + log.new(slog::o!("component" => "vm_state_driver")); guard.driver = Some(tokio::spawn(async move { state_driver::run_state_driver( log_for_driver, diff --git a/bin/propolis-server/src/lib/vm/request_queue.rs b/bin/propolis-server/src/lib/vm/request_queue.rs index 4fa538519..076709f9a 100644 --- a/bin/propolis-server/src/lib/vm/request_queue.rs +++ b/bin/propolis-server/src/lib/vm/request_queue.rs @@ -318,8 +318,6 @@ impl ExternalRequestQueue { // Requests to stop the instance block other requests from being // queued. Additional requests to stop are ignored for idempotency. ChangeReason::ApiRequest(ExternalRequest::Stop) => { - assert!(matches!(self.allowed.start, Disposition::Ignore)); - let reason = DenyReason::HaltPending; AllowedRequests { start: Disposition::Deny(reason), diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index f1debd3ae..a2f80a979 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -261,7 +261,7 @@ pub(super) async fn run_state_driver( ); let input_queue = Arc::new(InputQueue::new( - log.new(slog::o!("component" => "vmm_request_queue")), + log.new(slog::o!("component" => "request_queue")), )); let (vcpu_tasks, active_vm) = match ensure_request.migrate { @@ -326,7 +326,7 @@ pub(super) async fn run_state_driver( }; let state_driver = StateDriver { - log: log.new(slog::o!("component" => "vmm_state_driver")), + log, parent: vm.clone(), active_vm, input_queue, From 290cfbd4080d88bdd0eae57c81fa0b2f32f9a27c Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 26 Jun 2024 17:25:47 +0000 Subject: [PATCH 19/55] [WIP] hook up destination side of migration --- .../src/lib/migrate/destination.rs | 107 +++-- bin/propolis-server/src/lib/migrate/mod.rs | 20 +- bin/propolis-server/src/lib/server.rs | 1 + .../src/lib/vm/migrate_commands.rs | 13 +- bin/propolis-server/src/lib/vm/mod.rs | 22 +- .../src/lib/vm/state_driver.rs | 381 +++++++++++------- .../src/lib/vm/state_publisher.rs | 97 +++++ 7 files changed, 444 insertions(+), 197 deletions(-) create mode 100644 bin/propolis-server/src/lib/vm/state_publisher.rs diff --git a/bin/propolis-server/src/lib/migrate/destination.rs b/bin/propolis-server/src/lib/migrate/destination.rs index f966f3916..a258681da 100644 --- a/bin/propolis-server/src/lib/migrate/destination.rs +++ b/bin/propolis-server/src/lib/migrate/destination.rs @@ -24,24 +24,31 @@ use crate::migrate::probes; use crate::migrate::{ Device, MigrateError, MigratePhase, MigrateRole, MigrationState, PageIter, }; -use crate::vm::{migrate_commands::MigrateTargetCommand, ActiveVm}; +use crate::vm::migrate_commands::MigrateTargetCommand; +use crate::vm::migrate_commands::MigrateTargetResponse; use super::protocol::Protocol; /// Launches an attempt to migrate into a supplied instance using the supplied /// source connection. pub async fn migrate( - vm: Arc, + log: &slog::Logger, command_tx: tokio::sync::mpsc::Sender, + response_rx: tokio::sync::mpsc::Receiver, conn: WebSocketStream, local_addr: SocketAddr, protocol: Protocol, ) -> Result<(), MigrateError> { let err_tx = command_tx.clone(); + let log = log.new(slog::o!("component" => "migration_target_protocol")); let mut proto = match protocol { - Protocol::RonV0 => { - DestinationProtocol::new(vm, command_tx, conn, local_addr) - } + Protocol::RonV0 => DestinationProtocol::new( + log, + command_tx, + response_rx, + conn, + local_addr, + ), }; if let Err(err) = proto.run().await { @@ -64,33 +71,49 @@ pub async fn migrate( } struct DestinationProtocol { - /// The VM controller for the instance of interest. - vm: Arc, + /// The logger for messages from this protocol. + log: slog::Logger, /// The channel to use to send messages to the state worker coordinating /// this migration. command_tx: tokio::sync::mpsc::Sender, + /// The channel that receives responses from the state worker coordinating + /// this migration. + response_rx: tokio::sync::mpsc::Receiver, + /// Transport to the source Instance. conn: WebSocketStream, /// Local propolis-server address /// (to inform the source-side where to redirect its clients) local_addr: SocketAddr, + + /// The VM objects into which to import the source VM's state. Only + /// initialized after the sync phase. + vm_objects: Option>, } impl DestinationProtocol { fn new( - vm: Arc, + log: slog::Logger, command_tx: tokio::sync::mpsc::Sender, + response_rx: tokio::sync::mpsc::Receiver, conn: WebSocketStream, local_addr: SocketAddr, ) -> Self { - Self { vm, command_tx, conn, local_addr } + Self { + log, + command_tx, + response_rx, + conn, + local_addr, + vm_objects: None, + } } fn log(&self) -> &slog::Logger { - self.vm.log() + &self.log } async fn update_state(&mut self, state: MigrationState) { @@ -170,9 +193,22 @@ impl DestinationProtocol { } }?; info!(self.log(), "Destination read Preamble: {:?}", preamble); - if let Err(e) = preamble - .is_migration_compatible(&*self.vm.objects().await.instance_spec()) - { + + self.command_tx + .send(MigrateTargetCommand::InitializeFromExternalSpec) + .await + .map_err(|_| MigrateError::StateDriverChannelClosed)?; + + let MigrateTargetResponse::VmObjectsInitialized(vm_objects) = self + .response_rx + .recv() + .await + .ok_or(MigrateError::StateDriverChannelClosed)?; + + self.vm_objects = Some(vm_objects); + if let Err(e) = preamble.is_migration_compatible( + self.vm_objects.as_ref().unwrap().instance_spec(), + ) { error!( self.log(), "Source and destination instance specs incompatible: {}", e @@ -315,27 +351,23 @@ impl DestinationProtocol { info!(self.log(), "Devices: {devices:#?}"); - { - let objects = self.vm.objects().await; - let machine = objects.machine(); - let migrate_ctx = - MigrateCtx { mem: &machine.acc_mem.access().unwrap() }; - for device in devices { - info!( - self.log(), - "Applying state to device {}", device.instance_name - ); + let objects = self.vm_objects.as_ref().unwrap(); + let migrate_ctx = + MigrateCtx { mem: &objects.machine().acc_mem.access().unwrap() }; + for device in devices { + info!( + self.log(), + "Applying state to device {}", device.instance_name + ); - let target = objects - .device_by_name(&device.instance_name) - .ok_or_else(|| { - MigrateError::UnknownDevice( - device.instance_name.clone(), - ) - })?; - self.import_device(&target, &device, &migrate_ctx)?; - } + let target = objects + .device_by_name(&device.instance_name) + .ok_or_else(|| { + MigrateError::UnknownDevice(device.instance_name.clone()) + })?; + self.import_device(&target, &device, &migrate_ctx)?; } + self.send_msg(codec::Message::Okay).await } @@ -368,7 +400,7 @@ impl DestinationProtocol { // Take a snapshot of the host hrtime/wall clock time, then adjust // time data appropriately. - let vmm_hdl = &self.vm.objects().await.machine().hdl.clone(); + let vmm_hdl = &self.vm_objects.as_ref().unwrap().machine().hdl.clone(); let (dst_hrt, dst_wc) = vmm::time::host_time_snapshot(vmm_hdl) .map_err(|e| { MigrateError::TimeData(format!( @@ -561,9 +593,9 @@ impl DestinationProtocol { } }; - self.vm - .objects() - .await + self.vm_objects + .as_ref() + .unwrap() .com1() .import(&com1_history) .await @@ -583,6 +615,7 @@ impl DestinationProtocol { // Now that control is definitely being transferred, publish that the // migration has succeeded. + drop(self.vm_objects.take()); self.update_state(MigrationState::Finish).await; Ok(()) } @@ -638,7 +671,7 @@ impl DestinationProtocol { addr: GuestAddr, buf: &[u8], ) -> Result<(), MigrateError> { - let objects = self.vm.objects().await; + let objects = self.vm_objects.as_ref().unwrap(); let memctx = objects.machine().acc_mem.access().unwrap(); let len = buf.len(); memctx.write_from(addr, buf, len); diff --git a/bin/propolis-server/src/lib/migrate/mod.rs b/bin/propolis-server/src/lib/migrate/mod.rs index c0f2cbd53..3b7dbcd35 100644 --- a/bin/propolis-server/src/lib/migrate/mod.rs +++ b/bin/propolis-server/src/lib/migrate/mod.rs @@ -144,6 +144,14 @@ pub enum MigrateError { /// The other end of the migration ran into an error #[error("{0:?} migration instance encountered error: {1}")] RemoteError(MigrateRole, String), + + /// Sending/receiving from the VM state driver command/response channels + /// returned an error. + #[error("unable to communiciate with VM state driver")] + StateDriverChannelClosed, + + #[error("request to VM state driver returned failure")] + StateDriverResponseFailed, } impl From for MigrateError { @@ -181,7 +189,9 @@ impl From for HttpError { | MigrateError::TimeData(_) | MigrateError::DeviceState(_) | MigrateError::RemoteError(_, _) - | MigrateError::StateMachine(_) => { + | MigrateError::StateMachine(_) + | MigrateError::StateDriverChannelClosed + | MigrateError::StateDriverResponseFailed => { HttpError::for_internal_error(msg) } MigrateError::MigrationAlreadyInProgress @@ -313,8 +323,9 @@ pub(crate) struct DestinationContext< /// Once we've successfully established the connection, we can begin the /// migration process (destination-side). pub(crate) async fn dest_initiate( - rqctx: &RequestContext>, + log: &slog::Logger, migrate_info: api::InstanceMigrateInitiateRequest, + local_server_addr: SocketAddr, ) -> Result< DestinationContext< tokio_tungstenite::MaybeTlsStream, @@ -324,7 +335,7 @@ pub(crate) async fn dest_initiate( let migration_id = migrate_info.migration_id; // Create a new log context for the migration - let log = rqctx.log.new(o!( + let log = log.new(o!( "migration_id" => migration_id.to_string(), "migrate_role" => "destination", "migrate_src_addr" => migrate_info.src_addr @@ -385,12 +396,11 @@ pub(crate) async fn dest_initiate( return Err(MigrateError::Initiate); } }; - let local_addr = rqctx.server.local_addr; Ok(DestinationContext { migration_id, conn, - local_addr, + local_addr: local_server_addr, protocol: selected, }) } diff --git a/bin/propolis-server/src/lib/server.rs b/bin/propolis-server/src/lib/server.rs index 14c8b1c18..2352732da 100644 --- a/bin/propolis-server/src/lib/server.rs +++ b/bin/propolis-server/src/lib/server.rs @@ -251,6 +251,7 @@ async fn instance_ensure_common( oximeter_registry, nexus_client, vnc_server: server_context.vnc_server.clone(), + local_server_addr: rqctx.server.local_addr, }; server_context diff --git a/bin/propolis-server/src/lib/vm/migrate_commands.rs b/bin/propolis-server/src/lib/vm/migrate_commands.rs index 280ad4036..63c4ecb7e 100644 --- a/bin/propolis-server/src/lib/vm/migrate_commands.rs +++ b/bin/propolis-server/src/lib/vm/migrate_commands.rs @@ -5,16 +5,27 @@ //! Requests and responses between the VM state driver and the live migration //! protocol. +use std::sync::Arc; + use crate::migrate::MigrateError; /// A message sent from a live migration destination task to update the /// externally visible state of the migration attempt. #[derive(Clone, Copy, Debug)] pub enum MigrateTargetCommand { + /// Initialize VM objects using the instance spec supplied to the state + /// driver by its creator. + InitializeFromExternalSpec, + /// Update the externally-visible migration state. UpdateState(propolis_api_types::MigrationState), } +#[derive(Clone)] +pub enum MigrateTargetResponse { + VmObjectsInitialized(Arc), +} + /// A message sent from a live migration driver to the state worker, asking it /// to act on source instance components on the task's behalf. #[derive(Clone, Copy, Debug)] @@ -49,7 +60,7 @@ pub enum MigrateSourceResponse { /// An event raised by a migration task that must be handled by the state /// worker. #[derive(Debug)] -enum MigrateTaskEvent { +pub(super) enum MigrateTaskEvent { /// The task completed with the associated result. TaskExited(Result<(), MigrateError>), diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index c926b2a15..16aaf8d5c 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -7,6 +7,7 @@ use std::{ collections::BTreeMap, + net::SocketAddr, sync::{Arc, RwLock}, }; @@ -22,6 +23,7 @@ use propolis_api_types::{ use request_queue::ExternalRequest; use rfb::server::VncServer; use slog::info; +use state_publisher::{ExternalStateUpdate, StatePublisher}; use uuid::Uuid; use crate::{ @@ -34,6 +36,7 @@ pub(crate) mod migrate_commands; mod request_queue; mod services; mod state_driver; +mod state_publisher; pub(crate) type LifecycleMap = BTreeMap>; @@ -87,7 +90,7 @@ pub(crate) struct Vm { struct VmInner { state: VmState, - driver: Option>, + driver: Option>, } pub(crate) struct VmObjects { @@ -259,18 +262,10 @@ impl Drop for ActiveVm { drop(objects); services.stop(&log).await; - let tx = driver.await.expect("state driver shouldn't panic"); - let new_state = { - let old_state = tx.borrow(); - InstanceStateMonitorResponse { - gen: old_state.gen + 1, - state: propolis_api_types::InstanceState::Destroyed, - migration: old_state.migration.clone(), - } - }; - - tx.send(new_state).expect("VM in rundown should hold a receiver"); - + let mut tx = driver.await.expect("state driver shouldn't panic"); + tx.update(ExternalStateUpdate::Instance( + propolis_api_types::InstanceState::Destroyed, + )); parent.complete_rundown().await; }); } @@ -320,6 +315,7 @@ pub(super) struct EnsureOptions { pub oximeter_registry: Option, pub nexus_client: Option, pub vnc_server: Arc>, + pub local_server_addr: SocketAddr, } impl Vm { diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index a2f80a979..b52572429 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -16,7 +16,7 @@ use propolis_api_types::{ }, InstanceProperties, InstanceState, }; -use slog::{debug, error, info, trace, warn}; +use slog::{error, info}; use uuid::Uuid; use crate::{ @@ -25,52 +25,20 @@ use crate::{ }, migrate::MigrateRole, vcpu_tasks::VcpuTaskController, + vm::{ + migrate_commands::MigrateTargetCommand, + state_publisher::ExternalStateUpdate, + }, }; use super::{ guest_event::{self, GuestEvent}, + migrate_commands::{MigrateTargetResponse, MigrateTaskEvent}, request_queue::ExternalRequest, - InstanceStateTx, VmError, VmObjects, + state_publisher::{MigrationStateUpdate, StatePublisher}, + VmError, VmObjects, }; -struct MigrationStateUpdate { - state: propolis_api_types::MigrationState, - id: Uuid, - role: MigrateRole, -} - -impl MigrationStateUpdate { - fn apply_to( - self, - old: propolis_api_types::InstanceMigrateStatusResponse, - ) -> propolis_api_types::InstanceMigrateStatusResponse { - let new = propolis_api_types::InstanceMigrationStatus { - id: self.id, - state: self.state, - }; - match self.role { - MigrateRole::Destination => { - propolis_api_types::InstanceMigrateStatusResponse { - migration_in: Some(new), - migration_out: old.migration_out, - } - } - MigrateRole::Source => { - propolis_api_types::InstanceMigrateStatusResponse { - migration_in: old.migration_in, - migration_out: Some(new), - } - } - } - } -} - -enum ExternalStateUpdate { - Instance(InstanceState), - Migration(MigrationStateUpdate), - Complete(InstanceState, MigrationStateUpdate), -} - #[derive(Debug, PartialEq, Eq)] enum HandleEventOutcome { Continue, @@ -225,7 +193,7 @@ struct StateDriver { parent: Arc, active_vm: Arc, input_queue: Arc, - external_state_tx: super::InstanceStateTx, + external_state: StatePublisher, paused: bool, vcpu_tasks: Box, migration_src_state: crate::migrate::source::PersistentState, @@ -239,8 +207,9 @@ pub(super) async fn run_state_driver( Result, >, ensure_options: super::EnsureOptions, -) -> InstanceStateTx { - let (external_tx, external_rx) = tokio::sync::watch::channel( +) -> StatePublisher { + let (mut external_publisher, external_rx) = StatePublisher::new( + &log, propolis_api_types::InstanceStateMonitorResponse { gen: 1, state: if ensure_request.migrate.is_some() { @@ -264,11 +233,10 @@ pub(super) async fn run_state_driver( log.new(slog::o!("component" => "request_queue")), )); - let (vcpu_tasks, active_vm) = match ensure_request.migrate { + let migrated_in = ensure_request.migrate.is_some(); + let (vm_objects, vcpu_tasks) = match match ensure_request.migrate { None => { - trace!(log, "starting VM initialization"); - - let (vm_objects, vcpu_tasks) = match initialize_vm_from_spec( + initialize_vm_from_spec( &log, &input_queue, &ensure_request.properties, @@ -276,79 +244,88 @@ pub(super) async fn run_state_driver( &ensure_options, ) .await - { - Ok(objects) => objects, - Err(e) => { - let _ = ensure_result_tx - .send(Err(VmError::InitializationFailed(e))); - return external_tx; - } - }; - - trace!(log, "initialized VM objects"); - - let services = super::services::VmServices::new( + } + Some(migrate_request) => { + migrate_as_target( &log, - &vm, - &vm_objects, + &input_queue, &ensure_request.properties, + &ensure_request.instance_spec, &ensure_options, + migrate_request, + &mut external_publisher, ) - .await; - - trace!(log, "initialized VM services"); - - let active_vm = Arc::new(super::ActiveVm { - parent: vm.clone(), - log: log.clone(), - state_driver_queue: input_queue.clone(), - external_state_rx: external_rx, - properties: ensure_request.properties, - objects: Some(tokio::sync::RwLock::new(vm_objects)), - services: Some(services), - }); - - // All the VM components now exist, so allow external callers to - // interact with the VM. - // - // Order matters here: once the ensure result is sent, an external - // caller needs to observe that an active VM is present. - vm.make_active(active_vm.clone()); - let _ = ensure_result_tx.send(Ok( - propolis_api_types::InstanceEnsureResponse { migrate: None }, - )); - - trace!(log, "made VM active"); - - (vcpu_tasks, active_vm) + .await + } + } { + Ok(objects) => objects, + Err(e) => { + let _ = + ensure_result_tx.send(Err(VmError::InitializationFailed(e))); + return external_publisher; } - Some(_migrate) => todo!("gjc"), }; + let services = super::services::VmServices::new( + &log, + &vm, + &vm_objects, + &ensure_request.properties, + &ensure_options, + ) + .await; + + let active_vm = Arc::new(super::ActiveVm { + parent: vm.clone(), + log: log.clone(), + state_driver_queue: input_queue.clone(), + external_state_rx: external_rx, + properties: ensure_request.properties, + objects: Some(tokio::sync::RwLock::new(vm_objects)), + services: Some(services), + }); + + // All the VM components now exist, so allow external callers to + // interact with the VM. + // + // Order matters here: once the ensure result is sent, an external + // caller needs to observe that an active VM is present. + vm.make_active(active_vm.clone()); + let _ = ensure_result_tx + .send(Ok(propolis_api_types::InstanceEnsureResponse { migrate: None })); + let state_driver = StateDriver { log, parent: vm.clone(), active_vm, input_queue, - external_state_tx: external_tx, + external_state: external_publisher, paused: false, vcpu_tasks, migration_src_state: Default::default(), }; - state_driver.run().await + state_driver.run(migrated_in).await } impl StateDriver { - pub(super) async fn run(mut self) -> super::InstanceStateTx { - self.run_loop().await; + pub(super) async fn run(mut self, migrated_in: bool) -> StatePublisher { + info!(self.log, "state driver launched"); + + if migrated_in { + if self.start_vm(VmStartReason::MigratedIn).await.is_ok() { + self.run_loop().await; + } + } else { + self.run_loop().await; + } + self.parent.set_rundown().await; - self.external_state_tx + self.external_state } async fn run_loop(&mut self) { - info!(self.log, "state driver launched"); - + info!(self.log, "state driver entered main loop"); loop { let event = self.input_queue.wait_for_next_event(); info!(self.log, "state driver handling event"; "event" => ?event); @@ -486,9 +463,8 @@ impl StateDriver { async fn do_reboot(&mut self) { info!(self.log, "resetting instance"); - self.update_external_state(ExternalStateUpdate::Instance( - InstanceState::Rebooting, - )); + self.external_state + .update(ExternalStateUpdate::Instance(InstanceState::Rebooting)); { let (vm_objects, vcpu_tasks) = self.vm_objects_and_cpus().await; @@ -516,16 +492,14 @@ impl StateDriver { self.input_queue.notify_instance_state_change( super::request_queue::InstanceStateChange::Rebooted, ); - self.update_external_state(ExternalStateUpdate::Instance( - InstanceState::Running, - )); + self.external_state + .update(ExternalStateUpdate::Instance(InstanceState::Running)); } async fn do_halt(&mut self) { info!(self.log, "stopping instance"); - self.update_external_state(ExternalStateUpdate::Instance( - InstanceState::Stopping, - )); + self.external_state + .update(ExternalStateUpdate::Instance(InstanceState::Stopping)); // Entities expect to be paused before being halted. Note that the VM // may be paused already if it is being torn down after a successful @@ -579,42 +553,7 @@ impl StateDriver { }; self.input_queue.notify_instance_state_change(change); - self.update_external_state(ExternalStateUpdate::Instance(state)); - } - - fn update_external_state(&mut self, state: ExternalStateUpdate) { - let (instance_state, migration_state) = match state { - ExternalStateUpdate::Instance(i) => (Some(i), None), - ExternalStateUpdate::Migration(m) => (None, Some(m)), - ExternalStateUpdate::Complete(i, m) => (Some(i), Some(m)), - }; - - let propolis_api_types::InstanceStateMonitorResponse { - state: old_instance, - migration: old_migration, - gen: old_gen, - } = self.external_state_tx.borrow().clone(); - - let state = instance_state.unwrap_or(old_instance); - let migration = if let Some(migration_state) = migration_state { - migration_state.apply_to(old_migration) - } else { - old_migration - }; - - let gen = old_gen + 1; - info!(self.log, "publishing new instance state"; - "gen" => gen, - "state" => ?state, - "migration" => ?migration); - - let _ = self.external_state_tx.send( - propolis_api_types::InstanceStateMonitorResponse { - gen, - state, - migration, - }, - ); + self.external_state.update(ExternalStateUpdate::Instance(state)); } async fn vm_objects(&self) -> tokio::sync::RwLockReadGuard<'_, VmObjects> { @@ -803,3 +742,163 @@ async fn initialize_vm_from_spec( vcpu_tasks as Box, )) } + +async fn migrate_as_target( + log: &slog::Logger, + event_queue: &Arc, + properties: &InstanceProperties, + spec: &VersionedInstanceSpec, + options: &super::EnsureOptions, + api_request: propolis_api_types::InstanceMigrateInitiateRequest, + external_state: &mut StatePublisher, +) -> anyhow::Result<(VmObjects, Box)> { + // Use the information in the supplied migration request to connect to the + // migration source and negotiate the protocol verison to use. + let migrate_ctx = crate::migrate::dest_initiate( + log, + api_request, + options.local_server_addr, + ) + .await?; + + // Spin up a task to run the migration protocol proper. To avoid sending the + // entire VM context over to the migration task, create command and response + // channels to allow the migration task to delegate work back to this + // routine. + let log_for_task = log.clone(); + let (command_tx, mut command_rx) = tokio::sync::mpsc::channel(1); + let (response_tx, response_rx) = tokio::sync::mpsc::channel(1); + let mut migrate_task = tokio::spawn(async move { + crate::migrate::destination::migrate( + &log_for_task, + command_tx, + response_rx, + migrate_ctx.conn, + migrate_ctx.local_addr, + migrate_ctx.protocol, + ) + .await + }); + + // Migration cannot proceed (in any protocol version) until the target + // kernel VMM and Propolis components have been set up. The first command + // from the migration task should be a request to set up these components. + let init_command = command_rx.recv().await.ok_or_else(|| { + anyhow::anyhow!("migration task unexpectedly closed channel") + })?; + + // TODO(#706) The only extant protocol version (V0 with RON encoding) + // assumes that migration targets get an instance spec from the caller of + // the `instance_ensure` API, that the target VM will be initialized from + // this spec, and that device state will be imported in a later migration + // phase. There are other ways to approach this problem: + // + // - This task can initialize a VM using the *source's* instance spec + // (possibly with amended configuration supplied via the API). + // - This task can initialize components using device state payload + // forwarded on from the migration task. + // + // For now, initialize the target VM in the conventional way. + let MigrateTargetCommand::InitializeFromExternalSpec = init_command else { + anyhow::bail!("migration protocol didn't first ask to init objects"); + }; + + let (vm_objects, mut vcpu_tasks) = + initialize_vm_from_spec(log, event_queue, properties, spec, options) + .await?; + + // The migration task imports device state by operating directly on the + // newly-created VM objects. Before sending them to the task and allowing + // migration to continue, prepare the VM's vCPUs and objects to have state + // migrated into them. + // + // Ensure the VM's vCPUs are activated properly so that they can enter the + // guest after migration. Do this before allowing the migration task to + // continue so that reset doesn't overwrite any state written by migration. + // + // Pause the kernel VM so that emulated device state can be imported + // consistently. + reset_vcpus(&vm_objects, vcpu_tasks.as_mut()); + vm_objects.pause_vm(); + + // Everything is ready, so send a reference to the newly-created VM to the + // migration task. When the task exits, it drops this reference, allowing + // this task to reclaim an owned `VmObjects` from the `Arc` wrapper. + let vm_objects = Arc::new(vm_objects); + if response_tx + .send(MigrateTargetResponse::VmObjectsInitialized(vm_objects.clone())) + .await + .is_err() + { + vm_objects.resume_vm(); + anyhow::bail!("migration task unexpectedly closed channel"); + } + + loop { + let action = + next_migrate_task_event(&mut migrate_task, &mut command_rx, log) + .await; + + match action { + MigrateTaskEvent::TaskExited(res) => match res { + Ok(()) => { + let Ok(vm_objects) = Arc::try_unwrap(vm_objects) else { + panic!( + "migration task should have dropped its VM objects", + ); + }; + + return Ok((vm_objects, vcpu_tasks)); + } + Err(e) => { + vm_objects.resume_vm(); + return Err(e.into()); + } + }, + MigrateTaskEvent::Command(MigrateTargetCommand::UpdateState( + state, + )) => { + external_state.update(ExternalStateUpdate::Migration( + MigrationStateUpdate { + state, + id: migrate_ctx.migration_id, + role: MigrateRole::Destination, + }, + )); + } + MigrateTaskEvent::Command( + MigrateTargetCommand::InitializeFromExternalSpec, + ) => { + panic!("already received initialize-from-spec command"); + } + } + } +} + +async fn next_migrate_task_event( + task: &mut tokio::task::JoinHandle< + Result<(), crate::migrate::MigrateError>, + >, + command_rx: &mut tokio::sync::mpsc::Receiver, + log: &slog::Logger, +) -> MigrateTaskEvent { + if let Some(cmd) = command_rx.recv().await { + return MigrateTaskEvent::Command(cmd); + } + + // The sender side of the command channel is dropped, which means the + // migration task is exiting. Wait for it to finish and snag its result. + match task.await { + Ok(res) => { + info!(log, "Migration source task exited: {:?}", res); + MigrateTaskEvent::TaskExited(res) + } + Err(join_err) => { + if join_err.is_cancelled() { + panic!("Migration task canceled"); + } else { + panic!("Migration task panicked: {:?}", join_err.into_panic()); + } + } + } +} diff --git a/bin/propolis-server/src/lib/vm/state_publisher.rs b/bin/propolis-server/src/lib/vm/state_publisher.rs new file mode 100644 index 000000000..319ea9bf1 --- /dev/null +++ b/bin/propolis-server/src/lib/vm/state_publisher.rs @@ -0,0 +1,97 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Helper types for publishing instance states as made visible through the +//! external API. + +use propolis_api_types::{ + InstanceMigrateStatusResponse, InstanceMigrationStatus, InstanceState, + InstanceStateMonitorResponse, +}; +use slog::info; +use uuid::Uuid; + +use crate::migrate::MigrateRole; + +use super::{InstanceStateRx, InstanceStateTx}; + +pub(super) struct MigrationStateUpdate { + pub state: propolis_api_types::MigrationState, + pub id: Uuid, + pub role: MigrateRole, +} + +impl MigrationStateUpdate { + fn apply_to( + self, + old: InstanceMigrateStatusResponse, + ) -> InstanceMigrateStatusResponse { + let new = InstanceMigrationStatus { id: self.id, state: self.state }; + match self.role { + MigrateRole::Destination => InstanceMigrateStatusResponse { + migration_in: Some(new), + migration_out: old.migration_out, + }, + MigrateRole::Source => InstanceMigrateStatusResponse { + migration_in: old.migration_in, + migration_out: Some(new), + }, + } + } +} + +pub(super) enum ExternalStateUpdate { + Instance(InstanceState), + Migration(MigrationStateUpdate), + Complete(InstanceState, MigrationStateUpdate), +} + +pub(super) struct StatePublisher { + tx: InstanceStateTx, + log: slog::Logger, +} + +impl StatePublisher { + pub(super) fn new( + log: &slog::Logger, + initial_state: InstanceStateMonitorResponse, + ) -> (Self, InstanceStateRx) { + let (tx, rx) = tokio::sync::watch::channel(initial_state); + (Self { tx, log: log.clone() }, rx) + } + + pub(super) fn update(&mut self, update: ExternalStateUpdate) { + let (instance_state, migration_state) = match update { + ExternalStateUpdate::Instance(i) => (Some(i), None), + ExternalStateUpdate::Migration(m) => (None, Some(m)), + ExternalStateUpdate::Complete(i, m) => (Some(i), Some(m)), + }; + + let InstanceStateMonitorResponse { + state: old_instance, + migration: old_migration, + gen: old_gen, + } = self.tx.borrow().clone(); + + let state = instance_state.unwrap_or(old_instance); + let migration = if let Some(migration_state) = migration_state { + migration_state.apply_to(old_migration) + } else { + old_migration + }; + + let gen = old_gen + 1; + info!(self.log, "publishing new instance state"; + "gen" => gen, + "state" => ?state, + "migration" => ?migration); + + let _ = + self.tx.send(propolis_api_types::InstanceStateMonitorResponse { + gen, + state, + migration, + }); + } +} From 2b4767f9f06c2cf936c88db7b82a66b3a46d945d Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 26 Jun 2024 20:25:19 +0000 Subject: [PATCH 20/55] [WIP] hook up source side of migration --- bin/propolis-server/src/lib/migrate/mod.rs | 31 ++--- bin/propolis-server/src/lib/migrate/source.rs | 48 ++++++- bin/propolis-server/src/lib/server.rs | 13 +- bin/propolis-server/src/lib/vm/mod.rs | 25 ++-- .../src/lib/vm/request_queue.rs | 103 +++++++-------- .../src/lib/vm/state_driver.rs | 125 +++++++++++++++++- 6 files changed, 243 insertions(+), 102 deletions(-) diff --git a/bin/propolis-server/src/lib/migrate/mod.rs b/bin/propolis-server/src/lib/migrate/mod.rs index 3b7dbcd35..45933c8b5 100644 --- a/bin/propolis-server/src/lib/migrate/mod.rs +++ b/bin/propolis-server/src/lib/migrate/mod.rs @@ -3,10 +3,9 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use std::net::SocketAddr; -use std::sync::Arc; use bit_field::BitField; -use dropshot::{HttpError, RequestContext}; +use dropshot::HttpError; use futures::{SinkExt, StreamExt}; use propolis::migrate::MigrateStateError; use propolis_api_types::{self as api, MigrationState}; @@ -19,8 +18,6 @@ use tokio_tungstenite::tungstenite::protocol::CloseFrame; use tokio_tungstenite::{tungstenite, WebSocketStream}; use uuid::Uuid; -use crate::server::DropshotEndpointContext; - mod codec; pub mod destination; mod memx; @@ -35,7 +32,7 @@ pub enum MigrateRole { } // N.B. Keep in sync with scripts/live-migration-times.d. -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] enum MigratePhase { MigrateSync, Pause, @@ -226,30 +223,30 @@ struct DevicePayload { pub data: String, } +pub(crate) struct SourceContext< + T: AsyncRead + AsyncWrite + Unpin + Send + 'static, +> { + pub conn: WebSocketStream, + pub protocol: crate::migrate::protocol::Protocol, +} + /// Begin the migration process (source-side). /// /// This will check protocol version and then begin the migration in a separate task. pub async fn source_start< T: AsyncRead + AsyncWrite + Unpin + Send + 'static, >( - rqctx: RequestContext>, + log: &slog::Logger, migration_id: Uuid, mut conn: WebSocketStream, -) -> Result<(), MigrateError> { +) -> Result, MigrateError> { // Create a new log context for the migration - let log = rqctx.log.new(o!( + let log = log.new(o!( "migration_id" => migration_id.to_string(), "migrate_role" => "source" )); info!(log, "Migration Source"); - let active_vm = rqctx - .context() - .vm - .active_vm() - .ok_or_else(|| MigrateError::InstanceNotInitialized)? - .clone(); - let selected = match conn.next().await { Some(Ok(tungstenite::Message::Text(dst_protocols))) => { info!(log, "destination offered protocols: {}", dst_protocols); @@ -302,9 +299,7 @@ pub async fn source_start< } }; - todo!("gjc"); // need a method on ActiveVm for this - // controller.request_migration_from(migration_id, conn, selected)?; - Ok(()) + Ok(SourceContext { conn, protocol: selected }) } pub(crate) struct DestinationContext< diff --git a/bin/propolis-server/src/lib/migrate/source.rs b/bin/propolis-server/src/lib/migrate/source.rs index c0435a704..bf4ce4fec 100644 --- a/bin/propolis-server/src/lib/migrate/source.rs +++ b/bin/propolis-server/src/lib/migrate/source.rs @@ -163,10 +163,12 @@ pub async fn migrate( // phase. // // Record that now so we never try to do this again. - // - // TODO(gjc) - // proto.vm.migration_src_state().has_redirtying_ever_failed = - // true; + proto + .command_tx + .send(MigrateSourceCommand::RedirtyingFailed) + .await + .map_err(|_| MigrateError::StateDriverChannelClosed)?; + error!( proto.log(), "failed to restore dirty bits: {e}"; @@ -197,7 +199,7 @@ pub(crate) struct PersistentState { /// migration attempt. If this occurs, we can no longer offer only dirty /// pages in a subsequent migration attempt, as some pages which should be /// marked as dirty may not be. - has_redirtying_ever_failed: bool, + pub(crate) has_redirtying_ever_failed: bool, } struct SourceProtocol { @@ -362,11 +364,43 @@ impl SourceProtocol { vmm_ram_range ); - // Determine whether we can offer only dirty pages, or if we must offer - // all pages. + // In the pre-pause phase, it is safe to offer only dirty pages if (1) + // there is some prospect of being able to restore the kernel dirty page + // bitmap if migration fails, and (2) a prior attempt to restore the + // bitmap hasn't failed (thereby rendering the bitmap's contents + // untrustworthy). The first prong was checked when the protocol + // started, but the second prong requires input from the VM state + // driver. If this routine is being called from the pre-pause phase, and + // the dirty page map looks viable, ask the state driver if it's OK to + // proceed with transmitting only dirty pages. // // Refer to the giant comment on `RamOfferDiscipline` above for more // details about this determination. + if *phase == MigratePhase::RamPushPrePause && self.dirt.is_some() { + self.command_tx + .send(MigrateSourceCommand::QueryRedirtyingFailed) + .await + .map_err(|_| MigrateError::StateDriverChannelClosed)?; + + let response = self + .response_rx + .recv() + .await + .ok_or(MigrateError::StateDriverChannelClosed)?; + + match response { + MigrateSourceResponse::RedirtyingFailed(has_failed) => { + if has_failed { + self.dirt = None; + } + } + _ => panic!( + "unexpected response {:?} to request for redirtying info", + response + ), + } + } + let offer_discipline = match phase { // If we are in the pre-pause RAM push phase, and we don't have // VM_NPT_OPERATION to put back any dirty bits if the migration diff --git a/bin/propolis-server/src/lib/server.rs b/bin/propolis-server/src/lib/server.rs index 2352732da..6e8ffd1b1 100644 --- a/bin/propolis-server/src/lib/server.rs +++ b/bin/propolis-server/src/lib/server.rs @@ -260,7 +260,7 @@ async fn instance_ensure_common( .await .map(HttpResponseCreated) .map_err(|e| match e { - VmError::EnsureResultClosed => HttpError::for_internal_error( + VmError::ResultChannelClosed => HttpError::for_internal_error( "state driver unexpectedly dropped result channel".to_string(), ), VmError::WaitingToInitialize @@ -544,15 +544,10 @@ async fn instance_migrate_start( path_params: Path, websock: WebsocketConnection, ) -> dropshot::WebsocketChannelResult { + let ctx = rqctx.context(); let migration_id = path_params.into_inner().migration_id; - let conn = WebSocketStream::from_raw_socket( - websock.into_inner(), - Role::Server, - None, - ) - .await; - crate::migrate::source_start(rqctx, migration_id, conn).await?; - Ok(()) + let vm = ctx.vm.active_vm().ok_or_else(not_created_error)?; + Ok(vm.request_migration_out(migration_id, websock).await?) } #[endpoint { diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 16aaf8d5c..f47567c9d 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -18,7 +18,7 @@ use propolis::{ }; use propolis_api_types::{ instance_spec::{v0::InstanceSpecV0, VersionedInstanceSpec}, - InstanceProperties, InstanceStateMonitorResponse, InstanceStateRequested, + InstanceProperties, InstanceStateRequested, }; use request_queue::ExternalRequest; use rfb::server::VncServer; @@ -59,8 +59,8 @@ pub(crate) type CrucibleReplaceResultTx = #[derive(Debug, thiserror::Error)] pub(crate) enum VmError { - #[error("VM ensure result channel unexpectedly closed")] - EnsureResultClosed, + #[error("VM operation result channel unexpectedly closed")] + ResultChannelClosed, #[error("VM not created")] NotCreated, @@ -121,10 +121,6 @@ impl VmObjects { self.lifecycle_components.get(name).cloned() } - pub(crate) fn block_backends(&self) -> &BlockBackendMap { - &self.block_backends - } - pub(crate) fn crucible_backends(&self) -> &CrucibleBackendMap { &self.crucible_backends } @@ -211,6 +207,19 @@ impl ActiveVm { .map_err(Into::into) } + pub(crate) async fn request_migration_out( + &self, + migration_id: Uuid, + websock: dropshot::WebsocketConnection, + ) -> Result<(), VmError> { + Ok(self.state_driver_queue.queue_external_request( + ExternalRequest::MigrateAsSource { + migration_id, + websock: websock.into(), + }, + )?) + } + pub(crate) fn reconfigure_crucible_volume( &self, disk_name: String, @@ -452,6 +461,6 @@ impl Vm { })); } - ensure_rx.await.map_err(|_| VmError::EnsureResultClosed)? + ensure_rx.await.map_err(|_| VmError::ResultChannelClosed)? } } diff --git a/bin/propolis-server/src/lib/vm/request_queue.rs b/bin/propolis-server/src/lib/vm/request_queue.rs index 076709f9a..08260f29c 100644 --- a/bin/propolis-server/src/lib/vm/request_queue.rs +++ b/bin/propolis-server/src/lib/vm/request_queue.rs @@ -27,33 +27,40 @@ use slog::{debug, info, Logger}; use thiserror::Error; use uuid::Uuid; -use crate::migrate::MigrateError; +/// Wraps a [`dropshot::WebsocketConnection`] for inclusion in an +/// [`ExternalRequest`]. +// +// A newtype is used here to allow this module's tests (which want to verify +// queuing dispositions and don't care about request contents) to construct a +// `MigrateAsSource` request without having to conjure up a real websocket +// conection. +pub(crate) struct WebsocketConnection(Option); + +impl From for WebsocketConnection { + fn from(value: dropshot::WebsocketConnection) -> Self { + Self(Some(value)) + } +} -use super::migrate_commands::{MigrateSourceCommand, MigrateSourceResponse}; +impl WebsocketConnection { + /// Yields the wrapped [`dropshot::WebsocketConnection`]. + pub(crate) fn into_inner(self) -> dropshot::WebsocketConnection { + // Unwrapping is safe here because the only way an external consumer can + // get an instance of this wrapper is to use the From impl, which always + // wraps a `Some`. + self.0.unwrap() + } +} /// An external request made of a VM controller via the server API. Handled by /// the controller's state driver thread. -#[derive(Debug)] pub enum ExternalRequest { Start, /// Asks the state worker to start a migration-source task. MigrateAsSource { - /// The ID of the live migration for which this VM will be the source. migration_id: Uuid, - - /// A handle to the task that will execute the migration procedure. - task: tokio::task::JoinHandle>, - - /// The sender side of a one-shot channel that, when signaled, tells the - /// migration task to start its work. - start_tx: tokio::sync::oneshot::Sender<()>, - - /// A channel that receives commands from the migration task. - command_rx: tokio::sync::mpsc::Receiver, - - /// A channel used to send responses to migration commands. - response_tx: tokio::sync::mpsc::Sender, + websock: WebsocketConnection, }, /// Resets the guest by pausing all devices, resetting them to their @@ -87,6 +94,27 @@ pub enum ExternalRequest { }, } +impl std::fmt::Debug for ExternalRequest { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Start => write!(f, "Start"), + Self::MigrateAsSource { migration_id, .. } => f + .debug_struct("MigrateAsSource") + .field("migration_id", migration_id) + .finish(), + Self::Reboot => write!(f, "Reboot"), + Self::Stop => write!(f, "Stop"), + Self::ReconfigureCrucibleVolume { + disk_name, backend_id, .. + } => f + .debug_struct("ReconfigureCrucibleVolume") + .field("disk_name", disk_name) + .field("backend_id", backend_id) + .finish(), + } + } +} + /// A set of reasons why a request to queue an external state transition can /// fail. #[derive(Copy, Clone, Debug, Error)] @@ -94,9 +122,6 @@ pub enum RequestDeniedReason { #[error("Operation requires an active instance")] InstanceNotActive, - #[error("Already migrating into this instance")] - MigrationTargetInProgress, - #[error("Instance is currently starting")] StartInProgress, @@ -240,31 +265,6 @@ impl ExternalRequestQueue { .get_new_dispositions(DispositionChangeReason::StateChange(state)); } - /// Indicates whether the queue would allow a request to migrate out of this - /// instance. This can be used to avoid setting up migration tasks for - /// requests that will ultimately be denied. - /// - /// # Return value - /// - /// - `Ok(true)` if the request will be queued. - /// - `Ok(false)` if the request is allowed for idempotency reasons but will - /// not be queued. - /// - `Err` if the request is forbidden. - pub fn migrate_as_source_will_enqueue( - &self, - ) -> Result { - assert!(!matches!( - self.allowed.migrate_as_source, - RequestDisposition::Ignore - )); - - match self.allowed.migrate_as_source { - RequestDisposition::Enqueue => Ok(true), - RequestDisposition::Ignore => unreachable!(), - RequestDisposition::Deny(reason) => Err(reason), - } - } - /// Computes a new set of queue dispositions given the current state of the /// queue and the event that is changing those dispositions. fn get_new_dispositions( @@ -400,16 +400,9 @@ mod test { } fn make_migrate_as_source_request() -> ExternalRequest { - let task = tokio::task::spawn(async { Ok(()) }); - let (start_tx, _) = tokio::sync::oneshot::channel(); - let (_, command_rx) = tokio::sync::mpsc::channel(1); - let (response_tx, _) = tokio::sync::mpsc::channel(1); ExternalRequest::MigrateAsSource { migration_id: Uuid::new_v4(), - task, - start_tx, - command_rx, - response_tx, + websock: WebsocketConnection(None), } } @@ -420,7 +413,6 @@ mod test { queue.notify_instance_state_change(InstanceStateChange::StartedRunning); // Requests to migrate out should be allowed. - assert!(queue.migrate_as_source_will_enqueue().unwrap()); assert!(queue.try_queue(make_migrate_as_source_request()).is_ok()); // Once the request is queued, other requests to migrate out are @@ -431,7 +423,6 @@ mod test { // is assumed), but requests to migrate out are issued by the target // Propolis (which does not assume idempotency and issues only one // request per migration attempt). - assert!(queue.migrate_as_source_will_enqueue().is_err()); assert!(queue.try_queue(make_migrate_as_source_request()).is_err()); // If migration fails, the instance resumes running, and then another @@ -441,14 +432,12 @@ mod test { Some(ExternalRequest::MigrateAsSource { .. }) )); queue.notify_instance_state_change(InstanceStateChange::StartedRunning); - assert!(queue.migrate_as_source_will_enqueue().unwrap()); assert!(queue.try_queue(make_migrate_as_source_request()).is_ok()); // A successful migration stops the instance, which forecloses on future // requests to migrate out. queue.pop_front(); queue.notify_instance_state_change(InstanceStateChange::Stopped); - assert!(queue.migrate_as_source_will_enqueue().is_err()); assert!(queue.try_queue(make_migrate_as_source_request()).is_err()); } diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index b52572429..a151a2dc9 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -14,7 +14,7 @@ use propolis_api_types::{ components::backends::CrucibleStorageBackend, v0::StorageBackendV0, VersionedInstanceSpec, }, - InstanceProperties, InstanceState, + InstanceProperties, InstanceState, MigrationState, }; use slog::{error, info}; use uuid::Uuid; @@ -33,7 +33,10 @@ use crate::{ use super::{ guest_event::{self, GuestEvent}, - migrate_commands::{MigrateTargetResponse, MigrateTaskEvent}, + migrate_commands::{ + MigrateSourceCommand, MigrateSourceResponse, MigrateTargetResponse, + MigrateTaskEvent, + }, request_queue::ExternalRequest, state_publisher::{MigrationStateUpdate, StatePublisher}, VmError, VmObjects, @@ -432,7 +435,16 @@ impl StateDriver { Err(_) => HandleEventOutcome::Exit, } } - ExternalRequest::MigrateAsSource { .. } => todo!("gjc"), + ExternalRequest::MigrateAsSource { migration_id, websock } => { + self.migrate_as_source(migration_id, websock.into_inner()) + .await; + + // The callee either queues its own stop request (on a + // successful migration out) or resumes the VM (on a failed + // migration out). Either way, the main loop can just proceed to + // process the queue as normal. + HandleEventOutcome::Continue + } ExternalRequest::Reboot => { self.do_reboot().await; HandleEventOutcome::Continue @@ -575,6 +587,113 @@ impl StateDriver { (self.active_vm.objects().await, self.vcpu_tasks.as_mut()) } + async fn migrate_as_source( + &mut self, + migration_id: Uuid, + websock: dropshot::WebsocketConnection, + ) { + let conn = tokio_tungstenite::WebSocketStream::from_raw_socket( + websock.into_inner(), + tokio_tungstenite::tungstenite::protocol::Role::Server, + None, + ) + .await; + + // Negotiate the migration protocol version with the target. + let Ok(migrate_ctx) = + crate::migrate::source_start(&self.log, migration_id, conn).await + else { + return; + }; + + // Publish that migration is in progress before actually launching the + // migration task. + self.external_state.update(ExternalStateUpdate::Complete( + InstanceState::Migrating, + MigrationStateUpdate { + state: MigrationState::Sync, + id: migration_id, + role: MigrateRole::Source, + }, + )); + + let (command_tx, mut command_rx) = tokio::sync::mpsc::channel(1); + let (response_tx, response_rx) = tokio::sync::mpsc::channel(1); + let vm_for_task = self.active_vm.clone(); + let mut migrate_task = tokio::spawn(async move { + crate::migrate::source::migrate( + vm_for_task, + command_tx, + response_rx, + migrate_ctx.conn, + migrate_ctx.protocol, + ) + .await + }); + + loop { + match next_migrate_task_event( + &mut migrate_task, + &mut command_rx, + &self.log, + ) + .await + { + MigrateTaskEvent::TaskExited(res) => { + if res.is_ok() { + self.active_vm + .state_driver_queue + .queue_external_request(ExternalRequest::Stop) + .expect("can always queue a request to stop"); + } else { + if self.paused { + self.resume().await; + } + + self.publish_steady_state(InstanceState::Running); + } + } + + // N.B. When handling a command that requires a reply, do not + // return early if the reply fails to send. Instead, + // loop back around and let the `TaskExited` path restore + // the VM to the correct state. + MigrateTaskEvent::Command(cmd) => match cmd { + MigrateSourceCommand::UpdateState(state) => { + self.external_state.update( + ExternalStateUpdate::Migration( + MigrationStateUpdate { + id: migration_id, + state, + role: MigrateRole::Source, + }, + ), + ); + } + MigrateSourceCommand::Pause => { + self.pause().await; + let _ = response_tx + .send(MigrateSourceResponse::Pause(Ok(()))) + .await; + } + MigrateSourceCommand::QueryRedirtyingFailed => { + let has_failed = + self.migration_src_state.has_redirtying_ever_failed; + let _ = response_tx + .send(MigrateSourceResponse::RedirtyingFailed( + has_failed, + )) + .await; + } + MigrateSourceCommand::RedirtyingFailed => { + self.migration_src_state.has_redirtying_ever_failed = + true; + } + }, + } + } + } + async fn reconfigure_crucible_volume( &self, disk_name: String, From 96e867c6ed0dba7f95bd5ca0f22b21d5cbb20a8a Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 26 Jun 2024 22:06:54 +0000 Subject: [PATCH 21/55] [WIP] fix migration bugs --- .../src/lib/migrate/destination.rs | 38 +++--- bin/propolis-server/src/lib/migrate/mod.rs | 14 +-- bin/propolis-server/src/lib/migrate/source.rs | 18 +-- .../src/lib/vm/migrate_commands.rs | 2 +- .../src/lib/vm/state_driver.rs | 114 +++++++++++++----- 5 files changed, 121 insertions(+), 65 deletions(-) diff --git a/bin/propolis-server/src/lib/migrate/destination.rs b/bin/propolis-server/src/lib/migrate/destination.rs index a258681da..a9e68f28b 100644 --- a/bin/propolis-server/src/lib/migrate/destination.rs +++ b/bin/propolis-server/src/lib/migrate/destination.rs @@ -52,10 +52,10 @@ pub async fn migrate( }; if let Err(err) = proto.run().await { - err_tx + // If the + let _ = err_tx .send(MigrateTargetCommand::UpdateState(MigrationState::Error)) - .await - .unwrap(); + .await; // We encountered an error, try to inform the remote before bailing // Note, we don't use `?` here as this is a best effort and we don't @@ -179,6 +179,21 @@ impl DestinationProtocol { } async fn sync(&mut self) -> Result<(), MigrateError> { + self.command_tx + .send(MigrateTargetCommand::InitializeFromExternalSpec) + .await + .map_err(|_| MigrateError::StateDriverChannelClosed)?; + + let MigrateTargetResponse::VmObjectsInitialized(vm_objects) = self + .response_rx + .recv() + .await + .ok_or(MigrateError::StateDriverChannelClosed)?; + + let vm_objects = vm_objects + .map_err(MigrateError::TargetInstanceInitializationFailed)?; + + self.vm_objects = Some(vm_objects); self.update_state(MigrationState::Sync).await; let preamble: Preamble = match self.read_msg().await? { codec::Message::Serialized(s) => { @@ -194,18 +209,6 @@ impl DestinationProtocol { }?; info!(self.log(), "Destination read Preamble: {:?}", preamble); - self.command_tx - .send(MigrateTargetCommand::InitializeFromExternalSpec) - .await - .map_err(|_| MigrateError::StateDriverChannelClosed)?; - - let MigrateTargetResponse::VmObjectsInitialized(vm_objects) = self - .response_rx - .recv() - .await - .ok_or(MigrateError::StateDriverChannelClosed)?; - - self.vm_objects = Some(vm_objects); if let Err(e) = preamble.is_migration_compatible( self.vm_objects.as_ref().unwrap().instance_spec(), ) { @@ -632,7 +635,10 @@ impl DestinationProtocol { // If this is an error message, lift that out .map(|msg| match msg.try_into()? { codec::Message::Error(err) => { - error!(self.log(), "remote error: {err}"); + error!( + self.log(), + "migration failed due to error from source: {err}" + ); Err(MigrateError::RemoteError( MigrateRole::Source, err.to_string(), diff --git a/bin/propolis-server/src/lib/migrate/mod.rs b/bin/propolis-server/src/lib/migrate/mod.rs index 45933c8b5..ca460ed1e 100644 --- a/bin/propolis-server/src/lib/migrate/mod.rs +++ b/bin/propolis-server/src/lib/migrate/mod.rs @@ -87,8 +87,8 @@ pub enum MigrateError { UpgradeExpected, /// Attempted to migrate an uninitialized instance - #[error("instance is not initialized")] - InstanceNotInitialized, + #[error("failed to initialize the target VM: {0}")] + TargetInstanceInitializationFailed(String), /// The given UUID does not match the existing instance/migration UUID #[error("unexpected Uuid")] @@ -144,11 +144,8 @@ pub enum MigrateError { /// Sending/receiving from the VM state driver command/response channels /// returned an error. - #[error("unable to communiciate with VM state driver")] + #[error("VM state driver unexpectedly closed channel")] StateDriverChannelClosed, - - #[error("request to VM state driver returned failure")] - StateDriverResponseFailed, } impl From for MigrateError { @@ -177,7 +174,7 @@ impl From for HttpError { | MigrateError::Initiate | MigrateError::ProtocolParse(_, _) | MigrateError::NoMatchingProtocol(_, _) - | MigrateError::InstanceNotInitialized + | MigrateError::TargetInstanceInitializationFailed(_) | MigrateError::InvalidInstanceState | MigrateError::Codec(_) | MigrateError::UnexpectedMessage @@ -187,8 +184,7 @@ impl From for HttpError { | MigrateError::DeviceState(_) | MigrateError::RemoteError(_, _) | MigrateError::StateMachine(_) - | MigrateError::StateDriverChannelClosed - | MigrateError::StateDriverResponseFailed => { + | MigrateError::StateDriverChannelClosed => { HttpError::for_internal_error(msg) } MigrateError::MigrationAlreadyInProgress diff --git a/bin/propolis-server/src/lib/migrate/source.rs b/bin/propolis-server/src/lib/migrate/source.rs index bf4ce4fec..c6452b4ef 100644 --- a/bin/propolis-server/src/lib/migrate/source.rs +++ b/bin/propolis-server/src/lib/migrate/source.rs @@ -167,7 +167,8 @@ pub async fn migrate( .command_tx .send(MigrateSourceCommand::RedirtyingFailed) .await - .map_err(|_| MigrateError::StateDriverChannelClosed)?; + .unwrap(); + // .map_err(|_| MigrateError::StateDriverChannelClosed)?; error!( proto.log(), @@ -380,13 +381,11 @@ impl SourceProtocol { self.command_tx .send(MigrateSourceCommand::QueryRedirtyingFailed) .await - .map_err(|_| MigrateError::StateDriverChannelClosed)?; + .unwrap(); + // .map_err(|_| MigrateError::StateDriverChannelClosed)?; - let response = self - .response_rx - .recv() - .await - .ok_or(MigrateError::StateDriverChannelClosed)?; + let response = self.response_rx.recv().await.unwrap(); + // .ok_or(MigrateError::StateDriverChannelClosed)?; match response { MigrateSourceResponse::RedirtyingFailed(has_failed) => { @@ -749,7 +748,10 @@ impl SourceProtocol { // If this is an error message, lift that out .map(|msg| match msg { codec::Message::Error(err) => { - error!(self.log(), "remote error: {err}"); + error!( + self.log(), + "migration failed due to error from target: {err}" + ); Err(MigrateError::RemoteError( MigrateRole::Destination, err.to_string(), diff --git a/bin/propolis-server/src/lib/vm/migrate_commands.rs b/bin/propolis-server/src/lib/vm/migrate_commands.rs index 63c4ecb7e..80ebb3b05 100644 --- a/bin/propolis-server/src/lib/vm/migrate_commands.rs +++ b/bin/propolis-server/src/lib/vm/migrate_commands.rs @@ -23,7 +23,7 @@ pub enum MigrateTargetCommand { #[derive(Clone)] pub enum MigrateTargetResponse { - VmObjectsInitialized(Arc), + VmObjectsInitialized(Result, String>), } /// A message sent from a live migration driver to the state worker, asking it diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index a151a2dc9..0d60367cf 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -14,7 +14,8 @@ use propolis_api_types::{ components::backends::CrucibleStorageBackend, v0::StorageBackendV0, VersionedInstanceSpec, }, - InstanceProperties, InstanceState, MigrationState, + InstanceMigrateInitiateResponse, InstanceProperties, InstanceState, + MigrationState, }; use slog::{error, info}; use uuid::Uuid; @@ -236,7 +237,8 @@ pub(super) async fn run_state_driver( log.new(slog::o!("component" => "request_queue")), )); - let migrated_in = ensure_request.migrate.is_some(); + let migration_in_id = + ensure_request.migrate.as_ref().map(|req| req.migration_id); let (vm_objects, vcpu_tasks) = match match ensure_request.migrate { None => { initialize_vm_from_spec( @@ -294,8 +296,11 @@ pub(super) async fn run_state_driver( // Order matters here: once the ensure result is sent, an external // caller needs to observe that an active VM is present. vm.make_active(active_vm.clone()); - let _ = ensure_result_tx - .send(Ok(propolis_api_types::InstanceEnsureResponse { migrate: None })); + let _ = + ensure_result_tx.send(Ok(propolis_api_types::InstanceEnsureResponse { + migrate: migration_in_id + .map(|id| InstanceMigrateInitiateResponse { migration_id: id }), + })); let state_driver = StateDriver { log, @@ -308,7 +313,7 @@ pub(super) async fn run_state_driver( migration_src_state: Default::default(), }; - state_driver.run(migrated_in).await + state_driver.run(migration_in_id.is_some()).await } impl StateDriver { @@ -652,6 +657,8 @@ impl StateDriver { self.publish_steady_state(InstanceState::Running); } + + return; } // N.B. When handling a command that requires a reply, do not @@ -899,32 +906,75 @@ async fn migrate_as_target( .await }); - // Migration cannot proceed (in any protocol version) until the target - // kernel VMM and Propolis components have been set up. The first command - // from the migration task should be a request to set up these components. - let init_command = command_rx.recv().await.ok_or_else(|| { - anyhow::anyhow!("migration task unexpectedly closed channel") - })?; - - // TODO(#706) The only extant protocol version (V0 with RON encoding) - // assumes that migration targets get an instance spec from the caller of - // the `instance_ensure` API, that the target VM will be initialized from - // this spec, and that device state will be imported in a later migration - // phase. There are other ways to approach this problem: - // - // - This task can initialize a VM using the *source's* instance spec - // (possibly with amended configuration supplied via the API). - // - This task can initialize components using device state payload - // forwarded on from the migration task. - // - // For now, initialize the target VM in the conventional way. - let MigrateTargetCommand::InitializeFromExternalSpec = init_command else { - anyhow::bail!("migration protocol didn't first ask to init objects"); - }; + async fn init_sequence( + log: &slog::Logger, + event_queue: &Arc, + properties: &InstanceProperties, + spec: &VersionedInstanceSpec, + options: &super::EnsureOptions, + command_rx: &mut tokio::sync::mpsc::Receiver, + ) -> anyhow::Result<(VmObjects, Box)> { + // Migration cannot proceed (in any protocol version) until the target + // kernel VMM and Propolis components have been set up. The first + // command from the migration task should be a request to set up these + // components. + let init_command = command_rx.recv().await.ok_or_else(|| { + anyhow::anyhow!("migration task unexpectedly closed channel") + })?; + + // TODO(#706) The only extant protocol version (V0 with RON encoding) + // assumes that migration targets get an instance spec from the caller + // of the `instance_ensure` API, that the target VM will be initialized + // from this spec, and that device state will be imported in a later + // migration phase. Another approach is to get an instance spec from the + // source, amend it with information passed to the target, execute + // enough of the migration protocol to get device state payloads, and + // initialize everything in one fell swoop using the spec and payloads + // as inputs. + // + // This requires a new protocol version, so for now, only look for a + // request to initialize the VM from the caller-provided spec. + let MigrateTargetCommand::InitializeFromExternalSpec = init_command + else { + error!(log, "migration protocol didn't init objects first"; + "first_cmd" => ?init_command); + anyhow::bail!( + "migration protocol didn't first ask to init objects" + ); + }; - let (vm_objects, mut vcpu_tasks) = initialize_vm_from_spec(log, event_queue, properties, spec, options) - .await?; + .await + } + + let (vm_objects, mut vcpu_tasks) = match init_sequence( + log, + event_queue, + properties, + spec, + options, + &mut command_rx, + ) + .await + { + Ok(o) => o, + Err(e) => { + let _ = response_tx + .send(MigrateTargetResponse::VmObjectsInitialized(Err( + e.to_string() + ))) + .await; + external_state.update(ExternalStateUpdate::Migration( + MigrationStateUpdate { + id: migrate_ctx.migration_id, + state: MigrationState::Error, + role: MigrateRole::Source, + }, + )); + + return Err(e); + } + }; // The migration task imports device state by operating directly on the // newly-created VM objects. Before sending them to the task and allowing @@ -945,7 +995,9 @@ async fn migrate_as_target( // this task to reclaim an owned `VmObjects` from the `Arc` wrapper. let vm_objects = Arc::new(vm_objects); if response_tx - .send(MigrateTargetResponse::VmObjectsInitialized(vm_objects.clone())) + .send(MigrateTargetResponse::VmObjectsInitialized(Ok( + vm_objects.clone() + ))) .await .is_err() { @@ -1009,7 +1061,7 @@ async fn next_migrate_task_event( // migration task is exiting. Wait for it to finish and snag its result. match task.await { Ok(res) => { - info!(log, "Migration source task exited: {:?}", res); + info!(log, "Migration task exited: {:?}", res); MigrateTaskEvent::TaskExited(res) } Err(join_err) => { From 96d24f019f6d7ebb1be37760c593cd9c51c82234 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 26 Jun 2024 23:02:32 +0000 Subject: [PATCH 22/55] [WIP] todo cleanup --- bin/propolis-server/src/lib/initializer.rs | 5 ++--- bin/propolis-server/src/lib/migrate/source.rs | 17 +++++++---------- bin/propolis-server/src/lib/server.rs | 9 +-------- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index 971862562..3fb8413eb 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -11,8 +11,8 @@ use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; use crate::serial::Serial; -use crate::server::{BlockBackendMap, CrucibleBackendMap, DeviceMap}; use crate::stats::virtual_machine::VirtualMachine; +use crate::vm::{BlockBackendMap, CrucibleBackendMap, LifecycleMap}; use anyhow::{Context, Result}; use crucible_client_types::VolumeConstructionRequest; pub use nexus_client::Client as NexusClient; @@ -110,8 +110,7 @@ pub struct MachineInitializerState { pub struct MachineInitializer<'a> { pub(crate) log: slog::Logger, pub(crate) machine: &'a Machine, - // TODO(gjc) clean up types here - pub(crate) devices: DeviceMap, + pub(crate) devices: LifecycleMap, pub(crate) block_backends: BlockBackendMap, pub(crate) crucible_backends: CrucibleBackendMap, pub(crate) spec: &'a InstanceSpecV0, diff --git a/bin/propolis-server/src/lib/migrate/source.rs b/bin/propolis-server/src/lib/migrate/source.rs index c6452b4ef..e34432582 100644 --- a/bin/propolis-server/src/lib/migrate/source.rs +++ b/bin/propolis-server/src/lib/migrate/source.rs @@ -250,17 +250,14 @@ impl SourceProtocol { response_rx: tokio::sync::mpsc::Receiver, conn: WebSocketStream, ) -> Self { + // Create a (prospective) dirty page map if bhyve supports the NPT + // API. If this map is present and the VM hasn't recorded that it's + // possibly unhealthy, it will be used to offer only dirty pages during + // the pre-pause RAM push. let dirt = { let can_npt_operate = vm.objects().await.machine().hdl.can_npt_operate(); - // TODO(gjc) the pre-pause offer phase needs to look at whether - // redirtying has previously failed. This is done over the command - // channel (command_tx/response_rx) but that can't be used here - // because the state driver isn't actually coordinating with - // anything yet (the point of this function is to create the objects - // that need to be stuffed into a message to send to the state - // driver) if can_npt_operate { Some(Default::default()) } else { @@ -378,15 +375,15 @@ impl SourceProtocol { // Refer to the giant comment on `RamOfferDiscipline` above for more // details about this determination. if *phase == MigratePhase::RamPushPrePause && self.dirt.is_some() { + // The state driver should keep the command channels alive until the + // migration task exits, so these sends and receives should always + // work. self.command_tx .send(MigrateSourceCommand::QueryRedirtyingFailed) .await .unwrap(); - // .map_err(|_| MigrateError::StateDriverChannelClosed)?; let response = self.response_rx.recv().await.unwrap(); - // .ok_or(MigrateError::StateDriverChannelClosed)?; - match response { MigrateSourceResponse::RedirtyingFailed(has_failed) => { if has_failed { diff --git a/bin/propolis-server/src/lib/server.rs b/bin/propolis-server/src/lib/server.rs index 6e8ffd1b1..0703cbc5e 100644 --- a/bin/propolis-server/src/lib/server.rs +++ b/bin/propolis-server/src/lib/server.rs @@ -11,9 +11,9 @@ use std::convert::TryFrom; use std::net::Ipv6Addr; +use std::net::SocketAddr; use std::net::SocketAddrV6; use std::sync::Arc; -use std::{collections::BTreeMap, net::SocketAddr}; use crate::serial::history_buffer::SerialHistoryOffset; use crate::vm::VmError; @@ -41,13 +41,6 @@ use tokio_tungstenite::WebSocketStream; use crate::spec::{ServerSpecBuilder, ServerSpecBuilderError}; use crate::vnc::PropolisVncServer; -pub(crate) type DeviceMap = - BTreeMap>; -pub(crate) type BlockBackendMap = - BTreeMap>; -pub(crate) type CrucibleBackendMap = - BTreeMap>; - /// Configuration used to set this server up to provide Oximeter metrics. #[derive(Debug, Clone)] pub struct MetricsEndpointConfig { From 10a21790fa5f27c255981933492c048d08cf8169 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 26 Jun 2024 23:07:35 +0000 Subject: [PATCH 23/55] [WIP] fix standalone build --- bin/propolis-standalone/src/config.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/bin/propolis-standalone/src/config.rs b/bin/propolis-standalone/src/config.rs index cc9bde61f..3aff6c0da 100644 --- a/bin/propolis-standalone/src/config.rs +++ b/bin/propolis-standalone/src/config.rs @@ -330,7 +330,11 @@ fn create_crucible_backend( }; info!(log, "Creating Crucible disk from request {:?}", req); // QUESTION: is producer_registry: None correct here? - block::CrucibleBackend::create(req, opts, None, None, log.clone()).unwrap() + tokio::runtime::Handle::current().block_on(async move { + block::CrucibleBackend::create(req, opts, None, None, log.clone()) + .await + .unwrap() + }) } #[cfg(feature = "crucible")] @@ -345,7 +349,11 @@ fn create_crucible_mem_backend( } let parsed: CrucibleMemConfig = opt_deser(&be.options).unwrap(); - block::CrucibleBackend::create_mem(parsed.size, opts, log.clone()).unwrap() + tokio::runtime::Handle::current().block_on(async move { + block::CrucibleBackend::create_mem(parsed.size, opts, log.clone()) + .await + .unwrap() + }) } #[cfg(not(feature = "crucible"))] From 855950ed62d6e5b13f2ce1cd684487144714c890 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 26 Jun 2024 23:11:17 +0000 Subject: [PATCH 24/55] fix pedantic softnpu clippy warning --- lib/propolis/src/hw/virtio/softnpu.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/propolis/src/hw/virtio/softnpu.rs b/lib/propolis/src/hw/virtio/softnpu.rs index ef55e5827..cf1740fae 100644 --- a/lib/propolis/src/hw/virtio/softnpu.rs +++ b/lib/propolis/src/hw/virtio/softnpu.rs @@ -978,7 +978,7 @@ impl P9Handler for SoftNpuP9Handler { // may discover us from trying to use us as some sort of normal P9 // file system. It also helps clients that are actually looking for the // SoftNpu P9 device to identify us as such. - msg.version = "9P2000.P4".to_owned(); + "9P2000.P4".clone_into(&mut msg.version); let mut out = ispf::to_bytes_le(&msg).unwrap(); let buf = out.as_mut_slice(); From df479df99b028c710f0c47080be68ee3f43ac768 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 26 Jun 2024 23:13:18 +0000 Subject: [PATCH 25/55] [WIP] display external status while initializing VMs In the old ensure code, migrating into a target Propolis initialized a VM outside the live migration protocol, made that VM legible to the server (moving its VM wrapper to the "Created" state), and then executed the live migration protocol against it. If migration in failed, the VM would be marked as failed and the appropriate cleanup task would move the server's VM wrapper to the "Destroyed" state, which was enough for clients to observe that migration failed. In the new code, VMs never leave the "WaitingToStart" state if a migration in fails. They should instead go to the "Rundown" state immediately to provide similar legibility. This fixes the "incompatible instance spec" PHD test. --- .../src/lib/migrate/destination.rs | 1 - bin/propolis-server/src/lib/vm/mod.rs | 101 +++++++++++++----- .../src/lib/vm/state_driver.rs | 42 ++------ 3 files changed, 85 insertions(+), 59 deletions(-) diff --git a/bin/propolis-server/src/lib/migrate/destination.rs b/bin/propolis-server/src/lib/migrate/destination.rs index a9e68f28b..d5804d2b5 100644 --- a/bin/propolis-server/src/lib/migrate/destination.rs +++ b/bin/propolis-server/src/lib/migrate/destination.rs @@ -52,7 +52,6 @@ pub async fn migrate( }; if let Err(err) = proto.run().await { - // If the let _ = err_tx .send(MigrateTargetCommand::UpdateState(MigrationState::Error)) .await; diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index f47567c9d..2d98a155a 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -280,7 +280,7 @@ impl Drop for ActiveVm { } } -struct RundownVm { +struct UninitVm { external_state_rx: InstanceStateRx, properties: InstanceProperties, spec: InstanceSpecV0, @@ -304,17 +304,10 @@ struct RundownVm { enum VmState { /// This state machine has never held a VM. NoVm, - - /// There is an active state driver task, but it is currently creating VM - /// components and/or starting VM services. - WaitingForInit, - - /// There is an active virtual machine + WaitingForInit(UninitVm), Active(Arc), - - /// The active VM's state driver has exited, but the - Rundown(RundownVm), - RundownComplete(RundownVm), + Rundown(UninitVm), + RundownComplete(UninitVm), } pub(super) struct EnsureOptions { @@ -350,11 +343,10 @@ impl Vm { VmState::NoVm => { return Err(VmError::NotCreated); } - VmState::WaitingForInit => { - return Err(VmError::WaitingToInitialize); - } VmState::Active(vm) => vm.clone(), - VmState::Rundown(vm) | VmState::RundownComplete(vm) => { + VmState::WaitingForInit(vm) + | VmState::Rundown(vm) + | VmState::RundownComplete(vm) => { return Ok(propolis_api_types::InstanceSpecGetResponse { properties: vm.properties.clone(), state: vm.external_state_rx.borrow().state, @@ -376,21 +368,37 @@ impl Vm { let guard = self.inner.read().unwrap(); match &guard.state { VmState::NoVm => Err(VmError::NotCreated), - VmState::WaitingForInit => Err(VmError::WaitingToInitialize), VmState::Active(vm) => Ok(vm.external_state_rx.clone()), - VmState::Rundown(vm) | VmState::RundownComplete(vm) => { - Ok(vm.external_state_rx.clone()) - } + VmState::WaitingForInit(vm) + | VmState::Rundown(vm) + | VmState::RundownComplete(vm) => Ok(vm.external_state_rx.clone()), } } - fn make_active(&self, active: Arc) { + fn make_active( + self: &Arc, + log: &slog::Logger, + state_driver_queue: Arc, + objects: VmObjects, + services: services::VmServices, + ) -> Arc { info!(self.log, "installing active VM"); let mut guard = self.inner.write().unwrap(); let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { - VmState::WaitingForInit => { + VmState::WaitingForInit(vm) => { + let active = Arc::new(ActiveVm { + parent: self.clone(), + log: log.clone(), + state_driver_queue, + external_state_rx: vm.external_state_rx, + properties: vm.properties, + objects: Some(tokio::sync::RwLock::new(objects)), + services: Some(services), + }); + guard.state = VmState::Active(active.clone()); + active } _ => unreachable!( "only a starting VM's state worker calls make_active" @@ -398,13 +406,25 @@ impl Vm { } } + fn start_failed(&self) { + let mut guard = self.inner.write().unwrap(); + let old = std::mem::replace(&mut guard.state, VmState::NoVm); + match old { + VmState::WaitingForInit(vm) => { + guard.state = VmState::RundownComplete(vm) + } + _ => unreachable!( + "start failures should only occur before an active VM is installed") + } + } + async fn set_rundown(&self) { let vm = self .active_vm() .expect("VM should be active before being run down"); info!(self.log, "setting VM rundown"); - let new_state = VmState::Rundown(RundownVm { + let new_state = VmState::Rundown(UninitVm { external_state_rx: vm.external_state_rx.clone(), properties: vm.properties.clone(), spec: vm.objects().await.instance_spec.clone(), @@ -429,7 +449,30 @@ impl Vm { ensure_request: propolis_api_types::InstanceSpecEnsureRequest, options: EnsureOptions, ) -> Result { + let log_for_driver = + log.new(slog::o!("component" => "vm_state_driver")); + let (ensure_reply_tx, ensure_rx) = tokio::sync::oneshot::channel(); + let (external_publisher, external_rx) = StatePublisher::new( + &log_for_driver, + propolis_api_types::InstanceStateMonitorResponse { + gen: 1, + state: if ensure_request.migrate.is_some() { + propolis_api_types::InstanceState::Migrating + } else { + propolis_api_types::InstanceState::Starting + }, + migration: propolis_api_types::InstanceMigrateStatusResponse { + migration_in: ensure_request.migrate.as_ref().map(|req| { + propolis_api_types::InstanceMigrationStatus { + id: req.migration_id, + state: propolis_api_types::MigrationState::Sync, + } + }), + migration_out: None, + }, + }, + ); // Take the lock for writing, since in the common case this call will be // creating a new VM and there's no easy way to upgrade from a reader @@ -437,7 +480,7 @@ impl Vm { { let mut guard = self.inner.write().unwrap(); match guard.state { - VmState::WaitingForInit => { + VmState::WaitingForInit(_) => { return Err(VmError::WaitingToInitialize) } VmState::Active(_) => return Err(VmError::AlreadyInitialized), @@ -445,14 +488,20 @@ impl Vm { _ => {} } - guard.state = VmState::WaitingForInit; + let VersionedInstanceSpec::V0(v0_spec) = + ensure_request.instance_spec.clone(); + guard.state = VmState::WaitingForInit(UninitVm { + external_state_rx: external_rx.clone(), + properties: ensure_request.properties.clone(), + spec: v0_spec, + }); + let vm_for_driver = self.clone(); - let log_for_driver = - log.new(slog::o!("component" => "vm_state_driver")); guard.driver = Some(tokio::spawn(async move { state_driver::run_state_driver( log_for_driver, vm_for_driver, + external_publisher, ensure_request, ensure_reply_tx, options, diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 0d60367cf..7d58f2935 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -206,33 +206,13 @@ struct StateDriver { pub(super) async fn run_state_driver( log: slog::Logger, vm: Arc, + mut external_publisher: StatePublisher, ensure_request: propolis_api_types::InstanceSpecEnsureRequest, ensure_result_tx: tokio::sync::oneshot::Sender< Result, >, ensure_options: super::EnsureOptions, ) -> StatePublisher { - let (mut external_publisher, external_rx) = StatePublisher::new( - &log, - propolis_api_types::InstanceStateMonitorResponse { - gen: 1, - state: if ensure_request.migrate.is_some() { - propolis_api_types::InstanceState::Migrating - } else { - propolis_api_types::InstanceState::Starting - }, - migration: propolis_api_types::InstanceMigrateStatusResponse { - migration_in: ensure_request.migrate.as_ref().map(|req| { - propolis_api_types::InstanceMigrationStatus { - id: req.migration_id, - state: propolis_api_types::MigrationState::Sync, - } - }), - migration_out: None, - }, - }, - ); - let input_queue = Arc::new(InputQueue::new( log.new(slog::o!("component" => "request_queue")), )); @@ -265,6 +245,9 @@ pub(super) async fn run_state_driver( } { Ok(objects) => objects, Err(e) => { + external_publisher + .update(ExternalStateUpdate::Instance(InstanceState::Failed)); + vm.start_failed(); let _ = ensure_result_tx.send(Err(VmError::InitializationFailed(e))); return external_publisher; @@ -280,22 +263,14 @@ pub(super) async fn run_state_driver( ) .await; - let active_vm = Arc::new(super::ActiveVm { - parent: vm.clone(), - log: log.clone(), - state_driver_queue: input_queue.clone(), - external_state_rx: external_rx, - properties: ensure_request.properties, - objects: Some(tokio::sync::RwLock::new(vm_objects)), - services: Some(services), - }); - // All the VM components now exist, so allow external callers to // interact with the VM. // // Order matters here: once the ensure result is sent, an external // caller needs to observe that an active VM is present. - vm.make_active(active_vm.clone()); + let active_vm = + vm.make_active(&log, input_queue.clone(), vm_objects, services); + let _ = ensure_result_tx.send(Ok(propolis_api_types::InstanceEnsureResponse { migrate: migration_in_id @@ -1022,6 +997,9 @@ async fn migrate_as_target( return Ok((vm_objects, vcpu_tasks)); } Err(e) => { + error!(log, "target migration task failed"; + "error" => %e); + vm_objects.resume_vm(); return Err(e.into()); } From 0dc6512ae1976a7d7a8312fef3617201ded46a3a Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Thu, 27 Jun 2024 00:13:29 +0000 Subject: [PATCH 26/55] [WIP] set up correct external queue dispositions on migration in Before, when "migrate as target" was its own external request queue action, queuing such a request changed the queue's disposition for start requests from "enqueue" to "ignore." Now that there's no such request, the queue needs to start with the correct disposition when trying to migrate in. --- .../src/lib/vm/request_queue.rs | 36 ++++++++++++++----- .../src/lib/vm/state_driver.rs | 17 ++++++--- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/request_queue.rs b/bin/propolis-server/src/lib/vm/request_queue.rs index 08260f29c..53d1aa871 100644 --- a/bin/propolis-server/src/lib/vm/request_queue.rs +++ b/bin/propolis-server/src/lib/vm/request_queue.rs @@ -30,8 +30,8 @@ use uuid::Uuid; /// Wraps a [`dropshot::WebsocketConnection`] for inclusion in an /// [`ExternalRequest`]. // -// A newtype is used here to allow this module's tests (which want to verify -// queuing dispositions and don't care about request contents) to construct a +// This newtype allowsthis module's tests (which want to verify queuing +// dispositions and don't care about request contents) to construct a // `MigrateAsSource` request without having to conjure up a real websocket // conection. pub(crate) struct WebsocketConnection(Option); @@ -187,13 +187,26 @@ pub struct ExternalRequestQueue { log: Logger, } +pub enum InstanceAutoStart { + Yes, + No, +} + impl ExternalRequestQueue { /// Creates a new queue that logs to the supplied logger. - pub fn new(log: Logger) -> Self { + pub fn new(log: Logger, auto_start: InstanceAutoStart) -> Self { + // If the queue is being created for an instance that will start + // automatically (e.g. due to a migration in), set the request + // disposition for future start requests to Ignore for idempotency. + let start = match auto_start { + InstanceAutoStart::Yes => RequestDisposition::Ignore, + InstanceAutoStart::No => RequestDisposition::Enqueue, + }; + Self { queue: VecDeque::new(), allowed: AllowedRequests { - start: RequestDisposition::Enqueue, + start, migrate_as_source: RequestDisposition::Deny( RequestDeniedReason::InstanceNotActive, ), @@ -291,7 +304,11 @@ impl ExternalRequestQueue { ChangeReason::ApiRequest(ExternalRequest::MigrateAsSource { .. }) => { - assert!(matches!(self.allowed.start, Disposition::Ignore)); + assert!( + matches!(self.allowed.start, Disposition::Ignore), + "{:?}", + self.allowed + ); AllowedRequests { start: self.allowed.start, @@ -409,7 +426,8 @@ mod test { #[tokio::test] async fn migrate_as_source_is_not_idempotent() { // Simulate a running instance. - let mut queue = ExternalRequestQueue::new(test_logger()); + let mut queue = + ExternalRequestQueue::new(test_logger(), InstanceAutoStart::No); queue.notify_instance_state_change(InstanceStateChange::StartedRunning); // Requests to migrate out should be allowed. @@ -443,7 +461,8 @@ mod test { #[tokio::test] async fn stop_requests_enqueue_after_vm_failure() { - let mut queue = ExternalRequestQueue::new(test_logger()); + let mut queue = + ExternalRequestQueue::new(test_logger(), InstanceAutoStart::No); queue.notify_instance_state_change(InstanceStateChange::Failed); assert!(queue.try_queue(ExternalRequest::Stop).is_ok()); @@ -452,7 +471,8 @@ mod test { #[tokio::test] async fn reboot_requests_are_idempotent_except_when_stopping() { - let mut queue = ExternalRequestQueue::new(test_logger()); + let mut queue = + ExternalRequestQueue::new(test_logger(), InstanceAutoStart::No); queue.notify_instance_state_change(InstanceStateChange::StartedRunning); // Once the instance is started, reboot requests should be allowed, but diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 7d58f2935..395426778 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -38,7 +38,7 @@ use super::{ MigrateSourceCommand, MigrateSourceResponse, MigrateTargetResponse, MigrateTaskEvent, }, - request_queue::ExternalRequest, + request_queue::{ExternalRequest, InstanceAutoStart}, state_publisher::{MigrationStateUpdate, StatePublisher}, VmError, VmObjects, }; @@ -68,10 +68,10 @@ struct InputQueueInner { } impl InputQueueInner { - fn new(log: slog::Logger) -> Self { + fn new(log: slog::Logger, auto_start: InstanceAutoStart) -> Self { Self { external_requests: super::request_queue::ExternalRequestQueue::new( - log, + log, auto_start, ), guest_events: super::guest_event::GuestEventQueue::default(), } @@ -84,9 +84,12 @@ pub(super) struct InputQueue { } impl InputQueue { - pub(super) fn new(log: slog::Logger) -> Self { + pub(super) fn new( + log: slog::Logger, + auto_start: InstanceAutoStart, + ) -> Self { Self { - inner: Mutex::new(InputQueueInner::new(log)), + inner: Mutex::new(InputQueueInner::new(log, auto_start)), cv: Condvar::new(), } } @@ -215,6 +218,10 @@ pub(super) async fn run_state_driver( ) -> StatePublisher { let input_queue = Arc::new(InputQueue::new( log.new(slog::o!("component" => "request_queue")), + match ensure_request.migrate { + Some(_) => InstanceAutoStart::Yes, + None => InstanceAutoStart::No, + }, )); let migration_in_id = From a25e38f544652929dacd7f40621c5050e9656016 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Thu, 27 Jun 2024 01:15:37 +0000 Subject: [PATCH 27/55] [WIP] start clean instances in Creating; fix Crucible block_on --- bin/propolis-server/src/lib/initializer.rs | 2 +- bin/propolis-server/src/lib/vm/mod.rs | 2 +- bin/propolis-server/src/lib/vm/request_queue.rs | 2 +- lib/propolis/src/block/crucible.rs | 6 ++---- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index 3fb8413eb..2bf69b631 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -412,7 +412,7 @@ impl<'a> MachineInitializer<'a> { ) .await?; - let crucible = Some((be.get_uuid()?, be.clone())); + let crucible = Some((be.get_uuid().await?, be.clone())); Ok(StorageBackendInstance { be, crucible }) } instance_spec::v0::StorageBackendV0::File(spec) => { diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 2d98a155a..b1cc7fbe3 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -460,7 +460,7 @@ impl Vm { state: if ensure_request.migrate.is_some() { propolis_api_types::InstanceState::Migrating } else { - propolis_api_types::InstanceState::Starting + propolis_api_types::InstanceState::Creating }, migration: propolis_api_types::InstanceMigrateStatusResponse { migration_in: ensure_request.migrate.as_ref().map(|req| { diff --git a/bin/propolis-server/src/lib/vm/request_queue.rs b/bin/propolis-server/src/lib/vm/request_queue.rs index 53d1aa871..7ba185b2f 100644 --- a/bin/propolis-server/src/lib/vm/request_queue.rs +++ b/bin/propolis-server/src/lib/vm/request_queue.rs @@ -33,7 +33,7 @@ use uuid::Uuid; // This newtype allowsthis module's tests (which want to verify queuing // dispositions and don't care about request contents) to construct a // `MigrateAsSource` request without having to conjure up a real websocket -// conection. +// connection. pub(crate) struct WebsocketConnection(Option); impl From for WebsocketConnection { diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index 9505ce2f7..e5fb5dbb6 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -239,10 +239,8 @@ impl CrucibleBackend { } /// Retrieve the UUID identifying this Crucible backend. - pub fn get_uuid(&self) -> io::Result { - let rt = tokio::runtime::Handle::current(); - rt.block_on(async { self.state.volume.get_uuid().await }) - .map_err(CrucibleError::into) + pub async fn get_uuid(&self) -> io::Result { + self.state.volume.get_uuid().await.map_err(CrucibleError::into) } /// Issue a snapshot request From 28d3e6a6e90c7e5bf5b470ddc6dd9d65134eb7d0 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Thu, 27 Jun 2024 01:25:15 +0000 Subject: [PATCH 28/55] remove old files --- bin/propolis-server/src/lib/vm_old/mod.rs | 1138 -------------- .../src/lib/vm_old/request_queue.rs | 604 ------- .../src/lib/vm_old/state_driver.rs | 1384 ----------------- 3 files changed, 3126 deletions(-) delete mode 100644 bin/propolis-server/src/lib/vm_old/mod.rs delete mode 100644 bin/propolis-server/src/lib/vm_old/request_queue.rs delete mode 100644 bin/propolis-server/src/lib/vm_old/state_driver.rs diff --git a/bin/propolis-server/src/lib/vm_old/mod.rs b/bin/propolis-server/src/lib/vm_old/mod.rs deleted file mode 100644 index 76363fa2a..000000000 --- a/bin/propolis-server/src/lib/vm_old/mod.rs +++ /dev/null @@ -1,1138 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Implements the VM controller: the public interface to a single Propolis -//! instance. -//! -//! The VM controller serves two purposes. First, it collects all of the objects -//! describing a single Propolis VM (the Propolis `Instance` itself, the -//! instance's spec, direct references to components in the instance, etc.). -//! Second, it records requests and events that affect how a VM moves through -//! the stages of its lifecycle, i.e. how and when it boots, reboots, migrates, -//! and stops. -//! -//! Each VM controller has a single "state driver" thread that processes -//! requests and events recorded by its controller and acts on the underlying -//! Propolis instance to move the VM into the appropriate states. Doing this -//! work on a single thread ensures that a VM can only undergo one state change -//! at a time, that there are no races to start/pause/resume/halt a VM's -//! components, and that there is a single source of truth as to a VM's current -//! state (and as to the steps that are required to move it to a different -//! state). Operations like live migration that require components to pause and -//! resume coordinate directly with the state driver thread. -//! -//! The VM controller's public API allows a Propolis Dropshot server to query a -//! VM's current state, to ask to change that state, and to obtain references to -//! objects in a VM as needed to handle other requests made of the server (e.g. -//! requests to connect to an instance's serial console or to take a disk -//! snapshot). The controller also implements traits that allow a VM's -//! components to raise events for the state driver to process (e.g. a request -//! from a VM's chipset to reboot or halt the VM). - -use crate::migrate; - -use futures::{future::BoxFuture, stream::FuturesUnordered, StreamExt}; -use std::{ - collections::{BTreeMap, VecDeque}, - fmt::Debug, - net::SocketAddr, - pin::Pin, - sync::{Arc, Condvar, Mutex, MutexGuard, Weak}, - task::{Context, Poll}, - thread::JoinHandle, - time::Duration, -}; - -use oximeter::types::ProducerRegistry; -use propolis::{ - hw::{ps2::ctrl::PS2Ctrl, qemu::ramfb::RamFb, uart::LpcUart}, - vmm::Machine, -}; -use propolis_api_types::{ - instance_spec::VersionedInstanceSpec, - InstanceMigrateStatusResponse as ApiMigrateStatusResponse, - InstanceMigrationStatus as ApiMigrationStatus, InstanceProperties, - InstanceState as ApiInstanceState, - InstanceStateMonitorResponse as ApiMonitoredState, - InstanceStateRequested as ApiInstanceStateRequested, - MigrationState as ApiMigrationState, -}; -use slog::{debug, error, info, Logger}; -use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::sync::oneshot; -use tokio_tungstenite::WebSocketStream; -use uuid::Uuid; - -use crate::{ - initializer::{ - build_instance, MachineInitializer, MachineInitializerState, - }, - migrate::{MigrateError, MigrateRole}, - serial::Serial, - server::{BlockBackendMap, CrucibleBackendMap, DeviceMap, StaticConfig}, - vm::request_queue::ExternalRequest, -}; - -use self::request_queue::{ExternalRequestQueue, RequestDeniedReason}; -pub use nexus_client::Client as NexusClient; - -mod request_queue; -mod state_driver; - -#[derive(Debug, Error)] -pub enum VmControllerError { - #[error("The requested operation requires an active instance")] - InstanceNotActive, - - #[error("The instance has a pending request to halt")] - InstanceHaltPending, - - #[error("Instance is already a migration source")] - AlreadyMigrationSource, - - #[error("Cannot request state {0:?} while migration is in progress")] - InvalidRequestForMigrationSource(ApiInstanceStateRequested), - - #[error("A migration into this instance is in progress")] - MigrationTargetInProgress, - - #[error("Another live migration into this instance already occurred")] - MigrationTargetPreviouslyCompleted, - - #[error("The most recent attempt to migrate into this instance failed")] - MigrationTargetFailed, - - #[error("Can't migrate into a running instance")] - TooLateToBeMigrationTarget, - - #[error("Failed to queue requested state change: {0}")] - StateChangeRequestDenied(#[from] request_queue::RequestDeniedReason), - - #[error("Migration protocol error: {0:?}")] - MigrationProtocolError(#[from] MigrateError), - - #[error("Failed to start vCPU workers")] - VcpuWorkerCreationFailed(#[from] super::vcpu_tasks::VcpuTaskError), - - #[error("Failed to create state worker: {0}")] - StateWorkerCreationFailed(std::io::Error), -} - -impl From for dropshot::HttpError { - fn from(vm_error: VmControllerError) -> Self { - use dropshot::HttpError; - match vm_error { - VmControllerError::AlreadyMigrationSource - | VmControllerError::InvalidRequestForMigrationSource(_) - | VmControllerError::MigrationTargetInProgress - | VmControllerError::MigrationTargetFailed - | VmControllerError::TooLateToBeMigrationTarget - | VmControllerError::StateChangeRequestDenied(_) - | VmControllerError::InstanceNotActive - | VmControllerError::InstanceHaltPending - | VmControllerError::MigrationTargetPreviouslyCompleted => { - HttpError::for_status( - Some(format!("Instance operation failed: {}", vm_error)), - http::status::StatusCode::FORBIDDEN, - ) - } - VmControllerError::MigrationProtocolError(_) - | VmControllerError::VcpuWorkerCreationFailed(_) - | VmControllerError::StateWorkerCreationFailed(_) => { - HttpError::for_internal_error(format!( - "Instance operation failed: {}", - vm_error - )) - } - } - } -} - -/// A collection of objects that describe an instance and references to that -/// instance and its components. -pub(crate) struct VmObjects { - /// The underlying Propolis `Machine` this controller is managing. - machine: Option, - - /// The instance properties supplied when this controller was created. - properties: InstanceProperties, - - /// The instance spec used to create this controller's VM. - spec: tokio::sync::Mutex, - - /// Map of the emulated devices associated with the VM - devices: DeviceMap, - - /// Map of the instance's active block backends. - block_backends: BlockBackendMap, - - /// Map of the instance's active Crucible backends. - crucible_backends: CrucibleBackendMap, - - /// A wrapper around the instance's first COM port, suitable for providing a - /// connection to a guest's serial console. - com1: Arc>, - - /// An optional reference to the guest's framebuffer. - framebuffer: Option>, - - /// A reference to the guest's PS/2 controller. - ps2ctrl: Arc, - - /// A notification receiver to which the state worker publishes the most - /// recent instance state information. - monitor_rx: tokio::sync::watch::Receiver, -} - -/// A message sent from a live migration destination task to update the -/// externally visible state of the migration attempt. -#[derive(Clone, Copy, Debug)] -pub enum MigrateTargetCommand { - /// Update the externally-visible migration state. - UpdateState(ApiMigrationState), -} - -/// A message sent from a live migration driver to the state worker, asking it -/// to act on source instance components on the task's behalf. -#[derive(Clone, Copy, Debug)] -pub enum MigrateSourceCommand { - /// Update the externally-visible migration state. - UpdateState(ApiMigrationState), - - /// Pause the instance's devices and CPUs. - Pause, -} - -/// A message sent from the state worker to the live migration driver in -/// response to a previous command. -#[derive(Debug)] -pub enum MigrateSourceResponse { - /// A request to pause completed with the attached result. - Pause(Result<(), std::io::Error>), -} - -/// An event raised by a migration task that must be handled by the state -/// worker. -#[derive(Debug)] -enum MigrateTaskEvent { - /// The task completed with the associated result. - TaskExited(Result<(), MigrateError>), - - /// The task sent a command requesting work. - Command(T), -} - -/// An event raised by some component in the instance (e.g. a vCPU or the -/// chipset) that the state worker must handle. -/// -/// The vCPU-sourced events carry a time element (duration since VM boot) as -/// emitted by the kernel vmm. This is used to deduplicate events when all -/// vCPUs running in-kernel are kicked out for the suspend state. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -enum GuestEvent { - /// VM entered halt state - VcpuSuspendHalt(Duration), - /// VM entered reboot state - VcpuSuspendReset(Duration), - /// vCPU encounted triple-fault - VcpuSuspendTripleFault(i32, Duration), - /// Chipset signaled halt condition - ChipsetHalt, - /// Chipset signaled reboot condition - ChipsetReset, -} - -/// Shared instance state guarded by the controller's state mutex. This state is -/// accessed from the controller API and the VM's state worker. -#[derive(Debug)] -struct SharedVmStateInner { - external_request_queue: ExternalRequestQueue, - - /// The state worker's queue of unprocessed events from guest devices. - guest_event_queue: VecDeque, - - /// The expected ID of the next live migration this instance will - /// participate in (either in or out). If this is `Some`, external callers - /// who query migration state will observe that a live migration is in - /// progress even if the state driver has yet to pick up the live migration - /// tasks from its queue. - pending_migration_id: Option<(Uuid, MigrateRole)>, -} - -impl SharedVmStateInner { - fn new(parent_log: &Logger) -> Self { - let queue_log = - parent_log.new(slog::o!("component" => "external_request_queue")); - Self { - external_request_queue: ExternalRequestQueue::new(queue_log), - guest_event_queue: VecDeque::new(), - pending_migration_id: None, - } - } -} - -#[derive(Debug)] -pub(crate) struct SharedVmState { - inner: Mutex, - cv: Condvar, -} - -/// A VM controller: a wrapper around a Propolis instance that supplies the -/// functions needed for the Propolis server to implement its own API. -pub struct VmController { - /// A collection of objects that don't change once an instance is ensured: - /// the instance itself, a description of it, and convenience references to - /// some of its members (used to avoid rummaging through the instance's - /// inventory). - vm_objects: VmObjects, - - /// A wrapper for the runtime state of this instance, managed by the state - /// worker thread. This also serves as a sink for hardware events (e.g. from - /// vCPUs and the chipset), so it is wrapped in an Arc so that it can be - /// shared with those events' sources. - worker_state: Arc, - - /// A handle to the state worker thread for this instance. - worker_thread: Mutex< - Option>>, - >, - - /// This controller's logger. - log: Logger, - - /// A handle to a tokio runtime onto which this controller can spawn tasks - /// (e.g. migration tasks). - runtime_hdl: tokio::runtime::Handle, - - /// Migration source state persisted across multiple migration attempts. - migration_src_state: Mutex, - - /// A weak reference to this controller, suitable for upgrading and passing - /// to tasks the controller spawns. - this: Weak, -} - -impl SharedVmState { - fn new(parent_log: &Logger) -> Self { - Self { - inner: Mutex::new(SharedVmStateInner::new(parent_log)), - cv: Condvar::new(), - } - } - - fn queue_external_request( - &self, - request: ExternalRequest, - ) -> Result<(), RequestDeniedReason> { - let mut inner = self.inner.lock().unwrap(); - let result = inner.external_request_queue.try_queue(request); - if result.is_ok() { - self.cv.notify_one(); - } - result - } - - fn wait_for_next_event(&self) -> StateDriverEvent { - let guard = self.inner.lock().unwrap(); - let mut guard = self - .cv - .wait_while(guard, |i| { - i.external_request_queue.is_empty() - && i.guest_event_queue.is_empty() - }) - .unwrap(); - - if let Some(guest_event) = guard.guest_event_queue.pop_front() { - StateDriverEvent::Guest(guest_event) - } else { - StateDriverEvent::External( - guard.external_request_queue.pop_front().unwrap(), - ) - } - } - - /// Add a guest event to the queue, so long as it does not appear to be a - /// duplicate of an existing event. - fn enqueue_guest_event(&self, event: GuestEvent) { - let mut inner = self.inner.lock().unwrap(); - if !inner.guest_event_queue.iter().any(|ev| *ev == event) { - // Only queue event if nothing else in the queue is a direct match - inner.guest_event_queue.push_back(event); - self.cv.notify_one(); - } - } - - pub fn suspend_halt_event(&self, when: Duration) { - self.enqueue_guest_event(GuestEvent::VcpuSuspendHalt(when)); - } - - pub fn suspend_reset_event(&self, when: Duration) { - self.enqueue_guest_event(GuestEvent::VcpuSuspendReset(when)); - } - - pub fn suspend_triple_fault_event(&self, vcpu_id: i32, when: Duration) { - self.enqueue_guest_event(GuestEvent::VcpuSuspendTripleFault( - vcpu_id, when, - )); - } - - pub fn unhandled_vm_exit( - &self, - vcpu_id: i32, - exit: propolis::exits::VmExitKind, - ) { - panic!("vCPU {}: Unhandled VM exit: {:?}", vcpu_id, exit); - } - - pub fn io_error_event(&self, vcpu_id: i32, error: std::io::Error) { - panic!("vCPU {}: Unhandled vCPU error: {}", vcpu_id, error); - } - - pub fn clear_pending_migration(&self) { - let mut inner = self.inner.lock().unwrap(); - inner.pending_migration_id = None; - } -} - -/// Functions called by a Propolis chipset to notify another component that an -/// event occurred. -pub trait ChipsetEventHandler: Send + Sync { - fn chipset_halt(&self); - fn chipset_reset(&self); -} - -impl ChipsetEventHandler for SharedVmState { - fn chipset_halt(&self) { - self.enqueue_guest_event(GuestEvent::ChipsetHalt); - } - - fn chipset_reset(&self) { - self.enqueue_guest_event(GuestEvent::ChipsetReset); - } -} - -impl VmController { - #[allow(clippy::too_many_arguments)] - pub fn new( - instance_spec: VersionedInstanceSpec, - properties: InstanceProperties, - &StaticConfig { vm: ref toml_config, use_reservoir, .. }: &StaticConfig, - producer_registry: Option, - nexus_client: Option, - log: Logger, - runtime_hdl: tokio::runtime::Handle, - stop_ch: oneshot::Sender<()>, - ) -> anyhow::Result> { - let bootrom = &toml_config.bootrom; - info!(log, "initializing new VM"; - "spec" => #?instance_spec, - "properties" => #?properties, - "use_reservoir" => use_reservoir, - "bootrom" => %bootrom.display()); - - let vmm_log = log.new(slog::o!("component" => "vmm")); - - // Set up the 'shell' instance into which the rest of this routine will - // add components. - let VersionedInstanceSpec::V0(v0_spec) = &instance_spec; - let machine = build_instance( - &properties.vm_name(), - v0_spec, - use_reservoir, - vmm_log, - )?; - - // Create the state monitor channel and the worker state struct that - // depends on it. The state struct can then be passed to device - // initialization as an event sink. - let (monitor_tx, monitor_rx) = - tokio::sync::watch::channel(ApiMonitoredState { - gen: 0, - state: ApiInstanceState::Creating, - migration: ApiMigrateStatusResponse { - migration_in: None, - migration_out: None, - }, - }); - - let worker_state = Arc::new(SharedVmState::new(&log)); - - // Create and initialize devices in the new instance. - let mut init = MachineInitializer { - log: log.clone(), - machine: &machine, - devices: DeviceMap::new(), - block_backends: BlockBackendMap::new(), - crucible_backends: CrucibleBackendMap::new(), - spec: v0_spec, - properties: &properties, - toml_config, - producer_registry, - state: MachineInitializerState::default(), - }; - - init.initialize_rom(bootrom.as_path())?; - let chipset = init.initialize_chipset( - &(worker_state.clone() as Arc), - )?; - init.initialize_rtc(&chipset)?; - init.initialize_hpet()?; - - let com1 = Arc::new(init.initialize_uart(&chipset)?); - let ps2ctrl = init.initialize_ps2(&chipset)?; - init.initialize_qemu_debug_port()?; - init.initialize_qemu_pvpanic((&properties).into())?; - init.initialize_network_devices(&chipset)?; - - #[cfg(not(feature = "omicron-build"))] - init.initialize_test_devices(&toml_config.devices)?; - #[cfg(feature = "omicron-build")] - info!( - log, - "`omicron-build` feature enabled, ignoring any test devices" - ); - - #[cfg(feature = "falcon")] - init.initialize_softnpu_ports(&chipset)?; - #[cfg(feature = "falcon")] - init.initialize_9pfs(&chipset)?; - init.initialize_storage_devices(&chipset, nexus_client)?; - let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; - init.initialize_cpus()?; - let vcpu_tasks = super::vcpu_tasks::VcpuTasks::new( - &machine, - worker_state.clone(), - log.new(slog::o!("component" => "vcpu_tasks")), - )?; - - let MachineInitializer { - devices, - block_backends, - crucible_backends, - .. - } = init; - - // The instance is fully set up; pass it to the new controller. - let shared_state_for_worker = worker_state.clone(); - let controller = Arc::new_cyclic(|this| Self { - vm_objects: VmObjects { - machine: Some(machine), - properties, - spec: tokio::sync::Mutex::new(instance_spec), - devices, - block_backends, - crucible_backends, - com1, - framebuffer: Some(ramfb), - ps2ctrl, - monitor_rx, - }, - worker_state, - worker_thread: Mutex::new(None), - migration_src_state: Default::default(), - log: log.new(slog::o!("component" => "vm_controller")), - runtime_hdl: runtime_hdl.clone(), - this: this.clone(), - }); - - // Now that the controller exists, launch the state worker that will - // drive state transitions for this instance. When the VM halts, the - // worker will exit and drop its reference to the controller. - let ctrl_for_worker = controller.clone(); - let log_for_worker = - log.new(slog::o!("component" => "vm_state_worker")); - let worker_thread = std::thread::Builder::new() - .name("vm_state_worker".to_string()) - .spawn(move || { - let driver = state_driver::StateDriver::new( - runtime_hdl, - ctrl_for_worker, - shared_state_for_worker, - vcpu_tasks, - log_for_worker, - monitor_tx, - ); - - let monitor_tx = driver.run_state_worker(); - - // Signal back to the server state once the worker has exited. - let _ = stop_ch.send(()); - monitor_tx - }) - .map_err(VmControllerError::StateWorkerCreationFailed)?; - - *controller.worker_thread.lock().unwrap() = Some(worker_thread); - Ok(controller) - } - - pub fn properties(&self) -> &InstanceProperties { - &self.vm_objects.properties - } - - pub fn machine(&self) -> &Machine { - // Unwrap safety: The machine is created when the controller is created - // and removed only when the controller is dropped. - self.vm_objects - .machine - .as_ref() - .expect("VM controller always has a valid machine") - } - - pub(crate) fn migration_src_state( - &self, - ) -> MutexGuard<'_, migrate::source::PersistentState> { - self.migration_src_state.lock().unwrap() - } - - pub async fn instance_spec( - &self, - ) -> tokio::sync::MutexGuard<'_, VersionedInstanceSpec> { - self.vm_objects.spec.lock().await - } - - pub fn com1(&self) -> &Arc> { - &self.vm_objects.com1 - } - - pub fn framebuffer(&self) -> Option<&Arc> { - self.vm_objects.framebuffer.as_ref() - } - - pub fn ps2ctrl(&self) -> &Arc { - &self.vm_objects.ps2ctrl - } - - pub fn crucible_backends( - &self, - ) -> &BTreeMap> { - &self.vm_objects.crucible_backends - } - - pub fn log(&self) -> &Logger { - &self.log - } - - pub fn external_instance_state(&self) -> ApiInstanceState { - self.vm_objects.monitor_rx.borrow().state - } - - pub fn inject_nmi(&self) { - if let Some(machine) = &self.vm_objects.machine { - match machine.inject_nmi() { - Ok(_) => { - info!(self.log, "Sending NMI to instance"); - } - Err(e) => { - error!(self.log, "Could not send NMI to instance: {}", e); - } - }; - } - } - - pub fn state_watcher( - &self, - ) -> &tokio::sync::watch::Receiver { - &self.vm_objects.monitor_rx - } - - /// Asks to queue a request to start a source migration task for this VM. - /// The migration will have the supplied `migration_id` and will obtain its - /// connection to the target by calling `upgrade_fn` to obtain a future that - /// yields the necessary connection. - /// - /// This routine fails if the VM was not marked as a migration source or if - /// it has another pending request that precludes migration. Note that this - /// routine does not fail if the future returned from `upgrade_fn` fails to - /// produce a connection to the destination. - /// - /// On success, clients may query the instance's migration status to - /// determine how the migration has progressed. - pub fn request_migration_from< - T: AsyncRead + AsyncWrite + Unpin + Send + 'static, - >( - &self, - migration_id: Uuid, - conn: WebSocketStream, - protocol: crate::migrate::protocol::Protocol, - ) -> Result<(), VmControllerError> { - let mut inner = self.worker_state.inner.lock().unwrap(); - - // Check that the request can be enqueued before setting up the - // migration task. - if !inner.external_request_queue.migrate_as_source_will_enqueue()? { - return Ok(()); - } - - let migration_request = - self.launch_source_migration_task(migration_id, conn, protocol); - - // Unwrap is safe because the queue state was checked under the lock. - inner.external_request_queue.try_queue(migration_request).unwrap(); - self.worker_state.cv.notify_one(); - Ok(()) - } - - /// Launches a task that will execute a live migration out of this VM. - /// Returns a state change request message to queue to the state driver, - /// which will coordinate with this task to run the migration. - fn launch_source_migration_task< - T: AsyncRead + AsyncWrite + Unpin + Send + 'static, - >( - &self, - migration_id: Uuid, - conn: WebSocketStream, - protocol: crate::migrate::protocol::Protocol, - ) -> ExternalRequest { - let log_for_task = - self.log.new(slog::o!("component" => "migrate_source_task")); - let ctrl_for_task = self.this.upgrade().unwrap(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - let (response_tx, response_rx) = tokio::sync::mpsc::channel(1); - - // The migration process uses async operations when communicating with - // the migration target. Run that work on the async runtime. - info!(self.log, "Launching migration source task"); - let task = self.runtime_hdl.spawn(async move { - info!(log_for_task, "Waiting to be told to start"); - start_rx.await.unwrap(); - - info!(log_for_task, "Starting migration procedure"); - if let Err(e) = crate::migrate::source::migrate( - ctrl_for_task, - command_tx, - response_rx, - conn, - protocol, - ) - .await - { - error!(log_for_task, "Migration task failed: {}", e); - return Err(e); - } - - Ok(()) - }); - - ExternalRequest::MigrateAsSource { - migration_id, - task, - start_tx, - command_rx, - response_tx, - } - } - - /// Asks to queue a request to start a destination migration task for this - /// VM. The migration will have the supplied `migration_id` and will obtain - /// its connection to the source by calling `upgrade_fn` to obtain a future - /// that yields the necessary connection. - /// - /// This routine fails if the VM has already begun to run or if a previous - /// migration in was attempted (regardless of its outcome). Note that this - /// routine does not fail if the future returned from `upgrade_fn` - /// subsequently fails to produce a connection to the destination (though - /// the migration attempt will then fail). - /// - /// On success, clients may query the instance's migration status to - /// determine how the migration has progressed. - pub fn request_migration_into< - T: AsyncRead + AsyncWrite + Unpin + Send + 'static, - >( - &self, - migration_id: Uuid, - conn: WebSocketStream, - local_addr: SocketAddr, - protocol: crate::migrate::protocol::Protocol, - ) -> Result<(), VmControllerError> { - let mut inner = self.worker_state.inner.lock().unwrap(); - if !inner.external_request_queue.migrate_as_target_will_enqueue()? { - return Ok(()); - } - - // Check that the request can be enqueued before setting up the - // migration task. - let migration_request = self.launch_target_migration_task( - migration_id, - conn, - local_addr, - protocol, - ); - - // Unwrap is safe because the queue state was checked under the lock. - inner.external_request_queue.try_queue(migration_request).unwrap(); - self.worker_state.cv.notify_one(); - Ok(()) - } - - /// Launches a task that will execute a live migration into this VM. - /// Returns a state change request message to queue to the state driver, - /// which will coordinate with this task to run the migration. - fn launch_target_migration_task< - T: AsyncRead + AsyncWrite + Unpin + Send + 'static, - >( - &self, - migration_id: Uuid, - conn: WebSocketStream, - local_addr: SocketAddr, - protocol: crate::migrate::protocol::Protocol, - ) -> ExternalRequest { - let log_for_task = - self.log.new(slog::o!("component" => "migrate_source_task")); - let ctrl_for_task = self.this.upgrade().unwrap(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - - // The migration process uses async operations when communicating with - // the migration target. Run that work on the async runtime. - info!(self.log, "Launching migration target task"); - let task = self.runtime_hdl.spawn(async move { - info!(log_for_task, "Waiting to be told to start"); - start_rx.await.unwrap(); - - info!(log_for_task, "Starting migration procedure"); - if let Err(e) = crate::migrate::destination::migrate( - ctrl_for_task, - command_tx, - conn, - local_addr, - protocol, - ) - .await - { - error!(log_for_task, "Migration task failed: {}", e); - return Err(e); - } - - Ok(()) - }); - - ExternalRequest::MigrateAsTarget { - migration_id, - task, - start_tx, - command_rx, - } - } - - /// Handles a request to change the wrapped instance's state. - pub fn put_state( - &self, - requested: ApiInstanceStateRequested, - ) -> Result<(), VmControllerError> { - info!(self.log(), "Requested state {:?} via API", requested); - - self.worker_state - .queue_external_request(match requested { - ApiInstanceStateRequested::Run => ExternalRequest::Start, - ApiInstanceStateRequested::Stop => ExternalRequest::Stop, - ApiInstanceStateRequested::Reboot => ExternalRequest::Reboot, - }) - .map_err(Into::into) - } - - pub fn migrate_status(&self) -> ApiMigrateStatusResponse { - let mut published = - self.vm_objects.monitor_rx.borrow().migration.clone(); - - // There's a window between the point where a request to migrate returns - // and the point where the state worker actually picks up the migration - // and publishes its state. To ensure that migrations are visible as - // soon as they're queued, pick up the queued migration (if there is - // one) and insert it into the output in the appropriate position. The - // state driver will consume the pending migration before actually - // executing it. - let inner = self.worker_state.inner.lock().unwrap(); - if let Some((id, role)) = inner.pending_migration_id { - match role { - MigrateRole::Destination => { - published.migration_in = Some(ApiMigrationStatus { - id, - state: ApiMigrationState::Sync, - }); - } - MigrateRole::Source => { - published.migration_out = Some(ApiMigrationStatus { - id, - state: ApiMigrationState::Sync, - }); - } - } - } - - published - } - - pub(crate) fn for_each_device( - &self, - mut func: impl FnMut(&str, &Arc), - ) { - for (name, dev) in self.vm_objects.devices.iter() { - func(name, dev); - } - } - - pub(crate) fn for_each_device_fallible( - &self, - mut func: F, - ) -> std::result::Result<(), E> - where - F: FnMut( - &str, - &Arc, - ) -> std::result::Result<(), E>, - { - for (name, dev) in self.vm_objects.devices.iter() { - func(name, dev)?; - } - Ok(()) - } - - pub(crate) fn device_by_name( - &self, - name: &String, - ) -> Option> { - self.vm_objects.devices.get(name).cloned() - } -} - -impl Drop for VmController { - fn drop(&mut self) { - info!(self.log, "Dropping VM controller"); - let machine = self - .vm_objects - .machine - .take() - .expect("VM controller should have an instance at drop"); - - // Destroy the underlying kernel VMM resource - let hdl = machine.destroy(); - let _ = hdl.destroy(); - - // Detach block backends so they can do any final clean-up - debug!(self.log, "Detaching block backends"); - for backend in self.vm_objects.block_backends.values() { - let _ = backend.attachment().detach(); - } - - // A fully-initialized controller is kept alive in part by its worker - // thread, which owns the sender side of the controller's state-change - // notification channel. Since the controller is being dropped, the - // worker is gone, so reclaim the sender from it and use it to publish - // that the controller is being destroyed. - if let Some(thread) = self.worker_thread.lock().unwrap().take() { - let api_state = thread.join().unwrap(); - let old_state = api_state.borrow().clone(); - - // Preserve the instance's state if it failed so that clients can - // distinguish gracefully-stopped instances from failed instances. - if matches!(old_state.state, ApiInstanceState::Failed) { - return; - } - - let gen = old_state.gen + 1; - let _ = api_state.send(ApiMonitoredState { - gen, - state: ApiInstanceState::Destroyed, - ..old_state - }); - } - } -} - -/// An event that a VM's state driver must process. -#[derive(Debug)] -enum StateDriverEvent { - /// An event that was raised from within the guest. - Guest(GuestEvent), - - /// An event that was raised by an external entity (e.g. an API call to the - /// server). - External(ExternalRequest), -} - -/// Commands issued by the state driver back to its VM controller. These are -/// abstracted into a trait to allow them to be mocked out for testing without -/// having to supply mock implementations of the rest of the VM controller's -/// functionality. -#[cfg_attr(test, mockall::automock)] -trait StateDriverVmController { - /// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated - /// devices and vCPUs are brought to a consistent state. - /// - /// When the VM is paused, attempts to run its vCPUs (via `VM_RUN` ioctl) - /// will fail. A corresponding `resume_vm()` call must be made prior to - /// allowing vCPU tasks to run. - fn pause_vm(&self); - - /// Resume a previously-paused VM at the kernel VMM level. This will resume - /// any timers driving in-kernel-emulated devices, and allow the vCPU to run - /// again. - fn resume_vm(&self); - - /// Sends a reset request to each device in the instance, then sends a - /// reset command to the instance's bhyve VM. - fn reset_devices_and_machine(&self); - - /// Sends each device (and backend) a start request. - fn start_devices(&self) -> anyhow::Result<()>; - - /// Sends each device a pause request, then waits for all these requests to - /// complete. - fn pause_devices(&self); - - /// Sends each device a resume request. - fn resume_devices(&self); - - /// Sends each device (and backend) a halt request. - fn halt_devices(&self); - - /// Resets the state of each vCPU in the instance to its on-reboot state. - fn reset_vcpu_state(&self); -} - -impl StateDriverVmController for VmController { - fn pause_vm(&self) { - info!(self.log, "Pausing kernel VMM resources"); - self.machine().hdl.pause().expect("VM_PAUSE should succeed") - } - - fn resume_vm(&self) { - info!(self.log, "Resuming kernel VMM resources"); - self.machine().hdl.resume().expect("VM_RESUME should succeed") - } - - fn reset_devices_and_machine(&self) { - let _rtguard = self.runtime_hdl.enter(); - self.for_each_device(|name, dev| { - info!(self.log, "Sending reset request to {}", name); - dev.reset(); - }); - - self.machine().reinitialize().unwrap(); - } - - fn start_devices(&self) -> anyhow::Result<()> { - let _rtguard = self.runtime_hdl.enter(); - self.for_each_device_fallible(|name, dev| { - info!(self.log, "Sending startup complete to {}", name); - let res = dev.start(); - if let Err(e) = &res { - error!(self.log, "Startup failed for {}: {:?}", name, e); - } - res - })?; - for (name, backend) in self.vm_objects.block_backends.iter() { - debug!(self.log, "Starting block backend {}", name); - let res = backend.start(); - if let Err(e) = &res { - error!(self.log, "Startup failed for {}: {:?}", name, e); - return res; - } - } - Ok(()) - } - - fn pause_devices(&self) { - let _rtguard = self.runtime_hdl.enter(); - self.for_each_device(|name, dev| { - info!(self.log, "Sending pause request to {}", name); - dev.pause(); - }); - - // Create a Future that returns the name of the device that has finished - // pausing: this allows keeping track of which devices have and haven't - // completed pausing yet. - struct NamedFuture { - name: String, - future: BoxFuture<'static, ()>, - } - - impl std::future::Future for NamedFuture { - type Output = String; - - fn poll( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll { - let mut_self = self.get_mut(); - match Pin::new(&mut mut_self.future).poll(cx) { - Poll::Pending => Poll::Pending, - Poll::Ready(()) => Poll::Ready(mut_self.name.clone()), - } - } - } - - info!(self.log, "Waiting for devices to pause"); - self.runtime_hdl.block_on(async { - let mut stream: FuturesUnordered<_> = self - .vm_objects - .devices - .iter() - .map(|(name, dev)| { - info!(self.log, "Got paused future from dev {}", name); - NamedFuture { name: name.to_string(), future: dev.paused() } - }) - .collect(); - - loop { - match stream.next().await { - Some(name) => { - info!(self.log, "dev {} completed pause", name); - } - - None => { - // done - info!(self.log, "all devices paused"); - break; - } - } - } - }); - } - - fn resume_devices(&self) { - let _rtguard = self.runtime_hdl.enter(); - self.for_each_device(|name, dev| { - info!(self.log, "Sending resume request to {}", name); - dev.resume(); - }); - } - - fn halt_devices(&self) { - let _rtguard = self.runtime_hdl.enter(); - self.for_each_device(|name, dev| { - info!(self.log, "Sending halt request to {}", name); - dev.halt(); - }); - for (name, backend) in self.vm_objects.block_backends.iter() { - debug!(self.log, "Stopping and detaching block backend {}", name); - backend.stop(); - if let Err(err) = backend.detach() { - error!( - self.log, - "Error while detaching block backend {name}: {err:?}", - ); - } - } - } - - fn reset_vcpu_state(&self) { - for vcpu in self.machine().vcpus.iter() { - info!(self.log, "Resetting vCPU {}", vcpu.id); - vcpu.activate().unwrap(); - vcpu.reboot_state().unwrap(); - if vcpu.is_bsp() { - info!(self.log, "Resetting BSP vCPU {}", vcpu.id); - vcpu.set_run_state(propolis::bhyve_api::VRS_RUN, None).unwrap(); - vcpu.set_reg( - propolis::bhyve_api::vm_reg_name::VM_REG_GUEST_RIP, - 0xfff0, - ) - .unwrap(); - } - } - } -} diff --git a/bin/propolis-server/src/lib/vm_old/request_queue.rs b/bin/propolis-server/src/lib/vm_old/request_queue.rs deleted file mode 100644 index 9d23faa26..000000000 --- a/bin/propolis-server/src/lib/vm_old/request_queue.rs +++ /dev/null @@ -1,604 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Handles requests to change a Propolis server's state via the external API. -//! -//! An instance accepts or rejects requests to change state based on a -//! combination of its current state and its knowledge of the requests it has -//! previously queued but not processed yet. The latter knowledge is used to -//! reject requests that will never be fulfilled (because they're preceded by an -//! action that will forbid them; consider rebooting after stopping) or that may -//! need be to redirected to a migration target. -//! -//! The queue maintains a disposition for each kind of request that can be sent -//! to it, which allows that request to be enqueued, denied, or silently ignored -//! (for idempotency purposes). These dispositions can change as new requests -//! are queued. The queue also provides callbacks to the VM state driver that -//! allow the driver to advise the queue of state changes that further affect -//! what requests should be accepted. -//! -//! Users who want to share a queue must wrap it in the synchronization objects -//! of their choice. - -use std::collections::VecDeque; - -use slog::{debug, info, Logger}; -use thiserror::Error; -use uuid::Uuid; - -use crate::migrate::MigrateError; - -use super::{ - MigrateSourceCommand, MigrateSourceResponse, MigrateTargetCommand, -}; - -/// An external request made of a VM controller via the server API. Handled by -/// the controller's state driver thread. -#[derive(Debug)] -pub enum ExternalRequest { - /// Initializes the VM through live migration by running a - /// migration-destination task. - MigrateAsTarget { - /// The ID of the live migration to use when initializing. - migration_id: Uuid, - - /// A handle to the task that will execute the migration procedure. - task: tokio::task::JoinHandle>, - - /// The sender side of a one-shot channel that, when signaled, tells the - /// migration task to start its work. - start_tx: tokio::sync::oneshot::Sender<()>, - - /// A channel that receives commands from the migration task. - command_rx: tokio::sync::mpsc::Receiver, - }, - - /// Resets all the VM's devices and CPUs, then starts the VM. - Start, - - /// Asks the state worker to start a migration-source task. - MigrateAsSource { - /// The ID of the live migration for which this VM will be the source. - migration_id: Uuid, - - /// A handle to the task that will execute the migration procedure. - task: tokio::task::JoinHandle>, - - /// The sender side of a one-shot channel that, when signaled, tells the - /// migration task to start its work. - start_tx: tokio::sync::oneshot::Sender<()>, - - /// A channel that receives commands from the migration task. - command_rx: tokio::sync::mpsc::Receiver, - - /// A channel used to send responses to migration commands. - response_tx: tokio::sync::mpsc::Sender, - }, - - /// Resets the guest by pausing all devices, resetting them to their - /// cold-boot states, and resuming the devices. Note that this is not a - /// graceful reboot and does not coordinate with guest software. - Reboot, - - /// Halts the VM. Note that this is not a graceful shutdown and does not - /// coordinate with guest software. - Stop, -} - -/// A set of reasons why a request to queue an external state transition can -/// fail. -#[derive(Copy, Clone, Debug, Error)] -pub enum RequestDeniedReason { - #[error("Operation requires an active instance")] - InstanceNotActive, - - #[error("Already migrating into this instance")] - MigrationTargetInProgress, - - #[error("Instance is currently starting")] - StartInProgress, - - #[error("Instance is already a migration source")] - AlreadyMigrationSource, - - #[error("Operation cannot be performed on a migration source")] - InvalidRequestForMigrationSource, - - #[error("Instance is preparing to stop")] - HaltPending, - - #[error("Instance failed to start or halted due to a failure")] - InstanceFailed, -} - -/// The set of instance state changes that should change the dispositions of -/// future requests to the queue. -#[derive(Copy, Clone, Debug)] -pub enum InstanceStateChange { - StartedRunning, - Rebooted, - Stopped, - Failed, -} - -/// A reason for a change in the queue's request dispositions. -#[derive(Debug)] -enum DispositionChangeReason<'a> { - ApiRequest(&'a ExternalRequest), - StateChange(InstanceStateChange), -} - -/// The possible methods of handling a request to queue a state change. -#[derive(Copy, Clone, Debug)] -enum RequestDisposition { - /// Put the state change on the queue. - Enqueue, - - /// Drop the state change silently. This is used to make requests appear - /// idempotent to callers without making the state driver deal with the - /// consequences of queuing the same state change request twice. - Ignore, - - /// Deny the request to change state. - Deny(RequestDeniedReason), -} - -/// The current disposition for each kind of incoming request. -#[derive(Copy, Clone, Debug)] -struct AllowedRequests { - migrate_as_target: RequestDisposition, - start: RequestDisposition, - migrate_as_source: RequestDisposition, - reboot: RequestDisposition, - stop: RequestDisposition, -} - -#[derive(Debug)] -pub struct ExternalRequestQueue { - queue: VecDeque, - allowed: AllowedRequests, - log: Logger, -} - -impl ExternalRequestQueue { - /// Creates a new queue that logs to the supplied logger. - pub fn new(log: Logger) -> Self { - Self { - queue: VecDeque::new(), - allowed: AllowedRequests { - migrate_as_target: RequestDisposition::Enqueue, - start: RequestDisposition::Enqueue, - migrate_as_source: RequestDisposition::Deny( - RequestDeniedReason::InstanceNotActive, - ), - reboot: RequestDisposition::Deny( - RequestDeniedReason::InstanceNotActive, - ), - stop: RequestDisposition::Enqueue, - }, - log, - } - } - - /// Pops the request at the front of the queue. - pub fn pop_front(&mut self) -> Option { - self.queue.pop_front() - } - - /// Indicates whether the queue is empty. - pub fn is_empty(&self) -> bool { - self.queue.is_empty() - } - - /// Asks to place the supplied request on the queue. If the requests is - /// enqueued, updates the dispositions to use for future requests. - pub fn try_queue( - &mut self, - request: ExternalRequest, - ) -> Result<(), RequestDeniedReason> { - let disposition = match request { - ExternalRequest::MigrateAsTarget { .. } => { - self.allowed.migrate_as_target - } - ExternalRequest::Start => self.allowed.start, - ExternalRequest::MigrateAsSource { .. } => { - self.allowed.migrate_as_source - } - ExternalRequest::Reboot => self.allowed.reboot, - - // Requests to stop always succeed. Note that a request to stop a VM - // that hasn't started should still be queued to the state worker so - // that the worker can exit and drop its references to the instance. - ExternalRequest::Stop => self.allowed.stop, - }; - - info!(&self.log, "Queuing external request"; - "request" => ?request, - "disposition" => ?disposition); - - match disposition { - RequestDisposition::Enqueue => {} - RequestDisposition::Ignore => return Ok(()), - RequestDisposition::Deny(reason) => return Err(reason), - }; - - self.allowed = self.get_new_dispositions( - DispositionChangeReason::ApiRequest(&request), - ); - self.queue.push_back(request); - Ok(()) - } - - /// Notifies the queue that the instance's state has changed and that its - /// disposition should be updated accordingly. - pub fn notify_instance_state_change(&mut self, state: InstanceStateChange) { - self.allowed = self - .get_new_dispositions(DispositionChangeReason::StateChange(state)); - } - - /// Indicates whether the queue would allow a request to migrate into this - /// instance. This can be used to avoid setting up migration tasks for - /// requests that will ultimately be denied. - /// - /// # Return value - /// - /// - `Ok(true)` if the request will be queued. - /// - `Ok(false)` if the request is allowed for idempotency reasons but will - /// not be queued. - /// - `Err` if the request is forbidden. - pub fn migrate_as_target_will_enqueue( - &self, - ) -> Result { - match self.allowed.migrate_as_target { - RequestDisposition::Enqueue => Ok(true), - RequestDisposition::Ignore => Ok(false), - RequestDisposition::Deny(reason) => Err(reason), - } - } - - /// Indicates whether the queue would allow a request to migrate out of this - /// instance. This can be used to avoid setting up migration tasks for - /// requests that will ultimately be denied. - /// - /// # Return value - /// - /// - `Ok(true)` if the request will be queued. - /// - `Ok(false)` if the request is allowed for idempotency reasons but will - /// not be queued. - /// - `Err` if the request is forbidden. - pub fn migrate_as_source_will_enqueue( - &self, - ) -> Result { - assert!(!matches!( - self.allowed.migrate_as_source, - RequestDisposition::Ignore - )); - - match self.allowed.migrate_as_source { - RequestDisposition::Enqueue => Ok(true), - RequestDisposition::Ignore => unreachable!(), - RequestDisposition::Deny(reason) => Err(reason), - } - } - - /// Computes a new set of queue dispositions given the current state of the - /// queue and the event that is changing those dispositions. - fn get_new_dispositions( - &self, - reason: DispositionChangeReason, - ) -> AllowedRequests { - debug!(self.log, "Computing new queue dispositions"; - "reason" => ?reason); - - use DispositionChangeReason as ChangeReason; - use RequestDeniedReason as DenyReason; - use RequestDisposition as Disposition; - match reason { - // Starting the instance, whether via migration or cold boot, - // forecloses on further attempts to migrate in. For idempotency, - // further requests to start are allowed when an instance-starting - // transition is enqueued. - ChangeReason::ApiRequest(ExternalRequest::MigrateAsTarget { - .. - }) - | ChangeReason::ApiRequest(ExternalRequest::Start) => { - let (migrate_as_target_disposition, deny_reason) = match reason - { - // If this is a request to migrate in, make sure future - // requests to migrate in are handled idempotently. - ChangeReason::ApiRequest( - ExternalRequest::MigrateAsTarget { .. }, - ) => ( - Disposition::Ignore, - DenyReason::MigrationTargetInProgress, - ), - ChangeReason::ApiRequest(ExternalRequest::Start) => ( - Disposition::Deny(DenyReason::StartInProgress), - DenyReason::StartInProgress, - ), - _ => unreachable!(), - }; - - AllowedRequests { - migrate_as_target: migrate_as_target_disposition, - start: Disposition::Ignore, - migrate_as_source: Disposition::Deny(deny_reason), - reboot: Disposition::Deny(deny_reason), - stop: self.allowed.stop, - } - } - ChangeReason::ApiRequest(ExternalRequest::MigrateAsSource { - .. - }) => { - assert!(matches!(self.allowed.start, Disposition::Ignore)); - - // Requests to migrate into the instance should not be enqueued - // from this point, but whether they're dropped or ignored - // depends on how the instance was originally initialized. - assert!(!matches!( - self.allowed.migrate_as_target, - Disposition::Enqueue - )); - - AllowedRequests { - migrate_as_target: self.allowed.migrate_as_target, - start: self.allowed.start, - migrate_as_source: Disposition::Deny( - DenyReason::AlreadyMigrationSource, - ), - reboot: Disposition::Deny( - DenyReason::InvalidRequestForMigrationSource, - ), - stop: self.allowed.stop, - } - } - - // Requests to reboot prevent additional reboot requests from being - // queued, but do not affect other operations. - ChangeReason::ApiRequest(ExternalRequest::Reboot) => { - assert!(matches!(self.allowed.start, Disposition::Ignore)); - assert!(!matches!( - self.allowed.migrate_as_target, - Disposition::Enqueue - )); - - AllowedRequests { reboot: Disposition::Ignore, ..self.allowed } - } - - // Requests to stop the instance block other requests from being - // queued. Additional requests to stop are ignored for idempotency. - ChangeReason::ApiRequest(ExternalRequest::Stop) => { - AllowedRequests { - migrate_as_target: Disposition::Deny( - DenyReason::HaltPending, - ), - start: Disposition::Deny(DenyReason::HaltPending), - migrate_as_source: Disposition::Deny( - DenyReason::HaltPending, - ), - reboot: Disposition::Deny(DenyReason::HaltPending), - stop: Disposition::Ignore, - } - } - - // When an instance begins running, requests to migrate out of it or - // to reboot it become valid. - ChangeReason::StateChange(InstanceStateChange::StartedRunning) => { - AllowedRequests { - migrate_as_target: self.allowed.migrate_as_target, - start: self.allowed.start, - migrate_as_source: Disposition::Enqueue, - reboot: Disposition::Enqueue, - stop: self.allowed.stop, - } - } - - // When an instance finishes rebooting, allow new reboot requests to - // be queued again, unless reboot requests began to be denied in the - // meantime. - ChangeReason::StateChange(InstanceStateChange::Rebooted) => { - let new_reboot = - if let Disposition::Ignore = self.allowed.reboot { - Disposition::Enqueue - } else { - self.allowed.reboot - }; - - AllowedRequests { reboot: new_reboot, ..self.allowed } - } - - // When an instance stops or fails, requests to do anything other - // than stop it are denied with an appropriate deny reason. Note - // that an instance may stop or fail due to guest activity, so the - // previous dispositions for migrate and reboot requests may not be - // "deny". - ChangeReason::StateChange(InstanceStateChange::Stopped) => { - AllowedRequests { - migrate_as_target: Disposition::Deny( - DenyReason::InstanceNotActive, - ), - start: Disposition::Deny(DenyReason::InstanceNotActive), - migrate_as_source: Disposition::Deny( - DenyReason::InstanceNotActive, - ), - reboot: Disposition::Deny(DenyReason::InstanceNotActive), - stop: Disposition::Ignore, - } - } - ChangeReason::StateChange(InstanceStateChange::Failed) => { - AllowedRequests { - migrate_as_target: Disposition::Deny( - DenyReason::InstanceFailed, - ), - start: Disposition::Deny(DenyReason::InstanceFailed), - migrate_as_source: Disposition::Deny( - DenyReason::InstanceFailed, - ), - reboot: Disposition::Deny(DenyReason::InstanceFailed), - stop: self.allowed.stop, - } - } - } - } -} - -#[cfg(test)] -mod test { - use super::*; - - use uuid::Uuid; - - fn test_logger() -> slog::Logger { - slog::Logger::root(slog::Discard, slog::o!()) - } - - fn make_migrate_as_target_request() -> ExternalRequest { - let task = tokio::task::spawn(async { Ok(()) }); - let (start_tx, _) = tokio::sync::oneshot::channel(); - let (_, command_rx) = tokio::sync::mpsc::channel(1); - ExternalRequest::MigrateAsTarget { - migration_id: Uuid::new_v4(), - task, - start_tx, - command_rx, - } - } - - fn make_migrate_as_source_request() -> ExternalRequest { - let task = tokio::task::spawn(async { Ok(()) }); - let (start_tx, _) = tokio::sync::oneshot::channel(); - let (_, command_rx) = tokio::sync::mpsc::channel(1); - let (response_tx, _) = tokio::sync::mpsc::channel(1); - ExternalRequest::MigrateAsSource { - migration_id: Uuid::new_v4(), - task, - start_tx, - command_rx, - response_tx, - } - } - - #[tokio::test] - async fn migrate_as_target_is_idempotent() { - let mut queue = ExternalRequestQueue::new(test_logger()); - - // Requests to migrate as a target should queue normally at first. - assert!(queue.migrate_as_target_will_enqueue().unwrap()); - - // After queuing such a request, subsequent requests should be allowed - // without enqueuing anything. - assert!(queue.try_queue(make_migrate_as_target_request()).is_ok()); - assert!(!queue.migrate_as_target_will_enqueue().unwrap()); - - // Pop the request and tell the queue the instance is running. - assert!(matches!( - queue.pop_front(), - Some(ExternalRequest::MigrateAsTarget { .. }) - )); - queue.notify_instance_state_change(InstanceStateChange::StartedRunning); - - // Because the instance was started via migration in, future requests - // to migrate in should be allowed. - assert!(queue.try_queue(make_migrate_as_target_request()).is_ok()); - assert!(!queue.migrate_as_target_will_enqueue().unwrap()); - } - - #[tokio::test] - async fn migrate_as_target_is_forbidden_after_cold_boot() { - let mut queue = ExternalRequestQueue::new(test_logger()); - assert!(queue.try_queue(ExternalRequest::Start).is_ok()); - queue.notify_instance_state_change(InstanceStateChange::StartedRunning); - - assert!(queue.migrate_as_target_will_enqueue().is_err()); - assert!(queue.try_queue(make_migrate_as_target_request()).is_err()); - } - - #[tokio::test] - async fn migrate_as_source_is_not_idempotent() { - // Simulate a running instance. - let mut queue = ExternalRequestQueue::new(test_logger()); - assert!(queue.try_queue(ExternalRequest::Start).is_ok()); - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); - queue.notify_instance_state_change(InstanceStateChange::StartedRunning); - - // Requests to migrate out should be allowed. - assert!(queue.migrate_as_source_will_enqueue().unwrap()); - assert!(queue.try_queue(make_migrate_as_source_request()).is_ok()); - - // Once the request is queued, other requests to migrate out are - // disallowed until the queued request is disposed of. - // - // This differs from the migration-in case in that requests to migrate - // in are issued by the sled agent as part of a saga (where idempotency - // is assumed), but requests to migrate out are issued by the target - // Propolis (which does not assume idempotency and issues only one - // request per migration attempt). - assert!(queue.migrate_as_source_will_enqueue().is_err()); - assert!(queue.try_queue(make_migrate_as_source_request()).is_err()); - - // If migration fails, the instance resumes running, and then another - // request to migrate out should be allowed. - assert!(matches!( - queue.pop_front(), - Some(ExternalRequest::MigrateAsSource { .. }) - )); - queue.notify_instance_state_change(InstanceStateChange::StartedRunning); - assert!(queue.migrate_as_source_will_enqueue().unwrap()); - assert!(queue.try_queue(make_migrate_as_source_request()).is_ok()); - - // A successful migration stops the instance, which forecloses on future - // requests to migrate out. - queue.pop_front(); - queue.notify_instance_state_change(InstanceStateChange::Stopped); - assert!(queue.migrate_as_source_will_enqueue().is_err()); - assert!(queue.try_queue(make_migrate_as_source_request()).is_err()); - } - - #[tokio::test] - async fn stop_requests_enqueue_after_vm_failure() { - let mut queue = ExternalRequestQueue::new(test_logger()); - assert!(queue.try_queue(ExternalRequest::Start).is_ok()); - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); - queue.notify_instance_state_change(InstanceStateChange::Failed); - - assert!(queue.try_queue(ExternalRequest::Stop).is_ok()); - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Stop))); - } - - #[tokio::test] - async fn reboot_requests_are_idempotent_except_when_stopping() { - let mut queue = ExternalRequestQueue::new(test_logger()); - assert!(queue.try_queue(ExternalRequest::Start).is_ok()); - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); - queue.notify_instance_state_change(InstanceStateChange::StartedRunning); - - // Once the instance is started, reboot requests should be allowed, but - // after the first, subsequent requests should be dropped for - // idempotency. - assert!(queue.is_empty()); - for _ in 0..5 { - assert!(queue.try_queue(ExternalRequest::Reboot).is_ok()); - } - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Reboot))); - assert!(queue.is_empty()); - - // Once the instance has rebooted, new requests can be queued. - queue.notify_instance_state_change(InstanceStateChange::Rebooted); - assert!(queue.try_queue(ExternalRequest::Reboot).is_ok()); - assert!(!queue.is_empty()); - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Reboot))); - queue.notify_instance_state_change(InstanceStateChange::Rebooted); - - // If a request to reboot is queued, and then a request to stop is - // queued, new requests to reboot should always fail, even after the - // instance finishes rebooting. - assert!(queue.try_queue(ExternalRequest::Reboot).is_ok()); - assert!(!queue.is_empty()); - assert!(queue.try_queue(ExternalRequest::Stop).is_ok()); - assert!(queue.try_queue(ExternalRequest::Reboot).is_err()); - assert!(matches!(queue.pop_front(), Some(ExternalRequest::Reboot))); - queue.notify_instance_state_change(InstanceStateChange::Rebooted); - assert!(queue.try_queue(ExternalRequest::Reboot).is_err()); - } -} diff --git a/bin/propolis-server/src/lib/vm_old/state_driver.rs b/bin/propolis-server/src/lib/vm_old/state_driver.rs deleted file mode 100644 index 4a2832f64..000000000 --- a/bin/propolis-server/src/lib/vm_old/state_driver.rs +++ /dev/null @@ -1,1384 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use std::sync::Arc; - -use crate::migrate::{MigrateError, MigrateRole}; -use crate::vcpu_tasks::VcpuTaskController; - -use super::{ - request_queue, ExternalRequest, GuestEvent, MigrateSourceCommand, - MigrateSourceResponse, MigrateTargetCommand, MigrateTaskEvent, - SharedVmState, StateDriverEvent, -}; - -use propolis_api_types::{ - InstanceMigrateStatusResponse as ApiMigrateStatusResponse, - InstanceMigrationStatus as ApiMigrationStatus, - InstanceState as ApiInstanceState, - InstanceStateMonitorResponse as ApiMonitoredState, - MigrationState as ApiMigrationState, -}; -use slog::{error, info, Logger}; -use uuid::Uuid; - -#[usdt::provider(provider = "propolis")] -mod probes { - fn state_driver_pause() {} - fn state_driver_resume() {} -} - -/// Tells the state driver whether or not to continue running after responding -/// to an event. -#[derive(Debug, PartialEq, Eq)] -enum HandleEventOutcome { - Continue, - Exit, -} - -/// A reason for starting a VM. -#[derive(Debug, PartialEq, Eq)] -enum VmStartReason { - MigratedIn, - ExplicitRequest, -} - -/// A wrapper around all the data needed to describe the status of a live -/// migration. -struct PublishedMigrationState { - state: ApiMigrationState, - id: Uuid, - role: MigrateRole, -} - -impl PublishedMigrationState { - /// Updates an `old` migration status response to contain information about - /// the migration described by `self`. - fn apply_to( - self, - old: ApiMigrateStatusResponse, - ) -> ApiMigrateStatusResponse { - let new = ApiMigrationStatus { id: self.id, state: self.state }; - match self.role { - MigrateRole::Destination => ApiMigrateStatusResponse { - migration_in: Some(new), - migration_out: old.migration_out, - }, - MigrateRole::Source => ApiMigrateStatusResponse { - migration_in: old.migration_in, - migration_out: Some(new), - }, - } - } -} - -enum PublishedState { - Instance(ApiInstanceState), - Migration(PublishedMigrationState), - Complete(ApiInstanceState, PublishedMigrationState), -} - -pub(super) struct StateDriver< - V: super::StateDriverVmController, - C: VcpuTaskController, -> { - /// A handle to the host server's tokio runtime, useful for spawning tasks - /// that need to interact with async code (e.g. spinning up migration - /// tasks). - runtime_hdl: tokio::runtime::Handle, - - /// A reference to the command sink to which this driver should send its - /// requests to send messages to devices or update other VM controller - /// state. - controller: Arc, - - /// A reference to the state this driver shares with its VM controller. - shared_state: Arc, - - /// The controller for this instance's vCPU tasks. - vcpu_tasks: C, - - /// The state worker's logger. - log: Logger, - - /// The generation number to use when publishing externally-visible state - /// updates. - state_gen: u64, - - /// Whether the worker's VM's devices are paused. - paused: bool, - - /// The sender side of the monitor that reflects the instance's current - /// externally-visible state (including migration state). - api_state_tx: tokio::sync::watch::Sender, -} - -impl StateDriver -where - V: super::StateDriverVmController, - C: VcpuTaskController, -{ - /// Constructs a new state driver context. - pub(super) fn new( - runtime_hdl: tokio::runtime::Handle, - controller: Arc, - shared_controller_state: Arc, - vcpu_tasks: C, - log: Logger, - api_state_tx: tokio::sync::watch::Sender, - ) -> Self { - Self { - runtime_hdl, - controller, - shared_state: shared_controller_state, - vcpu_tasks, - log, - state_gen: 0, - paused: false, - api_state_tx, - } - } - - /// Yields the current externally-visible instance state. - fn get_instance_state(&self) -> ApiInstanceState { - self.api_state_tx.borrow().state - } - - /// Retrieves the most recently published migration state from the external - /// migration state channel. - /// - /// This function does not return the borrowed monitor, so the state may - /// change again as soon as this function returns. - fn get_migration_status(&self) -> ApiMigrateStatusResponse { - self.api_state_tx.borrow().migration.clone() - } - - /// Sets the published instance and/or migration state and increases the - /// state generation number. - fn set_published_state(&mut self, state: PublishedState) { - let (instance_state, migration_state) = match state { - PublishedState::Instance(i) => (Some(i), None), - PublishedState::Migration(m) => (None, Some(m)), - PublishedState::Complete(i, m) => (Some(i), Some(m)), - }; - - let ApiMonitoredState { - state: old_state, - migration: old_migration, - .. - } = self.api_state_tx.borrow().clone(); - - let state = instance_state.unwrap_or(old_state); - let migration = if let Some(migration_state) = migration_state { - migration_state.apply_to(old_migration) - } else { - old_migration - }; - - info!(self.log, "publishing new instance state"; - "gen" => self.state_gen, - "state" => ?state, - "migration" => ?migration); - - self.state_gen += 1; - let _ = self.api_state_tx.send(ApiMonitoredState { - gen: self.state_gen, - state, - migration, - }); - } - - /// Publishes the supplied externally-visible instance state to the external - /// instance state channel. - fn set_instance_state(&mut self, state: ApiInstanceState) { - self.set_published_state(PublishedState::Instance(state)); - } - - /// Publishes the supplied externally-visible migration status to the - /// instance state channel. - fn set_migration_state( - &mut self, - role: MigrateRole, - migration_id: Uuid, - state: ApiMigrationState, - ) { - self.set_published_state(PublishedState::Migration( - PublishedMigrationState { state, id: migration_id, role }, - )); - } - - /// Publishes that an instance is migrating and sets its migration state in - /// a single transaction, then consumes the pending migration information - /// from the shared VM state block. - fn publish_migration_start( - &mut self, - migration_id: Uuid, - role: MigrateRole, - ) { - // Order matters here. The 'pending migration' field exists so that - // migration status is available through the external API as soon as an - // external request to migrate returns, even if the migration hasn't yet - // been picked up off the queue. To ensure the migration is continuously - // visible, publish the "actual" migration before consuming the pending - // one. - self.set_published_state(PublishedState::Complete( - ApiInstanceState::Migrating, - PublishedMigrationState { - state: ApiMigrationState::Sync, - id: migration_id, - role, - }, - )); - - self.shared_state.clear_pending_migration(); - } - - /// Manages an instance's lifecycle once it has moved to the Running state. - pub(super) fn run_state_worker( - mut self, - ) -> tokio::sync::watch::Sender { - info!(self.log, "State worker launched"); - - loop { - let event = self.shared_state.wait_for_next_event(); - info!(self.log, "State worker handling event"; "event" => ?event); - - let outcome = self.handle_event(event); - info!(self.log, "State worker handled event"; "outcome" => ?outcome); - if matches!(outcome, HandleEventOutcome::Exit) { - break; - } - } - - info!(self.log, "State worker exiting"); - - self.api_state_tx - } - - fn handle_event(&mut self, event: StateDriverEvent) -> HandleEventOutcome { - let next_action = match event { - StateDriverEvent::Guest(guest_event) => { - return self.handle_guest_event(guest_event); - } - StateDriverEvent::External(external_event) => external_event, - }; - - match next_action { - ExternalRequest::MigrateAsTarget { - migration_id, - task, - start_tx, - command_rx, - } => { - self.migrate_as_target( - migration_id, - task, - start_tx, - command_rx, - ); - HandleEventOutcome::Continue - } - ExternalRequest::Start => { - self.start_vm(VmStartReason::ExplicitRequest); - HandleEventOutcome::Continue - } - ExternalRequest::Reboot => { - self.do_reboot(); - HandleEventOutcome::Continue - } - ExternalRequest::MigrateAsSource { - migration_id, - task, - start_tx, - command_rx, - response_tx, - } => { - self.migrate_as_source( - migration_id, - task, - start_tx, - command_rx, - response_tx, - ); - HandleEventOutcome::Continue - } - ExternalRequest::Stop => { - self.do_halt(); - HandleEventOutcome::Exit - } - } - } - - fn handle_guest_event(&mut self, event: GuestEvent) -> HandleEventOutcome { - match event { - GuestEvent::VcpuSuspendHalt(_when) => { - info!(self.log, "Halting due to VM suspend event",); - self.do_halt(); - HandleEventOutcome::Exit - } - GuestEvent::VcpuSuspendReset(_when) => { - info!(self.log, "Resetting due to VM suspend event"); - self.do_reboot(); - HandleEventOutcome::Continue - } - GuestEvent::VcpuSuspendTripleFault(vcpu_id, _when) => { - info!( - self.log, - "Resetting due to triple fault on vCPU {}", vcpu_id - ); - self.do_reboot(); - HandleEventOutcome::Continue - } - GuestEvent::ChipsetHalt => { - info!(self.log, "Halting due to chipset-driven halt"); - self.do_halt(); - HandleEventOutcome::Exit - } - GuestEvent::ChipsetReset => { - info!(self.log, "Resetting due to chipset-driven reset"); - self.do_reboot(); - HandleEventOutcome::Continue - } - } - } - - fn start_vm(&mut self, start_reason: VmStartReason) { - info!(self.log, "Starting instance"; "reason" => ?start_reason); - - // Only move to the Starting state if this VM is starting by explicit - // request (as opposed to the implicit start that happens after a - // migration in). In this case, no one has initialized vCPU state yet, - // so explicitly initialize it here. - // - // In the migration-in case, remain in the Migrating state until the - // VM is actually running. Note that this is contractual behavior--sled - // agent relies on this to represent that a migrating instance is - // continuously running through a successful migration. - match start_reason { - VmStartReason::ExplicitRequest => { - self.set_instance_state(ApiInstanceState::Starting); - self.reset_vcpus(); - } - VmStartReason::MigratedIn => { - assert_eq!( - self.get_instance_state(), - ApiInstanceState::Migrating - ); - // Signal the kernel VMM to resume devices which are handled by - // the in-kernel emulation. They were kept paused for - // consistency while migration state was loaded. - self.controller.resume_vm(); - } - } - - match self.controller.start_devices() { - Ok(()) => { - self.vcpu_tasks.resume_all(); - self.publish_steady_state(ApiInstanceState::Running); - } - Err(e) => { - error!(&self.log, "Failed to start devices: {:?}", e); - self.publish_steady_state(ApiInstanceState::Failed); - } - } - } - - fn do_reboot(&mut self) { - info!(self.log, "Resetting instance"); - - self.set_instance_state(ApiInstanceState::Rebooting); - - // Reboot is implemented as a pause -> reset -> resume transition. - // - // First, pause the vCPUs and all devices so no partially-completed - // work is present. - self.vcpu_tasks.pause_all(); - self.controller.pause_devices(); - - // Reset all the entities and the VM's bhyve state, then reset the - // vCPUs. The vCPU reset must come after the bhyve reset. - self.controller.reset_devices_and_machine(); - self.reset_vcpus(); - - // Resume devices so they're ready to do more work, then resume vCPUs. - self.controller.resume_devices(); - self.vcpu_tasks.resume_all(); - - // Notify the request queue that this reboot request was processed. - // This does not use the `publish_steady_state` path because the queue - // treats an instance's initial transition to "Running" as a one-time - // event that's different from a return to the running state from a - // transient intermediate state. - self.notify_request_queue(request_queue::InstanceStateChange::Rebooted); - self.set_instance_state(ApiInstanceState::Running); - } - - fn do_halt(&mut self) { - info!(self.log, "Stopping instance"); - self.set_instance_state(ApiInstanceState::Stopping); - - // Entities expect to be paused before being halted. Note that the VM - // may be paused already if it is being torn down after a successful - // migration out. - if !self.paused { - self.pause(); - } - - self.vcpu_tasks.exit_all(); - self.controller.halt_devices(); - self.publish_steady_state(ApiInstanceState::Stopped); - } - - fn migrate_as_target( - &mut self, - migration_id: Uuid, - mut task: tokio::task::JoinHandle>, - start_tx: tokio::sync::oneshot::Sender<()>, - mut command_rx: tokio::sync::mpsc::Receiver, - ) { - self.publish_migration_start(migration_id, MigrateRole::Destination); - - // Ensure the VM's vCPUs are activated properly so that they can enter - // the guest after migration. Do this before allowing the migration task - // to start so that reset doesn't overwrite any state written by - // migration. - self.reset_vcpus(); - - // Place the VM in a paused state so we can load emulated device state - // in a consistent manner - self.controller.pause_vm(); - - start_tx.send(()).unwrap(); - loop { - let action = self.runtime_hdl.block_on(async { - Self::next_migrate_task_event( - &mut task, - &mut command_rx, - &self.log, - ) - .await - }); - - match action { - MigrateTaskEvent::TaskExited(res) => { - if res.is_ok() { - // Clients that observe that migration has finished - // need to observe that the instance is running before - // they are guaranteed to be able to do anything else - // that requires a running instance. - assert!(matches!( - self.get_migration_status() - .migration_in - .unwrap() - .state, - ApiMigrationState::Finish - )); - - self.start_vm(VmStartReason::MigratedIn); - } else { - assert!(matches!( - self.get_migration_status() - .migration_in - .unwrap() - .state, - ApiMigrationState::Error - )); - - // Resume the kernel VM so that if this state driver is - // asked to halt, the pause resulting therefrom won't - // observe that the VM is already paused. - self.controller.resume_vm(); - self.publish_steady_state(ApiInstanceState::Failed); - }; - - break; - } - MigrateTaskEvent::Command( - MigrateTargetCommand::UpdateState(state), - ) => { - self.set_migration_state( - MigrateRole::Destination, - migration_id, - state, - ); - } - } - } - } - - fn migrate_as_source( - &mut self, - migration_id: Uuid, - mut task: tokio::task::JoinHandle>, - start_tx: tokio::sync::oneshot::Sender<()>, - mut command_rx: tokio::sync::mpsc::Receiver, - response_tx: tokio::sync::mpsc::Sender, - ) { - self.publish_migration_start(migration_id, MigrateRole::Source); - start_tx.send(()).unwrap(); - - // Wait either for the migration task to exit or for it to ask the - // worker to pause or resume the instance's devices. - loop { - let action = self.runtime_hdl.block_on(async { - Self::next_migrate_task_event( - &mut task, - &mut command_rx, - &self.log, - ) - .await - }); - - match action { - // If the task exited, bubble its result back up to the main - // state worker loop to decide on the instance's next state. - // - // If migration failed while devices were paused, this instance - // is allowed to resume, so resume its components here. - MigrateTaskEvent::TaskExited(res) => { - if res.is_ok() { - assert!(matches!( - self.get_migration_status() - .migration_out - .unwrap() - .state, - ApiMigrationState::Finish - )); - - self.shared_state - .queue_external_request(ExternalRequest::Stop) - .expect("can always queue a request to stop"); - } else { - assert!(matches!( - self.get_migration_status() - .migration_out - .unwrap() - .state, - ApiMigrationState::Error - )); - - if self.paused { - self.resume(); - self.publish_steady_state( - ApiInstanceState::Running, - ); - } - } - - break; - } - MigrateTaskEvent::Command(cmd) => match cmd { - MigrateSourceCommand::UpdateState(state) => { - self.set_migration_state( - MigrateRole::Source, - migration_id, - state, - ); - } - MigrateSourceCommand::Pause => { - self.pause(); - response_tx - .blocking_send(MigrateSourceResponse::Pause(Ok(()))) - .unwrap(); - } - }, - } - } - } - - async fn next_migrate_task_event( - task: &mut tokio::task::JoinHandle>, - command_rx: &mut tokio::sync::mpsc::Receiver, - log: &Logger, - ) -> MigrateTaskEvent { - if let Some(cmd) = command_rx.recv().await { - return MigrateTaskEvent::Command(cmd); - } - - // The sender side of the command channel is dropped, which means the - // migration task is exiting. Wait for it to finish and snag its result. - match task.await { - Ok(res) => { - info!(log, "Migration source task exited: {:?}", res); - MigrateTaskEvent::TaskExited(res) - } - Err(join_err) => { - if join_err.is_cancelled() { - panic!("Migration task canceled"); - } else { - panic!( - "Migration task panicked: {:?}", - join_err.into_panic() - ); - } - } - } - } - - fn pause(&mut self) { - assert!(!self.paused); - probes::state_driver_pause!(|| ()); - self.vcpu_tasks.pause_all(); - self.controller.pause_devices(); - self.controller.pause_vm(); - self.paused = true; - } - - fn resume(&mut self) { - assert!(self.paused); - probes::state_driver_resume!(|| ()); - self.controller.resume_vm(); - self.controller.resume_devices(); - self.vcpu_tasks.resume_all(); - self.paused = false; - } - - fn reset_vcpus(&self) { - self.vcpu_tasks.new_generation(); - self.controller.reset_vcpu_state(); - } - - fn publish_steady_state(&mut self, state: ApiInstanceState) { - let change = match state { - ApiInstanceState::Running => { - request_queue::InstanceStateChange::StartedRunning - } - ApiInstanceState::Stopped => { - request_queue::InstanceStateChange::Stopped - } - ApiInstanceState::Failed => { - request_queue::InstanceStateChange::Failed - } - _ => panic!( - "Called publish_steady_state on non-terminal state {:?}", - state - ), - }; - - self.notify_request_queue(change); - self.set_instance_state(state); - } - - fn notify_request_queue( - &self, - queue_change: request_queue::InstanceStateChange, - ) { - self.shared_state - .inner - .lock() - .unwrap() - .external_request_queue - .notify_instance_state_change(queue_change); - } -} - -#[cfg(test)] -mod test { - use anyhow::bail; - use mockall::Sequence; - - use super::*; - use crate::vcpu_tasks::MockVcpuTaskController; - use crate::vm::MockStateDriverVmController; - - struct TestStateDriver { - driver: - StateDriver, - state_rx: tokio::sync::watch::Receiver, - } - - impl TestStateDriver { - fn api_state(&self) -> ApiInstanceState { - self.state_rx.borrow().state - } - } - - struct TestObjects { - vm_ctrl: MockStateDriverVmController, - vcpu_ctrl: MockVcpuTaskController, - shared_state: Arc, - } - - fn make_state_driver(objects: TestObjects) -> TestStateDriver { - let logger = slog::Logger::root(slog::Discard, slog::o!()); - let (state_tx, state_rx) = - tokio::sync::watch::channel(ApiMonitoredState { - gen: 0, - state: ApiInstanceState::Creating, - migration: ApiMigrateStatusResponse { - migration_in: None, - migration_out: None, - }, - }); - - TestStateDriver { - driver: StateDriver::new( - tokio::runtime::Handle::current(), - Arc::new(objects.vm_ctrl), - objects.shared_state.clone(), - objects.vcpu_ctrl, - logger, - state_tx, - ), - state_rx, - } - } - - /// Generates default mocks for the VM controller and vCPU task controller - /// that accept unlimited requests to read state. - fn make_default_mocks() -> TestObjects { - let logger = slog::Logger::root(slog::Discard, slog::o!()); - let vm_ctrl = MockStateDriverVmController::new(); - let vcpu_ctrl = MockVcpuTaskController::new(); - TestObjects { - vm_ctrl, - vcpu_ctrl, - shared_state: Arc::new(SharedVmState::new(&logger)), - } - } - - fn add_reboot_expectations( - vm_ctrl: &mut MockStateDriverVmController, - vcpu_ctrl: &mut MockVcpuTaskController, - ) { - // The reboot process requires careful ordering of steps to make sure - // the VM's vCPUs are put into the correct state when the machine starts - // up. - let mut seq = Sequence::new(); - - // First, reboot has to pause everything. It doesn't actually matter - // whether vCPUs or devices pause first, but there's no way to specify - // that these events must be sequenced before other expectations but - // have no ordering with respect to each other. - vcpu_ctrl - .expect_pause_all() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_pause_devices() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - - // The devices and--importantly--the bhyve VM itself must be reset - // before resetting any vCPU state (so that bhyve will accept the ioctls - // sent to the vCPUs during the reset process). - vm_ctrl - .expect_reset_devices_and_machine() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vcpu_ctrl - .expect_new_generation() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_reset_vcpu_state() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - - // Entities and vCPUs can technically be resumed in either order, but - // resuming devices first allows them to be ready when the vCPUs start - // creating work for them to do. - vm_ctrl - .expect_resume_devices() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vcpu_ctrl - .expect_resume_all() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - } - - #[tokio::test] - async fn guest_triple_fault_reboots() { - let mut test_objects = make_default_mocks(); - - add_reboot_expectations( - &mut test_objects.vm_ctrl, - &mut test_objects.vcpu_ctrl, - ); - let mut driver = make_state_driver(test_objects); - driver.driver.handle_event(StateDriverEvent::Guest( - GuestEvent::VcpuSuspendTripleFault( - 0, - std::time::Duration::default(), - ), - )); - - assert!(matches!(driver.api_state(), ApiInstanceState::Running)); - } - - #[tokio::test] - async fn guest_chipset_reset_reboots() { - let mut test_objects = make_default_mocks(); - - add_reboot_expectations( - &mut test_objects.vm_ctrl, - &mut test_objects.vcpu_ctrl, - ); - let mut driver = make_state_driver(test_objects); - driver - .driver - .handle_event(StateDriverEvent::Guest(GuestEvent::ChipsetReset)); - - assert!(matches!(driver.api_state(), ApiInstanceState::Running)); - } - - #[tokio::test] - async fn start_from_cold_boot() { - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - let mut seq = Sequence::new(); - vcpu_ctrl - .expect_new_generation() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_reset_vcpu_state() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_start_devices() - .times(1) - .in_sequence(&mut seq) - .returning(|| Ok(())); - vcpu_ctrl - .expect_resume_all() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - - let mut driver = make_state_driver(test_objects); - driver - .driver - .handle_event(StateDriverEvent::External(ExternalRequest::Start)); - - assert!(matches!(driver.api_state(), ApiInstanceState::Running)); - } - - #[tokio::test] - async fn device_start_failure_causes_instance_failure() { - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - let mut seq = Sequence::new(); - vcpu_ctrl - .expect_new_generation() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_reset_vcpu_state() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_start_devices() - .times(1) - .in_sequence(&mut seq) - .returning(|| bail!("injected failure into start_devices!")); - - let mut driver = make_state_driver(test_objects); - - // Failure allows the instance to be preserved for debugging. - assert_eq!( - driver.driver.handle_event(StateDriverEvent::External( - ExternalRequest::Start - )), - HandleEventOutcome::Continue - ); - - assert!(matches!(driver.api_state(), ApiInstanceState::Failed)); - } - - #[tokio::test] - async fn devices_pause_before_halting() { - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - let mut seq = Sequence::new(); - vcpu_ctrl - .expect_pause_all() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_pause_devices() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_pause_vm() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vcpu_ctrl - .expect_exit_all() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - vm_ctrl - .expect_halt_devices() - .times(1) - .in_sequence(&mut seq) - .returning(|| ()); - - let mut driver = make_state_driver(test_objects); - driver - .driver - .handle_event(StateDriverEvent::External(ExternalRequest::Stop)); - - assert!(matches!(driver.api_state(), ApiInstanceState::Stopped)); - } - - #[tokio::test] - async fn devices_pause_once_when_halting_after_migration_out() { - let migration_id = Uuid::new_v4(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - let (response_tx, mut response_rx) = tokio::sync::mpsc::channel(1); - let migrate_task = tokio::spawn(async move { - start_rx.await.unwrap(); - task_exit_rx.await.unwrap() - }); - - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - - // This test will simulate a migration out (with a pause command), then - // order the state driver to halt. This should produce exactly one set - // of pause commands and one set of halt commands with no resume - // commands. - vm_ctrl.expect_pause_devices().times(1).returning(|| ()); - vcpu_ctrl.expect_pause_all().times(1).returning(|| ()); - vcpu_ctrl.expect_exit_all().times(1).returning(|| ()); - vm_ctrl.expect_halt_devices().times(1).returning(|| ()); - vm_ctrl.expect_resume_devices().never(); - vcpu_ctrl.expect_resume_all().never(); - vm_ctrl.expect_pause_vm().times(1).returning(|| ()); - vm_ctrl.expect_resume_vm().never(); - - let mut driver = make_state_driver(test_objects); - - // The state driver expects to run on an OS thread outside the async - // runtime so that it can call `block_on` to wait for messages from the - // migration task. - let hdl = std::thread::spawn(move || { - driver.driver.handle_event(StateDriverEvent::External( - ExternalRequest::MigrateAsSource { - migration_id, - task: migrate_task, - start_tx, - command_rx, - response_tx, - }, - )); - - // Return the driver (which has the mocks attached) when the thread - // is joined so the test can continue using it. - driver - }); - - // Simulate a pause and the successful completion of migration. - command_tx.send(MigrateSourceCommand::Pause).await.unwrap(); - let resp = response_rx.recv().await.unwrap(); - assert!(matches!(resp, MigrateSourceResponse::Pause(Ok(())))); - command_tx - .send(MigrateSourceCommand::UpdateState(ApiMigrationState::Finish)) - .await - .unwrap(); - - drop(command_tx); - task_exit_tx.send(Ok(())).unwrap(); - - // Wait for the call to `handle_event` to return before tearing anything - // else down. - driver = tokio::task::spawn_blocking(move || hdl.join().unwrap()) - .await - .unwrap(); - - // The migration should appear to have finished. The state driver will - // queue a "stop" command to itself in this case, but because the driver - // is not directly processing the queue here, the test has to issue this - // call itself. - assert_eq!( - driver.driver.get_migration_status().migration_out.unwrap(), - ApiMigrationStatus { - id: migration_id, - state: ApiMigrationState::Finish - } - ); - - driver - .driver - .handle_event(StateDriverEvent::External(ExternalRequest::Stop)); - - assert!(matches!(driver.api_state(), ApiInstanceState::Stopped)); - } - - #[tokio::test] - async fn paused_vm_resumes_after_failed_migration_out() { - let migration_id = Uuid::new_v4(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - let (response_tx, mut response_rx) = tokio::sync::mpsc::channel(1); - let migrate_task = tokio::spawn(async move { - start_rx.await.unwrap(); - task_exit_rx.await.unwrap() - }); - - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - - // This test will simulate a migration out up through pausing the - // source, then fail migration. This should pause and resume all the - // devices and the vCPUs. - vm_ctrl.expect_pause_devices().times(1).returning(|| ()); - vm_ctrl.expect_resume_devices().times(1).returning(|| ()); - vcpu_ctrl.expect_pause_all().times(1).returning(|| ()); - vcpu_ctrl.expect_resume_all().times(1).returning(|| ()); - - // VMM will be paused once prior to exporting state, and then resumed - // afterwards when the migration fails. - let mut pause_seq = Sequence::new(); - vm_ctrl - .expect_pause_vm() - .times(1) - .in_sequence(&mut pause_seq) - .returning(|| ()); - vm_ctrl - .expect_resume_vm() - .times(1) - .in_sequence(&mut pause_seq) - .returning(|| ()); - - let mut driver = make_state_driver(test_objects); - let hdl = std::thread::spawn(move || { - let outcome = driver.driver.handle_event( - StateDriverEvent::External(ExternalRequest::MigrateAsSource { - migration_id, - task: migrate_task, - start_tx, - command_rx, - response_tx, - }), - ); - - (driver, outcome) - }); - - // Simulate a successful pause. - command_tx.send(MigrateSourceCommand::Pause).await.unwrap(); - let resp = response_rx.recv().await.unwrap(); - assert!(matches!(resp, MigrateSourceResponse::Pause(Ok(())))); - - // Simulate failure. The migration protocol must both update the state - // to Error and make the task return `Err`. - command_tx - .send(MigrateSourceCommand::UpdateState(ApiMigrationState::Error)) - .await - .unwrap(); - drop(command_tx); - task_exit_tx.send(Err(MigrateError::UnexpectedMessage)).unwrap(); - - // Wait for the call to `handle_event` to return. - let (driver, outcome) = - tokio::task::spawn_blocking(move || hdl.join().unwrap()) - .await - .unwrap(); - - // The VM should be running and the state driver should continue - // operating normally. - assert!(matches!(driver.api_state(), ApiInstanceState::Running)); - assert_eq!(outcome, HandleEventOutcome::Continue); - assert_eq!( - driver.driver.get_migration_status().migration_out.unwrap(), - ApiMigrationStatus { - id: migration_id, - state: ApiMigrationState::Error - } - ); - } - - #[tokio::test] - async fn vm_starts_after_migration_in() { - let migration_id = Uuid::new_v4(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - let migrate_task = tokio::spawn(async move { - start_rx.await.unwrap(); - task_exit_rx.await.unwrap() - }); - - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - - vcpu_ctrl.expect_new_generation().times(1).returning(|| ()); - vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ()); - vm_ctrl.expect_start_devices().times(1).returning(|| Ok(())); - vcpu_ctrl.expect_resume_all().times(1).returning(|| ()); - - let mut pause_seq = Sequence::new(); - vm_ctrl - .expect_pause_vm() - .times(1) - .in_sequence(&mut pause_seq) - .returning(|| ()); - vm_ctrl - .expect_resume_vm() - .times(1) - .in_sequence(&mut pause_seq) - .returning(|| ()); - - let mut driver = make_state_driver(test_objects); - - // The state driver expects to run on an OS thread outside the async - // runtime so that it can call `block_on` to wait for messages from the - // migration task. - let hdl = std::thread::spawn(move || { - driver.driver.handle_event(StateDriverEvent::External( - ExternalRequest::MigrateAsTarget { - migration_id, - task: migrate_task, - start_tx, - command_rx, - }, - )); - - driver - }); - - // Explicitly drop the command channel to signal to the driver that - // the migration task is completing. - command_tx - .send(MigrateTargetCommand::UpdateState(ApiMigrationState::Finish)) - .await - .unwrap(); - drop(command_tx); - task_exit_tx.send(Ok(())).unwrap(); - - // Wait for the call to `handle_event` to return before tearing anything - // else down. - let driver = tokio::task::spawn_blocking(move || hdl.join().unwrap()) - .await - .unwrap(); - - assert_eq!( - driver.driver.get_migration_status().migration_in.unwrap(), - ApiMigrationStatus { - id: migration_id, - state: ApiMigrationState::Finish - } - ); - assert!(matches!(driver.api_state(), ApiInstanceState::Running)); - } - - #[tokio::test] - async fn failed_migration_in_fails_instance() { - let migration_id = Uuid::new_v4(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - let migrate_task = tokio::spawn(async move { - start_rx.await.unwrap(); - task_exit_rx.await.unwrap() - }); - - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - - vcpu_ctrl.expect_new_generation().times(1).returning(|| ()); - vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ()); - vm_ctrl.expect_pause_vm().times(1).returning(|| ()); - vm_ctrl.expect_resume_vm().times(1).returning(|| ()); - let mut driver = make_state_driver(test_objects); - - // The state driver expects to run on an OS thread outside the async - // runtime so that it can call `block_on` to wait for messages from the - // migration task. - let hdl = std::thread::spawn(move || { - let outcome = driver.driver.handle_event( - StateDriverEvent::External(ExternalRequest::MigrateAsTarget { - migration_id, - task: migrate_task, - start_tx, - command_rx, - }), - ); - - (driver, outcome) - }); - - // The migration task is required to update the migration state to - // "Error" before exiting when migration fails. - command_tx - .send(MigrateTargetCommand::UpdateState(ApiMigrationState::Error)) - .await - .unwrap(); - drop(command_tx); - task_exit_tx.send(Err(MigrateError::UnexpectedMessage)).unwrap(); - - // Wait for the call to `handle_event` to return. - let (driver, outcome) = - tokio::task::spawn_blocking(move || hdl.join().unwrap()) - .await - .unwrap(); - - // The migration should appear to have failed, but the VM should be - // preserved for debugging. - assert_eq!(outcome, HandleEventOutcome::Continue); - assert!(matches!(driver.api_state(), ApiInstanceState::Failed)); - assert_eq!( - driver.driver.get_migration_status().migration_in.unwrap(), - ApiMigrationStatus { - id: migration_id, - state: ApiMigrationState::Error - } - ); - } - - #[tokio::test] - async fn failed_vm_start_after_migration_in_fails_instance() { - let migration_id = Uuid::new_v4(); - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - let (task_exit_tx, task_exit_rx) = tokio::sync::oneshot::channel(); - let (command_tx, command_rx) = tokio::sync::mpsc::channel(1); - let migrate_task = tokio::spawn(async move { - start_rx.await.unwrap(); - task_exit_rx.await.unwrap() - }); - - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - - vcpu_ctrl.expect_new_generation().times(1).returning(|| ()); - vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ()); - - let mut pause_seq = Sequence::new(); - vm_ctrl - .expect_pause_vm() - .times(1) - .in_sequence(&mut pause_seq) - .returning(|| ()); - vm_ctrl - .expect_resume_vm() - .times(1) - .in_sequence(&mut pause_seq) - .returning(|| ()); - - vm_ctrl - .expect_start_devices() - .times(1) - .returning(|| bail!("injected failure into start_devices!")); - - let mut driver = make_state_driver(test_objects); - - // The state driver expects to run on an OS thread outside the async - // runtime so that it can call `block_on` to wait for messages from the - // migration task. - let hdl = std::thread::spawn(move || { - let outcome = driver.driver.handle_event( - StateDriverEvent::External(ExternalRequest::MigrateAsTarget { - migration_id, - task: migrate_task, - start_tx, - command_rx, - }), - ); - - (driver, outcome) - }); - - // Explicitly drop the command channel to signal to the driver that - // the migration task is completing. - command_tx - .send(MigrateTargetCommand::UpdateState(ApiMigrationState::Finish)) - .await - .unwrap(); - drop(command_tx); - task_exit_tx.send(Ok(())).unwrap(); - - // Wait for the call to `handle_event` to return. - let (driver, outcome) = - tokio::task::spawn_blocking(move || hdl.join().unwrap()) - .await - .unwrap(); - - // The instance should have failed, but should also be preserved for - // debugging. - assert_eq!(outcome, HandleEventOutcome::Continue); - assert!(matches!(driver.api_state(), ApiInstanceState::Failed)); - - // The migration has still succeeded in this case. - assert_eq!( - driver.driver.get_migration_status().migration_in.unwrap(), - ApiMigrationStatus { - id: migration_id, - state: ApiMigrationState::Finish - } - ); - } - - #[tokio::test] - async fn start_vm_after_migration_in_does_not_publish_starting_state() { - let mut test_objects = make_default_mocks(); - let vm_ctrl = &mut test_objects.vm_ctrl; - let vcpu_ctrl = &mut test_objects.vcpu_ctrl; - - // A call to start a VM after a successful migration should start vCPUs - // and devices without resetting anything. - vcpu_ctrl.expect_resume_all().times(1).returning(|| ()); - vm_ctrl.expect_start_devices().times(1).returning(|| Ok(())); - - // As noted below, the instance state is being magicked directly into a - // `Migrating` state, rather than executing the logic which would - // typically carry it there. As such, `pause_vm()` will not be called - // as part of setup. Since instance start _is_ being tested here, the - // `resume_vm()` call is expected. - vm_ctrl.expect_pause_vm().never(); - vm_ctrl.expect_resume_vm().times(1).returning(|| ()); - - // Skip the rigmarole of standing up a fake migration. Instead, just - // push the driver into the state it would have after a successful - // migration to appease the assertions in `start_vm`. - // - // Faking an entire migration, as in the previous tests, requires the - // state driver to run on its own worker thread. This is fine for tests - // that only want to examine state after the driver has finished an - // operation, but this test wants to test side effects of a specific - // part of the state driver's actions, which are tough to synchronize - // with when the driver is running on another thread. - let mut driver = make_state_driver(test_objects); - driver.driver.set_instance_state(ApiInstanceState::Migrating); - - // The driver starts in the Migrating state and should go directly to - // the Running state without passing through Starting. Because there's - // no way to guarantee that the test will see all intermediate states - // that `start_vm` publishes, instead assert that the final state of - // Running is correct and that the state generation only went up by 1 - // (implying that there were no intervening transitions). - let migrating_gen = driver.driver.api_state_tx.borrow().gen; - driver.driver.start_vm(VmStartReason::MigratedIn); - let new_state = driver.driver.api_state_tx.borrow().clone(); - assert!(matches!(new_state.state, ApiInstanceState::Running)); - assert_eq!(new_state.gen, migrating_gen + 1); - } -} From 72439b371aec2edbe7387c3424cc0997acd98c67 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Thu, 27 Jun 2024 01:31:16 +0000 Subject: [PATCH 29/55] request queue tests --- .../src/lib/vm/request_queue.rs | 85 ++++++++++++++++++- 1 file changed, 81 insertions(+), 4 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/request_queue.rs b/bin/propolis-server/src/lib/vm/request_queue.rs index 7ba185b2f..33a376bd2 100644 --- a/bin/propolis-server/src/lib/vm/request_queue.rs +++ b/bin/propolis-server/src/lib/vm/request_queue.rs @@ -30,7 +30,7 @@ use uuid::Uuid; /// Wraps a [`dropshot::WebsocketConnection`] for inclusion in an /// [`ExternalRequest`]. // -// This newtype allowsthis module's tests (which want to verify queuing +// This newtype allows this module's tests (which want to verify queuing // dispositions and don't care about request contents) to construct a // `MigrateAsSource` request without having to conjure up a real websocket // connection. @@ -328,7 +328,11 @@ impl ExternalRequestQueue { // Requests to reboot prevent additional reboot requests from being // queued, but do not affect other operations. ChangeReason::ApiRequest(ExternalRequest::Reboot) => { - assert!(matches!(self.allowed.start, Disposition::Ignore)); + assert!( + matches!(self.allowed.start, Disposition::Ignore), + "{:?}", + self.allowed + ); AllowedRequests { reboot: Disposition::Ignore, ..self.allowed } } @@ -423,11 +427,42 @@ mod test { } } + fn make_reconfigure_crucible_request() -> ExternalRequest { + let (tx, _rx) = tokio::sync::oneshot::channel(); + ExternalRequest::ReconfigureCrucibleVolume { + disk_name: "".to_string(), + backend_id: Uuid::new_v4(), + new_vcr_json: "".to_string(), + result_tx: tx, + } + } + + #[tokio::test] + async fn start_requests_become_idempotent_after_first_request() { + let mut queue = + ExternalRequestQueue::new(test_logger(), InstanceAutoStart::No); + + // The first request to start should succeed. + assert!(queue.try_queue(ExternalRequest::Start).is_ok()); + + // The second one should too, but only for idempotency: the queue should + // then have only one start request on it. + assert!(queue.try_queue(ExternalRequest::Start).is_ok()); + assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); + assert!(queue.pop_front().is_none()); + + // Start requests continue to be ignored even after the instance starts + // to run. + queue.notify_instance_state_change(InstanceStateChange::StartedRunning); + assert!(queue.try_queue(ExternalRequest::Start).is_ok()); + assert!(queue.pop_front().is_none()); + } + #[tokio::test] async fn migrate_as_source_is_not_idempotent() { // Simulate a running instance. let mut queue = - ExternalRequestQueue::new(test_logger(), InstanceAutoStart::No); + ExternalRequestQueue::new(test_logger(), InstanceAutoStart::Yes); queue.notify_instance_state_change(InstanceStateChange::StartedRunning); // Requests to migrate out should be allowed. @@ -472,7 +507,7 @@ mod test { #[tokio::test] async fn reboot_requests_are_idempotent_except_when_stopping() { let mut queue = - ExternalRequestQueue::new(test_logger(), InstanceAutoStart::No); + ExternalRequestQueue::new(test_logger(), InstanceAutoStart::Yes); queue.notify_instance_state_change(InstanceStateChange::StartedRunning); // Once the instance is started, reboot requests should be allowed, but @@ -503,4 +538,46 @@ mod test { queue.notify_instance_state_change(InstanceStateChange::Rebooted); assert!(queue.try_queue(ExternalRequest::Reboot).is_err()); } + + #[tokio::test] + async fn mutation_requires_running_and_not_migrating_out() { + let mut queue = + ExternalRequestQueue::new(test_logger(), InstanceAutoStart::No); + + // Mutating a VM before it has started is not allowed. + assert!(queue.try_queue(make_reconfigure_crucible_request()).is_err()); + + // Merely dequeuing the start request doesn't allow mutation; the VM + // actually has to be running. + assert!(queue.try_queue(ExternalRequest::Start).is_ok()); + assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start))); + assert!(queue.try_queue(make_reconfigure_crucible_request()).is_err()); + queue.notify_instance_state_change(InstanceStateChange::StartedRunning); + assert!(queue.try_queue(make_reconfigure_crucible_request()).is_ok()); + assert!(matches!( + queue.pop_front(), + Some(ExternalRequest::ReconfigureCrucibleVolume { .. }) + )); + + // Successfully requesting migration out should block new mutation + // requests (they should wait for the migration to resolve and then go + // to the target). + assert!(queue.try_queue(make_migrate_as_source_request()).is_ok()); + assert!(queue.try_queue(make_reconfigure_crucible_request()).is_err()); + + // But if the VM resumes (due to a failed migration out) these requests + // should succeed again. + assert!(queue.pop_front().is_some()); + queue.notify_instance_state_change(InstanceStateChange::StartedRunning); + assert!(queue.try_queue(make_reconfigure_crucible_request()).is_ok()); + } + + #[tokio::test] + async fn mutation_disallowed_after_stop() { + let mut queue = + ExternalRequestQueue::new(test_logger(), InstanceAutoStart::Yes); + queue.notify_instance_state_change(InstanceStateChange::StartedRunning); + queue.notify_instance_state_change(InstanceStateChange::Stopped); + assert!(queue.try_queue(make_reconfigure_crucible_request()).is_err()); + } } From d404d3e17c80a01dd4680632e07e594c45c50325 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Thu, 27 Jun 2024 16:48:02 +0000 Subject: [PATCH 30/55] [WIP] flatten out VM object hierarchy Also clean up VM init code in the state driver. --- .../src/lib/migrate/destination.rs | 53 +- bin/propolis-server/src/lib/migrate/mod.rs | 2 +- bin/propolis-server/src/lib/migrate/source.rs | 89 ++- bin/propolis-server/src/lib/server.rs | 69 +-- bin/propolis-server/src/lib/vm/active.rs | 87 +++ .../src/lib/vm/lifecycle_ops.rs | 151 ----- .../src/lib/vm/migrate_commands.rs | 2 +- bin/propolis-server/src/lib/vm/mod.rs | 305 ++-------- bin/propolis-server/src/lib/vm/objects.rs | 372 ++++++++++++ bin/propolis-server/src/lib/vm/services.rs | 11 +- .../src/lib/vm/state_driver.rs | 537 +++++++----------- bin/propolis-server/src/lib/vnc.rs | 4 +- 12 files changed, 864 insertions(+), 818 deletions(-) create mode 100644 bin/propolis-server/src/lib/vm/active.rs delete mode 100644 bin/propolis-server/src/lib/vm/lifecycle_ops.rs create mode 100644 bin/propolis-server/src/lib/vm/objects.rs diff --git a/bin/propolis-server/src/lib/migrate/destination.rs b/bin/propolis-server/src/lib/migrate/destination.rs index d5804d2b5..1ac32e997 100644 --- a/bin/propolis-server/src/lib/migrate/destination.rs +++ b/bin/propolis-server/src/lib/migrate/destination.rs @@ -90,7 +90,7 @@ struct DestinationProtocol { /// The VM objects into which to import the source VM's state. Only /// initialized after the sync phase. - vm_objects: Option>, + vm_objects: Option>, } impl DestinationProtocol { @@ -209,7 +209,7 @@ impl DestinationProtocol { info!(self.log(), "Destination read Preamble: {:?}", preamble); if let Err(e) = preamble.is_migration_compatible( - self.vm_objects.as_ref().unwrap().instance_spec(), + self.vm_objects.as_ref().unwrap().read().await.instance_spec(), ) { error!( self.log(), @@ -353,21 +353,26 @@ impl DestinationProtocol { info!(self.log(), "Devices: {devices:#?}"); - let objects = self.vm_objects.as_ref().unwrap(); - let migrate_ctx = - MigrateCtx { mem: &objects.machine().acc_mem.access().unwrap() }; - for device in devices { - info!( - self.log(), - "Applying state to device {}", device.instance_name - ); + { + let objects = self.vm_objects.as_ref().unwrap().read().await; + let migrate_ctx = MigrateCtx { + mem: &objects.machine().acc_mem.access().unwrap(), + }; + for device in devices { + info!( + self.log(), + "Applying state to device {}", device.instance_name + ); - let target = objects - .device_by_name(&device.instance_name) - .ok_or_else(|| { - MigrateError::UnknownDevice(device.instance_name.clone()) - })?; - self.import_device(&target, &device, &migrate_ctx)?; + let target = objects + .device_by_name(&device.instance_name) + .ok_or_else(|| { + MigrateError::UnknownDevice( + device.instance_name.clone(), + ) + })?; + self.import_device(&target, &device, &migrate_ctx)?; + } } self.send_msg(codec::Message::Okay).await @@ -402,7 +407,16 @@ impl DestinationProtocol { // Take a snapshot of the host hrtime/wall clock time, then adjust // time data appropriately. - let vmm_hdl = &self.vm_objects.as_ref().unwrap().machine().hdl.clone(); + let vmm_hdl = &self + .vm_objects + .as_ref() + .unwrap() + .read() + .await + .machine() + .hdl + .clone(); + let (dst_hrt, dst_wc) = vmm::time::host_time_snapshot(vmm_hdl) .map_err(|e| { MigrateError::TimeData(format!( @@ -598,10 +612,13 @@ impl DestinationProtocol { self.vm_objects .as_ref() .unwrap() + .read() + .await .com1() .import(&com1_history) .await .map_err(|e| MigrateError::Codec(e.to_string()))?; + self.send_msg(codec::Message::Okay).await } @@ -676,7 +693,7 @@ impl DestinationProtocol { addr: GuestAddr, buf: &[u8], ) -> Result<(), MigrateError> { - let objects = self.vm_objects.as_ref().unwrap(); + let objects = self.vm_objects.as_ref().unwrap().read().await; let memctx = objects.machine().acc_mem.access().unwrap(); let len = buf.len(); memctx.write_from(addr, buf, len); diff --git a/bin/propolis-server/src/lib/migrate/mod.rs b/bin/propolis-server/src/lib/migrate/mod.rs index ca460ed1e..9f50ff854 100644 --- a/bin/propolis-server/src/lib/migrate/mod.rs +++ b/bin/propolis-server/src/lib/migrate/mod.rs @@ -315,7 +315,7 @@ pub(crate) struct DestinationContext< /// migration process (destination-side). pub(crate) async fn dest_initiate( log: &slog::Logger, - migrate_info: api::InstanceMigrateInitiateRequest, + migrate_info: &api::InstanceMigrateInitiateRequest, local_server_addr: SocketAddr, ) -> Result< DestinationContext< diff --git a/bin/propolis-server/src/lib/migrate/source.rs b/bin/propolis-server/src/lib/migrate/source.rs index e34432582..94f69e6c9 100644 --- a/bin/propolis-server/src/lib/migrate/source.rs +++ b/bin/propolis-server/src/lib/migrate/source.rs @@ -33,7 +33,7 @@ use crate::migrate::{ use crate::vm::migrate_commands::{ MigrateSourceCommand, MigrateSourceResponse, }; -use crate::vm::ActiveVm; +use crate::vm::objects::VmObjects; /// Specifies which pages should be offered during a RAM transfer phase. /// @@ -115,7 +115,7 @@ enum RamOfferDiscipline { } pub async fn migrate( - vm: Arc, + vm: Arc, command_tx: tokio::sync::mpsc::Sender, response_rx: tokio::sync::mpsc::Receiver, conn: WebSocketStream, @@ -147,39 +147,36 @@ pub async fn migrate( // // See the lengthy comment on `RamOfferDiscipline` above for more // details about what's going on here. - for (&GuestAddr(gpa), dirtiness) in proto.dirt.iter().flatten() { - if let Err(e) = proto - .vm - .objects() - .await - .machine() - .hdl - .set_dirty_pages(gpa, dirtiness) - { - // Bad news! Our attempt to re-set the dirty bit on these - // pages has failed! Thus, subsequent migration attempts - // /!\ CAN NO LONGER RELY ON DIRTY PAGE TRACKING /!\ - // and must always offer all pages in the initial RAM push - // phase. - // - // Record that now so we never try to do this again. - proto - .command_tx - .send(MigrateSourceCommand::RedirtyingFailed) - .await - .unwrap(); - // .map_err(|_| MigrateError::StateDriverChannelClosed)?; - - error!( - proto.log(), - "failed to restore dirty bits: {e}"; - "gpa" => gpa, - ); - // No sense continuing to try putting back any remaining - // dirty bits, as we won't be using them any longer. - break; - } else { - debug!(proto.log(), "re-dirtied pages at {gpa:#x}",); + { + let objects = proto.vm.read().await; + let machine = objects.machine(); + for (&GuestAddr(gpa), dirtiness) in proto.dirt.iter().flatten() { + if let Err(e) = machine.hdl.set_dirty_pages(gpa, dirtiness) { + // Bad news! Our attempt to re-set the dirty bit on these + // pages has failed! Thus, subsequent migration attempts + // /!\ CAN NO LONGER RELY ON DIRTY PAGE TRACKING /!\ + // and must always offer all pages in the initial RAM push + // phase. + // + // Record that now so we never try to do this again. + proto + .command_tx + .send(MigrateSourceCommand::RedirtyingFailed) + .await + .unwrap(); + // .map_err(|_| MigrateError::StateDriverChannelClosed)?; + + error!( + proto.log(), + "failed to restore dirty bits: {e}"; + "gpa" => gpa, + ); + // No sense continuing to try putting back any remaining + // dirty bits, as we won't be using them any longer. + break; + } else { + debug!(proto.log(), "re-dirtied pages at {gpa:#x}",); + } } } @@ -204,8 +201,8 @@ pub(crate) struct PersistentState { } struct SourceProtocol { - /// The VM controller for the instance of interest. - vm: Arc, + /// The source instance's VM objects. + vm: Arc, /// The channel to use to send messages to the state worker coordinating /// this migration. @@ -245,7 +242,7 @@ type PageBitmap = [u8; PAGE_BITMAP_SIZE]; impl SourceProtocol { async fn new( - vm: Arc, + vm: Arc, command_tx: tokio::sync::mpsc::Sender, response_rx: tokio::sync::mpsc::Receiver, conn: WebSocketStream, @@ -256,7 +253,7 @@ impl SourceProtocol { // the pre-pause RAM push. let dirt = { let can_npt_operate = - vm.objects().await.machine().hdl.can_npt_operate(); + vm.read().await.machine().hdl.can_npt_operate(); if can_npt_operate { Some(Default::default()) @@ -329,7 +326,7 @@ impl SourceProtocol { async fn sync(&mut self) -> Result<(), MigrateError> { self.update_state(MigrationState::Sync).await; let preamble = Preamble::new(VersionedInstanceSpec::V0( - self.vm.objects().await.instance_spec().clone(), + self.vm.read().await.instance_spec().clone(), )); let s = ron::ser::to_string(&preamble) .map_err(codec::ProtocolError::from)?; @@ -579,7 +576,7 @@ impl SourceProtocol { self.update_state(MigrationState::Device).await; let mut device_states = vec![]; { - let objects = self.vm.objects().await; + let objects = self.vm.read().await; let machine = objects.machine(); let migrate_ctx = MigrateCtx { mem: &machine.acc_mem.access().unwrap() }; @@ -643,7 +640,7 @@ impl SourceProtocol { // Read and send over the time data async fn time_data(&mut self) -> Result<(), MigrateError> { - let vmm_hdl = &self.vm.objects().await.machine().hdl.clone(); + let vmm_hdl = &self.vm.read().await.machine().hdl.clone(); let vm_time_data = vmm::time::export_time_data(vmm_hdl).map_err(|e| { MigrateError::TimeData(format!( @@ -682,7 +679,7 @@ impl SourceProtocol { _ => return Err(MigrateError::UnexpectedMessage), }; let com1_history = - self.vm.objects().await.com1().export_history(remote_addr).await?; + self.vm.read().await.com1().export_history(remote_addr).await?; self.send_msg(codec::Message::Serialized(com1_history)).await?; self.read_ok().await } @@ -795,7 +792,7 @@ impl SourceProtocol { async fn vmm_ram_bounds( &mut self, ) -> Result, MigrateError> { - let objects = self.vm.objects().await; + let objects = self.vm.read().await; let machine = objects.machine(); let memctx = machine.acc_mem.access().unwrap(); memctx.mem_bounds().ok_or(MigrateError::InvalidInstanceState) @@ -807,7 +804,7 @@ impl SourceProtocol { bits: &mut [u8], ) -> Result<(), MigrateError> { self.vm - .objects() + .read() .await .machine() .hdl @@ -820,7 +817,7 @@ impl SourceProtocol { addr: GuestAddr, buf: &mut [u8], ) -> Result<(), MigrateError> { - let objects = self.vm.objects().await; + let objects = self.vm.read().await; let machine = objects.machine(); let memctx = machine.acc_mem.access().unwrap(); let len = buf.len(); diff --git a/bin/propolis-server/src/lib/server.rs b/bin/propolis-server/src/lib/server.rs index 0703cbc5e..cfadbb8a3 100644 --- a/bin/propolis-server/src/lib/server.rs +++ b/bin/propolis-server/src/lib/server.rs @@ -369,14 +369,15 @@ async fn instance_state_monitor( ) -> Result, HttpError> { let ctx = rqctx.context(); let gen = request.into_inner().gen; - let mut state_watcher = ctx.vm.state_watcher().map_err(|e| match e { - VmError::NotCreated | VmError::WaitingToInitialize => { - not_created_error() - } - _ => HttpError::for_internal_error(format!( - "unexpected error from VM controller: {e}" - )), - })?; + let mut state_watcher = + ctx.vm.state_watcher().await.map_err(|e| match e { + VmError::NotCreated | VmError::WaitingToInitialize => { + not_created_error() + } + _ => HttpError::for_internal_error(format!( + "unexpected error from VM controller: {e}" + )), + })?; loop { let last = state_watcher.borrow().clone(); @@ -411,7 +412,7 @@ async fn instance_state_put( ) -> Result { let ctx = rqctx.context(); let requested_state = request.into_inner(); - let vm = ctx.vm.active_vm().ok_or_else(not_created_error)?; + let vm = ctx.vm.active_vm().await.ok_or_else(not_created_error)?; let result = vm .put_state(requested_state) .map(|_| HttpResponseUpdatedNoContent {}) @@ -453,8 +454,8 @@ async fn instance_serial_history_get( ) -> Result, HttpError> { let ctx = rqctx.context(); - let vm = ctx.vm.active_vm().ok_or_else(not_created_error)?; - let serial = vm.objects().await.com1().clone(); + let vm = ctx.vm.active_vm().await.ok_or_else(not_created_error)?; + let serial = vm.objects().read().await.com1().clone(); let query_params = query.into_inner(); let byte_offset = SerialHistoryOffset::try_from(&query_params)?; @@ -481,8 +482,8 @@ async fn instance_serial( websock: WebsocketConnection, ) -> dropshot::WebsocketChannelResult { let ctx = rqctx.context(); - let vm = ctx.vm.active_vm().ok_or_else(not_created_error)?; - let serial = vm.objects().await.com1().clone(); + let vm = ctx.vm.active_vm().await.ok_or_else(not_created_error)?; + let serial = vm.objects().read().await.com1().clone(); // Use the default buffering paramters for the websocket configuration // @@ -539,7 +540,7 @@ async fn instance_migrate_start( ) -> dropshot::WebsocketChannelResult { let ctx = rqctx.context(); let migration_id = path_params.into_inner().migration_id; - let vm = ctx.vm.active_vm().ok_or_else(not_created_error)?; + let vm = ctx.vm.active_vm().await.ok_or_else(not_created_error)?; Ok(vm.request_migration_out(migration_id, websock).await?) } @@ -553,6 +554,7 @@ async fn instance_migrate_status( let ctx = rqctx.context(); ctx.vm .state_watcher() + .await .map(|rx| HttpResponseOk(rx.borrow().migration.clone())) .map_err(|e| match e { VmError::NotCreated | VmError::WaitingToInitialize => { @@ -573,15 +575,16 @@ async fn instance_issue_crucible_snapshot_request( rqctx: RequestContext>, path_params: Path, ) -> Result, HttpError> { - let vm = rqctx.context().vm.active_vm().ok_or_else(not_created_error)?; - let objects = vm.objects().await; - let crucible_backends = objects.crucible_backends(); + let vm = + rqctx.context().vm.active_vm().await.ok_or_else(not_created_error)?; + let objects = vm.objects().read().await; let path_params = path_params.into_inner(); - let backend = crucible_backends.get(&path_params.id).ok_or_else(|| { - let s = format!("no disk with id {}!", path_params.id); - HttpError::for_not_found(Some(s.clone()), s) - })?; + let backend = + objects.crucible_backends().get(&path_params.id).ok_or_else(|| { + let s = format!("no disk with id {}!", path_params.id); + HttpError::for_not_found(Some(s.clone()), s) + })?; backend.snapshot(path_params.snapshot_id).await.map_err(|e| { HttpError::for_bad_request(Some(e.to_string()), e.to_string()) })?; @@ -599,13 +602,14 @@ async fn disk_volume_status( path_params: Path, ) -> Result, HttpError> { let path_params = path_params.into_inner(); - let vm = rqctx.context().vm.active_vm().ok_or_else(not_created_error)?; - let objects = vm.objects().await; - let crucible_backends = objects.crucible_backends(); - let backend = crucible_backends.get(&path_params.id).ok_or_else(|| { - let s = format!("No crucible backend for id {}", path_params.id); - HttpError::for_not_found(Some(s.clone()), s) - })?; + let vm = + rqctx.context().vm.active_vm().await.ok_or_else(not_created_error)?; + let objects = vm.objects().read().await; + let backend = + objects.crucible_backends().get(&path_params.id).ok_or_else(|| { + let s = format!("No crucible backend for id {}", path_params.id); + HttpError::for_not_found(Some(s.clone()), s) + })?; Ok(HttpResponseOk(api::VolumeStatus { active: backend.volume_is_active().await.map_err(|e| { @@ -630,7 +634,8 @@ async fn instance_issue_crucible_vcr_request( let disk_name = request.name; let (tx, rx) = tokio::sync::oneshot::channel(); - let vm = rqctx.context().vm.active_vm().ok_or_else(not_created_error)?; + let vm = + rqctx.context().vm.active_vm().await.ok_or_else(not_created_error)?; vm.reconfigure_crucible_volume(disk_name, path_params.id, new_vcr_json, tx) .map_err(|e| match e { @@ -660,9 +665,9 @@ async fn instance_issue_crucible_vcr_request( async fn instance_issue_nmi( rqctx: RequestContext>, ) -> Result, HttpError> { - let vm = rqctx.context().vm.active_vm().ok_or_else(not_created_error)?; - let objects = vm.objects().await; - let _ = objects.machine().inject_nmi(); + let vm = + rqctx.context().vm.active_vm().await.ok_or_else(not_created_error)?; + let _ = vm.objects().read().await.machine().inject_nmi(); Ok(HttpResponseOk(())) } diff --git a/bin/propolis-server/src/lib/vm/active.rs b/bin/propolis-server/src/lib/vm/active.rs new file mode 100644 index 000000000..2d9ded45c --- /dev/null +++ b/bin/propolis-server/src/lib/vm/active.rs @@ -0,0 +1,87 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The `ActiveVm` wrapper owns all of the components and services that make up +//! a running Propolis instance. + +use std::sync::Arc; + +use propolis_api_types::{InstanceProperties, InstanceStateRequested}; +use slog::info; +use uuid::Uuid; + +use crate::vm::request_queue::ExternalRequest; + +use super::{ + objects::VmObjects, services::VmServices, CrucibleReplaceResultTx, + InstanceStateRx, VmError, +}; + +/// The components and services that make up an active Propolis VM. +pub(crate) struct ActiveVm { + pub(super) log: slog::Logger, + pub(super) state_driver_queue: Arc, + pub(super) external_state_rx: InstanceStateRx, + pub(super) properties: InstanceProperties, + pub(super) objects: Arc, + pub(super) services: VmServices, +} + +impl ActiveVm { + pub(crate) fn objects(&self) -> &Arc { + &self.objects + } + + pub(crate) fn put_state( + &self, + requested: InstanceStateRequested, + ) -> Result<(), VmError> { + info!(self.log, "requested state via API"; + "state" => ?requested); + + self.state_driver_queue + .queue_external_request(match requested { + InstanceStateRequested::Run => ExternalRequest::Start, + InstanceStateRequested::Stop => ExternalRequest::Stop, + InstanceStateRequested::Reboot => ExternalRequest::Reboot, + }) + .map_err(Into::into) + } + + pub(crate) async fn request_migration_out( + &self, + migration_id: Uuid, + websock: dropshot::WebsocketConnection, + ) -> Result<(), VmError> { + Ok(self.state_driver_queue.queue_external_request( + ExternalRequest::MigrateAsSource { + migration_id, + websock: websock.into(), + }, + )?) + } + + pub(crate) fn reconfigure_crucible_volume( + &self, + disk_name: String, + backend_id: Uuid, + new_vcr_json: String, + result_tx: CrucibleReplaceResultTx, + ) -> Result<(), VmError> { + self.state_driver_queue + .queue_external_request( + ExternalRequest::ReconfigureCrucibleVolume { + disk_name, + backend_id, + new_vcr_json, + result_tx, + }, + ) + .map_err(Into::into) + } + + pub(crate) fn services(&self) -> &VmServices { + &self.services + } +} diff --git a/bin/propolis-server/src/lib/vm/lifecycle_ops.rs b/bin/propolis-server/src/lib/vm/lifecycle_ops.rs deleted file mode 100644 index 4843969d2..000000000 --- a/bin/propolis-server/src/lib/vm/lifecycle_ops.rs +++ /dev/null @@ -1,151 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use std::{ - pin::Pin, - task::{Context, Poll}, -}; - -use futures::{future::BoxFuture, stream::FuturesUnordered, StreamExt}; -use slog::{error, info}; - -impl super::VmObjects { - /// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated - /// devices and vCPUs are brought to a consistent state. - /// - /// When the VM is paused, attempts to run its vCPUs (via `VM_RUN` ioctl) - /// will fail. A corresponding `resume_vm()` call must be made prior to - /// allowing vCPU tasks to run. - pub(super) fn pause_vm(&self) { - info!(self.log, "pausing kernel VMM resources"); - self.machine.hdl.pause().expect("VM_PAUSE should succeed"); - } - - pub(super) fn resume_vm(&self) { - info!(self.log, "resuming kernel VMM resources"); - self.machine.hdl.resume().expect("VM_RESUME should succeed"); - } - - pub(super) fn reset_devices_and_machine(&self) { - self.for_each_device(|name, dev| { - info!(self.log, "sending reset request to {}", name); - dev.reset(); - }); - - self.machine.reinitialize().unwrap(); - } - - pub(super) async fn start_devices(&self) -> anyhow::Result<()> { - self.for_each_device_fallible(|name, dev| { - info!(self.log, "sending startup complete to {}", name); - let res = dev.start(); - if let Err(e) = &res { - error!(self.log, "startup failed for {}: {:?}", name, e); - } - res - })?; - - for (name, backend) in self.block_backends.iter() { - info!(self.log, "starting block backend {}", name); - let res = backend.start().await; - if let Err(e) = &res { - error!(self.log, "Startup failed for {}: {:?}", name, e); - return res; - } - } - - Ok(()) - } - - pub(super) async fn pause_devices(&self) { - self.for_each_device(|name, dev| { - info!(self.log, "sending pause request to {}", name); - dev.pause(); - }); - - struct NamedFuture { - name: String, - future: BoxFuture<'static, ()>, - } - - impl std::future::Future for NamedFuture { - type Output = String; - - fn poll( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll { - let mut_self = self.get_mut(); - match Pin::new(&mut mut_self.future).poll(cx) { - Poll::Pending => Poll::Pending, - Poll::Ready(()) => Poll::Ready(mut_self.name.clone()), - } - } - } - - info!(self.log, "waiting for devices to pause"); - let mut stream: FuturesUnordered<_> = self - .lifecycle_components - .iter() - .map(|(name, dev)| { - info!(self.log, "got paused future from dev {}", name); - NamedFuture { name: name.clone(), future: dev.paused() } - }) - .collect(); - - loop { - match stream.next().await { - Some(name) => { - info!(self.log, "dev {} completed pause", name); - } - - None => { - info!(self.log, "all devices paused"); - break; - } - } - } - } - - pub(super) fn resume_devices(&self) { - self.for_each_device(|name, dev| { - info!(self.log, "sending resume request to {}", name); - dev.resume(); - }) - } - - pub(super) async fn halt_devices(&self) { - self.for_each_device(|name, dev| { - info!(self.log, "sending halt request to {}", name); - dev.halt(); - }); - - for (name, backend) in self.block_backends.iter() { - info!(self.log, "stopping and detaching block backend {}", name); - backend.stop().await; - if let Err(err) = backend.detach() { - error!(self.log, "error detaching block backend"; - "name" => name, - "error" => ?err); - } - } - } - - pub(super) fn reset_vcpu_state(&self) { - for vcpu in self.machine.vcpus.iter() { - info!(self.log, "resetting vCPU {}", vcpu.id); - vcpu.activate().unwrap(); - vcpu.reboot_state().unwrap(); - if vcpu.is_bsp() { - info!(self.log, "Resetting BSP vCPU {}", vcpu.id); - vcpu.set_run_state(propolis::bhyve_api::VRS_RUN, None).unwrap(); - vcpu.set_reg( - propolis::bhyve_api::vm_reg_name::VM_REG_GUEST_RIP, - 0xfff0, - ) - .unwrap(); - } - } - } -} diff --git a/bin/propolis-server/src/lib/vm/migrate_commands.rs b/bin/propolis-server/src/lib/vm/migrate_commands.rs index 80ebb3b05..f5d1cb8eb 100644 --- a/bin/propolis-server/src/lib/vm/migrate_commands.rs +++ b/bin/propolis-server/src/lib/vm/migrate_commands.rs @@ -23,7 +23,7 @@ pub enum MigrateTargetCommand { #[derive(Clone)] pub enum MigrateTargetResponse { - VmObjectsInitialized(Result, String>), + VmObjectsInitialized(Result, String>), } /// A message sent from a live migration driver to the state worker, asking it diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index b1cc7fbe3..c54d904f0 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -5,34 +5,24 @@ //! This module implements the `Vm` wrapper type that encapsulates a single //! instance on behalf of a Propolis server. -use std::{ - collections::BTreeMap, - net::SocketAddr, - sync::{Arc, RwLock}, -}; +use std::{collections::BTreeMap, net::SocketAddr, sync::Arc}; +use active::ActiveVm; use oximeter::types::ProducerRegistry; -use propolis::{ - hw::{ps2::ctrl::PS2Ctrl, qemu::ramfb::RamFb, uart::LpcUart}, - vmm::Machine, -}; use propolis_api_types::{ instance_spec::{v0::InstanceSpecV0, VersionedInstanceSpec}, - InstanceProperties, InstanceStateRequested, + InstanceProperties, }; -use request_queue::ExternalRequest; use rfb::server::VncServer; use slog::info; -use state_publisher::{ExternalStateUpdate, StatePublisher}; -use uuid::Uuid; +use state_publisher::StatePublisher; -use crate::{ - serial::Serial, server::MetricsEndpointConfig, vnc::PropolisVncServer, -}; +use crate::{server::MetricsEndpointConfig, vnc::PropolisVncServer}; +mod active; pub(crate) mod guest_event; -mod lifecycle_ops; pub(crate) mod migrate_commands; +pub(crate) mod objects; mod request_queue; mod services; mod state_driver; @@ -84,7 +74,7 @@ pub(crate) enum VmError { /// The top-level VM wrapper type. Callers are expected to wrap this in an /// `Arc`. pub(crate) struct Vm { - inner: RwLock, + inner: tokio::sync::RwLock, log: slog::Logger, } @@ -93,193 +83,6 @@ struct VmInner { driver: Option>, } -pub(crate) struct VmObjects { - log: slog::Logger, - instance_spec: InstanceSpecV0, - machine: Machine, - lifecycle_components: LifecycleMap, - block_backends: BlockBackendMap, - crucible_backends: CrucibleBackendMap, - com1: Arc>, - framebuffer: Option>, - ps2ctrl: Arc, -} - -impl VmObjects { - pub(crate) fn instance_spec(&self) -> &InstanceSpecV0 { - &self.instance_spec - } - - pub(crate) fn machine(&self) -> &Machine { - &self.machine - } - - pub(crate) fn device_by_name( - &self, - name: &str, - ) -> Option> { - self.lifecycle_components.get(name).cloned() - } - - pub(crate) fn crucible_backends(&self) -> &CrucibleBackendMap { - &self.crucible_backends - } - - pub(crate) fn com1(&self) -> &Arc> { - &self.com1 - } - - pub(crate) fn for_each_device( - &self, - mut func: impl FnMut(&str, &Arc), - ) { - for (name, dev) in self.lifecycle_components.iter() { - func(name, dev); - } - } - - pub(crate) fn for_each_device_fallible( - &self, - mut func: impl FnMut( - &str, - &Arc, - ) -> std::result::Result<(), E>, - ) -> std::result::Result<(), E> { - for (name, dev) in self.lifecycle_components.iter() { - func(name, dev)?; - } - - Ok(()) - } -} - -impl Drop for VmObjects { - fn drop(&mut self) { - info!(self.log, "dropping VM objects"); - } -} - -/// The state stored in a [`Vm`] when there is an actual underlying virtual -/// machine. -pub(super) struct ActiveVm { - parent: Arc, - log: slog::Logger, - - state_driver_queue: Arc, - external_state_rx: InstanceStateRx, - - properties: InstanceProperties, - - objects: Option>, - services: Option, -} - -impl ActiveVm { - pub(crate) fn log(&self) -> &slog::Logger { - &self.log - } - - pub(crate) async fn objects( - &self, - ) -> tokio::sync::RwLockReadGuard<'_, VmObjects> { - self.objects.as_ref().unwrap().read().await - } - - async fn objects_mut( - &self, - ) -> tokio::sync::RwLockWriteGuard<'_, VmObjects> { - self.objects.as_ref().unwrap().write().await - } - - pub(crate) fn put_state( - &self, - requested: InstanceStateRequested, - ) -> Result<(), VmError> { - info!(self.log, "requested state via API"; - "state" => ?requested); - - self.state_driver_queue - .queue_external_request(match requested { - InstanceStateRequested::Run => ExternalRequest::Start, - InstanceStateRequested::Stop => ExternalRequest::Stop, - InstanceStateRequested::Reboot => ExternalRequest::Reboot, - }) - .map_err(Into::into) - } - - pub(crate) async fn request_migration_out( - &self, - migration_id: Uuid, - websock: dropshot::WebsocketConnection, - ) -> Result<(), VmError> { - Ok(self.state_driver_queue.queue_external_request( - ExternalRequest::MigrateAsSource { - migration_id, - websock: websock.into(), - }, - )?) - } - - pub(crate) fn reconfigure_crucible_volume( - &self, - disk_name: String, - backend_id: Uuid, - new_vcr_json: String, - result_tx: CrucibleReplaceResultTx, - ) -> Result<(), VmError> { - self.state_driver_queue - .queue_external_request( - ExternalRequest::ReconfigureCrucibleVolume { - disk_name, - backend_id, - new_vcr_json, - result_tx, - }, - ) - .map_err(Into::into) - } - - pub(crate) fn services(&self) -> &services::VmServices { - self.services.as_ref().expect("active VMs always have services") - } -} - -impl Drop for ActiveVm { - fn drop(&mut self) { - info!(self.log, "dropping active VM"); - - let driver = self - .parent - .inner - .write() - .unwrap() - .driver - .take() - .expect("active VMs always have a driver"); - - let objects = - self.objects.take().expect("active VMs should always have objects"); - - let services = self - .services - .take() - .expect("active VMs should always have services"); - - let parent = self.parent.clone(); - let log = self.log.clone(); - tokio::spawn(async move { - drop(objects); - services.stop(&log).await; - - let mut tx = driver.await.expect("state driver shouldn't panic"); - tx.update(ExternalStateUpdate::Instance( - propolis_api_types::InstanceState::Destroyed, - )); - parent.complete_rundown().await; - }); - } -} - struct UninitVm { external_state_rx: InstanceStateRx, properties: InstanceProperties, @@ -305,7 +108,7 @@ enum VmState { /// This state machine has never held a VM. NoVm, WaitingForInit(UninitVm), - Active(Arc), + Active(active::ActiveVm), Rundown(UninitVm), RundownComplete(UninitVm), } @@ -324,26 +127,34 @@ impl Vm { pub fn new(log: &slog::Logger) -> Arc { let log = log.new(slog::o!("component" => "vm_wrapper")); let inner = VmInner { state: VmState::NoVm, driver: None }; - Arc::new(Self { inner: RwLock::new(inner), log }) + Arc::new(Self { inner: tokio::sync::RwLock::new(inner), log }) } - pub(super) fn active_vm(&self) -> Option> { - let guard = self.inner.read().unwrap(); - if let VmState::Active(vm) = &guard.state { - Some(vm.clone()) - } else { - None - } + pub(super) async fn active_vm( + &self, + ) -> Option> { + tokio::sync::RwLockReadGuard::try_map( + self.inner.read().await, + |inner| { + if let VmState::Active(vm) = &inner.state { + Some(vm) + } else { + None + } + }, + ) + .ok() } pub(super) async fn get( &self, ) -> Result { - let vm = match &self.inner.read().unwrap().state { + let guard = self.inner.read().await; + let vm = match &guard.state { VmState::NoVm => { return Err(VmError::NotCreated); } - VmState::Active(vm) => vm.clone(), + VmState::Active(vm) => vm, VmState::WaitingForInit(vm) | VmState::Rundown(vm) | VmState::RundownComplete(vm) => { @@ -355,7 +166,7 @@ impl Vm { } }; - let spec = vm.objects().await.instance_spec().clone(); + let spec = vm.objects().read().await.instance_spec().clone(); let state = vm.external_state_rx.borrow().clone(); Ok(propolis_api_types::InstanceSpecGetResponse { properties: vm.properties.clone(), @@ -364,8 +175,10 @@ impl Vm { }) } - pub(super) fn state_watcher(&self) -> Result { - let guard = self.inner.read().unwrap(); + pub(super) async fn state_watcher( + &self, + ) -> Result { + let guard = self.inner.read().await; match &guard.state { VmState::NoVm => Err(VmError::NotCreated), VmState::Active(vm) => Ok(vm.external_state_rx.clone()), @@ -375,30 +188,26 @@ impl Vm { } } - fn make_active( + async fn make_active( self: &Arc, log: &slog::Logger, state_driver_queue: Arc, - objects: VmObjects, + objects: &Arc, services: services::VmServices, - ) -> Arc { + ) { info!(self.log, "installing active VM"); - let mut guard = self.inner.write().unwrap(); + let mut guard = self.inner.write().await; let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { VmState::WaitingForInit(vm) => { - let active = Arc::new(ActiveVm { - parent: self.clone(), + guard.state = VmState::Active(ActiveVm { log: log.clone(), state_driver_queue, external_state_rx: vm.external_state_rx, properties: vm.properties, - objects: Some(tokio::sync::RwLock::new(objects)), - services: Some(services), + objects: objects.clone(), + services, }); - - guard.state = VmState::Active(active.clone()); - active } _ => unreachable!( "only a starting VM's state worker calls make_active" @@ -406,8 +215,8 @@ impl Vm { } } - fn start_failed(&self) { - let mut guard = self.inner.write().unwrap(); + async fn start_failed(&self) { + let mut guard = self.inner.write().await; let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { VmState::WaitingForInit(vm) => { @@ -419,23 +228,31 @@ impl Vm { } async fn set_rundown(&self) { - let vm = self - .active_vm() - .expect("VM should be active before being run down"); - info!(self.log, "setting VM rundown"); - let new_state = VmState::Rundown(UninitVm { - external_state_rx: vm.external_state_rx.clone(), - properties: vm.properties.clone(), - spec: vm.objects().await.instance_spec.clone(), - }); + let services = { + let mut guard = self.inner.write().await; + let VmState::Active(vm) = + std::mem::replace(&mut guard.state, VmState::NoVm) + else { + panic!("VM should be active before being run down"); + }; + + let spec = vm.objects().read().await.instance_spec().clone(); + let ActiveVm { external_state_rx, properties, .. } = vm; + guard.state = VmState::Rundown(UninitVm { + external_state_rx, + properties, + spec, + }); + vm.services + }; - self.inner.write().unwrap().state = new_state; + services.stop(&self.log).await; } async fn complete_rundown(&self) { info!(self.log, "completing VM rundown"); - let mut guard = self.inner.write().unwrap(); + let mut guard = self.inner.write().await; let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { VmState::Rundown(vm) => guard.state = VmState::RundownComplete(vm), @@ -478,7 +295,7 @@ impl Vm { // creating a new VM and there's no easy way to upgrade from a reader // lock to a writer lock. { - let mut guard = self.inner.write().unwrap(); + let mut guard = self.inner.write().await; match guard.state { VmState::WaitingForInit(_) => { return Err(VmError::WaitingToInitialize) diff --git a/bin/propolis-server/src/lib/vm/objects.rs b/bin/propolis-server/src/lib/vm/objects.rs new file mode 100644 index 000000000..141b4ae7a --- /dev/null +++ b/bin/propolis-server/src/lib/vm/objects.rs @@ -0,0 +1,372 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Provides a type that collects all of the components that make up a Propolis +//! VM. + +use std::{ + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use futures::{future::BoxFuture, stream::FuturesUnordered, StreamExt}; +use propolis::{ + hw::{ps2::ctrl::PS2Ctrl, qemu::ramfb::RamFb, uart::LpcUart}, + Machine, +}; +use propolis_api_types::instance_spec::v0::InstanceSpecV0; +use slog::{error, info}; +use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; + +use crate::{serial::Serial, vcpu_tasks::VcpuTaskController}; + +use super::{ + state_driver::VmStartReason, BlockBackendMap, CrucibleBackendMap, + LifecycleMap, +}; + +pub(crate) struct VmObjects { + log: slog::Logger, + parent: Arc, + inner: RwLock, +} + +pub(super) struct InputVmObjects { + pub instance_spec: InstanceSpecV0, + pub vcpu_tasks: Box, + pub machine: Machine, + pub lifecycle_components: LifecycleMap, + pub block_backends: BlockBackendMap, + pub crucible_backends: CrucibleBackendMap, + pub com1: Arc>, + pub framebuffer: Option>, + pub ps2ctrl: Arc, +} + +pub(crate) struct VmObjectsLocked { + log: slog::Logger, + instance_spec: InstanceSpecV0, + vcpu_tasks: Box, + machine: Machine, + lifecycle_components: LifecycleMap, + block_backends: BlockBackendMap, + crucible_backends: CrucibleBackendMap, + com1: Arc>, + framebuffer: Option>, + ps2ctrl: Arc, +} + +impl VmObjects { + pub(super) fn new( + log: slog::Logger, + parent: Arc, + input: InputVmObjects, + ) -> Self { + let inner = VmObjectsLocked::new(&log, input); + Self { log, parent, inner: tokio::sync::RwLock::new(inner) } + } + + pub(crate) fn log(&self) -> &slog::Logger { + &self.log + } + + pub(crate) async fn read(&self) -> RwLockReadGuard { + self.inner.read().await + } + + pub(super) async fn write(&self) -> RwLockWriteGuard { + self.inner.write().await + } +} + +impl VmObjectsLocked { + fn new(log: &slog::Logger, input: InputVmObjects) -> Self { + Self { + log: log.clone(), + instance_spec: input.instance_spec, + vcpu_tasks: input.vcpu_tasks, + machine: input.machine, + lifecycle_components: input.lifecycle_components, + block_backends: input.block_backends, + crucible_backends: input.crucible_backends, + com1: input.com1, + framebuffer: input.framebuffer, + ps2ctrl: input.ps2ctrl, + } + } + + pub(crate) fn instance_spec(&self) -> &InstanceSpecV0 { + &self.instance_spec + } + + pub(crate) fn instance_spec_mut(&mut self) -> &mut InstanceSpecV0 { + &mut self.instance_spec + } + + pub(crate) fn machine(&self) -> &Machine { + &self.machine + } + + pub(crate) fn device_by_name( + &self, + name: &str, + ) -> Option> { + self.lifecycle_components.get(name).cloned() + } + + pub(crate) fn crucible_backends(&self) -> &CrucibleBackendMap { + &self.crucible_backends + } + + pub(crate) fn com1(&self) -> &Arc> { + &self.com1 + } + + pub(crate) fn framebuffer(&self) -> &Option> { + &self.framebuffer + } + + pub(crate) fn ps2ctrl(&self) -> &Arc { + &self.ps2ctrl + } + + pub(crate) fn for_each_device( + &self, + mut func: impl FnMut(&str, &Arc), + ) { + for (name, dev) in self.lifecycle_components.iter() { + func(name, dev); + } + } + + pub(crate) fn for_each_device_fallible( + &self, + mut func: impl FnMut( + &str, + &Arc, + ) -> std::result::Result<(), E>, + ) -> std::result::Result<(), E> { + for (name, dev) in self.lifecycle_components.iter() { + func(name, dev)?; + } + + Ok(()) + } + + /// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated + /// devices and vCPUs are brought to a consistent state. + /// + /// When the VM is paused, attempts to run its vCPUs (via `VM_RUN` ioctl) + /// will fail. A corresponding `resume_vm()` call must be made prior to + /// allowing vCPU tasks to run. + pub(super) fn pause_kernel_vm(&self) { + info!(self.log, "pausing kernel VMM resources"); + self.machine.hdl.pause().expect("VM_PAUSE should succeed"); + } + + pub(super) fn resume_kernel_vm(&self) { + info!(self.log, "resuming kernel VMM resources"); + self.machine.hdl.resume().expect("VM_RESUME should succeed"); + } + + pub(super) fn reset_devices_and_machine(&self) { + self.for_each_device(|name, dev| { + info!(self.log, "sending reset request to {}", name); + dev.reset(); + }); + + self.machine.reinitialize().unwrap(); + } + + pub(super) async fn start( + &mut self, + reason: VmStartReason, + ) -> anyhow::Result<()> { + match reason { + VmStartReason::ExplicitRequest => { + self.reset_vcpus(); + } + VmStartReason::MigratedIn => { + self.resume_kernel_vm(); + } + } + + let result = self.start_devices().await; + if result.is_ok() { + self.vcpu_tasks.resume_all(); + } + + result + } + + pub(super) async fn pause(&mut self) { + self.vcpu_tasks.pause_all(); + self.pause_devices().await; + self.pause_kernel_vm(); + } + + pub(super) fn resume(&mut self) { + self.resume_kernel_vm(); + self.resume_devices(); + self.vcpu_tasks.resume_all(); + } + + pub(super) async fn halt(&mut self) { + self.vcpu_tasks.exit_all(); + self.halt_devices().await; + } + + pub(super) fn reset_vcpus(&self) { + self.vcpu_tasks.new_generation(); + self.reset_vcpu_state(); + } + + pub(super) async fn reboot(&mut self) { + // Reboot is implemented as a pause -> reset -> resume transition. + // + // First, pause the vCPUs and all devices so no partially-completed + // work is present. + self.vcpu_tasks.pause_all(); + self.pause_devices().await; + + // Reset all entities and the VM's bhyve state, then reset the + // vCPUs. The vCPU reset must come after the bhyve reset. + self.reset_devices_and_machine(); + self.reset_vcpus(); + + // Resume devices so they're ready to do more work, then resume + // vCPUs. + self.resume_devices(); + self.vcpu_tasks.resume_all(); + } + + pub(super) async fn start_devices(&self) -> anyhow::Result<()> { + self.for_each_device_fallible(|name, dev| { + info!(self.log, "sending startup complete to {}", name); + let res = dev.start(); + if let Err(e) = &res { + error!(self.log, "startup failed for {}: {:?}", name, e); + } + res + })?; + + for (name, backend) in self.block_backends.iter() { + info!(self.log, "starting block backend {}", name); + let res = backend.start().await; + if let Err(e) = &res { + error!(self.log, "Startup failed for {}: {:?}", name, e); + return res; + } + } + + Ok(()) + } + + pub(super) async fn pause_devices(&self) { + self.for_each_device(|name, dev| { + info!(self.log, "sending pause request to {}", name); + dev.pause(); + }); + + struct NamedFuture { + name: String, + future: BoxFuture<'static, ()>, + } + + impl std::future::Future for NamedFuture { + type Output = String; + + fn poll( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll { + let mut_self = self.get_mut(); + match Pin::new(&mut mut_self.future).poll(cx) { + Poll::Pending => Poll::Pending, + Poll::Ready(()) => Poll::Ready(mut_self.name.clone()), + } + } + } + + info!(self.log, "waiting for devices to pause"); + let mut stream: FuturesUnordered<_> = self + .lifecycle_components + .iter() + .map(|(name, dev)| { + info!(self.log, "got paused future from dev {}", name); + NamedFuture { name: name.clone(), future: dev.paused() } + }) + .collect(); + + loop { + match stream.next().await { + Some(name) => { + info!(self.log, "dev {} completed pause", name); + } + + None => { + info!(self.log, "all devices paused"); + break; + } + } + } + } + + pub(super) fn resume_devices(&self) { + self.for_each_device(|name, dev| { + info!(self.log, "sending resume request to {}", name); + dev.resume(); + }) + } + + pub(super) async fn halt_devices(&self) { + self.for_each_device(|name, dev| { + info!(self.log, "sending halt request to {}", name); + dev.halt(); + }); + + for (name, backend) in self.block_backends.iter() { + info!(self.log, "stopping and detaching block backend {}", name); + backend.stop().await; + if let Err(err) = backend.detach() { + error!(self.log, "error detaching block backend"; + "name" => name, + "error" => ?err); + } + } + } + + pub(super) fn reset_vcpu_state(&self) { + for vcpu in self.machine.vcpus.iter() { + info!(self.log, "resetting vCPU {}", vcpu.id); + vcpu.activate().unwrap(); + vcpu.reboot_state().unwrap(); + if vcpu.is_bsp() { + info!(self.log, "Resetting BSP vCPU {}", vcpu.id); + vcpu.set_run_state(propolis::bhyve_api::VRS_RUN, None).unwrap(); + vcpu.set_reg( + propolis::bhyve_api::vm_reg_name::VM_REG_GUEST_RIP, + 0xfff0, + ) + .unwrap(); + } + } + } +} + +impl Drop for VmObjects { + fn drop(&mut self) { + // Signal to these objects' owning VM that rundown has completed and a + // new VM can be created. + // + // It is always safe to complete rundown at this point because an + // `ActiveVm` always holds a reference to its `VmObjects`, and the + // parent VM doesn't drop its `ActiveVm` until rundown begins. + let parent = self.parent.clone(); + tokio::spawn(async move { + parent.complete_rundown().await; + }); + } +} diff --git a/bin/propolis-server/src/lib/vm/services.rs b/bin/propolis-server/src/lib/vm/services.rs index 5d01b7516..7deac845d 100644 --- a/bin/propolis-server/src/lib/vm/services.rs +++ b/bin/propolis-server/src/lib/vm/services.rs @@ -16,7 +16,7 @@ use crate::{ stats::virtual_machine::VirtualMachine, vnc::PropolisVncServer, }; -use super::VmObjects; +use super::objects::{VmObjects, VmObjectsLocked}; #[derive(Default)] pub(crate) struct OximeterState { @@ -48,13 +48,14 @@ impl VmServices { OximeterState::default() }; + let vm_objects = vm_objects.read().await; let vnc_server = ensure_options.vnc_server.clone(); - if let Some(ramfb) = &vm_objects.framebuffer { + if let Some(ramfb) = vm_objects.framebuffer() { vnc_server .server .initialize( crate::vnc::RamFb::new(ramfb.get_framebuffer_spec()), - vm_objects.ps2ctrl.clone(), + vm_objects.ps2ctrl().clone(), vm.clone(), ) .await; @@ -67,7 +68,7 @@ impl VmServices { })); } - let serial_task = start_serial_task(log, vm_objects).await; + let serial_task = start_serial_task(log, &vm_objects).await; Self { serial_task: tokio::sync::Mutex::new(Some(serial_task)), @@ -162,7 +163,7 @@ async fn register_oximeter_producer( async fn start_serial_task( log: &slog::Logger, - vm_objects: &VmObjects, + vm_objects: &tokio::sync::RwLockReadGuard<'_, VmObjectsLocked>, ) -> crate::serial::SerialTask { let (websocks_ch, websocks_recv) = tokio::sync::mpsc::channel(1); let (control_ch, control_recv) = tokio::sync::mpsc::channel(1); diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 395426778..08114671a 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -14,8 +14,8 @@ use propolis_api_types::{ components::backends::CrucibleStorageBackend, v0::StorageBackendV0, VersionedInstanceSpec, }, - InstanceMigrateInitiateResponse, InstanceProperties, InstanceState, - MigrationState, + InstanceMigrateInitiateResponse, InstanceProperties, + InstanceSpecEnsureRequest, InstanceState, MigrationState, }; use slog::{error, info}; use uuid::Uuid; @@ -25,9 +25,8 @@ use crate::{ build_instance, MachineInitializer, MachineInitializerState, }, migrate::MigrateRole, - vcpu_tasks::VcpuTaskController, vm::{ - migrate_commands::MigrateTargetCommand, + migrate_commands::MigrateTargetCommand, objects::InputVmObjects, state_publisher::ExternalStateUpdate, }, }; @@ -38,9 +37,10 @@ use super::{ MigrateSourceCommand, MigrateSourceResponse, MigrateTargetResponse, MigrateTaskEvent, }, + objects::VmObjects, request_queue::{ExternalRequest, InstanceAutoStart}, state_publisher::{MigrationStateUpdate, StatePublisher}, - VmError, VmObjects, + VmError, }; #[derive(Debug, PartialEq, Eq)] @@ -51,7 +51,7 @@ enum HandleEventOutcome { /// A reason for starting a VM. #[derive(Debug, PartialEq, Eq)] -enum VmStartReason { +pub(super) enum VmStartReason { MigratedIn, ExplicitRequest, } @@ -197,12 +197,10 @@ impl guest_event::ChipsetEventHandler for InputQueue { /// The context for a VM state driver task. struct StateDriver { log: slog::Logger, - parent: Arc, - active_vm: Arc, + objects: Arc, input_queue: Arc, external_state: StatePublisher, paused: bool, - vcpu_tasks: Box, migration_src_state: crate::migrate::source::PersistentState, } @@ -210,51 +208,28 @@ pub(super) async fn run_state_driver( log: slog::Logger, vm: Arc, mut external_publisher: StatePublisher, - ensure_request: propolis_api_types::InstanceSpecEnsureRequest, + ensure_request: InstanceSpecEnsureRequest, ensure_result_tx: tokio::sync::oneshot::Sender< Result, >, ensure_options: super::EnsureOptions, ) -> StatePublisher { - let input_queue = Arc::new(InputQueue::new( - log.new(slog::o!("component" => "request_queue")), - match ensure_request.migrate { - Some(_) => InstanceAutoStart::Yes, - None => InstanceAutoStart::No, - }, - )); - let migration_in_id = ensure_request.migrate.as_ref().map(|req| req.migration_id); - let (vm_objects, vcpu_tasks) = match match ensure_request.migrate { - None => { - initialize_vm_from_spec( - &log, - &input_queue, - &ensure_request.properties, - &ensure_request.instance_spec, - &ensure_options, - ) - .await - } - Some(migrate_request) => { - migrate_as_target( - &log, - &input_queue, - &ensure_request.properties, - &ensure_request.instance_spec, - &ensure_options, - migrate_request, - &mut external_publisher, - ) - .await - } - } { + let (vm_objects, input_queue) = match build_vm( + &log, + &vm, + &ensure_request, + &ensure_options, + &mut external_publisher, + ) + .await + { Ok(objects) => objects, Err(e) => { external_publisher .update(ExternalStateUpdate::Instance(InstanceState::Failed)); - vm.start_failed(); + vm.start_failed().await; let _ = ensure_result_tx.send(Err(VmError::InitializationFailed(e))); return external_publisher; @@ -275,9 +250,7 @@ pub(super) async fn run_state_driver( // // Order matters here: once the ensure result is sent, an external // caller needs to observe that an active VM is present. - let active_vm = - vm.make_active(&log, input_queue.clone(), vm_objects, services); - + vm.make_active(&log, input_queue.clone(), &vm_objects, services).await; let _ = ensure_result_tx.send(Ok(propolis_api_types::InstanceEnsureResponse { migrate: migration_in_id @@ -286,16 +259,16 @@ pub(super) async fn run_state_driver( let state_driver = StateDriver { log, - parent: vm.clone(), - active_vm, + objects: vm_objects, input_queue, external_state: external_publisher, paused: false, - vcpu_tasks, migration_src_state: Default::default(), }; - state_driver.run(migration_in_id.is_some()).await + let external_tx = state_driver.run(migration_in_id.is_some()).await; + vm.set_rundown().await; + external_tx } impl StateDriver { @@ -310,7 +283,6 @@ impl StateDriver { self.run_loop().await; } - self.parent.set_rundown().await; self.external_state } @@ -344,27 +316,11 @@ impl StateDriver { ) -> anyhow::Result<()> { info!(self.log, "starting instance"; "reason" => ?start_reason); - let start_result = { - let (vm_objects, vcpu_tasks) = self.vm_objects_and_cpus().await; - match start_reason { - VmStartReason::ExplicitRequest => { - reset_vcpus(&vm_objects, vcpu_tasks); - } - VmStartReason::MigratedIn => { - vm_objects.resume_vm(); - } - } - - let result = vm_objects.start_devices().await; - if result.is_ok() { - vcpu_tasks.resume_all(); - } - - result - }; - + let start_result = self.objects.write().await.start(start_reason).await; match &start_result { - Ok(()) => self.publish_steady_state(InstanceState::Running), + Ok(()) => { + self.publish_steady_state(InstanceState::Running); + } Err(e) => { error!(&self.log, "failed to start devices"; "error" => ?e); @@ -465,26 +421,7 @@ impl StateDriver { self.external_state .update(ExternalStateUpdate::Instance(InstanceState::Rebooting)); - { - let (vm_objects, vcpu_tasks) = self.vm_objects_and_cpus().await; - - // Reboot is implemented as a pause -> reset -> resume transition. - // - // First, pause the vCPUs and all devices so no partially-completed - // work is present. - vcpu_tasks.pause_all(); - vm_objects.pause_devices().await; - - // Reset all entities and the VM's bhyve state, then reset the - // vCPUs. The vCPU reset must come after the bhyve reset. - vm_objects.reset_devices_and_machine(); - reset_vcpus(&vm_objects, vcpu_tasks); - - // Resume devices so they're ready to do more work, then resume - // vCPUs. - vm_objects.resume_devices(); - vcpu_tasks.resume_all(); - } + self.objects.write().await.reboot().await; // Notify other consumers that the instance successfully rebooted and is // now back to Running. @@ -500,37 +437,32 @@ impl StateDriver { self.external_state .update(ExternalStateUpdate::Instance(InstanceState::Stopping)); - // Entities expect to be paused before being halted. Note that the VM - // may be paused already if it is being torn down after a successful - // migration out. - if !self.paused { - self.pause().await; + { + let mut guard = self.objects.write().await; + + // Entities expect to be paused before being halted. Note that the VM + // may be paused already if it is being torn down after a successful + // migration out. + if !self.paused { + guard.pause().await; + self.paused = true; + } + + guard.halt().await; } - self.vcpu_tasks.exit_all(); - self.vm_objects().await.halt_devices().await; self.publish_steady_state(InstanceState::Stopped); } async fn pause(&mut self) { assert!(!self.paused); - self.vcpu_tasks.pause_all(); - { - let objects = self.vm_objects().await; - objects.pause_devices().await; - objects.pause_vm(); - } + self.objects.write().await.pause().await; self.paused = true; } async fn resume(&mut self) { assert!(self.paused); - { - let objects = self.vm_objects().await; - objects.resume_vm(); - objects.resume_devices(); - } - self.vcpu_tasks.resume_all(); + self.objects.write().await.resume(); self.paused = false; } @@ -555,25 +487,6 @@ impl StateDriver { self.external_state.update(ExternalStateUpdate::Instance(state)); } - async fn vm_objects(&self) -> tokio::sync::RwLockReadGuard<'_, VmObjects> { - self.active_vm.objects().await - } - - async fn vm_objects_mut( - &self, - ) -> tokio::sync::RwLockWriteGuard<'_, VmObjects> { - self.active_vm.objects_mut().await - } - - async fn vm_objects_and_cpus( - &mut self, - ) -> ( - tokio::sync::RwLockReadGuard<'_, VmObjects>, - &mut dyn VcpuTaskController, - ) { - (self.active_vm.objects().await, self.vcpu_tasks.as_mut()) - } - async fn migrate_as_source( &mut self, migration_id: Uuid, @@ -606,10 +519,10 @@ impl StateDriver { let (command_tx, mut command_rx) = tokio::sync::mpsc::channel(1); let (response_tx, response_rx) = tokio::sync::mpsc::channel(1); - let vm_for_task = self.active_vm.clone(); + let objects_for_task = self.objects.clone(); let mut migrate_task = tokio::spawn(async move { crate::migrate::source::migrate( - vm_for_task, + objects_for_task, command_tx, response_rx, migrate_ctx.conn, @@ -618,6 +531,9 @@ impl StateDriver { .await }); + // The migration task may try to acquire the VM object lock shared, so + // this task cannot hold it excluive while waiting for the migration + // task to send an event. loop { match next_migrate_task_event( &mut migrate_task, @@ -628,8 +544,7 @@ impl StateDriver { { MigrateTaskEvent::TaskExited(res) => { if res.is_ok() { - self.active_vm - .state_driver_queue + self.input_queue .queue_external_request(ExternalRequest::Stop) .expect("can always queue a request to stop"); } else { @@ -693,16 +608,15 @@ impl StateDriver { "disk_name" => %disk_name, "backend_id" => %backend_id); - let mut objects = self.vm_objects_mut().await; - fn spec_element_not_found(disk_name: &str) -> dropshot::HttpError { let msg = format!("Crucible backend for {:?} not found", disk_name); dropshot::HttpError::for_not_found(Some(msg.clone()), msg) } + let mut objects = self.objects.write().await; let (readonly, old_vcr_json) = { let StorageBackendV0::Crucible(bes) = objects - .instance_spec + .instance_spec() .backends .storage_backends .get(&disk_name) @@ -715,8 +629,10 @@ impl StateDriver { }; let replace_result = { - let backend = - objects.crucible_backends.get(backend_id).ok_or_else(|| { + let backend = objects + .crucible_backends() + .get(backend_id) + .ok_or_else(|| { let msg = format!("No crucible backend for id {backend_id}"); dropshot::HttpError::for_not_found(Some(msg.clone()), msg) @@ -738,7 +654,7 @@ impl StateDriver { }); objects - .instance_spec + .instance_spec_mut() .backends .storage_backends .insert(disk_name, new_bes); @@ -749,122 +665,50 @@ impl StateDriver { } } -fn reset_vcpus( - vm_objects: &VmObjects, - vcpu_tasks: &mut dyn VcpuTaskController, -) { - vcpu_tasks.new_generation(); - vm_objects.reset_vcpu_state(); -} - -async fn initialize_vm_from_spec( +async fn build_vm( log: &slog::Logger, - event_queue: &Arc, - properties: &InstanceProperties, - spec: &VersionedInstanceSpec, + parent: &Arc, + request: &InstanceSpecEnsureRequest, options: &super::EnsureOptions, -) -> anyhow::Result<(VmObjects, Box)> { - info!(log, "initializing new VM"; - "spec" => #?spec, - "properties" => #?properties, - "use_reservoir" => options.use_reservoir, - "bootrom" => %options.toml_config.bootrom.display()); - - let vmm_log = log.new(slog::o!("component" => "vmm")); - - // Set up the 'shell' instance into which the rest of this routine will - // add components. - let VersionedInstanceSpec::V0(v0_spec) = &spec; - let machine = build_instance( - &properties.vm_name(), - v0_spec, - options.use_reservoir, - vmm_log, - )?; - - let mut init = MachineInitializer { - log: log.clone(), - machine: &machine, - devices: Default::default(), - block_backends: Default::default(), - crucible_backends: Default::default(), - spec: v0_spec, - properties, - toml_config: &options.toml_config, - producer_registry: options.oximeter_registry.clone(), - state: MachineInitializerState::default(), - }; - - init.initialize_rom(options.toml_config.bootrom.as_path())?; - let chipset = init.initialize_chipset( - &(event_queue.clone() - as Arc), - )?; - - init.initialize_rtc(&chipset)?; - init.initialize_hpet()?; - - let com1 = Arc::new(init.initialize_uart(&chipset)?); - let ps2ctrl = init.initialize_ps2(&chipset)?; - init.initialize_qemu_debug_port()?; - init.initialize_qemu_pvpanic(properties.into())?; - init.initialize_network_devices(&chipset)?; - - #[cfg(not(feature = "omicron-build"))] - init.initialize_test_devices(&options.toml_config.devices)?; - #[cfg(feature = "omicron-build")] - info!(log, "`omicron-build` feature enabled, ignoring any test devices"); - - #[cfg(feature = "falcon")] - init.initialize_softnpu_ports(&chipset)?; - #[cfg(feature = "falcon")] - init.initialize_9pfs(&chipset)?; + state_publisher: &mut StatePublisher, +) -> anyhow::Result<(Arc, Arc)> { + let input_queue = Arc::new(InputQueue::new( + log.new(slog::o!("component" => "request_queue")), + match request.migrate { + Some(_) => InstanceAutoStart::Yes, + None => InstanceAutoStart::No, + }, + )); - init.initialize_storage_devices(&chipset, options.nexus_client.clone()) + // If the caller didn't ask to initialize by live migration in, immediately + // create the VM objects and return them. + let Some(migrate_request) = &request.migrate else { + let input_objects = initialize_vm_objects_from_spec( + log, + &input_queue, + &request.properties, + &request.instance_spec, + options, + ) .await?; - let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; - init.initialize_cpus()?; - let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( - &machine, - event_queue.clone() as Arc, - log.new(slog::o!("component" => "vcpu_tasks")), - )?); - - let MachineInitializer { - devices, block_backends, crucible_backends, .. - } = init; + let vm_objects = Arc::new(VmObjects::new( + log.clone(), + parent.clone(), + input_objects, + )); - Ok(( - VmObjects { - log: log.clone(), - instance_spec: v0_spec.clone(), - machine, - lifecycle_components: devices, - block_backends, - crucible_backends, - com1, - framebuffer: Some(ramfb), - ps2ctrl, - }, - vcpu_tasks as Box, - )) -} + return Ok((vm_objects, input_queue)); + }; -async fn migrate_as_target( - log: &slog::Logger, - event_queue: &Arc, - properties: &InstanceProperties, - spec: &VersionedInstanceSpec, - options: &super::EnsureOptions, - api_request: propolis_api_types::InstanceMigrateInitiateRequest, - external_state: &mut StatePublisher, -) -> anyhow::Result<(VmObjects, Box)> { - // Use the information in the supplied migration request to connect to the - // migration source and negotiate the protocol verison to use. + // The caller has asked to initialize by live migration in. Initialize VM + // objects at the live migration task's request. + // + // Begin by contacting the source Propolis and obtaining the connection that + // the actual migration task will need. let migrate_ctx = crate::migrate::dest_initiate( log, - api_request, + migrate_request, options.local_server_addr, ) .await?; @@ -888,65 +732,40 @@ async fn migrate_as_target( .await }); - async fn init_sequence( - log: &slog::Logger, - event_queue: &Arc, - properties: &InstanceProperties, - spec: &VersionedInstanceSpec, - options: &super::EnsureOptions, - command_rx: &mut tokio::sync::mpsc::Receiver, - ) -> anyhow::Result<(VmObjects, Box)> { - // Migration cannot proceed (in any protocol version) until the target - // kernel VMM and Propolis components have been set up. The first - // command from the migration task should be a request to set up these - // components. - let init_command = command_rx.recv().await.ok_or_else(|| { - anyhow::anyhow!("migration task unexpectedly closed channel") - })?; - - // TODO(#706) The only extant protocol version (V0 with RON encoding) - // assumes that migration targets get an instance spec from the caller - // of the `instance_ensure` API, that the target VM will be initialized - // from this spec, and that device state will be imported in a later - // migration phase. Another approach is to get an instance spec from the - // source, amend it with information passed to the target, execute - // enough of the migration protocol to get device state payloads, and - // initialize everything in one fell swoop using the spec and payloads - // as inputs. - // - // This requires a new protocol version, so for now, only look for a - // request to initialize the VM from the caller-provided spec. + let init_command = command_rx.recv().await.ok_or_else(|| { + anyhow::anyhow!("migration task unexpectedly closed channel") + })?; + + let input_objects = 'init: { let MigrateTargetCommand::InitializeFromExternalSpec = init_command else { error!(log, "migration protocol didn't init objects first"; - "first_cmd" => ?init_command); - anyhow::bail!( - "migration protocol didn't first ask to init objects" - ); + "command" => ?init_command); + break 'init Err(anyhow::anyhow!( + "migration protocol didn't init objects first" + )); }; - initialize_vm_from_spec(log, event_queue, properties, spec, options) - .await - } + initialize_vm_objects_from_spec( + log, + &input_queue, + &request.properties, + &request.instance_spec, + options, + ) + .await + .map_err(Into::into) + }; - let (vm_objects, mut vcpu_tasks) = match init_sequence( - log, - event_queue, - properties, - spec, - options, - &mut command_rx, - ) - .await - { - Ok(o) => o, + let vm_objects = match input_objects { + Ok(o) => Arc::new(VmObjects::new(log.clone(), parent.clone(), o)), Err(e) => { let _ = response_tx .send(MigrateTargetResponse::VmObjectsInitialized(Err( e.to_string() ))) .await; - external_state.update(ExternalStateUpdate::Migration( + state_publisher.update(ExternalStateUpdate::Migration( MigrationStateUpdate { id: migrate_ctx.migration_id, state: MigrationState::Error, @@ -959,23 +778,20 @@ async fn migrate_as_target( }; // The migration task imports device state by operating directly on the - // newly-created VM objects. Before sending them to the task and allowing - // migration to continue, prepare the VM's vCPUs and objects to have state - // migrated into them. - // - // Ensure the VM's vCPUs are activated properly so that they can enter the - // guest after migration. Do this before allowing the migration task to - // continue so that reset doesn't overwrite any state written by migration. - // - // Pause the kernel VM so that emulated device state can be imported + // newly-created VM objects. Before sending them to the task, make sure the + // objects are ready to have state imported into them. Specifically, ensure + // that the VM's vCPUs are activated so they can enter the guest after + // migration and pause the kernel VM to allow it to import device state // consistently. - reset_vcpus(&vm_objects, vcpu_tasks.as_mut()); - vm_objects.pause_vm(); + // + // Drop the lock after this operation so that the migration task can acquire + // it. + { + let guard = vm_objects.read().await; + guard.reset_vcpus(); + guard.pause_kernel_vm(); + } - // Everything is ready, so send a reference to the newly-created VM to the - // migration task. When the task exits, it drops this reference, allowing - // this task to reclaim an owned `VmObjects` from the `Arc` wrapper. - let vm_objects = Arc::new(vm_objects); if response_tx .send(MigrateTargetResponse::VmObjectsInitialized(Ok( vm_objects.clone() @@ -983,7 +799,7 @@ async fn migrate_as_target( .await .is_err() { - vm_objects.resume_vm(); + vm_objects.write().await.resume_kernel_vm(); anyhow::bail!("migration task unexpectedly closed channel"); } @@ -995,26 +811,20 @@ async fn migrate_as_target( match action { MigrateTaskEvent::TaskExited(res) => match res { Ok(()) => { - let Ok(vm_objects) = Arc::try_unwrap(vm_objects) else { - panic!( - "migration task should have dropped its VM objects", - ); - }; - - return Ok((vm_objects, vcpu_tasks)); + return Ok((vm_objects, input_queue)); } Err(e) => { error!(log, "target migration task failed"; "error" => %e); - vm_objects.resume_vm(); + vm_objects.write().await.resume_kernel_vm(); return Err(e.into()); } }, MigrateTaskEvent::Command(MigrateTargetCommand::UpdateState( state, )) => { - external_state.update(ExternalStateUpdate::Migration( + state_publisher.update(ExternalStateUpdate::Migration( MigrationStateUpdate { state, id: migrate_ctx.migration_id, @@ -1031,6 +841,97 @@ async fn migrate_as_target( } } +async fn initialize_vm_objects_from_spec( + log: &slog::Logger, + event_queue: &Arc, + properties: &InstanceProperties, + spec: &VersionedInstanceSpec, + options: &super::EnsureOptions, +) -> anyhow::Result { + info!(log, "initializing new VM"; + "spec" => #?spec, + "properties" => #?properties, + "use_reservoir" => options.use_reservoir, + "bootrom" => %options.toml_config.bootrom.display()); + + let vmm_log = log.new(slog::o!("component" => "vmm")); + + // Set up the 'shell' instance into which the rest of this routine will + // add components. + let VersionedInstanceSpec::V0(v0_spec) = &spec; + let machine = build_instance( + &properties.vm_name(), + v0_spec, + options.use_reservoir, + vmm_log, + )?; + + let mut init = MachineInitializer { + log: log.clone(), + machine: &machine, + devices: Default::default(), + block_backends: Default::default(), + crucible_backends: Default::default(), + spec: v0_spec, + properties, + toml_config: &options.toml_config, + producer_registry: options.oximeter_registry.clone(), + state: MachineInitializerState::default(), + }; + + init.initialize_rom(options.toml_config.bootrom.as_path())?; + let chipset = init.initialize_chipset( + &(event_queue.clone() + as Arc), + )?; + + init.initialize_rtc(&chipset)?; + init.initialize_hpet()?; + + let com1 = Arc::new(init.initialize_uart(&chipset)?); + let ps2ctrl = init.initialize_ps2(&chipset)?; + init.initialize_qemu_debug_port()?; + init.initialize_qemu_pvpanic(properties.into())?; + init.initialize_network_devices(&chipset)?; + + #[cfg(not(feature = "omicron-build"))] + init.initialize_test_devices(&options.toml_config.devices)?; + #[cfg(feature = "omicron-build")] + info!(log, "`omicron-build` feature enabled, ignoring any test devices"); + + #[cfg(feature = "falcon")] + init.initialize_softnpu_ports(&chipset)?; + #[cfg(feature = "falcon")] + init.initialize_9pfs(&chipset)?; + + init.initialize_storage_devices(&chipset, options.nexus_client.clone()) + .await?; + + let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; + init.initialize_cpus()?; + let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( + &machine, + event_queue.clone() as Arc, + log.new(slog::o!("component" => "vcpu_tasks")), + )?); + + let MachineInitializer { + devices, block_backends, crucible_backends, .. + } = init; + + Ok(InputVmObjects { + instance_spec: v0_spec.clone(), + vcpu_tasks, + machine, + lifecycle_components: devices, + block_backends, + crucible_backends, + com1, + framebuffer: Some(ramfb), + ps2ctrl, + }) +} + async fn next_migrate_task_event( task: &mut tokio::task::JoinHandle< Result<(), crate::migrate::MigrateError>, diff --git a/bin/propolis-server/src/lib/vnc.rs b/bin/propolis-server/src/lib/vnc.rs index c9736e4d0..3912f7367 100644 --- a/bin/propolis-server/src/lib/vnc.rs +++ b/bin/propolis-server/src/lib/vnc.rs @@ -152,8 +152,8 @@ impl Server for PropolisVncServer { let len = fb.height as usize * fb.width as usize * 4; let mut buf = vec![0u8; len]; - if let Some(vm) = inner.vm.as_ref().unwrap().active_vm() { - let vm_objects = vm.objects().await; + if let Some(vm) = inner.vm.as_ref().unwrap().active_vm().await { + let vm_objects = vm.objects().read().await; let read = tokio::task::block_in_place(|| { let machine = vm_objects.machine(); let memctx = machine.acc_mem.access().unwrap(); From 52a9a5e77d575b400620b483baae48e200c68862 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 28 Jun 2024 03:23:31 +0000 Subject: [PATCH 31/55] [WIP] allow WaitingForInit -> RundownComplete --- bin/propolis-server/src/lib/vm/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index c54d904f0..68e494728 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -255,7 +255,9 @@ impl Vm { let mut guard = self.inner.write().await; let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { - VmState::Rundown(vm) => guard.state = VmState::RundownComplete(vm), + VmState::WaitingForInit(vm) | VmState::Rundown(vm) => { + guard.state = VmState::RundownComplete(vm) + } _ => unreachable!("VM rundown completed from invalid prior state"), } } From 6a4567e62bf69fbbbabc0b5c789a539751a22ab7 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 28 Jun 2024 03:29:35 +0000 Subject: [PATCH 32/55] [WIP] move to destroyed state on rundown complete --- bin/propolis-server/src/lib/vm/mod.rs | 26 ++++++- .../src/lib/vm/state_driver.rs | 76 +++++++++++-------- 2 files changed, 68 insertions(+), 34 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 68e494728..1413e54b2 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -15,6 +15,7 @@ use propolis_api_types::{ }; use rfb::server::VncServer; use slog::info; +use state_driver::StateDriverOutput; use state_publisher::StatePublisher; use crate::{server::MetricsEndpointConfig, vnc::PropolisVncServer}; @@ -80,7 +81,7 @@ pub(crate) struct Vm { struct VmInner { state: VmState, - driver: Option>, + driver: Option>, } struct UninitVm { @@ -215,15 +216,21 @@ impl Vm { } } - async fn start_failed(&self) { + async fn start_failed(&self, wait_for_objects: bool) { let mut guard = self.inner.write().await; let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { VmState::WaitingForInit(vm) => { - guard.state = VmState::RundownComplete(vm) + guard.state = if wait_for_objects { + VmState::Rundown(vm) + } else { + VmState::RundownComplete(vm) + }; } _ => unreachable!( - "start failures should only occur before an active VM is installed") + "start failures should only occur before an active VM is \ + installed" + ), } } @@ -260,6 +267,17 @@ impl Vm { } _ => unreachable!("VM rundown completed from invalid prior state"), } + + let StateDriverOutput { mut state_publisher, final_state } = guard + .driver + .take() + .expect("driver must exist in rundown") + .await + .expect("state driver shouldn't panic"); + + state_publisher.update(state_publisher::ExternalStateUpdate::Instance( + final_state, + )); } pub(crate) async fn ensure( diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 08114671a..9a4a7b5a4 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -204,35 +204,53 @@ struct StateDriver { migration_src_state: crate::migrate::source::PersistentState, } +pub(super) struct StateDriverOutput { + pub state_publisher: StatePublisher, + pub final_state: InstanceState, +} + pub(super) async fn run_state_driver( log: slog::Logger, vm: Arc, - mut external_publisher: StatePublisher, + mut state_publisher: StatePublisher, ensure_request: InstanceSpecEnsureRequest, ensure_result_tx: tokio::sync::oneshot::Sender< Result, >, ensure_options: super::EnsureOptions, -) -> StatePublisher { +) -> StateDriverOutput { let migration_in_id = ensure_request.migrate.as_ref().map(|req| req.migration_id); - let (vm_objects, input_queue) = match build_vm( + + let input_queue = Arc::new(InputQueue::new( + log.new(slog::o!("component" => "request_queue")), + match &ensure_request.migrate { + Some(_) => InstanceAutoStart::Yes, + None => InstanceAutoStart::No, + }, + )); + + let vm_objects = match build_vm( &log, &vm, &ensure_request, &ensure_options, - &mut external_publisher, + &input_queue, + &mut state_publisher, ) .await { Ok(objects) => objects, - Err(e) => { - external_publisher + Err((e, objects)) => { + state_publisher .update(ExternalStateUpdate::Instance(InstanceState::Failed)); - vm.start_failed().await; + vm.start_failed(objects.is_some()).await; let _ = ensure_result_tx.send(Err(VmError::InitializationFailed(e))); - return external_publisher; + return StateDriverOutput { + state_publisher, + final_state: InstanceState::Failed, + }; } }; @@ -261,14 +279,14 @@ pub(super) async fn run_state_driver( log, objects: vm_objects, input_queue, - external_state: external_publisher, + external_state: state_publisher, paused: false, migration_src_state: Default::default(), }; - let external_tx = state_driver.run(migration_in_id.is_some()).await; + let state_publisher = state_driver.run(migration_in_id.is_some()).await; vm.set_rundown().await; - external_tx + StateDriverOutput { state_publisher, final_state: InstanceState::Destroyed } } impl StateDriver { @@ -670,27 +688,21 @@ async fn build_vm( parent: &Arc, request: &InstanceSpecEnsureRequest, options: &super::EnsureOptions, + input_queue: &Arc, state_publisher: &mut StatePublisher, -) -> anyhow::Result<(Arc, Arc)> { - let input_queue = Arc::new(InputQueue::new( - log.new(slog::o!("component" => "request_queue")), - match request.migrate { - Some(_) => InstanceAutoStart::Yes, - None => InstanceAutoStart::No, - }, - )); - +) -> anyhow::Result, (anyhow::Error, Option>)> { // If the caller didn't ask to initialize by live migration in, immediately // create the VM objects and return them. let Some(migrate_request) = &request.migrate else { let input_objects = initialize_vm_objects_from_spec( log, - &input_queue, + input_queue, &request.properties, &request.instance_spec, options, ) - .await?; + .await + .map_err(|e| (e, None))?; let vm_objects = Arc::new(VmObjects::new( log.clone(), @@ -698,7 +710,7 @@ async fn build_vm( input_objects, )); - return Ok((vm_objects, input_queue)); + return Ok(vm_objects); }; // The caller has asked to initialize by live migration in. Initialize VM @@ -711,7 +723,8 @@ async fn build_vm( migrate_request, options.local_server_addr, ) - .await?; + .await + .map_err(|e| (e.into(), None))?; // Spin up a task to run the migration protocol proper. To avoid sending the // entire VM context over to the migration task, create command and response @@ -733,7 +746,7 @@ async fn build_vm( }); let init_command = command_rx.recv().await.ok_or_else(|| { - anyhow::anyhow!("migration task unexpectedly closed channel") + (anyhow::anyhow!("migration task unexpectedly closed channel"), None) })?; let input_objects = 'init: { @@ -748,7 +761,7 @@ async fn build_vm( initialize_vm_objects_from_spec( log, - &input_queue, + input_queue, &request.properties, &request.instance_spec, options, @@ -773,7 +786,7 @@ async fn build_vm( }, )); - return Err(e); + return Err((e, None)); } }; @@ -800,7 +813,10 @@ async fn build_vm( .is_err() { vm_objects.write().await.resume_kernel_vm(); - anyhow::bail!("migration task unexpectedly closed channel"); + return Err(( + anyhow::anyhow!("migration task unexpectedly closed channel"), + Some(vm_objects), + )); } loop { @@ -811,14 +827,14 @@ async fn build_vm( match action { MigrateTaskEvent::TaskExited(res) => match res { Ok(()) => { - return Ok((vm_objects, input_queue)); + return Ok(vm_objects); } Err(e) => { error!(log, "target migration task failed"; "error" => %e); vm_objects.write().await.resume_kernel_vm(); - return Err(e.into()); + return Err((e.into(), Some(vm_objects))); } }, MigrateTaskEvent::Command(MigrateTargetCommand::UpdateState( From 57711e319f605e84381618c2fa3e23ec400f38bf Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 28 Jun 2024 16:53:55 +0000 Subject: [PATCH 33/55] [WIP] send ensure result before bulk of migration protocol --- .../src/lib/vm/migrate_commands.rs | 28 ++ bin/propolis-server/src/lib/vm/mod.rs | 1 + bin/propolis-server/src/lib/vm/startup.rs | 326 +++++++++++++++ .../src/lib/vm/state_driver.rs | 373 ++---------------- 4 files changed, 398 insertions(+), 330 deletions(-) create mode 100644 bin/propolis-server/src/lib/vm/startup.rs diff --git a/bin/propolis-server/src/lib/vm/migrate_commands.rs b/bin/propolis-server/src/lib/vm/migrate_commands.rs index f5d1cb8eb..787a970c7 100644 --- a/bin/propolis-server/src/lib/vm/migrate_commands.rs +++ b/bin/propolis-server/src/lib/vm/migrate_commands.rs @@ -67,3 +67,31 @@ pub(super) enum MigrateTaskEvent { /// The task sent a command requesting work. Command(T), } + +pub(super) async fn next_migrate_task_event( + task: &mut tokio::task::JoinHandle< + Result<(), crate::migrate::MigrateError>, + >, + command_rx: &mut tokio::sync::mpsc::Receiver, + log: &slog::Logger, +) -> MigrateTaskEvent { + if let Some(cmd) = command_rx.recv().await { + return MigrateTaskEvent::Command(cmd); + } + + // The sender side of the command channel is dropped, which means the + // migration task is exiting. Wait for it to finish and snag its result. + match task.await { + Ok(res) => { + slog::info!(log, "Migration task exited: {:?}", res); + MigrateTaskEvent::TaskExited(res) + } + Err(join_err) => { + if join_err.is_cancelled() { + panic!("Migration task canceled"); + } else { + panic!("Migration task panicked: {:?}", join_err.into_panic()); + } + } + } +} diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 1413e54b2..dcb75c747 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -26,6 +26,7 @@ pub(crate) mod migrate_commands; pub(crate) mod objects; mod request_queue; mod services; +mod startup; mod state_driver; mod state_publisher; diff --git a/bin/propolis-server/src/lib/vm/startup.rs b/bin/propolis-server/src/lib/vm/startup.rs new file mode 100644 index 000000000..610e22b80 --- /dev/null +++ b/bin/propolis-server/src/lib/vm/startup.rs @@ -0,0 +1,326 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Functionality used to create a new VM and possibly live migrate into it. + +use std::sync::Arc; + +use propolis_api_types::{ + instance_spec::VersionedInstanceSpec, InstanceProperties, + InstanceSpecEnsureRequest, MigrationState, +}; +use slog::{error, info}; +use uuid::Uuid; + +use crate::{ + initializer::{ + build_instance, MachineInitializer, MachineInitializerState, + }, + migrate::{MigrateError, MigrateRole}, +}; + +use super::{ + migrate_commands::{ + next_migrate_task_event, MigrateTargetCommand, MigrateTargetResponse, + MigrateTaskEvent, + }, + objects::{InputVmObjects, VmObjects}, + state_driver::InputQueue, + state_publisher::{ + ExternalStateUpdate, MigrationStateUpdate, StatePublisher, + }, +}; + +pub(super) struct MigrateAsTargetContext { + vm_objects: Arc, + log: slog::Logger, + migration_id: Uuid, + migrate_task: tokio::task::JoinHandle>, + command_rx: tokio::sync::mpsc::Receiver, + response_tx: tokio::sync::mpsc::Sender, +} + +pub(super) struct BuildVmOutput { + pub vm_objects: Arc, + pub migration_in: Option, +} + +pub(super) async fn build_vm( + log: &slog::Logger, + parent: &Arc, + request: &InstanceSpecEnsureRequest, + options: &super::EnsureOptions, + input_queue: &Arc, + state_publisher: &mut StatePublisher, +) -> anyhow::Result>)> { + // If the caller didn't ask to initialize by live migration in, immediately + // create the VM objects and return them. + let Some(migrate_request) = &request.migrate else { + let input_objects = initialize_vm_objects_from_spec( + log, + input_queue, + &request.properties, + &request.instance_spec, + options, + ) + .await + .map_err(|e| (e, None))?; + + let vm_objects = Arc::new(VmObjects::new( + log.clone(), + parent.clone(), + input_objects, + )); + + return Ok(BuildVmOutput { vm_objects, migration_in: None }); + }; + + // The caller has asked to initialize by live migration in. Initialize VM + // objects at the live migration task's request. + // + // Begin by contacting the source Propolis and obtaining the connection that + // the actual migration task will need. + let migrate_ctx = crate::migrate::dest_initiate( + log, + migrate_request, + options.local_server_addr, + ) + .await + .map_err(|e| (e.into(), None))?; + + // Spin up a task to run the migration protocol proper. To avoid sending the + // entire VM context over to the migration task, create command and response + // channels to allow the migration task to delegate work back to this + // routine. + let log_for_task = log.clone(); + let (command_tx, mut command_rx) = tokio::sync::mpsc::channel(1); + let (response_tx, response_rx) = tokio::sync::mpsc::channel(1); + let migrate_task = tokio::spawn(async move { + crate::migrate::destination::migrate( + &log_for_task, + command_tx, + response_rx, + migrate_ctx.conn, + migrate_ctx.local_addr, + migrate_ctx.protocol, + ) + .await + }); + + let init_command = command_rx.recv().await.ok_or_else(|| { + (anyhow::anyhow!("migration task unexpectedly closed channel"), None) + })?; + + let input_objects = 'init: { + let MigrateTargetCommand::InitializeFromExternalSpec = init_command + else { + error!(log, "migration protocol didn't init objects first"; + "command" => ?init_command); + break 'init Err(anyhow::anyhow!( + "migration protocol didn't init objects first" + )); + }; + + initialize_vm_objects_from_spec( + log, + input_queue, + &request.properties, + &request.instance_spec, + options, + ) + .await + .map_err(Into::into) + }; + + let vm_objects = match input_objects { + Ok(o) => Arc::new(VmObjects::new(log.clone(), parent.clone(), o)), + Err(e) => { + let _ = response_tx + .send(MigrateTargetResponse::VmObjectsInitialized(Err( + e.to_string() + ))) + .await; + state_publisher.update(ExternalStateUpdate::Migration( + MigrationStateUpdate { + id: migrate_ctx.migration_id, + state: MigrationState::Error, + role: MigrateRole::Source, + }, + )); + + return Err((e, None)); + } + }; + + let migration_in = MigrateAsTargetContext { + vm_objects: vm_objects.clone(), + log: log.clone(), + migration_id: migrate_ctx.migration_id, + migrate_task, + command_rx, + response_tx, + }; + + Ok(BuildVmOutput { vm_objects, migration_in: Some(migration_in) }) +} + +async fn initialize_vm_objects_from_spec( + log: &slog::Logger, + event_queue: &Arc, + properties: &InstanceProperties, + spec: &VersionedInstanceSpec, + options: &super::EnsureOptions, +) -> anyhow::Result { + info!(log, "initializing new VM"; + "spec" => #?spec, + "properties" => #?properties, + "use_reservoir" => options.use_reservoir, + "bootrom" => %options.toml_config.bootrom.display()); + + let vmm_log = log.new(slog::o!("component" => "vmm")); + + // Set up the 'shell' instance into which the rest of this routine will + // add components. + let VersionedInstanceSpec::V0(v0_spec) = &spec; + let machine = build_instance( + &properties.vm_name(), + v0_spec, + options.use_reservoir, + vmm_log, + )?; + + let mut init = MachineInitializer { + log: log.clone(), + machine: &machine, + devices: Default::default(), + block_backends: Default::default(), + crucible_backends: Default::default(), + spec: v0_spec, + properties, + toml_config: &options.toml_config, + producer_registry: options.oximeter_registry.clone(), + state: MachineInitializerState::default(), + }; + + init.initialize_rom(options.toml_config.bootrom.as_path())?; + let chipset = init.initialize_chipset( + &(event_queue.clone() + as Arc), + )?; + + init.initialize_rtc(&chipset)?; + init.initialize_hpet()?; + + let com1 = Arc::new(init.initialize_uart(&chipset)?); + let ps2ctrl = init.initialize_ps2(&chipset)?; + init.initialize_qemu_debug_port()?; + init.initialize_qemu_pvpanic(properties.into())?; + init.initialize_network_devices(&chipset)?; + + #[cfg(not(feature = "omicron-build"))] + init.initialize_test_devices(&options.toml_config.devices)?; + #[cfg(feature = "omicron-build")] + info!(log, "`omicron-build` feature enabled, ignoring any test devices"); + + #[cfg(feature = "falcon")] + init.initialize_softnpu_ports(&chipset)?; + #[cfg(feature = "falcon")] + init.initialize_9pfs(&chipset)?; + + init.initialize_storage_devices(&chipset, options.nexus_client.clone()) + .await?; + + let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; + init.initialize_cpus()?; + let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( + &machine, + event_queue.clone() as Arc, + log.new(slog::o!("component" => "vcpu_tasks")), + )?); + + let MachineInitializer { + devices, block_backends, crucible_backends, .. + } = init; + + Ok(InputVmObjects { + instance_spec: v0_spec.clone(), + vcpu_tasks, + machine, + lifecycle_components: devices, + block_backends, + crucible_backends, + com1, + framebuffer: Some(ramfb), + ps2ctrl, + }) +} + +impl MigrateAsTargetContext { + pub(super) async fn run( + mut self, + state_publisher: &mut StatePublisher, + ) -> Result<(), MigrateError> { + // The migration task imports device state by operating directly on the + // newly-created VM objects. Before sending them to the task, make sure the + // objects are ready to have state imported into them. Specifically, ensure + // that the VM's vCPUs are activated so they can enter the guest after + // migration and pause the kernel VM to allow it to import device state + // consistently. + // + // Drop the lock after this operation so that the migration task can acquire + // it. + { + let guard = self.vm_objects.read().await; + guard.reset_vcpus(); + guard.pause_kernel_vm(); + } + + self.response_tx + .send(MigrateTargetResponse::VmObjectsInitialized(Ok(self + .vm_objects + .clone()))) + .await + .expect("migration task shouldn't exit while awaiting driver"); + + loop { + let action = next_migrate_task_event( + &mut self.migrate_task, + &mut self.command_rx, + &self.log, + ) + .await; + + match action { + MigrateTaskEvent::TaskExited(res) => match res { + Ok(()) => { + return Ok(()); + } + Err(e) => { + error!(self.log, "target migration task failed"; + "error" => %e); + + self.vm_objects.write().await.resume_kernel_vm(); + return Err(e); + } + }, + MigrateTaskEvent::Command( + MigrateTargetCommand::UpdateState(state), + ) => { + state_publisher.update(ExternalStateUpdate::Migration( + MigrationStateUpdate { + state, + id: self.migration_id, + role: MigrateRole::Destination, + }, + )); + } + MigrateTaskEvent::Command( + MigrateTargetCommand::InitializeFromExternalSpec, + ) => { + panic!("already received initialize-from-spec command"); + } + } + } + } +} diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 9a4a7b5a4..f1268f0a1 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -12,33 +12,24 @@ use std::{ use propolis_api_types::{ instance_spec::{ components::backends::CrucibleStorageBackend, v0::StorageBackendV0, - VersionedInstanceSpec, }, - InstanceMigrateInitiateResponse, InstanceProperties, - InstanceSpecEnsureRequest, InstanceState, MigrationState, + InstanceMigrateInitiateResponse, InstanceSpecEnsureRequest, InstanceState, + MigrationState, }; use slog::{error, info}; use uuid::Uuid; -use crate::{ - initializer::{ - build_instance, MachineInitializer, MachineInitializerState, - }, - migrate::MigrateRole, - vm::{ - migrate_commands::MigrateTargetCommand, objects::InputVmObjects, - state_publisher::ExternalStateUpdate, - }, -}; +use crate::{migrate::MigrateRole, vm::state_publisher::ExternalStateUpdate}; use super::{ guest_event::{self, GuestEvent}, migrate_commands::{ - MigrateSourceCommand, MigrateSourceResponse, MigrateTargetResponse, + next_migrate_task_event, MigrateSourceCommand, MigrateSourceResponse, MigrateTaskEvent, }, objects::VmObjects, request_queue::{ExternalRequest, InstanceAutoStart}, + startup::BuildVmOutput, state_publisher::{MigrationStateUpdate, StatePublisher}, VmError, }; @@ -230,29 +221,31 @@ pub(super) async fn run_state_driver( }, )); - let vm_objects = match build_vm( - &log, - &vm, - &ensure_request, - &ensure_options, - &input_queue, - &mut state_publisher, - ) - .await - { - Ok(objects) => objects, - Err((e, objects)) => { - state_publisher - .update(ExternalStateUpdate::Instance(InstanceState::Failed)); - vm.start_failed(objects.is_some()).await; - let _ = - ensure_result_tx.send(Err(VmError::InitializationFailed(e))); - return StateDriverOutput { - state_publisher, - final_state: InstanceState::Failed, - }; - } - }; + let BuildVmOutput { vm_objects, migration_in } = + match super::startup::build_vm( + &log, + &vm, + &ensure_request, + &ensure_options, + &input_queue, + &mut state_publisher, + ) + .await + { + Ok(objects) => objects, + Err((e, objects)) => { + state_publisher.update(ExternalStateUpdate::Instance( + InstanceState::Failed, + )); + vm.start_failed(objects.is_some()).await; + let _ = ensure_result_tx + .send(Err(VmError::InitializationFailed(e))); + return StateDriverOutput { + state_publisher, + final_state: InstanceState::Failed, + }; + } + }; let services = super::services::VmServices::new( &log, @@ -275,6 +268,19 @@ pub(super) async fn run_state_driver( .map(|id| InstanceMigrateInitiateResponse { migration_id: id }), })); + if let Some(migration_in) = migration_in { + if let Err(e) = migration_in.run(&mut state_publisher).await { + error!(log, "inbound live migration task failed"; + "error" => ?e); + + vm.start_failed(true).await; + return StateDriverOutput { + state_publisher, + final_state: InstanceState::Failed, + }; + } + } + let state_driver = StateDriver { log, objects: vm_objects, @@ -682,296 +688,3 @@ impl StateDriver { Ok(replace_result) } } - -async fn build_vm( - log: &slog::Logger, - parent: &Arc, - request: &InstanceSpecEnsureRequest, - options: &super::EnsureOptions, - input_queue: &Arc, - state_publisher: &mut StatePublisher, -) -> anyhow::Result, (anyhow::Error, Option>)> { - // If the caller didn't ask to initialize by live migration in, immediately - // create the VM objects and return them. - let Some(migrate_request) = &request.migrate else { - let input_objects = initialize_vm_objects_from_spec( - log, - input_queue, - &request.properties, - &request.instance_spec, - options, - ) - .await - .map_err(|e| (e, None))?; - - let vm_objects = Arc::new(VmObjects::new( - log.clone(), - parent.clone(), - input_objects, - )); - - return Ok(vm_objects); - }; - - // The caller has asked to initialize by live migration in. Initialize VM - // objects at the live migration task's request. - // - // Begin by contacting the source Propolis and obtaining the connection that - // the actual migration task will need. - let migrate_ctx = crate::migrate::dest_initiate( - log, - migrate_request, - options.local_server_addr, - ) - .await - .map_err(|e| (e.into(), None))?; - - // Spin up a task to run the migration protocol proper. To avoid sending the - // entire VM context over to the migration task, create command and response - // channels to allow the migration task to delegate work back to this - // routine. - let log_for_task = log.clone(); - let (command_tx, mut command_rx) = tokio::sync::mpsc::channel(1); - let (response_tx, response_rx) = tokio::sync::mpsc::channel(1); - let mut migrate_task = tokio::spawn(async move { - crate::migrate::destination::migrate( - &log_for_task, - command_tx, - response_rx, - migrate_ctx.conn, - migrate_ctx.local_addr, - migrate_ctx.protocol, - ) - .await - }); - - let init_command = command_rx.recv().await.ok_or_else(|| { - (anyhow::anyhow!("migration task unexpectedly closed channel"), None) - })?; - - let input_objects = 'init: { - let MigrateTargetCommand::InitializeFromExternalSpec = init_command - else { - error!(log, "migration protocol didn't init objects first"; - "command" => ?init_command); - break 'init Err(anyhow::anyhow!( - "migration protocol didn't init objects first" - )); - }; - - initialize_vm_objects_from_spec( - log, - input_queue, - &request.properties, - &request.instance_spec, - options, - ) - .await - .map_err(Into::into) - }; - - let vm_objects = match input_objects { - Ok(o) => Arc::new(VmObjects::new(log.clone(), parent.clone(), o)), - Err(e) => { - let _ = response_tx - .send(MigrateTargetResponse::VmObjectsInitialized(Err( - e.to_string() - ))) - .await; - state_publisher.update(ExternalStateUpdate::Migration( - MigrationStateUpdate { - id: migrate_ctx.migration_id, - state: MigrationState::Error, - role: MigrateRole::Source, - }, - )); - - return Err((e, None)); - } - }; - - // The migration task imports device state by operating directly on the - // newly-created VM objects. Before sending them to the task, make sure the - // objects are ready to have state imported into them. Specifically, ensure - // that the VM's vCPUs are activated so they can enter the guest after - // migration and pause the kernel VM to allow it to import device state - // consistently. - // - // Drop the lock after this operation so that the migration task can acquire - // it. - { - let guard = vm_objects.read().await; - guard.reset_vcpus(); - guard.pause_kernel_vm(); - } - - if response_tx - .send(MigrateTargetResponse::VmObjectsInitialized(Ok( - vm_objects.clone() - ))) - .await - .is_err() - { - vm_objects.write().await.resume_kernel_vm(); - return Err(( - anyhow::anyhow!("migration task unexpectedly closed channel"), - Some(vm_objects), - )); - } - - loop { - let action = - next_migrate_task_event(&mut migrate_task, &mut command_rx, log) - .await; - - match action { - MigrateTaskEvent::TaskExited(res) => match res { - Ok(()) => { - return Ok(vm_objects); - } - Err(e) => { - error!(log, "target migration task failed"; - "error" => %e); - - vm_objects.write().await.resume_kernel_vm(); - return Err((e.into(), Some(vm_objects))); - } - }, - MigrateTaskEvent::Command(MigrateTargetCommand::UpdateState( - state, - )) => { - state_publisher.update(ExternalStateUpdate::Migration( - MigrationStateUpdate { - state, - id: migrate_ctx.migration_id, - role: MigrateRole::Destination, - }, - )); - } - MigrateTaskEvent::Command( - MigrateTargetCommand::InitializeFromExternalSpec, - ) => { - panic!("already received initialize-from-spec command"); - } - } - } -} - -async fn initialize_vm_objects_from_spec( - log: &slog::Logger, - event_queue: &Arc, - properties: &InstanceProperties, - spec: &VersionedInstanceSpec, - options: &super::EnsureOptions, -) -> anyhow::Result { - info!(log, "initializing new VM"; - "spec" => #?spec, - "properties" => #?properties, - "use_reservoir" => options.use_reservoir, - "bootrom" => %options.toml_config.bootrom.display()); - - let vmm_log = log.new(slog::o!("component" => "vmm")); - - // Set up the 'shell' instance into which the rest of this routine will - // add components. - let VersionedInstanceSpec::V0(v0_spec) = &spec; - let machine = build_instance( - &properties.vm_name(), - v0_spec, - options.use_reservoir, - vmm_log, - )?; - - let mut init = MachineInitializer { - log: log.clone(), - machine: &machine, - devices: Default::default(), - block_backends: Default::default(), - crucible_backends: Default::default(), - spec: v0_spec, - properties, - toml_config: &options.toml_config, - producer_registry: options.oximeter_registry.clone(), - state: MachineInitializerState::default(), - }; - - init.initialize_rom(options.toml_config.bootrom.as_path())?; - let chipset = init.initialize_chipset( - &(event_queue.clone() - as Arc), - )?; - - init.initialize_rtc(&chipset)?; - init.initialize_hpet()?; - - let com1 = Arc::new(init.initialize_uart(&chipset)?); - let ps2ctrl = init.initialize_ps2(&chipset)?; - init.initialize_qemu_debug_port()?; - init.initialize_qemu_pvpanic(properties.into())?; - init.initialize_network_devices(&chipset)?; - - #[cfg(not(feature = "omicron-build"))] - init.initialize_test_devices(&options.toml_config.devices)?; - #[cfg(feature = "omicron-build")] - info!(log, "`omicron-build` feature enabled, ignoring any test devices"); - - #[cfg(feature = "falcon")] - init.initialize_softnpu_ports(&chipset)?; - #[cfg(feature = "falcon")] - init.initialize_9pfs(&chipset)?; - - init.initialize_storage_devices(&chipset, options.nexus_client.clone()) - .await?; - - let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; - init.initialize_cpus()?; - let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( - &machine, - event_queue.clone() as Arc, - log.new(slog::o!("component" => "vcpu_tasks")), - )?); - - let MachineInitializer { - devices, block_backends, crucible_backends, .. - } = init; - - Ok(InputVmObjects { - instance_spec: v0_spec.clone(), - vcpu_tasks, - machine, - lifecycle_components: devices, - block_backends, - crucible_backends, - com1, - framebuffer: Some(ramfb), - ps2ctrl, - }) -} - -async fn next_migrate_task_event( - task: &mut tokio::task::JoinHandle< - Result<(), crate::migrate::MigrateError>, - >, - command_rx: &mut tokio::sync::mpsc::Receiver, - log: &slog::Logger, -) -> MigrateTaskEvent { - if let Some(cmd) = command_rx.recv().await { - return MigrateTaskEvent::Command(cmd); - } - - // The sender side of the command channel is dropped, which means the - // migration task is exiting. Wait for it to finish and snag its result. - match task.await { - Ok(res) => { - info!(log, "Migration task exited: {:?}", res); - MigrateTaskEvent::TaskExited(res) - } - Err(join_err) => { - if join_err.is_cancelled() { - panic!("Migration task canceled"); - } else { - panic!("Migration task panicked: {:?}", join_err.into_panic()); - } - } - } -} From 2eef0b44868aeb805b7a395d474defae569c5528 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 28 Jun 2024 18:27:44 +0000 Subject: [PATCH 34/55] drive state machine properly after migration in fails --- Cargo.lock | 1 + bin/propolis-server/src/lib/vm/mod.rs | 16 ++++++++++++++++ bin/propolis-server/src/lib/vm/state_driver.rs | 2 +- phd-tests/framework/Cargo.toml | 1 + phd-tests/framework/src/test_vm/mod.rs | 18 ++++++++++++++---- 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3dd861e12..6cf1f1eee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3072,6 +3072,7 @@ dependencies = [ "flate2", "futures", "hex", + "http 0.2.12", "libc", "propolis-client", "propolis-server-config", diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index dcb75c747..afd86adad 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -115,6 +115,22 @@ enum VmState { RundownComplete(UninitVm), } +impl std::fmt::Display for VmState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}", + match self { + Self::NoVm => "NoVm", + Self::WaitingForInit(_) => "WaitingForInit", + Self::Active(_) => "Active", + Self::Rundown(_) => "Rundown", + Self::RundownComplete(_) => "RundownComplete", + } + ) + } +} + pub(super) struct EnsureOptions { pub toml_config: Arc, pub use_reservoir: bool, diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index f1268f0a1..382411722 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -273,7 +273,7 @@ pub(super) async fn run_state_driver( error!(log, "inbound live migration task failed"; "error" => ?e); - vm.start_failed(true).await; + vm.set_rundown().await; return StateDriverOutput { state_publisher, final_state: InstanceState::Failed, diff --git a/phd-tests/framework/Cargo.toml b/phd-tests/framework/Cargo.toml index fd7cf8307..53b63afbe 100644 --- a/phd-tests/framework/Cargo.toml +++ b/phd-tests/framework/Cargo.toml @@ -19,6 +19,7 @@ errno.workspace = true futures.workspace = true flate2.workspace = true hex.workspace = true +http.workspace = true libc.workspace = true propolis-client.workspace = true propolis-server-config.workspace = true diff --git a/phd-tests/framework/src/test_vm/mod.rs b/phd-tests/framework/src/test_vm/mod.rs index 4c415baac..c1201ba0e 100644 --- a/phd-tests/framework/src/test_vm/mod.rs +++ b/phd-tests/framework/src/test_vm/mod.rs @@ -976,10 +976,20 @@ async fn try_ensure_vm_destroyed(client: &Client) { .send() .await { - error!( - %error, - "error stopping VM to move it to Destroyed" - ); + // If the put fails because the instance was already run down, there's + // nothing else to do. If it fails for some other reason, there's + // nothing else that *can* be done, but the error is unusual and should + // be logged. + match error.status() { + Some(http::status::StatusCode::FAILED_DEPENDENCY) => {} + _ => { + error!( + %error, + "error stopping VM to move it to Destroyed" + ); + } + } + return; } From 3a7b4845797694f3883bc1ab77c454082a62dbc3 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 28 Jun 2024 21:14:00 +0000 Subject: [PATCH 35/55] all the doc comments --- bin/propolis-server/src/lib/server.rs | 12 +- bin/propolis-server/src/lib/vcpu_tasks.rs | 4 +- bin/propolis-server/src/lib/vm/active.rs | 36 ++- bin/propolis-server/src/lib/vm/guest_event.rs | 6 +- .../src/lib/vm/migrate_commands.rs | 6 + bin/propolis-server/src/lib/vm/mod.rs | 233 +++++++++++++++--- bin/propolis-server/src/lib/vm/objects.rs | 105 +++++++- .../src/lib/vm/request_queue.rs | 10 +- bin/propolis-server/src/lib/vm/services.rs | 18 ++ bin/propolis-server/src/lib/vm/startup.rs | 62 ++++- .../src/lib/vm/state_driver.rs | 110 +++++++-- .../src/lib/vm/state_publisher.rs | 17 ++ 12 files changed, 535 insertions(+), 84 deletions(-) diff --git a/bin/propolis-server/src/lib/server.rs b/bin/propolis-server/src/lib/server.rs index cfadbb8a3..d00cddcc3 100644 --- a/bin/propolis-server/src/lib/server.rs +++ b/bin/propolis-server/src/lib/server.rs @@ -6,8 +6,8 @@ //! //! Functions in this module verify parameters and convert between types (API //! request types to Propolis-native types and Propolis-native error types to -//! HTTP error codes) before sending operations to other components (e.g. the VM -//! controller) for processing. +//! HTTP error codes) before sending operations to the VM state machine for +//! processing. use std::convert::TryFrom; use std::net::Ipv6Addr; @@ -524,10 +524,10 @@ async fn instance_serial( .map_err(|e| format!("Serial socket hand-off failed: {}", e).into()) } -// This endpoint is meant to only be called during a migration from the destination -// instance to the source instance as part of the HTTP connection upgrade used to -// establish the migration link. We don't actually want this exported via OpenAPI -// clients. +// This endpoint is meant to only be called during a migration from the +// destination instance to the source instance as part of the HTTP connection +// upgrade used to establish the migration link. We don't actually want this +// exported via OpenAPI clients. #[channel { protocol = WEBSOCKETS, path = "/instance/migrate/{migration_id}/start", diff --git a/bin/propolis-server/src/lib/vcpu_tasks.rs b/bin/propolis-server/src/lib/vcpu_tasks.rs index ba79dedce..be2d37f35 100644 --- a/bin/propolis-server/src/lib/vcpu_tasks.rs +++ b/bin/propolis-server/src/lib/vcpu_tasks.rs @@ -40,7 +40,7 @@ pub(crate) trait VcpuTaskController: Send + Sync + 'static { impl VcpuTasks { pub(crate) fn new( machine: &propolis::Machine, - event_handler: Arc, + event_handler: Arc, log: slog::Logger, ) -> Result { let generation = Arc::new(AtomicUsize::new(0)); @@ -72,7 +72,7 @@ impl VcpuTasks { fn vcpu_loop( vcpu: &Vcpu, task: propolis::tasks::TaskHdl, - event_handler: Arc, + event_handler: Arc, generation: Arc, log: slog::Logger, ) { diff --git a/bin/propolis-server/src/lib/vm/active.rs b/bin/propolis-server/src/lib/vm/active.rs index 2d9ded45c..fd0f2dcff 100644 --- a/bin/propolis-server/src/lib/vm/active.rs +++ b/bin/propolis-server/src/lib/vm/active.rs @@ -2,8 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! The `ActiveVm` wrapper owns all of the components and services that make up -//! a running Propolis instance. +//! Implements a wrapper around an active VM. use std::sync::Arc; @@ -20,19 +19,35 @@ use super::{ /// The components and services that make up an active Propolis VM. pub(crate) struct ActiveVm { + /// The VM's logger. pub(super) log: slog::Logger, + + /// The input queue that receives external requests to change the VM's + /// state. pub(super) state_driver_queue: Arc, + + /// Receives external state updates from the state driver. pub(super) external_state_rx: InstanceStateRx, + + /// The wrapped VM's properties. pub(super) properties: InstanceProperties, + + /// A reference to the wrapped VM's components. Callers with a reference to + /// an `ActiveVm` can clone this to get a handle to those components. pub(super) objects: Arc, + + /// Services that interact with VM users or the control plane outside the + /// Propolis API (e.g. the serial console, VNC, and metrics reporting). pub(super) services: VmServices, } impl ActiveVm { + /// Yields a clonable reference to the active VM's components. pub(crate) fn objects(&self) -> &Arc { &self.objects } + /// Pushes a state change request to the VM's state change queue. pub(crate) fn put_state( &self, requested: InstanceStateRequested, @@ -49,6 +64,9 @@ impl ActiveVm { .map_err(Into::into) } + /// Pushes a request to migrate out of a VM to the VM's state change queue. + /// The migration protocol will communicate with the destination over the + /// provided websocket. pub(crate) async fn request_migration_out( &self, migration_id: Uuid, @@ -62,6 +80,19 @@ impl ActiveVm { )?) } + /// Pushes a request to reconfigure a Crucible volume to the VM's state + /// change queue. + /// + /// # Arguments + /// + /// - `disk_name`: The name of the Crucible disk component (in the instance + /// spec) to modify. + /// - `backend_id`: The UUID to use to find the Crucible backend in the + /// VM's Crucible backend map. + /// - `new_vcr_json`: The new volume construction request to supply to the + /// selected backend. + /// - `result_tx`: The channel to which the state driver should send the + /// replacement result after it completes this operation. pub(crate) fn reconfigure_crucible_volume( &self, disk_name: String, @@ -81,6 +112,7 @@ impl ActiveVm { .map_err(Into::into) } + /// Yields a reference to this VM's services. pub(crate) fn services(&self) -> &VmServices { &self.services } diff --git a/bin/propolis-server/src/lib/vm/guest_event.rs b/bin/propolis-server/src/lib/vm/guest_event.rs index 05e0f4998..b95d9c14b 100644 --- a/bin/propolis-server/src/lib/vm/guest_event.rs +++ b/bin/propolis-server/src/lib/vm/guest_event.rs @@ -2,6 +2,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +//! Types and traits for handling guest-emitted events on the VM state driver. + use std::{collections::VecDeque, time::Duration}; /// An event raised by some component in the instance (e.g. a vCPU or the @@ -29,7 +31,8 @@ pub(super) struct GuestEventQueue { queue: VecDeque, } -pub(crate) trait GuestEventHandler: Send + Sync { +/// A sink for events raised by a VM's vCPU tasks. +pub(crate) trait VcpuEventHandler: Send + Sync { fn suspend_halt_event(&self, when: Duration); fn suspend_reset_event(&self, when: Duration); fn suspend_triple_fault_event(&self, vcpu_id: i32, when: Duration); @@ -41,6 +44,7 @@ pub(crate) trait GuestEventHandler: Send + Sync { fn io_error_event(&self, vcpu_id: i32, error: std::io::Error); } +/// A sink for events raised by a VM's chipset. pub(crate) trait ChipsetEventHandler: Send + Sync { fn chipset_halt(&self); fn chipset_reset(&self); diff --git a/bin/propolis-server/src/lib/vm/migrate_commands.rs b/bin/propolis-server/src/lib/vm/migrate_commands.rs index 787a970c7..403432331 100644 --- a/bin/propolis-server/src/lib/vm/migrate_commands.rs +++ b/bin/propolis-server/src/lib/vm/migrate_commands.rs @@ -21,8 +21,12 @@ pub enum MigrateTargetCommand { UpdateState(propolis_api_types::MigrationState), } +/// A response sent from a migration target's state driver to its migration +/// task. #[derive(Clone)] pub enum MigrateTargetResponse { + /// An attempt to initialize VM objects produced the supplied objects or + /// failed for the supplied reason. VmObjectsInitialized(Result, String>), } @@ -68,6 +72,8 @@ pub(super) enum MigrateTaskEvent { Command(T), } +/// Given a migration executing in `task` that sends commands on `command_rx`, +/// gets the next event for the task's state driver to process. pub(super) async fn next_migrate_task_event( task: &mut tokio::task::JoinHandle< Result<(), crate::migrate::MigrateError>, diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index afd86adad..0413558f6 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -2,8 +2,81 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! This module implements the `Vm` wrapper type that encapsulates a single -//! instance on behalf of a Propolis server. +//! Implements the [`Vm`] type, which encapsulates a single Propolis virtual +//! machine instance and provides a public interface thereto to the Propolis +//! Dropshot server. +//! +//! The VM state machine looks like this: +//! +//! ```text +//! [NoVm] +//! | +//! | +//! v +//! +---- WaitingForInit <----+ +//! | | | +//! | | | +//! | v | +//! | Active | +//! | | | +//! | | | +//! | v | +//! +-------> Rundown | +//! | | | +//! | | | +//! | v | +//! +---> RundownComplete ----+ +//! ``` +//! +//! In the happy case where new VMs always start successfully, this state +//! machine transitions as follows: +//! +//! - New state machines start in the `NoVm` state. +//! - A request to create a new VM moves to the `WaitingForInit` state. +//! - Once all of the VM's components are created, the VM moves to `Active`. +//! - When the VM stops, the VM moves to `Rundown`. +//! - When all references to the VM's components are dropped, the VM moves to +//! `RundownComplete`. A request to create a new VM will move back to +//! `WaitingForInit`. +//! +//! In any state except `NoVm`, the state machine holds enough state to describe +//! the most recent VM known to the state machine, whether it is being created +//! (`WaitingForInit`), running (`Active`), or being torn down (`Rundown` and +//! `RundownComplete`). +//! +//! In the `Active` state, the VM wrapper holds an [`active::ActiveVm`] and +//! allows API-layer callers to obtain references to it. These callers use these +//! references to ask to change a VM's state or change its configuration. An +//! active VM holds a reference to a [`objects::VmObjects`] structure that +//! bundles up all of the Propolis components (kernel VM, devices, and backends) +//! that make up an instance and a spec that describes that instance; API-layer +//! callers may use this structure to read the instance's properties and query +//! component state, but cannot mutate the VM's structure this way. +//! +//! Requests to change a VM's state or configuration (and events from a running +//! guest that might change a VM's state, like an in-guest shutdown or reboot +//! request or a triple fault) are placed in an [input +//! queue](state_driver::InputQueue) that is serviced by a single "state driver" +//! task. When an instance stops, this task moves the state machine to the +//! `Rundown` state, which renders new API-layer callers unable to clone new +//! references to the VM's `VmObjects`. When all outstanding references to the +//! objects are dropped, the VM moves to the `RundownComplete` state, obtains +//! the final instance state from the (joined) state driver task, and publishes +//! that state. At that point the VM may be reinitialized. +//! +//! The VM state machine delegates VM creation to the state driver task. This +//! task can fail to initialize a VM in two ways: +//! +//! 1. It may fail to create all of the VM's component objects (e.g. due to +//! bad configuration or resource exhaustion). +//! 2. It may successfully create all of the VM's component objects, but then +//! fail to populate their initial state via live migration from another +//! instance. +//! +//! In the former case, where no VM objects are ever created, the state driver +//! moves the state machine directly from `WaitingForInit` to `RundownComplete`. +//! In the latter case, the driver moves to `Rundown` and allows `VmObjects` +//! teardown to drive the state machine to `RundownComplete`. use std::{collections::BTreeMap, net::SocketAddr, sync::Arc}; @@ -30,25 +103,42 @@ mod startup; mod state_driver; mod state_publisher; +/// Maps component names to lifecycle trait objects that allow +/// components to be started, paused, resumed, and halted. pub(crate) type LifecycleMap = BTreeMap>; + +/// Maps component names to block backend trait objects. pub(crate) type BlockBackendMap = BTreeMap>; + +/// Maps component names to Crucible backend objects. pub(crate) type CrucibleBackendMap = BTreeMap>; +/// Type alias for the sender side of the channel that receives +/// externally-visible instance state updates. type InstanceStateTx = tokio::sync::watch::Sender< propolis_api_types::InstanceStateMonitorResponse, >; + +/// Type alias for the receiver side of the channel that receives +/// externally-visible instance state updates. type InstanceStateRx = tokio::sync::watch::Receiver< propolis_api_types::InstanceStateMonitorResponse, >; +/// Type alias for the results sent by the state driver in response to a request +/// to change a Crucible backend's configuration. pub(crate) type CrucibleReplaceResult = Result; + +/// Type alias for the sender side of a channel that receives Crucible backend +/// reconfiguration results. pub(crate) type CrucibleReplaceResultTx = tokio::sync::oneshot::Sender; +/// Errors generated by the VM controller and its subcomponents. #[derive(Debug, thiserror::Error)] pub(crate) enum VmError { #[error("VM operation result channel unexpectedly closed")] @@ -73,46 +163,60 @@ pub(crate) enum VmError { ForbiddenStateChange(#[from] request_queue::RequestDeniedReason), } -/// The top-level VM wrapper type. Callers are expected to wrap this in an -/// `Arc`. +/// The top-level VM wrapper type. pub(crate) struct Vm { + /// Lock wrapper for the VM state machine's contents. + /// + /// Routines that need to read VM properties or obtain a `VmObjects` handle + /// acquire this lock shared. + /// + /// Routines that drive the VM state machine acquire this lock exclusive. inner: tokio::sync::RwLock, + + /// A logger for this VM. log: slog::Logger, } +/// Holds a VM state machine and state driver task handle. struct VmInner { + /// The VM's current state. state: VmState, + + /// A handle to the VM's current state driver task, if it has one. driver: Option>, } -struct UninitVm { +/// Describes a past or future VM and its properties. +struct VmDescription { + /// Records the VM's last externally-visible state. external_state_rx: InstanceStateRx, + + /// The VM's API-level instance properties. properties: InstanceProperties, + + /// The VM's last-known instance specification. spec: InstanceSpecV0, } -/// An enum representing the VM state machine. The API layer's Dropshot context -/// holds a reference to this state machine via the [`Vm`] wrapper struct. -/// -/// When an instance is running, its components and services are stored in an -/// [`ActiveVm`] whose lifecycle is managed by a "state driver" task. The VM is -/// kept alive by this task's strong reference. API calls that need to access -/// the active VM try to upgrade the state machine's weak reference to the VM. -/// -/// When an active VM halts, the state driver moves the state machine to the -/// `Rundown` state, preventing new API calls from obtaining new strong -/// references to the underlying VM while allowing existing calls to finish. -/// Eventually (barring a leak), the active VM will be dropped. This launches a -/// task that finishes cleaning up the VM and then moves to the -/// `RundownComplete` state, which allows a new VM to start. +/// The states in the VM state machine. See the module comment for more details. #[allow(clippy::large_enum_variant)] enum VmState { /// This state machine has never held a VM. NoVm, - WaitingForInit(UninitVm), + + /// A new state driver is attempting to initialize objects for a VM with the + /// ecnlosed description. + WaitingForInit(VmDescription), + + /// The VM is active, and callers can obtain a handle to its objects. Active(active::ActiveVm), - Rundown(UninitVm), - RundownComplete(UninitVm), + + /// The previous VM is shutting down, but its objects have not been fully + /// destroyed yet. + Rundown(VmDescription), + + /// The previous VM and its objects have been cleaned up. + RundownComplete(VmDescription), } impl std::fmt::Display for VmState { @@ -131,23 +235,45 @@ impl std::fmt::Display for VmState { } } +/// Parameters to an instance ensure operation. pub(super) struct EnsureOptions { + /// A reference to the VM configuration specified in the config TOML passed + /// to this propolis-server process. pub toml_config: Arc, + + /// True if VMs should allocate memory from the kernel VMM reservoir. pub use_reservoir: bool, + + /// Configuration used to serve Oximeter metrics from this server. pub metrics_config: Option, + + /// An Oximeter producer registry to pass to components that will emit + /// Oximeter metrics. pub oximeter_registry: Option, + + /// A Nexus client handle to pass to components that can make upcalls to + /// Nexus. pub nexus_client: Option, + + /// A reference to the process's VNC server, used to connect the server to + /// a new VM's framebuffer. pub vnc_server: Arc>, + + /// The address of this Propolis process, used by the live migration + /// protocol to transfer serial console connections. pub local_server_addr: SocketAddr, } impl Vm { + /// Creates a new VM. pub fn new(log: &slog::Logger) -> Arc { let log = log.new(slog::o!("component" => "vm_wrapper")); let inner = VmInner { state: VmState::NoVm, driver: None }; Arc::new(Self { inner: tokio::sync::RwLock::new(inner), log }) } + /// If the VM is `Active`, yields a shared lock guard with a reference to + /// the relevant `ActiveVm`. Returns `None` if there is no active VM. pub(super) async fn active_vm( &self, ) -> Option> { @@ -164,6 +290,8 @@ impl Vm { .ok() } + /// Returns the state, properties, and instance spec for the instance most + /// recently wrapped by this `Vm`. pub(super) async fn get( &self, ) -> Result { @@ -193,6 +321,8 @@ impl Vm { }) } + /// Yields a handle to the most recent instance state receiver wrapped by + /// this `Vm`. pub(super) async fn state_watcher( &self, ) -> Result { @@ -206,6 +336,15 @@ impl Vm { } } + /// Moves this VM from the `WaitingForInit` state to the `Active` state, + /// creating an `ActiveVm` with the supplied input queue, VM objects, and VM + /// services. + /// + /// This routine should only be called by the state driver. + /// + /// # Panics + /// + /// Panics if the VM is not in the `WaitingForInit` state. async fn make_active( self: &Arc, log: &slog::Logger, @@ -233,7 +372,21 @@ impl Vm { } } - async fn start_failed(&self, wait_for_objects: bool) { + /// Moves this VM from the `WaitingForInit` state to a rundown state in + /// response to an instance initialization failure. + /// + /// This routine should only be called by the state driver. + /// + /// # Arguments + /// + /// - `wait_for_objects`: True if the caller successfully created VM + /// objects that need to be destroyed before rundown can be completed. + /// False if the caller did not create any VM objects. + /// + /// # Panics + /// + /// Panics if the VM is not in the `WaitingForInit` state. + async fn vm_init_failed(&self, wait_for_objects: bool) { let mut guard = self.inner.write().await; let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { @@ -251,6 +404,13 @@ impl Vm { } } + /// Moves this VM from the `Active` state to the `Rundown` state. + /// + /// This routine should only be called by the state driver. + /// + /// # Panics + /// + /// Panics if the VM is not in the `Active` state. async fn set_rundown(&self) { info!(self.log, "setting VM rundown"); let services = { @@ -263,7 +423,7 @@ impl Vm { let spec = vm.objects().read().await.instance_spec().clone(); let ActiveVm { external_state_rx, properties, .. } = vm; - guard.state = VmState::Rundown(UninitVm { + guard.state = VmState::Rundown(VmDescription { external_state_rx, properties, spec, @@ -274,14 +434,19 @@ impl Vm { services.stop(&self.log).await; } + /// Moves this VM from the `Rundown` state to the `RundownComplete` state. + /// + /// This routine should only be called when dropping VM objects. + /// + /// # Panics + /// + /// Panics if the VM is not in the `Rundown` state. async fn complete_rundown(&self) { info!(self.log, "completing VM rundown"); let mut guard = self.inner.write().await; let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { - VmState::WaitingForInit(vm) | VmState::Rundown(vm) => { - guard.state = VmState::RundownComplete(vm) - } + VmState::Rundown(vm) => guard.state = VmState::RundownComplete(vm), _ => unreachable!("VM rundown completed from invalid prior state"), } @@ -297,6 +462,8 @@ impl Vm { )); } + /// Attempts to move this VM to the `Active` state by setting up a state + /// driver task and directing it to initialize a new VM. pub(crate) async fn ensure( self: &Arc, log: &slog::Logger, @@ -306,7 +473,16 @@ impl Vm { let log_for_driver = log.new(slog::o!("component" => "vm_state_driver")); + // This routine will create a state driver task that actually + // initializes the VM. The external instance-ensure API shouldn't return + // until that task has disposed of the initialization request. Create a + // channel to allow the state driver task to send back an ensure result + // at the appropriate moment. let (ensure_reply_tx, ensure_rx) = tokio::sync::oneshot::channel(); + + // The external state receiver needs to exist as soon as this routine + // returns, so create the appropriate channel here. The sender side of + // the channel will move to the state driver task. let (external_publisher, external_rx) = StatePublisher::new( &log_for_driver, propolis_api_types::InstanceStateMonitorResponse { @@ -344,7 +520,7 @@ impl Vm { let VersionedInstanceSpec::V0(v0_spec) = ensure_request.instance_spec.clone(); - guard.state = VmState::WaitingForInit(UninitVm { + guard.state = VmState::WaitingForInit(VmDescription { external_state_rx: external_rx.clone(), properties: ensure_request.properties.clone(), spec: v0_spec, @@ -364,6 +540,7 @@ impl Vm { })); } + // Wait for the state driver task to dispose of this request. ensure_rx.await.map_err(|_| VmError::ResultChannelClosed)? } } diff --git a/bin/propolis-server/src/lib/vm/objects.rs b/bin/propolis-server/src/lib/vm/objects.rs index 141b4ae7a..a597ff1d9 100644 --- a/bin/propolis-server/src/lib/vm/objects.rs +++ b/bin/propolis-server/src/lib/vm/objects.rs @@ -2,8 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Provides a type that collects all of the components that make up a Propolis -//! VM. +//! A collection of all of the components that make up a Propolis VM instance. use std::{ pin::Pin, @@ -27,12 +26,25 @@ use super::{ LifecycleMap, }; +/// A collection of components that make up a Propolis VM instance. pub(crate) struct VmObjects { + /// The objects' associated logger. log: slog::Logger, + + /// A reference to the VM state machine that created these objects. Used to + /// complete rundown when the objects are dropped. parent: Arc, + + /// Synchronizes access to the VM's objects. + /// + /// API-layer callers that want to enumerate a VM's devices or read its spec + /// acquire this lock shared. The state driver acquires this lock exclusive + /// to mutate the VM. inner: RwLock, } +/// A collection of objects that should eventually be wrapped in a lock and +/// stored in a `VmObjects` structure. See [`VmObjectsLocked`]. pub(super) struct InputVmObjects { pub instance_spec: InstanceSpecV0, pub vcpu_tasks: Box, @@ -45,20 +57,43 @@ pub(super) struct InputVmObjects { pub ps2ctrl: Arc, } +/// The collection of objects and state that make up a Propolis instance. pub(crate) struct VmObjectsLocked { + /// The objects' associated logger. log: slog::Logger, + + /// The instance spec that describes this collection of objects. instance_spec: InstanceSpecV0, + + /// The set of tasks that run this VM's vCPUs. vcpu_tasks: Box, + + /// The Propolis kernel VMM for this instance. machine: Machine, + + /// Maps from component names to the trait objects that implement lifecycle + /// operations (e.g. pause and resume) for eligible components. lifecycle_components: LifecycleMap, + + /// Maps from component names to trait objects that implement the block + /// storage backend trait. block_backends: BlockBackendMap, + + /// Maps from component names to Crucible backend objects. crucible_backends: CrucibleBackendMap, + + /// A handle to the serial console connection to the VM's first COM port. com1: Arc>, + + /// A handle to the VM's framebuffer. framebuffer: Option>, + + /// A handle to the VM's PS/2 controller. ps2ctrl: Arc, } impl VmObjects { + /// Creates a new VM object container. pub(super) fn new( log: slog::Logger, parent: Arc, @@ -68,20 +103,32 @@ impl VmObjects { Self { log, parent, inner: tokio::sync::RwLock::new(inner) } } + /// Yields the logger associated with these objects. pub(crate) fn log(&self) -> &slog::Logger { &self.log } + /// Yields a shared lock guard referring to the underlying object + /// collection. + /// + /// This function is crate-visible to allow the API layer to read (but not + /// mutate) VM objects. pub(crate) async fn read(&self) -> RwLockReadGuard { self.inner.read().await } + /// Yields an exclusive lock guard referring to the underlying object + /// collection. + /// + /// This function is only visible within the `vm` module so that only the + /// state driver can obtain a mutable reference to the underlying objects. pub(super) async fn write(&self) -> RwLockWriteGuard { self.inner.write().await } } impl VmObjectsLocked { + /// Associates a collection of VM objects with a logger. fn new(log: &slog::Logger, input: InputVmObjects) -> Self { Self { log: log.clone(), @@ -97,18 +144,23 @@ impl VmObjectsLocked { } } + /// Yields the VM's current instance spec. pub(crate) fn instance_spec(&self) -> &InstanceSpecV0 { &self.instance_spec } + /// Yields a mutable reference to the VM's current instance spec. pub(crate) fn instance_spec_mut(&mut self) -> &mut InstanceSpecV0 { &mut self.instance_spec } + /// Yields the VM's current kernel VMM handle. pub(crate) fn machine(&self) -> &Machine { &self.machine } + /// Obtains a handle to the lifecycle trait object for the component with + /// the supplied `name`. pub(crate) fn device_by_name( &self, name: &str, @@ -116,22 +168,29 @@ impl VmObjectsLocked { self.lifecycle_components.get(name).cloned() } + /// Yields the VM's current Crucible backend map. pub(crate) fn crucible_backends(&self) -> &CrucibleBackendMap { &self.crucible_backends } + /// Yields a clonable reference to the serial console for this VM's first + /// COM port. pub(crate) fn com1(&self) -> &Arc> { &self.com1 } + /// Yields a clonable reference to this VM's framebuffer. pub(crate) fn framebuffer(&self) -> &Option> { &self.framebuffer } + /// Yields a clonable reference to this VM's PS/2 controller. pub(crate) fn ps2ctrl(&self) -> &Arc { &self.ps2ctrl } + /// Iterates over all of the lifecycle trait objects in this VM and calls + /// `func` on each one. pub(crate) fn for_each_device( &self, mut func: impl FnMut(&str, &Arc), @@ -141,6 +200,9 @@ impl VmObjectsLocked { } } + /// Iterates over all of the lifecycle objects in this VM and calls `func` + /// on each one. If any invocation of `func` fails, this routine returns + /// immediately and yields the relevant error. pub(crate) fn for_each_device_fallible( &self, mut func: impl FnMut( @@ -155,7 +217,7 @@ impl VmObjectsLocked { Ok(()) } - /// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated + /// Pauses the VM at the kernel VMM level, ensuring that in-kernel-emulated /// devices and vCPUs are brought to a consistent state. /// /// When the VM is paused, attempts to run its vCPUs (via `VM_RUN` ioctl) @@ -166,11 +228,13 @@ impl VmObjectsLocked { self.machine.hdl.pause().expect("VM_PAUSE should succeed"); } + /// Resumes the VM at the kernel VMM level. pub(super) fn resume_kernel_vm(&self) { info!(self.log, "resuming kernel VMM resources"); self.machine.hdl.resume().expect("VM_RESUME should succeed"); } + /// Reinitializes the VM by resetting all of its devices and its kernel VMM. pub(super) fn reset_devices_and_machine(&self) { self.for_each_device(|name, dev| { info!(self.log, "sending reset request to {}", name); @@ -180,6 +244,12 @@ impl VmObjectsLocked { self.machine.reinitialize().unwrap(); } + /// Starts a VM's devices and allows all of its vCPU tasks to run. + /// + /// This function may be called either after initializing a new VM from + /// scratch or after an inbound live migration. In the latter case, this + /// routine assumes that the caller initialized and activated the VM's vCPUs + /// prior to importing state from the migration source. pub(super) async fn start( &mut self, reason: VmStartReason, @@ -201,28 +271,34 @@ impl VmObjectsLocked { result } + /// Pauses this VM's devices and its kernel VMM. pub(super) async fn pause(&mut self) { self.vcpu_tasks.pause_all(); self.pause_devices().await; self.pause_kernel_vm(); } + /// Resumes this VM's devices and its kernel VMM. pub(super) fn resume(&mut self) { self.resume_kernel_vm(); self.resume_devices(); self.vcpu_tasks.resume_all(); } + /// Stops the VM's vCPU tasks and devices. pub(super) async fn halt(&mut self) { self.vcpu_tasks.exit_all(); self.halt_devices().await; } + /// Resets the VM's kernel vCPU state. pub(super) fn reset_vcpus(&self) { self.vcpu_tasks.new_generation(); self.reset_vcpu_state(); } + /// Hard-resets a VM by pausing, resetting, and resuming all its devices and + /// vCPUs. pub(super) async fn reboot(&mut self) { // Reboot is implemented as a pause -> reset -> resume transition. // @@ -242,7 +318,9 @@ impl VmObjectsLocked { self.vcpu_tasks.resume_all(); } - pub(super) async fn start_devices(&self) -> anyhow::Result<()> { + /// Starts all of a VM's devices and allows its block backends to process + /// requests from their devices. + async fn start_devices(&self) -> anyhow::Result<()> { self.for_each_device_fallible(|name, dev| { info!(self.log, "sending startup complete to {}", name); let res = dev.start(); @@ -264,7 +342,8 @@ impl VmObjectsLocked { Ok(()) } - pub(super) async fn pause_devices(&self) { + /// Pauses all of a VM's devices. + async fn pause_devices(&self) { self.for_each_device(|name, dev| { info!(self.log, "sending pause request to {}", name); dev.pause(); @@ -314,14 +393,17 @@ impl VmObjectsLocked { } } - pub(super) fn resume_devices(&self) { + /// Resumes all of a VM's devices. + fn resume_devices(&self) { self.for_each_device(|name, dev| { info!(self.log, "sending resume request to {}", name); dev.resume(); }) } - pub(super) async fn halt_devices(&self) { + /// Stops all of a VM's devices and detaches its block backends from their + /// devices. + async fn halt_devices(&self) { self.for_each_device(|name, dev| { info!(self.log, "sending halt request to {}", name); dev.halt(); @@ -338,7 +420,8 @@ impl VmObjectsLocked { } } - pub(super) fn reset_vcpu_state(&self) { + /// Resets a VM's kernel vCPU objects to their initial states. + fn reset_vcpu_state(&self) { for vcpu in self.machine.vcpus.iter() { info!(self.log, "resetting vCPU {}", vcpu.id); vcpu.activate().unwrap(); @@ -361,9 +444,9 @@ impl Drop for VmObjects { // Signal to these objects' owning VM that rundown has completed and a // new VM can be created. // - // It is always safe to complete rundown at this point because an - // `ActiveVm` always holds a reference to its `VmObjects`, and the - // parent VM doesn't drop its `ActiveVm` until rundown begins. + // It is always safe to complete rundown at this point because the state + // driver ensures that if it creates VM objects, then it will not drop + // them without first moving the VM to the Rundown state. let parent = self.parent.clone(); tokio::spawn(async move { parent.complete_rundown().await; diff --git a/bin/propolis-server/src/lib/vm/request_queue.rs b/bin/propolis-server/src/lib/vm/request_queue.rs index 33a376bd2..3c3e2d184 100644 --- a/bin/propolis-server/src/lib/vm/request_queue.rs +++ b/bin/propolis-server/src/lib/vm/request_queue.rs @@ -55,13 +55,12 @@ impl WebsocketConnection { /// An external request made of a VM controller via the server API. Handled by /// the controller's state driver thread. pub enum ExternalRequest { + /// Asks the state worker to start a brand-new VM (i.e. not one initialized + /// by live migration, which implicitly starts the VM). Start, /// Asks the state worker to start a migration-source task. - MigrateAsSource { - migration_id: Uuid, - websock: WebsocketConnection, - }, + MigrateAsSource { migration_id: Uuid, websock: WebsocketConnection }, /// Resets the guest by pausing all devices, resetting them to their /// cold-boot states, and resuming the devices. Note that this is not a @@ -180,6 +179,7 @@ struct AllowedRequests { stop: RequestDisposition, } +/// A queue for external requests to change an instance's state. #[derive(Debug)] pub struct ExternalRequestQueue { queue: VecDeque, @@ -187,6 +187,8 @@ pub struct ExternalRequestQueue { log: Logger, } +/// Indicates whether this queue's creator will start the relevant instance +/// without waiting for a Start request from the queue. pub enum InstanceAutoStart { Yes, No, diff --git a/bin/propolis-server/src/lib/vm/services.rs b/bin/propolis-server/src/lib/vm/services.rs index 7deac845d..800f0c1f2 100644 --- a/bin/propolis-server/src/lib/vm/services.rs +++ b/bin/propolis-server/src/lib/vm/services.rs @@ -18,19 +18,33 @@ use crate::{ use super::objects::{VmObjects, VmObjectsLocked}; +/// Information used to serve Oximeter metrics. #[derive(Default)] pub(crate) struct OximeterState { + /// The Oximeter server to which Oximeter clients connect to query for + /// metrics. server: Option, + + /// The statistics object used by the API layer to record its metrics. pub stats: Option, } +/// A collection of services visible to consumers outside this Propolis that +/// depend on the functionality supplied by an extant VM. pub(crate) struct VmServices { + /// A VM's serial console handler task. pub serial_task: tokio::sync::Mutex>, + + /// A VM's Oximeter server. pub oximeter: tokio::sync::Mutex, + + /// A reference to the VM's host process's VNC server. pub vnc_server: Arc>, } impl VmServices { + /// Starts a new set of VM services using the supplied VM objects and server + /// configuration. pub(super) async fn new( log: &slog::Logger, vm: &Arc, @@ -77,6 +91,7 @@ impl VmServices { } } + /// Directs all the services in this service block to stop. pub(super) async fn stop(&self, log: &Logger) { self.vnc_server.stop().await; @@ -100,6 +115,8 @@ impl VmServices { } } +/// Creates an Oximeter producer and registers it with Oximeter, which will call +/// back into the server to gather the producer's metrics. async fn register_oximeter_producer( log: &slog::Logger, cfg: &MetricsEndpointConfig, @@ -161,6 +178,7 @@ async fn register_oximeter_producer( oximeter_state } +/// Launches a serial console handler task. async fn start_serial_task( log: &slog::Logger, vm_objects: &tokio::sync::RwLockReadGuard<'_, VmObjectsLocked>, diff --git a/bin/propolis-server/src/lib/vm/startup.rs b/bin/propolis-server/src/lib/vm/startup.rs index 610e22b80..721a13ad9 100644 --- a/bin/propolis-server/src/lib/vm/startup.rs +++ b/bin/propolis-server/src/lib/vm/startup.rs @@ -32,20 +32,53 @@ use super::{ }, }; +/// The context needed to finish a live migration into a VM after its initial +/// Sync phase has concluded and produced a set of VM objects (into which the +/// migration will import the source VM's state). pub(super) struct MigrateAsTargetContext { + /// The objects into which to import state from the source. vm_objects: Arc, + + /// The logger associated with this migration. log: slog::Logger, + + /// The migration's ID. migration_id: Uuid, + + /// A handle to the task that's driving the migration. migrate_task: tokio::task::JoinHandle>, + + /// Receives commands from the migration task. command_rx: tokio::sync::mpsc::Receiver, + + /// Sends command responses to the migration task. response_tx: tokio::sync::mpsc::Sender, } +/// The output of a call to [`build_vm`]. pub(super) struct BuildVmOutput { + /// A reference to the VM objects created by the request to build a new VM. pub vm_objects: Arc, + + /// If the VM is initializing via migration in, the context needed to + /// complete that migration. pub migration_in: Option, } +/// Builds a new set of VM objects from the supplied ensure `request`. +/// +/// If the request asks to create a new VM without migrating, this routine +/// simply sets up the new VM's objects and returns them. +/// +/// Callers who ask to initialize a VM via live migration expect their API calls +/// to succeed as soon as there's an initialized VM and a running migration +/// task, even if the migration hasn't completed yet. To facilitate this, when +/// initializing via live migration, this routine executes only enough of the +/// live migration protocol to create VM objects, then immediately returns those +/// objects and a context the caller can use to finish the migration task. This +/// allows the caller to complete any external ensure calls it has pending +/// before completing migration and allowing the state driver to process state +/// change requests. pub(super) async fn build_vm( log: &slog::Logger, parent: &Arc, @@ -108,6 +141,15 @@ pub(super) async fn build_vm( .await }); + // In the initial phases of live migration, the migration protocol decides + // whether the source and destination VMs have compatible configurations. If + // they do, the migration task asks this routine to initialize a VM on its + // behalf. Execute this part of the protocol now in order to create a set of + // VM objects to return. + // + // TODO(#706): Future versions of the protocol can extend this further, + // specifying an instance spec and/or an initial set of device payloads that + // the task should use to initialize its VM objects. let init_command = command_rx.recv().await.ok_or_else(|| { (anyhow::anyhow!("migration task unexpectedly closed channel"), None) })?; @@ -153,6 +195,8 @@ pub(super) async fn build_vm( } }; + // The VM's objects are initialized. Return them to the caller along with a + // continuation context that it can use to complete migration. let migration_in = MigrateAsTargetContext { vm_objects: vm_objects.clone(), log: log.clone(), @@ -165,6 +209,7 @@ pub(super) async fn build_vm( Ok(BuildVmOutput { vm_objects, migration_in: Some(migration_in) }) } +/// Initializes a set of Propolis components from the supplied instance spec. async fn initialize_vm_objects_from_spec( log: &slog::Logger, event_queue: &Arc, @@ -235,7 +280,7 @@ async fn initialize_vm_objects_from_spec( init.initialize_cpus()?; let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( &machine, - event_queue.clone() as Arc, + event_queue.clone() as Arc, log.new(slog::o!("component" => "vcpu_tasks")), )?); @@ -257,19 +302,20 @@ async fn initialize_vm_objects_from_spec( } impl MigrateAsTargetContext { + /// Runs a partially-completed inbound live migration to completion. pub(super) async fn run( mut self, state_publisher: &mut StatePublisher, ) -> Result<(), MigrateError> { // The migration task imports device state by operating directly on the - // newly-created VM objects. Before sending them to the task, make sure the - // objects are ready to have state imported into them. Specifically, ensure - // that the VM's vCPUs are activated so they can enter the guest after - // migration and pause the kernel VM to allow it to import device state - // consistently. + // newly-created VM objects. Before sending them to the task, make sure + // the objects are ready to have state imported into them. Specifically, + // ensure that the VM's vCPUs are activated so they can enter the guest + // after migration and pause the kernel VM to allow it to import device + // state consistently. // - // Drop the lock after this operation so that the migration task can acquire - // it. + // Drop the lock after this operation so that the migration task can + // acquire it to enumerate devices and import state into them. { let guard = self.vm_objects.read().await; guard.reset_vcpus(); diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 382411722..22b3cbe1f 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! It drives the state vroom vroom +//! A task to handle requests to change a VM's state or configuration. use std::{ sync::{Arc, Condvar, Mutex}, @@ -34,10 +34,11 @@ use super::{ VmError, }; +/// Tells the state driver what to do after handling an event. #[derive(Debug, PartialEq, Eq)] enum HandleEventOutcome { Continue, - Exit, + Exit { final_state: InstanceState }, } /// A reason for starting a VM. @@ -47,14 +48,20 @@ pub(super) enum VmStartReason { ExplicitRequest, } +/// A kind of event the state driver can handle. #[derive(Debug)] enum InputQueueEvent { ExternalRequest(ExternalRequest), GuestEvent(GuestEvent), } +/// The lock-guarded parts of a state driver's input queue. struct InputQueueInner { + /// State change requests from the external API. external_requests: super::request_queue::ExternalRequestQueue, + + /// State change requests from the VM's components. These take precedence + /// over external state change requests. guest_events: super::guest_event::GuestEventQueue, } @@ -69,12 +76,14 @@ impl InputQueueInner { } } +/// A queue for external state change requests and guest-driven state changes. pub(super) struct InputQueue { inner: Mutex, cv: Condvar, } impl InputQueue { + /// Creates a new state driver input queue. pub(super) fn new( log: slog::Logger, auto_start: InstanceAutoStart, @@ -85,7 +94,15 @@ impl InputQueue { } } + /// Waits for an event to arrive on the input queue and returns it for + /// processing. + /// + /// External requests and guest events are stored in separate queues. If + /// both queues have events when this routine is called, the guest event + /// queue takes precedence. fn wait_for_next_event(&self) -> InputQueueEvent { + // `block_in_place` is required to avoid blocking the executor while + // waiting on the condvar. tokio::task::block_in_place(|| { let guard = self.inner.lock().unwrap(); let mut guard = self @@ -105,6 +122,9 @@ impl InputQueue { }) } + /// Notifies the external request queue that the instance's state has + /// changed so that it can change the dispositions for new state change + /// requests. fn notify_instance_state_change( &self, state: super::request_queue::InstanceStateChange, @@ -113,6 +133,7 @@ impl InputQueue { guard.external_requests.notify_instance_state_change(state); } + /// Submits an external state change request to the queue. pub(super) fn queue_external_request( &self, request: ExternalRequest, @@ -126,7 +147,7 @@ impl InputQueue { } } -impl guest_event::GuestEventHandler for InputQueue { +impl guest_event::VcpuEventHandler for InputQueue { fn suspend_halt_event(&self, when: Duration) { let mut guard = self.inner.lock().unwrap(); if guard @@ -185,21 +206,40 @@ impl guest_event::ChipsetEventHandler for InputQueue { } } -/// The context for a VM state driver task. +/// The context for a VM state driver task's main loop. struct StateDriver { + /// The state driver's associated logger. log: slog::Logger, + + /// The VM objects this driver is managing. objects: Arc, + + /// The input queue this driver gets events from. input_queue: Arc, + + /// The channel to which this driver publishes external instance state + /// changes. external_state: StatePublisher, + + /// True if the VM is paused. paused: bool, + + /// State persisted from previous attempts to migrate out of this VM. migration_src_state: crate::migrate::source::PersistentState, } +/// The values returned by a state driver task when it exits. pub(super) struct StateDriverOutput { + /// The channel this driver used to publish external instance state changes. pub state_publisher: StatePublisher, + + /// The terminal state of this instance. When the instance completes + /// rundown, the parent VM publishes this state to the associated channel. pub final_state: InstanceState, } +/// Creates a new set of VM objects in response to an `ensure_request` directed +/// to the supplied `vm`. pub(super) async fn run_state_driver( log: slog::Logger, vm: Arc, @@ -237,7 +277,8 @@ pub(super) async fn run_state_driver( state_publisher.update(ExternalStateUpdate::Instance( InstanceState::Failed, )); - vm.start_failed(objects.is_some()).await; + + vm.vm_init_failed(objects.is_some()).await; let _ = ensure_result_tx .send(Err(VmError::InitializationFailed(e))); return StateDriverOutput { @@ -268,6 +309,15 @@ pub(super) async fn run_state_driver( .map(|id| InstanceMigrateInitiateResponse { migration_id: id }), })); + // If the VM was initialized via migration in, complete that migration now. + // + // External callers who ask to initialize an instance via migration in + // expect their API calls to complete once the relevant VM is initialized + // and the migration task has started (as opposed to when the entire + // migration attempt has completed), so this must happen after the ensure + // result is published. (Note that it's OK for the migration to fail after + // this point: the ensure request succeeds, but the instance goes to the + // Failed state and the migration appears to have failed.) if let Some(migration_in) = migration_in { if let Err(e) = migration_in.run(&mut state_publisher).await { error!(log, "inbound live migration task failed"; @@ -290,27 +340,31 @@ pub(super) async fn run_state_driver( migration_src_state: Default::default(), }; - let state_publisher = state_driver.run(migration_in_id.is_some()).await; + // Run the VM until it exits, then set rundown on the parent VM so that no + // new external callers can access its objects or services. + let output = state_driver.run(migration_in_id.is_some()).await; vm.set_rundown().await; - StateDriverOutput { state_publisher, final_state: InstanceState::Destroyed } + output } impl StateDriver { - pub(super) async fn run(mut self, migrated_in: bool) -> StatePublisher { + pub(super) async fn run(mut self, migrated_in: bool) -> StateDriverOutput { info!(self.log, "state driver launched"); - if migrated_in { + let final_state = if migrated_in { if self.start_vm(VmStartReason::MigratedIn).await.is_ok() { - self.run_loop().await; + self.run_loop().await + } else { + InstanceState::Failed } } else { - self.run_loop().await; - } + self.run_loop().await + }; - self.external_state + StateDriverOutput { state_publisher: self.external_state, final_state } } - async fn run_loop(&mut self) { + async fn run_loop(&mut self) -> InstanceState { info!(self.log, "state driver entered main loop"); loop { let event = self.input_queue.wait_for_next_event(); @@ -326,12 +380,16 @@ impl StateDriver { }; info!(self.log, "state driver handled event"; "outcome" => ?outcome); - if outcome == HandleEventOutcome::Exit { - break; + match outcome { + HandleEventOutcome::Continue => {} + HandleEventOutcome::Exit { final_state } => { + info!(self.log, "state driver exiting"; + "final_state" => ?final_state); + + return final_state; + } } } - - info!(self.log, "state driver exiting"); } async fn start_vm( @@ -363,7 +421,9 @@ impl StateDriver { GuestEvent::VcpuSuspendHalt(_when) => { info!(self.log, "Halting due to VM suspend event",); self.do_halt().await; - HandleEventOutcome::Exit + HandleEventOutcome::Exit { + final_state: InstanceState::Destroyed, + } } GuestEvent::VcpuSuspendReset(_when) => { info!(self.log, "Resetting due to VM suspend event"); @@ -381,7 +441,9 @@ impl StateDriver { GuestEvent::ChipsetHalt => { info!(self.log, "Halting due to chipset-driven halt"); self.do_halt().await; - HandleEventOutcome::Exit + HandleEventOutcome::Exit { + final_state: InstanceState::Destroyed, + } } GuestEvent::ChipsetReset => { info!(self.log, "Resetting due to chipset-driven reset"); @@ -399,7 +461,9 @@ impl StateDriver { ExternalRequest::Start => { match self.start_vm(VmStartReason::ExplicitRequest).await { Ok(_) => HandleEventOutcome::Continue, - Err(_) => HandleEventOutcome::Exit, + Err(_) => HandleEventOutcome::Exit { + final_state: InstanceState::Failed, + }, } } ExternalRequest::MigrateAsSource { migration_id, websock } => { @@ -418,7 +482,9 @@ impl StateDriver { } ExternalRequest::Stop => { self.do_halt().await; - HandleEventOutcome::Exit + HandleEventOutcome::Exit { + final_state: InstanceState::Destroyed, + } } ExternalRequest::ReconfigureCrucibleVolume { disk_name, diff --git a/bin/propolis-server/src/lib/vm/state_publisher.rs b/bin/propolis-server/src/lib/vm/state_publisher.rs index 319ea9bf1..f8e2bb7c7 100644 --- a/bin/propolis-server/src/lib/vm/state_publisher.rs +++ b/bin/propolis-server/src/lib/vm/state_publisher.rs @@ -16,13 +16,21 @@ use crate::migrate::MigrateRole; use super::{InstanceStateRx, InstanceStateTx}; +/// An update to an instance's migration's state. pub(super) struct MigrationStateUpdate { + /// The migration's new state. pub state: propolis_api_types::MigrationState, + + /// The migration's ID. pub id: Uuid, + + /// The role this VM was playing in the migration of interest. pub role: MigrateRole, } impl MigrationStateUpdate { + /// Applies an update to a previous migration status and returns the new + /// status. fn apply_to( self, old: InstanceMigrateStatusResponse, @@ -41,12 +49,19 @@ impl MigrationStateUpdate { } } +/// A kind of state update to publish. pub(super) enum ExternalStateUpdate { + /// Update the instance state (but not any migration state). Instance(InstanceState), + + /// Update migration state (but not the instance's state). Migration(MigrationStateUpdate), + + /// Update both instance and migration state. Complete(InstanceState, MigrationStateUpdate), } +/// A channel to which to publish externally-visible instance state updates. pub(super) struct StatePublisher { tx: InstanceStateTx, log: slog::Logger, @@ -61,6 +76,8 @@ impl StatePublisher { (Self { tx, log: log.clone() }, rx) } + /// Updates an instance's externally-visible state and publishes that state + /// with a successor generation number. pub(super) fn update(&mut self, update: ExternalStateUpdate) { let (instance_state, migration_state) = match update { ExternalStateUpdate::Instance(i) => (Some(i), None), From d933bc9778f9af8931a22d923410b367cb459c3f Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 28 Jun 2024 23:54:35 +0000 Subject: [PATCH 36/55] notify Crucible reconfiguration waiters when they're preempted by instance stop --- .../src/lib/vm/request_queue.rs | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/bin/propolis-server/src/lib/vm/request_queue.rs b/bin/propolis-server/src/lib/vm/request_queue.rs index 3c3e2d184..3b0fcbe0d 100644 --- a/bin/propolis-server/src/lib/vm/request_queue.rs +++ b/bin/propolis-server/src/lib/vm/request_queue.rs @@ -412,6 +412,46 @@ impl ExternalRequestQueue { } } +// It's possible for an external request queue to be dropped with outstanding +// requests if an event from the guest shuts down the VM before the queue can be +// drained. If this happens, notify anyone waiting on a specific request on the +// queue that the VM is gone. +impl Drop for ExternalRequestQueue { + fn drop(&mut self) { + for req in self.queue.drain(..) { + match req { + // Crucible VCR change requestors wait for their requests to be + // retired. + ExternalRequest::ReconfigureCrucibleVolume { + result_tx, + .. + } => { + let _ = + result_tx.send(Err(dropshot::HttpError::for_status( + Some( + "VM destroyed before request could be handled" + .to_string(), + ), + http::StatusCode::GONE, + ))); + } + + // Requests to start, reboot, and stop are handled + // asynchronously (calls to change the instance's state return + // as soon as they're queued). + ExternalRequest::Start + | ExternalRequest::Reboot + | ExternalRequest::Stop => {} + + // Dropping a request to migrate out drops the embedded + // connection to the migration target, thus notifying it that + // the source is gone. + ExternalRequest::MigrateAsSource { .. } => {} + } + } + } +} + #[cfg(test)] mod test { use super::*; From 82376e59be588e9c1e7935c4c9e597c63adf020d Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 2 Jul 2024 00:01:34 +0000 Subject: [PATCH 37/55] TaskGroup mutex can still be std --- lib/propolis/src/tasks.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/lib/propolis/src/tasks.rs b/lib/propolis/src/tasks.rs index 3920b82d5..12076e6ab 100644 --- a/lib/propolis/src/tasks.rs +++ b/lib/propolis/src/tasks.rs @@ -419,26 +419,29 @@ impl Drop for CtrlHeld<'_> { /// Holds a group of tokio task [task::JoinHandle]s to be later joined as a /// group when they have all concluded. -pub struct TaskGroup(tokio::sync::Mutex>>); +pub struct TaskGroup(Mutex>>); impl TaskGroup { pub fn new() -> Self { - Self(tokio::sync::Mutex::new(Vec::new())) + Self(Mutex::new(Vec::new())) } /// Add to the group of contained tasks - pub async fn extend(&self, tasks: I) + pub fn extend(&self, tasks: I) where I: Iterator>, { - let mut guard = self.0.lock().await; + let mut guard = self.0.lock().unwrap(); guard.extend(tasks); } /// Block until all held tasks have been joined, returning any resulting /// [task::JoinError]s after doing so. pub async fn block_until_joined(&self) -> Option> { - let mut guard = self.0.lock().await; - let workers = std::mem::replace(&mut *guard, Vec::new()); + let workers = { + let mut guard = self.0.lock().unwrap(); + std::mem::replace(&mut *guard, Vec::new()) + }; + if workers.is_empty() { return None; } From 2e70f5883a8b93e46f1dfe75800fe243c745c7b9 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 2 Jul 2024 16:15:00 +0000 Subject: [PATCH 38/55] use async_trait to reduce some paperwork --- Cargo.lock | 1 + lib/propolis/Cargo.toml | 1 + lib/propolis/src/block/crucible.rs | 48 +++++++++++++--------------- lib/propolis/src/block/file.rs | 30 ++++++++---------- lib/propolis/src/block/in_memory.rs | 30 ++++++++---------- lib/propolis/src/block/mem_async.rs | 49 +++++++++++++---------------- lib/propolis/src/block/mod.rs | 6 ++-- 7 files changed, 73 insertions(+), 92 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6cf1f1eee..8921fe9d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3422,6 +3422,7 @@ name = "propolis" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "bhyve_api", "bitflags 2.5.0", "bitstruct", diff --git a/lib/propolis/Cargo.toml b/lib/propolis/Cargo.toml index 6d75a067f..c663b437f 100644 --- a/lib/propolis/Cargo.toml +++ b/lib/propolis/Cargo.toml @@ -34,6 +34,7 @@ crucible-client-types = { workspace = true, optional = true } crucible = { workspace = true, optional = true } oximeter = { workspace = true, optional = true } nexus-client = { workspace = true, optional = true } +async-trait.workspace = true # falcon libloading = { workspace = true, optional = true } diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index e5fb5dbb6..b8c09aacf 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -16,7 +16,6 @@ use crucible::{ BlockIO, Buffer, CrucibleError, ReplaceResult, SnapshotDetails, Volume, }; use crucible_client_types::VolumeConstructionRequest; -use futures::future::BoxFuture; use oximeter::types::ProducerRegistry; use slog::{error, info}; use thiserror::Error; @@ -272,21 +271,19 @@ impl CrucibleBackend { async fn spawn_workers(&self) { // TODO: make this tunable? let worker_count = 8; - self.workers - .extend((0..worker_count).map(|n| { - let worker_state = self.state.clone(); - let worker_acc = self - .state - .attachment - .accessor_mem(|acc_mem| { - acc_mem.child(Some(format!("crucible worker {n}"))) - }) - .expect("backend is attached"); - tokio::spawn(async move { - worker_state.process_loop(worker_acc).await + self.workers.extend((0..worker_count).map(|n| { + let worker_state = self.state.clone(); + let worker_acc = self + .state + .attachment + .accessor_mem(|acc_mem| { + acc_mem.child(Some(format!("crucible worker {n}"))) }) - })) - .await; + .expect("backend is attached"); + tokio::spawn( + async move { worker_state.process_loop(worker_acc).await }, + ) + })); } pub async fn volume_is_active(&self) -> Result { @@ -294,6 +291,7 @@ impl CrucibleBackend { } } +#[async_trait::async_trait] impl block::Backend for CrucibleBackend { fn attachment(&self) -> &block::BackendAttachment { &self.state.attachment @@ -301,19 +299,15 @@ impl block::Backend for CrucibleBackend { fn info(&self) -> DeviceInfo { self.state.info } - fn start(&self) -> BoxFuture<'_, anyhow::Result<()>> { - Box::pin(async { - self.state.volume.activate().await?; - self.state.attachment.start(); - self.spawn_workers().await; - Ok(()) - }) + async fn start(&self) -> anyhow::Result<()> { + self.state.volume.activate().await?; + self.state.attachment.start(); + self.spawn_workers().await; + Ok(()) } - fn stop(&self) -> BoxFuture<'_, ()> { - Box::pin(async { - self.state.attachment.stop(); - self.workers.block_until_joined().await; - }) + async fn stop(&self) -> () { + self.state.attachment.stop(); + self.workers.block_until_joined().await; } } diff --git a/lib/propolis/src/block/file.rs b/lib/propolis/src/block/file.rs index 8f7265df2..aa8b3e99d 100644 --- a/lib/propolis/src/block/file.rs +++ b/lib/propolis/src/block/file.rs @@ -18,7 +18,6 @@ use crate::util::ioctl; use crate::vmm::{MappingExt, MemCtx}; use anyhow::Context; -use futures::future::BoxFuture; // XXX: completely arb for now const MAX_WORKERS: usize = 32; @@ -212,6 +211,7 @@ impl FileBackend { } } +#[async_trait::async_trait] impl block::Backend for FileBackend { fn attachment(&self) -> &block::BackendAttachment { &self.state.attachment @@ -221,24 +221,20 @@ impl block::Backend for FileBackend { self.state.info } - fn start(&self) -> BoxFuture<'_, anyhow::Result<()>> { - Box::pin(async { - self.state.attachment.start(); - if let Err(e) = self.spawn_workers() { - self.state.attachment.stop(); - self.workers.block_until_joined(); - Err(e).context("failure while spawning workers") - } else { - Ok(()) - } - }) - } - - fn stop(&self) -> BoxFuture<'_, ()> { - Box::pin(async { + async fn start(&self) -> anyhow::Result<()> { + self.state.attachment.start(); + if let Err(e) = self.spawn_workers() { self.state.attachment.stop(); self.workers.block_until_joined(); - }) + Err(e).context("failure while spawning workers") + } else { + Ok(()) + } + } + + async fn stop(&self) -> () { + self.state.attachment.stop(); + self.workers.block_until_joined(); } } diff --git a/lib/propolis/src/block/in_memory.rs b/lib/propolis/src/block/in_memory.rs index 1eb967a20..3636bc8fa 100644 --- a/lib/propolis/src/block/in_memory.rs +++ b/lib/propolis/src/block/in_memory.rs @@ -12,7 +12,6 @@ use crate::tasks::ThreadGroup; use crate::vmm::{MemCtx, SubMapping}; use anyhow::Context; -use futures::future::BoxFuture; pub struct InMemoryBackend { state: Arc, @@ -141,6 +140,7 @@ impl InMemoryBackend { } } +#[async_trait::async_trait] impl block::Backend for InMemoryBackend { fn attachment(&self) -> &block::BackendAttachment { &self.state.attachment @@ -150,24 +150,20 @@ impl block::Backend for InMemoryBackend { self.state.info } - fn start(&self) -> BoxFuture<'_, anyhow::Result<()>> { - Box::pin(async { - self.state.attachment.start(); - if let Err(e) = self.spawn_workers() { - self.state.attachment.stop(); - self.workers.block_until_joined(); - Err(e).context("failure while spawning workers") - } else { - Ok(()) - } - }) - } - - fn stop(&self) -> BoxFuture<'_, ()> { - Box::pin(async { + async fn start(&self) -> anyhow::Result<()> { + self.state.attachment.start(); + if let Err(e) = self.spawn_workers() { self.state.attachment.stop(); self.workers.block_until_joined(); - }) + Err(e).context("failure while spawning workers") + } else { + Ok(()) + } + } + + async fn stop(&self) -> () { + self.state.attachment.stop(); + self.workers.block_until_joined(); } } diff --git a/lib/propolis/src/block/mem_async.rs b/lib/propolis/src/block/mem_async.rs index 1feaf7e0e..e6683563e 100644 --- a/lib/propolis/src/block/mem_async.rs +++ b/lib/propolis/src/block/mem_async.rs @@ -7,8 +7,6 @@ use std::num::NonZeroUsize; use std::ptr::NonNull; use std::sync::Arc; -use futures::future::BoxFuture; - use crate::accessors::MemAccessor; use crate::block; use crate::tasks::TaskGroup; @@ -143,22 +141,20 @@ impl MemAsyncBackend { })) } - async fn spawn_workers(&self) { - self.workers - .extend((0..self.worker_count.get()).map(|n| { - let worker_state = self.work_state.clone(); - let worker_acc = self - .work_state - .attachment - .accessor_mem(|acc_mem| { - acc_mem.child(Some(format!("worker {n}"))) - }) - .expect("backend is attached"); - tokio::spawn(async move { - worker_state.processing_loop(worker_acc).await + fn spawn_workers(&self) { + self.workers.extend((0..self.worker_count.get()).map(|n| { + let worker_state = self.work_state.clone(); + let worker_acc = self + .work_state + .attachment + .accessor_mem(|acc_mem| { + acc_mem.child(Some(format!("worker {n}"))) }) - })) - .await; + .expect("backend is attached"); + tokio::spawn(async move { + worker_state.processing_loop(worker_acc).await + }) + })) } } @@ -209,6 +205,7 @@ impl Drop for MmapSeg { unsafe impl Send for MmapSeg {} unsafe impl Sync for MmapSeg {} +#[async_trait::async_trait] impl block::Backend for MemAsyncBackend { fn info(&self) -> block::DeviceInfo { self.work_state.info @@ -218,18 +215,14 @@ impl block::Backend for MemAsyncBackend { &self.work_state.attachment } - fn start(&self) -> BoxFuture<'_, anyhow::Result<()>> { - Box::pin(async { - self.work_state.attachment.start(); - self.spawn_workers().await; - Ok(()) - }) + async fn start(&self) -> anyhow::Result<()> { + self.work_state.attachment.start(); + self.spawn_workers(); + Ok(()) } - fn stop(&self) -> BoxFuture<'_, ()> { - Box::pin(async { - self.work_state.attachment.stop(); - self.workers.block_until_joined().await; - }) + async fn stop(&self) -> () { + self.work_state.attachment.stop(); + self.workers.block_until_joined().await; } } diff --git a/lib/propolis/src/block/mod.rs b/lib/propolis/src/block/mod.rs index 6e47701b7..5e838e62d 100644 --- a/lib/propolis/src/block/mod.rs +++ b/lib/propolis/src/block/mod.rs @@ -13,7 +13,6 @@ use crate::vmm::{MemCtx, SubMapping}; mod file; pub use file::FileBackend; -use futures::future::BoxFuture; #[cfg(feature = "crucible")] mod crucible; @@ -234,6 +233,7 @@ pub trait Device: Send + Sync + 'static { /// Top-level trait for block backends which will attach to [Device]s in order /// to process [Request]s posted by the guest. +#[async_trait::async_trait] pub trait Backend: Send + Sync + 'static { /// Access to the [BackendAttachment] representing this backend. fn attachment(&self) -> &BackendAttachment; @@ -245,7 +245,7 @@ pub trait Backend: Send + Sync + 'static { /// /// Spawning of any tasks required to do such request processing can be done /// as part of this start-up. - fn start(&self) -> BoxFuture<'_, anyhow::Result<()>>; + async fn start(&self) -> anyhow::Result<()>; /// Stop attempting to process new [Request]s from [Device] (if attached) /// @@ -254,7 +254,7 @@ pub trait Backend: Send + Sync + 'static { /// /// If any tasks were spawned as part of [Backend::start()], they should be /// brought to rest as part of this call. - fn stop(&self) -> BoxFuture<'_, ()>; + async fn stop(&self) -> (); /// Attempt to detach from associated [Device] /// From f9efa928885b436b7da42623f34060de5ec30862 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 2 Jul 2024 17:25:32 +0000 Subject: [PATCH 39/55] LifecycleMap -> DeviceMap --- bin/propolis-server/src/lib/initializer.rs | 4 ++-- bin/propolis-server/src/lib/vm/mod.rs | 2 +- bin/propolis-server/src/lib/vm/objects.rs | 7 +++---- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index 2bf69b631..060934d74 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -12,7 +12,7 @@ use std::time::{SystemTime, UNIX_EPOCH}; use crate::serial::Serial; use crate::stats::virtual_machine::VirtualMachine; -use crate::vm::{BlockBackendMap, CrucibleBackendMap, LifecycleMap}; +use crate::vm::{BlockBackendMap, CrucibleBackendMap, DeviceMap}; use anyhow::{Context, Result}; use crucible_client_types::VolumeConstructionRequest; pub use nexus_client::Client as NexusClient; @@ -110,7 +110,7 @@ pub struct MachineInitializerState { pub struct MachineInitializer<'a> { pub(crate) log: slog::Logger, pub(crate) machine: &'a Machine, - pub(crate) devices: LifecycleMap, + pub(crate) devices: DeviceMap, pub(crate) block_backends: BlockBackendMap, pub(crate) crucible_backends: CrucibleBackendMap, pub(crate) spec: &'a InstanceSpecV0, diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 0413558f6..b596590d7 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -105,7 +105,7 @@ mod state_publisher; /// Maps component names to lifecycle trait objects that allow /// components to be started, paused, resumed, and halted. -pub(crate) type LifecycleMap = +pub(crate) type DeviceMap = BTreeMap>; /// Maps component names to block backend trait objects. diff --git a/bin/propolis-server/src/lib/vm/objects.rs b/bin/propolis-server/src/lib/vm/objects.rs index a597ff1d9..4c0100a2c 100644 --- a/bin/propolis-server/src/lib/vm/objects.rs +++ b/bin/propolis-server/src/lib/vm/objects.rs @@ -22,8 +22,7 @@ use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use crate::{serial::Serial, vcpu_tasks::VcpuTaskController}; use super::{ - state_driver::VmStartReason, BlockBackendMap, CrucibleBackendMap, - LifecycleMap, + state_driver::VmStartReason, BlockBackendMap, CrucibleBackendMap, DeviceMap, }; /// A collection of components that make up a Propolis VM instance. @@ -49,7 +48,7 @@ pub(super) struct InputVmObjects { pub instance_spec: InstanceSpecV0, pub vcpu_tasks: Box, pub machine: Machine, - pub lifecycle_components: LifecycleMap, + pub lifecycle_components: DeviceMap, pub block_backends: BlockBackendMap, pub crucible_backends: CrucibleBackendMap, pub com1: Arc>, @@ -73,7 +72,7 @@ pub(crate) struct VmObjectsLocked { /// Maps from component names to the trait objects that implement lifecycle /// operations (e.g. pause and resume) for eligible components. - lifecycle_components: LifecycleMap, + lifecycle_components: DeviceMap, /// Maps from component names to trait objects that implement the block /// storage backend trait. From 3b4870a23e5217bb746494939854b9cb190d858a Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 2 Jul 2024 17:34:07 +0000 Subject: [PATCH 40/55] clean up interface to VmObjects - use wrapper types to hide the tokio-ness of the underlying reader-writer lock - provide direct access to some subcomponents of `Machine` for brevity --- .../src/lib/migrate/destination.rs | 25 +++++---- bin/propolis-server/src/lib/migrate/source.rs | 42 ++++++++------- bin/propolis-server/src/lib/server.rs | 10 ++-- bin/propolis-server/src/lib/vm/mod.rs | 4 +- bin/propolis-server/src/lib/vm/objects.rs | 53 +++++++++++++++++-- bin/propolis-server/src/lib/vm/services.rs | 8 +-- bin/propolis-server/src/lib/vm/startup.rs | 7 ++- .../src/lib/vm/state_driver.rs | 13 ++--- bin/propolis-server/src/lib/vnc.rs | 5 +- 9 files changed, 109 insertions(+), 58 deletions(-) diff --git a/bin/propolis-server/src/lib/migrate/destination.rs b/bin/propolis-server/src/lib/migrate/destination.rs index 1ac32e997..a423667b3 100644 --- a/bin/propolis-server/src/lib/migrate/destination.rs +++ b/bin/propolis-server/src/lib/migrate/destination.rs @@ -209,7 +209,12 @@ impl DestinationProtocol { info!(self.log(), "Destination read Preamble: {:?}", preamble); if let Err(e) = preamble.is_migration_compatible( - self.vm_objects.as_ref().unwrap().read().await.instance_spec(), + self.vm_objects + .as_ref() + .unwrap() + .lock_shared() + .await + .instance_spec(), ) { error!( self.log(), @@ -354,10 +359,9 @@ impl DestinationProtocol { info!(self.log(), "Devices: {devices:#?}"); { - let objects = self.vm_objects.as_ref().unwrap().read().await; - let migrate_ctx = MigrateCtx { - mem: &objects.machine().acc_mem.access().unwrap(), - }; + let objects = self.vm_objects.as_ref().unwrap().lock_shared().await; + let migrate_ctx = + MigrateCtx { mem: &objects.access_mem().unwrap() }; for device in devices { info!( self.log(), @@ -411,10 +415,9 @@ impl DestinationProtocol { .vm_objects .as_ref() .unwrap() - .read() + .lock_shared() .await - .machine() - .hdl + .vmm_hdl() .clone(); let (dst_hrt, dst_wc) = vmm::time::host_time_snapshot(vmm_hdl) @@ -612,7 +615,7 @@ impl DestinationProtocol { self.vm_objects .as_ref() .unwrap() - .read() + .lock_shared() .await .com1() .import(&com1_history) @@ -693,8 +696,8 @@ impl DestinationProtocol { addr: GuestAddr, buf: &[u8], ) -> Result<(), MigrateError> { - let objects = self.vm_objects.as_ref().unwrap().read().await; - let memctx = objects.machine().acc_mem.access().unwrap(); + let objects = self.vm_objects.as_ref().unwrap().lock_shared().await; + let memctx = objects.access_mem().unwrap(); let len = buf.len(); memctx.write_from(addr, buf, len); Ok(()) diff --git a/bin/propolis-server/src/lib/migrate/source.rs b/bin/propolis-server/src/lib/migrate/source.rs index 94f69e6c9..6da0e14b8 100644 --- a/bin/propolis-server/src/lib/migrate/source.rs +++ b/bin/propolis-server/src/lib/migrate/source.rs @@ -148,10 +148,11 @@ pub async fn migrate( // See the lengthy comment on `RamOfferDiscipline` above for more // details about what's going on here. { - let objects = proto.vm.read().await; - let machine = objects.machine(); + let objects = proto.vm.lock_shared().await; for (&GuestAddr(gpa), dirtiness) in proto.dirt.iter().flatten() { - if let Err(e) = machine.hdl.set_dirty_pages(gpa, dirtiness) { + if let Err(e) = + objects.vmm_hdl().set_dirty_pages(gpa, dirtiness) + { // Bad news! Our attempt to re-set the dirty bit on these // pages has failed! Thus, subsequent migration attempts // /!\ CAN NO LONGER RELY ON DIRTY PAGE TRACKING /!\ @@ -253,7 +254,7 @@ impl SourceProtocol { // the pre-pause RAM push. let dirt = { let can_npt_operate = - vm.read().await.machine().hdl.can_npt_operate(); + vm.lock_shared().await.vmm_hdl().can_npt_operate(); if can_npt_operate { Some(Default::default()) @@ -326,7 +327,7 @@ impl SourceProtocol { async fn sync(&mut self) -> Result<(), MigrateError> { self.update_state(MigrationState::Sync).await; let preamble = Preamble::new(VersionedInstanceSpec::V0( - self.vm.read().await.instance_spec().clone(), + self.vm.lock_shared().await.instance_spec().clone(), )); let s = ron::ser::to_string(&preamble) .map_err(codec::ProtocolError::from)?; @@ -576,10 +577,9 @@ impl SourceProtocol { self.update_state(MigrationState::Device).await; let mut device_states = vec![]; { - let objects = self.vm.read().await; - let machine = objects.machine(); + let objects = self.vm.lock_shared().await; let migrate_ctx = - MigrateCtx { mem: &machine.acc_mem.access().unwrap() }; + MigrateCtx { mem: &objects.access_mem().unwrap() }; // Collect together the serialized state for all the devices objects.for_each_device_fallible(|name, devop| { @@ -640,7 +640,7 @@ impl SourceProtocol { // Read and send over the time data async fn time_data(&mut self) -> Result<(), MigrateError> { - let vmm_hdl = &self.vm.read().await.machine().hdl.clone(); + let vmm_hdl = &self.vm.lock_shared().await.vmm_hdl().clone(); let vm_time_data = vmm::time::export_time_data(vmm_hdl).map_err(|e| { MigrateError::TimeData(format!( @@ -678,8 +678,13 @@ impl SourceProtocol { } _ => return Err(MigrateError::UnexpectedMessage), }; - let com1_history = - self.vm.read().await.com1().export_history(remote_addr).await?; + let com1_history = self + .vm + .lock_shared() + .await + .com1() + .export_history(remote_addr) + .await?; self.send_msg(codec::Message::Serialized(com1_history)).await?; self.read_ok().await } @@ -792,9 +797,8 @@ impl SourceProtocol { async fn vmm_ram_bounds( &mut self, ) -> Result, MigrateError> { - let objects = self.vm.read().await; - let machine = objects.machine(); - let memctx = machine.acc_mem.access().unwrap(); + let objects = self.vm.lock_shared().await; + let memctx = objects.access_mem().unwrap(); memctx.mem_bounds().ok_or(MigrateError::InvalidInstanceState) } @@ -804,10 +808,9 @@ impl SourceProtocol { bits: &mut [u8], ) -> Result<(), MigrateError> { self.vm - .read() + .lock_shared() .await - .machine() - .hdl + .vmm_hdl() .track_dirty_pages(start_gpa.0, bits) .map_err(|_| MigrateError::InvalidInstanceState) } @@ -817,9 +820,8 @@ impl SourceProtocol { addr: GuestAddr, buf: &mut [u8], ) -> Result<(), MigrateError> { - let objects = self.vm.read().await; - let machine = objects.machine(); - let memctx = machine.acc_mem.access().unwrap(); + let objects = self.vm.lock_shared().await; + let memctx = objects.access_mem().unwrap(); let len = buf.len(); memctx.direct_read_into(addr, buf, len); Ok(()) diff --git a/bin/propolis-server/src/lib/server.rs b/bin/propolis-server/src/lib/server.rs index d00cddcc3..eba893858 100644 --- a/bin/propolis-server/src/lib/server.rs +++ b/bin/propolis-server/src/lib/server.rs @@ -455,7 +455,7 @@ async fn instance_serial_history_get( { let ctx = rqctx.context(); let vm = ctx.vm.active_vm().await.ok_or_else(not_created_error)?; - let serial = vm.objects().read().await.com1().clone(); + let serial = vm.objects().lock_shared().await.com1().clone(); let query_params = query.into_inner(); let byte_offset = SerialHistoryOffset::try_from(&query_params)?; @@ -483,7 +483,7 @@ async fn instance_serial( ) -> dropshot::WebsocketChannelResult { let ctx = rqctx.context(); let vm = ctx.vm.active_vm().await.ok_or_else(not_created_error)?; - let serial = vm.objects().read().await.com1().clone(); + let serial = vm.objects().lock_shared().await.com1().clone(); // Use the default buffering paramters for the websocket configuration // @@ -577,7 +577,7 @@ async fn instance_issue_crucible_snapshot_request( ) -> Result, HttpError> { let vm = rqctx.context().vm.active_vm().await.ok_or_else(not_created_error)?; - let objects = vm.objects().read().await; + let objects = vm.objects().lock_shared().await; let path_params = path_params.into_inner(); let backend = @@ -604,7 +604,7 @@ async fn disk_volume_status( let path_params = path_params.into_inner(); let vm = rqctx.context().vm.active_vm().await.ok_or_else(not_created_error)?; - let objects = vm.objects().read().await; + let objects = vm.objects().lock_shared().await; let backend = objects.crucible_backends().get(&path_params.id).ok_or_else(|| { let s = format!("No crucible backend for id {}", path_params.id); @@ -667,7 +667,7 @@ async fn instance_issue_nmi( ) -> Result, HttpError> { let vm = rqctx.context().vm.active_vm().await.ok_or_else(not_created_error)?; - let _ = vm.objects().read().await.machine().inject_nmi(); + let _ = vm.objects().lock_shared().await.machine().inject_nmi(); Ok(HttpResponseOk(())) } diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index b596590d7..96d334d35 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -312,7 +312,7 @@ impl Vm { } }; - let spec = vm.objects().read().await.instance_spec().clone(); + let spec = vm.objects().lock_shared().await.instance_spec().clone(); let state = vm.external_state_rx.borrow().clone(); Ok(propolis_api_types::InstanceSpecGetResponse { properties: vm.properties.clone(), @@ -421,7 +421,7 @@ impl Vm { panic!("VM should be active before being run down"); }; - let spec = vm.objects().read().await.instance_spec().clone(); + let spec = vm.objects().lock_shared().await.instance_spec().clone(); let ActiveVm { external_state_rx, properties, .. } = vm; guard.state = VmState::Rundown(VmDescription { external_state_rx, diff --git a/bin/propolis-server/src/lib/vm/objects.rs b/bin/propolis-server/src/lib/vm/objects.rs index 4c0100a2c..ddc7fa1f9 100644 --- a/bin/propolis-server/src/lib/vm/objects.rs +++ b/bin/propolis-server/src/lib/vm/objects.rs @@ -5,6 +5,7 @@ //! A collection of all of the components that make up a Propolis VM instance. use std::{ + ops::{Deref, DerefMut}, pin::Pin, sync::Arc, task::{Context, Poll}, @@ -13,6 +14,7 @@ use std::{ use futures::{future::BoxFuture, stream::FuturesUnordered, StreamExt}; use propolis::{ hw::{ps2::ctrl::PS2Ctrl, qemu::ramfb::RamFb, uart::LpcUart}, + vmm::VmmHdl, Machine, }; use propolis_api_types::instance_spec::v0::InstanceSpecV0; @@ -112,8 +114,8 @@ impl VmObjects { /// /// This function is crate-visible to allow the API layer to read (but not /// mutate) VM objects. - pub(crate) async fn read(&self) -> RwLockReadGuard { - self.inner.read().await + pub(crate) async fn lock_shared(&self) -> VmObjectsShared { + VmObjectsShared(self.inner.read().await) } /// Yields an exclusive lock guard referring to the underlying object @@ -121,8 +123,8 @@ impl VmObjects { /// /// This function is only visible within the `vm` module so that only the /// state driver can obtain a mutable reference to the underlying objects. - pub(super) async fn write(&self) -> RwLockWriteGuard { - self.inner.write().await + pub(super) async fn lock_exclusive(&self) -> VmObjectsExclusive { + VmObjectsExclusive(self.inner.write().await) } } @@ -153,11 +155,24 @@ impl VmObjectsLocked { &mut self.instance_spec } - /// Yields the VM's current kernel VMM handle. + /// Yields the VM's current Propolis VM aggregation. pub(crate) fn machine(&self) -> &Machine { &self.machine } + /// Yields the VM's current kernel VMM handle. + pub(crate) fn vmm_hdl(&self) -> &Arc { + &self.machine.hdl + } + + /// Yields an accessor to the VM's memory context, or None if guest memory + /// is not currently accessible. + pub(crate) fn access_mem( + &self, + ) -> Option> { + self.machine.acc_mem.access() + } + /// Obtains a handle to the lifecycle trait object for the component with /// the supplied `name`. pub(crate) fn device_by_name( @@ -452,3 +467,31 @@ impl Drop for VmObjects { }); } } + +/// A shared lock on the contents of a [`VmObjects`]. +pub(crate) struct VmObjectsShared<'o>(RwLockReadGuard<'o, VmObjectsLocked>); + +/// An exclusive lock on the contents of a [`VmObjects`]. +pub(crate) struct VmObjectsExclusive<'o>(RwLockWriteGuard<'o, VmObjectsLocked>); + +impl Deref for VmObjectsShared<'_> { + type Target = VmObjectsLocked; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl Deref for VmObjectsExclusive<'_> { + type Target = VmObjectsLocked; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for VmObjectsExclusive<'_> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/bin/propolis-server/src/lib/vm/services.rs b/bin/propolis-server/src/lib/vm/services.rs index 800f0c1f2..16d12141b 100644 --- a/bin/propolis-server/src/lib/vm/services.rs +++ b/bin/propolis-server/src/lib/vm/services.rs @@ -16,7 +16,7 @@ use crate::{ stats::virtual_machine::VirtualMachine, vnc::PropolisVncServer, }; -use super::objects::{VmObjects, VmObjectsLocked}; +use super::objects::{VmObjects, VmObjectsShared}; /// Information used to serve Oximeter metrics. #[derive(Default)] @@ -62,7 +62,7 @@ impl VmServices { OximeterState::default() }; - let vm_objects = vm_objects.read().await; + let vm_objects = vm_objects.lock_shared().await; let vnc_server = ensure_options.vnc_server.clone(); if let Some(ramfb) = vm_objects.framebuffer() { vnc_server @@ -179,9 +179,9 @@ async fn register_oximeter_producer( } /// Launches a serial console handler task. -async fn start_serial_task( +async fn start_serial_task<'vm>( log: &slog::Logger, - vm_objects: &tokio::sync::RwLockReadGuard<'_, VmObjectsLocked>, + vm_objects: &VmObjectsShared<'vm>, ) -> crate::serial::SerialTask { let (websocks_ch, websocks_recv) = tokio::sync::mpsc::channel(1); let (control_ch, control_recv) = tokio::sync::mpsc::channel(1); diff --git a/bin/propolis-server/src/lib/vm/startup.rs b/bin/propolis-server/src/lib/vm/startup.rs index 721a13ad9..a439c1777 100644 --- a/bin/propolis-server/src/lib/vm/startup.rs +++ b/bin/propolis-server/src/lib/vm/startup.rs @@ -317,7 +317,7 @@ impl MigrateAsTargetContext { // Drop the lock after this operation so that the migration task can // acquire it to enumerate devices and import state into them. { - let guard = self.vm_objects.read().await; + let guard = self.vm_objects.lock_shared().await; guard.reset_vcpus(); guard.pause_kernel_vm(); } @@ -346,7 +346,10 @@ impl MigrateAsTargetContext { error!(self.log, "target migration task failed"; "error" => %e); - self.vm_objects.write().await.resume_kernel_vm(); + self.vm_objects + .lock_exclusive() + .await + .resume_kernel_vm(); return Err(e); } }, diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 22b3cbe1f..daecaa95e 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -398,7 +398,8 @@ impl StateDriver { ) -> anyhow::Result<()> { info!(self.log, "starting instance"; "reason" => ?start_reason); - let start_result = self.objects.write().await.start(start_reason).await; + let start_result = + self.objects.lock_exclusive().await.start(start_reason).await; match &start_result { Ok(()) => { self.publish_steady_state(InstanceState::Running); @@ -511,7 +512,7 @@ impl StateDriver { self.external_state .update(ExternalStateUpdate::Instance(InstanceState::Rebooting)); - self.objects.write().await.reboot().await; + self.objects.lock_exclusive().await.reboot().await; // Notify other consumers that the instance successfully rebooted and is // now back to Running. @@ -528,7 +529,7 @@ impl StateDriver { .update(ExternalStateUpdate::Instance(InstanceState::Stopping)); { - let mut guard = self.objects.write().await; + let mut guard = self.objects.lock_exclusive().await; // Entities expect to be paused before being halted. Note that the VM // may be paused already if it is being torn down after a successful @@ -546,13 +547,13 @@ impl StateDriver { async fn pause(&mut self) { assert!(!self.paused); - self.objects.write().await.pause().await; + self.objects.lock_exclusive().await.pause().await; self.paused = true; } async fn resume(&mut self) { assert!(self.paused); - self.objects.write().await.resume(); + self.objects.lock_exclusive().await.resume(); self.paused = false; } @@ -703,7 +704,7 @@ impl StateDriver { dropshot::HttpError::for_not_found(Some(msg.clone()), msg) } - let mut objects = self.objects.write().await; + let mut objects = self.objects.lock_exclusive().await; let (readonly, old_vcr_json) = { let StorageBackendV0::Crucible(bes) = objects .instance_spec() diff --git a/bin/propolis-server/src/lib/vnc.rs b/bin/propolis-server/src/lib/vnc.rs index 3912f7367..9a93643f7 100644 --- a/bin/propolis-server/src/lib/vnc.rs +++ b/bin/propolis-server/src/lib/vnc.rs @@ -153,10 +153,9 @@ impl Server for PropolisVncServer { let len = fb.height as usize * fb.width as usize * 4; let mut buf = vec![0u8; len]; if let Some(vm) = inner.vm.as_ref().unwrap().active_vm().await { - let vm_objects = vm.objects().read().await; + let vm_objects = vm.objects().lock_shared().await; let read = tokio::task::block_in_place(|| { - let machine = vm_objects.machine(); - let memctx = machine.acc_mem.access().unwrap(); + let memctx = vm_objects.access_mem().unwrap(); memctx.read_into(GuestAddr(fb.addr), &mut buf, len) }); From 4e80e26760d0456599c40eca70e3adf572c832c7 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 2 Jul 2024 17:55:06 +0000 Subject: [PATCH 41/55] clean up unused async --- lib/propolis/src/block/crucible.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index b8c09aacf..9825a8c76 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -268,7 +268,7 @@ impl CrucibleBackend { .map_err(CrucibleError::into) } - async fn spawn_workers(&self) { + fn spawn_workers(&self) { // TODO: make this tunable? let worker_count = 8; self.workers.extend((0..worker_count).map(|n| { @@ -283,7 +283,7 @@ impl CrucibleBackend { tokio::spawn( async move { worker_state.process_loop(worker_acc).await }, ) - })); + })) } pub async fn volume_is_active(&self) -> Result { @@ -302,7 +302,7 @@ impl block::Backend for CrucibleBackend { async fn start(&self) -> anyhow::Result<()> { self.state.volume.activate().await?; self.state.attachment.start(); - self.spawn_workers().await; + self.spawn_workers(); Ok(()) } async fn stop(&self) -> () { From 746b34f2db00b39401f8fc60c25b651e6912eb0a Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Thu, 11 Jul 2024 17:50:59 -0700 Subject: [PATCH 42/55] Run live migration directly on the state driver task (#720) Run live migration protocols on the state driver's tokio task without using `block_on`: - Define `SourceProtocol` and `DestinationProtocol` traits that describe the routines the state driver uses to run a generic migration irrespective of its protocol version. (This will be useful for protocol versioning later.) - Move the `migrate::source_start` and `migrate::dest_initiate` routines into factory functions that connect to the peer Propolis, negotiate protocol versions, and return an appropriate protocol impl. - Use the protocol impls to run migration on the state driver task. Remove all the types and constructs used to pass messages between it and migration tasks. Also, improve the interface between the `vm` and `migrate` modules for inbound migrations by defining some objects that migrations can use either to fully initialize a VM or to unwind correctly if migration fails. This allows migration to take control of when precisely a VM's components get created (and from what spec) without exposing to the migration task all the complexity of unwinding from a failed attempt to create a VM. Tested via full PHD run with a Debian 11 guest. --- .../src/lib/migrate/destination.rs | 469 ++++++++++++------ bin/propolis-server/src/lib/migrate/mod.rs | 205 +------- bin/propolis-server/src/lib/migrate/source.rs | 435 +++++++++------- bin/propolis-server/src/lib/vm/ensure.rs | 364 ++++++++++++++ bin/propolis-server/src/lib/vm/mod.rs | 36 +- bin/propolis-server/src/lib/vm/objects.rs | 22 +- .../src/lib/vm/state_driver.rs | 290 +++++------ .../src/lib/vm/state_publisher.rs | 8 +- phd-tests/tests/src/migrate.rs | 18 + 9 files changed, 1107 insertions(+), 740 deletions(-) create mode 100644 bin/propolis-server/src/lib/vm/ensure.rs diff --git a/bin/propolis-server/src/lib/migrate/destination.rs b/bin/propolis-server/src/lib/migrate/destination.rs index a423667b3..6d683c9a6 100644 --- a/bin/propolis-server/src/lib/migrate/destination.rs +++ b/bin/propolis-server/src/lib/migrate/destination.rs @@ -9,13 +9,16 @@ use propolis::migrate::{ MigrateCtx, MigrateStateError, Migrator, PayloadOffer, PayloadOffers, }; use propolis::vmm; +use propolis_api_types::InstanceMigrateInitiateRequest; use slog::{error, info, trace, warn}; use std::convert::TryInto; use std::io; use std::net::SocketAddr; use std::sync::Arc; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_tungstenite::WebSocketStream; +use tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode; +use tokio_tungstenite::tungstenite::protocol::CloseFrame; +use tokio_tungstenite::{tungstenite, WebSocketStream}; +use uuid::Uuid; use crate::migrate::codec; use crate::migrate::memx; @@ -24,176 +27,311 @@ use crate::migrate::probes; use crate::migrate::{ Device, MigrateError, MigratePhase, MigrateRole, MigrationState, PageIter, }; -use crate::vm::migrate_commands::MigrateTargetCommand; -use crate::vm::migrate_commands::MigrateTargetResponse; +use crate::vm::ensure::{VmEnsureActive, VmEnsureNotStarted}; +use crate::vm::state_publisher::{ + ExternalStateUpdate, MigrationStateUpdate, StatePublisher, +}; use super::protocol::Protocol; +use super::MigrateConn; + +/// The interface to an arbitrary version of the target half of the live +/// migration protocol. +// +// Use `async_trait` here to help generate a `Send` bound on the futures +// returned by the functions in this trait. +#[async_trait::async_trait] +pub(crate) trait DestinationProtocol { + /// Runs live migration as a target, attempting to create a set of VM + /// objects in the process. On success, returns an "active VM" placeholder + /// that the caller can use to set up and start a state driver loop. + async fn run<'ensure>( + mut self, + ensure: VmEnsureNotStarted<'ensure>, + ) -> Result, MigrateError>; +} -/// Launches an attempt to migrate into a supplied instance using the supplied -/// source connection. -pub async fn migrate( +/// Connects to a live migration source using the migration request information +/// in `migrate_info`, then negotiates a protocol version with that source. +/// Returns a [`DestinationProtocol`] implementation for the negotiated version +/// that the caller can use to run the migration. +pub(crate) async fn initiate( log: &slog::Logger, - command_tx: tokio::sync::mpsc::Sender, - response_rx: tokio::sync::mpsc::Receiver, - conn: WebSocketStream, + migrate_info: &InstanceMigrateInitiateRequest, local_addr: SocketAddr, - protocol: Protocol, -) -> Result<(), MigrateError> { - let err_tx = command_tx.clone(); - let log = log.new(slog::o!("component" => "migration_target_protocol")); - let mut proto = match protocol { - Protocol::RonV0 => DestinationProtocol::new( - log, - command_tx, - response_rx, - conn, - local_addr, - ), - }; +) -> Result { + let migration_id = migrate_info.migration_id; + + let log = log.new(slog::o!( + "migration_id" => migration_id.to_string(), + "migrate_role" => "destination", + "migrate_src_addr" => migrate_info.src_addr + )); + + info!(log, "negotiating migration as destination"); + + // Build upgrade request to the source instance + // (we do this by hand because it's hidden from the OpenAPI spec) + // TODO(#165): https (wss) + // TODO: We need to make sure the src_addr is a valid target + let src_migrate_url = format!( + "ws://{}/instance/migrate/{}/start", + migrate_info.src_addr, migration_id, + ); + info!(log, "Begin migration"; "src_migrate_url" => &src_migrate_url); + let (mut conn, _) = + tokio_tungstenite::connect_async(src_migrate_url).await?; + + // Generate a list of protocols that this target supports, then send them to + // the source and allow it to choose its favorite. + let dst_protocols = super::protocol::make_protocol_offer(); + conn.send(tungstenite::Message::Text(dst_protocols)).await?; + let src_selected = match conn.next().await { + Some(Ok(tungstenite::Message::Text(selected))) => selected, + x => { + error!( + log, + "source instance failed to negotiate protocol version: {:?}", x + ); - if let Err(err) = proto.run().await { - let _ = err_tx - .send(MigrateTargetCommand::UpdateState(MigrationState::Error)) - .await; - - // We encountered an error, try to inform the remote before bailing - // Note, we don't use `?` here as this is a best effort and we don't - // want an error encountered during this send to shadow the run error - // from the caller. - if let Ok(e) = codec::Message::Error(err.clone()).try_into() { - let _ = proto.conn.send(e).await; + // Tell the source about its mistake. This is best-effort. + if let Err(e) = conn + .send(tungstenite::Message::Close(Some(CloseFrame { + code: CloseCode::Protocol, + reason: "did not respond to version handshake.".into(), + }))) + .await + { + warn!(log, "failed to send handshake failure to source"; + "error" => ?e); + } + + return Err(MigrateError::Initiate); } - return Err(err); - } + }; + + // Make sure the source's selected protocol parses correctly and is in the + // list of protocols this target supports. If the source's choice is valid, + // use the protocol it picked. + let selected = + match super::protocol::select_protocol_from_offer(&src_selected) { + Ok(Some(selected)) => selected, + Ok(None) => { + let offered = super::protocol::make_protocol_offer(); + error!(log, "source selected protocol not on offer"; + "offered" => &offered, + "selected" => &src_selected); + + return Err(MigrateError::NoMatchingProtocol( + src_selected, + offered, + )); + } + Err(e) => { + error!(log, "source selected protocol failed to parse"; + "selected" => &src_selected); + + return Err(MigrateError::ProtocolParse( + src_selected, + e.to_string(), + )); + } + }; - Ok(()) + Ok(match selected { + Protocol::RonV0 => RonV0::new(log, migration_id, conn, local_addr), + }) } -struct DestinationProtocol { +/// The runner for version 0 of the LM protocol, using RON encoding. +struct RonV0 { + /// The ID for this migration. + migration_id: Uuid, + /// The logger for messages from this protocol. log: slog::Logger, /// The channel to use to send messages to the state worker coordinating /// this migration. - command_tx: tokio::sync::mpsc::Sender, - - /// The channel that receives responses from the state worker coordinating - /// this migration. - response_rx: tokio::sync::mpsc::Receiver, - - /// Transport to the source Instance. conn: WebSocketStream, /// Local propolis-server address /// (to inform the source-side where to redirect its clients) local_addr: SocketAddr, +} + +#[async_trait::async_trait] +impl DestinationProtocol for RonV0 { + async fn run<'ensure>( + mut self, + mut ensure: VmEnsureNotStarted<'ensure>, + ) -> Result, MigrateError> { + info!(self.log(), "entering destination migration task"); + + let result = async { + // Run the sync phase to ensure that the source's instance spec is + // compatible with the spec supplied in the ensure parameters. + if let Err(e) = self.run_sync_phases(&mut ensure).await { + self.update_state( + ensure.state_publisher(), + MigrationState::Error, + ); + let e = ensure.fail(e.into()).await; + return Err(e + .downcast::() + .expect("original error was a MigrateError")); + } + + // The sync phase succeeded, so it's OK to go ahead with creating + // the objects in the target's instance spec. + let mut objects_created = + ensure.create_objects().await.map_err(|e| { + MigrateError::TargetInstanceInitializationFailed( + e.to_string(), + ) + })?; + objects_created.prepare_for_migration().await; + let mut ensure = objects_created.ensure_active().await; + + // Now that the VM's objects exist, run the rest of the protocol to + // import state into them. + if let Err(e) = self.run_import_phases(&mut ensure).await { + self.update_state( + ensure.state_publisher(), + MigrationState::Error, + ); + ensure.fail().await; + return Err(e); + } - /// The VM objects into which to import the source VM's state. Only - /// initialized after the sync phase. - vm_objects: Option>, + Ok(ensure) + } + .await; + + match result { + Ok(vm) => { + info!(self.log(), "migration in succeeded"); + Ok(vm) + } + Err(err) => { + error!(self.log(), "migration in failed"; "error" => ?err); + + // We encountered an error, try to inform the remote before + // bailing Note, we don't use `?` here as this is a best effort + // and we don't want an error encountered during this send to + // shadow the run error from the caller. + if let Ok(e) = codec::Message::Error(err.clone()).try_into() { + let _ = self.conn.send(e).await; + } + Err(err) + } + } + } } -impl DestinationProtocol { +impl RonV0 { fn new( log: slog::Logger, - command_tx: tokio::sync::mpsc::Sender, - response_rx: tokio::sync::mpsc::Receiver, + migration_id: Uuid, conn: WebSocketStream, local_addr: SocketAddr, ) -> Self { - Self { - log, - command_tx, - response_rx, - conn, - local_addr, - vm_objects: None, - } + Self { log, migration_id, conn, local_addr } } fn log(&self) -> &slog::Logger { &self.log } - async fn update_state(&mut self, state: MigrationState) { - // When migrating into an instance, the VM state worker blocks waiting - // for the disposition of the migration attempt, so the channel should - // never be closed before the attempt completes. - self.command_tx - .send(MigrateTargetCommand::UpdateState(state)) - .await - .unwrap(); + fn update_state( + &self, + publisher: &mut StatePublisher, + state: MigrationState, + ) { + publisher.update(ExternalStateUpdate::Migration( + MigrationStateUpdate { + state, + id: self.migration_id, + role: MigrateRole::Destination, + }, + )); } - async fn run_phase( + async fn run_sync_phases( &mut self, - step: MigratePhase, + ensure_ctx: &mut VmEnsureNotStarted<'_>, ) -> Result<(), MigrateError> { - probes::migrate_phase_begin!(|| { step.to_string() }); - - let res = match step { - MigratePhase::MigrateSync => self.sync().await, - - // no pause step on the dest side - MigratePhase::Pause => unreachable!(), - MigratePhase::RamPushPrePause | MigratePhase::RamPushPostPause => { - self.ram_push(&step).await - } - MigratePhase::DeviceState => self.device_state().await, - MigratePhase::TimeData => self.time_data().await, - MigratePhase::RamPull => self.ram_pull().await, - MigratePhase::ServerState => self.server_state().await, - MigratePhase::Finish => self.finish().await, - }; + let step = MigratePhase::MigrateSync; + probes::migrate_phase_begin!(|| { step.to_string() }); + self.sync(ensure_ctx).await?; probes::migrate_phase_end!(|| { step.to_string() }); - res + Ok(()) } - async fn run(&mut self) -> Result<(), MigrateError> { - info!(self.log(), "Entering Destination Migration Task"); - - self.run_phase(MigratePhase::MigrateSync).await?; - + async fn run_import_phases( + &mut self, + ensure_ctx: &mut VmEnsureActive<'_>, + ) -> Result<(), MigrateError> { // The RAM transfer phase runs twice, once before the source pauses and // once after. There is no explicit pause phase on the destination, // though, so that step does not appear here even though there are // pre- and post-pause steps. - self.run_phase(MigratePhase::RamPushPrePause).await?; - self.run_phase(MigratePhase::RamPushPostPause).await?; + self.run_import_phase(MigratePhase::RamPushPrePause, ensure_ctx) + .await?; + self.run_import_phase(MigratePhase::RamPushPostPause, ensure_ctx) + .await?; // Import of the time data *must* be done before we import device // state: the proper functioning of device timers depends on an adjusted // boot_hrtime. - self.run_phase(MigratePhase::TimeData).await?; - self.run_phase(MigratePhase::DeviceState).await?; - self.run_phase(MigratePhase::RamPull).await?; - self.run_phase(MigratePhase::ServerState).await?; - self.run_phase(MigratePhase::Finish).await?; - - info!(self.log(), "Destination Migration Successful"); + self.run_import_phase(MigratePhase::TimeData, ensure_ctx).await?; + self.run_import_phase(MigratePhase::DeviceState, ensure_ctx).await?; + self.run_import_phase(MigratePhase::RamPull, ensure_ctx).await?; + self.run_import_phase(MigratePhase::ServerState, ensure_ctx).await?; + self.run_import_phase(MigratePhase::Finish, ensure_ctx).await?; Ok(()) } - async fn sync(&mut self) -> Result<(), MigrateError> { - self.command_tx - .send(MigrateTargetCommand::InitializeFromExternalSpec) - .await - .map_err(|_| MigrateError::StateDriverChannelClosed)?; + async fn run_import_phase( + &mut self, + step: MigratePhase, + ensure_ctx: &mut VmEnsureActive<'_>, + ) -> Result<(), MigrateError> { + probes::migrate_phase_begin!(|| { step.to_string() }); - let MigrateTargetResponse::VmObjectsInitialized(vm_objects) = self - .response_rx - .recv() - .await - .ok_or(MigrateError::StateDriverChannelClosed)?; + let res = match step { + MigratePhase::MigrateSync => { + unreachable!("sync phase runs before import") + } + + // no pause step on the dest side + MigratePhase::Pause => { + unreachable!("no explicit pause phase on dest") + } + + MigratePhase::RamPushPrePause | MigratePhase::RamPushPostPause => { + self.ram_push(&step, ensure_ctx).await + } + MigratePhase::DeviceState => self.device_state(ensure_ctx).await, + MigratePhase::TimeData => self.time_data(ensure_ctx).await, + MigratePhase::RamPull => self.ram_pull(ensure_ctx).await, + MigratePhase::ServerState => self.server_state(ensure_ctx).await, + MigratePhase::Finish => self.finish(ensure_ctx).await, + }; - let vm_objects = vm_objects - .map_err(MigrateError::TargetInstanceInitializationFailed)?; + probes::migrate_phase_end!(|| { step.to_string() }); - self.vm_objects = Some(vm_objects); - self.update_state(MigrationState::Sync).await; + res + } + + async fn sync( + &mut self, + ensure_ctx: &mut VmEnsureNotStarted<'_>, + ) -> Result<(), MigrateError> { + self.update_state(ensure_ctx.state_publisher(), MigrationState::Sync); let preamble: Preamble = match self.read_msg().await? { codec::Message::Serialized(s) => { Ok(ron::de::from_str(&s).map_err(codec::ProtocolError::from)?) @@ -208,14 +346,9 @@ impl DestinationProtocol { }?; info!(self.log(), "Destination read Preamble: {:?}", preamble); - if let Err(e) = preamble.is_migration_compatible( - self.vm_objects - .as_ref() - .unwrap() - .lock_shared() - .await - .instance_spec(), - ) { + if let Err(e) = + preamble.is_migration_compatible(ensure_ctx.instance_spec()) + { error!( self.log(), "Source and destination instance specs incompatible: {}", e @@ -229,17 +362,15 @@ impl DestinationProtocol { async fn ram_push( &mut self, phase: &MigratePhase, + ensure_ctx: &mut VmEnsureActive<'_>, ) -> Result<(), MigrateError> { - match phase { - MigratePhase::RamPushPrePause => { - self.update_state(MigrationState::RamPush).await - } - MigratePhase::RamPushPostPause => { - self.update_state(MigrationState::RamPushDirty).await - } + let state = match phase { + MigratePhase::RamPushPrePause => MigrationState::RamPush, + MigratePhase::RamPushPostPause => MigrationState::RamPushDirty, _ => unreachable!("should only push RAM in a RAM push phase"), - } + }; + self.update_state(ensure_ctx.state_publisher(), state); let (dirty, highest) = self.query_ram().await?; for (k, region) in dirty.as_raw_slice().chunks(4096).enumerate() { if region.iter().all(|&b| b == 0) { @@ -276,13 +407,13 @@ impl DestinationProtocol { // space or non-existent RAM regions. While we de facto // do not because of the way access is implemented, we // should probably disallow it at the protocol level. - self.xfer_ram(start, end, &bits).await?; + self.xfer_ram(ensure_ctx, start, end, &bits).await?; } _ => return Err(MigrateError::UnexpectedMessage), }; } self.send_msg(codec::Message::MemDone).await?; - self.update_state(MigrationState::Pause).await; + self.update_state(ensure_ctx.state_publisher(), MigrationState::Pause); Ok(()) } @@ -329,6 +460,7 @@ impl DestinationProtocol { async fn xfer_ram( &mut self, + ensure_ctx: &VmEnsureActive<'_>, start: u64, end: u64, bits: &[u8], @@ -336,13 +468,16 @@ impl DestinationProtocol { info!(self.log(), "ram_push: xfer RAM between {} and {}", start, end); for addr in PageIter::new(start, end, bits) { let bytes = self.read_page().await?; - self.write_guest_ram(GuestAddr(addr), &bytes).await?; + self.write_guest_ram(ensure_ctx, GuestAddr(addr), &bytes).await?; } Ok(()) } - async fn device_state(&mut self) -> Result<(), MigrateError> { - self.update_state(MigrationState::Device).await; + async fn device_state( + &mut self, + ensure_ctx: &mut VmEnsureActive<'_>, + ) -> Result<(), MigrateError> { + self.update_state(ensure_ctx.state_publisher(), MigrationState::Device); let devices: Vec = match self.read_msg().await? { codec::Message::Serialized(encoded) => { @@ -359,22 +494,20 @@ impl DestinationProtocol { info!(self.log(), "Devices: {devices:#?}"); { - let objects = self.vm_objects.as_ref().unwrap().lock_shared().await; + let vm_objects = ensure_ctx.vm_objects().lock_shared().await; let migrate_ctx = - MigrateCtx { mem: &objects.access_mem().unwrap() }; + MigrateCtx { mem: &vm_objects.access_mem().unwrap() }; for device in devices { info!( self.log(), "Applying state to device {}", device.instance_name ); - let target = objects + let target = vm_objects .device_by_name(&device.instance_name) .ok_or_else(|| { - MigrateError::UnknownDevice( - device.instance_name.clone(), - ) - })?; + MigrateError::UnknownDevice(device.instance_name.clone()) + })?; self.import_device(&target, &device, &migrate_ctx)?; } } @@ -384,7 +517,10 @@ impl DestinationProtocol { // Get the guest time data from the source, make updates to it based on the // new host, and write the data out to bhvye. - async fn time_data(&mut self) -> Result<(), MigrateError> { + async fn time_data( + &mut self, + ensure_ctx: &VmEnsureActive<'_>, + ) -> Result<(), MigrateError> { // Read time data sent by the source and deserialize let raw: String = match self.read_msg().await? { codec::Message::Serialized(encoded) => encoded, @@ -411,14 +547,8 @@ impl DestinationProtocol { // Take a snapshot of the host hrtime/wall clock time, then adjust // time data appropriately. - let vmm_hdl = &self - .vm_objects - .as_ref() - .unwrap() - .lock_shared() - .await - .vmm_hdl() - .clone(); + let vmm_hdl = + &ensure_ctx.vm_objects().lock_shared().await.vmm_hdl().clone(); let (dst_hrt, dst_wc) = vmm::time::host_time_snapshot(vmm_hdl) .map_err(|e| { @@ -589,16 +719,25 @@ impl DestinationProtocol { Ok(()) } - async fn ram_pull(&mut self) -> Result<(), MigrateError> { - self.update_state(MigrationState::RamPull).await; + async fn ram_pull( + &mut self, + ensure_ctx: &mut VmEnsureActive<'_>, + ) -> Result<(), MigrateError> { + self.update_state( + ensure_ctx.state_publisher(), + MigrationState::RamPull, + ); self.send_msg(codec::Message::MemQuery(0, !0)).await?; let m = self.read_msg().await?; info!(self.log(), "ram_pull: got end {:?}", m); self.send_msg(codec::Message::MemDone).await } - async fn server_state(&mut self) -> Result<(), MigrateError> { - self.update_state(MigrationState::Server).await; + async fn server_state( + &mut self, + ensure_ctx: &mut VmEnsureActive<'_>, + ) -> Result<(), MigrateError> { + self.update_state(ensure_ctx.state_publisher(), MigrationState::Server); self.send_msg(codec::Message::Serialized( ron::to_string(&self.local_addr) .map_err(codec::ProtocolError::from)?, @@ -612,9 +751,8 @@ impl DestinationProtocol { } }; - self.vm_objects - .as_ref() - .unwrap() + ensure_ctx + .vm_objects() .lock_shared() .await .com1() @@ -625,7 +763,10 @@ impl DestinationProtocol { self.send_msg(codec::Message::Okay).await } - async fn finish(&mut self) -> Result<(), MigrateError> { + async fn finish( + &mut self, + ensure_ctx: &mut VmEnsureActive<'_>, + ) -> Result<(), MigrateError> { // Tell the source this destination is ready to run the VM. self.send_msg(codec::Message::Okay).await?; @@ -635,10 +776,9 @@ impl DestinationProtocol { // that it should resume the VM. self.read_ok().await?; - // Now that control is definitely being transferred, publish that the - // migration has succeeded. - drop(self.vm_objects.take()); - self.update_state(MigrationState::Finish).await; + // The source has acknowledged the migration is complete, so it's safe + // to declare victory publicly. + self.update_state(ensure_ctx.state_publisher(), MigrationState::Finish); Ok(()) } @@ -693,10 +833,11 @@ impl DestinationProtocol { async fn write_guest_ram( &mut self, + ensure_ctx: &VmEnsureActive<'_>, addr: GuestAddr, buf: &[u8], ) -> Result<(), MigrateError> { - let objects = self.vm_objects.as_ref().unwrap().lock_shared().await; + let objects = ensure_ctx.vm_objects().lock_shared().await; let memctx = objects.access_mem().unwrap(); let len = buf.len(); memctx.write_from(addr, buf, len); diff --git a/bin/propolis-server/src/lib/migrate/mod.rs b/bin/propolis-server/src/lib/migrate/mod.rs index 9f50ff854..a59e8c85c 100644 --- a/bin/propolis-server/src/lib/migrate/mod.rs +++ b/bin/propolis-server/src/lib/migrate/mod.rs @@ -2,21 +2,14 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::net::SocketAddr; - use bit_field::BitField; use dropshot::HttpError; -use futures::{SinkExt, StreamExt}; use propolis::migrate::MigrateStateError; -use propolis_api_types::{self as api, MigrationState}; +use propolis_api_types::MigrationState; use serde::{Deserialize, Serialize}; -use slog::{error, info, o}; +use slog::error; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode; -use tokio_tungstenite::tungstenite::protocol::CloseFrame; -use tokio_tungstenite::{tungstenite, WebSocketStream}; -use uuid::Uuid; mod codec; pub mod destination; @@ -25,6 +18,15 @@ mod preamble; pub mod protocol; pub mod source; +/// Trait bounds for connection objects used in live migrations. +pub(crate) trait MigrateConn: + AsyncRead + AsyncWrite + Unpin + Send +{ +} + +impl MigrateConn for tokio_tungstenite::MaybeTlsStream {} +impl MigrateConn for hyper::upgrade::Upgraded {} + #[derive(Copy, Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] pub enum MigrateRole { Source, @@ -141,11 +143,6 @@ pub enum MigrateError { /// The other end of the migration ran into an error #[error("{0:?} migration instance encountered error: {1}")] RemoteError(MigrateRole, String), - - /// Sending/receiving from the VM state driver command/response channels - /// returned an error. - #[error("VM state driver unexpectedly closed channel")] - StateDriverChannelClosed, } impl From for MigrateError { @@ -183,8 +180,7 @@ impl From for HttpError { | MigrateError::TimeData(_) | MigrateError::DeviceState(_) | MigrateError::RemoteError(_, _) - | MigrateError::StateMachine(_) - | MigrateError::StateDriverChannelClosed => { + | MigrateError::StateMachine(_) => { HttpError::for_internal_error(msg) } MigrateError::MigrationAlreadyInProgress @@ -219,183 +215,6 @@ struct DevicePayload { pub data: String, } -pub(crate) struct SourceContext< - T: AsyncRead + AsyncWrite + Unpin + Send + 'static, -> { - pub conn: WebSocketStream, - pub protocol: crate::migrate::protocol::Protocol, -} - -/// Begin the migration process (source-side). -/// -/// This will check protocol version and then begin the migration in a separate task. -pub async fn source_start< - T: AsyncRead + AsyncWrite + Unpin + Send + 'static, ->( - log: &slog::Logger, - migration_id: Uuid, - mut conn: WebSocketStream, -) -> Result, MigrateError> { - // Create a new log context for the migration - let log = log.new(o!( - "migration_id" => migration_id.to_string(), - "migrate_role" => "source" - )); - info!(log, "Migration Source"); - - let selected = match conn.next().await { - Some(Ok(tungstenite::Message::Text(dst_protocols))) => { - info!(log, "destination offered protocols: {}", dst_protocols); - match protocol::select_protocol_from_offer(&dst_protocols) { - Ok(Some(selected)) => { - info!(log, "selected protocol {:?}", selected); - conn.send(tungstenite::Message::Text( - selected.offer_string(), - )) - .await?; - selected - } - Ok(None) => { - let src_protocols = protocol::make_protocol_offer(); - error!( - log, - "no compatible destination protocols"; - "dst_protocols" => &dst_protocols, - "src_protocols" => &src_protocols, - ); - return Err(MigrateError::NoMatchingProtocol( - src_protocols, - dst_protocols, - )); - } - Err(e) => { - error!(log, "failed to parse destination protocol offer"; - "dst_protocols" => &dst_protocols, - "error" => %e); - return Err(MigrateError::ProtocolParse( - dst_protocols, - e.to_string(), - )); - } - } - } - x => { - conn.send(tungstenite::Message::Close(Some(CloseFrame { - code: CloseCode::Protocol, - reason: "did not begin with version handshake.".into(), - }))) - .await?; - error!( - log, - "destination side did not begin migration version handshake: \ - {:?}", - x - ); - return Err(MigrateError::Initiate); - } - }; - - Ok(SourceContext { conn, protocol: selected }) -} - -pub(crate) struct DestinationContext< - T: AsyncRead + AsyncWrite + Unpin + Send + 'static, -> { - pub migration_id: Uuid, - pub conn: WebSocketStream, - pub local_addr: SocketAddr, - pub protocol: crate::migrate::protocol::Protocol, -} - -/// Initiate a migration to the given source instance. -/// -/// This will attempt to open a websocket to the given source instance and -/// check that the migrate protocol version is compatible ("equal" presently). -/// Once we've successfully established the connection, we can begin the -/// migration process (destination-side). -pub(crate) async fn dest_initiate( - log: &slog::Logger, - migrate_info: &api::InstanceMigrateInitiateRequest, - local_server_addr: SocketAddr, -) -> Result< - DestinationContext< - tokio_tungstenite::MaybeTlsStream, - >, - MigrateError, -> { - let migration_id = migrate_info.migration_id; - - // Create a new log context for the migration - let log = log.new(o!( - "migration_id" => migration_id.to_string(), - "migrate_role" => "destination", - "migrate_src_addr" => migrate_info.src_addr - )); - info!(log, "Migration Destination"); - - // Build upgrade request to the source instance - // (we do this by hand because it's hidden from the OpenAPI spec) - // TODO(#165): https (wss) - // TODO: We need to make sure the src_addr is a valid target - let src_migrate_url = format!( - "ws://{}/instance/migrate/{}/start", - migrate_info.src_addr, migration_id, - ); - info!(log, "Begin migration"; "src_migrate_url" => &src_migrate_url); - let (mut conn, _) = - tokio_tungstenite::connect_async(src_migrate_url).await?; - - let dst_protocols = protocol::make_protocol_offer(); - conn.send(tungstenite::Message::Text(dst_protocols)).await?; - let selected = match conn.next().await { - Some(Ok(tungstenite::Message::Text(selected_protocol))) => { - info!(log, "source negotiated protocol {}", selected_protocol); - match protocol::select_protocol_from_offer(&selected_protocol) { - Ok(Some(selected)) => selected, - Ok(None) => { - let offered = protocol::make_protocol_offer(); - error!(log, "source selected protocol not on offer"; - "offered" => &offered, - "selected" => &selected_protocol); - - return Err(MigrateError::NoMatchingProtocol( - selected_protocol, - offered, - )); - } - Err(e) => { - error!(log, "source selected protocol failed to parse"; - "selected" => &selected_protocol); - - return Err(MigrateError::ProtocolParse( - selected_protocol, - e.to_string(), - )); - } - } - } - x => { - conn.send(tungstenite::Message::Close(Some(CloseFrame { - code: CloseCode::Protocol, - reason: "did not respond to version handshake.".into(), - }))) - .await?; - error!( - log, - "source instance failed to negotiate protocol version: {:?}", x - ); - return Err(MigrateError::Initiate); - } - }; - - Ok(DestinationContext { - migration_id, - conn, - local_addr: local_server_addr, - protocol: selected, - }) -} - // We should probably turn this into some kind of ValidatedBitmap // data structure, so that we're only parsing it once. struct PageIter<'a> { diff --git a/bin/propolis-server/src/lib/migrate/source.rs b/bin/propolis-server/src/lib/migrate/source.rs index 6da0e14b8..a8c3c5d9a 100644 --- a/bin/propolis-server/src/lib/migrate/source.rs +++ b/bin/propolis-server/src/lib/migrate/source.rs @@ -10,30 +10,33 @@ use propolis::migrate::{ }; use propolis::vmm; use propolis_api_types::instance_spec::VersionedInstanceSpec; -use slog::{debug, error, info, trace}; +use slog::{debug, error, info, trace, warn}; use std::collections::HashMap; use std::convert::TryInto; use std::io; use std::ops::{Range, RangeInclusive}; -use std::sync::Arc; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_tungstenite::WebSocketStream; +use tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode; +use tokio_tungstenite::tungstenite::protocol::CloseFrame; +use tokio_tungstenite::{tungstenite, WebSocketStream}; +use uuid::Uuid; -use crate::migrate::codec; use crate::migrate::codec::Message; use crate::migrate::memx; use crate::migrate::preamble::Preamble; use crate::migrate::probes; use crate::migrate::protocol::Protocol; +use crate::migrate::{codec, protocol}; use crate::migrate::{ Device, DevicePayload, MigrateError, MigratePhase, MigrateRole, MigrationState, PageIter, }; -use crate::vm::migrate_commands::{ - MigrateSourceCommand, MigrateSourceResponse, -}; use crate::vm::objects::VmObjects; +use crate::vm::state_publisher::{ + ExternalStateUpdate, MigrationStateUpdate, StatePublisher, +}; + +use super::MigrateConn; /// Specifies which pages should be offered during a RAM transfer phase. /// @@ -114,77 +117,116 @@ enum RamOfferDiscipline { OfferDirty, } -pub async fn migrate( - vm: Arc, - command_tx: tokio::sync::mpsc::Sender, - response_rx: tokio::sync::mpsc::Receiver, - conn: WebSocketStream, - protocol: super::protocol::Protocol, -) -> Result<(), MigrateError> { - let err_tx = command_tx.clone(); - let mut proto = match protocol { - Protocol::RonV0 => { - SourceProtocol::new(vm, command_tx, response_rx, conn).await - } - }; +/// The interface to an arbitrary version of the source half of the live +/// migration protocol. +// +// Use `async_trait` here to help generate a `Send` bound on the futures +// returned by the functions in this trait. +#[async_trait::async_trait] +pub(crate) trait SourceProtocol { + /// Runs live migration out of the supplied `vm_objects`, writing back any + /// state that must be saved for future migration attempts to + /// `persistent_state`. + /// + /// This routine guarantees that the supplied `vm_objects` are paused on + /// success and resumed on failure. + async fn run( + self, + vm_objects: &VmObjects, + publisher: &mut StatePublisher, + persistent_state: &mut PersistentState, + ) -> Result<(), MigrateError>; +} - if let Err(err) = proto.run().await { - err_tx - .send(MigrateSourceCommand::UpdateState(MigrationState::Error)) - .await - .unwrap(); - - // We encountered an error, try to inform the remote before bailing - // Note, we don't use `?` here as this is a best effort and we don't - // want an error encountered during this send to shadow the run error - // from the caller. - let _ = proto.send_msg(codec::Message::Error(err.clone())).await; - - // If we are capable of setting the dirty bit on guest page table - // entries, re-dirty them, so that a later migration attempt can also - // only offer dirty pages. If we can't use VM_NPT_OPERATION, a - // subsequent migration attempt will offer all pages. - // - // See the lengthy comment on `RamOfferDiscipline` above for more - // details about what's going on here. - { - let objects = proto.vm.lock_shared().await; - for (&GuestAddr(gpa), dirtiness) in proto.dirt.iter().flatten() { - if let Err(e) = - objects.vmm_hdl().set_dirty_pages(gpa, dirtiness) - { - // Bad news! Our attempt to re-set the dirty bit on these - // pages has failed! Thus, subsequent migration attempts - // /!\ CAN NO LONGER RELY ON DIRTY PAGE TRACKING /!\ - // and must always offer all pages in the initial RAM push - // phase. - // - // Record that now so we never try to do this again. - proto - .command_tx - .send(MigrateSourceCommand::RedirtyingFailed) - .await - .unwrap(); - // .map_err(|_| MigrateError::StateDriverChannelClosed)?; +/// Negotiates a live migration protocol version with a target who has connected +/// over `conn`. If this is successful, returns a `SourceProtocol` +/// implementation that can be used to run the requested migration. +pub(crate) async fn initiate( + log: &slog::Logger, + migration_id: Uuid, + mut conn: WebSocketStream, + vm_objects: &VmObjects, + persistent_state: &PersistentState, +) -> Result { + // Create a new log context for the migration + let log = log.new(slog::o!( + "migration_id" => migration_id.to_string(), + "migrate_role" => "source" + )); + info!(log, "negotiating migration as source"); + + // The protocol should start with some text from the destination identifying + // the protocol versions it supports. + let dst_protocols = match conn.next().await { + Some(Ok(tungstenite::Message::Text(dst_protocols))) => dst_protocols, + x => { + error!( + log, + "destination side did not begin migration version handshake: \ + {:?}", + x + ); - error!( - proto.log(), - "failed to restore dirty bits: {e}"; - "gpa" => gpa, - ); - // No sense continuing to try putting back any remaining - // dirty bits, as we won't be using them any longer. - break; - } else { - debug!(proto.log(), "re-dirtied pages at {gpa:#x}",); - } + // Tell the destination it misbehaved. This is best-effort. + if let Err(e) = conn + .send(tungstenite::Message::Close(Some(CloseFrame { + code: CloseCode::Protocol, + reason: "did not begin with version handshake.".into(), + }))) + .await + { + warn!(log, "failed to send handshake failed message to source"; + "error" => ?e); } + + return Err(MigrateError::Initiate); } + }; - return Err(err); - } + // Pick the most favorable protocol from the list the destination supplied + // and send it back to the destination. + info!(log, "destination offered protocols: {}", dst_protocols); + let selected = match protocol::select_protocol_from_offer(&dst_protocols) { + Ok(Some(selected)) => { + conn.send(tungstenite::Message::Text(selected.offer_string())) + .await?; + selected + } + Ok(None) => { + let src_protocols = protocol::make_protocol_offer(); + error!( + log, + "no compatible destination protocols"; + "dst_protocols" => &dst_protocols, + "src_protocols" => &src_protocols, + ); + return Err(MigrateError::NoMatchingProtocol( + src_protocols, + dst_protocols, + )); + } + Err(e) => { + error!(log, "failed to parse destination protocol offer"; + "dst_protocols" => &dst_protocols, + "error" => %e); + return Err(MigrateError::ProtocolParse( + dst_protocols, + e.to_string(), + )); + } + }; - Ok(()) + info!(log, "selected protocol {:?}", selected); + match selected { + Protocol::RonV0 => Ok(RonV0::new( + log, + vm_objects, + migration_id, + conn, + persistent_state, + ) + .await), + } } /// State which must be stored across multiple migration attempts. @@ -201,17 +243,13 @@ pub(crate) struct PersistentState { pub(crate) has_redirtying_ever_failed: bool, } -struct SourceProtocol { - /// The source instance's VM objects. - vm: Arc, +/// Context for the source side of protocol version 0 using the RON encoding. +struct RonV0 { + /// The logger to which to log messages from this migration attempt. + log: slog::Logger, - /// The channel to use to send messages to the state worker coordinating - /// this migration. - command_tx: tokio::sync::mpsc::Sender, - - /// The channel to use to receive messages from the state worker - /// coordinating this migration. - response_rx: tokio::sync::mpsc::Receiver, + /// The migration's ID. + migration_id: Uuid, /// Transport to the destination Instance. conn: WebSocketStream, @@ -241,12 +279,13 @@ struct SourceProtocol { const PAGE_BITMAP_SIZE: usize = 4096; type PageBitmap = [u8; PAGE_BITMAP_SIZE]; -impl SourceProtocol { +impl RonV0 { async fn new( - vm: Arc, - command_tx: tokio::sync::mpsc::Sender, - response_rx: tokio::sync::mpsc::Receiver, + log: slog::Logger, + vm: &VmObjects, + migration_id: Uuid, conn: WebSocketStream, + persistent_state: &PersistentState, ) -> Self { // Create a (prospective) dirty page map if bhyve supports the NPT // API. If this map is present and the VM hasn't recorded that it's @@ -256,31 +295,83 @@ impl SourceProtocol { let can_npt_operate = vm.lock_shared().await.vmm_hdl().can_npt_operate(); - if can_npt_operate { + let has_redirtying_ever_failed = + persistent_state.has_redirtying_ever_failed; + if can_npt_operate && !has_redirtying_ever_failed { Some(Default::default()) } else { info!( - vm.log(), - "NPT operations not supported, will offer all pages pre-push"; + log, + "guest pages not redirtyable, will offer all pages in pre-pause"; + "can_npt_operate" => can_npt_operate, + "has_redirtying_ever_failed" => has_redirtying_ever_failed ); None } }; - Self { vm, command_tx, response_rx, conn, dirt } + Self { log, migration_id, conn, dirt } } +} + +#[async_trait::async_trait] +impl SourceProtocol for RonV0 { + async fn run( + self, + vm_objects: &VmObjects, + publisher: &mut StatePublisher, + persistent_state: &mut PersistentState, + ) -> Result<(), MigrateError> { + let mut runner = RonV0Runner { + log: self.log, + migration_id: self.migration_id, + conn: self.conn, + dirt: self.dirt, + vm: vm_objects, + state_publisher: publisher, + persistent_state, + paused: false, + }; + + runner.run().await + } +} +struct RonV0Runner<'vm, T: MigrateConn> { + log: slog::Logger, + migration_id: Uuid, + conn: WebSocketStream, + dirt: Option>, + vm: &'vm VmObjects, + state_publisher: &'vm mut StatePublisher, + persistent_state: &'vm mut PersistentState, + paused: bool, +} + +impl<'vm, T: MigrateConn> RonV0Runner<'vm, T> { fn log(&self) -> &slog::Logger { - self.vm.log() + &self.log } - async fn update_state(&mut self, state: MigrationState) { - // When migrating into an instance, the VM state worker blocks waiting - // for the disposition of the migration attempt, so the channel should - // never be closed before the attempt completes. - self.command_tx - .send(MigrateSourceCommand::UpdateState(state)) - .await - .unwrap(); + fn update_state(&mut self, state: MigrationState) { + self.state_publisher.update(ExternalStateUpdate::Migration( + MigrationStateUpdate { + state, + id: self.migration_id, + role: MigrateRole::Source, + }, + )); + } + + async fn pause_vm(&mut self) { + assert!(!self.paused); + self.paused = true; + self.vm.lock_exclusive().await.pause().await; + } + + async fn resume_vm(&mut self) { + assert!(self.paused); + self.paused = false; + self.vm.lock_exclusive().await.resume(); } async fn run_phase( @@ -310,22 +401,72 @@ impl SourceProtocol { async fn run(&mut self) -> Result<(), MigrateError> { info!(self.log(), "Entering Source Migration Task"); - self.run_phase(MigratePhase::MigrateSync).await?; - self.run_phase(MigratePhase::RamPushPrePause).await?; - self.run_phase(MigratePhase::Pause).await?; - self.run_phase(MigratePhase::RamPushPostPause).await?; - self.run_phase(MigratePhase::TimeData).await?; - self.run_phase(MigratePhase::DeviceState).await?; - self.run_phase(MigratePhase::RamPull).await?; - self.run_phase(MigratePhase::ServerState).await?; - self.run_phase(MigratePhase::Finish).await?; - - info!(self.log(), "Source Migration Successful"); - Ok(()) + let result: Result<_, MigrateError> = async { + self.run_phase(MigratePhase::MigrateSync).await?; + self.run_phase(MigratePhase::RamPushPrePause).await?; + self.run_phase(MigratePhase::Pause).await?; + self.run_phase(MigratePhase::RamPushPostPause).await?; + self.run_phase(MigratePhase::TimeData).await?; + self.run_phase(MigratePhase::DeviceState).await?; + self.run_phase(MigratePhase::RamPull).await?; + self.run_phase(MigratePhase::ServerState).await?; + self.run_phase(MigratePhase::Finish).await?; + Ok(()) + } + .await; + + if let Err(err) = result { + self.update_state(MigrationState::Error); + let _ = self.send_msg(codec::Message::Error(err.clone())).await; + + // If we are capable of setting the dirty bit on guest page table + // entries, re-dirty them, so that a later migration attempt can also + // only offer dirty pages. If we can't use VM_NPT_OPERATION, a + // subsequent migration attempt will offer all pages. + // + // See the lengthy comment on `RamOfferDiscipline` above for more + // details about what's going on here. + let vmm_hdl = self.vm.lock_shared().await.vmm_hdl().clone(); + for (&GuestAddr(gpa), dirtiness) in self.dirt.iter().flatten() { + if let Err(e) = vmm_hdl.set_dirty_pages(gpa, dirtiness) { + // Bad news! Our attempt to re-set the dirty bit on these + // pages has failed! Thus, subsequent migration attempts + // /!\ CAN NO LONGER RELY ON DIRTY PAGE TRACKING /!\ + // and must always offer all pages in the initial RAM push + // phase. + // + // Record that now so we never try to do this again. + self.persistent_state.has_redirtying_ever_failed = true; + error!( + self.log(), + "failed to restore dirty bits: {e}"; + "gpa" => gpa, + ); + // No sense continuing to try putting back any remaining + // dirty bits, as we won't be using them any longer. + break; + } else { + debug!(self.log(), "re-dirtied pages at {gpa:#x}",); + } + } + + if self.paused { + self.resume_vm().await; + } + + Err(err) + } else { + // The VM should be paused after successfully migrating out; the + // state driver assumes as much when subsequently halting the + // instance. + assert!(self.paused); + info!(self.log(), "Source Migration Successful"); + Ok(()) + } } async fn sync(&mut self) -> Result<(), MigrateError> { - self.update_state(MigrationState::Sync).await; + self.update_state(MigrationState::Sync); let preamble = Preamble::new(VersionedInstanceSpec::V0( self.vm.lock_shared().await.instance_spec().clone(), )); @@ -342,10 +483,10 @@ impl SourceProtocol { ) -> Result<(), MigrateError> { match phase { MigratePhase::RamPushPrePause => { - self.update_state(MigrationState::RamPush).await + self.update_state(MigrationState::RamPush) } MigratePhase::RamPushPostPause => { - self.update_state(MigrationState::RamPushDirty).await + self.update_state(MigrationState::RamPushDirty) } _ => unreachable!("should only push RAM in a RAM push phase"), } @@ -360,41 +501,11 @@ impl SourceProtocol { vmm_ram_range ); - // In the pre-pause phase, it is safe to offer only dirty pages if (1) - // there is some prospect of being able to restore the kernel dirty page - // bitmap if migration fails, and (2) a prior attempt to restore the - // bitmap hasn't failed (thereby rendering the bitmap's contents - // untrustworthy). The first prong was checked when the protocol - // started, but the second prong requires input from the VM state - // driver. If this routine is being called from the pre-pause phase, and - // the dirty page map looks viable, ask the state driver if it's OK to - // proceed with transmitting only dirty pages. + // Determine whether we can offer only dirty pages, or if we must offer + // all pages. // // Refer to the giant comment on `RamOfferDiscipline` above for more // details about this determination. - if *phase == MigratePhase::RamPushPrePause && self.dirt.is_some() { - // The state driver should keep the command channels alive until the - // migration task exits, so these sends and receives should always - // work. - self.command_tx - .send(MigrateSourceCommand::QueryRedirtyingFailed) - .await - .unwrap(); - - let response = self.response_rx.recv().await.unwrap(); - match response { - MigrateSourceResponse::RedirtyingFailed(has_failed) => { - if has_failed { - self.dirt = None; - } - } - _ => panic!( - "unexpected response {:?} to request for redirtying info", - response - ), - } - } - let offer_discipline = match phase { // If we are in the pre-pause RAM push phase, and we don't have // VM_NPT_OPERATION to put back any dirty bits if the migration @@ -444,7 +555,7 @@ impl SourceProtocol { }; } info!(self.log(), "ram_push: done sending ram"); - self.update_state(MigrationState::Pause).await; + self.update_state(MigrationState::Pause); Ok(()) } @@ -555,26 +666,16 @@ impl SourceProtocol { } async fn pause(&mut self) -> Result<(), MigrateError> { - self.update_state(MigrationState::Pause).await; + self.update_state(MigrationState::Pause); // Ask the instance to begin transitioning to the paused state // This will inform each device to pause. info!(self.log(), "Pausing devices"); - self.command_tx.send(MigrateSourceCommand::Pause).await.unwrap(); - let resp = self.response_rx.recv().await.unwrap(); - match resp { - MigrateSourceResponse::Pause(Ok(())) => Ok(()), - _ => { - info!( - self.log(), - "Unexpected pause response from state worker: {:?}", resp - ); - Err(MigrateError::SourcePause) - } - } + self.pause_vm().await; + Ok(()) } async fn device_state(&mut self) -> Result<(), MigrateError> { - self.update_state(MigrationState::Device).await; + self.update_state(MigrationState::Device); let mut device_states = vec![]; { let objects = self.vm.lock_shared().await; @@ -659,11 +760,11 @@ impl SourceProtocol { } async fn ram_pull(&mut self) -> Result<(), MigrateError> { - self.update_state(MigrationState::RamPush).await; + self.update_state(MigrationState::RamPush); let m = self.read_msg().await?; info!(self.log(), "ram_pull: got query {:?}", m); - self.update_state(MigrationState::Pause).await; - self.update_state(MigrationState::RamPushDirty).await; + self.update_state(MigrationState::Pause); + self.update_state(MigrationState::RamPushDirty); self.send_msg(codec::Message::MemEnd(0, !0)).await?; let m = self.read_msg().await?; info!(self.log(), "ram_pull: got done {:?}", m); @@ -671,7 +772,7 @@ impl SourceProtocol { } async fn server_state(&mut self) -> Result<(), MigrateError> { - self.update_state(MigrationState::Server).await; + self.update_state(MigrationState::Server); let remote_addr = match self.read_msg().await? { Message::Serialized(s) => { ron::from_str(&s).map_err(codec::ProtocolError::from)? @@ -704,7 +805,7 @@ impl SourceProtocol { // Now that handoff is complete, publish that the migration has // succeeded. - self.update_state(MigrationState::Finish).await; + self.update_state(MigrationState::Finish); // This VMM is going away, so if any guest memory is still dirty, it // won't be transferred. Assert that there is no such memory. diff --git a/bin/propolis-server/src/lib/vm/ensure.rs b/bin/propolis-server/src/lib/vm/ensure.rs new file mode 100644 index 000000000..4ba939952 --- /dev/null +++ b/bin/propolis-server/src/lib/vm/ensure.rs @@ -0,0 +1,364 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Tools for handling instance ensure requests. +//! +//! To initialize a new VM, the server must (1) create a set of VM objects from +//! an instance spec, (2) set up VM services that use those objects, (3) use the +//! objects and services to drive the VM state machine to the `ActiveVm` state, +//! and (4) notify the original caller of the "instance ensure" API of the +//! completion of its request. If VM initialization fails, the actions required +//! to compensate and drive the state machine to `RundownComplete` depend on how +//! many steps were completed. +//! +//! When live migrating into an instance, the live migration task interleaves +//! initialization steps with the steps of the live migration protocol, and +//! needs to be able to unwind initialization correctly whenever the migration +//! protocol fails. +//! +//! The `VmEnsure` types in this module exist to hide the gory details of +//! initializing and unwinding from higher-level operations like the live +//! migration task. Each type represents a phase of the initialization process +//! and has a routine that consumes the current phase and moves to the next +//! phase. If a higher-level operation fails, it can call a failure handler on +//! its current phase to unwind the whole operation and drive the VM state +//! machine to the correct resting state. + +use std::sync::Arc; + +use propolis_api_types::{ + instance_spec::{v0::InstanceSpecV0, VersionedInstanceSpec}, + InstanceEnsureResponse, InstanceMigrateInitiateResponse, + InstanceSpecEnsureRequest, InstanceState, +}; +use slog::{debug, info}; + +use crate::{ + initializer::{ + build_instance, MachineInitializer, MachineInitializerState, + }, + vm::request_queue::InstanceAutoStart, +}; + +use super::{ + objects::{InputVmObjects, VmObjects}, + services::VmServices, + state_driver::InputQueue, + state_publisher::{ExternalStateUpdate, StatePublisher}, + EnsureOptions, InstanceEnsureResponseTx, VmError, +}; + +/// Holds state about an instance ensure request that has not yet produced any +/// VM objects or driven the VM state machine to the `ActiveVm` state. +pub(crate) struct VmEnsureNotStarted<'a> { + log: &'a slog::Logger, + vm: &'a Arc, + ensure_request: &'a InstanceSpecEnsureRequest, + ensure_options: &'a EnsureOptions, + ensure_response_tx: InstanceEnsureResponseTx, + state_publisher: &'a mut StatePublisher, +} + +impl<'a> VmEnsureNotStarted<'a> { + pub(super) fn new( + log: &'a slog::Logger, + vm: &'a Arc, + ensure_request: &'a InstanceSpecEnsureRequest, + ensure_options: &'a EnsureOptions, + ensure_response_tx: InstanceEnsureResponseTx, + state_publisher: &'a mut StatePublisher, + ) -> Self { + Self { + log, + vm, + ensure_request, + ensure_options, + ensure_response_tx, + state_publisher, + } + } + + pub(crate) fn instance_spec(&self) -> &InstanceSpecV0 { + let VersionedInstanceSpec::V0(v0) = &self.ensure_request.instance_spec; + v0 + } + + pub(crate) fn state_publisher(&mut self) -> &mut StatePublisher { + self.state_publisher + } + + /// Creates a set of VM objects using the instance spec stored in this + /// ensure request, but does not install them as an active VM. + pub(crate) async fn create_objects( + self, + ) -> anyhow::Result> { + debug!(self.log, "creating VM objects"); + + let input_queue = Arc::new(InputQueue::new( + self.log.new(slog::o!("component" => "request_queue")), + match &self.ensure_request.migrate { + Some(_) => InstanceAutoStart::Yes, + None => InstanceAutoStart::No, + }, + )); + + match self.initialize_vm_objects_from_spec(&input_queue).await { + Ok(objects) => { + // N.B. Once these `VmObjects` exist, it is no longer safe to + // call `vm_init_failed`. + let objects = Arc::new(VmObjects::new( + self.log.clone(), + self.vm.clone(), + objects, + )); + + Ok(VmEnsureObjectsCreated { + log: self.log, + vm: self.vm, + ensure_request: self.ensure_request, + ensure_options: self.ensure_options, + ensure_response_tx: self.ensure_response_tx, + state_publisher: self.state_publisher, + vm_objects: objects, + input_queue, + kernel_vm_paused: false, + }) + } + Err(e) => Err(self.fail(e).await), + } + } + + pub(crate) async fn fail(self, reason: anyhow::Error) -> anyhow::Error { + self.state_publisher + .update(ExternalStateUpdate::Instance(InstanceState::Failed)); + + self.vm.vm_init_failed().await; + let _ = self + .ensure_response_tx + .send(Err(VmError::InitializationFailed(reason.to_string()))); + + reason + } + + async fn initialize_vm_objects_from_spec( + &self, + event_queue: &Arc, + ) -> anyhow::Result { + let properties = &self.ensure_request.properties; + let spec = &self.ensure_request.instance_spec; + let options = self.ensure_options; + + info!(self.log, "initializing new VM"; + "spec" => #?spec, + "properties" => #?properties, + "use_reservoir" => options.use_reservoir, + "bootrom" => %options.toml_config.bootrom.display()); + + let vmm_log = self.log.new(slog::o!("component" => "vmm")); + + // Set up the 'shell' instance into which the rest of this routine will + // add components. + let VersionedInstanceSpec::V0(v0_spec) = &spec; + let machine = build_instance( + &properties.vm_name(), + v0_spec, + options.use_reservoir, + vmm_log, + )?; + + let mut init = MachineInitializer { + log: self.log.clone(), + machine: &machine, + devices: Default::default(), + block_backends: Default::default(), + crucible_backends: Default::default(), + spec: v0_spec, + properties, + toml_config: &options.toml_config, + producer_registry: options.oximeter_registry.clone(), + state: MachineInitializerState::default(), + }; + + init.initialize_rom(options.toml_config.bootrom.as_path())?; + let chipset = init.initialize_chipset( + &(event_queue.clone() + as Arc), + )?; + + init.initialize_rtc(&chipset)?; + init.initialize_hpet()?; + + let com1 = Arc::new(init.initialize_uart(&chipset)?); + let ps2ctrl = init.initialize_ps2(&chipset)?; + init.initialize_qemu_debug_port()?; + init.initialize_qemu_pvpanic(properties.into())?; + init.initialize_network_devices(&chipset)?; + + #[cfg(not(feature = "omicron-build"))] + init.initialize_test_devices(&options.toml_config.devices)?; + #[cfg(feature = "omicron-build")] + info!( + self.log, + "`omicron-build` feature enabled, ignoring any test devices" + ); + + #[cfg(feature = "falcon")] + { + init.initialize_softnpu_ports(&chipset)?; + init.initialize_9pfs(&chipset)?; + } + + init.initialize_storage_devices(&chipset, options.nexus_client.clone()) + .await?; + + let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; + init.initialize_cpus()?; + let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( + &machine, + event_queue.clone() + as Arc, + self.log.new(slog::o!("component" => "vcpu_tasks")), + )?); + + let MachineInitializer { + devices, + block_backends, + crucible_backends, + .. + } = init; + + Ok(InputVmObjects { + instance_spec: v0_spec.clone(), + vcpu_tasks, + machine, + lifecycle_components: devices, + block_backends, + crucible_backends, + com1, + framebuffer: Some(ramfb), + ps2ctrl, + }) + } +} + +/// Represents an instance ensure request that has proceeded far enough to +/// create a set of VM objects, but that has not yet installed those objects as +/// an `ActiveVm` or notified the requestor that its request is complete. +pub(crate) struct VmEnsureObjectsCreated<'a> { + log: &'a slog::Logger, + vm: &'a Arc, + ensure_request: &'a InstanceSpecEnsureRequest, + ensure_options: &'a EnsureOptions, + ensure_response_tx: InstanceEnsureResponseTx, + state_publisher: &'a mut StatePublisher, + vm_objects: Arc, + input_queue: Arc, + kernel_vm_paused: bool, +} + +impl<'a> VmEnsureObjectsCreated<'a> { + /// Prepares the VM's CPUs for an incoming live migration by activating them + /// (at the kernel VM level) and then pausing the kernel VM. This must be + /// done before importing any state into these objects. + /// + /// # Panics + /// + /// Panics if called more than once on the same set of objects. + pub(crate) async fn prepare_for_migration(&mut self) { + assert!(!self.kernel_vm_paused); + let guard = self.vm_objects.lock_exclusive().await; + guard.reset_vcpus(); + guard.pause_kernel_vm(); + self.kernel_vm_paused = true; + } + + /// Uses this struct's VM objects to create a set of VM services, then + /// installs an active VM into the parent VM state machine and notifies the + /// ensure requester that its request is complete. + pub(crate) async fn ensure_active(self) -> VmEnsureActive<'a> { + let vm_services = VmServices::new( + self.log, + self.vm, + &self.vm_objects, + &self.ensure_request.properties, + self.ensure_options, + ) + .await; + + self.vm + .make_active( + self.log, + self.input_queue.clone(), + &self.vm_objects, + vm_services, + ) + .await; + + // The response channel may be closed if the client who asked to ensure + // the VM timed out or disconnected. This is OK; now that the VM is + // active, a new client can recover by reading the current instance + // state and using the state change API to send commands to the state + // driver. + let _ = self.ensure_response_tx.send(Ok(InstanceEnsureResponse { + migrate: self.ensure_request.migrate.as_ref().map(|req| { + InstanceMigrateInitiateResponse { + migration_id: req.migration_id, + } + }), + })); + + VmEnsureActive { + vm: self.vm, + state_publisher: self.state_publisher, + vm_objects: self.vm_objects, + input_queue: self.input_queue, + kernel_vm_paused: self.kernel_vm_paused, + } + } +} + +/// Describes a set of VM objects that are fully initialized and referred to by +/// the `ActiveVm` in a VM state machine, but for which a state driver loop has +/// not started yet. +pub(crate) struct VmEnsureActive<'a> { + vm: &'a Arc, + state_publisher: &'a mut StatePublisher, + vm_objects: Arc, + input_queue: Arc, + kernel_vm_paused: bool, +} + +impl<'a> VmEnsureActive<'a> { + pub(crate) fn vm_objects(&self) -> &Arc { + &self.vm_objects + } + + pub(crate) fn state_publisher(&mut self) -> &mut StatePublisher { + self.state_publisher + } + + pub(crate) async fn fail(mut self) { + // If a caller asked to prepare the VM objects for migration in the + // previous phase, make sure that operation is undone before the VM + // objects are torn down. + if self.kernel_vm_paused { + let guard = self.vm_objects.lock_exclusive().await; + guard.resume_kernel_vm(); + self.kernel_vm_paused = false; + } + + self.state_publisher + .update(ExternalStateUpdate::Instance(InstanceState::Failed)); + + // Since there are extant VM objects, move to the Rundown state. The VM + // will move to RundownComplete when the objects are finally dropped. + self.vm.set_rundown().await; + } + + /// Yields the VM objects and input queue for this VM so that they can be + /// used to start a state driver loop. + pub(super) fn into_inner(self) -> (Arc, Arc) { + (self.vm_objects, self.input_queue) + } +} diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 96d334d35..815d20f36 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -94,14 +94,13 @@ use state_publisher::StatePublisher; use crate::{server::MetricsEndpointConfig, vnc::PropolisVncServer}; mod active; +pub(crate) mod ensure; pub(crate) mod guest_event; -pub(crate) mod migrate_commands; pub(crate) mod objects; mod request_queue; mod services; -mod startup; mod state_driver; -mod state_publisher; +pub(crate) mod state_publisher; /// Maps component names to lifecycle trait objects that allow /// components to be started, paused, resumed, and halted. @@ -138,6 +137,10 @@ pub(crate) type CrucibleReplaceResult = pub(crate) type CrucibleReplaceResultTx = tokio::sync::oneshot::Sender; +type InstanceEnsureResponseTx = tokio::sync::oneshot::Sender< + Result, +>; + /// Errors generated by the VM controller and its subcomponents. #[derive(Debug, thiserror::Error)] pub(crate) enum VmError { @@ -156,8 +159,8 @@ pub(crate) enum VmError { #[error("VM is currently shutting down")] RundownInProgress, - #[error("VM initialization failed")] - InitializationFailed(#[source] anyhow::Error), + #[error("VM initialization failed: {0}")] + InitializationFailed(String), #[error("Forbidden state change")] ForbiddenStateChange(#[from] request_queue::RequestDeniedReason), @@ -340,8 +343,6 @@ impl Vm { /// creating an `ActiveVm` with the supplied input queue, VM objects, and VM /// services. /// - /// This routine should only be called by the state driver. - /// /// # Panics /// /// Panics if the VM is not in the `WaitingForInit` state. @@ -372,30 +373,21 @@ impl Vm { } } - /// Moves this VM from the `WaitingForInit` state to a rundown state in - /// response to an instance initialization failure. + /// Moves this VM from the `WaitingForInit` state to the `RundownComplete` + /// state in response to an instance initialization failure. /// - /// This routine should only be called by the state driver. - /// - /// # Arguments - /// - /// - `wait_for_objects`: True if the caller successfully created VM - /// objects that need to be destroyed before rundown can be completed. - /// False if the caller did not create any VM objects. + /// The caller must ensure there are no active `VmObjects` that refer to + /// this VM. /// /// # Panics /// /// Panics if the VM is not in the `WaitingForInit` state. - async fn vm_init_failed(&self, wait_for_objects: bool) { + async fn vm_init_failed(&self) { let mut guard = self.inner.write().await; let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { VmState::WaitingForInit(vm) => { - guard.state = if wait_for_objects { - VmState::Rundown(vm) - } else { - VmState::RundownComplete(vm) - }; + guard.state = VmState::RundownComplete(vm) } _ => unreachable!( "start failures should only occur before an active VM is \ diff --git a/bin/propolis-server/src/lib/vm/objects.rs b/bin/propolis-server/src/lib/vm/objects.rs index ddc7fa1f9..e5530560b 100644 --- a/bin/propolis-server/src/lib/vm/objects.rs +++ b/bin/propolis-server/src/lib/vm/objects.rs @@ -29,9 +29,6 @@ use super::{ /// A collection of components that make up a Propolis VM instance. pub(crate) struct VmObjects { - /// The objects' associated logger. - log: slog::Logger, - /// A reference to the VM state machine that created these objects. Used to /// complete rundown when the objects are dropped. parent: Arc, @@ -101,29 +98,18 @@ impl VmObjects { input: InputVmObjects, ) -> Self { let inner = VmObjectsLocked::new(&log, input); - Self { log, parent, inner: tokio::sync::RwLock::new(inner) } - } - - /// Yields the logger associated with these objects. - pub(crate) fn log(&self) -> &slog::Logger { - &self.log + Self { parent, inner: tokio::sync::RwLock::new(inner) } } /// Yields a shared lock guard referring to the underlying object /// collection. - /// - /// This function is crate-visible to allow the API layer to read (but not - /// mutate) VM objects. pub(crate) async fn lock_shared(&self) -> VmObjectsShared { VmObjectsShared(self.inner.read().await) } /// Yields an exclusive lock guard referring to the underlying object /// collection. - /// - /// This function is only visible within the `vm` module so that only the - /// state driver can obtain a mutable reference to the underlying objects. - pub(super) async fn lock_exclusive(&self) -> VmObjectsExclusive { + pub(crate) async fn lock_exclusive(&self) -> VmObjectsExclusive { VmObjectsExclusive(self.inner.write().await) } } @@ -286,14 +272,14 @@ impl VmObjectsLocked { } /// Pauses this VM's devices and its kernel VMM. - pub(super) async fn pause(&mut self) { + pub(crate) async fn pause(&mut self) { self.vcpu_tasks.pause_all(); self.pause_devices().await; self.pause_kernel_vm(); } /// Resumes this VM's devices and its kernel VMM. - pub(super) fn resume(&mut self) { + pub(crate) fn resume(&mut self) { self.resume_kernel_vm(); self.resume_devices(); self.vcpu_tasks.resume_all(); diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index daecaa95e..d5e87972f 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -9,27 +9,28 @@ use std::{ time::Duration, }; +use anyhow::Context; use propolis_api_types::{ instance_spec::{ components::backends::CrucibleStorageBackend, v0::StorageBackendV0, }, - InstanceMigrateInitiateResponse, InstanceSpecEnsureRequest, InstanceState, - MigrationState, + InstanceSpecEnsureRequest, InstanceState, MigrationState, }; use slog::{error, info}; use uuid::Uuid; -use crate::{migrate::MigrateRole, vm::state_publisher::ExternalStateUpdate}; +use crate::{ + migrate::{ + destination::DestinationProtocol, source::SourceProtocol, MigrateRole, + }, + vm::state_publisher::ExternalStateUpdate, +}; use super::{ + ensure::{VmEnsureActive, VmEnsureNotStarted}, guest_event::{self, GuestEvent}, - migrate_commands::{ - next_migrate_task_event, MigrateSourceCommand, MigrateSourceResponse, - MigrateTaskEvent, - }, objects::VmObjects, request_queue::{ExternalRequest, InstanceAutoStart}, - startup::BuildVmOutput, state_publisher::{MigrationStateUpdate, StatePublisher}, VmError, }; @@ -250,90 +251,30 @@ pub(super) async fn run_state_driver( >, ensure_options: super::EnsureOptions, ) -> StateDriverOutput { - let migration_in_id = - ensure_request.migrate.as_ref().map(|req| req.migration_id); - - let input_queue = Arc::new(InputQueue::new( - log.new(slog::o!("component" => "request_queue")), - match &ensure_request.migrate { - Some(_) => InstanceAutoStart::Yes, - None => InstanceAutoStart::No, - }, - )); - - let BuildVmOutput { vm_objects, migration_in } = - match super::startup::build_vm( - &log, - &vm, - &ensure_request, - &ensure_options, - &input_queue, - &mut state_publisher, - ) - .await - { - Ok(objects) => objects, - Err((e, objects)) => { - state_publisher.update(ExternalStateUpdate::Instance( - InstanceState::Failed, - )); - - vm.vm_init_failed(objects.is_some()).await; - let _ = ensure_result_tx - .send(Err(VmError::InitializationFailed(e))); - return StateDriverOutput { - state_publisher, - final_state: InstanceState::Failed, - }; - } - }; - - let services = super::services::VmServices::new( + let activated_vm = match create_and_activate_vm( &log, &vm, - &vm_objects, - &ensure_request.properties, + &mut state_publisher, + &ensure_request, + ensure_result_tx, &ensure_options, ) - .await; - - // All the VM components now exist, so allow external callers to - // interact with the VM. - // - // Order matters here: once the ensure result is sent, an external - // caller needs to observe that an active VM is present. - vm.make_active(&log, input_queue.clone(), &vm_objects, services).await; - let _ = - ensure_result_tx.send(Ok(propolis_api_types::InstanceEnsureResponse { - migrate: migration_in_id - .map(|id| InstanceMigrateInitiateResponse { migration_id: id }), - })); - - // If the VM was initialized via migration in, complete that migration now. - // - // External callers who ask to initialize an instance via migration in - // expect their API calls to complete once the relevant VM is initialized - // and the migration task has started (as opposed to when the entire - // migration attempt has completed), so this must happen after the ensure - // result is published. (Note that it's OK for the migration to fail after - // this point: the ensure request succeeds, but the instance goes to the - // Failed state and the migration appears to have failed.) - if let Some(migration_in) = migration_in { - if let Err(e) = migration_in.run(&mut state_publisher).await { - error!(log, "inbound live migration task failed"; - "error" => ?e); - - vm.set_rundown().await; + .await + { + Ok(activated) => activated, + Err(e) => { + error!(log, "failed to activate new VM"; "error" => #%e); return StateDriverOutput { state_publisher, final_state: InstanceState::Failed, }; } - } + }; + let (objects, input_queue) = activated_vm.into_inner(); let state_driver = StateDriver { log, - objects: vm_objects, + objects, input_queue, external_state: state_publisher, paused: false, @@ -342,11 +283,67 @@ pub(super) async fn run_state_driver( // Run the VM until it exits, then set rundown on the parent VM so that no // new external callers can access its objects or services. - let output = state_driver.run(migration_in_id.is_some()).await; + let output = state_driver.run(ensure_request.migrate.is_some()).await; vm.set_rundown().await; output } +/// Processes the supplied `ensure_request` to create a set of VM objects that +/// can be moved into a new `StateDriver`. +async fn create_and_activate_vm<'a>( + log: &'a slog::Logger, + vm: &'a Arc, + state_publisher: &'a mut StatePublisher, + ensure_request: &'a InstanceSpecEnsureRequest, + ensure_result_tx: tokio::sync::oneshot::Sender< + Result, + >, + ensure_options: &'a super::EnsureOptions, +) -> anyhow::Result> { + let ensure = VmEnsureNotStarted::new( + log, + vm, + ensure_request, + ensure_options, + ensure_result_tx, + state_publisher, + ); + + if let Some(migrate_request) = ensure_request.migrate.as_ref() { + let migration = match crate::migrate::destination::initiate( + log, + migrate_request, + ensure_options.local_server_addr, + ) + .await + { + Ok(mig) => mig, + Err(e) => { + return Err(ensure + .fail(e.into()) + .await + .context("creating migration protocol handler")); + } + }; + + // Delegate the rest of the activation process to the migration + // protocol. If the migration fails, the callee is responsible for + // dispatching failure messages to any API clients who are awaiting + // the results of their instance ensure calls. + Ok(migration + .run(ensure) + .await + .context("running live migration protocol")?) + } else { + let created = ensure + .create_objects() + .await + .context("creating VM objects for new instance")?; + + Ok(created.ensure_active().await) + } +} + impl StateDriver { pub(super) async fn run(mut self, migrated_in: bool) -> StateDriverOutput { info!(self.log, "state driver launched"); @@ -545,18 +542,6 @@ impl StateDriver { self.publish_steady_state(InstanceState::Stopped); } - async fn pause(&mut self) { - assert!(!self.paused); - self.objects.lock_exclusive().await.pause().await; - self.paused = true; - } - - async fn resume(&mut self) { - assert!(self.paused); - self.objects.lock_exclusive().await.resume(); - self.paused = false; - } - fn publish_steady_state(&mut self, state: InstanceState) { let change = match state { InstanceState::Running => { @@ -590,11 +575,27 @@ impl StateDriver { ) .await; - // Negotiate the migration protocol version with the target. - let Ok(migrate_ctx) = - crate::migrate::source_start(&self.log, migration_id, conn).await - else { - return; + let migration = match crate::migrate::source::initiate( + &self.log, + migration_id, + conn, + &self.objects, + &self.migration_src_state, + ) + .await + { + Ok(migration) => migration, + Err(_) => { + self.external_state.update(ExternalStateUpdate::Migration( + MigrationStateUpdate { + id: migration_id, + state: MigrationState::Error, + role: MigrateRole::Source, + }, + )); + + return; + } }; // Publish that migration is in progress before actually launching the @@ -608,83 +609,28 @@ impl StateDriver { }, )); - let (command_tx, mut command_rx) = tokio::sync::mpsc::channel(1); - let (response_tx, response_rx) = tokio::sync::mpsc::channel(1); - let objects_for_task = self.objects.clone(); - let mut migrate_task = tokio::spawn(async move { - crate::migrate::source::migrate( - objects_for_task, - command_tx, - response_rx, - migrate_ctx.conn, - migrate_ctx.protocol, + match migration + .run( + &self.objects, + &mut self.external_state, + &mut self.migration_src_state, ) .await - }); - - // The migration task may try to acquire the VM object lock shared, so - // this task cannot hold it excluive while waiting for the migration - // task to send an event. - loop { - match next_migrate_task_event( - &mut migrate_task, - &mut command_rx, - &self.log, - ) - .await - { - MigrateTaskEvent::TaskExited(res) => { - if res.is_ok() { - self.input_queue - .queue_external_request(ExternalRequest::Stop) - .expect("can always queue a request to stop"); - } else { - if self.paused { - self.resume().await; - } - - self.publish_steady_state(InstanceState::Running); - } - - return; - } + { + Ok(()) => { + info!(self.log, "migration out succeeded, queuing stop"); + // On a successful migration out, the protocol promises to leave + // the VM objects in a paused state, so don't pause them again. + self.paused = true; + self.input_queue + .queue_external_request(ExternalRequest::Stop) + .expect("can always queue a request to stop"); + } + Err(e) => { + info!(self.log, "migration out failed, resuming"; + "error" => ?e); - // N.B. When handling a command that requires a reply, do not - // return early if the reply fails to send. Instead, - // loop back around and let the `TaskExited` path restore - // the VM to the correct state. - MigrateTaskEvent::Command(cmd) => match cmd { - MigrateSourceCommand::UpdateState(state) => { - self.external_state.update( - ExternalStateUpdate::Migration( - MigrationStateUpdate { - id: migration_id, - state, - role: MigrateRole::Source, - }, - ), - ); - } - MigrateSourceCommand::Pause => { - self.pause().await; - let _ = response_tx - .send(MigrateSourceResponse::Pause(Ok(()))) - .await; - } - MigrateSourceCommand::QueryRedirtyingFailed => { - let has_failed = - self.migration_src_state.has_redirtying_ever_failed; - let _ = response_tx - .send(MigrateSourceResponse::RedirtyingFailed( - has_failed, - )) - .await; - } - MigrateSourceCommand::RedirtyingFailed => { - self.migration_src_state.has_redirtying_ever_failed = - true; - } - }, + self.publish_steady_state(InstanceState::Running); } } } diff --git a/bin/propolis-server/src/lib/vm/state_publisher.rs b/bin/propolis-server/src/lib/vm/state_publisher.rs index f8e2bb7c7..63a4f93a1 100644 --- a/bin/propolis-server/src/lib/vm/state_publisher.rs +++ b/bin/propolis-server/src/lib/vm/state_publisher.rs @@ -17,7 +17,7 @@ use crate::migrate::MigrateRole; use super::{InstanceStateRx, InstanceStateTx}; /// An update to an instance's migration's state. -pub(super) struct MigrationStateUpdate { +pub(crate) struct MigrationStateUpdate { /// The migration's new state. pub state: propolis_api_types::MigrationState, @@ -50,7 +50,7 @@ impl MigrationStateUpdate { } /// A kind of state update to publish. -pub(super) enum ExternalStateUpdate { +pub(crate) enum ExternalStateUpdate { /// Update the instance state (but not any migration state). Instance(InstanceState), @@ -62,7 +62,7 @@ pub(super) enum ExternalStateUpdate { } /// A channel to which to publish externally-visible instance state updates. -pub(super) struct StatePublisher { +pub(crate) struct StatePublisher { tx: InstanceStateTx, log: slog::Logger, } @@ -78,7 +78,7 @@ impl StatePublisher { /// Updates an instance's externally-visible state and publishes that state /// with a successor generation number. - pub(super) fn update(&mut self, update: ExternalStateUpdate) { + pub(crate) fn update(&mut self, update: ExternalStateUpdate) { let (instance_state, migration_state) = match update { ExternalStateUpdate::Instance(i) => (Some(i), None), ExternalStateUpdate::Migration(m) => (None, Some(m)), diff --git a/phd-tests/tests/src/migrate.rs b/phd-tests/tests/src/migrate.rs index b234bd4cf..c059ecc76 100644 --- a/phd-tests/tests/src/migrate.rs +++ b/phd-tests/tests/src/migrate.rs @@ -161,6 +161,15 @@ mod running_process { .unwrap_err(); info!(%error, "first migration failed as expected"); + // Also verify that the target reports that it failed. + let target_migration_state = target1 + .get_migration_state() + .await? + .migration_in + .expect("target should have a migration-in status") + .state; + assert_eq!(target_migration_state, MigrationState::Error); + // try again. this time, it should work! target2 .migrate_from(&source, Uuid::new_v4(), MigrationTimeout::default()) @@ -205,6 +214,15 @@ mod running_process { .unwrap_err(); info!(%error, "first migration failed as expected"); + // Also verify that the target reports that it failed. + let target_migration_state = target1 + .get_migration_state() + .await? + .migration_in + .expect("target should have a migration-in status") + .state; + assert_eq!(target_migration_state, MigrationState::Error); + // try again. this time, it should work! target2 .migrate_from(&source, Uuid::new_v4(), MigrationTimeout::default()) From e0ed0496ae51fab93bfe5eb13355124ef3633a66 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 16 Jul 2024 17:55:52 +0000 Subject: [PATCH 43/55] reimplement tokio task limits --- bin/propolis-server/src/lib/vm/active.rs | 4 ++ bin/propolis-server/src/lib/vm/mod.rs | 60 ++++++++++++++++++++---- 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/active.rs b/bin/propolis-server/src/lib/vm/active.rs index fd0f2dcff..1274782f9 100644 --- a/bin/propolis-server/src/lib/vm/active.rs +++ b/bin/propolis-server/src/lib/vm/active.rs @@ -39,6 +39,10 @@ pub(crate) struct ActiveVm { /// Services that interact with VM users or the control plane outside the /// Propolis API (e.g. the serial console, VNC, and metrics reporting). pub(super) services: VmServices, + + /// The runtime on which this VM's state driver and any tasks spawned by + /// the VM's components will run. + pub(super) tokio_rt: tokio::runtime::Runtime, } impl ActiveVm { diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 815d20f36..7a16df269 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -141,6 +141,15 @@ type InstanceEnsureResponseTx = tokio::sync::oneshot::Sender< Result, >; +/// The minimum number of threads to spawn in the Tokio runtime that runs the +/// state driver and any other VM-related tasks. +const VMM_MIN_RT_THREADS: usize = 8; + +/// When creating a new VM, add the VM's vCPU count to this value, then spawn +/// that many threads on its Tokio runtime or [`VMM_MIN_RT_THREADS`], whichever +/// is greater. +const VMM_BASE_RT_THREADS: usize = 4; + /// Errors generated by the VM controller and its subcomponents. #[derive(Debug, thiserror::Error)] pub(crate) enum VmError { @@ -164,6 +173,9 @@ pub(crate) enum VmError { #[error("Forbidden state change")] ForbiddenStateChange(#[from] request_queue::RequestDeniedReason), + + #[error("Failed to initialize VM's tokio runtime")] + TokioRuntimeInitializationFailed(#[source] std::io::Error), } /// The top-level VM wrapper type. @@ -199,6 +211,10 @@ struct VmDescription { /// The VM's last-known instance specification. spec: InstanceSpecV0, + + /// The runtime on which the VM's state driver is running (or on which it + /// ran). + tokio_rt: Option, } /// The states in the VM state machine. See the module comment for more details. @@ -365,6 +381,7 @@ impl Vm { properties: vm.properties, objects: objects.clone(), services, + tokio_rt: vm.tokio_rt.expect("WaitingForInit has runtime"), }); } _ => unreachable!( @@ -414,11 +431,12 @@ impl Vm { }; let spec = vm.objects().lock_shared().await.instance_spec().clone(); - let ActiveVm { external_state_rx, properties, .. } = vm; + let ActiveVm { external_state_rx, properties, tokio_rt, .. } = vm; guard.state = VmState::Rundown(VmDescription { external_state_rx, properties, spec, + tokio_rt: Some(tokio_rt), }); vm.services }; @@ -438,7 +456,14 @@ impl Vm { let mut guard = self.inner.write().await; let old = std::mem::replace(&mut guard.state, VmState::NoVm); match old { - VmState::Rundown(vm) => guard.state = VmState::RundownComplete(vm), + VmState::Rundown(mut vm) => { + // Tokio runtimes can't be dropped from an async context, so + // move the stashed state driver runtime onto an OS thread to + // drop it. + let rt = vm.tokio_rt.take().expect("rundown VM has a runtime"); + std::thread::spawn(move || drop(rt)); + guard.state = VmState::RundownComplete(vm); + } _ => unreachable!("VM rundown completed from invalid prior state"), } @@ -503,23 +528,31 @@ impl Vm { let mut guard = self.inner.write().await; match guard.state { VmState::WaitingForInit(_) => { - return Err(VmError::WaitingToInitialize) + return Err(VmError::WaitingToInitialize); } VmState::Active(_) => return Err(VmError::AlreadyInitialized), VmState::Rundown(_) => return Err(VmError::RundownInProgress), _ => {} - } + }; let VersionedInstanceSpec::V0(v0_spec) = ensure_request.instance_spec.clone(); - guard.state = VmState::WaitingForInit(VmDescription { - external_state_rx: external_rx.clone(), - properties: ensure_request.properties.clone(), - spec: v0_spec, - }); + let thread_count = usize::max( + VMM_MIN_RT_THREADS, + VMM_BASE_RT_THREADS + v0_spec.devices.board.cpus as usize, + ); + + let tokio_rt = tokio::runtime::Builder::new_multi_thread() + .thread_name("tokio-rt-vmm") + .worker_threads(thread_count) + .enable_all() + .build() + .map_err(VmError::TokioRuntimeInitializationFailed)?; + + let properties = ensure_request.properties.clone(); let vm_for_driver = self.clone(); - guard.driver = Some(tokio::spawn(async move { + guard.driver = Some(tokio_rt.spawn(async move { state_driver::run_state_driver( log_for_driver, vm_for_driver, @@ -530,6 +563,13 @@ impl Vm { ) .await })); + + guard.state = VmState::WaitingForInit(VmDescription { + external_state_rx: external_rx.clone(), + properties, + spec: v0_spec, + tokio_rt: Some(tokio_rt), + }); } // Wait for the state driver task to dispose of this request. From 860692761361ca7903cd8ab74dda679e6db9a4a3 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 16 Jul 2024 16:55:47 +0000 Subject: [PATCH 44/55] lifecycle_components -> devices again --- bin/propolis-server/src/lib/vm/ensure.rs | 2 +- bin/propolis-server/src/lib/vm/objects.rs | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/ensure.rs b/bin/propolis-server/src/lib/vm/ensure.rs index 4ba939952..b3e3f1de8 100644 --- a/bin/propolis-server/src/lib/vm/ensure.rs +++ b/bin/propolis-server/src/lib/vm/ensure.rs @@ -232,7 +232,7 @@ impl<'a> VmEnsureNotStarted<'a> { instance_spec: v0_spec.clone(), vcpu_tasks, machine, - lifecycle_components: devices, + devices, block_backends, crucible_backends, com1, diff --git a/bin/propolis-server/src/lib/vm/objects.rs b/bin/propolis-server/src/lib/vm/objects.rs index e5530560b..1e031743e 100644 --- a/bin/propolis-server/src/lib/vm/objects.rs +++ b/bin/propolis-server/src/lib/vm/objects.rs @@ -47,7 +47,7 @@ pub(super) struct InputVmObjects { pub instance_spec: InstanceSpecV0, pub vcpu_tasks: Box, pub machine: Machine, - pub lifecycle_components: DeviceMap, + pub devices: DeviceMap, pub block_backends: BlockBackendMap, pub crucible_backends: CrucibleBackendMap, pub com1: Arc>, @@ -71,7 +71,7 @@ pub(crate) struct VmObjectsLocked { /// Maps from component names to the trait objects that implement lifecycle /// operations (e.g. pause and resume) for eligible components. - lifecycle_components: DeviceMap, + devices: DeviceMap, /// Maps from component names to trait objects that implement the block /// storage backend trait. @@ -122,7 +122,7 @@ impl VmObjectsLocked { instance_spec: input.instance_spec, vcpu_tasks: input.vcpu_tasks, machine: input.machine, - lifecycle_components: input.lifecycle_components, + devices: input.devices, block_backends: input.block_backends, crucible_backends: input.crucible_backends, com1: input.com1, @@ -165,7 +165,7 @@ impl VmObjectsLocked { &self, name: &str, ) -> Option> { - self.lifecycle_components.get(name).cloned() + self.devices.get(name).cloned() } /// Yields the VM's current Crucible backend map. @@ -195,7 +195,7 @@ impl VmObjectsLocked { &self, mut func: impl FnMut(&str, &Arc), ) { - for (name, dev) in self.lifecycle_components.iter() { + for (name, dev) in self.devices.iter() { func(name, dev); } } @@ -210,7 +210,7 @@ impl VmObjectsLocked { &Arc, ) -> std::result::Result<(), E>, ) -> std::result::Result<(), E> { - for (name, dev) in self.lifecycle_components.iter() { + for (name, dev) in self.devices.iter() { func(name, dev)?; } @@ -371,7 +371,7 @@ impl VmObjectsLocked { info!(self.log, "waiting for devices to pause"); let mut stream: FuturesUnordered<_> = self - .lifecycle_components + .devices .iter() .map(|(name, dev)| { info!(self.log, "got paused future from dev {}", name); From 7b87f5bdf09f7a4252f4f68162eb218964651c7e Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 16 Jul 2024 19:02:20 +0000 Subject: [PATCH 45/55] don't undercut the state driver when completing rundown --- bin/propolis-server/src/lib/vm/mod.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 7a16df269..e865f7902 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -455,17 +455,21 @@ impl Vm { info!(self.log, "completing VM rundown"); let mut guard = self.inner.write().await; let old = std::mem::replace(&mut guard.state, VmState::NoVm); - match old { + + // Extract the run-down VM's tokio runtime so that it can be shut down + // on a separate thread (runtimes must be dropped in a non-async + // context). + // + // The runtime can't be dropped until this routine extracts the final VM + // state from the state driver task. + let rt = match old { VmState::Rundown(mut vm) => { - // Tokio runtimes can't be dropped from an async context, so - // move the stashed state driver runtime onto an OS thread to - // drop it. let rt = vm.tokio_rt.take().expect("rundown VM has a runtime"); - std::thread::spawn(move || drop(rt)); guard.state = VmState::RundownComplete(vm); + rt } _ => unreachable!("VM rundown completed from invalid prior state"), - } + }; let StateDriverOutput { mut state_publisher, final_state } = guard .driver @@ -477,6 +481,8 @@ impl Vm { state_publisher.update(state_publisher::ExternalStateUpdate::Instance( final_state, )); + + std::thread::spawn(move || drop(rt)); } /// Attempts to move this VM to the `Active` state by setting up a state From 45e76d3ba266bce2dc2db730a6ee76165f7636d2 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 16 Jul 2024 19:41:16 +0000 Subject: [PATCH 46/55] use shutdown_background instead of moving to an OS thread --- bin/propolis-server/src/lib/vm/mod.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index e865f7902..fc7990821 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -455,13 +455,6 @@ impl Vm { info!(self.log, "completing VM rundown"); let mut guard = self.inner.write().await; let old = std::mem::replace(&mut guard.state, VmState::NoVm); - - // Extract the run-down VM's tokio runtime so that it can be shut down - // on a separate thread (runtimes must be dropped in a non-async - // context). - // - // The runtime can't be dropped until this routine extracts the final VM - // state from the state driver task. let rt = match old { VmState::Rundown(mut vm) => { let rt = vm.tokio_rt.take().expect("rundown VM has a runtime"); @@ -482,7 +475,13 @@ impl Vm { final_state, )); - std::thread::spawn(move || drop(rt)); + // Shut down the runtime without blocking to wait for tasks to complete + // (since blocking is illegal in an async context). + // + // This must happen after the state driver task has successfully joined + // (otherwise it might be canceled and will fail to yield the VM's final + // state). + rt.shutdown_background(); } /// Attempts to move this VM to the `Active` state by setting up a state From 75d6a9560d45895bc4c003625db857fd48711c72 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 16 Jul 2024 20:01:00 +0000 Subject: [PATCH 47/55] rm unused file mistakenly added back in merge --- .../src/lib/vm/migrate_commands.rs | 103 ------------------ bin/propolis-server/src/lib/vm/startup.rs | 4 - 2 files changed, 107 deletions(-) delete mode 100644 bin/propolis-server/src/lib/vm/migrate_commands.rs diff --git a/bin/propolis-server/src/lib/vm/migrate_commands.rs b/bin/propolis-server/src/lib/vm/migrate_commands.rs deleted file mode 100644 index 403432331..000000000 --- a/bin/propolis-server/src/lib/vm/migrate_commands.rs +++ /dev/null @@ -1,103 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Requests and responses between the VM state driver and the live migration -//! protocol. - -use std::sync::Arc; - -use crate::migrate::MigrateError; - -/// A message sent from a live migration destination task to update the -/// externally visible state of the migration attempt. -#[derive(Clone, Copy, Debug)] -pub enum MigrateTargetCommand { - /// Initialize VM objects using the instance spec supplied to the state - /// driver by its creator. - InitializeFromExternalSpec, - - /// Update the externally-visible migration state. - UpdateState(propolis_api_types::MigrationState), -} - -/// A response sent from a migration target's state driver to its migration -/// task. -#[derive(Clone)] -pub enum MigrateTargetResponse { - /// An attempt to initialize VM objects produced the supplied objects or - /// failed for the supplied reason. - VmObjectsInitialized(Result, String>), -} - -/// A message sent from a live migration driver to the state worker, asking it -/// to act on source instance components on the task's behalf. -#[derive(Clone, Copy, Debug)] -pub enum MigrateSourceCommand { - /// Update the externally-visible migration state. - UpdateState(propolis_api_types::MigrationState), - - /// Determine whether a previous attempt to restore the VM's dirty bitmap - /// has failed. - QueryRedirtyingFailed, - - /// Record that the guest's dirty page bitmap may be inconsistent so that - /// future attempts to migrate out transmit all pages. - RedirtyingFailed, - - /// Pause the instance's devices and CPUs. - Pause, -} - -/// A message sent from the state worker to the live migration driver in -/// response to a previous command. -#[derive(Debug)] -pub enum MigrateSourceResponse { - /// A previous migration out has (or has not) failed to restore the VM's - /// dirty bitmap. - RedirtyingFailed(bool), - - /// A request to pause completed with the attached result. - Pause(Result<(), std::io::Error>), -} - -/// An event raised by a migration task that must be handled by the state -/// worker. -#[derive(Debug)] -pub(super) enum MigrateTaskEvent { - /// The task completed with the associated result. - TaskExited(Result<(), MigrateError>), - - /// The task sent a command requesting work. - Command(T), -} - -/// Given a migration executing in `task` that sends commands on `command_rx`, -/// gets the next event for the task's state driver to process. -pub(super) async fn next_migrate_task_event( - task: &mut tokio::task::JoinHandle< - Result<(), crate::migrate::MigrateError>, - >, - command_rx: &mut tokio::sync::mpsc::Receiver, - log: &slog::Logger, -) -> MigrateTaskEvent { - if let Some(cmd) = command_rx.recv().await { - return MigrateTaskEvent::Command(cmd); - } - - // The sender side of the command channel is dropped, which means the - // migration task is exiting. Wait for it to finish and snag its result. - match task.await { - Ok(res) => { - slog::info!(log, "Migration task exited: {:?}", res); - MigrateTaskEvent::TaskExited(res) - } - Err(join_err) => { - if join_err.is_cancelled() { - panic!("Migration task canceled"); - } else { - panic!("Migration task panicked: {:?}", join_err.into_panic()); - } - } - } -} diff --git a/bin/propolis-server/src/lib/vm/startup.rs b/bin/propolis-server/src/lib/vm/startup.rs index a439c1777..fea9b696e 100644 --- a/bin/propolis-server/src/lib/vm/startup.rs +++ b/bin/propolis-server/src/lib/vm/startup.rs @@ -21,10 +21,6 @@ use crate::{ }; use super::{ - migrate_commands::{ - next_migrate_task_event, MigrateTargetCommand, MigrateTargetResponse, - MigrateTaskEvent, - }, objects::{InputVmObjects, VmObjects}, state_driver::InputQueue, state_publisher::{ From 09d942c44464b2ddeea698f2914764352570d7f2 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 16 Jul 2024 21:02:00 +0000 Subject: [PATCH 48/55] clean up a few fully-qualified paths --- bin/propolis-server/src/lib/vm/mod.rs | 54 +++++++++---------- bin/propolis-server/src/lib/vm/services.rs | 5 +- .../src/lib/vm/state_driver.rs | 30 +++++------ 3 files changed, 42 insertions(+), 47 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index fc7990821..214f40ef7 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -84,12 +84,16 @@ use active::ActiveVm; use oximeter::types::ProducerRegistry; use propolis_api_types::{ instance_spec::{v0::InstanceSpecV0, VersionedInstanceSpec}, - InstanceProperties, + InstanceEnsureResponse, InstanceMigrateStatusResponse, + InstanceMigrationStatus, InstanceProperties, InstanceSpecEnsureRequest, + InstanceSpecGetResponse, InstanceState, InstanceStateMonitorResponse, + MigrationState, }; use rfb::server::VncServer; use slog::info; use state_driver::StateDriverOutput; use state_publisher::StatePublisher; +use tokio::sync::{RwLock, RwLockReadGuard}; use crate::{server::MetricsEndpointConfig, vnc::PropolisVncServer}; @@ -117,15 +121,12 @@ pub(crate) type CrucibleBackendMap = /// Type alias for the sender side of the channel that receives /// externally-visible instance state updates. -type InstanceStateTx = tokio::sync::watch::Sender< - propolis_api_types::InstanceStateMonitorResponse, ->; +type InstanceStateTx = tokio::sync::watch::Sender; /// Type alias for the receiver side of the channel that receives /// externally-visible instance state updates. -type InstanceStateRx = tokio::sync::watch::Receiver< - propolis_api_types::InstanceStateMonitorResponse, ->; +type InstanceStateRx = + tokio::sync::watch::Receiver; /// Type alias for the results sent by the state driver in response to a request /// to change a Crucible backend's configuration. @@ -137,9 +138,10 @@ pub(crate) type CrucibleReplaceResult = pub(crate) type CrucibleReplaceResultTx = tokio::sync::oneshot::Sender; -type InstanceEnsureResponseTx = tokio::sync::oneshot::Sender< - Result, ->; +/// Type alias for the sender side of a channel that receives the results of +/// instance-ensure API calls. +type InstanceEnsureResponseTx = + tokio::sync::oneshot::Sender>; /// The minimum number of threads to spawn in the Tokio runtime that runs the /// state driver and any other VM-related tasks. @@ -186,7 +188,7 @@ pub(crate) struct Vm { /// acquire this lock shared. /// /// Routines that drive the VM state machine acquire this lock exclusive. - inner: tokio::sync::RwLock, + inner: RwLock, /// A logger for this VM. log: slog::Logger, @@ -288,15 +290,15 @@ impl Vm { pub fn new(log: &slog::Logger) -> Arc { let log = log.new(slog::o!("component" => "vm_wrapper")); let inner = VmInner { state: VmState::NoVm, driver: None }; - Arc::new(Self { inner: tokio::sync::RwLock::new(inner), log }) + Arc::new(Self { inner: RwLock::new(inner), log }) } /// If the VM is `Active`, yields a shared lock guard with a reference to /// the relevant `ActiveVm`. Returns `None` if there is no active VM. pub(super) async fn active_vm( &self, - ) -> Option> { - tokio::sync::RwLockReadGuard::try_map( + ) -> Option> { + RwLockReadGuard::try_map( self.inner.read().await, |inner| { if let VmState::Active(vm) = &inner.state { @@ -311,9 +313,7 @@ impl Vm { /// Returns the state, properties, and instance spec for the instance most /// recently wrapped by this `Vm`. - pub(super) async fn get( - &self, - ) -> Result { + pub(super) async fn get(&self) -> Result { let guard = self.inner.read().await; let vm = match &guard.state { VmState::NoVm => { @@ -323,7 +323,7 @@ impl Vm { VmState::WaitingForInit(vm) | VmState::Rundown(vm) | VmState::RundownComplete(vm) => { - return Ok(propolis_api_types::InstanceSpecGetResponse { + return Ok(InstanceSpecGetResponse { properties: vm.properties.clone(), state: vm.external_state_rx.borrow().state, spec: VersionedInstanceSpec::V0(vm.spec.clone()), @@ -333,7 +333,7 @@ impl Vm { let spec = vm.objects().lock_shared().await.instance_spec().clone(); let state = vm.external_state_rx.borrow().clone(); - Ok(propolis_api_types::InstanceSpecGetResponse { + Ok(InstanceSpecGetResponse { properties: vm.properties.clone(), spec: VersionedInstanceSpec::V0(spec), state: state.state, @@ -489,9 +489,9 @@ impl Vm { pub(crate) async fn ensure( self: &Arc, log: &slog::Logger, - ensure_request: propolis_api_types::InstanceSpecEnsureRequest, + ensure_request: InstanceSpecEnsureRequest, options: EnsureOptions, - ) -> Result { + ) -> Result { let log_for_driver = log.new(slog::o!("component" => "vm_state_driver")); @@ -507,18 +507,18 @@ impl Vm { // the channel will move to the state driver task. let (external_publisher, external_rx) = StatePublisher::new( &log_for_driver, - propolis_api_types::InstanceStateMonitorResponse { + InstanceStateMonitorResponse { gen: 1, state: if ensure_request.migrate.is_some() { - propolis_api_types::InstanceState::Migrating + InstanceState::Migrating } else { - propolis_api_types::InstanceState::Creating + InstanceState::Creating }, - migration: propolis_api_types::InstanceMigrateStatusResponse { + migration: InstanceMigrateStatusResponse { migration_in: ensure_request.migrate.as_ref().map(|req| { - propolis_api_types::InstanceMigrationStatus { + InstanceMigrationStatus { id: req.migration_id, - state: propolis_api_types::MigrationState::Sync, + state: MigrationState::Sync, } }), migration_out: None, diff --git a/bin/propolis-server/src/lib/vm/services.rs b/bin/propolis-server/src/lib/vm/services.rs index 16d12141b..095fa5050 100644 --- a/bin/propolis-server/src/lib/vm/services.rs +++ b/bin/propolis-server/src/lib/vm/services.rs @@ -8,6 +8,7 @@ use std::sync::Arc; use oximeter::types::ProducerRegistry; +use propolis_api_types::InstanceProperties; use rfb::server::VncServer; use slog::{error, info, Logger}; @@ -49,7 +50,7 @@ impl VmServices { log: &slog::Logger, vm: &Arc, vm_objects: &VmObjects, - vm_properties: &propolis_api_types::InstanceProperties, + vm_properties: &InstanceProperties, ensure_options: &super::EnsureOptions, ) -> Self { let oximeter_state = if let Some(cfg) = &ensure_options.metrics_config { @@ -121,7 +122,7 @@ async fn register_oximeter_producer( log: &slog::Logger, cfg: &MetricsEndpointConfig, registry: &ProducerRegistry, - vm_properties: &propolis_api_types::InstanceProperties, + vm_properties: &InstanceProperties, ) -> OximeterState { let mut oximeter_state = OximeterState::default(); let virtual_machine = VirtualMachine::from(vm_properties); diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index d5e87972f..a749bd00e 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -30,9 +30,9 @@ use super::{ ensure::{VmEnsureActive, VmEnsureNotStarted}, guest_event::{self, GuestEvent}, objects::VmObjects, - request_queue::{ExternalRequest, InstanceAutoStart}, + request_queue::{self, ExternalRequest, InstanceAutoStart}, state_publisher::{MigrationStateUpdate, StatePublisher}, - VmError, + InstanceEnsureResponseTx, }; /// Tells the state driver what to do after handling an event. @@ -59,7 +59,7 @@ enum InputQueueEvent { /// The lock-guarded parts of a state driver's input queue. struct InputQueueInner { /// State change requests from the external API. - external_requests: super::request_queue::ExternalRequestQueue, + external_requests: request_queue::ExternalRequestQueue, /// State change requests from the VM's components. These take precedence /// over external state change requests. @@ -69,7 +69,7 @@ struct InputQueueInner { impl InputQueueInner { fn new(log: slog::Logger, auto_start: InstanceAutoStart) -> Self { Self { - external_requests: super::request_queue::ExternalRequestQueue::new( + external_requests: request_queue::ExternalRequestQueue::new( log, auto_start, ), guest_events: super::guest_event::GuestEventQueue::default(), @@ -128,7 +128,7 @@ impl InputQueue { /// requests. fn notify_instance_state_change( &self, - state: super::request_queue::InstanceStateChange, + state: request_queue::InstanceStateChange, ) { let mut guard = self.inner.lock().unwrap(); guard.external_requests.notify_instance_state_change(state); @@ -138,7 +138,7 @@ impl InputQueue { pub(super) fn queue_external_request( &self, request: ExternalRequest, - ) -> Result<(), super::request_queue::RequestDeniedReason> { + ) -> Result<(), request_queue::RequestDeniedReason> { let mut inner = self.inner.lock().unwrap(); let result = inner.external_requests.try_queue(request); if result.is_ok() { @@ -246,9 +246,7 @@ pub(super) async fn run_state_driver( vm: Arc, mut state_publisher: StatePublisher, ensure_request: InstanceSpecEnsureRequest, - ensure_result_tx: tokio::sync::oneshot::Sender< - Result, - >, + ensure_result_tx: InstanceEnsureResponseTx, ensure_options: super::EnsureOptions, ) -> StateDriverOutput { let activated_vm = match create_and_activate_vm( @@ -295,9 +293,7 @@ async fn create_and_activate_vm<'a>( vm: &'a Arc, state_publisher: &'a mut StatePublisher, ensure_request: &'a InstanceSpecEnsureRequest, - ensure_result_tx: tokio::sync::oneshot::Sender< - Result, - >, + ensure_result_tx: InstanceEnsureResponseTx, ensure_options: &'a super::EnsureOptions, ) -> anyhow::Result> { let ensure = VmEnsureNotStarted::new( @@ -514,7 +510,7 @@ impl StateDriver { // Notify other consumers that the instance successfully rebooted and is // now back to Running. self.input_queue.notify_instance_state_change( - super::request_queue::InstanceStateChange::Rebooted, + request_queue::InstanceStateChange::Rebooted, ); self.external_state .update(ExternalStateUpdate::Instance(InstanceState::Running)); @@ -545,14 +541,12 @@ impl StateDriver { fn publish_steady_state(&mut self, state: InstanceState) { let change = match state { InstanceState::Running => { - super::request_queue::InstanceStateChange::StartedRunning + request_queue::InstanceStateChange::StartedRunning } InstanceState::Stopped => { - super::request_queue::InstanceStateChange::Stopped - } - InstanceState::Failed => { - super::request_queue::InstanceStateChange::Failed + request_queue::InstanceStateChange::Stopped } + InstanceState::Failed => request_queue::InstanceStateChange::Failed, _ => panic!( "Called publish_steady_state on non-terminal state {:?}", state From 96a126354e6f50770f76120d3eed9dd6520f3e7e Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 16 Jul 2024 21:09:59 +0000 Subject: [PATCH 49/55] fmt --- bin/propolis-server/src/lib/vm/mod.rs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 214f40ef7..c40d47685 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -298,16 +298,13 @@ impl Vm { pub(super) async fn active_vm( &self, ) -> Option> { - RwLockReadGuard::try_map( - self.inner.read().await, - |inner| { - if let VmState::Active(vm) = &inner.state { - Some(vm) - } else { - None - } - }, - ) + RwLockReadGuard::try_map(self.inner.read().await, |inner| { + if let VmState::Active(vm) = &inner.state { + Some(vm) + } else { + None + } + }) .ok() } From fc8a45e5be3241cc5b31f48bc3bd7e935a4b667f Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 16 Jul 2024 23:09:09 +0000 Subject: [PATCH 50/55] rm one more unused file --- bin/propolis-server/src/lib/vm/startup.rs | 371 ---------------------- 1 file changed, 371 deletions(-) delete mode 100644 bin/propolis-server/src/lib/vm/startup.rs diff --git a/bin/propolis-server/src/lib/vm/startup.rs b/bin/propolis-server/src/lib/vm/startup.rs deleted file mode 100644 index fea9b696e..000000000 --- a/bin/propolis-server/src/lib/vm/startup.rs +++ /dev/null @@ -1,371 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Functionality used to create a new VM and possibly live migrate into it. - -use std::sync::Arc; - -use propolis_api_types::{ - instance_spec::VersionedInstanceSpec, InstanceProperties, - InstanceSpecEnsureRequest, MigrationState, -}; -use slog::{error, info}; -use uuid::Uuid; - -use crate::{ - initializer::{ - build_instance, MachineInitializer, MachineInitializerState, - }, - migrate::{MigrateError, MigrateRole}, -}; - -use super::{ - objects::{InputVmObjects, VmObjects}, - state_driver::InputQueue, - state_publisher::{ - ExternalStateUpdate, MigrationStateUpdate, StatePublisher, - }, -}; - -/// The context needed to finish a live migration into a VM after its initial -/// Sync phase has concluded and produced a set of VM objects (into which the -/// migration will import the source VM's state). -pub(super) struct MigrateAsTargetContext { - /// The objects into which to import state from the source. - vm_objects: Arc, - - /// The logger associated with this migration. - log: slog::Logger, - - /// The migration's ID. - migration_id: Uuid, - - /// A handle to the task that's driving the migration. - migrate_task: tokio::task::JoinHandle>, - - /// Receives commands from the migration task. - command_rx: tokio::sync::mpsc::Receiver, - - /// Sends command responses to the migration task. - response_tx: tokio::sync::mpsc::Sender, -} - -/// The output of a call to [`build_vm`]. -pub(super) struct BuildVmOutput { - /// A reference to the VM objects created by the request to build a new VM. - pub vm_objects: Arc, - - /// If the VM is initializing via migration in, the context needed to - /// complete that migration. - pub migration_in: Option, -} - -/// Builds a new set of VM objects from the supplied ensure `request`. -/// -/// If the request asks to create a new VM without migrating, this routine -/// simply sets up the new VM's objects and returns them. -/// -/// Callers who ask to initialize a VM via live migration expect their API calls -/// to succeed as soon as there's an initialized VM and a running migration -/// task, even if the migration hasn't completed yet. To facilitate this, when -/// initializing via live migration, this routine executes only enough of the -/// live migration protocol to create VM objects, then immediately returns those -/// objects and a context the caller can use to finish the migration task. This -/// allows the caller to complete any external ensure calls it has pending -/// before completing migration and allowing the state driver to process state -/// change requests. -pub(super) async fn build_vm( - log: &slog::Logger, - parent: &Arc, - request: &InstanceSpecEnsureRequest, - options: &super::EnsureOptions, - input_queue: &Arc, - state_publisher: &mut StatePublisher, -) -> anyhow::Result>)> { - // If the caller didn't ask to initialize by live migration in, immediately - // create the VM objects and return them. - let Some(migrate_request) = &request.migrate else { - let input_objects = initialize_vm_objects_from_spec( - log, - input_queue, - &request.properties, - &request.instance_spec, - options, - ) - .await - .map_err(|e| (e, None))?; - - let vm_objects = Arc::new(VmObjects::new( - log.clone(), - parent.clone(), - input_objects, - )); - - return Ok(BuildVmOutput { vm_objects, migration_in: None }); - }; - - // The caller has asked to initialize by live migration in. Initialize VM - // objects at the live migration task's request. - // - // Begin by contacting the source Propolis and obtaining the connection that - // the actual migration task will need. - let migrate_ctx = crate::migrate::dest_initiate( - log, - migrate_request, - options.local_server_addr, - ) - .await - .map_err(|e| (e.into(), None))?; - - // Spin up a task to run the migration protocol proper. To avoid sending the - // entire VM context over to the migration task, create command and response - // channels to allow the migration task to delegate work back to this - // routine. - let log_for_task = log.clone(); - let (command_tx, mut command_rx) = tokio::sync::mpsc::channel(1); - let (response_tx, response_rx) = tokio::sync::mpsc::channel(1); - let migrate_task = tokio::spawn(async move { - crate::migrate::destination::migrate( - &log_for_task, - command_tx, - response_rx, - migrate_ctx.conn, - migrate_ctx.local_addr, - migrate_ctx.protocol, - ) - .await - }); - - // In the initial phases of live migration, the migration protocol decides - // whether the source and destination VMs have compatible configurations. If - // they do, the migration task asks this routine to initialize a VM on its - // behalf. Execute this part of the protocol now in order to create a set of - // VM objects to return. - // - // TODO(#706): Future versions of the protocol can extend this further, - // specifying an instance spec and/or an initial set of device payloads that - // the task should use to initialize its VM objects. - let init_command = command_rx.recv().await.ok_or_else(|| { - (anyhow::anyhow!("migration task unexpectedly closed channel"), None) - })?; - - let input_objects = 'init: { - let MigrateTargetCommand::InitializeFromExternalSpec = init_command - else { - error!(log, "migration protocol didn't init objects first"; - "command" => ?init_command); - break 'init Err(anyhow::anyhow!( - "migration protocol didn't init objects first" - )); - }; - - initialize_vm_objects_from_spec( - log, - input_queue, - &request.properties, - &request.instance_spec, - options, - ) - .await - .map_err(Into::into) - }; - - let vm_objects = match input_objects { - Ok(o) => Arc::new(VmObjects::new(log.clone(), parent.clone(), o)), - Err(e) => { - let _ = response_tx - .send(MigrateTargetResponse::VmObjectsInitialized(Err( - e.to_string() - ))) - .await; - state_publisher.update(ExternalStateUpdate::Migration( - MigrationStateUpdate { - id: migrate_ctx.migration_id, - state: MigrationState::Error, - role: MigrateRole::Source, - }, - )); - - return Err((e, None)); - } - }; - - // The VM's objects are initialized. Return them to the caller along with a - // continuation context that it can use to complete migration. - let migration_in = MigrateAsTargetContext { - vm_objects: vm_objects.clone(), - log: log.clone(), - migration_id: migrate_ctx.migration_id, - migrate_task, - command_rx, - response_tx, - }; - - Ok(BuildVmOutput { vm_objects, migration_in: Some(migration_in) }) -} - -/// Initializes a set of Propolis components from the supplied instance spec. -async fn initialize_vm_objects_from_spec( - log: &slog::Logger, - event_queue: &Arc, - properties: &InstanceProperties, - spec: &VersionedInstanceSpec, - options: &super::EnsureOptions, -) -> anyhow::Result { - info!(log, "initializing new VM"; - "spec" => #?spec, - "properties" => #?properties, - "use_reservoir" => options.use_reservoir, - "bootrom" => %options.toml_config.bootrom.display()); - - let vmm_log = log.new(slog::o!("component" => "vmm")); - - // Set up the 'shell' instance into which the rest of this routine will - // add components. - let VersionedInstanceSpec::V0(v0_spec) = &spec; - let machine = build_instance( - &properties.vm_name(), - v0_spec, - options.use_reservoir, - vmm_log, - )?; - - let mut init = MachineInitializer { - log: log.clone(), - machine: &machine, - devices: Default::default(), - block_backends: Default::default(), - crucible_backends: Default::default(), - spec: v0_spec, - properties, - toml_config: &options.toml_config, - producer_registry: options.oximeter_registry.clone(), - state: MachineInitializerState::default(), - }; - - init.initialize_rom(options.toml_config.bootrom.as_path())?; - let chipset = init.initialize_chipset( - &(event_queue.clone() - as Arc), - )?; - - init.initialize_rtc(&chipset)?; - init.initialize_hpet()?; - - let com1 = Arc::new(init.initialize_uart(&chipset)?); - let ps2ctrl = init.initialize_ps2(&chipset)?; - init.initialize_qemu_debug_port()?; - init.initialize_qemu_pvpanic(properties.into())?; - init.initialize_network_devices(&chipset)?; - - #[cfg(not(feature = "omicron-build"))] - init.initialize_test_devices(&options.toml_config.devices)?; - #[cfg(feature = "omicron-build")] - info!(log, "`omicron-build` feature enabled, ignoring any test devices"); - - #[cfg(feature = "falcon")] - init.initialize_softnpu_ports(&chipset)?; - #[cfg(feature = "falcon")] - init.initialize_9pfs(&chipset)?; - - init.initialize_storage_devices(&chipset, options.nexus_client.clone()) - .await?; - - let ramfb = init.initialize_fwcfg(v0_spec.devices.board.cpus)?; - init.initialize_cpus()?; - let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new( - &machine, - event_queue.clone() as Arc, - log.new(slog::o!("component" => "vcpu_tasks")), - )?); - - let MachineInitializer { - devices, block_backends, crucible_backends, .. - } = init; - - Ok(InputVmObjects { - instance_spec: v0_spec.clone(), - vcpu_tasks, - machine, - lifecycle_components: devices, - block_backends, - crucible_backends, - com1, - framebuffer: Some(ramfb), - ps2ctrl, - }) -} - -impl MigrateAsTargetContext { - /// Runs a partially-completed inbound live migration to completion. - pub(super) async fn run( - mut self, - state_publisher: &mut StatePublisher, - ) -> Result<(), MigrateError> { - // The migration task imports device state by operating directly on the - // newly-created VM objects. Before sending them to the task, make sure - // the objects are ready to have state imported into them. Specifically, - // ensure that the VM's vCPUs are activated so they can enter the guest - // after migration and pause the kernel VM to allow it to import device - // state consistently. - // - // Drop the lock after this operation so that the migration task can - // acquire it to enumerate devices and import state into them. - { - let guard = self.vm_objects.lock_shared().await; - guard.reset_vcpus(); - guard.pause_kernel_vm(); - } - - self.response_tx - .send(MigrateTargetResponse::VmObjectsInitialized(Ok(self - .vm_objects - .clone()))) - .await - .expect("migration task shouldn't exit while awaiting driver"); - - loop { - let action = next_migrate_task_event( - &mut self.migrate_task, - &mut self.command_rx, - &self.log, - ) - .await; - - match action { - MigrateTaskEvent::TaskExited(res) => match res { - Ok(()) => { - return Ok(()); - } - Err(e) => { - error!(self.log, "target migration task failed"; - "error" => %e); - - self.vm_objects - .lock_exclusive() - .await - .resume_kernel_vm(); - return Err(e); - } - }, - MigrateTaskEvent::Command( - MigrateTargetCommand::UpdateState(state), - ) => { - state_publisher.update(ExternalStateUpdate::Migration( - MigrationStateUpdate { - state, - id: self.migration_id, - role: MigrateRole::Destination, - }, - )); - } - MigrateTaskEvent::Command( - MigrateTargetCommand::InitializeFromExternalSpec, - ) => { - panic!("already received initialize-from-spec command"); - } - } - } - } -} From 749ee17cd3f555e4711e79cd5488f13979c791a8 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 17 Jul 2024 17:55:56 +0000 Subject: [PATCH 51/55] remove extra fn call now that block_on isn't needed --- lib/propolis/src/block/crucible.rs | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index 9825a8c76..dc82559d8 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -87,27 +87,10 @@ impl CrucibleBackend { nexus_client: Option, log: slog::Logger, ) -> io::Result> { - CrucibleBackend::_create( - request, - opts, - producer_registry, - nexus_client, - log, - ) - .await - .map_err(CrucibleError::into) - } - - async fn _create( - request: VolumeConstructionRequest, - opts: block::BackendOpts, - producer_registry: Option, - nexus_client: Option, - log: slog::Logger, - ) -> Result, crucible::CrucibleError> { // Construct the volume. - let volume = - Volume::construct(request, producer_registry, log.clone()).await?; + let volume = Volume::construct(request, producer_registry, log.clone()) + .await + .map_err(|e| io::Error::from(CrucibleError::from(e)))?; // Decide if we need to scrub this volume or not. if volume.has_read_only_parent() { From c9b1bf6bea02d1d98b7d083900a50483cb14f068 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 17 Jul 2024 18:10:21 +0000 Subject: [PATCH 52/55] PR feedback: style fixes --- .../src/lib/migrate/destination.rs | 6 +- bin/propolis-server/src/lib/vm/guest_event.rs | 7 +- bin/propolis-server/src/lib/vm/mod.rs | 110 ++++++++++-------- bin/propolis-server/src/lib/vm/objects.rs | 24 ++-- 4 files changed, 75 insertions(+), 72 deletions(-) diff --git a/bin/propolis-server/src/lib/migrate/destination.rs b/bin/propolis-server/src/lib/migrate/destination.rs index 6d683c9a6..7f23beb15 100644 --- a/bin/propolis-server/src/lib/migrate/destination.rs +++ b/bin/propolis-server/src/lib/migrate/destination.rs @@ -119,8 +119,8 @@ pub(crate) async fn initiate( Ok(None) => { let offered = super::protocol::make_protocol_offer(); error!(log, "source selected protocol not on offer"; - "offered" => &offered, - "selected" => &src_selected); + "offered" => &offered, + "selected" => &src_selected); return Err(MigrateError::NoMatchingProtocol( src_selected, @@ -129,7 +129,7 @@ pub(crate) async fn initiate( } Err(e) => { error!(log, "source selected protocol failed to parse"; - "selected" => &src_selected); + "selected" => &src_selected); return Err(MigrateError::ProtocolParse( src_selected, diff --git a/bin/propolis-server/src/lib/vm/guest_event.rs b/bin/propolis-server/src/lib/vm/guest_event.rs index b95d9c14b..073858500 100644 --- a/bin/propolis-server/src/lib/vm/guest_event.rs +++ b/bin/propolis-server/src/lib/vm/guest_event.rs @@ -14,11 +14,12 @@ use std::{collections::VecDeque, time::Duration}; /// vCPUs running in-kernel are kicked out for the suspend state. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub(super) enum GuestEvent { - /// VM entered halt state + /// Fired when the bhyve VM enters its halt state. VcpuSuspendHalt(Duration), - /// VM entered reboot state + /// Fired when the bhyve VM enters its reset state. VcpuSuspendReset(Duration), - /// vCPU encounted triple-fault + /// Fired when the bhyve VM resets due to a triple fault. The first element + /// identifies the vCPU that sent this notification. VcpuSuspendTripleFault(i32, Duration), /// Chipset signaled halt condition ChipsetHalt, diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index c40d47685..9b51a51d0 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -31,13 +31,14 @@ //! In the happy case where new VMs always start successfully, this state //! machine transitions as follows: //! -//! - New state machines start in the `NoVm` state. -//! - A request to create a new VM moves to the `WaitingForInit` state. -//! - Once all of the VM's components are created, the VM moves to `Active`. -//! - When the VM stops, the VM moves to `Rundown`. +//! - New state machines start in [`VmState::NoVm`]. +//! - A request to create a new VM moves to [`VmState::WaitingForInit`]. +//! - Once all of the VM's components are created, the VM moves to +//! [`VmState::Active`]. +//! - When the VM stops, the VM moves to [`VmState::Rundown`]. //! - When all references to the VM's components are dropped, the VM moves to -//! `RundownComplete`. A request to create a new VM will move back to -//! `WaitingForInit`. +//! [`VmState::RundownComplete`]. A request to create a new VM will move back +//! to `WaitingForInit`. //! //! In any state except `NoVm`, the state machine holds enough state to describe //! the most recent VM known to the state machine, whether it is being created @@ -93,7 +94,7 @@ use rfb::server::VncServer; use slog::info; use state_driver::StateDriverOutput; use state_publisher::StatePublisher; -use tokio::sync::{RwLock, RwLockReadGuard}; +use tokio::sync::{oneshot, watch, RwLock, RwLockReadGuard}; use crate::{server::MetricsEndpointConfig, vnc::PropolisVncServer}; @@ -121,12 +122,11 @@ pub(crate) type CrucibleBackendMap = /// Type alias for the sender side of the channel that receives /// externally-visible instance state updates. -type InstanceStateTx = tokio::sync::watch::Sender; +type InstanceStateTx = watch::Sender; /// Type alias for the receiver side of the channel that receives /// externally-visible instance state updates. -type InstanceStateRx = - tokio::sync::watch::Receiver; +type InstanceStateRx = watch::Receiver; /// Type alias for the results sent by the state driver in response to a request /// to change a Crucible backend's configuration. @@ -136,12 +136,12 @@ pub(crate) type CrucibleReplaceResult = /// Type alias for the sender side of a channel that receives Crucible backend /// reconfiguration results. pub(crate) type CrucibleReplaceResultTx = - tokio::sync::oneshot::Sender; + oneshot::Sender; /// Type alias for the sender side of a channel that receives the results of /// instance-ensure API calls. type InstanceEnsureResponseTx = - tokio::sync::oneshot::Sender>; + oneshot::Sender>; /// The minimum number of threads to spawn in the Tokio runtime that runs the /// state driver and any other VM-related tasks. @@ -187,7 +187,7 @@ pub(crate) struct Vm { /// Routines that need to read VM properties or obtain a `VmObjects` handle /// acquire this lock shared. /// - /// Routines that drive the VM state machine acquire this lock exclusive. + /// Routines that drive the VM state machine acquire this lock exclusively. inner: RwLock, /// A logger for this VM. @@ -260,29 +260,29 @@ impl std::fmt::Display for VmState { pub(super) struct EnsureOptions { /// A reference to the VM configuration specified in the config TOML passed /// to this propolis-server process. - pub toml_config: Arc, + pub(super) toml_config: Arc, /// True if VMs should allocate memory from the kernel VMM reservoir. - pub use_reservoir: bool, + pub(super) use_reservoir: bool, /// Configuration used to serve Oximeter metrics from this server. - pub metrics_config: Option, + pub(super) metrics_config: Option, /// An Oximeter producer registry to pass to components that will emit /// Oximeter metrics. - pub oximeter_registry: Option, + pub(super) oximeter_registry: Option, /// A Nexus client handle to pass to components that can make upcalls to /// Nexus. - pub nexus_client: Option, + pub(super) nexus_client: Option, /// A reference to the process's VNC server, used to connect the server to /// a new VM's framebuffer. - pub vnc_server: Arc>, + pub(super) vnc_server: Arc>, /// The address of this Propolis process, used by the live migration /// protocol to transfer serial console connections. - pub local_server_addr: SocketAddr, + pub(super) local_server_addr: SocketAddr, } impl Vm { @@ -312,29 +312,33 @@ impl Vm { /// recently wrapped by this `Vm`. pub(super) async fn get(&self) -> Result { let guard = self.inner.read().await; - let vm = match &guard.state { - VmState::NoVm => { - return Err(VmError::NotCreated); - } - VmState::Active(vm) => vm, - VmState::WaitingForInit(vm) - | VmState::Rundown(vm) - | VmState::RundownComplete(vm) => { - return Ok(InstanceSpecGetResponse { + match &guard.state { + // If no VM has ever been created, there's nothing to get. + VmState::NoVm => Err(VmError::NotCreated), + + // If the VM is active, pull the required data out of its objects. + VmState::Active(vm) => { + let spec = + vm.objects().lock_shared().await.instance_spec().clone(); + let state = vm.external_state_rx.borrow().clone(); + Ok(InstanceSpecGetResponse { properties: vm.properties.clone(), - state: vm.external_state_rx.borrow().state, - spec: VersionedInstanceSpec::V0(vm.spec.clone()), - }); + spec: VersionedInstanceSpec::V0(spec), + state: state.state, + }) } - }; - let spec = vm.objects().lock_shared().await.instance_spec().clone(); - let state = vm.external_state_rx.borrow().clone(); - Ok(InstanceSpecGetResponse { - properties: vm.properties.clone(), - spec: VersionedInstanceSpec::V0(spec), - state: state.state, - }) + // If the VM is not active yet, or there is only a + // previously-run-down VM, return the state saved in the state + // machine. + VmState::WaitingForInit(vm) + | VmState::Rundown(vm) + | VmState::RundownComplete(vm) => Ok(InstanceSpecGetResponse { + properties: vm.properties.clone(), + state: vm.external_state_rx.borrow().state, + spec: VersionedInstanceSpec::V0(vm.spec.clone()), + }), + } } /// Yields a handle to the most recent instance state receiver wrapped by @@ -381,8 +385,9 @@ impl Vm { tokio_rt: vm.tokio_rt.expect("WaitingForInit has runtime"), }); } - _ => unreachable!( - "only a starting VM's state worker calls make_active" + state => unreachable!( + "only a starting VM's state worker calls make_active \ + (current state: {state})" ), } } @@ -403,9 +408,9 @@ impl Vm { VmState::WaitingForInit(vm) => { guard.state = VmState::RundownComplete(vm) } - _ => unreachable!( + state => unreachable!( "start failures should only occur before an active VM is \ - installed" + installed (current state: {state})" ), } } @@ -421,10 +426,13 @@ impl Vm { info!(self.log, "setting VM rundown"); let services = { let mut guard = self.inner.write().await; - let VmState::Active(vm) = - std::mem::replace(&mut guard.state, VmState::NoVm) - else { - panic!("VM should be active before being run down"); + let old = std::mem::replace(&mut guard.state, VmState::NoVm); + let vm = match old { + VmState::Active(vm) => vm, + state => panic!( + "VM should be active before being run down (current state: \ + {state})" + ), }; let spec = vm.objects().lock_shared().await.instance_spec().clone(); @@ -458,7 +466,9 @@ impl Vm { guard.state = VmState::RundownComplete(vm); rt } - _ => unreachable!("VM rundown completed from invalid prior state"), + state => unreachable!( + "VM rundown completed from invalid prior state {state}" + ), }; let StateDriverOutput { mut state_publisher, final_state } = guard @@ -497,7 +507,7 @@ impl Vm { // until that task has disposed of the initialization request. Create a // channel to allow the state driver task to send back an ensure result // at the appropriate moment. - let (ensure_reply_tx, ensure_rx) = tokio::sync::oneshot::channel(); + let (ensure_reply_tx, ensure_rx) = oneshot::channel(); // The external state receiver needs to exist as soon as this routine // returns, so create the appropriate channel here. The sender side of diff --git a/bin/propolis-server/src/lib/vm/objects.rs b/bin/propolis-server/src/lib/vm/objects.rs index 1e031743e..8fff5bbb9 100644 --- a/bin/propolis-server/src/lib/vm/objects.rs +++ b/bin/propolis-server/src/lib/vm/objects.rs @@ -334,7 +334,7 @@ impl VmObjectsLocked { info!(self.log, "starting block backend {}", name); let res = backend.start().await; if let Err(e) = &res { - error!(self.log, "Startup failed for {}: {:?}", name, e); + error!(self.log, "startup failed for {}: {:?}", name, e); return res; } } @@ -362,10 +362,9 @@ impl VmObjectsLocked { cx: &mut Context<'_>, ) -> Poll { let mut_self = self.get_mut(); - match Pin::new(&mut mut_self.future).poll(cx) { - Poll::Pending => Poll::Pending, - Poll::Ready(()) => Poll::Ready(mut_self.name.clone()), - } + Pin::new(&mut mut_self.future) + .poll(cx) + .map(|_| mut_self.name.clone()) } } @@ -379,18 +378,11 @@ impl VmObjectsLocked { }) .collect(); - loop { - match stream.next().await { - Some(name) => { - info!(self.log, "dev {} completed pause", name); - } - - None => { - info!(self.log, "all devices paused"); - break; - } - } + while let Some(name) = stream.next().await { + info!(self.log, "dev {} completed pause", name); } + + info!(self.log, "all devices paused"); } /// Resumes all of a VM's devices. From f168f307e23f3134a521a3f65babb4eb380a1f7c Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 17 Jul 2024 23:13:05 +0000 Subject: [PATCH 53/55] remove block_in_place from input queue --- bin/propolis-server/src/lib/vm/guest_event.rs | 4 -- .../src/lib/vm/request_queue.rs | 3 +- .../src/lib/vm/state_driver.rs | 70 +++++++++++-------- 3 files changed, 43 insertions(+), 34 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/guest_event.rs b/bin/propolis-server/src/lib/vm/guest_event.rs index 073858500..b34135a30 100644 --- a/bin/propolis-server/src/lib/vm/guest_event.rs +++ b/bin/propolis-server/src/lib/vm/guest_event.rs @@ -64,8 +64,4 @@ impl GuestEventQueue { pub(super) fn pop_front(&mut self) -> Option { self.queue.pop_front() } - - pub(super) fn is_empty(&self) -> bool { - self.queue.is_empty() - } } diff --git a/bin/propolis-server/src/lib/vm/request_queue.rs b/bin/propolis-server/src/lib/vm/request_queue.rs index 3b0fcbe0d..56c630258 100644 --- a/bin/propolis-server/src/lib/vm/request_queue.rs +++ b/bin/propolis-server/src/lib/vm/request_queue.rs @@ -230,11 +230,12 @@ impl ExternalRequestQueue { } /// Indicates whether the queue is empty. + #[cfg(test)] pub fn is_empty(&self) -> bool { self.queue.is_empty() } - /// Asks to place the supplied request on the queue. If the requests is + /// Asks to place the supplied request on the queue. If the request is /// enqueued, updates the dispositions to use for future requests. pub fn try_queue( &mut self, diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index a749bd00e..1ce73be01 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -5,7 +5,7 @@ //! A task to handle requests to change a VM's state or configuration. use std::{ - sync::{Arc, Condvar, Mutex}, + sync::{Arc, Mutex}, time::Duration, }; @@ -17,6 +17,7 @@ use propolis_api_types::{ InstanceSpecEnsureRequest, InstanceState, MigrationState, }; use slog::{error, info}; +use tokio::sync::Notify; use uuid::Uuid; use crate::{ @@ -79,8 +80,16 @@ impl InputQueueInner { /// A queue for external state change requests and guest-driven state changes. pub(super) struct InputQueue { + /// Contains the input queue's sub-queues, one for external state change + /// requests and one for events emitted by the VM. inner: Mutex, - cv: Condvar, + + /// Notifies the state driver that a new event is present on the queue. + /// + /// Notifiers must use [`Notify::notify_one`] when signaling this `Notify` + /// to guarantee the state driver does not miss incoming messages. See the + /// comments in [`InputQueue::wait_for_next_event`]. + notify: Notify, } impl InputQueue { @@ -91,7 +100,7 @@ impl InputQueue { ) -> Self { Self { inner: Mutex::new(InputQueueInner::new(log, auto_start)), - cv: Condvar::new(), + notify: Notify::new(), } } @@ -101,26 +110,29 @@ impl InputQueue { /// External requests and guest events are stored in separate queues. If /// both queues have events when this routine is called, the guest event /// queue takes precedence. - fn wait_for_next_event(&self) -> InputQueueEvent { - // `block_in_place` is required to avoid blocking the executor while - // waiting on the condvar. - tokio::task::block_in_place(|| { - let guard = self.inner.lock().unwrap(); - let mut guard = self - .cv - .wait_while(guard, |i| { - i.external_requests.is_empty() && i.guest_events.is_empty() - }) - .unwrap(); - - if let Some(guest_event) = guard.guest_events.pop_front() { - InputQueueEvent::GuestEvent(guest_event) - } else { - InputQueueEvent::ExternalRequest( - guard.external_requests.pop_front().unwrap(), - ) + /// + /// # Synchronization + /// + /// This routine assumes that it is only ever called by one task (the state + /// driver). If multiple threads call this routine simultaneously, they may + /// miss wakeups and not return when new events are pushed to the queue or + /// cause a panic (see below). + async fn wait_for_next_event(&self) -> InputQueueEvent { + loop { + { + let mut guard = self.inner.lock().unwrap(); + if let Some(guest_event) = guard.guest_events.pop_front() { + return InputQueueEvent::GuestEvent(guest_event); + } else if let Some(req) = guard.external_requests.pop_front() { + return InputQueueEvent::ExternalRequest(req); + } } - }) + + // Notifiers in this module must use `notify_one` so that their + // notifications will not be lost if they arrive after this routine + // checks the queues but before it actually polls the notify. + self.notify.notified().await; + } } /// Notifies the external request queue that the instance's state has @@ -142,7 +154,7 @@ impl InputQueue { let mut inner = self.inner.lock().unwrap(); let result = inner.external_requests.try_queue(request); if result.is_ok() { - self.cv.notify_one(); + self.notify.notify_one(); } result } @@ -155,7 +167,7 @@ impl guest_event::VcpuEventHandler for InputQueue { .guest_events .enqueue(guest_event::GuestEvent::VcpuSuspendHalt(when)) { - self.cv.notify_all(); + self.notify.notify_one(); } } @@ -165,7 +177,7 @@ impl guest_event::VcpuEventHandler for InputQueue { .guest_events .enqueue(guest_event::GuestEvent::VcpuSuspendReset(when)) { - self.cv.notify_all(); + self.notify.notify_one(); } } @@ -174,7 +186,7 @@ impl guest_event::VcpuEventHandler for InputQueue { if guard.guest_events.enqueue( guest_event::GuestEvent::VcpuSuspendTripleFault(vcpu_id, when), ) { - self.cv.notify_all(); + self.notify.notify_one(); } } @@ -195,14 +207,14 @@ impl guest_event::ChipsetEventHandler for InputQueue { fn chipset_halt(&self) { let mut guard = self.inner.lock().unwrap(); if guard.guest_events.enqueue(guest_event::GuestEvent::ChipsetHalt) { - self.cv.notify_all(); + self.notify.notify_one(); } } fn chipset_reset(&self) { let mut guard = self.inner.lock().unwrap(); if guard.guest_events.enqueue(guest_event::GuestEvent::ChipsetReset) { - self.cv.notify_all(); + self.notify.notify_one(); } } } @@ -360,7 +372,7 @@ impl StateDriver { async fn run_loop(&mut self) -> InstanceState { info!(self.log, "state driver entered main loop"); loop { - let event = self.input_queue.wait_for_next_event(); + let event = self.input_queue.wait_for_next_event().await; info!(self.log, "state driver handling event"; "event" => ?event); let outcome = match event { From c4c80f2e67d936945d256ea5393ae067069e679a Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Thu, 18 Jul 2024 01:45:43 +0000 Subject: [PATCH 54/55] rename async block_until_joined to join_all --- lib/propolis/src/block/crucible.rs | 2 +- lib/propolis/src/block/mem_async.rs | 2 +- lib/propolis/src/tasks.rs | 10 +++++++--- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index dc82559d8..241019c60 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -290,7 +290,7 @@ impl block::Backend for CrucibleBackend { } async fn stop(&self) -> () { self.state.attachment.stop(); - self.workers.block_until_joined().await; + self.workers.join_all().await; } } diff --git a/lib/propolis/src/block/mem_async.rs b/lib/propolis/src/block/mem_async.rs index e6683563e..ff733c56e 100644 --- a/lib/propolis/src/block/mem_async.rs +++ b/lib/propolis/src/block/mem_async.rs @@ -223,6 +223,6 @@ impl block::Backend for MemAsyncBackend { async fn stop(&self) -> () { self.work_state.attachment.stop(); - self.workers.block_until_joined().await; + self.workers.join_all().await; } } diff --git a/lib/propolis/src/tasks.rs b/lib/propolis/src/tasks.rs index 12076e6ab..536742eea 100644 --- a/lib/propolis/src/tasks.rs +++ b/lib/propolis/src/tasks.rs @@ -434,9 +434,13 @@ impl TaskGroup { guard.extend(tasks); } - /// Block until all held tasks have been joined, returning any resulting - /// [task::JoinError]s after doing so. - pub async fn block_until_joined(&self) -> Option> { + /// Waits until all the workers in this task group have completed. + /// + /// # Return value + /// + /// `None` if all the tasks completed successfully. `Some` if at least one + /// task failed; the wrapped value is a `Vec` of all of the returned errors. + pub async fn join_all(&self) -> Option> { let workers = { let mut guard = self.0.lock().unwrap(); std::mem::replace(&mut *guard, Vec::new()) From e110a65692281ea70332268564c5b2d6a88f6c75 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Thu, 18 Jul 2024 16:17:04 +0000 Subject: [PATCH 55/55] explain more clearly why it's safe not to use enable() --- bin/propolis-server/src/lib/vm/state_driver.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 1ce73be01..a5fa9cdae 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -128,9 +128,13 @@ impl InputQueue { } } - // Notifiers in this module must use `notify_one` so that their - // notifications will not be lost if they arrive after this routine - // checks the queues but before it actually polls the notify. + // It's safe not to use `Notified::enable` here because (1) only one + // thread (the state driver) can call `wait_for_next_event` on a + // given input queue, and (2) all the methods of signaling the queue + // use `notify_one`, which buffers a permit if no one is waiting + // when the signal arrives. This means that if a notification is + // sent after the lock is dropped but before `notified()` is called + // here, the ensuing wait will be satisfied immediately. self.notify.notified().await; } }