From 35dfd9abc6788a19e78e4723faf086a6e8bc79ed Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 3 Apr 2024 18:56:10 -0400 Subject: [PATCH] Prevent unintentional Upstairs takeovers (#5221) Volumes are "checked out" from Nexus for many reasons, some of which include sending to another service for use in `Volume::construct`. When that service activates the resulting Volume, this will forcibly take over any existing downstairs connections based on the Upstairs' generation number. This is intentional, and was designed so Nexus, in handing out Volumes with increasing generation numbers, can be sure that the resulting Volume works no matter what (for example, even if a previous Upstairs is wedged somehow, even if the service that is running the previous Upstairs is no longer accepting network connections). Up until now, Nexus wouldn't allow checking out a Volume if there is any chance a Propolis could be running that may use that Volume. This meant restricting certain operations, like creating a snapshot when a disk is attached to an instance that is stopped: any action Nexus would take to attempt a snapshot using a Pantry would race with a user's request to start that instance, and if the Volume checkouts occur in the wrong order the Pantry would take over connections from Propolis, resulting in guest OS errors. Nexus _can_ do this safely though: it has all the information required to know when a checkout is safe to do, and when it may not be safe. This commit adds checks to the Volume checkout transaction that are based on the reason that checkout is occurring, and requires call sites that are performing a checkout to say why they are. Because these checks are performed inside a transaction, Nexus can say for sure when it is safe to allow a Volume to be checked out for a certain reason. For example, in the scenario of taking a snapshot of a disk attached to an instance that is stopped, there are two checkout operations that have the possiblity of racing: 1) the one that Nexus will send to a Pantry during a snapshot create saga. 2) the one that Nexus will send to a Propolis during an instance start saga. If 1 occurs before 2, then Propolis will take over the downstairs connections that the Pantry has established, and the snapshot create saga will fail, but the guest OS for that Propolis will not see any errors. If 2 occurs before 1, then the 1 checkout will fail due to one of the conditions added in this commit: the checkout is being performed for use with a Pantry, and a Propolis _may_ exist, so reject the checkout attempt. Fixes #3289. --- nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-queries/src/db/datastore/mod.rs | 1 + nexus/db-queries/src/db/datastore/volume.rs | 394 +++++++++++++++++- nexus/src/app/image.rs | 5 +- nexus/src/app/instance.rs | 23 +- nexus/src/app/sagas/common_storage.rs | 5 +- nexus/src/app/sagas/disk_create.rs | 10 +- nexus/src/app/sagas/finalize_disk.rs | 3 +- nexus/src/app/sagas/instance_migrate.rs | 7 +- nexus/src/app/sagas/instance_start.rs | 2 + nexus/src/app/sagas/snapshot_create.rs | 140 ++++--- nexus/src/app/snapshot.rs | 31 +- nexus/tests/integration_tests/snapshots.rs | 103 +++++ .../integration_tests/volume_management.rs | 144 ++++++- .../add-lookup-disk-by-volume-id-index/up.sql | 4 + schema/crdb/dbinit.sql | 7 +- 16 files changed, 764 insertions(+), 118 deletions(-) create mode 100644 schema/crdb/add-lookup-disk-by-volume-id-index/up.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 64ddc7c451..853db4195a 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(49, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(50, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(50, "add-lookup-disk-by-volume-id-index"), KnownVersion::new(49, "physical-disk-state-and-policy"), KnownVersion::new(48, "add-metrics-producers-time-modified-index"), KnownVersion::new(47, "add-view-for-bgp-peer-configs"), diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index b40b641202..c753ac5436 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -114,6 +114,7 @@ pub use virtual_provisioning_collection::StorageType; pub use volume::read_only_resources_associated_with_volume; pub use volume::CrucibleResources; pub use volume::CrucibleTargets; +pub use volume::VolumeCheckoutReason; // Number of unique datasets required to back a region. // TODO: This should likely turn into a configuration option. diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index a9646b9ef6..0e80ee3e3c 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -11,8 +11,10 @@ use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; +use crate::db::model::Disk; use crate::db::model::DownstairsClientStopRequestNotification; use crate::db::model::DownstairsClientStoppedNotification; +use crate::db::model::Instance; use crate::db::model::Region; use crate::db::model::RegionSnapshot; use crate::db::model::UpstairsRepairNotification; @@ -25,6 +27,7 @@ use anyhow::bail; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use diesel::OptionalExtension; +use nexus_types::identity::Resource; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; @@ -44,6 +47,40 @@ use serde::Serialize; use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; +#[derive(Debug, Clone, Copy)] +pub enum VolumeCheckoutReason { + /// Check out a read-only Volume. + ReadOnlyCopy, + + /// Check out a Volume to modify and store back to the database. + CopyAndModify, + + /// Check out a Volume to send to Propolis to start an instance. + InstanceStart { vmm_id: Uuid }, + + /// Check out a Volume to send to a migration destination Propolis. + InstanceMigrate { vmm_id: Uuid, target_vmm_id: Uuid }, + + /// Check out a Volume to send to a Pantry (for background maintenance + /// operations). + Pantry, +} + +#[derive(Debug, thiserror::Error)] +enum VolumeGetError { + #[error("Serde error during volume_checkout: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Updated {0} database rows, expected {1}")] + UnexpectedDatabaseUpdate(usize, usize), + + #[error("Checkout condition failed: {0}")] + CheckoutConditionFailed(String), + + #[error("Invalid Volume: {0}")] + InvalidVolume(String), +} + impl DataStore { pub async fn volume_create(&self, volume: Volume) -> CreateResult { use db::schema::volume::dsl; @@ -194,6 +231,244 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + async fn volume_checkout_allowed( + reason: &VolumeCheckoutReason, + vcr: &VolumeConstructionRequest, + maybe_disk: Option, + maybe_instance: Option, + ) -> Result<(), VolumeGetError> { + match reason { + VolumeCheckoutReason::ReadOnlyCopy => { + // When checking out to make a copy (usually for use as a + // read-only parent), the volume must be read only. Even if a + // call-site that uses Copy sends this copied Volume to a + // Propolis or Pantry, the Upstairs that will be created will be + // read-only, and will not take over from other read-only + // Upstairs. + + match volume_is_read_only(&vcr) { + Ok(read_only) => { + if !read_only { + return Err(VolumeGetError::CheckoutConditionFailed( + String::from("Non-read-only Volume Checkout for use Copy!") + )); + } + + Ok(()) + } + + Err(e) => Err(VolumeGetError::InvalidVolume(e.to_string())), + } + } + + VolumeCheckoutReason::CopyAndModify => { + // `CopyAndModify` is used when taking a read/write Volume, + // modifying it (for example, when taking a snapshot, to point + // to read-only resources), and committing it back to the DB. + // This is a checkout of a read/write Volume, so creating an + // Upstairs from it *may* take over from something else. The + // call-site must ensure this doesn't happen, but we can't do + // that here. + + Ok(()) + } + + VolumeCheckoutReason::InstanceStart { vmm_id } => { + // Check out this volume to send to Propolis to start an + // Instance. The VMM id in the enum must match the instance's + // propolis_id. + + let Some(instance) = &maybe_instance else { + return Err(VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceStart {}: instance does not exist", + vmm_id + ), + )); + }; + + let runtime = instance.runtime(); + match (runtime.propolis_id, runtime.dst_propolis_id) { + (Some(_), Some(_)) => { + Err(VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceStart {}: instance {} is undergoing migration", + vmm_id, + instance.id(), + ) + )) + } + + (None, None) => { + Err(VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceStart {}: instance {} has no propolis ids", + vmm_id, + instance.id(), + ) + )) + } + + (Some(propolis_id), None) => { + if propolis_id != *vmm_id { + return Err(VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceStart {}: instance {} propolis id {} mismatch", + vmm_id, + instance.id(), + propolis_id, + ) + )); + } + + Ok(()) + } + + (None, Some(dst_propolis_id)) => { + Err(VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceStart {}: instance {} has no propolis id but dst propolis id {}", + vmm_id, + instance.id(), + dst_propolis_id, + ) + )) + } + } + } + + VolumeCheckoutReason::InstanceMigrate { vmm_id, target_vmm_id } => { + // Check out this volume to send to destination Propolis to + // migrate an Instance. Only take over from the specified source + // VMM. + + let Some(instance) = &maybe_instance else { + return Err(VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceMigrate {} {}: instance does not exist", + vmm_id, target_vmm_id + ), + )); + }; + + let runtime = instance.runtime(); + match (runtime.propolis_id, runtime.dst_propolis_id) { + (Some(propolis_id), Some(dst_propolis_id)) => { + if propolis_id != *vmm_id || dst_propolis_id != *target_vmm_id { + return Err(VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceMigrate {} {}: instance {} propolis id mismatches {} {}", + vmm_id, + target_vmm_id, + instance.id(), + propolis_id, + dst_propolis_id, + ) + )); + } + + Ok(()) + } + + (None, None) => { + Err(VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceMigrate {} {}: instance {} has no propolis ids", + vmm_id, + target_vmm_id, + instance.id(), + ) + )) + } + + (Some(propolis_id), None) => { + // XXX is this right? + if propolis_id != *vmm_id { + return Err(VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceMigrate {} {}: instance {} propolis id {} mismatch", + vmm_id, + target_vmm_id, + instance.id(), + propolis_id, + ) + )); + } + + Ok(()) + } + + (None, Some(dst_propolis_id)) => { + Err(VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceMigrate {} {}: instance {} has no propolis id but dst propolis id {}", + vmm_id, + target_vmm_id, + instance.id(), + dst_propolis_id, + ) + )) + } + } + } + + VolumeCheckoutReason::Pantry => { + // Check out this Volume to send to a Pantry, which will create + // a read/write Upstairs, for background maintenance operations. + // There must not be any Propolis, otherwise this will take over + // from that and cause errors for guest OSes. + + let Some(disk) = maybe_disk else { + // This volume isn't backing a disk, it won't take over from + // a Propolis' Upstairs. + return Ok(()); + }; + + let Some(attach_instance_id) = + disk.runtime().attach_instance_id + else { + // The volume is backing a disk that is not attached to an + // instance. At this moment it won't take over from a + // Propolis' Upstairs, so send it to a Pantry to create an + // Upstairs there. A future checkout that happens after + // this transaction that is sent to a Propolis _will_ take + // over from this checkout (sent to a Pantry), which is ok. + return Ok(()); + }; + + let Some(instance) = maybe_instance else { + // The instance, which the disk that this volume backs is + // attached to, doesn't exist? + // + // XXX this is a Nexus bug! + return Err(VolumeGetError::CheckoutConditionFailed( + format!( + "Pantry: instance {} backing disk {} does not exist?", + attach_instance_id, + disk.id(), + ) + )); + }; + + if let Some(propolis_id) = instance.runtime().propolis_id { + // The instance, which the disk that this volume backs is + // attached to, exists and has an active propolis ID. A + // propolis _may_ exist, so bail here - an activation from + // the Pantry is not allowed to take over from a Propolis. + Err(VolumeGetError::CheckoutConditionFailed(format!( + "Pantry: possible Propolis {}", + propolis_id + ))) + } else { + // The instance, which the disk that this volume backs is + // attached to, exists, but there is no active propolis ID. + // This is ok. + Ok(()) + } + } + } + } + /// Checkout a copy of the Volume from the database. /// This action (getting a copy) will increase the generation number /// of Volumes of the VolumeConstructionRequest::Volume type that have @@ -203,18 +478,10 @@ impl DataStore { pub async fn volume_checkout( &self, volume_id: Uuid, + reason: VolumeCheckoutReason, ) -> LookupResult { use db::schema::volume::dsl; - #[derive(Debug, thiserror::Error)] - enum VolumeGetError { - #[error("Serde error during volume_checkout: {0}")] - SerdeError(#[from] serde_json::Error), - - #[error("Updated {0} database rows, expected {1}")] - UnexpectedDatabaseUpdate(usize, usize), - } - // We perform a transaction here, to be sure that on completion // of this, the database contains an updated version of the // volume with the generation number incremented (for the volume @@ -241,6 +508,56 @@ impl DataStore { err.bail(VolumeGetError::SerdeError(e)) })?; + // The VolumeConstructionRequest resulting from this checkout will have its + // generation numbers bumped, and as result will (if it has non-read-only + // sub-volumes) take over from previous read/write activations when sent to a + // place that will `construct` a new Volume. Depending on the checkout reason, + // prevent creating multiple read/write Upstairs acting on the same Volume, + // except where the take over is intended. + + let (maybe_disk, maybe_instance) = { + use db::schema::instance::dsl as instance_dsl; + use db::schema::disk::dsl as disk_dsl; + + let maybe_disk: Option = disk_dsl::disk + .filter(disk_dsl::time_deleted.is_null()) + .filter(disk_dsl::volume_id.eq(volume_id)) + .select(Disk::as_select()) + .get_result_async(&conn) + .await + .optional()?; + + let maybe_instance: Option = if let Some(disk) = &maybe_disk { + if let Some(attach_instance_id) = disk.runtime().attach_instance_id { + instance_dsl::instance + .filter(instance_dsl::time_deleted.is_null()) + .filter(instance_dsl::id.eq(attach_instance_id)) + .select(Instance::as_select()) + .get_result_async(&conn) + .await + .optional()? + } else { + // Disk not attached to an instance + None + } + } else { + // Volume not associated with disk + None + }; + + (maybe_disk, maybe_instance) + }; + + if let Err(e) = Self::volume_checkout_allowed( + &reason, + &vcr, + maybe_disk, + maybe_instance, + ) + .await { + return Err(err.bail(e)); + } + // Look to see if the VCR is a Volume type, and if so, look at // its sub_volumes. If they are of type Region, then we need // to update their generation numbers and record that update @@ -353,8 +670,17 @@ impl DataStore { .await .map_err(|e| { if let Some(err) = err.take() { - return Error::internal_error(&format!("Transaction error: {}", err)); + match err { + VolumeGetError::CheckoutConditionFailed(message) => { + return Error::conflict(message); + } + + _ => { + return Error::internal_error(&format!("Transaction error: {}", err)); + } + } } + public_error_from_diesel(e, ErrorHandler::Server) }) } @@ -447,8 +773,9 @@ impl DataStore { pub async fn volume_checkout_randomize_ids( &self, volume_id: Uuid, + reason: VolumeCheckoutReason, ) -> CreateResult { - let volume = self.volume_checkout(volume_id).await?; + let volume = self.volume_checkout(volume_id, reason).await?; let vcr: sled_agent_client::types::VolumeConstructionRequest = serde_json::from_str(volume.data())?; @@ -1309,6 +1636,51 @@ pub fn read_only_resources_associated_with_volume( } } +/// Returns true if the sub-volumes of a Volume are all read-only +pub fn volume_is_read_only( + vcr: &VolumeConstructionRequest, +) -> anyhow::Result { + match vcr { + VolumeConstructionRequest::Volume { sub_volumes, .. } => { + for sv in sub_volumes { + match sv { + VolumeConstructionRequest::Region { opts, .. } => { + if !opts.read_only { + return Ok(false); + } + } + + _ => { + bail!("Saw non-Region in sub-volume {:?}", sv); + } + } + } + + Ok(true) + } + + VolumeConstructionRequest::Region { .. } => { + // We don't support a pure Region VCR at the volume + // level in the database, so this choice should + // never be encountered, but I want to know if it is. + panic!("Region not supported as a top level volume"); + } + + VolumeConstructionRequest::File { .. } => { + // Effectively, this is read-only, as this BlockIO implementation + // does not have a `write` implementation. This will be hit if + // trying to make a snapshot or image out of a + // `YouCanBootAnythingAsLongAsItsAlpine` image source. + Ok(true) + } + + VolumeConstructionRequest::Url { .. } => { + // ImageSource::Url was deprecated + bail!("Saw VolumeConstructionRequest::Url"); + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/nexus/src/app/image.rs b/nexus/src/app/image.rs index a7fe75a464..96a3e6b06f 100644 --- a/nexus/src/app/image.rs +++ b/nexus/src/app/image.rs @@ -121,7 +121,10 @@ impl super::Nexus { let image_volume = self .db_datastore - .volume_checkout_randomize_ids(db_snapshot.volume_id) + .volume_checkout_randomize_ids( + db_snapshot.volume_id, + db::datastore::VolumeCheckoutReason::ReadOnlyCopy, + ) .await?; db::model::Image { diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index e29ed21192..a82a53331e 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -173,6 +173,13 @@ enum InstanceStateChangeRequestAction { SendToSled(Uuid), } +/// What is the higher level operation that is calling +/// `instance_ensure_registered`? +pub(crate) enum InstanceRegisterReason { + Start { vmm_id: Uuid }, + Migrate { vmm_id: Uuid, target_vmm_id: Uuid }, +} + impl super::Nexus { pub fn instance_lookup<'a>( &'a self, @@ -1010,6 +1017,7 @@ impl super::Nexus { db_instance: &db::model::Instance, propolis_id: &Uuid, initial_vmm: &db::model::Vmm, + operation: InstanceRegisterReason, ) -> Result<(), Error> { opctx.authorize(authz::Action::Modify, authz_instance).await?; @@ -1065,8 +1073,19 @@ impl super::Nexus { } }; - let volume = - self.db_datastore.volume_checkout(disk.volume_id).await?; + let volume = self + .db_datastore + .volume_checkout( + disk.volume_id, + match operation { + InstanceRegisterReason::Start { vmm_id } => + db::datastore::VolumeCheckoutReason::InstanceStart { vmm_id }, + InstanceRegisterReason::Migrate { vmm_id, target_vmm_id } => + db::datastore::VolumeCheckoutReason::InstanceMigrate { vmm_id, target_vmm_id }, + } + ) + .await?; + disk_reqs.push(sled_agent_client::types::DiskRequest { name: disk.name().to_string(), slot: sled_agent_client::types::Slot(slot.0), diff --git a/nexus/src/app/sagas/common_storage.rs b/nexus/src/app/sagas/common_storage.rs index 3b590f6205..bf530ef858 100644 --- a/nexus/src/app/sagas/common_storage.rs +++ b/nexus/src/app/sagas/common_storage.rs @@ -769,7 +769,10 @@ pub(crate) async fn call_pantry_attach_for_disk( let disk_volume = nexus .datastore() - .volume_checkout(disk.volume_id) + .volume_checkout( + disk.volume_id, + db::datastore::VolumeCheckoutReason::Pantry, + ) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index 830a4dd96c..165bf7573c 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -390,7 +390,10 @@ async fn sdc_regions_ensure( let volume = osagactx .datastore() - .volume_checkout(db_snapshot.volume_id) + .volume_checkout( + db_snapshot.volume_id, + db::datastore::VolumeCheckoutReason::ReadOnlyCopy, + ) .await .map_err(ActionError::action_failed)?; @@ -433,7 +436,10 @@ async fn sdc_regions_ensure( let volume = osagactx .datastore() - .volume_checkout(image.volume_id) + .volume_checkout( + image.volume_id, + db::datastore::VolumeCheckoutReason::ReadOnlyCopy, + ) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/finalize_disk.rs b/nexus/src/app/sagas/finalize_disk.rs index d4f6fc39aa..89893fb703 100644 --- a/nexus/src/app/sagas/finalize_disk.rs +++ b/nexus/src/app/sagas/finalize_disk.rs @@ -79,7 +79,8 @@ impl NexusSaga for SagaFinalizeDisk { silo_id: params.silo_id, project_id: params.project_id, disk_id: params.disk_id, - attached_instance_and_sled: None, + attach_instance_id: None, + use_the_pantry: true, create_params: params::SnapshotCreate { identity: external::IdentityMetadataCreateParams { name: snapshot_name.clone(), diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index da3b3e93ea..e4bdd989cc 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -4,7 +4,8 @@ use super::{NexusActionContext, NexusSaga, ACTION_GENERATE_ID}; use crate::app::instance::{ - InstanceStateChangeError, InstanceStateChangeRequest, + InstanceRegisterReason, InstanceStateChangeError, + InstanceStateChangeRequest, }; use crate::app::sagas::{ declare_saga_actions, instance_common::allocate_vmm_ipv6, @@ -356,6 +357,10 @@ async fn sim_ensure_destination_propolis( &db_instance, &vmm.id, &vmm, + InstanceRegisterReason::Migrate { + vmm_id: params.src_vmm.id, + target_vmm_id: vmm.id, + }, ) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index b1d9506c31..98fcec13a7 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -10,6 +10,7 @@ use super::{ instance_common::allocate_vmm_ipv6, NexusActionContext, NexusSaga, SagaInitError, ACTION_GENERATE_ID, }; +use crate::app::instance::InstanceRegisterReason; use crate::app::instance::InstanceStateChangeError; use crate::app::sagas::declare_saga_actions; use chrono::Utc; @@ -527,6 +528,7 @@ async fn sis_ensure_registered( &db_instance, &propolis_id, &vmm_record, + InstanceRegisterReason::Start { vmm_id: propolis_id }, ) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index 8b6febf71a..ff57470a5f 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -130,7 +130,8 @@ pub(crate) struct Params { pub silo_id: Uuid, pub project_id: Uuid, pub disk_id: Uuid, - pub attached_instance_and_sled: Option<(Uuid, Uuid)>, + pub attach_instance_id: Option, + pub use_the_pantry: bool, pub create_params: params::SnapshotCreate, } @@ -251,8 +252,7 @@ impl NexusSaga for SagaSnapshotCreate { // (DB) Tracks virtual resource provisioning. builder.append(space_account_action()); - let use_the_pantry = params.attached_instance_and_sled.is_none(); - if !use_the_pantry { + if !params.use_the_pantry { // (Sleds) If the disk is attached to an instance, send a // snapshot request to sled-agent to create a ZFS snapshot. builder.append(send_snapshot_request_to_sled_agent_action()); @@ -284,7 +284,7 @@ impl NexusSaga for SagaSnapshotCreate { // (DB) Mark snapshot as "ready" builder.append(finalize_snapshot_record_action()); - if use_the_pantry { + if params.use_the_pantry { // (Pantry) Set the state back to Detached // // This has to be the last saga node! Otherwise, concurrent @@ -675,22 +675,47 @@ async fn ssc_send_snapshot_request_to_sled_agent( let snapshot_id = sagactx.lookup::("snapshot_id")?; // If this node was reached, the saga initiator thought the disk was - // attached to an instance that was running on a specific sled. Contact that - // sled and ask it to initiate a snapshot. Note that this is best-effort: - // the instance may have stopped (or may be have stopped, had the disk - // detached, and resumed running on the same sled) while the saga was - // executing. - let (instance_id, sled_id) = - params.attached_instance_and_sled.ok_or_else(|| { - ActionError::action_failed(Error::internal_error( - "snapshot saga in send_snapshot_request_to_sled_agent but no \ - instance/sled pair was provided", - )) - })?; + // attached to an instance that _may_ have a running Propolis. Contact that + // Propolis and ask it to initiate a snapshot. Note that this is + // best-effort: the instance may have stopped (or may be have stopped, had + // the disk detached, and resumed running on the same sled) while the saga + // was executing. + let Some(attach_instance_id) = params.attach_instance_id else { + return Err(ActionError::action_failed(Error::internal_error( + "attach instance id is None!", + ))); + }; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) + .instance_id(attach_instance_id) + .lookup_for(authz::Action::Read) + .await + .map_err(ActionError::action_failed)?; + + let sled_id = osagactx + .datastore() + .instance_fetch_with_vmm(&opctx, &authz_instance) + .await + .map_err(ActionError::action_failed)? + .sled_id(); + + // If this instance does not currently have a sled, we can't continue this + // saga - the user will have to reissue the snapshot request and it will get + // run on a Pantry. + let Some(sled_id) = sled_id else { + return Err(ActionError::action_failed(Error::unavail( + "sled id is None!", + ))); + }; info!(log, "asking for disk snapshot from Propolis via sled agent"; "disk_id" => %params.disk_id, - "instance_id" => %instance_id, + "instance_id" => %attach_instance_id, "sled_id" => %sled_id); let sled_agent_client = osagactx @@ -702,7 +727,7 @@ async fn ssc_send_snapshot_request_to_sled_agent( retry_until_known_result(log, || async { sled_agent_client .instance_issue_disk_snapshot_request( - &instance_id, + &attach_instance_id, ¶ms.disk_id, &InstanceIssueDiskSnapshotRequestBody { snapshot_id }, ) @@ -838,6 +863,16 @@ async fn ssc_attach_disk_to_pantry( info!(log, "disk {} in state finalizing", params.disk_id); } + external::DiskState::Attached(attach_instance_id) => { + // No state change required + info!( + log, + "disk {} in state attached to instance id {}", + params.disk_id, + attach_instance_id + ); + } + _ => { // Return a 503 indicating that the user should retry return Err(ActionError::action_failed( @@ -1358,7 +1393,10 @@ async fn ssc_create_volume_record( let disk_volume = osagactx .datastore() - .volume_checkout(disk.volume_id) + .volume_checkout( + disk.volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) .await .map_err(ActionError::action_failed)?; @@ -1815,14 +1853,16 @@ mod test { project_id: Uuid, disk_id: Uuid, disk: NameOrId, - instance_and_sled: Option<(Uuid, Uuid)>, + attach_instance_id: Option, + use_the_pantry: bool, ) -> Params { Params { serialized_authn: authn::saga::Serialized::for_opctx(opctx), silo_id, project_id, disk_id, - attached_instance_and_sled: instance_and_sled, + attach_instance_id, + use_the_pantry, create_params: params::SnapshotCreate { identity: IdentityMetadataCreateParams { name: "my-snapshot".parse().expect("Invalid disk name"), @@ -1871,7 +1911,8 @@ mod test { project_id, disk_id, Name::from_str(DISK_NAME).unwrap().into(), - None, + None, // not attached to an instance + true, // use the pantry ); let dag = create_saga_dag::(params).unwrap(); let runnable_saga = nexus.create_runnable_saga(dag).await.unwrap(); @@ -2079,7 +2120,7 @@ mod test { // since this is just a test, bypass the normal // attachment machinery and just update the disk's // database record directly. - let instance_and_sled = if !use_the_pantry { + let attach_instance_id = if !use_the_pantry { let state = setup_test_instance( cptestctx, client, @@ -2092,11 +2133,7 @@ mod test { ) .await; - let sled_id = state - .sled_id() - .expect("running instance should have a vmm"); - - Some((state.instance().id(), sled_id)) + Some(state.instance().id()) } else { None }; @@ -2107,7 +2144,8 @@ mod test { project_id, disk_id, Name::from_str(DISK_NAME).unwrap().into(), - instance_and_sled, + attach_instance_id, + use_the_pantry, ) } }) @@ -2205,36 +2243,31 @@ mod test { Name::from_str(DISK_NAME).unwrap().into(), // The disk isn't attached at this time, so don't supply a sled. None, + true, // use the pantry ); let dag = create_saga_dag::(params).unwrap(); let runnable_saga = nexus.create_runnable_saga(dag).await.unwrap(); // Before running the saga, attach the disk to an instance! - let (.., authz_disk, db_disk) = - LookupPath::new(&opctx, nexus.datastore()) - .disk_id(disk_id) - .fetch_for(authz::Action::Read) - .await - .expect("Failed to look up created disk"); - - assert!(nexus - .datastore() - .disk_update_runtime( - &opctx, - &authz_disk, - &db_disk.runtime().attach(Uuid::new_v4()), - ) - .await - .expect("failed to attach disk")); + let _instance_and_vmm = setup_test_instance( + &cptestctx, + &client, + vec![params::InstanceDiskAttachment::Attach( + params::InstanceDiskAttach { + name: Name::from_str(DISK_NAME).unwrap(), + }, + )], + ) + .await; // Actually run the saga let output = nexus.run_saga(runnable_saga).await; - // Expect to see 503 + // Expect to see 409 match output { Err(e) => { - assert!(matches!(e, Error::ServiceUnavailable { .. })); + assert!(matches!(e, Error::Conflict { .. })); } Ok(_) => { @@ -2269,6 +2302,7 @@ mod test { Name::from_str(DISK_NAME).unwrap().into(), // The disk isn't attached at this time, so don't supply a sled. None, + true, // use the pantry ); let dag = create_saga_dag::(params).unwrap(); @@ -2313,8 +2347,6 @@ mod test { // the saga, stopping the instance, detaching the disk, and then letting // the saga run. let fake_instance_id = Uuid::new_v4(); - let fake_sled_id = - Uuid::parse_str(nexus_test_utils::SLED_AGENT_UUID).unwrap(); let params = new_test_params( &opctx, @@ -2322,7 +2354,8 @@ mod test { project_id, disk_id, Name::from_str(DISK_NAME).unwrap().into(), - Some((fake_instance_id, fake_sled_id)), + Some(fake_instance_id), + false, // use the pantry ); let dag = create_saga_dag::(params).unwrap(); @@ -2363,10 +2396,6 @@ mod test { ) .await; - let sled_id = instance_state - .sled_id() - .expect("running instance should have a vmm"); - // Rerun the saga let params = new_test_params( &opctx, @@ -2374,7 +2403,8 @@ mod test { project_id, disk_id, Name::from_str(DISK_NAME).unwrap().into(), - Some((instance_state.instance().id(), sled_id)), + Some(instance_state.instance().id()), + false, // use the pantry ); let dag = create_saga_dag::(params).unwrap(); diff --git a/nexus/src/app/snapshot.rs b/nexus/src/app/snapshot.rs index 0c90ac31fb..c28d180d3c 100644 --- a/nexus/src/app/snapshot.rs +++ b/nexus/src/app/snapshot.rs @@ -12,7 +12,6 @@ use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; -use omicron_common::api::external::InstanceState; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::NameOrId; @@ -93,7 +92,7 @@ impl super::Nexus { // If there isn't a running propolis, Nexus needs to use the Crucible // Pantry to make this snapshot - let instance_and_sled = if let Some(attach_instance_id) = + let use_the_pantry = if let Some(attach_instance_id) = &db_disk.runtime_state.attach_instance_id { let (.., authz_instance) = @@ -107,29 +106,12 @@ impl super::Nexus { .instance_fetch_with_vmm(&opctx, &authz_instance) .await?; - match instance_state.vmm().as_ref() { - None => None, - Some(vmm) => match vmm.runtime.state.0 { - // If the VM might be running, or it's rebooting (which - // doesn't deactivate the volume), send the snapshot request - // to the relevant VMM. Otherwise, there's no way to know if - // the instance has attached the volume or is in the process - // of detaching it, so bail. - InstanceState::Running | InstanceState::Rebooting => { - Some((*attach_instance_id, vmm.sled_id)) - } - _ => { - return Err(Error::invalid_request(&format!( - "cannot snapshot attached disk for instance in \ - state {}", - vmm.runtime.state.0 - ))); - } - }, - } + // If a Propolis _may_ exist, send the snapshot request there, + // otherwise use the pantry. + !instance_state.vmm().is_some() } else { // This disk is not attached to an instance, use the pantry. - None + true }; let saga_params = sagas::snapshot_create::Params { @@ -137,7 +119,8 @@ impl super::Nexus { silo_id: authz_silo.id(), project_id: authz_project.id(), disk_id: authz_disk.id(), - attached_instance_and_sled: instance_and_sled, + attach_instance_id: db_disk.runtime_state.attach_instance_id, + use_the_pantry, create_params: params.clone(), }; diff --git a/nexus/tests/integration_tests/snapshots.rs b/nexus/tests/integration_tests/snapshots.rs index 63ea81f13f..251b729f98 100644 --- a/nexus/tests/integration_tests/snapshots.rs +++ b/nexus/tests/integration_tests/snapshots.rs @@ -256,6 +256,109 @@ async fn test_snapshot_without_instance(cptestctx: &ControlPlaneTestContext) { assert_eq!(disk.state, DiskState::Detached); } +#[nexus_test] +async fn test_snapshot_stopped_instance(cptestctx: &ControlPlaneTestContext) { + let client = &cptestctx.external_client; + DiskTest::new(&cptestctx).await; + create_project_and_pool(client).await; + let disks_url = get_disks_url(); + + // Define a global image + let image_create_params = params::ImageCreate { + identity: IdentityMetadataCreateParams { + name: "alpine-edge".parse().unwrap(), + description: String::from( + "you can boot any image, as long as it's alpine", + ), + }, + source: params::ImageSource::YouCanBootAnythingAsLongAsItsAlpine, + os: "alpine".to_string(), + version: "edge".to_string(), + }; + + let images_url = format!("/v1/images?project={}", PROJECT_NAME); + let image = + NexusRequest::objects_post(client, &images_url, &image_create_params) + .authn_as(AuthnMode::PrivilegedUser) + .execute_and_parse_unwrap::() + .await; + + // Create a disk from this image + let disk_size = ByteCount::from_gibibytes_u32(2); + let base_disk_name: Name = "base-disk".parse().unwrap(); + let base_disk = params::DiskCreate { + identity: IdentityMetadataCreateParams { + name: base_disk_name.clone(), + description: String::from("sells rainsticks"), + }, + disk_source: params::DiskSource::Image { image_id: image.identity.id }, + size: disk_size, + }; + + let base_disk: Disk = NexusRequest::new( + RequestBuilder::new(client, Method::POST, &disks_url) + .body(Some(&base_disk)) + .expect_status(Some(StatusCode::CREATED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap(); + + // Create a stopped instance with attached disk + let instances_url = format!("/v1/instances?project={}", PROJECT_NAME,); + let instance_name = "base-instance"; + + let instance: Instance = object_create( + client, + &instances_url, + ¶ms::InstanceCreate { + identity: IdentityMetadataCreateParams { + name: instance_name.parse().unwrap(), + description: format!("instance {:?}", instance_name), + }, + ncpus: InstanceCpuCount(2), + memory: ByteCount::from_gibibytes_u32(1), + hostname: "base-instance".parse().unwrap(), + user_data: + b"#cloud-config\nsystem_info:\n default_user:\n name: oxide" + .to_vec(), + ssh_public_keys: Some(Vec::new()), + network_interfaces: + params::InstanceNetworkInterfaceAttachment::None, + disks: vec![params::InstanceDiskAttachment::Attach( + params::InstanceDiskAttach { name: base_disk_name.clone() }, + )], + external_ips: vec![], + start: false, + }, + ) + .await; + + assert_eq!(instance.runtime.run_state, external::InstanceState::Stopped); + + // Issue snapshot request + let snapshots_url = format!("/v1/snapshots?project={}", PROJECT_NAME); + + let snapshot: views::Snapshot = object_create( + client, + &snapshots_url, + ¶ms::SnapshotCreate { + identity: IdentityMetadataCreateParams { + name: instance_name.parse().unwrap(), + description: format!("instance {:?}", instance_name), + }, + disk: base_disk_name.into(), + }, + ) + .await; + + assert_eq!(snapshot.disk_id, base_disk.identity.id); + assert_eq!(snapshot.size, base_disk.size); +} + #[nexus_test] async fn test_delete_snapshot(cptestctx: &ControlPlaneTestContext) { let client = &cptestctx.external_client; diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index daf78823ed..ecfa7cf0f1 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -9,6 +9,7 @@ use chrono::Utc; use dropshot::test_util::ClientTestContext; use http::method::Method; use http::StatusCode; +use nexus_db_queries::db; use nexus_db_queries::db::DataStore; use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; @@ -1375,7 +1376,13 @@ async fn test_volume_remove_read_only_parent_base( // Go and get the volume from the database, verify it no longer // has a read only parent. - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); let vcr: VolumeConstructionRequest = serde_json::from_str(new_vol.data()).unwrap(); @@ -1394,7 +1401,13 @@ async fn test_volume_remove_read_only_parent_base( } // Verify the t_vid now has a ROP. - let new_vol = datastore.volume_checkout(t_vid).await.unwrap(); + let new_vol = datastore + .volume_checkout( + t_vid, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); let vcr: VolumeConstructionRequest = serde_json::from_str(new_vol.data()).unwrap(); @@ -1421,7 +1434,13 @@ async fn test_volume_remove_read_only_parent_base( // We want to verify we can call volume_remove_rop twice and the second // time through it won't change what it did the first time. This is // critical to supporting replay of the saga, should it be needed. - let new_vol = datastore.volume_checkout(t_vid).await.unwrap(); + let new_vol = datastore + .volume_checkout( + t_vid, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); let vcr: VolumeConstructionRequest = serde_json::from_str(new_vol.data()).unwrap(); @@ -1570,7 +1589,13 @@ async fn test_volume_remove_rop_saga(cptestctx: &ControlPlaneTestContext) { .await .unwrap(); - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); let vcr: VolumeConstructionRequest = serde_json::from_str(new_vol.data()).unwrap(); @@ -1628,7 +1653,13 @@ async fn test_volume_remove_rop_saga_twice( .unwrap(); println!("first returns {:?}", res); - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); let vcr: VolumeConstructionRequest = serde_json::from_str(new_vol.data()).unwrap(); @@ -1762,7 +1793,13 @@ async fn test_volume_remove_rop_saga_deleted_volume( .await .unwrap(); - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); let vcr: VolumeConstructionRequest = serde_json::from_str(new_vol.data()).unwrap(); @@ -1811,11 +1848,23 @@ async fn test_volume_checkout(cptestctx: &ControlPlaneTestContext) { // The first time back, we get 1 but internally the generation number goes // to 2. - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); volume_match_gen(new_vol, vec![Some(1)]); // Request again, we should get 2 now. - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); volume_match_gen(new_vol, vec![Some(2)]); } @@ -1853,9 +1902,21 @@ async fn test_volume_checkout_updates_nothing( .unwrap(); // Verify nothing happens to our non generation number volume. - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); volume_match_gen(new_vol, vec![None]); - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); volume_match_gen(new_vol, vec![None]); } @@ -1894,15 +1955,33 @@ async fn test_volume_checkout_updates_multiple_gen( // The first time back, we get our original values, but internally the // generation number goes up. - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); volume_match_gen(new_vol, vec![Some(3), Some(8)]); // Request again, we should see the incremented values now.. - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); volume_match_gen(new_vol, vec![Some(4), Some(9)]); // Request one more, because why not. - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); volume_match_gen(new_vol, vec![Some(5), Some(10)]); } @@ -1947,11 +2026,23 @@ async fn test_volume_checkout_updates_sparse_multiple_gen( // The first time back, we get our original values, but internally the // generation number goes up. - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); volume_match_gen(new_vol, vec![None, Some(7), Some(9)]); // Request again, we should see the incremented values now.. - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); volume_match_gen(new_vol, vec![None, Some(8), Some(10)]); } @@ -1996,11 +2087,23 @@ async fn test_volume_checkout_updates_sparse_mid_multiple_gen( // The first time back, we get our original values, but internally the // generation number goes up. - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); volume_match_gen(new_vol, vec![Some(7), None, Some(9)]); // Request again, we should see the incremented values now.. - let new_vol = datastore.volume_checkout(volume_id).await.unwrap(); + let new_vol = datastore + .volume_checkout( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await + .unwrap(); volume_match_gen(new_vol, vec![Some(8), None, Some(10)]); } @@ -2038,7 +2141,12 @@ async fn test_volume_checkout_randomize_ids_only_read_only( .unwrap(); // volume_checkout_randomize_ids should fail - let r = datastore.volume_checkout_randomize_ids(volume_id).await; + let r = datastore + .volume_checkout_randomize_ids( + volume_id, + db::datastore::VolumeCheckoutReason::CopyAndModify, + ) + .await; assert!(r.is_err()); } diff --git a/schema/crdb/add-lookup-disk-by-volume-id-index/up.sql b/schema/crdb/add-lookup-disk-by-volume-id-index/up.sql new file mode 100644 index 0000000000..2f129f334c --- /dev/null +++ b/schema/crdb/add-lookup-disk-by-volume-id-index/up.sql @@ -0,0 +1,4 @@ +CREATE UNIQUE INDEX IF NOT EXISTS lookup_disk_by_volume_id ON omicron.public.disk ( + volume_id +) WHERE + time_deleted IS NULL; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index da3dbb3f4c..9f28efbd16 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1152,6 +1152,11 @@ CREATE UNIQUE INDEX IF NOT EXISTS lookup_deleted_disk ON omicron.public.disk ( ) WHERE time_deleted IS NOT NULL; +CREATE UNIQUE INDEX IF NOT EXISTS lookup_disk_by_volume_id ON omicron.public.disk ( + volume_id +) WHERE + time_deleted IS NULL; + CREATE TABLE IF NOT EXISTS omicron.public.image ( /* Identity metadata (resource) */ id UUID PRIMARY KEY, @@ -3770,7 +3775,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '49.0.0', NULL) + ( TRUE, NOW(), NOW(), '50.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT;