diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index be068f0912..82488def6c 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -2451,26 +2451,22 @@ async fn cmd_db_region_find_deleted( struct Row { dataset_id: Uuid, region_id: Uuid, - region_snapshot_id: String, - volume_id: Uuid, + volume_id: String, } let rows: Vec = datasets_regions_volumes .into_iter() .map(|row| { - let (dataset, region, region_snapshot, volume) = row; + let (dataset, region, volume) = row; Row { dataset_id: dataset.id(), region_id: region.id(), - region_snapshot_id: if let Some(region_snapshot) = - region_snapshot - { - region_snapshot.snapshot_id.to_string() + volume_id: if let Some(volume) = volume { + volume.id().to_string() } else { String::from("") }, - volume_id: volume.id(), } }) .collect(); diff --git a/nexus/db-model/src/volume_resource_usage.rs b/nexus/db-model/src/volume_resource_usage.rs index 0095baf471..0a76b3718d 100644 --- a/nexus/db-model/src/volume_resource_usage.rs +++ b/nexus/db-model/src/volume_resource_usage.rs @@ -29,7 +29,7 @@ impl_enum_type!( /// /// Read-only resources can be used by many volumes, and because of this they /// need to have a reference count so they can be deleted when they're not -/// referenced anymore. The region_snapshot table uses a `volume_references` +/// referenced anymore. The region_snapshot table used a `volume_references` /// column, which counts how many uses there are. The region table does not have /// this column, and more over a simple integer works for reference counting but /// does not tell you _what_ volume that use is from. This can be determined diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index f555e0af6b..da32494d6f 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -362,6 +362,12 @@ impl DataStore { ) .execute_async(&conn).await?; } + + // Whenever a region is hard-deleted, validate invariants + // for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(()) } }) diff --git a/nexus/db-queries/src/db/datastore/region_snapshot.rs b/nexus/db-queries/src/db/datastore/region_snapshot.rs index 242560a415..4a5db14bd6 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot.rs @@ -64,14 +64,25 @@ impl DataStore { ) -> DeleteResult { use db::schema::region_snapshot::dsl; - diesel::delete(dsl::region_snapshot) + let conn = self.pool_connection_unauthorized().await?; + + let result = diesel::delete(dsl::region_snapshot) .filter(dsl::dataset_id.eq(dataset_id)) .filter(dsl::region_id.eq(region_id)) .filter(dsl::snapshot_id.eq(snapshot_id)) - .execute_async(&*self.pool_connection_unauthorized().await?) + .execute_async(&*conn) .await .map(|_rows_deleted| ()) - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)); + + // Whenever a region snapshot is hard-deleted, validate invariants for + // all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + result } /// Find region snapshots on expunged disks diff --git a/nexus/db-queries/src/db/datastore/snapshot.rs b/nexus/db-queries/src/db/datastore/snapshot.rs index 9d4900e2a4..b9f6b589d7 100644 --- a/nexus/db-queries/src/db/datastore/snapshot.rs +++ b/nexus/db-queries/src/db/datastore/snapshot.rs @@ -322,4 +322,23 @@ impl DataStore { .optional() .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + + /// Get a snapshot, returning None if it does not exist (instead of a + /// NotFound error). + pub async fn snapshot_get( + &self, + opctx: &OpContext, + snapshot_id: Uuid, + ) -> LookupResult> { + let conn = self.pool_connection_authorized(opctx).await?; + + use db::schema::snapshot::dsl; + dsl::snapshot + .filter(dsl::id.eq(snapshot_id)) + .select(Snapshot::as_select()) + .first_async(&*conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } } diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index d7d6b10081..c5dd3327d9 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -192,6 +192,10 @@ impl DataStore { } } + // After volume creation, validate invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(volume) } @@ -1019,10 +1023,11 @@ impl DataStore { } /// Find regions for deleted volumes that do not have associated region - /// snapshots and are not being used by any other non-deleted volumes + /// snapshots and are not being used by any other non-deleted volumes, and + /// return them for garbage collection pub async fn find_deleted_volume_regions( &self, - ) -> ListResultVec<(Dataset, Region, Option, Volume)> { + ) -> ListResultVec<(Dataset, Region, Option)> { let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("find_deleted_volume_regions") .transaction(&conn, |conn| async move { @@ -1034,19 +1039,19 @@ impl DataStore { async fn find_deleted_volume_regions_txn( conn: &async_bb8_diesel::Connection, - ) -> Result< - Vec<(Dataset, Region, Option, Volume)>, - diesel::result::Error, - > { + ) -> Result)>, diesel::result::Error> + { use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; use db::schema::region_snapshot::dsl; use db::schema::volume::dsl as volume_dsl; - use db::schema::volume_resource_usage::dsl as ru_dsl; - // Find all regions and datasets - let tuples_superset = region_dsl::region - .inner_join( + // Find all read-write regions (read-only region cleanup is taken care + // of in soft_delete_volume_txn!) and their associated datasets + let unfiltered_deleted_regions = region_dsl::region + .filter(region_dsl::read_only.eq(false)) + // the volume may be hard deleted, so use a left join here + .left_join( volume_dsl::volume.on(region_dsl::volume_id.eq(volume_dsl::id)), ) .inner_join( @@ -1054,53 +1059,70 @@ impl DataStore { .on(region_dsl::dataset_id.eq(dataset_dsl::id)), ) // where there either are no region snapshots, or the region - // snapshot volume references have gone to zero + // snapshot volume has deleted = true .left_join( dsl::region_snapshot.on(dsl::region_id .eq(region_dsl::id) .and(dsl::dataset_id.eq(dataset_dsl::id))), ) - .filter( - dsl::volume_references - .eq(0) - // Despite the SQL specifying that this column is NOT NULL, - // this null check is required for this function to work! - // It's possible that the left join of region_snapshot above - // could join zero rows, making this null. - .or(dsl::volume_references.is_null()), - ) - // where the volume has already been soft-deleted - .filter(volume_dsl::time_deleted.is_not_null()) + .filter(dsl::deleting.eq(true).or(dsl::deleting.is_null())) // and return them (along with the volume so it can be hard deleted) .select(( Dataset::as_select(), Region::as_select(), Option::::as_select(), - Volume::as_select(), + // Diesel can't express a difference between + // + // a) the volume record existing and the nullable + // volume.time_deleted column being set to null + // b) the volume record does not exist (null due to left join) + // + // so return an Option and check below + Option::::as_select(), )) .load_async(conn) .await?; - // Filter out read-only regions that are still being used by volumes - let mut tuples = Vec::with_capacity(tuples_superset.len()); + let mut deleted_regions = + Vec::with_capacity(unfiltered_deleted_regions.len()); - for (dataset, region, region_snapshot, volume) in tuples_superset { - let region_usage_left = ru_dsl::volume_resource_usage - .filter( - ru_dsl::usage_type - .eq(VolumeResourceUsageType::ReadOnlyRegion), - ) - .filter(ru_dsl::region_id.eq(region.id())) - .count() - .get_result_async::(conn) - .await?; + for (dataset, region, region_snapshot, volume) in + unfiltered_deleted_regions + { + // only operate on soft or hard deleted volumes + let deleted = match &volume { + Some(volume) => volume.time_deleted.is_some(), + None => true, + }; - if region_usage_left == 0 { - tuples.push((dataset, region, region_snapshot, volume)); + if !deleted { + continue; } + + if region_snapshot.is_some() { + // We cannot delete this region: the presence of the region + // snapshot record means that the Crucible agent's snapshot has + // not been deleted yet (as the lifetime of the region snapshot + // record is equal to or longer than the lifetime of the + // Crucible agent's snapshot). + // + // This condition can occur when multiple volume delete sagas + // run concurrently: one will decrement the crucible resources + // (but hasn't made the appropriate DELETE calls to the + // appropriate Agents to tombstone the running snapshots and + // snapshots yet), and the other will be in the "delete freed + // regions" saga node trying to delete the region. Without this + // check, This race results in the Crucible Agent returning + // "must delete snapshots first" and causing saga unwinds. + // + // Another volume delete will pick up this region and remove it. + continue; + } + + deleted_regions.push((dataset, region, volume)); } - Ok(tuples) + Ok(deleted_regions) } pub async fn read_only_resources_associated_with_volume( @@ -1477,6 +1499,10 @@ impl DataStore { } } + // After volume deletion, validate invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(resources_to_delete) } @@ -1735,6 +1761,11 @@ impl DataStore { })?; } + // After read-only parent removal, validate + // invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(true) } } @@ -2713,6 +2744,11 @@ impl DataStore { }) })?; + // After region replacement, validate invariants for all + // volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(VolumeReplaceResult::Done) } }) @@ -3142,6 +3178,11 @@ impl DataStore { )); } + // After region snapshot replacement, validate invariants + // for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(VolumeReplaceResult::Done) } }) @@ -3540,6 +3581,120 @@ impl DataStore { } } +// Add some validation that runs only for tests +#[cfg(any(test, feature = "testing"))] +impl DataStore { + fn volume_invariant_violated(msg: String) -> diesel::result::Error { + diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::CheckViolation, + Box::new(msg), + ) + } + + /// Tests each Volume to see if invariants hold + /// + /// If an invariant is violated, this function returns a `CheckViolation` + /// with the text of what invariant was violated. + pub(crate) async fn validate_volume_invariants( + conn: &async_bb8_diesel::Connection, + ) -> Result<(), diesel::result::Error> { + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + + while let Some(p) = paginator.next() { + use db::schema::volume::dsl; + let haystack = + paginated(dsl::volume, dsl::id, &p.current_pagparams()) + .select(Volume::as_select()) + .get_results_async::(conn) + .await + .unwrap(); + + paginator = p.found_batch(&haystack, &|r| r.id()); + + for volume in haystack { + Self::validate_volume_has_all_resources(&conn, volume).await?; + } + } + + Ok(()) + } + + /// Assert that the resources that comprise non-deleted volumes have not + /// been prematurely deleted. + async fn validate_volume_has_all_resources( + conn: &async_bb8_diesel::Connection, + volume: Volume, + ) -> Result<(), diesel::result::Error> { + if volume.time_deleted.is_some() { + // Do not need to validate resources for soft-deleted volumes + return Ok(()); + } + + let vcr: VolumeConstructionRequest = + serde_json::from_str(&volume.data()).unwrap(); + + // validate all read/write resources still exist + + let num_read_write_subvolumes = count_read_write_sub_volumes(&vcr); + + let mut read_write_targets = + Vec::with_capacity(3 * num_read_write_subvolumes); + + read_write_resources_associated_with_volume( + &vcr, + &mut read_write_targets, + ); + + for target in read_write_targets { + let sub_err = OptionalError::new(); + + let maybe_region = DataStore::target_to_region( + conn, &sub_err, &target, false, // read-write + ) + .await + .unwrap(); + + let Some(_region) = maybe_region else { + return Err(Self::volume_invariant_violated(format!( + "could not find resource for {target}" + ))); + }; + } + + // validate all read-only resources still exist + + let crucible_targets = { + let mut crucible_targets = CrucibleTargets::default(); + read_only_resources_associated_with_volume( + &vcr, + &mut crucible_targets, + ); + crucible_targets + }; + + for read_only_target in &crucible_targets.read_only_targets { + let sub_err = OptionalError::new(); + + let maybe_usage = + DataStore::read_only_target_to_volume_resource_usage( + conn, + &sub_err, + read_only_target, + ) + .await + .unwrap(); + + let Some(_usage) = maybe_usage else { + return Err(Self::volume_invariant_violated(format!( + "could not find resource for {read_only_target}" + ))); + }; + } + + Ok(()) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index c402c8fc92..29878364e6 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -216,14 +216,16 @@ impl RegionSnapshotReplacementFindAffected { Ok(Some(region_snapshot)) => region_snapshot, Ok(None) => { + // If the associated region snapshot was deleted, then there + // are no more volumes that reference it. This is not an + // error! Continue processing the other requests. let s = format!( - "region snapshot {} {} {} not found!", + "region snapshot {} {} {} not found", request.old_dataset_id, request.old_region_id, request.old_snapshot_id, ); - error!(&log, "{s}"); - status.errors.push(s); + info!(&log, "{s}"); continue; } diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index d615b55612..4e6c3e1e16 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -58,7 +58,6 @@ use crate::app::db::datastore::ReplacementTarget; use crate::app::db::datastore::VolumeReplaceResult; use crate::app::db::datastore::VolumeToDelete; use crate::app::db::datastore::VolumeWithTarget; -use crate::app::db::lookup::LookupPath; use crate::app::sagas::common_storage::find_only_new_region; use crate::app::sagas::declare_saga_actions; use crate::app::RegionAllocationStrategy; @@ -242,12 +241,21 @@ async fn rsrss_get_alloc_region_params( ); // Look up the existing snapshot - let (.., db_snapshot) = LookupPath::new(&opctx, &osagactx.datastore()) - .snapshot_id(params.request.old_snapshot_id) - .fetch() + let maybe_db_snapshot = osagactx + .datastore() + .snapshot_get(&opctx, params.request.old_snapshot_id) .await .map_err(ActionError::action_failed)?; + let Some(db_snapshot) = maybe_db_snapshot else { + return Err(ActionError::action_failed(Error::internal_error( + &format!( + "snapshot {} was hard deleted!", + params.request.old_snapshot_id + ), + ))); + }; + // Find the region to replace let db_region = osagactx .datastore() @@ -480,12 +488,22 @@ async fn rsrss_get_old_snapshot_volume_id( ¶ms.serialized_authn, ); - let (.., db_snapshot) = LookupPath::new(&opctx, &osagactx.datastore()) - .snapshot_id(params.request.old_snapshot_id) - .fetch() + // Look up the existing snapshot + let maybe_db_snapshot = osagactx + .datastore() + .snapshot_get(&opctx, params.request.old_snapshot_id) .await .map_err(ActionError::action_failed)?; + let Some(db_snapshot) = maybe_db_snapshot else { + return Err(ActionError::action_failed(Error::internal_error( + &format!( + "snapshot {} was hard deleted!", + params.request.old_snapshot_id + ), + ))); + }; + Ok(db_snapshot.volume_id) } @@ -510,6 +528,13 @@ async fn rsrss_create_fake_volume( gen: 0, opts: CrucibleOpts { id: new_volume_id, + // Do not put the new region ID here: it will be deleted during + // the associated garbage collection saga if + // `volume_replace_snapshot` does not perform the swap (which + // happens when the snapshot volume is deleted) and we still + // want it to exist when performing other replacement steps. If + // the replacement does occur, then the old address will swapped + // in here. target: vec![], lossy: false, flush_timeout: None, @@ -673,14 +698,15 @@ async fn rsrss_replace_snapshot_in_volume( } VolumeReplaceResult::ExistingVolumeDeleted => { - // Unwind the saga here to clean up the resources allocated during - // this saga. The associated background task will transition this - // request's state to Completed. - - Err(ActionError::action_failed(format!( - "existing volume {} deleted", - replacement_params.old_volume_id - ))) + // If the snapshot volume was deleted, we still want to proceed with + // replacing the rest of the uses of the region snapshot. Note this + // also covers the case where this saga node runs (performing the + // replacement), the executor crashes before it can record that + // success, and then before this node is rerun the snapshot is + // deleted. If this saga unwound here, that would violate the + // property of idempotency. + + Ok(()) } } } diff --git a/nexus/src/app/sagas/volume_delete.rs b/nexus/src/app/sagas/volume_delete.rs index c5e1b49f61..12d48cb367 100644 --- a/nexus/src/app/sagas/volume_delete.rs +++ b/nexus/src/app/sagas/volume_delete.rs @@ -29,7 +29,7 @@ use super::NexusSaga; use crate::app::sagas::declare_saga_actions; use nexus_db_model::Dataset; use nexus_db_model::Region; -use nexus_db_model::RegionSnapshot; +use nexus_db_model::Volume; use nexus_db_queries::authn; use nexus_db_queries::db::datastore::CrucibleResources; use nexus_types::identity::Asset; @@ -330,8 +330,7 @@ async fn svd_delete_crucible_snapshot_records( Ok(()) } -type FreedCrucibleRegions = - Vec<(Dataset, Region, Option, Uuid)>; +type FreedCrucibleRegions = Vec<(Dataset, Region, Option)>; /// Deleting region snapshots in a previous saga node may have freed up regions /// that were deleted in the DB but couldn't be deleted by the Crucible Agent @@ -394,7 +393,7 @@ type FreedCrucibleRegions = /// The disk's volume has no read only resources, while the snapshot's volume /// does. The disk volume's targets are all regions (backed by downstairs that /// are read/write) while the snapshot volume's targets are all snapshots -/// (backed by volumes that are read-only). The two volumes are linked in the +/// (backed by downstairs that are read-only). The two volumes are linked in the /// sense that the snapshots from the second are contained *within* the regions /// of the first, reflecting the resource nesting from ZFS. This is also /// reflected in the REST endpoint that the Crucible agent uses: @@ -436,7 +435,7 @@ async fn svd_find_freed_crucible_regions( // Don't serialize the whole Volume, as the data field contains key material! Ok(freed_datasets_regions_and_volumes .into_iter() - .map(|x| (x.0, x.1, x.2, x.3.id())) + .map(|x| (x.0, x.1, x.2.map(|v: Volume| v.id()))) .collect()) } @@ -451,23 +450,7 @@ async fn svd_delete_freed_crucible_regions( let freed_datasets_regions_and_volumes = sagactx.lookup::("freed_crucible_regions")?; - for (dataset, region, region_snapshot, volume_id) in - freed_datasets_regions_and_volumes - { - if region_snapshot.is_some() { - // We cannot delete this region yet, the snapshot has not been - // deleted. This can occur when multiple volume delete sagas run - // concurrently: one will decrement the crucible resources (but - // hasn't made the appropriate DELETE calls to remove the running - // snapshots and snapshots yet), and the other will be here trying - // to delete the region. This race results in the crucible agent - // returning "must delete snapshots first" and causing saga unwinds. - // - // Another volume delete (probably the one racing with this one!) - // will pick up this region and remove it. - continue; - } - + for (dataset, region, volume_id) in freed_datasets_regions_and_volumes { // Send DELETE calls to the corresponding Crucible agents osagactx .nexus() @@ -496,14 +479,16 @@ async fn svd_delete_freed_crucible_regions( })?; // Remove volume DB record - osagactx.datastore().volume_hard_delete(volume_id).await.map_err( - |e| { - ActionError::action_failed(format!( - "failed to volume_hard_delete {}: {:?}", - volume_id, e, - )) - }, - )?; + if let Some(volume_id) = volume_id { + osagactx.datastore().volume_hard_delete(volume_id).await.map_err( + |e| { + ActionError::action_failed(format!( + "failed to volume_hard_delete {}: {:?}", + volume_id, e, + )) + }, + )?; + } } Ok(()) diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index d6d7cb58ea..4b081d3e7f 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -2438,83 +2438,6 @@ async fn test_single_region_allocate_for_replace_not_enough_zpools( assert_eq!(datasets_and_regions.len(), REGION_REDUNDANCY_THRESHOLD); } -// Confirm that a region set can start at N, a region can be deleted, and the -// allocation CTE can bring the redundancy back to N. -#[nexus_test] -async fn test_region_allocation_after_delete( - cptestctx: &ControlPlaneTestContext, -) { - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = - OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - - // Create three 10 GiB zpools, each with one dataset. - let _disk_test = DiskTest::new(&cptestctx).await; - - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - - // Create a disk - let client = &cptestctx.external_client; - let _project_id = create_project_and_pool(client).await; - - let disk = create_disk(&client, PROJECT_NAME, DISK_NAME).await; - - // Assert disk has three allocated regions - let disk_id = disk.identity.id; - let (.., db_disk) = LookupPath::new(&opctx, &datastore) - .disk_id(disk_id) - .fetch() - .await - .unwrap_or_else(|_| panic!("test disk {:?} should exist", disk_id)); - - let allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD); - - // Delete one of the regions - let region_to_delete: &nexus_db_model::Region = &allocated_regions[0].1; - datastore - .regions_hard_delete(&opctx.log, vec![region_to_delete.id()]) - .await - .unwrap(); - - // Assert disk's volume has one less allocated region - let allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD - 1); - - let region_total_size: ByteCount = ByteCount::try_from( - region_to_delete.block_size().to_bytes() - * region_to_delete.blocks_per_extent() - * region_to_delete.extent_count(), - ) - .unwrap(); - - // Rerun disk region allocation - datastore - .disk_region_allocate( - &opctx, - db_disk.volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from( - region_to_delete.block_size().to_bytes() as u32, - ) - .unwrap(), - }, - region_total_size, - &RegionAllocationStrategy::Random { seed: None }, - ) - .await - .unwrap(); - - // Assert redundancy was restored - let allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD); -} - #[nexus_test] async fn test_no_halt_disk_delete_one_region_on_expunged_agent( cptestctx: &ControlPlaneTestContext, diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 5a03a6b95a..ff88de52e0 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -13,6 +13,7 @@ use http::method::Method; use http::StatusCode; use nexus_config::RegionAllocationStrategy; use nexus_db_model::RegionSnapshotReplacement; +use nexus_db_model::RegionSnapshotReplacementState; use nexus_db_model::VolumeResourceUsage; use nexus_db_model::VolumeResourceUsageRecord; use nexus_db_queries::context::OpContext; @@ -50,6 +51,8 @@ use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Name; use omicron_common::api::internal; +use omicron_test_utils::dev::poll::wait_for_condition; +use omicron_test_utils::dev::poll::CondCheckError; use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::DownstairsRegionKind; use omicron_uuid_kinds::GenericUuid; @@ -3757,22 +3760,14 @@ impl TestReadOnlyRegionReferenceUsage { .any(|(_, r)| r.id() == self.region.id())); } - pub async fn region_returned_by_find_deleted_volume_regions(&self) { - let deleted_volume_regions = - self.datastore.find_deleted_volume_regions().await.unwrap(); - - assert!(deleted_volume_regions - .into_iter() - .any(|(_, r, _, _)| r.id() == self.region.id())); - } - + // read-only regions should never be returned by find_deleted_volume_regions pub async fn region_not_returned_by_find_deleted_volume_regions(&self) { let deleted_volume_regions = self.datastore.find_deleted_volume_regions().await.unwrap(); assert!(!deleted_volume_regions .into_iter() - .any(|(_, r, _, _)| r.id() == self.region.id())); + .any(|(_, r, _)| r.id() == self.region.id())); } pub async fn create_first_volume_region_in_rop(&self) { @@ -3923,9 +3918,9 @@ async fn test_read_only_region_reference_usage_sanity( harness.validate_region_returned_for_cleanup().await; - // It should be returned by find_deleted_volume_regions. + // It should not be returned by find_deleted_volume_regions. - harness.region_returned_by_find_deleted_volume_regions().await; + harness.region_not_returned_by_find_deleted_volume_regions().await; } /// Assert that creating a volume with a read-only region in the ROP creates an @@ -3960,9 +3955,9 @@ async fn test_read_only_region_reference_sanity_rop( harness.validate_region_returned_for_cleanup().await; - // It should be returned by find_deleted_volume_regions. + // It should not be returned by find_deleted_volume_regions. - harness.region_returned_by_find_deleted_volume_regions().await; + harness.region_not_returned_by_find_deleted_volume_regions().await; } /// Assert that creating multiple volumes with a read-only region creates the @@ -4007,7 +4002,9 @@ async fn test_read_only_region_reference_sanity_multi( harness.validate_region_returned_for_cleanup().await; - harness.region_returned_by_find_deleted_volume_regions().await; + // It should not be returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; } /// Assert that creating multiple volumes with a read-only region in the ROP @@ -4052,7 +4049,9 @@ async fn test_read_only_region_reference_sanity_rop_multi( harness.validate_region_returned_for_cleanup().await; - harness.region_returned_by_find_deleted_volume_regions().await; + // It should not be returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; } /// Assert that a read-only region is properly reference counted and not @@ -5530,3 +5529,302 @@ async fn test_migrate_to_ref_count_with_records_region_snapshot_deleting( assert_eq!(records_before, records_after); } + +#[nexus_test] +async fn test_double_layer_with_read_only_region_delete( + cptestctx: &ControlPlaneTestContext, +) { + // 1) Create disk, snapshot, then two disks from those snapshots + // 2) Replace one of the region snapshots in that snapshot volume with a + // read-only region + // 3) Delete in the following order: disk, snapshot, then two disks from the + // snapshot - after each delete, verify that crucible resources were not + // prematurely deleted + // 6) At the end, assert that all Crucible resources were cleaned up + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let _disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let _another_disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "another-disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Perform region snapshot replacement for one of the snapshot's targets, + // causing a read-only region to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete the disk and snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete disk-from-snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + // Finally, delete another-disk-from-snapshot + + NexusRequest::object_delete( + client, + &get_disk_url("another-disk-from-snapshot"), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete another-disk-from-snapshot"); + + assert!(disk_test.crucible_resources_deleted().await); +} + +#[nexus_test] +async fn test_double_layer_snapshot_with_read_only_region_delete_2( + cptestctx: &ControlPlaneTestContext, +) { + // 1) Create disk, then a snapshot + // 2) Replace two of the region snapshots in that snapshot volume with + // read-only regions + // 3) Create a disk from the snapshot + // 4) Replace the last of the region snapshots in that snapshot volume with + // a read-only region + // 5) Delete in the following order: disk, snapshot, then the disk from the + // snapshot - after each delete, verify that crucible resources were not + // prematurely deleted + // 6) At the end, assert that all Crucible resources were cleaned up + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + // Perform region snapshot replacement for two of the snapshot's targets, + // causing two read-only regions to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + let request_id = request.id; + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + wait_for_condition( + || { + let datastore = datastore.clone(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + async move { + let request = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, request_id, + ) + .await + .unwrap(); + + let state = request.replacement_state; + + if state == RegionSnapshotReplacementState::Complete { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + let (dataset, region) = &allocated_regions[1]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + assert!(!disk_test.crucible_resources_deleted().await); + + // Create a disk from the snapshot + + let _disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Replace the last of the region snapshot targets + + let (dataset, region) = &allocated_regions[2]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete the disk and snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + assert!(!disk_test.crucible_resources_deleted().await); + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete disk-from-snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + + // Assert everything was cleaned up + + assert!(disk_test.crucible_resources_deleted().await); +} diff --git a/schema/crdb/crucible-ref-count-records/up09.sql b/schema/crdb/crucible-ref-count-records/up09.sql new file mode 100644 index 0000000000..be56ed401c --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up09.sql @@ -0,0 +1,2 @@ +CREATE INDEX IF NOT EXISTS lookup_regions_by_read_only + on omicron.public.region (read_only); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index ae4dd3e65b..2984c62dd5 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -622,6 +622,9 @@ CREATE INDEX IF NOT EXISTS lookup_regions_missing_ports on omicron.public.region (id) WHERE port IS NULL; +CREATE INDEX IF NOT EXISTS lookup_regions_by_read_only + on omicron.public.region (read_only); + /* * A snapshot of a region, within a dataset. */