diff --git a/Cargo.lock b/Cargo.lock index d99358a9ad..7dbaa3e008 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4464,6 +4464,7 @@ dependencies = [ "serde_with", "sled-agent-client", "slog", + "static_assertions", "steno", "strum", "subprocess", diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index eaf3dc1295..c16c0f5319 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -47,6 +47,7 @@ serde_urlencoded.workspace = true serde_with.workspace = true sled-agent-client.workspace = true slog.workspace = true +static_assertions.workspace = true steno.workspace = true thiserror.workspace = true tokio = { workspace = true, features = [ "full" ] } diff --git a/nexus/db-queries/src/db/datastore/vpc.rs b/nexus/db-queries/src/db/datastore/vpc.rs index 14886ba018..6db99465a3 100644 --- a/nexus/db-queries/src/db/datastore/vpc.rs +++ b/nexus/db-queries/src/db/datastore/vpc.rs @@ -34,12 +34,15 @@ use crate::db::model::VpcUpdate; use crate::db::model::{Ipv4Net, Ipv6Net}; use crate::db::pagination::paginated; use crate::db::queries::vpc::InsertVpcQuery; +use crate::db::queries::vpc::VniSearchIter; use crate::db::queries::vpc_subnet::FilterConflictingVpcSubnetRangesQuery; use crate::db::queries::vpc_subnet::SubnetError; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; +use diesel::result::DatabaseErrorKind; +use diesel::result::Error as DieselError; use ipnetwork::IpNetwork; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::CreateResult; @@ -85,18 +88,23 @@ impl DataStore { SERVICES_VPC.clone(), Some(Vni(ExternalVni::SERVICES_VNI)), ); - let authz_vpc = self + let authz_vpc = match self .project_create_vpc_raw(opctx, &authz_project, vpc_query) .await - .map(|(authz_vpc, _)| authz_vpc) - .or_else(|e| match e { - Error::ObjectAlreadyExists { .. } => Ok(authz::Vpc::new( - authz_project.clone(), - *SERVICES_VPC_ID, - LookupType::ByName(SERVICES_VPC.identity.name.to_string()), - )), - _ => Err(e), - })?; + { + Ok(None) => { + let msg = "VNI exhaustion detected when creating built-in VPCs"; + error!(opctx.log, "{}", msg); + Err(Error::internal_error(msg)) + } + Ok(Some((authz_vpc, _))) => Ok(authz_vpc), + Err(Error::ObjectAlreadyExists { .. }) => Ok(authz::Vpc::new( + authz_project.clone(), + *SERVICES_VPC_ID, + LookupType::ByName(SERVICES_VPC.identity.name.to_string()), + )), + Err(e) => Err(e), + }?; // Also add the system router and internet gateway route @@ -287,22 +295,65 @@ impl DataStore { &self, opctx: &OpContext, authz_project: &authz::Project, - vpc: IncompleteVpc, + mut vpc: IncompleteVpc, ) -> Result<(authz::Vpc, Vpc), Error> { - self.project_create_vpc_raw( - opctx, - authz_project, - InsertVpcQuery::new(vpc), - ) - .await + // Generate an iterator that allows us to search the entire space of + // VNIs for this VPC, in manageable chunks to limit memory usage. + let vnis = VniSearchIter::new(vpc.vni.0); + for (i, vni) in vnis.enumerate() { + vpc.vni = Vni(vni); + let id = usdt::UniqueId::new(); + crate::probes::vni__search__range__start!(|| { + (&id, u32::from(vni), VniSearchIter::STEP_SIZE) + }); + match self + .project_create_vpc_raw( + opctx, + authz_project, + InsertVpcQuery::new(vpc.clone()), + ) + .await + { + Ok(Some((authz_vpc, vpc))) => { + crate::probes::vni__search__range__found!(|| { + (&id, u32::from(vpc.vni.0)) + }); + return Ok((authz_vpc, vpc)); + } + Err(e) => return Err(e), + Ok(None) => { + crate::probes::vni__search__range__empty!(|| (&id)); + debug!( + opctx.log, + "No VNIs available within current search range, retrying"; + "attempt" => i, + "vpc_name" => %vpc.identity.name, + "start_vni" => ?vni, + ); + } + } + } + + // We've failed to find a VNI after searching the entire range, so we'll + // return a 503 at this point. + error!( + opctx.log, + "failed to find a VNI after searching entire range"; + ); + Err(Error::unavail("Failed to find a free VNI for this VPC")) } + // Internal implementation for creating a VPC. + // + // This returns an optional VPC. If it is None, then we failed to insert a + // VPC specifically because there are no available VNIs. All other errors + // are returned in the `Result::Err` variant. async fn project_create_vpc_raw( &self, opctx: &OpContext, authz_project: &authz::Project, vpc_query: InsertVpcQuery, - ) -> Result<(authz::Vpc, Vpc), Error> { + ) -> Result, Error> { use db::schema::vpc::dsl; assert_eq!(authz_project.id(), vpc_query.vpc.project_id); @@ -312,30 +363,48 @@ impl DataStore { let project_id = vpc_query.vpc.project_id; let conn = self.pool_connection_authorized(opctx).await?; - let vpc: Vpc = Project::insert_resource( + let result: Result = Project::insert_resource( project_id, diesel::insert_into(dsl::vpc).values(vpc_query), ) .insert_and_get_result_async(&conn) - .await - .map_err(|e| match e { - AsyncInsertError::CollectionNotFound => Error::ObjectNotFound { - type_name: ResourceType::Project, - lookup_type: LookupType::ById(project_id), - }, - AsyncInsertError::DatabaseError(e) => public_error_from_diesel( - e, - ErrorHandler::Conflict(ResourceType::Vpc, name.as_str()), - ), - })?; - Ok(( - authz::Vpc::new( - authz_project.clone(), - vpc.id(), - LookupType::ByName(vpc.name().to_string()), - ), - vpc, - )) + .await; + match result { + Ok(vpc) => Ok(Some(( + authz::Vpc::new( + authz_project.clone(), + vpc.id(), + LookupType::ByName(vpc.name().to_string()), + ), + vpc, + ))), + Err(AsyncInsertError::CollectionNotFound) => { + Err(Error::ObjectNotFound { + type_name: ResourceType::Project, + lookup_type: LookupType::ById(project_id), + }) + } + Err(AsyncInsertError::DatabaseError( + DieselError::DatabaseError( + DatabaseErrorKind::NotNullViolation, + info, + ), + )) if info + .message() + .starts_with("null value in column \"vni\"") => + { + // We failed the non-null check on the VNI column, which means + // we could not find a valid VNI in our search range. Return + // None instead to signal the error. + Ok(None) + } + Err(AsyncInsertError::DatabaseError(e)) => { + Err(public_error_from_diesel( + e, + ErrorHandler::Conflict(ResourceType::Vpc, name.as_str()), + )) + } + } } pub async fn project_update_vpc( @@ -1092,3 +1161,234 @@ impl DataStore { }) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::db::datastore::datastore_test; + use crate::db::model::Project; + use crate::db::queries::vpc::MAX_VNI_SEARCH_RANGE_SIZE; + use nexus_test_utils::db::test_setup_database; + use nexus_types::external_api::params; + use omicron_common::api::external; + use omicron_test_utils::dev; + use slog::info; + + // Test that we detect the right error condition and return None when we + // fail to insert a VPC due to VNI exhaustion. + // + // This is a bit awkward, but we'll test this by inserting a bunch of VPCs, + // and checking that we get the expected error response back from the + // `project_create_vpc_raw` call. + #[tokio::test] + async fn test_project_create_vpc_raw_returns_none_on_vni_exhaustion() { + usdt::register_probes().unwrap(); + let logctx = dev::test_setup_log( + "test_project_create_vpc_raw_returns_none_on_vni_exhaustion", + ); + let log = &logctx.log; + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create a project. + let project_params = params::ProjectCreate { + identity: IdentityMetadataCreateParams { + name: "project".parse().unwrap(), + description: String::from("test project"), + }, + }; + let project = Project::new(Uuid::new_v4(), project_params); + let (authz_project, _) = datastore + .project_create(&opctx, project) + .await + .expect("failed to create project"); + + let starting_vni = 2048; + let description = String::from("test vpc"); + for vni in 0..=MAX_VNI_SEARCH_RANGE_SIZE { + // Create an incomplete VPC and make sure it has the next available + // VNI. + let name: external::Name = format!("vpc{vni}").parse().unwrap(); + let mut incomplete_vpc = IncompleteVpc::new( + Uuid::new_v4(), + authz_project.id(), + Uuid::new_v4(), + params::VpcCreate { + identity: IdentityMetadataCreateParams { + name: name.clone(), + description: description.clone(), + }, + ipv6_prefix: None, + dns_name: name.clone(), + }, + ) + .expect("failed to create incomplete VPC"); + let this_vni = + Vni(external::Vni::try_from(starting_vni + vni).unwrap()); + incomplete_vpc.vni = this_vni; + info!( + log, + "creating initial VPC"; + "index" => vni, + "vni" => ?this_vni, + ); + let query = InsertVpcQuery::new(incomplete_vpc); + let (_, db_vpc) = datastore + .project_create_vpc_raw(&opctx, &authz_project, query) + .await + .expect("failed to create initial set of VPCs") + .expect("expected an actual VPC"); + info!( + log, + "created VPC"; + "vpc" => ?db_vpc, + ); + } + + // At this point, we've filled all the VNIs starting from 2048. Let's + // try to allocate one more, also starting from that position. This + // should fail, because we've explicitly filled the entire range we'll + // search above. + let name: external::Name = "dead-vpc".parse().unwrap(); + let mut incomplete_vpc = IncompleteVpc::new( + Uuid::new_v4(), + authz_project.id(), + Uuid::new_v4(), + params::VpcCreate { + identity: IdentityMetadataCreateParams { + name: name.clone(), + description: description.clone(), + }, + ipv6_prefix: None, + dns_name: name.clone(), + }, + ) + .expect("failed to create incomplete VPC"); + let this_vni = Vni(external::Vni::try_from(starting_vni).unwrap()); + incomplete_vpc.vni = this_vni; + info!( + log, + "creating VPC when all VNIs are allocated"; + "vni" => ?this_vni, + ); + let query = InsertVpcQuery::new(incomplete_vpc); + let Ok(None) = datastore + .project_create_vpc_raw(&opctx, &authz_project, query) + .await + else { + panic!("Expected Ok(None) when creating a VPC without any available VNIs"); + }; + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + // Test that we appropriately retry when there are no available VNIs. + // + // This is a bit awkward, but we'll test this by inserting a bunch of VPCs, + // and then check that we correctly retry + #[tokio::test] + async fn test_project_create_vpc_retries() { + usdt::register_probes().unwrap(); + let logctx = dev::test_setup_log("test_project_create_vpc_retries"); + let log = &logctx.log; + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create a project. + let project_params = params::ProjectCreate { + identity: IdentityMetadataCreateParams { + name: "project".parse().unwrap(), + description: String::from("test project"), + }, + }; + let project = Project::new(Uuid::new_v4(), project_params); + let (authz_project, _) = datastore + .project_create(&opctx, project) + .await + .expect("failed to create project"); + + let starting_vni = 2048; + let description = String::from("test vpc"); + for vni in 0..=MAX_VNI_SEARCH_RANGE_SIZE { + // Create an incomplete VPC and make sure it has the next available + // VNI. + let name: external::Name = format!("vpc{vni}").parse().unwrap(); + let mut incomplete_vpc = IncompleteVpc::new( + Uuid::new_v4(), + authz_project.id(), + Uuid::new_v4(), + params::VpcCreate { + identity: IdentityMetadataCreateParams { + name: name.clone(), + description: description.clone(), + }, + ipv6_prefix: None, + dns_name: name.clone(), + }, + ) + .expect("failed to create incomplete VPC"); + let this_vni = + Vni(external::Vni::try_from(starting_vni + vni).unwrap()); + incomplete_vpc.vni = this_vni; + info!( + log, + "creating initial VPC"; + "index" => vni, + "vni" => ?this_vni, + ); + let query = InsertVpcQuery::new(incomplete_vpc); + let (_, db_vpc) = datastore + .project_create_vpc_raw(&opctx, &authz_project, query) + .await + .expect("failed to create initial set of VPCs") + .expect("expected an actual VPC"); + info!( + log, + "created VPC"; + "vpc" => ?db_vpc, + ); + } + + // Similar to the above test, we've fill all available VPCs starting at + // `starting_vni`. Let's attempt to allocate one beginning there, which + // _should_ fail and be internally retried. Note that we're using + // `project_create_vpc()` here instead of the raw version, to check that + // retry logic. + let name: external::Name = "dead-at-first-vpc".parse().unwrap(); + let mut incomplete_vpc = IncompleteVpc::new( + Uuid::new_v4(), + authz_project.id(), + Uuid::new_v4(), + params::VpcCreate { + identity: IdentityMetadataCreateParams { + name: name.clone(), + description: description.clone(), + }, + ipv6_prefix: None, + dns_name: name.clone(), + }, + ) + .expect("failed to create incomplete VPC"); + let this_vni = Vni(external::Vni::try_from(starting_vni).unwrap()); + incomplete_vpc.vni = this_vni; + info!( + log, + "creating VPC when all VNIs are allocated"; + "vni" => ?this_vni, + ); + match datastore + .project_create_vpc(&opctx, &authz_project, incomplete_vpc.clone()) + .await + { + Ok((_, vpc)) => { + assert_eq!(vpc.id(), incomplete_vpc.identity.id); + let expected_vni = starting_vni + MAX_VNI_SEARCH_RANGE_SIZE + 1; + assert_eq!(u32::from(vpc.vni.0), expected_vni); + info!(log, "successfully created VPC after retries"; "vpc" => ?vpc); + } + Err(e) => panic!("Unexpected error when inserting VPC: {e}"), + }; + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } +} diff --git a/nexus/db-queries/src/db/queries/vpc.rs b/nexus/db-queries/src/db/queries/vpc.rs index b1ac8fe1e1..c29a51adb0 100644 --- a/nexus/db-queries/src/db/queries/vpc.rs +++ b/nexus/db-queries/src/db/queries/vpc.rs @@ -245,15 +245,7 @@ struct NextVni { impl NextVni { fn new(vni: Vni) -> Self { - let base_u32 = u32::from(vni.0); - // The valid range is [0, 1 << 24], so the maximum shift is whatever - // gets us to 1 << 24, and the minimum is whatever gets us back to the - // minimum guest VNI. - let max_shift = i64::from(external::Vni::MAX_VNI - base_u32); - let min_shift = i64::from( - -i32::try_from(base_u32 - external::Vni::MIN_GUEST_VNI) - .expect("Expected a valid VNI at this point"), - ); + let VniShifts { min_shift, max_shift } = VniShifts::new(vni); let generator = DefaultShiftGenerator { base: vni, max_shift, min_shift }; let inner = NextItem::new_unscoped(generator); @@ -278,3 +270,208 @@ impl NextVni { } delegate_query_fragment_impl!(NextVni); + +// Helper type to compute the shift for a `NextItem` query to find VNIs. +#[derive(Clone, Copy, Debug, PartialEq)] +struct VniShifts { + // The minimum `ShiftGenerator` shift. + min_shift: i64, + // The maximum `ShiftGenerator` shift. + max_shift: i64, +} + +/// Restrict the search for a VNI to a small range. +/// +/// VNIs are pretty sparsely allocated (the number of VPCs), and the range is +/// quite large (24 bits). To avoid memory issues, we'll restrict a search +/// for an available VNI to a small range starting from the random starting +/// VNI. +// +// NOTE: This is very small for tests, to ensure we can accurately test the +// failure mode where there are no available VNIs. +#[cfg(not(test))] +pub const MAX_VNI_SEARCH_RANGE_SIZE: u32 = 2048; +#[cfg(test)] +pub const MAX_VNI_SEARCH_RANGE_SIZE: u32 = 10; + +// Ensure that we cannot search a range that extends beyond the valid guest VNI +// range. +static_assertions::const_assert!( + MAX_VNI_SEARCH_RANGE_SIZE + <= (external::Vni::MAX_VNI - external::Vni::MIN_GUEST_VNI) +); + +impl VniShifts { + fn new(vni: Vni) -> Self { + let base_u32 = u32::from(vni.0); + let range_end = base_u32 + MAX_VNI_SEARCH_RANGE_SIZE; + + // Clamp the maximum shift at the distance to the maximum allowed VNI, + // or the maximum of the range. + let max_shift = i64::from( + (external::Vni::MAX_VNI - base_u32).min(MAX_VNI_SEARCH_RANGE_SIZE), + ); + + // And any remaining part of the range wraps around starting at the + // beginning. + let min_shift = -i64::from( + range_end.checked_sub(external::Vni::MAX_VNI).unwrap_or(0), + ); + Self { min_shift, max_shift } + } +} + +/// An iterator yielding sequential starting VNIs. +/// +/// The VPC insertion query requires a search for the next available VNI, using +/// the `NextItem` query. We limit the search for each query to avoid memory +/// issues on any one query. If we fail to find a VNI, we need to search the +/// next range. This iterator yields the starting positions for the `NextItem` +/// query, so that the entire range can be search in chunks until a free VNI is +/// found. +// +// NOTE: It's technically possible for this to lead to searching the very +// initial portion of the range twice. If we end up wrapping around so that the +// last position yielded by this iterator is `start - x`, then we'll end up +// searching from `start - x` to `start + (MAX_VNI_SEARCH_RANGE_SIZE - x)`, and +// so search those first few after `start` again. This is both innocuous and +// really unlikely. +#[derive(Clone, Copy, Debug)] +pub struct VniSearchIter { + start: u32, + current: u32, + has_wrapped: bool, +} + +impl VniSearchIter { + pub const STEP_SIZE: u32 = MAX_VNI_SEARCH_RANGE_SIZE; + + /// Create a search range, starting from the provided VNI. + pub fn new(start: external::Vni) -> Self { + let start = u32::from(start); + Self { start, current: start, has_wrapped: false } + } +} + +impl std::iter::Iterator for VniSearchIter { + type Item = external::Vni; + + fn next(&mut self) -> Option { + // If we've wrapped around and the computed position is beyond where we + // started, then the ite + if self.has_wrapped && self.current > self.start { + return None; + } + + // Compute the next position. + // + // Make sure we wrap around to the mininum guest VNI. Note that we + // consider the end of the range inclusively, so we subtract one in the + // offset below to end up _at_ the min guest VNI. + let mut next = self.current + MAX_VNI_SEARCH_RANGE_SIZE; + if next > external::Vni::MAX_VNI { + next -= external::Vni::MAX_VNI; + next += external::Vni::MIN_GUEST_VNI - 1; + self.has_wrapped = true; + } + let current = self.current; + self.current = next; + Some(external::Vni::try_from(current).unwrap()) + } +} + +#[cfg(test)] +mod tests { + use super::external; + use super::Vni; + use super::VniSearchIter; + use super::VniShifts; + use super::MAX_VNI_SEARCH_RANGE_SIZE; + + // Ensure that when the search range lies entirely within the range of VNIs, + // we search from the start VNI through the maximum allowed range size. + #[test] + fn test_vni_shift_no_wrapping() { + let vni = Vni(external::Vni::try_from(2048).unwrap()); + let VniShifts { min_shift, max_shift } = VniShifts::new(vni); + assert_eq!(min_shift, 0); + assert_eq!(max_shift, i64::from(MAX_VNI_SEARCH_RANGE_SIZE)); + assert_eq!(max_shift - min_shift, i64::from(MAX_VNI_SEARCH_RANGE_SIZE)); + } + + // Ensure that we wrap correctly, when the starting VNI happens to land + // quite close to the end of the allowed range. + #[test] + fn test_vni_shift_with_wrapping() { + let offset = 5; + let vni = + Vni(external::Vni::try_from(external::Vni::MAX_VNI - offset) + .unwrap()); + let VniShifts { min_shift, max_shift } = VniShifts::new(vni); + assert_eq!(min_shift, -i64::from(MAX_VNI_SEARCH_RANGE_SIZE - offset)); + assert_eq!(max_shift, i64::from(offset)); + assert_eq!(max_shift - min_shift, i64::from(MAX_VNI_SEARCH_RANGE_SIZE)); + } + + #[test] + fn test_vni_search_iter_steps() { + let start = external::Vni::try_from(2048).unwrap(); + let mut it = VniSearchIter::new(start); + let next = it.next().unwrap(); + assert_eq!(next, start); + let next = it.next().unwrap(); + assert_eq!( + next, + external::Vni::try_from( + u32::from(start) + MAX_VNI_SEARCH_RANGE_SIZE + ) + .unwrap() + ); + } + + #[test] + fn test_vni_search_iter_full_count() { + let start = + external::Vni::try_from(external::Vni::MIN_GUEST_VNI).unwrap(); + + let last = VniSearchIter::new(start).last().unwrap(); + println!("{:?}", last); + + pub const fn div_ceil(x: u32, y: u32) -> u32 { + let d = x / y; + let r = x % y; + if r > 0 && y > 0 { + d + 1 + } else { + d + } + } + const N_EXPECTED: u32 = div_ceil( + external::Vni::MAX_VNI - external::Vni::MIN_GUEST_VNI, + MAX_VNI_SEARCH_RANGE_SIZE, + ); + let count = u32::try_from(VniSearchIter::new(start).count()).unwrap(); + assert_eq!(count, N_EXPECTED); + } + + #[test] + fn test_vni_search_iter_wrapping() { + // Start from just before the end of the range. + let start = + external::Vni::try_from(external::Vni::MAX_VNI - 1).unwrap(); + let mut it = VniSearchIter::new(start); + + // We should yield that start position first. + let next = it.next().unwrap(); + assert_eq!(next, start); + + // The next value should be wrapped around to the beginning. + // + // Subtract 2 because we _include_ the max VNI in the search range. + let next = it.next().unwrap(); + assert_eq!( + u32::from(next), + external::Vni::MIN_GUEST_VNI + MAX_VNI_SEARCH_RANGE_SIZE - 2 + ); + } +} diff --git a/nexus/db-queries/src/lib.rs b/nexus/db-queries/src/lib.rs index 29c33039ff..a693f7ff42 100644 --- a/nexus/db-queries/src/lib.rs +++ b/nexus/db-queries/src/lib.rs @@ -17,3 +17,22 @@ extern crate newtype_derive; #[cfg(test)] #[macro_use] extern crate diesel; + +#[usdt::provider(provider = "nexus__db__queries")] +mod probes { + // Fires before we start a search over a range for a VNI. + // + // Includes the starting VNI and the size of the range being searched. + fn vni__search__range__start( + _: &usdt::UniqueId, + start_vni: u32, + size: u32, + ) { + } + + // Fires when we successfully find a VNI. + fn vni__search__range__found(_: &usdt::UniqueId, vni: u32) {} + + // Fires when we fail to find a VNI in the provided range. + fn vni__search__range__empty(_: &usdt::UniqueId) {} +}