From 877a886e3ce33e0b1bf93bf479887d9f9c25af2a Mon Sep 17 00:00:00 2001 From: Justin Bennett Date: Wed, 13 Dec 2023 01:19:17 -0500 Subject: [PATCH] Add resource limits (#4605) This PR aims to introduce `quotas` as a concept into Nexus for allowing operators to enforce virtual resource limits at the silo level. The initial implementation will be limited to checks during instance start, disk creation, and snapshot creation. We will _not_ being doing advanced quota recalculation as system resources change. We will _not yet_ be enforcing intelligent quota caps where the sum of all quotas must be less than the theoretical available system virtual resources. The implementation of this functionality is shaped by [RFD-427](https://rfd.shared.oxide.computer/rfd/0427) but some desired functionality will be deferred given time/complexity constraints. Longer term I believe the shape of quotas and perhaps even their relationship to silos may change. This PR implements a simplified version that matches closely to how the virtual resource provisioning tables are already built out. I know there's some oddness around the shape of the quotas table with it not having its own ID and otherwise being mildly divergent from other resources, but this was largely to ensure we could migrate to another solution _and_ not overcomplicate the initial implementation. ## TODO - [x] Add quota creation as a step of silo creation - [x] Add initialization checks in CTEs for instance create, etc to only proceed when quota unmet - [x] Wire up CTE sentinels in upstream callsites - [x] Add backfill migration for existing customers - [x] Add tests for quota enforcement - [x] Delete the quotas when the silo is deleted --------- Co-authored-by: Sean Klein --- common/src/api/external/mod.rs | 1 + end-to-end-tests/src/bin/bootstrap.rs | 17 +- end-to-end-tests/src/helpers/ctx.rs | 2 +- nexus/db-model/src/lib.rs | 2 + .../virtual_provisioning_collection_update.rs | 22 ++ nexus/db-model/src/quota.rs | 109 ++++++ nexus/db-model/src/schema.rs | 13 +- nexus/db-queries/src/db/datastore/mod.rs | 1 + nexus/db-queries/src/db/datastore/quota.rs | 127 +++++++ nexus/db-queries/src/db/datastore/rack.rs | 2 + nexus/db-queries/src/db/datastore/silo.rs | 46 ++- .../virtual_provisioning_collection.rs | 10 +- nexus/db-queries/src/db/fixed_data/silo.rs | 5 + .../virtual_provisioning_collection_update.rs | 228 ++++++++++++- nexus/src/app/external_endpoints.rs | 1 + nexus/src/app/mod.rs | 1 + nexus/src/app/quota.rs | 49 +++ nexus/src/app/rack.rs | 4 + nexus/src/external_api/http_entrypoints.rs | 92 +++++- nexus/test-utils/src/resource_helpers.rs | 1 + nexus/tests/integration_tests/certificates.rs | 10 + nexus/tests/integration_tests/endpoints.rs | 24 ++ nexus/tests/integration_tests/mod.rs | 1 + nexus/tests/integration_tests/quotas.rs | 312 ++++++++++++++++++ nexus/tests/integration_tests/silos.rs | 7 + nexus/tests/output/nexus_tags.txt | 3 + nexus/types/src/external_api/params.rs | 54 +++ nexus/types/src/external_api/views.rs | 8 + openapi/nexus.json | 261 +++++++++++++++ schema/crdb/20.0.0/up01.sql | 8 + schema/crdb/20.0.0/up02.sql | 28 ++ schema/crdb/dbinit.sql | 11 +- 32 files changed, 1429 insertions(+), 31 deletions(-) create mode 100644 nexus/db-model/src/quota.rs create mode 100644 nexus/db-queries/src/db/datastore/quota.rs create mode 100644 nexus/src/app/quota.rs create mode 100644 nexus/tests/integration_tests/quotas.rs create mode 100644 schema/crdb/20.0.0/up01.sql create mode 100644 schema/crdb/20.0.0/up02.sql diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index a6d729593b..64a2e462ec 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -702,6 +702,7 @@ pub enum ResourceType { Silo, SiloUser, SiloGroup, + SiloQuotas, IdentityProvider, SamlIdentityProvider, SshKey, diff --git a/end-to-end-tests/src/bin/bootstrap.rs b/end-to-end-tests/src/bin/bootstrap.rs index 83a37b8c21..9ddd872bc2 100644 --- a/end-to-end-tests/src/bin/bootstrap.rs +++ b/end-to-end-tests/src/bin/bootstrap.rs @@ -4,11 +4,11 @@ use end_to_end_tests::helpers::{generate_name, get_system_ip_pool}; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use oxide_client::types::{ ByteCount, DeviceAccessTokenRequest, DeviceAuthRequest, DeviceAuthVerify, - DiskCreate, DiskSource, IpRange, Ipv4Range, + DiskCreate, DiskSource, IpRange, Ipv4Range, SiloQuotasUpdate, }; use oxide_client::{ ClientDisksExt, ClientHiddenExt, ClientProjectsExt, - ClientSystemNetworkingExt, + ClientSystemNetworkingExt, ClientSystemSilosExt, }; use serde::{de::DeserializeOwned, Deserialize}; use std::time::Duration; @@ -45,6 +45,19 @@ async fn main() -> Result<()> { .send() .await?; + // ===== SET UP QUOTAS ===== // + eprintln!("setting up quotas..."); + client + .silo_quotas_update() + .silo("recovery") + .body(SiloQuotasUpdate { + cpus: Some(16), + memory: Some(ByteCount(1024 * 1024 * 1024 * 10)), + storage: Some(ByteCount(1024 * 1024 * 1024 * 1024)), + }) + .send() + .await?; + // ===== ENSURE DATASETS ARE READY ===== // eprintln!("ensuring datasets are ready..."); let ctx = Context::from_client(client).await?; diff --git a/end-to-end-tests/src/helpers/ctx.rs b/end-to-end-tests/src/helpers/ctx.rs index 2c66bd4724..0132feafeb 100644 --- a/end-to-end-tests/src/helpers/ctx.rs +++ b/end-to-end-tests/src/helpers/ctx.rs @@ -78,7 +78,7 @@ fn rss_config() -> Result { let content = std::fs::read_to_string(&path).unwrap_or(RSS_CONFIG_STR.to_string()); toml::from_str(&content) - .with_context(|| format!("parsing config-rss as TOML")) + .with_context(|| "parsing config-rss as TOML".to_string()) } fn nexus_external_dns_name(config: &SetupServiceConfig) -> String { diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 43bf83fd34..908f6f2368 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -55,6 +55,7 @@ mod system_update; // for join-based marker trait generation. mod ipv4_nat_entry; pub mod queries; +mod quota; mod rack; mod region; mod region_snapshot; @@ -139,6 +140,7 @@ pub use physical_disk::*; pub use physical_disk_kind::*; pub use producer_endpoint::*; pub use project::*; +pub use quota::*; pub use rack::*; pub use region::*; pub use region_snapshot::*; diff --git a/nexus/db-model/src/queries/virtual_provisioning_collection_update.rs b/nexus/db-model/src/queries/virtual_provisioning_collection_update.rs index 6c684016b4..124ffe4db6 100644 --- a/nexus/db-model/src/queries/virtual_provisioning_collection_update.rs +++ b/nexus/db-model/src/queries/virtual_provisioning_collection_update.rs @@ -8,6 +8,7 @@ //! for the construction of this query. use crate::schema::silo; +use crate::schema::silo_quotas; use crate::schema::virtual_provisioning_collection; table! { @@ -28,11 +29,32 @@ table! { } } +table! { + quotas (silo_id) { + silo_id -> Uuid, + cpus -> Int8, + memory -> Int8, + storage -> Int8, + } +} + +table! { + silo_provisioned { + id -> Uuid, + virtual_disk_bytes_provisioned -> Int8, + cpus_provisioned -> Int8, + ram_provisioned -> Int8, + } +} + diesel::allow_tables_to_appear_in_same_query!(silo, parent_silo,); diesel::allow_tables_to_appear_in_same_query!( virtual_provisioning_collection, + silo_quotas, parent_silo, all_collections, do_update, + quotas, + silo_provisioned ); diff --git a/nexus/db-model/src/quota.rs b/nexus/db-model/src/quota.rs new file mode 100644 index 0000000000..70a8ffa1fd --- /dev/null +++ b/nexus/db-model/src/quota.rs @@ -0,0 +1,109 @@ +use super::ByteCount; +use crate::schema::silo_quotas; +use chrono::{DateTime, Utc}; +use nexus_types::external_api::{params, views}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +#[derive( + Queryable, + Insertable, + Debug, + Clone, + Selectable, + Serialize, + Deserialize, + AsChangeset, +)] +#[diesel(table_name = silo_quotas)] +pub struct SiloQuotas { + pub silo_id: Uuid, + pub time_created: DateTime, + pub time_modified: DateTime, + + /// The number of CPUs that this silo is allowed to use + pub cpus: i64, + + /// The amount of memory (in bytes) that this silo is allowed to use + #[diesel(column_name = memory_bytes)] + pub memory: ByteCount, + + /// The amount of storage (in bytes) that this silo is allowed to use + #[diesel(column_name = storage_bytes)] + pub storage: ByteCount, +} + +impl SiloQuotas { + pub fn new( + silo_id: Uuid, + cpus: i64, + memory: ByteCount, + storage: ByteCount, + ) -> Self { + Self { + silo_id, + time_created: Utc::now(), + time_modified: Utc::now(), + cpus, + memory, + storage, + } + } + + pub fn arbitrarily_high_default(silo_id: Uuid) -> Self { + let count = params::SiloQuotasCreate::arbitrarily_high_default(); + Self::new( + silo_id, + count.cpus, + count.memory.into(), + count.storage.into(), + ) + } +} + +impl From for views::SiloQuotas { + fn from(silo_quotas: SiloQuotas) -> Self { + Self { + silo_id: silo_quotas.silo_id, + cpus: silo_quotas.cpus, + memory: silo_quotas.memory.into(), + storage: silo_quotas.storage.into(), + } + } +} + +impl From for SiloQuotas { + fn from(silo_quotas: views::SiloQuotas) -> Self { + Self { + silo_id: silo_quotas.silo_id, + time_created: Utc::now(), + time_modified: Utc::now(), + cpus: silo_quotas.cpus, + memory: silo_quotas.memory.into(), + storage: silo_quotas.storage.into(), + } + } +} + +// Describes a set of updates for the [`SiloQuotas`] model. +#[derive(AsChangeset)] +#[diesel(table_name = silo_quotas)] +pub struct SiloQuotasUpdate { + pub cpus: Option, + #[diesel(column_name = memory_bytes)] + pub memory: Option, + #[diesel(column_name = storage_bytes)] + pub storage: Option, + pub time_modified: DateTime, +} + +impl From for SiloQuotasUpdate { + fn from(params: params::SiloQuotasUpdate) -> Self { + Self { + cpus: params.cpus, + memory: params.memory.map(|f| f.into()), + storage: params.storage.map(|f| f.into()), + time_modified: Utc::now(), + } + } +} diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 51501b4894..10fa8dcfac 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -409,6 +409,17 @@ table! { } } +table! { + silo_quotas(silo_id) { + silo_id -> Uuid, + time_created -> Timestamptz, + time_modified -> Timestamptz, + cpus -> Int8, + memory_bytes -> Int8, + storage_bytes -> Int8, + } +} + table! { network_interface (id) { id -> Uuid, @@ -1322,7 +1333,7 @@ table! { /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(19, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(20, 0, 0); allow_tables_to_appear_in_same_query!( system_update, diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 761c3f995f..1609fc7101 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -68,6 +68,7 @@ mod network_interface; mod oximeter; mod physical_disk; mod project; +mod quota; mod rack; mod region; mod region_snapshot; diff --git a/nexus/db-queries/src/db/datastore/quota.rs b/nexus/db-queries/src/db/datastore/quota.rs new file mode 100644 index 0000000000..2066781e6b --- /dev/null +++ b/nexus/db-queries/src/db/datastore/quota.rs @@ -0,0 +1,127 @@ +use super::DataStore; +use crate::authz; +use crate::context::OpContext; +use crate::db; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::pagination::paginated; +use crate::db::pool::DbConnection; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::prelude::*; +use nexus_db_model::SiloQuotas; +use nexus_db_model::SiloQuotasUpdate; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::DeleteResult; +use omicron_common::api::external::Error; +use omicron_common::api::external::ListResultVec; +use omicron_common::api::external::ResourceType; +use omicron_common::api::external::UpdateResult; +use uuid::Uuid; + +impl DataStore { + /// Creates new quotas for a silo. This is grouped with silo creation + /// and shouldn't be called outside of that flow. + /// + /// An authz check _cannot_ be performed here because the authz initialization + /// isn't complete and will lead to a db deadlock. + /// + /// See + pub async fn silo_quotas_create( + &self, + conn: &async_bb8_diesel::Connection, + authz_silo: &authz::Silo, + quotas: SiloQuotas, + ) -> Result<(), Error> { + let silo_id = authz_silo.id(); + use db::schema::silo_quotas::dsl; + + diesel::insert_into(dsl::silo_quotas) + .values(quotas) + .execute_async(conn) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::SiloQuotas, + &silo_id.to_string(), + ), + ) + }) + .map(|_| ()) + } + + pub async fn silo_quotas_delete( + &self, + opctx: &OpContext, + conn: &async_bb8_diesel::Connection, + authz_silo: &authz::Silo, + ) -> DeleteResult { + // Given that the quotas right now are somewhat of an extension of the + // Silo we just check for delete permission on the silo itself. + opctx.authorize(authz::Action::Delete, authz_silo).await?; + + use db::schema::silo_quotas::dsl; + diesel::delete(dsl::silo_quotas) + .filter(dsl::silo_id.eq(authz_silo.id())) + .execute_async(conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } + + pub async fn silo_update_quota( + &self, + opctx: &OpContext, + authz_silo: &authz::Silo, + updates: SiloQuotasUpdate, + ) -> UpdateResult { + opctx.authorize(authz::Action::Modify, authz_silo).await?; + use db::schema::silo_quotas::dsl; + let silo_id = authz_silo.id(); + diesel::update(dsl::silo_quotas) + .filter(dsl::silo_id.eq(silo_id)) + .set(updates) + .returning(SiloQuotas::as_returning()) + .get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::SiloQuotas, + &silo_id.to_string(), + ), + ) + }) + } + + pub async fn silo_quotas_view( + &self, + opctx: &OpContext, + authz_silo: &authz::Silo, + ) -> Result { + opctx.authorize(authz::Action::Read, authz_silo).await?; + use db::schema::silo_quotas::dsl; + dsl::silo_quotas + .filter(dsl::silo_id.eq(authz_silo.id())) + .first_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + pub async fn fleet_list_quotas( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + ) -> ListResultVec { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + use db::schema::silo_quotas::dsl; + paginated(dsl::silo_quotas, dsl::silo_id, pagparams) + .select(SiloQuotas::as_select()) + .load_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } +} diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index a69386cfd0..728da0b0d1 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -912,6 +912,8 @@ mod test { name: "test-silo".parse().unwrap(), description: String::new(), }, + // Set a default quota of a half rack's worth of resources + quotas: external_params::SiloQuotasCreate::arbitrarily_high_default(), discoverable: false, identity_mode: SiloIdentityMode::LocalOnly, admin_group_name: None, diff --git a/nexus/db-queries/src/db/datastore/silo.rs b/nexus/db-queries/src/db/datastore/silo.rs index 437c171fb0..2c0c5f3c47 100644 --- a/nexus/db-queries/src/db/datastore/silo.rs +++ b/nexus/db-queries/src/db/datastore/silo.rs @@ -28,6 +28,7 @@ use chrono::Utc; use diesel::prelude::*; use nexus_db_model::Certificate; use nexus_db_model::ServiceKind; +use nexus_db_model::SiloQuotas; use nexus_types::external_api::params; use nexus_types::external_api::shared; use nexus_types::external_api::shared::SiloRole; @@ -61,13 +62,32 @@ impl DataStore { debug!(opctx.log, "attempting to create built-in silos"); use db::schema::silo::dsl; - let count = diesel::insert_into(dsl::silo) - .values([&*DEFAULT_SILO, &*INTERNAL_SILO]) - .on_conflict(dsl::id) - .do_nothing() - .execute_async(&*self.pool_connection_authorized(opctx).await?) + use db::schema::silo_quotas::dsl as quotas_dsl; + let count = self + .pool_connection_authorized(opctx) + .await? + .transaction_async(|conn| async move { + diesel::insert_into(quotas_dsl::silo_quotas) + .values(SiloQuotas::arbitrarily_high_default( + DEFAULT_SILO.id(), + )) + .on_conflict(quotas_dsl::silo_id) + .do_nothing() + .execute_async(&conn) + .await + .map_err(TransactionError::CustomError) + .unwrap(); + diesel::insert_into(dsl::silo) + .values([&*DEFAULT_SILO, &*INTERNAL_SILO]) + .on_conflict(dsl::id) + .do_nothing() + .execute_async(&conn) + .await + .map_err(TransactionError::CustomError) + }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .unwrap(); + info!(opctx.log, "created {} built-in silos", count); self.virtual_provisioning_collection_create( @@ -263,6 +283,18 @@ impl DataStore { self.dns_update(nexus_opctx, &conn, dns_update).await?; + self.silo_quotas_create( + &conn, + &authz_silo, + SiloQuotas::new( + authz_silo.id(), + new_silo_params.quotas.cpus, + new_silo_params.quotas.memory.into(), + new_silo_params.quotas.storage.into(), + ), + ) + .await?; + Ok::>(silo) }) .await?; @@ -380,6 +412,8 @@ impl DataStore { ))); } + self.silo_quotas_delete(opctx, &conn, &authz_silo).await?; + self.virtual_provisioning_collection_delete_on_connection( &opctx.log, &conn, id, ) diff --git a/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs b/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs index 230c3941ff..348d277ddf 100644 --- a/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs +++ b/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs @@ -195,7 +195,9 @@ impl DataStore { ) .get_results_async(&*self.pool_connection_authorized(opctx).await?) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .map_err(|e| { + crate::db::queries::virtual_provisioning_collection_update::from_diesel(e) + })?; self.virtual_provisioning_collection_producer .append_disk_metrics(&provisions)?; Ok(provisions) @@ -249,7 +251,7 @@ impl DataStore { ) .get_results_async(&*self.pool_connection_authorized(opctx).await?) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .map_err(|e| crate::db::queries::virtual_provisioning_collection_update::from_diesel(e))?; self.virtual_provisioning_collection_producer .append_disk_metrics(&provisions)?; Ok(provisions) @@ -270,7 +272,7 @@ impl DataStore { ) .get_results_async(&*self.pool_connection_authorized(opctx).await?) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .map_err(|e| crate::db::queries::virtual_provisioning_collection_update::from_diesel(e))?; self.virtual_provisioning_collection_producer .append_cpu_metrics(&provisions)?; Ok(provisions) @@ -300,7 +302,7 @@ impl DataStore { ) .get_results_async(&*self.pool_connection_authorized(opctx).await?) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .map_err(|e| crate::db::queries::virtual_provisioning_collection_update::from_diesel(e))?; self.virtual_provisioning_collection_producer .append_cpu_metrics(&provisions)?; Ok(provisions) diff --git a/nexus/db-queries/src/db/fixed_data/silo.rs b/nexus/db-queries/src/db/fixed_data/silo.rs index d32c4211e9..6eba849ee3 100644 --- a/nexus/db-queries/src/db/fixed_data/silo.rs +++ b/nexus/db-queries/src/db/fixed_data/silo.rs @@ -24,6 +24,9 @@ lazy_static! { name: "default-silo".parse().unwrap(), description: "default silo".to_string(), }, + // This quota is actually _unused_ because the default silo + // isn't constructed in the same way a normal silo would be. + quotas: params::SiloQuotasCreate::empty(), discoverable: false, identity_mode: shared::SiloIdentityMode::LocalOnly, admin_group_name: None, @@ -49,6 +52,8 @@ lazy_static! { name: "oxide-internal".parse().unwrap(), description: "Built-in internal Silo.".to_string(), }, + // The internal silo contains no virtual resources, so it has no allotted capacity. + quotas: params::SiloQuotasCreate::empty(), discoverable: false, identity_mode: shared::SiloIdentityMode::LocalOnly, admin_group_name: None, diff --git a/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs b/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs index 0a383eb6f1..7672d5af9a 100644 --- a/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs +++ b/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs @@ -13,16 +13,69 @@ use crate::db::pool::DbConnection; use crate::db::schema::virtual_provisioning_collection; use crate::db::schema::virtual_provisioning_resource; use crate::db::subquery::{AsQuerySource, Cte, CteBuilder, CteQuery}; +use crate::db::true_or_cast_error::matches_sentinel; +use crate::db::true_or_cast_error::TrueOrCastError; use db_macros::Subquery; use diesel::pg::Pg; use diesel::query_builder::{AstPass, Query, QueryFragment, QueryId}; +use diesel::result::Error as DieselError; use diesel::{ - sql_types, CombineDsl, ExpressionMethods, IntoSql, - NullableExpressionMethods, QueryDsl, RunQueryDsl, SelectableHelper, + sql_types, BoolExpressionMethods, CombineDsl, ExpressionMethods, IntoSql, + JoinOnDsl, NullableExpressionMethods, QueryDsl, RunQueryDsl, + SelectableHelper, }; use nexus_db_model::queries::virtual_provisioning_collection_update::{ - all_collections, do_update, parent_silo, + all_collections, do_update, parent_silo, quotas, silo_provisioned, }; +use omicron_common::api::external; +use omicron_common::api::external::MessagePair; + +const NOT_ENOUGH_CPUS_SENTINEL: &'static str = "Not enough cpus"; +const NOT_ENOUGH_MEMORY_SENTINEL: &'static str = "Not enough memory"; +const NOT_ENOUGH_STORAGE_SENTINEL: &'static str = "Not enough storage"; + +/// Translates a generic pool error to an external error based +/// on messages which may be emitted when provisioning virtual resources +/// such as instances and disks. +pub fn from_diesel(e: DieselError) -> external::Error { + use crate::db::error; + + let sentinels = [ + NOT_ENOUGH_CPUS_SENTINEL, + NOT_ENOUGH_MEMORY_SENTINEL, + NOT_ENOUGH_STORAGE_SENTINEL, + ]; + if let Some(sentinel) = matches_sentinel(&e, &sentinels) { + match sentinel { + NOT_ENOUGH_CPUS_SENTINEL => { + return external::Error::InsufficientCapacity { + message: MessagePair::new_full( + "vCPU Limit Exceeded: Not enough vCPUs to complete request. Either stop unused instances to free up resources or contact the rack operator to request a capacity increase.".to_string(), + "User tried to allocate an instance but the virtual provisioning resource table indicated that there were not enough CPUs available to satisfy the request.".to_string(), + ) + } + } + NOT_ENOUGH_MEMORY_SENTINEL => { + return external::Error::InsufficientCapacity { + message: MessagePair::new_full( + "Memory Limit Exceeded: Not enough memory to complete request. Either stop unused instances to free up resources or contact the rack operator to request a capacity increase.".to_string(), + "User tried to allocate an instance but the virtual provisioning resource table indicated that there were not enough RAM available to satisfy the request.".to_string(), + ) + } + } + NOT_ENOUGH_STORAGE_SENTINEL => { + return external::Error::InsufficientCapacity { + message: MessagePair::new_full( + "Storage Limit Exceeded: Not enough storage to complete request. Either remove unneeded disks and snapshots to free up resources or contact the rack operator to request a capacity increase.".to_string(), + "User tried to allocate a disk or snapshot but the virtual provisioning resource table indicated that there were not enough storage available to satisfy the request.".to_string(), + ) + } + } + _ => {} + } + } + error::public_error_from_diesel(e, error::ErrorHandler::Server) +} #[derive(Subquery, QueryId)] #[subquery(name = parent_silo)] @@ -82,20 +135,86 @@ struct DoUpdate { } impl DoUpdate { - fn new_for_insert(id: uuid::Uuid) -> Self { + fn new_for_insert( + silo_provisioned: &SiloProvisioned, + quotas: &Quotas, + resource: VirtualProvisioningResource, + ) -> Self { use virtual_provisioning_resource::dsl; + let cpus_provisioned_delta = + resource.cpus_provisioned.into_sql::(); + let memory_provisioned_delta = + i64::from(resource.ram_provisioned).into_sql::(); + let storage_provisioned_delta = + i64::from(resource.virtual_disk_bytes_provisioned) + .into_sql::(); + let not_allocted = dsl::virtual_provisioning_resource - .find(id) + .find(resource.id) .count() .single_value() .assume_not_null() .eq(0); + let has_sufficient_cpus = quotas + .query_source() + .select(quotas::cpus) + .single_value() + .assume_not_null() + .ge(silo_provisioned + .query_source() + .select(silo_provisioned::cpus_provisioned) + .single_value() + .assume_not_null() + + cpus_provisioned_delta); + + let has_sufficient_memory = quotas + .query_source() + .select(quotas::memory) + .single_value() + .assume_not_null() + .ge(silo_provisioned + .query_source() + .select(silo_provisioned::ram_provisioned) + .single_value() + .assume_not_null() + + memory_provisioned_delta); + + let has_sufficient_storage = quotas + .query_source() + .select(quotas::storage) + .single_value() + .assume_not_null() + .ge(silo_provisioned + .query_source() + .select(silo_provisioned::virtual_disk_bytes_provisioned) + .single_value() + .assume_not_null() + + storage_provisioned_delta); + Self { query: Box::new(diesel::select((ExpressionAlias::new::< do_update::update, - >(not_allocted),))), + >( + not_allocted + .and(TrueOrCastError::new( + cpus_provisioned_delta.eq(0).or(has_sufficient_cpus), + NOT_ENOUGH_CPUS_SENTINEL, + )) + .and(TrueOrCastError::new( + memory_provisioned_delta + .eq(0) + .or(has_sufficient_memory), + NOT_ENOUGH_MEMORY_SENTINEL, + )) + .and(TrueOrCastError::new( + storage_provisioned_delta + .eq(0) + .or(has_sufficient_storage), + NOT_ENOUGH_STORAGE_SENTINEL, + )), + ),))), } } @@ -161,6 +280,67 @@ impl UpdatedProvisions { } } +#[derive(Subquery, QueryId)] +#[subquery(name = quotas)] +struct Quotas { + query: Box>, +} + +impl Quotas { + // TODO: We could potentially skip this in cases where we know we're removing a resource instead of inserting + fn new(parent_silo: &ParentSilo) -> Self { + use crate::db::schema::silo_quotas::dsl; + Self { + query: Box::new( + dsl::silo_quotas + .inner_join( + parent_silo + .query_source() + .on(dsl::silo_id.eq(parent_silo::id)), + ) + .select(( + dsl::silo_id, + dsl::cpus, + ExpressionAlias::new::( + dsl::memory_bytes, + ), + ExpressionAlias::new::( + dsl::storage_bytes, + ), + )), + ), + } + } +} + +#[derive(Subquery, QueryId)] +#[subquery(name = silo_provisioned)] +struct SiloProvisioned { + query: Box>, +} + +impl SiloProvisioned { + fn new(parent_silo: &ParentSilo) -> Self { + use virtual_provisioning_collection::dsl; + Self { + query: Box::new( + dsl::virtual_provisioning_collection + .inner_join( + parent_silo + .query_source() + .on(dsl::id.eq(parent_silo::id)), + ) + .select(( + dsl::id, + dsl::cpus_provisioned, + dsl::ram_provisioned, + dsl::virtual_disk_bytes_provisioned, + )), + ), + } + } +} + // This structure wraps a query, such that it can be used within a CTE. // // It generates a name that can be used by the "CteBuilder", but does not @@ -195,6 +375,15 @@ where } } +/// The virtual resource collection is only updated when a resource is inserted +/// or deleted from the resource provisioning table. By probing for the presence +/// or absence of a resource, we can update collections at the same time as we +/// create or destroy the resource, which helps make the operation idempotent. +enum UpdateKind { + Insert(VirtualProvisioningResource), + Delete(uuid::Uuid), +} + /// Constructs a CTE for updating resource provisioning information in all /// collections for a particular object. #[derive(QueryId)] @@ -220,7 +409,7 @@ impl VirtualProvisioningCollectionUpdate { // - values: The updated values to propagate through collections (iff // "do_update" evaluates to "true"). fn apply_update( - do_update: DoUpdate, + update_kind: UpdateKind, update: U, project_id: uuid::Uuid, values: V, @@ -237,6 +426,17 @@ impl VirtualProvisioningCollectionUpdate { &parent_silo, *crate::db::fixed_data::FLEET_ID, ); + + let quotas = Quotas::new(&parent_silo); + let silo_provisioned = SiloProvisioned::new(&parent_silo); + + let do_update = match update_kind { + UpdateKind::Insert(resource) => { + DoUpdate::new_for_insert(&silo_provisioned, "as, resource) + } + UpdateKind::Delete(id) => DoUpdate::new_for_delete(id), + }; + let updated_collections = UpdatedProvisions::new(&all_collections, &do_update, values); @@ -251,6 +451,8 @@ impl VirtualProvisioningCollectionUpdate { let cte = CteBuilder::new() .add_subquery(parent_silo) .add_subquery(all_collections) + .add_subquery(quotas) + .add_subquery(silo_provisioned) .add_subquery(do_update) .add_subquery(update) .add_subquery(updated_collections) @@ -273,8 +475,7 @@ impl VirtualProvisioningCollectionUpdate { provision.virtual_disk_bytes_provisioned = disk_byte_diff; Self::apply_update( - // We should insert the record if it does not already exist. - DoUpdate::new_for_insert(id), + UpdateKind::Insert(provision.clone()), // The query to actually insert the record. UnreferenceableSubquery( diesel::insert_into( @@ -305,8 +506,7 @@ impl VirtualProvisioningCollectionUpdate { use virtual_provisioning_resource::dsl as resource_dsl; Self::apply_update( - // We should delete the record if it exists. - DoUpdate::new_for_delete(id), + UpdateKind::Delete(id), // The query to actually delete the record. UnreferenceableSubquery( diesel::delete(resource_dsl::virtual_provisioning_resource) @@ -342,8 +542,7 @@ impl VirtualProvisioningCollectionUpdate { provision.ram_provisioned = ram_diff; Self::apply_update( - // We should insert the record if it does not already exist. - DoUpdate::new_for_insert(id), + UpdateKind::Insert(provision.clone()), // The query to actually insert the record. UnreferenceableSubquery( diesel::insert_into( @@ -378,8 +577,7 @@ impl VirtualProvisioningCollectionUpdate { use virtual_provisioning_resource::dsl as resource_dsl; Self::apply_update( - // We should delete the record if it exists. - DoUpdate::new_for_delete(id), + UpdateKind::Delete(id), // The query to actually delete the record. // // The filter condition here ensures that the provisioning record is diff --git a/nexus/src/app/external_endpoints.rs b/nexus/src/app/external_endpoints.rs index 1ab33c5c9c..0a6dd41db6 100644 --- a/nexus/src/app/external_endpoints.rs +++ b/nexus/src/app/external_endpoints.rs @@ -827,6 +827,7 @@ mod test { name: name.parse().unwrap(), description: String::new(), }, + quotas: params::SiloQuotasCreate::empty(), discoverable: false, identity_mode, admin_group_name: None, diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index d4c2d596f8..b92714a365 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -51,6 +51,7 @@ mod metrics; mod network_interface; mod oximeter; mod project; +mod quota; mod rack; pub(crate) mod saga; mod session; diff --git a/nexus/src/app/quota.rs b/nexus/src/app/quota.rs new file mode 100644 index 0000000000..f59069a9ab --- /dev/null +++ b/nexus/src/app/quota.rs @@ -0,0 +1,49 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Resource limits and system quotas + +use nexus_db_queries::authz; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db; +use nexus_db_queries::db::lookup; +use nexus_types::external_api::params; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::Error; +use omicron_common::api::external::ListResultVec; +use omicron_common::api::external::UpdateResult; +use uuid::Uuid; + +impl super::Nexus { + pub async fn silo_quotas_view( + &self, + opctx: &OpContext, + silo_lookup: &lookup::Silo<'_>, + ) -> Result { + let (.., authz_silo) = + silo_lookup.lookup_for(authz::Action::Read).await?; + self.db_datastore.silo_quotas_view(opctx, &authz_silo).await + } + + pub(crate) async fn fleet_list_quotas( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + ) -> ListResultVec { + self.db_datastore.fleet_list_quotas(opctx, pagparams).await + } + + pub(crate) async fn silo_update_quota( + &self, + opctx: &OpContext, + silo_lookup: &lookup::Silo<'_>, + updates: ¶ms::SiloQuotasUpdate, + ) -> UpdateResult { + let (.., authz_silo) = + silo_lookup.lookup_for(authz::Action::Modify).await?; + self.db_datastore + .silo_update_quota(opctx, &authz_silo, updates.clone().into()) + .await + } +} diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 1643ac301d..168e9eeaa3 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -203,6 +203,10 @@ impl super::Nexus { name: request.recovery_silo.silo_name, description: "built-in recovery Silo".to_string(), }, + // The recovery silo is initialized with no allocated capacity given it's + // not intended to be used to deploy workloads. Operators can add capacity + // after the fact if they want to use it for that purpose. + quotas: params::SiloQuotasCreate::empty(), discoverable: false, identity_mode: SiloIdentityMode::LocalOnly, admin_group_name: None, diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index a6fd7a3ccb..6720f95c39 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -45,7 +45,7 @@ use nexus_db_queries::db::model::Name; use nexus_db_queries::{ authz::ApiResource, db::fixed_data::silo::INTERNAL_SILO_ID, }; -use nexus_types::external_api::params::ProjectSelector; +use nexus_types::external_api::{params::ProjectSelector, views::SiloQuotas}; use nexus_types::{ external_api::views::{SledInstance, Switch}, identity::AssetIdentityMetadata, @@ -280,6 +280,11 @@ pub(crate) fn external_api() -> NexusApiDescription { api.register(silo_policy_view)?; api.register(silo_policy_update)?; + api.register(system_quotas_list)?; + + api.register(silo_quotas_view)?; + api.register(silo_quotas_update)?; + api.register(silo_identity_provider_list)?; api.register(saml_identity_provider_create)?; @@ -510,6 +515,91 @@ async fn policy_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// Lists resource quotas for all silos +#[endpoint { + method = GET, + path = "/v1/system/silo-quotas", + tags = ["system/silos"], +}] +async fn system_quotas_list( + rqctx: RequestContext>, + query_params: Query, +) -> Result>, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.nexus; + + let query = query_params.into_inner(); + let pagparams = data_page_params_for(&rqctx, &query)?; + + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let quotas = nexus + .fleet_list_quotas(&opctx, &pagparams) + .await? + .into_iter() + .map(|p| p.into()) + .collect(); + + Ok(HttpResponseOk(ScanById::results_page( + &query, + quotas, + &|_, quota: &SiloQuotas| quota.silo_id, + )?)) + }; + apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// View the resource quotas of a given silo +#[endpoint { + method = GET, + path = "/v1/system/silos/{silo}/quotas", + tags = ["system/silos"], +}] +async fn silo_quotas_view( + rqctx: RequestContext>, + path_params: Path, +) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.nexus; + + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let silo_lookup = + nexus.silo_lookup(&opctx, path_params.into_inner().silo)?; + let quota = nexus.silo_quotas_view(&opctx, &silo_lookup).await?; + Ok(HttpResponseOk(quota.into())) + }; + apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// Update the resource quotas of a given silo +/// +/// If a quota value is not specified, it will remain unchanged. +#[endpoint { + method = PUT, + path = "/v1/system/silos/{silo}/quotas", + tags = ["system/silos"], +}] +async fn silo_quotas_update( + rqctx: RequestContext>, + path_params: Path, + new_quota: TypedBody, +) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.nexus; + + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let silo_lookup = + nexus.silo_lookup(&opctx, path_params.into_inner().silo)?; + let quota = nexus + .silo_update_quota(&opctx, &silo_lookup, &new_quota.into_inner()) + .await?; + Ok(HttpResponseOk(quota.into())) + }; + apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + /// List silos /// /// Lists silos that are discoverable based on the current permissions. diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index 1848989bf9..0527d99490 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -287,6 +287,7 @@ pub async fn create_silo( name: silo_name.parse().unwrap(), description: "a silo".to_string(), }, + quotas: params::SiloQuotasCreate::arbitrarily_high_default(), discoverable, identity_mode, admin_group_name: None, diff --git a/nexus/tests/integration_tests/certificates.rs b/nexus/tests/integration_tests/certificates.rs index 1843fc28c8..5a34caab49 100644 --- a/nexus/tests/integration_tests/certificates.rs +++ b/nexus/tests/integration_tests/certificates.rs @@ -394,6 +394,11 @@ async fn test_silo_certificates() { .name(silo2.silo_name.clone()) .description("") .discoverable(false) + .quotas(oxide_client::types::SiloQuotasCreate { + cpus: 0, + memory: oxide_client::types::ByteCount(0), + storage: oxide_client::types::ByteCount(0), + }) .identity_mode(oxide_client::types::SiloIdentityMode::LocalOnly) .tls_certificates(vec![silo2_cert.try_into().unwrap()]), ) @@ -454,6 +459,11 @@ async fn test_silo_certificates() { .name(silo3.silo_name.clone()) .description("") .discoverable(false) + .quotas(oxide_client::types::SiloQuotasCreate { + cpus: 0, + memory: oxide_client::types::ByteCount(0), + storage: oxide_client::types::ByteCount(0), + }) .identity_mode(oxide_client::types::SiloIdentityMode::LocalOnly) .tls_certificates(vec![silo3_cert.try_into().unwrap()]), ) diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index e11902d0fe..bd6df210c0 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -85,12 +85,15 @@ lazy_static! { format!("/v1/system/silos/{}", *DEMO_SILO_NAME); pub static ref DEMO_SILO_POLICY_URL: String = format!("/v1/system/silos/{}/policy", *DEMO_SILO_NAME); + pub static ref DEMO_SILO_QUOTAS_URL: String = + format!("/v1/system/silos/{}/quotas", *DEMO_SILO_NAME); pub static ref DEMO_SILO_CREATE: params::SiloCreate = params::SiloCreate { identity: IdentityMetadataCreateParams { name: DEMO_SILO_NAME.clone(), description: String::from(""), }, + quotas: params::SiloQuotasCreate::arbitrarily_high_default(), discoverable: true, identity_mode: shared::SiloIdentityMode::SamlJit, admin_group_name: None, @@ -950,6 +953,27 @@ lazy_static! { ), ], }, + VerifyEndpoint { + url: &DEMO_SILO_QUOTAS_URL, + visibility: Visibility::Protected, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![ + AllowedMethod::Get, + AllowedMethod::Put( + serde_json::to_value( + params::SiloQuotasCreate::empty() + ).unwrap() + ) + ], + }, + VerifyEndpoint { + url: "/v1/system/silo-quotas", + visibility: Visibility::Public, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![ + AllowedMethod::Get + ], + }, VerifyEndpoint { url: "/v1/policy", visibility: Visibility::Public, diff --git a/nexus/tests/integration_tests/mod.rs b/nexus/tests/integration_tests/mod.rs index 53de24c518..35c70bf874 100644 --- a/nexus/tests/integration_tests/mod.rs +++ b/nexus/tests/integration_tests/mod.rs @@ -24,6 +24,7 @@ mod oximeter; mod pantry; mod password_login; mod projects; +mod quotas; mod rack; mod role_assignments; mod roles_builtin; diff --git a/nexus/tests/integration_tests/quotas.rs b/nexus/tests/integration_tests/quotas.rs new file mode 100644 index 0000000000..2fddf4e05c --- /dev/null +++ b/nexus/tests/integration_tests/quotas.rs @@ -0,0 +1,312 @@ +use anyhow::Error; +use dropshot::test_util::ClientTestContext; +use dropshot::HttpErrorResponseBody; +use http::Method; +use nexus_test_utils::http_testing::AuthnMode; +use nexus_test_utils::http_testing::NexusRequest; +use nexus_test_utils::http_testing::RequestBuilder; +use nexus_test_utils::http_testing::TestResponse; +use nexus_test_utils::resource_helpers::create_local_user; +use nexus_test_utils::resource_helpers::grant_iam; +use nexus_test_utils::resource_helpers::object_create; +use nexus_test_utils::resource_helpers::populate_ip_pool; +use nexus_test_utils::resource_helpers::DiskTest; +use nexus_test_utils_macros::nexus_test; +use nexus_types::external_api::params; +use nexus_types::external_api::shared; +use nexus_types::external_api::shared::SiloRole; +use nexus_types::external_api::views::SiloQuotas; +use omicron_common::api::external::ByteCount; +use omicron_common::api::external::IdentityMetadataCreateParams; +use omicron_common::api::external::InstanceCpuCount; + +type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + +struct ResourceAllocator { + auth: AuthnMode, +} + +impl ResourceAllocator { + fn new(auth: AuthnMode) -> Self { + Self { auth } + } + + async fn set_quotas( + &self, + client: &ClientTestContext, + quotas: params::SiloQuotasUpdate, + ) -> Result { + NexusRequest::object_put( + client, + "/v1/system/silos/quota-test-silo/quotas", + Some("as), + ) + .authn_as(self.auth.clone()) + .execute() + .await + } + + async fn get_quotas(&self, client: &ClientTestContext) -> SiloQuotas { + NexusRequest::object_get( + client, + "/v1/system/silos/quota-test-silo/quotas", + ) + .authn_as(self.auth.clone()) + .execute() + .await + .expect("failed to fetch quotas") + .parsed_body() + .expect("failed to parse quotas") + } + + async fn provision_instance( + &self, + client: &ClientTestContext, + name: &str, + cpus: u16, + memory: u32, + ) -> Result { + NexusRequest::objects_post( + client, + "/v1/instances?project=project", + ¶ms::InstanceCreate { + identity: IdentityMetadataCreateParams { + name: name.parse().unwrap(), + description: "".into(), + }, + ncpus: InstanceCpuCount(cpus), + memory: ByteCount::from_gibibytes_u32(memory), + hostname: "host".to_string(), + user_data: b"#cloud-config\nsystem_info:\n default_user:\n name: oxide" + .to_vec(), + network_interfaces: params::InstanceNetworkInterfaceAttachment::Default, + external_ips: Vec::::new(), + disks: Vec::::new(), + start: false, + }, + ) + .authn_as(self.auth.clone()) + .execute() + .await + .expect("Instance should be created regardless of quotas"); + + NexusRequest::new( + RequestBuilder::new( + client, + Method::POST, + format!("/v1/instances/{}/start?project=project", name) + .as_str(), + ) + .body(None as Option<&serde_json::Value>), + ) + .authn_as(self.auth.clone()) + .execute() + .await + } + + async fn cleanup_instance( + &self, + client: &ClientTestContext, + name: &str, + ) -> TestResponse { + // Try to stop the instance + NexusRequest::new( + RequestBuilder::new( + client, + Method::POST, + format!("/v1/instances/{}/stop?project=project", name).as_str(), + ) + .body(None as Option<&serde_json::Value>), + ) + .authn_as(self.auth.clone()) + .execute() + .await + .expect("failed to stop instance"); + + NexusRequest::object_delete( + client, + format!("/v1/instances/{}?project=project", name).as_str(), + ) + .authn_as(self.auth.clone()) + .execute() + .await + .expect("failed to delete instance") + } + + async fn provision_disk( + &self, + client: &ClientTestContext, + name: &str, + size: u32, + ) -> Result { + NexusRequest::new( + RequestBuilder::new( + client, + Method::POST, + "/v1/disks?project=project", + ) + .body(Some(¶ms::DiskCreate { + identity: IdentityMetadataCreateParams { + name: name.parse().unwrap(), + description: "".into(), + }, + size: ByteCount::from_gibibytes_u32(size), + disk_source: params::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + })), + ) + .authn_as(self.auth.clone()) + .execute() + .await + } +} + +async fn setup_silo_with_quota( + client: &ClientTestContext, + silo_name: &str, + quotas: params::SiloQuotasCreate, +) -> ResourceAllocator { + let silo = object_create( + client, + "/v1/system/silos", + ¶ms::SiloCreate { + identity: IdentityMetadataCreateParams { + name: silo_name.parse().unwrap(), + description: "".into(), + }, + quotas, + discoverable: true, + identity_mode: shared::SiloIdentityMode::LocalOnly, + admin_group_name: None, + tls_certificates: vec![], + mapped_fleet_roles: Default::default(), + }, + ) + .await; + + populate_ip_pool(&client, "default", None).await; + + // Create a silo user + let user = create_local_user( + client, + &silo, + &"user".parse().unwrap(), + params::UserPassword::LoginDisallowed, + ) + .await; + + // Make silo admin + grant_iam( + client, + format!("/v1/system/silos/{}", silo_name).as_str(), + SiloRole::Admin, + user.id, + AuthnMode::PrivilegedUser, + ) + .await; + + let auth_mode = AuthnMode::SiloUser(user.id); + + NexusRequest::objects_post( + client, + "/v1/projects", + ¶ms::ProjectCreate { + identity: IdentityMetadataCreateParams { + name: "project".parse().unwrap(), + description: "".into(), + }, + }, + ) + .authn_as(auth_mode.clone()) + .execute() + .await + .unwrap(); + + ResourceAllocator::new(auth_mode) +} + +#[nexus_test] +async fn test_quotas(cptestctx: &ControlPlaneTestContext) { + let client = &cptestctx.external_client; + + // Simulate space for disks + DiskTest::new(&cptestctx).await; + + let system = setup_silo_with_quota( + &client, + "quota-test-silo", + params::SiloQuotasCreate::empty(), + ) + .await; + + // Ensure trying to provision an instance with empty quotas fails + let err = system + .provision_instance(client, "instance", 1, 1) + .await + .unwrap() + .parsed_body::() + .expect("failed to parse error body"); + assert!( + err.message.contains("vCPU Limit Exceeded"), + "Unexpected error: {0}", + err.message + ); + system.cleanup_instance(client, "instance").await; + + // Up the CPU, memory quotas + system + .set_quotas( + client, + params::SiloQuotasUpdate { + cpus: Some(4), + memory: Some(ByteCount::from_gibibytes_u32(15)), + storage: Some(ByteCount::from_gibibytes_u32(2)), + }, + ) + .await + .expect("failed to set quotas"); + + let quotas = system.get_quotas(client).await; + assert_eq!(quotas.cpus, 4); + assert_eq!(quotas.memory, ByteCount::from_gibibytes_u32(15)); + assert_eq!(quotas.storage, ByteCount::from_gibibytes_u32(2)); + + // Ensure memory quota is enforced + let err = system + .provision_instance(client, "instance", 1, 16) + .await + .unwrap() + .parsed_body::() + .expect("failed to parse error body"); + assert!( + err.message.contains("Memory Limit Exceeded"), + "Unexpected error: {0}", + err.message + ); + system.cleanup_instance(client, "instance").await; + + // Allocating instance should now succeed + system + .provision_instance(client, "instance", 2, 10) + .await + .expect("Instance should've had enough resources to be provisioned"); + + let err = system + .provision_disk(client, "disk", 3) + .await + .unwrap() + .parsed_body::() + .expect("failed to parse error body"); + assert!( + err.message.contains("Storage Limit Exceeded"), + "Unexpected error: {0}", + err.message + ); + + system + .provision_disk(client, "disk", 1) + .await + .expect("Disk should be provisioned"); +} diff --git a/nexus/tests/integration_tests/silos.rs b/nexus/tests/integration_tests/silos.rs index 3c69c8b7cd..a5d4b47eaa 100644 --- a/nexus/tests/integration_tests/silos.rs +++ b/nexus/tests/integration_tests/silos.rs @@ -68,6 +68,7 @@ async fn test_silos(cptestctx: &ControlPlaneTestContext) { name: cptestctx.silo_name.clone(), description: "a silo".to_string(), }, + quotas: params::SiloQuotasCreate::empty(), discoverable: false, identity_mode: shared::SiloIdentityMode::LocalOnly, admin_group_name: None, @@ -284,6 +285,7 @@ async fn test_silo_admin_group(cptestctx: &ControlPlaneTestContext) { name: "silo-name".parse().unwrap(), description: "a silo".to_string(), }, + quotas: params::SiloQuotasCreate::empty(), discoverable: false, identity_mode: shared::SiloIdentityMode::SamlJit, admin_group_name: Some("administrator".into()), @@ -2256,6 +2258,7 @@ async fn test_silo_authn_policy(cptestctx: &ControlPlaneTestContext) { name: silo_name, description: String::new(), }, + quotas: params::SiloQuotasCreate::empty(), discoverable: false, identity_mode: shared::SiloIdentityMode::LocalOnly, admin_group_name: None, @@ -2332,6 +2335,7 @@ async fn check_fleet_privileges( name: SILO_NAME.parse().unwrap(), description: String::new(), }, + quotas: params::SiloQuotasCreate::empty(), discoverable: false, identity_mode: shared::SiloIdentityMode::LocalOnly, admin_group_name: None, @@ -2360,6 +2364,7 @@ async fn check_fleet_privileges( name: SILO_NAME.parse().unwrap(), description: String::new(), }, + quotas: params::SiloQuotasCreate::empty(), discoverable: false, identity_mode: shared::SiloIdentityMode::LocalOnly, admin_group_name: None, @@ -2387,6 +2392,7 @@ async fn check_fleet_privileges( name: SILO_NAME.parse().unwrap(), description: String::new(), }, + quotas: params::SiloQuotasCreate::empty(), discoverable: false, identity_mode: shared::SiloIdentityMode::LocalOnly, admin_group_name: None, @@ -2419,6 +2425,7 @@ async fn check_fleet_privileges( name: SILO_NAME.parse().unwrap(), description: String::new(), }, + quotas: params::SiloQuotasCreate::empty(), discoverable: false, identity_mode: shared::SiloIdentityMode::LocalOnly, admin_group_name: None, diff --git a/nexus/tests/output/nexus_tags.txt b/nexus/tests/output/nexus_tags.txt index 5a4a61132e..3f77f4cb26 100644 --- a/nexus/tests/output/nexus_tags.txt +++ b/nexus/tests/output/nexus_tags.txt @@ -183,9 +183,12 @@ silo_identity_provider_list GET /v1/system/identity-providers silo_list GET /v1/system/silos silo_policy_update PUT /v1/system/silos/{silo}/policy silo_policy_view GET /v1/system/silos/{silo}/policy +silo_quotas_update PUT /v1/system/silos/{silo}/quotas +silo_quotas_view GET /v1/system/silos/{silo}/quotas silo_user_list GET /v1/system/users silo_user_view GET /v1/system/users/{user_id} silo_view GET /v1/system/silos/{silo} +system_quotas_list GET /v1/system/silo-quotas user_builtin_list GET /v1/system/users-builtin user_builtin_view GET /v1/system/users-builtin/{user} diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index cde448c5b7..f27a6619e2 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -288,6 +288,12 @@ pub struct SiloCreate { /// endpoints. These should be valid for the Silo's DNS name(s). pub tls_certificates: Vec, + /// Limits the amount of provisionable CPU, memory, and storage in the Silo. + /// CPU and memory are only consumed by running instances, while storage is + /// consumed by any disk or snapshot. A value of 0 means that resource is + /// *not* provisionable. + pub quotas: SiloQuotasCreate, + /// Mapping of which Fleet roles are conferred by each Silo role /// /// The default is that no Fleet roles are conferred by any Silo roles @@ -297,6 +303,54 @@ pub struct SiloCreate { BTreeMap>, } +/// The amount of provisionable resources for a Silo +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct SiloQuotasCreate { + /// The amount of virtual CPUs available for running instances in the Silo + pub cpus: i64, + /// The amount of RAM (in bytes) available for running instances in the Silo + pub memory: ByteCount, + /// The amount of storage (in bytes) available for disks or snapshots + pub storage: ByteCount, +} + +impl SiloQuotasCreate { + /// All quotas set to 0 + pub fn empty() -> Self { + Self { + cpus: 0, + memory: ByteCount::from(0), + storage: ByteCount::from(0), + } + } + + /// An arbitrarily high but identifiable default for quotas + /// that can be used for creating a Silo for testing + /// + /// The only silo that customers will see that this should be set on is the default + /// silo. Ultimately the default silo should only be initialized with an empty quota, + /// but as tests currently relying on it having a quota, we need to set something. + pub fn arbitrarily_high_default() -> Self { + Self { + cpus: 9999999999, + memory: ByteCount::try_from(999999999999999999_u64).unwrap(), + storage: ByteCount::try_from(999999999999999999_u64).unwrap(), + } + } +} + +/// Updateable properties of a Silo's resource limits. +/// If a value is omitted it will not be updated. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct SiloQuotasUpdate { + /// The amount of virtual CPUs available for running instances in the Silo + pub cpus: Option, + /// The amount of RAM (in bytes) available for running instances in the Silo + pub memory: Option, + /// The amount of storage (in bytes) available for disks or snapshots + pub storage: Option, +} + /// Create-time parameters for a `User` #[derive(Clone, Deserialize, Serialize, JsonSchema)] pub struct UserCreate { diff --git a/nexus/types/src/external_api/views.rs b/nexus/types/src/external_api/views.rs index af17e7e840..ecd459594a 100644 --- a/nexus/types/src/external_api/views.rs +++ b/nexus/types/src/external_api/views.rs @@ -49,6 +49,14 @@ pub struct Silo { BTreeMap>, } +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct SiloQuotas { + pub silo_id: Uuid, + pub cpus: i64, + pub memory: ByteCount, + pub storage: ByteCount, +} + // IDENTITY PROVIDER #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] diff --git a/openapi/nexus.json b/openapi/nexus.json index 7afb6cdc2f..2ddd5f0e94 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -6210,6 +6210,65 @@ } } }, + "/v1/system/silo-quotas": { + "get": { + "tags": [ + "system/silos" + ], + "summary": "Lists resource quotas for all silos", + "operationId": "system_quotas_list", + "parameters": [ + { + "in": "query", + "name": "limit", + "description": "Maximum number of items returned by a single call", + "schema": { + "nullable": true, + "type": "integer", + "format": "uint32", + "minimum": 1 + } + }, + { + "in": "query", + "name": "page_token", + "description": "Token returned by previous call to retrieve the subsequent page", + "schema": { + "nullable": true, + "type": "string" + } + }, + { + "in": "query", + "name": "sort_by", + "schema": { + "$ref": "#/components/schemas/IdSortMode" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SiloQuotasResultsPage" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + }, + "x-dropshot-pagination": { + "required": [] + } + } + }, "/v1/system/silos": { "get": { "tags": [ @@ -6458,6 +6517,91 @@ } } }, + "/v1/system/silos/{silo}/quotas": { + "get": { + "tags": [ + "system/silos" + ], + "summary": "View the resource quotas of a given silo", + "operationId": "silo_quotas_view", + "parameters": [ + { + "in": "path", + "name": "silo", + "description": "Name or ID of the silo", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SiloQuotas" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "put": { + "tags": [ + "system/silos" + ], + "summary": "Update the resource quotas of a given silo", + "description": "If a quota value is not specified, it will remain unchanged.", + "operationId": "silo_quotas_update", + "parameters": [ + { + "in": "path", + "name": "silo", + "description": "Name or ID of the silo", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SiloQuotasUpdate" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SiloQuotas" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/v1/system/users": { "get": { "tags": [ @@ -13206,6 +13350,14 @@ "name": { "$ref": "#/components/schemas/Name" }, + "quotas": { + "description": "Limits the amount of provisionable CPU, memory, and storage in the Silo. CPU and memory are only consumed by running instances, while storage is consumed by any disk or snapshot. A value of 0 means that resource is *not* provisionable.", + "allOf": [ + { + "$ref": "#/components/schemas/SiloQuotasCreate" + } + ] + }, "tls_certificates": { "description": "Initial TLS certificates to be used for the new Silo's console and API endpoints. These should be valid for the Silo's DNS name(s).", "type": "array", @@ -13219,6 +13371,7 @@ "discoverable", "identity_mode", "name", + "quotas", "tls_certificates" ] }, @@ -13241,6 +13394,114 @@ } ] }, + "SiloQuotas": { + "type": "object", + "properties": { + "cpus": { + "type": "integer", + "format": "int64" + }, + "memory": { + "$ref": "#/components/schemas/ByteCount" + }, + "silo_id": { + "type": "string", + "format": "uuid" + }, + "storage": { + "$ref": "#/components/schemas/ByteCount" + } + }, + "required": [ + "cpus", + "memory", + "silo_id", + "storage" + ] + }, + "SiloQuotasCreate": { + "description": "The amount of provisionable resources for a Silo", + "type": "object", + "properties": { + "cpus": { + "description": "The amount of virtual CPUs available for running instances in the Silo", + "type": "integer", + "format": "int64" + }, + "memory": { + "description": "The amount of RAM (in bytes) available for running instances in the Silo", + "allOf": [ + { + "$ref": "#/components/schemas/ByteCount" + } + ] + }, + "storage": { + "description": "The amount of storage (in bytes) available for disks or snapshots", + "allOf": [ + { + "$ref": "#/components/schemas/ByteCount" + } + ] + } + }, + "required": [ + "cpus", + "memory", + "storage" + ] + }, + "SiloQuotasResultsPage": { + "description": "A single page of results", + "type": "object", + "properties": { + "items": { + "description": "list of items on this page of results", + "type": "array", + "items": { + "$ref": "#/components/schemas/SiloQuotas" + } + }, + "next_page": { + "nullable": true, + "description": "token used to fetch the next page of results (if any)", + "type": "string" + } + }, + "required": [ + "items" + ] + }, + "SiloQuotasUpdate": { + "description": "Updateable properties of a Silo's resource limits. If a value is omitted it will not be updated.", + "type": "object", + "properties": { + "cpus": { + "nullable": true, + "description": "The amount of virtual CPUs available for running instances in the Silo", + "type": "integer", + "format": "int64" + }, + "memory": { + "nullable": true, + "description": "The amount of RAM (in bytes) available for running instances in the Silo", + "allOf": [ + { + "$ref": "#/components/schemas/ByteCount" + } + ] + }, + "storage": { + "nullable": true, + "description": "The amount of storage (in bytes) available for disks or snapshots", + "allOf": [ + { + "$ref": "#/components/schemas/ByteCount" + } + ] + } + } + }, "SiloResultsPage": { "description": "A single page of results", "type": "object", diff --git a/schema/crdb/20.0.0/up01.sql b/schema/crdb/20.0.0/up01.sql new file mode 100644 index 0000000000..6a95c41e48 --- /dev/null +++ b/schema/crdb/20.0.0/up01.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS omicron.public.silo_quotas ( + silo_id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + time_modified TIMESTAMPTZ NOT NULL, + cpus INT8 NOT NULL, + memory_bytes INT8 NOT NULL, + storage_bytes INT8 NOT NULL +); \ No newline at end of file diff --git a/schema/crdb/20.0.0/up02.sql b/schema/crdb/20.0.0/up02.sql new file mode 100644 index 0000000000..2909e379ca --- /dev/null +++ b/schema/crdb/20.0.0/up02.sql @@ -0,0 +1,28 @@ +set + local disallow_full_table_scans = off; + +-- Adds quotas for any existing silos without them. +-- The selected quotas are based on the resources of a half rack +-- with 30% CPU and memory reserved for internal use and a 3.5x tax +-- on storage for replication, etc. +INSERT INTO + silo_quotas ( + silo_id, + time_created, + time_modified, + cpus, + memory_bytes, + storage_bytes + ) +SELECT + s.id AS silo_id, + NOW() AS time_created, + NOW() AS time_modified, + 9999999999 AS cpus, + 999999999999999999 AS memory_bytes, + 999999999999999999 AS storage_bytes +FROM + silo s + LEFT JOIN silo_quotas sq ON s.id = sq.silo_id +WHERE + sq.silo_id IS NULL; \ No newline at end of file diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 0bf365a2f1..be7291b4e4 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -827,6 +827,15 @@ CREATE UNIQUE INDEX IF NOT EXISTS lookup_ssh_key_by_silo_user ON omicron.public. ) WHERE time_deleted IS NULL; +CREATE TABLE IF NOT EXISTS omicron.public.silo_quotas ( + silo_id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + time_modified TIMESTAMPTZ NOT NULL, + cpus INT8 NOT NULL, + memory_bytes INT8 NOT NULL, + storage_bytes INT8 NOT NULL +); + /* * Projects */ @@ -3062,7 +3071,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '19.0.0', NULL) + ( TRUE, NOW(), NOW(), '20.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT;