From 6d65e1e86f6caaab936bc895ebf690dc790c3a94 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 1 Oct 2024 23:40:29 -0400 Subject: [PATCH] Start integrating clickhouse clusters to blueprints (#6627) This PR integrates integrates clickhouse keeper and server node allocation via `ClickhouseAllocator` into the `BlueprintBuilder`. It goes about testing that allocation works as expected via the planner. There are many more specific tests for the `ClickhouseAllocator` and the tests here are mainly checking that everything fits together and the same results occur when run through the planner. There is an additional test and code to allow us to disable the cluster policy and expunge all clickhouse keeper and server zones in one shot. This code is safe to merge because it is currently inert. There is no way to enable the `ClickhousePolicy` outside of tests yet. This will come in one or two follow up PRs where we add an internal nexus endpoint for enabling the policy and then an OMDB command to trigger the endpoint. Further OMDB support will be added for monitoring. We expect that for the foreseeable future we will always deploy with `ClickhousePolicy::deploy_with_standalone = true`. This is stage 1 of RFD 468 where we run replicated clickhouse and the existing single node clickhouse together. Lastly, there will be other PRs to plug in the actual inventory collection and execution phases for clickhouse cluster reconfiguration. We shouldn't bother even implementing the OMDB policy enablement until all that is complete as it just won't work. --- Cargo.lock | 1 + nexus/db-model/src/deployment.rs | 8 +- nexus/db-queries/Cargo.toml | 1 + .../db-queries/src/db/datastore/deployment.rs | 256 +++++++ nexus/db-queries/src/db/datastore/rack.rs | 5 + nexus/inventory/src/builder.rs | 22 +- nexus/reconfigurator/execution/src/dns.rs | 1 + .../execution/src/omicron_physical_disks.rs | 1 + .../execution/src/omicron_zones.rs | 1 + .../planning/src/blueprint_builder/builder.rs | 201 +++++- .../src/blueprint_builder/clickhouse.rs | 128 ++-- .../planning/src/blueprint_builder/mod.rs | 1 + nexus/reconfigurator/planning/src/planner.rs | 663 ++++++++++++++++++ .../src/planner/omicron_zone_placement.rs | 16 +- nexus/reconfigurator/planning/src/system.rs | 11 +- .../background/tasks/blueprint_execution.rs | 1 + .../app/background/tasks/blueprint_load.rs | 1 + nexus/test-utils/src/lib.rs | 3 + nexus/types/src/deployment.rs | 5 + nexus/types/src/deployment/planning_input.rs | 20 + nexus/types/src/deployment/zone_type.rs | 15 + nexus/types/src/inventory.rs | 4 +- openapi/nexus-internal.json | 89 +++ sled-agent/src/rack_setup/service.rs | 3 + 24 files changed, 1385 insertions(+), 72 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2c4a71d510..9983704f23 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5505,6 +5505,7 @@ dependencies = [ "camino", "camino-tempfile", "chrono", + "clickhouse-admin-types", "const_format", "db-macros", "diesel", diff --git a/nexus/db-model/src/deployment.rs b/nexus/db-model/src/deployment.rs index fab129e6ba..500d72dc9a 100644 --- a/nexus/db-model/src/deployment.rs +++ b/nexus/db-model/src/deployment.rs @@ -813,18 +813,20 @@ impl BpClickhouseClusterConfig { .max_used_server_id .0 .try_into() - .context("more than 2^63 IDs in use")?, + .context("more than 2^63 clickhouse server IDs in use")?, max_used_keeper_id: config .max_used_keeper_id .0 .try_into() - .context("more than 2^63 IDs in use")?, + .context("more than 2^63 clickhouse keeper IDs in use")?, cluster_name: config.cluster_name.clone(), cluster_secret: config.cluster_secret.clone(), highest_seen_keeper_leader_committed_log_index: config .highest_seen_keeper_leader_committed_log_index .try_into() - .context("more than 2^63 IDs in use")?, + .context( + "more than 2^63 clickhouse keeper log indexes in use", + )?, }) } } diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index c6c5caab6a..2059180c59 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -16,6 +16,7 @@ async-bb8-diesel.workspace = true async-trait.workspace = true camino.workspace = true chrono.workspace = true +clickhouse-admin-types.workspace = true const_format.workspace = true diesel.workspace = true diesel-dtrace.workspace = true diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 07c32ef818..13f7ea5aca 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -20,6 +20,7 @@ use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::DateTime; use chrono::Utc; +use clickhouse_admin_types::{KeeperId, ServerId}; use diesel::expression::SelectableHelper; use diesel::pg::Pg; use diesel::query_builder::AstPass; @@ -36,6 +37,9 @@ use diesel::OptionalExtension; use diesel::QueryDsl; use diesel::RunQueryDsl; use nexus_db_model::Blueprint as DbBlueprint; +use nexus_db_model::BpClickhouseClusterConfig; +use nexus_db_model::BpClickhouseKeeperZoneIdToNodeId; +use nexus_db_model::BpClickhouseServerZoneIdToNodeId; use nexus_db_model::BpOmicronPhysicalDisk; use nexus_db_model::BpOmicronZone; use nexus_db_model::BpOmicronZoneNic; @@ -49,6 +53,7 @@ use nexus_types::deployment::BlueprintPhysicalDisksConfig; use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZonesConfig; +use nexus_types::deployment::ClickhouseClusterConfig; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::external_api::views::SledState; use omicron_common::api::external::DataPageParams; @@ -58,6 +63,7 @@ use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_common::bail_unless; use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; use std::collections::BTreeMap; use uuid::Uuid; @@ -180,6 +186,42 @@ impl DataStore { }) .collect::, _>>()?; + let clickhouse_tables: Option<(_, _, _)> = if let Some(config) = + &blueprint.clickhouse_cluster_config + { + let mut keepers = vec![]; + for (zone_id, keeper_id) in &config.keepers { + let keeper = BpClickhouseKeeperZoneIdToNodeId::new( + blueprint_id, + *zone_id, + *keeper_id, + ) + .with_context(|| format!("zone {zone_id}, keeper {keeper_id}")) + .map_err(|e| Error::internal_error(&format!("{:#}", e)))?; + keepers.push(keeper) + } + + let mut servers = vec![]; + for (zone_id, server_id) in &config.servers { + let server = BpClickhouseServerZoneIdToNodeId::new( + blueprint_id, + *zone_id, + *server_id, + ) + .with_context(|| format!("zone {zone_id}, server {server_id}")) + .map_err(|e| Error::internal_error(&format!("{:#}", e)))?; + servers.push(server); + } + + let cluster_config = + BpClickhouseClusterConfig::new(blueprint_id, config) + .map_err(|e| Error::internal_error(&format!("{:#}", e)))?; + + Some((cluster_config, keepers, servers)) + } else { + None + }; + // This implementation inserts all records associated with the // blueprint in one transaction. This is required: we don't want // any planner or executor to see a half-inserted blueprint, nor do we @@ -258,7 +300,33 @@ impl DataStore { .await?; } + // Insert all clickhouse cluster related tables if necessary + if let Some((clickhouse_cluster_config, keepers, servers)) = clickhouse_tables { + { + use db::schema::bp_clickhouse_cluster_config::dsl; + let _ = diesel::insert_into(dsl::bp_clickhouse_cluster_config) + .values(clickhouse_cluster_config) + .execute_async(&conn) + .await?; + } + { + use db::schema::bp_clickhouse_keeper_zone_id_to_node_id::dsl; + let _ = diesel::insert_into(dsl::bp_clickhouse_keeper_zone_id_to_node_id) + .values(keepers) + .execute_async(&conn) + .await?; + } + { + use db::schema::bp_clickhouse_server_zone_id_to_node_id::dsl; + let _ = diesel::insert_into(dsl::bp_clickhouse_server_zone_id_to_node_id) + .values(servers) + .execute_async(&conn) + .await?; + } + } + Ok(()) + }) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; @@ -622,6 +690,156 @@ impl DataStore { disks_config.disks.sort_unstable_by_key(|d| d.id); } + // Load our `ClickhouseClusterConfig` if it exists + let clickhouse_cluster_config: Option = { + use db::schema::bp_clickhouse_cluster_config::dsl; + + let res = dsl::bp_clickhouse_cluster_config + .filter(dsl::blueprint_id.eq(blueprint_id)) + .select(BpClickhouseClusterConfig::as_select()) + .get_result_async(&*conn) + .await + .optional() + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + + match res { + None => None, + Some(bp_config) => { + // Load our clickhouse keeper configs for the given blueprint + let keepers: BTreeMap = { + use db::schema::bp_clickhouse_keeper_zone_id_to_node_id::dsl; + let mut keepers = BTreeMap::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + while let Some(p) = paginator.next() { + let batch = paginated( + dsl::bp_clickhouse_keeper_zone_id_to_node_id, + dsl::omicron_zone_id, + &p.current_pagparams(), + ) + .filter(dsl::blueprint_id.eq(blueprint_id)) + .select( + BpClickhouseKeeperZoneIdToNodeId::as_select(), + ) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + })?; + + paginator = + p.found_batch(&batch, &|k| k.omicron_zone_id); + + for k in batch { + let keeper_id = KeeperId( + u64::try_from(k.keeper_id).map_err( + |_| { + Error::internal_error(&format!( + "keeper id is negative: {}", + k.keeper_id + )) + }, + )?, + ); + keepers.insert( + k.omicron_zone_id.into(), + keeper_id, + ); + } + } + keepers + }; + + // Load our clickhouse server configs for the given blueprint + let servers: BTreeMap = { + use db::schema::bp_clickhouse_server_zone_id_to_node_id::dsl; + let mut servers = BTreeMap::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + while let Some(p) = paginator.next() { + let batch = paginated( + dsl::bp_clickhouse_server_zone_id_to_node_id, + dsl::omicron_zone_id, + &p.current_pagparams(), + ) + .filter(dsl::blueprint_id.eq(blueprint_id)) + .select( + BpClickhouseServerZoneIdToNodeId::as_select(), + ) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + })?; + + paginator = + p.found_batch(&batch, &|s| s.omicron_zone_id); + + for s in batch { + let server_id = ServerId( + u64::try_from(s.server_id).map_err( + |_| { + Error::internal_error(&format!( + "server id is negative: {}", + s.server_id + )) + }, + )?, + ); + servers.insert( + s.omicron_zone_id.into(), + server_id, + ); + } + } + servers + }; + + Some(ClickhouseClusterConfig { + generation: bp_config.generation.into(), + max_used_server_id: ServerId( + u64::try_from(bp_config.max_used_server_id) + .map_err(|_| { + Error::internal_error(&format!( + "max server id is negative: {}", + bp_config.max_used_server_id + )) + })?, + ), + max_used_keeper_id: KeeperId( + u64::try_from(bp_config.max_used_keeper_id) + .map_err(|_| { + Error::internal_error(&format!( + "max keeper id is negative: {}", + bp_config.max_used_keeper_id + )) + })?, + ), + cluster_name: bp_config.cluster_name, + cluster_secret: bp_config.cluster_secret, + highest_seen_keeper_leader_committed_log_index: + u64::try_from( + bp_config.highest_seen_keeper_leader_committed_log_index, + ) + .map_err(|_| { + Error::internal_error(&format!( + "max server id is negative: {}", + bp_config.highest_seen_keeper_leader_committed_log_index + )) + })?, + keepers, + servers, + }) + } + } + }; + Ok(Blueprint { id: blueprint_id, blueprint_zones, @@ -632,6 +850,7 @@ impl DataStore { external_dns_version, cockroachdb_fingerprint, cockroachdb_setting_preserve_downgrade, + clickhouse_cluster_config, time_created, creator, comment, @@ -663,6 +882,9 @@ impl DataStore { nsled_agent_zones, nzones, nnics, + nclickhouse_cluster_configs, + nclickhouse_keepers, + nclickhouse_servers, ) = conn .transaction_async(|conn| async move { // Ensure that blueprint we're about to delete is not the @@ -759,6 +981,34 @@ impl DataStore { .await? }; + let nclickhouse_cluster_configs = { + use db::schema::bp_clickhouse_cluster_config::dsl; + diesel::delete( + dsl::bp_clickhouse_cluster_config + .filter(dsl::blueprint_id.eq(blueprint_id)), + ) + .execute_async(&conn) + .await? + }; + + let nclickhouse_keepers = { + use db::schema::bp_clickhouse_keeper_zone_id_to_node_id::dsl; + diesel::delete(dsl::bp_clickhouse_keeper_zone_id_to_node_id + .filter(dsl::blueprint_id.eq(blueprint_id)), + ) + .execute_async(&conn) + .await? + }; + + let nclickhouse_servers = { + use db::schema::bp_clickhouse_server_zone_id_to_node_id::dsl; + diesel::delete(dsl::bp_clickhouse_server_zone_id_to_node_id + .filter(dsl::blueprint_id.eq(blueprint_id)), + ) + .execute_async(&conn) + .await? + }; + Ok(( nblueprints, nsled_states, @@ -767,6 +1017,9 @@ impl DataStore { nsled_agent_zones, nzones, nnics, + nclickhouse_cluster_configs, + nclickhouse_keepers, + nclickhouse_servers, )) }) .await @@ -786,6 +1039,9 @@ impl DataStore { "nsled_agent_zones" => nsled_agent_zones, "nzones" => nzones, "nnics" => nnics, + "nclickhouse_cluster_configs" => nclickhouse_cluster_configs, + "nclickhouse_keepers" => nclickhouse_keepers, + "nclickhouse_servers" => nclickhouse_servers ); Ok(()) diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index 8b7cf4804a..ce93891406 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -1066,6 +1066,7 @@ mod test { internal_dns_version: *Generation::new(), external_dns_version: *Generation::new(), cockroachdb_fingerprint: String::new(), + clickhouse_cluster_config: None, time_created: Utc::now(), creator: "test suite".to_string(), comment: "test suite".to_string(), @@ -1549,6 +1550,7 @@ mod test { internal_dns_version: *Generation::new(), external_dns_version: *Generation::new(), cockroachdb_fingerprint: String::new(), + clickhouse_cluster_config: None, time_created: now_db_precision(), creator: "test suite".to_string(), comment: "test blueprint".to_string(), @@ -1810,6 +1812,7 @@ mod test { internal_dns_version: *Generation::new(), external_dns_version: *Generation::new(), cockroachdb_fingerprint: String::new(), + clickhouse_cluster_config: None, time_created: now_db_precision(), creator: "test suite".to_string(), comment: "test blueprint".to_string(), @@ -2024,6 +2027,7 @@ mod test { internal_dns_version: *Generation::new(), external_dns_version: *Generation::new(), cockroachdb_fingerprint: String::new(), + clickhouse_cluster_config: None, time_created: now_db_precision(), creator: "test suite".to_string(), comment: "test blueprint".to_string(), @@ -2167,6 +2171,7 @@ mod test { internal_dns_version: *Generation::new(), external_dns_version: *Generation::new(), cockroachdb_fingerprint: String::new(), + clickhouse_cluster_config: None, time_created: now_db_precision(), creator: "test suite".to_string(), comment: "test blueprint".to_string(), diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index 6e2d8ba28d..7b2ee42a6b 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -21,6 +21,7 @@ use nexus_types::inventory::BaseboardId; use nexus_types::inventory::Caboose; use nexus_types::inventory::CabooseFound; use nexus_types::inventory::CabooseWhich; +use nexus_types::inventory::ClickhouseKeeperClusterMembership; use nexus_types::inventory::Collection; use nexus_types::inventory::OmicronZonesFound; use nexus_types::inventory::RotPage; @@ -31,6 +32,7 @@ use nexus_types::inventory::ServiceProcessor; use nexus_types::inventory::SledAgent; use nexus_types::inventory::Zpool; use omicron_uuid_kinds::CollectionKind; +use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; use std::collections::BTreeMap; use std::collections::BTreeSet; @@ -92,6 +94,9 @@ pub struct CollectionBuilder { BTreeMap, RotPageFound>>, sleds: BTreeMap, omicron_zones: BTreeMap, + clickhouse_keeper_cluster_membership: + BTreeMap, + // We just generate one UUID for each collection. id_rng: TypedUuidRng, } @@ -119,6 +124,7 @@ impl CollectionBuilder { rot_pages_found: BTreeMap::new(), sleds: BTreeMap::new(), omicron_zones: BTreeMap::new(), + clickhouse_keeper_cluster_membership: BTreeMap::new(), id_rng: TypedUuidRng::from_entropy(), } } @@ -146,9 +152,8 @@ impl CollectionBuilder { rot_pages_found: self.rot_pages_found, sled_agents: self.sleds, omicron_zones: self.omicron_zones, - // Currently unused - // See: https://github.com/oxidecomputer/omicron/issues/6578 - clickhouse_keeper_cluster_membership: BTreeMap::new(), + clickhouse_keeper_cluster_membership: self + .clickhouse_keeper_cluster_membership, } } @@ -561,6 +566,16 @@ impl CollectionBuilder { Ok(()) } } + + /// Record information about Keeper cluster membership learned from the + /// clickhouse-admin service running in the keeper zones. + pub fn found_clickhouse_keeper_cluster_membership( + &mut self, + zone_id: OmicronZoneUuid, + membership: ClickhouseKeeperClusterMembership, + ) { + self.clickhouse_keeper_cluster_membership.insert(zone_id, membership); + } } /// Returns the current time, truncated to the previous microsecond. @@ -619,6 +634,7 @@ mod test { assert!(collection.rots.is_empty()); assert!(collection.cabooses_found.is_empty()); assert!(collection.rot_pages_found.is_empty()); + assert!(collection.clickhouse_keeper_cluster_membership.is_empty()); } // Simple test of a single, fairly typical collection that contains just diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 8cc0ed96cc..75c0db9b7e 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -849,6 +849,7 @@ mod test { internal_dns_version: initial_dns_generation, external_dns_version: Generation::new(), cockroachdb_fingerprint: String::new(), + clickhouse_cluster_config: None, time_created: now_db_precision(), creator: "test-suite".to_string(), comment: "test blueprint".to_string(), diff --git a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs index d94bbe2e27..14af55fee6 100644 --- a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs +++ b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs @@ -187,6 +187,7 @@ mod test { internal_dns_version: Generation::new(), external_dns_version: Generation::new(), cockroachdb_fingerprint: String::new(), + clickhouse_cluster_config: None, time_created: chrono::Utc::now(), creator: "test".to_string(), comment: "test blueprint".to_string(), diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index 3e8ff84a0d..882d46913b 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -351,6 +351,7 @@ mod test { internal_dns_version: Generation::new(), external_dns_version: Generation::new(), cockroachdb_fingerprint: String::new(), + clickhouse_cluster_config: None, time_created: chrono::Utc::now(), creator: "test".to_string(), comment: "test blueprint".to_string(), diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 35512a50f3..ca619fc4a9 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -8,6 +8,7 @@ use crate::ip_allocator::IpAllocator; use crate::planner::zone_needs_expungement; use crate::planner::ZoneExpungeReason; use anyhow::anyhow; +use clickhouse_admin_types::OXIMETER_CLUSTER; use ipnet::IpAdd; use nexus_inventory::now_db_precision; use nexus_sled_agent_shared::inventory::OmicronZoneDataset; @@ -21,6 +22,7 @@ use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::BlueprintZonesConfig; +use nexus_types::deployment::ClickhouseClusterConfig; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::deployment::DiskFilter; use nexus_types::deployment::OmicronZoneExternalFloatingAddr; @@ -75,6 +77,7 @@ use thiserror::Error; use typed_rng::TypedUuidRng; use typed_rng::UuidRng; +use super::clickhouse::ClickhouseAllocator; use super::external_networking::BuilderExternalNetworking; use super::external_networking::ExternalNetworkingChoice; use super::external_networking::ExternalSnatNetworkingChoice; @@ -175,6 +178,9 @@ impl fmt::Display for Operation { ZoneExpungeReason::SledExpunged => { "sled policy is expunged" } + ZoneExpungeReason::ClickhouseClusterDisabled => { + "clickhouse cluster disabled via policy" + } }; write!( f, @@ -216,6 +222,7 @@ pub struct BlueprintBuilder<'a> { sled_ip_allocators: BTreeMap, external_networking: OnceCell>, internal_dns_subnets: OnceCell, + clickhouse_allocator: Option, // These fields will become part of the final blueprint. See the // corresponding fields in `Blueprint`. @@ -278,6 +285,7 @@ impl<'a> BlueprintBuilder<'a> { .copied() .map(|sled_id| (sled_id, SledState::Active)) .collect(); + Blueprint { id: rng.blueprint_rng.next(), blueprint_zones, @@ -289,6 +297,7 @@ impl<'a> BlueprintBuilder<'a> { cockroachdb_fingerprint: String::new(), cockroachdb_setting_preserve_downgrade: CockroachDbPreserveDowngrade::DoNotModify, + clickhouse_cluster_config: None, time_created: now_db_precision(), creator: creator.to_owned(), comment: format!("starting blueprint with {num_sleds} empty sleds"), @@ -334,6 +343,43 @@ impl<'a> BlueprintBuilder<'a> { || commissioned_sled_ids.contains(sled_id) }); + // If we have the clickhouse cluster setup enabled via policy and we + // don't yet have a `ClickhouseClusterConfiguration`, then we must create + // one and feed it to our `ClickhouseAllocator`. + let clickhouse_allocator = if input.clickhouse_cluster_enabled() { + let parent_config = parent_blueprint + .clickhouse_cluster_config + .clone() + .unwrap_or_else(|| { + info!( + log, + concat!( + "Clickhouse cluster enabled by policy: ", + "generating initial 'ClickhouseClusterConfig' ", + "and 'ClickhouseAllocator'" + ) + ); + ClickhouseClusterConfig::new(OXIMETER_CLUSTER.to_string()) + }); + Some(ClickhouseAllocator::new( + log.clone(), + parent_config, + inventory.latest_clickhouse_keeper_membership(), + )) + } else { + if parent_blueprint.clickhouse_cluster_config.is_some() { + info!( + log, + concat!( + "clickhouse cluster disabled via policy ", + "discarding existing 'ClickhouseAllocator' and ", + "the resulting generated 'ClickhouseClusterConfig" + ) + ); + } + None + }; + Ok(BlueprintBuilder { log, parent_blueprint, @@ -347,6 +393,7 @@ impl<'a> BlueprintBuilder<'a> { sled_state, cockroachdb_setting_preserve_downgrade: parent_blueprint .cockroachdb_setting_preserve_downgrade, + clickhouse_allocator, creator: creator.to_owned(), operations: Vec::new(), comments: Vec::new(), @@ -427,6 +474,18 @@ impl<'a> BlueprintBuilder<'a> { let blueprint_disks = self .disks .into_disks_map(self.input.all_sled_ids(SledFilter::InService)); + + // If we have an allocator, use it to generate a new config. If an error + // is returned then log it and carry over the parent_config. + let clickhouse_cluster_config = self.clickhouse_allocator.map(|a| { + match a.plan(&(&blueprint_zones).into()) { + Ok(config) => config, + Err(e) => { + error!(self.log, "clickhouse allocator error: {e}"); + a.parent_config().clone() + } + } + }); Blueprint { id: self.rng.blueprint_rng.next(), blueprint_zones, @@ -442,6 +501,7 @@ impl<'a> BlueprintBuilder<'a> { .clone(), cockroachdb_setting_preserve_downgrade: self .cockroachdb_setting_preserve_downgrade, + clickhouse_cluster_config, time_created: now_db_precision(), creator: self.creator, comment: self @@ -501,6 +561,13 @@ impl<'a> BlueprintBuilder<'a> { "sled_id" => sled_id.to_string(), )); + // If there are any `ClickhouseServer` or `ClickhouseKeeper` zones that + // are not expunged and we no longer have a `ClickhousePolicy` which + // indicates replicated clickhouse clusters should be running, we need + // to expunge all such zones. + let clickhouse_cluster_enabled = + self.input.clickhouse_cluster_enabled(); + // Do any zones need to be marked expunged? let mut zones_to_expunge = BTreeMap::new(); @@ -512,9 +579,11 @@ impl<'a> BlueprintBuilder<'a> { "zone_id" => zone_id.to_string() )); - let Some(reason) = - zone_needs_expungement(sled_details, zone_config) - else { + let Some(reason) = zone_needs_expungement( + sled_details, + zone_config, + clickhouse_cluster_enabled, + ) else { continue; }; @@ -553,6 +622,13 @@ impl<'a> BlueprintBuilder<'a> { "expunged sled with non-expunged zone found" ); } + ZoneExpungeReason::ClickhouseClusterDisabled => { + info!( + &log, + "clickhouse cluster disabled via policy, \ + expunging related zone" + ); + } } zones_to_expunge.insert(zone_id, reason); @@ -583,6 +659,7 @@ impl<'a> BlueprintBuilder<'a> { let mut count_disk_expunged = 0; let mut count_sled_decommissioned = 0; let mut count_sled_expunged = 0; + let mut count_clickhouse_cluster_disabled = 0; for reason in zones_to_expunge.values() { match reason { ZoneExpungeReason::DiskExpunged => count_disk_expunged += 1, @@ -590,12 +667,19 @@ impl<'a> BlueprintBuilder<'a> { count_sled_decommissioned += 1; } ZoneExpungeReason::SledExpunged => count_sled_expunged += 1, + ZoneExpungeReason::ClickhouseClusterDisabled => { + count_clickhouse_cluster_disabled += 1 + } }; } let count_and_reason = [ (count_disk_expunged, ZoneExpungeReason::DiskExpunged), (count_sled_decommissioned, ZoneExpungeReason::SledDecommissioned), (count_sled_expunged, ZoneExpungeReason::SledExpunged), + ( + count_clickhouse_cluster_disabled, + ZoneExpungeReason::ClickhouseClusterDisabled, + ), ]; for (count, reason) in count_and_reason { if count > 0 { @@ -1118,6 +1202,117 @@ impl<'a> BlueprintBuilder<'a> { Ok(EnsureMultiple::Changed { added: num_crdb_to_add, removed: 0 }) } + pub fn sled_ensure_zone_multiple_clickhouse_server( + &mut self, + sled_id: SledUuid, + desired_zone_count: usize, + ) -> Result { + // How many clickhouse server zones do we want to add? + let clickhouse_server_count = self.sled_num_running_zones_of_kind( + sled_id, + ZoneKind::ClickhouseServer, + ); + let num_clickhouse_servers_to_add = + match desired_zone_count.checked_sub(clickhouse_server_count) { + Some(0) => return Ok(EnsureMultiple::NotNeeded), + Some(n) => n, + None => { + return Err(Error::Planner(anyhow!( + "removing a ClickhouseServer zone not yet supported \ + (sled {sled_id} has {clickhouse_server_count}; \ + planner wants {desired_zone_count})" + ))); + } + }; + for _ in 0..num_clickhouse_servers_to_add { + let zone_id = self.rng.zone_rng.next(); + let underlay_ip = self.sled_alloc_ip(sled_id)?; + let pool_name = + self.sled_select_zpool(sled_id, ZoneKind::ClickhouseServer)?; + let port = omicron_common::address::CLICKHOUSE_HTTP_PORT; + let address = SocketAddrV6::new(underlay_ip, port, 0, 0); + let zone_type = BlueprintZoneType::ClickhouseServer( + blueprint_zone_type::ClickhouseServer { + address, + dataset: OmicronZoneDataset { + pool_name: pool_name.clone(), + }, + }, + ); + let filesystem_pool = pool_name; + + let zone = BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id: zone_id, + underlay_address: underlay_ip, + filesystem_pool: Some(filesystem_pool), + zone_type, + }; + self.sled_add_zone(sled_id, zone)?; + } + + Ok(EnsureMultiple::Changed { + added: num_clickhouse_servers_to_add, + removed: 0, + }) + } + + pub fn sled_ensure_zone_multiple_clickhouse_keeper( + &mut self, + sled_id: SledUuid, + desired_zone_count: usize, + ) -> Result { + // How many clickhouse keeper zones do we want to add? + let clickhouse_keeper_count = self.sled_num_running_zones_of_kind( + sled_id, + ZoneKind::ClickhouseKeeper, + ); + let num_clickhouse_keepers_to_add = + match desired_zone_count.checked_sub(clickhouse_keeper_count) { + Some(0) => return Ok(EnsureMultiple::NotNeeded), + Some(n) => n, + None => { + return Err(Error::Planner(anyhow!( + "removing a ClickhouseKeeper zone not yet supported \ + (sled {sled_id} has {clickhouse_keeper_count}; \ + planner wants {desired_zone_count})" + ))); + } + }; + + for _ in 0..num_clickhouse_keepers_to_add { + let zone_id = self.rng.zone_rng.next(); + let underlay_ip = self.sled_alloc_ip(sled_id)?; + let pool_name = + self.sled_select_zpool(sled_id, ZoneKind::ClickhouseKeeper)?; + let port = omicron_common::address::CLICKHOUSE_KEEPER_TCP_PORT; + let address = SocketAddrV6::new(underlay_ip, port, 0, 0); + let zone_type = BlueprintZoneType::ClickhouseKeeper( + blueprint_zone_type::ClickhouseKeeper { + address, + dataset: OmicronZoneDataset { + pool_name: pool_name.clone(), + }, + }, + ); + let filesystem_pool = pool_name; + + let zone = BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id: zone_id, + underlay_address: underlay_ip, + filesystem_pool: Some(filesystem_pool), + zone_type, + }; + self.sled_add_zone(sled_id, zone)?; + } + + Ok(EnsureMultiple::Changed { + added: num_clickhouse_keepers_to_add, + removed: 0, + }) + } + pub fn sled_promote_internal_ntp_to_boundary_ntp( &mut self, sled_id: SledUuid, diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/clickhouse.rs b/nexus/reconfigurator/planning/src/blueprint_builder/clickhouse.rs index 4071346632..0941806fc3 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/clickhouse.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/clickhouse.rs @@ -22,9 +22,9 @@ use thiserror::Error; // Will be removed once the planner starts using this code // See: https://github.com/oxidecomputer/omicron/issues/6577 #[allow(unused)] -struct ClickhouseZonesThatShouldBeRunning { - keepers: BTreeSet, - servers: BTreeSet, +pub struct ClickhouseZonesThatShouldBeRunning { + pub keepers: BTreeSet, + pub servers: BTreeSet, } impl From<&BTreeMap> @@ -64,7 +64,6 @@ impl From<&BTreeMap> #[allow(unused)] pub struct ClickhouseAllocator { log: Logger, - active_clickhouse_zones: ClickhouseZonesThatShouldBeRunning, parent_config: ClickhouseClusterConfig, // The latest clickhouse cluster membership from inventory inventory: Option, @@ -77,10 +76,6 @@ pub struct ClickhouseAllocator { #[allow(unused)] #[derive(Debug, Error)] pub enum KeeperAllocationError { - #[error("a clickhouse cluster configuration has not been created")] - NoConfig, - #[error("failed to retrieve clickhouse keeper membership from inventory")] - NoInventory, #[error("cannot add more than one keeper at a time: {added_keepers:?}")] BadMembershipChange { added_keepers: BTreeSet }, } @@ -91,13 +86,11 @@ pub enum KeeperAllocationError { impl ClickhouseAllocator { pub fn new( log: Logger, - zones_by_sled_id: &BTreeMap, clickhouse_cluster_config: ClickhouseClusterConfig, inventory: Option, ) -> ClickhouseAllocator { ClickhouseAllocator { log, - active_clickhouse_zones: zones_by_sled_id.into(), parent_config: clickhouse_cluster_config, inventory, } @@ -107,6 +100,7 @@ impl ClickhouseAllocator { /// on the parent blueprint and inventory pub fn plan( &self, + active_clickhouse_zones: &ClickhouseZonesThatShouldBeRunning, ) -> Result { let mut new_config = self.parent_config.clone(); @@ -122,10 +116,10 @@ impl ClickhouseAllocator { // First, remove the clickhouse servers that are no longer in service new_config.servers.retain(|zone_id, _| { - self.active_clickhouse_zones.servers.contains(zone_id) + active_clickhouse_zones.servers.contains(zone_id) }); // Next, add any new clickhouse servers - for zone_id in &self.active_clickhouse_zones.servers { + for zone_id in &active_clickhouse_zones.servers { if !new_config.servers.contains_key(zone_id) { // Allocate a new `ServerId` and map it to the server zone new_config.max_used_server_id += 1.into(); @@ -136,16 +130,37 @@ impl ClickhouseAllocator { } // Now we need to configure the keepers. We can only add or remove - // one keeper at a time. + // one keeper at a time during a reconfiguration. // // We need to see if we have any keeper inventory so we can compare it // with our current configuration and see if any changes are required. // If we fail to retrieve any inventory for keepers in the current // collection than we must not modify our keeper config, as we don't // know whether a configuration is ongoing or not. + // + // There is an exception to this rule: on *new* clusters that have + // keeper zones deployed but do not have any keepers running we can + // create a full cluster configuration unconditionally. We can add + // more than one keeper because this is the initial configuration and + // not a "reconfiguration" that only allows adding or removing one + // node at a time. Furthermore, we have to start at last one keeper + // unconditionally in this case because we cannot retrieve keeper + // inventory if there are no keepers running. Without retrieving + // inventory, we cannot make further progress. let current_keepers: BTreeSet<_> = self.parent_config.keepers.values().cloned().collect(); let Some(inventory_membership) = &self.inventory else { + // Are we a new cluster ? + if new_config.max_used_keeper_id == 0.into() { + // Generate our initial configuration + for zone_id in &active_clickhouse_zones.keepers { + // Allocate a new `KeeperId` and map it to the zone_id + new_config.max_used_keeper_id += 1.into(); + new_config + .keepers + .insert(*zone_id, new_config.max_used_keeper_id); + } + } return bump_gen_if_necessary(new_config); }; @@ -219,7 +234,7 @@ impl ClickhouseAllocator { // Let's ensure that this zone has not been expunged yet. If it has that means // that adding the keeper will never succeed. - if !self.active_clickhouse_zones.keepers.contains(added_zone_id) { + if !active_clickhouse_zones.keepers.contains(added_zone_id) { // The zone has been expunged, so we must remove it from our configuration. new_config.keepers.remove(added_zone_id); @@ -245,7 +260,7 @@ impl ClickhouseAllocator { // We remove first, because the zones are already gone and therefore // don't help our quorum. for (zone_id, _) in &self.parent_config.keepers { - if !self.active_clickhouse_zones.keepers.contains(&zone_id) { + if !active_clickhouse_zones.keepers.contains(&zone_id) { // Remove the keeper for the first expunged zone we see. // Remember, we only do one keeper membership change at time. new_config.keepers.remove(zone_id); @@ -254,9 +269,9 @@ impl ClickhouseAllocator { } // Do we need to add any nodes to in service zones that don't have them - for zone_id in &self.active_clickhouse_zones.keepers { + for zone_id in &active_clickhouse_zones.keepers { if !new_config.keepers.contains_key(zone_id) { - // Allocate a new `KeeperId` and map it to the server zone + // Allocate a new `KeeperId` and map it to the keeper zone new_config.max_used_keeper_id += 1.into(); new_config .keepers @@ -268,6 +283,10 @@ impl ClickhouseAllocator { // We possibly added or removed clickhouse servers, but not keepers. bump_gen_if_necessary(new_config) } + + pub fn parent_config(&self) -> &ClickhouseClusterConfig { + &self.parent_config + } } #[cfg(test)] @@ -363,13 +382,12 @@ pub mod test { let mut allocator = ClickhouseAllocator { log: logctx.log.clone(), - active_clickhouse_zones, parent_config: parent_config.clone(), inventory, }; // Our clickhouse cluster config should not have changed - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); // Note that we cannot directly check equality here and // in a bunch of the test cases below, because we bump the @@ -380,7 +398,7 @@ pub mod test { // Running again without changing the inventory should be idempotent allocator.parent_config = new_config; - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config, allocator.parent_config); logctx.cleanup_successful(); @@ -412,13 +430,12 @@ pub mod test { // allocator should allocate one more keeper. let mut allocator = ClickhouseAllocator { log: logctx.log.clone(), - active_clickhouse_zones, parent_config: parent_config.clone(), inventory, }; // Did our new config change as we expect? - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config.generation, Generation::from_u32(2)); assert_eq!(new_config.generation, parent_config.generation.next()); assert_eq!(new_config.max_used_keeper_id, 4.into()); @@ -440,14 +457,14 @@ pub mod test { // itself does not modify the allocator and a new one is created by the // `BlueprintBuilder` on each planning round. allocator.parent_config = new_config; - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config, allocator.parent_config); // Now let's update our inventory to reflect the new keeper. This should // trigger the planner to add a 5th keeper. allocator.inventory.as_mut().unwrap().raft_config.insert(4.into()); allocator.inventory.as_mut().unwrap().leader_committed_log_index += 1; - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config.generation, Generation::from_u32(3)); assert_eq!( new_config.generation, @@ -473,7 +490,7 @@ pub mod test { // inventory raft config. We should end up with the same config. allocator.parent_config = new_config; allocator.inventory.as_mut().unwrap().leader_committed_log_index += 1; - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert!(!new_config.needs_generation_bump(&allocator.parent_config)); // Now let's modify the inventory to reflect that the 5th keeper node @@ -483,7 +500,7 @@ pub mod test { // our keeper zones have a keeper that is part of the cluster. allocator.inventory.as_mut().unwrap().raft_config.insert(5.into()); allocator.inventory.as_mut().unwrap().leader_committed_log_index += 1; - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert!(!new_config.needs_generation_bump(&allocator.parent_config)); logctx.cleanup_successful(); @@ -497,7 +514,7 @@ pub mod test { let (n_keeper_zones, n_server_zones, n_keepers, n_servers) = (5, 2, 5, 2); - let (active_clickhouse_zones, parent_config) = initial_config( + let (mut active_clickhouse_zones, parent_config) = initial_config( n_keeper_zones, n_server_zones, n_keepers, @@ -512,23 +529,22 @@ pub mod test { let mut allocator = ClickhouseAllocator { log: logctx.log.clone(), - active_clickhouse_zones, parent_config: parent_config.clone(), inventory, }; // Our clickhouse cluster config should not have changed // We have 5 keepers and 5 zones and all of them are in the inventory - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert!(!new_config.needs_generation_bump(&parent_config)); // Now expunge 2 of the 5 keeper zones by removing them from the // in-service zones - allocator.active_clickhouse_zones.keepers.pop_first(); - allocator.active_clickhouse_zones.keepers.pop_first(); + active_clickhouse_zones.keepers.pop_first(); + active_clickhouse_zones.keepers.pop_first(); // Running the planner should remove one of the keepers from the new config - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config.generation, Generation::from_u32(2)); assert_eq!( new_config.generation, @@ -554,14 +570,14 @@ pub mod test { // since the inventory hasn't reflected the change allocator.parent_config = new_config; allocator.inventory.as_mut().unwrap().leader_committed_log_index += 1; - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert!(!new_config.needs_generation_bump(&allocator.parent_config)); // Reflecting the new config in inventory should remove another keeper allocator.inventory.as_mut().unwrap().raft_config = new_config.keepers.values().cloned().collect(); allocator.inventory.as_mut().unwrap().leader_committed_log_index += 1; - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config.generation, Generation::from_u32(3)); assert_eq!( @@ -588,7 +604,7 @@ pub mod test { // change, because the inventory doesn't reflect the removed keeper allocator.parent_config = new_config; allocator.inventory.as_mut().unwrap().leader_committed_log_index += 1; - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&&active_clickhouse_zones).unwrap(); assert!(!new_config.needs_generation_bump(&allocator.parent_config)); // Reflecting the keeper removal in inventory should also result in no @@ -597,7 +613,7 @@ pub mod test { new_config.keepers.values().cloned().collect(); allocator.inventory.as_mut().unwrap().leader_committed_log_index += 1; allocator.parent_config = new_config; - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert!(!new_config.needs_generation_bump(&allocator.parent_config)); logctx.cleanup_successful(); @@ -612,7 +628,7 @@ pub mod test { let (n_keeper_zones, n_server_zones, n_keepers, n_servers) = (5, 2, 4, 2); - let (active_clickhouse_zones, parent_config) = initial_config( + let (mut active_clickhouse_zones, parent_config) = initial_config( n_keeper_zones, n_server_zones, n_keepers, @@ -627,14 +643,13 @@ pub mod test { let mut allocator = ClickhouseAllocator { log: logctx.log.clone(), - active_clickhouse_zones, parent_config, inventory, }; // First run the planner to add a 5th keeper to our config assert_eq!(allocator.parent_config.keepers.len(), 4); - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config.keepers.len(), 5); // Pick one of the keepers currently in our inventory, find the zone @@ -654,7 +669,7 @@ pub mod test { .find(|(_, &keeper_id)| keeper_id == keeper_to_expunge) .map(|(zone_id, _)| *zone_id) .unwrap(); - allocator.active_clickhouse_zones.keepers.remove(&zone_to_expunge); + active_clickhouse_zones.keepers.remove(&zone_to_expunge); // Bump the inventory commit index so we guarantee we perform the keeper // checks @@ -666,7 +681,7 @@ pub mod test { // Run the plan. Our configuration should stay the same because we can // only add or remove one keeper node from the cluster at a time and we // are already in the process of adding a node. - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert!(!new_config.needs_generation_bump(&allocator.parent_config)); // Now we change the inventory to reflect the addition of the node to @@ -675,7 +690,7 @@ pub mod test { allocator.parent_config = new_config; allocator.inventory.as_mut().unwrap().leader_committed_log_index += 1; allocator.inventory.as_mut().unwrap().raft_config.insert(5.into()); - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config.keepers.len(), 4); // Let's make sure that the right keeper was expunged. @@ -694,8 +709,8 @@ pub mod test { .raft_config .remove(&keeper_to_expunge); let new_zone_id = OmicronZoneUuid::new_v4(); - allocator.active_clickhouse_zones.keepers.insert(new_zone_id); - let new_config = allocator.plan().unwrap(); + active_clickhouse_zones.keepers.insert(new_zone_id); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config.keepers.len(), 5); assert_eq!(*new_config.keepers.get(&new_zone_id).unwrap(), KeeperId(6)); assert_eq!(new_config.max_used_keeper_id, 6.into()); @@ -712,7 +727,7 @@ pub mod test { let (n_keeper_zones, n_server_zones, n_keepers, n_servers) = (5, 2, 4, 2); - let (active_clickhouse_zones, parent_config) = initial_config( + let (mut active_clickhouse_zones, parent_config) = initial_config( n_keeper_zones, n_server_zones, n_keepers, @@ -727,14 +742,13 @@ pub mod test { let mut allocator = ClickhouseAllocator { log: logctx.log.clone(), - active_clickhouse_zones, parent_config, inventory, }; // First run the planner to add a 5th keeper to our config assert_eq!(allocator.parent_config.keepers.len(), 4); - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config.keepers.len(), 5); // Find the zone for our new keeper and expunge it before it is @@ -749,12 +763,12 @@ pub mod test { .map(|(zone_id, _)| *zone_id) .unwrap(); allocator.parent_config = new_config; - allocator.active_clickhouse_zones.keepers.remove(&zone_to_expunge); + active_clickhouse_zones.keepers.remove(&zone_to_expunge); // Bump the inventory commit index so we guarantee we perform the keeper // checks allocator.inventory.as_mut().unwrap().leader_committed_log_index += 1; - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config.keepers.len(), 4); assert!(!new_config.keepers.contains_key(&zone_to_expunge)); @@ -770,7 +784,7 @@ pub mod test { let (n_keeper_zones, n_server_zones, n_keepers, n_servers) = (3, 5, 3, 2); - let (active_clickhouse_zones, parent_config) = initial_config( + let (mut active_clickhouse_zones, parent_config) = initial_config( n_keeper_zones, n_server_zones, n_keepers, @@ -785,7 +799,6 @@ pub mod test { let mut allocator = ClickhouseAllocator { log: logctx.log.clone(), - active_clickhouse_zones, parent_config, inventory, }; @@ -793,12 +806,12 @@ pub mod test { let zone_to_expunge = *allocator.parent_config.servers.keys().next().unwrap(); - allocator.active_clickhouse_zones.servers.remove(&zone_to_expunge); + active_clickhouse_zones.servers.remove(&zone_to_expunge); // After running the planner we should see 4 servers: // Start with 2, expunge 1, add 3 to reach the number of zones we have. assert_eq!(allocator.parent_config.servers.len(), 2); - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config.servers.len(), 4); assert_eq!(new_config.max_used_server_id, 5.into()); assert_eq!(new_config.generation, Generation::from_u32(2)); @@ -810,11 +823,11 @@ pub mod test { // We can add a new keeper and server at the same time let new_keeper_zone = OmicronZoneUuid::new_v4(); let new_server_id = OmicronZoneUuid::new_v4(); - allocator.active_clickhouse_zones.keepers.insert(new_keeper_zone); - allocator.active_clickhouse_zones.servers.insert(new_server_id); + active_clickhouse_zones.keepers.insert(new_keeper_zone); + active_clickhouse_zones.servers.insert(new_server_id); allocator.parent_config = new_config; allocator.inventory.as_mut().unwrap().leader_committed_log_index += 1; - let new_config = allocator.plan().unwrap(); + let new_config = allocator.plan(&active_clickhouse_zones).unwrap(); assert_eq!(new_config.generation, Generation::from_u32(3)); assert_eq!(new_config.max_used_server_id, 6.into()); assert_eq!(new_config.max_used_keeper_id, 4.into()); @@ -850,7 +863,6 @@ pub mod test { let allocator = ClickhouseAllocator { log: logctx.log.clone(), - active_clickhouse_zones, parent_config, inventory, }; @@ -858,7 +870,7 @@ pub mod test { // We expect to get an error back. This can be used by higher level // software to trigger alerts, etc... In practice the `BlueprintBuilder` // should not change it's config when it receives an error. - assert!(allocator.plan().is_err()); + assert!(allocator.plan(&active_clickhouse_zones).is_err()); logctx.cleanup_successful(); } diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/mod.rs b/nexus/reconfigurator/planning/src/blueprint_builder/mod.rs index b84bef6426..725835f4ae 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/mod.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/mod.rs @@ -11,3 +11,4 @@ mod internal_dns; mod zones; pub use builder::*; +pub use clickhouse::{ClickhouseAllocator, ClickhouseZonesThatShouldBeRunning}; diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index f02d8ec3de..e2534a4668 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -353,6 +353,8 @@ impl<'a> Planner<'a> { for zone_kind in [ DiscretionaryOmicronZone::BoundaryNtp, + DiscretionaryOmicronZone::ClickhouseKeeper, + DiscretionaryOmicronZone::ClickhouseServer, DiscretionaryOmicronZone::CockroachDb, DiscretionaryOmicronZone::InternalDns, DiscretionaryOmicronZone::ExternalDns, @@ -431,6 +433,12 @@ impl<'a> Planner<'a> { DiscretionaryOmicronZone::BoundaryNtp => { self.input.target_boundary_ntp_zone_count() } + DiscretionaryOmicronZone::ClickhouseKeeper => { + self.input.target_clickhouse_keeper_zone_count() + } + DiscretionaryOmicronZone::ClickhouseServer => { + self.input.target_clickhouse_server_zone_count() + } DiscretionaryOmicronZone::CockroachDb => { self.input.target_cockroachdb_zone_count() } @@ -518,6 +526,18 @@ impl<'a> Planner<'a> { DiscretionaryOmicronZone::BoundaryNtp => self .blueprint .sled_promote_internal_ntp_to_boundary_ntp(sled_id)?, + DiscretionaryOmicronZone::ClickhouseKeeper => { + self.blueprint.sled_ensure_zone_multiple_clickhouse_keeper( + sled_id, + new_total_zone_count, + )? + } + DiscretionaryOmicronZone::ClickhouseServer => { + self.blueprint.sled_ensure_zone_multiple_clickhouse_server( + sled_id, + new_total_zone_count, + )? + } DiscretionaryOmicronZone::CockroachDb => { self.blueprint.sled_ensure_zone_multiple_cockroachdb( sled_id, @@ -695,6 +715,7 @@ fn sled_needs_all_zones_expunged( pub(crate) fn zone_needs_expungement( sled_details: &SledDetails, zone_config: &BlueprintZoneConfig, + clickhouse_cluster_enabled: bool, ) -> Option { // Should we expunge the zone because the sled is gone? if let Some(reason) = @@ -719,6 +740,16 @@ pub(crate) fn zone_needs_expungement( } }; + // Should we expunge the zone because clickhouse clusters are no longer + // enabled via policy? + if !clickhouse_cluster_enabled { + if zone_config.zone_type.is_clickhouse_keeper() + || zone_config.zone_type.is_clickhouse_server() + { + return Some(ZoneExpungeReason::ClickhouseClusterDisabled); + } + } + None } @@ -731,6 +762,7 @@ pub(crate) enum ZoneExpungeReason { DiskExpunged, SledDecommissioned, SledExpunged, + ClickhouseClusterDisabled, } #[cfg(test)] @@ -747,6 +779,7 @@ mod test { use chrono::NaiveDateTime; use chrono::TimeZone; use chrono::Utc; + use clickhouse_admin_types::KeeperId; use expectorate::assert_contents; use nexus_inventory::now_db_precision; use nexus_sled_agent_shared::inventory::ZoneKind; @@ -755,6 +788,7 @@ mod test { use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZoneType; + use nexus_types::deployment::ClickhousePolicy; use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::deployment::CockroachDbSettings; @@ -765,6 +799,7 @@ mod test { use nexus_types::external_api::views::SledPolicy; use nexus_types::external_api::views::SledProvisionPolicy; use nexus_types::external_api::views::SledState; + use nexus_types::inventory::ClickhouseKeeperClusterMembership; use nexus_types::inventory::OmicronZonesFound; use omicron_common::api::external::Generation; use omicron_common::disk::DiskIdentity; @@ -773,6 +808,7 @@ mod test { use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolUuid; + use std::collections::BTreeSet; use std::collections::HashMap; use std::mem; use std::net::IpAddr; @@ -2432,4 +2468,631 @@ mod test { logctx.cleanup_successful(); } + + /// Deploy all keeper nodes server nodes at once for a new cluster. + /// Then add keeper nodes 1 at a time. + #[test] + fn test_plan_deploy_all_clickhouse_cluster_nodes() { + static TEST_NAME: &str = "planner_deploy_all_keeper_nodes"; + let logctx = test_setup_log(TEST_NAME); + let log = logctx.log.clone(); + + // Use our example system. + let (mut collection, input, blueprint1) = + example(&log, TEST_NAME, DEFAULT_N_SLEDS); + verify_blueprint(&blueprint1); + + // We shouldn't have a clickhouse cluster config, as we don't have a + // clickhouse policy set yet + assert!(blueprint1.clickhouse_cluster_config.is_none()); + let target_keepers = 3; + let target_servers = 2; + + // Enable clickhouse clusters via policy + let mut input_builder = input.into_builder(); + input_builder.policy_mut().clickhouse_policy = Some(ClickhousePolicy { + deploy_with_standalone: true, + target_servers, + target_keepers, + }); + + // Creating a new blueprint should deploy all the new clickhouse zones + let input = input_builder.build(); + let blueprint2 = Planner::new_based_on( + log.clone(), + &blueprint1, + &input, + "test_blueprint2", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp2")) + .plan() + .expect("plan"); + + // We should see zones for 3 clickhouse keepers, and 2 servers created + let active_zones: Vec<_> = blueprint2 + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + .map(|(_, z)| z.clone()) + .collect(); + + let keeper_zone_ids: BTreeSet<_> = active_zones + .iter() + .filter(|z| z.zone_type.is_clickhouse_keeper()) + .map(|z| z.id) + .collect(); + let server_zone_ids: BTreeSet<_> = active_zones + .iter() + .filter(|z| z.zone_type.is_clickhouse_server()) + .map(|z| z.id) + .collect(); + + assert_eq!(keeper_zone_ids.len(), target_keepers); + assert_eq!(server_zone_ids.len(), target_servers); + + // We should be attempting to allocate all servers and keepers since + // this the initial configuration + { + let clickhouse_cluster_config = + blueprint2.clickhouse_cluster_config.as_ref().unwrap(); + assert_eq!(clickhouse_cluster_config.generation, 2.into()); + assert_eq!( + clickhouse_cluster_config.max_used_keeper_id, + (target_keepers as u64).into() + ); + assert_eq!( + clickhouse_cluster_config.max_used_server_id, + (target_servers as u64).into() + ); + assert_eq!(clickhouse_cluster_config.keepers.len(), target_keepers); + assert_eq!(clickhouse_cluster_config.servers.len(), target_servers); + + // Ensure that the added keepers are in server zones + for zone_id in clickhouse_cluster_config.keepers.keys() { + assert!(keeper_zone_ids.contains(zone_id)); + } + + // Ensure that the added servers are in server zones + for zone_id in clickhouse_cluster_config.servers.keys() { + assert!(server_zone_ids.contains(zone_id)); + } + } + + // Planning again without changing inventory should result in the same + // state + let blueprint3 = Planner::new_based_on( + log.clone(), + &blueprint2, + &input, + "test_blueprint3", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp3")) + .plan() + .expect("plan"); + + assert_eq!( + blueprint2.clickhouse_cluster_config, + blueprint3.clickhouse_cluster_config + ); + + // Updating the inventory to reflect the keepers + // should result in the same state, except for the + // `highest_seen_keeper_leader_committed_log_index` + let (zone_id, keeper_id) = blueprint3 + .clickhouse_cluster_config + .as_ref() + .unwrap() + .keepers + .first_key_value() + .unwrap(); + let membership = ClickhouseKeeperClusterMembership { + queried_keeper: *keeper_id, + leader_committed_log_index: 1, + raft_config: blueprint3 + .clickhouse_cluster_config + .as_ref() + .unwrap() + .keepers + .values() + .cloned() + .collect(), + }; + collection + .clickhouse_keeper_cluster_membership + .insert(*zone_id, membership); + + let blueprint4 = Planner::new_based_on( + log.clone(), + &blueprint3, + &input, + "test_blueprint4", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp4")) + .plan() + .expect("plan"); + + let bp3_config = blueprint3.clickhouse_cluster_config.as_ref().unwrap(); + let bp4_config = blueprint4.clickhouse_cluster_config.as_ref().unwrap(); + assert_eq!(bp4_config.generation, bp3_config.generation); + assert_eq!( + bp4_config.max_used_keeper_id, + bp3_config.max_used_keeper_id + ); + assert_eq!( + bp4_config.max_used_server_id, + bp3_config.max_used_server_id + ); + assert_eq!(bp4_config.keepers, bp3_config.keepers); + assert_eq!(bp4_config.servers, bp3_config.servers); + assert_eq!( + bp4_config.highest_seen_keeper_leader_committed_log_index, + 1 + ); + + // Let's bump the clickhouse target to 5 via policy so that we can add + // more nodes one at a time. Initial configuration deploys all nodes, + // but reconfigurations may only add or remove one node at a time. + // Enable clickhouse clusters via policy + let target_keepers = 5; + let mut input_builder = input.into_builder(); + input_builder.policy_mut().clickhouse_policy = Some(ClickhousePolicy { + deploy_with_standalone: true, + target_servers, + target_keepers, + }); + let input = input_builder.build(); + let blueprint5 = Planner::new_based_on( + log.clone(), + &blueprint4, + &input, + "test_blueprint5", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp5")) + .plan() + .expect("plan"); + + let active_zones: Vec<_> = blueprint5 + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + .map(|(_, z)| z.clone()) + .collect(); + + let new_keeper_zone_ids: BTreeSet<_> = active_zones + .iter() + .filter(|z| z.zone_type.is_clickhouse_keeper()) + .map(|z| z.id) + .collect(); + + // We should have allocated 2 new keeper zones + assert_eq!(new_keeper_zone_ids.len(), target_keepers); + assert!(keeper_zone_ids.is_subset(&new_keeper_zone_ids)); + + // We should be trying to provision one new keeper for a keeper zone + let bp4_config = blueprint4.clickhouse_cluster_config.as_ref().unwrap(); + let bp5_config = blueprint5.clickhouse_cluster_config.as_ref().unwrap(); + assert_eq!(bp5_config.generation, bp4_config.generation.next()); + assert_eq!( + bp5_config.max_used_keeper_id, + bp4_config.max_used_keeper_id + 1.into() + ); + assert_eq!( + bp5_config.keepers.len(), + bp5_config.max_used_keeper_id.0 as usize + ); + + // Planning again without updating inventory results in the same `ClickhouseClusterConfig` + let blueprint6 = Planner::new_based_on( + log.clone(), + &blueprint5, + &input, + "test_blueprint6", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp6")) + .plan() + .expect("plan"); + + let bp6_config = blueprint6.clickhouse_cluster_config.as_ref().unwrap(); + assert_eq!(bp5_config, bp6_config); + + // Updating the inventory to include the 4th node should add another + // keeper node + let membership = ClickhouseKeeperClusterMembership { + queried_keeper: *keeper_id, + leader_committed_log_index: 2, + raft_config: blueprint6 + .clickhouse_cluster_config + .as_ref() + .unwrap() + .keepers + .values() + .cloned() + .collect(), + }; + collection + .clickhouse_keeper_cluster_membership + .insert(*zone_id, membership); + + let blueprint7 = Planner::new_based_on( + log.clone(), + &blueprint6, + &input, + "test_blueprint7", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp7")) + .plan() + .expect("plan"); + + let bp7_config = blueprint7.clickhouse_cluster_config.as_ref().unwrap(); + assert_eq!(bp7_config.generation, bp6_config.generation.next()); + assert_eq!( + bp7_config.max_used_keeper_id, + bp6_config.max_used_keeper_id + 1.into() + ); + assert_eq!( + bp7_config.keepers.len(), + bp7_config.max_used_keeper_id.0 as usize + ); + assert_eq!(bp7_config.keepers.len(), target_keepers); + assert_eq!( + bp7_config.highest_seen_keeper_leader_committed_log_index, + 2 + ); + + // Updating the inventory to reflect the newest keeper node should not + // increase the cluster size since we have reached the target. + let membership = ClickhouseKeeperClusterMembership { + queried_keeper: *keeper_id, + leader_committed_log_index: 3, + raft_config: blueprint7 + .clickhouse_cluster_config + .as_ref() + .unwrap() + .keepers + .values() + .cloned() + .collect(), + }; + collection + .clickhouse_keeper_cluster_membership + .insert(*zone_id, membership); + let blueprint8 = Planner::new_based_on( + log.clone(), + &blueprint7, + &input, + "test_blueprint8", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp8")) + .plan() + .expect("plan"); + + let bp8_config = blueprint8.clickhouse_cluster_config.as_ref().unwrap(); + assert_eq!(bp8_config.generation, bp7_config.generation); + assert_eq!( + bp8_config.max_used_keeper_id, + bp7_config.max_used_keeper_id + ); + assert_eq!(bp8_config.keepers, bp7_config.keepers); + assert_eq!(bp7_config.keepers.len(), target_keepers); + assert_eq!( + bp8_config.highest_seen_keeper_leader_committed_log_index, + 3 + ); + + logctx.cleanup_successful(); + } + + // Start with an existing clickhouse cluster and expunge a keeper. This + // models what will happen after an RSS deployment with clickhouse policy + // enabled or an existing system already running a clickhouse cluster. + #[test] + fn test_expunge_clickhouse_clusters() { + static TEST_NAME: &str = "planner_expunge_clickhouse_clusters"; + let logctx = test_setup_log(TEST_NAME); + let log = logctx.log.clone(); + + // Use our example system. + let (mut collection, input, blueprint1) = + example(&log, TEST_NAME, DEFAULT_N_SLEDS); + + let target_keepers = 3; + let target_servers = 2; + + // Enable clickhouse clusters via policy + let mut input_builder = input.into_builder(); + input_builder.policy_mut().clickhouse_policy = Some(ClickhousePolicy { + deploy_with_standalone: true, + target_servers, + target_keepers, + }); + let input = input_builder.build(); + + // Create a new blueprint to deploy all our clickhouse zones + let mut blueprint2 = Planner::new_based_on( + log.clone(), + &blueprint1, + &input, + "test_blueprint2", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp2")) + .plan() + .expect("plan"); + + // We should see zones for 3 clickhouse keepers, and 2 servers created + let active_zones: Vec<_> = blueprint2 + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + .map(|(_, z)| z.clone()) + .collect(); + + let keeper_zone_ids: BTreeSet<_> = active_zones + .iter() + .filter(|z| z.zone_type.is_clickhouse_keeper()) + .map(|z| z.id) + .collect(); + let server_zone_ids: BTreeSet<_> = active_zones + .iter() + .filter(|z| z.zone_type.is_clickhouse_server()) + .map(|z| z.id) + .collect(); + + assert_eq!(keeper_zone_ids.len(), target_keepers); + assert_eq!(server_zone_ids.len(), target_servers); + + // Directly manipulate the blueprint and inventory so that the + // clickhouse clusters are stable + let config = blueprint2.clickhouse_cluster_config.as_mut().unwrap(); + config.max_used_keeper_id = (target_keepers as u64).into(); + config.keepers = keeper_zone_ids + .iter() + .enumerate() + .map(|(i, zone_id)| (*zone_id, KeeperId(i as u64))) + .collect(); + config.highest_seen_keeper_leader_committed_log_index = 1; + + let raft_config: BTreeSet<_> = + config.keepers.values().cloned().collect(); + + collection.clickhouse_keeper_cluster_membership = config + .keepers + .iter() + .map(|(zone_id, keeper_id)| { + ( + *zone_id, + ClickhouseKeeperClusterMembership { + queried_keeper: *keeper_id, + leader_committed_log_index: 1, + raft_config: raft_config.clone(), + }, + ) + }) + .collect(); + + // Let's run the planner. The blueprint shouldn't change with regards to + // clickhouse as our inventory reflects our desired state. + let blueprint3 = Planner::new_based_on( + log.clone(), + &blueprint2, + &input, + "test_blueprint3", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp3")) + .plan() + .expect("plan"); + + assert_eq!( + blueprint2.clickhouse_cluster_config, + blueprint3.clickhouse_cluster_config + ); + + // Find the sled containing one of the keeper zones and expunge it + let (sled_id, bp_zone_config) = blueprint3 + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + .find(|(_, z)| z.zone_type.is_clickhouse_keeper()) + .unwrap(); + + // Expunge a keeper zone + let mut builder = input.into_builder(); + builder.sleds_mut().get_mut(&sled_id).unwrap().policy = + SledPolicy::Expunged; + let input = builder.build(); + + let blueprint4 = Planner::new_based_on( + log.clone(), + &blueprint3, + &input, + "test_blueprint4", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp4")) + .plan() + .expect("plan"); + + // The planner should expunge a zone based on the sled being expunged. Since this + // is a clickhouse keeper zone, the clickhouse keeper configuration should change + // to reflect this. + let old_config = blueprint3.clickhouse_cluster_config.as_ref().unwrap(); + let config = blueprint4.clickhouse_cluster_config.as_ref().unwrap(); + assert_eq!(config.generation, old_config.generation.next()); + assert!(!config.keepers.contains_key(&bp_zone_config.id)); + // We've only removed one keeper from our desired state + assert_eq!(config.keepers.len() + 1, old_config.keepers.len()); + // We haven't allocated any new keepers + assert_eq!(config.max_used_keeper_id, old_config.max_used_keeper_id); + + // Planning again will not change the keeper state because we haven't updated the inventory + let blueprint5 = Planner::new_based_on( + log.clone(), + &blueprint4, + &input, + "test_blueprint5", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp5")) + .plan() + .expect("plan"); + + assert_eq!( + blueprint4.clickhouse_cluster_config, + blueprint5.clickhouse_cluster_config + ); + + // Updating the inventory to reflect the removed keeper results in a new one being added + + // Remove the keeper for the expunged zone + collection + .clickhouse_keeper_cluster_membership + .remove(&bp_zone_config.id); + + // Update the inventory on at least one of the remaining nodes. + let existing = collection + .clickhouse_keeper_cluster_membership + .first_entry() + .unwrap() + .into_mut(); + existing.leader_committed_log_index = 3; + existing.raft_config = config.keepers.values().cloned().collect(); + + let blueprint6 = Planner::new_based_on( + log.clone(), + &blueprint5, + &input, + "test_blueprint6", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp6")) + .plan() + .expect("plan"); + + let old_config = blueprint5.clickhouse_cluster_config.as_ref().unwrap(); + let config = blueprint6.clickhouse_cluster_config.as_ref().unwrap(); + + // Our generation has changed to reflect the added keeper + assert_eq!(config.generation, old_config.generation.next()); + assert!(!config.keepers.contains_key(&bp_zone_config.id)); + // We've only added one keeper from our desired state + // This brings us back up to our target count + assert_eq!(config.keepers.len(), old_config.keepers.len() + 1); + assert_eq!(config.keepers.len(), target_keepers); + // We've allocated one new keeper + assert_eq!( + config.max_used_keeper_id, + old_config.max_used_keeper_id + 1.into() + ); + + logctx.cleanup_successful(); + } + + #[test] + fn test_expunge_all_clickhouse_cluster_zones_after_policy_is_disabled() { + static TEST_NAME: &str = "planner_expunge_all_clickhouse_cluster_zones_after_policy_is_disabled"; + let logctx = test_setup_log(TEST_NAME); + let log = logctx.log.clone(); + + // Use our example system. + let (collection, input, blueprint1) = + example(&log, TEST_NAME, DEFAULT_N_SLEDS); + + let target_keepers = 3; + let target_servers = 2; + + // Enable clickhouse clusters via policy + let mut input_builder = input.into_builder(); + input_builder.policy_mut().clickhouse_policy = Some(ClickhousePolicy { + deploy_with_standalone: true, + target_servers, + target_keepers, + }); + let input = input_builder.build(); + + // Create a new blueprint to deploy all our clickhouse zones + let blueprint2 = Planner::new_based_on( + log.clone(), + &blueprint1, + &input, + "test_blueprint2", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp2")) + .plan() + .expect("plan"); + + // We should see zones for 3 clickhouse keepers, and 2 servers created + let active_zones: Vec<_> = blueprint2 + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + .map(|(_, z)| z.clone()) + .collect(); + + let keeper_zone_ids: BTreeSet<_> = active_zones + .iter() + .filter(|z| z.zone_type.is_clickhouse_keeper()) + .map(|z| z.id) + .collect(); + let server_zone_ids: BTreeSet<_> = active_zones + .iter() + .filter(|z| z.zone_type.is_clickhouse_server()) + .map(|z| z.id) + .collect(); + + assert_eq!(keeper_zone_ids.len(), target_keepers); + assert_eq!(server_zone_ids.len(), target_servers); + + // Disable clickhouse clusters via policy + let mut input_builder = input.into_builder(); + input_builder.policy_mut().clickhouse_policy = None; + let input = input_builder.build(); + + // Create a new blueprint with the disabled policy + let blueprint3 = Planner::new_based_on( + log.clone(), + &blueprint2, + &input, + "test_blueprint3", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp3")) + .plan() + .expect("plan"); + + // All our clickhouse keeper and server zones that we created when we + // enabled our clickhouse policy should be expunged when we disable it. + let expunged_zones: Vec<_> = blueprint3 + .all_omicron_zones(BlueprintZoneFilter::Expunged) + .map(|(_, z)| z.clone()) + .collect(); + + let expunged_keeper_zone_ids: BTreeSet<_> = expunged_zones + .iter() + .filter(|z| z.zone_type.is_clickhouse_keeper()) + .map(|z| z.id) + .collect(); + let expunged_server_zone_ids: BTreeSet<_> = expunged_zones + .iter() + .filter(|z| z.zone_type.is_clickhouse_server()) + .map(|z| z.id) + .collect(); + + assert_eq!(keeper_zone_ids, expunged_keeper_zone_ids); + assert_eq!(server_zone_ids, expunged_server_zone_ids); + + logctx.cleanup_successful(); + } } diff --git a/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs b/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs index 0af0321768..8c1ec2ba7d 100644 --- a/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs +++ b/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs @@ -15,6 +15,8 @@ use std::mem; #[cfg_attr(test, derive(test_strategy::Arbitrary))] pub(crate) enum DiscretionaryOmicronZone { BoundaryNtp, + ClickhouseKeeper, + ClickhouseServer, CockroachDb, InternalDns, ExternalDns, @@ -28,14 +30,18 @@ impl DiscretionaryOmicronZone { ) -> Option { match zone_type { BlueprintZoneType::BoundaryNtp(_) => Some(Self::BoundaryNtp), + BlueprintZoneType::ClickhouseKeeper(_) => { + Some(Self::ClickhouseKeeper) + } + BlueprintZoneType::ClickhouseServer(_) => { + Some(Self::ClickhouseServer) + } BlueprintZoneType::CockroachDb(_) => Some(Self::CockroachDb), BlueprintZoneType::InternalDns(_) => Some(Self::InternalDns), BlueprintZoneType::ExternalDns(_) => Some(Self::ExternalDns), BlueprintZoneType::Nexus(_) => Some(Self::Nexus), // Zones that we should place but don't yet. BlueprintZoneType::Clickhouse(_) - | BlueprintZoneType::ClickhouseKeeper(_) - | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CruciblePantry(_) | BlueprintZoneType::Oximeter(_) => None, // Zones that get special handling for placement (all sleds get @@ -51,6 +57,12 @@ impl From for ZoneKind { fn from(zone: DiscretionaryOmicronZone) -> Self { match zone { DiscretionaryOmicronZone::BoundaryNtp => Self::BoundaryNtp, + DiscretionaryOmicronZone::ClickhouseKeeper => { + Self::ClickhouseKeeper + } + DiscretionaryOmicronZone::ClickhouseServer => { + Self::ClickhouseServer + } DiscretionaryOmicronZone::CockroachDb => Self::CockroachDb, DiscretionaryOmicronZone::InternalDns => Self::InternalDns, DiscretionaryOmicronZone::ExternalDns => Self::ExternalDns, diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index ccb4d4ab27..5b19590e35 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -14,6 +14,7 @@ use nexus_sled_agent_shared::inventory::Baseboard; use nexus_sled_agent_shared::inventory::Inventory; use nexus_sled_agent_shared::inventory::InventoryDisk; use nexus_sled_agent_shared::inventory::SledRole; +use nexus_types::deployment::ClickhousePolicy; use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbSettings; use nexus_types::deployment::PlanningInputBuilder; @@ -87,6 +88,7 @@ pub struct SystemDescription { service_ip_pool_ranges: Vec, internal_dns_version: Generation, external_dns_version: Generation, + clickhouse_policy: Option, } impl SystemDescription { @@ -164,6 +166,7 @@ impl SystemDescription { service_ip_pool_ranges, internal_dns_version: Generation::new(), external_dns_version: Generation::new(), + clickhouse_policy: None, } } @@ -209,6 +212,12 @@ impl SystemDescription { self } + /// Set the clickhouse policy + pub fn clickhouse_policy(&mut self, policy: ClickhousePolicy) -> &mut Self { + self.clickhouse_policy = Some(policy); + self + } + /// Add a sled to the system, as described by a SledBuilder pub fn sled(&mut self, sled: SledBuilder) -> anyhow::Result<&mut Self> { let sled_id = sled.id.unwrap_or_else(SledUuid::new_v4); @@ -338,7 +347,7 @@ impl SystemDescription { target_cockroachdb_zone_count: self.target_cockroachdb_zone_count, target_cockroachdb_cluster_version: self .target_cockroachdb_cluster_version, - clickhouse_policy: None, + clickhouse_policy: self.clickhouse_policy.clone(), }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index c4fd916d5f..398655e9fb 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -247,6 +247,7 @@ mod test { internal_dns_version: dns_version, external_dns_version: dns_version, cockroachdb_fingerprint: String::new(), + clickhouse_cluster_config: None, time_created: chrono::Utc::now(), creator: "test".to_string(), comment: "test blueprint".to_string(), diff --git a/nexus/src/app/background/tasks/blueprint_load.rs b/nexus/src/app/background/tasks/blueprint_load.rs index 70fcf713bc..8b5c02dd80 100644 --- a/nexus/src/app/background/tasks/blueprint_load.rs +++ b/nexus/src/app/background/tasks/blueprint_load.rs @@ -225,6 +225,7 @@ mod test { internal_dns_version: Generation::new(), external_dns_version: Generation::new(), cockroachdb_fingerprint: String::new(), + clickhouse_cluster_config: None, time_created: now_db_precision(), creator: "test".to_string(), comment: "test blueprint".to_string(), diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 432478b8c2..e80f8c0ae0 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -834,6 +834,9 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { cockroachdb_fingerprint: String::new(), cockroachdb_setting_preserve_downgrade: CockroachDbPreserveDowngrade::DoNotModify, + // Clickhouse clusters are not generated by RSS. One must run + // reconfigurator for that. + clickhouse_cluster_config: None, time_created: Utc::now(), creator: "nexus-test-utils".to_string(), comment: "initial test blueprint".to_string(), diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 55be9401a3..891e30b41b 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -59,6 +59,7 @@ pub use network_resources::OmicronZoneExternalSnatIp; pub use network_resources::OmicronZoneNetworkResources; pub use network_resources::OmicronZoneNic; pub use network_resources::OmicronZoneNicEntry; +pub use planning_input::ClickhousePolicy; pub use planning_input::CockroachDbClusterVersion; pub use planning_input::CockroachDbPreserveDowngrade; pub use planning_input::CockroachDbSettings; @@ -166,6 +167,10 @@ pub struct Blueprint { /// Whether to set `cluster.preserve_downgrade_option` and what to set it to pub cockroachdb_setting_preserve_downgrade: CockroachDbPreserveDowngrade, + /// Allocation of Clickhouse Servers and Keepers for replicated clickhouse + /// setups. This is set to `None` if replicated clickhouse is not in use. + pub clickhouse_cluster_config: Option, + /// when this blueprint was generated (for debugging) pub time_created: chrono::DateTime, /// identity of the component that generated the blueprint (for debugging) diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index 169c134c5d..f922f5e251 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -113,10 +113,30 @@ impl PlanningInput { self.policy.target_cockroachdb_cluster_version } + pub fn target_clickhouse_server_zone_count(&self) -> usize { + self.policy + .clickhouse_policy + .as_ref() + .map(|policy| policy.target_servers) + .unwrap_or(0) + } + + pub fn target_clickhouse_keeper_zone_count(&self) -> usize { + self.policy + .clickhouse_policy + .as_ref() + .map(|policy| policy.target_keepers) + .unwrap_or(0) + } + pub fn service_ip_pool_ranges(&self) -> &[IpRange] { &self.policy.service_ip_pool_ranges } + pub fn clickhouse_cluster_enabled(&self) -> bool { + self.policy.clickhouse_policy.is_some() + } + pub fn all_sleds( &self, filter: SledFilter, diff --git a/nexus/types/src/deployment/zone_type.rs b/nexus/types/src/deployment/zone_type.rs index a67fb06563..16a53c1e09 100644 --- a/nexus/types/src/deployment/zone_type.rs +++ b/nexus/types/src/deployment/zone_type.rs @@ -100,6 +100,21 @@ impl BlueprintZoneType { matches!(self, BlueprintZoneType::Crucible(_)) } + /// Identifies whether this is a clickhouse keeper zone + pub fn is_clickhouse_keeper(&self) -> bool { + matches!(self, BlueprintZoneType::ClickhouseKeeper(_)) + } + + /// Identifies whether this is a clickhouse server zone + pub fn is_clickhouse_server(&self) -> bool { + matches!(self, BlueprintZoneType::ClickhouseServer(_)) + } + + /// Identifies whether this is a single-node clickhouse zone + pub fn is_clickhouse(&self) -> bool { + matches!(self, BlueprintZoneType::Clickhouse(_)) + } + /// Returns the durable dataset associated with this zone, if any exists. pub fn durable_dataset(&self) -> Option> { let (dataset, kind, &address) = match self { diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index 563fd8a4d3..4f60f4d690 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -167,11 +167,11 @@ impl Collection { /// there is one. pub fn latest_clickhouse_keeper_membership( &self, - ) -> Option<(OmicronZoneUuid, ClickhouseKeeperClusterMembership)> { + ) -> Option { self.clickhouse_keeper_cluster_membership .iter() .max_by_key(|(_, membership)| membership.leader_committed_log_index) - .map(|(zone_id, membership)| (*zone_id, membership.clone())) + .map(|(_, membership)| (membership.clone())) } } diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 9e05ff72f9..9cb476a775 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -1872,6 +1872,15 @@ "$ref": "#/components/schemas/BlueprintZonesConfig" } }, + "clickhouse_cluster_config": { + "nullable": true, + "description": "Allocation of Clickhouse Servers and Keepers for replicated clickhouse setups. This is set to `None` if replicated clickhouse is not in use.", + "allOf": [ + { + "$ref": "#/components/schemas/ClickhouseClusterConfig" + } + ] + }, "cockroachdb_fingerprint": { "description": "CockroachDB state fingerprint when this blueprint was created", "type": "string" @@ -2539,6 +2548,74 @@ "key" ] }, + "ClickhouseClusterConfig": { + "description": "Global configuration for all clickhouse servers (replicas) and keepers", + "type": "object", + "properties": { + "cluster_name": { + "description": "An arbitrary name for the Clickhouse cluster shared by all nodes", + "type": "string" + }, + "cluster_secret": { + "description": "An arbitrary string shared by all nodes used at runtime to determine whether nodes are part of the same cluster.", + "type": "string" + }, + "generation": { + "description": "The last update to the clickhouse cluster configuration\n\nThis is used by `clickhouse-admin` in the clickhouse server and keeper zones to discard old configurations.", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, + "highest_seen_keeper_leader_committed_log_index": { + "description": "This is used as a marker to tell if the raft configuration in a new inventory collection is newer than the last collection. This serves as a surrogate for the log index of the last committed configuration, which clickhouse keeper doesn't expose.\n\nThis is necesssary because during inventory collection we poll multiple keeper nodes, and each returns their local knowledge of the configuration. But we may reach different nodes in different attempts, and some nodes in a following attempt may reflect stale configuration. Due to timing, we can always query old information. That is just normal polling. However, we never want to use old configuration if we have already seen and acted on newer configuration.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "keepers": { + "description": "The desired state of the clickhouse keeper cluster\n\nWe decouple deployment of zones that should contain clickhouse keeper processes from actually starting or stopping those processes, adding or removing them to/from the keeper cluster, and reconfiguring other keeper and clickhouse server nodes to reflect the new configuration.\n\nAs part of this decoupling, we keep track of the intended zone deployment in the blueprint, but that is not enough to track the desired state of the keeper cluster. We are only allowed to add or remove one keeper node at a time, and therefore we must track the desired state of the keeper cluster which may change multiple times until the keepers in the cluster match the deployed zones. An example may help:\n\n1. We start with 3 keeper nodes in 3 deployed keeper zones and need to add two to reach our desired policy of 5 keepers 2. The planner adds 2 new keeper zones to the blueprint 3. The planner will also add **one** new keeper to the `keepers` field below that matches one of the deployed zones. 4. The executor will start the new keeper process that was added to the `keepers` field, attempt to add it to the keeper cluster by pushing configuration updates to the other keepers, and then updating the clickhouse server configurations to know about the new keeper. 5. If the keeper is successfully added, as reflected in inventory, then steps 3 and 4 above will be repeated for the next keeper process. 6. If the keeper is not successfully added by the executor it will continue to retry indefinitely. 7. If the zone is expunged while the planner has it as part of its desired state in `keepers`, and the executor is trying to add it, the keeper will be removed from `keepers` in the next blueprint. If it has been added to the actual cluster by an executor in the meantime it will be removed on the next iteration of an executor.", + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/KeeperId" + } + }, + "max_used_keeper_id": { + "description": "Clickhouse Keeper IDs must be unique and are handed out monotonically. Keep track of the last used one.", + "allOf": [ + { + "$ref": "#/components/schemas/KeeperId" + } + ] + }, + "max_used_server_id": { + "description": "Clickhouse Server IDs must be unique and are handed out monotonically. Keep track of the last used one.", + "allOf": [ + { + "$ref": "#/components/schemas/ServerId" + } + ] + }, + "servers": { + "description": "The desired state of clickhouse server processes on the rack\n\nClickhouse servers do not have the same limitations as keepers and can be deployed all at once.", + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ServerId" + } + } + }, + "required": [ + "cluster_name", + "cluster_secret", + "generation", + "highest_seen_keeper_leader_committed_log_index", + "keepers", + "max_used_keeper_id", + "max_used_server_id", + "servers" + ] + }, "CockroachDbClusterVersion": { "description": "CockroachDB cluster versions we are aware of.\n\nCockroachDB can be upgraded from one major version to the next, e.g. v22.1 -> v22.2. Each major version introduces changes in how it stores data on disk to support new features, and each major version has support for reading the previous version's data so that it can perform an upgrade. The version of the data format is called the \"cluster version\", which is distinct from but related to the software version that's being run.\n\nWhile software version v22.2 is using cluster version v22.1, it's possible to downgrade back to v22.1. Once the cluster version is upgraded, there's no going back.\n\nTo give us some time to evaluate new versions of the software while retaining a downgrade path, we currently deploy new versions of CockroachDB across two releases of the Oxide software, in a \"tick-tock\" model:\n\n- In \"tick\" releases, we upgrade the version of the CockroachDB software to a new major version, and update `CockroachDbClusterVersion::NEWLY_INITIALIZED`. On upgraded racks, the new version is running with the previous cluster version; on newly-initialized racks, the new version is running with the new cluser version. - In \"tock\" releases, we change `CockroachDbClusterVersion::POLICY` to the major version we upgraded to in the last \"tick\" release. This results in a new blueprint that upgrades the cluster version, destroying the downgrade path but allowing us to eventually upgrade to the next release.\n\nThese presently describe major versions of CockroachDB. The order of these must be maintained in the correct order (the first variant must be the earliest version).", "type": "string", @@ -3639,6 +3716,12 @@ "last" ] }, + "KeeperId": { + "description": "A unique ID for a ClickHouse Keeper", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, "LastResult": { "oneOf": [ { @@ -4951,6 +5034,12 @@ } ] }, + "ServerId": { + "description": "A unique ID for a Clickhouse Server", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, "SledAgentInfo": { "description": "Sent by a sled agent to Nexus to inform about resources", "type": "object", diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index f4edf7f024..fa64a5c06c 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -1486,6 +1486,9 @@ pub(crate) fn build_initial_blueprint_from_sled_configs( cockroachdb_fingerprint: String::new(), cockroachdb_setting_preserve_downgrade: CockroachDbPreserveDowngrade::DoNotModify, + // We do not create clickhouse clusters in RSS. We create them via + // reconfigurator only. + clickhouse_cluster_config: None, time_created: Utc::now(), creator: "RSS".to_string(), comment: "initial blueprint from rack setup".to_string(),