Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provision multiple CRDB nodes, if multiple sleds exist #2956

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
7b54128
[sled-agent] Make service_manager responsible for storage services too
smklein Apr 28, 2023
f410325
Merge branch 'main' into storage-manager-cleanup
smklein Apr 28, 2023
0b4b040
CRDB auto-format on boot
smklein Apr 28, 2023
ef9517c
better use of 'unique_name' (for storage zones), auto-launch storage …
smklein Apr 30, 2023
f1fd1f5
Merge branch 'main' into storage-manager-cleanup
smklein Apr 30, 2023
3a9ad87
Merge branch 'main' into storage-manager-cleanup
smklein Apr 30, 2023
5752a83
[RSS] Explicit set of Bootstrap Agents
smklein Apr 30, 2023
31b52a7
Refuse to enact a sled plan unless it's on the explicit set
smklein Apr 30, 2023
f77f235
wip
smklein Apr 28, 2023
2810f93
RSS explicitly calling to initialize CRDB
smklein Apr 30, 2023
5297950
Read the CRDB address from the running zone
smklein Apr 30, 2023
c21be48
Send requests to the right address
smklein Apr 30, 2023
f11c153
fmt
smklein Apr 30, 2023
4377f1d
Stop deleting chelsio addresses during uninstall (#2953)
smklein Apr 30, 2023
ec3b1e4
[RSS] Explicit set of Bootstrap Agents
smklein Apr 30, 2023
c812609
Merge branch 'storage-manager-cleanup' into rss-explicit
smklein Apr 30, 2023
cbfb8c8
Merge branch 'rss-explicit' into cockroach-init
smklein Apr 30, 2023
e2c3dc8
Provision multiple CRDB nodes, if multiple sleds exist
smklein Apr 30, 2023
9d00c93
Fix tests
smklein Apr 30, 2023
8a08090
Merge branch 'storage-manager-cleanup' into rss-explicit
smklein Apr 30, 2023
7155257
Merge branch 'rss-explicit' into cockroach-init
smklein Apr 30, 2023
7335cb7
Merge branch 'cockroach-init' into actually-provision-multiple-crdbs
smklein Apr 30, 2023
cfb7cbc
make serialization happier
smklein Apr 30, 2023
6346ca7
Merge branch 'rss-explicit' into cockroach-init
smklein Apr 30, 2023
d8efb8e
Merge branch 'cockroach-init' into actually-provision-multiple-crdbs
smklein Apr 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions illumos-utils/src/running_zone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,10 @@ impl RunningZone {
format!("{}/root", self.inner.zonepath.display())
}

pub fn control_interface(&self) -> AddrObject {
AddrObject::new(self.inner.get_control_vnic_name(), "omicron6").unwrap()
}

/// Runs a command within the Zone, return the output.
pub fn run_cmd<I, S>(&self, args: I) -> Result<String, RunCommandError>
where
Expand Down Expand Up @@ -592,8 +596,8 @@ impl InstalledZone {
///
/// This results in a zone name which is distinct across different zpools,
/// but stable and predictable across reboots.
pub fn get_zone_name(zone_name: &str, unique_name: Option<&str>) -> String {
let mut zone_name = format!("{}{}", ZONE_PREFIX, zone_name);
pub fn get_zone_name(zone_type: &str, unique_name: Option<&str>) -> String {
let mut zone_name = format!("{}{}", ZONE_PREFIX, zone_type);
if let Some(suffix) = unique_name {
zone_name.push_str(&format!("_{}", suffix));
}
Expand All @@ -618,7 +622,7 @@ impl InstalledZone {
log: &Logger,
underlay_vnic_allocator: &VnicAllocator<Etherstub>,
zone_root_path: &Path,
zone_name: &str,
zone_type: &str,
unique_name: Option<&str>,
datasets: &[zone::Dataset],
filesystems: &[zone::Fs],
Expand All @@ -631,14 +635,14 @@ impl InstalledZone {
let control_vnic =
underlay_vnic_allocator.new_control(None).map_err(|err| {
InstallZoneError::CreateVnic {
zone: zone_name.to_string(),
zone: zone_type.to_string(),
err,
}
})?;

let full_zone_name = Self::get_zone_name(zone_name, unique_name);
let full_zone_name = Self::get_zone_name(zone_type, unique_name);
let zone_image_path =
PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_name));
PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_type));

let net_device_names: Vec<String> = opte_ports
.iter()
Expand Down
1 change: 1 addition & 0 deletions illumos-utils/src/zpool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ impl Zpool {
}

#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, JsonSchema)]
#[serde(rename_all = "snake_case")]
pub enum ZpoolKind {
// This zpool is used for external storage (u.2)
External,
Expand Down
39 changes: 25 additions & 14 deletions internal-dns/src/resolver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,31 @@ impl Resolver {
Ok(*address)
}

pub async fn lookup_all_ipv6(
&self,
srv: crate::ServiceName,
) -> Result<Vec<Ipv6Addr>, ResolveError> {
let name = format!("{}.{}", srv.dns_name(), DNS_ZONE);
debug!(self.log, "lookup_ipv6 srv"; "dns_name" => &name);
let response = self.inner.ipv6_lookup(&name).await?;
let addresses = response.iter().map(|a| *a).collect::<Vec<_>>();
Ok(addresses)
}

pub async fn lookup_ip(
&self,
srv: crate::ServiceName,
) -> Result<IpAddr, ResolveError> {
let name = format!("{}.{}", srv.dns_name(), DNS_ZONE);
debug!(self.log, "lookup srv"; "dns_name" => &name);
let response = self.inner.lookup_ip(&name).await?;
let address = response
.iter()
.next()
.ok_or_else(|| ResolveError::NotFound(srv))?;
Ok(address)
}

/// Looks up a single [`SocketAddrV6`] based on the SRV name
/// Returns an error if the record does not exist.
pub async fn lookup_socket_v6(
Expand Down Expand Up @@ -156,20 +181,6 @@ impl Resolver {
}
})
}

pub async fn lookup_ip(
&self,
srv: crate::ServiceName,
) -> Result<IpAddr, ResolveError> {
let name = format!("{}.{}", srv.dns_name(), DNS_ZONE);
debug!(self.log, "lookup srv"; "dns_name" => &name);
let response = self.inner.lookup_ip(&name).await?;
let address = response
.iter()
.next()
.ok_or_else(|| ResolveError::NotFound(srv))?;
Ok(address)
}
}

#[cfg(test)]
Expand Down
128 changes: 116 additions & 12 deletions openapi/sled-agent.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,24 @@
"version": "0.0.1"
},
"paths": {
"/cockroachdb": {
"post": {
"summary": "Initializes a CockroachDB cluster, calling:",
"description": "<https://www.cockroachlabs.com/docs/stable/cockroach-init.html>\nand also populating the contents of the filesystem with preliminary tables.",
"operationId": "cockroachdb_init",
"responses": {
"204": {
"description": "resource updated"
},
"4XX": {
"$ref": "#/components/responses/Error"
},
"5XX": {
"$ref": "#/components/responses/Error"
}
}
}
},
"/disks/{disk_id}": {
"put": {
"operationId": "disk_put",
Expand Down Expand Up @@ -640,13 +658,6 @@
{
"type": "object",
"properties": {
"all_addresses": {
"description": "The addresses of all nodes within the cluster.",
"type": "array",
"items": {
"type": "string"
}
},
"type": {
"type": "string",
"enum": [
Expand All @@ -655,7 +666,6 @@
}
},
"required": [
"all_addresses",
"type"
]
},
Expand Down Expand Up @@ -689,6 +699,21 @@
}
]
},
"DatasetName": {
"type": "object",
"properties": {
"kind": {
"$ref": "#/components/schemas/DatasetKind"
},
"pool_name": {
"$ref": "#/components/schemas/ZpoolName"
}
},
"required": [
"kind",
"pool_name"
]
},
"DendriteAsic": {
"type": "string",
"enum": [
Expand Down Expand Up @@ -1738,7 +1763,7 @@
"pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$"
},
"ServiceEnsureBody": {
"description": "Used to request that the Sled initialize certain services on initialization.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.",
"description": "Used to request that the Sled initialize multiple services.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.",
"type": "object",
"properties": {
"services": {
Expand Down Expand Up @@ -2036,6 +2061,48 @@
"mode",
"type"
]
},
{
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": [
"clickhouse"
]
}
},
"required": [
"type"
]
},
{
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": [
"cockroach_db"
]
}
},
"required": [
"type"
]
},
{
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": [
"crucible"
]
}
},
"required": [
"type"
]
}
]
},
Expand All @@ -2050,6 +2117,15 @@
"format": "ipv6"
}
},
"dataset": {
"nullable": true,
"default": null,
"allOf": [
{
"$ref": "#/components/schemas/DatasetName"
}
]
},
"gz_addresses": {
"default": [],
"type": "array",
Expand Down Expand Up @@ -2080,6 +2156,7 @@
]
},
"ServiceZoneService": {
"description": "Used to request that the Sled initialize a single service.",
"type": "object",
"properties": {
"details": {
Expand Down Expand Up @@ -2481,13 +2558,16 @@
"description": "The type of zone which may be requested from Sled Agent",
"type": "string",
"enum": [
"clickhouse",
"cockroach_db",
"crucible_pantry",
"crucible",
"external_dns",
"internal_dns",
"nexus",
"ntp",
"oximeter",
"switch",
"crucible_pantry",
"ntp"
"switch"
]
},
"Zpool": {
Expand All @@ -2505,6 +2585,30 @@
"disk_type",
"id"
]
},
"ZpoolKind": {
"type": "string",
"enum": [
"external",
"internal"
]
},
"ZpoolName": {
"description": "A wrapper around a zpool name.\n\nThis expects that the format will be: `ox{i,p}_<UUID>` - we parse the prefix when reading the structure, and validate that the UUID can be utilized.",
"type": "object",
"properties": {
"id": {
"type": "string",
"format": "uuid"
},
"kind": {
"$ref": "#/components/schemas/ZpoolKind"
}
},
"required": [
"id",
"kind"
]
}
}
}
Expand Down
5 changes: 1 addition & 4 deletions sled-agent/src/bootstrap/hardware.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,7 @@ impl HardwareMonitor {
let hardware = HardwareManager::new(log, sled_mode)
.map_err(|e| Error::Hardware(e))?;

// TODO: The coupling between the storage and service manager is growing
// pretty tight; we should consider merging them together.
let storage_manager =
StorageManager::new(&log, underlay_etherstub.clone()).await;
let storage_manager = StorageManager::new(&log).await;

let service_manager = ServiceManager::new(
log.clone(),
Expand Down
13 changes: 13 additions & 0 deletions sled-agent/src/bootstrap/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,19 @@ use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use serde_with::serde_as;
use std::borrow::Cow;
use std::collections::HashSet;
use std::net::{Ipv4Addr, Ipv6Addr, SocketAddrV6};
use uuid::Uuid;

#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)]
#[serde(rename_all = "snake_case", tag = "type")]
pub enum BootstrapAddressDiscovery {
/// Ignore all bootstrap addresses except our own.
OnlyOurs,
/// Ignore all bootstrap addresses except the following.
OnlyThese(HashSet<Ipv6Addr>),
}

/// Configuration for the "rack setup service".
///
/// The Rack Setup Service should be responsible for one-time setup actions,
Expand All @@ -24,6 +34,9 @@ use uuid::Uuid;
pub struct RackInitializeRequest {
pub rack_subnet: Ipv6Addr,

/// Describes how bootstrap addresses should be collected during RSS.
pub bootstrap_discovery: BootstrapAddressDiscovery,

/// The minimum number of sleds required to unlock the rack secret.
///
/// If this value is less than 2, no rack secret will be created on startup;
Expand Down
20 changes: 20 additions & 0 deletions sled-agent/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pub fn api() -> SledApiDescription {
fn register_endpoints(api: &mut SledApiDescription) -> Result<(), String> {
api.register(disk_put)?;
api.register(filesystem_put)?;
api.register(cockroachdb_init)?;
api.register(instance_issue_disk_snapshot_request)?;
api.register(instance_put_migration_ids)?;
api.register(instance_put_state)?;
Expand Down Expand Up @@ -103,6 +104,7 @@ async fn filesystem_put(
let sa = rqctx.context();
let body_args = body.into_inner();
sa.filesystem_ensure(
body_args.id,
body_args.zpool_id,
body_args.dataset_kind,
body_args.address,
Expand All @@ -112,6 +114,24 @@ async fn filesystem_put(
Ok(HttpResponseUpdatedNoContent())
}

/// Initializes a CockroachDB cluster, calling:
///
/// <https://www.cockroachlabs.com/docs/stable/cockroach-init.html>
///
/// and also populating the contents of the filesystem with preliminary
/// tables.
#[endpoint {
method = POST,
path = "/cockroachdb",
}]
async fn cockroachdb_init(
rqctx: RequestContext<SledAgent>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
let sa = rqctx.context();
sa.cockroachdb_initialize().await?;
Ok(HttpResponseUpdatedNoContent())
}

/// Path parameters for Instance requests (sled agent API)
#[derive(Deserialize, JsonSchema)]
struct InstancePathParam {
Expand Down
Loading