diff --git a/Cargo.lock b/Cargo.lock index 6e10d4c12f..f902117c15 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1332,6 +1332,32 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" +[[package]] +name = "clickana" +version = "0.1.0" +dependencies = [ + "anyhow", + "camino", + "chrono", + "clap", + "clickhouse-admin-server-client", + "clickhouse-admin-types", + "dropshot 0.13.0", + "futures", + "omicron-common", + "omicron-workspace-hack", + "ratatui", + "schemars", + "serde_json", + "slog", + "slog-async", + "slog-dtrace", + "slog-error-chain", + "slog-term", + "tokio", + "tokio-postgres", +] + [[package]] name = "clickhouse-admin-api" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index bb9ff0e80f..3d8efe3acb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,7 @@ members = [ "cockroach-admin/types", "common", "dev-tools/cert-dev", + "dev-tools/clickana", "dev-tools/clickhouse-cluster-dev", "dev-tools/ch-dev", "dev-tools/crdb-seed", @@ -158,6 +159,7 @@ default-members = [ "cockroach-admin/types", "common", "dev-tools/cert-dev", + "dev-tools/clickana", "dev-tools/clickhouse-cluster-dev", "dev-tools/ch-dev", "dev-tools/crdb-seed", @@ -332,6 +334,7 @@ chrono = { version = "0.4", features = [ "serde" ] } chrono-tz = "0.10.0" ciborium = "0.2.2" clap = { version = "4.5", features = ["cargo", "derive", "env", "wrap_help"] } +clickana = { path = "dev-tools/clickana" } clickhouse-admin-api = { path = "clickhouse-admin/api" } clickhouse-admin-keeper-client = { path = "clients/clickhouse-admin-keeper-client" } clickhouse-admin-server-client = { path = "clients/clickhouse-admin-server-client" } diff --git a/clickhouse-admin/Cargo.toml b/clickhouse-admin/Cargo.toml index 80b080b2ff..65ceb3fdbf 100644 --- a/clickhouse-admin/Cargo.toml +++ b/clickhouse-admin/Cargo.toml @@ -22,7 +22,9 @@ slog.workspace = true slog-async.workspace = true slog-dtrace.workspace = true slog-error-chain.workspace = true +slog-term.workspace = true serde.workspace = true +serde_json.workspace = true thiserror.workspace = true tokio.workspace = true tokio-postgres.workspace = true diff --git a/clickhouse-admin/types/src/lib.rs b/clickhouse-admin/types/src/lib.rs index 3b8696438c..aa1437bedc 100644 --- a/clickhouse-admin/types/src/lib.rs +++ b/clickhouse-admin/types/src/lib.rs @@ -1200,7 +1200,7 @@ pub enum Timestamp { #[derive(Debug, Serialize, Deserialize, JsonSchema, PartialEq)] #[serde(rename_all = "snake_case")] pub struct SystemTimeSeries { - pub time: Timestamp, + pub time: String, pub value: f64, // TODO: Would be really nice to have an enum with possible units (s, ms, bytes) // Not sure if I can even add this, the system tables don't mention units at all. @@ -2099,15 +2099,15 @@ snapshot_storage_disk=LocalSnapshotDisk let expected = vec![ SystemTimeSeries { - time: crate::Timestamp::Unix("1732494720".to_string()), + time: "1732494720".to_string(), value: 110220450825.75238, }, SystemTimeSeries { - time: crate::Timestamp::Unix("1732494840".to_string()), + time: "1732494840".to_string(), value: 110339992917.33331, }, SystemTimeSeries { - time: crate::Timestamp::Unix("1732494960".to_string()), + time: "1732494960".to_string(), value: 110421854037.33331, }, ]; @@ -2127,21 +2127,15 @@ snapshot_storage_disk=LocalSnapshotDisk let expected = vec![ SystemTimeSeries { - time: crate::Timestamp::Utc( - "2024-11-25T00:34:00Z".parse::>().unwrap(), - ), + time: "2024-11-25T00:34:00Z".to_string(), value: 110220450825.75238, }, SystemTimeSeries { - time: crate::Timestamp::Utc( - "2024-11-25T00:35:00Z".parse::>().unwrap(), - ), + time: "2024-11-25T00:35:00Z".to_string(), value: 110339992917.33331, }, SystemTimeSeries { - time: crate::Timestamp::Utc( - "2024-11-25T00:36:00Z".parse::>().unwrap(), - ), + time: "2024-11-25T00:36:00Z".to_string(), value: 110421854037.33331, }, ]; @@ -2176,7 +2170,7 @@ snapshot_storage_disk=LocalSnapshotDisk assert_eq!( format!("{}", root_cause), - "data did not match any variant of untagged enum Timestamp at line 1 column 12", + "invalid type: integer `2024`, expected a string at line 1 column 12", ); } } diff --git a/dev-tools/clickana/Cargo.toml b/dev-tools/clickana/Cargo.toml new file mode 100644 index 0000000000..a9f91b890b --- /dev/null +++ b/dev-tools/clickana/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "clickana" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[dependencies] +anyhow.workspace = true +camino.workspace = true +chrono.workspace = true +clap.workspace = true +clickhouse-admin-types.workspace = true +clickhouse-admin-server-client.workspace = true +dropshot.workspace = true +futures.workspace = true +omicron-common.workspace = true +ratatui.workspace = true +schemars.workspace = true +slog.workspace = true +slog-async.workspace = true +slog-dtrace.workspace = true +slog-error-chain.workspace = true +slog-term.workspace = true +serde_json.workspace = true +tokio.workspace = true +tokio-postgres.workspace = true + +omicron-workspace-hack.workspace = true + +[lints] +workspace = true diff --git a/dev-tools/clickana/src/bin/clickana.rs b/dev-tools/clickana/src/bin/clickana.rs new file mode 100644 index 0000000000..0c8d06156e --- /dev/null +++ b/dev-tools/clickana/src/bin/clickana.rs @@ -0,0 +1,57 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use anyhow::Result; +use camino::Utf8PathBuf; +use clap::Parser; +use clickana::Clickana; +use std::net::SocketAddr; + +const CLICKANA_LOG_FILE: &str = "/tmp/clickana.log"; + +#[tokio::main] +async fn main() -> Result<()> { + let args = Cli::parse(); + + let terminal = ratatui::init(); + let result = Clickana::new( + args.clickhouse_addr, + args.log_path, + args.sampling_interval, + args.time_range, + args.refresh_interval, + ) + .run(terminal) + .await; + ratatui::restore(); + result +} + +#[derive(Debug, Parser)] +struct Cli { + /// Path to the log file + #[arg( + long, + short, + env = "CLICKANA_LOG_PATH", + default_value = CLICKANA_LOG_FILE, + )] + log_path: Utf8PathBuf, + + /// Address where a clickhouse admin server is listening on + #[arg(long, short = 'a')] + clickhouse_addr: SocketAddr, + + /// The interval to collect monitoring data in seconds + #[arg(long, short, default_value_t = 60)] + sampling_interval: u64, + + /// Range of time to collect monitoring data in seconds + #[arg(long, short, default_value_t = 3600)] + time_range: u64, + + /// The interval at which the dashboards will refresh + #[arg(long, short, default_value_t = 60)] + refresh_interval: u64, +} diff --git a/dev-tools/clickana/src/chart.rs b/dev-tools/clickana/src/chart.rs new file mode 100644 index 0000000000..f8a78fb63d --- /dev/null +++ b/dev-tools/clickana/src/chart.rs @@ -0,0 +1,765 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use anyhow::{bail, Result}; +use chrono::{DateTime, Utc}; +use clickhouse_admin_server_client::types::{SystemTable, SystemTimeSeries}; +use ratatui::{ + layout::{Constraint, Rect}, + style::{Color, Style, Stylize}, + symbols::Marker, + text::Line, + widgets::{Axis, Block, Chart, Dataset, GraphType, LegendPosition}, + Frame, +}; +use std::fmt::Display; + +// Ratatui requires data points in a Dataset to be f64 +const GIBIBYTE_F64: f64 = 1073741824.0; +const MEBIBYTE_F64: f64 = 1048576.0; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Unit { + Count, + Gibibyte, + Mebibyte, +} + +impl Display for Unit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Unit::Count => "", + Unit::Gibibyte => "GiB", + Unit::Mebibyte => "MiB", + }; + write!(f, "{s}") + } +} + +impl Unit { + /// Returns the value of the unit represented in bytes. + fn as_bytes_f64(&self) -> Result { + let bytes = match self { + Unit::Gibibyte => GIBIBYTE_F64, + Unit::Mebibyte => MEBIBYTE_F64, + Unit::Count => bail!("Count cannot be converted into bytes"), + }; + Ok(bytes) + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum MetricName { + DiskUsage, + MemoryTracking, + QueryCount, + RunningQueries, +} + +impl Display for MetricName { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + MetricName::DiskUsage => "DiskUsed_default", + MetricName::MemoryTracking => "CurrentMetric_MemoryTracking", + MetricName::QueryCount => "ProfileEvent_Query", + MetricName::RunningQueries => "CurrentMetric_Query", + }; + write!(f, "{s}") + } +} + +impl MetricName { + /// Returns the associated table to query for each metric. + pub fn table(&self) -> SystemTable { + match self { + MetricName::DiskUsage => SystemTable::AsynchronousMetricLog, + MetricName::MemoryTracking + | MetricName::QueryCount + | MetricName::RunningQueries => SystemTable::MetricLog, + } + } + + /// Returns the unit the data values will be represented as. + fn unit(&self) -> Unit { + match self { + MetricName::DiskUsage => Unit::Gibibyte, + MetricName::MemoryTracking => Unit::Mebibyte, + MetricName::QueryCount | MetricName::RunningQueries => Unit::Count, + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ChartMetadata { + pub title: String, + pub unit: Unit, +} + +impl ChartMetadata { + pub fn new(metric: MetricName, title: String) -> Self { + let unit = metric.unit(); + Self { title, unit } + } +} + +#[derive(Debug)] +struct TimeSeriesValues { + /// A collection of all the values from the timeseries + values: Vec, +} + +impl TimeSeriesValues { + fn new(raw_data: &Vec) -> Self { + let values: Vec = raw_data.iter().map(|ts| ts.value).collect(); + Self { values } + } + + fn min(&self) -> Result<&f64> { + let Some(min_value) = + self.values.iter().min_by(|a, b| a.partial_cmp(b).unwrap()) + else { + bail!("no values have been retrieved") + }; + + Ok(min_value) + } + + fn max(&self) -> Result<&f64> { + let Some(max_value) = + self.values.iter().max_by(|a, b| a.partial_cmp(b).unwrap()) + else { + bail!("no values have been retrieved") + }; + + Ok(max_value) + } + + /// Returns the average value of the max and min values. + fn avg(&self, min_value_label: f64, max_value_label: f64) -> f64 { + (min_value_label + max_value_label) / 2.0 + } +} + +// The result of the following functions will not be precise, but it doesn't +// matter since we just want an estimate for the chart's labels and bounds. +// all we need are values that are larger than the maximum value in the +// timeseries or smaller than the minimum value in the timeseries. + +/// Returns the sum of the maximum raw value and 1 or the equivalent of 1 +/// MiB or GiB in bytes. +fn padded_max_value_raw(unit: Unit, max_value_raw: &f64) -> Result { + let ceil_value = max_value_raw.ceil(); + let padded_value = match unit { + Unit::Count => ceil_value + 1.0, + Unit::Gibibyte | Unit::Mebibyte => ceil_value + unit.as_bytes_f64()?, + }; + Ok(padded_value) +} + +/// Returns the sum of the max raw value and 1 or the equivalent of 1 +/// Mib or Gib. +fn padded_max_value_as_unit(unit: Unit, max_value_raw: &f64) -> Result { + let label_value = match unit { + Unit::Count => max_value_raw + 1.0, + Unit::Gibibyte | Unit::Mebibyte => { + (max_value_raw / unit.as_bytes_f64()?) + 1.0 + } + }; + Ok(label_value.ceil()) +} + +/// Returns the difference of the minimum raw value and 1 or the equivalent +/// of 1 in MiB or GiB in bytes. If the minimum is equal to or less than 1.0, +/// or the equivalent of 1 once converted from bytes to the expected unit +/// (e.g. less than or equal to 1048576 if we're using MiB) we'll use 0.0 as +/// the minimum value as we don't expect any of our charts +/// to require negative numbers for now. +fn padded_min_value_raw(unit: Unit, min_value_raw: &f64) -> Result { + let padded_value = match unit { + Unit::Count => { + if *min_value_raw <= 1.0 { + 0.0 + } else { + min_value_raw - 1.0 + } + } + Unit::Gibibyte | Unit::Mebibyte => { + let bytes = unit.as_bytes_f64()?; + if *min_value_raw <= bytes { + 0.0 + } else { + min_value_raw - bytes + } + } + }; + Ok(padded_value.floor()) +} + +/// Returns the difference of the minimum raw value and 1 or the equivalent +/// of 1 in MiB or GiB in bytes. If the minimum is less than 1, we'll use +/// 0 as the minimum value as we don't expect any of our charts to require +/// negative numbers for now. +fn padded_min_value_as_unit(unit: Unit, min_value_raw: &f64) -> Result { + let padded_value = match unit { + Unit::Count => { + if *min_value_raw < 1.0 { + 0.0 + } else { + min_value_raw - 1.0 + } + } + Unit::Gibibyte | Unit::Mebibyte => { + let value_as_unit = min_value_raw / unit.as_bytes_f64()?; + if value_as_unit < 1.0 { + 0.0 + } else { + value_as_unit - 1.0 + } + } + }; + Ok(padded_value.floor()) +} + +#[derive(Debug, PartialEq)] +struct YAxisValues { + lower_label: String, + mid_label: String, + upper_label: String, + lower_bound: f64, + upper_bound: f64, +} + +impl YAxisValues { + fn new(unit: Unit, raw_data: &Vec) -> Result { + // Retrieve values only to create Y axis bounds and labels + let values = TimeSeriesValues::new(&raw_data); + let max_value = values.max()?; + let min_value = values.min()?; + + // In case there is very little variance in the y axis, we will be adding some + // padding to the bounds and labels so we don't end up with repeated labels or + // straight lines too close to the upper bounds. + let upper_bound = padded_max_value_raw(unit, max_value)?; + let upper_label_as_unit = padded_max_value_as_unit(unit, max_value)?; + let lower_bound = padded_min_value_raw(unit, min_value)?; + let lower_label_as_unit = padded_min_value_as_unit(unit, min_value)?; + let mid_label_as_unit = + values.avg(lower_label_as_unit, upper_label_as_unit); + + // To nicely display the mid value label for the Y axis, we do the following: + // - It is not displayed it if it is a 0.0. + // - If it does not have a fractional number we display it as an integer. + // - Else, we display the number as is up to the first fractional number. + //let mid_value = mid_label; + let fractional_of_mid_value = + mid_label_as_unit - mid_label_as_unit.floor(); + let mid_value_formatted = format!("{:.1}", mid_label_as_unit); + let mid_label = if mid_value_formatted == *"0.0" { + "".to_string() + } else if fractional_of_mid_value == 0.0 { + format!( + "{} {}", + mid_value_formatted.split('.').next().unwrap(), + unit + ) + } else { + format!("{} {}", mid_value_formatted, unit) + }; + + let upper_label = format!("{} {}", upper_label_as_unit, unit); + let lower_label = format!("{} {}", lower_label_as_unit, unit); + + Ok(Self { + lower_label, + mid_label, + upper_label, + lower_bound, + upper_bound, + }) + } +} + +#[derive(Debug)] +struct TimeSeriesTimestamps { + /// A collection of all the timestamps from the timeseries + timestamps: Vec, +} + +impl TimeSeriesTimestamps { + fn new(raw_data: &Vec) -> Self { + let timestamps: Vec = raw_data + .iter() + .map(|ts| { + ts.time.trim_matches('"').parse::().unwrap_or_else(|_| { + panic!("could not parse timestamp {} into i64", ts.time) + }) + }) + .collect(); + Self { timestamps } + } + + fn min(&self) -> Result<&i64> { + let Some(start_time) = self.timestamps.iter().min() else { + bail!("failed to retrieve start time, timestamp list is empty") + }; + Ok(start_time) + } + + fn max(&self) -> Result<&i64> { + let Some(end_time) = self.timestamps.iter().max() else { + bail!("failed to retrieve end time, timestamp list is empty") + }; + Ok(end_time) + } + + fn avg(&self, start_time: &i64, end_time: &i64) -> i64 { + (start_time + end_time) / 2 + } +} + +#[derive(Debug, PartialEq)] +pub struct XAxisTimestamps { + mid_time_label: DateTime, + pub start_time_label: DateTime, + pub end_time_label: DateTime, + start_time_bound: f64, + end_time_bound: f64, +} + +impl XAxisTimestamps { + fn new(raw_data: &Vec) -> Result { + // Retrieve timestamps only to create chart bounds and labels + let timestamps = TimeSeriesTimestamps::new(&raw_data); + // These timestamps will be used to calculate start and end timestamps in order + // to create labels and set bounds for the X axis. As above, some of these conversions + // may lose precision, but it's OK as these values are only used to make sure the + // datapoints fit within the graph nicely. + let start_time = timestamps.min()?; + let end_time = timestamps.max()?; + let avg_time = timestamps.avg(start_time, end_time); + + let Some(start_time_label) = DateTime::from_timestamp(*start_time, 0) + else { + bail!( + "failed to convert timestamp to UTC date and time; + timestamp = {}", + start_time + ) + }; + let Some(end_time_label) = DateTime::from_timestamp(*end_time, 0) + else { + bail!( + "failed to convert timestamp to UTC date and time; + timestamp = {}", + end_time + ) + }; + let Some(mid_time_label) = DateTime::from_timestamp(avg_time, 0) else { + bail!( + "failed to convert timestamp to UTC date and time; + timestamp = {}", + avg_time + ) + }; + + let start_time_bound = *start_time as f64; + let end_time_bound = *end_time as f64; + + Ok(Self { + mid_time_label, + start_time_label, + end_time_label, + start_time_bound, + end_time_bound, + }) + } +} + +#[derive(Debug, PartialEq)] +struct DataPoints { + data: Vec<(f64, f64)>, +} + +impl DataPoints { + fn new(timeseries: &Vec) -> Self { + // These values will be used to render the graph and ratatui + // requires them to be f64 + let data: Vec<(f64, f64)> = timeseries + .iter() + .map(|ts| { + ( + ts.time.trim_matches('"').parse::().unwrap_or_else( + |_| { + panic!( + "could not parse timestamp {} into f64", + ts.time + ) + }, + ), + ts.value, + ) + }) + .collect(); + Self { data } + } +} + +#[derive(Debug, PartialEq)] +pub struct ChartData { + metadata: ChartMetadata, + data_points: DataPoints, + pub x_axis_timestamps: XAxisTimestamps, + y_axis_values: YAxisValues, +} + +impl ChartData { + pub fn new( + raw_data: Vec, + metadata: ChartMetadata, + ) -> Result { + // Retrieve datapoints that will be charted + let data_points = DataPoints::new(&raw_data); + + // Retrieve X axis bounds and labels + let x_axis_timestamps = XAxisTimestamps::new(&raw_data)?; + + // Retrieve X axis bounds and labels + let y_axis_values = YAxisValues::new(metadata.unit, &raw_data)?; + + Ok(Self { metadata, data_points, x_axis_timestamps, y_axis_values }) + } + + pub fn render_line_chart(&self, frame: &mut Frame, area: Rect) { + let datasets = vec![Dataset::default() + .marker(Marker::Braille) + .style(Style::default().fg(Color::LightGreen)) + .graph_type(GraphType::Line) + .data(&self.data_points.data)]; + + let chart = Chart::new(datasets) + .block( + Block::bordered() + .title(Line::from(self.title()).cyan().bold().centered()), + ) + .x_axis( + Axis::default() + .style(Style::default().gray()) + .bounds([self.start_time_bound(), self.end_time_bound()]) + .labels([ + self.start_time_label().bold(), + self.mid_time_label().bold(), + self.end_time_label().bold(), + ]), + ) + .y_axis( + Axis::default() + .style(Style::default().gray()) + .bounds([ + self.lower_value_bound(), + self.upper_value_bound(), + ]) + .labels([ + self.lower_value_label().bold(), + self.mid_value_label().bold(), + self.upper_value_label().bold(), + ]), + ) + .legend_position(Some(LegendPosition::TopLeft)) + .hidden_legend_constraints(( + Constraint::Ratio(1, 2), + Constraint::Ratio(1, 2), + )); + + frame.render_widget(chart, area); + } + + fn title(&self) -> String { + self.metadata.title.clone() + } + + pub fn start_date_time(&self) -> DateTime { + self.x_axis_timestamps.start_time_label + } + + pub fn end_date_time(&self) -> DateTime { + self.x_axis_timestamps.end_time_label + } + + fn start_time_label(&self) -> String { + self.x_axis_timestamps.start_time_label.time().to_string() + } + + fn mid_time_label(&self) -> String { + self.x_axis_timestamps.mid_time_label.time().to_string() + } + + fn end_time_label(&self) -> String { + self.x_axis_timestamps.end_time_label.time().to_string() + } + + fn start_time_bound(&self) -> f64 { + self.x_axis_timestamps.start_time_bound + } + + fn end_time_bound(&self) -> f64 { + self.x_axis_timestamps.end_time_bound + } + + fn lower_value_label(&self) -> String { + self.y_axis_values.lower_label.clone() + } + + fn mid_value_label(&self) -> String { + self.y_axis_values.mid_label.clone() + } + + fn upper_value_label(&self) -> String { + self.y_axis_values.upper_label.clone() + } + + fn lower_value_bound(&self) -> f64 { + self.y_axis_values.lower_bound + } + + fn upper_value_bound(&self) -> f64 { + self.y_axis_values.upper_bound + } +} + +#[cfg(test)] +mod tests { + use crate::{ + chart::{Unit, YAxisValues}, + ChartData, ChartMetadata, MetricName, + }; + use chrono::DateTime; + use clickhouse_admin_server_client::types::SystemTimeSeries; + + use super::{DataPoints, XAxisTimestamps}; + + #[test] + fn gather_chart_data_for_disk_usage_success() { + let metadata = + ChartMetadata::new(MetricName::DiskUsage, "Test Chart".to_string()); + let raw_data = vec![ + SystemTimeSeries { + time: "1732223400".to_string(), + value: 479551511587.3104, + }, + SystemTimeSeries { + time: "1732223520".to_string(), + value: 479555459822.93335, + }, + SystemTimeSeries { + time: "1732223640".to_string(), + value: 479560290201.6, + }, + ]; + + let expected_result = ChartData { + metadata: ChartMetadata { + title: "Test Chart".to_string(), + unit: Unit::Gibibyte, + }, + data_points: DataPoints { + data: vec![ + (1732223400.0, 479551511587.3104), + (1732223520.0, 479555459822.93335), + (1732223640.0, 479560290201.6), + ], + }, + x_axis_timestamps: XAxisTimestamps { + start_time_label: DateTime::from_timestamp(1732223400, 0) + .unwrap(), + mid_time_label: DateTime::from_timestamp(1732223520, 0) + .unwrap(), + end_time_label: DateTime::from_timestamp(1732223640, 0) + .unwrap(), + start_time_bound: 1732223400.0, + end_time_bound: 1732223640.0, + }, + y_axis_values: YAxisValues { + lower_label: "445 GiB".to_string(), + mid_label: "446.5 GiB".to_string(), + upper_label: "448 GiB".to_string(), + lower_bound: 478477769763.0, + upper_bound: 480634032026.0, + }, + }; + let result = ChartData::new(raw_data, metadata).unwrap(); + assert_eq!(result, expected_result); + } + + #[test] + fn gather_chart_data_for_memory_tracking_success() { + let metadata = ChartMetadata::new( + MetricName::MemoryTracking, + "Test Chart".to_string(), + ); + let raw_data = vec![ + SystemTimeSeries { + time: "1732223400".to_string(), + value: 479551511587.3104, + }, + SystemTimeSeries { + time: "1732223520".to_string(), + value: 479555459822.93335, + }, + SystemTimeSeries { + time: "1732223640".to_string(), + value: 479560290201.6, + }, + ]; + + let expected_result = ChartData { + metadata: ChartMetadata { + title: "Test Chart".to_string(), + unit: Unit::Mebibyte, + }, + data_points: DataPoints { + data: vec![ + (1732223400.0, 479551511587.3104), + (1732223520.0, 479555459822.93335), + (1732223640.0, 479560290201.6), + ], + }, + x_axis_timestamps: XAxisTimestamps { + start_time_label: DateTime::from_timestamp(1732223400, 0) + .unwrap(), + mid_time_label: DateTime::from_timestamp(1732223520, 0) + .unwrap(), + end_time_label: DateTime::from_timestamp(1732223640, 0) + .unwrap(), + start_time_bound: 1732223400.0, + end_time_bound: 1732223640.0, + }, + y_axis_values: YAxisValues { + lower_label: "457334 MiB".to_string(), + mid_label: "457340 MiB".to_string(), + upper_label: "457346 MiB".to_string(), + lower_bound: 479550463011.0, + upper_bound: 479561338778.0, + }, + }; + let result = ChartData::new(raw_data, metadata).unwrap(); + assert_eq!(result, expected_result); + } + + #[test] + fn gather_chart_data_for_query_count_success() { + let metadata = ChartMetadata::new( + MetricName::QueryCount, + "Test Chart".to_string(), + ); + let raw_data = vec![ + SystemTimeSeries { time: "1732223400".to_string(), value: 0.0 }, + SystemTimeSeries { time: "1732223520".to_string(), value: 0.004 }, + SystemTimeSeries { time: "1732223640".to_string(), value: 0.0 }, + ]; + + let expected_result = ChartData { + metadata: ChartMetadata { + title: "Test Chart".to_string(), + unit: Unit::Count, + }, + data_points: DataPoints { + data: vec![ + (1732223400.0, 0.0), + (1732223520.0, 0.004), + (1732223640.0, 0.0), + ], + }, + x_axis_timestamps: XAxisTimestamps { + start_time_label: DateTime::from_timestamp(1732223400, 0) + .unwrap(), + mid_time_label: DateTime::from_timestamp(1732223520, 0) + .unwrap(), + end_time_label: DateTime::from_timestamp(1732223640, 0) + .unwrap(), + start_time_bound: 1732223400.0, + end_time_bound: 1732223640.0, + }, + y_axis_values: YAxisValues { + lower_label: "0 ".to_string(), + mid_label: "1 ".to_string(), + upper_label: "2 ".to_string(), + lower_bound: 0.0, + upper_bound: 2.0, + }, + }; + let result = ChartData::new(raw_data, metadata).unwrap(); + assert_eq!(result, expected_result); + } + + #[test] + fn gather_chart_data_for_running_queries_success() { + let metadata = ChartMetadata::new( + MetricName::RunningQueries, + "Test Chart".to_string(), + ); + let raw_data = vec![ + SystemTimeSeries { time: "1732223400".to_string(), value: 1.554 }, + SystemTimeSeries { time: "1732223520".to_string(), value: 1.877 }, + SystemTimeSeries { time: "1732223640".to_string(), value: 1.3456 }, + ]; + + let expected_result = ChartData { + metadata: ChartMetadata { + title: "Test Chart".to_string(), + unit: Unit::Count, + }, + data_points: DataPoints { + data: vec![ + (1732223400.0, 1.554), + (1732223520.0, 1.877), + (1732223640.0, 1.3456), + ], + }, + x_axis_timestamps: XAxisTimestamps { + start_time_label: DateTime::from_timestamp(1732223400, 0) + .unwrap(), + mid_time_label: DateTime::from_timestamp(1732223520, 0) + .unwrap(), + end_time_label: DateTime::from_timestamp(1732223640, 0) + .unwrap(), + start_time_bound: 1732223400.0, + end_time_bound: 1732223640.0, + }, + y_axis_values: YAxisValues { + lower_label: "0 ".to_string(), + mid_label: "1.5 ".to_string(), + upper_label: "3 ".to_string(), + lower_bound: 0.0, + upper_bound: 3.0, + }, + }; + let result = ChartData::new(raw_data, metadata).unwrap(); + assert_eq!(result, expected_result); + } + + #[test] + #[should_panic( + expected = "could not parse timestamp Some nonsense string into f64" + )] + fn gather_chart_data_failure() { + let metadata = + ChartMetadata::new(MetricName::DiskUsage, "Test Chart".to_string()); + let raw_data = vec![ + SystemTimeSeries { + time: "Some nonsense string".to_string(), + value: 479551511587.3104, + }, + SystemTimeSeries { + time: "1732223520".to_string(), + value: 479555459822.93335, + }, + SystemTimeSeries { + time: "1732223640".to_string(), + value: 479560290201.6, + }, + ]; + + let _ = ChartData::new(raw_data, metadata); + } +} diff --git a/dev-tools/clickana/src/lib.rs b/dev-tools/clickana/src/lib.rs new file mode 100644 index 0000000000..76af35ba8d --- /dev/null +++ b/dev-tools/clickana/src/lib.rs @@ -0,0 +1,274 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use anyhow::{anyhow, bail, Context, Result}; +use camino::Utf8PathBuf; +use chrono::{DateTime, Utc}; +use clickhouse_admin_server_client::types::{ + SystemTimeSeries, TimestampFormat, +}; +use clickhouse_admin_server_client::Client as ClickhouseServerClient; +use futures::stream::FuturesOrdered; +use futures::StreamExt; +use omicron_common::FileKv; +use ratatui::crossterm::event::{self, Event, KeyCode}; +use ratatui::layout::{Constraint, Layout, Rect}; +use ratatui::style::{Color, Style, Stylize}; +use ratatui::text::Span; +use ratatui::widgets::Paragraph; +use ratatui::{DefaultTerminal, Frame}; +use slog::{o, Drain, Logger}; +use slog_async::Async; +use slog_term::{FullFormat, PlainDecorator}; +use std::collections::BTreeMap; +use std::future::Future; +use std::net::SocketAddr; +use std::pin::Pin; +use std::time::{Duration, Instant}; + +use crate::chart::{ChartData, ChartMetadata, MetricName}; + +mod chart; + +#[derive(Debug)] +struct Dashboard { + start_time: DateTime, + end_time: DateTime, + top_left_frame: ChartData, + top_right_frame: ChartData, + bottom_left_frame: ChartData, + bottom_right_frame: ChartData, +} + +#[derive(Clone, Debug)] +pub struct Clickana { + clickhouse_addr: SocketAddr, + log_path: Utf8PathBuf, + sampling_interval: u64, + time_range: u64, + refresh_interval: u64, +} + +impl Clickana { + pub fn new( + clickhouse_addr: SocketAddr, + log_path: Utf8PathBuf, + sampling_interval: u64, + time_range: u64, + refresh_interval: u64, + ) -> Self { + Self { + clickhouse_addr, + log_path, + sampling_interval, + time_range, + refresh_interval, + } + } + + pub async fn run(self, mut terminal: DefaultTerminal) -> Result<()> { + let admin_url = format!("http://{}", self.clickhouse_addr); + let log = self.new_logger()?; + let client = ClickhouseServerClient::new(&admin_url, log.clone()); + + let tick_interval = Duration::from_secs(self.refresh_interval); + let mut last_tick = Instant::now(); + loop { + // Charts we will be showing in the dashboard + // + // We are hardcoding these for now. In the future these will likely be taken + // from a TOML config file. + let charts = BTreeMap::from([ + (MetricName::DiskUsage, "Disk Usage".to_string()), + ( + MetricName::MemoryTracking, + "Memory Allocated by the Server".to_string(), + ), + ( + MetricName::QueryCount, + "Queries Started per Second".to_string(), + ), + (MetricName::RunningQueries, "Queries Running".to_string()), + ]); + + let mut tasks = FuturesOrdered::< + Pin>>>, + >::new(); + + for (metric_name, title) in charts { + let s = self.clone(); + let c = client.clone(); + + let task = Box::pin(async move { + let metadata = ChartMetadata::new(metric_name, title); + let data = s.get_api_data(&c, metric_name).await?; + ChartData::new(data, metadata) + }); + tasks.push_back(task); + } + + if tasks.len() != 4 { + bail!( + "expected information for 4 charts, received {} instead", + tasks.len() + ); + } + + // TODO: Eventually we may want to not have a set amount of charts and make the + // dashboard a bit more dynamic. Perhaps taking a toml configuration file or + // something like that. We can then create a vector of "ChartData"s for Dashboard + // to take and create the layout dynamically. + // + // IDEA (ajs): I think it would be useful to be able to have a little menu of charts + // on the side of the pane, and then you can scroll and select which ones to show + // without having to restart the app, or mess with a toml file. + // You could also allow toggling between a set of predefined layouts to make it always + // look nice. So you could show, 1, 2, 4, 6, 8 charts or something and allow selecting + // which to show in each view. You could even remember which charts to show in each layout, + // so you could toggle back and forth between different layouts and see all the charts, + // some with more detail. + // + // We have already checked that the length of tasks is 4, so it's safe to unwrap + let top_left_frame: ChartData = tasks.next().await.unwrap()?; + let top_right_frame: ChartData = tasks.next().await.unwrap()?; + let bottom_left_frame: ChartData = tasks.next().await.unwrap()?; + let bottom_right_frame: ChartData = tasks.next().await.unwrap()?; + + // We only need to retrieve from one chart as they will all be relatively the same. + // Rarely, the charts may have a variance of a second or so depending on when + // the API calls were made, but for the header block we don't need exact precision. + let start_time = top_left_frame.start_date_time(); + let end_time = top_left_frame.end_date_time(); + + let dashboard = Dashboard { + start_time, + end_time, + top_left_frame, + top_right_frame, + bottom_left_frame, + bottom_right_frame, + }; + terminal.draw(|frame| self.draw(frame, dashboard))?; + + let timeout = tick_interval.saturating_sub(last_tick.elapsed()); + if event::poll(timeout)? { + if let Event::Key(key) = event::read()? { + // To exit the dashboard press the "q" key + if key.code == KeyCode::Char('q') { + return Ok(()); + } + } + } + + if last_tick.elapsed() >= tick_interval { + last_tick = Instant::now(); + } + } + } + + fn draw(&self, frame: &mut Frame, dashboard: Dashboard) { + let [heading, top, bottom] = Layout::vertical([ + Constraint::Length(4), + // TODO: If we make the dashboard with too many charts + // we may want to reconsider setting sizes instead of filling + // the space + Constraint::Fill(1), + Constraint::Fill(1), + ]) + .areas(frame.area()); + let [title] = + Layout::horizontal([Constraint::Fill(1); 1]).areas(heading); + let [top_left_frame, top_right_frame] = + Layout::horizontal([Constraint::Fill(1); 2]).areas(top); + let [bottom_left_frame, bottom_right_frame] = + Layout::horizontal([Constraint::Fill(1); 2]).areas(bottom); + + self.render_title_bar(frame, title, &dashboard); + + dashboard.top_left_frame.render_line_chart(frame, top_left_frame); + dashboard.top_right_frame.render_line_chart(frame, top_right_frame); + dashboard.bottom_left_frame.render_line_chart(frame, bottom_left_frame); + dashboard + .bottom_right_frame + .render_line_chart(frame, bottom_right_frame); + } + + fn render_title_bar( + &self, + frame: &mut Frame, + area: Rect, + dashboard: &Dashboard, + ) { + let style = Style::new().fg(Color::Green).bold(); + let title = vec![ + Span::styled("CLICKANA", style).into_centered_line(), + Span::styled( + format!("Sampling Interval: {}s", self.sampling_interval), + style, + ) + .into_left_aligned_line(), + Span::styled( + format!( + "Time Range: {} - {} ({}s)", + dashboard.start_time, dashboard.end_time, self.time_range + ), + style, + ) + .into_left_aligned_line(), + Span::styled( + format!("Refresh Interval {}s", self.refresh_interval), + style, + ) + .into_left_aligned_line(), + ]; + let p = Paragraph::new(title); + + frame.render_widget(p, area); + } + + async fn get_api_data( + &self, + client: &ClickhouseServerClient, + metric: MetricName, + ) -> Result> { + let timeseries = client + .system_timeseries_avg( + metric.table(), + &format!("{metric}"), + Some(self.sampling_interval), + Some(self.time_range), + Some(TimestampFormat::UnixEpoch), + ) + .await + .map(|t| t.into_inner()) + .map_err(|e| { + anyhow!( + concat!( + "failed to retrieve timeseries from clickhouse server; ", + "error = {}", + ), + e + ) + }); + + timeseries + } + + fn new_logger(&self) -> Result { + let file = std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(self.log_path.clone()) + .with_context(|| { + format!("error opening log file {}", self.log_path) + })?; + + let decorator = PlainDecorator::new(file); + let drain = FullFormat::new(decorator).build().fuse(); + let drain = Async::new(drain).build().fuse(); + + Ok(slog::Logger::root(drain, o!(FileKv))) + } +} diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 667a666375..e501e650b1 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -797,12 +797,23 @@ enum ValidateCommands { /// Validate each `volume_references` column in the region snapshots table ValidateVolumeReferences, + /// Find either regions Nexus knows about that the corresponding Crucible + /// agent says were deleted, or regions that Nexus doesn't know about. + ValidateRegions(ValidateRegionsArgs), + /// Find either region snapshots Nexus knows about that the corresponding /// Crucible agent says were deleted, or region snapshots that Nexus doesn't /// know about. ValidateRegionSnapshots, } +#[derive(Debug, Args)] +struct ValidateRegionsArgs { + /// Delete Regions Nexus is unaware of + #[clap(long, default_value_t = false)] + clean_up_orphaned_regions: bool, +} + #[derive(Debug, Args)] struct VolumeArgs { #[command(subcommand)] @@ -1093,6 +1104,20 @@ impl DbArgs { DbCommands::Validate(ValidateArgs { command: ValidateCommands::ValidateVolumeReferences, }) => cmd_db_validate_volume_references(&datastore).await, + DbCommands::Validate(ValidateArgs { + command: ValidateCommands::ValidateRegions(args), + }) => { + let clean_up_orphaned_regions = + if args.clean_up_orphaned_regions { + let token = omdb.check_allow_destructive()?; + CleanUpOrphanedRegions::Yes { _token: token } + } else { + CleanUpOrphanedRegions::No + }; + + cmd_db_validate_regions(&datastore, clean_up_orphaned_regions) + .await + } DbCommands::Validate(ValidateArgs { command: ValidateCommands::ValidateRegionSnapshots, }) => cmd_db_validate_region_snapshots(&datastore).await, @@ -2566,39 +2591,48 @@ async fn cmd_db_region_used_by( async fn cmd_db_region_find_deleted( datastore: &DataStore, ) -> Result<(), anyhow::Error> { - let datasets_regions_volumes = + let freed_crucible_resources = datastore.find_deleted_volume_regions().await?; #[derive(Tabled)] - struct Row { + struct RegionRow { dataset_id: DatasetUuid, region_id: Uuid, - volume_id: String, } - let rows: Vec = datasets_regions_volumes - .into_iter() + #[derive(Tabled)] + struct VolumeRow { + volume_id: Uuid, + } + + let region_rows: Vec = freed_crucible_resources + .datasets_and_regions + .iter() .map(|row| { - let (dataset, region, volume) = row; + let (dataset, region) = row; - Row { - dataset_id: dataset.id(), - region_id: region.id(), - volume_id: if let Some(volume) = volume { - volume.id().to_string() - } else { - String::from("") - }, - } + RegionRow { dataset_id: dataset.id(), region_id: region.id() } }) .collect(); - let table = tabled::Table::new(rows) + let table = tabled::Table::new(region_rows) .with(tabled::settings::Style::psql()) .to_string(); println!("{}", table); + let volume_rows: Vec = freed_crucible_resources + .volumes + .iter() + .map(|volume_id| VolumeRow { volume_id: *volume_id }) + .collect(); + + let volume_table = tabled::Table::new(volume_rows) + .with(tabled::settings::Style::psql()) + .to_string(); + + println!("{}", volume_table); + Ok(()) } @@ -4526,6 +4560,283 @@ async fn cmd_db_validate_volume_references( Ok(()) } +enum CleanUpOrphanedRegions { + Yes { _token: DestructiveOperationToken }, + No, +} + +async fn cmd_db_validate_regions( + datastore: &DataStore, + clean_up_orphaned_regions: CleanUpOrphanedRegions, +) -> Result<(), anyhow::Error> { + // *Lifetime note*: + // + // The lifetime of the region record in cockroachdb is longer than the time + // the Crucible agent's region is in a non-destroyed state: Nexus will + // perform the query to allocate regions (inserting them into the database) + // before it ensures those regions are created (i.e. making the POST request + // to the appropriate Crucible agent to create them), and it will request + // that the regions be deleted (then wait for that region to transition to + // the destroyed state) before hard-deleting the records in the database. + + // First, get all region records (with their corresponding dataset) + let datasets_and_regions: Vec<(Dataset, Region)> = datastore + .pool_connection_for_tests() + .await? + .transaction_async(|conn| async move { + // Selecting all datasets and regions requires a full table scan + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; + + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::region::dsl; + + dsl::region + .inner_join( + dataset_dsl::dataset + .on(dsl::dataset_id.eq(dataset_dsl::id)), + ) + .select((Dataset::as_select(), Region::as_select())) + .get_results_async(&conn) + .await + }) + .await?; + + #[derive(Tabled)] + struct Row { + dataset_id: DatasetUuid, + region_id: Uuid, + dataset_addr: std::net::SocketAddrV6, + error: String, + } + + let mut rows = Vec::new(); + + // Reconcile with the corresponding Crucible Agent: are they aware of each + // region in the database? + for (dataset, region) in &datasets_and_regions { + // If the dataset was expunged, do not attempt to contact the Crucible + // agent! + let in_service = + datastore.dataset_physical_disk_in_service(dataset.id()).await?; + + if !in_service { + eprintln!( + "dataset {} {:?} is not in service, skipping", + dataset.id(), + dataset.address(), + ); + continue; + } + + use crucible_agent_client::types::RegionId; + use crucible_agent_client::types::State; + use crucible_agent_client::Client as CrucibleAgentClient; + + let Some(dataset_addr) = dataset.address() else { + eprintln!("Dataset {} missing an IP address", dataset.id()); + continue; + }; + + let url = format!("http://{}", dataset_addr); + let client = CrucibleAgentClient::new(&url); + + let actual_region = + match client.region_get(&RegionId(region.id().to_string())).await { + Ok(region) => region.into_inner(), + + Err(e) => { + // Either there was a communication error, or the agent is + // unaware of the Region (this would be a 404). + match e { + crucible_agent_client::Error::ErrorResponse(rv) + if rv.status() == http::StatusCode::NOT_FOUND => + { + rows.push(Row { + dataset_id: dataset.id(), + region_id: region.id(), + dataset_addr, + error: String::from( + "Agent does not know about this region!", + ), + }); + } + + _ => { + eprintln!( + "{} region_get {:?}: {e}", + dataset_addr, + region.id(), + ); + } + } + + continue; + } + }; + + // The Agent is aware of this region, but is it in the appropriate + // state? + + match actual_region.state { + State::Destroyed => { + // If it is destroyed, then this is invalid as the record should + // be hard-deleted as well (see the lifetime note above). Note + // that omdb could be racing a Nexus that is performing region + // deletion: if the region transitioned to Destroyed but Nexus + // is waiting to re-poll, it will not have hard-deleted the + // region record yet. + + rows.push(Row { + dataset_id: dataset.id(), + region_id: region.id(), + dataset_addr, + error: String::from( + "region may need to be manually hard-deleted", + ), + }); + } + + _ => { + // ok + } + } + } + + // Reconcile with the Crucible agents: are there regions that Nexus does not + // know about? Ask each Crucible agent for its list of regions, then check + // in the database: if that region is _not_ in the database, then either it + // was never created by Nexus, or it was hard-deleted by Nexus. Either way, + // omdb should (if the command line argument is supplied) request that the + // orphaned region be deleted. + // + // Note: This should not delete what is actually a valid region, see the + // lifetime note above. + + let mut orphaned_bytes: u64 = 0; + + let db_region_ids: BTreeSet = + datasets_and_regions.iter().map(|(_, r)| r.id()).collect(); + + // Find all the Crucible datasets + let datasets: Vec = datastore + .pool_connection_for_tests() + .await? + .transaction_async(|conn| async move { + // Selecting all datasets and regions requires a full table scan + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; + + use db::schema::dataset::dsl; + + dsl::dataset + .filter(dsl::kind.eq(nexus_db_model::DatasetKind::Crucible)) + .select(Dataset::as_select()) + .get_results_async(&conn) + .await + }) + .await?; + + for dataset in &datasets { + // If the dataset was expunged, do not attempt to contact the Crucible + // agent! + let in_service = + datastore.dataset_physical_disk_in_service(dataset.id()).await?; + + if !in_service { + eprintln!( + "dataset {} {:?} is not in service, skipping", + dataset.id(), + dataset.address(), + ); + continue; + } + + use crucible_agent_client::types::State; + use crucible_agent_client::Client as CrucibleAgentClient; + + let Some(dataset_addr) = dataset.address() else { + eprintln!("Dataset {} missing an IP address", dataset.id()); + continue; + }; + + let url = format!("http://{}", dataset_addr); + let client = CrucibleAgentClient::new(&url); + + let actual_regions = match client.region_list().await { + Ok(v) => v.into_inner(), + Err(e) => { + eprintln!("{} region_list: {e}", dataset_addr); + continue; + } + }; + + for actual_region in actual_regions { + // Skip doing anything if the region is already tombstoned or + // destroyed + match actual_region.state { + State::Destroyed | State::Tombstoned => { + // the Crucible agent will eventually clean this up, or + // already has. + continue; + } + + State::Failed | State::Requested | State::Created => { + // this region needs cleaning up if there isn't an + // associated db record + } + } + + let actual_region_id: Uuid = actual_region.id.0.parse().unwrap(); + if !db_region_ids.contains(&actual_region_id) { + orphaned_bytes += actual_region.block_size + * actual_region.extent_size + * u64::from(actual_region.extent_count); + + match clean_up_orphaned_regions { + CleanUpOrphanedRegions::Yes { .. } => { + match client.region_delete(&actual_region.id).await { + Ok(_) => { + eprintln!( + "{} region {} deleted ok", + dataset_addr, actual_region.id, + ); + } + + Err(e) => { + eprintln!( + "{} region_delete {:?}: {e}", + dataset_addr, actual_region.id, + ); + } + } + } + + CleanUpOrphanedRegions::No => { + // Do not delete this region, just print a row + rows.push(Row { + dataset_id: dataset.id(), + region_id: actual_region_id, + dataset_addr, + error: String::from( + "Nexus does not know about this region!", + ), + }); + } + } + } + } + } + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .to_string(); + + println!("{}", table); + + eprintln!("found {} orphaned bytes", orphaned_bytes); + + Ok(()) +} + async fn cmd_db_validate_region_snapshots( datastore: &DataStore, ) -> Result<(), anyhow::Error> { @@ -4581,6 +4892,15 @@ async fn cmd_db_validate_region_snapshots( .or_default() .insert(region_snapshot.snapshot_id); + // If the dataset was expunged, do not attempt to contact the Crucible + // agent! + let in_service = + datastore.dataset_physical_disk_in_service(dataset.id()).await?; + + if !in_service { + continue; + } + use crucible_agent_client::types::RegionId; use crucible_agent_client::types::State; use crucible_agent_client::Client as CrucibleAgentClient; @@ -4593,11 +4913,21 @@ async fn cmd_db_validate_region_snapshots( let url = format!("http://{}", dataset_addr); let client = CrucibleAgentClient::new(&url); - let actual_region_snapshots = client + let actual_region_snapshots = match client .region_get_snapshots(&RegionId( region_snapshot.region_id.to_string(), )) - .await?; + .await + { + Ok(v) => v, + Err(e) => { + eprintln!( + "{} region_get_snapshots {:?}: {e}", + dataset_addr, region_snapshot.region_id, + ); + continue; + } + }; let snapshot_id = region_snapshot.snapshot_id.to_string(); @@ -4741,6 +5071,15 @@ async fn cmd_db_validate_region_snapshots( // Reconcile with the Crucible agents: are there snapshots that Nexus does // not know about? for (dataset, region) in datasets_and_regions { + // If the dataset was expunged, do not attempt to contact the Crucible + // agent! + let in_service = + datastore.dataset_physical_disk_in_service(dataset.id()).await?; + + if !in_service { + continue; + } + use crucible_agent_client::types::RegionId; use crucible_agent_client::types::State; use crucible_agent_client::Client as CrucibleAgentClient; @@ -4753,9 +5092,20 @@ async fn cmd_db_validate_region_snapshots( let url = format!("http://{}", dataset_addr); let client = CrucibleAgentClient::new(&url); - let actual_region_snapshots = client + let actual_region_snapshots = match client .region_get_snapshots(&RegionId(region.id().to_string())) - .await?; + .await + { + Ok(v) => v, + Err(e) => { + eprintln!( + "{} region_get_snapshots {:?}: {e}", + dataset_addr, + region.id(), + ); + continue; + } + }; let default = HashSet::default(); let nexus_region_snapshots: &HashSet = diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index fe16b4076e..eb7d74b340 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1655,6 +1655,14 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { println!(" > {line}"); } + println!( + " total requests completed ok: {}", + status.requests_completed_ok.len(), + ); + for line in &status.requests_completed_ok { + println!(" > {line}"); + } + println!(" errors: {}", status.errors.len()); for line in &status.errors { println!(" > {line}"); @@ -1720,6 +1728,14 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { println!(" > {line}"); } + println!( + " total steps set to volume_deleted ok: {}", + status.step_set_volume_deleted_ok.len(), + ); + for line in &status.step_set_volume_deleted_ok { + println!(" > {line}"); + } + println!(" errors: {}", status.errors.len()); for line in &status.errors { println!(" > {line}"); @@ -1831,10 +1847,11 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { Ok(status) => { println!( - " total records transitioned to done: {}", - status.records_set_to_done.len(), + " region snapshot replacement finish sagas started \ + ok: {}", + status.finish_invoked_ok.len() ); - for line in &status.records_set_to_done { + for line in &status.finish_invoked_ok { println!(" > {line}"); } diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 0c1d5402fb..742ed1e840 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -631,7 +631,7 @@ task: "region_snapshot_replacement_finish" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - total records transitioned to done: 0 + region snapshot replacement finish sagas started ok: 0 errors: 0 task: "region_snapshot_replacement_garbage_collection" @@ -649,6 +649,7 @@ task: "region_snapshot_replacement_start" started at (s ago) and ran for ms total requests created ok: 0 total start saga invoked ok: 0 + total requests completed ok: 0 errors: 0 task: "region_snapshot_replacement_step" @@ -659,6 +660,7 @@ task: "region_snapshot_replacement_step" total step records created ok: 0 total step garbage collect saga invoked ok: 0 total step saga invoked ok: 0 + total steps set to volume_deleted ok: 0 errors: 0 task: "saga_recovery" @@ -1081,7 +1083,7 @@ task: "region_snapshot_replacement_finish" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - total records transitioned to done: 0 + region snapshot replacement finish sagas started ok: 0 errors: 0 task: "region_snapshot_replacement_garbage_collection" @@ -1099,6 +1101,7 @@ task: "region_snapshot_replacement_start" started at (s ago) and ran for ms total requests created ok: 0 total start saga invoked ok: 0 + total requests completed ok: 0 errors: 0 task: "region_snapshot_replacement_step" @@ -1109,6 +1112,7 @@ task: "region_snapshot_replacement_step" total step records created ok: 0 total step garbage collect saga invoked ok: 0 total step saga invoked ok: 0 + total steps set to volume_deleted ok: 0 errors: 0 task: "saga_recovery" diff --git a/internal-dns/resolver/src/resolver.rs b/internal-dns/resolver/src/resolver.rs index af47bb23ad..8d0e071e19 100644 --- a/internal-dns/resolver/src/resolver.rs +++ b/internal-dns/resolver/src/resolver.rs @@ -12,7 +12,7 @@ use omicron_common::address::{ get_internal_dns_server_addresses, Ipv6Subnet, AZ_PREFIX, DNS_PORT, }; use slog::{debug, error, info, trace}; -use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; +use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; #[derive(Debug, Clone, thiserror::Error)] pub enum ResolveError { @@ -323,20 +323,6 @@ impl Resolver { } } - pub async fn lookup_ip( - &self, - srv: ServiceName, - ) -> Result { - let name = srv.srv_name(); - debug!(self.log, "lookup srv"; "dns_name" => &name); - let response = self.resolver.lookup_ip(&name).await?; - let address = response - .iter() - .next() - .ok_or_else(|| ResolveError::NotFound(srv))?; - Ok(address) - } - /// Returns an iterator of [`SocketAddrV6`]'s for the targets of the given /// SRV lookup response. // SRV records have a target, which is itself another DNS name that needs @@ -534,7 +520,7 @@ mod test { let resolver = dns_server.resolver().unwrap(); let err = resolver - .lookup_ip(ServiceName::Cockroach) + .lookup_srv(ServiceName::Cockroach) .await .expect_err("Looking up non-existent service should fail"); diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs index 50f84d0b59..8a60b332b6 100644 --- a/live-tests/tests/common/mod.rs +++ b/live-tests/tests/common/mod.rs @@ -157,7 +157,7 @@ async fn check_execution_environment( // The only real requirement for these tests is that they're run from a // place with connectivity to the underlay network of a deployed control // plane. The easiest way to tell is to look up something in internal DNS. - resolver.lookup_ip(ServiceName::InternalDns).await.map_err(|e| { + resolver.lookup_srv(ServiceName::InternalDns).await.map_err(|e| { let text = format!( "check_execution_environment(): failed to look up internal DNS \ in the internal DNS servers.\n\n \ diff --git a/nexus/db-model/src/region_snapshot_replacement.rs b/nexus/db-model/src/region_snapshot_replacement.rs index bcbd55028d..28627d8379 100644 --- a/nexus/db-model/src/region_snapshot_replacement.rs +++ b/nexus/db-model/src/region_snapshot_replacement.rs @@ -28,6 +28,7 @@ impl_enum_type!( ReplacementDone => b"replacement_done" DeletingOldVolume => b"deleting_old_volume" Running => b"running" + Completing => b"completing" Complete => b"complete" ); @@ -46,6 +47,7 @@ impl std::str::FromStr for RegionSnapshotReplacementState { Ok(RegionSnapshotReplacementState::DeletingOldVolume) } "running" => Ok(RegionSnapshotReplacementState::Running), + "completing" => Ok(RegionSnapshotReplacementState::Completing), "complete" => Ok(RegionSnapshotReplacementState::Complete), _ => Err(format!("unrecognized value {} for enum", s)), } @@ -79,9 +81,14 @@ impl std::str::FromStr for RegionSnapshotReplacementState { /// | | /// v --- /// --- -/// Running | -/// | set in region snapshot replacement -/// | | finish background task +/// Running <-- | +/// | | +/// | | | +/// v | | +/// | | responsibility of region snapshot +/// Completing -- | replacement finish saga +/// | +/// | | /// v | /// | /// Complete --- @@ -133,6 +140,12 @@ pub struct RegionSnapshotReplacement { pub replacement_state: RegionSnapshotReplacementState, pub operating_saga_id: Option, + + /// In order for the newly created region not to be deleted inadvertently, + /// an additional reference count bump is required. This volume should live + /// as long as this request so that all necessary replacements can be + /// completed. + pub new_region_volume_id: Option, } impl RegionSnapshotReplacement { @@ -157,6 +170,7 @@ impl RegionSnapshotReplacement { old_snapshot_id, old_snapshot_volume_id: None, new_region_id: None, + new_region_volume_id: None, replacement_state: RegionSnapshotReplacementState::Requested, operating_saga_id: None, } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 1f5b43669d..a8e1141db6 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1946,6 +1946,7 @@ table! { new_region_id -> Nullable, replacement_state -> crate::RegionSnapshotReplacementStateEnum, operating_saga_id -> Nullable, + new_region_volume_id -> Nullable, } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 20d81944ce..4542283aac 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(117, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(118, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,7 +29,8 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), - KnownVersion::new(117, "support-bundles"), + KnownVersion::new(118, "support-bundles"), + KnownVersion::new(117, "add-completing-and-new-region-volume"), KnownVersion::new(116, "bp-physical-disk-disposition"), KnownVersion::new(115, "inv-omicron-physical-disks-generation"), KnownVersion::new(114, "crucible-ref-count-records"), diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index a12e0abc1d..db3eab6175 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -127,16 +127,7 @@ pub use support_bundle::SupportBundleExpungementReport; pub use switch_port::SwitchPortSettingsCombinedResult; pub use virtual_provisioning_collection::StorageType; pub use vmm::VmmStateUpdateResult; -pub use volume::read_only_resources_associated_with_volume; -pub use volume::CrucibleResources; -pub use volume::CrucibleTargets; -pub use volume::ExistingTarget; -pub use volume::ReplacementTarget; -pub use volume::VolumeCheckoutReason; -pub use volume::VolumeReplaceResult; -pub use volume::VolumeReplacementParams; -pub use volume::VolumeToDelete; -pub use volume::VolumeWithTarget; +pub use volume::*; // Number of unique datasets required to back a region. // TODO: This should likely turn into a configuration option. diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index 885cb622b8..8e59462aa3 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -422,8 +422,8 @@ impl DataStore { } } - /// Find regions on expunged disks - pub async fn find_regions_on_expunged_physical_disks( + /// Find read/write regions on expunged disks + pub async fn find_read_write_regions_on_expunged_physical_disks( &self, opctx: &OpContext, ) -> LookupResult> { @@ -450,6 +450,8 @@ impl DataStore { )) .select(dataset_dsl::id) )) + // only return read-write regions here + .filter(region_dsl::read_only.eq(false)) .select(Region::as_select()) .load_async(&*conn) .await diff --git a/nexus/db-queries/src/db/datastore/region_replacement.rs b/nexus/db-queries/src/db/datastore/region_replacement.rs index 0fda6b46ba..8aad7f2cfd 100644 --- a/nexus/db-queries/src/db/datastore/region_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_replacement.rs @@ -37,6 +37,13 @@ impl DataStore { opctx: &OpContext, region: &Region, ) -> Result { + if region.read_only() { + return Err(Error::invalid_request(format!( + "region {} is read-only", + region.id(), + ))); + } + let request = RegionReplacement::for_region(region); let request_id = request.id; @@ -52,19 +59,22 @@ impl DataStore { opctx: &OpContext, request: RegionReplacement, ) -> Result<(), Error> { + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper("insert_region_replacement_request") .transaction(&conn, |conn| { let request = request.clone(); + let err = err.clone(); async move { use db::schema::region_replacement::dsl; - Self::volume_repair_insert_query( + Self::volume_repair_insert_in_txn( + &conn, + err, request.volume_id, request.id, ) - .execute_async(&conn) .await?; diesel::insert_into(dsl::region_replacement) @@ -76,7 +86,13 @@ impl DataStore { } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_replacement_request_by_id( @@ -908,6 +924,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn test_one_replacement_per_volume() { @@ -919,6 +936,20 @@ mod test { let region_2_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request_1 = RegionReplacement::new(region_1_id, volume_id); let request_2 = RegionReplacement::new(region_2_id, volume_id); @@ -950,6 +981,20 @@ mod test { let region_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = { let mut request = RegionReplacement::new(region_id, volume_id); request.replacement_state = RegionReplacementState::Running; @@ -1042,6 +1087,20 @@ mod test { let region_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = { let mut request = RegionReplacement::new(region_id, volume_id); request.replacement_state = RegionReplacementState::ReplacementDone; diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index 76a83cca2a..90b014c582 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -16,7 +16,6 @@ use crate::db::model::RegionSnapshotReplacement; use crate::db::model::RegionSnapshotReplacementState; use crate::db::model::RegionSnapshotReplacementStep; use crate::db::model::RegionSnapshotReplacementStepState; -use crate::db::model::VolumeRepair; use crate::db::pagination::paginated; use crate::db::pagination::Paginator; use crate::db::update_and_check::UpdateAndCheck; @@ -93,6 +92,7 @@ impl DataStore { request: RegionSnapshotReplacement, volume_id: Uuid, ) -> Result<(), Error> { + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper( @@ -100,20 +100,24 @@ impl DataStore { ) .transaction(&conn, |conn| { let request = request.clone(); + let err = err.clone(); async move { use db::schema::region_snapshot_replacement::dsl; - use db::schema::volume_repair::dsl as volume_repair_dsl; - // An associated volume repair record isn't _strictly_ needed: - // snapshot volumes should never be directly constructed, and - // therefore won't ever have an associated Upstairs that - // receives a volume replacement request. However it's being - // done in an attempt to be overly cautious. - - diesel::insert_into(volume_repair_dsl::volume_repair) - .values(VolumeRepair { volume_id, repair_id: request.id }) - .execute_async(&conn) - .await?; + // An associated volume repair record isn't _strictly_ + // needed: snapshot volumes should never be directly + // constructed, and therefore won't ever have an associated + // Upstairs that receives a volume replacement request. + // However it's being done in an attempt to be overly + // cautious, and it validates that the volume exist: + // otherwise it would be possible to create a region + // snapshot replacement request for a volume that didn't + // exist! + + Self::volume_repair_insert_in_txn( + &conn, err, volume_id, request.id, + ) + .await?; diesel::insert_into(dsl::region_snapshot_replacement) .values(request) @@ -124,7 +128,13 @@ impl DataStore { } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_snapshot_replacement_request_by_id( @@ -342,6 +352,7 @@ impl DataStore { region_snapshot_replacement_id: Uuid, operating_saga_id: Uuid, new_region_id: Uuid, + new_region_volume_id: Uuid, old_snapshot_volume_id: Uuid, ) -> Result<(), Error> { use db::schema::region_snapshot_replacement::dsl; @@ -357,6 +368,7 @@ impl DataStore { .eq(RegionSnapshotReplacementState::ReplacementDone), dsl::old_snapshot_volume_id.eq(Some(old_snapshot_volume_id)), dsl::new_region_id.eq(Some(new_region_id)), + dsl::new_region_volume_id.eq(Some(new_region_volume_id)), dsl::operating_saga_id.eq(Option::::None), )) .check_if_exists::( @@ -375,6 +387,8 @@ impl DataStore { && record.replacement_state == RegionSnapshotReplacementState::ReplacementDone && record.new_region_id == Some(new_region_id) + && record.new_region_volume_id + == Some(new_region_volume_id) && record.old_snapshot_volume_id == Some(old_snapshot_volume_id) { @@ -557,15 +571,201 @@ impl DataStore { } } - /// Transition a RegionSnapshotReplacement record from Running to Complete. - /// Also removes the `volume_repair` record that is taking a "lock" on the - /// Volume. Note this doesn't occur from a saga context, and therefore 1) - /// doesn't accept an operating saga id parameter, and 2) checks that - /// operating_saga_id is null for the corresponding record. + /// Transition a RegionSnapshotReplacement record from Running to + /// Completing, setting a unique id at the same time. + pub async fn set_region_snapshot_replacement_completing( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Running), + ) + .filter(dsl::operating_saga_id.is_null()) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Completing), + dsl::operating_saga_id.eq(operating_saga_id), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == Some(operating_saga_id) + && record.replacement_state + == RegionSnapshotReplacementState::Completing + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionReplacement record from Completing to Running, + /// clearing the operating saga id. + pub async fn undo_set_region_snapshot_replacement_completing( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Completing), + ) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Running), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == None + && record.replacement_state + == RegionSnapshotReplacementState::Running + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionSnapshotReplacement record from Completing to + /// Complete. Also removes the `volume_repair` record that is taking a + /// "lock" on the Volume. pub async fn set_region_snapshot_replacement_complete( &self, opctx: &OpContext, region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper( + "set_region_snapshot_replacement_complete", + ) + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + use db::schema::volume_repair::dsl as volume_repair_dsl; + + diesel::delete( + volume_repair_dsl::volume_repair.filter( + volume_repair_dsl::repair_id + .eq(region_snapshot_replacement_id), + ), + ) + .execute_async(&conn) + .await?; + + use db::schema::region_snapshot_replacement::dsl; + + let result = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Completing), + ) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Complete), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&conn) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.replacement_state + == RegionSnapshotReplacementState::Complete + { + Ok(()) + } else { + Err(err.bail(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + )))) + } + } + } + } + }) + .await + .map_err(|e| match err.take() { + Some(error) => error, + None => public_error_from_diesel(e, ErrorHandler::Server), + }) + } + + /// Transition a RegionSnapshotReplacement record from Requested to Complete + /// - this is required when the region snapshot is hard-deleted, which means + /// that all volume references are gone and no replacement is required. Also + /// removes the `volume_repair` record that is taking a "lock" on the + /// Volume. + pub async fn set_region_snapshot_replacement_complete_from_requested( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, ) -> Result<(), Error> { type TxnError = TransactionError; @@ -577,6 +777,7 @@ impl DataStore { let err = err.clone(); async move { use db::schema::volume_repair::dsl as volume_repair_dsl; + use db::schema::region_snapshot_replacement::dsl; diesel::delete( volume_repair_dsl::volume_repair.filter( @@ -587,17 +788,16 @@ impl DataStore { .execute_async(&conn) .await?; - use db::schema::region_snapshot_replacement::dsl; - let result = diesel::update(dsl::region_snapshot_replacement) .filter(dsl::id.eq(region_snapshot_replacement_id)) .filter( dsl::replacement_state - .eq(RegionSnapshotReplacementState::Running), + .eq(RegionSnapshotReplacementState::Requested), ) .filter(dsl::operating_saga_id.is_null()) - .set((dsl::replacement_state - .eq(RegionSnapshotReplacementState::Complete),)) + .filter(dsl::new_region_volume_id.is_null()) + .set(dsl::replacement_state + .eq(RegionSnapshotReplacementState::Complete)) .check_if_exists::( region_snapshot_replacement_id, ) @@ -621,7 +821,7 @@ impl DataStore { region_snapshot_replacement_id, record.replacement_state, record.operating_saga_id, - ), + ) )))) } } @@ -651,6 +851,7 @@ impl DataStore { opctx: &OpContext, request: RegionSnapshotReplacementStep, ) -> Result { + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper( @@ -658,10 +859,10 @@ impl DataStore { ) .transaction(&conn, |conn| { let request = request.clone(); + let err = err.clone(); async move { use db::schema::region_snapshot_replacement_step::dsl; - use db::schema::volume_repair::dsl as volume_repair_dsl; // Skip inserting this new record if we found another region // snapshot replacement step with this volume in the step's @@ -714,13 +915,13 @@ impl DataStore { // volume replacement: create an associated volume repair // record. - diesel::insert_into(volume_repair_dsl::volume_repair) - .values(VolumeRepair { - volume_id: request.volume_id, - repair_id: request.id, - }) - .execute_async(&conn) - .await?; + Self::volume_repair_insert_in_txn( + &conn, + err, + request.volume_id, + request.id, + ) + .await?; let request_id = request.id; @@ -733,7 +934,13 @@ impl DataStore { } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_snapshot_replacement_step_by_id( @@ -1073,6 +1280,87 @@ impl DataStore { Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), } } + + /// Transition from Requested to VolumeDeleted, and remove the associated + /// `volume_repair` record. This occurs when the associated snapshot's + /// volume is deleted. Note this doesn't occur from a saga context, and + /// therefore 1) doesn't accept an operating saga id parameter, and 2) + /// checks that operating_saga_id is null for the corresponding record. + pub async fn set_region_snapshot_replacement_step_volume_deleted_from_requested( + &self, + opctx: &OpContext, + region_snapshot_replacement_step: RegionSnapshotReplacementStep, + ) -> Result<(), Error> { + let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); + + self.transaction_retry_wrapper( + "set_region_snapshot_replacement_complete", + ) + .transaction(&conn, |conn| { + let err = err.clone(); + + async move { + use db::schema::volume_repair::dsl as volume_repair_dsl; + + diesel::delete( + volume_repair_dsl::volume_repair.filter( + volume_repair_dsl::repair_id + .eq(region_snapshot_replacement_step.id), + ), + ) + .execute_async(&conn) + .await?; + + use db::schema::region_snapshot_replacement_step::dsl; + let result = + diesel::update(dsl::region_snapshot_replacement_step) + .filter(dsl::id.eq(region_snapshot_replacement_step.id)) + .filter(dsl::operating_saga_id.is_null()) + .filter(dsl::old_snapshot_volume_id.is_null()) + .filter( + dsl::replacement_state.eq( + RegionSnapshotReplacementStepState::Requested, + ), + ) + .set(dsl::replacement_state.eq( + RegionSnapshotReplacementStepState::VolumeDeleted, + )) + .check_if_exists::( + region_snapshot_replacement_step.id, + ) + .execute_and_check(&conn) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.replacement_state + == RegionSnapshotReplacementStepState::VolumeDeleted + { + Ok(()) + } else { + Err(err.bail(Error::conflict(format!( + "region snapshot replacement step {} set \ + to {:?} (operating saga id {:?})", + region_snapshot_replacement_step.id, + record.replacement_state, + record.operating_saga_id, + )))) + } + } + } + } + }) + .await + .map_err(|e| match err.take() { + Some(error) => error, + None => public_error_from_diesel(e, ErrorHandler::Server), + }) + } } #[cfg(test)] @@ -1083,6 +1371,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; use omicron_uuid_kinds::DatasetUuid; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn test_one_replacement_per_volume() { @@ -1100,6 +1389,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request_1 = RegionSnapshotReplacement::new( dataset_1_id, region_1_id, @@ -1146,6 +1449,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request_1 = RegionSnapshotReplacement::new( dataset_1_id, region_1_id, @@ -1182,6 +1499,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); @@ -1214,11 +1545,25 @@ mod test { // Insert some replacement steps, and make sure counting works + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + { - let step = RegionSnapshotReplacementStep::new( - request_id, - Uuid::new_v4(), // volume id - ); + let step = + RegionSnapshotReplacementStep::new(request_id, step_volume_id); let result = datastore .insert_region_snapshot_replacement_step(&opctx, step) @@ -1247,11 +1592,25 @@ mod test { 1, ); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + { - let mut step = RegionSnapshotReplacementStep::new( - request_id, - Uuid::new_v4(), // volume id - ); + let mut step = + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Running; @@ -1283,11 +1642,25 @@ mod test { 1, ); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + { - let mut step = RegionSnapshotReplacementStep::new( - request_id, - Uuid::new_v4(), // volume id - ); + let mut step = + RegionSnapshotReplacementStep::new(request_id, step_volume_id); // VolumeDeleted does not count as "in-progress" step.replacement_state = @@ -1337,6 +1710,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let step = RegionSnapshotReplacementStep::new(Uuid::new_v4(), volume_id); let first_request_id = step.id; @@ -1431,6 +1818,22 @@ mod test { let db = TestDatabase::new_with_datastore(&logctx.log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut request = RegionSnapshotReplacement::new( DatasetUuid::new_v4(), Uuid::new_v4(), @@ -1442,9 +1845,7 @@ mod test { datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -1457,8 +1858,24 @@ mod test { .unwrap() .is_empty()); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Complete; let result = datastore @@ -1468,8 +1885,24 @@ mod test { assert!(matches!(result, InsertStepResult::Inserted { .. })); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Complete; let result = datastore @@ -1509,6 +1942,34 @@ mod test { let volume_id = Uuid::new_v4(); let old_snapshot_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + datastore + .volume_create(nexus_db_model::Volume::new( + old_snapshot_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step = RegionSnapshotReplacementStep::new(request_id, volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Complete; @@ -1558,6 +2019,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = RegionReplacement::new(Uuid::new_v4(), volume_id); datastore diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 4e0d1ccac1..505533da50 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -59,6 +59,7 @@ use serde::Deserialize; use serde::Deserializer; use serde::Serialize; use sled_agent_client::types::VolumeConstructionRequest; +use std::collections::HashSet; use std::collections::VecDeque; use std::net::AddrParseError; use std::net::SocketAddr; @@ -179,6 +180,23 @@ enum ReplaceSnapshotError { MultipleResourceUsageRecords(String), } +/// Crucible resources freed by previous volume deletes +#[derive(Debug, Serialize, Deserialize)] +pub struct FreedCrucibleResources { + /// Regions that previously could not be deleted (often due to region + /// snaphots) that were freed by a volume delete + pub datasets_and_regions: Vec<(Dataset, Region)>, + + /// Previously soft-deleted volumes that can now be hard-deleted + pub volumes: Vec, +} + +impl FreedCrucibleResources { + pub fn is_empty(&self) -> bool { + self.datasets_and_regions.is_empty() && self.volumes.is_empty() + } +} + impl DataStore { async fn volume_create_in_txn( conn: &async_bb8_diesel::Connection, @@ -412,7 +430,7 @@ impl DataStore { }) } - async fn volume_get_impl( + pub(super) async fn volume_get_impl( conn: &async_bb8_diesel::Connection, volume_id: Uuid, ) -> Result, diesel::result::Error> { @@ -1115,12 +1133,12 @@ impl DataStore { .await } - /// Find regions for deleted volumes that do not have associated region - /// snapshots and are not being used by any other non-deleted volumes, and - /// return them for garbage collection + /// Find read/write regions for deleted volumes that do not have associated + /// region snapshots and are not being used by any other non-deleted + /// volumes, and return them for garbage collection pub async fn find_deleted_volume_regions( &self, - ) -> ListResultVec<(Dataset, Region, Option)> { + ) -> LookupResult { let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("find_deleted_volume_regions") .transaction(&conn, |conn| async move { @@ -1132,8 +1150,7 @@ impl DataStore { async fn find_deleted_volume_regions_in_txn( conn: &async_bb8_diesel::Connection, - ) -> Result)>, diesel::result::Error> - { + ) -> Result { use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; use db::schema::region_snapshot::dsl; @@ -1179,6 +1196,9 @@ impl DataStore { let mut deleted_regions = Vec::with_capacity(unfiltered_deleted_regions.len()); + let mut volume_set: HashSet = + HashSet::with_capacity(unfiltered_deleted_regions.len()); + for (dataset, region, region_snapshot, volume) in unfiltered_deleted_regions { @@ -1212,10 +1232,61 @@ impl DataStore { continue; } + if let Some(volume) = &volume { + volume_set.insert(volume.id()); + } + deleted_regions.push((dataset, region, volume)); } - Ok(deleted_regions) + let regions_for_deletion: HashSet = + deleted_regions.iter().map(|(_, region, _)| region.id()).collect(); + + let mut volumes = Vec::with_capacity(deleted_regions.len()); + + for volume_id in volume_set { + // Do not return a volume hard-deletion if there are still lingering + // read/write regions, unless all those lingering read/write regions + // will be deleted from the result of returning from this function. + let allocated_rw_regions: HashSet = + Self::get_allocated_regions_query(volume_id) + .get_results_async::<(Dataset, Region)>(conn) + .await? + .into_iter() + .filter_map(|(_, region)| { + if !region.read_only() { + Some(region.id()) + } else { + None + } + }) + .collect(); + + if allocated_rw_regions.is_subset(®ions_for_deletion) { + // If all the allocated rw regions for this volume are in the + // set of regions being returned for deletion, then we can + // hard-delete this volume. Read-only region accounting should + // have already been updated by soft-deleting this volume. + // + // Note: we'll be in this branch if allocated_rw_regions is + // empty. I believe the only time we'll hit this empty case is + // when the volume is fully populated with read-only resources + // (read-only regions and region snapshots). + volumes.push(volume_id); + } else { + // Not all r/w regions allocated to this volume are being + // deleted here, so we can't hard-delete the volume yet. + } + } + + Ok(FreedCrucibleResources { + datasets_and_regions: deleted_regions + .into_iter() + .map(|(d, r, _)| (d, r)) + .collect(), + + volumes, + }) } pub async fn read_only_resources_associated_with_volume( @@ -2621,8 +2692,11 @@ pub enum VolumeReplaceResult { // this call performed the replacement Done, - // the "existing" volume was deleted - ExistingVolumeDeleted, + // the "existing" volume was soft deleted + ExistingVolumeSoftDeleted, + + // the "existing" volume was hard deleted + ExistingVolumeHardDeleted, } impl DataStore { @@ -2747,14 +2821,14 @@ impl DataStore { // perform the region replacement now, and this will short-circuit // the rest of the process. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeHardDeleted); }; if old_volume.time_deleted.is_some() { // Existing volume was soft-deleted, so return here for the same // reason: the region replacement process should be short-circuited // now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeSoftDeleted); } let old_vcr: VolumeConstructionRequest = @@ -3001,14 +3075,14 @@ impl DataStore { // perform the region replacement now, and this will short-circuit // the rest of the process. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeHardDeleted); }; if old_volume.time_deleted.is_some() { // Existing volume was soft-deleted, so return here for the same // reason: the region replacement process should be short-circuited // now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeSoftDeleted); } let old_vcr: VolumeConstructionRequest = @@ -3682,7 +3756,15 @@ impl DataStore { let mut paginator = Paginator::new(SQL_BATCH_SIZE); let conn = self.pool_connection_authorized(opctx).await?; - let needle = address.to_string(); + let needle = match address { + SocketAddr::V4(_) => { + return Err(Error::internal_error(&format!( + "find_volumes_referencing_socket_addr not ipv6: {address}" + ))); + } + + SocketAddr::V6(addr) => addr, + }; while let Some(p) = paginator.next() { use db::schema::volume::dsl; @@ -3699,7 +3781,23 @@ impl DataStore { paginator = p.found_batch(&haystack, &|r| r.id()); for volume in haystack { - if volume.data().contains(&needle) { + let vcr: VolumeConstructionRequest = + match serde_json::from_str(&volume.data()) { + Ok(vcr) => vcr, + Err(e) => { + return Err(Error::internal_error(&format!( + "cannot deserialize volume data for {}: {e}", + volume.id(), + ))); + } + }; + + let rw_reference = region_in_vcr(&vcr, &needle) + .map_err(|e| Error::internal_error(&e.to_string()))?; + let ro_reference = read_only_target_in_vcr(&vcr, &needle) + .map_err(|e| Error::internal_error(&e.to_string()))?; + + if rw_reference || ro_reference { volumes.push(volume); } } diff --git a/nexus/db-queries/src/db/datastore/volume_repair.rs b/nexus/db-queries/src/db/datastore/volume_repair.rs index 7ea88c8542..598d9d77a2 100644 --- a/nexus/db-queries/src/db/datastore/volume_repair.rs +++ b/nexus/db-queries/src/db/datastore/volume_repair.rs @@ -11,6 +11,8 @@ use crate::db::datastore::RunnableQuery; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::model::VolumeRepair; +use crate::db::DbConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use diesel::result::DatabaseErrorKind; @@ -19,14 +21,75 @@ use omicron_common::api::external::Error; use uuid::Uuid; impl DataStore { - pub(super) fn volume_repair_insert_query( + /// Insert a volume repair record, taking a "lock" on the volume pointed to + /// by volume id with some repair id. + /// + /// If there exists a record that has a matching volume id and repair id, + /// return Ok(()). + /// + /// If there is no volume that matches the given volume id, return an error: + /// it should not be possible to lock a volume that does not exist! Note + /// that it is possible to lock a soft-deleted volume. + /// + /// If there is already an existing record that has a matching volume id but + /// a different repair id, then this function returns an Error::conflict. + pub(super) async fn volume_repair_insert_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, volume_id: Uuid, repair_id: Uuid, - ) -> impl RunnableQuery { + ) -> Result<(), diesel::result::Error> { use db::schema::volume_repair::dsl; - diesel::insert_into(dsl::volume_repair) + // If a lock that matches the arguments exists already, return Ok + // + // Note: if rerunning this function (for example if a saga node was + // rerun), the volume could have existed when this lock was inserted the + // first time, but have been deleted now. + let maybe_lock = dsl::volume_repair + .filter(dsl::repair_id.eq(repair_id)) + .filter(dsl::volume_id.eq(volume_id)) + .first_async::(conn) + .await + .optional()?; + + if maybe_lock.is_some() { + return Ok(()); + } + + // Do not allow a volume repair record to be created if the volume does + // not exist, or was hard-deleted! + let maybe_volume = Self::volume_get_impl(conn, volume_id).await?; + + if maybe_volume.is_none() { + return Err(err.bail(Error::invalid_request(format!( + "cannot create record: volume {volume_id} does not exist" + )))); + } + + // Do not check for soft-deletion here: We may want to request locks for + // soft-deleted volumes. + + match diesel::insert_into(dsl::volume_repair) .values(VolumeRepair { volume_id, repair_id }) + .execute_async(conn) + .await + { + Ok(_) => Ok(()), + + Err(e) => match e { + DieselError::DatabaseError( + DatabaseErrorKind::UniqueViolation, + ref error_information, + ) if error_information.constraint_name() + == Some("volume_repair_pkey") => + { + Err(err.bail(Error::conflict("volume repair lock"))) + } + + _ => Err(e), + }, + } } pub async fn volume_repair_lock( @@ -36,21 +99,25 @@ impl DataStore { repair_id: Uuid, ) -> Result<(), Error> { let conn = self.pool_connection_authorized(opctx).await?; - Self::volume_repair_insert_query(volume_id, repair_id) - .execute_async(&*conn) + let err = OptionalError::new(); + + self.transaction_retry_wrapper("volume_repair_lock") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + Self::volume_repair_insert_in_txn( + &conn, err, volume_id, repair_id, + ) + .await + } + }) .await - .map(|_| ()) - .map_err(|e| match e { - DieselError::DatabaseError( - DatabaseErrorKind::UniqueViolation, - ref error_information, - ) if error_information.constraint_name() - == Some("volume_repair_pkey") => - { - Error::conflict("volume repair lock") + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) } - - _ => public_error_from_diesel(e, ErrorHandler::Server), }) } @@ -102,6 +169,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn volume_lock_conflict_error_returned() { @@ -113,6 +181,20 @@ mod test { let lock_2 = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore.volume_repair_lock(&opctx, volume_id, lock_1).await.unwrap(); let err = datastore @@ -125,4 +207,55 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + /// Assert that you can't take a volume repair lock if the volume does not + /// exist yet! + #[tokio::test] + async fn volume_lock_should_fail_without_volume() { + let logctx = + dev::test_setup_log("volume_lock_should_fail_without_volume"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let lock_1 = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + + datastore + .volume_repair_lock(&opctx, volume_id, lock_1) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn volume_lock_relock_allowed() { + let logctx = dev::test_setup_log("volume_lock_relock_allowed"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let lock_id = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + datastore.volume_repair_lock(&opctx, volume_id, lock_id).await.unwrap(); + datastore.volume_repair_lock(&opctx, volume_id, lock_id).await.unwrap(); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 19cff79057..354a10ad74 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -850,7 +850,7 @@ impl BackgroundTasksInitializer { done", period: config.region_snapshot_replacement_finish.period_secs, task_impl: Box::new(RegionSnapshotReplacementFinishDetector::new( - datastore, + datastore, sagas, )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], diff --git a/nexus/src/app/background/tasks/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs index f86ba8eb8f..caa262bc8c 100644 --- a/nexus/src/app/background/tasks/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -23,6 +23,7 @@ use nexus_db_model::RegionReplacement; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::RegionReplacementStatus; +use omicron_common::api::external::Error; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::TypedUuid; use serde_json::json; @@ -42,7 +43,7 @@ impl RegionReplacementDetector { &self, serialized_authn: authn::saga::Serialized, request: RegionReplacement, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let params = sagas::region_replacement_start::Params { serialized_authn, request, @@ -68,17 +69,18 @@ impl BackgroundTask for RegionReplacementDetector { let mut status = RegionReplacementStatus::default(); - // Find regions on expunged physical disks + // Find read/write regions on expunged physical disks let regions_to_be_replaced = match self .datastore - .find_regions_on_expunged_physical_disks(opctx) + .find_read_write_regions_on_expunged_physical_disks(opctx) .await { Ok(regions) => regions, Err(e) => { let s = format!( - "find_regions_on_expunged_physical_disks failed: {e}" + "find_read_write_regions_on_expunged_physical_disks \ + failed: {e}" ); error!(&log, "{s}"); status.errors.push(s); @@ -134,15 +136,31 @@ impl BackgroundTask for RegionReplacementDetector { } Err(e) => { - let s = format!( - "error adding region replacement request for \ - region {} volume id {}: {e}", - region.id(), - region.volume_id(), - ); - error!(&log, "{s}"); + match e { + Error::Conflict { message } + if message.external_message() + == "volume repair lock" => + { + // This is not a fatal error! If there are + // competing region replacement and region + // snapshot replacements, then they are both + // attempting to lock volumes. + } + + _ => { + let s = format!( + "error adding region replacement \ + request for region {} volume id {}: \ + {e}", + region.id(), + region.volume_id(), + ); + error!(&log, "{s}"); + + status.errors.push(s); + } + } - status.errors.push(s); continue; } } @@ -173,7 +191,9 @@ impl BackgroundTask for RegionReplacementDetector { // If the replacement request is in the `requested` state and // the request's volume was soft-deleted or hard-deleted, avoid // sending the start request and instead transition the request - // to completed + // to completed. Note the saga will do the right thing if the + // volume is deleted, but this avoids the overhead of starting + // it. let volume_deleted = match self .datastore @@ -314,6 +334,21 @@ mod test { // Add a region replacement request for a fake region let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = RegionReplacement::new(Uuid::new_v4(), volume_id); let request_id = request.id; diff --git a/nexus/src/app/background/tasks/region_replacement_driver.rs b/nexus/src/app/background/tasks/region_replacement_driver.rs index e7fe0d6338..6cc28f9dfd 100644 --- a/nexus/src/app/background/tasks/region_replacement_driver.rs +++ b/nexus/src/app/background/tasks/region_replacement_driver.rs @@ -258,6 +258,7 @@ mod test { use omicron_uuid_kinds::UpstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; use omicron_uuid_kinds::UpstairsSessionKind; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = @@ -288,6 +289,20 @@ mod test { let new_region_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = { let mut request = RegionReplacement::new(region_id, volume_id); request.replacement_state = RegionReplacementState::Running; @@ -382,6 +397,20 @@ mod test { .unwrap(); } + datastore + .volume_create(nexus_db_model::Volume::new( + old_region.volume_id(), + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: old_region.volume_id(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + // Add a region replacement request for that region, and change it to // state ReplacementDone. Set the new_region_id to the region created // above. @@ -481,6 +510,20 @@ mod test { .unwrap(); } + datastore + .volume_create(nexus_db_model::Volume::new( + old_region.volume_id(), + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: old_region.volume_id(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + // Add a region replacement request for that region, and change it to // state Running. Set the new_region_id to the region created above. let request = { @@ -630,6 +673,20 @@ mod test { .unwrap(); } + datastore + .volume_create(nexus_db_model::Volume::new( + old_region.volume_id(), + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: old_region.volume_id(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + // Add a region replacement request for that region, and change it to // state Running. Set the new_region_id to the region created above. let request = { diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs index 0eebd37fb7..61a84c579d 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs @@ -8,9 +8,15 @@ //! Once all related region snapshot replacement steps are done, the region //! snapshot replacement can be completed. +use crate::app::authn; use crate::app::background::BackgroundTask; +use crate::app::saga::StartSaga; +use crate::app::sagas; +use crate::app::sagas::region_snapshot_replacement_finish::*; +use crate::app::sagas::NexusSaga; use futures::future::BoxFuture; use futures::FutureExt; +use nexus_db_model::RegionSnapshotReplacement; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus; @@ -19,11 +25,31 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementFinishDetector { datastore: Arc, + sagas: Arc, } impl RegionSnapshotReplacementFinishDetector { - pub fn new(datastore: Arc) -> Self { - RegionSnapshotReplacementFinishDetector { datastore } + pub fn new(datastore: Arc, sagas: Arc) -> Self { + RegionSnapshotReplacementFinishDetector { datastore, sagas } + } + + async fn send_finish_request( + &self, + opctx: &OpContext, + request: RegionSnapshotReplacement, + ) -> Result<(), omicron_common::api::external::Error> { + let params = sagas::region_snapshot_replacement_finish::Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + request, + }; + + let saga_dag = SagaRegionSnapshotReplacementFinish::prepare(¶ms)?; + + // We only care that the saga was started, and don't wish to wait for it + // to complete, so use `StartSaga::saga_start`, rather than `saga_run`. + self.sagas.saga_start(saga_dag).await?; + + Ok(()) } async fn transition_requests_to_done( @@ -120,21 +146,23 @@ impl RegionSnapshotReplacementFinishDetector { } }; - // Transition region snapshot replacement to Complete - match self - .datastore - .set_region_snapshot_replacement_complete(opctx, request.id) - .await - { + let request_id = request.id; + + match self.send_finish_request(opctx, request).await { Ok(()) => { - let s = format!("set request {} to done", request.id); + let s = format!( + "region snapshot replacement finish invoked ok for \ + {request_id}" + ); + info!(&log, "{s}"); - status.records_set_to_done.push(s); + status.finish_invoked_ok.push(s); } Err(e) => { let s = format!( - "marking snapshot replacement as done failed: {e}" + "invoking region snapshot replacement finish for \ + {request_id} failed: {e}", ); error!(&log, "{s}"); status.errors.push(s); @@ -170,6 +198,7 @@ mod test { use nexus_db_queries::db::datastore::region_snapshot_replacement; use nexus_test_utils_macros::nexus_test; use omicron_uuid_kinds::DatasetUuid; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = @@ -186,8 +215,10 @@ mod test { datastore.clone(), ); - let mut task = - RegionSnapshotReplacementFinishDetector::new(datastore.clone()); + let mut task = RegionSnapshotReplacementFinishDetector::new( + datastore.clone(), + nexus.sagas.clone(), + ); // Noop test let result: RegionSnapshotReplacementFinishStatus = @@ -208,11 +239,25 @@ mod test { let request_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -232,6 +277,7 @@ mod test { .unwrap(); let new_region_id = Uuid::new_v4(); + let new_region_volume_id = Uuid::new_v4(); let old_snapshot_volume_id = Uuid::new_v4(); datastore @@ -240,6 +286,7 @@ mod test { request_id, operating_saga_id, new_region_id, + new_region_volume_id, old_snapshot_volume_id, ) .await @@ -267,14 +314,44 @@ mod test { let operating_saga_id = Uuid::new_v4(); + let step_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step_1 = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step_1.replacement_state = RegionSnapshotReplacementStepState::Complete; step_1.operating_saga_id = Some(operating_saga_id); let step_1_id = step_1.id; + let step_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step_2 = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step_2.replacement_state = RegionSnapshotReplacementStepState::Complete; step_2.operating_saga_id = Some(operating_saga_id); let step_2_id = step_2.id; @@ -335,8 +412,9 @@ mod test { assert_eq!( result, RegionSnapshotReplacementFinishStatus { - records_set_to_done: vec![format!( - "set request {request_id} to done" + finish_invoked_ok: vec![format!( + "region snapshot replacement finish invoked ok for \ + {request_id}" )], errors: vec![], }, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs index db30e1d074..57bbf3741c 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs @@ -155,6 +155,7 @@ mod test { use nexus_db_model::RegionSnapshotReplacementState; use nexus_test_utils_macros::nexus_test; use omicron_uuid_kinds::DatasetUuid; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = @@ -199,11 +200,25 @@ mod test { let request_1_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -219,11 +234,25 @@ mod test { let request_2_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index 04d86a1268..f2b82a3943 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -23,6 +23,7 @@ use nexus_db_model::RegionSnapshotReplacement; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; +use omicron_common::api::external::Error; use serde_json::json; use std::sync::Arc; @@ -40,7 +41,7 @@ impl RegionSnapshotReplacementDetector { &self, serialized_authn: authn::saga::Serialized, request: RegionSnapshotReplacement, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let params = sagas::region_snapshot_replacement_start::Params { serialized_authn, request, @@ -138,17 +139,33 @@ impl RegionSnapshotReplacementDetector { } Err(e) => { - let s = - format!("error creating replacement request: {e}"); - - error!( - &log, - "{s}"; - "snapshot_id" => %region_snapshot.snapshot_id, - "region_id" => %region_snapshot.region_id, - "dataset_id" => %region_snapshot.dataset_id, - ); - status.errors.push(s); + match e { + Error::Conflict { message } + if message.external_message() + == "volume repair lock" => + { + // This is not a fatal error! If there are + // competing region replacement and region + // snapshot replacements, then they are both + // attempting to lock volumes. + } + + _ => { + let s = format!( + "error creating replacement request: {e}" + ); + + error!( + &log, + "{s}"; + "snapshot_id" => %region_snapshot.snapshot_id, + "region_id" => %region_snapshot.region_id, + "dataset_id" => %region_snapshot.dataset_id, + ); + + status.errors.push(s); + } + } } } } @@ -185,6 +202,67 @@ impl RegionSnapshotReplacementDetector { for request in requests { let request_id = request.id; + // If the region snapshot is gone, then there are no more references + // in any volume, and the whole region snapshot replacement can be + // fast-tracked to Complete. + + let maybe_region_snapshot = match self + .datastore + .region_snapshot_get( + request.old_dataset_id.into(), + request.old_region_id, + request.old_snapshot_id, + ) + .await + { + Ok(maybe_region_snapshot) => maybe_region_snapshot, + + Err(e) => { + let s = format!("query for region snapshot failed: {e}"); + + error!( + &log, + "{s}"; + "request.snapshot_id" => %request.old_snapshot_id, + "request.region_id" => %request.old_region_id, + "request.dataset_id" => %request.old_dataset_id, + ); + status.errors.push(s); + return; + } + }; + + if maybe_region_snapshot.is_none() { + match self + .datastore + .set_region_snapshot_replacement_complete_from_requested( + &opctx, request.id, + ) + .await + { + Ok(()) => { + let s = format!( + "region snapshot replacement {request_id} \ + completed ok" + ); + info!(&log, "{s}"); + status.requests_completed_ok.push(s); + } + + Err(e) => { + let s = format!( + "query to set region snapshot request state \ + to complete failed: {e}" + ); + + error!(&log, "{s}"; "request.id" => %request_id); + status.errors.push(s); + } + } + + continue; + } + let result = self .send_start_request( authn::saga::Serialized::for_opctx(opctx), @@ -269,6 +347,7 @@ mod test { use nexus_db_model::Snapshot; use nexus_db_model::SnapshotIdentity; use nexus_db_model::SnapshotState; + use nexus_db_model::VolumeResourceUsage; use nexus_db_queries::authz; use nexus_db_queries::db::lookup::LookupPath; use nexus_test_utils::resource_helpers::create_project; @@ -276,6 +355,8 @@ mod test { use omicron_common::api::external; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; + use sled_agent_client::types::CrucibleOpts; + use sled_agent_client::types::VolumeConstructionRequest; use std::collections::BTreeMap; use uuid::Uuid; @@ -309,19 +390,43 @@ mod test { // Add a region snapshot replacement request for a fake region snapshot - let request = RegionSnapshotReplacement::new( - DatasetUuid::new_v4(), // dataset id - Uuid::new_v4(), // region id - Uuid::new_v4(), // snapshot id + let dataset_id = DatasetUuid::new_v4(); + let region_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let region_snapshot = RegionSnapshot::new( + dataset_id, + region_id, + snapshot_id, + "[::]:12345".to_string(), ); + datastore.region_snapshot_create(region_snapshot).await.unwrap(); + + let request = + RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); + let request_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -339,6 +444,7 @@ mod test { "region snapshot replacement start invoked ok for \ {request_id}" )], + requests_completed_ok: vec![], errors: vec![], }, ); @@ -407,6 +513,22 @@ mod test { .await .unwrap(); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .project_ensure_snapshot( &opctx, @@ -426,7 +548,7 @@ mod test { project_id, disk_id: Uuid::new_v4(), - volume_id: Uuid::new_v4(), + volume_id, destination_volume_id: Uuid::new_v4(), gen: Generation::new(), @@ -508,4 +630,194 @@ mod test { dataset_to_zpool.get(&first_zpool.id.to_string()).unwrap(); assert_eq!(&request.old_dataset_id.to_string(), dataset_id); } + + #[nexus_test(server = crate::Server)] + async fn test_delete_region_snapshot_replacement_volume_causes_complete( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let starter = Arc::new(NoopStartSaga::new()); + let mut task = RegionSnapshotReplacementDetector::new( + datastore.clone(), + starter.clone(), + ); + + // Noop test + let result: RegionSnapshotReplacementStartStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + assert_eq!(result, RegionSnapshotReplacementStartStatus::default()); + assert_eq!(starter.count_reset(), 0); + + // The volume reference counting machinery needs a fake dataset to exist + // (region snapshots are joined with the dataset table when creating the + // CrucibleResources object) + + let disk_test = DiskTest::new(cptestctx).await; + + let dataset_id = disk_test.zpools().next().unwrap().datasets[0].id; + + // Add a region snapshot replacement request for a fake region snapshot + + let region_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let region_snapshot = RegionSnapshot::new( + dataset_id, + region_id, + snapshot_id, + "[::1]:12345".to_string(), + ); + + datastore + .region_snapshot_create(region_snapshot.clone()) + .await + .unwrap(); + + let request = + RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); + + let request_id = request.id; + + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + // the region snapshot + String::from("[::1]:12345"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Assert usage + + let records = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id.into(), + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert!(!records.is_empty()); + assert_eq!(records[0].volume_id, volume_id); + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, request, volume_id, + ) + .await + .unwrap(); + + // Before the task starts, soft-delete the volume, and delete the + // region snapshot (like the volume delete saga would do). + + let crucible_resources = + datastore.soft_delete_volume(volume_id).await.unwrap(); + + // Assert no more usage + + let records = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id.into(), + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert!(records.is_empty()); + + // The region snapshot should have been returned for deletion + + let datasets_and_snapshots = + datastore.snapshots_to_delete(&crucible_resources).await.unwrap(); + + assert!(!datasets_and_snapshots.is_empty()); + + let region_snapshot_to_delete = &datasets_and_snapshots[0].1; + + assert_eq!( + region_snapshot_to_delete.dataset_id, + region_snapshot.dataset_id, + ); + assert_eq!( + region_snapshot_to_delete.region_id, + region_snapshot.region_id, + ); + assert_eq!( + region_snapshot_to_delete.snapshot_id, + region_snapshot.snapshot_id, + ); + + // So delete it! + + datastore + .region_snapshot_remove( + region_snapshot_to_delete.dataset_id.into(), + region_snapshot_to_delete.region_id, + region_snapshot_to_delete.snapshot_id, + ) + .await + .unwrap(); + + // Activate the task - it should pick the request up but not attempt to + // run the start saga + + let result: RegionSnapshotReplacementStartStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + + assert_eq!( + result, + RegionSnapshotReplacementStartStatus { + requests_created_ok: vec![], + start_invoked_ok: vec![], + requests_completed_ok: vec![format!( + "region snapshot replacement {request_id} completed ok" + )], + errors: vec![], + }, + ); + + // Assert start saga not invoked + assert_eq!(starter.count_reset(), 0); + } } diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index ac259ecba8..f481126312 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -32,10 +32,11 @@ use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_model::RegionSnapshotReplacementStep; use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::datastore::region_snapshot_replacement; +use nexus_db_queries::db::datastore::region_snapshot_replacement::*; use nexus_db_queries::db::DataStore; use nexus_types::identity::Asset; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; +use omicron_common::api::external::Error; use serde_json::json; use std::sync::Arc; @@ -53,7 +54,7 @@ impl RegionSnapshotReplacementFindAffected { &self, opctx: &OpContext, request: RegionSnapshotReplacementStep, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let params = sagas::region_snapshot_replacement_step::Params { serialized_authn: authn::saga::Serialized::for_opctx(opctx), request, @@ -70,7 +71,7 @@ impl RegionSnapshotReplacementFindAffected { &self, opctx: &OpContext, request: RegionSnapshotReplacementStep, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let Some(old_snapshot_volume_id) = request.old_snapshot_volume_id else { // This state is illegal! @@ -79,9 +80,7 @@ impl RegionSnapshotReplacementFindAffected { request.id, ); - return Err(omicron_common::api::external::Error::internal_error( - &s, - )); + return Err(Error::internal_error(&s)); }; let params = @@ -315,6 +314,21 @@ impl RegionSnapshotReplacementFindAffected { // functions execute), an indefinite amount of work would be // created, continually "moving" the snapshot_addr from // temporary volume to temporary volume. + // + // If the volume was soft deleted, then skip making a step for + // it. + + if volume.time_deleted.is_some() { + info!( + log, + "volume was soft-deleted, skipping creating a step for \ + it"; + "request id" => ?request.id, + "volume id" => ?volume.id(), + ); + + continue; + } match self .datastore @@ -326,7 +340,7 @@ impl RegionSnapshotReplacementFindAffected { .await { Ok(insertion_result) => match insertion_result { - region_snapshot_replacement::InsertStepResult::Inserted { step_id } => { + InsertStepResult::Inserted { step_id } => { let s = format!("created {step_id}"); info!( log, @@ -337,7 +351,7 @@ impl RegionSnapshotReplacementFindAffected { status.step_records_created_ok.push(s); } - region_snapshot_replacement::InsertStepResult::AlreadyHandled { .. } => { + InsertStepResult::AlreadyHandled { .. } => { info!( log, "step already exists for volume id"; @@ -345,17 +359,32 @@ impl RegionSnapshotReplacementFindAffected { "volume id" => ?volume.id(), ); } - } + }, Err(e) => { let s = format!("error creating step request: {e}"); - error!( + warn!( log, "{s}"; "request id" => ?request.id, "volume id" => ?volume.id(), ); - status.errors.push(s); + + match e { + Error::Conflict { message } + if message.external_message() + == "volume repair lock" => + { + // This is not a fatal error! If there are + // competing region replacement and region + // snapshot replacements, then they are both + // attempting to lock volumes. + } + + _ => { + status.errors.push(s); + } + } } } } @@ -392,13 +421,81 @@ impl RegionSnapshotReplacementFindAffected { }; for request in step_requests { - let request_id = request.id; + let request_step_id = request.id; + + // Check if the volume was deleted _after_ the replacement step was + // created. Avoid launching the region snapshot replacement step + // saga if it was deleted: the saga will do the right thing if it is + // deleted, but this avoids the overhead of starting it. + + let volume_deleted = + match self.datastore.volume_deleted(request.volume_id).await { + Ok(volume_deleted) => volume_deleted, + + Err(e) => { + let s = format!( + "error checking if volume id {} was \ + deleted: {e}", + request.volume_id, + ); + error!(&log, "{s}"); + + status.errors.push(s); + continue; + } + }; + + if volume_deleted { + // Volume was soft or hard deleted, so proceed with clean up, + // which if this is in state Requested there won't be any + // additional associated state, so transition the record to + // Completed. + + info!( + &log, + "request {} step {} volume {} was soft or hard deleted!", + request.request_id, + request_step_id, + request.volume_id, + ); + + let result = self + .datastore + .set_region_snapshot_replacement_step_volume_deleted_from_requested( + opctx, request, + ) + .await; + + match result { + Ok(()) => { + let s = format!( + "request step {request_step_id} transitioned from \ + requested to volume_deleted" + ); + + info!(&log, "{s}"); + status.step_set_volume_deleted_ok.push(s); + } + + Err(e) => { + let s = format!( + "error transitioning {request_step_id} from \ + requested to complete: {e}" + ); + + error!(&log, "{s}"); + status.errors.push(s); + } + } + + continue; + } match self.send_start_request(opctx, request.clone()).await { Ok(()) => { let s = format!( "region snapshot replacement step saga invoked ok for \ - {request_id}" + {request_step_id}" ); info!( @@ -413,7 +510,7 @@ impl RegionSnapshotReplacementFindAffected { Err(e) => { let s = format!( "invoking region snapshot replacement step saga for \ - {request_id} failed: {e}" + {request_step_id} failed: {e}" ); error!( @@ -575,11 +672,25 @@ mod test { let request_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -599,6 +710,7 @@ mod test { .unwrap(); let new_region_id = Uuid::new_v4(); + let new_region_volume_id = Uuid::new_v4(); let old_snapshot_volume_id = Uuid::new_v4(); datastore @@ -607,6 +719,7 @@ mod test { request_id, operating_saga_id, new_region_id, + new_region_volume_id, old_snapshot_volume_id, ) .await @@ -731,11 +844,27 @@ mod test { // Now, add some Complete records and make sure the garbage collection // saga is invoked. + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let result = datastore .insert_region_snapshot_replacement_step(&opctx, { let mut record = RegionSnapshotReplacementStep::new( Uuid::new_v4(), - Uuid::new_v4(), + volume_id, ); record.replacement_state = @@ -747,16 +876,29 @@ mod test { .await .unwrap(); - assert!(matches!( - result, - region_snapshot_replacement::InsertStepResult::Inserted { .. } - )); + assert!(matches!(result, InsertStepResult::Inserted { .. })); + + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); let result = datastore .insert_region_snapshot_replacement_step(&opctx, { let mut record = RegionSnapshotReplacementStep::new( Uuid::new_v4(), - Uuid::new_v4(), + volume_id, ); record.replacement_state = @@ -768,10 +910,7 @@ mod test { .await .unwrap(); - assert!(matches!( - result, - region_snapshot_replacement::InsertStepResult::Inserted { .. } - )); + assert!(matches!(result, InsertStepResult::Inserted { .. })); // Activate the task - it should pick the complete steps up and try to // run the region snapshot replacement step garbage collect saga diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 405a972976..d8ba6abbdd 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -39,6 +39,7 @@ pub mod project_create; pub mod region_replacement_drive; pub mod region_replacement_finish; pub mod region_replacement_start; +pub mod region_snapshot_replacement_finish; pub mod region_snapshot_replacement_garbage_collect; pub mod region_snapshot_replacement_start; pub mod region_snapshot_replacement_step; @@ -173,7 +174,8 @@ fn make_action_registry() -> ActionRegistry { region_snapshot_replacement_start::SagaRegionSnapshotReplacementStart, region_snapshot_replacement_garbage_collect::SagaRegionSnapshotReplacementGarbageCollect, region_snapshot_replacement_step::SagaRegionSnapshotReplacementStep, - region_snapshot_replacement_step_garbage_collect::SagaRegionSnapshotReplacementStepGarbageCollect + region_snapshot_replacement_step_garbage_collect::SagaRegionSnapshotReplacementStepGarbageCollect, + region_snapshot_replacement_finish::SagaRegionSnapshotReplacementFinish ]; #[cfg(test)] diff --git a/nexus/src/app/sagas/region_replacement_finish.rs b/nexus/src/app/sagas/region_replacement_finish.rs index c7efa2f03f..2212e6fdf3 100644 --- a/nexus/src/app/sagas/region_replacement_finish.rs +++ b/nexus/src/app/sagas/region_replacement_finish.rs @@ -311,6 +311,20 @@ pub(crate) mod test { operating_saga_id: None, }; + datastore + .volume_create(nexus_db_model::Volume::new( + new_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: new_volume_id, + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_replacement_request(&opctx, request.clone()) .await diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index ce063dc5be..aa9e83c037 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -491,7 +491,7 @@ async fn srrs_get_old_region_address( async fn srrs_replace_region_in_volume( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let log = sagactx.user_data().log(); let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; @@ -555,8 +555,6 @@ async fn srrs_replace_region_in_volume( .await .map_err(ActionError::action_failed)?; - debug!(log, "replacement returned {:?}", volume_replace_region_result); - match volume_replace_region_result { VolumeReplaceResult::AlreadyHappened | VolumeReplaceResult::Done => { // The replacement was done either by this run of this saga node, or @@ -565,10 +563,11 @@ async fn srrs_replace_region_in_volume( // with the rest of the saga (to properly clean up allocated // resources). - Ok(()) + Ok(volume_replace_region_result) } - VolumeReplaceResult::ExistingVolumeDeleted => { + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted => { // Unwind the saga here to clean up the resources allocated during // this saga. The associated background task will transition this // request's state to Completed. diff --git a/nexus/src/app/sagas/region_snapshot_replacement_finish.rs b/nexus/src/app/sagas/region_snapshot_replacement_finish.rs new file mode 100644 index 0000000000..d992f753d6 --- /dev/null +++ b/nexus/src/app/sagas/region_snapshot_replacement_finish.rs @@ -0,0 +1,211 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! After the change to store a "new region volume" in the region snapshot +//! replacement request, that volume requires garbage collection before the +//! region snapshot replacement transitions to Complete. It's this saga's +//! responsibility to ensure that cleanup. This saga handles the following +//! region snapshot replacement request state transitions: +//! +//! ```text +//! Running <-- +//! | +//! | | +//! v | +//! | +//! Completing -- +//! +//! | +//! v +//! +//! Complete +//! ``` +//! +//! The first thing this saga does is set itself as the "operating saga" for the +//! request, and change the state to "Completing". Then, it performs the volume +//! delete sub-saga for the new region volume. Finally, it updates the region +//! snapshot replacement request by clearing the operating saga id and changing +//! the state to "Complete". +//! +//! Any unwind will place the state back into Running. + +use super::{ + ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ACTION_GENERATE_ID, +}; +use crate::app::sagas::declare_saga_actions; +use crate::app::sagas::volume_delete; +use crate::app::{authn, db}; +use serde::Deserialize; +use serde::Serialize; +use steno::ActionError; +use steno::Node; +use uuid::Uuid; + +// region snapshot replacement finish saga: input parameters + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + pub serialized_authn: authn::saga::Serialized, + pub request: db::model::RegionSnapshotReplacement, +} + +// region snapshot replacement finish saga: actions + +declare_saga_actions! { + region_snapshot_replacement_finish; + SET_SAGA_ID -> "unused_1" { + + rsrfs_set_saga_id + - rsrfs_set_saga_id_undo + } + UPDATE_REQUEST_RECORD -> "unused_4" { + + rsrfs_update_request_record + } +} + +// region snapshot replacement finish saga: definition + +#[derive(Debug)] +pub(crate) struct SagaRegionSnapshotReplacementFinish; +impl NexusSaga for SagaRegionSnapshotReplacementFinish { + const NAME: &'static str = "region-snapshot-replacement-finish"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + region_snapshot_replacement_finish_register_actions(registry); + } + + fn make_saga_dag( + params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(Node::action( + "saga_id", + "GenerateSagaId", + ACTION_GENERATE_ID.as_ref(), + )); + + builder.append(set_saga_id_action()); + + if let Some(new_region_volume_id) = params.request.new_region_volume_id + { + let subsaga_params = volume_delete::Params { + serialized_authn: params.serialized_authn.clone(), + volume_id: new_region_volume_id, + }; + + let subsaga_dag = { + let subsaga_builder = steno::DagBuilder::new( + steno::SagaName::new(volume_delete::SagaVolumeDelete::NAME), + ); + volume_delete::SagaVolumeDelete::make_saga_dag( + &subsaga_params, + subsaga_builder, + )? + }; + + builder.append(Node::constant( + "params_for_volume_delete_subsaga", + serde_json::to_value(&subsaga_params).map_err(|e| { + SagaInitError::SerializeError( + "params_for_volume_delete_subsaga".to_string(), + e, + ) + })?, + )); + + builder.append(Node::subsaga( + "volume_delete_subsaga_no_result", + subsaga_dag, + "params_for_volume_delete_subsaga", + )); + } + + builder.append(update_request_record_action()); + + Ok(builder.build()?) + } +} + +// region snapshot replacement finish saga: action implementations + +async fn rsrfs_set_saga_id( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + // Change the request record here to an intermediate "completing" state to + // block out other sagas that will be triggered for the same request. + osagactx + .datastore() + .set_region_snapshot_replacement_completing( + &opctx, + params.request.id, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn rsrfs_set_saga_id_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + osagactx + .datastore() + .undo_set_region_snapshot_replacement_completing( + &opctx, + params.request.id, + saga_id, + ) + .await?; + + Ok(()) +} + +async fn rsrfs_update_request_record( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + // Update the replacement request record to 'Complete' and clear the + // operating saga id. There is no undo step for this, it should succeed + // idempotently. + datastore + .set_region_snapshot_replacement_complete( + &opctx, + params.request.id, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} diff --git a/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs index 762182724b..675b2b0cb3 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs @@ -284,11 +284,27 @@ pub(crate) mod test { RegionSnapshotReplacementState::ReplacementDone; request.old_snapshot_volume_id = Some(old_snapshot_volume_id); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( &opctx, request.clone(), - Uuid::new_v4(), + volume_id, ) .await .unwrap(); diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index 4855f64ac2..3cc08395ec 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -101,9 +101,47 @@ declare_saga_actions! { FIND_NEW_REGION -> "new_dataset_and_region" { + rsrss_find_new_region } + // One of the common sharp edges of sagas is that the compensating action of + // a node does _not_ run if the forward action fails. Said another way, for + // this node: + // + // EXAMPLE -> "output" { + // + forward_action + // - forward_action_undo + // } + // + // If `forward_action` fails, `forward_action_undo` is never executed. + // Forward actions are therefore required to be atomic, in that they either + // fully apply or don't apply at all. + // + // Sagas with nodes that ensure multiple regions exist cannot be atomic + // because they can partially fail (for example: what if only 2 out of 3 + // ensures succeed?). In order for the compensating action to be run, it + // must exist as a separate node that has a no-op forward action: + // + // EXAMPLE_UNDO -> "not_used" { + // + noop + // - forward_action_undo + // } + // EXAMPLE -> "output" { + // + forward_action + // } + // + // This saga will only ever ensure that a single region exists, so you might + // think you could get away with a single node that combines the forward and + // compensating action - you'd be mistaken! The Crucible agent's region + // ensure is not atomic in all cases: if the region fails to create, it + // enters the `failed` state, but is not deleted. Nexus must clean these up. + NEW_REGION_ENSURE_UNDO -> "not_used" { + + rsrss_noop + - rsrss_new_region_ensure_undo + } NEW_REGION_ENSURE -> "ensured_dataset_and_region" { + rsrss_new_region_ensure - - rsrss_new_region_ensure_undo + } + NEW_REGION_VOLUME_CREATE -> "new_region_volume" { + + rsrss_new_region_volume_create + - rsrss_new_region_volume_create_undo } GET_OLD_SNAPSHOT_VOLUME_ID -> "old_snapshot_volume_id" { + rsrss_get_old_snapshot_volume_id @@ -149,11 +187,19 @@ impl NexusSaga for SagaRegionSnapshotReplacementStart { ACTION_GENERATE_ID.as_ref(), )); + builder.append(Node::action( + "new_region_volume_id", + "GenerateNewRegionVolumeId", + ACTION_GENERATE_ID.as_ref(), + )); + builder.append(set_saga_id_action()); builder.append(get_alloc_region_params_action()); builder.append(alloc_new_region_action()); builder.append(find_new_region_action()); + builder.append(new_region_ensure_undo_action()); builder.append(new_region_ensure_action()); + builder.append(new_region_volume_create_action()); builder.append(get_old_snapshot_volume_id_action()); builder.append(create_fake_volume_action()); builder.append(replace_snapshot_in_volume_action()); @@ -380,6 +426,10 @@ async fn rsrss_find_new_region( Ok(dataset_and_region) } +async fn rsrss_noop(_sagactx: NexusActionContext) -> Result<(), ActionError> { + Ok(()) +} + async fn rsrss_new_region_ensure( sagactx: NexusActionContext, ) -> Result< @@ -390,10 +440,6 @@ async fn rsrss_new_region_ensure( let osagactx = sagactx.user_data(); let log = osagactx.log(); - // With a list of datasets and regions to ensure, other sagas need to have a - // separate no-op forward step for the undo action to ensure that the undo - // step occurs in the case that the ensure partially fails. Here this is not - // required, there's only one dataset and region. let new_dataset_and_region = sagactx .lookup::<(db::model::Dataset, db::model::Region)>( "new_dataset_and_region", @@ -474,6 +520,94 @@ async fn rsrss_new_region_ensure_undo( Ok(()) } +async fn rsrss_new_region_volume_create( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + + let new_region_volume_id = + sagactx.lookup::("new_region_volume_id")?; + + let (new_dataset, ensured_region) = sagactx.lookup::<( + db::model::Dataset, + crucible_agent_client::types::Region, + )>( + "ensured_dataset_and_region", + )?; + + let Some(new_dataset_address) = new_dataset.address() else { + return Err(ActionError::action_failed(format!( + "dataset {} does not have an address!", + new_dataset.id(), + ))); + }; + + let new_region_address = SocketAddrV6::new( + *new_dataset_address.ip(), + ensured_region.port_number, + 0, + 0, + ); + + // Create a volume to inflate the reference count of the newly created + // read-only region. If this is not done it's possible that a user could + // delete the snapshot volume _after_ the new read-only region was swapped + // in, removing the last reference to it and causing garbage collection. + + let volume_construction_request = VolumeConstructionRequest::Volume { + id: new_region_volume_id, + block_size: 0, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 0, + blocks_per_extent: 0, + extent_count: 0, + gen: 0, + opts: CrucibleOpts { + id: new_region_volume_id, + target: vec![new_region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }; + + let volume_data = serde_json::to_string(&volume_construction_request) + .map_err(|e| { + ActionError::action_failed(Error::internal_error(&e.to_string())) + })?; + + let volume = db::model::Volume::new(new_region_volume_id, volume_data); + + osagactx + .datastore() + .volume_create(volume) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn rsrss_new_region_volume_create_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + + // Delete the volume. + + let new_region_volume_id = + sagactx.lookup::("new_region_volume_id")?; + osagactx.datastore().volume_hard_delete(new_region_volume_id).await?; + + Ok(()) +} + async fn rsrss_get_old_snapshot_volume_id( sagactx: NexusActionContext, ) -> Result { @@ -660,7 +794,7 @@ async fn get_replace_params( async fn rsrss_replace_snapshot_in_volume( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let log = sagactx.user_data().log(); let osagactx = sagactx.user_data(); @@ -694,10 +828,11 @@ async fn rsrss_replace_snapshot_in_volume( // if the transaction occurred on the non-deleted volume so proceed // with the rest of the saga. - Ok(()) + Ok(volume_replace_snapshot_result) } - VolumeReplaceResult::ExistingVolumeDeleted => { + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted => { // If the snapshot volume was deleted, we still want to proceed with // replacing the rest of the uses of the region snapshot. Note this // also covers the case where this saga node runs (performing the @@ -706,7 +841,7 @@ async fn rsrss_replace_snapshot_in_volume( // deleted. If this saga unwound here, that would violate the // property of idempotency. - Ok(()) + Ok(volume_replace_snapshot_result) } } } @@ -780,6 +915,9 @@ async fn rsrss_update_request_record( let old_region_volume_id = sagactx.lookup::("new_volume_id")?; + let new_region_volume_id = + sagactx.lookup::("new_region_volume_id")?; + // Now that the region has been ensured and the construction request has // been updated, update the replacement request record to 'ReplacementDone' // and clear the operating saga id. There is no undo step for this, it @@ -790,6 +928,7 @@ async fn rsrss_update_request_record( params.request.id, saga_id, new_region_id, + new_region_volume_id, old_region_volume_id, ) .await @@ -830,7 +969,7 @@ pub(crate) mod test { /// Create four zpools, a disk, and a snapshot of that disk async fn prepare_for_test( cptestctx: &ControlPlaneTestContext, - ) -> PrepareResult { + ) -> PrepareResult<'_> { let client = &cptestctx.external_client; let nexus = &cptestctx.server.server_context().nexus; let datastore = nexus.datastore(); @@ -876,20 +1015,21 @@ pub(crate) mod test { panic!("test snapshot {:?} should exist", snapshot_id) }); - PrepareResult { db_disk, snapshot, db_snapshot } + PrepareResult { db_disk, snapshot, db_snapshot, disk_test } } - struct PrepareResult { + struct PrepareResult<'a> { db_disk: nexus_db_model::Disk, snapshot: views::Snapshot, db_snapshot: nexus_db_model::Snapshot, + disk_test: DiskTest<'a, crate::Server>, } #[nexus_test(server = crate::Server)] async fn test_region_snapshot_replacement_start_saga( cptestctx: &ControlPlaneTestContext, ) { - let PrepareResult { db_disk, snapshot, db_snapshot } = + let PrepareResult { db_disk, snapshot, db_snapshot, .. } = prepare_for_test(cptestctx).await; let nexus = &cptestctx.server.server_context().nexus; @@ -990,8 +1130,10 @@ pub(crate) mod test { .await .unwrap(); - assert_eq!(volumes.len(), 1); - assert_eq!(volumes[0].id(), db_snapshot.volume_id); + assert!(volumes + .iter() + .map(|v| v.id()) + .any(|vid| vid == db_snapshot.volume_id)); } fn new_test_params( @@ -1009,9 +1151,11 @@ pub(crate) mod test { pub(crate) async fn verify_clean_slate( cptestctx: &ControlPlaneTestContext, + test: &DiskTest<'_, crate::Server>, request: &RegionSnapshotReplacement, affected_volume_original: &Volume, ) { + let sled_agent = &cptestctx.sled_agent.sled_agent; let datastore = cptestctx.server.server_context().nexus.datastore(); crate::app::sagas::test_helpers::assert_no_failed_undo_steps( @@ -1024,6 +1168,10 @@ pub(crate) mod test { // original disk, and three for the (currently unused) snapshot // destination volume assert_eq!(region_allocations(&datastore).await, 6); + + // Assert that only those six provisioned regions are non-destroyed + assert_no_other_ensured_regions(sled_agent, test, &datastore).await; + assert_region_snapshot_replacement_request_untouched( cptestctx, &datastore, &request, ) @@ -1031,11 +1179,12 @@ pub(crate) mod test { assert_volume_untouched(&datastore, &affected_volume_original).await; } - async fn region_allocations(datastore: &DataStore) -> usize { + async fn regions(datastore: &DataStore) -> Vec { use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use async_bb8_diesel::AsyncSimpleConnection; use diesel::QueryDsl; + use diesel::SelectableHelper; use nexus_db_queries::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_db_queries::db::schema::region::dsl; @@ -1047,15 +1196,56 @@ pub(crate) mod test { conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await.unwrap(); dsl::region - .count() - .get_result_async(&conn) + .select(db::model::Region::as_select()) + .get_results_async(&conn) .await - .map(|x: i64| x as usize) }) .await .unwrap() } + async fn region_allocations(datastore: &DataStore) -> usize { + regions(datastore).await.len() + } + + async fn assert_no_other_ensured_regions( + sled_agent: &omicron_sled_agent::sim::SledAgent, + test: &DiskTest<'_, crate::Server>, + datastore: &DataStore, + ) { + let mut non_destroyed_regions_from_agent = vec![]; + + for zpool in test.zpools() { + for dataset in &zpool.datasets { + let crucible_dataset = + sled_agent.get_crucible_dataset(zpool.id, dataset.id); + for region in crucible_dataset.list() { + match region.state { + crucible_agent_client::types::State::Tombstoned + | crucible_agent_client::types::State::Destroyed => { + // ok + } + + _ => { + non_destroyed_regions_from_agent + .push(region.clone()); + } + } + } + } + } + + let db_regions = regions(datastore).await; + let db_region_ids: Vec = + db_regions.iter().map(|x| x.id()).collect(); + + for region in non_destroyed_regions_from_agent { + let region_id = region.id.0.parse().unwrap(); + let contains = db_region_ids.contains(®ion_id); + assert!(contains, "db does not have {:?}", region_id); + } + } + async fn assert_region_snapshot_replacement_request_untouched( cptestctx: &ControlPlaneTestContext, datastore: &DataStore, @@ -1094,11 +1284,78 @@ pub(crate) mod test { assert_eq!(actual, expected); } + #[nexus_test(server = crate::Server)] + async fn test_action_failure_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let PrepareResult { db_disk, snapshot, db_snapshot, disk_test } = + prepare_for_test(cptestctx).await; + + let log = &cptestctx.logctx.log; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = test_opctx(cptestctx); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + assert_eq!(disk_allocated_regions.len(), 3); + + let region: &nexus_db_model::Region = &disk_allocated_regions[0].1; + let snapshot_id = snapshot.identity.id; + + let region_snapshot = datastore + .region_snapshot_get(region.dataset_id(), region.id(), snapshot_id) + .await + .unwrap() + .unwrap(); + + let request = + RegionSnapshotReplacement::for_region_snapshot(®ion_snapshot); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request.clone()) + .await + .unwrap(); + + let affected_volume_original = + datastore.volume_get(db_snapshot.volume_id).await.unwrap().unwrap(); + + verify_clean_slate( + &cptestctx, + &disk_test, + &request, + &affected_volume_original, + ) + .await; + + crate::app::sagas::test_helpers::action_failure_can_unwind::< + SagaRegionSnapshotReplacementStart, + _, + _, + >( + nexus, + || Box::pin(async { new_test_params(&opctx, &request) }), + || { + Box::pin(async { + verify_clean_slate( + &cptestctx, + &disk_test, + &request, + &affected_volume_original, + ) + .await; + }) + }, + log, + ) + .await; + } + #[nexus_test(server = crate::Server)] async fn test_action_failure_can_unwind_idempotently( cptestctx: &ControlPlaneTestContext, ) { - let PrepareResult { db_disk, snapshot, db_snapshot } = + let PrepareResult { db_disk, snapshot, db_snapshot, disk_test } = prepare_for_test(cptestctx).await; let log = &cptestctx.logctx.log; @@ -1130,8 +1387,13 @@ pub(crate) mod test { let affected_volume_original = datastore.volume_get(db_snapshot.volume_id).await.unwrap().unwrap(); - verify_clean_slate(&cptestctx, &request, &affected_volume_original) - .await; + verify_clean_slate( + &cptestctx, + &disk_test, + &request, + &affected_volume_original, + ) + .await; crate::app::sagas::test_helpers::action_failure_can_unwind_idempotently::< SagaRegionSnapshotReplacementStart, @@ -1143,6 +1405,7 @@ pub(crate) mod test { || Box::pin(async { verify_clean_slate( &cptestctx, + &disk_test, &request, &affected_volume_original, ).await; @@ -1155,7 +1418,7 @@ pub(crate) mod test { async fn test_actions_succeed_idempotently( cptestctx: &ControlPlaneTestContext, ) { - let PrepareResult { db_disk, snapshot, db_snapshot: _ } = + let PrepareResult { db_disk, snapshot, .. } = prepare_for_test(cptestctx).await; let nexus = &cptestctx.server.server_context().nexus; @@ -1192,4 +1455,66 @@ pub(crate) mod test { ) .await; } + + /// Assert this saga does not leak regions if the replacement read-only + /// region cannot be created. + #[nexus_test(server = crate::Server)] + async fn test_no_leak_region(cptestctx: &ControlPlaneTestContext) { + let PrepareResult { db_disk, snapshot, db_snapshot, disk_test } = + prepare_for_test(cptestctx).await; + + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = test_opctx(cptestctx); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + assert_eq!(disk_allocated_regions.len(), 3); + + let region: &nexus_db_model::Region = &disk_allocated_regions[0].1; + let snapshot_id = snapshot.identity.id; + + let region_snapshot = datastore + .region_snapshot_get(region.dataset_id(), region.id(), snapshot_id) + .await + .unwrap() + .unwrap(); + + let request = + RegionSnapshotReplacement::for_region_snapshot(®ion_snapshot); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request.clone()) + .await + .unwrap(); + + let affected_volume_original = + datastore.volume_get(db_snapshot.volume_id).await.unwrap().unwrap(); + + disk_test.set_always_fail_callback().await; + + // Run the region snapshot replacement start saga + let dag = + create_saga_dag::(Params { + serialized_authn: Serialized::for_opctx(&opctx), + request: request.clone(), + allocation_strategy: RegionAllocationStrategy::Random { + seed: None, + }, + }) + .unwrap(); + + let runnable_saga = nexus.sagas.saga_prepare(dag).await.unwrap(); + + // Actually run the saga + runnable_saga.run_to_completion().await.unwrap(); + + verify_clean_slate( + &cptestctx, + &disk_test, + &request, + &affected_volume_original, + ) + .await; + } } diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step.rs b/nexus/src/app/sagas/region_snapshot_replacement_step.rs index 7b1d598861..a236fcf62c 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step.rs @@ -375,7 +375,8 @@ async fn rsrss_replace_snapshot_in_volume( // with the saga. } - VolumeReplaceResult::ExistingVolumeDeleted => { + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted => { // Proceed with the saga but skip the notification step. } } @@ -423,6 +424,20 @@ async fn rsrss_notify_upstairs( let params = sagactx.saga_params::()?; let log = sagactx.user_data().log(); + // If the associated volume was deleted, then skip this notification step as + // there is no Upstairs to talk to. Continue with the saga to transition the + // step request to Complete, and then perform the associated clean up. + + let volume_replace_snapshot_result = sagactx + .lookup::("volume_replace_snapshot_result")?; + if matches!( + volume_replace_snapshot_result, + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted + ) { + return Ok(()); + } + // Make an effort to notify a Propolis if one was booted for this volume. // This is best effort: if there is a failure, this saga will unwind and be // triggered again for the same request. If there is no Propolis booted for diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs b/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs index b83f917a70..15c6a39651 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs @@ -187,8 +187,24 @@ pub(crate) mod test { .await .unwrap(); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut request = - RegionSnapshotReplacementStep::new(Uuid::new_v4(), Uuid::new_v4()); + RegionSnapshotReplacementStep::new(Uuid::new_v4(), step_volume_id); request.replacement_state = RegionSnapshotReplacementStepState::Complete; request.old_snapshot_volume_id = Some(old_snapshot_volume_id); diff --git a/nexus/src/app/sagas/volume_delete.rs b/nexus/src/app/sagas/volume_delete.rs index a9be1f34ee..a8ded4e33c 100644 --- a/nexus/src/app/sagas/volume_delete.rs +++ b/nexus/src/app/sagas/volume_delete.rs @@ -27,12 +27,9 @@ use super::ActionRegistry; use super::NexusActionContext; use super::NexusSaga; use crate::app::sagas::declare_saga_actions; -use nexus_db_model::Dataset; -use nexus_db_model::Region; -use nexus_db_model::Volume; use nexus_db_queries::authn; use nexus_db_queries::db::datastore::CrucibleResources; -use nexus_types::identity::Asset; +use nexus_db_queries::db::datastore::FreedCrucibleResources; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -330,8 +327,6 @@ async fn svd_delete_crucible_snapshot_records( Ok(()) } -type FreedCrucibleRegions = Vec<(Dataset, Region, Option)>; - /// Deleting region snapshots in a previous saga node may have freed up regions /// that were deleted in the DB but couldn't be deleted by the Crucible Agent /// because a snapshot existed. Look for those here. These will be a different @@ -417,7 +412,7 @@ type FreedCrucibleRegions = Vec<(Dataset, Region, Option)>; /// another snapshot delete. async fn svd_find_freed_crucible_regions( sagactx: NexusActionContext, -) -> Result { +) -> Result { let osagactx = sagactx.user_data(); // Find regions freed up for deletion by a previous saga node deleting the @@ -432,11 +427,7 @@ async fn svd_find_freed_crucible_regions( }, )?; - // Don't serialize the whole Volume, as the data field contains key material! - Ok(freed_datasets_regions_and_volumes - .into_iter() - .map(|x| (x.0, x.1, x.2.map(|v: Volume| v.id()))) - .collect()) + Ok(freed_datasets_regions_and_volumes) } async fn svd_delete_freed_crucible_regions( @@ -448,9 +439,11 @@ async fn svd_delete_freed_crucible_regions( // Find regions freed up for deletion by a previous saga node deleting the // region snapshots. let freed_datasets_regions_and_volumes = - sagactx.lookup::("freed_crucible_regions")?; + sagactx.lookup::("freed_crucible_regions")?; - for (dataset, region, volume_id) in freed_datasets_regions_and_volumes { + for (dataset, region) in + &freed_datasets_regions_and_volumes.datasets_and_regions + { // Send DELETE calls to the corresponding Crucible agents osagactx .nexus() @@ -477,18 +470,17 @@ async fn svd_delete_freed_crucible_regions( e, )) })?; + } - // Remove volume DB record - if let Some(volume_id) = volume_id { - osagactx.datastore().volume_hard_delete(volume_id).await.map_err( - |e| { - ActionError::action_failed(format!( - "failed to volume_hard_delete {}: {:?}", - volume_id, e, - )) - }, - )?; - } + for volume_id in &freed_datasets_regions_and_volumes.volumes { + osagactx.datastore().volume_hard_delete(*volume_id).await.map_err( + |e| { + ActionError::action_failed(format!( + "failed to volume_hard_delete {}: {:?}", + volume_id, e, + )) + }, + )?; } Ok(()) diff --git a/nexus/test-utils/src/background.rs b/nexus/test-utils/src/background.rs index 32a2f24d9d..7c8857123c 100644 --- a/nexus/test-utils/src/background.rs +++ b/nexus/test-utils/src/background.rs @@ -305,7 +305,7 @@ pub async fn run_region_snapshot_replacement_finish( assert!(status.errors.is_empty()); - status.records_set_to_done.len() + status.finish_invoked_ok.len() } /// Run all replacement related background tasks until they aren't doing diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 621df47448..e7f641226b 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -5,9 +5,12 @@ //! Tests related to region and region snapshot replacement use dropshot::test_util::ClientTestContext; +use nexus_client::types::LastResult; use nexus_db_model::PhysicalDiskPolicy; use nexus_db_model::RegionReplacementState; +use nexus_db_model::RegionSnapshotReplacementState; use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::datastore::region_snapshot_replacement::*; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::DataStore; use nexus_test_utils::background::*; @@ -15,11 +18,22 @@ use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; use nexus_test_utils::resource_helpers::create_default_ip_pool; use nexus_test_utils::resource_helpers::create_disk; +use nexus_test_utils::resource_helpers::create_disk_from_snapshot; use nexus_test_utils::resource_helpers::create_project; +use nexus_test_utils::resource_helpers::create_snapshot; +use nexus_test_utils::resource_helpers::object_create; use nexus_test_utils_macros::nexus_test; +use nexus_types::external_api::params; +use nexus_types::external_api::views; +use nexus_types::identity::Asset; +use nexus_types::internal_api::background::*; +use omicron_common::api::external; +use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use omicron_uuid_kinds::GenericUuid; use slog::Logger; +use std::collections::HashSet; +use std::net::SocketAddr; use std::sync::Arc; use uuid::Uuid; @@ -40,12 +54,37 @@ fn get_disk_url(disk_name: &str) -> String { format!("/v1/disks/{disk_name}?project={}", PROJECT_NAME) } +fn get_disks_url() -> String { + format!("/v1/disks?project={}", PROJECT_NAME) +} + +fn get_snapshot_url(snapshot_name: &str) -> String { + format!("/v1/snapshots/{snapshot_name}?project={}", PROJECT_NAME) +} + +fn get_snapshots_url() -> String { + format!("/v1/snapshots?project={}", PROJECT_NAME) +} + async fn create_project_and_pool(client: &ClientTestContext) -> Uuid { create_default_ip_pool(client).await; let project = create_project(client, PROJECT_NAME).await; project.identity.id } +async fn collection_list( + client: &ClientTestContext, + list_url: &str, +) -> Vec +where + T: Clone + serde::de::DeserializeOwned, +{ + NexusRequest::iter_collection_authn(client, list_url, "", None) + .await + .expect("failed to list") + .all_items +} + /// Assert that the first part of region replacement does not create a freed /// crucible region (that would be picked up by a volume delete saga) #[nexus_test] @@ -118,279 +157,354 @@ async fn test_region_replacement_does_not_create_freed_region( assert!(datastore.find_deleted_volume_regions().await.unwrap().is_empty()); } -struct RegionReplacementDeletedVolumeTest<'a> { - log: Logger, - datastore: Arc, - disk_test: DiskTest<'a>, - client: ClientTestContext, - internal_client: ClientTestContext, - replacement_request_id: Uuid, -} - -#[derive(Debug)] -struct ExpectedEndState(pub RegionReplacementState); +mod region_replacement { + use super::*; -#[derive(Debug)] -struct ExpectedIntermediateState(pub RegionReplacementState); + #[derive(Debug)] + struct ExpectedEndState(pub RegionReplacementState); -impl<'a> RegionReplacementDeletedVolumeTest<'a> { - pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { - let nexus = &cptestctx.server.server_context().nexus; + #[derive(Debug)] + struct ExpectedIntermediateState(pub RegionReplacementState); - // Create four zpools, each with one dataset. This is required for - // region and region snapshot replacement to have somewhere to move the - // data. - let disk_test = DiskTestBuilder::new(&cptestctx) - .on_specific_sled(cptestctx.first_sled()) - .with_zpool_count(4) - .build() - .await; - - let client = &cptestctx.external_client; - let internal_client = &cptestctx.internal_client; - let datastore = nexus.datastore().clone(); - - let opctx = OpContext::for_tests( - cptestctx.logctx.log.new(o!()), - datastore.clone(), - ); - - // Create a disk - let _project_id = create_project_and_pool(client).await; - - let disk = create_disk(&client, PROJECT_NAME, "disk").await; - - // Manually create the region replacement request for the first - // allocated region of that disk + #[derive(Debug)] + struct ExpectedStartState(pub RegionReplacementState); - let (.., db_disk) = LookupPath::new(&opctx, &datastore) - .disk_id(disk.identity.id) - .fetch() - .await - .unwrap(); - - assert_eq!(db_disk.id(), disk.identity.id); - - let disk_allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - let (_, region) = &disk_allocated_regions[0]; - - let replacement_request_id = datastore - .create_region_replacement_request_for_region(&opctx, ®ion) - .await - .unwrap(); - - // Assert the request is in state Requested + pub(super) struct DeletedVolumeTest<'a> { + log: Logger, + datastore: Arc, + disk_test: DiskTest<'a>, + client: ClientTestContext, + internal_client: ClientTestContext, + replacement_request_id: Uuid, + } - let region_replacement = datastore - .get_region_replacement_request_by_id( - &opctx, + impl<'a> DeletedVolumeTest<'a> { + pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { + let nexus = &cptestctx.server.server_context().nexus; + + // Create four zpools, each with one dataset. This is required for + // region and region snapshot replacement to have somewhere to move + // the data. + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let datastore = nexus.datastore().clone(); + + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + // Create a disk + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + // Manually create the region replacement request for the first + // allocated region of that disk + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + let disk_allocated_regions = datastore + .get_allocated_regions(db_disk.volume_id) + .await + .unwrap(); + let (_, region) = &disk_allocated_regions[0]; + + let replacement_request_id = datastore + .create_region_replacement_request_for_region(&opctx, ®ion) + .await + .unwrap(); + + // Assert the request is in state Requested + + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_replacement.replacement_state, + RegionReplacementState::Requested, + ); + + DeletedVolumeTest { + log: cptestctx.logctx.log.new(o!()), + datastore, + disk_test, + client: client.clone(), + internal_client: internal_client.clone(), replacement_request_id, - ) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Requested, - ); - - RegionReplacementDeletedVolumeTest { - log: cptestctx.logctx.log.new(o!()), - datastore, - disk_test, - client: client.clone(), - internal_client: internal_client.clone(), - replacement_request_id, + } } - } - pub fn opctx(&self) -> OpContext { - OpContext::for_tests(self.log.clone(), self.datastore.clone()) - } + pub fn opctx(&self) -> OpContext { + OpContext::for_tests(self.log.clone(), self.datastore.clone()) + } - pub async fn delete_the_disk(&self) { - let disk_url = get_disk_url("disk"); - NexusRequest::object_delete(&self.client, &disk_url) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("failed to delete disk"); - } + pub async fn delete_the_disk(&self) { + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + } - /// Make sure: - /// - /// - all region replacement related background tasks run to completion - /// - this harness' region replacement request has transitioned to Complete - /// - no Crucible resources are leaked - pub async fn finish_test(&self) { - // Make sure that all the background tasks can run to completion. + /// Make sure: + /// + /// - all region replacement related background tasks run to completion + /// - this harness' region replacement request has transitioned to + /// Complete + /// - no Crucible resources are leaked + pub async fn finish_test(&self) { + // Make sure that all the background tasks can run to completion. - run_replacement_tasks_to_completion(&self.internal_client).await; + run_replacement_tasks_to_completion(&self.internal_client).await; - // Assert the request is in state Complete + // Assert the request is in state Complete - let region_replacement = self - .datastore - .get_region_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, - ) - .await - .unwrap(); + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Complete, - ); + assert_eq!( + region_replacement.replacement_state, + RegionReplacementState::Complete, + ); - // Assert there are no more Crucible resources + // Assert there are no more Crucible resources - assert!(self.disk_test.crucible_resources_deleted().await); - } + assert!(self.disk_test.crucible_resources_deleted().await); + } - async fn wait_for_request_state( - &self, - expected_end_state: ExpectedEndState, - expected_intermediate_state: ExpectedIntermediateState, - ) { - wait_for_condition( - || { - let datastore = self.datastore.clone(); - let opctx = self.opctx(); - let replacement_request_id = self.replacement_request_id; - - async move { - let region_replacement = datastore - .get_region_replacement_request_by_id( - &opctx, - replacement_request_id, - ) - .await - .unwrap(); - - let state = region_replacement.replacement_state; - - if state == expected_end_state.0 { - // The saga transitioned the request ok - Ok(()) - } else if state == expected_intermediate_state.0 { - // The saga is still running - Err(CondCheckError::<()>::NotYet) - } else { - // Any other state is not expected - panic!("unexpected state {state:?}!"); + async fn wait_for_request_state( + &self, + expected_end_state: ExpectedEndState, + expected_intermediate_state: ExpectedIntermediateState, + expected_start_state: ExpectedStartState, + ) { + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + // If the expected start and end state are the same + // (i.e. there's a back edge in the associated request's + // state machine), then it's impossible to determine + // when the saga starts and stops based on the state. + if expected_end_state.0 == expected_start_state.0 { + if state == expected_end_state.0 { + // The saga transitioned the request ok, or + // hasn't started yet. Either way we have to + // return here, and the call site should perform + // an additional check for some associated + // expected result. + Ok(()) + } else if state == expected_intermediate_state.0 { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } else { + if state == expected_end_state.0 { + // The saga transitioned the request ok + Ok(()) + } else if state == expected_intermediate_state.0 + || state == expected_start_state.0 + { + // The saga is still running, or hasn't started + // yet. + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } } - } - }, - &std::time::Duration::from_millis(500), - &std::time::Duration::from_secs(60), - ) - .await - .expect("request transitioned to expected state"); - - // Assert the request state - - let region_replacement = self - .datastore - .get_region_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), ) .await - .unwrap(); - - assert_eq!(region_replacement.replacement_state, expected_end_state.0); - } - - /// Run the "region replacement" task to transition the request to Running. - pub async fn transition_request_to_running(&self) { - // Activate the "region replacement" background task + .expect("request transitioned to expected state"); + + // Assert the request state + + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_replacement.replacement_state, + expected_end_state.0 + ); + } - run_region_replacement(&self.internal_client).await; + /// Run the "region replacement" task to transition the request to + /// Running. + pub async fn transition_request_to_running(&self) { + // Activate the "region replacement" background task - // The activation above could only have started the associated saga, so - // wait until the request is in state Running. + run_region_replacement(&self.internal_client).await; - self.wait_for_request_state( - ExpectedEndState(RegionReplacementState::Running), - ExpectedIntermediateState(RegionReplacementState::Allocating), - ) - .await; - } + // The activation above could only have started the associated saga, + // so wait until the request is in state Running. - /// Call the region replacement drive task to attach the associated volume - /// to the simulated pantry, ostensibly for reconciliation - pub async fn attach_request_volume_to_pantry(&self) { - // Run the "region replacement driver" task to attach the associated - // volume to the simulated pantry. - - run_region_replacement_driver(&self.internal_client).await; + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::Running), + ExpectedIntermediateState(RegionReplacementState::Allocating), + ExpectedStartState(RegionReplacementState::Requested), + ) + .await; + } - // The activation above could only have started the associated saga, so - // wait until the request is in the expected end state. + /// Call the region replacement drive task to attach the associated volume + /// to the simulated pantry, ostensibly for reconciliation + pub async fn attach_request_volume_to_pantry(&self) { + // Run the "region replacement driver" task to attach the associated + // volume to the simulated pantry. - self.wait_for_request_state( - ExpectedEndState(RegionReplacementState::Running), - ExpectedIntermediateState(RegionReplacementState::Driving), - ) - .await; + run_region_replacement_driver(&self.internal_client).await; - // Additionally, assert that the drive saga recorded that it sent the - // attachment request to the simulated pantry + // The activation above could only have started the associated saga, + // so wait until the request is in the expected end state. - let most_recent_step = self - .datastore - .current_region_replacement_request_step( - &self.opctx(), - self.replacement_request_id, + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::Running), + ExpectedIntermediateState(RegionReplacementState::Driving), + ExpectedStartState(RegionReplacementState::Running), ) - .await - .unwrap() - .unwrap(); - - assert!(most_recent_step.pantry_address().is_some()); - } + .await; - /// Manually activate the background attachment for the request volume - pub async fn manually_activate_attached_volume( - &self, - cptestctx: &'a ControlPlaneTestContext, - ) { - let pantry = - cptestctx.sled_agent.pantry_server.as_ref().unwrap().pantry.clone(); - - let region_replacement = self - .datastore - .get_region_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, + // Additionally, assert that the drive saga recorded that it sent + // the attachment request to the simulated pantry. + // + // If `wait_for_request_state` has the same expected start and end + // state (as it does above), it's possible to exit that function + // having not yet started the saga yet, and this requires an + // additional `wait_for_condition` to wait for the expected recorded + // step. + + let most_recent_step = wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + match datastore + .current_region_replacement_request_step( + &opctx, + replacement_request_id, + ) + .await + .unwrap() + { + Some(step) => Ok(step), + + None => { + // The saga either has not started yet or is + // still running - see the comment before this + // check for more info. + Err(CondCheckError::<()>::NotYet) + } + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(10), ) .await - .unwrap(); + .expect("most recent step"); - pantry - .activate_background_attachment( - region_replacement.volume_id.to_string(), - ) - .unwrap(); - } + assert!(most_recent_step.pantry_address().is_some()); + } - /// Transition request to ReplacementDone via the region replacement drive - /// saga - pub async fn transition_request_to_replacement_done(&self) { - // Run the "region replacement driver" task + /// Manually activate the background attachment for the request volume + pub async fn manually_activate_attached_volume( + &self, + cptestctx: &'a ControlPlaneTestContext, + ) { + let pantry = cptestctx + .sled_agent + .pantry_server + .as_ref() + .unwrap() + .pantry + .clone(); + + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + pantry + .activate_background_attachment( + region_replacement.volume_id.to_string(), + ) + .unwrap(); + } - run_region_replacement_driver(&self.internal_client).await; + /// Transition request to ReplacementDone via the region replacement + /// drive saga + pub async fn transition_request_to_replacement_done(&self) { + // Run the "region replacement driver" task - // The activation above could only have started the associated saga, so - // wait until the request is in the expected end state. + run_region_replacement_driver(&self.internal_client).await; - self.wait_for_request_state( - ExpectedEndState(RegionReplacementState::ReplacementDone), - ExpectedIntermediateState(RegionReplacementState::Driving), - ) - .await; + // The activation above could only have started the associated saga, + // so wait until the request is in the expected end state. + + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::ReplacementDone), + ExpectedIntermediateState(RegionReplacementState::Driving), + ExpectedStartState(RegionReplacementState::Running), + ) + .await; + } } } @@ -400,7 +514,8 @@ impl<'a> RegionReplacementDeletedVolumeTest<'a> { async fn test_delete_volume_region_replacement_state_requested( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: delete the // disk, then finish the test. @@ -416,7 +531,8 @@ async fn test_delete_volume_region_replacement_state_requested( async fn test_delete_volume_region_replacement_state_running( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: // - transition the request to "Running" @@ -436,7 +552,8 @@ async fn test_delete_volume_region_replacement_state_running( async fn test_delete_volume_region_replacement_state_running_on_pantry( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: // - transition the request to "Running" @@ -458,7 +575,8 @@ async fn test_delete_volume_region_replacement_state_running_on_pantry( async fn test_delete_volume_region_replacement_state_replacement_done( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: // - transition the request to "Running" @@ -480,3 +598,1111 @@ async fn test_delete_volume_region_replacement_state_replacement_done( test_harness.finish_test().await; } + +/// Assert that the problem experienced in issue 6353 is fixed +#[nexus_test] +async fn test_racing_replacements_for_soft_deleted_disk_volume( + cptestctx: &ControlPlaneTestContext, +) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Create four zpools, each with one dataset. This is required for region + // and region snapshot replacement to have somewhere to move the data. + let sled_id = cptestctx.first_sled(); + let mut disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(sled_id) + .with_zpool_count(4) + .build() + .await; + + // Create a disk, then a snapshot of that disk + let client = &cptestctx.external_client; + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshots_url = format!("/v1/snapshots?project={}", PROJECT_NAME); + + let snapshot: views::Snapshot = object_create( + client, + &snapshots_url, + ¶ms::SnapshotCreate { + identity: IdentityMetadataCreateParams { + name: "snapshot".parse().unwrap(), + description: String::from("a snapshot"), + }, + disk: disk.identity.name.into(), + }, + ) + .await; + + // Before deleting the disk, save the DB model + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + // Next, expunge a physical disk that contains a region snapshot (which + // means it'll have the region too) + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + let (dataset, region) = &disk_allocated_regions[0]; + let zpool = disk_test + .zpools() + .find(|x| *x.id.as_untyped_uuid() == dataset.pool_id) + .expect("Expected at least one zpool"); + + let (_, db_zpool) = LookupPath::new(&opctx, datastore) + .zpool_id(zpool.id.into_untyped_uuid()) + .fetch() + .await + .unwrap(); + + datastore + .physical_disk_update_policy( + &opctx, + db_zpool.physical_disk_id.into(), + PhysicalDiskPolicy::Expunged, + ) + .await + .unwrap(); + + // Only one region snapshot should be been returned by the following call + // due to the allocation policy. + + let expunged_region_snapshots = datastore + .find_region_snapshots_on_expunged_physical_disks(&opctx) + .await + .unwrap(); + + assert_eq!(expunged_region_snapshots.len(), 1); + + for expunged_region_snapshot in expunged_region_snapshots { + assert_eq!(expunged_region_snapshot.snapshot_id, snapshot.identity.id); + } + + // Either one or two read/write regions will be returned: + // + // - one for the disk, and + // - one for the snapshot destination volume, depending on if it was + // allocated on to the physical disk that was expunged. + + let expunged_regions = datastore + .find_read_write_regions_on_expunged_physical_disks(&opctx) + .await + .unwrap(); + + match expunged_regions.len() { + 1 => { + assert_eq!(expunged_regions[0].id(), region.id()); + } + + 2 => { + assert!(expunged_regions.iter().any(|r| r.id() == region.id())); + + let (.., db_snapshot) = LookupPath::new(&opctx, datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let snapshot_allocated_datasets_and_regions = datastore + .get_allocated_regions(db_snapshot.destination_volume_id) + .await + .unwrap(); + + let snapshot_allocated_regions: Vec = + snapshot_allocated_datasets_and_regions + .into_iter() + .map(|(_, r)| r.id()) + .collect(); + + assert!(expunged_regions.iter().any(|region| { + snapshot_allocated_regions.contains(®ion.id()) + })); + } + + _ => { + panic!("unexpected number of expunged regions!"); + } + } + + // Now, race the region replacement with the region snapshot replacement: + // + // 1) region replacement will allocate a new region and swap it into the + // disk volume. + + let internal_client = &cptestctx.internal_client; + + let _ = + activate_background_task(&internal_client, "region_replacement").await; + + // After that task invocation, there should be one running region + // replacement for the disk's region. Filter out the replacement request for + // the snapshot destination volume if it's there. The above background task + // only starts the associated saga, so wait for it to complete. + + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + async move { + let region_replacements: Vec<_> = datastore + .get_running_region_replacements(&opctx) + .await + .unwrap() + .into_iter() + .filter(|x| x.old_region_id == region.id()) + .collect(); + + if region_replacements.len() == 1 { + // The saga transitioned the request ok + Ok(()) + } else { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(20), + ) + .await + .expect("request transitioned to expected state"); + + let region_replacements: Vec<_> = datastore + .get_running_region_replacements(&opctx) + .await + .unwrap() + .into_iter() + .filter(|x| x.old_region_id == region.id()) + .collect(); + + assert_eq!(region_replacements.len(), 1); + + // 2) region snapshot replacement start will replace the region snapshot in + // the snapshot volume + + let _ = activate_background_task( + &internal_client, + "region_snapshot_replacement_start", + ) + .await; + + // After that, there should be one "replacement done" region snapshot + // replacement for the associated region snapshot. The above background task + // only starts the associated saga, so wait for it to complete. + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + async move { + let region_snapshot_replacements = datastore + .get_replacement_done_region_snapshot_replacements(&opctx) + .await + .unwrap(); + + if region_snapshot_replacements.len() == 1 { + // The saga transitioned the request ok + Ok(()) + } else { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(20), + ) + .await + .expect("request transitioned to expected state"); + + let region_snapshot_replacements = datastore + .get_replacement_done_region_snapshot_replacements(&opctx) + .await + .unwrap(); + + assert_eq!(region_snapshot_replacements.len(), 1); + assert_eq!( + region_snapshot_replacements[0].old_dataset_id, + dataset.id().into() + ); + assert_eq!(region_snapshot_replacements[0].old_region_id, region.id()); + assert_eq!( + region_snapshot_replacements[0].old_snapshot_id, + snapshot.identity.id + ); + assert_eq!( + region_snapshot_replacements[0].replacement_state, + RegionSnapshotReplacementState::ReplacementDone, + ); + + assert!(datastore.find_deleted_volume_regions().await.unwrap().is_empty()); + + // 3) Delete the disk + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + // The volume should be soft-deleted now. The region snapshot replacement + // swapped out the region snapshot from the snapshot volume to the temporary + // volume for later deletion, but has not actually deleted that temporary + // volume yet, so the count will not have gone to 0. + + let volume = datastore.volume_get(db_disk.volume_id).await.unwrap(); + assert!(volume.is_some()); + assert!(volume.unwrap().time_deleted.is_some()); + + // 4) region snapshot replacement garbage collect will delete the temporary + // volume with the stashed reference to the region snapshot, bringing the + // reference count to zero. + + let _ = activate_background_task( + &internal_client, + "region_snapshot_replacement_garbage_collection", + ) + .await; + + // Assert the region snapshot was deleted. + assert!(datastore + .region_snapshot_get(dataset.id(), region.id(), snapshot.identity.id) + .await + .unwrap() + .is_none()); + + // Assert that the disk's volume is still only soft-deleted, because the two + // other associated region snapshots still exist. + let volume = datastore.volume_get(db_disk.volume_id).await.unwrap(); + assert!(volume.is_some()); + + // Check on the old region id - it should not be deleted + let maybe_region = + datastore.get_region_optional(region.id()).await.unwrap(); + + eprintln!("old_region_id: {:?}", &maybe_region); + assert!(maybe_region.is_some()); + + // But the new region id will be! + let maybe_region = datastore + .get_region_optional(region_replacements[0].new_region_id.unwrap()) + .await + .unwrap(); + + eprintln!("new region id: {:?}", &maybe_region); + assert!(maybe_region.is_none()); + + // The region_replacement drive task should invoke the drive saga now, which + // will skip over all notification steps and transition the request to + // ReplacementDone + + let last_background_task = + activate_background_task(&internal_client, "region_replacement_driver") + .await; + + assert!(match last_background_task.last { + LastResult::Completed(last_result_completed) => { + match serde_json::from_value::( + last_result_completed.details, + ) { + Err(e) => { + eprintln!("{e}"); + false + } + + Ok(v) => !v.drive_invoked_ok.is_empty(), + } + } + + _ => { + false + } + }); + + // wait for the drive saga to complete here + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let replacement_request_id = region_replacements[0].id; + + async move { + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + if state == RegionReplacementState::ReplacementDone { + // The saga transitioned the request ok + Ok(()) + } else if state == RegionReplacementState::Driving { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else if state == RegionReplacementState::Completing { + // The saga transitioned the request ok, and it's now being + // finished by the region replacement finish saga + Ok(()) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + // After the region snapshot replacement process runs to completion, there + // should be no more crucible resources left. Run the "region snapshot + // replacement step" background task until there's nothing left, then the + // "region snapshot replacement finish", then make sure there are no + // crucible resources left. + + let mut count = 0; + loop { + let actions_taken = + run_region_snapshot_replacement_step(&internal_client).await; + + if actions_taken == 0 { + break; + } + + count += 1; + + if count > 20 { + assert!(false); + } + } + + let _ = activate_background_task( + &internal_client, + "region_snapshot_replacement_finish", + ) + .await; + + // Ensure the region snapshot replacement request went to Complete + + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let request_id = region_snapshot_replacements[0].id; + + async move { + let region_snapshot_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, request_id, + ) + .await + .unwrap(); + + let state = region_snapshot_replacement.replacement_state; + + if state == RegionSnapshotReplacementState::Complete { + Ok(()) + } else { + // Any other state is not expected + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + // Delete the snapshot + + let snapshot_url = get_snapshot_url("snapshot"); + NexusRequest::object_delete(client, &snapshot_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + // and now there should be no higher level resources left + + let disks_url = get_disks_url(); + assert_eq!( + collection_list::(&client, &disks_url).await.len(), + 0 + ); + + let snapshots_url = get_snapshots_url(); + assert_eq!( + collection_list::(&client, &snapshots_url).await.len(), + 0 + ); + + // Make sure that all the background tasks can run to completion. + + run_replacement_tasks_to_completion(&internal_client).await; + + // The disk volume should be deleted by the snapshot delete: wait until this + // happens + + wait_for_condition( + || { + let datastore = datastore.clone(); + let volume_id = db_disk.volume_id; + + async move { + let volume = datastore.volume_get(volume_id).await.unwrap(); + if volume.is_none() { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(10), + ) + .await + .expect("disk volume deleted"); + + // There should be no more crucible resources left. Don't just check for + // `crucible_resources_deleted` here! We have set one of the physical disk + // policies to expunged, so Nexus will not attempt to clean up any resources + // on that physical disk. + + disk_test.remove_zpool(db_zpool.id()).await; + + // Now, assert that all crucible resources are cleaned up + + assert!(disk_test.crucible_resources_deleted().await); +} + +mod region_snapshot_replacement { + use super::*; + + #[derive(Debug)] + struct ExpectedEndState(pub RegionSnapshotReplacementState); + + #[derive(Debug)] + struct ExpectedIntermediateState(pub RegionSnapshotReplacementState); + + #[derive(Debug)] + struct ExpectedStartState(pub RegionSnapshotReplacementState); + + pub(super) struct DeletedVolumeTest<'a> { + log: Logger, + datastore: Arc, + disk_test: DiskTest<'a>, + client: ClientTestContext, + internal_client: ClientTestContext, + replacement_request_id: Uuid, + snapshot_socket_addr: SocketAddr, + } + + impl<'a> DeletedVolumeTest<'a> { + pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { + let nexus = &cptestctx.server.server_context().nexus; + + // Create four zpools, each with one dataset. This is required for + // region and region snapshot replacement to have somewhere to move + // the data. + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let datastore = nexus.datastore().clone(); + + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + // Create a disk, a snapshot of that disk, and a disk from that + // snapshot + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot") + .await; + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Manually create the region snapshot replacement request for the + // first allocated region of that disk + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + let disk_allocated_regions = datastore + .get_allocated_regions(db_disk.volume_id) + .await + .unwrap(); + let (_, region) = &disk_allocated_regions[0]; + + let region_snapshot = datastore + .region_snapshot_get( + region.dataset_id(), + region.id(), + snapshot.identity.id, + ) + .await + .expect("found region snapshot without error") + .unwrap(); + + let replacement_request_id = datastore + .create_region_snapshot_replacement_request( + &opctx, + ®ion_snapshot, + ) + .await + .unwrap(); + + // Assert the request is in state Requested + + let region_snapshot_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + RegionSnapshotReplacementState::Requested, + ); + + // Assert two volumes reference the snapshot addr + + let snapshot_socket_addr = + region_snapshot.snapshot_addr.parse().unwrap(); + + let volumes = datastore + .find_volumes_referencing_socket_addr( + &opctx, + snapshot_socket_addr, + ) + .await + .unwrap(); + + assert_eq!(volumes.len(), 2); + + // Validate that they are snapshot and disk from snapshot + + let volumes_set: HashSet = + volumes.into_iter().map(|v| v.id()).collect(); + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let (.., db_disk_from_snapshot) = + LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap(); + + assert!(volumes_set.contains(&db_snapshot.volume_id)); + assert!(volumes_set.contains(&db_disk_from_snapshot.volume_id)); + + DeletedVolumeTest { + log: cptestctx.logctx.log.new(o!()), + datastore, + disk_test, + client: client.clone(), + internal_client: internal_client.clone(), + replacement_request_id, + snapshot_socket_addr, + } + } + + pub fn opctx(&self) -> OpContext { + OpContext::for_tests(self.log.clone(), self.datastore.clone()) + } + + pub async fn delete_the_disk(&self) { + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + } + + pub async fn delete_the_snapshot(&self) { + let snapshot_url = get_snapshot_url("snapshot"); + NexusRequest::object_delete(&self.client, &snapshot_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + } + + pub async fn delete_the_disk_from_snapshot(&self) { + let disk_url = get_disk_url("disk-from-snapshot"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + } + + /// Make sure: + /// + /// - all region snapshot replacement related background tasks run to + /// completion + /// - this harness' region snapshot replacement request has transitioned + /// to Complete + /// - there are no more volumes that reference the request's region + /// snapshot + pub async fn finish_test(&self) { + // Make sure that all the background tasks can run to completion. + + run_replacement_tasks_to_completion(&self.internal_client).await; + + // Assert the request is in state Complete + + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let region_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + if state == RegionSnapshotReplacementState::Complete { + // The saga transitioned the request ok + Ok(()) + } else { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(10), + ) + .await + .expect("request transitioned to expected state"); + + let region_snapshot_replacement = self + .datastore + .get_region_snapshot_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + RegionSnapshotReplacementState::Complete, + ); + + // Assert no volumes are referencing the snapshot address + + let volumes = self + .datastore + .find_volumes_referencing_socket_addr( + &self.opctx(), + self.snapshot_socket_addr, + ) + .await + .unwrap(); + + if !volumes.is_empty() { + eprintln!("{:?}", volumes); + } + + assert!(volumes.is_empty()); + } + + /// Assert no Crucible resources are leaked + pub async fn assert_no_crucible_resources_leaked(&self) { + assert!(self.disk_test.crucible_resources_deleted().await); + } + + async fn wait_for_request_state( + &self, + expected_end_state: ExpectedEndState, + expected_intermediate_state: ExpectedIntermediateState, + expected_start_state: ExpectedStartState, + ) { + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let request = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = request.replacement_state; + + if state == expected_end_state.0 { + // The saga transitioned the request ok + Ok(()) + } else if state == expected_intermediate_state.0 { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else if state == expected_start_state.0 { + // The saga hasn't started yet + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + // Assert the request state + + let region_snapshot_replacement = self + .datastore + .get_region_snapshot_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + expected_end_state.0, + ); + } + + /// Run the "region snapshot replacement" task to transition the request + /// to ReplacementDone. + pub async fn transition_request_to_replacement_done(&self) { + // Activate the "region snapshot replacement start" background task + + run_region_snapshot_replacement_start(&self.internal_client).await; + + // The activation above could only have started the associated saga, + // so wait until the request is in state Running. + + self.wait_for_request_state( + ExpectedEndState( + RegionSnapshotReplacementState::ReplacementDone, + ), + ExpectedIntermediateState( + RegionSnapshotReplacementState::Allocating, + ), + ExpectedStartState(RegionSnapshotReplacementState::Requested), + ) + .await; + } + + /// Run the "region snapshot replacement garbage collection" task to + /// transition the request to Running. + pub async fn transition_request_to_running(&self) { + // Activate the "region snapshot replacement garbage collection" + // background task + + run_region_snapshot_replacement_garbage_collection( + &self.internal_client, + ) + .await; + + // The activation above could only have started the associated saga, + // so wait until the request is in state Running. + + self.wait_for_request_state( + ExpectedEndState(RegionSnapshotReplacementState::Running), + ExpectedIntermediateState( + RegionSnapshotReplacementState::DeletingOldVolume, + ), + ExpectedStartState( + RegionSnapshotReplacementState::ReplacementDone, + ), + ) + .await; + } + + /// Manually create a region snapshot replacement step for the disk + /// created from the snapshot + pub async fn create_manual_region_snapshot_replacement_step(&self) { + let disk_url = get_disk_url("disk-from-snapshot"); + + let disk_from_snapshot: external::Disk = + NexusRequest::object_get(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap(); + + let (.., db_disk_from_snapshot) = + LookupPath::new(&self.opctx(), &self.datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let result = self + .datastore + .create_region_snapshot_replacement_step( + &self.opctx(), + self.replacement_request_id, + db_disk_from_snapshot.volume_id, + ) + .await + .unwrap(); + + match result { + InsertStepResult::Inserted { .. } => {} + + _ => { + assert!(false, "bad result from create_region_snapshot_replacement_step"); + } + } + } + } +} + +/// Assert that a region snapshot replacement request in state "Requested" can +/// have its snapshot deleted and still transition to Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_requested( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: delete + // the snapshot, then finish the test. + + test_harness.delete_the_snapshot().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk().await; + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "Requested" can +/// have its snapshot deleted, and the snapshot's source disk can be deleted, +/// and the request will still transition to Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_requested_2( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - delete the snapshot + // - delete the snapshot's source disk + // - the only thing that will remain is the disk-from-snap that was created + // - finally, call finish_test + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "Requested" can +/// have everything be deleted, and the request will still transition to +/// Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_requested_3( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - delete the snapshot + // - delete the snapshot's source disk + // - delete the disk created from the snapshot + // - finally, call finish_test + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + test_harness.delete_the_disk_from_snapshot().await; + + test_harness.finish_test().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "ReplacementDone" +/// can have its snapshot deleted, and the request will still transition to +/// Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_replacement_done( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - transition the request to "ReplacementDone" + // - delete the snapshot + // - finally, call finish_test + + test_harness.transition_request_to_replacement_done().await; + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "Running" +/// can have its snapshot deleted, and the request will still transition to +/// Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_running( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - transition the request to "ReplacementDone" + // - transition the request to "Running" + // - delete the snapshot + // - finally, call finish_test + + test_harness.transition_request_to_replacement_done().await; + test_harness.transition_request_to_running().await; + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement step can have its associated +/// volume deleted and still transition to VolumeDeleted +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_step( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - transition the request to "ReplacementDone" + // - transition the request to "Running" + // - manually create a region snapshot replacement step for the disk created + // from the snapshot + // - delete the disk created from the snapshot + // - finally, call finish_test + + test_harness.transition_request_to_replacement_done().await; + test_harness.transition_request_to_running().await; + + test_harness.create_manual_region_snapshot_replacement_step().await; + test_harness.delete_the_disk_from_snapshot().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk().await; + test_harness.delete_the_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index 5da4e49a3a..309d113d73 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -769,10 +769,9 @@ async fn test_disk_region_creation_failure( .await .unwrap(); - // After the failed allocation, the disk should be Faulted + // After the failed allocation, the disk creation should have unwound let disks = disks_list(&client, &disks_url).await; - assert_eq!(disks.len(), 1); - assert_eq!(disks[0].state, DiskState::Faulted); + assert_eq!(disks.len(), 0); } // Tests that invalid block sizes are rejected @@ -2574,7 +2573,7 @@ async fn test_disk_expunge(cptestctx: &ControlPlaneTestContext) { // All three regions should be returned let expunged_regions = datastore - .find_regions_on_expunged_physical_disks(&opctx) + .find_read_write_regions_on_expunged_physical_disks(&opctx) .await .unwrap(); diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 543e66ee30..f8cff2361d 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -3760,12 +3760,13 @@ impl TestReadOnlyRegionReferenceUsage { // read-only regions should never be returned by find_deleted_volume_regions pub async fn region_not_returned_by_find_deleted_volume_regions(&self) { - let deleted_volume_regions = + let freed_crucible_resources = self.datastore.find_deleted_volume_regions().await.unwrap(); - assert!(!deleted_volume_regions + assert!(!freed_crucible_resources + .datasets_and_regions .into_iter() - .any(|(_, r, _)| r.id() == self.region.id())); + .any(|(_, r)| r.id() == self.region.id())); } pub async fn create_first_volume_region_in_rop(&self) { diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index cf3d652587..bee2f56c34 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -37,6 +37,7 @@ pub struct LookupRegionPortStatus { pub struct RegionSnapshotReplacementStartStatus { pub requests_created_ok: Vec, pub start_invoked_ok: Vec, + pub requests_completed_ok: Vec, pub errors: Vec, } @@ -55,13 +56,14 @@ pub struct RegionSnapshotReplacementStepStatus { pub step_records_created_ok: Vec, pub step_garbage_collect_invoked_ok: Vec, pub step_invoked_ok: Vec, + pub step_set_volume_deleted_ok: Vec, pub errors: Vec, } /// The status of a `region_snapshot_replacement_finish` background task activation #[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] pub struct RegionSnapshotReplacementFinishStatus { - pub records_set_to_done: Vec, + pub finish_invoked_ok: Vec, pub errors: Vec, } diff --git a/openapi/clickhouse-admin-server.json b/openapi/clickhouse-admin-server.json index c82c7c0d8e..50c526569e 100644 --- a/openapi/clickhouse-admin-server.json +++ b/openapi/clickhouse-admin-server.json @@ -616,7 +616,7 @@ "type": "object", "properties": { "time": { - "$ref": "#/components/schemas/Timestamp" + "type": "string" }, "value": { "type": "number", @@ -628,17 +628,6 @@ "value" ] }, - "Timestamp": { - "anyOf": [ - { - "type": "string", - "format": "date-time" - }, - { - "type": "string" - } - ] - }, "SystemTable": { "description": "Available metrics tables in the `system` database", "type": "string", diff --git a/openapi/clickhouse-admin-single.json b/openapi/clickhouse-admin-single.json index b00bf56314..c6b99da245 100644 --- a/openapi/clickhouse-admin-single.json +++ b/openapi/clickhouse-admin-single.json @@ -131,7 +131,7 @@ "type": "object", "properties": { "time": { - "$ref": "#/components/schemas/Timestamp" + "type": "string" }, "value": { "type": "number", @@ -143,17 +143,6 @@ "value" ] }, - "Timestamp": { - "anyOf": [ - { - "type": "string", - "format": "date-time" - }, - { - "type": "string" - } - ] - }, "SystemTable": { "description": "Available metrics tables in the `system` database", "type": "string", diff --git a/oximeter/collector/src/agent.rs b/oximeter/collector/src/agent.rs index e924cb2ee3..7e28831fa0 100644 --- a/oximeter/collector/src/agent.rs +++ b/oximeter/collector/src/agent.rs @@ -745,9 +745,9 @@ mod tests { let count = stats.collections.datum.value() as usize; assert!(count != 0); - assert_eq!( - count, - collection_count.load(Ordering::SeqCst), + let server_count = collection_count.load(Ordering::SeqCst); + assert!( + count == server_count || count - 1 == server_count, "number of collections reported by the collection \ task differs from the number reported by the empty \ producer server itself" @@ -892,9 +892,16 @@ mod tests { assert_eq!(stats.collections.datum.value(), 0); assert!(count != 0); - assert_eq!( - count, - collection_count.load(Ordering::SeqCst), + + // The server may have handled a request that we've not yet recorded on + // our collection task side, so we allow the server count to be greater + // than our own. But since the collection task is single-threaded, it + // cannot ever be more than _one_ greater than our count, since we + // should increment that counter before making another request to the + // server. + let server_count = collection_count.load(Ordering::SeqCst); + assert!( + count == server_count || count - 1 == server_count, "number of collections reported by the collection \ task differs from the number reported by the always-ded \ producer server itself" diff --git a/package-manifest.toml b/package-manifest.toml index 83b1ba8168..b28ac7d59f 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -166,7 +166,8 @@ source.packages = [ "internal-dns-cli.tar.gz", "omicron-clickhouse-admin.tar.gz", "zone-setup.tar.gz", - "zone-network-install.tar.gz" + "zone-network-install.tar.gz", + "clickana.tar.gz" ] output.type = "zone" @@ -197,7 +198,8 @@ source.packages = [ "internal-dns-cli.tar.gz", "omicron-clickhouse-admin.tar.gz", "zone-setup.tar.gz", - "zone-network-install.tar.gz" + "zone-network-install.tar.gz", + "clickana.tar.gz" ] output.type = "zone" @@ -924,3 +926,12 @@ service_name = "probe" source.type = "composite" source.packages = ["thundermuffin.tar.gz"] output.type = "zone" + +[package.clickana] +service_name = "clickana" +only_for_targets.image = "standard" +source.type = "local" +source.rust.binary_names = ["clickana"] +source.rust.release = true +output.type = "zone" +output.intermediate_only = true \ No newline at end of file diff --git a/schema/crdb/add-completing-and-new-region-volume/up01.sql b/schema/crdb/add-completing-and-new-region-volume/up01.sql new file mode 100644 index 0000000000..6a973eb3c3 --- /dev/null +++ b/schema/crdb/add-completing-and-new-region-volume/up01.sql @@ -0,0 +1 @@ +ALTER TYPE omicron.public.region_snapshot_replacement_state ADD VALUE IF NOT EXISTS 'completing' AFTER 'complete'; diff --git a/schema/crdb/add-completing-and-new-region-volume/up02.sql b/schema/crdb/add-completing-and-new-region-volume/up02.sql new file mode 100644 index 0000000000..42c0028ff5 --- /dev/null +++ b/schema/crdb/add-completing-and-new-region-volume/up02.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region_snapshot_replacement ADD COLUMN IF NOT EXISTS new_region_volume_id UUID; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 37380f6e43..ce6764bd17 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4487,7 +4487,8 @@ CREATE TYPE IF NOT EXISTS omicron.public.region_snapshot_replacement_state AS EN 'replacement_done', 'deleting_old_volume', 'running', - 'complete' + 'complete', + 'completing' ); CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot_replacement ( @@ -4505,7 +4506,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot_replacement ( replacement_state omicron.public.region_snapshot_replacement_state NOT NULL, - operating_saga_id UUID + operating_saga_id UUID, + + new_region_volume_id UUID ); CREATE INDEX IF NOT EXISTS lookup_region_snapshot_replacement_by_state on omicron.public.region_snapshot_replacement (replacement_state); @@ -4754,7 +4757,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '117.0.0', NULL) + (TRUE, NOW(), NOW(), '118.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index db250eb914..7c5fb44d13 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -76,6 +76,7 @@ use omicron_common::address::WICKETD_NEXUS_PROXY_PORT; use omicron_common::address::WICKETD_PORT; use omicron_common::address::{ get_internal_dns_server_addresses, CLICKHOUSE_ADMIN_PORT, + CLICKHOUSE_TCP_PORT, }; use omicron_common::address::{Ipv6Subnet, NEXUS_TECHPORT_EXTERNAL_PORT}; use omicron_common::address::{BOOTSTRAP_ARTIFACT_PORT, COCKROACH_ADMIN_PORT}; @@ -1597,13 +1598,20 @@ impl ServiceManager { addr.to_string() }; + // The ClickHouse client connects via the TCP port + let ch_address = { + let mut addr = *address; + addr.set_port(CLICKHOUSE_TCP_PORT); + addr.to_string() + }; + let clickhouse_admin_config = PropertyGroupBuilder::new("config") .add_property("http_address", "astring", admin_address) .add_property( "ch_address", "astring", - address.to_string(), + ch_address.to_string(), ) .add_property( "ch_binary", @@ -1668,13 +1676,20 @@ impl ServiceManager { addr.to_string() }; + // The ClickHouse client connects via the TCP port + let ch_address = { + let mut addr = *address; + addr.set_port(CLICKHOUSE_TCP_PORT); + addr.to_string() + }; + let clickhouse_admin_config = PropertyGroupBuilder::new("config") .add_property("http_address", "astring", admin_address) .add_property( "ch_address", "astring", - address.to_string(), + ch_address.to_string(), ) .add_property( "ch_binary", diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 07cb4e1ccf..ed5b99b7ad 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -173,12 +173,6 @@ impl CrucibleDataInner { let id = Uuid::from_str(&id.0).unwrap(); if let Some(region) = self.regions.get_mut(&id) { - if region.state == State::Failed { - // The real Crucible agent would not let a Failed region be - // deleted - bail!("cannot delete in state Failed"); - } - region.state = State::Destroyed; self.used_ports.remove(®ion.port_number); Ok(Some(region.clone())) diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index b9bf703933..37526690cb 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -962,12 +962,21 @@ impl SledAgent { continue; }; - // First, ensure the dataset exists - let dataset_id = zone.id.into_untyped_uuid(); - self.inner - .storage - .upsert_filesystem(dataset_id, dataset_name) - .await?; + // NOTE: This code will be deprecated by https://github.com/oxidecomputer/omicron/pull/7160 + // + // However, we need to ensure that all blueprints have datasets + // within them before we can remove this back-fill. + // + // Therefore, we do something hairy here: We ensure the filesystem + // exists, but don't specify any dataset UUID value. + // + // This means that: + // - If the dataset exists and has a UUID, this will be a no-op + // - If the dataset doesn't exist, it'll be created without its + // oxide:uuid zfs property set + // - If a subsequent call to "datasets_ensure" tries to set a UUID, + // it should be able to get set (once). + self.inner.storage.upsert_filesystem(None, dataset_name).await?; } self.inner diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index f0653c7905..d6ffd42d0a 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -35,7 +35,6 @@ use std::collections::HashSet; use std::future::Future; use tokio::sync::{mpsc, oneshot, watch}; use tokio::time::{interval, Duration, MissedTickBehavior}; -use uuid::Uuid; // The size of the mpsc bounded channel used to communicate // between the `StorageHandle` and `StorageManager`. @@ -100,7 +99,7 @@ enum StorageManagerState { #[derive(Debug)] pub(crate) struct NewFilesystemRequest { - dataset_id: Uuid, + dataset_id: Option, dataset_name: DatasetName, responder: DebugIgnore>>, } @@ -526,7 +525,7 @@ impl StorageHandle { // and ask for the set of all datasets from Nexus. pub async fn upsert_filesystem( &self, - dataset_id: Uuid, + dataset_id: Option, dataset_name: DatasetName, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); @@ -1499,27 +1498,9 @@ impl StorageManager { zoned, encryption_details, size_details, - id: Some(DatasetUuid::from_untyped_uuid(request.dataset_id)), + id: request.dataset_id, additional_options: None, })?; - // Ensure the dataset has a usable UUID. - if let Ok(id_str) = Zfs::get_oxide_value(&fs_name, "uuid") { - if let Ok(id) = id_str.parse::() { - if id != request.dataset_id { - return Err(Error::UuidMismatch { - name: request.dataset_name.full_name(), - old: id, - new: request.dataset_id, - }); - } - return Ok(()); - } - } - Zfs::set_oxide_value( - &fs_name, - "uuid", - &request.dataset_id.to_string(), - )?; Ok(()) } @@ -1544,7 +1525,6 @@ mod tests { use std::collections::BTreeMap; use std::str::FromStr; use std::sync::atomic::Ordering; - use uuid::Uuid; // A helper struct to advance time. struct TimeTravel {} @@ -2005,16 +1985,92 @@ mod tests { .expect("Ensuring disks should work after key manager is ready"); assert!(!result.has_error(), "{:?}", result); - // Create a filesystem on the newly formatted U.2 - let dataset_id = Uuid::new_v4(); + // Create a filesystem on the newly formatted U.2. + // + // We can call "upsert_filesystem" both with and without a UUID. + let dataset_id = DatasetUuid::new_v4(); + let zpool_name = ZpoolName::new_external(config.disks[0].pool_id); + let dataset_name = + DatasetName::new(zpool_name.clone(), DatasetKind::Crucible); + harness + .handle() + .upsert_filesystem(Some(dataset_id), dataset_name.clone()) + .await + .unwrap(); + // Observe the dataset exists, and the UUID is set. + let observed_dataset = &Zfs::get_dataset_properties( + &[dataset_name.full_name()], + WhichDatasets::SelfOnly, + ) + .unwrap()[0]; + assert_eq!(observed_dataset.id, Some(dataset_id)); + + harness + .handle() + .upsert_filesystem(None, dataset_name.clone()) + .await + .unwrap(); + // Observe the dataset still exists, and the UUID is still set, + // even though we did not ask for a new value explicitly. + let observed_dataset = &Zfs::get_dataset_properties( + &[dataset_name.full_name()], + WhichDatasets::SelfOnly, + ) + .unwrap()[0]; + assert_eq!(observed_dataset.id, Some(dataset_id)); + + harness.cleanup().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn upsert_filesystem_no_uuid() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log("upsert_filesystem"); + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; + + // Test setup: Add a U.2 and M.2, adopt them into the "control plane" + // for usage. + harness.handle().key_manager_ready().await; + let raw_disks = + harness.add_vdevs(&["u2_under_test.vdev", "m2_helping.vdev"]).await; + let config = harness.make_config(1, &raw_disks); + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .expect("Ensuring disks should work after key manager is ready"); + assert!(!result.has_error(), "{:?}", result); + + // Create a filesystem on the newly formatted U.2, without a UUID let zpool_name = ZpoolName::new_external(config.disks[0].pool_id); let dataset_name = DatasetName::new(zpool_name.clone(), DatasetKind::Crucible); harness .handle() - .upsert_filesystem(dataset_id, dataset_name) + .upsert_filesystem(None, dataset_name.clone()) .await .unwrap(); + let observed_dataset = &Zfs::get_dataset_properties( + &[dataset_name.full_name()], + WhichDatasets::SelfOnly, + ) + .unwrap()[0]; + assert_eq!(observed_dataset.id, None); + + // Later, we can set the UUID to a specific value + let dataset_id = DatasetUuid::new_v4(); + harness + .handle() + .upsert_filesystem(Some(dataset_id), dataset_name.clone()) + .await + .unwrap(); + let observed_dataset = &Zfs::get_dataset_properties( + &[dataset_name.full_name()], + WhichDatasets::SelfOnly, + ) + .unwrap()[0]; + assert_eq!(observed_dataset.id, Some(dataset_id)); harness.cleanup().await; logctx.cleanup_successful(); diff --git a/tools/console_version b/tools/console_version index 85ca41f755..ef52d38564 100644 --- a/tools/console_version +++ b/tools/console_version @@ -1,2 +1,2 @@ -COMMIT="c1ebd8d9acae4ff7a09b2517265fba52ebdfe82e" -SHA2="840dbfda1c0def66212e7602d7be6e8acf1b26ba218f10ce3e627df49f5ce9e2" +COMMIT="d583ae70786db46ace23f0e45bc9fdcffc21e6ae" +SHA2="590fda51f0599879effea4f4b2754ec572f6c1f4d5a586016ff6a2e2d8b6f5f9"