Skip to content

Commit

Permalink
Detect and un-delete phantom disks (#4547)
Browse files Browse the repository at this point in the history
A "phantom" disk is a disk where the disk delete saga has run for it but
unwound: this leaves that disk soft-deleted, but the resources and
accounting for that disk remain. Users cannot request that the disk be
deleted again, and it remains a phantom.

There are two fixes for this:

1. Change the disk delete saga to undo the disk's soft delete and set
the disk to faulted during an unwind. This way, users can request that
disks be deleted repeatedly until it works.

2. Create a background task that detects these phantom disks and does
the same thing: un-delete them and set them to faulted.

This requires adding an index on `id` to the `disk` table, so the schema
is bumped to 12.0.1.

Fixes oxidecomputer/customer-support#58.
  • Loading branch information
jmpesp authored Dec 4, 2023
1 parent 15115a4 commit c915eeb
Show file tree
Hide file tree
Showing 18 changed files with 667 additions and 11 deletions.
24 changes: 20 additions & 4 deletions common/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,8 @@ pub struct BackgroundTaskConfig {
pub nat_cleanup: NatCleanupConfig,
/// configuration for inventory tasks
pub inventory: InventoryConfig,
/// configuration for phantom disks task
pub phantom_disks: PhantomDiskConfig,
}

#[serde_as]
Expand Down Expand Up @@ -386,7 +388,7 @@ pub struct NatCleanupConfig {
pub struct InventoryConfig {
/// period (in seconds) for periodic activations of this background task
///
/// Each activation fetches information about all harware and software in
/// Each activation fetches information about all hardware and software in
/// the system and inserts it into the database. This generates a moderate
/// amount of data.
#[serde_as(as = "DurationSeconds<u64>")]
Expand All @@ -405,6 +407,14 @@ pub struct InventoryConfig {
pub disable: bool,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct PhantomDiskConfig {
/// period (in seconds) for periodic activations of this background task
#[serde_as(as = "DurationSeconds<u64>")]
pub period_secs: Duration,
}

/// Configuration for a nexus server
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
pub struct PackageConfig {
Expand Down Expand Up @@ -508,8 +518,9 @@ mod test {
BackgroundTaskConfig, Config, ConfigDropshotWithTls, ConsoleConfig,
Database, DeploymentConfig, DnsTasksConfig, DpdConfig,
ExternalEndpointsConfig, InternalDns, InventoryConfig, LoadError,
LoadErrorKind, MgdConfig, NatCleanupConfig, PackageConfig, SchemeName,
TimeseriesDbConfig, Tunables, UpdatesConfig,
LoadErrorKind, MgdConfig, NatCleanupConfig, PackageConfig,
PhantomDiskConfig, SchemeName, TimeseriesDbConfig, Tunables,
UpdatesConfig,
};
use crate::address::{Ipv6Subnet, RACK_PREFIX};
use crate::api::internal::shared::SwitchLocation;
Expand Down Expand Up @@ -663,6 +674,7 @@ mod test {
inventory.period_secs = 10
inventory.nkeep = 11
inventory.disable = false
phantom_disks.period_secs = 30
[default_region_allocation_strategy]
type = "random"
seed = 0
Expand Down Expand Up @@ -764,7 +776,10 @@ mod test {
period_secs: Duration::from_secs(10),
nkeep: 11,
disable: false,
}
},
phantom_disks: PhantomDiskConfig {
period_secs: Duration::from_secs(30),
},
},
default_region_allocation_strategy:
crate::nexus_config::RegionAllocationStrategy::Random {
Expand Down Expand Up @@ -822,6 +837,7 @@ mod test {
inventory.period_secs = 10
inventory.nkeep = 3
inventory.disable = false
phantom_disks.period_secs = 30
[default_region_allocation_strategy]
type = "random"
"##,
Expand Down
26 changes: 26 additions & 0 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,32 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
);
}
};
} else if name == "phantom_disks" {
#[derive(Deserialize)]
struct TaskSuccess {
/// how many phantom disks were deleted ok
phantom_disk_deleted_ok: usize,

/// how many phantom disks could not be deleted
phantom_disk_deleted_err: usize,
}

match serde_json::from_value::<TaskSuccess>(details.clone()) {
Err(error) => eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
),
Ok(success) => {
println!(
" number of phantom disks deleted: {}",
success.phantom_disk_deleted_ok
);
println!(
" number of phantom disk delete errors: {}",
success.phantom_disk_deleted_err
);
}
};
} else {
println!(
"warning: unknown background task: {:?} \
Expand Down
12 changes: 12 additions & 0 deletions dev-tools/omdb/tests/env.out
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ task: "nat_v4_garbage_collector"
predetermined retention policy


task: "phantom_disks"
detects and un-deletes phantom disks


---------------------------------------------
stderr:
note: using Nexus URL http://127.0.0.1:REDACTED_PORT
Expand Down Expand Up @@ -131,6 +135,10 @@ task: "nat_v4_garbage_collector"
predetermined retention policy


task: "phantom_disks"
detects and un-deletes phantom disks


---------------------------------------------
stderr:
note: Nexus URL not specified. Will pick one from DNS.
Expand Down Expand Up @@ -183,6 +191,10 @@ task: "nat_v4_garbage_collector"
predetermined retention policy


task: "phantom_disks"
detects and un-deletes phantom disks


---------------------------------------------
stderr:
note: Nexus URL not specified. Will pick one from DNS.
Expand Down
12 changes: 12 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,10 @@ task: "nat_v4_garbage_collector"
predetermined retention policy


task: "phantom_disks"
detects and un-deletes phantom disks


---------------------------------------------
stderr:
note: using Nexus URL http://127.0.0.1:REDACTED_PORT/
Expand Down Expand Up @@ -357,6 +361,14 @@ task: "inventory_collection"
last collection started: <REDACTED_TIMESTAMP>
last collection done: <REDACTED_TIMESTAMP>

task: "phantom_disks"
configured period: every 30s
currently executing: no
last completed activation: iter 2, triggered by an explicit signal
started at <REDACTED TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
number of phantom disks deleted: 0
number of phantom disk delete errors: 0

---------------------------------------------
stderr:
note: using Nexus URL http://127.0.0.1:REDACTED_PORT/
Expand Down
6 changes: 5 additions & 1 deletion nexus/db-model/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1301,7 +1301,7 @@ table! {
///
/// This should be updated whenever the schema is changed. For more details,
/// refer to: schema/crdb/README.adoc
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(17, 0, 0);
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(18, 0, 0);

allow_tables_to_appear_in_same_query!(
system_update,
Expand Down Expand Up @@ -1370,3 +1370,7 @@ allow_tables_to_appear_in_same_query!(
switch_port,
switch_port_settings_bgp_peer_config
);

allow_tables_to_appear_in_same_query!(disk, virtual_provisioning_resource);

allow_tables_to_appear_in_same_query!(volume, virtual_provisioning_resource);
Loading

0 comments on commit c915eeb

Please sign in to comment.