From 30e756512a40a284efae766110f710fdd5b0e6fb Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 9 May 2023 10:07:30 -0400 Subject: [PATCH 1/6] Actually provision running read-only downstairs! My normal flow for booting an instance is: - create an image - create a disk with that image as a source - attach that disk to an instance, and boot This masked the problem that this PR fixes: the crucible agent does not call `apply_smf` when creating or destroying "running snapshots" (aka read-only downstairs for snapshots), **only** when creating or destroying regions. If a user creates an image, the read-only downstairs are not provisioned. If a user then creates a new disk with that image as a source, `apply_smf` is called as part of creating the region for that new disk, and this will also provision the read-only downstairs. If a user only created a disk from a bulk import (like oxidecomputer/omicron#3034) then the read-only downstairs would not be started. If that disk is then attached to an instance, it will not boot because the Upstairs cannot connect to the non-existent read-only downstairs: May 07 06:23:09.145 INFO [1] connecting to [fd00:1122:3344:102::a]:19001, looper: 1 May 07 06:23:09.146 INFO [0] connecting to [fd00:1122:3344:105::5]:19001, looper: 0 May 07 06:23:09.146 INFO [2] connecting to [fd00:1122:3344:10b::b]:19001, looper: 2 May 07 06:23:19.155 INFO [1] connecting to [fd00:1122:3344:102::a]:19001, looper: 1 May 07 06:23:19.158 INFO [0] connecting to [fd00:1122:3344:105::5]:19001, looper: 0 May 07 06:23:19.158 INFO [2] connecting to [fd00:1122:3344:10b::b]:19001, looper: 2 Fixes oxidecomputer/omicron#3034 Fixes oxidecomputer/crucible#704 --- agent/src/datafile.rs | 167 ++++++++++++++++++++++++++++++++------ agent/src/main.rs | 182 ++++++++++++++++++++++++++++++------------ agent/src/model.rs | 7 ++ 3 files changed, 281 insertions(+), 75 deletions(-) diff --git a/agent/src/datafile.rs b/agent/src/datafile.rs index 673eb3449..563861a27 100644 --- a/agent/src/datafile.rs +++ b/agent/src/datafile.rs @@ -259,7 +259,7 @@ impl DataFile { id: request.id.clone(), name: request.name.clone(), port_number, - state: State::Created, // no work to do, just run smf + state: State::Requested, }; info!( @@ -302,26 +302,26 @@ impl DataFile { let running_snapshots = inner.running_snapshots.get_mut(&request.id).unwrap(); - if running_snapshots.get(&request.name).is_none() { - info!( - self.log, - "not running for region {} snapshot {}, returning Ok", - request.id.0, - request.name - ); - - return Ok(()); - } + match running_snapshots.get_mut(&request.name) { + None => { + info!( + self.log, + "not running for region {} snapshot {}, returning Ok", + request.id.0, + request.name + ); - info!( - self.log, - "removing snapshot {}-{}", request.id.0, request.name - ); + return Ok(()); + } - running_snapshots.remove(&request.name); + Some(request) => { + info!( + self.log, + "removing snapshot {}-{}", request.id.0, request.name + ); - if running_snapshots.is_empty() { - inner.running_snapshots.remove(&request.id); + request.state = State::Tombstoned; + } } /* @@ -435,6 +435,36 @@ impl DataFile { self.store(inner); } + /** + * Mark a particular running snapshot as failed to provision. + */ + pub fn fail_rs(&self, region_id: &RegionId, snapshot_name: &str) { + let mut inner = self.inner.lock().unwrap(); + + let mut rs = inner + .running_snapshots + .get_mut(region_id) + .unwrap() + .get_mut(snapshot_name) + .unwrap(); + + let nstate = State::Failed; + if rs.state == nstate { + return; + } + + info!( + self.log, + "running snapshot {} state: {:?} -> {:?}", + rs.id.0, + rs.state, + nstate, + ); + rs.state = nstate; + + self.store(inner); + } + /** * Mark a particular region as provisioned. */ @@ -465,6 +495,51 @@ impl DataFile { Ok(()) } + /** + * Mark a particular running snapshot as provisioned. + */ + pub fn created_rs( + &self, + region_id: &RegionId, + snapshot_name: &str, + ) -> Result<()> { + let mut inner = self.inner.lock().unwrap(); + + let mut rs = inner + .running_snapshots + .get_mut(region_id) + .unwrap() + .get_mut(snapshot_name) + .unwrap(); + + let nstate = State::Created; + + match &rs.state { + State::Requested => (), + State::Tombstoned => { + /* + * Nexus requested that we destroy this running snapshot before + * we finished provisioning it. + */ + return Ok(()); + } + + x => bail!("created running_snapshot in weird state {:?}", x), + } + + info!( + self.log, + "running snapshot {} state: {:?} -> {:?}", + rs.id.0, + rs.state, + nstate, + ); + rs.state = nstate; + + self.store(inner); + Ok(()) + } + /** * Mark a particular region as destroyed. */ @@ -490,6 +565,40 @@ impl DataFile { Ok(()) } + /** + * Mark a particular running snapshot as destroyed. + */ + pub fn destroyed_rs( + &self, + region_id: &RegionId, + snapshot_name: &str, + ) -> Result<()> { + let mut inner = self.inner.lock().unwrap(); + + let mut rs = inner + .running_snapshots + .get_mut(region_id) + .unwrap() + .get_mut(snapshot_name) + .unwrap(); + + let nstate = State::Destroyed; + + info!( + self.log, + "running snapshot {} state: {:?} -> {:?}", + rs.id.0, + rs.state, + nstate, + ); + rs.state = nstate; + + // XXX shouldn't this remove the record? + + self.store(inner); + Ok(()) + } + /** * Nexus has requested that we destroy this particular region. Returns * true if the region is already destroyed, false if not destroyed @@ -547,11 +656,11 @@ impl DataFile { } /** - * The worker thread will request the first region that is in a - * particular state. If there are no tasks in the provided state, - * we sleep waiting for work to do. + * The worker thread will request the first resource that is in a + * particular state. If there are no resources in the provided state, + * wait on the condition variable. */ - pub fn first_in_states(&self, states: &[State]) -> Region { + pub fn first_in_states(&self, states: &[State]) -> Resource { let mut inner = self.inner.lock().unwrap(); loop { @@ -565,7 +674,19 @@ impl DataFile { for s in states { for r in inner.regions.values() { if &r.state == s { - return r.clone(); + return Resource::Region(r.clone()); + } + } + + for (rid, r) in &inner.running_snapshots { + for (name, rs) in r { + if &rs.state == s { + return Resource::RunningSnapshot( + rid.clone(), + name.clone(), + rs.clone(), + ); + } } } } diff --git a/agent/src/main.rs b/agent/src/main.rs index fc515c4a6..ce73ffca2 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -18,6 +18,7 @@ mod datafile; mod model; mod server; +use model::Resource; use model::State; #[derive(Debug, Parser)] @@ -337,8 +338,9 @@ fn apply_smf( .iter() .flat_map(|(_, n)| { n.iter() - .map(|(_, s)| { - format!("{}-{}-{}", snapshot_prefix, s.id.0, s.name) + .filter(|(_, rs)| rs.state == State::Created) + .map(|(_, rs)| { + format!("{}-{}-{}", snapshot_prefix, rs.id.0, rs.name) }) .collect::>() }) @@ -786,51 +788,114 @@ fn worker( * - create a running snapshot * - delete a running snapshot * - * We use first_in_states to both get the next available Region - * and to wait on the condvar when there are no regions available. - * - * If the region is State::Requested, we create that region then run - * the apply_smf(). - * If the region is State:Tombstoned, we apply_smf() first, then we - * finish up destroying the region. + * We use first_in_states to both get the next available Resource enum, + * which wraps either a Region or RegionSnapshot that has changed. + * Otherwise, first_in_states will wait on the condvar. */ - let r = df.first_in_states(&[State::Tombstoned, State::Requested]); + let work = df.first_in_states(&[State::Tombstoned, State::Requested]); - match &r.state { - State::Requested => { - // if regions need to be created, do that before apply_smf. - let region_dataset = - regions_dataset.ensure_child_dataset(&r.id.0).unwrap(); + match work { + Resource::Region(r) => { + /* + * If the region is State::Requested, we create that region then run + * the apply_smf(). + * + * If the region is State:Tombstoned, we apply_smf() first, then we + * finish up destroying the region. + */ + match &r.state { + State::Requested => { + // if regions need to be created, do that before apply_smf. + let region_dataset = regions_dataset + .ensure_child_dataset(&r.id.0) + .unwrap(); + + let res = worker_region_create( + &log, + &downstairs_program, + &r, + ®ion_dataset.path().unwrap(), + ) + .and_then(|_| df.created(&r.id)); + + if let Err(e) = res { + error!( + log, + "region {:?} create failed: {:?}", r.id.0, e + ); + df.fail(&r.id); + } - let res = worker_region_create( - &log, - &downstairs_program, - &r, - ®ion_dataset.path().unwrap(), - ) - .and_then(|_| df.created(&r.id)); + info!(log, "applying SMF actions post create..."); + let result = apply_smf( + &log, + &df, + regions_dataset.path().unwrap(), + &downstairs_prefix, + &snapshot_prefix, + ); + if let Err(e) = result { + error!(log, "SMF application failure: {:?}", e); + } else { + info!(log, "SMF ok!"); + } + } + State::Tombstoned => { + info!(log, "applying SMF actions before removal..."); + let result = apply_smf( + &log, + &df, + regions_dataset.path().unwrap(), + &downstairs_prefix, + &snapshot_prefix, + ); + + if let Err(e) = result { + error!(log, "SMF application failure: {:?}", e); + } else { + info!(log, "SMF ok!"); + } - if let Err(e) = res { - error!(log, "region {:?} create failed: {:?}", r.id.0, e); - df.fail(&r.id); - } + // After SMF successfully shuts off downstairs, remove zfs + // dataset. + let region_dataset = regions_dataset + .from_child_dataset(&r.id.0) + .unwrap(); - info!(log, "applying SMF actions post create..."); - let result = apply_smf( - &log, - &df, - regions_dataset.path().unwrap(), - &downstairs_prefix, - &snapshot_prefix, - ); - if let Err(e) = result { - error!(log, "SMF application failure: {:?}", e); - } else { - info!(log, "SMF ok!"); + let res = + worker_region_destroy(&log, &r, region_dataset) + .and_then(|_| df.destroyed(&r.id)); + + if let Err(e) = res { + error!( + log, + "region {:?} destroy failed: {:?}", r.id.0, e + ); + df.fail(&r.id); + } + } + _ => { + eprintln!( + "worker got unexpected region state: {:?}", + r + ); + std::process::exit(1); + } } } - State::Tombstoned => { - info!(log, "applying SMF actions before removal..."); + + Resource::RunningSnapshot(region_id, snapshot_name, rs) => { + /* + * No matter what the state is, we run apply_smf(). Creating and + * deleting running snapshots only requires us to create and + * delete services. The snapshots are not created by us. + */ + info!( + log, + "applying SMF actions for running snapshot {} (state {:?})...", + rs.id.0, + rs.state, + ); let result = apply_smf( &log, &df, @@ -845,23 +910,36 @@ fn worker( info!(log, "SMF ok!"); } - // After SMF successfully shuts off downstairs, remove zfs - // dataset. - let region_dataset = - regions_dataset.from_child_dataset(&r.id.0).unwrap(); + // `apply_smf` returned Ok, so the desired state transition + // succeeded: update the datafile. + let res = match &rs.state { + State::Requested => { + df.created_rs(®ion_id, &snapshot_name) + } - let res = worker_region_destroy(&log, &r, region_dataset) - .and_then(|_| df.destroyed(&r.id)); + State::Tombstoned => { + df.destroyed_rs(®ion_id, &snapshot_name) + } + + _ => { + eprintln!( + "worker got unexpected running snapshot state: {:?}", + rs, + ); + std::process::exit(1); + } + }; if let Err(e) = res { - error!(log, "region {:?} destroy failed: {:?}", r.id.0, e); - df.fail(&r.id); + error!( + log, + "running snapshot {} state change failed: {:?}", + rs.id.0, + e + ); + df.fail_rs(®ion_id, &snapshot_name); } } - _ => { - eprintln!("worker got unexpected region state: {:?}", r); - std::process::exit(1); - } } } } diff --git a/agent/src/model.rs b/agent/src/model.rs index 7e0c24bce..fc8d38405 100644 --- a/agent/src/model.rs +++ b/agent/src/model.rs @@ -291,6 +291,13 @@ pub struct DeleteSnapshotRequest { pub name: String, } +// The different types of resources the worker thread monitors for changes. This +// wraps the object that has been added, or changed somehow. +pub enum Resource { + Region(Region), + RunningSnapshot(RegionId, String, RunningSnapshot), +} + #[cfg(test)] mod test { use super::*; From 3db67ec855b1b8e39d5e482ab88e51e16100f991 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 9 May 2023 11:17:49 -0400 Subject: [PATCH 2/6] disable the smf without re-enabling it --- agent/src/main.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/agent/src/main.rs b/agent/src/main.rs index ce73ffca2..15f0e8354 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -538,6 +538,10 @@ fn apply_smf( for (_, region_snapshots) in running_snapshots.iter_mut() { for snapshot in region_snapshots.values_mut() { + if snapshot.state != State::Requested { + continue; + } + let name = format!( "{}-{}-{}", snapshot_prefix, snapshot.id.0, snapshot.name @@ -889,6 +893,13 @@ fn worker( * No matter what the state is, we run apply_smf(). Creating and * deleting running snapshots only requires us to create and * delete services. The snapshots are not created by us. + * + * If the running snapshot is Requested, we apply_smf() first, + * then set the state to Created. This is a little different + * from how Regions are handled. + * + * If the running snapshot is Tombstoned, we apply_smf() first, + * then we set the state to Destroyed */ info!( log, From 6f7e1680994ff95d38d1a39543fa383e68a9a681 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 10 May 2023 10:37:30 -0400 Subject: [PATCH 3/6] provisioned -> created --- agent/src/datafile.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/src/datafile.rs b/agent/src/datafile.rs index 563861a27..bbb7ee0cc 100644 --- a/agent/src/datafile.rs +++ b/agent/src/datafile.rs @@ -496,7 +496,7 @@ impl DataFile { } /** - * Mark a particular running snapshot as provisioned. + * Mark a particular running snapshot as created. */ pub fn created_rs( &self, From 03086e68e2038439a954b92a8f91a00e23d9102e Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 10 May 2023 10:54:01 -0400 Subject: [PATCH 4/6] remove XXX, add note about sagas --- agent/src/datafile.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/agent/src/datafile.rs b/agent/src/datafile.rs index bbb7ee0cc..765027b28 100644 --- a/agent/src/datafile.rs +++ b/agent/src/datafile.rs @@ -541,7 +541,9 @@ impl DataFile { } /** - * Mark a particular region as destroyed. + * Mark a particular region as destroyed. Do not remove the record: it's + * important for calls to the agent to be idempotent in order to safely be + * used in a saga. */ pub fn destroyed(&self, id: &RegionId) -> Result<()> { let mut inner = self.inner.lock().unwrap(); @@ -559,14 +561,14 @@ impl DataFile { ); r.state = nstate; - // XXX shouldn't this remove the record? - self.store(inner); Ok(()) } /** - * Mark a particular running snapshot as destroyed. + * Mark a particular running snapshot as destroyed. Do not remove the + * record: it's important for calls to the agent to be idempotent in order + * to safely be used in a saga. */ pub fn destroyed_rs( &self, @@ -593,8 +595,6 @@ impl DataFile { ); rs.state = nstate; - // XXX shouldn't this remove the record? - self.store(inner); Ok(()) } From bc0ba2da3f5135d8649f1e801db2c98f85e22f8e Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 10 May 2023 10:57:11 -0400 Subject: [PATCH 5/6] move datafile update into "SMF ok!" branch --- agent/src/main.rs | 53 +++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/agent/src/main.rs b/agent/src/main.rs index 15f0e8354..4ff78688d 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -844,6 +844,7 @@ fn worker( info!(log, "SMF ok!"); } } + State::Tombstoned => { info!(log, "applying SMF actions before removal..."); let result = apply_smf( @@ -907,6 +908,7 @@ fn worker( rs.id.0, rs.state, ); + let result = apply_smf( &log, &df, @@ -919,36 +921,37 @@ fn worker( error!(log, "SMF application failure: {:?}", e); } else { info!(log, "SMF ok!"); - } - // `apply_smf` returned Ok, so the desired state transition - // succeeded: update the datafile. - let res = match &rs.state { - State::Requested => { - df.created_rs(®ion_id, &snapshot_name) - } + // `apply_smf` returned Ok, so the desired state transition + // succeeded: update the datafile. + let res = match &rs.state { + State::Requested => { + df.created_rs(®ion_id, &snapshot_name) + } - State::Tombstoned => { - df.destroyed_rs(®ion_id, &snapshot_name) - } + State::Tombstoned => { + df.destroyed_rs(®ion_id, &snapshot_name) + } - _ => { - eprintln!( - "worker got unexpected running snapshot state: {:?}", - rs, + _ => { + eprintln!( + "worker got unexpected running snapshot state: {:?}", + rs, + ); + std::process::exit(1); + } + }; + + if let Err(e) = res { + error!( + log, + "running snapshot {} state change failed: {:?}", + rs.id.0, + e ); - std::process::exit(1); - } - }; - if let Err(e) = res { - error!( - log, - "running snapshot {} state change failed: {:?}", - rs.id.0, - e - ); - df.fail_rs(®ion_id, &snapshot_name); + df.fail_rs(®ion_id, &snapshot_name); + } } } } From 473cf73454820941c5aa82d27a1fe6189636f560 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 10 May 2023 10:59:05 -0400 Subject: [PATCH 6/6] eprintln -> error --- agent/src/main.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/agent/src/main.rs b/agent/src/main.rs index 4ff78688d..70b309e8e 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -880,9 +880,9 @@ fn worker( } } _ => { - eprintln!( - "worker got unexpected region state: {:?}", - r + error!( + log, + "worker got unexpected region state: {:?}", r ); std::process::exit(1); } @@ -934,7 +934,8 @@ fn worker( } _ => { - eprintln!( + error!( + log, "worker got unexpected running snapshot state: {:?}", rs, );