Skip to content

Commit

Permalink
Actually provision read-only downstairs! (#728)
Browse files Browse the repository at this point in the history
My normal flow for booting an instance is:

- create an image
- create a disk with that image as a source
- attach that disk to an instance, and boot

This masked the problem that this PR fixes: the crucible agent does not
call `apply_smf` when creating or destroying "running snapshots" (aka
read-only downstairs for snapshots), **only** when creating or
destroying regions.

If a user creates an image, the read-only downstairs are not
provisioned. If a user then creates a new disk with that image as a
source, `apply_smf` is called as part of creating the region for that
new disk, and this will also provision the read-only downstairs. If a
user only created a disk from a bulk import (like
oxidecomputer/omicron#3034) then the read-only downstairs would not be
started. If that disk is then attached to an instance, it will not boot
because the Upstairs cannot connect to the non-existent read-only
downstairs:

    May 07 06:23:09.145 INFO [1] connecting to [fd00:1122:3344:102::a]:19001, looper: 1
    May 07 06:23:09.146 INFO [0] connecting to [fd00:1122:3344:105::5]:19001, looper: 0
    May 07 06:23:09.146 INFO [2] connecting to [fd00:1122:3344:10b::b]:19001, looper: 2
    May 07 06:23:19.155 INFO [1] connecting to [fd00:1122:3344:102::a]:19001, looper: 1
    May 07 06:23:19.158 INFO [0] connecting to [fd00:1122:3344:105::5]:19001, looper: 0
    May 07 06:23:19.158 INFO [2] connecting to [fd00:1122:3344:10b::b]:19001, looper: 2

Fixes oxidecomputer/omicron#3034
Fixes #704
  • Loading branch information
jmpesp authored May 10, 2023
1 parent 92ae012 commit 9f69dea
Show file tree
Hide file tree
Showing 3 changed files with 300 additions and 79 deletions.
171 changes: 146 additions & 25 deletions agent/src/datafile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ impl DataFile {
id: request.id.clone(),
name: request.name.clone(),
port_number,
state: State::Created, // no work to do, just run smf
state: State::Requested,
};

info!(
Expand Down Expand Up @@ -302,26 +302,26 @@ impl DataFile {
let running_snapshots =
inner.running_snapshots.get_mut(&request.id).unwrap();

if running_snapshots.get(&request.name).is_none() {
info!(
self.log,
"not running for region {} snapshot {}, returning Ok",
request.id.0,
request.name
);

return Ok(());
}
match running_snapshots.get_mut(&request.name) {
None => {
info!(
self.log,
"not running for region {} snapshot {}, returning Ok",
request.id.0,
request.name
);

info!(
self.log,
"removing snapshot {}-{}", request.id.0, request.name
);
return Ok(());
}

running_snapshots.remove(&request.name);
Some(request) => {
info!(
self.log,
"removing snapshot {}-{}", request.id.0, request.name
);

if running_snapshots.is_empty() {
inner.running_snapshots.remove(&request.id);
request.state = State::Tombstoned;
}
}

/*
Expand Down Expand Up @@ -435,6 +435,36 @@ impl DataFile {
self.store(inner);
}

/**
* Mark a particular running snapshot as failed to provision.
*/
pub fn fail_rs(&self, region_id: &RegionId, snapshot_name: &str) {
let mut inner = self.inner.lock().unwrap();

let mut rs = inner
.running_snapshots
.get_mut(region_id)
.unwrap()
.get_mut(snapshot_name)
.unwrap();

let nstate = State::Failed;
if rs.state == nstate {
return;
}

info!(
self.log,
"running snapshot {} state: {:?} -> {:?}",
rs.id.0,
rs.state,
nstate,
);
rs.state = nstate;

self.store(inner);
}

/**
* Mark a particular region as provisioned.
*/
Expand Down Expand Up @@ -466,7 +496,54 @@ impl DataFile {
}

/**
* Mark a particular region as destroyed.
* Mark a particular running snapshot as created.
*/
pub fn created_rs(
&self,
region_id: &RegionId,
snapshot_name: &str,
) -> Result<()> {
let mut inner = self.inner.lock().unwrap();

let mut rs = inner
.running_snapshots
.get_mut(region_id)
.unwrap()
.get_mut(snapshot_name)
.unwrap();

let nstate = State::Created;

match &rs.state {
State::Requested => (),
State::Tombstoned => {
/*
* Nexus requested that we destroy this running snapshot before
* we finished provisioning it.
*/
return Ok(());
}

x => bail!("created running_snapshot in weird state {:?}", x),
}

info!(
self.log,
"running snapshot {} state: {:?} -> {:?}",
rs.id.0,
rs.state,
nstate,
);
rs.state = nstate;

self.store(inner);
Ok(())
}

/**
* Mark a particular region as destroyed. Do not remove the record: it's
* important for calls to the agent to be idempotent in order to safely be
* used in a saga.
*/
pub fn destroyed(&self, id: &RegionId) -> Result<()> {
let mut inner = self.inner.lock().unwrap();
Expand All @@ -484,7 +561,39 @@ impl DataFile {
);
r.state = nstate;

// XXX shouldn't this remove the record?
self.store(inner);
Ok(())
}

/**
* Mark a particular running snapshot as destroyed. Do not remove the
* record: it's important for calls to the agent to be idempotent in order
* to safely be used in a saga.
*/
pub fn destroyed_rs(
&self,
region_id: &RegionId,
snapshot_name: &str,
) -> Result<()> {
let mut inner = self.inner.lock().unwrap();

let mut rs = inner
.running_snapshots
.get_mut(region_id)
.unwrap()
.get_mut(snapshot_name)
.unwrap();

let nstate = State::Destroyed;

info!(
self.log,
"running snapshot {} state: {:?} -> {:?}",
rs.id.0,
rs.state,
nstate,
);
rs.state = nstate;

self.store(inner);
Ok(())
Expand Down Expand Up @@ -547,11 +656,11 @@ impl DataFile {
}

/**
* The worker thread will request the first region that is in a
* particular state. If there are no tasks in the provided state,
* we sleep waiting for work to do.
* The worker thread will request the first resource that is in a
* particular state. If there are no resources in the provided state,
* wait on the condition variable.
*/
pub fn first_in_states(&self, states: &[State]) -> Region {
pub fn first_in_states(&self, states: &[State]) -> Resource {
let mut inner = self.inner.lock().unwrap();

loop {
Expand All @@ -565,7 +674,19 @@ impl DataFile {
for s in states {
for r in inner.regions.values() {
if &r.state == s {
return r.clone();
return Resource::Region(r.clone());
}
}

for (rid, r) in &inner.running_snapshots {
for (name, rs) in r {
if &rs.state == s {
return Resource::RunningSnapshot(
rid.clone(),
name.clone(),
rs.clone(),
);
}
}
}
}
Expand Down
Loading

0 comments on commit 9f69dea

Please sign in to comment.