Skip to content

Commit

Permalink
Split starting running snapshots into two steps (#3097)
Browse files Browse the repository at this point in the history
When Saga nodes fail, the corresponding undo function is not executed.
If a saga node can partially succeed before failing, the undo function
has to be split into a separate saga step in order to properly unwind
the created resources. This wasn't being done for running snapshots, so
this commit corrects that and adds a test for it.

Fixes #3085
  • Loading branch information
jmpesp authored May 15, 2023
1 parent 7980ab4 commit b417a6c
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 3 deletions.
2 changes: 2 additions & 0 deletions nexus/src/app/sagas/disk_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,8 @@ pub(crate) mod test {
);
assert!(no_region_allocations_exist(datastore, &test).await);
assert!(no_regions_ensured(&sled_agent, &test).await);

assert!(test.crucible_resources_deleted().await);
}

#[nexus_test(server = crate::Server)]
Expand Down
12 changes: 9 additions & 3 deletions nexus/src/app/sagas/snapshot_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,12 @@ declare_saga_actions! {
+ ssc_detach_disk_from_pantry
}

START_RUNNING_SNAPSHOT_UNDO -> "ssc_not_used" {
+ ssc_noop
- ssc_start_running_snapshot_undo
}
START_RUNNING_SNAPSHOT -> "replace_sockets_map" {
+ ssc_start_running_snapshot
- ssc_start_running_snapshot_undo
}
CREATE_VOLUME_RECORD -> "created_volume" {
+ ssc_create_volume_record
Expand Down Expand Up @@ -266,8 +269,7 @@ impl NexusSaga for SagaSnapshotCreate {

// (Sleds + DB) Start snapshot downstairs, add an entry in the DB for
// the dataset's snapshot.
//
// TODO: Should this be two separate saga steps?
builder.append(start_running_snapshot_undo_action());
builder.append(start_running_snapshot_action());
// (DB) Copy and modify the disk volume construction request to point
// to the new running snapshot
Expand All @@ -289,6 +291,10 @@ impl NexusSaga for SagaSnapshotCreate {

// snapshot create saga: action implementations

async fn ssc_noop(_sagactx: NexusActionContext) -> Result<(), ActionError> {
Ok(())
}

async fn ssc_alloc_regions(
sagactx: NexusActionContext,
) -> Result<Vec<(db::model::Dataset, db::model::Region)>, ActionError> {
Expand Down
113 changes: 113 additions & 0 deletions nexus/tests/integration_tests/snapshots.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ fn get_disks_url() -> String {
format!("/v1/disks?project={}", PROJECT_NAME)
}

fn get_disk_url(name: &str) -> String {
format!("/v1/disks/{}?project={}", name, PROJECT_NAME)
}

async fn create_org_and_project(client: &ClientTestContext) -> Uuid {
let project = create_project(client, PROJECT_NAME).await;
project.identity.id
Expand Down Expand Up @@ -747,6 +751,115 @@ async fn test_cannot_snapshot_if_no_space(cptestctx: &ControlPlaneTestContext) {
.expect("unexpected success creating snapshot");
}

#[nexus_test]
async fn test_snapshot_unwind(cptestctx: &ControlPlaneTestContext) {
let client = &cptestctx.external_client;
let disk_test = DiskTest::new(&cptestctx).await;
populate_ip_pool(&client, "default", None).await;
create_org_and_project(client).await;
let disks_url = get_disks_url();

// Define a global image
let server = ServerBuilder::new().run().unwrap();
server.expect(
Expectation::matching(request::method_path("HEAD", "/image.raw"))
.times(1..)
.respond_with(
status_code(200).append_header(
"Content-Length",
format!("{}", 4096 * 1000),
),
),
);

let image_create_params = params::ImageCreate {
identity: IdentityMetadataCreateParams {
name: "alpine-edge".parse().unwrap(),
description: String::from(
"you can boot any image, as long as it's alpine",
),
},
source: params::ImageSource::Url {
url: server.url("/image.raw").to_string(),
},
os: "alpine".to_string(),
version: "edge".to_string(),
block_size: params::BlockSize::try_from(512).unwrap(),
};

let images_url = format!("/v1/images?project={}", PROJECT_NAME);
let image =
NexusRequest::objects_post(client, &images_url, &image_create_params)
.authn_as(AuthnMode::PrivilegedUser)
.execute_and_parse_unwrap::<views::Image>()
.await;

// Create a disk from this image
let disk_size = ByteCount::from_gibibytes_u32(2);
let base_disk_name: Name = "base-disk".parse().unwrap();
let base_disk = params::DiskCreate {
identity: IdentityMetadataCreateParams {
name: base_disk_name.clone(),
description: String::from("sells rainsticks"),
},
disk_source: params::DiskSource::Image { image_id: image.identity.id },
size: disk_size,
};

let _base_disk: Disk = NexusRequest::new(
RequestBuilder::new(client, Method::POST, &disks_url)
.body(Some(&base_disk))
.expect_status(Some(StatusCode::CREATED)),
)
.authn_as(AuthnMode::PrivilegedUser)
.execute()
.await
.unwrap()
.parsed_body()
.unwrap();

// Set the third region's running snapshot callback so it fails
let zpool = &disk_test.zpools[2];
let dataset = &zpool.datasets[0];
disk_test
.sled_agent
.get_crucible_dataset(zpool.id, dataset.id)
.await
.set_creating_a_running_snapshot_should_fail()
.await;

// Issue snapshot request, expecting it to fail
let snapshots_url = format!("/v1/snapshots?project={}", PROJECT_NAME);

NexusRequest::expect_failure_with_body(
client,
StatusCode::INTERNAL_SERVER_ERROR,
Method::POST,
&snapshots_url,
&params::SnapshotCreate {
identity: IdentityMetadataCreateParams {
name: "snapshot".parse().unwrap(),
description: String::from("a snapshot"),
},
disk: base_disk_name.clone(),
},
)
.authn_as(AuthnMode::PrivilegedUser)
.execute()
.await
.expect("snapshot request succeeded");

// Delete the disk
NexusRequest::object_delete(client, &get_disk_url(base_disk_name.as_str()))
.authn_as(AuthnMode::PrivilegedUser)
.execute()
.await
.expect("failed to delete disk");

// Assert everything was cleaned up
assert!(disk_test.crucible_resources_deleted().await);
}

// Test that the code that Saga nodes call is idempotent

#[nexus_test]
Expand Down
14 changes: 14 additions & 0 deletions sled-agent/src/sim/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ struct CrucibleDataInner {
snapshots: HashMap<Uuid, Vec<Snapshot>>,
running_snapshots: HashMap<Uuid, HashMap<String, RunningSnapshot>>,
on_create: Option<CreateCallback>,
creating_a_running_snapshot_should_fail: bool,
next_port: u16,
}

Expand All @@ -47,6 +48,7 @@ impl CrucibleDataInner {
snapshots: HashMap::new(),
running_snapshots: HashMap::new(),
on_create: None,
creating_a_running_snapshot_should_fail: false,
next_port: crucible_port,
}
}
Expand Down Expand Up @@ -175,11 +177,19 @@ impl CrucibleDataInner {
Ok(())
}

fn set_creating_a_running_snapshot_should_fail(&mut self) {
self.creating_a_running_snapshot_should_fail = true;
}

fn create_running_snapshot(
&mut self,
id: &RegionId,
name: &str,
) -> Result<RunningSnapshot> {
if self.creating_a_running_snapshot_should_fail {
bail!("failure creating running snapshot");
}

let id = Uuid::from_str(&id.0).unwrap();

let map =
Expand Down Expand Up @@ -308,6 +318,10 @@ impl CrucibleData {
self.inner.lock().await.delete_snapshot(id, name)
}

pub async fn set_creating_a_running_snapshot_should_fail(&self) {
self.inner.lock().await.set_creating_a_running_snapshot_should_fail();
}

pub async fn create_running_snapshot(
&self,
id: &RegionId,
Expand Down

0 comments on commit b417a6c

Please sign in to comment.