From 90a97da83170a4a81c7e7ba6274e0c181408bc75 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 15 Sep 2022 17:17:29 -0400 Subject: [PATCH 1/3] Set instance state to failed on bad instance_put If a call to sled-agent's `instance_put` fails with anything other than a 400 (bad request), set instance's state to failed - we cannot know what state the instance is in. This exposed another two problems: - Nexus would only delete network interfaces off instances that were stopped - Nexus would only detach disks off instances that were creating or stopped. This commit changes the relevant network interface queries to allow deletion of network interfaces for instances that are either stopped or failed, and adds Failed to the list of ok_to_detach_instance_states for disks. Note that network interface insertion still requires an instance to be stopped, not failed. Closes https://github.com/oxidecomputer/omicron/issues/1713 --- nexus/src/app/instance.rs | 79 ++++++++++++++-- nexus/src/db/datastore/disk.rs | 1 + nexus/src/db/queries/network_interface.rs | 99 ++++++++++++-------- nexus/tests/integration_tests/instances.rs | 102 ++++++++++++++++++++- sled-agent/src/sim/sled_agent.rs | 7 ++ 5 files changed, 237 insertions(+), 51 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 47f1a68b1d..81b6ddda4e 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -604,25 +604,84 @@ impl super::Nexus { let sa = self.instance_sled(&db_instance).await?; - let new_runtime = sa + let instance_put_result = sa .instance_put( &db_instance.id(), &sled_agent_client::types::InstanceEnsureBody { initial: instance_hardware, - target: requested, + target: requested.clone(), migrate: None, }, ) - .await - .map_err(Error::from)?; + .await; - let new_runtime: nexus::InstanceRuntimeState = - new_runtime.into_inner().into(); + match instance_put_result { + Ok(new_runtime) => { + let new_runtime: nexus::InstanceRuntimeState = + new_runtime.into_inner().into(); - self.db_datastore - .instance_update_runtime(&db_instance.id(), &new_runtime.into()) - .await - .map(|_| ()) + self.db_datastore + .instance_update_runtime( + &db_instance.id(), + &new_runtime.into(), + ) + .await + .map(|_| ()) + } + + Err(e) => { + // The sled-agent has told us that it can't do what we + // requested, but does that mean a failure? One example would be + // if we try to "reboot" a stopped instance. That shouldn't + // transition the instance to failed. But if the sled-agent + // *can't* boot a stopped instance, that should transition + // to failed. + // + // Without a richer error type, let the sled-agent tell Nexus + // what to do with status codes. + error!(self.log, "saw {} from instance_put!", &e,); + + // this is unfortunate, but sled_agent_client::Error doesn't + // implement Copy, and can't be match'ed upon below without this + // line. + let e = e.into(); + + match &e { + // Bad request shouldn't change the instance state. + Error::InvalidRequest { .. } => Err(e), + + // Internal server error (or anything else) should change + // the instance state to failed, we don't know what state + // the instance is in. + _ => { + let new_runtime = db::model::InstanceRuntimeState { + state: db::model::InstanceState::new( + InstanceState::Failed, + ), + gen: db_instance.runtime_state.gen.next().into(), + ..db_instance.runtime_state.clone() + }; + + // XXX what if this fails? + let result = self + .db_datastore + .instance_update_runtime( + &db_instance.id(), + &new_runtime, + ) + .await; + + error!( + self.log, + "saw {:?} from setting InstanceState::Failed after bad instance_put", + result, + ); + + Err(e) + } + } + } + } } /// Lists disks attached to the instance. diff --git a/nexus/src/db/datastore/disk.rs b/nexus/src/db/datastore/disk.rs index 3b7a421d00..e6a6f2d4ee 100644 --- a/nexus/src/db/datastore/disk.rs +++ b/nexus/src/db/datastore/disk.rs @@ -273,6 +273,7 @@ impl DataStore { let ok_to_detach_instance_states = vec![ db::model::InstanceState(api::external::InstanceState::Creating), db::model::InstanceState(api::external::InstanceState::Stopped), + db::model::InstanceState(api::external::InstanceState::Failed), ]; let detached_label = api::external::DiskState::Detached.label(); diff --git a/nexus/src/db/queries/network_interface.rs b/nexus/src/db/queries/network_interface.rs index f44ab6c37d..fdae2f5e84 100644 --- a/nexus/src/db/queries/network_interface.rs +++ b/nexus/src/db/queries/network_interface.rs @@ -33,11 +33,14 @@ use uuid::Uuid; // These are sentinel values and other constants used to verify the state of the // system when operating on network interfaces lazy_static::lazy_static! { - // State an instance must be in to operate on its network interfaces, in + // States an instance must be in to operate on its network interfaces, in // most situations. static ref INSTANCE_STOPPED: db::model::InstanceState = db::model::InstanceState(external::InstanceState::Stopped); + static ref INSTANCE_FAILED: db::model::InstanceState = + db::model::InstanceState(external::InstanceState::Failed); + // An instance can be in the creating state while we manipulate its // interfaces. The intention is for this only to be the case during sagas. static ref INSTANCE_CREATING: db::model::InstanceState = @@ -51,19 +54,19 @@ lazy_static::lazy_static! { static ref NO_INSTANCE_SENTINEL_STRING: String = String::from(NO_INSTANCE_SENTINEL); - static ref INSTANCE_NOT_STOPPED_SENTINEL_STRING: String = - String::from(INSTANCE_NOT_STOPPED_SENTINEL); + static ref INSTANCE_BAD_STATE_SENTINEL_STRING: String = + String::from(INSTANCE_BAD_STATE_SENTINEL); } // Uncastable sentinel used to detect when an instance exists, but is not -// stopped -const INSTANCE_NOT_STOPPED_SENTINEL: &'static str = "not-stopped"; +// in the right state to have its network interfaces altered +const INSTANCE_BAD_STATE_SENTINEL: &'static str = "bad-state"; // Error message generated when we're attempting to operate on an instance, // either inserting or deleting an interface, and that instance exists but is -// not stopped. -const INSTANCE_NOT_STOPPED_ERROR_MESSAGE: &'static str = - "could not parse \"not-stopped\" as type uuid: uuid: incorrect UUID length: not-stopped"; +// in a state we can't work on. +const INSTANCE_BAD_STATE_ERROR_MESSAGE: &'static str = + "could not parse \"bad-state\" as type uuid: uuid: incorrect UUID length: bad-state"; // Uncastable sentinel used to detect when an instance doesn't exist const NO_INSTANCE_SENTINEL: &'static str = "no-instance"; @@ -289,15 +292,15 @@ fn decode_database_error( } // This catches the UUID-cast failure intentionally introduced by - // `push_instance_stopped_verification_subquery`, which verifies that + // `push_instance_state_verification_subquery`, which verifies that // the instance is actually stopped when running this query. PoolError::Connection(ConnectionError::Query( Error::DatabaseError(DatabaseErrorKind::Unknown, ref info), - )) if info.message() == INSTANCE_NOT_STOPPED_ERROR_MESSAGE => { + )) if info.message() == INSTANCE_BAD_STATE_ERROR_MESSAGE => { InsertError::InstanceMustBeStopped(interface.instance_id) } // This catches the UUID-cast failure intentionally introduced by - // `push_instance_stopped_verification_subquery`, which verifies that + // `push_instance_state_verification_subquery`, which verifies that // the instance doesn't even exist when running this query. PoolError::Connection(ConnectionError::Query( Error::DatabaseError(DatabaseErrorKind::Unknown, ref info), @@ -731,13 +734,14 @@ fn push_instance_validation_cte<'a>( out.push_sql(" AS "); out.push_identifier(dsl::subnet_id::NAME)?; - // Push the subquery to ensure the instance is actually stopped when trying - // to insert the new interface. + // Push the subquery to ensure the instance state when trying to insert the + // new interface. out.push_sql(", ("); - push_instance_stopped_verification_subquery( + push_instance_state_verification_subquery( instance_id, instance_id_str, out.reborrow(), + false, )?; // Push the subquery used to select and validate the slot number for the @@ -1116,8 +1120,8 @@ impl QueryFragment for IsPrimaryNic { type InstanceFromClause = FromClause; const INSTANCE_FROM_CLAUSE: InstanceFromClause = InstanceFromClause::new(); -// Subquery used to ensure an instance both exists and is stopped before -// inserting or deleting a network interface. +// Subquery used to ensure an instance both exists and is either stopped (or +// optionally failed) before inserting or deleting a network interface. // // This pushes a subquery like: // @@ -1138,26 +1142,34 @@ const INSTANCE_FROM_CLAUSE: InstanceFromClause = InstanceFromClause::new(); // ) // WHEN 'stopped' THEN '' -- Instance UUID as a string // WHEN 'creating' THEN '' -- Instance UUID as a string +// WHEN 'failed' THEN '' -- Instance UUID as a string // WHEN 'destroyed' THEN 'no-instance' -- Sentinel for an instance not existing -// ELSE 'not-stopped' -- Any other state is invalid for operating on instances +// ELSE 'bad-state' -- Any other state is invalid for operating on instances // END // AS UUID) // ``` // // This uses the familiar cast-fail trick to select the instance's UUID if the -// instance is stopped, or a sentinel of `'running'` if not. It also ensures the -// instance exists at all with the sentinel `'no-instance'`. +// instance is in a state that can be altered, or a sentinel of `'running'` if +// not. It also ensures the instance exists at all with the sentinel +// `'no-instance'`. +// +// 'failed' is conditionally an accepted state: it would not be accepted as part +// of InsertQuery, but it should be as part of DeleteQuery (for example if the +// instance creation saga failed). // -// Note that both 'stopped' and 'creating' are considered valid states. The -// former is used for most situations, especially client-facing, but the latter -// is critical for the instance-creation saga. When we first provision the -// instance, its in the 'creating' state until a sled agent responds telling us -// that the instance has actually been launched. This additional case supports -// adding interfaces during that provisioning process. -fn push_instance_stopped_verification_subquery<'a>( +// Note that 'stopped', 'failed', and 'creating' are considered valid states. +// 'stopped' is used for most situations, especially client-facing, but +// 'creating' is critical for the instance-creation saga. When we first +// provision the instance, its in the 'creating' state until a sled agent +// responds telling us that the instance has actually been launched. This +// additional case supports adding interfaces during that provisioning process. + +fn push_instance_state_verification_subquery<'a>( instance_id: &'a Uuid, instance_id_str: &'a String, mut out: AstPass<'_, 'a, Pg>, + failed_ok: bool, ) -> QueryResult<()> { out.push_sql("CAST(CASE COALESCE((SELECT "); out.push_identifier(db::schema::instance::dsl::state::NAME)?; @@ -1179,6 +1191,13 @@ fn push_instance_stopped_verification_subquery<'a>( out.push_bind_param::(&INSTANCE_CREATING)?; out.push_sql(" THEN "); out.push_bind_param::(instance_id_str)?; + if failed_ok { + // FAILED is ok for DeleteQuery, but not for InsertQuery! + out.push_sql(" WHEN "); + out.push_bind_param::(&INSTANCE_FAILED)?; + out.push_sql(" THEN "); + out.push_bind_param::(instance_id_str)?; + } out.push_sql(" WHEN "); out.push_bind_param::(&INSTANCE_DESTROYED)?; out.push_sql(" THEN "); @@ -1187,7 +1206,7 @@ fn push_instance_stopped_verification_subquery<'a>( )?; out.push_sql(" ELSE "); out.push_bind_param::( - &INSTANCE_NOT_STOPPED_SENTINEL_STRING, + &INSTANCE_BAD_STATE_SENTINEL_STRING, )?; out.push_sql(" END AS UUID)"); Ok(()) @@ -1239,9 +1258,10 @@ fn push_instance_stopped_verification_subquery<'a>( /// time_deleted IS NULL /// )), 'destroyed') /// WHEN 'stopped' THEN '' -/// WHEN 'creating' the '' +/// WHEN 'creating' THEN '' +/// WHEN 'failed' THEN '' /// WHEN 'destroyed' THEN 'no-instance' -/// ELSE 'not-stopped' +/// ELSE 'bad-state' /// ) AS UUID) /// ) /// UPDATE @@ -1287,10 +1307,11 @@ impl QueryFragment for DeleteQuery { mut out: AstPass<'_, 'a, Pg>, ) -> diesel::QueryResult<()> { out.push_sql("WITH instance AS MATERIALIZED (SELECT "); - push_instance_stopped_verification_subquery( + push_instance_state_verification_subquery( &self.instance_id, &self.instance_id_str, out.reborrow(), + true, )?; out.push_sql( "), interface AS MATERIALIZED (SELECT CAST(IF((SELECT NOT ", @@ -1341,8 +1362,8 @@ pub enum DeleteError { /// Attempting to delete the primary interface, while there still exist /// secondary interfaces. InstanceHasSecondaryInterfaces(Uuid), - /// Instance must be stopped prior to deleting interfaces from it - InstanceMustBeStopped(Uuid), + /// Instance must be stopped or failed prior to deleting interfaces from it + InstanceBadState(Uuid), /// The instance does not exist at all, or is in the destroyed state. InstanceNotFound(Uuid), /// Any other error @@ -1393,9 +1414,9 @@ impl DeleteError { are still attached", ) } - DeleteError::InstanceMustBeStopped(_) => { + DeleteError::InstanceBadState(_) => { external::Error::invalid_request( - "Instance must be stopped to detach a network interface", + "Instance must be stopped or failed to detach a network interface", ) } DeleteError::InstanceNotFound(id) => { @@ -1444,15 +1465,15 @@ fn decode_delete_network_interface_database_error( } // This catches the UUID-cast failure intentionally introduced by - // `push_instance_stopped_verification_subquery`, which verifies that - // the instance is actually stopped when running this query. + // `push_instance_state_verification_subquery`, which verifies that + // the instance can be worked on when running this query. PoolError::Connection(ConnectionError::Query( Error::DatabaseError(DatabaseErrorKind::Unknown, ref info), - )) if info.message() == INSTANCE_NOT_STOPPED_ERROR_MESSAGE => { - DeleteError::InstanceMustBeStopped(instance_id) + )) if info.message() == INSTANCE_BAD_STATE_ERROR_MESSAGE => { + DeleteError::InstanceBadState(instance_id) } // This catches the UUID-cast failure intentionally introduced by - // `push_instance_stopped_verification_subquery`, which verifies that + // `push_instance_state_verification_subquery`, which verifies that // the instance doesn't even exist when running this query. PoolError::Connection(ConnectionError::Query( Error::DatabaseError(DatabaseErrorKind::Unknown, ref info), diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 1a0afe2014..edba7b4ef4 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -287,8 +287,9 @@ async fn test_instances_create_reboot_halt( instances_eq(&instance, &instance_next); let instance_next = instance_get(&client, &instance_url).await; instances_eq(&instance, &instance_next); + assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped); - // Attempt to reboot the halted instance. This should fail. + // Attempt to reboot the halted instance. This should fail. let _error: HttpErrorResponseBody = NexusRequest::expect_failure( client, StatusCode::BAD_REQUEST, @@ -306,6 +307,10 @@ async fn test_instances_create_reboot_halt( // client, and expressing that as a rich error type. // assert_eq!(error.message, "cannot reboot instance in state \"stopped\""); + // State should still be stopped. + let instance = instance_get(&client, &instance_url).await; + assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + // Start the instance. While it's starting, issue a reboot. This should // succeed, having stopped in between. let instance = instance_next; @@ -1162,7 +1167,7 @@ async fn test_instance_create_delete_network_interface( .expect("Failed to parse error response body"); assert_eq!( err.message, - "Instance must be stopped to detach a network interface", + "Instance must be stopped or failed to detach a network interface", "Expected an InvalidRequest response when detaching an interface from a running instance" ); } @@ -1798,6 +1803,99 @@ async fn test_attach_one_disk_to_instance(cptestctx: &ControlPlaneTestContext) { assert_eq!(disks[0].state, DiskState::Attached(instance.identity.id)); } +#[nexus_test] +async fn test_instance_fails_to_boot_with_disk( + cptestctx: &ControlPlaneTestContext, +) { + // Test that the saga correctly unwinds if the sled_agent's instance_put fails + // see: https://github.com/oxidecomputer/omicron/issues/1713 + let client = &cptestctx.external_client; + + const POOL_NAME: &str = "p0"; + const ORGANIZATION_NAME: &str = "bobs-barrel-of-bytes"; + const PROJECT_NAME: &str = "bit-barrel"; + + // Test pre-reqs + DiskTest::new(&cptestctx).await; + create_ip_pool(&client, POOL_NAME, None, None).await; + create_organization(&client, ORGANIZATION_NAME).await; + create_project(client, ORGANIZATION_NAME, PROJECT_NAME).await; + + // Create the "probablydata" disk + create_disk(&client, ORGANIZATION_NAME, PROJECT_NAME, "probablydata").await; + + // Verify disk is there and currently detached + let url_project_disks = format!( + "/organizations/{}/projects/{}/disks", + ORGANIZATION_NAME, PROJECT_NAME, + ); + let disks: Vec = NexusRequest::iter_collection_authn( + client, + &url_project_disks, + "", + None, + ) + .await + .expect("failed to list disks") + .all_items; + assert_eq!(disks.len(), 1); + assert_eq!(disks[0].state, DiskState::Detached); + + // Create the instance + let instance_params = params::InstanceCreate { + identity: IdentityMetadataCreateParams { + name: Name::try_from(String::from("nfs")).unwrap(), + description: String::from("probably serving data"), + }, + ncpus: InstanceCpuCount::try_from(32).unwrap(), + memory: ByteCount::from_gibibytes_u32(4), + hostname: String::from("nfs"), + user_data: vec![], + network_interfaces: params::InstanceNetworkInterfaceAttachment::Default, + external_ips: vec![], + disks: vec![params::InstanceDiskAttachment::Attach( + params::InstanceDiskAttach { + name: Name::try_from(String::from("probablydata")).unwrap(), + }, + )], + start: true, + }; + + let url_instances = format!( + "/organizations/{}/projects/{}/instances", + ORGANIZATION_NAME, PROJECT_NAME + ); + + let builder = + RequestBuilder::new(client, http::Method::POST, &url_instances) + .body(Some(&instance_params)) + .expect_status(Some(http::StatusCode::INTERNAL_SERVER_ERROR)); + + NexusRequest::new(builder) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Expected instance creation to fail!"); + + // Verify disk is not attached to the instance + let url_instance_disks = format!( + "/organizations/{}/projects/{}/disks", + ORGANIZATION_NAME, PROJECT_NAME, + ); + let disks: Vec = NexusRequest::iter_collection_authn( + client, + &url_instance_disks, + "", + None, + ) + .await + .expect("failed to list disks") + .all_items; + + assert_eq!(disks.len(), 1); + assert_eq!(disks[0].state, DiskState::Detached); +} + // Test that 8 disks is supported #[nexus_test] async fn test_attach_eight_disks_to_instance( diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index b0ca1e7392..8874dd35ce 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -194,6 +194,13 @@ impl SledAgent { initial_hardware: InstanceHardware, target: InstanceRuntimeStateRequested, ) -> Result { + let ncpus: i64 = (&initial_hardware.runtime.ncpus).into(); + if ncpus > 16 { + return Err(Error::internal_error( + &"instances with more than 16 CPUs not supported!", + )); + }; + for disk in &initial_hardware.disks { let initial_state = DiskRuntimeState { disk_state: omicron_common::api::external::DiskState::Attached( From 6e85d66e5fc10d911fcbb55f432bf854e6adc6e5 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 16 Sep 2022 17:11:04 -0400 Subject: [PATCH 2/3] remove unnecessary borrow and comma --- nexus/src/app/instance.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 81b6ddda4e..ed2417d8a3 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -639,7 +639,7 @@ impl super::Nexus { // // Without a richer error type, let the sled-agent tell Nexus // what to do with status codes. - error!(self.log, "saw {} from instance_put!", &e,); + error!(self.log, "saw {} from instance_put!", e); // this is unfortunate, but sled_agent_client::Error doesn't // implement Copy, and can't be match'ed upon below without this From be9f7a37ce7a43199195d3c334e19885c09bc483 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 16 Sep 2022 17:27:45 -0400 Subject: [PATCH 3/3] add comments explaining why a 500 is generated in simulated sled agent --- nexus/tests/integration_tests/instances.rs | 5 +++++ sled-agent/src/sim/sled_agent.rs | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index edba7b4ef4..170e03cfe1 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1847,6 +1847,11 @@ async fn test_instance_fails_to_boot_with_disk( name: Name::try_from(String::from("nfs")).unwrap(), description: String::from("probably serving data"), }, + // there's a specific line in the simulated sled agent that will return + // a 500 if you try to allocate an instance with more than 16 CPUs. a + // 500 error is required to exercise the undo nodes of the instance + // create saga (where provision fails, instead of just responding with a + // bad request). ncpus: InstanceCpuCount::try_from(32).unwrap(), memory: ByteCount::from_gibibytes_u32(4), hostname: String::from("nfs"), diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 8874dd35ce..b05b2c39eb 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -194,10 +194,12 @@ impl SledAgent { initial_hardware: InstanceHardware, target: InstanceRuntimeStateRequested, ) -> Result { + // respond with a fake 500 level failure if asked to ensure an instance + // with more than 16 CPUs. let ncpus: i64 = (&initial_hardware.runtime.ncpus).into(); if ncpus > 16 { return Err(Error::internal_error( - &"instances with more than 16 CPUs not supported!", + &"could not allocate an instance: ran out of CPUs!", )); };