diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 47f1a68b1d..ed2417d8a3 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -604,25 +604,84 @@ impl super::Nexus { let sa = self.instance_sled(&db_instance).await?; - let new_runtime = sa + let instance_put_result = sa .instance_put( &db_instance.id(), &sled_agent_client::types::InstanceEnsureBody { initial: instance_hardware, - target: requested, + target: requested.clone(), migrate: None, }, ) - .await - .map_err(Error::from)?; + .await; - let new_runtime: nexus::InstanceRuntimeState = - new_runtime.into_inner().into(); + match instance_put_result { + Ok(new_runtime) => { + let new_runtime: nexus::InstanceRuntimeState = + new_runtime.into_inner().into(); - self.db_datastore - .instance_update_runtime(&db_instance.id(), &new_runtime.into()) - .await - .map(|_| ()) + self.db_datastore + .instance_update_runtime( + &db_instance.id(), + &new_runtime.into(), + ) + .await + .map(|_| ()) + } + + Err(e) => { + // The sled-agent has told us that it can't do what we + // requested, but does that mean a failure? One example would be + // if we try to "reboot" a stopped instance. That shouldn't + // transition the instance to failed. But if the sled-agent + // *can't* boot a stopped instance, that should transition + // to failed. + // + // Without a richer error type, let the sled-agent tell Nexus + // what to do with status codes. + error!(self.log, "saw {} from instance_put!", e); + + // this is unfortunate, but sled_agent_client::Error doesn't + // implement Copy, and can't be match'ed upon below without this + // line. + let e = e.into(); + + match &e { + // Bad request shouldn't change the instance state. + Error::InvalidRequest { .. } => Err(e), + + // Internal server error (or anything else) should change + // the instance state to failed, we don't know what state + // the instance is in. + _ => { + let new_runtime = db::model::InstanceRuntimeState { + state: db::model::InstanceState::new( + InstanceState::Failed, + ), + gen: db_instance.runtime_state.gen.next().into(), + ..db_instance.runtime_state.clone() + }; + + // XXX what if this fails? + let result = self + .db_datastore + .instance_update_runtime( + &db_instance.id(), + &new_runtime, + ) + .await; + + error!( + self.log, + "saw {:?} from setting InstanceState::Failed after bad instance_put", + result, + ); + + Err(e) + } + } + } + } } /// Lists disks attached to the instance. diff --git a/nexus/src/db/datastore/disk.rs b/nexus/src/db/datastore/disk.rs index 3b7a421d00..e6a6f2d4ee 100644 --- a/nexus/src/db/datastore/disk.rs +++ b/nexus/src/db/datastore/disk.rs @@ -273,6 +273,7 @@ impl DataStore { let ok_to_detach_instance_states = vec![ db::model::InstanceState(api::external::InstanceState::Creating), db::model::InstanceState(api::external::InstanceState::Stopped), + db::model::InstanceState(api::external::InstanceState::Failed), ]; let detached_label = api::external::DiskState::Detached.label(); diff --git a/nexus/src/db/queries/network_interface.rs b/nexus/src/db/queries/network_interface.rs index f44ab6c37d..fdae2f5e84 100644 --- a/nexus/src/db/queries/network_interface.rs +++ b/nexus/src/db/queries/network_interface.rs @@ -33,11 +33,14 @@ use uuid::Uuid; // These are sentinel values and other constants used to verify the state of the // system when operating on network interfaces lazy_static::lazy_static! { - // State an instance must be in to operate on its network interfaces, in + // States an instance must be in to operate on its network interfaces, in // most situations. static ref INSTANCE_STOPPED: db::model::InstanceState = db::model::InstanceState(external::InstanceState::Stopped); + static ref INSTANCE_FAILED: db::model::InstanceState = + db::model::InstanceState(external::InstanceState::Failed); + // An instance can be in the creating state while we manipulate its // interfaces. The intention is for this only to be the case during sagas. static ref INSTANCE_CREATING: db::model::InstanceState = @@ -51,19 +54,19 @@ lazy_static::lazy_static! { static ref NO_INSTANCE_SENTINEL_STRING: String = String::from(NO_INSTANCE_SENTINEL); - static ref INSTANCE_NOT_STOPPED_SENTINEL_STRING: String = - String::from(INSTANCE_NOT_STOPPED_SENTINEL); + static ref INSTANCE_BAD_STATE_SENTINEL_STRING: String = + String::from(INSTANCE_BAD_STATE_SENTINEL); } // Uncastable sentinel used to detect when an instance exists, but is not -// stopped -const INSTANCE_NOT_STOPPED_SENTINEL: &'static str = "not-stopped"; +// in the right state to have its network interfaces altered +const INSTANCE_BAD_STATE_SENTINEL: &'static str = "bad-state"; // Error message generated when we're attempting to operate on an instance, // either inserting or deleting an interface, and that instance exists but is -// not stopped. -const INSTANCE_NOT_STOPPED_ERROR_MESSAGE: &'static str = - "could not parse \"not-stopped\" as type uuid: uuid: incorrect UUID length: not-stopped"; +// in a state we can't work on. +const INSTANCE_BAD_STATE_ERROR_MESSAGE: &'static str = + "could not parse \"bad-state\" as type uuid: uuid: incorrect UUID length: bad-state"; // Uncastable sentinel used to detect when an instance doesn't exist const NO_INSTANCE_SENTINEL: &'static str = "no-instance"; @@ -289,15 +292,15 @@ fn decode_database_error( } // This catches the UUID-cast failure intentionally introduced by - // `push_instance_stopped_verification_subquery`, which verifies that + // `push_instance_state_verification_subquery`, which verifies that // the instance is actually stopped when running this query. PoolError::Connection(ConnectionError::Query( Error::DatabaseError(DatabaseErrorKind::Unknown, ref info), - )) if info.message() == INSTANCE_NOT_STOPPED_ERROR_MESSAGE => { + )) if info.message() == INSTANCE_BAD_STATE_ERROR_MESSAGE => { InsertError::InstanceMustBeStopped(interface.instance_id) } // This catches the UUID-cast failure intentionally introduced by - // `push_instance_stopped_verification_subquery`, which verifies that + // `push_instance_state_verification_subquery`, which verifies that // the instance doesn't even exist when running this query. PoolError::Connection(ConnectionError::Query( Error::DatabaseError(DatabaseErrorKind::Unknown, ref info), @@ -731,13 +734,14 @@ fn push_instance_validation_cte<'a>( out.push_sql(" AS "); out.push_identifier(dsl::subnet_id::NAME)?; - // Push the subquery to ensure the instance is actually stopped when trying - // to insert the new interface. + // Push the subquery to ensure the instance state when trying to insert the + // new interface. out.push_sql(", ("); - push_instance_stopped_verification_subquery( + push_instance_state_verification_subquery( instance_id, instance_id_str, out.reborrow(), + false, )?; // Push the subquery used to select and validate the slot number for the @@ -1116,8 +1120,8 @@ impl QueryFragment for IsPrimaryNic { type InstanceFromClause = FromClause; const INSTANCE_FROM_CLAUSE: InstanceFromClause = InstanceFromClause::new(); -// Subquery used to ensure an instance both exists and is stopped before -// inserting or deleting a network interface. +// Subquery used to ensure an instance both exists and is either stopped (or +// optionally failed) before inserting or deleting a network interface. // // This pushes a subquery like: // @@ -1138,26 +1142,34 @@ const INSTANCE_FROM_CLAUSE: InstanceFromClause = InstanceFromClause::new(); // ) // WHEN 'stopped' THEN '' -- Instance UUID as a string // WHEN 'creating' THEN '' -- Instance UUID as a string +// WHEN 'failed' THEN '' -- Instance UUID as a string // WHEN 'destroyed' THEN 'no-instance' -- Sentinel for an instance not existing -// ELSE 'not-stopped' -- Any other state is invalid for operating on instances +// ELSE 'bad-state' -- Any other state is invalid for operating on instances // END // AS UUID) // ``` // // This uses the familiar cast-fail trick to select the instance's UUID if the -// instance is stopped, or a sentinel of `'running'` if not. It also ensures the -// instance exists at all with the sentinel `'no-instance'`. +// instance is in a state that can be altered, or a sentinel of `'running'` if +// not. It also ensures the instance exists at all with the sentinel +// `'no-instance'`. +// +// 'failed' is conditionally an accepted state: it would not be accepted as part +// of InsertQuery, but it should be as part of DeleteQuery (for example if the +// instance creation saga failed). // -// Note that both 'stopped' and 'creating' are considered valid states. The -// former is used for most situations, especially client-facing, but the latter -// is critical for the instance-creation saga. When we first provision the -// instance, its in the 'creating' state until a sled agent responds telling us -// that the instance has actually been launched. This additional case supports -// adding interfaces during that provisioning process. -fn push_instance_stopped_verification_subquery<'a>( +// Note that 'stopped', 'failed', and 'creating' are considered valid states. +// 'stopped' is used for most situations, especially client-facing, but +// 'creating' is critical for the instance-creation saga. When we first +// provision the instance, its in the 'creating' state until a sled agent +// responds telling us that the instance has actually been launched. This +// additional case supports adding interfaces during that provisioning process. + +fn push_instance_state_verification_subquery<'a>( instance_id: &'a Uuid, instance_id_str: &'a String, mut out: AstPass<'_, 'a, Pg>, + failed_ok: bool, ) -> QueryResult<()> { out.push_sql("CAST(CASE COALESCE((SELECT "); out.push_identifier(db::schema::instance::dsl::state::NAME)?; @@ -1179,6 +1191,13 @@ fn push_instance_stopped_verification_subquery<'a>( out.push_bind_param::(&INSTANCE_CREATING)?; out.push_sql(" THEN "); out.push_bind_param::(instance_id_str)?; + if failed_ok { + // FAILED is ok for DeleteQuery, but not for InsertQuery! + out.push_sql(" WHEN "); + out.push_bind_param::(&INSTANCE_FAILED)?; + out.push_sql(" THEN "); + out.push_bind_param::(instance_id_str)?; + } out.push_sql(" WHEN "); out.push_bind_param::(&INSTANCE_DESTROYED)?; out.push_sql(" THEN "); @@ -1187,7 +1206,7 @@ fn push_instance_stopped_verification_subquery<'a>( )?; out.push_sql(" ELSE "); out.push_bind_param::( - &INSTANCE_NOT_STOPPED_SENTINEL_STRING, + &INSTANCE_BAD_STATE_SENTINEL_STRING, )?; out.push_sql(" END AS UUID)"); Ok(()) @@ -1239,9 +1258,10 @@ fn push_instance_stopped_verification_subquery<'a>( /// time_deleted IS NULL /// )), 'destroyed') /// WHEN 'stopped' THEN '' -/// WHEN 'creating' the '' +/// WHEN 'creating' THEN '' +/// WHEN 'failed' THEN '' /// WHEN 'destroyed' THEN 'no-instance' -/// ELSE 'not-stopped' +/// ELSE 'bad-state' /// ) AS UUID) /// ) /// UPDATE @@ -1287,10 +1307,11 @@ impl QueryFragment for DeleteQuery { mut out: AstPass<'_, 'a, Pg>, ) -> diesel::QueryResult<()> { out.push_sql("WITH instance AS MATERIALIZED (SELECT "); - push_instance_stopped_verification_subquery( + push_instance_state_verification_subquery( &self.instance_id, &self.instance_id_str, out.reborrow(), + true, )?; out.push_sql( "), interface AS MATERIALIZED (SELECT CAST(IF((SELECT NOT ", @@ -1341,8 +1362,8 @@ pub enum DeleteError { /// Attempting to delete the primary interface, while there still exist /// secondary interfaces. InstanceHasSecondaryInterfaces(Uuid), - /// Instance must be stopped prior to deleting interfaces from it - InstanceMustBeStopped(Uuid), + /// Instance must be stopped or failed prior to deleting interfaces from it + InstanceBadState(Uuid), /// The instance does not exist at all, or is in the destroyed state. InstanceNotFound(Uuid), /// Any other error @@ -1393,9 +1414,9 @@ impl DeleteError { are still attached", ) } - DeleteError::InstanceMustBeStopped(_) => { + DeleteError::InstanceBadState(_) => { external::Error::invalid_request( - "Instance must be stopped to detach a network interface", + "Instance must be stopped or failed to detach a network interface", ) } DeleteError::InstanceNotFound(id) => { @@ -1444,15 +1465,15 @@ fn decode_delete_network_interface_database_error( } // This catches the UUID-cast failure intentionally introduced by - // `push_instance_stopped_verification_subquery`, which verifies that - // the instance is actually stopped when running this query. + // `push_instance_state_verification_subquery`, which verifies that + // the instance can be worked on when running this query. PoolError::Connection(ConnectionError::Query( Error::DatabaseError(DatabaseErrorKind::Unknown, ref info), - )) if info.message() == INSTANCE_NOT_STOPPED_ERROR_MESSAGE => { - DeleteError::InstanceMustBeStopped(instance_id) + )) if info.message() == INSTANCE_BAD_STATE_ERROR_MESSAGE => { + DeleteError::InstanceBadState(instance_id) } // This catches the UUID-cast failure intentionally introduced by - // `push_instance_stopped_verification_subquery`, which verifies that + // `push_instance_state_verification_subquery`, which verifies that // the instance doesn't even exist when running this query. PoolError::Connection(ConnectionError::Query( Error::DatabaseError(DatabaseErrorKind::Unknown, ref info), diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 1a0afe2014..170e03cfe1 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -287,8 +287,9 @@ async fn test_instances_create_reboot_halt( instances_eq(&instance, &instance_next); let instance_next = instance_get(&client, &instance_url).await; instances_eq(&instance, &instance_next); + assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped); - // Attempt to reboot the halted instance. This should fail. + // Attempt to reboot the halted instance. This should fail. let _error: HttpErrorResponseBody = NexusRequest::expect_failure( client, StatusCode::BAD_REQUEST, @@ -306,6 +307,10 @@ async fn test_instances_create_reboot_halt( // client, and expressing that as a rich error type. // assert_eq!(error.message, "cannot reboot instance in state \"stopped\""); + // State should still be stopped. + let instance = instance_get(&client, &instance_url).await; + assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + // Start the instance. While it's starting, issue a reboot. This should // succeed, having stopped in between. let instance = instance_next; @@ -1162,7 +1167,7 @@ async fn test_instance_create_delete_network_interface( .expect("Failed to parse error response body"); assert_eq!( err.message, - "Instance must be stopped to detach a network interface", + "Instance must be stopped or failed to detach a network interface", "Expected an InvalidRequest response when detaching an interface from a running instance" ); } @@ -1798,6 +1803,104 @@ async fn test_attach_one_disk_to_instance(cptestctx: &ControlPlaneTestContext) { assert_eq!(disks[0].state, DiskState::Attached(instance.identity.id)); } +#[nexus_test] +async fn test_instance_fails_to_boot_with_disk( + cptestctx: &ControlPlaneTestContext, +) { + // Test that the saga correctly unwinds if the sled_agent's instance_put fails + // see: https://github.com/oxidecomputer/omicron/issues/1713 + let client = &cptestctx.external_client; + + const POOL_NAME: &str = "p0"; + const ORGANIZATION_NAME: &str = "bobs-barrel-of-bytes"; + const PROJECT_NAME: &str = "bit-barrel"; + + // Test pre-reqs + DiskTest::new(&cptestctx).await; + create_ip_pool(&client, POOL_NAME, None, None).await; + create_organization(&client, ORGANIZATION_NAME).await; + create_project(client, ORGANIZATION_NAME, PROJECT_NAME).await; + + // Create the "probablydata" disk + create_disk(&client, ORGANIZATION_NAME, PROJECT_NAME, "probablydata").await; + + // Verify disk is there and currently detached + let url_project_disks = format!( + "/organizations/{}/projects/{}/disks", + ORGANIZATION_NAME, PROJECT_NAME, + ); + let disks: Vec = NexusRequest::iter_collection_authn( + client, + &url_project_disks, + "", + None, + ) + .await + .expect("failed to list disks") + .all_items; + assert_eq!(disks.len(), 1); + assert_eq!(disks[0].state, DiskState::Detached); + + // Create the instance + let instance_params = params::InstanceCreate { + identity: IdentityMetadataCreateParams { + name: Name::try_from(String::from("nfs")).unwrap(), + description: String::from("probably serving data"), + }, + // there's a specific line in the simulated sled agent that will return + // a 500 if you try to allocate an instance with more than 16 CPUs. a + // 500 error is required to exercise the undo nodes of the instance + // create saga (where provision fails, instead of just responding with a + // bad request). + ncpus: InstanceCpuCount::try_from(32).unwrap(), + memory: ByteCount::from_gibibytes_u32(4), + hostname: String::from("nfs"), + user_data: vec![], + network_interfaces: params::InstanceNetworkInterfaceAttachment::Default, + external_ips: vec![], + disks: vec![params::InstanceDiskAttachment::Attach( + params::InstanceDiskAttach { + name: Name::try_from(String::from("probablydata")).unwrap(), + }, + )], + start: true, + }; + + let url_instances = format!( + "/organizations/{}/projects/{}/instances", + ORGANIZATION_NAME, PROJECT_NAME + ); + + let builder = + RequestBuilder::new(client, http::Method::POST, &url_instances) + .body(Some(&instance_params)) + .expect_status(Some(http::StatusCode::INTERNAL_SERVER_ERROR)); + + NexusRequest::new(builder) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Expected instance creation to fail!"); + + // Verify disk is not attached to the instance + let url_instance_disks = format!( + "/organizations/{}/projects/{}/disks", + ORGANIZATION_NAME, PROJECT_NAME, + ); + let disks: Vec = NexusRequest::iter_collection_authn( + client, + &url_instance_disks, + "", + None, + ) + .await + .expect("failed to list disks") + .all_items; + + assert_eq!(disks.len(), 1); + assert_eq!(disks[0].state, DiskState::Detached); +} + // Test that 8 disks is supported #[nexus_test] async fn test_attach_eight_disks_to_instance( diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index b0ca1e7392..b05b2c39eb 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -194,6 +194,15 @@ impl SledAgent { initial_hardware: InstanceHardware, target: InstanceRuntimeStateRequested, ) -> Result { + // respond with a fake 500 level failure if asked to ensure an instance + // with more than 16 CPUs. + let ncpus: i64 = (&initial_hardware.runtime.ncpus).into(); + if ncpus > 16 { + return Err(Error::internal_error( + &"could not allocate an instance: ran out of CPUs!", + )); + }; + for disk in &initial_hardware.disks { let initial_state = DiskRuntimeState { disk_state: omicron_common::api::external::DiskState::Attached(