From e3ec364eed53e1486879a0b96bc17aac3dcbcd0e Mon Sep 17 00:00:00 2001
From: Rain <rain@oxide.computer>
Date: Tue, 27 Aug 2024 14:28:00 -0700
Subject: [PATCH] [wicket] attempt to fix test_inventory (#6456)

I haven't been able to reproduce this locally, but this is my best guess
as to what's going wrong here: MGS/wicketd learns about SPs but due to a
race/load on the system, misses out on populating their state and
instead leaves it empty. That causes the SPs to be filtered out here:

https://github.com/oxidecomputer/omicron/blob/7a6f45c5504bb092ce738d165cc88736ba4a9092/wicketd/src/rss_config.rs#L129

This theory is buttressed by the fact that in failing logs, the returned
inventory is a lot smaller than what I'm seeing locally.

For example, in the logs for [this failing
test](https://buildomat.eng.oxide.computer/wg/0/details/01J69AR918WAQNFKSBS85EAQPV/kkFMDYhAM3Vxb5ujRHlyAO9thmIAc7mHjHuicct0gS2bL8xu/01J69ARHYXXSKXKG8J49SRZVTA)
I see [a 1430 byte
response](https://buildomat.eng.oxide.computer/wg/0/artefact/01J69AR918WAQNFKSBS85EAQPV/kkFMDYhAM3Vxb5ujRHlyAO9thmIAc7mHjHuicct0gS2bL8xu/01J69ARHYXXSKXKG8J49SRZVTA/01J69ENP3EF3A212GVAGEMBDVQ/mod-ff551cc639cd8d16-test_inventory.21679.0.log?format=x-bunyan#L640):

```
test_inventory (wicketd test client): client response
    result = Ok(Response { url: Url { scheme: "http", cannot_be_a_base: false, username: "", password: None, host: Some(Ipv6(::1)), port: Some(45364), path: "/inventory", query: None, fragment: None }, status: 200, headers: {"content-type": "application/json", "x-request-id": "e68141e2-4c4f-46ec-a49b-9f8aa11a3410", "content-length": "1430", "date": "Tue, 27 Aug 2024 08:13:01 GMT"} })
```

But in passing runs locally, I see a much larger 8654 byte response
([full
logs](https://gist.github.com/sunshowers/b9c1868ba4c8c4bd3eec49cc4b56516d)):

```
19:32:43.847Z DEBG test_inventory (wicketd test client): client response
    result = Ok(Response { url: Url { scheme: "http", cannot_be_a_base: false, username: "", password: None, host: Some(Ipv6(::1)), port: Some(44183), path: "/inventory", query: None, fragment: None }, status: 200, headers: {"content-type": "application/json", "x-request-id": "8b48dae0-025d-426a-82f0-1dd8323670d5", "content-length": "8654", "date": "Tue, 27 Aug 2024 19:32:43 GMT"} })
```

Based on this theory, this PR changes the exit condition for the poll
loop to also consider all of the SP states being present.

In case there's something else going on, the PR also adds a bunch of
additional logging.

Fixes #6300.
---
 wicketd/src/http_entrypoints.rs              |  2 +
 wicketd/src/rss_config.rs                    | 12 +++++-
 wicketd/tests/integration_tests/inventory.rs | 39 +++++++++++++++-----
 3 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/wicketd/src/http_entrypoints.rs b/wicketd/src/http_entrypoints.rs
index 55b4d61c9a..3f460f1e37 100644
--- a/wicketd/src/http_entrypoints.rs
+++ b/wicketd/src/http_entrypoints.rs
@@ -82,6 +82,7 @@ impl WicketdApi for WicketdApiImpl {
         config.update_with_inventory_and_bootstrap_peers(
             &inventory,
             &ctx.bootstrap_peers,
+            &ctx.log,
         );
 
         Ok(HttpResponseOk((&*config).into()))
@@ -101,6 +102,7 @@ impl WicketdApi for WicketdApiImpl {
         config.update_with_inventory_and_bootstrap_peers(
             &inventory,
             &ctx.bootstrap_peers,
+            &ctx.log,
         );
         config
             .update(body.into_inner(), ctx.baseboard.as_ref())
diff --git a/wicketd/src/rss_config.rs b/wicketd/src/rss_config.rs
index 56e83fcd41..46ede25eaa 100644
--- a/wicketd/src/rss_config.rs
+++ b/wicketd/src/rss_config.rs
@@ -26,6 +26,7 @@ use omicron_common::api::external::AllowedSourceIps;
 use omicron_common::api::external::SwitchLocation;
 use once_cell::sync::Lazy;
 use sled_hardware_types::Baseboard;
+use slog::debug;
 use slog::warn;
 use std::collections::btree_map;
 use std::collections::BTreeMap;
@@ -115,6 +116,7 @@ impl CurrentRssConfig {
         &mut self,
         inventory: &RackV1Inventory,
         bootstrap_peers: &BootstrapPeers,
+        log: &slog::Logger,
     ) {
         let bootstrap_sleds = bootstrap_peers.sleds();
 
@@ -126,7 +128,15 @@ impl CurrentRssConfig {
                     return None;
                 }
 
-                let state = sp.state.as_ref()?;
+                let Some(state) = sp.state.as_ref() else {
+                    debug!(
+                        log,
+                        "in update_with_inventory_and_bootstrap_peers, \
+                         filtering out SP with no state";
+                        "sp" => ?sp,
+                    );
+                    return None;
+                };
                 let baseboard = Baseboard::new_gimlet(
                     state.serial_number.clone(),
                     state.model.clone(),
diff --git a/wicketd/tests/integration_tests/inventory.rs b/wicketd/tests/integration_tests/inventory.rs
index ed5ad22d5d..c7057e3adc 100644
--- a/wicketd/tests/integration_tests/inventory.rs
+++ b/wicketd/tests/integration_tests/inventory.rs
@@ -10,6 +10,7 @@ use super::setup::WicketdTestContext;
 use gateway_messages::SpPort;
 use gateway_test_utils::setup as gateway_setup;
 use sled_hardware_types::Baseboard;
+use slog::{info, warn};
 use wicket::OutputKind;
 use wicket_common::inventory::{SpIdentifier, SpType};
 use wicket_common::rack_setup::BootstrapSledDescription;
@@ -32,13 +33,29 @@ async fn test_inventory() {
                 .into_inner();
             match response {
                 GetInventoryResponse::Response { inventory, .. } => {
-                    break inventory
-                }
-                GetInventoryResponse::Unavailable => {
-                    // Keep polling wicketd until it receives its first results from MGS.
-                    tokio::time::sleep(Duration::from_millis(100)).await;
+                    // Ensure that the SP state is populated -- if it's not,
+                    // then the `configured-bootstrap-sleds` command below
+                    // might return an empty list.
+                    let sp_state_none: Vec<_> = inventory
+                        .sps
+                        .iter()
+                        .filter(|sp| sp.state.is_none())
+                        .collect();
+                    if sp_state_none.is_empty() {
+                        break inventory;
+                    }
+
+                    warn!(
+                        wicketd_testctx.log(),
+                        "SP state not yet populated for some SPs, retrying";
+                        "sps" => ?sp_state_none
+                    )
                 }
+                GetInventoryResponse::Unavailable => {}
             }
+
+            // Keep polling wicketd until it receives its first results from MGS.
+            tokio::time::sleep(Duration::from_millis(100)).await;
         }
     };
     let inventory =
@@ -46,6 +63,8 @@ async fn test_inventory() {
             .await
             .expect("get_inventory completed within 10 seconds");
 
+    info!(wicketd_testctx.log(), "inventory returned"; "inventory" => ?inventory);
+
     // 4 SPs attached to the inventory.
     assert_eq!(inventory.sps.len(), 4);
 
@@ -70,17 +89,17 @@ async fn test_inventory() {
             serde_json::from_slice(&stdout).expect("stdout is valid JSON");
 
         // This only tests the case that we get sleds back with no current
-        // bootstrap IP. This does provide svalue: it check that the command
-        // exists, accesses data within wicket, and returns it in the schema we
-        // expect. But it does not test the case where a sled does have a
-        // bootstrap IP.
+        // bootstrap IP. This does provide some value: it checks that the
+        // command exists, accesses data within wicket, and returns it in the
+        // schema we expect. But it does not test the case where a sled does
+        // have a bootstrap IP.
         //
         // Unfortunately, that's a difficult thing to test today. Wicket gets
         // that information by enumerating the IPs on the bootstrap network and
         // reaching out to the bootstrap_agent on them directly to ask them who
         // they are. Our testing setup does not have a way to provide such an
         // IP, or run a bootstrap_agent on an IP to respond. We should update
-        // this test when we do have that capabilitiy.
+        // this test when we do have that capability.
         assert_eq!(
             response,
             vec![