Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distinguish between Scrimlets and Gimlets #1510

Merged
merged 3 commits into from
Jul 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions common/src/sql/dbinit.sql
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ CREATE TABLE omicron.public.sled (
/* FK into the Rack table */
rack_id UUID NOT NULL,

/* Idenfities if this Sled is a Scrimlet */
is_scrimlet BOOL NOT NULL,

/* The IP address and bound port of the sled agent server. */
ip INET NOT NULL,
port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL,
Expand Down
1 change: 1 addition & 0 deletions nexus/db-model/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,7 @@ table! {
rcgen -> Int8,

rack_id -> Uuid,
is_scrimlet -> Bool,
ip -> Inet,
port -> Int4,
last_used_address -> Inet,
Expand Down
14 changes: 13 additions & 1 deletion nexus/db-model/src/sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ pub struct Sled {

pub rack_id: Uuid,

is_scrimlet: bool,

// ServiceAddress (Sled Agent).
pub ip: ipv6::Ipv6Addr,
pub port: SqlU16,
Expand All @@ -33,7 +35,12 @@ pub struct Sled {
}

impl Sled {
pub fn new(id: Uuid, addr: SocketAddrV6, rack_id: Uuid) -> Self {
pub fn new(
id: Uuid,
addr: SocketAddrV6,
is_scrimlet: bool,
rack_id: Uuid,
) -> Self {
let last_used_address = {
let mut segments = addr.ip().segments();
segments[7] += omicron_common::address::RSS_RESERVED_ADDRESSES;
Expand All @@ -44,12 +51,17 @@ impl Sled {
time_deleted: None,
rcgen: Generation::new(),
rack_id,
is_scrimlet,
ip: ipv6::Ipv6Addr::from(addr.ip()),
port: addr.port().into(),
last_used_address,
}
}

pub fn is_scrimlet(&self) -> bool {
self.is_scrimlet
}

pub fn ip(&self) -> Ipv6Addr {
self.ip.into()
}
Expand Down
21 changes: 17 additions & 4 deletions nexus/src/app/sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ use crate::db::identity::Asset;
use crate::db::lookup::LookupPath;
use crate::db::model::DatasetKind;
use crate::db::model::ServiceKind;
use crate::internal_api::params::ZpoolPutRequest;
use crate::internal_api::params::{
SledAgentStartupInfo, SledRole, ZpoolPutRequest,
};
use omicron_common::api::external::DataPageParams;
use omicron_common::api::external::Error;
use omicron_common::api::external::ListResultVec;
use omicron_common::api::external::LookupResult;
use sled_agent_client::Client as SledAgentClient;
use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6};
use std::net::{Ipv6Addr, SocketAddr};
use std::sync::Arc;
use uuid::Uuid;

Expand All @@ -28,10 +30,21 @@ impl super::Nexus {
pub async fn upsert_sled(
&self,
id: Uuid,
address: SocketAddrV6,
info: SledAgentStartupInfo,
) -> Result<(), Error> {
info!(self.log, "registered sled agent"; "sled_uuid" => id.to_string());
let sled = db::model::Sled::new(id, address, self.rack_id);

let is_scrimlet = match info.role {
SledRole::Gimlet => false,
SledRole::Scrimlet => true,
};

let sled = db::model::Sled::new(
id,
info.sa_address,
is_scrimlet,
self.rack_id,
);
self.db_datastore.sled_upsert(sled).await?;
Ok(())
}
Expand Down
8 changes: 5 additions & 3 deletions nexus/src/db/datastore/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,8 @@ mod test {
);
let rack_id = Uuid::new_v4();
let sled_id = Uuid::new_v4();
let sled = Sled::new(sled_id, bogus_addr.clone(), rack_id);
let is_scrimlet = false;
let sled = Sled::new(sled_id, bogus_addr.clone(), is_scrimlet, rack_id);
datastore.sled_upsert(sled).await.unwrap();
sled_id
}
Expand Down Expand Up @@ -790,12 +791,13 @@ mod test {
let rack_id = Uuid::new_v4();
let addr1 = "[fd00:1de::1]:12345".parse().unwrap();
let sled1_id = "0de4b299-e0b4-46f0-d528-85de81a7095f".parse().unwrap();
let sled1 = db::model::Sled::new(sled1_id, addr1, rack_id);
let is_scrimlet = false;
let sled1 = db::model::Sled::new(sled1_id, addr1, is_scrimlet, rack_id);
datastore.sled_upsert(sled1).await.unwrap();

let addr2 = "[fd00:1df::1]:12345".parse().unwrap();
let sled2_id = "66285c18-0c79-43e0-e54f-95271f271314".parse().unwrap();
let sled2 = db::model::Sled::new(sled2_id, addr2, rack_id);
let sled2 = db::model::Sled::new(sled2_id, addr2, is_scrimlet, rack_id);
datastore.sled_upsert(sled2).await.unwrap();

let ip = datastore.next_ipv6_address(&opctx, sled1_id).await.unwrap();
Expand Down
2 changes: 2 additions & 0 deletions nexus/src/db/datastore/sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ impl DataStore {
dsl::time_modified.eq(Utc::now()),
dsl::ip.eq(sled.ip),
dsl::port.eq(sled.port),
dsl::rack_id.eq(sled.rack_id),
dsl::is_scrimlet.eq(sled.is_scrimlet()),
))
.returning(Sled::as_returning())
.get_result_async(self.pool())
Expand Down
12 changes: 4 additions & 8 deletions nexus/src/internal_api/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ type NexusApiDescription = ApiDescription<Arc<ServerContext>>;
/// Returns a description of the internal nexus API
pub fn internal_api() -> NexusApiDescription {
fn register_endpoints(api: &mut NexusApiDescription) -> Result<(), String> {
api.register(cpapi_sled_agents_post)?;
api.register(sled_agent_put)?;
api.register(rack_initialization_complete)?;
api.register(zpool_put)?;
api.register(dataset_put)?;
Expand All @@ -63,26 +63,22 @@ struct SledAgentPathParam {
}

/// Report that the sled agent for the specified sled has come online.
// TODO: Should probably be "PUT", since:
// 1. We're upserting the value
// 2. The client supplies the UUID
// 3. This call is idempotent (mod "time_modified").
#[endpoint {
method = POST,
path = "/sled-agents/{sled_id}",
}]
async fn cpapi_sled_agents_post(
async fn sled_agent_put(
rqctx: Arc<RequestContext<Arc<ServerContext>>>,
path_params: Path<SledAgentPathParam>,
sled_info: TypedBody<SledAgentStartupInfo>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
let apictx = rqctx.context();
let nexus = &apictx.nexus;
let path = path_params.into_inner();
let si = sled_info.into_inner();
let info = sled_info.into_inner();
let sled_id = &path.sled_id;
let handler = async {
nexus.upsert_sled(*sled_id, si.sa_address).await?;
nexus.upsert_sled(*sled_id, info).await?;
Ok(HttpResponseUpdatedNoContent())
};
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
Expand Down
19 changes: 18 additions & 1 deletion nexus/types/src/internal_api/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,28 @@ use std::net::SocketAddrV6;
use std::str::FromStr;
use uuid::Uuid;

/// Describes the role of the sled within the rack.
///
/// Note that this may change if the sled is physically moved
/// within the rack.
#[derive(Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "snake_case")]
pub enum SledRole {
/// The sled is a general compute sled.
Gimlet,
/// The sled is attached to the network switch, and has additional
/// responsibilities.
Scrimlet,
}

/// Sent by a sled agent on startup to Nexus to request further instruction
#[derive(Serialize, Deserialize, JsonSchema)]
pub struct SledAgentStartupInfo {
/// the address of the sled agent's API endpoint
/// The address of the sled agent's API endpoint
pub sa_address: SocketAddrV6,

/// Describes the responsibilities of the sled
pub role: SledRole,
}

/// Sent by a sled agent on startup to Nexus to request further instruction
Expand Down
21 changes: 19 additions & 2 deletions openapi/nexus-internal.json
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@
"/sled-agents/{sled_id}": {
"post": {
"summary": "Report that the sled agent for the specified sled has come online.",
"operationId": "cpapi_sled_agents_post",
"operationId": "sled_agent_put",
"parameters": [
{
"in": "path",
Expand Down Expand Up @@ -1788,15 +1788,32 @@
"description": "Sent by a sled agent on startup to Nexus to request further instruction",
"type": "object",
"properties": {
"role": {
"description": "Describes the responsibilities of the sled",
"allOf": [
{
"$ref": "#/components/schemas/SledRole"
}
]
},
"sa_address": {
"description": "the address of the sled agent's API endpoint",
"description": "The address of the sled agent's API endpoint",
"type": "string"
}
},
"required": [
"role",
"sa_address"
]
},
"SledRole": {
"description": "Describes the role of the sled within the rack.\n\nNote that this may change if the sled is physically moved within the rack.",
"type": "string",
"enum": [
"gimlet",
"scrimlet"
]
},
"ZpoolPutRequest": {
"description": "Sent by a sled agent on startup to Nexus to request further instruction",
"type": "object",
Expand Down
9 changes: 9 additions & 0 deletions sled-agent/src/bootstrap/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,11 +263,20 @@ impl Agent {

return Ok(SledAgentResponse { id: server.id() });
}

// TODO(https://github.com/oxidecomputer/omicron/issues/823):
// Currently, the prescence or abscence of RSS is our signal
// for "is this a scrimlet or not".
// Longer-term, we should make this call based on the underlying

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a note that this changes at runtime with respect to hardware. It's not a static thing and the underlying hardware presence will come and go as the Sidecar is power cycled.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That matches my understanding. I do think there is more work that'll need to smooth out the case of "a sled has been removed from one location in the rack, and plugged in elsewhere" - perhaps a call to Nexus, to figure out what services should change - but otherwise, this makes sense.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, while the movement bit seemed captured, I just wanted to make sure it was clear that you might choose to start a bootstrap agent before this is known and wanted to call that out as part of this given what appeared the initial start the agent piece was encoding this knowledge right now.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, thanks Robert. I believe Sean added functionality to update the database's record including the is_scrimlet value in d172c9c.

One question I have though is about when that value is updated. At RSS-time, it's definitely possible that the sled agent comes up before Sidecar, so we'd need to support updating it from false to true. But it seems kind of "sticky" to me. That is, I'm not sure it makes sense to set this to false, at least not right away, when Sidecar is cycled. If that's done as part of an administrative action or by Nexus, it seems like we'd want to keep the knowledge that this is or was a Scrimlet, and report 503s from dpd or the sled agent. If Sidecar cycles unexpectedly, we probably still want to do that, along with generating an alert of some kind. I don't know if there's any action here, but spelling that out seems useful to me.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect an initial policy of stickiness across a given boot will be a pretty useful way to go here. That is, when it's rebooted, it'll be something that is determined fresh and resets stickiness.

// hardware.
let is_scrimlet = self.rss.lock().await.is_some();

// Server does not exist, initialize it.
let server = SledServer::start(
&self.sled_config,
self.parent_log.clone(),
sled_address,
is_scrimlet,
request.rack_id,
)
.await
Expand Down
2 changes: 1 addition & 1 deletion sled-agent/src/mocks/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ mock! {
pub fn new(server_addr: &str, log: Logger) -> Self;
pub fn client(&self) -> reqwest::Client;
pub fn baseurl(&self) -> &'static str;
pub async fn cpapi_sled_agents_post(
pub async fn sled_agent_put(
&self,
id: &Uuid,
info: &SledAgentStartupInfo,
Expand Down
10 changes: 9 additions & 1 deletion sled-agent/src/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ impl Server {
config: &Config,
log: Logger,
addr: SocketAddrV6,
is_scrimlet: bool,
rack_id: Uuid,
) -> Result<Server, String> {
info!(log, "setting up sled agent server");
Expand Down Expand Up @@ -83,15 +84,22 @@ impl Server {
log,
"contacting server nexus, registering sled: {}", sled_id
);
let role = if is_scrimlet {
nexus_client::types::SledRole::Scrimlet
} else {
nexus_client::types::SledRole::Gimlet
};

let nexus_client = lazy_nexus_client
.get()
.await
.map_err(|err| BackoffError::transient(err.to_string()))?;
nexus_client
.cpapi_sled_agents_post(
.sled_agent_put(
&sled_id,
&nexus_client::types::SledAgentStartupInfo {
sa_address: sled_address.to_string(),
role,
},
)
.await
Expand Down
3 changes: 2 additions & 1 deletion sled-agent/src/sim/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,11 @@ impl Server {
let notify_nexus = || async {
debug!(log, "contacting server nexus");
(nexus_client
.cpapi_sled_agents_post(
.sled_agent_put(
&config.id,
&nexus_client::types::SledAgentStartupInfo {
sa_address: sa_address.to_string(),
role: nexus_client::types::SledRole::Gimlet,
},
)
.await)
Expand Down