From deba63e066a35906adfefb888d591b85c02581ba Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 7 Jun 2022 17:09:47 -0400 Subject: [PATCH] Replace bootstrap-agent dropshot server with sprockets session Removes sprockets proxies, fixing #1161. --- Cargo.lock | 14 +- openapi/bootstrap-agent.json | 199 -------------- sled-agent/Cargo.toml | 2 +- sled-agent/src/bin/sled-agent.rs | 74 +----- sled-agent/src/bootstrap/agent.rs | 5 +- sled-agent/src/bootstrap/client.rs | 154 ++++++++++- sled-agent/src/bootstrap/config.rs | 7 +- sled-agent/src/bootstrap/http_entrypoints.rs | 95 ------- sled-agent/src/bootstrap/mod.rs | 2 +- sled-agent/src/bootstrap/params.rs | 27 +- sled-agent/src/bootstrap/rss_handle.rs | 106 +------- sled-agent/src/bootstrap/server.rs | 248 ++++++++++++------ sled-agent/src/bootstrap/views.rs | 19 +- sled-agent/src/rack_setup/service.rs | 3 - sled-agent/src/sp.rs | 92 ++++++- .../tests/integration_tests/commands.rs | 25 +- 16 files changed, 463 insertions(+), 609 deletions(-) delete mode 100644 openapi/bootstrap-agent.json delete mode 100644 sled-agent/src/bootstrap/http_entrypoints.rs diff --git a/Cargo.lock b/Cargo.lock index 268e855769..81b63c9a7d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2992,6 +2992,7 @@ dependencies = [ "schemars", "serde", "serde_json", + "serde_repr", "serial_test", "sled-agent-client", "slog", @@ -3003,7 +3004,6 @@ dependencies = [ "sp-sim", "spdm", "sprockets-host", - "sprockets-proxy", "structopt", "subprocess", "tar", @@ -5065,18 +5065,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "sprockets-proxy" -version = "0.1.0" -source = "git+http://github.com/oxidecomputer/sprockets?rev=0361fd13ff19cda6696242fe40f1325fca30d3d1#0361fd13ff19cda6696242fe40f1325fca30d3d1" -dependencies = [ - "serde", - "slog", - "sprockets-host", - "thiserror", - "tokio", -] - [[package]] name = "sprockets-rot" version = "0.1.0" diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json deleted file mode 100644 index 7a42c6d66a..0000000000 --- a/openapi/bootstrap-agent.json +++ /dev/null @@ -1,199 +0,0 @@ -{ - "openapi": "3.0.3", - "info": { - "title": "Oxide Bootstrap Agent API", - "description": "API for interacting with bootstrapping agents", - "contact": { - "url": "https://oxide.computer", - "email": "api@oxide.computer" - }, - "version": "0.0.1" - }, - "paths": { - "/request_share": { - "get": { - "operationId": "request_share", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ShareRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ShareResponse" - } - } - } - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, - "/start_sled": { - "put": { - "operationId": "start_sled", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SledAgentRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SledAgentResponse" - } - } - } - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - } - }, - "components": { - "responses": { - "Error": { - "description": "Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Error" - } - } - } - } - }, - "schemas": { - "Error": { - "description": "Error information from a response.", - "type": "object", - "properties": { - "error_code": { - "type": "string" - }, - "message": { - "type": "string" - }, - "request_id": { - "type": "string" - } - }, - "required": [ - "message", - "request_id" - ] - }, - "Ipv6Net": { - "example": "fd12:3456::/64", - "title": "An IPv6 subnet", - "description": "An IPv6 subnet, including prefix and subnet mask", - "type": "string", - "pattern": "^(fd|FD)[0-9a-fA-F]{2}:((([0-9a-fA-F]{1,4}\\:){6}[0-9a-fA-F]{1,4})|(([0-9a-fA-F]{1,4}:){1,6}:))/(6[4-9]|[7-9][0-9]|1[0-1][0-9]|12[0-6])$", - "maxLength": 43 - }, - "Ipv6Subnet": { - "description": "Wraps an [`Ipv6Network`] with a compile-time prefix length.", - "type": "object", - "properties": { - "net": { - "$ref": "#/components/schemas/Ipv6Net" - } - }, - "required": [ - "net" - ] - }, - "ShareRequest": { - "description": "Identity signed by local RoT and Oxide certificate chain.", - "type": "object", - "properties": { - "identity": { - "type": "array", - "items": { - "type": "integer", - "format": "uint8", - "minimum": 0 - } - } - }, - "required": [ - "identity" - ] - }, - "ShareResponse": { - "description": "Sent between bootstrap agents to establish trust quorum.", - "type": "object", - "properties": { - "shared_secret": { - "type": "array", - "items": { - "type": "integer", - "format": "uint8", - "minimum": 0 - } - } - }, - "required": [ - "shared_secret" - ] - }, - "SledAgentRequest": { - "description": "Configuration information for launching a Sled Agent.", - "type": "object", - "properties": { - "subnet": { - "description": "Portion of the IP space to be managed by the Sled Agent.", - "allOf": [ - { - "$ref": "#/components/schemas/Ipv6Subnet" - } - ] - } - }, - "required": [ - "subnet" - ] - }, - "SledAgentResponse": { - "description": "Describes the Sled Agent running on the device.", - "type": "object", - "properties": { - "id": { - "type": "string", - "format": "uuid" - } - }, - "required": [ - "id" - ] - } - } - } -} \ No newline at end of file diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index a2b322bc1c..1a4bb9fe2f 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -30,6 +30,7 @@ reqwest = { version = "0.11.8", default-features = false, features = ["rustls-tl schemars = { version = "0.8.10", features = [ "chrono", "uuid1" ] } serde = { version = "1.0", features = [ "derive" ] } serde_json = "1.0" +serde_repr = "0.1" sled-agent-client = { path = "../sled-agent-client" } slog = { version = "2.5", features = [ "max_level_trace", "release_max_level_debug" ] } slog-dtrace = "0.2" @@ -37,7 +38,6 @@ smf = "0.2" spdm = { git = "https://github.com/oxidecomputer/spdm", rev = "9742f6e" } sp-sim = { path = "../sp-sim" } sprockets-host = { git = "http://github.com/oxidecomputer/sprockets", rev = "0361fd13ff19cda6696242fe40f1325fca30d3d1" } -sprockets-proxy = { git = "http://github.com/oxidecomputer/sprockets", rev = "0361fd13ff19cda6696242fe40f1325fca30d3d1" } socket2 = { version = "0.4", features = [ "all" ] } structopt = "0.3" tar = "0.4" diff --git a/sled-agent/src/bin/sled-agent.rs b/sled-agent/src/bin/sled-agent.rs index c8d361db6e..fba2d0d5de 100644 --- a/sled-agent/src/bin/sled-agent.rs +++ b/sled-agent/src/bin/sled-agent.rs @@ -4,8 +4,6 @@ //! Executable program to run the sled agent -use dropshot::ConfigDropshot; -use omicron_common::api::external::Error; use omicron_common::cmd::fatal; use omicron_common::cmd::CmdError; use omicron_sled_agent::bootstrap::{ @@ -15,31 +13,9 @@ use omicron_sled_agent::bootstrap::{ use omicron_sled_agent::rack_setup::config::SetupServiceConfig as RssConfig; use omicron_sled_agent::{config::Config as SledConfig, server as sled_server}; use sp_sim::config::GimletConfig; -use std::net::SocketAddr; use std::path::PathBuf; use structopt::StructOpt; -#[derive(Debug)] -enum ApiRequest { - Bootstrap, - Sled, -} - -impl std::str::FromStr for ApiRequest { - type Err = Error; - fn from_str(s: &str) -> Result { - match s { - "bootstrap" => Ok(ApiRequest::Bootstrap), - "sled" => Ok(ApiRequest::Sled), - _ => Err(Error::InvalidValue { - label: s.to_string(), - message: "Invalid value: try one of {bootstrap, sled}" - .to_string(), - }), - } - } -} - #[derive(Debug, StructOpt)] #[structopt( name = "sled_agent", @@ -47,10 +23,7 @@ impl std::str::FromStr for ApiRequest { )] enum Args { /// Generates the OpenAPI specification. - Openapi { - #[structopt(name = "api_type", parse(try_from_str))] - api_requested: ApiRequest, - }, + Openapi, /// Runs the Sled Agent server. Run { #[structopt(name = "CONFIG_FILE_PATH", parse(from_os_str))] @@ -71,14 +44,7 @@ async fn do_run() -> Result<(), CmdError> { })?; match args { - Args::Openapi { api_requested } => match api_requested { - ApiRequest::Bootstrap => { - bootstrap_server::run_openapi().map_err(CmdError::Failure) - } - ApiRequest::Sled => { - sled_server::run_openapi().map_err(CmdError::Failure) - } - }, + Args::Openapi => sled_server::run_openapi().map_err(CmdError::Failure), Args::Run { config_path } => { let config = SledConfig::from_file(&config_path) .map_err(|e| CmdError::Failure(e.to_string()))?; @@ -131,46 +97,12 @@ async fn do_run() -> Result<(), CmdError> { let bootstrap_address = bootstrap_address(link) .map_err(|e| CmdError::Failure(e.to_string()))?; - // Are we going to simulate a local SP? If so: - // - // 1. The bootstrap dropshot server listens on localhost - // 2. A sprockets proxy listens on `bootstrap_address` (and relays - // incoming connections to the localhost dropshot server) - // - // If we're not simulating a local SP, we can't establish sprockets - // sessions, so we'll have the bootstrap dropshot server listen on - // `bootstrap_address` (and no sprockets proxy). - // - // TODO-security: With this configuration, dropshot itself is - // running plain HTTP and blindly trusting all connections from - // localhost. We have a similar sprockets proxy on the client side, - // where the proxy blindly trusts all connections from localhost - // (although the client-side proxy only runs while is being made, - // while our dropshot server is always listening). Can we secure - // these connections sufficiently? Other options include expanding - // dropshot/progenitor to allow a custom connection layer (supported - // by hyper, but not reqwest), keeping the sprockets proxy but using - // something other than TCP that we can lock down, or abandoning - // dropshot and using a bespoke protocol over a raw - // sprockets-encrypted TCP connection. - let (bootstrap_dropshot_addr, sprockets_proxy_bind_addr) = - if sp_config.is_some() { - ("[::1]:0".parse().unwrap(), Some(bootstrap_address)) - } else { - (SocketAddr::V6(bootstrap_address), None) - }; - // Configure and run the Bootstrap server. let bootstrap_config = BootstrapConfig { id: config.id, - dropshot: ConfigDropshot { - bind_address: bootstrap_dropshot_addr, - request_body_max_bytes: 1024 * 1024, - ..Default::default() - }, + bind_address: bootstrap_address, log: config.log.clone(), rss_config, - sprockets_proxy_bind_addr, sp_config, }; diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 224f4c1e25..bc0f84c2ba 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -210,13 +210,14 @@ impl Agent { )?, ) .map_err(|err| BootstrapError::Toml { path: request_path, err })?; - agent.request_agent(sled_request).await?; + agent.request_agent(&sled_request).await?; } Ok(agent) } /// Implements the "request share" API. + #[allow(dead_code)] // Currently uncalled; will be used soon! pub async fn request_share( &self, identity: Vec, @@ -234,7 +235,7 @@ impl Agent { /// been initialized. pub async fn request_agent( &self, - request: SledAgentRequest, + request: &SledAgentRequest, ) -> Result { info!(&self.log, "Loading Sled Agent: {:?}", request); diff --git a/sled-agent/src/bootstrap/client.rs b/sled-agent/src/bootstrap/client.rs index 6ddc1f2566..8e0dd392dd 100644 --- a/sled-agent/src/bootstrap/client.rs +++ b/sled-agent/src/bootstrap/client.rs @@ -3,9 +3,155 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. //! Interface for making API requests to a Sled Agent's Bootstrap API. -//! This is not its own crate because the only intended consumer is other -//! bootstrap peers within the cluster. -use omicron_common::generate_logging_api; +use super::params::Request; +use super::params::RequestEnvelope; +use super::params::SledAgentRequest; +use super::params::Version; +use super::views::SledAgentResponse; +use crate::bootstrap::views::Response; +use crate::bootstrap::views::ResponseEnvelope; +use crate::sp::SpHandle; +use crate::sp::SprocketsRole; +use slog::Logger; +use std::borrow::Cow; +use std::io; +use std::net::SocketAddrV6; +use thiserror::Error; +use tokio::io::AsyncReadExt; +use tokio::io::AsyncWriteExt; +use tokio::net::TcpStream; -generate_logging_api!("../openapi/bootstrap-agent.json"); +#[derive(Debug, Error)] +pub(crate) enum Error { + #[error("Could not connect to {addr}: {err}")] + Connect { addr: SocketAddrV6, err: io::Error }, + + #[error("Could not establish sprockets session: {0}")] + SprocketsSession(String), + + #[error("Failed serializing request: {0}")] + Serialize(serde_json::Error), + + #[error("Failed writing request length prefix: {0}")] + WriteLengthPrefix(io::Error), + + #[error("Failed writing request: {0}")] + WriteRequest(io::Error), + + #[error("Failed flushing request: {0}")] + FlushRequest(io::Error), + + #[error("Failed reading response length prefix: {0}")] + ReadLengthPrefix(io::Error), + + #[error("Received bogus response length: {0}")] + BadResponseLength(u32), + + #[error("Failed reading response: {0}")] + ReadResponse(io::Error), + + #[error("Failed deserializing response: {0}")] + Deserialize(serde_json::Error), + + #[error("Request failed: {0}")] + ServerFailure(String), + + #[error("Bogus response from server (expected {expected} but received {received})")] + InvalidResponse { expected: &'static str, received: &'static str }, +} + +pub(crate) struct Client<'a> { + addr: SocketAddrV6, + sp: &'a Option, + log: Logger, +} + +impl<'a> Client<'a> { + pub(crate) fn new( + addr: SocketAddrV6, + sp: &'a Option, + log: Logger, + ) -> Self { + Self { addr, sp, log } + } + + pub(crate) async fn start_sled( + &self, + request: &SledAgentRequest, + ) -> Result { + let request = Request::SledAgentRequest(Cow::Borrowed(request)); + + match self.request_response(request).await? { + Response::SledAgentResponse(response) => Ok(response), + Response::ShareResponse(_) => Err(Error::InvalidResponse { + expected: "SledAgentResponse", + received: "ShareResponse", + }), + } + } + + async fn request_response( + &self, + request: Request<'_>, + ) -> Result { + // Bound to avoid allocating an unreasonable amount of memory from a + // bogus length prefix from a server. We authenticate servers via + // sprockets before allocating based on the length prefix they send, so + // it should be fine to be a little sloppy here and just pick something + // far larger than we ever expect to see. + const MAX_RESPONSE_LEN: u32 = 16 << 20; + + // Establish connection and sprockets connection (if possible). + let stream = TcpStream::connect(self.addr) + .await + .map_err(|err| Error::Connect { addr: self.addr, err })?; + + let mut stream = crate::sp::maybe_wrap_stream( + stream, + self.sp, + SprocketsRole::Client, + &self.log, + ) + .await + .map_err(|err| Error::SprocketsSession(err.to_string()))?; + + // Build and serialize our request. + let envelope = RequestEnvelope { version: Version::V1, request }; + let mut buf = + serde_json::to_vec(&envelope).map_err(Error::Serialize)?; + let request_length = u32::try_from(buf.len()) + .expect("serialized bootstrap-agent request length overflowed u32"); + + // Write our request with a length prefix. + stream + .write_u32(request_length) + .await + .map_err(Error::WriteLengthPrefix)?; + stream.write_all(&buf).await.map_err(Error::WriteRequest)?; + stream.flush().await.map_err(Error::FlushRequest)?; + + // Read the response, length prefix first. + let response_length = + stream.read_u32().await.map_err(Error::ReadLengthPrefix)?; + // Sanity check / guard against malformed lengths + if response_length > MAX_RESPONSE_LEN { + return Err(Error::BadResponseLength(response_length)); + } + + buf.resize(response_length as usize, 0); + stream.read_exact(&mut buf).await.map_err(Error::ReadResponse)?; + + // Deserialize and handle the response. + let envelope: ResponseEnvelope = + serde_json::from_slice(&buf).map_err(Error::Deserialize)?; + + // Currently we only have one version, so there's nothing to do in this + // match, but we leave it here as a breadcrumb for future changes. + match envelope.version { + Version::V1 => (), + } + + envelope.response.map_err(Error::ServerFailure) + } +} diff --git a/sled-agent/src/bootstrap/config.rs b/sled-agent/src/bootstrap/config.rs index 945b03a10e..8cab95750d 100644 --- a/sled-agent/src/bootstrap/config.rs +++ b/sled-agent/src/bootstrap/config.rs @@ -4,7 +4,6 @@ //! Interfaces for working with bootstrap agent configuration -use dropshot::ConfigDropshot; use dropshot::ConfigLogging; use serde::Deserialize; use serde::Serialize; @@ -18,14 +17,10 @@ pub const BOOTSTRAP_AGENT_PORT: u16 = 12346; #[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] pub struct Config { pub id: Uuid, - pub dropshot: ConfigDropshot, + pub bind_address: SocketAddrV6, pub log: ConfigLogging, pub rss_config: Option, - // If present, `dropshot` should bind to a localhost address, and we'll - // configure a sprockets-proxy pointed to it that listens on this - // (non-localhost) address. - pub sprockets_proxy_bind_addr: Option, pub sp_config: Option, } diff --git a/sled-agent/src/bootstrap/http_entrypoints.rs b/sled-agent/src/bootstrap/http_entrypoints.rs deleted file mode 100644 index 8c2952a805..0000000000 --- a/sled-agent/src/bootstrap/http_entrypoints.rs +++ /dev/null @@ -1,95 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! HTTP entrypoint functions for the bootstrap agent's exposed API -//! -//! Note that care must be taken when evolving this interface. In particular, -//! changes need to consider forward- and backward-compatibility of bootstrap -//! agents on other sleds that may be running different versions. -//! -//! Note also that changes to the interface will necessarily require updates to -//! code where we use the client. To do this, follow the method prescribed in -//! the repo README: -//! -//! 1. Update the interface -//! 2. `cargo build` will now succeed but `cargo test` will fail because the -//! `bootstrap-agent.json` file is out of date (and because the client calls no -//! longer match the server) -//! 3. Update the `bootstrap-agent.json` file: `EXPECTORATE=overwrite cargo -//! test` -//! 4. The build will now fail, so update the client calls as needed. -//! -//! Do not update client calls before updating `bootstrap-agent.json` or else -//! you won't be able to build the crate and therefore won't be able to -//! automatically updates `boostrap-agent.json`. - -use dropshot::endpoint; -use dropshot::ApiDescription; -use dropshot::HttpError; -use dropshot::HttpResponseOk; -use dropshot::RequestContext; -use dropshot::TypedBody; -use omicron_common::api::external::Error as ExternalError; -use std::sync::Arc; - -use super::agent::Agent; -use super::{ - params::{ShareRequest, SledAgentRequest}, - views::{ShareResponse, SledAgentResponse}, -}; - -/// Returns a description of the bootstrap agent API -pub(crate) fn ba_api() -> ApiDescription> { - fn register_endpoints( - api: &mut ApiDescription>, - ) -> Result<(), String> { - api.register(request_share)?; - api.register(start_sled)?; - Ok(()) - } - - let mut api = ApiDescription::new(); - if let Err(err) = register_endpoints(&mut api) { - panic!("failed to register entrypoints: {}", err); - } - api -} - -#[endpoint { - method = GET, - path = "/request_share", -}] -async fn request_share( - rqctx: Arc>>, - request: TypedBody, -) -> Result, HttpError> { - let bootstrap_agent = rqctx.context(); - - let request = request.into_inner(); - Ok(HttpResponseOk( - bootstrap_agent - .request_share(request.identity) - .await - .map_err(|e| ExternalError::from(e))?, - )) -} - -#[endpoint { - method = PUT, - path = "/start_sled", -}] -async fn start_sled( - rqctx: Arc>>, - request: TypedBody, -) -> Result, HttpError> { - let bootstrap_agent = rqctx.context(); - - let request = request.into_inner(); - Ok(HttpResponseOk( - bootstrap_agent - .request_agent(request) - .await - .map_err(|e| ExternalError::from(e))?, - )) -} diff --git a/sled-agent/src/bootstrap/mod.rs b/sled-agent/src/bootstrap/mod.rs index cbba39a632..27bc0b1c79 100644 --- a/sled-agent/src/bootstrap/mod.rs +++ b/sled-agent/src/bootstrap/mod.rs @@ -8,7 +8,7 @@ pub mod agent; pub mod client; pub mod config; pub mod discovery; -mod http_entrypoints; +//mod http_entrypoints; pub mod multicast; pub(crate) mod params; pub(crate) mod rss_handle; diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 55d6e2c117..a16a89c712 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -2,11 +2,15 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Request body types for the bootstrap agent +//! Request types for the bootstrap agent + +use std::borrow::Cow; use omicron_common::address::{Ipv6Subnet, SLED_PREFIX}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use serde_repr::Deserialize_repr; +use serde_repr::Serialize_repr; /// Identity signed by local RoT and Oxide certificate chain. #[derive(Serialize, Deserialize, JsonSchema)] @@ -21,3 +25,24 @@ pub struct SledAgentRequest { /// Portion of the IP space to be managed by the Sled Agent. pub subnet: Ipv6Subnet, } + +#[derive(Clone, Copy, Debug, Serialize_repr, Deserialize_repr, PartialEq)] +#[repr(u32)] +pub enum Version { + V1 = 1, +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub enum Request<'a> { + /// Send configuration information for launching a Sled Agent. + SledAgentRequest(Cow<'a, SledAgentRequest>), + + /// Request the sled's share of the rack secret. + ShareRequest, +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct RequestEnvelope<'a> { + pub version: Version, + pub request: Request<'a>, +} diff --git a/sled-agent/src/bootstrap/rss_handle.rs b/sled-agent/src/bootstrap/rss_handle.rs index 1367a8addc..d945aa169b 100644 --- a/sled-agent/src/bootstrap/rss_handle.rs +++ b/sled-agent/src/bootstrap/rss_handle.rs @@ -16,10 +16,7 @@ use omicron_common::backoff::internal_service_policy; use omicron_common::backoff::retry_notify; use omicron_common::backoff::BackoffError; use slog::Logger; -use std::net::SocketAddr; use std::net::SocketAddrV6; -use std::time::Duration; -use thiserror::Error; use tokio::sync::mpsc; use tokio::sync::oneshot; use tokio::task::JoinHandle; @@ -64,111 +61,22 @@ impl RssHandle { } } -#[derive(Debug, Error)] -enum InitializeSledAgentError { - #[error("Failed to construct an HTTP client: {0}")] - HttpClient(#[from] reqwest::Error), - - #[error("Failed to start sprockets proxy: {0}")] - SprocketsProxy(#[from] sprockets_proxy::Error), - - #[error("Error making HTTP request to Bootstrap Agent: {0}")] - BootstrapApi( - #[from] - bootstrap_agent_client::Error, - ), -} - async fn initialize_sled_agent( log: &Logger, bootstrap_addr: SocketAddrV6, request: &SledAgentRequest, sp: &Option, -) -> Result<(), InitializeSledAgentError> { - let dur = std::time::Duration::from_secs(60); - - let client = reqwest::ClientBuilder::new() - .connect_timeout(dur) - .timeout(dur) - .build()?; - - let (url, _proxy_task) = if let Some(sp) = sp.as_ref() { - // We have an SP; spawn a sprockets proxy for this connection. - let proxy_config = sprockets_proxy::Config { - bind_address: "[::1]:0".parse().unwrap(), - target_address: SocketAddr::V6(bootstrap_addr), - role: sprockets_proxy::Role::Client, - }; - // TODO-cleanup The `Duration` passed to `Proxy::new()` is the timeout - // for communicating with the RoT. Currently it can be set to anything - // at all (our simulated RoT always responds immediately). Should the - // value move to our config? - let proxy = sprockets_proxy::Proxy::new( - &proxy_config, - sp.manufacturing_public_key(), - sp.rot_handle(), - sp.rot_certs(), - Duration::from_secs(5), - log.new(o!("BootstrapAgentClientSprocketsProxy" - => proxy_config.target_address)), - ) - .await?; - - let proxy_addr = proxy.local_addr(); - - let proxy_task = tokio::spawn(async move { - // TODO-robustness `proxy.run()` only fails if `accept()`ing on our - // already-bound listening socket fails, which means something has - // gone very wrong. Do we have any recourse other than panicking? - // What does dropshot do if `accept()` fails? - proxy.run().await.expect("sprockets client proxy failed"); - }); - - // Wrap `proxy_task` in `AbortOnDrop`, which will abort it (shutting - // down the proxy) when we return. - let proxy_task = AbortOnDrop(proxy_task); - - info!( - log, "Sending request to peer agent via sprockets proxy"; - "peer" => %bootstrap_addr, - "sprockets_proxy" => %proxy_addr, - ); - (format!("http://{}", proxy_addr), Some(proxy_task)) - } else { - // We have no SP; connect directly. - info!( - log, "Sending request to peer agent"; - "peer" => %bootstrap_addr, - ); - (format!("http://{}", bootstrap_addr), None) - }; - - let client = bootstrap_agent_client::Client::new_with_client( - &url, - client, - log.new(o!("BootstrapAgentClient" => url.clone())), +) -> Result<(), bootstrap_agent_client::Error> { + let client = bootstrap_agent_client::Client::new( + bootstrap_addr, + sp, + log.new(o!("BootstrapAgentClient" => bootstrap_addr.to_string())), ); let sled_agent_initialize = || async { - client - .start_sled(&bootstrap_agent_client::types::SledAgentRequest { - subnet: bootstrap_agent_client::types::Ipv6Subnet { - net: bootstrap_agent_client::types::Ipv6Net( - request.subnet.net().to_string(), - ), - }, - }) - .await - .map_err(BackoffError::transient)?; + client.start_sled(request).await.map_err(BackoffError::transient)?; - Ok::< - (), - BackoffError< - bootstrap_agent_client::Error< - bootstrap_agent_client::types::Error, - >, - >, - >(()) + Ok::<(), BackoffError>(()) }; let log_failure = |error, _| { diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index f3a0cdc9a7..86653376c5 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -6,23 +6,30 @@ use super::agent::Agent; use super::config::Config; -use super::http_entrypoints::ba_api as http_api; +use super::params::Request; +use super::params::RequestEnvelope; +use super::params::Version; +use super::views::Response; +use super::views::ResponseEnvelope; use crate::config::Config as SledConfig; use crate::sp::SpHandle; -use dropshot::HttpServer; +use crate::sp::SprocketsRole; use slog::Drain; use slog::Logger; use std::net::Ipv6Addr; -use std::net::SocketAddr; use std::net::SocketAddrV6; use std::sync::Arc; -use std::time::Duration; +use tokio::io::AsyncReadExt; +use tokio::io::AsyncWriteExt; +use tokio::net::TcpListener; +use tokio::net::TcpStream; +use tokio::task::JoinHandle; /// Wraps a [Agent] object, and provides helper methods for exposing it /// via an HTTP interface. pub struct Server { bootstrap_agent: Arc, - http_server: dropshot::HttpServer>, + inner: JoinHandle>, } impl Server { @@ -58,38 +65,18 @@ impl Server { ); let ba = Arc::clone(&bootstrap_agent); - let dropshot_log = log.new(o!("component" => "dropshot (Bootstrap)")); - let http_server = dropshot::HttpServerStarter::new( - &config.dropshot, - http_api(), - ba, - &dropshot_log, - ) - .map_err(|error| format!("initializing server: {}", error))? - .start(); - - // Are connections to our bootstrap dropshot server being tunneled - // through a sprockets proxy? If so, start up our half. - if let Some(sprockets_proxy_bind_addr) = - config.sprockets_proxy_bind_addr - { - spawn_sprockets_proxy( - &sp, - &http_server, - sprockets_proxy_bind_addr, - &log, - ) - .await?; - } + let ba_log = log.new(o!("component" => "BootstrapAgentServer")); + let inner = + Inner::start(config.bind_address, sp.clone(), ba, ba_log).await?; - let server = Server { bootstrap_agent, http_server }; + let server = Server { bootstrap_agent, inner }; // Initialize the bootstrap agent *after* the server has started. // This ordering allows the bootstrap agent to communicate with // other bootstrap agents on the rack during the initialization // process. if let Err(e) = server.bootstrap_agent.initialize(&config).await { - let _ = server.close().await; + server.inner.abort(); return Err(e.to_string()); } @@ -97,75 +84,164 @@ impl Server { } pub async fn wait_for_finish(self) -> Result<(), String> { - self.http_server.await + match self.inner.await { + Ok(result) => result, + Err(err) => { + if err.is_cancelled() { + // We control cancellation of `inner`, which only happens if + // we intentionally abort it in `close()`; that should not + // result in an error here. + Ok(()) + } else { + Err(format!("Join on server tokio task failed: {err}")) + } + } + } } pub async fn close(self) -> Result<(), String> { - self.http_server.close().await + self.inner.abort(); + self.wait_for_finish().await } } -pub fn run_openapi() -> Result<(), String> { - http_api() - .openapi("Oxide Bootstrap Agent API", "0.0.1") - .description("API for interacting with bootstrapping agents") - .contact_url("https://oxide.computer") - .contact_email("api@oxide.computer") - .write(&mut std::io::stdout()) - .map_err(|e| e.to_string()) +struct Inner { + listener: TcpListener, + sp: Option, + bootstrap_agent: Arc, + log: Logger, +} + +impl Inner { + async fn start( + bind_address: SocketAddrV6, + sp: Option, + bootstrap_agent: Arc, + log: Logger, + ) -> Result>, String> { + let listener = + TcpListener::bind(bind_address).await.map_err(|err| { + format!("could not bind to {bind_address}: {err}") + })?; + info!(log, "Started listening"; "local_addr" => %bind_address); + let inner = Inner { listener, sp, bootstrap_agent, log }; + Ok(tokio::spawn(inner.run())) + } + + // Run our sprockets server. The only `.await` point is when `accept()`ing + // on our bound socket, so we can be cleanly shut down by our caller + // `.abort()`ing our task, which will not affect any already-accepted + // sockets (which are spawned onto detached tokio tasks). + async fn run(self) -> Result<(), String> { + loop { + let (stream, remote_addr) = + self.listener.accept().await.map_err(|err| { + format!("accept() on already-bound socket failed: {err}") + })?; + + let log = self.log.new(o!("remote_addr" => remote_addr)); + info!(log, "Accepted connection"); + + let sp = self.sp.clone(); + let ba = Arc::clone(&self.bootstrap_agent); + tokio::spawn(async move { + match serve_single_request(stream, sp, ba, &log).await { + Ok(()) => info!(log, "Connection closed"), + Err(err) => warn!(log, "Connection failed"; "err" => err), + } + }); + } + } } -async fn spawn_sprockets_proxy( - sp: &Option, - http_server: &HttpServer>, - sprockets_proxy_bind_addr: SocketAddrV6, +async fn serve_single_request( + stream: TcpStream, + sp: Option, + bootstrap_agent: Arc, log: &Logger, ) -> Result<(), String> { - // We can only start a sprockets proxy if we have an SP. - let sp = sp.as_ref().ok_or( - "Misconfiguration: cannot start a sprockets proxy without an SP", - )?; - - // If we're running a sprockets proxy, our dropshot server should be - // listening on localhost. - let dropshot_addr = http_server.local_addr(); - if !dropshot_addr.ip().is_loopback() { - return Err(concat!( - "Misconfiguration: bootstrap dropshot IP address should ", - "be loopback when using a sprockets proxy" - ) - .into()); + // Bound to avoid allocating an unreasonable amount of memory from a bogus + // length prefix from a client. We authenticate clients via sprockets before + // allocating based on the length prefix they send, so it should be fine to + // be a little sloppy here and just pick something far larger than we ever + // expect to see. + const MAX_REQUEST_LEN: u32 = 128 << 20; + + // Establish sprockets session (if we have an SP). + let mut stream = + crate::sp::maybe_wrap_stream(stream, &sp, SprocketsRole::Server, log) + .await + .map_err(|err| { + format!("Failed to establish sprockets session: {err}") + })?; + + // Read request, length prefix first. + let request_length = stream + .read_u32() + .await + .map_err(|err| format!("Failed to read length prefix: {err}"))?; + + // Sanity check / guard against malformed lengths + if request_length > MAX_REQUEST_LEN { + return Err(format!( + "Rejecting incoming message with enormous length {request_length}" + )); + } + + let mut buf = vec![0; request_length as usize]; + stream.read_exact(&mut buf).await.map_err(|err| { + format!("Failed to read message of length {request_length}: {err}") + })?; + + // Deserialize request. + let envelope: RequestEnvelope = + serde_json::from_slice(&buf).map_err(|err| { + format!("Failed to deserialize request envelope: {err}") + })?; + + // Currently we only have one version, so there's nothing to do in this + // match, but we leave it here as a breadcrumb for future changes. + match envelope.version { + Version::V1 => (), } - let proxy_config = sprockets_proxy::Config { - bind_address: SocketAddr::V6(sprockets_proxy_bind_addr), - target_address: dropshot_addr, - role: sprockets_proxy::Role::Server, + // Handle request. + let response = match envelope.request { + Request::SledAgentRequest(request) => { + match bootstrap_agent.request_agent(&*request).await { + Ok(response) => Ok(Response::SledAgentResponse(response)), + Err(err) => { + warn!(log, "Sled agent request failed"; "err" => %err); + Err(format!("Sled agent request failed: {err}")) + } + } + } + Request::ShareRequest => { + Err("share request currently unsupported".to_string()) + } }; - let proxy_log = log.new(o!("component" => "sprockets-proxy (Bootstrap)")); - - // TODO-cleanup The `Duration` passed to `Proxy::new()` is the timeout - // for communicating with the RoT. Currently it can be set to anything - // at all (our simulated RoT always responds immediately). Should the - // value move to our config? - let proxy = sprockets_proxy::Proxy::new( - &proxy_config, - sp.manufacturing_public_key(), - sp.rot_handle(), - sp.rot_certs(), - Duration::from_secs(5), - proxy_log, - ) - .await - .map_err(|err| format!("Failed to start sprockets proxy: {err}"))?; - - tokio::spawn(async move { - // TODO-robustness `proxy.run()` only fails if `accept()`ing on our - // already-bound listening socket fails, which means something has - // gone very wrong. Do we have any recourse other than panicking? - // What does dropshot do if `accept()` fails? - proxy.run().await.expect("sprockets server proxy failed"); - }); + + // Build and serialize response. + let envelope = ResponseEnvelope { version: Version::V1, response }; + buf.clear(); + serde_json::to_writer(&mut buf, &envelope) + .map_err(|err| format!("Failed to serialize response: {err}"))?; + + // Write response, length prefix first. + let response_length = u32::try_from(buf.len()) + .expect("serialized bootstrap-agent response length overflowed u32"); + + stream.write_u32(response_length).await.map_err(|err| { + format!("Failed to write response length prefix: {err}") + })?; + stream + .write_all(&buf) + .await + .map_err(|err| format!("Failed to write response body: {err}"))?; + stream + .flush() + .await + .map_err(|err| format!("Failed to flush response body: {err}"))?; Ok(()) } diff --git a/sled-agent/src/bootstrap/views.rs b/sled-agent/src/bootstrap/views.rs index 56d3a9b80d..dae2259d8b 100644 --- a/sled-agent/src/bootstrap/views.rs +++ b/sled-agent/src/bootstrap/views.rs @@ -4,19 +4,34 @@ //! Response types for the bootstrap agent +use super::params::Version; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use uuid::Uuid; /// Sent between bootstrap agents to establish trust quorum. -#[derive(Serialize, Deserialize, JsonSchema)] +// Note: We intentionally do not derive `Debug` on this type, to avoid +// accidentally debug-logging the secret share. +#[derive(Serialize, Deserialize, JsonSchema, PartialEq)] pub struct ShareResponse { // TODO-completeness: format TBD; currently opaque. pub shared_secret: Vec, } /// Describes the Sled Agent running on the device. -#[derive(Serialize, Deserialize, JsonSchema)] +#[derive(Serialize, Deserialize, JsonSchema, PartialEq)] pub struct SledAgentResponse { pub id: Uuid, } + +#[derive(Serialize, Deserialize, PartialEq)] +pub enum Response { + SledAgentResponse(SledAgentResponse), + ShareResponse(ShareResponse), +} + +#[derive(Serialize, Deserialize, PartialEq)] +pub struct ResponseEnvelope { + pub version: Version, + pub response: Result, +} diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 0fef7054d2..602c7b7d8e 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -46,9 +46,6 @@ pub enum SetupServiceError { #[error("Failed to construct an HTTP client: {0}")] HttpClient(reqwest::Error), - - #[error("Failed to construct a sprockets proxy: {0}")] - SprocketsProxy(#[from] sprockets_proxy::Error), } // The workload / information allocated to a single sled. diff --git a/sled-agent/src/sp.rs b/sled-agent/src/sp.rs index 6148e2d5e7..f47bb4110d 100644 --- a/sled-agent/src/sp.rs +++ b/sled-agent/src/sp.rs @@ -22,12 +22,17 @@ use sprockets_host::RotManagerHandle; use sprockets_host::RotOpV1; use sprockets_host::RotResultV1; use sprockets_host::RotTransport; +use sprockets_host::Session; +use sprockets_host::SessionHandshakeError; use std::collections::VecDeque; use std::net::Ipv6Addr; use std::sync::Arc; use std::thread; +use std::time::Duration; use std::time::Instant; use thiserror::Error; +use tokio::io::AsyncRead; +use tokio::io::AsyncWrite; // These error cases are mostly simulation-specific; the list will grow once we // have real hardware (and may shrink if/when we remove or collapse simulated @@ -82,8 +87,8 @@ impl SpHandle { } } - // TODO The error type here leaks that we only currently support simulated - // SPs and will need work once we support a real SP. + // TODO-cleanup The error type here leaks that we only currently support + // simulated SPs and will need work once we support a real SP. pub fn rot_handle(&self) -> RotManagerHandle { match &self.inner { Inner::SimulatedSp(sim) => sim.rot_handle.clone(), @@ -95,6 +100,89 @@ impl SpHandle { Inner::SimulatedSp(sim) => sim.rot_certs, } } + + // TODO-cleanup The error type here leaks that we only currently support + // simulated SPs and will need work once we support a real SP. + pub(crate) async fn wrap_stream( + &self, + stream: T, + role: SprocketsRole, + log: &Logger, + ) -> Result, SessionHandshakeError> { + // TODO-cleanup Do we want this timeout to be configurable? + const ROT_TIMEOUT: Duration = Duration::from_secs(30); + + let session = match role { + SprocketsRole::Client => { + sprockets_host::Session::new_client( + stream, + self.manufacturing_public_key(), + self.rot_handle(), + self.rot_certs(), + ROT_TIMEOUT, + ) + .await? + } + SprocketsRole::Server => { + sprockets_host::Session::new_server( + stream, + self.manufacturing_public_key(), + self.rot_handle(), + self.rot_certs(), + ROT_TIMEOUT, + ) + .await? + } + }; + + let remote_identity = session.remote_identity(); + // TODO-correctness We must check `remote_identity` against the list + // of devices expected in our trust quorum (once we have such a + // list!). + + info!( + log, "Negotiated sprockets session"; + "peer_serial_number" => ?remote_identity.certs.serial_number, + ); + Ok(session) + } +} + +#[derive(Debug, Clone, Copy)] +pub(crate) enum SprocketsRole { + Client, + Server, +} + +pub(crate) trait AsyncReadWrite: + AsyncRead + AsyncWrite + Send + Unpin +{ +} +impl AsyncReadWrite for T where T: AsyncRead + AsyncWrite + Send + Unpin {} + +/// Helper function to wrap a stream in a sprockets session if we have an SP, or +/// return it wrapped in an unauthenticated `BufStream` if not. +/// +/// TODO-cleanup This function should be removed when we start requiring an SP +/// (even if simulated) to be present. +pub(crate) async fn maybe_wrap_stream( + stream: T, + sp: &Option, + role: SprocketsRole, + log: &Logger, +) -> Result, SessionHandshakeError> +{ + match sp.as_ref() { + Some(sp) => { + info!(log, "SP available; establishing sprockets session"); + let session = sp.wrap_stream(stream, role, log).await?; + Ok(Box::new(session)) + } + None => { + info!(log, "No SP available; proceeding without sprockets auth"); + Ok(Box::new(tokio::io::BufStream::new(stream))) + } + } } #[derive(Clone)] diff --git a/sled-agent/tests/integration_tests/commands.rs b/sled-agent/tests/integration_tests/commands.rs index 25960b5998..3f7fda5584 100644 --- a/sled-agent/tests/integration_tests/commands.rs +++ b/sled-agent/tests/integration_tests/commands.rs @@ -55,32 +55,9 @@ fn test_sled_agent_no_args() { assert_contents("tests/output/cmd-sled-agent-noargs-stderr", &stderr_text); } -#[test] -fn test_sled_agent_openapi_bootagent() { - let exec = Exec::cmd(path_to_sled_agent()).arg("openapi").arg("bootstrap"); - let (exit_status, stdout_text, stderr_text) = run_command(exec); - assert_exit_code(exit_status, EXIT_SUCCESS); - assert_contents( - "tests/output/cmd-sled-agent-openapi-bootstrap-stderr", - &stderr_text, - ); - - let spec: OpenAPI = serde_json::from_str(&stdout_text) - .expect("stdout was not valid OpenAPI"); - - // Check for lint errors. - let errors = openapi_lint::validate(&spec); - assert!(errors.is_empty(), "{}", errors.join("\n\n")); - - // Confirm that the output hasn't changed. It's expected that we'll change - // this file as the API evolves, but pay attention to the diffs to ensure - // that the changes match your expectations. - assert_contents("../openapi/bootstrap-agent.json", &stdout_text); -} - #[test] fn test_sled_agent_openapi_sled() { - let exec = Exec::cmd(path_to_sled_agent()).arg("openapi").arg("sled"); + let exec = Exec::cmd(path_to_sled_agent()).arg("openapi"); let (exit_status, stdout_text, stderr_text) = run_command(exec); assert_exit_code(exit_status, EXIT_SUCCESS); assert_contents(