From a5052d0ec05348b734bab5fd02f76847b12729ed Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Wed, 2 Oct 2024 21:55:24 +0000 Subject: [PATCH 01/28] sled agent APIs to manage update artifact storage --- Cargo.lock | 28 ++ Cargo.toml | 6 + clients/repo-depot-client/Cargo.toml | 16 + clients/repo-depot-client/src/lib.rs | 24 ++ common/src/address.rs | 1 + dev-tools/openapi-manager/Cargo.toml | 7 +- dev-tools/openapi-manager/src/spec.rs | 9 + openapi/repo-depot.json | 82 +++++ openapi/sled-agent.json | 111 ++++++ sled-agent/Cargo.toml | 4 + sled-agent/api/src/lib.rs | 42 ++- sled-agent/repo-depot-api/Cargo.toml | 15 + sled-agent/repo-depot-api/src/lib.rs | 28 ++ sled-agent/src/artifact_store.rs | 481 +++++++++++++++++++++++++ sled-agent/src/http_entrypoints.rs | 34 ++ sled-agent/src/lib.rs | 1 + sled-agent/src/sim/artifact_store.rs | 45 +++ sled-agent/src/sim/http_entrypoints.rs | 34 ++ sled-agent/src/sim/mod.rs | 1 + sled-agent/src/sim/sled_agent.rs | 24 +- sled-agent/src/sled_agent.rs | 18 + 21 files changed, 1001 insertions(+), 10 deletions(-) create mode 100644 clients/repo-depot-client/Cargo.toml create mode 100644 clients/repo-depot-client/src/lib.rs create mode 100644 openapi/repo-depot.json create mode 100644 sled-agent/repo-depot-api/Cargo.toml create mode 100644 sled-agent/repo-depot-api/src/lib.rs create mode 100644 sled-agent/src/artifact_store.rs create mode 100644 sled-agent/src/sim/artifact_store.rs diff --git a/Cargo.lock b/Cargo.lock index 1ec40e5cb6..23627d9f1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6866,6 +6866,7 @@ dependencies = [ "glob", "guppy", "hex", + "hex-literal", "http 1.1.0", "hyper 1.4.1", "hyper-staticfile", @@ -6899,12 +6900,15 @@ dependencies = [ "propolis_api_types", "rand", "rcgen", + "repo-depot-api", + "repo-depot-client", "reqwest 0.12.7", "schemars", "semver 1.0.23", "serde", "serde_human_bytes", "serde_json", + "sha2", "sha3", "sled-agent-api", "sled-agent-client", @@ -7204,6 +7208,7 @@ dependencies = [ "openapiv3", "owo-colors", "oximeter-api", + "repo-depot-api", "serde_json", "similar", "sled-agent-api", @@ -9073,6 +9078,29 @@ version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" +[[package]] +name = "repo-depot-api" +version = "0.1.0" +dependencies = [ + "dropshot 0.12.0", + "omicron-common", + "omicron-workspace-hack", + "schemars", + "serde", +] + +[[package]] +name = "repo-depot-client" +version = "0.1.0" +dependencies = [ + "omicron-workspace-hack", + "progenitor", + "reqwest 0.12.7", + "schemars", + "serde", + "slog", +] + [[package]] name = "reqwest" version = "0.11.27" diff --git a/Cargo.toml b/Cargo.toml index 5411e233b2..4fd82e6f91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ members = [ "clients/nexus-client", "clients/oxide-client", "clients/oximeter-client", + "clients/repo-depot-client", "clients/sled-agent-client", "clients/wicketd-client", "cockroach-admin", @@ -97,6 +98,7 @@ members = [ "sled-agent", "sled-agent/api", "sled-agent/bootstrap-agent-api", + "sled-agent/repo-depot-api", "sled-agent/types", "sled-hardware", "sled-hardware/types", @@ -135,6 +137,7 @@ default-members = [ "clients/nexus-client", "clients/oxide-client", "clients/oximeter-client", + "clients/repo-depot-client", "clients/sled-agent-client", "clients/wicketd-client", "cockroach-admin", @@ -219,6 +222,7 @@ default-members = [ "sled-agent", "sled-agent/api", "sled-agent/bootstrap-agent-api", + "sled-agent/repo-depot-api", "sled-agent/types", "sled-hardware", "sled-hardware/types", @@ -529,6 +533,8 @@ reedline = "0.33.0" ref-cast = "1.0" regex = "1.10.6" regress = "0.9.1" +repo-depot-api = { path = "sled-agent/repo-depot-api" } +repo-depot-client = { path = "clients/repo-depot-client" } reqwest = { version = "0.12", default-features = false } ring = "0.17.8" rpassword = "7.3.1" diff --git a/clients/repo-depot-client/Cargo.toml b/clients/repo-depot-client/Cargo.toml new file mode 100644 index 0000000000..858c75632f --- /dev/null +++ b/clients/repo-depot-client/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "repo-depot-client" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +omicron-workspace-hack.workspace = true +progenitor.workspace = true +reqwest.workspace = true +schemars.workspace = true +serde.workspace = true +slog.workspace = true diff --git a/clients/repo-depot-client/src/lib.rs b/clients/repo-depot-client/src/lib.rs new file mode 100644 index 0000000000..69e21cdaf3 --- /dev/null +++ b/clients/repo-depot-client/src/lib.rs @@ -0,0 +1,24 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Interface for Sled Agent's Repo Depot to make API requests. + +progenitor::generate_api!( + spec = "../../openapi/repo-depot.json", + inner_type = slog::Logger, + pre_hook = (|log: &slog::Logger, request: &reqwest::Request| { + slog::debug!(log, "client request"; + "method" => %request.method(), + "uri" => %request.url(), + "body" => ?&request.body(), + ); + }), + post_hook = (|log: &slog::Logger, result: &Result<_, _>| { + slog::debug!(log, "client response"; "result" => ?result); + }), + derives = [schemars::JsonSchema], +); + +/// A type alias for errors returned by this crate. +pub type ClientError = crate::Error; diff --git a/common/src/address.rs b/common/src/address.rs index 7cf00d5228..7e6d68ebc8 100644 --- a/common/src/address.rs +++ b/common/src/address.rs @@ -29,6 +29,7 @@ pub const MIN_PORT: u16 = u16::MIN; pub const DNS_PORT: u16 = 53; pub const DNS_HTTP_PORT: u16 = 5353; pub const SLED_AGENT_PORT: u16 = 12345; +pub const REPO_DEPOT_PORT: u16 = 12348; pub const COCKROACH_PORT: u16 = 32221; pub const COCKROACH_ADMIN_PORT: u16 = 32222; diff --git a/dev-tools/openapi-manager/Cargo.toml b/dev-tools/openapi-manager/Cargo.toml index 211e134016..d32477caf3 100644 --- a/dev-tools/openapi-manager/Cargo.toml +++ b/dev-tools/openapi-manager/Cargo.toml @@ -12,9 +12,9 @@ anyhow.workspace = true atomicwrites.workspace = true bootstrap-agent-api.workspace = true camino.workspace = true +clap.workspace = true clickhouse-admin-api.workspace = true cockroach-admin-api.workspace = true -clap.workspace = true dns-server-api.workspace = true dropshot.workspace = true fs-err.workspace = true @@ -24,13 +24,14 @@ installinator-api.workspace = true nexus-external-api.workspace = true nexus-internal-api.workspace = true omicron-workspace-hack.workspace = true -openapiv3.workspace = true openapi-lint.workspace = true openapi-manager-types.workspace = true +openapiv3.workspace = true owo-colors.workspace = true oximeter-api.workspace = true +repo-depot-api.workspace = true serde_json.workspace = true -sled-agent-api.workspace = true similar.workspace = true +sled-agent-api.workspace = true supports-color.workspace = true wicketd-api.workspace = true diff --git a/dev-tools/openapi-manager/src/spec.rs b/dev-tools/openapi-manager/src/spec.rs index 7d734218fc..46cba6afb7 100644 --- a/dev-tools/openapi-manager/src/spec.rs +++ b/dev-tools/openapi-manager/src/spec.rs @@ -110,6 +110,15 @@ pub fn all_apis() -> Vec { filename: "oximeter.json", extra_validation: None, }, + ApiSpec { + title: "Oxide TUF Repo Depot API", + version: "0.0.1", + description: "API for fetching update artifacts", + boundary: ApiBoundary::Internal, + api_description: repo_depot_api::repo_depot_api_mod::stub_api_description, + filename: "repo-depot.json", + extra_validation: None, + }, ApiSpec { title: "Oxide Sled Agent API", version: "0.0.1", diff --git a/openapi/repo-depot.json b/openapi/repo-depot.json new file mode 100644 index 0000000000..0c0019cf8d --- /dev/null +++ b/openapi/repo-depot.json @@ -0,0 +1,82 @@ +{ + "openapi": "3.0.3", + "info": { + "title": "Oxide TUF Repo Depot API", + "description": "API for fetching update artifacts", + "contact": { + "url": "https://oxide.computer", + "email": "api@oxide.computer" + }, + "version": "0.0.1" + }, + "paths": { + "/artifact/sha256/{sha256}": { + "get": { + "summary": "Fetch an artifact from the depot.", + "operationId": "artifact_get_by_sha256", + "parameters": [ + { + "in": "path", + "name": "sha256", + "required": true, + "schema": { + "type": "string", + "format": "hex string (32 bytes)" + } + } + ], + "responses": { + "200": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + } + }, + "components": { + "schemas": { + "Error": { + "description": "Error information from a response.", + "type": "object", + "properties": { + "error_code": { + "type": "string" + }, + "message": { + "type": "string" + }, + "request_id": { + "type": "string" + } + }, + "required": [ + "message", + "request_id" + ] + } + }, + "responses": { + "Error": { + "description": "Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + } +} diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index c6f029ea9a..8bdf0647f6 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -10,6 +10,106 @@ "version": "0.0.1" }, "paths": { + "/artifacts/{sha256}": { + "put": { + "operationId": "artifact_put", + "parameters": [ + { + "in": "path", + "name": "sha256", + "required": true, + "schema": { + "type": "string", + "format": "hex string (32 bytes)" + } + } + ], + "requestBody": { + "content": { + "application/octet-stream": { + "schema": { + "type": "string", + "format": "binary" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "operationId": "artifact_delete", + "parameters": [ + { + "in": "path", + "name": "sha256", + "required": true, + "schema": { + "type": "string", + "format": "hex string (32 bytes)" + } + } + ], + "responses": { + "204": { + "description": "successful deletion" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/artifacts/{sha256}/copy-from-depot": { + "post": { + "operationId": "artifact_copy_from_depot", + "parameters": [ + { + "in": "path", + "name": "sha256", + "required": true, + "schema": { + "type": "string", + "format": "hex string (32 bytes)" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ArtifactCopyFromDepotBody" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/boot-disk/{boot_disk}/os/write": { "post": { "summary": "Write a new host OS image to the specified boot disk", @@ -1411,6 +1511,17 @@ "start_request" ] }, + "ArtifactCopyFromDepotBody": { + "type": "object", + "properties": { + "address": { + "type": "string" + } + }, + "required": [ + "address" + ] + }, "Baseboard": { "description": "Describes properties that should uniquely identify a Gimlet.", "oneOf": [ diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 3208f1c031..3ef367e0ea 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -63,12 +63,15 @@ propolis_api_types.workspace = true propolis-client.workspace = true propolis-mock-server.workspace = true # Only used by the simulated sled agent rand = { workspace = true, features = ["getrandom"] } +repo-depot-api.workspace = true +repo-depot-client.workspace = true reqwest = { workspace = true, features = ["rustls-tls", "stream"] } schemars = { workspace = true, features = ["chrono", "uuid1"] } semver.workspace = true serde.workspace = true serde_human_bytes.workspace = true serde_json = { workspace = true, features = ["raw_value"] } +sha2.workspace = true sha3.workspace = true sled-agent-api.workspace = true sled-agent-client.workspace = true @@ -104,6 +107,7 @@ opte-ioctl.workspace = true assert_matches.workspace = true expectorate.workspace = true guppy.workspace = true +hex-literal.workspace = true http.workspace = true hyper.workspace = true omicron-test-utils.workspace = true diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index d9e49a5c56..73f9cad2d5 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::{collections::BTreeMap, time::Duration}; +use std::{collections::BTreeMap, net::SocketAddrV6, time::Duration}; use camino::Utf8PathBuf; use dropshot::{ @@ -25,6 +25,7 @@ use omicron_common::{ DatasetsConfig, DatasetsManagementResult, DiskVariant, DisksManagementResult, OmicronPhysicalDisksConfig, }, + update::ArtifactHash, }; use omicron_uuid_kinds::{PropolisUuid, ZpoolUuid}; use schemars::JsonSchema; @@ -309,6 +310,35 @@ pub trait SledAgentApi { artifact: TypedBody, ) -> Result; + #[endpoint { + method = POST, + path = "/artifacts/{sha256}/copy-from-depot" + }] + async fn artifact_copy_from_depot( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result; + + #[endpoint { + method = PUT, + path = "/artifacts/{sha256}" + }] + async fn artifact_put( + rqctx: RequestContext, + path_params: Path, + body: StreamingBody, + ) -> Result; + + #[endpoint { + method = DELETE, + path = "/artifacts/{sha256}" + }] + async fn artifact_delete( + rqctx: RequestContext, + path_params: Path, + ) -> Result; + /// Take a snapshot of a disk that is attached to an instance #[endpoint { method = POST, @@ -545,6 +575,16 @@ pub struct DiskPathParam { pub disk_id: Uuid, } +#[derive(Deserialize, JsonSchema)] +pub struct ArtifactPathParam { + pub sha256: ArtifactHash, +} + +#[derive(Deserialize, JsonSchema)] +pub struct ArtifactCopyFromDepotBody { + pub address: SocketAddrV6, +} + #[derive(Deserialize, JsonSchema)] pub struct VmmIssueDiskSnapshotRequestPathParam { pub propolis_id: PropolisUuid, diff --git a/sled-agent/repo-depot-api/Cargo.toml b/sled-agent/repo-depot-api/Cargo.toml new file mode 100644 index 0000000000..f9fa60ad8b --- /dev/null +++ b/sled-agent/repo-depot-api/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "repo-depot-api" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +dropshot.workspace = true +omicron-common.workspace = true +omicron-workspace-hack.workspace = true +schemars.workspace = true +serde.workspace = true diff --git a/sled-agent/repo-depot-api/src/lib.rs b/sled-agent/repo-depot-api/src/lib.rs new file mode 100644 index 0000000000..236b9c8e7a --- /dev/null +++ b/sled-agent/repo-depot-api/src/lib.rs @@ -0,0 +1,28 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use dropshot::{FreeformBody, HttpError, HttpResponseOk, Path, RequestContext}; +use omicron_common::update::ArtifactHash; +use schemars::JsonSchema; +use serde::Deserialize; + +#[dropshot::api_description] +pub trait RepoDepotApi { + type Context; + + /// Fetch an artifact from the depot. + #[endpoint { + method = GET, + path = "/artifact/sha256/{sha256}", + }] + async fn artifact_get_by_sha256( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; +} + +#[derive(Clone, Debug, Deserialize, JsonSchema)] +pub struct ArtifactPathParams { + pub sha256: ArtifactHash, +} diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs new file mode 100644 index 0000000000..b1f452a3d9 --- /dev/null +++ b/sled-agent/src/artifact_store.rs @@ -0,0 +1,481 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Manages update artifacts stored on this sled. The implementation is a very +//! basic content-addressed object store. +//! +//! GET operations are handled by the "Repo Depot" API, which is deliberately +//! a separate Dropshot service from the rest of Sled Agent. This is to avoid a +//! circular logical dependency, because we expect Sled Agent to fetch artifacts +//! it does not have from another Repo Depot that does have them (at Nexus's +//! direction). This API's implementation is also part of this module. +//! +//! POST, PUT, and DELETE operations are handled by the Sled Agent API. + +use std::io::ErrorKind; +use std::net::SocketAddrV6; +use std::sync::LazyLock; +use std::time::Duration; + +use camino::{Utf8Path, Utf8PathBuf}; +use camino_tempfile::NamedUtf8TempFile; +use dropshot::{ + Body, ConfigDropshot, FreeformBody, HttpError, HttpResponseOk, + HttpServerStarter, Path, RequestContext, StreamingBody, +}; +use futures::{Stream, TryStreamExt}; +use illumos_utils::zpool::ZPOOL_MOUNTPOINT_ROOT; +use omicron_common::address::REPO_DEPOT_PORT; +use omicron_common::disk::{DatasetKind, DatasetsConfig}; +use omicron_common::update::ArtifactHash; +use repo_depot_api::*; +use sha2::{Digest, Sha256}; +use sled_storage::manager::StorageHandle; +use slog::{info, Logger}; +use tokio::fs::File; +use tokio::io::AsyncWriteExt; + +#[derive(Clone)] +pub(crate) struct ArtifactStore { + log: Logger, + storage: T, +} + +impl ArtifactStore { + pub(crate) fn new(log: &Logger, storage: T) -> ArtifactStore { + ArtifactStore { + log: log.new(slog::o!("component" => "ArtifactStore")), + storage, + } + } +} + +impl ArtifactStore { + pub(crate) fn start( + self, + sled_address: SocketAddrV6, + dropshot_config: &ConfigDropshot, + ) -> Result< + dropshot::HttpServer>, + Box, + > { + let mut depot_address = sled_address; + depot_address.set_port(REPO_DEPOT_PORT); + + let log = self.log.new(o!("component" => "dropshot (Repo Depot)")); + Ok(HttpServerStarter::new( + &ConfigDropshot { + bind_address: depot_address.into(), + ..dropshot_config.clone() + }, + repo_depot_api_mod::api_description::() + .expect("registered entrypoints"), + self, + &log, + )? + .start()) + } +} + +impl ArtifactStore { + /// GET operation (served by Repo Depot API) + pub(crate) async fn get( + &self, + sha256: ArtifactHash, + ) -> Result { + let sha256 = sha256.to_string(); + for dataset in self.datasets().await? { + let path = dataset.join(&sha256); + match File::open(&path).await { + Ok(file) => { + info!( + &self.log, + "Retrieved artifact"; + "sha256" => &sha256, + "path" => path.as_str() + ); + return Ok(file); + } + Err(err) if err.kind() == ErrorKind::NotFound => {} + Err(err) => { + return Err(Error::File { verb: "open", path, err }); + } + } + } + Err(Error::NotFound { sha256 }) + } + + /// PUT operation (served by Sled Agent API) + pub(crate) async fn put( + &self, + sha256: ArtifactHash, + stream: impl Stream, Error>>, + ) -> Result<(), Error> { + let ds = self.datasets().await?.next().ok_or(Error::NoUpdateDataset)?; + let final_path = ds.join(sha256.to_string()); + + let (file, temp_path) = tokio::task::spawn_blocking(move || { + NamedUtf8TempFile::new_in(&ds).map_err(|err| Error::File { + verb: "create temporary file in", + path: ds, + err, + }) + }) + .await?? + .into_parts(); + let file = + NamedUtf8TempFile::from_parts(File::from_std(file), temp_path); + + let (mut file, hasher) = stream + .try_fold( + (file, Sha256::new()), + |(mut file, mut hasher), chunk| async move { + hasher.update(&chunk); + match file.as_file_mut().write_all(chunk.as_ref()).await { + Ok(()) => Ok((file, hasher)), + Err(err) => Err(Error::File { + verb: "write to", + path: file.path().to_owned(), + err, + }), + } + }, + ) + .await?; + file.as_file_mut().sync_data().await.map_err(|err| Error::File { + verb: "sync", + path: final_path.clone(), + err, + })?; + + let digest = hasher.finalize(); + if digest.as_slice() != sha256.as_ref() { + return Err(Error::HashMismatch { + expected: sha256, + actual: ArtifactHash(digest.into()), + }); + } + + let moved_final_path = final_path.clone(); + tokio::task::spawn_blocking(move || { + file.persist(&moved_final_path).map_err(|err| Error::File { + verb: "rename temporary file to", + path: moved_final_path, + err: err.error, + }) + }) + .await??; + info!( + &self.log, + "Wrote artifact"; + "sha256" => &sha256.to_string(), + "path" => final_path.as_str() + ); + Ok(()) + } + + /// PUT operation (served by Sled Agent API) which takes a [`StreamingBody`] + pub(crate) async fn put_body( + &self, + sha256: ArtifactHash, + body: StreamingBody, + ) -> Result<(), Error> { + self.put(sha256, body.into_stream().map_err(Error::Body)).await + } + + /// POST operation (served by Sled Agent API) + pub(crate) async fn copy_from_depot( + &self, + sha256: ArtifactHash, + address: SocketAddrV6, + ) -> Result<(), Error> { + static CLIENT: LazyLock = LazyLock::new(|| { + reqwest::ClientBuilder::new() + .connect_timeout(Duration::from_secs(15)) + .read_timeout(Duration::from_secs(15)) + .build() + .unwrap() + }); + + let client = repo_depot_client::Client::new_with_client( + &format!("http://{address}"), + CLIENT.clone(), + self.log.new( + slog::o!("component" => "Repo Depot client (ArtifactStore)"), + ), + ); + let stream = client + .artifact_get_by_sha256(&sha256.to_string()) + .await + .map_err(|err| Error::DepotCopy { sha256, address, err })? + .into_inner() + .into_inner() + .map_err(|err| Error::DepotCopy { + sha256, + address, + err: repo_depot_client::ClientError::ResponseBodyError(err), + }); + self.put(sha256, stream).await + } + + /// DELETE operation (served by Sled Agent API) + pub(crate) async fn delete( + &self, + sha256: ArtifactHash, + ) -> Result<(), Error> { + let sha256 = sha256.to_string(); + let mut any_datasets = false; + for dataset in self.datasets().await? { + any_datasets = true; + let path = dataset.join(&sha256); + match tokio::fs::remove_file(&path).await { + Ok(()) => { + info!( + &self.log, + "Removed artifact"; + "sha256" => &sha256, + "path" => path.as_str() + ); + } + Err(err) if err.kind() == ErrorKind::NotFound => {} + Err(err) => { + return Err(Error::File { verb: "remove", path, err }); + } + } + } + if any_datasets { + Ok(()) + } else { + // If we're here because there aren't any update datasets, we should + // report Service Unavailable instead of a successful result. + Err(Error::NoUpdateDataset) + } + } + + async fn datasets( + &self, + ) -> Result + '_, Error> { + let config = self.storage.datasets_config_list().await?; + let mountpoint_root = self.storage.mountpoint_root(); + Ok(config + .datasets + .into_values() + .filter(|dataset| *dataset.name.dataset() == DatasetKind::Update) + .map(|dataset| dataset.name.mountpoint(mountpoint_root))) + } +} + +enum RepoDepotImpl {} + +impl RepoDepotApi for RepoDepotImpl { + type Context = ArtifactStore; + + async fn artifact_get_by_sha256( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError> { + let sha256 = path_params.into_inner().sha256; + let file = rqctx.context().get(sha256).await?; + let file_access = hyper_staticfile::vfs::TokioFileAccess::new(file); + let file_stream = + hyper_staticfile::util::FileBytesStream::new(file_access); + let body = Body::wrap(hyper_staticfile::Body::Full(file_stream)); + Ok(HttpResponseOk(FreeformBody(body))) + } +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum Error { + #[error(transparent)] + Body(dropshot::HttpError), + + #[error("Error retrieving dataset configuration: {0}")] + DatasetConfig(#[source] sled_storage::error::Error), + + #[error("Error fetching artifact {sha256} from depot {address}: {err}")] + DepotCopy { + sha256: ArtifactHash, + address: SocketAddrV6, + #[source] + err: repo_depot_client::ClientError, + }, + + #[error("Failed to {verb} `{path}`: {err}")] + File { + verb: &'static str, + path: Utf8PathBuf, + #[source] + err: std::io::Error, + }, + + #[error("Digest mismatch: expected {expected}, actual {actual}")] + HashMismatch { expected: ArtifactHash, actual: ArtifactHash }, + + #[error(transparent)] + Join(#[from] tokio::task::JoinError), + + #[error("Artifact {sha256} not found")] + NotFound { sha256: String }, + + #[error("No update datasets present")] + NoUpdateDataset, +} + +impl From for HttpError { + fn from(err: Error) -> HttpError { + match err { + Error::Body(inner) => inner, + Error::DatasetConfig(_) | Error::NoUpdateDataset => { + HttpError::for_unavail(None, err.to_string()) + } + Error::DepotCopy { .. } | Error::File { .. } | Error::Join(_) => { + HttpError::for_internal_error(err.to_string()) + } + Error::HashMismatch { .. } => { + HttpError::for_bad_request(None, err.to_string()) + } + Error::NotFound { .. } => { + HttpError::for_not_found(None, err.to_string()) + } + } + } +} + +pub(crate) trait StorageBackend { + async fn datasets_config_list(&self) -> Result; + fn mountpoint_root(&self) -> &Utf8Path; +} + +impl StorageBackend for StorageHandle { + async fn datasets_config_list(&self) -> Result { + self.datasets_config_list().await.map_err(Error::DatasetConfig) + } + + fn mountpoint_root(&self) -> &Utf8Path { + ZPOOL_MOUNTPOINT_ROOT.into() + } +} + +#[cfg(test)] +mod test { + use camino::Utf8Path; + use camino_tempfile::Utf8TempDir; + use futures::stream; + use hex_literal::hex; + use omicron_common::disk::{ + DatasetConfig, DatasetKind, DatasetName, DatasetsConfig, + }; + use omicron_common::update::ArtifactHash; + use omicron_common::zpool_name::ZpoolName; + use omicron_test_utils::dev::test_setup_log; + use omicron_uuid_kinds::{DatasetUuid, ZpoolUuid}; + use tokio::io::AsyncReadExt; + + use super::{ArtifactStore, Error, StorageBackend}; + + struct TestBackend { + datasets: DatasetsConfig, + mountpoint_root: Utf8TempDir, + } + + impl TestBackend { + fn new(len: usize) -> TestBackend { + let mountpoint_root = camino_tempfile::tempdir().unwrap(); + + let mut datasets = DatasetsConfig::default(); + if len > 0 { + datasets.generation = datasets.generation.next(); + } + for _ in 0..len { + let dataset = DatasetConfig { + id: DatasetUuid::new_v4(), + name: DatasetName::new( + ZpoolName::new_external(ZpoolUuid::new_v4()), + DatasetKind::Update, + ), + compression: Default::default(), + quota: None, + reservation: None, + }; + let mountpoint = + dataset.name.mountpoint(mountpoint_root.path()); + std::fs::create_dir_all(mountpoint).unwrap(); + datasets.datasets.insert(dataset.id, dataset); + } + + TestBackend { datasets, mountpoint_root } + } + } + + impl StorageBackend for TestBackend { + async fn datasets_config_list(&self) -> Result { + Ok(self.datasets.clone()) + } + + fn mountpoint_root(&self) -> &Utf8Path { + self.mountpoint_root.path() + } + } + + const TEST_ARTIFACT: &[u8] = b"I'm an artifact!\n"; + const TEST_HASH: ArtifactHash = ArtifactHash(hex!( + "ab3581cd62f6645518f61a8e4391af6c062d5d60111edb0e51b37bd84827f5b4" + )); + + #[tokio::test] + async fn get_put_delete() { + let log = test_setup_log("get_put_delete"); + let backend = TestBackend::new(1); + let store = ArtifactStore::new(&log.log, backend); + + // get fails, because it doesn't exist yet + assert!(matches!( + store.get(TEST_HASH).await, + Err(Error::NotFound { .. }) + )); + // delete does not fail because we don't fail if the artifact is not + // present + assert!(matches!(store.delete(TEST_HASH).await, Ok(()))); + + // put succeeds + store + .put(TEST_HASH, stream::once(async { Ok(TEST_ARTIFACT) })) + .await + .unwrap(); + + // get now succeeds, file reads back OK + let mut file = store.get(TEST_HASH).await.unwrap(); + let mut vec = Vec::new(); + file.read_to_end(&mut vec).await.unwrap(); + assert_eq!(vec, TEST_ARTIFACT); + // delete succeeds + store.delete(TEST_HASH).await.unwrap(); + } + + #[tokio::test] + async fn no_dataset() { + // If there are no update datasets, all gets should fail with + // `Error::NotFound`, and all other operations should fail with + // `Error::NoUpdateDataset`. + + let log = test_setup_log("no_dataset"); + let backend = TestBackend::new(0); + let store = ArtifactStore::new(&log.log, backend); + + assert!(matches!( + store + .put(TEST_HASH, stream::once(async { Ok(TEST_ARTIFACT) })) + .await, + Err(Error::NoUpdateDataset) + )); + assert!(matches!( + store.get(TEST_HASH).await, + Err(Error::NotFound { .. }) + )); + assert!(matches!( + store.delete(TEST_HASH).await, + Err(Error::NoUpdateDataset) + )); + } +} diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index d008f4a45a..fd6f6bc48f 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -408,6 +408,40 @@ impl SledAgentApi for SledAgentImpl { Ok(HttpResponseUpdatedNoContent()) } + async fn artifact_copy_from_depot( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let sha256 = path_params.into_inner().sha256; + let address = body.into_inner().address; + rqctx + .context() + .artifact_store() + .copy_from_depot(sha256, address) + .await?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn artifact_put( + rqctx: RequestContext, + path_params: Path, + body: StreamingBody, + ) -> Result { + let sha256 = path_params.into_inner().sha256; + rqctx.context().artifact_store().put_body(sha256, body).await?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn artifact_delete( + rqctx: RequestContext, + path_params: Path, + ) -> Result { + let sha256 = path_params.into_inner().sha256; + rqctx.context().artifact_store().delete(sha256).await?; + Ok(HttpResponseDeleted()) + } + async fn vmm_issue_disk_snapshot_request( rqctx: RequestContext, path_params: Path, diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index a2421528e2..8f15f4bb89 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -15,6 +15,7 @@ pub mod sim; pub mod common; // Modules for the non-simulated sled agent. +mod artifact_store; mod backing_fs; mod boot_disk_os_writer; pub mod bootstrap; diff --git a/sled-agent/src/sim/artifact_store.rs b/sled-agent/src/sim/artifact_store.rs new file mode 100644 index 0000000000..3d7ec2dee4 --- /dev/null +++ b/sled-agent/src/sim/artifact_store.rs @@ -0,0 +1,45 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Implementation of `crate::artifact_store::StorageBackend` for our simulated +//! storage. + +use std::sync::Arc; + +use camino::Utf8Path; +use camino_tempfile::Utf8TempDir; +use futures::lock::Mutex; +use omicron_common::disk::DatasetsConfig; + +use super::storage::Storage; +use crate::artifact_store::{Error, StorageBackend}; + +pub(super) struct SimArtifactStorage { + root: Utf8TempDir, + backend: Arc>, +} + +impl SimArtifactStorage { + pub(super) fn new(backend: Arc>) -> SimArtifactStorage { + SimArtifactStorage { + root: camino_tempfile::tempdir().unwrap(), + backend, + } + } +} + +impl StorageBackend for SimArtifactStorage { + async fn datasets_config_list(&self) -> Result { + self.backend + .lock() + .await + .datasets_config_list() + .await + .map_err(|_| Error::NoUpdateDataset) + } + + fn mountpoint_root(&self) -> &Utf8Path { + self.root.path() + } +} diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 53d209725d..1d23f3e6ea 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -181,6 +181,40 @@ impl SledAgentApi for SledAgentSimImpl { Ok(HttpResponseUpdatedNoContent()) } + async fn artifact_copy_from_depot( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let sha256 = path_params.into_inner().sha256; + let address = body.into_inner().address; + rqctx + .context() + .artifact_store() + .copy_from_depot(sha256, address) + .await?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn artifact_put( + rqctx: RequestContext, + path_params: Path, + body: StreamingBody, + ) -> Result { + let sha256 = path_params.into_inner().sha256; + rqctx.context().artifact_store().put_body(sha256, body).await?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn artifact_delete( + rqctx: RequestContext, + path_params: Path, + ) -> Result { + let sha256 = path_params.into_inner().sha256; + rqctx.context().artifact_store().delete(sha256).await?; + Ok(HttpResponseDeleted()) + } + async fn vmm_issue_disk_snapshot_request( rqctx: RequestContext, path_params: Path, diff --git a/sled-agent/src/sim/mod.rs b/sled-agent/src/sim/mod.rs index 14d980cf79..ab3b155b36 100644 --- a/sled-agent/src/sim/mod.rs +++ b/sled-agent/src/sim/mod.rs @@ -4,6 +4,7 @@ //! Simulated sled agent implementation +mod artifact_store; mod collection; mod config; mod disk; diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 0652c021cb..722ac276d9 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -4,12 +4,14 @@ //! Simulated sled agent implementation +use super::artifact_store::SimArtifactStorage; use super::collection::{PokeMode, SimCollection}; use super::config::Config; use super::disk::SimDisk; use super::instance::{self, SimInstance}; use super::storage::CrucibleData; use super::storage::Storage; +use crate::artifact_store::ArtifactStore; use crate::nexus::NexusClient; use crate::sim::simulatable::Simulatable; use crate::updates::UpdateManager; @@ -72,7 +74,7 @@ pub struct SledAgent { vmms: Arc>, /// collection of simulated disks, indexed by disk uuid disks: Arc>, - storage: Mutex, + storage: Arc>, updates: UpdateManager, nexus_address: SocketAddr, pub nexus_client: Arc, @@ -88,6 +90,7 @@ pub struct SledAgent { fake_zones: Mutex, instance_ensure_state_error: Mutex>, pub bootstore_network_config: Mutex, + artifacts: ArtifactStore, pub log: Logger, } @@ -165,6 +168,14 @@ impl SledAgent { }, }); + let storage = Arc::new(Mutex::new(Storage::new( + id.into_untyped_uuid(), + config.storage.ip, + storage_log, + ))); + let artifacts = + ArtifactStore::new(&log, SimArtifactStorage::new(storage.clone())); + Arc::new(SledAgent { id, ip: config.dropshot.bind_address.ip(), @@ -178,11 +189,7 @@ impl SledAgent { disk_log, sim_mode, )), - storage: Mutex::new(Storage::new( - id.into_untyped_uuid(), - config.storage.ip, - storage_log, - )), + storage, updates: UpdateManager::new(config.updates.clone()), nexus_address, nexus_client, @@ -197,6 +204,7 @@ impl SledAgent { zones: vec![], }), instance_ensure_state_error: Mutex::new(None), + artifacts, log, bootstore_network_config, }) @@ -560,6 +568,10 @@ impl SledAgent { &self.updates } + pub(super) fn artifact_store(&self) -> &ArtifactStore { + &self.artifacts + } + pub async fn vmm_count(&self) -> usize { self.vmms.size().await } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 78a61c894e..668e62cb95 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -4,6 +4,7 @@ //! Sled agent implementation +use crate::artifact_store::ArtifactStore; use crate::boot_disk_os_writer::BootDiskOsWriter; use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; use crate::bootstrap::early_networking::EarlyNetworkSetupError; @@ -167,6 +168,9 @@ pub enum Error { #[error("Expected revision to fit in a u32, but found {0}")] UnexpectedRevision(i64), + + #[error("Error starting Repo Depot service: {0}")] + RepoDepotStart(#[source] Box), } impl From for omicron_common::api::external::Error { @@ -350,6 +354,9 @@ struct SledAgentInner { // Component of Sled Agent responsible for managing instrumentation probes. probes: ProbeManager, + + // Component of Sled Agent responsible for managing the artifact store. + repo_depot: dropshot::HttpServer, } impl SledAgentInner { @@ -582,6 +589,10 @@ impl SledAgent { log.new(o!("component" => "ProbeManager")), ); + let repo_depot = ArtifactStore::new(&log, storage_manager.clone()) + .start(sled_address, &config.dropshot) + .map_err(Error::RepoDepotStart)?; + let sled_agent = SledAgent { inner: Arc::new(SledAgentInner { id: request.body.id, @@ -604,6 +615,7 @@ impl SledAgent { bootstore: long_running_task_handles.bootstore.clone(), _metrics_manager: metrics_manager, boot_disk_os_writer: BootDiskOsWriter::new(&parent_log), + repo_depot, }), log: log.clone(), sprockets: config.sprockets.clone(), @@ -1084,6 +1096,8 @@ impl SledAgent { } /// Downloads and applies an artifact. + // TODO: This is being split into "download" (i.e. store an artifact in the + // artifact store) and "apply" (perform an update using an artifact). pub async fn update_artifact( &self, artifact: UpdateArtifactId, @@ -1095,6 +1109,10 @@ impl SledAgent { Ok(()) } + pub fn artifact_store(&self) -> &ArtifactStore { + &self.inner.repo_depot.app_private() + } + /// Issue a snapshot request for a Crucible disk attached to an instance pub async fn vmm_issue_disk_snapshot_request( &self, From ce1bc4256bd4311ff460cb8cf8e43b3d3e67d9a0 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Fri, 4 Oct 2024 01:21:42 +0000 Subject: [PATCH 02/28] fn datasets -> fn dataset_mountpoints --- sled-agent/src/artifact_store.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index b1f452a3d9..dcf60d0ab9 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -85,7 +85,7 @@ impl ArtifactStore { sha256: ArtifactHash, ) -> Result { let sha256 = sha256.to_string(); - for dataset in self.datasets().await? { + for dataset in self.dataset_mountpoints().await? { let path = dataset.join(&sha256); match File::open(&path).await { Ok(file) => { @@ -112,7 +112,11 @@ impl ArtifactStore { sha256: ArtifactHash, stream: impl Stream, Error>>, ) -> Result<(), Error> { - let ds = self.datasets().await?.next().ok_or(Error::NoUpdateDataset)?; + let ds = self + .dataset_mountpoints() + .await? + .next() + .ok_or(Error::NoUpdateDataset)?; let final_path = ds.join(sha256.to_string()); let (file, temp_path) = tokio::task::spawn_blocking(move || { @@ -226,7 +230,7 @@ impl ArtifactStore { ) -> Result<(), Error> { let sha256 = sha256.to_string(); let mut any_datasets = false; - for dataset in self.datasets().await? { + for dataset in self.dataset_mountpoints().await? { any_datasets = true; let path = dataset.join(&sha256); match tokio::fs::remove_file(&path).await { @@ -253,7 +257,7 @@ impl ArtifactStore { } } - async fn datasets( + async fn dataset_mountpoints( &self, ) -> Result + '_, Error> { let config = self.storage.datasets_config_list().await?; From 5ad16a1db20b13444691587105b6cc14e059fc7a Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Fri, 4 Oct 2024 01:31:45 +0000 Subject: [PATCH 03/28] be more resilient in the face of io errors --- sled-agent/src/artifact_store.rs | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index dcf60d0ab9..4f7804f716 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -32,7 +32,7 @@ use omicron_common::update::ArtifactHash; use repo_depot_api::*; use sha2::{Digest, Sha256}; use sled_storage::manager::StorageHandle; -use slog::{info, Logger}; +use slog::{error, info, Logger}; use tokio::fs::File; use tokio::io::AsyncWriteExt; @@ -85,6 +85,7 @@ impl ArtifactStore { sha256: ArtifactHash, ) -> Result { let sha256 = sha256.to_string(); + let mut last_error = None; for dataset in self.dataset_mountpoints().await? { let path = dataset.join(&sha256); match File::open(&path).await { @@ -99,11 +100,21 @@ impl ArtifactStore { } Err(err) if err.kind() == ErrorKind::NotFound => {} Err(err) => { - return Err(Error::File { verb: "open", path, err }); + error!( + &self.log, + "Failed to open file"; + "error" => &err, + "path" => path.as_str(), + ); + last_error = Some(Error::File { verb: "open", path, err }); } } } - Err(Error::NotFound { sha256 }) + if let Some(last_error) = last_error { + Err(last_error) + } else { + Err(Error::NotFound { sha256 }) + } } /// PUT operation (served by Sled Agent API) @@ -230,6 +241,7 @@ impl ArtifactStore { ) -> Result<(), Error> { let sha256 = sha256.to_string(); let mut any_datasets = false; + let mut last_error = None; for dataset in self.dataset_mountpoints().await? { any_datasets = true; let path = dataset.join(&sha256); @@ -244,11 +256,20 @@ impl ArtifactStore { } Err(err) if err.kind() == ErrorKind::NotFound => {} Err(err) => { - return Err(Error::File { verb: "remove", path, err }); + error!( + &self.log, + "Failed to remove file"; + "error" => &err, + "path" => path.as_str(), + ); + last_error = + Some(Error::File { verb: "remove", path, err }); } } } - if any_datasets { + if let Some(last_error) = last_error { + Err(last_error) + } else if any_datasets { Ok(()) } else { // If we're here because there aren't any update datasets, we should From 485ee4057097fa109c593de463bdf6f987d921c3 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Fri, 4 Oct 2024 18:52:55 +0000 Subject: [PATCH 04/28] clean up temporary files on startup --- sled-agent/src/artifact_store.rs | 147 ++++++++++++++++++++++++------- sled-agent/src/lib.rs | 2 +- sled-agent/src/sled_agent.rs | 6 +- 3 files changed, 118 insertions(+), 37 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index 4f7804f716..b5a908f628 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -36,6 +36,8 @@ use slog::{error, info, Logger}; use tokio::fs::File; use tokio::io::AsyncWriteExt; +const TEMP_SUBDIR: &str = "tmp"; + #[derive(Clone)] pub(crate) struct ArtifactStore { log: Logger, @@ -52,14 +54,54 @@ impl ArtifactStore { } impl ArtifactStore { - pub(crate) fn start( + pub(crate) async fn start( self, sled_address: SocketAddrV6, dropshot_config: &ConfigDropshot, - ) -> Result< - dropshot::HttpServer>, - Box, - > { + ) -> Result, StartError> { + // This function is only called when the real Sled Agent starts up (it + // is only defined over `ArtifactStore`). In the real + // Sled Agent, these datasets are durable and may retain temporary + // files leaked during a crash. Upon startup, we attempt to remove the + // subdirectory we store temporary files in, logging an error if that + // fails. + + // This `datasets_config_list` call is clear enough to the + // compiler, but perhaps not to the person reading this code, + // because there's two relevant methods in scope: the concrete + // `StorageHandle::datasets_config_list` (because T = StorageHandle) and + // the generic `::datasets_config_list`. + // The reason the concrete function is selected is because these + // functions return two different error types, and the `.map_err` + // implies that we want a `sled_storage::error::Error`. + let config = self + .storage + .datasets_config_list() + .await + .map_err(StartError::DatasetConfig)?; + for mountpoint in + update_dataset_mountpoints(config, ZPOOL_MOUNTPOINT_ROOT.into()) + { + let path = mountpoint.join(TEMP_SUBDIR); + if let Err(err) = tokio::fs::remove_dir_all(&path).await { + if err.kind() != ErrorKind::NotFound { + // We log an error here because we expect that if we are + // having disk I/O errors, something else (fmd?) will + // identify those issues and bubble them up to the operator. + // (As of writing this comment that is not true but we + // expect this to exist in the limit, and refusing to start + // Sled Agent because of a problem with a single FRU seems + // inappropriate.) + error!( + &self.log, + "Failed to remove stale temporary artifacts"; + "error" => &err, + "path" => path.as_str(), + ); + } + } + } + let mut depot_address = sled_address; depot_address.set_port(REPO_DEPOT_PORT); @@ -73,11 +115,21 @@ impl ArtifactStore { .expect("registered entrypoints"), self, &log, - )? + ) + .map_err(StartError::Dropshot)? .start()) } } +#[derive(Debug, thiserror::Error)] +pub enum StartError { + #[error("Error retrieving dataset configuration: {0}")] + DatasetConfig(#[source] sled_storage::error::Error), + + #[error("Dropshot error while starting Repo Depot service: {0}")] + Dropshot(#[source] Box), +} + impl ArtifactStore { /// GET operation (served by Repo Depot API) pub(crate) async fn get( @@ -94,7 +146,7 @@ impl ArtifactStore { &self.log, "Retrieved artifact"; "sha256" => &sha256, - "path" => path.as_str() + "path" => path.as_str(), ); return Ok(file); } @@ -123,17 +175,27 @@ impl ArtifactStore { sha256: ArtifactHash, stream: impl Stream, Error>>, ) -> Result<(), Error> { - let ds = self + let mountpoint = self .dataset_mountpoints() .await? .next() .ok_or(Error::NoUpdateDataset)?; - let final_path = ds.join(sha256.to_string()); - + let final_path = mountpoint.join(sha256.to_string()); + + let temp_dir = mountpoint.join(TEMP_SUBDIR); + if let Err(err) = tokio::fs::create_dir(&temp_dir).await { + if err.kind() != ErrorKind::AlreadyExists { + return Err(Error::File { + verb: "create directory", + path: temp_dir, + err, + }); + } + } let (file, temp_path) = tokio::task::spawn_blocking(move || { - NamedUtf8TempFile::new_in(&ds).map_err(|err| Error::File { + NamedUtf8TempFile::new_in(&temp_dir).map_err(|err| Error::File { verb: "create temporary file in", - path: ds, + path: temp_dir, err, }) }) @@ -185,7 +247,7 @@ impl ArtifactStore { &self.log, "Wrote artifact"; "sha256" => &sha256.to_string(), - "path" => final_path.as_str() + "path" => final_path.as_str(), ); Ok(()) } @@ -251,7 +313,7 @@ impl ArtifactStore { &self.log, "Removed artifact"; "sha256" => &sha256, - "path" => path.as_str() + "path" => path.as_str(), ); } Err(err) if err.kind() == ErrorKind::NotFound => {} @@ -282,15 +344,21 @@ impl ArtifactStore { &self, ) -> Result + '_, Error> { let config = self.storage.datasets_config_list().await?; - let mountpoint_root = self.storage.mountpoint_root(); - Ok(config - .datasets - .into_values() - .filter(|dataset| *dataset.name.dataset() == DatasetKind::Update) - .map(|dataset| dataset.name.mountpoint(mountpoint_root))) + Ok(update_dataset_mountpoints(config, self.storage.mountpoint_root())) } } +fn update_dataset_mountpoints( + config: DatasetsConfig, + root: &Utf8Path, +) -> impl Iterator + '_ { + config + .datasets + .into_values() + .filter(|dataset| *dataset.name.dataset() == DatasetKind::Update) + .map(|dataset| dataset.name.mountpoint(root)) +} + enum RepoDepotImpl {} impl RepoDepotApi for RepoDepotImpl { @@ -463,19 +531,32 @@ mod test { // present assert!(matches!(store.delete(TEST_HASH).await, Ok(()))); - // put succeeds - store - .put(TEST_HASH, stream::once(async { Ok(TEST_ARTIFACT) })) - .await - .unwrap(); - - // get now succeeds, file reads back OK - let mut file = store.get(TEST_HASH).await.unwrap(); - let mut vec = Vec::new(); - file.read_to_end(&mut vec).await.unwrap(); - assert_eq!(vec, TEST_ARTIFACT); - // delete succeeds - store.delete(TEST_HASH).await.unwrap(); + // test several things here: + // 1. put succeeds + // 2. put is idempotent (we don't care if it clobbers a file as long as + // the hash is okay) + // 3. we don't fail trying to create TEMP_SUBDIR twice + for _ in 0..2 { + store + .put(TEST_HASH, stream::once(async { Ok(TEST_ARTIFACT) })) + .await + .unwrap(); + // get succeeds, file reads back OK + let mut file = store.get(TEST_HASH).await.unwrap(); + let mut vec = Vec::new(); + file.read_to_end(&mut vec).await.unwrap(); + assert_eq!(vec, TEST_ARTIFACT); + } + + // delete succeeds and is idempotent + for _ in 0..2 { + store.delete(TEST_HASH).await.unwrap(); + // get now fails because it no longer exists + assert!(matches!( + store.get(TEST_HASH).await, + Err(Error::NotFound { .. }) + )); + } } #[tokio::test] diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 8f15f4bb89..b2d78c4a5e 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -15,7 +15,7 @@ pub mod sim; pub mod common; // Modules for the non-simulated sled agent. -mod artifact_store; +pub mod artifact_store; mod backing_fs; mod boot_disk_os_writer; pub mod bootstrap; diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 668e62cb95..70c7dd376c 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -169,8 +169,8 @@ pub enum Error { #[error("Expected revision to fit in a u32, but found {0}")] UnexpectedRevision(i64), - #[error("Error starting Repo Depot service: {0}")] - RepoDepotStart(#[source] Box), + #[error(transparent)] + RepoDepotStart(#[from] crate::artifact_store::StartError), } impl From for omicron_common::api::external::Error { @@ -591,7 +591,7 @@ impl SledAgent { let repo_depot = ArtifactStore::new(&log, storage_manager.clone()) .start(sled_address, &config.dropshot) - .map_err(Error::RepoDepotStart)?; + .await?; let sled_agent = SledAgent { inner: Arc::new(SledAgentInner { From 26f410731162d468bef5465a39ebefed6c566c6f Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Fri, 4 Oct 2024 18:57:15 +0000 Subject: [PATCH 05/28] naming consistency --- sled-agent/src/artifact_store.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index b5a908f628..de3fb6392c 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -138,8 +138,8 @@ impl ArtifactStore { ) -> Result { let sha256 = sha256.to_string(); let mut last_error = None; - for dataset in self.dataset_mountpoints().await? { - let path = dataset.join(&sha256); + for mountpoint in self.dataset_mountpoints().await? { + let path = mountpoint.join(&sha256); match File::open(&path).await { Ok(file) => { info!( @@ -304,9 +304,9 @@ impl ArtifactStore { let sha256 = sha256.to_string(); let mut any_datasets = false; let mut last_error = None; - for dataset in self.dataset_mountpoints().await? { + for mountpoint in self.dataset_mountpoints().await? { any_datasets = true; - let path = dataset.join(&sha256); + let path = mountpoint.join(&sha256); match tokio::fs::remove_file(&path).await { Ok(()) => { info!( From 893980e7d198db0e45436d768ef3e9d3f6ec66cc Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Fri, 4 Oct 2024 23:08:02 +0000 Subject: [PATCH 06/28] log.cleanup_successful(); --- sled-agent/src/artifact_store.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index de3fb6392c..aeca03c06c 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -557,6 +557,8 @@ mod test { Err(Error::NotFound { .. }) )); } + + log.cleanup_successful(); } #[tokio::test] @@ -583,5 +585,7 @@ mod test { store.delete(TEST_HASH).await, Err(Error::NoUpdateDataset) )); + + log.cleanup_successful(); } } From efcfb9258b9d7326fb9fcf62b1b6a9bd9a2bc6a6 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Fri, 11 Oct 2024 16:59:15 +0000 Subject: [PATCH 07/28] document ArtifactStore Co-authored-by: David Pacheco --- sled-agent/src/artifact_store.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index aeca03c06c..541888cab6 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -38,6 +38,24 @@ use tokio::io::AsyncWriteExt; const TEMP_SUBDIR: &str = "tmp"; +/// Content-addressable local storage for software artifacts. +/// +/// Storage for artifacts is backed by datasets that are explicitly designated +/// for this purpose. The `T: StorageBackend` parameter, which varies between +/// the real sled agent, the simulated sled agent, and unit tests, specifies +/// exactly which datasets are available for artifact storage. That's the only +/// thing `T` is used for. The behavior of storing artifacts as files under +/// one or more paths is identical for all callers (i.e., both the real and +/// simulated sled agents). +/// +/// A given artifact is generally stored on exactly one of the datasets +/// designated for artifact storage. There's no way to know from its key which +/// dataset it's stored on. This means: +/// +/// - for PUT, we pick a dataset and store it there +/// - for GET, we look in every dataset for it until we find it +/// - for DELETE, we attempt to delete it from every dataset (even after we've +/// found it in one of them) #[derive(Clone)] pub(crate) struct ArtifactStore { log: Logger, From e8b2673867ea5951e0a6226fcaee329268cff42b Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Fri, 11 Oct 2024 17:05:13 +0000 Subject: [PATCH 08/28] fn put -> put_impl --- sled-agent/src/artifact_store.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index 541888cab6..845ae5ca8f 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -187,8 +187,9 @@ impl ArtifactStore { } } - /// PUT operation (served by Sled Agent API) - pub(crate) async fn put( + /// Common implementation for all artifact write operations, generic over a + /// stream of bytes. + async fn put_impl( &self, sha256: ArtifactHash, stream: impl Stream, Error>>, @@ -276,7 +277,7 @@ impl ArtifactStore { sha256: ArtifactHash, body: StreamingBody, ) -> Result<(), Error> { - self.put(sha256, body.into_stream().map_err(Error::Body)).await + self.put_impl(sha256, body.into_stream().map_err(Error::Body)).await } /// POST operation (served by Sled Agent API) @@ -311,7 +312,7 @@ impl ArtifactStore { address, err: repo_depot_client::ClientError::ResponseBodyError(err), }); - self.put(sha256, stream).await + self.put_impl(sha256, stream).await } /// DELETE operation (served by Sled Agent API) @@ -556,7 +557,7 @@ mod test { // 3. we don't fail trying to create TEMP_SUBDIR twice for _ in 0..2 { store - .put(TEST_HASH, stream::once(async { Ok(TEST_ARTIFACT) })) + .put_impl(TEST_HASH, stream::once(async { Ok(TEST_ARTIFACT) })) .await .unwrap(); // get succeeds, file reads back OK @@ -591,7 +592,7 @@ mod test { assert!(matches!( store - .put(TEST_HASH, stream::once(async { Ok(TEST_ARTIFACT) })) + .put_impl(TEST_HASH, stream::once(async { Ok(TEST_ARTIFACT) })) .await, Err(Error::NoUpdateDataset) )); From 3b15f3bcf379d4a5da9ce39586ec80d4ca64ff20 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 15 Oct 2024 17:11:42 +0000 Subject: [PATCH 09/28] copy_from_depot should take a URL --- openapi/sled-agent.json | 4 ++-- sled-agent/api/src/lib.rs | 4 ++-- sled-agent/src/artifact_store.rs | 25 ++++++++++++++++--------- sled-agent/src/http_entrypoints.rs | 4 ++-- sled-agent/src/sim/http_entrypoints.rs | 4 ++-- 5 files changed, 24 insertions(+), 17 deletions(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index d0155d9a69..66462ad9ac 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -1520,12 +1520,12 @@ "ArtifactCopyFromDepotBody": { "type": "object", "properties": { - "address": { + "depot_base_url": { "type": "string" } }, "required": [ - "address" + "depot_base_url" ] }, "Baseboard": { diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 509e0ba844..6213435154 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::{collections::BTreeMap, net::SocketAddrV6, time::Duration}; +use std::{collections::BTreeMap, time::Duration}; use camino::Utf8PathBuf; use dropshot::{ @@ -584,7 +584,7 @@ pub struct ArtifactPathParam { #[derive(Deserialize, JsonSchema)] pub struct ArtifactCopyFromDepotBody { - pub address: SocketAddrV6, + pub depot_base_url: String, } #[derive(Deserialize, JsonSchema)] diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index 845ae5ca8f..8389dbe7d8 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -284,7 +284,7 @@ impl ArtifactStore { pub(crate) async fn copy_from_depot( &self, sha256: ArtifactHash, - address: SocketAddrV6, + depot_base_url: &str, ) -> Result<(), Error> { static CLIENT: LazyLock = LazyLock::new(|| { reqwest::ClientBuilder::new() @@ -295,21 +295,26 @@ impl ArtifactStore { }); let client = repo_depot_client::Client::new_with_client( - &format!("http://{address}"), + depot_base_url, CLIENT.clone(), - self.log.new( - slog::o!("component" => "Repo Depot client (ArtifactStore)"), - ), + self.log.new(slog::o!( + "component" => "Repo Depot client (ArtifactStore)", + "base_url" => depot_base_url.to_owned(), + )), ); let stream = client .artifact_get_by_sha256(&sha256.to_string()) .await - .map_err(|err| Error::DepotCopy { sha256, address, err })? + .map_err(|err| Error::DepotCopy { + sha256, + base_url: depot_base_url.to_owned(), + err, + })? .into_inner() .into_inner() .map_err(|err| Error::DepotCopy { sha256, - address, + base_url: depot_base_url.to_owned(), err: repo_depot_client::ClientError::ResponseBodyError(err), }); self.put_impl(sha256, stream).await @@ -405,10 +410,12 @@ pub(crate) enum Error { #[error("Error retrieving dataset configuration: {0}")] DatasetConfig(#[source] sled_storage::error::Error), - #[error("Error fetching artifact {sha256} from depot {address}: {err}")] + #[error( + "Error fetching artifact {sha256} from depot at {base_url}: {err}" + )] DepotCopy { sha256: ArtifactHash, - address: SocketAddrV6, + base_url: String, #[source] err: repo_depot_client::ClientError, }, diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 99fbcfd32e..e5a9255e4d 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -407,11 +407,11 @@ impl SledAgentApi for SledAgentImpl { body: TypedBody, ) -> Result { let sha256 = path_params.into_inner().sha256; - let address = body.into_inner().address; + let depot_base_url = body.into_inner().depot_base_url; rqctx .context() .artifact_store() - .copy_from_depot(sha256, address) + .copy_from_depot(sha256, &depot_base_url) .await?; Ok(HttpResponseUpdatedNoContent()) } diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 02c99ad68a..e13437f7dc 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -188,11 +188,11 @@ impl SledAgentApi for SledAgentSimImpl { body: TypedBody, ) -> Result { let sha256 = path_params.into_inner().sha256; - let address = body.into_inner().address; + let depot_base_url = body.into_inner().depot_base_url; rqctx .context() .artifact_store() - .copy_from_depot(sha256, address) + .copy_from_depot(sha256, &depot_base_url) .await?; Ok(HttpResponseUpdatedNoContent()) } From c909649574c4b13f7fb1681f60ee485a5bdf2c79 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 15 Oct 2024 17:13:46 +0000 Subject: [PATCH 10/28] reduce semantic satiation --- sled-agent/src/artifact_store.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index 8389dbe7d8..c503d8f420 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Manages update artifacts stored on this sled. The implementation is a very +//! Manages TUF artifacts stored on this sled. The implementation is a very //! basic content-addressed object store. //! //! GET operations are handled by the "Repo Depot" API, which is deliberately @@ -98,7 +98,7 @@ impl ArtifactStore { .await .map_err(StartError::DatasetConfig)?; for mountpoint in - update_dataset_mountpoints(config, ZPOOL_MOUNTPOINT_ROOT.into()) + filter_dataset_mountpoints(config, ZPOOL_MOUNTPOINT_ROOT.into()) { let path = mountpoint.join(TEMP_SUBDIR); if let Err(err) = tokio::fs::remove_dir_all(&path).await { @@ -368,11 +368,11 @@ impl ArtifactStore { &self, ) -> Result + '_, Error> { let config = self.storage.datasets_config_list().await?; - Ok(update_dataset_mountpoints(config, self.storage.mountpoint_root())) + Ok(filter_dataset_mountpoints(config, self.storage.mountpoint_root())) } } -fn update_dataset_mountpoints( +fn filter_dataset_mountpoints( config: DatasetsConfig, root: &Utf8Path, ) -> impl Iterator + '_ { From 43250773913b9dfc6b535d0e2eca3870dc131775 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 15 Oct 2024 17:15:00 +0000 Subject: [PATCH 11/28] remove default type parameter --- sled-agent/src/artifact_store.rs | 9 +++++---- sled-agent/src/sled_agent.rs | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index c503d8f420..0a74b1e36e 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -57,7 +57,7 @@ const TEMP_SUBDIR: &str = "tmp"; /// - for DELETE, we attempt to delete it from every dataset (even after we've /// found it in one of them) #[derive(Clone)] -pub(crate) struct ArtifactStore { +pub(crate) struct ArtifactStore { log: Logger, storage: T, } @@ -71,12 +71,13 @@ impl ArtifactStore { } } -impl ArtifactStore { +impl ArtifactStore { pub(crate) async fn start( self, sled_address: SocketAddrV6, dropshot_config: &ConfigDropshot, - ) -> Result, StartError> { + ) -> Result>, StartError> + { // This function is only called when the real Sled Agent starts up (it // is only defined over `ArtifactStore`). In the real // Sled Agent, these datasets are durable and may retain temporary @@ -386,7 +387,7 @@ fn filter_dataset_mountpoints( enum RepoDepotImpl {} impl RepoDepotApi for RepoDepotImpl { - type Context = ArtifactStore; + type Context = ArtifactStore; async fn artifact_get_by_sha256( rqctx: RequestContext, diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 5c0b453748..9bbd571164 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -356,7 +356,7 @@ struct SledAgentInner { probes: ProbeManager, // Component of Sled Agent responsible for managing the artifact store. - repo_depot: dropshot::HttpServer, + repo_depot: dropshot::HttpServer>, } impl SledAgentInner { @@ -1104,7 +1104,7 @@ impl SledAgent { Ok(()) } - pub fn artifact_store(&self) -> &ArtifactStore { + pub fn artifact_store(&self) -> &ArtifactStore { &self.inner.repo_depot.app_private() } From 86b8047113f931ad24fc3a4475a98122aa3896e5 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 15 Oct 2024 17:51:11 +0000 Subject: [PATCH 12/28] StorageBackend -> DatasetsManager; attempt clean up --- sled-agent/src/artifact_store.rs | 109 +++++++++++++-------------- sled-agent/src/sim/artifact_store.rs | 25 +++--- 2 files changed, 65 insertions(+), 69 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index 0a74b1e36e..afd5e07304 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -31,6 +31,7 @@ use omicron_common::disk::{DatasetKind, DatasetsConfig}; use omicron_common::update::ArtifactHash; use repo_depot_api::*; use sha2::{Digest, Sha256}; +use sled_storage::error::Error as StorageError; use sled_storage::manager::StorageHandle; use slog::{error, info, Logger}; use tokio::fs::File; @@ -41,7 +42,7 @@ const TEMP_SUBDIR: &str = "tmp"; /// Content-addressable local storage for software artifacts. /// /// Storage for artifacts is backed by datasets that are explicitly designated -/// for this purpose. The `T: StorageBackend` parameter, which varies between +/// for this purpose. The `T: DatasetsManager` parameter, which varies between /// the real sled agent, the simulated sled agent, and unit tests, specifies /// exactly which datasets are available for artifact storage. That's the only /// thing `T` is used for. The behavior of storing artifacts as files under @@ -57,12 +58,12 @@ const TEMP_SUBDIR: &str = "tmp"; /// - for DELETE, we attempt to delete it from every dataset (even after we've /// found it in one of them) #[derive(Clone)] -pub(crate) struct ArtifactStore { +pub(crate) struct ArtifactStore { log: Logger, storage: T, } -impl ArtifactStore { +impl ArtifactStore { pub(crate) fn new(log: &Logger, storage: T) -> ArtifactStore { ArtifactStore { log: log.new(slog::o!("component" => "ArtifactStore")), @@ -78,28 +79,20 @@ impl ArtifactStore { dropshot_config: &ConfigDropshot, ) -> Result>, StartError> { - // This function is only called when the real Sled Agent starts up (it - // is only defined over `ArtifactStore`). In the real - // Sled Agent, these datasets are durable and may retain temporary - // files leaked during a crash. Upon startup, we attempt to remove the - // subdirectory we store temporary files in, logging an error if that - // fails. - - // This `datasets_config_list` call is clear enough to the - // compiler, but perhaps not to the person reading this code, - // because there's two relevant methods in scope: the concrete - // `StorageHandle::datasets_config_list` (because T = StorageHandle) and - // the generic `::datasets_config_list`. - // The reason the concrete function is selected is because these - // functions return two different error types, and the `.map_err` - // implies that we want a `sled_storage::error::Error`. - let config = self + // In the real sled agent, the update datasets are durable and may + // retain temporary files leaked during a crash. Upon startup, we + // attempt to remove the subdirectory we store temporary files in, + // logging an error if that fails. + // + // (This function is part of `start` instead of `new` out of + // convenience: this function already needs to be async and fallible, + // but `new` doesn't; and all the sled agent implementations that don't + // call this function also don't need to run cleanup.) + for mountpoint in self .storage - .datasets_config_list() + .artifact_storage_paths() .await - .map_err(StartError::DatasetConfig)?; - for mountpoint in - filter_dataset_mountpoints(config, ZPOOL_MOUNTPOINT_ROOT.into()) + .map_err(StartError::DatasetConfig)? { let path = mountpoint.join(TEMP_SUBDIR); if let Err(err) = tokio::fs::remove_dir_all(&path).await { @@ -149,7 +142,7 @@ pub enum StartError { Dropshot(#[source] Box), } -impl ArtifactStore { +impl ArtifactStore { /// GET operation (served by Repo Depot API) pub(crate) async fn get( &self, @@ -157,7 +150,7 @@ impl ArtifactStore { ) -> Result { let sha256 = sha256.to_string(); let mut last_error = None; - for mountpoint in self.dataset_mountpoints().await? { + for mountpoint in self.storage.artifact_storage_paths().await? { let path = mountpoint.join(&sha256); match File::open(&path).await { Ok(file) => { @@ -196,7 +189,8 @@ impl ArtifactStore { stream: impl Stream, Error>>, ) -> Result<(), Error> { let mountpoint = self - .dataset_mountpoints() + .storage + .artifact_storage_paths() .await? .next() .ok_or(Error::NoUpdateDataset)?; @@ -329,7 +323,7 @@ impl ArtifactStore { let sha256 = sha256.to_string(); let mut any_datasets = false; let mut last_error = None; - for mountpoint in self.dataset_mountpoints().await? { + for mountpoint in self.storage.artifact_storage_paths().await? { any_datasets = true; let path = mountpoint.join(&sha256); match tokio::fs::remove_file(&path).await { @@ -364,16 +358,19 @@ impl ArtifactStore { Err(Error::NoUpdateDataset) } } +} - async fn dataset_mountpoints( +/// Abstracts over what kind of sled agent we are; each of the real sled agent, +/// simulated sled agent, and this module's unit tests have different ways of +/// keeping track of the datasets on the system. +pub(crate) trait DatasetsManager { + async fn artifact_storage_paths( &self, - ) -> Result + '_, Error> { - let config = self.storage.datasets_config_list().await?; - Ok(filter_dataset_mountpoints(config, self.storage.mountpoint_root())) - } + ) -> Result + '_, StorageError>; } -fn filter_dataset_mountpoints( +/// Iterator `.filter().map()` common to `DatasetsManager` implementations. +pub(crate) fn filter_dataset_mountpoints( config: DatasetsConfig, root: &Utf8Path, ) -> impl Iterator + '_ { @@ -384,6 +381,15 @@ fn filter_dataset_mountpoints( .map(|dataset| dataset.name.mountpoint(root)) } +impl DatasetsManager for StorageHandle { + async fn artifact_storage_paths( + &self, + ) -> Result + '_, StorageError> { + let config = self.datasets_config_list().await?; + Ok(filter_dataset_mountpoints(config, ZPOOL_MOUNTPOINT_ROOT.into())) + } +} + enum RepoDepotImpl {} impl RepoDepotApi for RepoDepotImpl { @@ -409,7 +415,7 @@ pub(crate) enum Error { Body(dropshot::HttpError), #[error("Error retrieving dataset configuration: {0}")] - DatasetConfig(#[source] sled_storage::error::Error), + DatasetConfig(#[from] sled_storage::error::Error), #[error( "Error fetching artifact {sha256} from depot at {base_url}: {err}" @@ -462,24 +468,8 @@ impl From for HttpError { } } -pub(crate) trait StorageBackend { - async fn datasets_config_list(&self) -> Result; - fn mountpoint_root(&self) -> &Utf8Path; -} - -impl StorageBackend for StorageHandle { - async fn datasets_config_list(&self) -> Result { - self.datasets_config_list().await.map_err(Error::DatasetConfig) - } - - fn mountpoint_root(&self) -> &Utf8Path { - ZPOOL_MOUNTPOINT_ROOT.into() - } -} - #[cfg(test)] mod test { - use camino::Utf8Path; use camino_tempfile::Utf8TempDir; use futures::stream; use hex_literal::hex; @@ -490,9 +480,10 @@ mod test { use omicron_common::zpool_name::ZpoolName; use omicron_test_utils::dev::test_setup_log; use omicron_uuid_kinds::{DatasetUuid, ZpoolUuid}; + use sled_storage::error::Error as StorageError; use tokio::io::AsyncReadExt; - use super::{ArtifactStore, Error, StorageBackend}; + use super::{ArtifactStore, DatasetsManager, Error}; struct TestBackend { datasets: DatasetsConfig, @@ -528,13 +519,15 @@ mod test { } } - impl StorageBackend for TestBackend { - async fn datasets_config_list(&self) -> Result { - Ok(self.datasets.clone()) - } - - fn mountpoint_root(&self) -> &Utf8Path { - self.mountpoint_root.path() + impl DatasetsManager for TestBackend { + async fn artifact_storage_paths( + &self, + ) -> Result + '_, StorageError> + { + Ok(super::filter_dataset_mountpoints( + self.datasets.clone(), + self.mountpoint_root.path(), + )) } } diff --git a/sled-agent/src/sim/artifact_store.rs b/sled-agent/src/sim/artifact_store.rs index 3d7ec2dee4..d73f5a2880 100644 --- a/sled-agent/src/sim/artifact_store.rs +++ b/sled-agent/src/sim/artifact_store.rs @@ -7,13 +7,12 @@ use std::sync::Arc; -use camino::Utf8Path; use camino_tempfile::Utf8TempDir; use futures::lock::Mutex; -use omicron_common::disk::DatasetsConfig; +use sled_storage::error::Error as StorageError; use super::storage::Storage; -use crate::artifact_store::{Error, StorageBackend}; +use crate::artifact_store::DatasetsManager; pub(super) struct SimArtifactStorage { root: Utf8TempDir, @@ -29,17 +28,21 @@ impl SimArtifactStorage { } } -impl StorageBackend for SimArtifactStorage { - async fn datasets_config_list(&self) -> Result { - self.backend +impl DatasetsManager for SimArtifactStorage { + async fn artifact_storage_paths( + &self, + ) -> Result + '_, StorageError> + { + let config = self + .backend .lock() .await .datasets_config_list() .await - .map_err(|_| Error::NoUpdateDataset) - } - - fn mountpoint_root(&self) -> &Utf8Path { - self.root.path() + .map_err(|_| StorageError::LedgerNotFound)?; + Ok(crate::artifact_store::filter_dataset_mountpoints( + config, + self.root.path(), + )) } } From 599089a7c311cf211ed79de1d3564b5bf89dc646 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 15 Oct 2024 17:54:02 +0000 Subject: [PATCH 13/28] create reqwest client at startup, not on first use --- sled-agent/src/artifact_store.rs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index afd5e07304..783f9372f3 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -15,7 +15,6 @@ use std::io::ErrorKind; use std::net::SocketAddrV6; -use std::sync::LazyLock; use std::time::Duration; use camino::{Utf8Path, Utf8PathBuf}; @@ -60,6 +59,7 @@ const TEMP_SUBDIR: &str = "tmp"; #[derive(Clone)] pub(crate) struct ArtifactStore { log: Logger, + reqwest_client: reqwest::Client, storage: T, } @@ -67,6 +67,11 @@ impl ArtifactStore { pub(crate) fn new(log: &Logger, storage: T) -> ArtifactStore { ArtifactStore { log: log.new(slog::o!("component" => "ArtifactStore")), + reqwest_client: reqwest::ClientBuilder::new() + .connect_timeout(Duration::from_secs(15)) + .read_timeout(Duration::from_secs(15)) + .build() + .unwrap(), storage, } } @@ -281,17 +286,9 @@ impl ArtifactStore { sha256: ArtifactHash, depot_base_url: &str, ) -> Result<(), Error> { - static CLIENT: LazyLock = LazyLock::new(|| { - reqwest::ClientBuilder::new() - .connect_timeout(Duration::from_secs(15)) - .read_timeout(Duration::from_secs(15)) - .build() - .unwrap() - }); - let client = repo_depot_client::Client::new_with_client( depot_base_url, - CLIENT.clone(), + self.reqwest_client.clone(), self.log.new(slog::o!( "component" => "Repo Depot client (ArtifactStore)", "base_url" => depot_base_url.to_owned(), From f624e5d86ced3ed67d9cae72b1cd1221e751fca3 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 15 Oct 2024 17:56:04 +0000 Subject: [PATCH 14/28] don't embed source error strings --- sled-agent/src/artifact_store.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index 783f9372f3..b484a1b414 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -140,10 +140,10 @@ impl ArtifactStore { #[derive(Debug, thiserror::Error)] pub enum StartError { - #[error("Error retrieving dataset configuration: {0}")] + #[error("Error retrieving dataset configuration")] DatasetConfig(#[source] sled_storage::error::Error), - #[error("Dropshot error while starting Repo Depot service: {0}")] + #[error("Dropshot error while starting Repo Depot service")] Dropshot(#[source] Box), } @@ -411,12 +411,10 @@ pub(crate) enum Error { #[error(transparent)] Body(dropshot::HttpError), - #[error("Error retrieving dataset configuration: {0}")] + #[error("Error retrieving dataset configuration")] DatasetConfig(#[from] sled_storage::error::Error), - #[error( - "Error fetching artifact {sha256} from depot at {base_url}: {err}" - )] + #[error("Error fetching artifact {sha256} from depot at {base_url}")] DepotCopy { sha256: ArtifactHash, base_url: String, @@ -424,7 +422,7 @@ pub(crate) enum Error { err: repo_depot_client::ClientError, }, - #[error("Failed to {verb} `{path}`: {err}")] + #[error("Failed to {verb} `{path}`")] File { verb: &'static str, path: Utf8PathBuf, From b44d06bb08c1079a196b85d08192e022117a6ac8 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 15 Oct 2024 18:01:44 +0000 Subject: [PATCH 15/28] fewer contextless errors --- sled-agent/src/artifact_store.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index b484a1b414..bb7a54cec3 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -408,7 +408,7 @@ impl RepoDepotApi for RepoDepotImpl { #[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[error(transparent)] + #[error("Error while reading request body")] Body(dropshot::HttpError), #[error("Error retrieving dataset configuration")] @@ -433,7 +433,7 @@ pub(crate) enum Error { #[error("Digest mismatch: expected {expected}, actual {actual}")] HashMismatch { expected: ArtifactHash, actual: ArtifactHash }, - #[error(transparent)] + #[error("Blocking task failed")] Join(#[from] tokio::task::JoinError), #[error("Artifact {sha256} not found")] From 013e67f0f83e445226e72c06d08a4ea18a386de6 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 15 Oct 2024 18:06:21 +0000 Subject: [PATCH 16/28] another docstring --- sled-agent/src/artifact_store.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index bb7a54cec3..4616aa8d86 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -387,6 +387,8 @@ impl DatasetsManager for StorageHandle { } } +/// Implementation of the Repo Depot API backed by an +/// `ArtifactStore`. enum RepoDepotImpl {} impl RepoDepotApi for RepoDepotImpl { From 5eefb6e83ab64cf356844e20e2ce4b2b42c9ebe4 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 15 Oct 2024 18:18:11 +0000 Subject: [PATCH 17/28] add the repo depot API to api-manifest.toml --- dev-tools/ls-apis/api-manifest.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dev-tools/ls-apis/api-manifest.toml b/dev-tools/ls-apis/api-manifest.toml index f91aba74aa..2c471a4ab2 100644 --- a/dev-tools/ls-apis/api-manifest.toml +++ b/dev-tools/ls-apis/api-manifest.toml @@ -278,6 +278,11 @@ client_package_name = "sled-agent-client" label = "Sled Agent" server_package_name = "sled-agent-api" +[[apis]] +client_package_name = "repo-depot-client" +label = "Repo Depot API" +server_package_name = "repo-depot-api" + [[apis]] client_package_name = "wicketd-client" label = "Wicketd" From aebfe3270b5dcb8ee53b957bbdc8e71c5bd7931d Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Wed, 16 Oct 2024 22:17:40 +0000 Subject: [PATCH 18/28] add list artifacts operation --- openapi/sled-agent.json | 29 +++++++++ sled-agent/api/src/lib.rs | 11 +++- sled-agent/src/artifact_store.rs | 85 +++++++++++++++++++++++++- sled-agent/src/http_entrypoints.rs | 9 ++- sled-agent/src/sim/http_entrypoints.rs | 8 +++ 5 files changed, 139 insertions(+), 3 deletions(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 66462ad9ac..627d254dd8 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -10,6 +10,35 @@ "version": "0.0.1" }, "paths": { + "/artifacts": { + "get": { + "operationId": "artifact_list", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Set_of_hex_schema", + "type": "array", + "items": { + "type": "string", + "format": "hex string (32 bytes)" + }, + "uniqueItems": true + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/artifacts/{sha256}": { "put": { "operationId": "artifact_put", diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 6213435154..9bf7947423 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -2,7 +2,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::{collections::BTreeMap, time::Duration}; +use std::collections::{BTreeMap, BTreeSet}; +use std::time::Duration; use camino::Utf8PathBuf; use dropshot::{ @@ -302,6 +303,14 @@ pub trait SledAgentApi { artifact: TypedBody, ) -> Result; + #[endpoint { + method = GET, + path = "/artifacts" + }] + async fn artifact_list( + rqctx: RequestContext, + ) -> Result>, HttpError>; + #[endpoint { method = POST, path = "/artifacts/{sha256}/copy-from-depot" diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index 4616aa8d86..0fb418f01a 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -13,8 +13,10 @@ //! //! POST, PUT, and DELETE operations are handled by the Sled Agent API. +use std::collections::BTreeSet; use std::io::ErrorKind; use std::net::SocketAddrV6; +use std::str::FromStr; use std::time::Duration; use camino::{Utf8Path, Utf8PathBuf}; @@ -149,6 +151,10 @@ pub enum StartError { impl ArtifactStore { /// GET operation (served by Repo Depot API) + /// + /// We try all datasets, returning early if we find the artifact, logging + /// errors as we go. If we don't find it we return the most recent error we + /// logged or a NotFound. pub(crate) async fn get( &self, sha256: ArtifactHash, @@ -186,6 +192,73 @@ impl ArtifactStore { } } + /// List operation (served by Sled Agent API) + /// + /// We try all datasets, logging errors as we go; if we're experiencing I/O + /// errors, Nexus should still be aware of the artifacts we think we have. + pub(crate) async fn list(&self) -> Result, Error> { + let mut set = BTreeSet::new(); + let mut any_datasets = false; + for mountpoint in self.storage.artifact_storage_paths().await? { + any_datasets = true; + let mut read_dir = match tokio::fs::read_dir(&mountpoint).await { + Ok(read_dir) => read_dir, + Err(err) => { + error!( + &self.log, + "Failed to read dir"; + "error" => &err, + "path" => mountpoint.as_str(), + ); + continue; + } + }; + // The semantics of tokio::fs::ReadDir are weird. At least with + // `std::fs::ReadDir`, we know when the end of the iterator is, + // because `.next()` returns `Option>`; we could + // theoretically log the error and continue trying to retrieve + // elements from the iterator (but whether this makes sense to do + // is not documented and likely system-dependent). + // + // The Tokio version returns `Result>`, which + // has no indication of whether there might be more items in + // the stream! (The stream adapter in tokio-stream simply calls + // `Result::transpose()`, so in theory an error is not the end of + // the stream.) + // + // For lack of any direction we stop reading entries from the stream + // on the first error. That way we at least don't get stuck retrying + // an operation that will always fail. + loop { + match read_dir.next_entry().await { + Ok(Some(entry)) => { + if let Ok(file_name) = entry.file_name().into_string() { + if let Ok(hash) = ArtifactHash::from_str(&file_name) + { + set.insert(hash); + } + } + } + Ok(None) => break, + Err(err) => { + error!( + &self.log, + "Failed to read dir"; + "error" => &err, + "path" => mountpoint.as_str(), + ); + break; + } + } + } + } + if any_datasets { + Ok(set) + } else { + Err(Error::NoUpdateDataset) + } + } + /// Common implementation for all artifact write operations, generic over a /// stream of bytes. async fn put_impl( @@ -313,6 +386,9 @@ impl ArtifactStore { } /// DELETE operation (served by Sled Agent API) + /// + /// We attempt to delete the artifact in all datasets, logging errors as we + /// go. If any errors occurred we return the most recent error we logged. pub(crate) async fn delete( &self, sha256: ArtifactHash, @@ -534,11 +610,13 @@ mod test { )); #[tokio::test] - async fn get_put_delete() { + async fn list_get_put_delete() { let log = test_setup_log("get_put_delete"); let backend = TestBackend::new(1); let store = ArtifactStore::new(&log.log, backend); + // list succeeds with an empty result + assert!(store.list().await.unwrap().is_empty()); // get fails, because it doesn't exist yet assert!(matches!( store.get(TEST_HASH).await, @@ -558,6 +636,8 @@ mod test { .put_impl(TEST_HASH, stream::once(async { Ok(TEST_ARTIFACT) })) .await .unwrap(); + // list lists the file + assert!(store.list().await.unwrap().into_iter().eq([TEST_HASH])); // get succeeds, file reads back OK let mut file = store.get(TEST_HASH).await.unwrap(); let mut vec = Vec::new(); @@ -568,6 +648,8 @@ mod test { // delete succeeds and is idempotent for _ in 0..2 { store.delete(TEST_HASH).await.unwrap(); + // list succeeds with an empty result + assert!(store.list().await.unwrap().is_empty()); // get now fails because it no longer exists assert!(matches!( store.get(TEST_HASH).await, @@ -598,6 +680,7 @@ mod test { store.get(TEST_HASH).await, Err(Error::NotFound { .. }) )); + assert!(matches!(store.list().await, Err(Error::NoUpdateDataset))); assert!(matches!( store.delete(TEST_HASH).await, Err(Error::NoUpdateDataset) diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index e5a9255e4d..d70eeb1b9a 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -31,6 +31,7 @@ use omicron_common::disk::{ DatasetsConfig, DatasetsManagementResult, DiskVariant, DisksManagementResult, M2Slot, OmicronPhysicalDisksConfig, }; +use omicron_common::update::ArtifactHash; use sled_agent_api::*; use sled_agent_types::boot_disk::{ BootDiskOsWriteStatus, BootDiskPathParams, BootDiskUpdatePathParams, @@ -50,7 +51,7 @@ use sled_agent_types::zone_bundle::{ BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, StorageLimit, ZoneBundleId, ZoneBundleMetadata, }; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, BTreeSet}; type SledApiDescription = ApiDescription; @@ -401,6 +402,12 @@ impl SledAgentApi for SledAgentImpl { Ok(HttpResponseUpdatedNoContent()) } + async fn artifact_list( + rqctx: RequestContext, + ) -> Result>, HttpError> { + Ok(HttpResponseOk(rqctx.context().artifact_store().list().await?)) + } + async fn artifact_copy_from_depot( rqctx: RequestContext, path_params: Path, diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index e13437f7dc..c5353a16ed 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -35,6 +35,7 @@ use omicron_common::disk::DatasetsConfig; use omicron_common::disk::DatasetsManagementResult; use omicron_common::disk::DisksManagementResult; use omicron_common::disk::OmicronPhysicalDisksConfig; +use omicron_common::update::ArtifactHash; use sled_agent_api::*; use sled_agent_types::boot_disk::BootDiskOsWriteStatus; use sled_agent_types::boot_disk::BootDiskPathParams; @@ -57,6 +58,7 @@ use sled_agent_types::zone_bundle::CleanupCount; use sled_agent_types::zone_bundle::ZoneBundleId; use sled_agent_types::zone_bundle::ZoneBundleMetadata; use std::collections::BTreeMap; +use std::collections::BTreeSet; use std::sync::Arc; use super::sled_agent::SledAgent; @@ -182,6 +184,12 @@ impl SledAgentApi for SledAgentSimImpl { Ok(HttpResponseUpdatedNoContent()) } + async fn artifact_list( + rqctx: RequestContext, + ) -> Result>, HttpError> { + Ok(HttpResponseOk(rqctx.context().artifact_store().list().await?)) + } + async fn artifact_copy_from_depot( rqctx: RequestContext, path_params: Path, From 2ae374316c08d76c88fee415d73fb7541250facc Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Mon, 28 Oct 2024 17:26:49 +0000 Subject: [PATCH 19/28] create an update artifact dataset on both M.2s --- sled-storage/src/dataset.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index a715a33a69..0f2b610c42 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -30,6 +30,7 @@ pub const CLUSTER_DATASET: &'static str = "cluster"; pub const CONFIG_DATASET: &'static str = "config"; pub const M2_DEBUG_DATASET: &'static str = "debug"; pub const M2_BACKING_DATASET: &'static str = "backing"; +pub const M2_ARTIFACT_DATASET: &'static str = "update"; pub const DEBUG_DATASET_QUOTA: ByteCount = if cfg!(any(test, feature = "testing")) { @@ -46,6 +47,10 @@ pub const DUMP_DATASET_QUOTA: ByteCount = ByteCount::from_gibibytes_u32(100); // passed to zfs create -o compression= pub const DUMP_DATASET_COMPRESSION: CompressionAlgorithm = CompressionAlgorithm::GzipN { level: GzipLevel::new::<9>() }; +// TODO-correctness: This value of 20 GiB is a wild guess -- given TUF repo +// sizes as of Oct 2024, it would be capable of storing about 10 distinct system +// versions. +pub const ARTIFACT_DATASET_QUOTA: ByteCount = ByteCount::from_gibibytes_u32(20); // U.2 datasets live under the encrypted dataset and inherit encryption pub const ZONE_DATASET: &'static str = "crypt/zone"; @@ -65,7 +70,7 @@ const U2_EXPECTED_DATASETS: [ExpectedDataset; U2_EXPECTED_DATASET_COUNT] = [ .compression(DUMP_DATASET_COMPRESSION), ]; -const M2_EXPECTED_DATASET_COUNT: usize = 6; +const M2_EXPECTED_DATASET_COUNT: usize = 7; const M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ // Stores software images. // @@ -90,6 +95,9 @@ const M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ ExpectedDataset::new(CONFIG_DATASET), // Store debugging data, such as service bundles. ExpectedDataset::new(M2_DEBUG_DATASET).quota(DEBUG_DATASET_QUOTA), + // Stores software artifacts (zones, OS images, Hubris images, etc.) + // extracted from TUF repos by Nexus. + ExpectedDataset::new(M2_ARTIFACT_DATASET).quota(ARTIFACT_DATASET_QUOTA), ]; // Helper type for describing expected datasets and their optional quota. From 45c4cac06c869199e96d84d7a441406affaf1f0c Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Mon, 28 Oct 2024 22:38:22 +0000 Subject: [PATCH 20/28] PUT artifacts to all artifact datasets --- sled-agent/src/artifact_store.rs | 355 +++++++++++++++++++++++-------- 1 file changed, 263 insertions(+), 92 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index 0fb418f01a..3f142cc1ce 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -20,22 +20,23 @@ use std::str::FromStr; use std::time::Duration; use camino::{Utf8Path, Utf8PathBuf}; -use camino_tempfile::NamedUtf8TempFile; +use camino_tempfile::{NamedUtf8TempFile, Utf8TempPath}; use dropshot::{ Body, ConfigDropshot, FreeformBody, HttpError, HttpResponseOk, HttpServerStarter, Path, RequestContext, StreamingBody, }; use futures::{Stream, TryStreamExt}; -use illumos_utils::zpool::ZPOOL_MOUNTPOINT_ROOT; +use http::StatusCode; use omicron_common::address::REPO_DEPOT_PORT; use omicron_common::disk::{DatasetKind, DatasetsConfig}; use omicron_common::update::ArtifactHash; use repo_depot_api::*; use sha2::{Digest, Sha256}; +use sled_storage::dataset::M2_ARTIFACT_DATASET; use sled_storage::error::Error as StorageError; use sled_storage::manager::StorageHandle; use slog::{error, info, Logger}; -use tokio::fs::File; +use tokio::fs::{File, OpenOptions}; use tokio::io::AsyncWriteExt; const TEMP_SUBDIR: &str = "tmp"; @@ -50,14 +51,15 @@ const TEMP_SUBDIR: &str = "tmp"; /// one or more paths is identical for all callers (i.e., both the real and /// simulated sled agents). /// -/// A given artifact is generally stored on exactly one of the datasets -/// designated for artifact storage. There's no way to know from its key which -/// dataset it's stored on. This means: +/// A given artifact is generally stored on both datasets designated for +/// artifact storage across both M.2 devices, but we attempt to be resilient to +/// a failing or missing M.2 device. This means: /// -/// - for PUT, we pick a dataset and store it there -/// - for GET, we look in every dataset for it until we find it -/// - for DELETE, we attempt to delete it from every dataset (even after we've -/// found it in one of them) +/// - for PUT, we try to write to all datasets, logging errors as we go; if we +/// successfully write the artifact to at least one, we return OK. +/// - for GET, we look in each dataset until we find it. +/// - for DELETE, we attempt to delete it from each dataset, logging errors as +/// we go, and failing if we saw any errors. #[derive(Clone)] pub(crate) struct ArtifactStore { log: Logger, @@ -149,6 +151,18 @@ pub enum StartError { Dropshot(#[source] Box), } +macro_rules! log_and_store { + ($last_error:expr, $log:expr, $verb:literal, $path:expr, $err:expr) => {{ + error!( + $log, + concat!("Failed to ", $verb, " path"); + "error" => &$err, + "path" => $path.as_str(), + ); + $last_error = Some(Error::File { verb: $verb, path: $path, err: $err }); + }}; +} + impl ArtifactStore { /// GET operation (served by Repo Depot API) /// @@ -175,13 +189,7 @@ impl ArtifactStore { } Err(err) if err.kind() == ErrorKind::NotFound => {} Err(err) => { - error!( - &self.log, - "Failed to open file"; - "error" => &err, - "path" => path.as_str(), - ); - last_error = Some(Error::File { verb: "open", path, err }); + log_and_store!(last_error, &self.log, "open", path, err); } } } @@ -259,64 +267,85 @@ impl ArtifactStore { } } - /// Common implementation for all artifact write operations, generic over a - /// stream of bytes. + /// Common implementation for all artifact write operations that creates + /// a temporary file on all datasets. + /// + /// Errors are logged and ignored unless a temporary file already exists + /// (another task is writing to this artifact) or no temporary files could + /// be created. + async fn put_writer( + &self, + sha256: ArtifactHash, + ) -> Result { + let mut inner = Vec::new(); + let mut last_error = None; + for mountpoint in self.storage.artifact_storage_paths().await? { + let temp_dir = mountpoint.join(TEMP_SUBDIR); + if let Err(err) = tokio::fs::create_dir(&temp_dir).await { + if err.kind() != ErrorKind::AlreadyExists { + log_and_store!( + last_error, &self.log, "create", temp_dir, err + ); + continue; + } + } + + let temp_path = + Utf8TempPath::from_path(temp_dir.join(sha256.to_string())); + let file = match OpenOptions::new() + .write(true) + .create_new(true) + .open(&temp_path) + .await + { + Ok(file) => file, + Err(err) => { + if err.kind() == ErrorKind::AlreadyExists { + return Err(Error::AlreadyInProgress { sha256 }); + } else { + let path = temp_path.to_path_buf(); + log_and_store!( + last_error, &self.log, "create", path, err + ); + continue; + } + } + }; + let file = NamedUtf8TempFile::from_parts(file, temp_path); + + inner.push((file, mountpoint, sha256)); + } + if inner.is_empty() { + Err(last_error.unwrap_or(Error::NoUpdateDataset)) + } else { + Ok(MultiWriter { inner }) + } + } + + /// Common implementation for all artifact write operations that copies a + /// generic stream of bytes to a set of temporary files, then persists those + /// files. async fn put_impl( &self, + multi_writer: Option, sha256: ArtifactHash, stream: impl Stream, Error>>, ) -> Result<(), Error> { - let mountpoint = self - .storage - .artifact_storage_paths() - .await? - .next() - .ok_or(Error::NoUpdateDataset)?; - let final_path = mountpoint.join(sha256.to_string()); - - let temp_dir = mountpoint.join(TEMP_SUBDIR); - if let Err(err) = tokio::fs::create_dir(&temp_dir).await { - if err.kind() != ErrorKind::AlreadyExists { - return Err(Error::File { - verb: "create directory", - path: temp_dir, - err, - }); - } - } - let (file, temp_path) = tokio::task::spawn_blocking(move || { - NamedUtf8TempFile::new_in(&temp_dir).map_err(|err| Error::File { - verb: "create temporary file in", - path: temp_dir, - err, - }) - }) - .await?? - .into_parts(); - let file = - NamedUtf8TempFile::from_parts(File::from_std(file), temp_path); + let multi_writer = match multi_writer { + Some(multi_writer) => multi_writer, + None => self.put_writer(sha256).await?, + }; - let (mut file, hasher) = stream + let (multi_writer, hasher) = stream .try_fold( - (file, Sha256::new()), - |(mut file, mut hasher), chunk| async move { + (multi_writer, Sha256::new()), + |(mut multi_writer, mut hasher), chunk| async move { hasher.update(&chunk); - match file.as_file_mut().write_all(chunk.as_ref()).await { - Ok(()) => Ok((file, hasher)), - Err(err) => Err(Error::File { - verb: "write to", - path: file.path().to_owned(), - err, - }), - } + multi_writer.write(&self.log, chunk).await?; + Ok((multi_writer, hasher)) }, ) .await?; - file.as_file_mut().sync_data().await.map_err(|err| Error::File { - verb: "sync", - path: final_path.clone(), - err, - })?; let digest = hasher.finalize(); if digest.as_slice() != sha256.as_ref() { @@ -326,20 +355,11 @@ impl ArtifactStore { }); } - let moved_final_path = final_path.clone(); - tokio::task::spawn_blocking(move || { - file.persist(&moved_final_path).map_err(|err| Error::File { - verb: "rename temporary file to", - path: moved_final_path, - err: err.error, - }) - }) - .await??; + multi_writer.finalize(&self.log).await?; info!( &self.log, "Wrote artifact"; "sha256" => &sha256.to_string(), - "path" => final_path.as_str(), ); Ok(()) } @@ -350,7 +370,8 @@ impl ArtifactStore { sha256: ArtifactHash, body: StreamingBody, ) -> Result<(), Error> { - self.put_impl(sha256, body.into_stream().map_err(Error::Body)).await + self.put_impl(None, sha256, body.into_stream().map_err(Error::Body)) + .await } /// POST operation (served by Sled Agent API) @@ -367,6 +388,8 @@ impl ArtifactStore { "base_url" => depot_base_url.to_owned(), )), ); + // Check that there's no conflict before we send the upstream request. + let multi_writer = self.put_writer(sha256).await?; let stream = client .artifact_get_by_sha256(&sha256.to_string()) .await @@ -382,7 +405,7 @@ impl ArtifactStore { base_url: depot_base_url.to_owned(), err: repo_depot_client::ClientError::ResponseBodyError(err), }); - self.put_impl(sha256, stream).await + self.put_impl(Some(multi_writer), sha256, stream).await } /// DELETE operation (served by Sled Agent API) @@ -410,14 +433,7 @@ impl ArtifactStore { } Err(err) if err.kind() == ErrorKind::NotFound => {} Err(err) => { - error!( - &self.log, - "Failed to remove file"; - "error" => &err, - "path" => path.as_str(), - ); - last_error = - Some(Error::File { verb: "remove", path, err }); + log_and_store!(last_error, &self.log, "remove", path, err); } } } @@ -458,8 +474,127 @@ impl DatasetsManager for StorageHandle { async fn artifact_storage_paths( &self, ) -> Result + '_, StorageError> { - let config = self.datasets_config_list().await?; - Ok(filter_dataset_mountpoints(config, ZPOOL_MOUNTPOINT_ROOT.into())) + // TODO: When datasets are managed by Reconfigurator (#6229), + // this should be changed to use `self.datasets_config_list()` and + // `filter_dataset_mountpoints`. + Ok(self + .get_latest_disks() + .await + .all_m2_mountpoints(M2_ARTIFACT_DATASET) + .into_iter()) + } +} + +/// Abstraction for `ArtifactStore::put_impl` that handles writing to several +/// temporary files. +struct MultiWriter { + inner: Vec<(NamedUtf8TempFile, Utf8PathBuf, ArtifactHash)>, +} + +impl MultiWriter { + /// Write `chunk` to all files. If an error occurs, it is logged and the + /// temporary file is dropped. If there are no files left to write to, the + /// most recently-seen error is returned. + async fn write( + &mut self, + log: &Logger, + chunk: impl AsRef<[u8]>, + ) -> Result<(), Error> { + // We would rather use `Vec::retain_mut` here but we'd need an async + // version... + let to_write = std::mem::take(&mut self.inner); + self.inner.reserve_exact(to_write.len()); + let mut last_error = None; + for (mut file, mountpoint, sha256) in to_write { + match file.as_file_mut().write_all(chunk.as_ref()).await { + Ok(()) => self.inner.push((file, mountpoint, sha256)), + Err(err) => { + let path = file.path().to_owned(); + log_and_store!(last_error, log, "write to", path, err); + // `file` and `final_path` are dropped here, cleaning up + // the file + } + } + } + + if self.inner.is_empty() { + Err(last_error.unwrap_or(Error::NoUpdateDataset)) + } else { + Ok(()) + } + } + + /// Rename all files to their final paths. If an error occurs, it is logged. + /// If none of the files are renamed successfully, the most recently-seen + /// error is returned. + async fn finalize(self, log: &Logger) -> Result<(), Error> { + let mut last_error = None; + let mut any_success = false; + for (mut file, mountpoint, sha256) in self.inner { + // 1. Open the mountpoint and its temp dir so we can fsync them at + // the end. + let parent_dir = match File::open(&mountpoint).await { + Ok(dir) => dir, + Err(err) => { + log_and_store!(last_error, log, "open", mountpoint, err); + continue; + } + }; + let temp_dir_path = mountpoint.join(TEMP_SUBDIR); + let temp_dir = match File::open(&temp_dir_path).await { + Ok(dir) => dir, + Err(err) => { + log_and_store!(last_error, log, "open", temp_dir_path, err); + continue; + } + }; + // 2. fsync the file. + if let Err(err) = file.as_file_mut().sync_all().await { + let path = file.path().to_owned(); + log_and_store!(last_error, log, "sync", path, err); + continue; + } + // 3. Rename temporary file. + let final_path = mountpoint.join(sha256.to_string()); + let moved_final_path = final_path.clone(); + if let Err(err) = tokio::task::spawn_blocking(move || { + file.persist(&moved_final_path) + }) + .await? + { + error!( + log, + "Failed to rename temporary file"; + "error" => &err.error, + "from" => err.file.path().as_str(), + "to" => final_path.as_str(), + ); + last_error = Some(Error::FileRename { + from: err.file.path().to_owned(), + to: final_path, + err: err.error, + }); + continue; + } + // 4. fsync the parent directory for both the final path and its + // previous path. + if let Err(err) = parent_dir.sync_all().await { + log_and_store!(last_error, log, "sync", mountpoint, err); + continue; + } + if let Err(err) = temp_dir.sync_all().await { + log_and_store!(last_error, log, "sync", temp_dir_path, err); + continue; + } + + any_success = true; + } + + if any_success { + Ok(()) + } else { + Err(last_error.unwrap_or(Error::NoUpdateDataset)) + } } } @@ -486,6 +621,9 @@ impl RepoDepotApi for RepoDepotImpl { #[derive(Debug, thiserror::Error)] pub(crate) enum Error { + #[error("Another task is already writing artifact {sha256}")] + AlreadyInProgress { sha256: ArtifactHash }, + #[error("Error while reading request body")] Body(dropshot::HttpError), @@ -508,6 +646,14 @@ pub(crate) enum Error { err: std::io::Error, }, + #[error("Failed to rename `{from}` to `{to}`")] + FileRename { + from: Utf8PathBuf, + to: Utf8PathBuf, + #[source] + err: std::io::Error, + }, + #[error("Digest mismatch: expected {expected}, actual {actual}")] HashMismatch { expected: ArtifactHash, actual: ArtifactHash }, @@ -524,13 +670,19 @@ pub(crate) enum Error { impl From for HttpError { fn from(err: Error) -> HttpError { match err { + Error::AlreadyInProgress { .. } => HttpError::for_client_error( + None, + StatusCode::CONFLICT, + err.to_string(), + ), Error::Body(inner) => inner, Error::DatasetConfig(_) | Error::NoUpdateDataset => { HttpError::for_unavail(None, err.to_string()) } - Error::DepotCopy { .. } | Error::File { .. } | Error::Join(_) => { - HttpError::for_internal_error(err.to_string()) - } + Error::DepotCopy { .. } + | Error::File { .. } + | Error::FileRename { .. } + | Error::Join(_) => HttpError::for_internal_error(err.to_string()), Error::HashMismatch { .. } => { HttpError::for_bad_request(None, err.to_string()) } @@ -612,7 +764,7 @@ mod test { #[tokio::test] async fn list_get_put_delete() { let log = test_setup_log("get_put_delete"); - let backend = TestBackend::new(1); + let backend = TestBackend::new(2); let store = ArtifactStore::new(&log.log, backend); // list succeeds with an empty result @@ -633,7 +785,11 @@ mod test { // 3. we don't fail trying to create TEMP_SUBDIR twice for _ in 0..2 { store - .put_impl(TEST_HASH, stream::once(async { Ok(TEST_ARTIFACT) })) + .put_impl( + None, + TEST_HASH, + stream::once(async { Ok(TEST_ARTIFACT) }), + ) .await .unwrap(); // list lists the file @@ -645,6 +801,17 @@ mod test { assert_eq!(vec, TEST_ARTIFACT); } + // all datasets should have the artifact + for mountpoint in store.storage.artifact_storage_paths().await.unwrap() + { + assert_eq!( + tokio::fs::read(mountpoint.join(TEST_HASH.to_string())) + .await + .unwrap(), + TEST_ARTIFACT + ); + } + // delete succeeds and is idempotent for _ in 0..2 { store.delete(TEST_HASH).await.unwrap(); @@ -672,7 +839,11 @@ mod test { assert!(matches!( store - .put_impl(TEST_HASH, stream::once(async { Ok(TEST_ARTIFACT) })) + .put_impl( + None, + TEST_HASH, + stream::once(async { Ok(TEST_ARTIFACT) }) + ) .await, Err(Error::NoUpdateDataset) )); From 3c2f86684acd5e0403f3dfd2efdb648e0f6a477f Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 29 Oct 2024 01:04:41 +0000 Subject: [PATCH 21/28] change list API to return a count of each artifact --- openapi/sled-agent.json | 14 +++++++------- sled-agent/api/src/lib.rs | 4 ++-- sled-agent/src/artifact_store.rs | 19 +++++++++++++------ sled-agent/src/http_entrypoints.rs | 4 ++-- sled-agent/src/sim/http_entrypoints.rs | 3 +-- 5 files changed, 25 insertions(+), 19 deletions(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 75075c8f30..e52076fa4b 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -19,13 +19,13 @@ "content": { "application/json": { "schema": { - "title": "Set_of_hex_schema", - "type": "array", - "items": { - "type": "string", - "format": "hex string (32 bytes)" - }, - "uniqueItems": true + "title": "Map_of_uint", + "type": "object", + "additionalProperties": { + "type": "integer", + "format": "uint", + "minimum": 0 + } } } } diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 9bf7947423..470d499f7f 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::collections::{BTreeMap, BTreeSet}; +use std::collections::BTreeMap; use std::time::Duration; use camino::Utf8PathBuf; @@ -309,7 +309,7 @@ pub trait SledAgentApi { }] async fn artifact_list( rqctx: RequestContext, - ) -> Result>, HttpError>; + ) -> Result>, HttpError>; #[endpoint { method = POST, diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index 3f142cc1ce..d2db0e8c20 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -13,7 +13,7 @@ //! //! POST, PUT, and DELETE operations are handled by the Sled Agent API. -use std::collections::BTreeSet; +use std::collections::BTreeMap; use std::io::ErrorKind; use std::net::SocketAddrV6; use std::str::FromStr; @@ -204,8 +204,10 @@ impl ArtifactStore { /// /// We try all datasets, logging errors as we go; if we're experiencing I/O /// errors, Nexus should still be aware of the artifacts we think we have. - pub(crate) async fn list(&self) -> Result, Error> { - let mut set = BTreeSet::new(); + pub(crate) async fn list( + &self, + ) -> Result, Error> { + let mut map = BTreeMap::new(); let mut any_datasets = false; for mountpoint in self.storage.artifact_storage_paths().await? { any_datasets = true; @@ -243,7 +245,7 @@ impl ArtifactStore { if let Ok(file_name) = entry.file_name().into_string() { if let Ok(hash) = ArtifactHash::from_str(&file_name) { - set.insert(hash); + *map.entry(hash).or_default() += 1; } } } @@ -261,7 +263,7 @@ impl ArtifactStore { } } if any_datasets { - Ok(set) + Ok(map) } else { Err(Error::NoUpdateDataset) } @@ -793,7 +795,12 @@ mod test { .await .unwrap(); // list lists the file - assert!(store.list().await.unwrap().into_iter().eq([TEST_HASH])); + assert!(store + .list() + .await + .unwrap() + .into_iter() + .eq([(TEST_HASH, 2)])); // get succeeds, file reads back OK let mut file = store.get(TEST_HASH).await.unwrap(); let mut vec = Vec::new(); diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index d70eeb1b9a..3661c347df 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -51,7 +51,7 @@ use sled_agent_types::zone_bundle::{ BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, StorageLimit, ZoneBundleId, ZoneBundleMetadata, }; -use std::collections::{BTreeMap, BTreeSet}; +use std::collections::BTreeMap; type SledApiDescription = ApiDescription; @@ -404,7 +404,7 @@ impl SledAgentApi for SledAgentImpl { async fn artifact_list( rqctx: RequestContext, - ) -> Result>, HttpError> { + ) -> Result>, HttpError> { Ok(HttpResponseOk(rqctx.context().artifact_store().list().await?)) } diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index c5353a16ed..f401f1642d 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -58,7 +58,6 @@ use sled_agent_types::zone_bundle::CleanupCount; use sled_agent_types::zone_bundle::ZoneBundleId; use sled_agent_types::zone_bundle::ZoneBundleMetadata; use std::collections::BTreeMap; -use std::collections::BTreeSet; use std::sync::Arc; use super::sled_agent::SledAgent; @@ -186,7 +185,7 @@ impl SledAgentApi for SledAgentSimImpl { async fn artifact_list( rqctx: RequestContext, - ) -> Result>, HttpError> { + ) -> Result>, HttpError> { Ok(HttpResponseOk(rqctx.context().artifact_store().list().await?)) } From 743a67bd8c1f7952f69fef94c7cea84b13cad641 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 29 Oct 2024 06:22:54 +0000 Subject: [PATCH 22/28] make copy_from_depot create a task and return --- sled-agent/src/artifact_store.rs | 253 ++++++++++++++++++------------- 1 file changed, 150 insertions(+), 103 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index d2db0e8c20..e01db03edf 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -36,6 +36,7 @@ use sled_storage::dataset::M2_ARTIFACT_DATASET; use sled_storage::error::Error as StorageError; use sled_storage::manager::StorageHandle; use slog::{error, info, Logger}; +use slog_error_chain::SlogInlineError; use tokio::fs::{File, OpenOptions}; use tokio::io::AsyncWriteExt; @@ -275,10 +276,10 @@ impl ArtifactStore { /// Errors are logged and ignored unless a temporary file already exists /// (another task is writing to this artifact) or no temporary files could /// be created. - async fn put_writer( + async fn writer( &self, sha256: ArtifactHash, - ) -> Result { + ) -> Result { let mut inner = Vec::new(); let mut last_error = None; for mountpoint in self.storage.artifact_storage_paths().await? { @@ -315,55 +316,18 @@ impl ArtifactStore { }; let file = NamedUtf8TempFile::from_parts(file, temp_path); - inner.push((file, mountpoint, sha256)); + inner.push(Some((file, mountpoint))); } if inner.is_empty() { Err(last_error.unwrap_or(Error::NoUpdateDataset)) } else { - Ok(MultiWriter { inner }) - } - } - - /// Common implementation for all artifact write operations that copies a - /// generic stream of bytes to a set of temporary files, then persists those - /// files. - async fn put_impl( - &self, - multi_writer: Option, - sha256: ArtifactHash, - stream: impl Stream, Error>>, - ) -> Result<(), Error> { - let multi_writer = match multi_writer { - Some(multi_writer) => multi_writer, - None => self.put_writer(sha256).await?, - }; - - let (multi_writer, hasher) = stream - .try_fold( - (multi_writer, Sha256::new()), - |(mut multi_writer, mut hasher), chunk| async move { - hasher.update(&chunk); - multi_writer.write(&self.log, chunk).await?; - Ok((multi_writer, hasher)) - }, - ) - .await?; - - let digest = hasher.finalize(); - if digest.as_slice() != sha256.as_ref() { - return Err(Error::HashMismatch { - expected: sha256, - actual: ArtifactHash(digest.into()), - }); + Ok(ArtifactWriter { + hasher: Sha256::new(), + files: inner, + log: self.log.clone(), + sha256, + }) } - - multi_writer.finalize(&self.log).await?; - info!( - &self.log, - "Wrote artifact"; - "sha256" => &sha256.to_string(), - ); - Ok(()) } /// PUT operation (served by Sled Agent API) which takes a [`StreamingBody`] @@ -372,7 +336,9 @@ impl ArtifactStore { sha256: ArtifactHash, body: StreamingBody, ) -> Result<(), Error> { - self.put_impl(None, sha256, body.into_stream().map_err(Error::Body)) + self.writer(sha256) + .await? + .write_stream(body.into_stream().map_err(Error::Body)) .await } @@ -391,23 +357,35 @@ impl ArtifactStore { )), ); // Check that there's no conflict before we send the upstream request. - let multi_writer = self.put_writer(sha256).await?; - let stream = client + let writer = self.writer(sha256).await?; + let response = client .artifact_get_by_sha256(&sha256.to_string()) .await .map_err(|err| Error::DepotCopy { sha256, base_url: depot_base_url.to_owned(), err, - })? - .into_inner() - .into_inner() - .map_err(|err| Error::DepotCopy { - sha256, - base_url: depot_base_url.to_owned(), - err: repo_depot_client::ClientError::ResponseBodyError(err), + })?; + // Copy from the stream on its own task and immediately return. + let log = self.log.clone(); + let base_url = depot_base_url.to_owned(); + tokio::task::spawn(async move { + let stream = response.into_inner().into_inner().map_err(|err| { + Error::DepotCopy { + sha256, + base_url: base_url.clone(), + err: repo_depot_client::ClientError::ResponseBodyError(err), + } }); - self.put_impl(Some(multi_writer), sha256, stream).await + if let Err(err) = writer.write_stream(stream).await { + error!( + &log, + "Failed to write artifact"; + "err" => &err, + ); + } + }); + Ok(()) } /// DELETE operation (served by Sled Agent API) @@ -454,7 +432,7 @@ impl ArtifactStore { /// Abstracts over what kind of sled agent we are; each of the real sled agent, /// simulated sled agent, and this module's unit tests have different ways of /// keeping track of the datasets on the system. -pub(crate) trait DatasetsManager { +pub(crate) trait DatasetsManager: Sync { async fn artifact_storage_paths( &self, ) -> Result + '_, StorageError>; @@ -487,39 +465,55 @@ impl DatasetsManager for StorageHandle { } } -/// Abstraction for `ArtifactStore::put_impl` that handles writing to several -/// temporary files. -struct MultiWriter { - inner: Vec<(NamedUtf8TempFile, Utf8PathBuf, ArtifactHash)>, +/// Abstraction that handles writing to several temporary files. +struct ArtifactWriter { + files: Vec, Utf8PathBuf)>>, + hasher: Sha256, + log: Logger, + sha256: ArtifactHash, } -impl MultiWriter { +impl ArtifactWriter { + async fn write_stream( + self, + stream: impl Stream, Error>>, + ) -> Result<(), Error> { + let writer = stream + .try_fold(self, |mut writer, chunk| async { + writer.write(chunk).await?; + Ok(writer) + }) + .await?; + writer.finalize().await + } + /// Write `chunk` to all files. If an error occurs, it is logged and the /// temporary file is dropped. If there are no files left to write to, the /// most recently-seen error is returned. - async fn write( - &mut self, - log: &Logger, - chunk: impl AsRef<[u8]>, - ) -> Result<(), Error> { - // We would rather use `Vec::retain_mut` here but we'd need an async - // version... - let to_write = std::mem::take(&mut self.inner); - self.inner.reserve_exact(to_write.len()); + async fn write(&mut self, chunk: impl AsRef<[u8]>) -> Result<(), Error> { + self.hasher.update(&chunk); + let mut last_error = None; - for (mut file, mountpoint, sha256) in to_write { - match file.as_file_mut().write_all(chunk.as_ref()).await { - Ok(()) => self.inner.push((file, mountpoint, sha256)), - Err(err) => { - let path = file.path().to_owned(); - log_and_store!(last_error, log, "write to", path, err); - // `file` and `final_path` are dropped here, cleaning up - // the file + for option in &mut self.files { + if let Some((mut file, mountpoint)) = option.take() { + match file.as_file_mut().write_all(chunk.as_ref()).await { + Ok(()) => { + *option = Some((file, mountpoint)); + } + Err(err) => { + let path = file.path().to_owned(); + log_and_store!( + last_error, &self.log, "write to", path, err + ); + // `file` and `final_path` are dropped here, cleaning up + // the file + } } } } - if self.inner.is_empty() { + self.files.retain(Option::is_some); + if self.files.is_empty() { Err(last_error.unwrap_or(Error::NoUpdateDataset)) } else { Ok(()) @@ -529,16 +523,26 @@ impl MultiWriter { /// Rename all files to their final paths. If an error occurs, it is logged. /// If none of the files are renamed successfully, the most recently-seen /// error is returned. - async fn finalize(self, log: &Logger) -> Result<(), Error> { + async fn finalize(self) -> Result<(), Error> { + let digest = self.hasher.finalize(); + if digest.as_slice() != self.sha256.as_ref() { + return Err(Error::HashMismatch { + expected: self.sha256, + actual: ArtifactHash(digest.into()), + }); + } + let mut last_error = None; let mut any_success = false; - for (mut file, mountpoint, sha256) in self.inner { + for (mut file, mountpoint) in self.files.into_iter().flatten() { // 1. Open the mountpoint and its temp dir so we can fsync them at // the end. let parent_dir = match File::open(&mountpoint).await { Ok(dir) => dir, Err(err) => { - log_and_store!(last_error, log, "open", mountpoint, err); + log_and_store!( + last_error, &self.log, "open", mountpoint, err + ); continue; } }; @@ -546,18 +550,24 @@ impl MultiWriter { let temp_dir = match File::open(&temp_dir_path).await { Ok(dir) => dir, Err(err) => { - log_and_store!(last_error, log, "open", temp_dir_path, err); + log_and_store!( + last_error, + &self.log, + "open", + temp_dir_path, + err + ); continue; } }; // 2. fsync the file. if let Err(err) = file.as_file_mut().sync_all().await { let path = file.path().to_owned(); - log_and_store!(last_error, log, "sync", path, err); + log_and_store!(last_error, &self.log, "sync", path, err); continue; } // 3. Rename temporary file. - let final_path = mountpoint.join(sha256.to_string()); + let final_path = mountpoint.join(self.sha256.to_string()); let moved_final_path = final_path.clone(); if let Err(err) = tokio::task::spawn_blocking(move || { file.persist(&moved_final_path) @@ -565,7 +575,7 @@ impl MultiWriter { .await? { error!( - log, + &self.log, "Failed to rename temporary file"; "error" => &err.error, "from" => err.file.path().as_str(), @@ -581,11 +591,17 @@ impl MultiWriter { // 4. fsync the parent directory for both the final path and its // previous path. if let Err(err) = parent_dir.sync_all().await { - log_and_store!(last_error, log, "sync", mountpoint, err); + log_and_store!(last_error, &self.log, "sync", mountpoint, err); continue; } if let Err(err) = temp_dir.sync_all().await { - log_and_store!(last_error, log, "sync", temp_dir_path, err); + log_and_store!( + last_error, + &self.log, + "sync", + temp_dir_path, + err + ); continue; } @@ -593,6 +609,11 @@ impl MultiWriter { } if any_success { + info!( + &self.log, + "Wrote artifact"; + "sha256" => &self.sha256.to_string(), + ); Ok(()) } else { Err(last_error.unwrap_or(Error::NoUpdateDataset)) @@ -621,7 +642,7 @@ impl RepoDepotApi for RepoDepotImpl { } } -#[derive(Debug, thiserror::Error)] +#[derive(Debug, thiserror::Error, SlogInlineError)] pub(crate) enum Error { #[error("Another task is already writing artifact {sha256}")] AlreadyInProgress { sha256: ArtifactHash }, @@ -787,11 +808,10 @@ mod test { // 3. we don't fail trying to create TEMP_SUBDIR twice for _ in 0..2 { store - .put_impl( - None, - TEST_HASH, - stream::once(async { Ok(TEST_ARTIFACT) }), - ) + .writer(TEST_HASH) + .await + .unwrap() + .write_stream(stream::once(async { Ok(TEST_ARTIFACT) })) .await .unwrap(); // list lists the file @@ -845,13 +865,7 @@ mod test { let store = ArtifactStore::new(&log.log, backend); assert!(matches!( - store - .put_impl( - None, - TEST_HASH, - stream::once(async { Ok(TEST_ARTIFACT) }) - ) - .await, + store.writer(TEST_HASH).await, Err(Error::NoUpdateDataset) )); assert!(matches!( @@ -866,4 +880,37 @@ mod test { log.cleanup_successful(); } + + #[tokio::test] + async fn wrong_hash() { + const ACTUAL: ArtifactHash = ArtifactHash(hex!( + "4d27a9d1ddb65e0f2350a400cf73157e42ae2ca687a4220aa0a73b9bb2d211f7" + )); + + let log = test_setup_log("wrong_hash"); + let backend = TestBackend::new(2); + let store = ArtifactStore::new(&log.log, backend); + let err = store + .writer(TEST_HASH) + .await + .unwrap() + .write_stream(stream::once(async { + Ok(b"This isn't right at all.") + })) + .await + .unwrap_err(); + match err { + Error::HashMismatch { expected, actual } => { + assert_eq!(expected, TEST_HASH); + assert_eq!(actual, ACTUAL); + } + err => panic!("wrong error: {err}"), + } + assert!(matches!( + store.get(TEST_HASH).await, + Err(Error::NotFound { .. }) + )); + + log.cleanup_successful(); + } } From 58b9cbe61efde81211a0a1ab99d9ee644fb69b69 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 29 Oct 2024 06:25:26 +0000 Subject: [PATCH 23/28] ls-apis expectorate --- dev-tools/ls-apis/tests/api_dependencies.out | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dev-tools/ls-apis/tests/api_dependencies.out b/dev-tools/ls-apis/tests/api_dependencies.out index a902484a75..3fa9942f85 100644 --- a/dev-tools/ls-apis/tests/api_dependencies.out +++ b/dev-tools/ls-apis/tests/api_dependencies.out @@ -71,6 +71,9 @@ Propolis (client: propolis-client) Crucible Repair (client: repair-client) consumed by: crucible-downstairs (crucible/downstairs) +Repo Depot API (client: repo-depot-client) + consumed by: omicron-sled-agent (omicron/sled-agent) + Sled Agent (client: sled-agent-client) consumed by: dpd (dendrite/dpd) consumed by: omicron-nexus (omicron/nexus) From cd7dc7b9f8b984b365758d526d580d2afcb82894 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Wed, 30 Oct 2024 16:46:56 +0000 Subject: [PATCH 24/28] review comments --- sled-agent/src/artifact_store.rs | 55 +++++++++----------------------- 1 file changed, 15 insertions(+), 40 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index e01db03edf..a7cab622de 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -11,7 +11,8 @@ //! it does not have from another Repo Depot that does have them (at Nexus's //! direction). This API's implementation is also part of this module. //! -//! POST, PUT, and DELETE operations are handled by the Sled Agent API. +//! POST, PUT, and DELETE operations are called by Nexus and handled by the Sled +//! Agent API. use std::collections::BTreeMap; use std::io::ErrorKind; @@ -280,7 +281,7 @@ impl ArtifactStore { &self, sha256: ArtifactHash, ) -> Result { - let mut inner = Vec::new(); + let mut files = Vec::new(); let mut last_error = None; for mountpoint in self.storage.artifact_storage_paths().await? { let temp_dir = mountpoint.join(TEMP_SUBDIR); @@ -316,14 +317,14 @@ impl ArtifactStore { }; let file = NamedUtf8TempFile::from_parts(file, temp_path); - inner.push(Some((file, mountpoint))); + files.push(Some((file, mountpoint))); } - if inner.is_empty() { + if files.is_empty() { Err(last_error.unwrap_or(Error::NoUpdateDataset)) } else { Ok(ArtifactWriter { hasher: Sha256::new(), - files: inner, + files, log: self.log.clone(), sha256, }) @@ -535,8 +536,13 @@ impl ArtifactWriter { let mut last_error = None; let mut any_success = false; for (mut file, mountpoint) in self.files.into_iter().flatten() { - // 1. Open the mountpoint and its temp dir so we can fsync them at - // the end. + // 1. fsync the temporary file. + if let Err(err) = file.as_file_mut().sync_all().await { + let path = file.path().to_owned(); + log_and_store!(last_error, &self.log, "sync", path, err); + continue; + } + // 2. Open the parent directory so we can fsync it. let parent_dir = match File::open(&mountpoint).await { Ok(dir) => dir, Err(err) => { @@ -546,27 +552,7 @@ impl ArtifactWriter { continue; } }; - let temp_dir_path = mountpoint.join(TEMP_SUBDIR); - let temp_dir = match File::open(&temp_dir_path).await { - Ok(dir) => dir, - Err(err) => { - log_and_store!( - last_error, - &self.log, - "open", - temp_dir_path, - err - ); - continue; - } - }; - // 2. fsync the file. - if let Err(err) = file.as_file_mut().sync_all().await { - let path = file.path().to_owned(); - log_and_store!(last_error, &self.log, "sync", path, err); - continue; - } - // 3. Rename temporary file. + // 3. Rename the temporary file. let final_path = mountpoint.join(self.sha256.to_string()); let moved_final_path = final_path.clone(); if let Err(err) = tokio::task::spawn_blocking(move || { @@ -588,22 +574,11 @@ impl ArtifactWriter { }); continue; } - // 4. fsync the parent directory for both the final path and its - // previous path. + // 4. fsync the parent directory. if let Err(err) = parent_dir.sync_all().await { log_and_store!(last_error, &self.log, "sync", mountpoint, err); continue; } - if let Err(err) = temp_dir.sync_all().await { - log_and_store!( - last_error, - &self.log, - "sync", - temp_dir_path, - err - ); - continue; - } any_success = true; } From e967f02c9a8ad266bdaabe8fe3fee64c1596728e Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Wed, 30 Oct 2024 16:56:50 +0000 Subject: [PATCH 25/28] propagate non-fatal write errors to `finalize()` --- sled-agent/src/artifact_store.rs | 75 +++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index a7cab622de..31033cf3bb 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -272,11 +272,18 @@ impl ArtifactStore { } /// Common implementation for all artifact write operations that creates - /// a temporary file on all datasets. + /// a temporary file on all datasets. Returns an [`ArtifactWriter`] that + /// can be used to write the artifact to all temporary files, then move all + /// temporary files to their final paths. /// - /// Errors are logged and ignored unless a temporary file already exists - /// (another task is writing to this artifact) or no temporary files could - /// be created. + /// Most errors during the write process are considered non-fatal errors. + /// All non-fatal errors are logged, and the most recently-seen non-fatal + /// error is returned by [`ArtifactWriter::finalize`]. + /// + /// In this method, possible fatal errors are: + /// - No temporary files could be created. + /// - A temporary file already exists (another task is writing to this + /// artifact). async fn writer( &self, sha256: ArtifactHash, @@ -327,6 +334,7 @@ impl ArtifactStore { files, log: self.log.clone(), sha256, + last_error, }) } } @@ -472,9 +480,13 @@ struct ArtifactWriter { hasher: Sha256, log: Logger, sha256: ArtifactHash, + last_error: Option, } impl ArtifactWriter { + /// Calls [`ArtifactWriter::write`] for each chunk in the stream, then + /// [`ArtifactWriter::finalize`]. See the documentation for these functions + /// for error handling information. async fn write_stream( self, stream: impl Stream, Error>>, @@ -488,13 +500,17 @@ impl ArtifactWriter { writer.finalize().await } - /// Write `chunk` to all files. If an error occurs, it is logged and the - /// temporary file is dropped. If there are no files left to write to, the - /// most recently-seen error is returned. + /// Write `chunk` to all temporary files. + /// + /// Errors in this method are considered non-fatal errors. All non-fatal + /// errors are logged, and the most recently-seen non-fatal error is + /// returned by [`ArtifactWriter::finalize`]. + /// + /// If all files have failed, this method returns the most recently-seen + /// non-fatal error as a fatal error. async fn write(&mut self, chunk: impl AsRef<[u8]>) -> Result<(), Error> { self.hasher.update(&chunk); - let mut last_error = None; for option in &mut self.files { if let Some((mut file, mountpoint)) = option.take() { match file.as_file_mut().write_all(chunk.as_ref()).await { @@ -504,7 +520,11 @@ impl ArtifactWriter { Err(err) => { let path = file.path().to_owned(); log_and_store!( - last_error, &self.log, "write to", path, err + self.last_error, + &self.log, + "write to", + path, + err ); // `file` and `final_path` are dropped here, cleaning up // the file @@ -515,16 +535,18 @@ impl ArtifactWriter { self.files.retain(Option::is_some); if self.files.is_empty() { - Err(last_error.unwrap_or(Error::NoUpdateDataset)) + Err(self.last_error.take().unwrap_or(Error::NoUpdateDataset)) } else { Ok(()) } } - /// Rename all files to their final paths. If an error occurs, it is logged. - /// If none of the files are renamed successfully, the most recently-seen - /// error is returned. - async fn finalize(self) -> Result<(), Error> { + /// Rename all files to their final paths. + /// + /// Errors in this method are considered non-fatal errors, but this method + /// will return the most recently-seen error by any method in the write + /// process. + async fn finalize(mut self) -> Result<(), Error> { let digest = self.hasher.finalize(); if digest.as_slice() != self.sha256.as_ref() { return Err(Error::HashMismatch { @@ -533,13 +555,12 @@ impl ArtifactWriter { }); } - let mut last_error = None; let mut any_success = false; for (mut file, mountpoint) in self.files.into_iter().flatten() { // 1. fsync the temporary file. if let Err(err) = file.as_file_mut().sync_all().await { let path = file.path().to_owned(); - log_and_store!(last_error, &self.log, "sync", path, err); + log_and_store!(self.last_error, &self.log, "sync", path, err); continue; } // 2. Open the parent directory so we can fsync it. @@ -547,7 +568,11 @@ impl ArtifactWriter { Ok(dir) => dir, Err(err) => { log_and_store!( - last_error, &self.log, "open", mountpoint, err + self.last_error, + &self.log, + "open", + mountpoint, + err ); continue; } @@ -567,7 +592,7 @@ impl ArtifactWriter { "from" => err.file.path().as_str(), "to" => final_path.as_str(), ); - last_error = Some(Error::FileRename { + self.last_error = Some(Error::FileRename { from: err.file.path().to_owned(), to: final_path, err: err.error, @@ -576,14 +601,22 @@ impl ArtifactWriter { } // 4. fsync the parent directory. if let Err(err) = parent_dir.sync_all().await { - log_and_store!(last_error, &self.log, "sync", mountpoint, err); + log_and_store!( + self.last_error, + &self.log, + "sync", + mountpoint, + err + ); continue; } any_success = true; } - if any_success { + if let Some(last_error) = self.last_error { + Err(last_error) + } else if any_success { info!( &self.log, "Wrote artifact"; @@ -591,7 +624,7 @@ impl ArtifactWriter { ); Ok(()) } else { - Err(last_error.unwrap_or(Error::NoUpdateDataset)) + Err(Error::NoUpdateDataset) } } } From 508861d5084b4e40296dcd1dbd852a7d41581d91 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Wed, 30 Oct 2024 18:07:01 +0000 Subject: [PATCH 26/28] expectoraaaaate --- dev-tools/ls-apis/tests/api_dependencies.out | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-tools/ls-apis/tests/api_dependencies.out b/dev-tools/ls-apis/tests/api_dependencies.out index 5f2fa1ddb3..aee8cd7a70 100644 --- a/dev-tools/ls-apis/tests/api_dependencies.out +++ b/dev-tools/ls-apis/tests/api_dependencies.out @@ -72,7 +72,7 @@ Crucible Repair (client: repair-client) consumed by: crucible-downstairs (crucible/downstairs) via 1 path Repo Depot API (client: repo-depot-client) - consumed by: omicron-sled-agent (omicron/sled-agent) + consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path Sled Agent (client: sled-agent-client) consumed by: dpd (dendrite/dpd) via 1 path From 32afac51e2a18318a2690744684356a1c7eeb40c Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Thu, 31 Oct 2024 16:43:01 +0000 Subject: [PATCH 27/28] improved API responses for PUT/POST --- openapi/sled-agent.json | 44 ++++++++++-- sled-agent/api/src/lib.rs | 20 ++++-- sled-agent/src/artifact_store.rs | 92 +++++++++++--------------- sled-agent/src/http_entrypoints.rs | 20 +++--- sled-agent/src/sim/http_entrypoints.rs | 13 ++-- 5 files changed, 113 insertions(+), 76 deletions(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 1554125714..d526943231 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -65,8 +65,15 @@ "required": true }, "responses": { - "204": { - "description": "resource updated" + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ArtifactPutResponse" + } + } + } }, "4XX": { "$ref": "#/components/responses/Error" @@ -127,8 +134,15 @@ "required": true }, "responses": { - "204": { - "description": "resource updated" + "202": { + "description": "successfully enqueued operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ArtifactCopyFromDepotResponse" + } + } + } }, "4XX": { "$ref": "#/components/responses/Error" @@ -1557,6 +1571,28 @@ "depot_base_url" ] }, + "ArtifactCopyFromDepotResponse": { + "type": "object" + }, + "ArtifactPutResponse": { + "type": "object", + "properties": { + "datasets": { + "type": "integer", + "format": "uint", + "minimum": 0 + }, + "successful_writes": { + "type": "integer", + "format": "uint", + "minimum": 0 + } + }, + "required": [ + "datasets", + "successful_writes" + ] + }, "Baseboard": { "description": "Describes properties that should uniquely identify a Gimlet.", "oneOf": [ diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 470d499f7f..bdf792e943 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -7,9 +7,10 @@ use std::time::Duration; use camino::Utf8PathBuf; use dropshot::{ - FreeformBody, HttpError, HttpResponseCreated, HttpResponseDeleted, - HttpResponseHeaders, HttpResponseOk, HttpResponseUpdatedNoContent, Path, - Query, RequestContext, StreamingBody, TypedBody, + FreeformBody, HttpError, HttpResponseAccepted, HttpResponseCreated, + HttpResponseDeleted, HttpResponseHeaders, HttpResponseOk, + HttpResponseUpdatedNoContent, Path, Query, RequestContext, StreamingBody, + TypedBody, }; use nexus_sled_agent_shared::inventory::{ Inventory, OmicronZonesConfig, SledRole, @@ -319,7 +320,7 @@ pub trait SledAgentApi { rqctx: RequestContext, path_params: Path, body: TypedBody, - ) -> Result; + ) -> Result, HttpError>; #[endpoint { method = PUT, @@ -329,7 +330,7 @@ pub trait SledAgentApi { rqctx: RequestContext, path_params: Path, body: StreamingBody, - ) -> Result; + ) -> Result, HttpError>; #[endpoint { method = DELETE, @@ -596,6 +597,15 @@ pub struct ArtifactCopyFromDepotBody { pub depot_base_url: String, } +#[derive(Serialize, JsonSchema)] +pub struct ArtifactCopyFromDepotResponse {} + +#[derive(Debug, Serialize, JsonSchema)] +pub struct ArtifactPutResponse { + pub datasets: usize, + pub successful_writes: usize, +} + #[derive(Deserialize, JsonSchema)] pub struct VmmIssueDiskSnapshotRequestPathParam { pub propolis_id: PropolisUuid, diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index 31033cf3bb..fc0dc4a20a 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -33,6 +33,7 @@ use omicron_common::disk::{DatasetKind, DatasetsConfig}; use omicron_common::update::ArtifactHash; use repo_depot_api::*; use sha2::{Digest, Sha256}; +use sled_agent_api::ArtifactPutResponse; use sled_storage::dataset::M2_ARTIFACT_DATASET; use sled_storage::error::Error as StorageError; use sled_storage::manager::StorageHandle; @@ -175,16 +176,16 @@ impl ArtifactStore { &self, sha256: ArtifactHash, ) -> Result { - let sha256 = sha256.to_string(); + let sha256_str = sha256.to_string(); let mut last_error = None; for mountpoint in self.storage.artifact_storage_paths().await? { - let path = mountpoint.join(&sha256); + let path = mountpoint.join(&sha256_str); match File::open(&path).await { Ok(file) => { info!( &self.log, "Retrieved artifact"; - "sha256" => &sha256, + "sha256" => &sha256_str, "path" => path.as_str(), ); return Ok(file); @@ -195,11 +196,7 @@ impl ArtifactStore { } } } - if let Some(last_error) = last_error { - Err(last_error) - } else { - Err(Error::NotFound { sha256 }) - } + Err(last_error.unwrap_or(Error::NotFound { sha256 })) } /// List operation (served by Sled Agent API) @@ -276,9 +273,8 @@ impl ArtifactStore { /// can be used to write the artifact to all temporary files, then move all /// temporary files to their final paths. /// - /// Most errors during the write process are considered non-fatal errors. - /// All non-fatal errors are logged, and the most recently-seen non-fatal - /// error is returned by [`ArtifactWriter::finalize`]. + /// Most errors during the write process are considered non-fatal errors, + /// which are logged instead of immediately returned. /// /// In this method, possible fatal errors are: /// - No temporary files could be created. @@ -290,7 +286,9 @@ impl ArtifactStore { ) -> Result { let mut files = Vec::new(); let mut last_error = None; + let mut datasets = 0; for mountpoint in self.storage.artifact_storage_paths().await? { + datasets += 1; let temp_dir = mountpoint.join(TEMP_SUBDIR); if let Err(err) = tokio::fs::create_dir(&temp_dir).await { if err.kind() != ErrorKind::AlreadyExists { @@ -330,11 +328,11 @@ impl ArtifactStore { Err(last_error.unwrap_or(Error::NoUpdateDataset)) } else { Ok(ArtifactWriter { + datasets, hasher: Sha256::new(), files, log: self.log.clone(), sha256, - last_error, }) } } @@ -344,7 +342,7 @@ impl ArtifactStore { &self, sha256: ArtifactHash, body: StreamingBody, - ) -> Result<(), Error> { + ) -> Result { self.writer(sha256) .await? .write_stream(body.into_stream().map_err(Error::Body)) @@ -476,11 +474,11 @@ impl DatasetsManager for StorageHandle { /// Abstraction that handles writing to several temporary files. struct ArtifactWriter { + datasets: usize, files: Vec, Utf8PathBuf)>>, hasher: Sha256, log: Logger, sha256: ArtifactHash, - last_error: Option, } impl ArtifactWriter { @@ -490,7 +488,7 @@ impl ArtifactWriter { async fn write_stream( self, stream: impl Stream, Error>>, - ) -> Result<(), Error> { + ) -> Result { let writer = stream .try_fold(self, |mut writer, chunk| async { writer.write(chunk).await?; @@ -502,15 +500,13 @@ impl ArtifactWriter { /// Write `chunk` to all temporary files. /// - /// Errors in this method are considered non-fatal errors. All non-fatal - /// errors are logged, and the most recently-seen non-fatal error is - /// returned by [`ArtifactWriter::finalize`]. - /// - /// If all files have failed, this method returns the most recently-seen - /// non-fatal error as a fatal error. + /// Errors in this method are considered non-fatal errors. All errors + /// are logged. If all files have failed, this method returns the most + /// recently-seen non-fatal error as a fatal error. async fn write(&mut self, chunk: impl AsRef<[u8]>) -> Result<(), Error> { self.hasher.update(&chunk); + let mut last_error = None; for option in &mut self.files { if let Some((mut file, mountpoint)) = option.take() { match file.as_file_mut().write_all(chunk.as_ref()).await { @@ -520,11 +516,7 @@ impl ArtifactWriter { Err(err) => { let path = file.path().to_owned(); log_and_store!( - self.last_error, - &self.log, - "write to", - path, - err + last_error, &self.log, "write to", path, err ); // `file` and `final_path` are dropped here, cleaning up // the file @@ -535,7 +527,7 @@ impl ArtifactWriter { self.files.retain(Option::is_some); if self.files.is_empty() { - Err(self.last_error.take().unwrap_or(Error::NoUpdateDataset)) + Err(last_error.unwrap_or(Error::NoUpdateDataset)) } else { Ok(()) } @@ -543,10 +535,10 @@ impl ArtifactWriter { /// Rename all files to their final paths. /// - /// Errors in this method are considered non-fatal errors, but this method - /// will return the most recently-seen error by any method in the write - /// process. - async fn finalize(mut self) -> Result<(), Error> { + /// Errors in this method are considered non-fatal errors. If all files have + /// failed in some way, the most recently-seen error is returned as a fatal + /// error. + async fn finalize(self) -> Result { let digest = self.hasher.finalize(); if digest.as_slice() != self.sha256.as_ref() { return Err(Error::HashMismatch { @@ -555,12 +547,13 @@ impl ArtifactWriter { }); } - let mut any_success = false; + let mut last_error = None; + let mut successful_writes = 0; for (mut file, mountpoint) in self.files.into_iter().flatten() { // 1. fsync the temporary file. if let Err(err) = file.as_file_mut().sync_all().await { let path = file.path().to_owned(); - log_and_store!(self.last_error, &self.log, "sync", path, err); + log_and_store!(last_error, &self.log, "sync", path, err); continue; } // 2. Open the parent directory so we can fsync it. @@ -568,11 +561,7 @@ impl ArtifactWriter { Ok(dir) => dir, Err(err) => { log_and_store!( - self.last_error, - &self.log, - "open", - mountpoint, - err + last_error, &self.log, "open", mountpoint, err ); continue; } @@ -592,7 +581,7 @@ impl ArtifactWriter { "from" => err.file.path().as_str(), "to" => final_path.as_str(), ); - self.last_error = Some(Error::FileRename { + last_error = Some(Error::FileRename { from: err.file.path().to_owned(), to: final_path, err: err.error, @@ -601,30 +590,27 @@ impl ArtifactWriter { } // 4. fsync the parent directory. if let Err(err) = parent_dir.sync_all().await { - log_and_store!( - self.last_error, - &self.log, - "sync", - mountpoint, - err - ); + log_and_store!(last_error, &self.log, "sync", mountpoint, err); continue; } - any_success = true; + successful_writes += 1; } - if let Some(last_error) = self.last_error { - Err(last_error) - } else if any_success { + if successful_writes > 0 { info!( &self.log, "Wrote artifact"; "sha256" => &self.sha256.to_string(), + "datasets" => self.datasets, + "successful_writes" => successful_writes, ); - Ok(()) + Ok(ArtifactPutResponse { + datasets: self.datasets, + successful_writes, + }) } else { - Err(Error::NoUpdateDataset) + Err(last_error.unwrap_or(Error::NoUpdateDataset)) } } } @@ -692,7 +678,7 @@ pub(crate) enum Error { Join(#[from] tokio::task::JoinError), #[error("Artifact {sha256} not found")] - NotFound { sha256: String }, + NotFound { sha256: ArtifactHash }, #[error("No update datasets present")] NoUpdateDataset, diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 3661c347df..cd3afc44ac 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -11,10 +11,10 @@ use bootstore::schemes::v0::NetworkConfig; use camino::Utf8PathBuf; use display_error_chain::DisplayErrorChain; use dropshot::{ - ApiDescription, Body, FreeformBody, HttpError, HttpResponseCreated, - HttpResponseDeleted, HttpResponseHeaders, HttpResponseOk, - HttpResponseUpdatedNoContent, Path, Query, RequestContext, StreamingBody, - TypedBody, + ApiDescription, Body, FreeformBody, HttpError, HttpResponseAccepted, + HttpResponseCreated, HttpResponseDeleted, HttpResponseHeaders, + HttpResponseOk, HttpResponseUpdatedNoContent, Path, Query, RequestContext, + StreamingBody, TypedBody, }; use nexus_sled_agent_shared::inventory::{ Inventory, OmicronZonesConfig, SledRole, @@ -412,7 +412,8 @@ impl SledAgentApi for SledAgentImpl { rqctx: RequestContext, path_params: Path, body: TypedBody, - ) -> Result { + ) -> Result, HttpError> + { let sha256 = path_params.into_inner().sha256; let depot_base_url = body.into_inner().depot_base_url; rqctx @@ -420,17 +421,18 @@ impl SledAgentApi for SledAgentImpl { .artifact_store() .copy_from_depot(sha256, &depot_base_url) .await?; - Ok(HttpResponseUpdatedNoContent()) + Ok(HttpResponseAccepted(ArtifactCopyFromDepotResponse {})) } async fn artifact_put( rqctx: RequestContext, path_params: Path, body: StreamingBody, - ) -> Result { + ) -> Result, HttpError> { let sha256 = path_params.into_inner().sha256; - rqctx.context().artifact_store().put_body(sha256, body).await?; - Ok(HttpResponseUpdatedNoContent()) + Ok(HttpResponseOk( + rqctx.context().artifact_store().put_body(sha256, body).await?, + )) } async fn artifact_delete( diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index daea732940..fdffb249cf 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -10,6 +10,7 @@ use dropshot::endpoint; use dropshot::ApiDescription; use dropshot::FreeformBody; use dropshot::HttpError; +use dropshot::HttpResponseAccepted; use dropshot::HttpResponseCreated; use dropshot::HttpResponseDeleted; use dropshot::HttpResponseHeaders; @@ -192,7 +193,8 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, path_params: Path, body: TypedBody, - ) -> Result { + ) -> Result, HttpError> + { let sha256 = path_params.into_inner().sha256; let depot_base_url = body.into_inner().depot_base_url; rqctx @@ -200,17 +202,18 @@ impl SledAgentApi for SledAgentSimImpl { .artifact_store() .copy_from_depot(sha256, &depot_base_url) .await?; - Ok(HttpResponseUpdatedNoContent()) + Ok(HttpResponseAccepted(ArtifactCopyFromDepotResponse {})) } async fn artifact_put( rqctx: RequestContext, path_params: Path, body: StreamingBody, - ) -> Result { + ) -> Result, HttpError> { let sha256 = path_params.into_inner().sha256; - rqctx.context().artifact_store().put_body(sha256, body).await?; - Ok(HttpResponseUpdatedNoContent()) + Ok(HttpResponseOk( + rqctx.context().artifact_store().put_body(sha256, body).await?, + )) } async fn artifact_delete( From 4d00c16a363b2d6d3782e05689c2272b3955616b Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Thu, 31 Oct 2024 19:01:11 +0000 Subject: [PATCH 28/28] document ArtifactPutResponse fields --- openapi/sled-agent.json | 2 ++ sled-agent/api/src/lib.rs | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index d526943231..b79cb467ec 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -1578,11 +1578,13 @@ "type": "object", "properties": { "datasets": { + "description": "The number of valid M.2 artifact datasets we found on the sled. There is typically one of these datasets for each functional M.2.", "type": "integer", "format": "uint", "minimum": 0 }, "successful_writes": { + "description": "The number of valid writes to the M.2 artifact datasets. This should be less than or equal to the number of artifact datasets.", "type": "integer", "format": "uint", "minimum": 0 diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index bdf792e943..b5608602f2 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -602,7 +602,12 @@ pub struct ArtifactCopyFromDepotResponse {} #[derive(Debug, Serialize, JsonSchema)] pub struct ArtifactPutResponse { + /// The number of valid M.2 artifact datasets we found on the sled. There is + /// typically one of these datasets for each functional M.2. pub datasets: usize, + + /// The number of valid writes to the M.2 artifact datasets. This should be + /// less than or equal to the number of artifact datasets. pub successful_writes: usize, }