oxidecomputer · iliana · Oct 31, 2024 · Oct 2, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/dev-tools/ls-apis/tests/api_dependencies.out b/dev-tools/ls-apis/tests/api_dependencies.out
@@ -72,7 +72,7 @@ Crucible Repair (client: repair-client)
     consumed by: crucible-downstairs (crucible/downstairs) via 1 path
 
 Repo Depot API (client: repo-depot-client)
-    consumed by: omicron-sled-agent (omicron/sled-agent)
+    consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path
 
 Sled Agent (client: sled-agent-client)
     consumed by: dpd (dendrite/dpd) via 1 path

diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs
@@ -11,7 +11,8 @@
 //! it does not have from another Repo Depot that does have them (at Nexus's
 //! direction). This API's implementation is also part of this module.
 //!
-//! POST, PUT, and DELETE operations are handled by the Sled Agent API.
+//! POST, PUT, and DELETE operations are called by Nexus and handled by the Sled
+//! Agent API.
 
 use std::collections::BTreeMap;
 use std::io::ErrorKind;
@@ -271,16 +272,23 @@ impl<T: DatasetsManager> ArtifactStore<T> {
     }
 
     /// Common implementation for all artifact write operations that creates
-    /// a temporary file on all datasets.
+    /// a temporary file on all datasets. Returns an [`ArtifactWriter`] that
+    /// can be used to write the artifact to all temporary files, then move all
+    /// temporary files to their final paths.
     ///
-    /// Errors are logged and ignored unless a temporary file already exists
-    /// (another task is writing to this artifact) or no temporary files could
-    /// be created.
+    /// Most errors during the write process are considered non-fatal errors.
+    /// All non-fatal errors are logged, and the most recently-seen non-fatal
+    /// error is returned by [`ArtifactWriter::finalize`].
+    ///
+    /// In this method, possible fatal errors are:
+    /// - No temporary files could be created.
+    /// - A temporary file already exists (another task is writing to this
+    ///   artifact).
     async fn writer(
         &self,
         sha256: ArtifactHash,
     ) -> Result<ArtifactWriter, Error> {
-        let mut inner = Vec::new();
+        let mut files = Vec::new();
         let mut last_error = None;
         for mountpoint in self.storage.artifact_storage_paths().await? {
             let temp_dir = mountpoint.join(TEMP_SUBDIR);
@@ -316,16 +324,17 @@ impl<T: DatasetsManager> ArtifactStore<T> {
             };
             let file = NamedUtf8TempFile::from_parts(file, temp_path);
 
-            inner.push(Some((file, mountpoint)));
+            files.push(Some((file, mountpoint)));
         }
-        if inner.is_empty() {
+        if files.is_empty() {
             Err(last_error.unwrap_or(Error::NoUpdateDataset))
         } else {
             Ok(ArtifactWriter {
                 hasher: Sha256::new(),
-                files: inner,
+                files,
                 log: self.log.clone(),
                 sha256,
+                last_error,
             })
         }
     }
@@ -471,9 +480,13 @@ struct ArtifactWriter {
     hasher: Sha256,
     log: Logger,
     sha256: ArtifactHash,
+    last_error: Option<Error>,
 }
 
 impl ArtifactWriter {
+    /// Calls [`ArtifactWriter::write`] for each chunk in the stream, then
+    /// [`ArtifactWriter::finalize`]. See the documentation for these functions
+    /// for error handling information.
     async fn write_stream(
         self,
         stream: impl Stream<Item = Result<impl AsRef<[u8]>, Error>>,
@@ -487,13 +500,17 @@ impl ArtifactWriter {
         writer.finalize().await
     }
 
-    /// Write `chunk` to all files. If an error occurs, it is logged and the
-    /// temporary file is dropped. If there are no files left to write to, the
-    /// most recently-seen error is returned.
+    /// Write `chunk` to all temporary files.
+    ///
+    /// Errors in this method are considered non-fatal errors. All non-fatal
+    /// errors are logged, and the most recently-seen non-fatal error is
+    /// returned by [`ArtifactWriter::finalize`].
+    ///
+    /// If all files have failed, this method returns the most recently-seen
+    /// non-fatal error as a fatal error.
     async fn write(&mut self, chunk: impl AsRef<[u8]>) -> Result<(), Error> {
         self.hasher.update(&chunk);
 
-        let mut last_error = None;
         for option in &mut self.files {
             if let Some((mut file, mountpoint)) = option.take() {
                 match file.as_file_mut().write_all(chunk.as_ref()).await {
@@ -503,7 +520,11 @@ impl ArtifactWriter {
                     Err(err) => {
                         let path = file.path().to_owned();
                         log_and_store!(
-                            last_error, &self.log, "write to", path, err
+                            self.last_error,
+                            &self.log,
+                            "write to",
+                            path,
+                            err
                         );
                         // `file` and `final_path` are dropped here, cleaning up
                         // the file
@@ -514,16 +535,18 @@ impl ArtifactWriter {
 
         self.files.retain(Option::is_some);
         if self.files.is_empty() {
-            Err(last_error.unwrap_or(Error::NoUpdateDataset))
+            Err(self.last_error.take().unwrap_or(Error::NoUpdateDataset))
         } else {
             Ok(())
         }
     }
 
-    /// Rename all files to their final paths. If an error occurs, it is logged.
-    /// If none of the files are renamed successfully, the most recently-seen
-    /// error is returned.
-    async fn finalize(self) -> Result<(), Error> {
+    /// Rename all files to their final paths.
+    ///
+    /// Errors in this method are considered non-fatal errors, but this method
+    /// will return the most recently-seen error by any method in the write
+    /// process.
+    async fn finalize(mut self) -> Result<(), Error> {
         let digest = self.hasher.finalize();
         if digest.as_slice() != self.sha256.as_ref() {
             return Err(Error::HashMismatch {
@@ -532,41 +555,29 @@ impl ArtifactWriter {
             });
         }
 
-        let mut last_error = None;
         let mut any_success = false;
         for (mut file, mountpoint) in self.files.into_iter().flatten() {
-            // 1. Open the mountpoint and its temp dir so we can fsync them at
-            // the end.
+            // 1. fsync the temporary file.
+            if let Err(err) = file.as_file_mut().sync_all().await {
+                let path = file.path().to_owned();
+                log_and_store!(self.last_error, &self.log, "sync", path, err);
+                continue;
+            }
+            // 2. Open the parent directory so we can fsync it.
             let parent_dir = match File::open(&mountpoint).await {
                 Ok(dir) => dir,
                 Err(err) => {
                     log_and_store!(
-                        last_error, &self.log, "open", mountpoint, err
-                    );
-                    continue;
-                }
-            };
-            let temp_dir_path = mountpoint.join(TEMP_SUBDIR);
-            let temp_dir = match File::open(&temp_dir_path).await {
-                Ok(dir) => dir,
-                Err(err) => {
-                    log_and_store!(
-                        last_error,
+                        self.last_error,
                         &self.log,
                         "open",
-                        temp_dir_path,
+                        mountpoint,
                         err
                     );
                     continue;
                 }
             };
-            // 2. fsync the file.
-            if let Err(err) = file.as_file_mut().sync_all().await {
-                let path = file.path().to_owned();
-                log_and_store!(last_error, &self.log, "sync", path, err);
-                continue;
-            }
-            // 3. Rename temporary file.
+            // 3. Rename the temporary file.
             let final_path = mountpoint.join(self.sha256.to_string());
             let moved_final_path = final_path.clone();
             if let Err(err) = tokio::task::spawn_blocking(move || {
@@ -581,25 +592,20 @@ impl ArtifactWriter {
                     "from" => err.file.path().as_str(),
                     "to" => final_path.as_str(),
                 );
-                last_error = Some(Error::FileRename {
+                self.last_error = Some(Error::FileRename {
                     from: err.file.path().to_owned(),
                     to: final_path,
                     err: err.error,
                 });
                 continue;
             }
-            // 4. fsync the parent directory for both the final path and its
-            // previous path.
+            // 4. fsync the parent directory.
             if let Err(err) = parent_dir.sync_all().await {
-                log_and_store!(last_error, &self.log, "sync", mountpoint, err);
-                continue;
-            }
-            if let Err(err) = temp_dir.sync_all().await {
                 log_and_store!(
-                    last_error,
+                    self.last_error,
                     &self.log,
                     "sync",
-                    temp_dir_path,
+                    mountpoint,
                     err
                 );
                 continue;
@@ -608,15 +614,17 @@ impl ArtifactWriter {
             any_success = true;
         }
 
-        if any_success {
+        if let Some(last_error) = self.last_error {
+            Err(last_error)
+        } else if any_success {
             info!(
                 &self.log,
                 "Wrote artifact";
                 "sha256" => &self.sha256.to_string(),
             );
             Ok(())
         } else {
-            Err(last_error.unwrap_or(Error::NoUpdateDataset))
+            Err(Error::NoUpdateDataset)
         }
     }
 }