Skip to content

Commit

Permalink
Create zone bundles from ZFS snapshots
Browse files Browse the repository at this point in the history
- Fixes #4010
- Previously, we copied log files directly out of their original
  locations, which meant we contended with several other components:
  `logadm` rotating the log file; the log archiver moving the to
  longer-term storage; and the program writing to the file itself. This
  commit changes the operation of the bundler, to first create a ZFS
  snapshot of the filesystem(s) containing the log files, clone them,
  and then copy files out of the clones. We destroy those clones /
  snapshots after completing, and when the sled-agent starts to help
  with crash-safety.
  • Loading branch information
bnaecker committed Oct 6, 2023
1 parent c030293 commit ada2189
Show file tree
Hide file tree
Showing 5 changed files with 743 additions and 127 deletions.
78 changes: 14 additions & 64 deletions illumos-utils/src/running_zone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -391,13 +391,16 @@ pub struct RunningZone {
}

impl RunningZone {
/// The path to the zone's root filesystem (i.e., `/`), within zonepath.
pub const ROOT_FS_PATH: &'static str = "root";

pub fn name(&self) -> &str {
&self.inner.name
}

/// Returns the filesystem path to the zone's root
/// Returns the filesystem path to the zone's root in the GZ.
pub fn root(&self) -> Utf8PathBuf {
self.inner.zonepath.join("root")
self.inner.zonepath.join(Self::ROOT_FS_PATH)
}

pub fn control_interface(&self) -> AddrObject {
Expand Down Expand Up @@ -958,13 +961,11 @@ impl RunningZone {
};
let binary = Utf8PathBuf::from(path);

// Fetch any log files for this SMF service.
let Some((log_file, rotated_log_files)) =
self.service_log_files(&service_name)?
let Some(log_file) = self.service_log_file(&service_name)?
else {
error!(
self.inner.log,
"failed to find log files for existing service";
"failed to find log file for existing service";
"service_name" => &service_name,
);
continue;
Expand All @@ -975,7 +976,6 @@ impl RunningZone {
binary,
pid,
log_file,
rotated_log_files,
});
}
}
Expand All @@ -992,72 +992,24 @@ impl RunningZone {
.collect())
}

/// Return any SMF log files associated with the named service.
/// Return any SMF log file associated with the named service.
///
/// Given a named service, this returns a tuple of the latest or current log
/// file, and an array of any rotated log files. If the service does not
/// exist, or there are no log files, `None` is returned.
pub fn service_log_files(
/// Given a named service, this returns the path of the current log file.
/// This can be used to find rotated or archived log files, but keep in mind
/// this returns only the current, if it exists.
pub fn service_log_file(
&self,
name: &str,
) -> Result<Option<(Utf8PathBuf, Vec<Utf8PathBuf>)>, ServiceError> {
) -> Result<Option<Utf8PathBuf>, ServiceError> {
let output = self.run_cmd(&["svcs", "-L", name])?;
let mut lines = output.lines();
let Some(current) = lines.next() else {
return Ok(None);
};
// We need to prepend the zonepath root to get the path in the GZ. We
// can do this with `join()`, but that will _replace_ the path if the
// second one is absolute. So trim any prefixed `/` from each path.
let root = self.root();
let current_log_file =
root.join(current.trim().trim_start_matches('/'));

// The rotated log files should have the same prefix as the current, but
// with an index appended. We'll search the parent directory for
// matching names, skipping the current file.
//
// See https://illumos.org/man/8/logadm for details on the naming
// conventions around these files.
let dir = current_log_file.parent().unwrap();
let mut rotated_files: Vec<Utf8PathBuf> = Vec::new();
for entry in dir.read_dir_utf8()? {
let entry = entry?;
let path = entry.path();

// Camino's Utf8Path only considers whole path components to match,
// so convert both paths into a &str and use that object's
// starts_with. See the `camino_starts_with_behaviour` test.
let path_ref: &str = path.as_ref();
let current_log_file_ref: &str = current_log_file.as_ref();
if path != current_log_file
&& path_ref.starts_with(current_log_file_ref)
{
rotated_files.push(path.clone().into());
}
}

Ok(Some((current_log_file, rotated_files)))
return Ok(Some(Utf8PathBuf::from(current.trim())));
}
}

#[test]
fn camino_starts_with_behaviour() {
let logfile =
Utf8PathBuf::from("/zonepath/var/svc/log/oxide-nexus:default.log");
let rotated_logfile =
Utf8PathBuf::from("/zonepath/var/svc/log/oxide-nexus:default.log.0");

let logfile_as_string: &str = logfile.as_ref();
let rotated_logfile_as_string: &str = rotated_logfile.as_ref();

assert!(logfile != rotated_logfile);
assert!(logfile_as_string != rotated_logfile_as_string);

assert!(!rotated_logfile.starts_with(&logfile));
assert!(rotated_logfile_as_string.starts_with(&logfile_as_string));
}

impl Drop for RunningZone {
fn drop(&mut self) {
if let Some(_) = self.id.take() {
Expand Down Expand Up @@ -1088,8 +1040,6 @@ pub struct ServiceProcess {
pub pid: u32,
/// The path for the current log file.
pub log_file: Utf8PathBuf,
/// The paths for any rotated log files.
pub rotated_log_files: Vec<Utf8PathBuf>,
}

/// Errors returned from [`InstalledZone::install`].
Expand Down
146 changes: 146 additions & 0 deletions illumos-utils/src/zfs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ use std::fmt;
pub const ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT: &str = "/zone";
pub const ZONE_ZFS_RAMDISK_DATASET: &str = "rpool/zone";

/// The name of a dataset used for creating zone bundles.
///
/// See `sled_agent/src/zone_bundle.rs` for details on the purpose and use of
/// this dataset.
pub const ZONE_BUNDLE_ZFS_DATASET: &str = "rpool/oxide-sled-agent-zone-bundle";

pub const ZFS: &str = "/usr/sbin/zfs";
pub const KEYPATH_ROOT: &str = "/var/run/oxide/";

Expand Down Expand Up @@ -105,6 +111,35 @@ pub struct GetValueError {
err: GetValueErrorRaw,
}

#[derive(Debug, thiserror::Error)]
#[error("Failed to list snapshots: {0}")]
pub struct ListSnapshotsError(#[from] crate::ExecutionError);

#[derive(Debug, thiserror::Error)]
#[error("Failed to create snapshot '{snap_name}' from filesystem '{filesystem}': {err}")]
pub struct CreateSnapshotError {
filesystem: String,
snap_name: String,
err: crate::ExecutionError,
}

#[derive(Debug, thiserror::Error)]
#[error("Failed to delete snapshot '{filesystem}@{snap_name}': {err}")]
pub struct DestroySnapshotError {
filesystem: String,
snap_name: String,
err: crate::ExecutionError,
}

#[derive(Debug, thiserror::Error)]
#[error("Failed to create clone '{clone_name}' from snapshot '{filesystem}@{snap_name}': {err}")]
pub struct CloneSnapshotError {
filesystem: String,
snap_name: String,
clone_name: String,
err: crate::ExecutionError,
}

/// Wraps commands for interacting with ZFS.
pub struct Zfs {}

Expand Down Expand Up @@ -181,6 +216,20 @@ impl Zfs {
Ok(filesystems)
}

/// Return the name of a dataset for a ZFS object.
///
/// The object can either be a dataset name, or a path, in which case it
/// will be resolved to the _mounted_ ZFS dataset containing that path.
pub fn get_dataset_name(object: &str) -> Result<String, ListDatasetsError> {
let mut command = std::process::Command::new(ZFS);
let cmd = command.args(&["get", "-Hpo", "name", "name", object]);
execute(cmd)
.map(|output| {
String::from_utf8_lossy(&output.stdout).trim().to_string()
})
.map_err(|err| ListDatasetsError { name: object.to_string(), err })
}

/// Destroys a dataset.
pub fn destroy_dataset(name: &str) -> Result<(), DestroyDatasetError> {
let mut command = std::process::Command::new(PFEXEC);
Expand Down Expand Up @@ -353,6 +402,7 @@ impl Zfs {
}
}

/// Set the value of an Oxide-managed ZFS property.
pub fn set_oxide_value(
filesystem_name: &str,
name: &str,
Expand All @@ -378,6 +428,7 @@ impl Zfs {
Ok(())
}

/// Get the value of an Oxide-managed ZFS property.
pub fn get_oxide_value(
filesystem_name: &str,
name: &str,
Expand Down Expand Up @@ -408,6 +459,96 @@ impl Zfs {
}
Ok(value.to_string())
}

/// List all extant snapshots.
pub fn list_snapshots() -> Result<Vec<Snapshot>, ListSnapshotsError> {
let mut command = std::process::Command::new(ZFS);
let cmd = command.args(&["list", "-H", "-o", "name", "-t", "snapshot"]);
execute(cmd)
.map(|output| {
let stdout = String::from_utf8_lossy(&output.stdout);
stdout
.trim()
.lines()
.map(|line| {
let (filesystem, snap_name) =
line.split_once('@').unwrap();
Snapshot {
filesystem: filesystem.to_string(),
snap_name: snap_name.to_string(),
}
})
.collect()
})
.map_err(ListSnapshotsError::from)
}

/// Create a snapshot of a filesystem.
pub fn create_snapshot(
filesystem: &str,
snap_name: &str,
) -> Result<(), CreateSnapshotError> {
let mut command = std::process::Command::new(ZFS);
let path = format!("{filesystem}@{snap_name}");
let cmd = command.args(&["snapshot", &path]);
execute(cmd).map(|_| ()).map_err(|err| CreateSnapshotError {
filesystem: filesystem.to_string(),
snap_name: snap_name.to_string(),
err,
})
}

/// Destroy a named snapshot of a filesystem.
pub fn destroy_snapshot(
filesystem: &str,
snap_name: &str,
) -> Result<(), DestroySnapshotError> {
let mut command = std::process::Command::new(ZFS);
let path = format!("{filesystem}@{snap_name}");
let cmd = command.args(&["destroy", &path]);
execute(cmd).map(|_| ()).map_err(|err| DestroySnapshotError {
filesystem: filesystem.to_string(),
snap_name: snap_name.to_string(),
err,
})
}

/// Create a clone of a snapshot.
pub fn clone_snapshot(
filesystem: &str,
snap_name: &str,
clone_name: &str,
) -> Result<(), CloneSnapshotError> {
let mut command = std::process::Command::new(ZFS);
let snap_path = format!("{filesystem}@{snap_name}");
let cmd = command.args(&["clone", &snap_path, clone_name]);
execute(cmd).map(|_| ()).map_err(|err| CloneSnapshotError {
filesystem: filesystem.to_string(),
snap_name: snap_name.to_string(),
clone_name: clone_name.to_string(),
err,
})
}
}

/// A read-only snapshot of a ZFS filesystem.
#[derive(Clone, Debug)]
pub struct Snapshot {
pub filesystem: String,
pub snap_name: String,
}

impl fmt::Display for Snapshot {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}@{}", self.filesystem, self.snap_name)
}
}

/// A clone of a ZFS snapshot.
#[derive(Clone, Debug)]
pub struct ZfsClone {
pub snapshot: Snapshot,
pub clone_name: String,
}

/// Returns all datasets managed by Omicron
Expand Down Expand Up @@ -444,5 +585,10 @@ pub fn get_all_omicron_datasets_for_delete() -> anyhow::Result<Vec<String>> {
}
};

// Delete the zone-bundle dataset, if it exists.
if let Ok(zb_dataset) = Zfs::get_dataset_name(&ZONE_BUNDLE_ZFS_DATASET) {
datasets.push(zb_dataset);
}

Ok(datasets)
}
Binary file not shown.
25 changes: 25 additions & 0 deletions sled-agent/src/metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Tools for producing metrics from the sled-agent for `oximeter` to collect.
use oximeter_instruments::kstat::foo;
use oximeter_producer::Server as ProducerServer;

pub struct MetricsManager {
producer_server: ProducerServer,
kstats: foo::KstatSampler,
}

// Spawn a task to manage all the metrics, people can send messages to register
// new kstat-based targets / metrics.
//
// Questions:
//
// - How do we "unregister" anything? Do we? Or maybe just detect when the kstat
// is gone? That'll work for VNICs, and we don't want to unregister physical
// link stats.
//
// - Does this produce _all_ metrics for the sled agent? That makes the
// interface trickier, if it's using message-passing.
Loading

0 comments on commit ada2189

Please sign in to comment.