From 13f41155f2f7609149350473d36272a8c32a0df2 Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Wed, 11 Dec 2024 13:00:29 -0800 Subject: [PATCH] Read CPU percentage from cgroup data (#1140) ### What does this PR do? This commit expands the cgroup sub-observer to inspect the CPU consumption of the target group, calculating a percentage utilization from this. ### Related issues Resolves SMPTNG-385 --- lading/src/observer/linux/cgroup.rs | 8 +- lading/src/observer/linux/cgroup/v2.rs | 2 + lading/src/observer/linux/cgroup/v2/cpu.rs | 104 +++++++++++++++++++++ 3 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 lading/src/observer/linux/cgroup/v2/cpu.rs diff --git a/lading/src/observer/linux/cgroup.rs b/lading/src/observer/linux/cgroup.rs index 94c2a2254..99065720f 100644 --- a/lading/src/observer/linux/cgroup.rs +++ b/lading/src/observer/linux/cgroup.rs @@ -114,7 +114,13 @@ impl Sampler { ); if let Err(err) = v2::poll(&cgroup_path, &self.labels).await { error!( - "Unable to poll cgroup metrics for {path}: {err}", + "Unable to poll cgroup memory metrics for {path}: {err}", + path = cgroup_path.to_string_lossy() + ); + } + if let Err(err) = v2::cpu::poll(&cgroup_path, &self.labels).await { + error!( + "Unable to poll cgroup CPU metrics for {path}: {err}", path = cgroup_path.to_string_lossy() ); } diff --git a/lading/src/observer/linux/cgroup/v2.rs b/lading/src/observer/linux/cgroup/v2.rs index b00f205d5..173bf0eed 100644 --- a/lading/src/observer/linux/cgroup/v2.rs +++ b/lading/src/observer/linux/cgroup/v2.rs @@ -1,3 +1,5 @@ +pub(crate) mod cpu; + use core::f64; use std::{ io, diff --git a/lading/src/observer/linux/cgroup/v2/cpu.rs b/lading/src/observer/linux/cgroup/v2/cpu.rs new file mode 100644 index 000000000..f83b16d92 --- /dev/null +++ b/lading/src/observer/linux/cgroup/v2/cpu.rs @@ -0,0 +1,104 @@ +use metrics::gauge; +use once_cell::sync::OnceCell; +use std::path::Path; +use std::sync::Mutex; +use std::time::Instant; +use tokio::fs; + +#[derive(thiserror::Error, Debug)] +/// Errors produced by functions in this module +pub(crate) enum Error { + /// Wrapper for [`std::io::Error`] + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + #[error("Float Parsing: {0}")] + ParseFloat(#[from] std::num::ParseFloatError), + #[error("Integer Parsing: {0}")] + ParseInt(#[from] std::num::ParseIntError), +} + +struct PrevStats { + usage_usec: u64, + user_usec: u64, + system_usec: u64, + last_instant: Instant, +} + +static PREV: OnceCell> = OnceCell::new(); + +// Read cgroup CPU data and calculate a percentage of usage. +pub(crate) async fn poll(group_prefix: &Path, labels: &[(String, String)]) -> Result<(), Error> { + // Read cpu.max (cgroup v2) + let cpu_max = fs::read_to_string(group_prefix.join("cpu.max")).await?; + let parts: Vec<&str> = cpu_max.split_whitespace().collect(); + let (max_str, period_str) = (parts[0], parts[1]); + let allowed_cores = if max_str == "max" { + // If the target cgroup has no CPU limit we assume it has access to all + // physical cores. + num_cpus::get_physical() as f64 + } else { + let max_val = max_str.parse::()?; + let period_val = period_str.parse::()?; + max_val / period_val + }; + + // Read cpu.stat + let cpu_stat = fs::read_to_string(group_prefix.join("cpu.stat")).await?; + let mut usage_usec = 0u64; + let mut user_usec = 0u64; + let mut system_usec = 0u64; + + for line in cpu_stat.lines() { + let mut parts = line.split_whitespace(); + let key = parts.next().expect("no key"); + let value: u64 = parts.next().expect("no value").parse()?; + match key { + "usage_usec" => usage_usec = value, + "user_usec" => user_usec = value, + "system_usec" => system_usec = value, + _ => {} + } + } + + // Get or initialize the previous stats. Note that the first time this is + // initialized we intentionally set last_instance to now to avoid scheduling + // shenanigans. + let now = Instant::now(); + let mut prev = PREV + .get_or_init(|| { + Mutex::new(PrevStats { + usage_usec, + user_usec, + system_usec, + last_instant: now, + }) + }) + .lock() + .expect("could not lock stats, poisoned by my constituents"); + + let delta_time = now.duration_since(prev.last_instant).as_micros(); + let delta_usage = usage_usec.saturating_sub(prev.usage_usec); + let delta_user = user_usec.saturating_sub(prev.user_usec); + let delta_system = system_usec.saturating_sub(prev.system_usec); + + // Update previous stats and if there's a time delta calculate the CPU + // usage. + prev.usage_usec = usage_usec; + prev.user_usec = user_usec; + prev.system_usec = system_usec; + prev.last_instant = now; + if delta_time > 0 { + let delta_time = delta_time as f64; + + // Compute CPU usage as a percentage of the cgroup CPU allocation + let total_cpu = (delta_usage as f64 / delta_time) / allowed_cores * 100.0; + let user_cpu = (delta_user as f64 / delta_time) / allowed_cores * 100.0; + let system_cpu = (delta_system as f64 / delta_time) / allowed_cores * 100.0; + + gauge!("cgroup_total_cpu_percentage", labels).set(total_cpu); + gauge!("cgroup_user_cpu_percentage", labels).set(user_cpu); + gauge!("cgroup_system_cpu_percentage", labels).set(system_cpu); + } + + Ok(()) +}