Skip to content

Commit

Permalink
add monitoring for open file descriptors stat
Browse files Browse the repository at this point in the history
  • Loading branch information
HaoranYi committed Nov 23, 2022
1 parent e87ce35 commit b49d0d3
Show file tree
Hide file tree
Showing 9 changed files with 57 additions and 0 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ log = "0.4.17"
lru = "0.7.7"
min-max-heap = "1.3.0"
num_enum = "0.5.7"
procfs = "0.14.1"
rand = "0.7.0"
rand_chacha = "0.2.2"
rayon = "1.5.3"
Expand Down
43 changes: 43 additions & 0 deletions core/src/system_monitor_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use num_enum::{IntoPrimitive, TryFromPrimitive};
#[cfg(target_os = "linux")]
use std::{fs::File, io::BufReader};
use {
procfs::process::{LimitValue, Process},
solana_sdk::timing::AtomicInterval,
std::{
collections::HashMap,
Expand All @@ -29,6 +30,7 @@ const SAMPLE_INTERVAL_OS_NETWORK_LIMITS_MS: u64 = MS_PER_H;
const SAMPLE_INTERVAL_MEM_MS: u64 = MS_PER_S;
const SAMPLE_INTERVAL_CPU_MS: u64 = 10 * MS_PER_S;
const SAMPLE_INTERVAL_DISK_MS: u64 = MS_PER_S;
const SAMPLE_INTERVAL_OPEN_FD_MS: u64 = 30 * MS_PER_S;
const SLEEP_INTERVAL: Duration = Duration::from_millis(500);

#[cfg(target_os = "linux")]
Expand Down Expand Up @@ -392,6 +394,7 @@ impl SystemMonitorService {
report_os_network_stats: bool,
report_os_cpu_stats: bool,
report_os_disk_stats: bool,
report_os_open_fd_stats: bool,
) -> Self {
info!("Starting SystemMonitorService");
let thread_hdl = Builder::new()
Expand All @@ -403,6 +406,7 @@ impl SystemMonitorService {
report_os_network_stats,
report_os_cpu_stats,
report_os_disk_stats,
report_os_open_fd_stats,
);
})
.unwrap();
Expand Down Expand Up @@ -832,6 +836,40 @@ impl SystemMonitorService {
Self::report_cpuid_values();
}

fn get_open_fd_stats() -> Option<(usize, usize, usize)> {
let proc = Process::myself().ok()?;
let curr_num_open_fd = proc.fd_count().unwrap();
let max_open_fd_limit = proc.limits().unwrap().max_open_files;

let max_open_fd_soft_limit = match max_open_fd_limit.soft_limit {
LimitValue::Unlimited => usize::MAX,
LimitValue::Value(x) => x as usize,
};
let max_open_fd_hard_limit = match max_open_fd_limit.hard_limit {
LimitValue::Unlimited => usize::MAX,
LimitValue::Value(x) => x as usize,
};

Some((
curr_num_open_fd,
max_open_fd_soft_limit,
max_open_fd_hard_limit,
))
}

fn report_open_fd_stats() {
if let Some((curr_num_open_fd, max_open_fd_soft_limit, max_open_fd_hard_limit)) =
Self::get_open_fd_stats()
{
datapoint_info!(
"open-fd-stats",
("number_open_files", curr_num_open_fd, i64),
("max_open_files_hard_limit", max_open_fd_hard_limit, i64),
("max_open_files_soft_limit", max_open_fd_soft_limit, i64),
);
}
}

#[cfg(target_os = "linux")]
fn process_disk_stats(disk_stats: &mut Option<DiskStats>) {
match read_disk_stats() {
Expand Down Expand Up @@ -973,6 +1011,7 @@ impl SystemMonitorService {
report_os_network_stats: bool,
report_os_cpu_stats: bool,
report_os_disk_stats: bool,
report_os_open_fd_stats: bool,
) {
let mut udp_stats = None;
let mut disk_stats = None;
Expand All @@ -981,6 +1020,7 @@ impl SystemMonitorService {
let mem_timer = AtomicInterval::default();
let cpu_timer = AtomicInterval::default();
let disk_timer = AtomicInterval::default();
let open_fd_timer = AtomicInterval::default();

loop {
if exit.load(Ordering::Relaxed) {
Expand All @@ -1003,6 +1043,9 @@ impl SystemMonitorService {
if report_os_disk_stats && disk_timer.should_update(SAMPLE_INTERVAL_DISK_MS) {
Self::process_disk_stats(&mut disk_stats);
}
if report_os_open_fd_stats && open_fd_timer.should_update(SAMPLE_INTERVAL_OPEN_FD_MS) {
Self::report_open_fd_stats();
}
sleep(SLEEP_INTERVAL);
}
}
Expand Down
3 changes: 3 additions & 0 deletions core/src/validator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ pub struct ValidatorConfig {
pub no_os_network_stats_reporting: bool,
pub no_os_cpu_stats_reporting: bool,
pub no_os_disk_stats_reporting: bool,
pub no_os_open_fd_stats_reporting: bool,
pub poh_pinned_cpu_core: usize,
pub poh_hashes_per_batch: u64,
pub process_ledger_before_services: bool,
Expand Down Expand Up @@ -218,6 +219,7 @@ impl Default for ValidatorConfig {
no_os_network_stats_reporting: true,
no_os_cpu_stats_reporting: true,
no_os_disk_stats_reporting: true,
no_os_open_fd_stats_reporting: true,
poh_pinned_cpu_core: poh_service::DEFAULT_PINNED_CPU_CORE,
poh_hashes_per_batch: poh_service::DEFAULT_HASHES_PER_BATCH,
process_ledger_before_services: false,
Expand Down Expand Up @@ -500,6 +502,7 @@ impl Validator {
!config.no_os_network_stats_reporting,
!config.no_os_cpu_stats_reporting,
!config.no_os_disk_stats_reporting,
!config.no_os_open_fd_stats_reporting,
));

let (poh_timing_point_sender, poh_timing_point_receiver) = unbounded();
Expand Down
1 change: 1 addition & 0 deletions ledger-tool/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2711,6 +2711,7 @@ fn main() {
false,
false,
false,
false,
);

accounts_index_config.index_limit_mb = if let Some(limit) =
Expand Down
1 change: 1 addition & 0 deletions local-cluster/src/validator_configs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig {
no_os_network_stats_reporting: config.no_os_network_stats_reporting,
no_os_cpu_stats_reporting: config.no_os_cpu_stats_reporting,
no_os_disk_stats_reporting: config.no_os_disk_stats_reporting,
no_os_open_fd_stats_reporting: config.no_os_open_fd_stats_reporting,
poh_pinned_cpu_core: config.poh_pinned_cpu_core,
account_indexes: config.account_indexes.clone(),
accounts_db_caching_enabled: config.accounts_db_caching_enabled,
Expand Down
1 change: 1 addition & 0 deletions programs/sbf/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions validator/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,11 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> {
.long("no-os-disk-stats-reporting")
.help("Disable reporting of OS disk statistics.")
)
.arg(
Arg::with_name("no_os_open_fd_stats_reporting")
.long("no-os-open-fd-stats-reporting")
.help("Disable reporting of open file descriptors statistics for current process.")
)
.arg(
Arg::with_name("accounts-hash-interval-slots")
.long("accounts-hash-interval-slots")
Expand Down
1 change: 1 addition & 0 deletions validator/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1144,6 +1144,7 @@ pub fn main() {
no_os_network_stats_reporting: matches.is_present("no_os_network_stats_reporting"),
no_os_cpu_stats_reporting: matches.is_present("no_os_cpu_stats_reporting"),
no_os_disk_stats_reporting: matches.is_present("no_os_disk_stats_reporting"),
no_os_open_fd_stats_reporting: matches.is_present("no_os_open_fd_stats_reporting"),
poh_pinned_cpu_core: value_of(&matches, "poh_pinned_cpu_core")
.unwrap_or(poh_service::DEFAULT_PINNED_CPU_CORE),
poh_hashes_per_batch: value_of(&matches, "poh_hashes_per_batch")
Expand Down

0 comments on commit b49d0d3

Please sign in to comment.