From 502f249904cd4074988a6578cbd78a15c5e81299 Mon Sep 17 00:00:00 2001 From: Brennan Watt Date: Wed, 20 Jul 2022 11:44:36 -0700 Subject: [PATCH] Add proc net dev metrics to net stats (#26603) * Add proc net dev metrics to net stats --- core/src/system_monitor_service.rs | 289 +++++++++++++++++++++++++---- core/src/validator.rs | 6 +- 2 files changed, 260 insertions(+), 35 deletions(-) diff --git a/core/src/system_monitor_service.rs b/core/src/system_monitor_service.rs index c545718b2c43f6..a3973b9bfbe2c0 100644 --- a/core/src/system_monitor_service.rs +++ b/core/src/system_monitor_service.rs @@ -1,5 +1,5 @@ #[cfg(target_os = "linux")] -use std::{fs::File, io::BufReader, path::Path}; +use std::{fs::File, io::BufReader}; use { solana_sdk::timing::AtomicInterval, std::{ @@ -26,6 +26,8 @@ const SLEEP_INTERVAL: Duration = Duration::from_millis(500); #[cfg(target_os = "linux")] const PROC_NET_SNMP_PATH: &str = "/proc/net/snmp"; +#[cfg(target_os = "linux")] +const PROC_NET_DEV_PATH: &str = "/proc/net/dev"; pub struct SystemMonitorService { thread_hdl: JoinHandle<()>, @@ -43,6 +45,50 @@ struct UdpStats { ignored_multi: usize, } +#[derive(Default)] +#[cfg_attr(not(target_os = "linux"), allow(dead_code))] +// These stats are aggregated across all network devices excluding the loopback interface. +struct NetDevStats { + // Number of bytes received + rx_bytes: u64, + // Number of packets received + rx_packets: u64, + // Number of receive errors detected by device driver + rx_errs: u64, + // Number of receive packets dropped by the device driver (not included in error count) + rx_drops: u64, + // Number of receive FIFO buffer errors + rx_fifo: u64, + // Number of receive packet framing errors + rx_frame: u64, + // Number of compressed packets received + rx_compressed: u64, + // Number of multicast frames received by device driver + rx_multicast: u64, + // Number of bytes transmitted + tx_bytes: u64, + // Number of packets transmitted + tx_packets: u64, + // Number of transmit errors detected by device driver + tx_errs: u64, + // Number of transmit packets dropped by device driver + tx_drops: u64, + // Number of transmit FIFO buffer errors + tx_fifo: u64, + // Number of transmit collisions detected + tx_colls: u64, + // Number of transmit carrier losses detected by device driver + tx_carrier: u64, + // Number of compressed packets transmitted + tx_compressed: u64, +} + +#[cfg_attr(not(target_os = "linux"), allow(dead_code))] +struct NetStats { + udp_stats: UdpStats, + net_dev_stats: NetDevStats, +} + struct CpuInfo { cpu_num: u32, cpu_freq_mhz: u64, @@ -75,16 +121,27 @@ fn platform_id() -> String { } #[cfg(target_os = "linux")] -fn read_udp_stats(file_path: impl AsRef) -> Result { - let file = File::open(file_path).map_err(|e| e.to_string())?; - let mut reader = BufReader::new(file); - parse_udp_stats(&mut reader) +fn read_net_stats() -> Result { + let file_path_snmp = PROC_NET_SNMP_PATH; + let file_snmp = File::open(file_path_snmp).map_err(|e| e.to_string())?; + let mut reader_snmp = BufReader::new(file_snmp); + + let file_path_dev = PROC_NET_DEV_PATH; + let file_dev = File::open(file_path_dev).map_err(|e| e.to_string())?; + let mut reader_dev = BufReader::new(file_dev); + + let udp_stats = parse_udp_stats(&mut reader_snmp)?; + let net_dev_stats = parse_net_dev_stats(&mut reader_dev)?; + Ok(NetStats { + udp_stats, + net_dev_stats, + }) } #[cfg_attr(not(target_os = "linux"), allow(dead_code))] -fn parse_udp_stats(reader: &mut impl BufRead) -> Result { +fn parse_udp_stats(reader_snmp: &mut impl BufRead) -> Result { let mut udp_lines = Vec::default(); - for line in reader.lines() { + for line in reader_snmp.lines() { let line = line.map_err(|e| e.to_string())?; if line.starts_with("Udp:") { udp_lines.push(line); @@ -110,18 +167,59 @@ fn parse_udp_stats(reader: &mut impl BufRead) -> Result { .collect(); let stats = UdpStats::from_map(&udp_stats); + Ok(stats) +} + +#[cfg_attr(not(target_os = "linux"), allow(dead_code))] +fn parse_net_dev_stats(reader_dev: &mut impl BufRead) -> Result { + let mut stats = NetDevStats::default(); + for (line_number, line) in reader_dev.lines().enumerate() { + if line_number < 2 { + // Skip first two lines with header information. + continue; + } + + let line = line.map_err(|e| e.to_string())?; + let values: Vec<_> = line.split_ascii_whitespace().collect(); + + if values.len() != 17 { + return Err("parse error, expected exactly 17 stat elements".to_string()); + } + if values[0] == "lo:" { + // Filter out the loopback network interface as we are only concerned with + // external traffic. + continue; + } + + stats.rx_bytes += values[1].parse::().map_err(|e| e.to_string())?; + stats.rx_packets += values[2].parse::().map_err(|e| e.to_string())?; + stats.rx_errs += values[3].parse::().map_err(|e| e.to_string())?; + stats.rx_drops += values[4].parse::().map_err(|e| e.to_string())?; + stats.rx_fifo += values[5].parse::().map_err(|e| e.to_string())?; + stats.rx_frame += values[6].parse::().map_err(|e| e.to_string())?; + stats.rx_compressed += values[7].parse::().map_err(|e| e.to_string())?; + stats.rx_multicast += values[8].parse::().map_err(|e| e.to_string())?; + stats.tx_bytes += values[9].parse::().map_err(|e| e.to_string())?; + stats.tx_packets += values[10].parse::().map_err(|e| e.to_string())?; + stats.tx_errs += values[11].parse::().map_err(|e| e.to_string())?; + stats.tx_drops += values[12].parse::().map_err(|e| e.to_string())?; + stats.tx_fifo += values[13].parse::().map_err(|e| e.to_string())?; + stats.tx_colls += values[14].parse::().map_err(|e| e.to_string())?; + stats.tx_carrier += values[15].parse::().map_err(|e| e.to_string())?; + stats.tx_compressed += values[16].parse::().map_err(|e| e.to_string())?; + } Ok(stats) } #[cfg(target_os = "linux")] -pub fn verify_udp_stats_access() -> Result<(), String> { - read_udp_stats(PROC_NET_SNMP_PATH)?; +pub fn verify_net_stats_access() -> Result<(), String> { + read_net_stats()?; Ok(()) } #[cfg(not(target_os = "linux"))] -pub fn verify_udp_stats_access() -> Result<(), String> { +pub fn verify_net_stats_access() -> Result<(), String> { Ok(()) } @@ -235,68 +333,164 @@ impl SystemMonitorService { } #[cfg(target_os = "linux")] - fn process_udp_stats(udp_stats: &mut Option) { - match read_udp_stats(PROC_NET_SNMP_PATH) { + fn process_net_stats(net_stats: &mut Option) { + match read_net_stats() { Ok(new_stats) => { - if let Some(old_stats) = udp_stats { - Self::report_udp_stats(old_stats, &new_stats); + if let Some(old_stats) = net_stats { + Self::report_net_stats(old_stats, &new_stats); } - *udp_stats = Some(new_stats); + *net_stats = Some(new_stats); } - Err(e) => warn!("read_udp_stats: {}", e), + Err(e) => warn!("read_net_stats: {}", e), } } #[cfg(not(target_os = "linux"))] - fn process_udp_stats(_udp_stats: &mut Option) {} + fn process_net_stats(_net_stats: &mut Option) {} #[cfg(target_os = "linux")] - fn report_udp_stats(old_stats: &UdpStats, new_stats: &UdpStats) { + fn report_net_stats(old_stats: &NetStats, new_stats: &NetStats) { datapoint_info!( "net-stats-validator", ( "in_datagrams_delta", - new_stats.in_datagrams - old_stats.in_datagrams, + new_stats.udp_stats.in_datagrams - old_stats.udp_stats.in_datagrams, i64 ), ( "no_ports_delta", - new_stats.no_ports - old_stats.no_ports, + new_stats.udp_stats.no_ports - old_stats.udp_stats.no_ports, i64 ), ( "in_errors_delta", - new_stats.in_errors - old_stats.in_errors, + new_stats.udp_stats.in_errors - old_stats.udp_stats.in_errors, i64 ), ( "out_datagrams_delta", - new_stats.out_datagrams - old_stats.out_datagrams, + new_stats.udp_stats.out_datagrams - old_stats.udp_stats.out_datagrams, i64 ), ( "rcvbuf_errors_delta", - new_stats.rcvbuf_errors - old_stats.rcvbuf_errors, + new_stats.udp_stats.rcvbuf_errors - old_stats.udp_stats.rcvbuf_errors, i64 ), ( "sndbuf_errors_delta", - new_stats.sndbuf_errors - old_stats.sndbuf_errors, + new_stats.udp_stats.sndbuf_errors - old_stats.udp_stats.sndbuf_errors, i64 ), ( "in_csum_errors_delta", - new_stats.in_csum_errors - old_stats.in_csum_errors, + new_stats.udp_stats.in_csum_errors - old_stats.udp_stats.in_csum_errors, i64 ), ( "ignored_multi_delta", - new_stats.ignored_multi - old_stats.ignored_multi, + new_stats.udp_stats.ignored_multi - old_stats.udp_stats.ignored_multi, + i64 + ), + ("in_errors", new_stats.udp_stats.in_errors, i64), + ("rcvbuf_errors", new_stats.udp_stats.rcvbuf_errors, i64), + ("sndbuf_errors", new_stats.udp_stats.sndbuf_errors, i64), + ( + "rx_bytes_delta", + new_stats + .net_dev_stats + .rx_bytes + .saturating_sub(old_stats.net_dev_stats.rx_bytes), + i64 + ), + ( + "rx_packets_delta", + new_stats + .net_dev_stats + .rx_packets + .saturating_sub(old_stats.net_dev_stats.rx_packets), + i64 + ), + ( + "rx_errs_delta", + new_stats + .net_dev_stats + .rx_errs + .saturating_sub(old_stats.net_dev_stats.rx_errs), + i64 + ), + ( + "rx_drops_delta", + new_stats + .net_dev_stats + .rx_drops + .saturating_sub(old_stats.net_dev_stats.rx_drops), + i64 + ), + ( + "rx_fifo_delta", + new_stats + .net_dev_stats + .rx_fifo + .saturating_sub(old_stats.net_dev_stats.rx_fifo), + i64 + ), + ( + "rx_frame_delta", + new_stats + .net_dev_stats + .rx_frame + .saturating_sub(old_stats.net_dev_stats.rx_frame), + i64 + ), + ( + "tx_bytes_delta", + new_stats + .net_dev_stats + .tx_bytes + .saturating_sub(old_stats.net_dev_stats.tx_bytes), + i64 + ), + ( + "tx_packets_delta", + new_stats + .net_dev_stats + .tx_packets + .saturating_sub(old_stats.net_dev_stats.tx_packets), + i64 + ), + ( + "tx_errs_delta", + new_stats + .net_dev_stats + .tx_errs + .saturating_sub(old_stats.net_dev_stats.tx_errs), + i64 + ), + ( + "tx_drops_delta", + new_stats + .net_dev_stats + .tx_drops + .saturating_sub(old_stats.net_dev_stats.tx_drops), + i64 + ), + ( + "tx_fifo_delta", + new_stats + .net_dev_stats + .tx_fifo + .saturating_sub(old_stats.net_dev_stats.tx_fifo), + i64 + ), + ( + "tx_colls_delta", + new_stats + .net_dev_stats + .tx_colls + .saturating_sub(old_stats.net_dev_stats.tx_colls), i64 ), - ("in_errors", new_stats.in_errors, i64), - ("rcvbuf_errors", new_stats.rcvbuf_errors, i64), - ("sndbuf_errors", new_stats.sndbuf_errors, i64), ); } @@ -399,7 +593,7 @@ impl SystemMonitorService { Self::check_os_network_limits(); } if udp_timer.should_update(SAMPLE_INTERVAL_UDP_MS) { - Self::process_udp_stats(&mut udp_stats); + Self::process_net_stats(&mut udp_stats); } } if report_os_memory_stats && mem_timer.should_update(SAMPLE_INTERVAL_MEM_MS) { @@ -423,7 +617,7 @@ mod tests { #[test] fn test_parse_udp_stats() { - let mut mock_snmp = + const MOCK_SNMP: &[u8] = b"Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates Ip: 1 64 357 0 2 0 0 0 355 315 0 6 0 0 0 0 0 0 0 Icmp: InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps @@ -436,15 +630,46 @@ Udp: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumE Udp: 27 7 0 30 0 0 0 0 UdpLite: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti UdpLite: 0 0 0 0 0 0 0 0" as &[u8]; + const UNEXPECTED_DATA: &[u8] = b"unexpected data" as &[u8]; + + let mut mock_snmp = MOCK_SNMP; let stats = parse_udp_stats(&mut mock_snmp).unwrap(); assert_eq!(stats.out_datagrams, 30); assert_eq!(stats.no_ports, 7); - let mut mock_snmp = b"unexpected data" as &[u8]; + mock_snmp = UNEXPECTED_DATA; let stats = parse_udp_stats(&mut mock_snmp); assert!(stats.is_err()); } + #[test] + fn test_parse_net_dev_stats() { + const MOCK_DEV: &[u8] = +b"Inter-| Receive | Transmit +face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed +lo: 50 1 0 0 0 0 0 0 100 2 1 0 0 0 0 0 +eno1: 100 1 0 0 0 0 0 0 200 3 2 0 0 0 0 0 +ens4: 400 4 0 1 0 0 0 0 250 5 0 0 0 0 0 0" as &[u8]; + const UNEXPECTED_DATA: &[u8] = b"un +expected +data" as &[u8]; + + let mut mock_dev = MOCK_DEV; + let stats = parse_net_dev_stats(&mut mock_dev).unwrap(); + assert_eq!(stats.rx_bytes, 500); + assert_eq!(stats.rx_packets, 5); + assert_eq!(stats.rx_errs, 0); + assert_eq!(stats.rx_drops, 1); + assert_eq!(stats.tx_bytes, 450); + assert_eq!(stats.tx_packets, 8); + assert_eq!(stats.tx_errs, 2); + assert_eq!(stats.tx_drops, 0); + + let mut mock_dev = UNEXPECTED_DATA; + let stats = parse_net_dev_stats(&mut mock_dev); + assert!(stats.is_err()); + } + #[test] fn test_calc_percent() { assert!(SystemMonitorService::calc_percent(99, 100) < 100.0); diff --git a/core/src/validator.rs b/core/src/validator.rs index c94ae7dff543bd..21e78a294cd46e 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -18,7 +18,7 @@ use { sigverify, snapshot_packager_service::SnapshotPackagerService, stats_reporter_service::StatsReporterService, - system_monitor_service::{verify_udp_stats_access, SystemMonitorService}, + system_monitor_service::{verify_net_stats_access, SystemMonitorService}, tower_storage::TowerStorage, tpu::{Tpu, TpuSockets, DEFAULT_TPU_COALESCE_MS}, tvu::{Tvu, TvuConfig, TvuSockets}, @@ -390,8 +390,8 @@ impl Validator { warn!("vote account: {}", vote_account); if !config.no_os_network_stats_reporting { - verify_udp_stats_access().unwrap_or_else(|err| { - error!("Failed to access UDP stats: {}. Bypass check with --no-os-network-stats-reporting.", err); + verify_net_stats_access().unwrap_or_else(|err| { + error!("Failed to access Network stats: {}. Bypass check with --no-os-network-stats-reporting.", err); abort(); }); }