From ba640fdf65403a5e79e11fc0138440ae64026854 Mon Sep 17 00:00:00 2001 From: yuanchao Date: Wed, 23 Oct 2024 15:03:53 +0800 Subject: [PATCH] feat: memory limit supports metric available --- agent/src/config/config.rs | 46 +++++++++++++++++++------- agent/src/config/handler.rs | 45 ++++++++++++++------------ agent/src/monitor.rs | 32 +++++++++++++----- agent/src/utils/guard.rs | 54 ++++++++++++++++++------------- agent/src/utils/process/mod.rs | 9 ++++-- message/agent.proto | 5 +++ message/trident.proto | 8 ++++- server/agent_config/README-CH.md | 39 +++++++++++++++++++--- server/agent_config/README.md | 41 +++++++++++++++++++---- server/agent_config/config.go | 5 +-- server/agent_config/example.yaml | 17 ++++++---- server/agent_config/template.yaml | 30 +++++++++++++---- 12 files changed, 238 insertions(+), 93 deletions(-) diff --git a/agent/src/config/config.rs b/agent/src/config/config.rs index 8a5da3c22bd..dc53187a73b 100644 --- a/agent/src/config/config.rs +++ b/agent/src/config/config.rs @@ -57,7 +57,7 @@ use public::{ bitmap::Bitmap, consts::NPB_DEFAULT_PORT, proto::{ - agent::{self, SocketType, SystemLoadMetric}, + agent::{self, SocketType, SysMemoryMetric, SystemLoadMetric}, common, trident::{self, KubernetesClusterIdRequest, TapMode}, }, @@ -1814,10 +1814,26 @@ impl Default for Alerts { } } +fn to_sys_memory_metric<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + match u8::deserialize(deserializer)? { + 0 => Ok(agent::SysMemoryMetric::Free), + 1 => Ok(agent::SysMemoryMetric::Available), + other => Err(de::Error::invalid_value( + Unexpected::Unsigned(other as u64), + &"[0-1]", + )), + } +} + #[derive(Clone, Copy, Default, Debug, Deserialize, PartialEq, Eq)] #[serde(default)] -pub struct SysFreeMemoryPercentage { +pub struct SysMemoryPercentage { pub trigger_threshold: u32, + #[serde(deserialize_with = "to_sys_memory_metric")] + pub metric: agent::SysMemoryMetric, } fn to_system_load_metric<'de, D>(deserializer: D) -> Result @@ -1841,14 +1857,14 @@ pub struct RelativeSysLoad { pub trigger_threshold: f32, pub recovery_threshold: f32, #[serde(deserialize_with = "to_system_load_metric")] - pub system_load_circuit_breaker_metric: agent::SystemLoadMetric, + pub metric: agent::SystemLoadMetric, } impl PartialEq for RelativeSysLoad { fn eq(&self, other: &Self) -> bool { self.trigger_threshold == other.trigger_threshold || self.recovery_threshold == other.recovery_threshold - || self.system_load_circuit_breaker_metric == other.system_load_circuit_breaker_metric + || self.metric == other.metric } } impl Eq for RelativeSysLoad {} @@ -1858,7 +1874,7 @@ impl Default for RelativeSysLoad { RelativeSysLoad { trigger_threshold: 1.0, recovery_threshold: 0.9, - system_load_circuit_breaker_metric: agent::SystemLoadMetric::Load15, + metric: agent::SystemLoadMetric::Load15, } } } @@ -1882,7 +1898,7 @@ impl Default for TxThroughput { #[derive(Clone, Copy, Default, Debug, Deserialize, PartialEq, Eq)] #[serde(default)] pub struct CircuitBreakers { - pub sys_free_memory_percentage: SysFreeMemoryPercentage, + pub sys_memory_percentage: SysMemoryPercentage, pub relative_sys_load: RelativeSysLoad, pub tx_throughput: TxThroughput, } @@ -2339,13 +2355,15 @@ impl From<&RuntimeConfig> for UserConfig { check_core_file_disabled: rc.yaml_config.check_core_file_disabled, }, circuit_breakers: CircuitBreakers { - sys_free_memory_percentage: SysFreeMemoryPercentage { - trigger_threshold: rc.sys_free_memory_limit, + sys_memory_percentage: SysMemoryPercentage { + trigger_threshold: rc.sys_memory_limit, + metric: SysMemoryMetric::from_str_name(rc.sys_memory_metric.as_str_name()) + .unwrap_or(SysMemoryMetric::Free), }, relative_sys_load: RelativeSysLoad { trigger_threshold: rc.system_load_circuit_breaker_threshold, recovery_threshold: rc.system_load_circuit_breaker_recover, - system_load_circuit_breaker_metric: SystemLoadMetric::from_str_name( + metric: SystemLoadMetric::from_str_name( rc.system_load_circuit_breaker_metric.as_str_name(), ) .unwrap_or(SystemLoadMetric::Load15), @@ -4507,7 +4525,9 @@ pub struct RuntimeConfig { pub kubernetes_api_enabled: bool, #[serde(deserialize_with = "bool_from_int")] pub ntp_enabled: bool, - pub sys_free_memory_limit: u32, + pub sys_memory_limit: u32, + #[serde(skip)] + pub sys_memory_metric: trident::SysMemoryMetric, pub log_file_size: u32, #[serde(deserialize_with = "bool_from_int")] pub external_agent_http_proxy_enabled: bool, @@ -4613,7 +4633,8 @@ impl RuntimeConfig { l4_performance_enabled: true, kubernetes_api_enabled: false, ntp_enabled: false, - sys_free_memory_limit: 0, + sys_memory_limit: 0, + sys_memory_metric: trident::SysMemoryMetric::Free, log_file_size: 1000, external_agent_http_proxy_enabled: false, external_agent_http_proxy_port: 38086, @@ -4871,7 +4892,8 @@ impl TryFrom for RuntimeConfig { l4_performance_enabled: conf.l4_performance_enabled(), kubernetes_api_enabled: conf.kubernetes_api_enabled(), ntp_enabled: conf.ntp_enabled(), - sys_free_memory_limit: conf.sys_free_memory_limit(), + sys_memory_limit: conf.sys_memory_limit(), + sys_memory_metric: conf.sys_memory_metric(), log_file_size: conf.log_file_size(), external_agent_http_proxy_enabled: conf.external_agent_http_proxy_enabled(), external_agent_http_proxy_port: conf.external_agent_http_proxy_port() as u16, diff --git a/agent/src/config/handler.rs b/agent/src/config/handler.rs index 3a04a27cada..f9a047710d5 100755 --- a/agent/src/config/handler.rs +++ b/agent/src/config/handler.rs @@ -197,7 +197,8 @@ pub struct EnvironmentConfig { pub max_millicpus: u32, pub process_threshold: u32, pub thread_threshold: u32, - pub sys_free_memory_limit: u32, + pub sys_memory_limit: u32, + pub sys_memory_metric: agent::SysMemoryMetric, pub log_file_size: u32, pub capture_mode: PacketCaptureType, pub guard_interval: Duration, @@ -1567,11 +1568,12 @@ impl TryFrom<(Config, UserConfig, DynamicConfig)> for ModuleConfig { max_millicpus: conf.global.limits.max_millicpus, process_threshold: conf.global.alerts.process_threshold, thread_threshold: conf.global.alerts.thread_threshold, - sys_free_memory_limit: conf + sys_memory_limit: conf .global .circuit_breakers - .sys_free_memory_percentage + .sys_memory_percentage .trigger_threshold, + sys_memory_metric: conf.global.circuit_breakers.sys_memory_percentage.metric, log_file_size: conf.global.limits.max_local_log_file_size, capture_mode: conf.inputs.cbpf.common.capture_mode, guard_interval: conf.global.tunning.resource_monitoring_interval, @@ -1589,7 +1591,7 @@ impl TryFrom<(Config, UserConfig, DynamicConfig)> for ModuleConfig { .global .circuit_breakers .relative_sys_load - .system_load_circuit_breaker_metric, + .metric, }, synchronizer: SynchronizerConfig { sync_interval: conf.global.communication.proactive_request_interval, @@ -3499,28 +3501,31 @@ impl ConfigHandler { relative_sys_load.recovery_threshold, new_relative_sys_load.recovery_threshold); relative_sys_load.recovery_threshold = new_relative_sys_load.recovery_threshold; } - if relative_sys_load.system_load_circuit_breaker_metric - != new_relative_sys_load.system_load_circuit_breaker_metric - { - info!("Update global.circuit_breakers.relative_sys_load.system_load_circuit_breaker_metric from {:?} to {:?}.", - relative_sys_load.system_load_circuit_breaker_metric, new_relative_sys_load.system_load_circuit_breaker_metric); - relative_sys_load.system_load_circuit_breaker_metric = - new_relative_sys_load.system_load_circuit_breaker_metric; + if relative_sys_load.metric != new_relative_sys_load.metric { + info!( + "Update global.circuit_breakers.relative_sys_load.metric from {:?} to {:?}.", + relative_sys_load.metric, new_relative_sys_load.metric + ); + relative_sys_load.metric = new_relative_sys_load.metric; } if relative_sys_load.trigger_threshold != new_relative_sys_load.trigger_threshold { info!("Update global.circuit_breakers.relative_sys_load.trigger_threshold from {:?} to {:?}.", relative_sys_load.trigger_threshold, new_relative_sys_load.trigger_threshold); relative_sys_load.trigger_threshold = new_relative_sys_load.trigger_threshold; } - let sys_free_memory_percentage = &mut circuit_breakers.sys_free_memory_percentage; - let new_sys_free_memory_percentage = &mut new_circuit_breakers.sys_free_memory_percentage; - if sys_free_memory_percentage.trigger_threshold - != new_sys_free_memory_percentage.trigger_threshold - { - info!("Update global.circuit_breakers.sys_free_memory_percentage.trigger_threshold from {:?} to {:?}.", - sys_free_memory_percentage.trigger_threshold, new_sys_free_memory_percentage.trigger_threshold); - sys_free_memory_percentage.trigger_threshold = - new_sys_free_memory_percentage.trigger_threshold; + let sys_memory_percentage = &mut circuit_breakers.sys_memory_percentage; + let new_sys_memory_percentage = &mut new_circuit_breakers.sys_memory_percentage; + if sys_memory_percentage.trigger_threshold != new_sys_memory_percentage.trigger_threshold { + info!("Update global.circuit_breakers.sys_memory_percentage.trigger_threshold from {:?} to {:?}.", + sys_memory_percentage.trigger_threshold, new_sys_memory_percentage.trigger_threshold); + sys_memory_percentage.trigger_threshold = new_sys_memory_percentage.trigger_threshold; + } + if sys_memory_percentage.metric != new_sys_memory_percentage.metric { + info!( + "Update global.circuit_breakers.sys_memory_percentage.metric from {:?} to {:?}.", + sys_memory_percentage.metric, new_sys_memory_percentage.metric + ); + sys_memory_percentage.metric = new_sys_memory_percentage.metric; } let tx_throughput = &mut circuit_breakers.tx_throughput; let new_tx_throughput = &mut new_circuit_breakers.tx_throughput; diff --git a/agent/src/monitor.rs b/agent/src/monitor.rs index f464f34bc5f..7d8c28e5654 100644 --- a/agent/src/monitor.rs +++ b/agent/src/monitor.rs @@ -33,7 +33,7 @@ use crate::config::handler::EnvironmentAccess; use crate::{ error::{Error, Result}, utils::{ - process::{get_current_sys_free_memory_percentage, get_file_and_size_sum}, + process::{get_current_sys_memory_percentage, get_file_and_size_sum}, stats::{ self, Collector, Countable, Counter, CounterType, CounterValue, RefCountable, StatsOption, @@ -211,24 +211,40 @@ impl RefCountable for SysStatusBroker { } let mut metrics = vec![]; - let current_sys_free_memory_percentage = get_current_sys_free_memory_percentage(); + let (current_sys_free_memory_percentage, current_sys_available_memory_percentage) = + get_current_sys_memory_percentage(); metrics.push(( "sys_free_memory", CounterType::Gauged, CounterValue::Unsigned(current_sys_free_memory_percentage as u64), )); + metrics.push(( + "sys_available_memory", + CounterType::Gauged, + CounterValue::Unsigned(current_sys_available_memory_percentage as u64), + )); - let sys_free_memory_limit = self.config.load().sys_free_memory_limit as f64; - let sys_free_memory_limit_ratio = if sys_free_memory_limit > 0.0 { - current_sys_free_memory_percentage as f64 / sys_free_memory_limit - } else { - 0.0 // If sys_free_memory_limit is set to 0, it means that there is no need to check if the system's free memory is too low. In this case, 0.0 will be directly returned, indicating that there will be no low system free memory alert. - }; + let sys_memory_limit = self.config.load().sys_memory_limit as f64; + + let (sys_free_memory_limit_ratio, sys_available_memory_limit_ratio) = + if sys_memory_limit > 0.0 { + ( + current_sys_free_memory_percentage as f64 / sys_memory_limit, + current_sys_available_memory_percentage as f64 / sys_memory_limit, + ) + } else { + (0.0, 0.0) // If sys_memory_limit is set to 0, it means that there is no need to check if the system's free/available memory is too low. In this case, 0.0 will be directly returned, indicating that there will be no low system free/available memory alert. + }; metrics.push(( "sys_free_memory_limit_ratio", CounterType::Gauged, CounterValue::Float(sys_free_memory_limit_ratio), )); + metrics.push(( + "sys_available_memory_limit_ratio", + CounterType::Gauged, + CounterValue::Float(sys_available_memory_limit_ratio), + )); match get_file_and_size_sum(&self.log_dir) { Ok(file_and_size_sum) => { diff --git a/agent/src/utils/guard.rs b/agent/src/utils/guard.rs index f861932c15a..16bbb3d003c 100644 --- a/agent/src/utils/guard.rs +++ b/agent/src/utils/guard.rs @@ -33,7 +33,7 @@ use log::{debug, error, info, warn}; use sysinfo::{get_current_pid, Pid, ProcessExt, ProcessRefreshKind, System, SystemExt}; use super::process::{ - get_current_sys_free_memory_percentage, get_file_and_size_sum, get_memory_rss, get_thread_num, + get_current_sys_memory_percentage, get_file_and_size_sum, get_memory_rss, get_thread_num, FileAndSizeSum, }; use crate::common::{ @@ -45,7 +45,7 @@ use crate::exception::ExceptionHandler; use crate::rpc::get_timestamp; use crate::utils::{cgroups::is_kernel_available_for_cgroups, environment::running_in_container}; -use public::proto::agent::{Exception, PacketCaptureType, SystemLoadMetric}; +use public::proto::agent::{Exception, PacketCaptureType, SysMemoryMetric, SystemLoadMetric}; struct SystemLoadGuard { system: Arc>, @@ -267,43 +267,51 @@ impl Guard { (cpu_limit / 10) as f32 > cpu_usage // The cpu_usage is in percentage, and the unit of cpu_limit is milli-cores. Divide cpu_limit by 10 to align the units } - fn check_sys_free_memory( - sys_free_memory_limit: f64, - under_sys_free_memory_limit: &mut bool, + fn check_sys_memory( + sys_memory_limit: f64, + sys_memory_metric: SysMemoryMetric, + under_sys_memory_limit: &mut bool, last_exceeded: &mut Duration, exception_handler: &ExceptionHandler, ) { - let current_sys_free_memory_percentage = get_current_sys_free_memory_percentage() as f64; + let (current_sys_free_memory_percentage, current_sys_available_memory_percentage) = + get_current_sys_memory_percentage(); debug!( - "current_sys_free_memory_percentage: {}, sys_free_memory_limit: {}", - current_sys_free_memory_percentage, sys_free_memory_limit + "current_sys_memory_percentage: [ free: {}, available: {} ], sys_memory_metric: {:?} sys_memory_limit: {}", + current_sys_free_memory_percentage, current_sys_available_memory_percentage, sys_memory_metric, sys_memory_limit ); - if sys_free_memory_limit != 0.0 { - if current_sys_free_memory_percentage < sys_free_memory_limit * 0.7 { + let current_memory_percentage = if sys_memory_metric == SysMemoryMetric::Free { + current_sys_free_memory_percentage as f64 + } else { + current_sys_available_memory_percentage as f64 + }; + + if sys_memory_limit != 0.0 { + if current_memory_percentage < sys_memory_limit * 0.7 { *last_exceeded = get_timestamp(0); exception_handler.set(Exception::FreeMemExceeded); - *under_sys_free_memory_limit = true; + *under_sys_memory_limit = true; error!( - "current system free memory percentage is less than the 70% of sys_free_memory_limit, current system free memory percentage={}%, sys_free_memory_limit={}%, deepflow-agent restart...", - current_sys_free_memory_percentage, sys_free_memory_limit + "current system {:?} memory percentage is less than the 70% of sys_memory_limit, current system free memory percentage={}%, sys_memory_limit={}%, deepflow-agent restart...", + sys_memory_metric, current_memory_percentage, sys_memory_limit ); crate::utils::notify_exit(-1); - } else if current_sys_free_memory_percentage < sys_free_memory_limit { + } else if current_memory_percentage < sys_memory_limit { *last_exceeded = get_timestamp(0); exception_handler.set(Exception::FreeMemExceeded); - *under_sys_free_memory_limit = true; + *under_sys_memory_limit = true; error!( - "current system free memory percentage is less than sys_free_memory_limit, current system free memory percentage={}%, sys_free_memory_limit={}%, set the agent to disabled", - current_sys_free_memory_percentage, sys_free_memory_limit + "current system {:?} memory percentage is less than sys_memory_limit, current system free memory percentage={}%, sys_memory_limit={}%, set the agent to disabled", + sys_memory_metric, current_memory_percentage, sys_memory_limit ); - } else if current_sys_free_memory_percentage >= sys_free_memory_limit * 1.1 { + } else if current_memory_percentage >= sys_memory_limit * 1.1 { let now = get_timestamp(0); - if *under_sys_free_memory_limit && now > *last_exceeded + CONTINUOUS_SAFETY_TIME { + if *under_sys_memory_limit && now > *last_exceeded + CONTINUOUS_SAFETY_TIME { exception_handler.clear(Exception::FreeMemExceeded); - *under_sys_free_memory_limit = false; + *under_sys_memory_limit = false; info!( - "current system free memory percentage: {}% remains above sys_free_memory_limit: {} * 110%, set the agent to enabled.", - current_sys_free_memory_percentage, sys_free_memory_limit + "current system {:?} memory percentage: {}% remains above sys_memory_limit: {} * 110%, set the agent to enabled.", + sys_memory_metric, current_memory_percentage, sys_memory_limit ); } } @@ -453,7 +461,7 @@ impl Guard { } } - Self::check_sys_free_memory(config.sys_free_memory_limit as f64, &mut under_sys_free_memory_limit, &mut last_exceeded, &exception_handler); + Self::check_sys_memory(config.sys_memory_limit as f64, config.sys_memory_metric, &mut under_sys_free_memory_limit, &mut last_exceeded, &exception_handler); match get_thread_num() { Ok(thread_num) => { diff --git a/agent/src/utils/process/mod.rs b/agent/src/utils/process/mod.rs index 13b69cbff53..97cb6c8c0ea 100644 --- a/agent/src/utils/process/mod.rs +++ b/agent/src/utils/process/mod.rs @@ -29,7 +29,7 @@ pub use self::windows::*; use sysinfo::{System, SystemExt}; /// 返回当前系统的空闲内存数目,单位:% -pub fn get_current_sys_free_memory_percentage() -> u32 { +pub fn get_current_sys_memory_percentage() -> (u32, u32) { // don't use new_all(), we only need meminfo, new_all() will refresh all things(include cpu, users, etc). // It could be problematic for processes using a lot of files and using sysinfo at the same time. // https://github.com/GuillaumeGomez/sysinfo/blob/master/src/linux/system.rs#L21 @@ -37,8 +37,11 @@ pub fn get_current_sys_free_memory_percentage() -> u32 { s.refresh_memory(); let total_memory = s.total_memory(); if total_memory > 0 { - (s.free_memory() * 100 / total_memory) as u32 + ( + (s.free_memory() * 100 / total_memory) as u32, + (s.available_memory() * 100 / total_memory) as u32, + ) } else { - 0 + (0, 0) } } diff --git a/message/agent.proto b/message/agent.proto index 6b9576d4d18..5c7b4fe9196 100644 --- a/message/agent.proto +++ b/message/agent.proto @@ -198,6 +198,11 @@ enum SystemLoadMetric { Load15 = 2; } +enum SysMemoryMetric { + Free = 0; + Available = 1; +} + message Segment { // e.g. single LAN area optional uint32 id = 1; repeated string mac = 2; diff --git a/message/trident.proto b/message/trident.proto index 012e32674c3..755d0a29cbd 100644 --- a/message/trident.proto +++ b/message/trident.proto @@ -203,6 +203,11 @@ enum SystemLoadMetric { Load15 = 2; } +enum SysMemoryMetric { + Free = 0; + Available = 1; +} + message Config { optional bool enabled = 1 [default = true]; optional uint32 max_cpus = 2 [default = 1]; @@ -304,13 +309,14 @@ message Config { optional PluginConfig plugins = 420; - optional uint32 sys_free_memory_limit = 501 [default = 0]; + optional uint32 sys_memory_limit = 501 [default = 0]; optional uint32 log_file_size = 502 [default = 1000]; optional TapMode tap_mode = 503 [default = LOCAL]; optional float system_load_circuit_breaker_threshold = 504 [default = 1.0]; optional SystemLoadMetric system_load_circuit_breaker_metric = 505 [default = Load15]; optional float system_load_circuit_breaker_recover = 506 [default = 0.9]; optional string secret_key = 507; // secret key for dataplane + optional SysMemoryMetric sys_memory_metric = 508 [default = Free]; optional string local_config = 510; // 全量的配置文件内容 } diff --git a/server/agent_config/README-CH.md b/server/agent_config/README-CH.md index dcfcfccbc99..f0f6062ac07 100644 --- a/server/agent_config/README-CH.md +++ b/server/agent_config/README-CH.md @@ -303,11 +303,11 @@ check provides a switch to prevent the process hang. Additional links: 控制 deepflow-agent 在一定的环境条件下停止运行或停止部分功能。 -### 系统空闲内存百分比 {#global.circuit_breakers.sys_free_memory_percentage} +### 系统空闲内存百分比 {#global.circuit_breakers.sys_memory_percentage} 计算公式:`(free_memory / total_memory) * 100%` -#### 触发阈值 {#global.circuit_breakers.sys_free_memory_percentage.trigger_threshold} +#### 触发阈值 {#global.circuit_breakers.sys_memory_percentage.trigger_threshold} **标签**: @@ -315,15 +315,15 @@ check provides a switch to prevent the process hang. Additional links: **FQCN**: -`global.circuit_breakers.sys_free_memory_percentage.trigger_threshold` +`global.circuit_breakers.sys_memory_percentage.trigger_threshold` -Upgrade from old version: `sys_free_memory_limit` +Upgrade from old version: `sys_memory_limit` **默认值**: ```yaml global: circuit_breakers: - sys_free_memory_percentage: + sys_memory_percentage: trigger_threshold: 0 ``` @@ -338,6 +338,35 @@ global: 当系统空闲内存低于此阈值的 90% 时,deepflow-agent 将自动重启。 +#### 观测指标 {#global.circuit_breakers.sys_memory_percentage.metric} + +**标签**: + +`hot_update` + +**FQCN**: + +`global.circuit_breakers.sys_memory_percentage.metric` + +Upgrade from old version: `sys_memory_metric` + +**默认值**: +```yaml +global: + circuit_breakers: + sys_memory_percentage: + metric: free +``` + +**模式**: +| Key | Value | +| ---- | ---------------------------- | +| Type | string | + +**详细描述**: + +deepflow-agent 观测该内存指标的百分比 + ### 相对系统负载 {#global.circuit_breakers.relative_sys_load} 计算公式: `system_load / total_cpu_cores` diff --git a/server/agent_config/README.md b/server/agent_config/README.md index c0c3faead1a..5f34c645bba 100644 --- a/server/agent_config/README.md +++ b/server/agent_config/README.md @@ -301,11 +301,11 @@ check provides a switch to prevent the process hang. Additional links: ## Circuit Breakers {#global.circuit_breakers} -### System Free Memory Percentage {#global.circuit_breakers.sys_free_memory_percentage} +### System Free Memory Percentage {#global.circuit_breakers.sys_memory_percentage} Calculation Method: `(free_memory / total_memory) * 100%` -#### Trigger Threshold {#global.circuit_breakers.sys_free_memory_percentage.trigger_threshold} +#### Trigger Threshold {#global.circuit_breakers.sys_memory_percentage.trigger_threshold} **Tags**: @@ -313,7 +313,7 @@ Calculation Method: `(free_memory / total_memory) * 100%` **FQCN**: -`global.circuit_breakers.sys_free_memory_percentage.trigger_threshold` +`global.circuit_breakers.sys_memory_percentage.trigger_threshold` Upgrade from old version: `sys_free_memory_limit` @@ -321,7 +321,7 @@ Upgrade from old version: `sys_free_memory_limit` ```yaml global: circuit_breakers: - sys_free_memory_percentage: + sys_memory_percentage: trigger_threshold: 0 ``` @@ -342,6 +342,35 @@ Setting sys_free_memory_limit to 0 indicates that the system free memory ratio i 3. When the current system free memory ratio remains above sys_free_memory_limit * 110%, the agent recovers from the disabled state. +#### Metric {#global.circuit_breakers.sys_memory_percentage.metric} + +**Tags**: + +`hot_update` + +**FQCN**: + +`global.circuit_breakers.sys_memory_percentage.metric` + +Upgrade from old version: `sys_memory_metric` + +**Default value**: +```yaml +global: + circuit_breakers: + sys_memory_percentage: + metric: free +``` + +**Schema**: +| Key | Value | +| ---- | ---------------------------- | +| Type | string | + +**Description**: + +deepflow-agent observes the percentage of this memory metric. + ### Relative System Load {#global.circuit_breakers.relative_sys_load} Calculation Method: `system_load / total_cpu_cores` @@ -414,7 +443,7 @@ minutes, the agent can recover from the circuit breaker disabled state, and setting it to 0 means turning off the circuit breaker feature. -#### Metric {#global.circuit_breakers.relative_sys_load.system_load_circuit_breaker_metric} +#### Metric {#global.circuit_breakers.relative_sys_load.metric} **Tags**: @@ -431,7 +460,7 @@ Upgrade from old version: `system_load_circuit_breaker_metric` global: circuit_breakers: relative_sys_load: - system_load_circuit_breaker_metric: load15 + metric: load15 ``` **Enum options**: diff --git a/server/agent_config/config.go b/server/agent_config/config.go index 045e8b48c31..de0d36e883c 100644 --- a/server/agent_config/config.go +++ b/server/agent_config/config.go @@ -85,8 +85,9 @@ type AgentGroupConfig struct { Domains []string `json:"DOMAINS" yaml:"domains,omitempty"` // domains info, separate by "," DecapType []int `json:"DECAP_TYPE" yaml:"decap_type,omitempty"` // separate by "," HTTPLogSpanID *string `json:"HTTP_LOG_SPAN_ID" yaml:"http_log_span_id,omitempty"` - SysFreeMemoryLimit *int `json:"SYS_FREE_MEMORY_LIMIT" yaml:"sys_free_memory_limit,omitempty"` // unit: % - LogFileSize *int `json:"LOG_FILE_SIZE" yaml:"log_file_size,omitempty"` // unit: MB + SysMemoryLimit *int `json:"SYS_FREE_MEMORY_LIMIT" yaml:"sys_memory_limit,omitempty"` // unit: % + SysMemoryMetric *int `json:"SYS_FREE_MEMORY_LIMIT" yaml:"sys_memory_metric,omitempty"` // unit: % + LogFileSize *int `json:"LOG_FILE_SIZE" yaml:"log_file_size,omitempty"` // unit: MB HTTPLogXRequestID *string `json:"HTTP_LOG_X_REQUEST_ID" yaml:"http_log_x_request_id,omitempty"` ExternalAgentHTTPProxyEnabled *int `json:"EXTERNAL_AGENT_HTTP_PROXY_ENABLED" yaml:"external_agent_http_proxy_enabled,omitempty"` ExternalAgentHTTPProxyPort *int `json:"EXTERNAL_AGENT_HTTP_PROXY_PORT" yaml:"external_agent_http_proxy_port,omitempty"` diff --git a/server/agent_config/example.yaml b/server/agent_config/example.yaml index 4511ceb9b59..e5e4f9d8d38 100644 --- a/server/agent_config/example.yaml +++ b/server/agent_config/example.yaml @@ -25,17 +25,22 @@ max_millicpus: 1000 # Note: deepflow-agent uses cgroups to limit memory usage. max_memory: 768 -# System Free Memory Limit +# System Memory Limit # Unit: %. Default: 0. Range: [0, 100] # Note: The limit of the percentage of system free memory. -# Setting sys_free_memory_limit to 0 indicates that the system free memory ratio is not checked. -# 1. When the current system free memory ratio is below sys_free_memory_limit * 70%, +# Setting sys_memory_limit to 0 indicates that the system free/available memory ratio is not checked. +# 1. When the current system free/available memory ratio is below sys_memory_limit * 70%, # the agent will automatically restart. -# 2. When the current system free memory ratio is below sys_free_memory_limit but above 70%, +# 2. When the current system free/available memory ratio is below sys_memory_limit but above 70%, # the agent enters the disabled state. -# 3. When the current system free memory ratio remains above sys_free_memory_limit * 110%, +# 3. When the current system free/available memory ratio remains above sys_memory_limit * 110%, # the agent recovers from the disabled state. -sys_free_memory_limit: 0 +sys_memory_limit: 0 + +# System Memory Metric +# Default: free +# Supported values: free, available +sys_memory_metric: free # Packet Capture Rate Limit # Unit: Kpps. Default: 200. Range: [1, 1000000] diff --git a/server/agent_config/template.yaml b/server/agent_config/template.yaml index 25ec17231e4..bd7b3dab0d3 100644 --- a/server/agent_config/template.yaml +++ b/server/agent_config/template.yaml @@ -254,7 +254,7 @@ global: # Calculation Method: `(free_memory / total_memory) * 100%` # ch: |- # 计算公式:`(free_memory / total_memory) * 100%` - sys_free_memory_percentage: + sys_memory_percentage: # type: int # name: # en: Trigger Threshold @@ -266,17 +266,33 @@ global: # ee_feature: false # description: # en: |- - # Setting sys_free_memory_limit to 0 indicates that the system free memory ratio is not checked. - # 1. When the current system free memory ratio is below sys_free_memory_limit * 70%, + # Setting sys_memory_limit to 0 indicates that the system free/available memory ratio is not checked. + # 1. When the current system free/available memory ratio is below sys_memory_limit * 70%, # the agent will automatically restart. - # 2. When the current system free memory ratio is below sys_free_memory_limit but above 70%, + # 2. When the current system free/available memory ratio is below sys_memory_limit but above 70%, # the agent enters the disabled state. - # 3. When the current system free memory ratio remains above sys_free_memory_limit * 110%, + # 3. When the current system free/available memory ratio remains above sys_memory_limit * 110%, # the agent recovers from the disabled state. # ch: |- # 当系统空闲内存低于此阈值的 90% 时,deepflow-agent 将自动重启。 - # upgrade_from: sys_free_memory_limit + # upgrade_from: sys_memory_limit trigger_threshold: 0 + # type: string + # name: + # en: Metric + # ch: 观测指标 + # unit: + # range: [] + # enum_options: [free, available] + # modification: hot_update + # ee_feature: false + # description: + # en: |- + # deepflow-agent observes the percentage of this memory metric + # ch: |- + # deepflow-agent 观测该内存指标的百分比 + # upgrade_from: sys_memory_metric + metric: free # type: section # name: # en: Relative System Load @@ -347,7 +363,7 @@ global: # ch: |- # deepflow-agent 默认每 10 秒监控一次所设定的系统负载指标项。 # upgrade_from: system_load_circuit_breaker_metric - system_load_circuit_breaker_metric: load15 + metric: load15 # type: section # name: # en: Tx Throughput