Skip to content

Commit

Permalink
[monit] Adding patch to enhance syslog error message generation for m…
Browse files Browse the repository at this point in the history
…onit alert action when status is failed. (#5720)

Why/How I did:

Make sure first error syslog is triggered based on FAULT TOLERANCE condition.

Added support of repeat clause with alert action. This is used as trigger
for generation of periodic syslog error messages if error is persistent

Updated the monit conf files with repeat every x cycles for the alert action

Signed-off-by: Guohan Lu <[email protected]>
  • Loading branch information
lguohan committed Dec 9, 2020
1 parent 0e5f12e commit 88cfe6a
Show file tree
Hide file tree
Showing 21 changed files with 118 additions and 43 deletions.
2 changes: 1 addition & 1 deletion dockers/docker-database/base_image_files/monit_database
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
## redis_server
###############################################################################
check program database|redis_server with path "/usr/bin/process_checker database /usr/bin/redis-server"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
10 changes: 5 additions & 5 deletions dockers/docker-fpm-frr/base_image_files/monit_bgp
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@
## bgpcfgd
###############################################################################
check program bgp|zebra with path "/usr/bin/process_checker bgp /usr/lib/frr/zebra"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program bgp|fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program bgp|bgpd with path "/usr/bin/process_checker bgp /usr/lib/frr/bgpd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program bgp|staticd with path "/usr/bin/process_checker bgp /usr/lib/frr/staticd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpcfgd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
6 changes: 3 additions & 3 deletions dockers/docker-lldp/base_image_files/monit_lldp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
## lldpmgrd
###############################################################################
check program lldp|lldpd_monitor with path "/usr/bin/process_checker lldp lldpd:"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program lldp|lldp_syncd with path "/usr/bin/process_checker lldp python2 -m lldp_syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program lldp|lldpmgrd with path "/usr/bin/process_checker lldp python /usr/bin/lldpmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
20 changes: 10 additions & 10 deletions dockers/docker-orchagent/base_image_files/monit_swss
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,31 @@
## vxlanmgrd
##############################################################################
check program swss|orchagent with path "/usr/bin/process_checker swss /usr/bin/orchagent -d /var/log/swss"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program swss|portsyncd with path "/usr/bin/process_checker swss /usr/bin/portsyncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program swss|neighsyncd with path "/usr/bin/process_checker swss /usr/bin/neighsyncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program swss|vrfmgrd with path "/usr/bin/process_checker swss /usr/bin/vrfmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program swss|vlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vlanmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program swss|intfmgrd with path "/usr/bin/process_checker swss /usr/bin/intfmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program swss|portmgrd with path "/usr/bin/process_checker swss /usr/bin/portmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program swss|buffermgrd with path "/usr/bin/process_checker swss /usr/bin/buffermgrd -l"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program swss|nbrmgrd with path "/usr/bin/process_checker swss /usr/bin/nbrmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program swss|vxlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vxlanmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
2 changes: 1 addition & 1 deletion dockers/docker-sflow/base_image_files/monit_sflow
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
## sflowmgrd
###############################################################################
check program sflow|sflowmgrd with path "/usr/bin/process_checker sflow /usr/bin/sflowmgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
4 changes: 2 additions & 2 deletions dockers/docker-snmp/base_image_files/monit_snmp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
## snmpd_subagent
###############################################################################
check program snmp|snmpd with path "/usr/bin/process_checker snmp /usr/sbin/snmpd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program snmp|snmp_subagent with path "/usr/bin/process_checker snmp python3 -m sonic_ax_impl"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
## restapi
###############################################################################
check program restapi|restapi with path "/usr/bin/process_checker restapi /usr/sbin/go-server-server"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
## dialout_client
###############################################################################
check program telemetry|telemetry with path "/usr/bin/process_checker telemetry /usr/sbin/telemetry"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
4 changes: 2 additions & 2 deletions dockers/docker-teamd/base_image_files/monit_teamd
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
## teammgrd
###############################################################################
check program teamd|teamsyncd with path "/usr/bin/process_checker teamd /usr/bin/teamsyncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program teamd|teammgrd with path "/usr/bin/process_checker teamd /usr/bin/teammgrd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
20 changes: 15 additions & 5 deletions files/image_config/monit/conf.d/sonic-host
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,27 @@
###############################################################################

check filesystem root-overlay with path /
if space usage > 90% for 10 times within 20 cycles then alert
if space usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles

check filesystem var-log with path /var/log
if space usage > 90% for 10 times within 20 cycles then alert
if space usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles

check system $HOST
if memory usage > 90% for 10 times within 20 cycles then alert
if cpu usage (user) > 90% for 10 times within 20 cycles then alert
if cpu usage (system) > 90% for 10 times within 20 cycles then alert
if memory usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
if cpu usage (user) > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
if cpu usage (system) > 90% for 10 times within 20 cycles then alert repeat every 1 cycles

check process rsyslog with pidfile /var/run/rsyslogd.pid
start program = "/bin/systemctl start rsyslog.service"
stop program = "/bin/systemctl stop rsyslog.service"
if totalmem > 800 MB for 10 times within 20 cycles then restart

# route_check.py Verify routes between APPL-DB & ASIC-DB are in sync.
# For any discrepancy, details are logged and a non-zero code is returned
# which would trigger a monit alert.
# Hence for any discrepancy, there will be log messages for "ERR" level
# from both route_check.py & monit.
#
check program routeCheck with path "/usr/bin/route_check.py"
every 5 cycles
if status != 0 for 3 cycle then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
## dsserve
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
## syncd
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
## dsserve
###############################################################################
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
64 changes: 64 additions & 0 deletions src/monit/patch/0002-change_monit_alert_log_error.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
From 97a5defc6a7fcc6a00f691bb5314ceb8fb7704e9 Mon Sep 17 00:00:00 2001
From: Abhishek Dosi <[email protected]>
Date: Mon, 26 Oct 2020 11:40:02 -0700
Subject: [PATCH] Patch on top of commit Patch is addressing these changes:-

a) Enable repeat keyword for alert action . Using this we can log
syslog error message for persistent failure condition

b) Make sure error message is loggged if state is changed to fail first time (fault tolerance condition)
or we have repeat clause for alert

Signed-off-by: Abhishek Dosi <[email protected]>

---
src/event.c | 6 +++++-
src/p.y | 8 +++++++-
2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/event.c b/src/event.c
index ed363ee..9d08fc0 100644
--- a/src/event.c
+++ b/src/event.c
@@ -336,7 +336,8 @@ static void _handleEvent(Service_T S, Event_T E) {
if (E->state != State_Init || E->state_map & 0x1) {
if (E->state == State_Succeeded || E->state == State_ChangedNot || E->id == Event_Instance || E->id == Event_Action)
LogInfo("'%s' %s\n", S->name, E->message);
- else
+ /* Send Error log if state change to failed for 1st time or if we have repeat clause then do periodically */
+ else if ((E->state_changed) || (E->state == State_Failed && E->action->failed->repeat && E->count % E->action->failed->repeat == 0))
LogError("'%s' %s\n", S->name, E->message);
}
if (E->state == State_Init)
return;
diff --git a/src/p.y b/src/p.y
index a57807d..b46b1a1 100644
--- a/src/p.y
+++ b/src/p.y
@@ -2250,9 +2250,12 @@ repeat : /* EMPTY */ {
}
;

-action : ALERT {
+action : ALERT repeat{
$<number>$ = Action_Alert;
}
+ | ALERT {
+ $<number>$ = Action_Alert;
+ }
| EXEC argumentlist repeat {
$<number>$ = Action_Exec;
}
@@ -2281,6 +2284,9 @@ action1 : action {
repeat = 0;
command1 = command;
command = NULL;
+ } else if ($<number>1 == Action_Alert) {
+ repeat1 = repeat;
+ repeat = 0;
}
}
;
--
2.17.1

1 change: 1 addition & 0 deletions src/monit/patch/series
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# This series applies on GIT commit dc9bc1c949125140d967edfc598dfad47eedc552
0001-used_system_memory_sysdep-Use-MemAvailable-value-if-.patch
0002-change_monit_alert_log_error.patch

0 comments on commit 88cfe6a

Please sign in to comment.