Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[202006][Monit] Add changes to adjust error message generation for monit alert action #6145

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dockers/docker-database/base_image_files/monit_database
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
## process list:
## redis_server
###############################################################################
check process redis_server matching "/usr/bin/redis-server"
if does not exist for 5 times within 5 cycles then alert
check program database|redis_server with path "/usr/bin/process_checker database /usr/bin/redis-server"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
20 changes: 10 additions & 10 deletions dockers/docker-fpm-frr/base_image_files/monit_bgp
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@
## staticd
## bgpcfgd
###############################################################################
check process zebra matching "/usr/lib/frr/zebra"
if does not exist for 5 times within 5 cycles then alert
check program bgp|zebra with path "/usr/bin/process_checker bgp /usr/lib/frr/zebra"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process fpmsyncd matching "fpmsyncd"
if does not exist for 5 times within 5 cycles then alert
check program bgp|fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process bgpd matching "/usr/lib/frr/bgpd"
if does not exist for 5 times within 5 cycles then alert
check program bgp|bgpd with path "/usr/bin/process_checker bgp /usr/lib/frr/bgpd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process staticd matching "/usr/lib/frr/staticd"
if does not exist for 5 times within 5 cycles then alert
check program bgp|staticd with path "/usr/bin/process_checker bgp /usr/lib/frr/staticd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process bgpcfgd matching "python /usr/local/bin/bgpcfgd"
if does not exist for 5 times within 5 cycles then alert
check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpcfgd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
12 changes: 6 additions & 6 deletions dockers/docker-lldp/base_image_files/monit_lldp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
## lldp-syncd
## lldpmgrd
###############################################################################
check process lldpd_monitor matching "lldpd: "
if does not exist for 5 times within 5 cycles then alert
check program lldp|lldpd_monitor with path "/usr/bin/process_checker lldp lldpd:"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process lldp_syncd matching "python2 -m lldp_syncd"
if does not exist for 5 times within 5 cycles then alert
check program lldp|lldp_syncd with path "/usr/bin/process_checker lldp python2 -m lldp_syncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process lldpmgrd matching "python /usr/bin/lldpmgrd"
if does not exist for 5 times within 5 cycles then alert
check program lldp|lldpmgrd with path "/usr/bin/process_checker lldp python /usr/bin/lldpmgrd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
42 changes: 21 additions & 21 deletions dockers/docker-orchagent/base_image_files/monit_swss
Original file line number Diff line number Diff line change
Expand Up @@ -11,33 +11,33 @@
## buffermgrd
## nbrmgrd
## vxlanmgrd
###############################################################################
check process orchagent matching "/usr/bin/orchagent -d /var/log/swss"
if does not exist for 5 times within 5 cycles then alert
##############################################################################
check program swss|orchagent with path "/usr/bin/process_checker swss /usr/bin/orchagent -d /var/log/swss"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process portsyncd matching "/usr/bin/portsyncd"
if does not exist for 5 times within 5 cycles then alert
check program swss|portsyncd with path "/usr/bin/process_checker swss /usr/bin/portsyncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process neighsyncd matching "/usr/bin/neighsyncd"
if does not exist for 5 times within 5 cycles then alert
check program swss|neighsyncd with path "/usr/bin/process_checker swss /usr/bin/neighsyncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process vrfmgrd matching "/usr/bin/vrfmgrd"
if does not exist for 5 times within 5 cycles then alert
check program swss|vrfmgrd with path "/usr/bin/process_checker swss /usr/bin/vrfmgrd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process vlanmgrd matching "/usr/bin/vlanmgrd"
if does not exist for 5 times within 5 cycles then alert
check program swss|vlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vlanmgrd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process intfmgrd matching "/usr/bin/intfmgrd"
if does not exist for 5 times within 5 cycles then alert
check program swss|intfmgrd with path "/usr/bin/process_checker swss /usr/bin/intfmgrd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process portmgrd matching "/usr/bin/portmgrd"
if does not exist for 5 times within 5 cycles then alert
check program swss|portmgrd with path "/usr/bin/process_checker swss /usr/bin/portmgrd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process buffermgrd matching "/usr/bin/buffermgrd -l"
if does not exist for 5 times within 5 cycles then alert
check program swss|buffermgrd with path "/usr/bin/process_checker swss /usr/bin/buffermgrd -l"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process nbrmgrd matching "/usr/bin/nbrmgrd"
if does not exist for 5 times within 5 cycles then alert
check program swss|nbrmgrd with path "/usr/bin/process_checker swss /usr/bin/nbrmgrd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process vxlanmgrd matching "/usr/bin/vxlanmgrd"
if does not exist for 5 times within 5 cycles then alert
check program swss|vxlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vxlanmgrd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
4 changes: 2 additions & 2 deletions dockers/docker-sflow/base_image_files/monit_sflow
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
## process list:
## sflowmgrd
###############################################################################
check process sflowmgrd matching "/usr/bin/sflowmgrd"
if does not exist for 5 times within 5 cycles then alert
check program sflow|sflowmgrd with path "/usr/bin/process_checker sflow /usr/bin/sflowmgrd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
8 changes: 4 additions & 4 deletions dockers/docker-snmp/base_image_files/monit_snmp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
## snmpd
## snmpd_subagent
###############################################################################
check process snmpd matching "/usr/sbin/snmpd\s"
if does not exist for 5 times within 5 cycles then alert
check program snmp|snmpd with path "/usr/bin/process_checker snmp /usr/sbin/snmpd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process snmp_subagent matching "python3 -m sonic_ax_impl"
if does not exist for 5 times within 5 cycles then alert
check program snmp|snmp_subagent with path "/usr/bin/process_checker snmp python3 -m sonic_ax_impl"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
4 changes: 2 additions & 2 deletions dockers/docker-sonic-restapi/base_image_files/monit_restapi
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
## process list:
## restapi
###############################################################################
check process restapi matching "/usr/sbin/go-server-server"
if does not exist for 5 times within 5 cycles then alert
check program restapi|restapi with path "/usr/bin/process_checker restapi /usr/sbin/go-server-server"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
## telemetry
## dialout_client
###############################################################################
check process telemetry matching "/usr/sbin/telemetry"
if does not exist for 5 times within 5 cycles then alert
check program telemetry|telemetry with path "/usr/bin/process_checker telemetry /usr/sbin/telemetry"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process dialout_client matching "/usr/sbin/dialout_client_cli"
if does not exist for 5 times within 5 cycles then alert
check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
11 changes: 11 additions & 0 deletions dockers/docker-teamd/base_image_files/monit_teamd
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
###############################################################################
## Monit configuration for teamd container
## process list:
## teamsyncd
## teammgrd
###############################################################################
check program teamd|teamsyncd with path "/usr/bin/process_checker teamd /usr/bin/teamsyncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check program teamd|teammgrd with path "/usr/bin/process_checker teamd /usr/bin/teammgrd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
5 changes: 5 additions & 0 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ sudo rm -rf $FILESYSTEM_ROOT/$REDIS_DUMP_LOAD_PY2_WHEEL_NAME
# Install Python module for ipaddress
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install ipaddress

# Install Python module for psutil
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install psutil

# Install SwSS SDK Python 3 package
# Note: the scripts will be overwritten by corresponding Python 2 package
if [ -e {{swsssdk_py3_wheel_path}} ]; then
Expand Down Expand Up @@ -222,6 +225,8 @@ sudo cp $IMAGE_CONFIGS/monit/monitrc $FILESYSTEM_ROOT/etc/monit/
sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/monitrc
sudo cp $IMAGE_CONFIGS/monit/conf.d/* $FILESYSTEM_ROOT/etc/monit/conf.d/
sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/conf.d/*
sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker

# Copy crontabs
sudo cp -f $IMAGE_CONFIGS/cron.d/* $FILESYSTEM_ROOT/etc/cron.d/
Expand Down
21 changes: 16 additions & 5 deletions files/image_config/monit/conf.d/sonic-host
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,28 @@
###############################################################################

check filesystem root-overlay with path /
if space usage > 90% for 10 times within 20 cycles then alert
if space usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles

check filesystem var-log with path /var/log
if space usage > 90% for 10 times within 20 cycles then alert
if space usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles

check system $HOST
if memory usage > 90% for 10 times within 20 cycles then alert
if cpu usage (user) > 90% for 10 times within 20 cycles then alert
if cpu usage (system) > 90% for 10 times within 20 cycles then alert
if memory usage > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
if cpu usage (user) > 90% for 10 times within 20 cycles then alert repeat every 1 cycles
if cpu usage (system) > 90% for 10 times within 20 cycles then alert repeat every 1 cycles

check process rsyslog with pidfile /var/run/rsyslogd.pid
start program = "/bin/systemctl start rsyslog.service"
stop program = "/bin/systemctl stop rsyslog.service"
if totalmem > 800 MB for 10 times within 20 cycles then restart

# route_check.py Verify routes between APPL-DB & ASIC-DB are in sync.
# For any discrepancy, details are logged and a non-zero code is returned
# which would trigger a monit alert.
# Hence for any discrepancy, there will be log messages for "ERR" level
# from both route_check.py & monit.
#
check program routeCheck with path "/usr/bin/route_check.py"
every 5 cycles
if status != 0 for 3 cycle then alert repeat every 1 cycles

57 changes: 57 additions & 0 deletions files/image_config/monit/process_checker
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/python
import argparse
import sys
import syslog

import psutil
import swsssdk


def check_process_existence(container_name, process_cmdline):
"""
@summary: Check whether the process in the specified container is running or not and
an alerting message will written into syslog if it failed to run.
"""
config_db = swsssdk.ConfigDBConnector()
config_db.connect()
feature_table = config_db.get_table("FEATURE")

if container_name in feature_table.keys():
# We look into the 'FEATURE' table to verify whether the container is disabled or not.
# If the container is diabled, we exit.
if ("state" in feature_table[container_name].keys()
and feature_table[container_name]["state"] == "disabled"):
sys.exit(0)
else:
# We leveraged the psutil library to help us check whether the process is running or not.
# If the process entity is found in process tree and it is also in the 'running' or 'sleeping'
# state, then it will be marked as 'running'.
is_running = False
for process in psutil.process_iter(["cmdline", "status"]):
if ((' '.join(process.cmdline())).startswith(process_cmdline) and process.status() in ["running", "sleeping"]):
is_running = True
break

if not is_running:
# If this script is run by Monit, then the following output will be appended to
# Monit's syslog message.
print("'{}' is not running.".format(process_cmdline))
sys.exit(1)
else:
syslog.syslog(syslog.LOG_ERR, "container '{}' is not included in SONiC image or the given container name is invalid!"
.format(container_name))


def main():
parser = argparse.ArgumentParser(description="Check whether the process in the specified \
container is running and an alerting message will be written into syslog if it \
failed to run.", usage="/usr/bin/process_checker <container_name> <process_cmdline>")
parser.add_argument("container_name", help="container name")
parser.add_argument("process_cmdline", nargs=argparse.REMAINDER, help="process command line")
args = parser.parse_args()

check_process_existence(args.container_name, ' '.join(args.process_cmdline))


if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
## syncd
## dsserve
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process dsserve matching "/usr/bin/dsserve /usr/bin/syncd"
if does not exist for 5 times within 5 cycles then alert
check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
## process list:
## syncd
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
## syncd
## dsserve
###############################################################################
check process syncd matching "/usr/bin/syncd\s"
if does not exist for 5 times within 5 cycles then alert
check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles

check process dsserve matching "/usr/bin/dsserve /usr/bin/syncd"
if does not exist for 5 times within 5 cycles then alert
check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
1 change: 1 addition & 0 deletions rules/docker-teamd.mk
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@ $(DOCKER_TEAMD)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_TEAMD)_RUN_OPT += -v /host/warmboot:/var/warmboot

$(DOCKER_TEAMD)_BASE_IMAGE_FILES += teamdctl:/usr/bin/teamdctl
$(DOCKER_TEAMD)_BASE_IMAGE_FILES += monit_teamd:/etc/monit/conf.d
$(DOCKER_TEAMD)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
Loading