From ad64dc6b8a227081a7491e62b0523263206af0db Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 11 Aug 2020 22:28:46 +0000 Subject: [PATCH 01/12] [monit] Unmonitor the processes in containers which are disabled. Signed-off-by: Yong Zhao --- .../base_image_files/monit_database | 4 +- .../docker-fpm-frr/base_image_files/monit_bgp | 20 +++--- .../docker-lldp/base_image_files/monit_lldp | 12 ++-- .../base_image_files/monit_swss | 40 ++++++------ .../docker-sflow/base_image_files/monit_sflow | 4 +- .../docker-snmp/base_image_files/monit_snmp | 8 +-- .../base_image_files/monit_restapi | 4 +- .../base_image_files/monit_telemetry | 8 +-- .../build_templates/sonic_debian_extension.j2 | 3 + files/image_config/monit/process_checker | 64 +++++++++++++++++++ .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 8 +-- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 8 +-- 19 files changed, 139 insertions(+), 72 deletions(-) create mode 100755 files/image_config/monit/process_checker diff --git a/dockers/docker-database/base_image_files/monit_database b/dockers/docker-database/base_image_files/monit_database index c5508922864e..5b0d99746aed 100644 --- a/dockers/docker-database/base_image_files/monit_database +++ b/dockers/docker-database/base_image_files/monit_database @@ -3,5 +3,5 @@ ## process list: ## redis_server ############################################################################### -check process redis_server matching "/usr/bin/redis-server" - if does not exist for 5 times within 5 cycles then alert +check program container_process_redis_server with path "/usr/bin/process_checker database redis-server /usr/bin/redis-server" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-fpm-frr/base_image_files/monit_bgp b/dockers/docker-fpm-frr/base_image_files/monit_bgp index 5dbb794c346b..6fa6b723ad09 100644 --- a/dockers/docker-fpm-frr/base_image_files/monit_bgp +++ b/dockers/docker-fpm-frr/base_image_files/monit_bgp @@ -7,17 +7,17 @@ ## staticd ## bgpcfgd ############################################################################### -check process zebra matching "/usr/lib/frr/zebra" - if does not exist for 5 times within 5 cycles then alert +check program container_process_zebra with path "/usr/bin/process_checker bgp zebra /usr/lib/frr/zebra" + if status != 0 for 5 times within 5 cycles then alert -check process fpmsyncd matching "fpmsyncd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd fpmsyncd" + if status != 0 for 5 times within 5 cycles then alert -check process bgpd matching "/usr/lib/frr/bgpd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_bgpd with path "/usr/bin/process_checker bgp bgpd /usr/lib/frr/bgpd" + if status != 0 for 5 times within 5 cycles then alert -check process staticd matching "/usr/lib/frr/staticd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_staticd with path "/usr/bin/process_checker bgp staticd /usr/lib/frr/staticd" + if status != 0 for 5 times within 5 cycles then alert -check process bgpcfgd matching "python /usr/local/bin/bgpcfgd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_bgpcfgd with path "/usr/bin/process_checker bgp bgpcfgd python /usr/local/bin/bgpcfgd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-lldp/base_image_files/monit_lldp b/dockers/docker-lldp/base_image_files/monit_lldp index 200c52c7d332..1039d1e115fb 100644 --- a/dockers/docker-lldp/base_image_files/monit_lldp +++ b/dockers/docker-lldp/base_image_files/monit_lldp @@ -5,11 +5,11 @@ ## lldp-syncd ## lldpmgrd ############################################################################### -check process lldpd_monitor matching "lldpd: " - if does not exist for 5 times within 5 cycles then alert +check program container_process_lldpd_monitor with path "/usr/bin/process_checker lldp lldpd lldpd:" + if status != 0 for 5 times within 5 cycles then alert -check process lldp_syncd matching "python2 -m lldp_syncd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_lldp_syncd with path "/usr/bin/process_checker lldp lldp_syncd python2 -m lldp_syncd" + if status != 0 for 5 times within 5 cycles then alert -check process lldpmgrd matching "python /usr/bin/lldpmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_lldpmgrd with path "/usr/bin/process_checker lldp lldpmgrd python /usr/bin/lldpmgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-orchagent/base_image_files/monit_swss b/dockers/docker-orchagent/base_image_files/monit_swss index 5928dbd4ddb0..e32b1196bda1 100644 --- a/dockers/docker-orchagent/base_image_files/monit_swss +++ b/dockers/docker-orchagent/base_image_files/monit_swss @@ -12,32 +12,32 @@ ## nbrmgrd ## vxlanmgrd ############################################################################### -check process orchagent matching "/usr/bin/orchagent -d /var/log/swss" - if does not exist for 5 times within 5 cycles then alert +check program container_process_orchagent with path "/usr/bin/process_checker swss orchagent /usr/bin/orchagent -d /var/log/swss" + if status != 0 for 5 times within 5 cycles then alert -check process portsyncd matching "/usr/bin/portsyncd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_portsyncd with path "/usr/bin/process_checker swss portsyncd /usr/bin/portsyncd" + if status != 0 for 5 times within 5 cycles then alert -check process neighsyncd matching "/usr/bin/neighsyncd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_neighsyncd with path "/usr/bin/process_checker swss neighsyncd /usr/bin/neighsyncd" + if status != 0 for 5 times within 5 cycles then alert -check process vrfmgrd matching "/usr/bin/vrfmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_vrfmgrd with path "/usr/bin/process_checker swss vrfmgrd /usr/bin/vrfmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process vlanmgrd matching "/usr/bin/vlanmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_vlanmgrd with path "/usr/bin/process_checker swss vlanmgrd /usr/bin/vlanmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process intfmgrd matching "/usr/bin/intfmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_intfmgrd with path "/usr/bin/process_checker swss intfmgrd /usr/bin/intfmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process portmgrd matching "/usr/bin/portmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_portmgrd with path "/usr/bin/process_checker swss portmgrd /usr/bin/portmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process buffermgrd matching "/usr/bin/buffermgrd -l" - if does not exist for 5 times within 5 cycles then alert +check program container_process_buffermgrd with path "/usr/bin/process_checker swss buffermgrd /usr/bin/buffermgrd -l" + if status != 0 for 5 times within 5 cycles then alert -check process nbrmgrd matching "/usr/bin/nbrmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_nbrmgrd with path "/usr/bin/process_checker swss nbrmgrd /usr/bin/nbrmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process vxlanmgrd matching "/usr/bin/vxlanmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_vxlanmgrd with path "/usr/bin/process_checker swss vxlanmgrd /usr/bin/vxlanmgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sflow/base_image_files/monit_sflow b/dockers/docker-sflow/base_image_files/monit_sflow index d041f81001ea..44f3159aa773 100644 --- a/dockers/docker-sflow/base_image_files/monit_sflow +++ b/dockers/docker-sflow/base_image_files/monit_sflow @@ -3,5 +3,5 @@ ## process list: ## sflowmgrd ############################################################################### -check process sflowmgrd matching "/usr/bin/sflowmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_sflowmgrd with path "/usr/bin/process_checker sflow sflowmgrd /usr/bin/sflowmgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-snmp/base_image_files/monit_snmp b/dockers/docker-snmp/base_image_files/monit_snmp index cfb1a2b66831..47689fb0b433 100644 --- a/dockers/docker-snmp/base_image_files/monit_snmp +++ b/dockers/docker-snmp/base_image_files/monit_snmp @@ -4,8 +4,8 @@ ## snmpd ## snmpd_subagent ############################################################################### -check process snmpd matching "/usr/sbin/snmpd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_snmpd with path "/usr/bin/process_checker snmp snmpd /usr/sbin/snmpd" + if status != 0 for 5 times within 5 cycles then alert -check process snmp_subagent matching "python3 -m sonic_ax_impl" - if does not exist for 5 times within 5 cycles then alert +check program container_process_snmp_subagent with path "/usr/bin/process_checker snmp snmp_subagent python3 -m sonic_ax_impl" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sonic-restapi/base_image_files/monit_restapi b/dockers/docker-sonic-restapi/base_image_files/monit_restapi index 2e90baf30d57..7ef05080c938 100644 --- a/dockers/docker-sonic-restapi/base_image_files/monit_restapi +++ b/dockers/docker-sonic-restapi/base_image_files/monit_restapi @@ -3,5 +3,5 @@ ## process list: ## restapi ############################################################################### -check process restapi matching "/usr/sbin/go-server-server" - if does not exist for 5 times within 5 cycles then alert +check program container_process_restapi with path "/usr/bin/process_checker restapi restapi /usr/sbin/go-server-server" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry index a82c652f8179..5db332a90ffb 100644 --- a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry +++ b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry @@ -4,8 +4,8 @@ ## telemetry ## dialout_client ############################################################################### -check process telemetry matching "/usr/sbin/telemetry" - if does not exist for 5 times within 5 cycles then alert +check program container_process_telemetry with path "/usr/bin/process_checker telemetry telemetry /usr/sbin/telemetry" + if status != 0 for 5 times within 5 cycles then alert -check process dialout_client matching "/usr/sbin/dialout_client_cli" - if does not exist for 5 times within 5 cycles then alert +check program container_process_dialout_client with path "/usr/bin/process_checker telemetry dialout_client /usr/sbin/dialout_client_cli" + if status != 0 for 5 times within 5 cycles then alert diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index 19502e0320ab..7d18c1c3ac23 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -228,6 +228,9 @@ sudo cp $IMAGE_CONFIGS/monit/monitrc $FILESYSTEM_ROOT/etc/monit/ sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/monitrc sudo cp $IMAGE_CONFIGS/monit/conf.d/* $FILESYSTEM_ROOT/etc/monit/conf.d/ sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/conf.d/* +sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/ +sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker + # Copy crontabs sudo cp -f $IMAGE_CONFIGS/cron.d/* $FILESYSTEM_ROOT/etc/cron.d/ diff --git a/files/image_config/monit/process_checker b/files/image_config/monit/process_checker new file mode 100755 index 000000000000..5de8addfe6d8 --- /dev/null +++ b/files/image_config/monit/process_checker @@ -0,0 +1,64 @@ +#!/usr/bin/python +import argparse +import sys +import syslog + +import psutil +import swsssdk + + +def check_process_existence(container_name, process_name, process_cmdline): + """ + @summary: Check whether the process in the specified container is running or not and + an alerting message will written into syslog if it failed to run. + """ + config_db = swsssdk.ConfigDBConnector() + config_db.connect() + feature_table = config_db.get_table("FEATURE") + + if container_name in feature_table.keys(): + # We look into the 'FEATURE' table to verify whether the container is disabled or not. + # If the container is diabled, we exit. + if ("state" in feature_table[container_name].keys() + and feature_table[container_name]["state"] == "disabled"): + sys.exit(0) + else: + # We leveraged the psutil library to help us check whether the process is running or not. + # If the process entity is found in process tree and it is also in the 'running' or 'sleeping' + # state, then it will be marked as 'running'. + is_running = False + for process in psutil.process_iter(["name", "cmdline", "status"]): + # The script process_checker has the command line format '/usr/bin/process_checker + # ' such as '/usr/bin/process_checker bgp fpmsyncd fpmsyncd'. So + # when using psutil to search process 'fpmsyncd', we should skip the process which ran process_checker + # since it is not 'fpmsyncd' process although 'fpmsyncd' is a sustring of its cmdline. + if process.name() == "process_checker": + continue + + if ((process_name == process.name() or process_cmdline in ' '.join(process.cmdline())) + and process.status() in ["running", "sleeping"]): + is_running = True + break + + if not is_running: + print("'{}' is not running.".format(process_name)) + sys.exit(1) + else: + syslog.syslog(syslog.LOG_ERR, "contianer '{}' is not included in SONiC image or the given container name is invalid!" + .format(container_name)) + + +def main(): + parser = argparse.ArgumentParser(description="Check whether the process in the specified \ + container is running and an alerting message will be written into syslog if it \ + failed to run.", usage="/usr/bin/process_checker ") + parser.add_argument("container_name", help="container name") + parser.add_argument("process_name", help="process name") + parser.add_argument("process_cmdline", nargs=argparse.REMAINDER, help="process name") + args = parser.parse_args() + + check_process_existence(args.container_name, args.process_name, ' '.join(args.process_cmdline)) + + +if __name__ == '__main__': + main() diff --git a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd +++ b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd index 81c0b6ef6bc6..d9857a432bce 100644 --- a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd +++ b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd @@ -4,8 +4,8 @@ ## syncd ## dsserve ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert -check process dsserve matching "/usr/bin/dsserve /usr/bin/syncd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_dsserve with path "/usr/bin/process_checker syncd dsserve /usr/bin/dsserve /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd +++ b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd +++ b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd +++ b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd index 81c0b6ef6bc6..d9857a432bce 100644 --- a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd +++ b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd @@ -4,8 +4,8 @@ ## syncd ## dsserve ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert -check process dsserve matching "/usr/bin/dsserve /usr/bin/syncd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_dsserve with path "/usr/bin/process_checker syncd dsserve /usr/bin/dsserve /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert From b1cb0ba3d1441081d57e5ac8ba7f708d2a357942 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 11 Aug 2020 23:13:59 +0000 Subject: [PATCH 02/12] [monit] Update the process snmp command line. Signed-off-by: Yong Zhao --- dockers/docker-snmp/base_image_files/monit_snmp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/docker-snmp/base_image_files/monit_snmp b/dockers/docker-snmp/base_image_files/monit_snmp index 47689fb0b433..f1bc0c612f39 100644 --- a/dockers/docker-snmp/base_image_files/monit_snmp +++ b/dockers/docker-snmp/base_image_files/monit_snmp @@ -7,5 +7,5 @@ check program container_process_snmpd with path "/usr/bin/process_checker snmp snmpd /usr/sbin/snmpd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_snmp_subagent with path "/usr/bin/process_checker snmp snmp_subagent python3 -m sonic_ax_impl" +check program container_process_snmp_subagent with path "/usr/bin/process_checker snmp snmp_subagent python3.6 -m sonic_ax_impl" if status != 0 for 5 times within 5 cycles then alert From 55237eea116d2133eec7670153215c859459d0ae Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 15 Sep 2020 18:32:31 -0700 Subject: [PATCH 03/12] [Monit] Change the format of process checking in Monit configuration files. Signed-off-by: Yong Zhao --- .../base_image_files/monit_database | 2 +- .../docker-fpm-frr/base_image_files/monit_bgp | 10 ++++----- .../docker-lldp/base_image_files/monit_lldp | 6 ++--- .../base_image_files/monit_swss | 22 +++++++++---------- .../docker-sflow/base_image_files/monit_sflow | 2 +- .../docker-snmp/base_image_files/monit_snmp | 4 ++-- .../base_image_files/monit_restapi | 2 +- .../base_image_files/monit_telemetry | 4 ++-- .../docker-teamd/base_image_files/monit_teamd | 11 ++++++++++ .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 4 ++-- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 4 ++-- 18 files changed, 48 insertions(+), 37 deletions(-) create mode 100644 dockers/docker-teamd/base_image_files/monit_teamd diff --git a/dockers/docker-database/base_image_files/monit_database b/dockers/docker-database/base_image_files/monit_database index 5b0d99746aed..fc33bba0e744 100644 --- a/dockers/docker-database/base_image_files/monit_database +++ b/dockers/docker-database/base_image_files/monit_database @@ -3,5 +3,5 @@ ## process list: ## redis_server ############################################################################### -check program container_process_redis_server with path "/usr/bin/process_checker database redis-server /usr/bin/redis-server" +check program database|redis_server with path "/usr/bin/process_checker database redis-server /usr/bin/redis-server" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-fpm-frr/base_image_files/monit_bgp b/dockers/docker-fpm-frr/base_image_files/monit_bgp index 6fa6b723ad09..b2b0de9ab916 100644 --- a/dockers/docker-fpm-frr/base_image_files/monit_bgp +++ b/dockers/docker-fpm-frr/base_image_files/monit_bgp @@ -7,17 +7,17 @@ ## staticd ## bgpcfgd ############################################################################### -check program container_process_zebra with path "/usr/bin/process_checker bgp zebra /usr/lib/frr/zebra" +check program bgp|zebra with path "/usr/bin/process_checker bgp zebra /usr/lib/frr/zebra" if status != 0 for 5 times within 5 cycles then alert -check program container_process_fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd fpmsyncd" +check program bgp|fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd fpmsyncd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_bgpd with path "/usr/bin/process_checker bgp bgpd /usr/lib/frr/bgpd" +check program bgp|bgpd with path "/usr/bin/process_checker bgp bgpd /usr/lib/frr/bgpd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_staticd with path "/usr/bin/process_checker bgp staticd /usr/lib/frr/staticd" +check program bgp|staticd with path "/usr/bin/process_checker bgp staticd /usr/lib/frr/staticd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_bgpcfgd with path "/usr/bin/process_checker bgp bgpcfgd python /usr/local/bin/bgpcfgd" +check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp bgpcfgd python /usr/local/bin/bgpcfgd" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-lldp/base_image_files/monit_lldp b/dockers/docker-lldp/base_image_files/monit_lldp index 1039d1e115fb..fc91f91bfd1b 100644 --- a/dockers/docker-lldp/base_image_files/monit_lldp +++ b/dockers/docker-lldp/base_image_files/monit_lldp @@ -5,11 +5,11 @@ ## lldp-syncd ## lldpmgrd ############################################################################### -check program container_process_lldpd_monitor with path "/usr/bin/process_checker lldp lldpd lldpd:" +check program lldp|lldpd_monitor with path "/usr/bin/process_checker lldp lldpd lldpd:" if status != 0 for 5 times within 5 cycles then alert -check program container_process_lldp_syncd with path "/usr/bin/process_checker lldp lldp_syncd python2 -m lldp_syncd" +check program lldp|lldp_syncd with path "/usr/bin/process_checker lldp lldp_syncd python2 -m lldp_syncd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_lldpmgrd with path "/usr/bin/process_checker lldp lldpmgrd python /usr/bin/lldpmgrd" +check program lldp|lldpmgrd with path "/usr/bin/process_checker lldp lldpmgrd python /usr/bin/lldpmgrd" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-orchagent/base_image_files/monit_swss b/dockers/docker-orchagent/base_image_files/monit_swss index e32b1196bda1..1151b01995a7 100644 --- a/dockers/docker-orchagent/base_image_files/monit_swss +++ b/dockers/docker-orchagent/base_image_files/monit_swss @@ -11,33 +11,33 @@ ## buffermgrd ## nbrmgrd ## vxlanmgrd -############################################################################### -check program container_process_orchagent with path "/usr/bin/process_checker swss orchagent /usr/bin/orchagent -d /var/log/swss" +############################################################################## +check program swss|orchagent with path "/usr/bin/process_checker swss orchagent /usr/bin/orchagent -d /var/log/swss" if status != 0 for 5 times within 5 cycles then alert -check program container_process_portsyncd with path "/usr/bin/process_checker swss portsyncd /usr/bin/portsyncd" +check program swss|portsyncd with path "/usr/bin/process_checker swss portsyncd /usr/bin/portsyncd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_neighsyncd with path "/usr/bin/process_checker swss neighsyncd /usr/bin/neighsyncd" +check program swss|neighsyncd with path "/usr/bin/process_checker swss neighsyncd /usr/bin/neighsyncd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_vrfmgrd with path "/usr/bin/process_checker swss vrfmgrd /usr/bin/vrfmgrd" +check program swss|vrfmgrd with path "/usr/bin/process_checker swss vrfmgrd /usr/bin/vrfmgrd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_vlanmgrd with path "/usr/bin/process_checker swss vlanmgrd /usr/bin/vlanmgrd" +check program swss|vlanmgrd with path "/usr/bin/process_checker swss vlanmgrd /usr/bin/vlanmgrd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_intfmgrd with path "/usr/bin/process_checker swss intfmgrd /usr/bin/intfmgrd" +check program swss|intfmgrd with path "/usr/bin/process_checker swss intfmgrd /usr/bin/intfmgrd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_portmgrd with path "/usr/bin/process_checker swss portmgrd /usr/bin/portmgrd" +check program swss|portmgrd with path "/usr/bin/process_checker swss portmgrd /usr/bin/portmgrd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_buffermgrd with path "/usr/bin/process_checker swss buffermgrd /usr/bin/buffermgrd -l" +check program swss|buffermgrd with path "/usr/bin/process_checker swss buffermgrd /usr/bin/buffermgrd -l" if status != 0 for 5 times within 5 cycles then alert -check program container_process_nbrmgrd with path "/usr/bin/process_checker swss nbrmgrd /usr/bin/nbrmgrd" +check program swss|nbrmgrd with path "/usr/bin/process_checker swss nbrmgrd /usr/bin/nbrmgrd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_vxlanmgrd with path "/usr/bin/process_checker swss vxlanmgrd /usr/bin/vxlanmgrd" +check program swss|vxlanmgrd with path "/usr/bin/process_checker swss vxlanmgrd /usr/bin/vxlanmgrd" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sflow/base_image_files/monit_sflow b/dockers/docker-sflow/base_image_files/monit_sflow index 44f3159aa773..765632f54c58 100644 --- a/dockers/docker-sflow/base_image_files/monit_sflow +++ b/dockers/docker-sflow/base_image_files/monit_sflow @@ -3,5 +3,5 @@ ## process list: ## sflowmgrd ############################################################################### -check program container_process_sflowmgrd with path "/usr/bin/process_checker sflow sflowmgrd /usr/bin/sflowmgrd" +check program sflow|sflowmgrd with path "/usr/bin/process_checker sflow sflowmgrd /usr/bin/sflowmgrd" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-snmp/base_image_files/monit_snmp b/dockers/docker-snmp/base_image_files/monit_snmp index f1bc0c612f39..e018c75af39a 100644 --- a/dockers/docker-snmp/base_image_files/monit_snmp +++ b/dockers/docker-snmp/base_image_files/monit_snmp @@ -4,8 +4,8 @@ ## snmpd ## snmpd_subagent ############################################################################### -check program container_process_snmpd with path "/usr/bin/process_checker snmp snmpd /usr/sbin/snmpd" +check program snmp|snmpd with path "/usr/bin/process_checker snmp snmpd /usr/sbin/snmpd" if status != 0 for 5 times within 5 cycles then alert -check program container_process_snmp_subagent with path "/usr/bin/process_checker snmp snmp_subagent python3.6 -m sonic_ax_impl" +check program snmp|snmp_subagent with path "/usr/bin/process_checker snmp snmp_subagent python3.6 -m sonic_ax_impl" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sonic-restapi/base_image_files/monit_restapi b/dockers/docker-sonic-restapi/base_image_files/monit_restapi index 7ef05080c938..0062851fa9cc 100644 --- a/dockers/docker-sonic-restapi/base_image_files/monit_restapi +++ b/dockers/docker-sonic-restapi/base_image_files/monit_restapi @@ -3,5 +3,5 @@ ## process list: ## restapi ############################################################################### -check program container_process_restapi with path "/usr/bin/process_checker restapi restapi /usr/sbin/go-server-server" +check program restapi|restapi with path "/usr/bin/process_checker restapi restapi /usr/sbin/go-server-server" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry index 5db332a90ffb..75a9699717c9 100644 --- a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry +++ b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry @@ -4,8 +4,8 @@ ## telemetry ## dialout_client ############################################################################### -check program container_process_telemetry with path "/usr/bin/process_checker telemetry telemetry /usr/sbin/telemetry" +check program telemetry|telemetry with path "/usr/bin/process_checker telemetry telemetry /usr/sbin/telemetry" if status != 0 for 5 times within 5 cycles then alert -check program container_process_dialout_client with path "/usr/bin/process_checker telemetry dialout_client /usr/sbin/dialout_client_cli" +check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry dialout_client /usr/sbin/dialout_client_cli" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-teamd/base_image_files/monit_teamd b/dockers/docker-teamd/base_image_files/monit_teamd new file mode 100644 index 000000000000..34b4d4b33897 --- /dev/null +++ b/dockers/docker-teamd/base_image_files/monit_teamd @@ -0,0 +1,11 @@ +############################################################################### +## Monit configuration for teamd container +## process list: +## teamsyncd +## teammgrd +############################################################################### +check program teamd|teamsyncd with path "/usr/bin/process_checker teamd teamsyncd /usr/bin/teamsyncd" + if status != 0 for 5 times within 5 cycles then alert + +check program teamd|teammgrd with path "/usr/bin/process_checker teamd teammgrd /usr/bin/teammgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd index b207e6116fd1..b9bd4414ac1b 100644 --- a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd +++ b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd index d9857a432bce..f5159c2fa0fc 100644 --- a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd +++ b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd @@ -4,8 +4,8 @@ ## syncd ## dsserve ############################################################################### -check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" if status != 0 for 5 times within 5 cycles then alert -check program container_process_dsserve with path "/usr/bin/process_checker syncd dsserve /usr/bin/dsserve /usr/bin/syncd" +check program syncd|dsserve with path "/usr/bin/process_checker syncd dsserve /usr/bin/dsserve /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd index b207e6116fd1..b9bd4414ac1b 100644 --- a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd +++ b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd index b207e6116fd1..b9bd4414ac1b 100644 --- a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd +++ b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd index b207e6116fd1..b9bd4414ac1b 100644 --- a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd index b207e6116fd1..b9bd4414ac1b 100644 --- a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd index b207e6116fd1..b9bd4414ac1b 100644 --- a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd index b207e6116fd1..b9bd4414ac1b 100644 --- a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd +++ b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd index d9857a432bce..f5159c2fa0fc 100644 --- a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd +++ b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd @@ -4,8 +4,8 @@ ## syncd ## dsserve ############################################################################### -check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" if status != 0 for 5 times within 5 cycles then alert -check program container_process_dsserve with path "/usr/bin/process_checker syncd dsserve /usr/bin/dsserve /usr/bin/syncd" +check program syncd|dsserve with path "/usr/bin/process_checker syncd dsserve /usr/bin/dsserve /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert From a671f45e3f48339e15b7780d2f9d73cb4f1010c3 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 15 Sep 2020 18:36:24 -0700 Subject: [PATCH 04/12] [Monit] Add copying the Monit configuration file of teamd container to host under /etc/monit/conf.d in docker-teamd.mk. Signed-off-by: Yong Zhao --- rules/docker-teamd.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/rules/docker-teamd.mk b/rules/docker-teamd.mk index c0fe6bfb6bb1..5442d5bf6b3f 100644 --- a/rules/docker-teamd.mk +++ b/rules/docker-teamd.mk @@ -27,4 +27,5 @@ $(DOCKER_TEAMD)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_TEAMD)_RUN_OPT += -v /host/warmboot:/var/warmboot $(DOCKER_TEAMD)_BASE_IMAGE_FILES += teamdctl:/usr/bin/teamdctl +$(DOCKER_TEAMD)_BASE_IMAGE_FILES += monit_teamd:/etc/monit/conf.d $(DOCKER_TEAMD)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) From 9ecb61de4a4b236bfc8eb00ae1b1cb13474a90fe Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 15 Sep 2020 22:30:59 -0700 Subject: [PATCH 05/12] [Monit] Fix the typo. Signed-off-by: Yong Zhao --- files/image_config/monit/process_checker | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/image_config/monit/process_checker b/files/image_config/monit/process_checker index 5de8addfe6d8..5907f6e708c0 100755 --- a/files/image_config/monit/process_checker +++ b/files/image_config/monit/process_checker @@ -44,7 +44,7 @@ def check_process_existence(container_name, process_name, process_cmdline): print("'{}' is not running.".format(process_name)) sys.exit(1) else: - syslog.syslog(syslog.LOG_ERR, "contianer '{}' is not included in SONiC image or the given container name is invalid!" + syslog.syslog(syslog.LOG_ERR, "container '{}' is not included in SONiC image or the given container name is invalid!" .format(container_name)) From e4c08242ae80c93f7e0b70b3be8c430e3d9d996f Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Fri, 18 Sep 2020 12:40:34 -0700 Subject: [PATCH 06/12] [Monit] Use the string '/usr/bin/syncd' in Monit configuration files to match the syncd process. Signed-off-by: Yong Zhao --- files/image_config/monit/process_checker | 2 +- platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd | 2 +- .../broadcom/docker-syncd-brcm/base_image_files/monit_syncd | 2 +- platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd | 2 +- .../centec/docker-syncd-centec/base_image_files/monit_syncd | 2 +- .../docker-syncd-mrvl/base_image_files/monit_syncd | 2 +- .../docker-syncd-mrvl/base_image_files/monit_syncd | 2 +- platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd | 2 +- .../mellanox/docker-syncd-mlnx/base_image_files/monit_syncd | 2 +- .../nephos/docker-syncd-nephos/base_image_files/monit_syncd | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/files/image_config/monit/process_checker b/files/image_config/monit/process_checker index 5907f6e708c0..25ffdea08582 100755 --- a/files/image_config/monit/process_checker +++ b/files/image_config/monit/process_checker @@ -35,7 +35,7 @@ def check_process_existence(container_name, process_name, process_cmdline): if process.name() == "process_checker": continue - if ((process_name == process.name() or process_cmdline in ' '.join(process.cmdline())) + if ((process_name == process.name() or (' '.join(process.cmdline())).startswith(process_cmdline)) and process.status() in ["running", "sleeping"]): is_running = True break diff --git a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd index b9bd4414ac1b..8f4e1eb0cc57 100644 --- a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd +++ b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd index f5159c2fa0fc..363850b2f366 100644 --- a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd +++ b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd @@ -4,7 +4,7 @@ ## syncd ## dsserve ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert check program syncd|dsserve with path "/usr/bin/process_checker syncd dsserve /usr/bin/dsserve /usr/bin/syncd" diff --git a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd index b9bd4414ac1b..8f4e1eb0cc57 100644 --- a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd +++ b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd index b9bd4414ac1b..8f4e1eb0cc57 100644 --- a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd +++ b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd index b9bd4414ac1b..8f4e1eb0cc57 100644 --- a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd index b9bd4414ac1b..8f4e1eb0cc57 100644 --- a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd index b9bd4414ac1b..8f4e1eb0cc57 100644 --- a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd index b9bd4414ac1b..8f4e1eb0cc57 100644 --- a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd +++ b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd index f5159c2fa0fc..363850b2f366 100644 --- a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd +++ b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd @@ -4,7 +4,7 @@ ## syncd ## dsserve ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" +check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert check program syncd|dsserve with path "/usr/bin/process_checker syncd dsserve /usr/bin/dsserve /usr/bin/syncd" From fe167c405a106055aa82b34171f0a7283b2fcd01 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Fri, 18 Sep 2020 15:44:00 -0700 Subject: [PATCH 07/12] [Monit] Container name will not be as a parameter to the script process_checker. Signed-off-by: Yong Zhao --- .../base_image_files/monit_database | 2 +- .../docker-fpm-frr/base_image_files/monit_bgp | 10 +++++----- .../docker-lldp/base_image_files/monit_lldp | 6 +++--- .../base_image_files/monit_swss | 20 +++++++++---------- .../docker-sflow/base_image_files/monit_sflow | 2 +- .../docker-snmp/base_image_files/monit_snmp | 4 ++-- .../base_image_files/monit_restapi | 2 +- .../base_image_files/monit_telemetry | 4 ++-- .../docker-teamd/base_image_files/monit_teamd | 4 ++-- files/image_config/monit/process_checker | 15 +++----------- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 4 ++-- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 2 +- .../base_image_files/monit_syncd | 4 ++-- 19 files changed, 41 insertions(+), 50 deletions(-) diff --git a/dockers/docker-database/base_image_files/monit_database b/dockers/docker-database/base_image_files/monit_database index fc33bba0e744..c1addd8a6f05 100644 --- a/dockers/docker-database/base_image_files/monit_database +++ b/dockers/docker-database/base_image_files/monit_database @@ -3,5 +3,5 @@ ## process list: ## redis_server ############################################################################### -check program database|redis_server with path "/usr/bin/process_checker database redis-server /usr/bin/redis-server" +check program database|redis_server with path "/usr/bin/process_checker database /usr/bin/redis-server" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-fpm-frr/base_image_files/monit_bgp b/dockers/docker-fpm-frr/base_image_files/monit_bgp index b2b0de9ab916..2be4232dda6e 100644 --- a/dockers/docker-fpm-frr/base_image_files/monit_bgp +++ b/dockers/docker-fpm-frr/base_image_files/monit_bgp @@ -7,17 +7,17 @@ ## staticd ## bgpcfgd ############################################################################### -check program bgp|zebra with path "/usr/bin/process_checker bgp zebra /usr/lib/frr/zebra" +check program bgp|zebra with path "/usr/bin/process_checker bgp /usr/lib/frr/zebra" if status != 0 for 5 times within 5 cycles then alert -check program bgp|fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd fpmsyncd" +check program bgp|fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd" if status != 0 for 5 times within 5 cycles then alert -check program bgp|bgpd with path "/usr/bin/process_checker bgp bgpd /usr/lib/frr/bgpd" +check program bgp|bgpd with path "/usr/bin/process_checker bgp /usr/lib/frr/bgpd" if status != 0 for 5 times within 5 cycles then alert -check program bgp|staticd with path "/usr/bin/process_checker bgp staticd /usr/lib/frr/staticd" +check program bgp|staticd with path "/usr/bin/process_checker bgp /usr/lib/frr/staticd" if status != 0 for 5 times within 5 cycles then alert -check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp bgpcfgd python /usr/local/bin/bgpcfgd" +check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp python /usr/local/bin/bgpcfgd" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-lldp/base_image_files/monit_lldp b/dockers/docker-lldp/base_image_files/monit_lldp index fc91f91bfd1b..194fa14a3088 100644 --- a/dockers/docker-lldp/base_image_files/monit_lldp +++ b/dockers/docker-lldp/base_image_files/monit_lldp @@ -5,11 +5,11 @@ ## lldp-syncd ## lldpmgrd ############################################################################### -check program lldp|lldpd_monitor with path "/usr/bin/process_checker lldp lldpd lldpd:" +check program lldp|lldpd_monitor with path "/usr/bin/process_checker lldp lldpd:" if status != 0 for 5 times within 5 cycles then alert -check program lldp|lldp_syncd with path "/usr/bin/process_checker lldp lldp_syncd python2 -m lldp_syncd" +check program lldp|lldp_syncd with path "/usr/bin/process_checker lldp python2 -m lldp_syncd" if status != 0 for 5 times within 5 cycles then alert -check program lldp|lldpmgrd with path "/usr/bin/process_checker lldp lldpmgrd python /usr/bin/lldpmgrd" +check program lldp|lldpmgrd with path "/usr/bin/process_checker lldp python /usr/bin/lldpmgrd" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-orchagent/base_image_files/monit_swss b/dockers/docker-orchagent/base_image_files/monit_swss index 1151b01995a7..f5f4389f3fe4 100644 --- a/dockers/docker-orchagent/base_image_files/monit_swss +++ b/dockers/docker-orchagent/base_image_files/monit_swss @@ -12,32 +12,32 @@ ## nbrmgrd ## vxlanmgrd ############################################################################## -check program swss|orchagent with path "/usr/bin/process_checker swss orchagent /usr/bin/orchagent -d /var/log/swss" +check program swss|orchagent with path "/usr/bin/process_checker swss /usr/bin/orchagent -d /var/log/swss" if status != 0 for 5 times within 5 cycles then alert -check program swss|portsyncd with path "/usr/bin/process_checker swss portsyncd /usr/bin/portsyncd" +check program swss|portsyncd with path "/usr/bin/process_checker swss /usr/bin/portsyncd" if status != 0 for 5 times within 5 cycles then alert -check program swss|neighsyncd with path "/usr/bin/process_checker swss neighsyncd /usr/bin/neighsyncd" +check program swss|neighsyncd with path "/usr/bin/process_checker swss /usr/bin/neighsyncd" if status != 0 for 5 times within 5 cycles then alert -check program swss|vrfmgrd with path "/usr/bin/process_checker swss vrfmgrd /usr/bin/vrfmgrd" +check program swss|vrfmgrd with path "/usr/bin/process_checker swss /usr/bin/vrfmgrd" if status != 0 for 5 times within 5 cycles then alert -check program swss|vlanmgrd with path "/usr/bin/process_checker swss vlanmgrd /usr/bin/vlanmgrd" +check program swss|vlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vlanmgrd" if status != 0 for 5 times within 5 cycles then alert -check program swss|intfmgrd with path "/usr/bin/process_checker swss intfmgrd /usr/bin/intfmgrd" +check program swss|intfmgrd with path "/usr/bin/process_checker swss /usr/bin/intfmgrd" if status != 0 for 5 times within 5 cycles then alert -check program swss|portmgrd with path "/usr/bin/process_checker swss portmgrd /usr/bin/portmgrd" +check program swss|portmgrd with path "/usr/bin/process_checker swss /usr/bin/portmgrd" if status != 0 for 5 times within 5 cycles then alert -check program swss|buffermgrd with path "/usr/bin/process_checker swss buffermgrd /usr/bin/buffermgrd -l" +check program swss|buffermgrd with path "/usr/bin/process_checker swss /usr/bin/buffermgrd -l" if status != 0 for 5 times within 5 cycles then alert -check program swss|nbrmgrd with path "/usr/bin/process_checker swss nbrmgrd /usr/bin/nbrmgrd" +check program swss|nbrmgrd with path "/usr/bin/process_checker swss /usr/bin/nbrmgrd" if status != 0 for 5 times within 5 cycles then alert -check program swss|vxlanmgrd with path "/usr/bin/process_checker swss vxlanmgrd /usr/bin/vxlanmgrd" +check program swss|vxlanmgrd with path "/usr/bin/process_checker swss /usr/bin/vxlanmgrd" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sflow/base_image_files/monit_sflow b/dockers/docker-sflow/base_image_files/monit_sflow index 765632f54c58..217f2e625835 100644 --- a/dockers/docker-sflow/base_image_files/monit_sflow +++ b/dockers/docker-sflow/base_image_files/monit_sflow @@ -3,5 +3,5 @@ ## process list: ## sflowmgrd ############################################################################### -check program sflow|sflowmgrd with path "/usr/bin/process_checker sflow sflowmgrd /usr/bin/sflowmgrd" +check program sflow|sflowmgrd with path "/usr/bin/process_checker sflow /usr/bin/sflowmgrd" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-snmp/base_image_files/monit_snmp b/dockers/docker-snmp/base_image_files/monit_snmp index e018c75af39a..a943985abcef 100644 --- a/dockers/docker-snmp/base_image_files/monit_snmp +++ b/dockers/docker-snmp/base_image_files/monit_snmp @@ -4,8 +4,8 @@ ## snmpd ## snmpd_subagent ############################################################################### -check program snmp|snmpd with path "/usr/bin/process_checker snmp snmpd /usr/sbin/snmpd" +check program snmp|snmpd with path "/usr/bin/process_checker snmp /usr/sbin/snmpd" if status != 0 for 5 times within 5 cycles then alert -check program snmp|snmp_subagent with path "/usr/bin/process_checker snmp snmp_subagent python3.6 -m sonic_ax_impl" +check program snmp|snmp_subagent with path "/usr/bin/process_checker snmp python3.6 -m sonic_ax_impl" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sonic-restapi/base_image_files/monit_restapi b/dockers/docker-sonic-restapi/base_image_files/monit_restapi index 0062851fa9cc..84e4366f4ac5 100644 --- a/dockers/docker-sonic-restapi/base_image_files/monit_restapi +++ b/dockers/docker-sonic-restapi/base_image_files/monit_restapi @@ -3,5 +3,5 @@ ## process list: ## restapi ############################################################################### -check program restapi|restapi with path "/usr/bin/process_checker restapi restapi /usr/sbin/go-server-server" +check program restapi|restapi with path "/usr/bin/process_checker restapi /usr/sbin/go-server-server" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry index 75a9699717c9..7365ce51d1fd 100644 --- a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry +++ b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry @@ -4,8 +4,8 @@ ## telemetry ## dialout_client ############################################################################### -check program telemetry|telemetry with path "/usr/bin/process_checker telemetry telemetry /usr/sbin/telemetry" +check program telemetry|telemetry with path "/usr/bin/process_checker telemetry /usr/sbin/telemetry" if status != 0 for 5 times within 5 cycles then alert -check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry dialout_client /usr/sbin/dialout_client_cli" +check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli" if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-teamd/base_image_files/monit_teamd b/dockers/docker-teamd/base_image_files/monit_teamd index 34b4d4b33897..256482aef2bf 100644 --- a/dockers/docker-teamd/base_image_files/monit_teamd +++ b/dockers/docker-teamd/base_image_files/monit_teamd @@ -4,8 +4,8 @@ ## teamsyncd ## teammgrd ############################################################################### -check program teamd|teamsyncd with path "/usr/bin/process_checker teamd teamsyncd /usr/bin/teamsyncd" +check program teamd|teamsyncd with path "/usr/bin/process_checker teamd /usr/bin/teamsyncd" if status != 0 for 5 times within 5 cycles then alert -check program teamd|teammgrd with path "/usr/bin/process_checker teamd teammgrd /usr/bin/teammgrd" +check program teamd|teammgrd with path "/usr/bin/process_checker teamd /usr/bin/teammgrd" if status != 0 for 5 times within 5 cycles then alert diff --git a/files/image_config/monit/process_checker b/files/image_config/monit/process_checker index 25ffdea08582..2d9949ab6a6a 100755 --- a/files/image_config/monit/process_checker +++ b/files/image_config/monit/process_checker @@ -28,15 +28,7 @@ def check_process_existence(container_name, process_name, process_cmdline): # state, then it will be marked as 'running'. is_running = False for process in psutil.process_iter(["name", "cmdline", "status"]): - # The script process_checker has the command line format '/usr/bin/process_checker - # ' such as '/usr/bin/process_checker bgp fpmsyncd fpmsyncd'. So - # when using psutil to search process 'fpmsyncd', we should skip the process which ran process_checker - # since it is not 'fpmsyncd' process although 'fpmsyncd' is a sustring of its cmdline. - if process.name() == "process_checker": - continue - - if ((process_name == process.name() or (' '.join(process.cmdline())).startswith(process_cmdline)) - and process.status() in ["running", "sleeping"]): + if ((' '.join(process.cmdline())).startswith(process_cmdline) and process.status() in ["running", "sleeping"]): is_running = True break @@ -51,13 +43,12 @@ def check_process_existence(container_name, process_name, process_cmdline): def main(): parser = argparse.ArgumentParser(description="Check whether the process in the specified \ container is running and an alerting message will be written into syslog if it \ - failed to run.", usage="/usr/bin/process_checker ") + failed to run.", usage="/usr/bin/process_checker ") parser.add_argument("container_name", help="container name") - parser.add_argument("process_name", help="process name") parser.add_argument("process_cmdline", nargs=argparse.REMAINDER, help="process name") args = parser.parse_args() - check_process_existence(args.container_name, args.process_name, ' '.join(args.process_cmdline)) + check_process_existence(args.container_name, ' '.join(args.process_cmdline)) if __name__ == '__main__': diff --git a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd index 8f4e1eb0cc57..14789c67c3b8 100644 --- a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd +++ b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd index 363850b2f366..119548770096 100644 --- a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd +++ b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd @@ -4,8 +4,8 @@ ## syncd ## dsserve ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert -check program syncd|dsserve with path "/usr/bin/process_checker syncd dsserve /usr/bin/dsserve /usr/bin/syncd" +check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd index 8f4e1eb0cc57..14789c67c3b8 100644 --- a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd +++ b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd index 8f4e1eb0cc57..14789c67c3b8 100644 --- a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd +++ b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd index 8f4e1eb0cc57..14789c67c3b8 100644 --- a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd index 8f4e1eb0cc57..14789c67c3b8 100644 --- a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd index 8f4e1eb0cc57..14789c67c3b8 100644 --- a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd index 8f4e1eb0cc57..14789c67c3b8 100644 --- a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd +++ b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd index 363850b2f366..119548770096 100644 --- a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd +++ b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd @@ -4,8 +4,8 @@ ## syncd ## dsserve ############################################################################### -check program syncd|syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd" +check program syncd|syncd with path "/usr/bin/process_checker syncd /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert -check program syncd|dsserve with path "/usr/bin/process_checker syncd dsserve /usr/bin/dsserve /usr/bin/syncd" +check program syncd|dsserve with path "/usr/bin/process_checker syncd /usr/bin/dsserve /usr/bin/syncd" if status != 0 for 5 times within 5 cycles then alert From b576e19c1802363d18bcfe5a3b8390568f28e0f7 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Fri, 18 Sep 2020 16:04:24 -0700 Subject: [PATCH 08/12] [Monit] Install the psutil module. Signed-off-by: Yong Zhao --- dockers/docker-fpm-frr/base_image_files/monit_bgp | 2 +- files/build_templates/sonic_debian_extension.j2 | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dockers/docker-fpm-frr/base_image_files/monit_bgp b/dockers/docker-fpm-frr/base_image_files/monit_bgp index 2be4232dda6e..dba5d1667262 100644 --- a/dockers/docker-fpm-frr/base_image_files/monit_bgp +++ b/dockers/docker-fpm-frr/base_image_files/monit_bgp @@ -19,5 +19,5 @@ check program bgp|bgpd with path "/usr/bin/process_checker bgp /usr/lib/frr/bgpd check program bgp|staticd with path "/usr/bin/process_checker bgp /usr/lib/frr/staticd" if status != 0 for 5 times within 5 cycles then alert -check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp python /usr/local/bin/bgpcfgd" +check program bgp|bgpcfgd with path "/usr/bin/process_checker bgp /usr/bin/python /usr/local/bin/bgpcfgd" if status != 0 for 5 times within 5 cycles then alert diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index 7d18c1c3ac23..fc0dda3d055a 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -116,6 +116,9 @@ sudo rm -rf $FILESYSTEM_ROOT/$REDIS_DUMP_LOAD_PY2_WHEEL_NAME # Install Python module for ipaddress sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install ipaddress +# Install Python module for psutil +sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install psutil + # Install SwSS SDK Python 3 package # Note: the scripts will be overwritten by corresponding Python 2 package if [ -e {{swsssdk_py3_wheel_path}} ]; then From b0c86e864f71ed7d6d7771817360150ba28edad6 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 21 Sep 2020 11:34:50 -0700 Subject: [PATCH 09/12] [Monit] Use process command line to signify the process is not running instead of process name in syslog. Signed-off-by: Yong Zhao --- files/image_config/monit/process_checker | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/files/image_config/monit/process_checker b/files/image_config/monit/process_checker index 2d9949ab6a6a..306ed44e3c7f 100755 --- a/files/image_config/monit/process_checker +++ b/files/image_config/monit/process_checker @@ -7,7 +7,7 @@ import psutil import swsssdk -def check_process_existence(container_name, process_name, process_cmdline): +def check_process_existence(container_name, process_cmdline): """ @summary: Check whether the process in the specified container is running or not and an alerting message will written into syslog if it failed to run. @@ -27,13 +27,13 @@ def check_process_existence(container_name, process_name, process_cmdline): # If the process entity is found in process tree and it is also in the 'running' or 'sleeping' # state, then it will be marked as 'running'. is_running = False - for process in psutil.process_iter(["name", "cmdline", "status"]): + for process in psutil.process_iter(["cmdline", "status"]): if ((' '.join(process.cmdline())).startswith(process_cmdline) and process.status() in ["running", "sleeping"]): is_running = True break if not is_running: - print("'{}' is not running.".format(process_name)) + print("'{}' is not running.".format(process_cmdline)) sys.exit(1) else: syslog.syslog(syslog.LOG_ERR, "container '{}' is not included in SONiC image or the given container name is invalid!" @@ -45,7 +45,7 @@ def main(): container is running and an alerting message will be written into syslog if it \ failed to run.", usage="/usr/bin/process_checker ") parser.add_argument("container_name", help="container name") - parser.add_argument("process_cmdline", nargs=argparse.REMAINDER, help="process name") + parser.add_argument("process_cmdline", nargs=argparse.REMAINDER, help="process command line") args = parser.parse_args() check_process_existence(args.container_name, ' '.join(args.process_cmdline)) From 1a4e481f8e5c623967cff2925bd4cff23c13cd79 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 21 Sep 2020 11:41:02 -0700 Subject: [PATCH 10/12] [Monit] Delete a blank line. Signed-off-by: Yong Zhao --- files/build_templates/sonic_debian_extension.j2 | 1 - 1 file changed, 1 deletion(-) diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index fc0dda3d055a..2ea3a25b9ae0 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -234,7 +234,6 @@ sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/conf.d/* sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/ sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker - # Copy crontabs sudo cp -f $IMAGE_CONFIGS/cron.d/* $FILESYSTEM_ROOT/etc/cron.d/ From 875a2a54561c37756eb16c42fe6986257dff8d76 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 23 Sep 2020 09:54:42 -0700 Subject: [PATCH 11/12] [Monit] Add a comment for the `print(...)` statement in the script process_checker. Signed-off-by: Yong Zhao --- files/image_config/monit/process_checker | 2 ++ 1 file changed, 2 insertions(+) diff --git a/files/image_config/monit/process_checker b/files/image_config/monit/process_checker index 306ed44e3c7f..7ee3f868807d 100755 --- a/files/image_config/monit/process_checker +++ b/files/image_config/monit/process_checker @@ -33,6 +33,8 @@ def check_process_existence(container_name, process_cmdline): break if not is_running: + # If this script is run by Monit, then the following output will be appneded to + # Monit's syslog message. print("'{}' is not running.".format(process_cmdline)) sys.exit(1) else: From 4fd77ccf05e80bd9ab34b65652601a93d9a67721 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 23 Sep 2020 10:13:03 -0700 Subject: [PATCH 12/12] [Monit] Fix a typo. Signed-off-by: Yong Zhao --- files/image_config/monit/process_checker | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/image_config/monit/process_checker b/files/image_config/monit/process_checker index 7ee3f868807d..ba48e37729aa 100755 --- a/files/image_config/monit/process_checker +++ b/files/image_config/monit/process_checker @@ -33,7 +33,7 @@ def check_process_existence(container_name, process_cmdline): break if not is_running: - # If this script is run by Monit, then the following output will be appneded to + # If this script is run by Monit, then the following output will be appended to # Monit's syslog message. print("'{}' is not running.".format(process_cmdline)) sys.exit(1)