From ad64dc6b8a227081a7491e62b0523263206af0db Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 11 Aug 2020 22:28:46 +0000 Subject: [PATCH] [monit] Unmonitor the processes in containers which are disabled. Signed-off-by: Yong Zhao --- .../base_image_files/monit_database | 4 +- .../docker-fpm-frr/base_image_files/monit_bgp | 20 +++--- .../docker-lldp/base_image_files/monit_lldp | 12 ++-- .../base_image_files/monit_swss | 40 ++++++------ .../docker-sflow/base_image_files/monit_sflow | 4 +- .../docker-snmp/base_image_files/monit_snmp | 8 +-- .../base_image_files/monit_restapi | 4 +- .../base_image_files/monit_telemetry | 8 +-- .../build_templates/sonic_debian_extension.j2 | 3 + files/image_config/monit/process_checker | 64 +++++++++++++++++++ .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 8 +-- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 4 +- .../base_image_files/monit_syncd | 8 +-- 19 files changed, 139 insertions(+), 72 deletions(-) create mode 100755 files/image_config/monit/process_checker diff --git a/dockers/docker-database/base_image_files/monit_database b/dockers/docker-database/base_image_files/monit_database index c5508922864e..5b0d99746aed 100644 --- a/dockers/docker-database/base_image_files/monit_database +++ b/dockers/docker-database/base_image_files/monit_database @@ -3,5 +3,5 @@ ## process list: ## redis_server ############################################################################### -check process redis_server matching "/usr/bin/redis-server" - if does not exist for 5 times within 5 cycles then alert +check program container_process_redis_server with path "/usr/bin/process_checker database redis-server /usr/bin/redis-server" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-fpm-frr/base_image_files/monit_bgp b/dockers/docker-fpm-frr/base_image_files/monit_bgp index 5dbb794c346b..6fa6b723ad09 100644 --- a/dockers/docker-fpm-frr/base_image_files/monit_bgp +++ b/dockers/docker-fpm-frr/base_image_files/monit_bgp @@ -7,17 +7,17 @@ ## staticd ## bgpcfgd ############################################################################### -check process zebra matching "/usr/lib/frr/zebra" - if does not exist for 5 times within 5 cycles then alert +check program container_process_zebra with path "/usr/bin/process_checker bgp zebra /usr/lib/frr/zebra" + if status != 0 for 5 times within 5 cycles then alert -check process fpmsyncd matching "fpmsyncd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_fpmsyncd with path "/usr/bin/process_checker bgp fpmsyncd fpmsyncd" + if status != 0 for 5 times within 5 cycles then alert -check process bgpd matching "/usr/lib/frr/bgpd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_bgpd with path "/usr/bin/process_checker bgp bgpd /usr/lib/frr/bgpd" + if status != 0 for 5 times within 5 cycles then alert -check process staticd matching "/usr/lib/frr/staticd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_staticd with path "/usr/bin/process_checker bgp staticd /usr/lib/frr/staticd" + if status != 0 for 5 times within 5 cycles then alert -check process bgpcfgd matching "python /usr/local/bin/bgpcfgd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_bgpcfgd with path "/usr/bin/process_checker bgp bgpcfgd python /usr/local/bin/bgpcfgd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-lldp/base_image_files/monit_lldp b/dockers/docker-lldp/base_image_files/monit_lldp index 200c52c7d332..1039d1e115fb 100644 --- a/dockers/docker-lldp/base_image_files/monit_lldp +++ b/dockers/docker-lldp/base_image_files/monit_lldp @@ -5,11 +5,11 @@ ## lldp-syncd ## lldpmgrd ############################################################################### -check process lldpd_monitor matching "lldpd: " - if does not exist for 5 times within 5 cycles then alert +check program container_process_lldpd_monitor with path "/usr/bin/process_checker lldp lldpd lldpd:" + if status != 0 for 5 times within 5 cycles then alert -check process lldp_syncd matching "python2 -m lldp_syncd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_lldp_syncd with path "/usr/bin/process_checker lldp lldp_syncd python2 -m lldp_syncd" + if status != 0 for 5 times within 5 cycles then alert -check process lldpmgrd matching "python /usr/bin/lldpmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_lldpmgrd with path "/usr/bin/process_checker lldp lldpmgrd python /usr/bin/lldpmgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-orchagent/base_image_files/monit_swss b/dockers/docker-orchagent/base_image_files/monit_swss index 5928dbd4ddb0..e32b1196bda1 100644 --- a/dockers/docker-orchagent/base_image_files/monit_swss +++ b/dockers/docker-orchagent/base_image_files/monit_swss @@ -12,32 +12,32 @@ ## nbrmgrd ## vxlanmgrd ############################################################################### -check process orchagent matching "/usr/bin/orchagent -d /var/log/swss" - if does not exist for 5 times within 5 cycles then alert +check program container_process_orchagent with path "/usr/bin/process_checker swss orchagent /usr/bin/orchagent -d /var/log/swss" + if status != 0 for 5 times within 5 cycles then alert -check process portsyncd matching "/usr/bin/portsyncd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_portsyncd with path "/usr/bin/process_checker swss portsyncd /usr/bin/portsyncd" + if status != 0 for 5 times within 5 cycles then alert -check process neighsyncd matching "/usr/bin/neighsyncd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_neighsyncd with path "/usr/bin/process_checker swss neighsyncd /usr/bin/neighsyncd" + if status != 0 for 5 times within 5 cycles then alert -check process vrfmgrd matching "/usr/bin/vrfmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_vrfmgrd with path "/usr/bin/process_checker swss vrfmgrd /usr/bin/vrfmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process vlanmgrd matching "/usr/bin/vlanmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_vlanmgrd with path "/usr/bin/process_checker swss vlanmgrd /usr/bin/vlanmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process intfmgrd matching "/usr/bin/intfmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_intfmgrd with path "/usr/bin/process_checker swss intfmgrd /usr/bin/intfmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process portmgrd matching "/usr/bin/portmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_portmgrd with path "/usr/bin/process_checker swss portmgrd /usr/bin/portmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process buffermgrd matching "/usr/bin/buffermgrd -l" - if does not exist for 5 times within 5 cycles then alert +check program container_process_buffermgrd with path "/usr/bin/process_checker swss buffermgrd /usr/bin/buffermgrd -l" + if status != 0 for 5 times within 5 cycles then alert -check process nbrmgrd matching "/usr/bin/nbrmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_nbrmgrd with path "/usr/bin/process_checker swss nbrmgrd /usr/bin/nbrmgrd" + if status != 0 for 5 times within 5 cycles then alert -check process vxlanmgrd matching "/usr/bin/vxlanmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_vxlanmgrd with path "/usr/bin/process_checker swss vxlanmgrd /usr/bin/vxlanmgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sflow/base_image_files/monit_sflow b/dockers/docker-sflow/base_image_files/monit_sflow index d041f81001ea..44f3159aa773 100644 --- a/dockers/docker-sflow/base_image_files/monit_sflow +++ b/dockers/docker-sflow/base_image_files/monit_sflow @@ -3,5 +3,5 @@ ## process list: ## sflowmgrd ############################################################################### -check process sflowmgrd matching "/usr/bin/sflowmgrd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_sflowmgrd with path "/usr/bin/process_checker sflow sflowmgrd /usr/bin/sflowmgrd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-snmp/base_image_files/monit_snmp b/dockers/docker-snmp/base_image_files/monit_snmp index cfb1a2b66831..47689fb0b433 100644 --- a/dockers/docker-snmp/base_image_files/monit_snmp +++ b/dockers/docker-snmp/base_image_files/monit_snmp @@ -4,8 +4,8 @@ ## snmpd ## snmpd_subagent ############################################################################### -check process snmpd matching "/usr/sbin/snmpd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_snmpd with path "/usr/bin/process_checker snmp snmpd /usr/sbin/snmpd" + if status != 0 for 5 times within 5 cycles then alert -check process snmp_subagent matching "python3 -m sonic_ax_impl" - if does not exist for 5 times within 5 cycles then alert +check program container_process_snmp_subagent with path "/usr/bin/process_checker snmp snmp_subagent python3 -m sonic_ax_impl" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sonic-restapi/base_image_files/monit_restapi b/dockers/docker-sonic-restapi/base_image_files/monit_restapi index 2e90baf30d57..7ef05080c938 100644 --- a/dockers/docker-sonic-restapi/base_image_files/monit_restapi +++ b/dockers/docker-sonic-restapi/base_image_files/monit_restapi @@ -3,5 +3,5 @@ ## process list: ## restapi ############################################################################### -check process restapi matching "/usr/sbin/go-server-server" - if does not exist for 5 times within 5 cycles then alert +check program container_process_restapi with path "/usr/bin/process_checker restapi restapi /usr/sbin/go-server-server" + if status != 0 for 5 times within 5 cycles then alert diff --git a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry index a82c652f8179..5db332a90ffb 100644 --- a/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry +++ b/dockers/docker-sonic-telemetry/base_image_files/monit_telemetry @@ -4,8 +4,8 @@ ## telemetry ## dialout_client ############################################################################### -check process telemetry matching "/usr/sbin/telemetry" - if does not exist for 5 times within 5 cycles then alert +check program container_process_telemetry with path "/usr/bin/process_checker telemetry telemetry /usr/sbin/telemetry" + if status != 0 for 5 times within 5 cycles then alert -check process dialout_client matching "/usr/sbin/dialout_client_cli" - if does not exist for 5 times within 5 cycles then alert +check program container_process_dialout_client with path "/usr/bin/process_checker telemetry dialout_client /usr/sbin/dialout_client_cli" + if status != 0 for 5 times within 5 cycles then alert diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index 19502e0320ab..7d18c1c3ac23 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -228,6 +228,9 @@ sudo cp $IMAGE_CONFIGS/monit/monitrc $FILESYSTEM_ROOT/etc/monit/ sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/monitrc sudo cp $IMAGE_CONFIGS/monit/conf.d/* $FILESYSTEM_ROOT/etc/monit/conf.d/ sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/conf.d/* +sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/ +sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker + # Copy crontabs sudo cp -f $IMAGE_CONFIGS/cron.d/* $FILESYSTEM_ROOT/etc/cron.d/ diff --git a/files/image_config/monit/process_checker b/files/image_config/monit/process_checker new file mode 100755 index 000000000000..5de8addfe6d8 --- /dev/null +++ b/files/image_config/monit/process_checker @@ -0,0 +1,64 @@ +#!/usr/bin/python +import argparse +import sys +import syslog + +import psutil +import swsssdk + + +def check_process_existence(container_name, process_name, process_cmdline): + """ + @summary: Check whether the process in the specified container is running or not and + an alerting message will written into syslog if it failed to run. + """ + config_db = swsssdk.ConfigDBConnector() + config_db.connect() + feature_table = config_db.get_table("FEATURE") + + if container_name in feature_table.keys(): + # We look into the 'FEATURE' table to verify whether the container is disabled or not. + # If the container is diabled, we exit. + if ("state" in feature_table[container_name].keys() + and feature_table[container_name]["state"] == "disabled"): + sys.exit(0) + else: + # We leveraged the psutil library to help us check whether the process is running or not. + # If the process entity is found in process tree and it is also in the 'running' or 'sleeping' + # state, then it will be marked as 'running'. + is_running = False + for process in psutil.process_iter(["name", "cmdline", "status"]): + # The script process_checker has the command line format '/usr/bin/process_checker + # ' such as '/usr/bin/process_checker bgp fpmsyncd fpmsyncd'. So + # when using psutil to search process 'fpmsyncd', we should skip the process which ran process_checker + # since it is not 'fpmsyncd' process although 'fpmsyncd' is a sustring of its cmdline. + if process.name() == "process_checker": + continue + + if ((process_name == process.name() or process_cmdline in ' '.join(process.cmdline())) + and process.status() in ["running", "sleeping"]): + is_running = True + break + + if not is_running: + print("'{}' is not running.".format(process_name)) + sys.exit(1) + else: + syslog.syslog(syslog.LOG_ERR, "contianer '{}' is not included in SONiC image or the given container name is invalid!" + .format(container_name)) + + +def main(): + parser = argparse.ArgumentParser(description="Check whether the process in the specified \ + container is running and an alerting message will be written into syslog if it \ + failed to run.", usage="/usr/bin/process_checker ") + parser.add_argument("container_name", help="container name") + parser.add_argument("process_name", help="process name") + parser.add_argument("process_cmdline", nargs=argparse.REMAINDER, help="process name") + args = parser.parse_args() + + check_process_existence(args.container_name, args.process_name, ' '.join(args.process_cmdline)) + + +if __name__ == '__main__': + main() diff --git a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd +++ b/platform/barefoot/docker-syncd-bfn/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd index 81c0b6ef6bc6..d9857a432bce 100644 --- a/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd +++ b/platform/broadcom/docker-syncd-brcm/base_image_files/monit_syncd @@ -4,8 +4,8 @@ ## syncd ## dsserve ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert -check process dsserve matching "/usr/bin/dsserve /usr/bin/syncd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_dsserve with path "/usr/bin/process_checker syncd dsserve /usr/bin/dsserve /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd +++ b/platform/cavium/docker-syncd-cavm/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/centec/docker-syncd-centec/base_image_files/monit_syncd +++ b/platform/centec/docker-syncd-centec/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-arm64/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell-armhf/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd +++ b/platform/marvell/docker-syncd-mrvl/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd index 75391f90ac32..b207e6116fd1 100644 --- a/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd +++ b/platform/mellanox/docker-syncd-mlnx/base_image_files/monit_syncd @@ -3,5 +3,5 @@ ## process list: ## syncd ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert diff --git a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd index 81c0b6ef6bc6..d9857a432bce 100644 --- a/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd +++ b/platform/nephos/docker-syncd-nephos/base_image_files/monit_syncd @@ -4,8 +4,8 @@ ## syncd ## dsserve ############################################################################### -check process syncd matching "/usr/bin/syncd\s" - if does not exist for 5 times within 5 cycles then alert +check program container_process_syncd with path "/usr/bin/process_checker syncd syncd /usr/bin/syncd --diag" + if status != 0 for 5 times within 5 cycles then alert -check process dsserve matching "/usr/bin/dsserve /usr/bin/syncd" - if does not exist for 5 times within 5 cycles then alert +check program container_process_dsserve with path "/usr/bin/process_checker syncd dsserve /usr/bin/dsserve /usr/bin/syncd" + if status != 0 for 5 times within 5 cycles then alert