From 2e8d64119dfec47b8d52a7e1ad38083e7009781f Mon Sep 17 00:00:00 2001 From: lguohan Date: Sun, 26 Apr 2020 16:54:15 -0700 Subject: [PATCH] [sanity_checks]: add critical process check in sanity checks (#1617) - first read the critical process list (/etc/supervisord/critical_process) from the container, and then check if any of the process crashed. - add snmp container to the critical service list - add auto_recover support Signed-off-by: Guohan Lu * add comment Signed-off-by: Guohan Lu * add auto_recover support if process check failed --- tests/common/devices.py | 62 +++++++++++++++++-- tests/common/plugins/sanity_check/checks.py | 44 +++++++++++++ .../common/plugins/sanity_check/constants.py | 2 +- tests/common/plugins/sanity_check/recover.py | 2 +- 4 files changed, 102 insertions(+), 8 deletions(-) diff --git a/tests/common/devices.py b/tests/common/devices.py index 9a4c394d70..3ede6e9474 100644 --- a/tests/common/devices.py +++ b/tests/common/devices.py @@ -10,6 +10,7 @@ import json import logging import os +import re from multiprocessing.pool import ThreadPool from datetime import datetime @@ -91,7 +92,7 @@ class SonicHost(AnsibleHostBase): For running ansible module on the SONiC switch """ - CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp"] + CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp", "snmp"] def __init__(self, ansible_adhoc, hostname, gather_facts=False): AnsibleHostBase.__init__(self, ansible_adhoc, hostname) @@ -214,6 +215,55 @@ def critical_services_fully_started(self): logging.debug("Status of critical services: %s" % str(result)) return all(result.values()) + def critical_process_status(self, service): + """ + @summary: Check whether critical process status of a service. + + @param service: Name of the SONiC service + """ + result = {'status': True} + result['exited_critical_process'] = [] + result['running_critical_process'] = [] + critical_process_list = [] + + # return false if the service is not started + service_status = self.is_service_fully_started(service) + if service_status == False: + result['status'] = False + return result + + # get critical process list for the service + output = self.command("docker exec {} bash -c '[ -f /etc/supervisor/critical_processes ] && cat /etc/supervisor/critical_processes'".format(service), module_ignore_errors=True) + for l in output['stdout'].split(): + critical_process_list.append(l.rstrip()) + if len(critical_process_list) == 0: + return result + + # get process status for the service + output = self.command("docker exec {} supervisorctl status".format(service)) + logging.info("====== supervisor process status for service {} ======".format(service)) + + for l in output['stdout_lines']: + (pname, status, info) = re.split("\s+", l, 2) + if status != "RUNNING": + if pname in critical_process_list: + result['exited_critical_process'].append(pname) + result['status'] = False + else: + if pname in critical_process_list: + result['running_critical_process'].append(pname) + + return result + + def all_critical_process_status(self): + """ + @summary: Check whether all critical processes status for all critical services + """ + result = {} + for service in self.CRITICAL_SERVICES: + result[service] = self.critical_process_status(service) + return result + def get_crm_resources(self): """ @summary: Run the "crm show resources all" command and parse its output @@ -257,7 +307,7 @@ def get_pmon_daemon_states(self): daemons = self.shell('docker exec pmon supervisorctl status')['stdout_lines'] - daemon_list = [ line.strip().split()[0] for line in daemons if len(line.strip()) > 0 ] + daemon_list = [ line.strip().split()[0] for line in daemons if len(line.strip()) > 0 ] daemon_ctl_key_prefix = 'skip_' daemon_config_file_path = os.path.join('/usr/share/sonic/device', self.facts["platform"], 'pmon_daemon_control.json') @@ -294,7 +344,7 @@ def num_npus(self): return the number of NPUs on the DUT """ return self.facts["num_npu"] - + def get_syncd_docker_names(self): """ @summary: get the list of syncd dockers names for the number of NPUs present on the DUT @@ -454,10 +504,10 @@ def get_fanout_os(self): def get_fanout_type(self): return self.type - + def shutdown(self, interface_name): return self.host.shutdown(interface_name)[self.hostname] - + def no_shutdown(self, interface_name): return self.host.no_shutdown(interface_name)[self.hostname] @@ -466,7 +516,7 @@ def command(self, cmd): def __str__(self): return "{ os: '%s', hostname: '%s', device_type: '%s' }" % (self.os, self.hostname, self.type) - + def __repr__(self): return self.__str__() diff --git a/tests/common/plugins/sanity_check/checks.py b/tests/common/plugins/sanity_check/checks.py index ba2bd687ba..0c322d2901 100644 --- a/tests/common/plugins/sanity_check/checks.py +++ b/tests/common/plugins/sanity_check/checks.py @@ -97,6 +97,8 @@ def check_interfaces(dut): return check_result def check_dbmemory(dut): + logger.info("Checking database memory...") + total_omem = 0 re_omem = re.compile("omem=(\d+)") res = dut.command("/usr/bin/redis-cli client list") @@ -115,6 +117,46 @@ def check_dbmemory(dut): logger.info("Done checking database memory") return check_result +def check_processes(dut): + logger.info("Checking process status...") + + networking_uptime = dut.get_networking_uptime().seconds + timeout = max((SYSTEM_STABILIZE_MAX_TIME - networking_uptime), 0) + interval = 20 + logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \ + (networking_uptime, timeout, interval)) + + check_result = {"failed": False, "check_item": "processes"} + if timeout == 0: # Check processes status, do not retry. + processes_status = dut.all_critical_process_status() + check_result["processes_status"] = processes_status + check_result["services_status"] = {} + for k, v in processes_status.items(): + if v['status'] == False or len(v['exited_critical_process']) > 0: + check_result['failed'] = True + check_result["services_status"].update({k: v['status']}) + else: # Retry checking processes status + start = time.time() + elapsed = 0 + while elapsed < timeout: + processes_status = dut.all_critical_process_status() + check_result["processes_status"] = processes_status + check_result["services_status"] = {} + for k, v in processes_status.items(): + if v['status'] == False or len(v['exited_critical_process']) > 0: + check_result['failed'] = True + check_result["services_status"].update({k: v['status']}) + + if check_result["failed"]: + wait(interval, msg="Not all processes are started, wait %d seconds to retry. Remaining time: %d %s" % \ + (interval, int(timeout - elapsed), str(check_result["processes_status"]))) + elapsed = time.time() - start + else: + break + + logger.info("Done checking processes status.") + return check_result + def do_checks(dut, check_items): results = [] for item in check_items: @@ -124,6 +166,8 @@ def do_checks(dut, check_items): results.append(check_interfaces(dut)) elif item == "dbmemory": results.append(check_dbmemory(dut)) + elif item == "processes": + results.append(check_processes(dut)) return results diff --git a/tests/common/plugins/sanity_check/constants.py b/tests/common/plugins/sanity_check/constants.py index 1f8b6e5d36..fa0e4abfaa 100644 --- a/tests/common/plugins/sanity_check/constants.py +++ b/tests/common/plugins/sanity_check/constants.py @@ -20,4 +20,4 @@ "adaptive": {"cmd": None, "reboot": False, "adaptive": True, 'recover_wait': 30}, } # All supported recover methods -SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory"] # Supported checks +SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes"] # Supported checks diff --git a/tests/common/plugins/sanity_check/recover.py b/tests/common/plugins/sanity_check/recover.py index 98fe73754e..0abacd5818 100644 --- a/tests/common/plugins/sanity_check/recover.py +++ b/tests/common/plugins/sanity_check/recover.py @@ -58,7 +58,7 @@ def adaptive_recover(dut, localhost, fanouthosts, check_results, wait_time): logging.info("Restoring {}".format(result)) if result['check_item'] == 'interfaces': __recover_interfaces(dut, fanouthosts, result, wait_time) - elif result['check_item'] == 'services': + elif result['check_item'] in ['services', 'processes']: action = __recover_services(dut, result) # Only allow outstanding_action be overridden when it is # None. In case the outstanding_action has already been