From 076b5c234321a71ce11d4d079eae801f971bc9bd Mon Sep 17 00:00:00 2001 From: Guohan Lu Date: Wed, 8 Apr 2020 16:46:33 +0000 Subject: [PATCH 1/3] [sanity_checks]: add critical process check in sanity checks first read the critical process list (/etc/supervisord/critical_process) from the container, and then check if any of the process crashed. add snmp container to the critical service list Signed-off-by: Guohan Lu --- tests/common/devices.py | 62 +++++++++++++++++-- tests/common/plugins/sanity_check/checks.py | 40 ++++++++++++ .../common/plugins/sanity_check/constants.py | 2 +- 3 files changed, 97 insertions(+), 7 deletions(-) diff --git a/tests/common/devices.py b/tests/common/devices.py index 9a4c394d70..848d7c7c56 100644 --- a/tests/common/devices.py +++ b/tests/common/devices.py @@ -10,6 +10,7 @@ import json import logging import os +import re from multiprocessing.pool import ThreadPool from datetime import datetime @@ -91,7 +92,7 @@ class SonicHost(AnsibleHostBase): For running ansible module on the SONiC switch """ - CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp"] + CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp", "snmp"] def __init__(self, ansible_adhoc, hostname, gather_facts=False): AnsibleHostBase.__init__(self, ansible_adhoc, hostname) @@ -214,6 +215,55 @@ def critical_services_fully_started(self): logging.debug("Status of critical services: %s" % str(result)) return all(result.values()) + def critical_process_status(self, service): + """ + @summary: Check whether critical process status of a service. + + @param service: Name of the SONiC service + """ + result = {} + result['exited_critical_process'] = [] + result['running_critical_process'] = [] + critical_process_list = [] + + service_status = self.is_service_fully_started(service) + if service_status == False: + result['service'] = False + return result + else: + result['service'] = True + + # get critical process list for the service + output = self.command("docker exec {} bash -c '[ -f /etc/supervisor/critical_processes ] && cat /etc/supervisor/critical_processes'".format(service), module_ignore_errors=True) + for l in output['stdout'].split(): + critical_process_list.append(l.rstrip()) + if len(critical_process_list) == 0: + return result + + # get process status for the service + output = self.command("docker exec {} supervisorctl status".format(service)) + logging.info("====== supervisor process status for service {} ======".format(service)) + + for l in output['stdout_lines']: + (pname, status, info) = re.split("\s+", l, 2) + if status != "RUNNING": + if pname in critical_process_list: + result['exited_critical_process'].append(pname) + else: + if pname in critical_process_list: + result['running_critical_process'].append(pname) + + return result + + def all_critical_process_status(self): + """ + @summary: Check whether all critical processes status for all critical services + """ + result = {} + for service in self.CRITICAL_SERVICES: + result[service] = self.critical_process_status(service) + return result + def get_crm_resources(self): """ @summary: Run the "crm show resources all" command and parse its output @@ -257,7 +307,7 @@ def get_pmon_daemon_states(self): daemons = self.shell('docker exec pmon supervisorctl status')['stdout_lines'] - daemon_list = [ line.strip().split()[0] for line in daemons if len(line.strip()) > 0 ] + daemon_list = [ line.strip().split()[0] for line in daemons if len(line.strip()) > 0 ] daemon_ctl_key_prefix = 'skip_' daemon_config_file_path = os.path.join('/usr/share/sonic/device', self.facts["platform"], 'pmon_daemon_control.json') @@ -294,7 +344,7 @@ def num_npus(self): return the number of NPUs on the DUT """ return self.facts["num_npu"] - + def get_syncd_docker_names(self): """ @summary: get the list of syncd dockers names for the number of NPUs present on the DUT @@ -454,10 +504,10 @@ def get_fanout_os(self): def get_fanout_type(self): return self.type - + def shutdown(self, interface_name): return self.host.shutdown(interface_name)[self.hostname] - + def no_shutdown(self, interface_name): return self.host.no_shutdown(interface_name)[self.hostname] @@ -466,7 +516,7 @@ def command(self, cmd): def __str__(self): return "{ os: '%s', hostname: '%s', device_type: '%s' }" % (self.os, self.hostname, self.type) - + def __repr__(self): return self.__str__() diff --git a/tests/common/plugins/sanity_check/checks.py b/tests/common/plugins/sanity_check/checks.py index ba2bd687ba..272d112d63 100644 --- a/tests/common/plugins/sanity_check/checks.py +++ b/tests/common/plugins/sanity_check/checks.py @@ -97,6 +97,8 @@ def check_interfaces(dut): return check_result def check_dbmemory(dut): + logger.info("Checking database memory...") + total_omem = 0 re_omem = re.compile("omem=(\d+)") res = dut.command("/usr/bin/redis-cli client list") @@ -115,6 +117,42 @@ def check_dbmemory(dut): logger.info("Done checking database memory") return check_result +def check_processes(dut): + logger.info("Checking process status...") + + networking_uptime = dut.get_networking_uptime().seconds + timeout = max((SYSTEM_STABILIZE_MAX_TIME - networking_uptime), 0) + interval = 20 + logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \ + (networking_uptime, timeout, interval)) + + check_result = {"failed": False, "check_item": "processes"} + if timeout == 0: # Check processes status, do not retry. + processes_status = dut.all_critical_process_status() + check_result["processes_status"] = processes_status + for k, v in processes_status.items(): + if v['service'] == False or len(v['exited_critical_process']) > 0: + check_result['failed'] = True + else: # Retry checking processes status + start = time.time() + elapsed = 0 + while elapsed < timeout: + processes_status = dut.all_critical_process_status() + check_result["processes_status"] = processes_status + for k, v in processes_status.items(): + if v['service'] == False or len(v['exited_critical_process']) > 0: + check_result['failed'] = True + + if check_result["failed"]: + wait(interval, msg="Not all processes are started, wait %d seconds to retry. Remaining time: %d %s" % \ + (interval, int(timeout - elapsed), str(check_result["processes_status"]))) + elapsed = time.time() - start + else: + break + + logger.info("Done checking processes status.") + return check_result + def do_checks(dut, check_items): results = [] for item in check_items: @@ -124,6 +162,8 @@ def do_checks(dut, check_items): results.append(check_interfaces(dut)) elif item == "dbmemory": results.append(check_dbmemory(dut)) + elif item == "processes": + results.append(check_processes(dut)) return results diff --git a/tests/common/plugins/sanity_check/constants.py b/tests/common/plugins/sanity_check/constants.py index 1f8b6e5d36..fa0e4abfaa 100644 --- a/tests/common/plugins/sanity_check/constants.py +++ b/tests/common/plugins/sanity_check/constants.py @@ -20,4 +20,4 @@ "adaptive": {"cmd": None, "reboot": False, "adaptive": True, 'recover_wait': 30}, } # All supported recover methods -SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory"] # Supported checks +SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes"] # Supported checks From 2f12279b4a835232c004b538cef913e56b5c49ff Mon Sep 17 00:00:00 2001 From: Guohan Lu Date: Sun, 26 Apr 2020 21:46:13 +0000 Subject: [PATCH 2/3] add comment Signed-off-by: Guohan Lu --- tests/common/devices.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/common/devices.py b/tests/common/devices.py index 848d7c7c56..738abf5926 100644 --- a/tests/common/devices.py +++ b/tests/common/devices.py @@ -226,6 +226,7 @@ def critical_process_status(self, service): result['running_critical_process'] = [] critical_process_list = [] + # return false if the service is not started service_status = self.is_service_fully_started(service) if service_status == False: result['service'] = False From 925927616a4d379d3ce1fd6caaae7390c3543ddf Mon Sep 17 00:00:00 2001 From: Guohan Lu Date: Sun, 26 Apr 2020 23:40:56 +0000 Subject: [PATCH 3/3] add auto_recover support if process check failed --- tests/common/devices.py | 7 +++---- tests/common/plugins/sanity_check/checks.py | 8 ++++++-- tests/common/plugins/sanity_check/recover.py | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/common/devices.py b/tests/common/devices.py index 738abf5926..3ede6e9474 100644 --- a/tests/common/devices.py +++ b/tests/common/devices.py @@ -221,7 +221,7 @@ def critical_process_status(self, service): @param service: Name of the SONiC service """ - result = {} + result = {'status': True} result['exited_critical_process'] = [] result['running_critical_process'] = [] critical_process_list = [] @@ -229,10 +229,8 @@ def critical_process_status(self, service): # return false if the service is not started service_status = self.is_service_fully_started(service) if service_status == False: - result['service'] = False + result['status'] = False return result - else: - result['service'] = True # get critical process list for the service output = self.command("docker exec {} bash -c '[ -f /etc/supervisor/critical_processes ] && cat /etc/supervisor/critical_processes'".format(service), module_ignore_errors=True) @@ -250,6 +248,7 @@ def critical_process_status(self, service): if status != "RUNNING": if pname in critical_process_list: result['exited_critical_process'].append(pname) + result['status'] = False else: if pname in critical_process_list: result['running_critical_process'].append(pname) diff --git a/tests/common/plugins/sanity_check/checks.py b/tests/common/plugins/sanity_check/checks.py index 272d112d63..0c322d2901 100644 --- a/tests/common/plugins/sanity_check/checks.py +++ b/tests/common/plugins/sanity_check/checks.py @@ -130,18 +130,22 @@ def check_processes(dut): if timeout == 0: # Check processes status, do not retry. processes_status = dut.all_critical_process_status() check_result["processes_status"] = processes_status + check_result["services_status"] = {} for k, v in processes_status.items(): - if v['service'] == False or len(v['exited_critical_process']) > 0: + if v['status'] == False or len(v['exited_critical_process']) > 0: check_result['failed'] = True + check_result["services_status"].update({k: v['status']}) else: # Retry checking processes status start = time.time() elapsed = 0 while elapsed < timeout: processes_status = dut.all_critical_process_status() check_result["processes_status"] = processes_status + check_result["services_status"] = {} for k, v in processes_status.items(): - if v['service'] == False or len(v['exited_critical_process']) > 0: + if v['status'] == False or len(v['exited_critical_process']) > 0: check_result['failed'] = True + check_result["services_status"].update({k: v['status']}) if check_result["failed"]: wait(interval, msg="Not all processes are started, wait %d seconds to retry. Remaining time: %d %s" % \ diff --git a/tests/common/plugins/sanity_check/recover.py b/tests/common/plugins/sanity_check/recover.py index 98fe73754e..0abacd5818 100644 --- a/tests/common/plugins/sanity_check/recover.py +++ b/tests/common/plugins/sanity_check/recover.py @@ -58,7 +58,7 @@ def adaptive_recover(dut, localhost, fanouthosts, check_results, wait_time): logging.info("Restoring {}".format(result)) if result['check_item'] == 'interfaces': __recover_interfaces(dut, fanouthosts, result, wait_time) - elif result['check_item'] == 'services': + elif result['check_item'] in ['services', 'processes']: action = __recover_services(dut, result) # Only allow outstanding_action be overridden when it is # None. In case the outstanding_action has already been