From 076b5c234321a71ce11d4d079eae801f971bc9bd Mon Sep 17 00:00:00 2001
From: Guohan Lu <gulv@microsoft.com>
Date: Wed, 8 Apr 2020 16:46:33 +0000
Subject: [PATCH 1/3] [sanity_checks]: add critical process check in sanity
 checks

first read the critical process list (/etc/supervisord/critical_process)
from the container, and then check if any of the process crashed.

add snmp container to the critical service list

Signed-off-by: Guohan Lu <gulv@microsoft.com>
---
 tests/common/devices.py                       | 62 +++++++++++++++++--
 tests/common/plugins/sanity_check/checks.py   | 40 ++++++++++++
 .../common/plugins/sanity_check/constants.py  |  2 +-
 3 files changed, 97 insertions(+), 7 deletions(-)

diff --git a/tests/common/devices.py b/tests/common/devices.py
index 9a4c394d70..848d7c7c56 100644
--- a/tests/common/devices.py
+++ b/tests/common/devices.py
@@ -10,6 +10,7 @@
 import json
 import logging
 import os
+import re
 from multiprocessing.pool import ThreadPool
 from datetime import datetime
 
@@ -91,7 +92,7 @@ class SonicHost(AnsibleHostBase):
 
     For running ansible module on the SONiC switch
     """
-    CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp"]
+    CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp", "snmp"]
 
     def __init__(self, ansible_adhoc, hostname, gather_facts=False):
         AnsibleHostBase.__init__(self, ansible_adhoc, hostname)
@@ -214,6 +215,55 @@ def critical_services_fully_started(self):
         logging.debug("Status of critical services: %s" % str(result))
         return all(result.values())
 
+    def critical_process_status(self, service):
+        """
+        @summary: Check whether critical process status of a service.
+
+        @param service: Name of the SONiC service
+        """
+        result = {}
+        result['exited_critical_process'] = []
+        result['running_critical_process'] = []
+        critical_process_list = []
+
+        service_status = self.is_service_fully_started(service)
+        if service_status == False:
+            result['service'] = False
+            return result
+        else:
+            result['service'] = True
+
+        # get critical process list for the service
+        output = self.command("docker exec {} bash -c '[ -f /etc/supervisor/critical_processes ] && cat /etc/supervisor/critical_processes'".format(service), module_ignore_errors=True)
+        for l in output['stdout'].split():
+            critical_process_list.append(l.rstrip())
+        if len(critical_process_list) == 0:
+            return result
+
+        # get process status for the service
+        output = self.command("docker exec {} supervisorctl status".format(service))
+        logging.info("====== supervisor process status for service {} ======".format(service))
+
+        for l in output['stdout_lines']:
+            (pname, status, info) = re.split("\s+", l, 2)
+            if status != "RUNNING":
+                if pname in critical_process_list:
+                    result['exited_critical_process'].append(pname)
+            else:
+                if pname in critical_process_list:
+                    result['running_critical_process'].append(pname)
+
+        return result
+
+    def all_critical_process_status(self):
+        """
+        @summary: Check whether all critical processes status for all critical services
+        """
+        result = {}
+        for service in self.CRITICAL_SERVICES:
+            result[service] = self.critical_process_status(service)
+        return result
+
     def get_crm_resources(self):
         """
         @summary: Run the "crm show resources all" command and parse its output
@@ -257,7 +307,7 @@ def get_pmon_daemon_states(self):
 
         daemons = self.shell('docker exec pmon supervisorctl status')['stdout_lines']
 
-        daemon_list = [ line.strip().split()[0] for line in daemons if len(line.strip()) > 0 ] 
+        daemon_list = [ line.strip().split()[0] for line in daemons if len(line.strip()) > 0 ]
 
         daemon_ctl_key_prefix = 'skip_'
         daemon_config_file_path = os.path.join('/usr/share/sonic/device', self.facts["platform"], 'pmon_daemon_control.json')
@@ -294,7 +344,7 @@ def num_npus(self):
         return the number of NPUs on the DUT
         """
         return self.facts["num_npu"]
-    
+
     def get_syncd_docker_names(self):
         """
         @summary: get the list of syncd dockers names for the number of NPUs present on the DUT
@@ -454,10 +504,10 @@ def get_fanout_os(self):
 
     def get_fanout_type(self):
         return self.type
-    
+
     def shutdown(self, interface_name):
         return self.host.shutdown(interface_name)[self.hostname]
-    
+
     def no_shutdown(self, interface_name):
         return self.host.no_shutdown(interface_name)[self.hostname]
 
@@ -466,7 +516,7 @@ def command(self, cmd):
 
     def __str__(self):
         return "{ os: '%s', hostname: '%s', device_type: '%s' }" % (self.os, self.hostname, self.type)
-    
+
     def __repr__(self):
         return self.__str__()
 
diff --git a/tests/common/plugins/sanity_check/checks.py b/tests/common/plugins/sanity_check/checks.py
index ba2bd687ba..272d112d63 100644
--- a/tests/common/plugins/sanity_check/checks.py
+++ b/tests/common/plugins/sanity_check/checks.py
@@ -97,6 +97,8 @@ def check_interfaces(dut):
     return check_result
 
 def check_dbmemory(dut):
+    logger.info("Checking database memory...")
+
     total_omem = 0
     re_omem = re.compile("omem=(\d+)")
     res = dut.command("/usr/bin/redis-cli client list")
@@ -115,6 +117,42 @@ def check_dbmemory(dut):
     logger.info("Done checking database memory")
     return check_result
 
+def check_processes(dut):
+    logger.info("Checking process status...")
+
+    networking_uptime = dut.get_networking_uptime().seconds
+    timeout = max((SYSTEM_STABILIZE_MAX_TIME - networking_uptime), 0)
+    interval = 20
+    logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \
+                (networking_uptime, timeout, interval))
+
+    check_result = {"failed": False, "check_item": "processes"}
+    if timeout == 0:    # Check processes status, do not retry.
+        processes_status = dut.all_critical_process_status()
+        check_result["processes_status"] = processes_status
+        for k, v in processes_status.items():
+            if v['service'] == False or len(v['exited_critical_process']) > 0:
+                check_result['failed'] = True
+    else:               # Retry checking processes status
+        start = time.time()
+        elapsed = 0
+        while elapsed < timeout:
+            processes_status = dut.all_critical_process_status()
+            check_result["processes_status"] = processes_status
+            for k, v in processes_status.items():
+                if v['service'] == False or len(v['exited_critical_process']) > 0:
+                    check_result['failed'] = True
+
+            if check_result["failed"]:
+                wait(interval, msg="Not all processes are started, wait %d seconds to retry. Remaining time: %d %s" % \
+                     (interval, int(timeout - elapsed), str(check_result["processes_status"])))
+                elapsed = time.time() - start
+            else:
+                break
+
+    logger.info("Done checking processes status.")
+    return check_result
+
 def do_checks(dut, check_items):
     results = []
     for item in check_items:
@@ -124,6 +162,8 @@ def do_checks(dut, check_items):
             results.append(check_interfaces(dut))
         elif item == "dbmemory":
             results.append(check_dbmemory(dut))
+        elif item == "processes":
+            results.append(check_processes(dut))
 
     return results
 
diff --git a/tests/common/plugins/sanity_check/constants.py b/tests/common/plugins/sanity_check/constants.py
index 1f8b6e5d36..fa0e4abfaa 100644
--- a/tests/common/plugins/sanity_check/constants.py
+++ b/tests/common/plugins/sanity_check/constants.py
@@ -20,4 +20,4 @@
     "adaptive": {"cmd": None, "reboot": False, "adaptive": True, 'recover_wait': 30},
 }       # All supported recover methods
 
-SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory"]          # Supported checks
+SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes"]          # Supported checks

From 2f12279b4a835232c004b538cef913e56b5c49ff Mon Sep 17 00:00:00 2001
From: Guohan Lu <gulv@microsoft.com>
Date: Sun, 26 Apr 2020 21:46:13 +0000
Subject: [PATCH 2/3] add comment

Signed-off-by: Guohan Lu <gulv@microsoft.com>
---
 tests/common/devices.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/common/devices.py b/tests/common/devices.py
index 848d7c7c56..738abf5926 100644
--- a/tests/common/devices.py
+++ b/tests/common/devices.py
@@ -226,6 +226,7 @@ def critical_process_status(self, service):
         result['running_critical_process'] = []
         critical_process_list = []
 
+        # return false if the service is not started
         service_status = self.is_service_fully_started(service)
         if service_status == False:
             result['service'] = False

From 925927616a4d379d3ce1fd6caaae7390c3543ddf Mon Sep 17 00:00:00 2001
From: Guohan Lu <gulv@microsoft.com>
Date: Sun, 26 Apr 2020 23:40:56 +0000
Subject: [PATCH 3/3] add auto_recover support if process check failed

---
 tests/common/devices.py                      | 7 +++----
 tests/common/plugins/sanity_check/checks.py  | 8 ++++++--
 tests/common/plugins/sanity_check/recover.py | 2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/common/devices.py b/tests/common/devices.py
index 738abf5926..3ede6e9474 100644
--- a/tests/common/devices.py
+++ b/tests/common/devices.py
@@ -221,7 +221,7 @@ def critical_process_status(self, service):
 
         @param service: Name of the SONiC service
         """
-        result = {}
+        result = {'status': True}
         result['exited_critical_process'] = []
         result['running_critical_process'] = []
         critical_process_list = []
@@ -229,10 +229,8 @@ def critical_process_status(self, service):
         # return false if the service is not started
         service_status = self.is_service_fully_started(service)
         if service_status == False:
-            result['service'] = False
+            result['status'] = False
             return result
-        else:
-            result['service'] = True
 
         # get critical process list for the service
         output = self.command("docker exec {} bash -c '[ -f /etc/supervisor/critical_processes ] && cat /etc/supervisor/critical_processes'".format(service), module_ignore_errors=True)
@@ -250,6 +248,7 @@ def critical_process_status(self, service):
             if status != "RUNNING":
                 if pname in critical_process_list:
                     result['exited_critical_process'].append(pname)
+                    result['status'] = False
             else:
                 if pname in critical_process_list:
                     result['running_critical_process'].append(pname)
diff --git a/tests/common/plugins/sanity_check/checks.py b/tests/common/plugins/sanity_check/checks.py
index 272d112d63..0c322d2901 100644
--- a/tests/common/plugins/sanity_check/checks.py
+++ b/tests/common/plugins/sanity_check/checks.py
@@ -130,18 +130,22 @@ def check_processes(dut):
     if timeout == 0:    # Check processes status, do not retry.
         processes_status = dut.all_critical_process_status()
         check_result["processes_status"] = processes_status
+        check_result["services_status"] = {}
         for k, v in processes_status.items():
-            if v['service'] == False or len(v['exited_critical_process']) > 0:
+            if v['status'] == False or len(v['exited_critical_process']) > 0:
                 check_result['failed'] = True
+            check_result["services_status"].update({k: v['status']})
     else:               # Retry checking processes status
         start = time.time()
         elapsed = 0
         while elapsed < timeout:
             processes_status = dut.all_critical_process_status()
             check_result["processes_status"] = processes_status
+            check_result["services_status"] = {}
             for k, v in processes_status.items():
-                if v['service'] == False or len(v['exited_critical_process']) > 0:
+                if v['status'] == False or len(v['exited_critical_process']) > 0:
                     check_result['failed'] = True
+                check_result["services_status"].update({k: v['status']})
 
             if check_result["failed"]:
                 wait(interval, msg="Not all processes are started, wait %d seconds to retry. Remaining time: %d %s" % \
diff --git a/tests/common/plugins/sanity_check/recover.py b/tests/common/plugins/sanity_check/recover.py
index 98fe73754e..0abacd5818 100644
--- a/tests/common/plugins/sanity_check/recover.py
+++ b/tests/common/plugins/sanity_check/recover.py
@@ -58,7 +58,7 @@ def adaptive_recover(dut, localhost, fanouthosts, check_results, wait_time):
             logging.info("Restoring {}".format(result))
             if result['check_item'] == 'interfaces':
                 __recover_interfaces(dut, fanouthosts, result, wait_time)
-            elif result['check_item'] == 'services':
+            elif result['check_item'] in ['services', 'processes']:
                 action             = __recover_services(dut, result)
                 # Only allow outstanding_action be overridden when it is
                 # None. In case the outstanding_action has already been