Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[sanity_checks]: add critical process check in sanity checks #1617

Merged
merged 3 commits into from
Apr 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 56 additions & 6 deletions tests/common/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import json
import logging
import os
import re
from multiprocessing.pool import ThreadPool
from datetime import datetime

Expand Down Expand Up @@ -91,7 +92,7 @@ class SonicHost(AnsibleHostBase):
For running ansible module on the SONiC switch
"""
CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp"]
CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp", "snmp"]

def __init__(self, ansible_adhoc, hostname, gather_facts=False):
AnsibleHostBase.__init__(self, ansible_adhoc, hostname)
Expand Down Expand Up @@ -214,6 +215,55 @@ def critical_services_fully_started(self):
logging.debug("Status of critical services: %s" % str(result))
return all(result.values())

def critical_process_status(self, service):
"""
@summary: Check whether critical process status of a service.
@param service: Name of the SONiC service
"""
result = {'status': True}
result['exited_critical_process'] = []
result['running_critical_process'] = []
critical_process_list = []

# return false if the service is not started
service_status = self.is_service_fully_started(service)
if service_status == False:
result['status'] = False
return result

# get critical process list for the service
output = self.command("docker exec {} bash -c '[ -f /etc/supervisor/critical_processes ] && cat /etc/supervisor/critical_processes'".format(service), module_ignore_errors=True)
for l in output['stdout'].split():
critical_process_list.append(l.rstrip())
if len(critical_process_list) == 0:
return result

# get process status for the service
output = self.command("docker exec {} supervisorctl status".format(service))
logging.info("====== supervisor process status for service {} ======".format(service))

for l in output['stdout_lines']:
(pname, status, info) = re.split("\s+", l, 2)
if status != "RUNNING":
if pname in critical_process_list:
result['exited_critical_process'].append(pname)
result['status'] = False
else:
if pname in critical_process_list:
result['running_critical_process'].append(pname)

return result

def all_critical_process_status(self):
"""
@summary: Check whether all critical processes status for all critical services
"""
result = {}
for service in self.CRITICAL_SERVICES:
result[service] = self.critical_process_status(service)
return result

def get_crm_resources(self):
"""
@summary: Run the "crm show resources all" command and parse its output
Expand Down Expand Up @@ -257,7 +307,7 @@ def get_pmon_daemon_states(self):

daemons = self.shell('docker exec pmon supervisorctl status')['stdout_lines']

daemon_list = [ line.strip().split()[0] for line in daemons if len(line.strip()) > 0 ]
daemon_list = [ line.strip().split()[0] for line in daemons if len(line.strip()) > 0 ]

daemon_ctl_key_prefix = 'skip_'
daemon_config_file_path = os.path.join('/usr/share/sonic/device', self.facts["platform"], 'pmon_daemon_control.json')
Expand Down Expand Up @@ -294,7 +344,7 @@ def num_npus(self):
return the number of NPUs on the DUT
"""
return self.facts["num_npu"]

def get_syncd_docker_names(self):
"""
@summary: get the list of syncd dockers names for the number of NPUs present on the DUT
Expand Down Expand Up @@ -454,10 +504,10 @@ def get_fanout_os(self):

def get_fanout_type(self):
return self.type

def shutdown(self, interface_name):
return self.host.shutdown(interface_name)[self.hostname]

def no_shutdown(self, interface_name):
return self.host.no_shutdown(interface_name)[self.hostname]

Expand All @@ -466,7 +516,7 @@ def command(self, cmd):

def __str__(self):
return "{ os: '%s', hostname: '%s', device_type: '%s' }" % (self.os, self.hostname, self.type)

def __repr__(self):
return self.__str__()

Expand Down
44 changes: 44 additions & 0 deletions tests/common/plugins/sanity_check/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ def check_interfaces(dut):
return check_result

def check_dbmemory(dut):
logger.info("Checking database memory...")

total_omem = 0
re_omem = re.compile("omem=(\d+)")
res = dut.command("/usr/bin/redis-cli client list")
Expand All @@ -115,6 +117,46 @@ def check_dbmemory(dut):
logger.info("Done checking database memory")
return check_result

def check_processes(dut):
logger.info("Checking process status...")

networking_uptime = dut.get_networking_uptime().seconds
timeout = max((SYSTEM_STABILIZE_MAX_TIME - networking_uptime), 0)
interval = 20
logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \
(networking_uptime, timeout, interval))

check_result = {"failed": False, "check_item": "processes"}
if timeout == 0: # Check processes status, do not retry.
processes_status = dut.all_critical_process_status()
check_result["processes_status"] = processes_status
check_result["services_status"] = {}
for k, v in processes_status.items():
if v['status'] == False or len(v['exited_critical_process']) > 0:
check_result['failed'] = True
check_result["services_status"].update({k: v['status']})
else: # Retry checking processes status
start = time.time()
elapsed = 0
while elapsed < timeout:
processes_status = dut.all_critical_process_status()
check_result["processes_status"] = processes_status
check_result["services_status"] = {}
for k, v in processes_status.items():
if v['status'] == False or len(v['exited_critical_process']) > 0:
check_result['failed'] = True
check_result["services_status"].update({k: v['status']})

if check_result["failed"]:
wait(interval, msg="Not all processes are started, wait %d seconds to retry. Remaining time: %d %s" % \
(interval, int(timeout - elapsed), str(check_result["processes_status"])))
elapsed = time.time() - start
else:
break

logger.info("Done checking processes status.")
return check_result

def do_checks(dut, check_items):
results = []
for item in check_items:
Expand All @@ -124,6 +166,8 @@ def do_checks(dut, check_items):
results.append(check_interfaces(dut))
elif item == "dbmemory":
results.append(check_dbmemory(dut))
elif item == "processes":
results.append(check_processes(dut))

return results

Expand Down
2 changes: 1 addition & 1 deletion tests/common/plugins/sanity_check/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
"adaptive": {"cmd": None, "reboot": False, "adaptive": True, 'recover_wait': 30},
} # All supported recover methods

SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory"] # Supported checks
SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes"] # Supported checks
2 changes: 1 addition & 1 deletion tests/common/plugins/sanity_check/recover.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def adaptive_recover(dut, localhost, fanouthosts, check_results, wait_time):
logging.info("Restoring {}".format(result))
if result['check_item'] == 'interfaces':
__recover_interfaces(dut, fanouthosts, result, wait_time)
elif result['check_item'] == 'services':
elif result['check_item'] in ['services', 'processes']:
action = __recover_services(dut, result)
# Only allow outstanding_action be overridden when it is
# None. In case the outstanding_action has already been
Expand Down