Skip to content

Commit

Permalink
[test_system_health.py] Adjust test cases of system health (#4562)
Browse files Browse the repository at this point in the history
What is the motivation for this PR?
Command monit summary -B can no longer display the status for each critical process, system-health should not depend on it and need find a way to monitor the status of critical processes. The PR is to adjust the test case and cover the change.

How did you do it?
Adjust test case to check critical process without monit
Stop a critical process and check system-health status

How did you verify/test it?
Run the regression
  • Loading branch information
Junchao-Mellanox authored Nov 17, 2021
1 parent 7a09d22 commit 69daa5f
Showing 1 changed file with 55 additions and 19 deletions.
74 changes: 55 additions & 19 deletions tests/system_health/test_system_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import logging
import os
import pytest
import random
import time
from pkg_resources import parse_version
from tests.common import config_reload
from tests.common.utilities import wait_until
from tests.common.helpers.assertions import pytest_require
from tests.platform_tests.thermal_control_test_helper import disable_thermal_policy
Expand Down Expand Up @@ -63,27 +65,22 @@ def check_image_version(duthost):
yield


@pytest.fixture(autouse=True, scope='module')
def config_reload_after_tests(duthost):
yield
config_reload(duthost)


def test_service_checker(duthosts, enum_rand_one_per_hwsku_hostname):
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
wait_system_health_boot_up(duthost)
with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_DEVICE_CHECK_CONFIG_FILE)):
cmd = "monit summary -B"
logger.info('Getting output for command {}'.format(cmd))
output = duthost.shell(cmd)
content = output['stdout'].strip()
lines = content.splitlines()
status_begin = lines[1].find('Status')
type_begin = lines[1].find('Type')
processes_status = duthost.all_critical_process_status()
expect_error_dict = {}
logger.info('Getting service status')
for line in lines[2:]:
service_name = line[0:status_begin].strip()
status = line[status_begin:type_begin].strip()
service_type = line[type_begin:].strip()
assert service_type in SERVICE_EXPECT_STATUS_DICT, 'Unknown service type {}'.format(service_type)
expect_status = SERVICE_EXPECT_STATUS_DICT[service_type]
if expect_status != status:
expect_error_dict[service_name] = '{} is not {}'.format(service_name, expect_status)
for container_name, processes in processes_status.items():
if processes["status"] is False or len(processes["exited_critical_process"]) > 0:
for process_name in processes["exited_critical_process"]:
expect_error_dict[process_name] = '{}:{} is not running'.format(container_name, process_name)

logger.info('Waiting {} seconds for healthd to work'.format(DEFAULT_INTERVAL))
time.sleep(DEFAULT_INTERVAL)
Expand All @@ -98,6 +95,30 @@ def test_service_checker(duthosts, enum_rand_one_per_hwsku_hostname):
assert summary == expect_summary, 'Expect summary {}, got {}'.format(expect_summary, summary)


@pytest.mark.disable_loganalyzer
def test_service_checker_with_process_exit(duthosts, enum_rand_one_per_hwsku_hostname):
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
wait_system_health_boot_up(duthost)
with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_DEVICE_CHECK_CONFIG_FILE)):
processes_status = duthost.all_critical_process_status()
containers = [x for x in list(processes_status.keys()) if x != "syncd" and x !="database"]
logging.info('Test containers: {}'.format(containers))
random.shuffle(containers)
for container in containers:
running_critical_process = processes_status[container]['running_critical_process']
if not running_critical_process:
continue

critical_process = random.sample(running_critical_process, 1)[0]
with ProcessExitContext(duthost, container, critical_process):
time.sleep(DEFAULT_INTERVAL)
value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, '{}:{}'.format(container, critical_process))
assert value == "'{}' is not running".format(critical_process), 'Got value {}'.format(value)
summary = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'summary')
assert summary == SUMMARY_NOT_OK
break


@pytest.mark.disable_loganalyzer
def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocker_factory, disable_thermal_policy):
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
Expand Down Expand Up @@ -139,7 +160,7 @@ def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocke
time.sleep(THERMAL_CHECK_INTERVAL)
value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name)
assert not value or fan_expect_value not in value, 'Mock fan valid speed, expect {}, but it still report invalid speed'

value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'ASIC')
assert not value or asic_expect_value not in value, 'Mock ASIC normal temperature, but it is still overheated'

Expand All @@ -158,7 +179,7 @@ def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocke
value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name)
assert value and value == fan_expect_value, 'Mock fan absence, expect {}, but got {}'.format(fan_expect_value,
value)

value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name)
assert value and psu_expect_value == value, 'Mock PSU no power, expect {}, but got {}'.format(psu_expect_value,
value)
Expand All @@ -173,7 +194,7 @@ def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocke
value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name)
assert not value or value != fan_expect_value, 'Mock fan presence, but it still report absence'


time.sleep(PSU_CHECK_INTERVAL)
value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name)
assert not value or psu_expect_value != value, 'Mock PSU power good, but it is still out of power'
Expand Down Expand Up @@ -348,3 +369,18 @@ def __exit__(self, exc_type, exc_val, exc_tb):
:return:
"""
self.dut.command('mv -f {} {}'.format(self.backup_config, self.origin_config))


class ProcessExitContext:
def __init__(self, dut, container_name, process_name):
self.dut = dut
self.container_name = container_name
self.process_name = process_name

def __enter__(self):
logging.info('Stopping {}:{}'.format(self.container_name, self.process_name))
self.dut.command('docker exec -it {} bash -c "supervisorctl stop {}"'.format(self.container_name, self.process_name))

def __exit__(self, exc_type, exc_val, exc_tb):
logging.info('Starting {}:{}'.format(self.container_name, self.process_name))
self.dut.command('docker exec -it {} bash -c "supervisorctl start {}"'.format(self.container_name, self.process_name))

0 comments on commit 69daa5f

Please sign in to comment.