From 69daa5feafc0e7ae1bb42d8e286ca194d3d8adff Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Wed, 17 Nov 2021 16:21:03 +0800 Subject: [PATCH] [test_system_health.py] Adjust test cases of system health (#4562) What is the motivation for this PR? Command monit summary -B can no longer display the status for each critical process, system-health should not depend on it and need find a way to monitor the status of critical processes. The PR is to adjust the test case and cover the change. How did you do it? Adjust test case to check critical process without monit Stop a critical process and check system-health status How did you verify/test it? Run the regression --- tests/system_health/test_system_health.py | 74 +++++++++++++++++------ 1 file changed, 55 insertions(+), 19 deletions(-) diff --git a/tests/system_health/test_system_health.py b/tests/system_health/test_system_health.py index 44903fbed13..a8c43e4cc98 100644 --- a/tests/system_health/test_system_health.py +++ b/tests/system_health/test_system_health.py @@ -2,8 +2,10 @@ import logging import os import pytest +import random import time from pkg_resources import parse_version +from tests.common import config_reload from tests.common.utilities import wait_until from tests.common.helpers.assertions import pytest_require from tests.platform_tests.thermal_control_test_helper import disable_thermal_policy @@ -63,27 +65,22 @@ def check_image_version(duthost): yield +@pytest.fixture(autouse=True, scope='module') +def config_reload_after_tests(duthost): + yield + config_reload(duthost) + + def test_service_checker(duthosts, enum_rand_one_per_hwsku_hostname): duthost = duthosts[enum_rand_one_per_hwsku_hostname] wait_system_health_boot_up(duthost) with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_DEVICE_CHECK_CONFIG_FILE)): - cmd = "monit summary -B" - logger.info('Getting output for command {}'.format(cmd)) - output = duthost.shell(cmd) - content = output['stdout'].strip() - lines = content.splitlines() - status_begin = lines[1].find('Status') - type_begin = lines[1].find('Type') + processes_status = duthost.all_critical_process_status() expect_error_dict = {} - logger.info('Getting service status') - for line in lines[2:]: - service_name = line[0:status_begin].strip() - status = line[status_begin:type_begin].strip() - service_type = line[type_begin:].strip() - assert service_type in SERVICE_EXPECT_STATUS_DICT, 'Unknown service type {}'.format(service_type) - expect_status = SERVICE_EXPECT_STATUS_DICT[service_type] - if expect_status != status: - expect_error_dict[service_name] = '{} is not {}'.format(service_name, expect_status) + for container_name, processes in processes_status.items(): + if processes["status"] is False or len(processes["exited_critical_process"]) > 0: + for process_name in processes["exited_critical_process"]: + expect_error_dict[process_name] = '{}:{} is not running'.format(container_name, process_name) logger.info('Waiting {} seconds for healthd to work'.format(DEFAULT_INTERVAL)) time.sleep(DEFAULT_INTERVAL) @@ -98,6 +95,30 @@ def test_service_checker(duthosts, enum_rand_one_per_hwsku_hostname): assert summary == expect_summary, 'Expect summary {}, got {}'.format(expect_summary, summary) +@pytest.mark.disable_loganalyzer +def test_service_checker_with_process_exit(duthosts, enum_rand_one_per_hwsku_hostname): + duthost = duthosts[enum_rand_one_per_hwsku_hostname] + wait_system_health_boot_up(duthost) + with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_DEVICE_CHECK_CONFIG_FILE)): + processes_status = duthost.all_critical_process_status() + containers = [x for x in list(processes_status.keys()) if x != "syncd" and x !="database"] + logging.info('Test containers: {}'.format(containers)) + random.shuffle(containers) + for container in containers: + running_critical_process = processes_status[container]['running_critical_process'] + if not running_critical_process: + continue + + critical_process = random.sample(running_critical_process, 1)[0] + with ProcessExitContext(duthost, container, critical_process): + time.sleep(DEFAULT_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, '{}:{}'.format(container, critical_process)) + assert value == "'{}' is not running".format(critical_process), 'Got value {}'.format(value) + summary = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'summary') + assert summary == SUMMARY_NOT_OK + break + + @pytest.mark.disable_loganalyzer def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocker_factory, disable_thermal_policy): duthost = duthosts[enum_rand_one_per_hwsku_hostname] @@ -139,7 +160,7 @@ def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocke time.sleep(THERMAL_CHECK_INTERVAL) value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) assert not value or fan_expect_value not in value, 'Mock fan valid speed, expect {}, but it still report invalid speed' - + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'ASIC') assert not value or asic_expect_value not in value, 'Mock ASIC normal temperature, but it is still overheated' @@ -158,7 +179,7 @@ def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocke value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) assert value and value == fan_expect_value, 'Mock fan absence, expect {}, but got {}'.format(fan_expect_value, value) - + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) assert value and psu_expect_value == value, 'Mock PSU no power, expect {}, but got {}'.format(psu_expect_value, value) @@ -173,7 +194,7 @@ def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocke value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) assert not value or value != fan_expect_value, 'Mock fan presence, but it still report absence' - + time.sleep(PSU_CHECK_INTERVAL) value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) assert not value or psu_expect_value != value, 'Mock PSU power good, but it is still out of power' @@ -348,3 +369,18 @@ def __exit__(self, exc_type, exc_val, exc_tb): :return: """ self.dut.command('mv -f {} {}'.format(self.backup_config, self.origin_config)) + + +class ProcessExitContext: + def __init__(self, dut, container_name, process_name): + self.dut = dut + self.container_name = container_name + self.process_name = process_name + + def __enter__(self): + logging.info('Stopping {}:{}'.format(self.container_name, self.process_name)) + self.dut.command('docker exec -it {} bash -c "supervisorctl stop {}"'.format(self.container_name, self.process_name)) + + def __exit__(self, exc_type, exc_val, exc_tb): + logging.info('Starting {}:{}'.format(self.container_name, self.process_name)) + self.dut.command('docker exec -it {} bash -c "supervisorctl start {}"'.format(self.container_name, self.process_name))