From 51712aa1667c513306ecb4e2e886d12ad83c986a Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Tue, 28 Jun 2022 23:48:10 +0800 Subject: [PATCH] =?UTF-8?q?[system-health]=20Fix=20error=20log=20system=5F?= =?UTF-8?q?service'state'=20while=20doing=20confi=E2=80=A6=20(#11225)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Why I did it While doing config reload, FEATURE table may be removed and re-add. During this process, updating FEATURE table is not atomic. It could be that the FEATURE table has entry, but each entry has no field. This PR introduces a retry mechanism to avoid this. - How I did it Introduces a retry mechanism to avoid this. - How to verify it New unit test added to verify the flow as well as running some manual test. --- .../health_checker/sysmonitor.py | 43 ++++++++++++++++--- src/system-health/tests/test_system_health.py | 20 +++++++++ 2 files changed, 57 insertions(+), 6 deletions(-) diff --git a/src/system-health/health_checker/sysmonitor.py b/src/system-health/health_checker/sysmonitor.py index a4058f8c09d3..e69d289fc537 100755 --- a/src/system-health/health_checker/sysmonitor.py +++ b/src/system-health/health_checker/sysmonitor.py @@ -2,6 +2,7 @@ import os import sys +import time import glob import multiprocessing from datetime import datetime @@ -145,12 +146,7 @@ def get_all_service_list(self): dir_list += [os.path.basename(i) for i in glob.glob('{}/*.service'.format(path))] #add the enabled docker services from config db feature table - feature_table = self.config_db.get_table("FEATURE") - for srv in feature_table.keys(): - if feature_table[srv]["state"] not in ["disabled", "always_disabled"]: - srvext = srv + ".service" - if srvext not in dir_list: - dir_list.append(srvext) + self.get_service_from_feature_table(dir_list) self.config.load_config() if self.config and self.config.ignore_services: @@ -161,6 +157,41 @@ def get_all_service_list(self): dir_list.sort() return dir_list + def get_service_from_feature_table(self, dir_list): + """Get service from CONFIG DB FEATURE table. During "config reload" command, filling FEATURE table + is not an atomic operation, sonic-cfggen do it with two steps: + 1. Add an empty table entry to CONFIG DB + 2. Add all fields to the table + + So, if system health read db on middle of step 1 and step 2, it might read invalid data. A retry + mechanism is here to avoid such issue. + + Args: + dir_list (list): service list + """ + max_retry = 3 + retry_delay = 1 + success = True + + while max_retry > 0: + success = True + feature_table = self.config_db.get_table("FEATURE") + for srv, fields in feature_table.items(): + if 'state' not in fields: + success = False + logger.log_warning("FEATURE table is not fully ready: {}, retrying".format(feature_table)) + break + if fields["state"] not in ["disabled", "always_disabled"]: + srvext = srv + ".service" + if srvext not in dir_list: + dir_list.append(srvext) + if not success: + max_retry -= 1 + time.sleep(retry_delay) + else: + break + if not success: + logger.log_error("FEATURE table is not fully ready: {}, max retry reached".format(feature_table)) #Checks FEATURE table from config db for the service' check_up_status flag #if marked to true, then read the service up_status from FEATURE table of state db. diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index 76f3ceea5d3f..d58c69bececa 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -720,3 +720,23 @@ def test_system_service(): sysmon.task_run() assert sysmon._task_process is not None sysmon.task_stop() + + +def test_get_service_from_feature_table(): + sysmon = Sysmonitor() + sysmon.config_db = MagicMock() + sysmon.config_db.get_table = MagicMock() + sysmon.config_db.get_table.side_effect = [ + { + 'bgp': {}, + 'swss': {} + }, + { + 'bgp': {'state': 'enabled'}, + 'swss': {'state': 'disabled'} + } + ] + dir_list = [] + sysmon.get_service_from_feature_table(dir_list) + assert 'bgp.service' in dir_list + assert 'swss.service' not in dir_list