Skip to content

Commit

Permalink
[system-health] Fix error log system_service'state' while doing confi… (
Browse files Browse the repository at this point in the history
sonic-net#11225)

- Why I did it
While doing config reload, FEATURE table may be removed and re-add. During this process, updating FEATURE table is not atomic. It could be that the FEATURE table has entry, but each entry has no field. This PR introduces a retry mechanism to avoid this.

- How I did it
Introduces a retry mechanism to avoid this.

- How to verify it
New unit test added to verify the flow as well as running some manual test.
  • Loading branch information
Junchao-Mellanox authored and skbarista committed Aug 17, 2022
1 parent d13bc1e commit ca031ce
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 6 deletions.
43 changes: 37 additions & 6 deletions src/system-health/health_checker/sysmonitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
import sys
import time
import glob
import multiprocessing
from datetime import datetime
Expand Down Expand Up @@ -145,12 +146,7 @@ def get_all_service_list(self):
dir_list += [os.path.basename(i) for i in glob.glob('{}/*.service'.format(path))]

#add the enabled docker services from config db feature table
feature_table = self.config_db.get_table("FEATURE")
for srv in feature_table.keys():
if feature_table[srv]["state"] not in ["disabled", "always_disabled"]:
srvext = srv + ".service"
if srvext not in dir_list:
dir_list.append(srvext)
self.get_service_from_feature_table(dir_list)

self.config.load_config()
if self.config and self.config.ignore_services:
Expand All @@ -161,6 +157,41 @@ def get_all_service_list(self):
dir_list.sort()
return dir_list

def get_service_from_feature_table(self, dir_list):
"""Get service from CONFIG DB FEATURE table. During "config reload" command, filling FEATURE table
is not an atomic operation, sonic-cfggen do it with two steps:
1. Add an empty table entry to CONFIG DB
2. Add all fields to the table
So, if system health read db on middle of step 1 and step 2, it might read invalid data. A retry
mechanism is here to avoid such issue.
Args:
dir_list (list): service list
"""
max_retry = 3
retry_delay = 1
success = True

while max_retry > 0:
success = True
feature_table = self.config_db.get_table("FEATURE")
for srv, fields in feature_table.items():
if 'state' not in fields:
success = False
logger.log_warning("FEATURE table is not fully ready: {}, retrying".format(feature_table))
break
if fields["state"] not in ["disabled", "always_disabled"]:
srvext = srv + ".service"
if srvext not in dir_list:
dir_list.append(srvext)
if not success:
max_retry -= 1
time.sleep(retry_delay)
else:
break
if not success:
logger.log_error("FEATURE table is not fully ready: {}, max retry reached".format(feature_table))

#Checks FEATURE table from config db for the service' check_up_status flag
#if marked to true, then read the service up_status from FEATURE table of state db.
Expand Down
20 changes: 20 additions & 0 deletions src/system-health/tests/test_system_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,3 +720,23 @@ def test_system_service():
sysmon.task_run()
assert sysmon._task_process is not None
sysmon.task_stop()


def test_get_service_from_feature_table():
sysmon = Sysmonitor()
sysmon.config_db = MagicMock()
sysmon.config_db.get_table = MagicMock()
sysmon.config_db.get_table.side_effect = [
{
'bgp': {},
'swss': {}
},
{
'bgp': {'state': 'enabled'},
'swss': {'state': 'disabled'}
}
]
dir_list = []
sysmon.get_service_from_feature_table(dir_list)
assert 'bgp.service' in dir_list
assert 'swss.service' not in dir_list

0 comments on commit ca031ce

Please sign in to comment.