Skip to content

Commit

Permalink
[system-health] Make check interval more accurate (#14085)
Browse files Browse the repository at this point in the history
- Why I did it

Healthd check system status every 60 seconds. However, running checker may take several seconds. Say checker takes X seconds, healthd takes (60 + X) seconds to finish one iteration. This implementation makes sonic-mgmt test case not so stable because the value X is hard to predict and different among different platforms. This PR introduces an interval
compensation mechanism to healthd main loop.

- How I did it

Introduces an interval compensation mechanism to healthd main loop: healthd should wait (60 - X) seconds for next iteration

- How to verify it

Manual test
Unit test
  • Loading branch information
Junchao-Mellanox authored and mssonicbld committed Mar 15, 2023
1 parent 7bba702 commit 5df167b
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 7 deletions.
2 changes: 1 addition & 1 deletion src/system-health/pytest.ini
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[pytest]
addopts = --cov=health_checker --cov-report html --cov-report term --cov-report xml
addopts = --cov=health_checker --cov=healthd --cov-report html --cov-report term --cov-report xml
22 changes: 16 additions & 6 deletions src/system-health/scripts/healthd
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import signal
import threading
import time

from sonic_py_common.daemon_base import DaemonBase
from swsscommon.swsscommon import SonicV2Connector
Expand Down Expand Up @@ -79,18 +80,27 @@ class HealthDaemon(DaemonBase):
return
sysmon = Sysmonitor()
sysmon.task_run()
while 1:
stat = manager.check(chassis)
self._process_stat(chassis, manager.config, stat)

if self.stop_event.wait(manager.config.interval):
break
while self._run_checker(manager, chassis):
pass
except ImportError:
self.log_warning("sonic_platform package not installed. Cannot start system-health daemon")

self.deinit()
sysmon.task_stop()

def _run_checker(self, manager, chassis):
begin = time.time()
stat = manager.check(chassis)
self._process_stat(chassis, manager.config, stat)
elapse = time.time() - begin
sleep_time_in_sec = manager.config.interval - elapse
if sleep_time_in_sec < 0:
self.log_notice(f'System health takes {elapse} seconds for one iteration')
sleep_time_in_sec = 1
if self.stop_event.wait(sleep_time_in_sec):
return False
return True

def _process_stat(self, chassis, config, stat):
from health_checker.health_checker import HealthChecker
self._clear_system_health_table()
Expand Down
29 changes: 29 additions & 0 deletions src/system-health/tests/test_system_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import copy
import os
import sys
from imp import load_source
from swsscommon import swsscommon

from mock import Mock, MagicMock, patch
Expand All @@ -23,7 +24,9 @@

test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
scripts_path = os.path.join(modules_path, 'scripts')
sys.path.insert(0, modules_path)
sys.path.insert(0, scripts_path)
from health_checker import utils
from health_checker.config import Config
from health_checker.hardware_checker import HardwareChecker
Expand All @@ -35,6 +38,9 @@
from health_checker.sysmonitor import MonitorStateDbTask
from health_checker.sysmonitor import MonitorSystemBusTask

load_source('healthd', os.path.join(scripts_path, 'healthd'))
from healthd import HealthDaemon

mock_supervisorctl_output = """
snmpd RUNNING pid 67, uptime 1:03:56
snmp-subagent EXITED Oct 19 01:53 AM
Expand Down Expand Up @@ -740,3 +746,26 @@ def test_get_service_from_feature_table():
sysmon.get_service_from_feature_table(dir_list)
assert 'bgp.service' in dir_list
assert 'swss.service' not in dir_list


@patch('healthd.time.time')
def test_healthd_check_interval(mock_time):
daemon = HealthDaemon()
manager = MagicMock()
manager.check = MagicMock()
manager.config = MagicMock()
chassis = MagicMock()
daemon._process_stat = MagicMock()
daemon.stop_event = MagicMock()
daemon.stop_event.wait = MagicMock()

daemon.stop_event.wait.return_value = False
manager.config.interval = 60
mock_time.side_effect = [0, 3, 0, 61, 0, 1]
assert daemon._run_checker(manager, chassis)
daemon.stop_event.wait.assert_called_with(57)
assert daemon._run_checker(manager, chassis)
daemon.stop_event.wait.assert_called_with(1)

daemon.stop_event.wait.return_value = True
assert not daemon._run_checker(manager, chassis)

0 comments on commit 5df167b

Please sign in to comment.