From a5c920705088bd69ae98ec9e41a26bbb78c67970 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Tue, 11 May 2021 00:46:17 +0800 Subject: [PATCH] [thermalctld] Enable stopping thermal manager (#180) Make thermalctld exit as soon as possible. Depends on https://github.com/Azure/sonic-platform-common/pull/187 During config reload flow, supervisord sends SIGTERM to thermalctld. However, if thermalctld cannot exit in 10 seconds, supervisord sends SITKILL to thermalctld. In this case, sub process of thermalctld might still stay in memory as a zombie process. This PR is aimed to fix this. --- sonic-thermalctld/scripts/thermalctld | 28 +++++++++-- sonic-thermalctld/tests/test_thermalctld.py | 52 ++++++++++++--------- 2 files changed, 55 insertions(+), 25 deletions(-) diff --git a/sonic-thermalctld/scripts/thermalctld b/sonic-thermalctld/scripts/thermalctld index 5c1449984..b98cd5c4a 100644 --- a/sonic-thermalctld/scripts/thermalctld +++ b/sonic-thermalctld/scripts/thermalctld @@ -191,7 +191,7 @@ class FanUpdater(logger.Logger): FAN_INFO_TABLE_NAME = 'FAN_INFO' FAN_DRAWER_INFO_TABLE_NAME = 'FAN_DRAWER_INFO' - def __init__(self, chassis): + def __init__(self, chassis, task_stopping_event): """ Initializer for FanUpdater :param chassis: Object representing a platform chassis @@ -199,6 +199,7 @@ class FanUpdater(logger.Logger): super(FanUpdater, self).__init__(SYSLOG_IDENTIFIER) self.chassis = chassis + self.task_stopping_event = task_stopping_event self.fan_status_dict = {} state_db = daemon_base.db_connect("STATE_DB") self.table = swsscommon.Table(state_db, FanUpdater.FAN_INFO_TABLE_NAME) @@ -236,8 +237,12 @@ class FanUpdater(logger.Logger): FanStatus.reset_fan_counter() for drawer_index, drawer in enumerate(self.chassis.get_all_fan_drawers()): + if self.task_stopping_event.is_set(): + return self._refresh_fan_drawer_status(drawer, drawer_index) for fan_index, fan in enumerate(drawer.get_all_fans()): + if self.task_stopping_event.is_set(): + return try: self._refresh_fan_status(drawer, drawer_index, fan, fan_index) except Exception as e: @@ -245,6 +250,8 @@ class FanUpdater(logger.Logger): for psu_index, psu in enumerate(self.chassis.get_all_psus()): for fan_index, fan in enumerate(psu.get_all_fans()): + if self.task_stopping_event.is_set(): + return try: self._refresh_fan_status(psu, psu_index, fan, fan_index, True) except Exception as e: @@ -396,6 +403,8 @@ class FanUpdater(logger.Logger): def _update_led_color(self): for fan_name, fan_status in self.fan_status_dict.items(): + if self.task_stopping_event.is_set(): + return try: fvs = swsscommon.FieldValuePairs([ ('led_status', str(try_get(fan_status.fan.get_status_led))) @@ -408,6 +417,8 @@ class FanUpdater(logger.Logger): self.table.set(fan_name, fvs) for drawer in self.chassis.get_all_fan_drawers(): + if self.task_stopping_event.is_set(): + return drawer_name = try_get(drawer.get_name) if drawer_name == NOT_AVAILABLE: continue @@ -510,7 +521,7 @@ class TemperatureUpdater(logger.Logger): # Temperature information table name in database TEMPER_INFO_TABLE_NAME = 'TEMPERATURE_INFO' - def __init__(self, chassis): + def __init__(self, chassis, task_stopping_event): """ Initializer of TemperatureUpdater :param chassis: Object representing a platform chassis @@ -518,6 +529,7 @@ class TemperatureUpdater(logger.Logger): super(TemperatureUpdater, self).__init__(SYSLOG_IDENTIFIER) self.chassis = chassis + self.task_stopping_event = task_stopping_event self.temperature_status_dict = {} state_db = daemon_base.db_connect("STATE_DB") self.table = swsscommon.Table(state_db, TemperatureUpdater.TEMPER_INFO_TABLE_NAME) @@ -562,6 +574,8 @@ class TemperatureUpdater(logger.Logger): """ self.log_debug("Start temperature updating") for index, thermal in enumerate(self.chassis.get_all_thermals()): + if self.task_stopping_event.is_set(): + return try: self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index) except Exception as e: @@ -570,6 +584,8 @@ class TemperatureUpdater(logger.Logger): for psu_index, psu in enumerate(self.chassis.get_all_psus()): parent_name = 'PSU {}'.format(psu_index + 1) for thermal_index, thermal in enumerate(psu.get_all_thermals()): + if self.task_stopping_event.is_set(): + return try: self._refresh_temperature_status(parent_name, thermal, thermal_index) except Exception as e: @@ -578,6 +594,8 @@ class TemperatureUpdater(logger.Logger): for sfp_index, sfp in enumerate(self.chassis.get_all_sfps()): parent_name = 'SFP {}'.format(sfp_index + 1) for thermal_index, thermal in enumerate(sfp.get_all_thermals()): + if self.task_stopping_event.is_set(): + return try: self._refresh_temperature_status(parent_name, thermal, thermal_index) except Exception as e: @@ -686,8 +704,8 @@ class ThermalMonitor(ProcessTaskBase): # Set minimum logging level to INFO self.logger.set_min_log_priority_info() - self.fan_updater = FanUpdater(chassis) - self.temperature_updater = TemperatureUpdater(chassis) + self.fan_updater = FanUpdater(chassis, self.task_stopping_event) + self.temperature_updater = TemperatureUpdater(chassis, self.task_stopping_event) def main(self): begin = time.time() @@ -789,6 +807,8 @@ class ThermalControlDaemon(daemon_base.DaemonBase): if sig in FATAL_SIGNALS: self.log_info("Caught signal '{}' - exiting...".format(SIGNALS_TO_NAMES_DICT[sig])) exit_code = 128 + sig # Make sure we exit with a non-zero code so that supervisor will try to restart us + self.thermal_monitor.task_stop() + self.thermal_manager.stop() self.stop_event.set() elif sig in NONFATAL_SIGNALS: self.log_info("Caught signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig])) diff --git a/sonic-thermalctld/tests/test_thermalctld.py b/sonic-thermalctld/tests/test_thermalctld.py index d2f647384..7c272826b 100644 --- a/sonic-thermalctld/tests/test_thermalctld.py +++ b/sonic-thermalctld/tests/test_thermalctld.py @@ -1,5 +1,6 @@ import os import sys +import multiprocessing from imp import load_source # TODO: Replace with importlib once we no longer need to support Python 2 # TODO: Clean this up once we no longer need to support Python 2 @@ -148,7 +149,7 @@ class TestFanUpdater(object): Test cases to cover functionality in FanUpdater class """ def test_deinit(self): - fan_updater = thermalctld.FanUpdater(MockChassis()) + fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event()) fan_updater.fan_status_dict = {'key1': 'value1', 'key2': 'value2'} fan_updater.table._del = mock.MagicMock() @@ -161,7 +162,7 @@ def test_deinit(self): @mock.patch('thermalctld.update_entity_info', mock.MagicMock()) def test_refresh_fan_drawer_status_fan_drawer_get_name_not_impl(self): # Test case where fan_drawer.get_name is not implemented - fan_updater = thermalctld.FanUpdater(MockChassis()) + fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event()) mock_fan_drawer = mock.MagicMock() fan_updater._refresh_fan_drawer_status(mock_fan_drawer, 1) assert thermalctld.update_entity_info.call_count == 0 @@ -175,7 +176,7 @@ def test_update_fan_with_exception(self): fan.make_over_speed() chassis.get_all_fans().append(fan) - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() assert fan.get_status_led() == MockFan.STATUS_LED_COLOR_RED assert fan_updater.log_warning.call_count == 1 @@ -192,7 +193,7 @@ def test_set_fan_led_exception(self): mock_fan = MockFan() mock_fan.set_status_led = mock.MagicMock(side_effect=NotImplementedError) - fan_updater = thermalctld.FanUpdater(MockChassis()) + fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event()) fan_updater._set_fan_led(mock_fan_drawer, mock_fan, 'Test Fan', fan_status) assert fan_updater.log_warning.call_count == 1 fan_updater.log_warning.assert_called_with('Failed to set status LED for fan Test Fan, set_status_led not implemented') @@ -200,7 +201,7 @@ def test_set_fan_led_exception(self): def test_fan_absent(self): chassis = MockChassis() chassis.make_absent_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -224,7 +225,7 @@ def test_fan_absent(self): def test_fan_faulty(self): chassis = MockChassis() chassis.make_faulty_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -248,7 +249,7 @@ def test_fan_faulty(self): def test_fan_under_speed(self): chassis = MockChassis() chassis.make_under_speed_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -264,7 +265,7 @@ def test_fan_under_speed(self): def test_fan_over_speed(self): chassis = MockChassis() chassis.make_over_speed_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -283,7 +284,7 @@ def test_update_psu_fans(self): mock_fan = MockFan() psu._fan_list.append(mock_fan) chassis._psu_list.append(psu) - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() assert fan_updater.log_warning.call_count == 0 @@ -331,7 +332,7 @@ def test_insufficient_fan_number(): chassis = MockChassis() chassis.make_absent_fan() chassis.make_faulty_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() assert fan_updater.log_warning.call_count == 3 expected_calls = [ @@ -415,7 +416,7 @@ class TestTemperatureUpdater(object): """ def test_deinit(self): chassis = MockChassis() - temp_updater = thermalctld.TemperatureUpdater(chassis) + temp_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temp_updater.temperature_status_dict = {'key1': 'value1', 'key2': 'value2'} temp_updater.table._del = mock.MagicMock() @@ -423,11 +424,12 @@ def test_deinit(self): assert temp_updater.table._del.call_count == 2 expected_calls = [mock.call('key1'), mock.call('key2')] temp_updater.table._del.assert_has_calls(expected_calls, any_order=True) + def test_over_temper(self): chassis = MockChassis() chassis.make_over_temper_thermal() - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() thermal_list = chassis.get_all_thermals() assert temperature_updater.log_warning.call_count == 1 @@ -441,7 +443,7 @@ def test_over_temper(self): def test_under_temper(self): chassis = MockChassis() chassis.make_under_temper_thermal() - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() thermal_list = chassis.get_all_thermals() assert temperature_updater.log_warning.call_count == 1 @@ -458,7 +460,7 @@ def test_update_psu_thermals(self): mock_thermal = MockThermal() psu._thermal_list.append(mock_thermal) chassis._psu_list.append(psu) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() assert temperature_updater.log_warning.call_count == 0 @@ -478,7 +480,7 @@ def test_update_sfp_thermals(self): mock_thermal = MockThermal() sfp._thermal_list.append(mock_thermal) chassis._sfp_list.append(sfp) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() assert temperature_updater.log_warning.call_count == 0 @@ -499,7 +501,7 @@ def test_update_thermal_with_exception(self): thermal.make_over_temper() chassis.get_all_thermals().append(thermal) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() assert temperature_updater.log_warning.call_count == 2 @@ -524,17 +526,17 @@ def test_updater_thermal_check_modular_chassis(): chassis = MockChassis() assert chassis.is_modular_chassis() == False - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) assert temperature_updater.chassis_table == None chassis.set_modular_chassis(True) chassis.set_my_slot(-1) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) assert temperature_updater.chassis_table == None my_slot = 1 chassis.set_my_slot(my_slot) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) assert temperature_updater.chassis_table != None assert temperature_updater.chassis_table.table_name == '{}_{}'.format(TEMPER_INFO_TABLE_NAME, str(my_slot)) @@ -547,7 +549,7 @@ def test_updater_thermal_check_chassis_table(): chassis.set_modular_chassis(True) chassis.set_my_slot(1) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() assert temperature_updater.chassis_table.get_size() == chassis.get_num_thermals() @@ -566,7 +568,7 @@ def test_updater_thermal_check_min_max(): chassis.set_modular_chassis(True) chassis.set_my_slot(1) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() slot_dict = temperature_updater.chassis_table.get(thermal.get_name()) @@ -580,12 +582,14 @@ def test_signal_handler(): daemon_thermalctld.stop_event.set = mock.MagicMock() daemon_thermalctld.log_info = mock.MagicMock() daemon_thermalctld.log_warning = mock.MagicMock() + daemon_thermalctld.thermal_manager.stop = mock.MagicMock() daemon_thermalctld.signal_handler(thermalctld.signal.SIGHUP, None) daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert assert daemon_thermalctld.log_info.call_count == 1 daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGHUP' - ignoring...") assert daemon_thermalctld.log_warning.call_count == 0 assert daemon_thermalctld.stop_event.set.call_count == 0 + assert daemon_thermalctld.thermal_manager.stop.call_count == 0 assert thermalctld.exit_code == thermalctld.ERR_UNKNOWN # Test SIGINT @@ -593,6 +597,7 @@ def test_signal_handler(): daemon_thermalctld.stop_event.set = mock.MagicMock() daemon_thermalctld.log_info = mock.MagicMock() daemon_thermalctld.log_warning = mock.MagicMock() + daemon_thermalctld.thermal_manager.stop = mock.MagicMock() test_signal = thermalctld.signal.SIGINT daemon_thermalctld.signal_handler(test_signal, None) daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert @@ -600,6 +605,7 @@ def test_signal_handler(): daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGINT' - exiting...") assert daemon_thermalctld.log_warning.call_count == 0 assert daemon_thermalctld.stop_event.set.call_count == 1 + assert daemon_thermalctld.thermal_manager.stop.call_count == 1 assert thermalctld.exit_code == (128 + test_signal) # Test SIGTERM @@ -608,6 +614,7 @@ def test_signal_handler(): daemon_thermalctld.stop_event.set = mock.MagicMock() daemon_thermalctld.log_info = mock.MagicMock() daemon_thermalctld.log_warning = mock.MagicMock() + daemon_thermalctld.thermal_manager.stop = mock.MagicMock() test_signal = thermalctld.signal.SIGTERM daemon_thermalctld.signal_handler(test_signal, None) daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert @@ -615,6 +622,7 @@ def test_signal_handler(): daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGTERM' - exiting...") assert daemon_thermalctld.log_warning.call_count == 0 assert daemon_thermalctld.stop_event.set.call_count == 1 + assert daemon_thermalctld.thermal_manager.stop.call_count == 1 assert thermalctld.exit_code == (128 + test_signal) # Test an unhandled signal @@ -623,12 +631,14 @@ def test_signal_handler(): daemon_thermalctld.stop_event.set = mock.MagicMock() daemon_thermalctld.log_info = mock.MagicMock() daemon_thermalctld.log_warning = mock.MagicMock() + daemon_thermalctld.thermal_manager.stop = mock.MagicMock() daemon_thermalctld.signal_handler(thermalctld.signal.SIGUSR1, None) daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert assert daemon_thermalctld.log_warning.call_count == 1 daemon_thermalctld.log_warning.assert_called_with("Caught unhandled signal 'SIGUSR1' - ignoring...") assert daemon_thermalctld.log_info.call_count == 0 assert daemon_thermalctld.stop_event.set.call_count == 0 + assert daemon_thermalctld.thermal_manager.stop.call_count == 0 assert thermalctld.exit_code == thermalctld.ERR_UNKNOWN