diff --git a/sonic-thermalctld/scripts/thermalctld b/sonic-thermalctld/scripts/thermalctld index 61aca9218..c798fde96 100644 --- a/sonic-thermalctld/scripts/thermalctld +++ b/sonic-thermalctld/scripts/thermalctld @@ -538,11 +538,17 @@ class TemperatureUpdater(logger.Logger): self.is_chassis_system = chassis.is_modular_chassis() if self.is_chassis_system: + self.module_thermals = set() my_slot = try_get(chassis.get_my_slot, INVALID_SLOT) if my_slot != INVALID_SLOT: - table_name = TemperatureUpdater.TEMPER_INFO_TABLE_NAME+'_'+str(my_slot) - chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB") - self.chassis_table = swsscommon.Table(chassis_state_db, table_name) + try: + # Modular chassis does not have to have table CHASSIS_STATE_DB. + # So catch the exception here and ignore it. + table_name = TemperatureUpdater.TEMPER_INFO_TABLE_NAME+'_'+str(my_slot) + chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB") + self.chassis_table = swsscommon.Table(chassis_state_db, table_name) + except Exception as e: + self.chassis_table = None def deinit(self): """ @@ -576,31 +582,61 @@ class TemperatureUpdater(logger.Logger): for index, thermal in enumerate(self.chassis.get_all_thermals()): if self.task_stopping_event.is_set(): return - try: - self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index) - except Exception as e: - self.log_warning('Failed to update thermal status - {}'.format(repr(e))) + + self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index) for psu_index, psu in enumerate(self.chassis.get_all_psus()): parent_name = 'PSU {}'.format(psu_index + 1) for thermal_index, thermal in enumerate(psu.get_all_thermals()): if self.task_stopping_event.is_set(): return - try: - self._refresh_temperature_status(parent_name, thermal, thermal_index) - except Exception as e: - self.log_warning('Failed to update thermal status - {}'.format(repr(e))) + + self._refresh_temperature_status(parent_name, thermal, thermal_index) for sfp_index, sfp in enumerate(self.chassis.get_all_sfps()): parent_name = 'SFP {}'.format(sfp_index + 1) for thermal_index, thermal in enumerate(sfp.get_all_thermals()): if self.task_stopping_event.is_set(): return - try: - self._refresh_temperature_status(parent_name, thermal, thermal_index) - except Exception as e: - self.log_warning('Failed to update thermal status - {}'.format(repr(e))) + self._refresh_temperature_status(parent_name, thermal, thermal_index) + + if self.is_chassis_system: + available_thermals = set() + for module_index, module in enumerate(self.chassis.get_all_modules()): + module_name = try_get(module.get_name, 'Module {}'.format(module_index + 1)) + + for thermal_index, thermal in enumerate(module.get_all_thermals()): + if self.task_stopping_event.is_set(): + return + + available_thermals.add((thermal, module_name, thermal_index)) + self._refresh_temperature_status(module_name, thermal, thermal_index) + + for sfp_index, sfp in enumerate(module.get_all_sfps()): + sfp_name = '{} SFP {}'.format(module_name, sfp_index + 1) + for thermal_index, thermal in enumerate(sfp.get_all_thermals()): + if self.task_stopping_event.is_set(): + return + + available_thermals.add((thermal, sfp_name, thermal_index)) + self._refresh_temperature_status(sfp_name, thermal, thermal_index) + + for psu_index, psu in enumerate(module.get_all_psus()): + psu_name = '{} PSU {}'.format(module_name, psu_index + 1) + for thermal_index, thermal in enumerate(psu.get_all_thermals()): + if self.task_stopping_event.is_set(): + return + + available_thermals.add((thermal, psu_name, thermal_index)) + self._refresh_temperature_status(psu_name, thermal, thermal_index) + + + thermals_to_remove = self.module_thermals - available_thermals + self.module_thermals = available_thermals + for thermal, parent_name, thermal_index in thermals_to_remove: + self._remove_thermal_from_db(thermal, parent_name, thermal_index) + self.log_debug("End temperature updating") def _refresh_temperature_status(self, parent_name, thermal, thermal_index): @@ -611,72 +647,82 @@ class TemperatureUpdater(logger.Logger): :param thermal_index: Index of the thermal object in platform chassis :return: """ - name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1)) + try: + name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1)) + + # Only save entity info for thermals that belong to chassis and PSU + # for SFP thermal, they don't need save entity info because snmp can deduce the relation from TRANSCEIVER_DOM_SENSOR + # and as we save logical port in TRANSCEIVER_INFO table, for split cable, a SFP thermal might have multiple parent + # logical port + if 'SFP' not in parent_name: + update_entity_info(self.phy_entity_table, parent_name, name, thermal, thermal_index + 1) + + if name not in self.temperature_status_dict: + self.temperature_status_dict[name] = TemperatureStatus() + + temperature_status = self.temperature_status_dict[name] + + high_threshold = NOT_AVAILABLE + low_threshold = NOT_AVAILABLE + high_critical_threshold = NOT_AVAILABLE + low_critical_threshold = NOT_AVAILABLE + maximum_temperature = NOT_AVAILABLE + minimum_temperature = NOT_AVAILABLE + temperature = try_get(thermal.get_temperature) + is_replaceable = try_get(thermal.is_replaceable, False) + if temperature != NOT_AVAILABLE: + temperature_status.set_temperature(name, temperature) + minimum_temperature = try_get(thermal.get_minimum_recorded) + maximum_temperature = try_get(thermal.get_maximum_recorded) + high_threshold = try_get(thermal.get_high_threshold) + low_threshold = try_get(thermal.get_low_threshold) + high_critical_threshold = try_get(thermal.get_high_critical_threshold) + low_critical_threshold = try_get(thermal.get_low_critical_threshold) + + warning = False + if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold): + self._log_on_status_changed(not temperature_status.over_temperature, + 'High temperature warning cleared: {} temperature restored to {}C, high threshold {}C'. + format(name, temperature, high_threshold), + 'High temperature warning: {} current temperature {}C, high threshold {}C'. + format(name, temperature, high_threshold) + ) + warning = warning | temperature_status.over_temperature + + if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold): + self._log_on_status_changed(not temperature_status.under_temperature, + 'Low temperature warning cleared: {} temperature restored to {}C, low threshold {}C'. + format(name, temperature, low_threshold), + 'Low temperature warning: {} current temperature {}C, low threshold {}C'. + format(name, temperature, low_threshold) + ) + warning = warning | temperature_status.under_temperature + + fvs = swsscommon.FieldValuePairs( + [('temperature', str(temperature)), + ('minimum_temperature', str(minimum_temperature)), + ('maximum_temperature', str(maximum_temperature)), + ('high_threshold', str(high_threshold)), + ('low_threshold', str(low_threshold)), + ('warning_status', str(warning)), + ('critical_high_threshold', str(high_critical_threshold)), + ('critical_low_threshold', str(low_critical_threshold)), + ('is_replaceable', str(is_replaceable)), + ('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S')) + ]) - # Only save entity info for thermals that belong to chassis and PSU - # for SFP thermal, they don't need save entity info because snmp can deduce the relation from TRANSCEIVER_DOM_SENSOR - # and as we save logical port in TRANSCEIVER_INFO table, for split cable, a SFP thermal might have multiple parent - # logical port - if 'SFP' not in parent_name: - update_entity_info(self.phy_entity_table, parent_name, name, thermal, thermal_index + 1) - - if name not in self.temperature_status_dict: - self.temperature_status_dict[name] = TemperatureStatus() - - temperature_status = self.temperature_status_dict[name] - - high_threshold = NOT_AVAILABLE - low_threshold = NOT_AVAILABLE - high_critical_threshold = NOT_AVAILABLE - low_critical_threshold = NOT_AVAILABLE - maximum_temperature = NOT_AVAILABLE - minimum_temperature = NOT_AVAILABLE - temperature = try_get(thermal.get_temperature) - is_replaceable = try_get(thermal.is_replaceable, False) - if temperature != NOT_AVAILABLE: - temperature_status.set_temperature(name, temperature) - minimum_temperature = try_get(thermal.get_minimum_recorded) - maximum_temperature = try_get(thermal.get_maximum_recorded) - high_threshold = try_get(thermal.get_high_threshold) - low_threshold = try_get(thermal.get_low_threshold) - high_critical_threshold = try_get(thermal.get_high_critical_threshold) - low_critical_threshold = try_get(thermal.get_low_critical_threshold) - - warning = False - if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold): - self._log_on_status_changed(not temperature_status.over_temperature, - 'High temperature warning cleared: {} temperature restored to {}C, high threshold {}C'. - format(name, temperature, high_threshold), - 'High temperature warning: {} current temperature {}C, high threshold {}C'. - format(name, temperature, high_threshold) - ) - warning = warning | temperature_status.over_temperature - - if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold): - self._log_on_status_changed(not temperature_status.under_temperature, - 'Low temperature warning cleared: {} temperature restored to {}C, low threshold {}C'. - format(name, temperature, low_threshold), - 'Low temperature warning: {} current temperature {}C, low threshold {}C'. - format(name, temperature, low_threshold) - ) - warning = warning | temperature_status.under_temperature + self.table.set(name, fvs) + if self.is_chassis_system and self.chassis_table is not None: + self.chassis_table.set(name, fvs) + except Exception as e: + self.log_warning('Failed to update thermal status for {} - {}'.format(name, repr(e))) - fvs = swsscommon.FieldValuePairs( - [('temperature', str(temperature)), - ('minimum_temperature', str(minimum_temperature)), - ('maximum_temperature', str(maximum_temperature)), - ('high_threshold', str(high_threshold)), - ('low_threshold', str(low_threshold)), - ('warning_status', str(warning)), - ('critical_high_threshold', str(high_critical_threshold)), - ('critical_low_threshold', str(low_critical_threshold)), - ('is_replaceable', str(is_replaceable)), - ('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S')) - ]) + def _remove_thermal_from_db(self, thermal, parent_name, thermal_index): + name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1)) + self.table._del(name) - self.table.set(name, fvs) - if self.is_chassis_system and self.chassis_table is not None: - self.chassis_table.set(name, fvs) + if self.chassis_table is not None: + self.chassis_table._del(name) class ThermalMonitor(ProcessTaskBase): diff --git a/sonic-thermalctld/tests/mock_platform.py b/sonic-thermalctld/tests/mock_platform.py index 117dcafcf..660903226 100644 --- a/sonic-thermalctld/tests/mock_platform.py +++ b/sonic-thermalctld/tests/mock_platform.py @@ -388,6 +388,17 @@ def make_error_thermal(self): thermal = MockErrorThermal() self._thermal_list.append(thermal) + def make_module_thermal(self): + module = MockModule() + self._module_list.append(module) + sfp = MockSfp() + sfp._thermal_list.append(MockThermal()) + psu = MockPsu() + psu._thermal_list.append(MockThermal()) + module._sfp_list.append(sfp) + module._psu_list.append(psu) + module._thermal_list.append(MockThermal()) + def is_modular_chassis(self): return self._is_chassis_system @@ -430,3 +441,8 @@ def get_position_in_parent(self): def is_replaceable(self): return self._replaceable + + +class MockModule(module_base.ModuleBase): + def __init__(self): + super(MockModule, self).__init__() diff --git a/sonic-thermalctld/tests/test_thermalctld.py b/sonic-thermalctld/tests/test_thermalctld.py index 9555b4c7a..834b35d33 100644 --- a/sonic-thermalctld/tests/test_thermalctld.py +++ b/sonic-thermalctld/tests/test_thermalctld.py @@ -470,15 +470,15 @@ def test_update_psu_thermals(self): temperature_updater.update() assert temperature_updater.log_warning.call_count == 0 - temperature_updater._refresh_temperature_status = mock.MagicMock(side_effect=Exception("Test message")) + mock_thermal.get_temperature = mock.MagicMock(side_effect=Exception("Test message")) temperature_updater.update() assert temperature_updater.log_warning.call_count == 1 # TODO: Clean this up once we no longer need to support Python 2 if sys.version_info.major == 3: - temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message')") + temperature_updater.log_warning.assert_called_with("Failed to update thermal status for PSU 1 Thermal 1 - Exception('Test message')") else: - temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message',)") + temperature_updater.log_warning.assert_called_with("Failed to update thermal status for PSU 1 Thermal 1 - Exception('Test message',)") def test_update_sfp_thermals(self): chassis = MockChassis() @@ -490,15 +490,15 @@ def test_update_sfp_thermals(self): temperature_updater.update() assert temperature_updater.log_warning.call_count == 0 - temperature_updater._refresh_temperature_status = mock.MagicMock(side_effect=Exception("Test message")) + mock_thermal.get_temperature = mock.MagicMock(side_effect=Exception("Test message")) temperature_updater.update() assert temperature_updater.log_warning.call_count == 1 # TODO: Clean this up once we no longer need to support Python 2 if sys.version_info.major == 3: - temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message')") + temperature_updater.log_warning.assert_called_with("Failed to update thermal status for SFP 1 Thermal 1 - Exception('Test message')") else: - temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message',)") + temperature_updater.log_warning.assert_called_with("Failed to update thermal status for SFP 1 Thermal 1 - Exception('Test message',)") def test_update_thermal_with_exception(self): chassis = MockChassis() @@ -514,16 +514,28 @@ def test_update_thermal_with_exception(self): # TODO: Clean this up once we no longer need to support Python 2 if sys.version_info.major == 3: expected_calls = [ - mock.call("Failed to update thermal status - Exception('Failed to get temperature')"), + mock.call("Failed to update thermal status for chassis 1 Thermal 1 - Exception('Failed to get temperature')"), mock.call('High temperature warning: chassis 1 Thermal 2 current temperature 3C, high threshold 2C') ] else: expected_calls = [ - mock.call("Failed to update thermal status - Exception('Failed to get temperature',)"), + mock.call("Failed to update thermal status for chassis 1 Thermal 1 - Exception('Failed to get temperature',)"), mock.call('High temperature warning: chassis 1 Thermal 2 current temperature 3C, high threshold 2C') ] assert temperature_updater.log_warning.mock_calls == expected_calls + def test_update_module_thermals(self): + chassis = MockChassis() + chassis.make_module_thermal() + chassis.set_modular_chassis(True) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater.update() + assert len(temperature_updater.module_thermals) == 3 + + chassis._module_list = [] + temperature_updater.update() + assert len(temperature_updater.module_thermals) == 0 + # Modular chassis-related tests