From f65991b7b03a272037b4c46790fc8c2d36d4b49a Mon Sep 17 00:00:00 2001 From: junchao Date: Mon, 21 Feb 2022 09:06:12 +0800 Subject: [PATCH] [Mellanox] Add CPU thermal control for SN4800 --- .../sonic_platform/cpu_thermal_control.py | 46 +++++++++++++++ .../sonic_platform/device_data.py | 20 ++++++- .../sonic_platform/thermal.py | 9 +++ .../sonic_platform/thermal_manager.py | 27 ++++++++- .../tests/test_cpu_thermal_control.py | 59 +++++++++++++++++++ 5 files changed, 159 insertions(+), 2 deletions(-) create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/cpu_thermal_control.py create mode 100644 platform/mellanox/mlnx-platform-api/tests/test_cpu_thermal_control.py diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/cpu_thermal_control.py b/platform/mellanox/mlnx-platform-api/sonic_platform/cpu_thermal_control.py new file mode 100644 index 000000000000..7dbf061254d3 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/cpu_thermal_control.py @@ -0,0 +1,46 @@ +from sonic_py_common.task_base import ThreadTaskBase + +from . import utils +from .device_data import DeviceDataManager + + +class CPUThermalControl(ThreadTaskBase): + CPU_COOLING_STATE = '/var/run/hw-management/thermal/cooling2_cur_state' + CPU_TEMP_FILE = '/var/run/hw-management/thermal/cpu_pack' + MAX_COOLING_STATE = 10 + MIN_COOLING_STATE = 2 + INTERVAL = 3 + + def __init__(self): + super(CPUThermalControl, self).__init__() + self.temp_low, self.temp_high = DeviceDataManager.get_cpu_thermal_threshold() + + def task_worker(self): + last_temp = 0 + while not self.task_stopping_event.wait(self.INTERVAL): + last_temp = self.run(last_temp) + + def run(self, last_temp): + current_temp = self.read_cpu_temp() + if current_temp < self.temp_low: + self.set_cooling_state(self.MIN_COOLING_STATE) + elif current_temp > self.temp_high: + self.set_cooling_state(self.MAX_COOLING_STATE) + else: + cooling_state = self.get_cooling_state() + if current_temp > last_temp: + self.set_cooling_state(min(cooling_state + 1, self.MAX_COOLING_STATE)) + elif current_temp < last_temp: + self.set_cooling_state(max(cooling_state - 1, self.MIN_COOLING_STATE)) + return current_temp + + def set_cooling_state(self, state): + utils.write_file(self.CPU_COOLING_STATE, state, log_func=None) + + def get_cooling_state(self): + return utils.read_int_from_file(self.CPU_COOLING_STATE, default=self.MAX_COOLING_STATE, log_func=None) + + def read_cpu_temp(self): + cpu_temp = utils.read_int_from_file(self.CPU_TEMP_FILE, default=self.temp_high, log_func=None) + return cpu_temp if cpu_temp <= 1000 else int(cpu_temp / 1000) + \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py index b4610fe045f0..d7d31db46fd1 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -148,7 +148,8 @@ 'thermal': { "capability": { "comex_amb": False - } + }, + 'cpu_threshold': (80, 95) # min=80, max=95 }, 'sfp': { 'max_port_per_line_card': 16 @@ -263,3 +264,20 @@ def get_linecard_max_port_count(cls): if not sfp_data: return 0 return sfp_data.get('max_port_per_line_card', 0) + + @classmethod + def is_cpu_thermal_control_supported(cls): + return cls.get_cpu_thermal_threshold() != (None, None) + + @classmethod + @utils.read_only_cache() + def get_cpu_thermal_threshold(cls): + platform_data = DEVICE_DATA.get(cls.get_platform_name(), None) + if not platform_data: + return None, None + + thermal_data = platform_data.get('thermal', None) + if not thermal_data: + return None, None + + return thermal_data.get('cpu_threshold', (None, None)) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index e4d22575450c..69fb57a1dcf6 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -519,6 +519,15 @@ def monitor_asic_themal_zone(cls): else: cls.expect_cooling_state = None + @classmethod + def start_cpu_thermal_control(cls, chassis): + platform_name = DeviceDataManager.get_platform_name() + if platform_name != 'x86_64-nvidia_sn4800-r0': + return + + + + class RemovableThermal(Thermal): def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position, presence_cb): diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 895d3aaefedb..bab8c144bd1a 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -15,6 +15,8 @@ # limitations under the License. # from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase +from .cpu_thermal_control import CPUThermalControl +from .device_data import DeviceDataManager from .thermal_actions import * from .thermal_conditions import * from .thermal_infos import * @@ -22,6 +24,8 @@ class ThermalManager(ThermalManagerBase): + cpu_thermal_control = None + @classmethod def start_thermal_control_algorithm(cls): """ @@ -42,8 +46,30 @@ def stop_thermal_control_algorithm(cls): """ Thermal.set_thermal_algorithm_status(False) + @classmethod + def start_cpu_thermal_control_algoritm(cls): + if cls.cpu_thermal_control: + return + + if not DeviceDataManager.is_cpu_thermal_control_supported(): + return + + cls.cpu_thermal_control = CPUThermalControl() + cls.cpu_thermal_control.task_run() + + @classmethod + def stop_cpu_thermal_control_algoritm(cls): + if cls.cpu_thermal_control: + cls.cpu_thermal_control.task_stop() + cls.cpu_thermal_control = None + @classmethod def run_policy(cls, chassis): + if cls._running: + cls.start_cpu_thermal_control_algoritm() + else: + cls.stop_cpu_thermal_control_algoritm() + if not cls._policy_dict: return @@ -59,7 +85,6 @@ def run_policy(cls, chassis): if not cls._running: return try: - print(policy.name) if policy.is_match(cls._thermal_info_dict): policy.do_action(cls._thermal_info_dict) except Exception as e: diff --git a/platform/mellanox/mlnx-platform-api/tests/test_cpu_thermal_control.py b/platform/mellanox/mlnx-platform-api/tests/test_cpu_thermal_control.py new file mode 100644 index 000000000000..ccf08637acee --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_cpu_thermal_control.py @@ -0,0 +1,59 @@ +import glob +import os +import pytest +import sys +if sys.version_info.major == 3: + from unittest import mock +else: + import mock + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform.cpu_thermal_control import CPUThermalControl + + +class TestCPUThermalControl: + @mock.patch('sonic_platform.device_data.DeviceDataManager.get_cpu_thermal_threshold', mock.MagicMock(return_value=(85, 95))) + @mock.patch('sonic_platform.utils.read_int_from_file') + @mock.patch('sonic_platform.utils.write_file') + def test_run(self, mock_write_file, mock_read_file): + instance = CPUThermalControl() + file_content = { + CPUThermalControl.CPU_COOLING_STATE: 5, + CPUThermalControl.CPU_TEMP_FILE: instance.temp_high + 1 + } + + def read_file(file_path, **kwargs): + return file_content[file_path] + + mock_read_file.side_effect = read_file + # Test current temp is higher than high threshold + instance.run(0) + mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, CPUThermalControl.MAX_COOLING_STATE, log_func=None) + + # Test current temp is lower than low threshold + file_content[CPUThermalControl.CPU_TEMP_FILE] = instance.temp_low - 1 + instance.run(0) + mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, CPUThermalControl.MIN_COOLING_STATE, log_func=None) + + # Test current temp increasing + file_content[CPUThermalControl.CPU_TEMP_FILE] = instance.temp_low + instance.run(0) + mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, 6, log_func=None) + + # Test current temp decreasing + instance.run(instance.temp_low + 1) + mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, 4, log_func=None) + + # Test current temp increasing and current cooling state is already the max + file_content[CPUThermalControl.CPU_TEMP_FILE] = 85 + file_content[CPUThermalControl.CPU_COOLING_STATE] = CPUThermalControl.MAX_COOLING_STATE + instance.run(84) + mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, CPUThermalControl.MAX_COOLING_STATE, log_func=None) + + # Test current temp decreasing and current cooling state is already the max + file_content[CPUThermalControl.CPU_COOLING_STATE] = CPUThermalControl.MIN_COOLING_STATE + instance.run(86) + mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, CPUThermalControl.MIN_COOLING_STATE, log_func=None)