From 27fdea46aadd3546bea67040126934fda37260c3 Mon Sep 17 00:00:00 2001 From: junchao Date: Wed, 1 Apr 2020 19:29:38 +0800 Subject: [PATCH 1/7] Enable changing PSU fan speed --- .../mlnx-platform-api/sonic_platform/fan.py | 25 ++++++++++++++++--- .../sonic_platform/thermal.py | 4 +-- .../sonic_platform/thermal_actions.py | 16 ++++++++++-- .../sonic_platform/thermal_conditions.py | 14 +++++++++++ .../sonic_platform/thermal_manager.py | 13 +++++++--- .../tests/test_thermal_policy.py | 4 +-- 6 files changed, 63 insertions(+), 13 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index 14bb873ee8ef..e078557b840e 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -9,6 +9,7 @@ ############################################################################# import os.path +import subprocess try: from sonic_platform_base.fan_base import FanBase @@ -22,6 +23,7 @@ FAN_PATH = "/var/run/hw-management/thermal/" LED_PATH = "/var/run/hw-management/led/" +CONFIG_PATH = "/var/run/hw-management/config" # fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches FAN_DIR = "/var/run/hw-management/system/fan_dir" COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state" @@ -39,6 +41,9 @@ class Fan(FanBase): STATUS_LED_COLOR_ORANGE = "orange" min_cooling_level = 2 + # PSU fan speed vector + PSU_FAN_SPEED = ['0x3c', '0x3c', '0x3c', '0x3c', '0x3c', + '0x3c', '0x3c', '0x46', '0x50', '0x5a', '0x64'] def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None): # API index is starting from 0, Mellanox platform index is starting from 1 @@ -60,6 +65,10 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sk self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) self._name = 'psu_{}_fan_{}'.format(self.index, 1) self.fan_max_speed_path = None + self.psu_i2c_bus_path = join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index)) + self.psu_i2c_addr_path = join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index)) + self.psu_i2c_command_path = join(CONFIG_PATH, 'fan_command') + self.fan_status_path = "fan{}_fault".format(self.index) self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index) @@ -239,9 +248,19 @@ def set_speed(self, speed): status = True if self.is_psu_fan: - #PSU fan speed is not setable. - return False - + try: + with open(self.psu_i2c_bus_path, 'r') as f: + bus = f.read().strip() + with open(self.psu_i2c_addr_path, 'r') as f: + addr = f.read().strip() + with open(self.psu_i2c_command_path, 'r') as f: + command = f.read().strip() + speed = Fan.PSU_FAN_SPEED[int(speed / 10)] + subprocess.call("i2cset -f -y {0} {1} {2} {3} wp".format(bus, addr, command, speed), shell = True) + return True + except Exception as e: + return False + try: cooling_level = int(speed / 10) if cooling_level < self.min_cooling_level: diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index a9a83daccd7c..66ff58f7ae07 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -549,8 +549,8 @@ def get_air_flow_direction(cls): port_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_PORT_AMBIENT) # if there is any exception, let it raise - fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path)) - port_ambient_temp = int(cls._read_generic_file(port_ambient_path)) + fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path, 0)) + port_ambient_temp = int(cls._read_generic_file(port_ambient_path, 0)) if fan_ambient_temp > port_ambient_temp: return 'p2c', fan_ambient_temp elif fan_ambient_temp < port_ambient_temp: diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 4bde98fcdc5b..3a364d594057 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -146,8 +146,8 @@ def execute(self, thermal_info_dict): for key, cooling_level in minimum_table.items(): temp_range = key.split(':') - temp_min = int(temp_range[0]) * 1000 - temp_max = int(temp_range[1]) * 1000 + temp_min = int(temp_range[0]) + temp_max = int(temp_range[1]) if temp_min <= temperature <= temp_max: Fan.min_cooling_level = cooling_level - 10 break @@ -155,3 +155,15 @@ def execute(self, thermal_info_dict): current_cooling_level = Fan.get_cooling_level() if current_cooling_level < Fan.min_cooling_level: Fan.set_cooling_level(Fan.min_cooling_level) + + +class UpdatePsuFanSpeedAction(ThermalPolicyActionBase): + def execute(self, thermal_info_dict): + from .thermal_conditions import CoolingLevelChangeCondition + from .thermal_infos import ChassisInfo + + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + cooling_level = CoolingLevelChangeCondition.cooling_level + for psu in chassis.get_all_psus(): + for psu_fan in psu.get_all_fans(): + psu_fan.set_speed(cooling_level * 10) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py index 8593b0f32881..bc5a1bb68276 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -86,6 +86,7 @@ def is_match(self, thermal_info_dict): trust_state = Thermal.check_module_temperature_trustable() air_flow_dir, temperature = Thermal.get_air_flow_direction() + temperature = temperature / 1000 change_cooling_level = False if trust_state != MinCoolingLevelChangeCondition.trust_state: @@ -101,3 +102,16 @@ def is_match(self, thermal_info_dict): change_cooling_level = True return change_cooling_level + + +class CoolingLevelChangeCondition(ThermalPolicyConditionBase): + cooling_level = None + + def is_match(self, thermal_info_dict): + from .fan import Fan + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level != cooling_level: + CoolingLevelChangeCondition.cooling_level = current_cooling_level + return True + else: + return False diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index efcfb154717f..d70a5bcd1a03 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -40,7 +40,12 @@ def stop_thermal_control_algorithm(cls): @classmethod def _add_private_thermal_policy(cls): - policy = ThermalPolicy() - policy.conditions[MinCoolingLevelChangeCondition] = MinCoolingLevelChangeCondition() - policy.actions[ChangeMinCoolingLevelAction] = ChangeMinCoolingLevelAction() - cls._policy_dict['DynamicMinCoolingLevelPolicy'] = policy + dynamic_min_speed_policy = ThermalPolicy() + dynamic_min_speed_policy.conditions[MinCoolingLevelChangeCondition] = MinCoolingLevelChangeCondition() + dynamic_min_speed_policy.actions[ChangeMinCoolingLevelAction] = ChangeMinCoolingLevelAction() + cls._policy_dict['DynamicMinCoolingLevelPolicy'] = dynamic_min_speed_policy + + update_psu_fan_speed_policy = ThermalPolicy() + update_psu_fan_speed_policy.conditions[CoolingLevelChangeCondition] = CoolingLevelChangeCondition() + update_psu_fan_speed_policy.actions[UpdatePsuFanSpeedAction] = UpdatePsuFanSpeedAction() + cls._policy_dict['UpdatePsuFanSpeedPolicy'] = update_psu_fan_speed_policy diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index a89cd75c6e3b..789e316bb540 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -486,7 +486,7 @@ def test_dynamic_minimum_policy(thermal_manager): assert condition.is_match(None) assert MinCoolingLevelChangeCondition.trust_state == 'trust' assert MinCoolingLevelChangeCondition.air_flow_dir == 'p2c' - assert MinCoolingLevelChangeCondition.temperature == 35000 + assert MinCoolingLevelChangeCondition.temperature == 35 assert not condition.is_match(None) Thermal.check_module_temperature_trustable = MagicMock(return_value='untrust') @@ -499,7 +499,7 @@ def test_dynamic_minimum_policy(thermal_manager): Thermal.get_air_flow_direction = MagicMock(return_value=('c2p', 25000)) assert condition.is_match(None) - assert MinCoolingLevelChangeCondition.temperature == 25000 + assert MinCoolingLevelChangeCondition.temperature == 25 chassis = MockChassis() chassis.sku_name = 'invalid' From f61a3636f468578c6daa7d139448751a1fa604e2 Mon Sep 17 00:00:00 2001 From: junchao Date: Wed, 1 Apr 2020 19:37:46 +0800 Subject: [PATCH 2/7] install i2c-tool in pmon docker --- dockers/docker-platform-monitor/Dockerfile.j2 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dockers/docker-platform-monitor/Dockerfile.j2 b/dockers/docker-platform-monitor/Dockerfile.j2 index fd11f628559c..1763eb9bac0b 100755 --- a/dockers/docker-platform-monitor/Dockerfile.j2 +++ b/dockers/docker-platform-monitor/Dockerfile.j2 @@ -18,7 +18,8 @@ RUN apt-get update && \ rrdtool \ python-smbus \ ethtool \ - dmidecode + dmidecode \ + i2c-tools {% if docker_platform_monitor_debs.strip() -%} # Copy locally-built Debian package dependencies From b285bc84b25b2a65e5df3565fbadd61631e6588a Mon Sep 17 00:00:00 2001 From: junchao Date: Thu, 2 Apr 2020 09:55:22 +0800 Subject: [PATCH 3/7] fix issue found in manual test --- platform/mellanox/mlnx-platform-api/sonic_platform/fan.py | 6 +++--- .../mlnx-platform-api/sonic_platform/thermal_conditions.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index e078557b840e..15f5bdf5ed32 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -65,9 +65,9 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sk self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) self._name = 'psu_{}_fan_{}'.format(self.index, 1) self.fan_max_speed_path = None - self.psu_i2c_bus_path = join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index)) - self.psu_i2c_addr_path = join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index)) - self.psu_i2c_command_path = join(CONFIG_PATH, 'fan_command') + self.psu_i2c_bus_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index)) + self.psu_i2c_addr_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index)) + self.psu_i2c_command_path = os.path.join(CONFIG_PATH, 'fan_command') self.fan_status_path = "fan{}_fault".format(self.index) self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py index bc5a1bb68276..5d8a5821ad42 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -110,7 +110,7 @@ class CoolingLevelChangeCondition(ThermalPolicyConditionBase): def is_match(self, thermal_info_dict): from .fan import Fan current_cooling_level = Fan.get_cooling_level() - if current_cooling_level != cooling_level: + if current_cooling_level != CoolingLevelChangeCondition.cooling_level: CoolingLevelChangeCondition.cooling_level = current_cooling_level return True else: From de6ca46b9db7d6d7e5254579683a95a9c9195f2d Mon Sep 17 00:00:00 2001 From: junchao Date: Fri, 3 Apr 2020 14:56:43 +0800 Subject: [PATCH 4/7] Add logs to thermal actions --- .../mlnx-platform-api/sonic_platform/thermal_actions.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 3a364d594057..eaed99eca9ed 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -1,5 +1,6 @@ from sonic_platform_base.sonic_thermal_control.thermal_action_base import ThermalPolicyActionBase from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object +from .thermal import logger class SetFanSpeedAction(ThermalPolicyActionBase): @@ -52,6 +53,7 @@ def execute(self, thermal_info_dict): fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] for fan in fan_info_obj.get_presence_fans(): fan.set_speed(self.speed) + logger.log_info('Set all system FAN speed to {}'.format(self.speed)) @thermal_json_object('fan.all.check_and_set_speed') @@ -126,6 +128,8 @@ def execute(self, thermal_info_dict): break fan.set_speed(Fan.min_cooling_level * 10) + logger.log_info('Changed thermal algorithm status to {}'.format(self.status)) + class ChangeMinCoolingLevelAction(ThermalPolicyActionBase): UNKNOWN_SKU_COOLING_LEVEL = 6 @@ -156,6 +160,8 @@ def execute(self, thermal_info_dict): if current_cooling_level < Fan.min_cooling_level: Fan.set_cooling_level(Fan.min_cooling_level) + logger.log_info('Changed minimum cooling level to {}'.format(Fan.min_cooling_level)) + class UpdatePsuFanSpeedAction(ThermalPolicyActionBase): def execute(self, thermal_info_dict): @@ -167,3 +173,5 @@ def execute(self, thermal_info_dict): for psu in chassis.get_all_psus(): for psu_fan in psu.get_all_fans(): psu_fan.set_speed(cooling_level * 10) + + logger.log_info('Updated PSU FAN speed to {}%'.format(cooling_level * 10)) From c16bb1d8e7275f53695835f28158bc0f3c70de9b Mon Sep 17 00:00:00 2001 From: junchao Date: Fri, 3 Apr 2020 16:45:44 +0800 Subject: [PATCH 5/7] Update PSU fan speed whenever system fan speed or cooling level changed --- .../sonic_platform/thermal_actions.py | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index eaed99eca9ed..dbb2e2e54059 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -55,6 +55,20 @@ def execute(self, thermal_info_dict): fan.set_speed(self.speed) logger.log_info('Set all system FAN speed to {}'.format(self.speed)) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, self.speed) + + @classmethod + def set_psu_fan_speed(cls, thermal_info_dict, speed): + from .thermal_infos import ChassisInfo + if ChassisInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[ChassisInfo.INFO_NAME], ChassisInfo): + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + for psu in chassis.get_all_psus(): + for psu_fan in psu.get_all_fans(): + psu_fan.set_speed(speed) + + logger.log_info('Updated PSU FAN speed to {}%'.format(speed)) + + @thermal_json_object('fan.all.check_and_set_speed') class CheckAndSetAllFanSpeedAction(SetAllFanSpeedAction): @@ -123,10 +137,16 @@ def execute(self, thermal_info_dict): # save power if Thermal.check_thermal_zone_temperature(): fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] + update_psu_fan_speed = False + speed = Fan.min_cooling_level * 10 for fan in fan_info_obj.get_presence_fans(): if fan.get_target_speed() != 100: break - fan.set_speed(Fan.min_cooling_level * 10) + update_psu_fan_speed = True + fan.set_speed(speed) + + if update_psu_fan_speed: + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, speed) logger.log_info('Changed thermal algorithm status to {}'.format(self.status)) @@ -159,6 +179,7 @@ def execute(self, thermal_info_dict): current_cooling_level = Fan.get_cooling_level() if current_cooling_level < Fan.min_cooling_level: Fan.set_cooling_level(Fan.min_cooling_level) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, Fan.min_cooling_level * 10) logger.log_info('Changed minimum cooling level to {}'.format(Fan.min_cooling_level)) @@ -166,12 +187,4 @@ def execute(self, thermal_info_dict): class UpdatePsuFanSpeedAction(ThermalPolicyActionBase): def execute(self, thermal_info_dict): from .thermal_conditions import CoolingLevelChangeCondition - from .thermal_infos import ChassisInfo - - chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() - cooling_level = CoolingLevelChangeCondition.cooling_level - for psu in chassis.get_all_psus(): - for psu_fan in psu.get_all_fans(): - psu_fan.set_speed(cooling_level * 10) - - logger.log_info('Updated PSU FAN speed to {}%'.format(cooling_level * 10)) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, CoolingLevelChangeCondition.cooling_level * 10) From ab9132fa3b59432cb91221259125596006160721 Mon Sep 17 00:00:00 2001 From: junchao Date: Fri, 3 Apr 2020 16:49:31 +0800 Subject: [PATCH 6/7] fix unit test failure --- platform/mellanox/mlnx-platform-api/tests/mock_platform.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py index ff8adb66ac91..c53480584889 100644 --- a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py +++ b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py @@ -28,6 +28,9 @@ def get_presence(self): def get_powergood_status(self): return self.powergood + def get_all_fans(self): + return [] + class MockChassis: def __init__(self): From af951c650a1642fac2a865c367f9e411aa9290c9 Mon Sep 17 00:00:00 2001 From: junchao Date: Wed, 8 Apr 2020 16:21:01 +0800 Subject: [PATCH 7/7] add some logs if set PSU FAN speed failed --- platform/mellanox/mlnx-platform-api/sonic_platform/fan.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index 15f5bdf5ed32..347807e43f9a 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -248,6 +248,7 @@ def set_speed(self, speed): status = True if self.is_psu_fan: + from .thermal import logger try: with open(self.psu_i2c_bus_path, 'r') as f: bus = f.read().strip() @@ -256,9 +257,14 @@ def set_speed(self, speed): with open(self.psu_i2c_command_path, 'r') as f: command = f.read().strip() speed = Fan.PSU_FAN_SPEED[int(speed / 10)] - subprocess.call("i2cset -f -y {0} {1} {2} {3} wp".format(bus, addr, command, speed), shell = True) + command = "i2cset -f -y {0} {1} {2} {3} wp".format(bus, addr, command, speed) + res = subprocess.check_call(command, shell = True) return True + except subprocess.CalledProcessError as ce: + logger.log_error('Failed to call command {}, return code={}, command output={}'.format(ce.cmd, ce.returncode, ce.output)) + return False except Exception as e: + logger.log_error('Failed to set PSU FAN speed - {}'.format(e)) return False try: