diff --git a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json index 054d797be951..f16f68dd002e 100644 --- a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json +++ b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json @@ -1,6 +1,6 @@ { "thermal_control_algorithm": { - "run_at_boot_up": "false", + "run_at_boot_up": "true", "fan_speed_when_suspend": "60" }, "info_types": [ @@ -51,6 +51,24 @@ } ] }, + { + "name": "any fan broken", + "conditions": [ + { + "type": "fan.any.fault" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, { "name": "all fan and psu presence", "conditions": [ @@ -59,12 +77,15 @@ }, { "type": "psu.all.presence" + }, + { + "type": "fan.all.good" } ], "actions": [ { - "type": "fan.all.set_speed", - "speed": "60" + "type": "thermal_control.control", + "status": "true" } ] } diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py new file mode 100644 index 000000000000..f12ae0d5f323 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -0,0 +1,134 @@ +DEVICE_DATA = { + 'ACS-MSN2700': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'LS-SN2700': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'ACS-MSN2740': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:120":13}, + "p2c_untrust": {"-127:35":13, "36:40":14 , "41:120":15}, + "c2p_trust": {"-127:120":13}, + "c2p_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17}, + "unk_trust": {"-127:120":13}, + "unk_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17}, + } + } + }, + 'ACS-MSN2410': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'Mellanox-SN2700': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'Mellanox-SN2700-D48C8': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'ACS-MSN2100': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:120":12}, + "p2c_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16}, + "c2p_trust": {"-127:40":12, "41:120":13}, + "c2p_untrust": {"-127:40":12, "41:120":13}, + "unk_trust": {"-127:40":12, "41:120":13}, + "unk_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16} + } + } + }, + 'ACS-MSN2010': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:120":12}, + "p2c_untrust": {"-127:15":12, "16:20":13, "21:30":14, "31:35":15, "36:120":16}, + "c2p_trust": {"-127:120":12}, + "c2p_untrust": {"-127:20":12, "21:25":13 , "26:30":14, "31:35":15, "36:120":16}, + "unk_trust": {"-127:120":12}, + "unk_untrust": {"-127:15":12, "16:20":13 , "21:30":14, "31:35":15, "36:120":16} + } + } + }, + 'ACS-MSN3700': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + } + } + }, + 'ACS-MSN3800': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:35":12, "36:120":13}, + "p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + "c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "unk_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + } + } + }, + 'Mellanox-SN3800-D112C8': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:35":12, "36:120":13}, + "p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + "c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "unk_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + } + } + }, +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index 9ce65d1e2f98..f513f1944b41 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -24,15 +24,18 @@ LED_PATH = "/var/run/hw-management/led/" # fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches FAN_DIR = "/var/run/hw-management/system/fan_dir" +COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state" # SKUs with unplugable FANs: # 1. don't have fanX_status and should be treated as always present hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100'] + class Fan(FanBase): """Platform-specific Fan class""" STATUS_LED_COLOR_ORANGE = "orange" + min_cooling_level = 2 def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None): # API index is starting from 0, Mellanox platform index is starting from 1 @@ -231,13 +234,18 @@ def set_speed(self, speed): bool: True if set success, False if fail. """ status = True - pwm = int(round(PWM_MAX*speed/100.0)) if self.is_psu_fan: #PSU fan speed is not setable. return False try: + cooling_level = int(speed / 10) + if cooling_level < self.min_cooling_level: + cooling_level = self.min_cooling_level + speed = self.min_cooling_level * 10 + self.set_cooling_level(cooling_level) + pwm = int(round(PWM_MAX*speed/100.0)) with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'w') as fan_pwm: fan_pwm.write(str(pwm)) except (ValueError, IOError): @@ -352,3 +360,36 @@ def get_speed_tolerance(self): """ # The tolerance value is fixed as 20% for all the Mellanox platform return 20 + + @classmethod + def set_cooling_level(cls, level): + """ + Change cooling level. The input level should be an integer value [1, 10]. + 1 means 10%, 2 means 20%, 10 means 100%. + """ + if not isinstance(level, int): + raise RuntimeError("Failed to set cooling level, input parameter must be integer") + + if level < 1 or level > 10: + raise RuntimeError("Failed to set cooling level, level value must be in range [1, 10], got {}".format(level)) + + try: + # reset FAN driver and change cooling state + with open(COOLING_STATE_PATH, 'w') as cooling_state: + cooling_state.write(str(level + 10)) + + # make cooling state display correct value + with open(COOLING_STATE_PATH, 'w') as cooling_state: + cooling_state.write(str(level)) + except (ValueError, IOError) as e: + raise RuntimeError("Failed to set cooling level - {}".format(e)) + + @classmethod + def get_cooling_level(cls): + try: + with open(COOLING_STATE_PATH, 'r') as cooling_state: + cooling_level = int(cooling_state.read()) + return cooling_level + except (ValueError, IOError) as e: + raise RuntimeError("Failed to get cooling level - {}".format(e)) + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index a5faa5ea793a..a9a83daccd7c 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -42,6 +42,18 @@ HW_MGMT_THERMAL_ROOT = "/var/run/hw-management/thermal/" +THERMAL_ZONE_ASIC_PATH = "/var/run/hw-management/thermal/mlxsw/" +THERMAL_ZONE_MODULE_PATH = "/var/run/hw-management/thermal/mlxsw-module{}/" +THERMAL_ZONE_GEARBOX_PATH = "/var/run/hw-management/thermal/mlxsw-gearbox{}/" +THERMAL_ZONE_MODE = "thermal_zone_mode" +THERMAL_ZONE_POLICY = "thermal_zone_policy" +THERMAL_ZONE_TEMPERATURE = "thermal_zone_temp" +THERMAL_ZONE_NORMAL_TEMPERATURE = "temp_trip_norm" + +MODULE_COUNTER_PATH = "/var/run/hw-management/config/module_counter" +GEARBOX_COUNTER_PATH = "/var/run/hw-management/config/gearbox_counter" +MODULE_TEMPERATURE_FAULT_PATH = "/var/run/hw-management/thermal/module{}_temp_fault" + thermal_api_handler_cpu_core = { THERMAL_API_GET_TEMPERATURE:"cpu_core{}", THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max", @@ -262,6 +274,7 @@ def initialize_thermals(sku, thermal_list, psu_list): # create thermal objects for all categories of sensors tp_index = hwsku_dict_thermal[sku] thermal_profile = thermal_profile_list[tp_index] + Thermal.thermal_profile = thermal_profile for category in thermal_device_categories_all: if category == THERMAL_DEV_CATEGORY_AMBIENT: count, ambient_list = thermal_profile[category] @@ -290,6 +303,9 @@ def initialize_thermals(sku, thermal_list, psu_list): class Thermal(ThermalBase): + thermal_profile = None + thermal_algorithm_status = False + def __init__(self, category, index, has_index, dependency = None): """ index should be a string for category ambient and int for other categories @@ -321,7 +337,8 @@ def get_name(self): return self.name - def _read_generic_file(self, filename, len): + @classmethod + def _read_generic_file(cls, filename, len): """ Read a generic file, returns the contents of the file """ @@ -420,3 +437,123 @@ def get_high_critical_threshold(self): if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: return None return value_float / 1000.0 + + + @classmethod + def _write_generic_file(cls, filename, content): + """ + Write a generic file if content changed + """ + try: + with open(filename, 'w+') as file_obj: + origin_content = file_obj.read() + if origin_content != content: + file_obj.write(content) + except Exception as e: + logger.log_info("Fail to write file {} due to {}".format(filename, repr(e))) + + @classmethod + def set_thermal_algorithm_status(cls, status, force=True): + """ + Enable/disable kernel thermal algorithm + """ + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + if not force and cls.thermal_algorithm_status == status: + return + + cls.thermal_algorithm_status = status + content = "enabled" if status else "disabled" + policy = "step_wise" if status else "user_space" + cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_POLICY), policy) + + if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + if count != 0: + for index in range(count): + cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) + + if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] + if count != 0: + for index in range(count): + cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) + + @classmethod + def check_thermal_zone_temperature(cls): + """ + Check thermal zone current temperature with normal temperature + + Returns: + True if all thermal zones current temperature less or equal than normal temperature + """ + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_ASIC_PATH): + return False + + if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + if count != 0: + for index in range(count): + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_MODULE_PATH.format(start + index)): + return False + + if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] + if count != 0: + for index in range(count): + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_GEARBOX_PATH.format(start + index)): + return False + + return True + + @classmethod + def _check_thermal_zone_temperature(cls, thermal_zone_path): + normal_temp_path = join(thermal_zone_path, THERMAL_ZONE_NORMAL_TEMPERATURE) + current_temp_path = join(thermal_zone_path, THERMAL_ZONE_TEMPERATURE) + normal = None + current = None + try: + with open(normal_temp_path, 'r') as file_obj: + normal = float(file_obj.read()) + + with open(current_temp_path, 'r') as file_obj: + current = float(file_obj.read()) + + return current <= normal + except Exception as e: + logger.log_info("Fail to check thermal zone temperature for file {} due to {}".format(thermal_zone_path, repr(e))) + + @classmethod + def check_module_temperature_trustable(cls): + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + for index in range(count): + fault_file_path = MODULE_TEMPERATURE_FAULT_PATH.format(index + start) + fault = cls._read_generic_file(fault_file_path, 0) + if fault.strip() != '0': + return 'untrust' + return 'trust' + + @classmethod + def get_air_flow_direction(cls): + fan_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_FAN_AMBIENT) + port_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_PORT_AMBIENT) + + # if there is any exception, let it raise + fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path)) + port_ambient_temp = int(cls._read_generic_file(port_ambient_path)) + if fan_ambient_temp > port_ambient_temp: + return 'p2c', fan_ambient_temp + elif fan_ambient_temp < port_ambient_temp: + return 'c2p', port_ambient_temp + else: + return 'unk', fan_ambient_temp diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 72729287d1c5..cb30dd956c02 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -54,6 +54,22 @@ def execute(self, thermal_info_dict): fan.set_speed(self.speed) +@thermal_json_object('fan.all.check_and_set_speed') +class CheckAndSetAllFanSpeedAction(SetAllFanSpeedAction): + """ + Action to check thermal zone temperature and recover speed for all fans + """ + def execute(self, thermal_info_dict): + """ + Check thermal zone and set speed for all fans + :param thermal_info_dict: A dictionary stores all thermal information. + :return: + """ + from .thermal import Thermal + if Thermal.check_thermal_zone_temperature(): + SetAllFanSpeedAction.execute(self, thermal_info_dict) + + @thermal_json_object('thermal_control.control') class ControlThermalAlgoAction(ThermalPolicyActionBase): """ @@ -95,14 +111,47 @@ def execute(self, thermal_info_dict): :param thermal_info_dict: A dictionary stores all thermal information. :return: """ + from .thermal_infos import FanInfo + from .thermal import Thermal + from .fan import Fan + Thermal.set_thermal_algorithm_status(self.status, False) + if self.status: + # Check thermal zone temperature, if all thermal zone temperature + # back to normal and FAN speed is still 100%, set it to minimum allowed speed to + # save power + if Thermal.check_thermal_zone_temperature(): + fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] + for fan in fan_info_obj.get_presence_fans(): + if fan.get_target_speed() != 100: + break + fan.set_speed(Fan.min_cooling_level * 10) + + +class ChangeMinCoolingLevelAction(ThermalPolicyActionBase): + UNKNOWN_SKU_COOLING_LEVEL = 6 + def execute(self, thermal_info_dict): + from .device_data import DEVICE_DATA + from .fan import Fan from .thermal_infos import ChassisInfo - if ChassisInfo.INFO_NAME in thermal_info_dict: - chassis_info_obj = thermal_info_dict[ChassisInfo.INFO_NAME] - chassis = chassis_info_obj.get_chassis() - thermal_manager = chassis.get_thermal_manager() - if self.status: - thermal_manager.start_thermal_control_algorithm() - else: - thermal_manager.stop_thermal_control_algorithm() + from .thermal_conditions import MinCoolingLevelChangeCondition + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + if chassis.sku_name not in DEVICE_DATA or 'thermal' not in DEVICE_DATA[chassis.sku_name] or 'minimum_table' not in DEVICE_DATA[chassis.sku_name]['thermal']: + Fan.min_cooling_level = ChangeMinCoolingLevelAction.UNKNOWN_SKU_COOLING_LEVEL + else: + air_flow_dir = MinCoolingLevelChangeCondition.air_flow_dir + trust_state = MinCoolingLevelChangeCondition.trust_state + temperature = MinCoolingLevelChangeCondition.temperature + minimum_table = DEVICE_DATA[chassis.sku_name]['thermal']['minimum_table']['{}_{}'.format(air_flow_dir, trust_state)] + for key, cooling_level in minimum_table.items(): + temp_range = key.split(':') + temp_min = int(temp_range[0]) * 1000 + temp_max = int(temp_range[1]) * 1000 + if temp_min <= temperature <= temp_max: + Fan.min_cooling_level = cooling_level - 10 + break + + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level < Fan.min_cooling_level: + Fan.set_cooling_level(Fan.min_cooling_level) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py index 2df59acc9bf1..8593b0f32881 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -32,6 +32,20 @@ def is_match(self, thermal_info_dict): return len(fan_info_obj.get_absence_fans()) == 0 if fan_info_obj else False +@thermal_json_object('fan.any.fault') +class AnyFanFaultCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_fault_fans()) > 0 if fan_info_obj else False + + +@thermal_json_object('fan.all.good') +class AllFanGoodCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_fault_fans()) == 0 if fan_info_obj else False + + class PsuCondition(ThermalPolicyConditionBase): def get_psu_info(self, thermal_info_dict): from .thermal_infos import PsuInfo @@ -61,3 +75,29 @@ def is_match(self, thermal_info_dict): psu_info_obj = self.get_psu_info(thermal_info_dict) return len(psu_info_obj.get_absence_psus()) == 0 if psu_info_obj else False + +class MinCoolingLevelChangeCondition(ThermalPolicyConditionBase): + trust_state = None + air_flow_dir = None + temperature = None + + def is_match(self, thermal_info_dict): + from .thermal import Thermal + + trust_state = Thermal.check_module_temperature_trustable() + air_flow_dir, temperature = Thermal.get_air_flow_direction() + + change_cooling_level = False + if trust_state != MinCoolingLevelChangeCondition.trust_state: + MinCoolingLevelChangeCondition.trust_state = trust_state + change_cooling_level = True + + if air_flow_dir != MinCoolingLevelChangeCondition.air_flow_dir: + MinCoolingLevelChangeCondition.air_flow_dir = air_flow_dir + change_cooling_level = True + + if temperature != MinCoolingLevelChangeCondition.temperature: + MinCoolingLevelChangeCondition.temperature = temperature + change_cooling_level = True + + return change_cooling_level diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py index 82c186495f5e..e810a5646456 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py @@ -14,6 +14,7 @@ class FanInfo(ThermalPolicyInfoBase): def __init__(self): self._absence_fans = set() self._presence_fans = set() + self._fault_fans = set() self._status_changed = False def collect(self, chassis): @@ -24,17 +25,27 @@ def collect(self, chassis): """ self._status_changed = False for fan in chassis.get_all_fans(): - if fan.get_presence() and fan not in self._presence_fans: + presence = fan.get_presence() + status = fan.get_status() + if presence and fan not in self._presence_fans: self._presence_fans.add(fan) self._status_changed = True if fan in self._absence_fans: self._absence_fans.remove(fan) - elif not fan.get_presence() and fan not in self._absence_fans: + elif not presence and fan not in self._absence_fans: self._absence_fans.add(fan) self._status_changed = True if fan in self._presence_fans: self._presence_fans.remove(fan) + if not status and fan not in self._fault_fans: + self._fault_fans.add(fan) + self._status_changed = True + elif status and fan in self._fault_fans: + self._fault_fans.remove(fan) + self._status_changed = True + + def get_absence_fans(self): """ Retrieves absence fans @@ -49,6 +60,13 @@ def get_presence_fans(self): """ return self._presence_fans + def get_fault_fans(self): + """ + Retrieves fault fans + :return: A set of fault fans + """ + return self._fault_fans + def is_status_changed(self): """ Retrieves if the status of fan information changed diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 133bb078ca20..efcfb154717f 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -1,12 +1,20 @@ import os from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase +from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy from .thermal_actions import * from .thermal_conditions import * from .thermal_infos import * class ThermalManager(ThermalManagerBase): - THERMAL_ALGORITHM_CONTROL_PATH = '/var/run/hw-management/config/suspend' + @classmethod + def initialize(cls): + """ + Initialize thermal manager, including register thermal condition types and thermal action types + and any other vendor specific initialization. + :return: + """ + cls._add_private_thermal_policy() @classmethod def start_thermal_control_algorithm(cls): @@ -16,7 +24,8 @@ def start_thermal_control_algorithm(cls): Returns: bool: True if set success, False if fail. """ - cls._control_thermal_control_algorithm(False) + from .thermal import Thermal + Thermal.set_thermal_algorithm_status(True) @classmethod def stop_thermal_control_algorithm(cls): @@ -26,25 +35,12 @@ def stop_thermal_control_algorithm(cls): Returns: bool: True if set success, False if fail. """ - cls._control_thermal_control_algorithm(True) + from .thermal import Thermal + Thermal.set_thermal_algorithm_status(False) @classmethod - def _control_thermal_control_algorithm(cls, suspend): - """ - Control thermal control algorithm - - Args: - suspend: Bool, indicate suspend the algorithm or not - - Returns: - bool: True if set success, False if fail. - """ - status = True - write_value = 1 if suspend else 0 - try: - with open(cls.THERMAL_ALGORITHM_CONTROL_PATH, 'w') as control_file: - control_file.write(str(write_value)) - except (ValueError, IOError): - status = False - - return status + def _add_private_thermal_policy(cls): + policy = ThermalPolicy() + policy.conditions[MinCoolingLevelChangeCondition] = MinCoolingLevelChangeCondition() + policy.actions[ChangeMinCoolingLevelAction] = ChangeMinCoolingLevelAction() + cls._policy_dict['DynamicMinCoolingLevelPolicy'] = policy diff --git a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py index f34ace97968d..ff8adb66ac91 100644 --- a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py +++ b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py @@ -1,13 +1,20 @@ class MockFan: + speed = 60 def __init__(self): self.presence = True - self.speed = 60 + self.status = True def get_presence(self): return self.presence def set_speed(self, speed): - self.speed = speed + MockFan.speed = speed + + def get_status(self): + return self.status + + def get_target_speed(self): + return MockFan.speed class MockPsu: diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index 843244e937fa..a89cd75c6e3b 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -11,6 +11,11 @@ from sonic_platform.thermal_manager import ThermalManager from sonic_platform.thermal_infos import FanInfo, PsuInfo +from sonic_platform.fan import Fan +from sonic_platform.thermal import Thermal + +Thermal.check_thermal_zone_temperature = MagicMock() +Thermal.set_thermal_algorithm_status = MagicMock() @pytest.fixture(scope='session', autouse=True) @@ -27,6 +32,7 @@ def test_load_policy(thermal_manager): assert 'any fan absence' in thermal_manager._policy_dict assert 'any psu absence' in thermal_manager._policy_dict + assert 'any fan broken' in thermal_manager._policy_dict assert 'all fan and psu presence' in thermal_manager._policy_dict assert thermal_manager._fan_speed_when_suspend == 60 @@ -40,6 +46,7 @@ def test_fan_info(): fan_info.collect(chassis) assert len(fan_info.get_absence_fans()) == 1 assert len(fan_info.get_presence_fans()) == 0 + assert len(fan_info.get_fault_fans()) == 0 assert fan_info.is_status_changed() fan_list = chassis.get_all_fans() @@ -47,8 +54,15 @@ def test_fan_info(): fan_info.collect(chassis) assert len(fan_info.get_absence_fans()) == 0 assert len(fan_info.get_presence_fans()) == 1 + assert len(fan_info.get_fault_fans()) == 0 assert fan_info.is_status_changed() + fan_list[0].status = False + fan_info.collect(chassis) + assert len(fan_info.get_absence_fans()) == 0 + assert len(fan_info.get_presence_fans()) == 1 + assert len(fan_info.get_fault_fans()) == 1 + assert fan_info.is_status_changed() def test_psu_info(): chassis = MockChassis() @@ -77,35 +91,47 @@ def test_fan_policy(thermal_manager): chassis = MockChassis() chassis.make_fan_absence() chassis.fan_list.append(MockFan()) - thermal_manager.start_thermal_control_algorithm = MagicMock() - thermal_manager.stop_thermal_control_algorithm = MagicMock() thermal_manager.run_policy(chassis) fan_list = chassis.get_all_fans() assert fan_list[1].speed == 100 - thermal_manager.stop_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) fan_list[0].presence = True + Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) + thermal_manager.run_policy(chassis) + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) + assert Thermal.check_thermal_zone_temperature.call_count == 2 + assert fan_list[0].speed == 60 + assert fan_list[1].speed == 60 + + fan_list[0].status = False + thermal_manager.run_policy(chassis) + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) + + fan_list[0].status = True + Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) thermal_manager.run_policy(chassis) - thermal_manager.start_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) + assert Thermal.check_thermal_zone_temperature.call_count == 2 + assert fan_list[0].speed == 100 + assert fan_list[1].speed == 100 def test_psu_policy(thermal_manager): chassis = MockChassis() chassis.make_psu_absence() chassis.fan_list.append(MockFan()) - thermal_manager.start_thermal_control_algorithm = MagicMock() - thermal_manager.stop_thermal_control_algorithm = MagicMock() thermal_manager.run_policy(chassis) fan_list = chassis.get_all_fans() assert fan_list[0].speed == 100 - thermal_manager.stop_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) psu_list = chassis.get_all_psus() psu_list[0].presence = True thermal_manager.run_policy(chassis) - thermal_manager.start_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) def test_any_fan_absence_condition(): @@ -159,6 +185,44 @@ def test_all_fan_presence_condition(): fan_info.collect(chassis) assert condition.is_match({'fan_info': fan_info}) +def test_any_fan_fault_condition(): + chassis = MockChassis() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fault_fan = MockFan() + fault_fan.status = False + fan_list.append(fault_fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AnyFanFaultCondition + condition = AnyFanFaultCondition() + assert condition.is_match({'fan_info': fan_info}) + + fault_fan.status = True + fan_info.collect(chassis) + assert not condition.is_match({'fan_info': fan_info}) + +def test_all_fan_good_condition(): + chassis = MockChassis() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fault_fan = MockFan() + fault_fan.status = False + fan_list.append(fault_fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AllFanGoodCondition + condition = AllFanGoodCondition() + assert not condition.is_match({'fan_info': fan_info}) + + fault_fan.status = True + fan_info.collect(chassis) + assert condition.is_match({'fan_info': fan_info}) + def test_any_psu_absence_condition(): chassis = MockChassis() @@ -275,6 +339,53 @@ def test_load_control_thermal_algo_action(): with pytest.raises(ValueError): action.load_from_json(json_obj) +def test_load_check_and_set_speed_action(): + from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction + action = CheckAndSetAllFanSpeedAction() + json_str = '{\"speed\": \"40\"}' + json_obj = json.loads(json_str) + action.load_from_json(json_obj) + assert action.speed == 40 + + json_str = '{\"speed\": \"-1\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"speed\": \"101\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"invalid\": \"60\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + +def test_execute_check_and_set_fan_speed_action(): + chassis = MockChassis() + fan_list = chassis.get_all_fans() + fan_list.append(MockFan()) + fan_list.append(MockFan()) + fan_info = FanInfo() + fan_info.collect(chassis) + Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) + + from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction + action = CheckAndSetAllFanSpeedAction() + action.speed = 99 + action.execute({'fan_info': fan_info}) + assert fan_list[0].speed == 99 + assert fan_list[1].speed == 99 + + Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) + fan_list[0].speed = 100 + fan_list[1].speed = 100 + action.speed = 60 + action.execute({'fan_info': fan_info}) + assert fan_list[0].speed == 100 + assert fan_list[1].speed == 100 + def test_load_duplicate_condition(): from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy with open(os.path.join(test_path, 'duplicate_condition.json')) as f: @@ -315,4 +426,94 @@ class MockThermalManager(ThermalManagerBase): with pytest.raises(Exception): MockThermalManager.load(os.path.join(test_path, 'policy_with_same_conditions.json')) +def test_dynamic_minimum_table_data(): + from sonic_platform.device_data import DEVICE_DATA + for sku, sku_data in DEVICE_DATA.items(): + if 'thermal' in sku_data and 'minimum_table' in sku_data['thermal']: + minimum_table = sku_data['thermal']['minimum_table'] + check_minimum_table_data(sku, minimum_table) + +def check_minimum_table_data(sku, minimum_table): + valid_dir = ['p2c', 'c2p', 'unk'] + valid_trust_state = ['trust', 'untrust'] + + for category, data in minimum_table.items(): + key_data = category.split('_') + assert key_data[0] in valid_dir + assert key_data[1] in valid_trust_state + + data_list = [(value, key) for key, value in data.items()] + data_list.sort(key=lambda x : x[0]) + + previous_edge = None + previous_cooling_level = None + for item in data_list: + cooling_level = item[0] + range_str = item[1] + + ranges = range_str.split(':') + low = int(ranges[0]) + high = int(ranges[1]) + assert low < high + + if previous_edge is None: + assert low == -127 + else: + assert low - previous_edge == 1, '{}-{}-{} error, item={}'.format(sku, key_data[0], key_data[1], item) + previous_edge = high + + assert 10 <= cooling_level <= 20 + if previous_cooling_level is not None: + assert cooling_level > previous_cooling_level + previous_cooling_level = cooling_level + +def test_dynamic_minimum_policy(thermal_manager): + from sonic_platform.thermal_conditions import MinCoolingLevelChangeCondition + from sonic_platform.thermal_actions import ChangeMinCoolingLevelAction + from sonic_platform.thermal_infos import ChassisInfo + from sonic_platform.thermal import Thermal + from sonic_platform.fan import Fan + ThermalManager.initialize() + assert 'DynamicMinCoolingLevelPolicy' in thermal_manager._policy_dict + policy = thermal_manager._policy_dict['DynamicMinCoolingLevelPolicy'] + assert MinCoolingLevelChangeCondition in policy.conditions + assert ChangeMinCoolingLevelAction in policy.actions + + condition = policy.conditions[MinCoolingLevelChangeCondition] + action = policy.actions[ChangeMinCoolingLevelAction] + Thermal.check_module_temperature_trustable = MagicMock(return_value='trust') + Thermal.get_air_flow_direction = MagicMock(return_value=('p2c', 35000)) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.trust_state == 'trust' + assert MinCoolingLevelChangeCondition.air_flow_dir == 'p2c' + assert MinCoolingLevelChangeCondition.temperature == 35000 + assert not condition.is_match(None) + + Thermal.check_module_temperature_trustable = MagicMock(return_value='untrust') + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.trust_state == 'untrust' + + Thermal.get_air_flow_direction = MagicMock(return_value=('c2p', 35000)) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.air_flow_dir == 'c2p' + + Thermal.get_air_flow_direction = MagicMock(return_value=('c2p', 25000)) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.temperature == 25000 + chassis = MockChassis() + chassis.sku_name = 'invalid' + info = ChassisInfo() + info._chassis = chassis + thermal_info_dict = {ChassisInfo.INFO_NAME: info} + Fan.get_cooling_level = MagicMock(return_value=5) + Fan.set_cooling_level = MagicMock() + action.execute(thermal_info_dict) + assert Fan.min_cooling_level == 6 + Fan.set_cooling_level.assert_called_with(6) + Fan.set_cooling_level.call_count = 0 + + chassis.sku_name = 'ACS-MSN2700' + action.execute(thermal_info_dict) + assert Fan.min_cooling_level == 4 + assert Fan.set_cooling_level.call_count == 0 diff --git a/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json index 5d31b2abd875..413211b21220 100644 --- a/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json +++ b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json @@ -51,6 +51,24 @@ } ] }, + { + "name": "any fan broken", + "conditions": [ + { + "type": "fan.any.fault" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, { "name": "all fan and psu presence", "conditions": [ @@ -59,12 +77,19 @@ }, { "type": "psu.all.presence" + }, + { + "type": "fan.all.good" } ], "actions": [ { "type": "thermal_control.control", "status": "true" + }, + { + "type": "fan.all.check_and_set_speed", + "speed": "60" } ] }