Skip to content

Commit

Permalink
[thermalctld] add FAN led management in thermal control daemon (#54)
Browse files Browse the repository at this point in the history
  • Loading branch information
Junchao-Mellanox authored May 13, 2020
1 parent f1409e0 commit 6e975f5
Show file tree
Hide file tree
Showing 3 changed files with 190 additions and 20 deletions.
119 changes: 101 additions & 18 deletions sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -64,27 +64,61 @@ def log_on_status_changed(normal_status, normal_log, abnormal_log):


class FanStatus(object):
def __init__(self):
absence_fan_count = 0
fault_fan_count = 0
update_led_color = True

def __init__(self, fan=None, is_psu_fan=False):
"""
Constructor of FanStatus
"""
self.fan = fan
self.is_psu_fan = is_psu_fan
self.presence = True
self.status = True
self.under_speed = False
self.over_speed = False
self.invalid_direction = False

@classmethod
def get_bad_fan_count(cls):
return cls.absence_fan_count + cls.fault_fan_count

@classmethod
def reset_fan_counter(cls):
cls.absence_fan_count = 0
cls.fault_fan_count = 0

def set_presence(self, presence):
"""
Set and cache Fan presence status
:param presence: Fan presence status
:return: True if status changed else False
"""
if not presence and not self.is_psu_fan:
FanStatus.absence_fan_count += 1

if presence == self.presence:
return False

self.presence = presence
return True

def set_fault_status(self, status):
"""
Set and cache Fan fault status
:param status: Fan fault status, False indicate Fault
:return: True if status changed else False
"""
if not status:
FanStatus.fault_fan_count += 1

if status == self.status:
return False

self.status = status
return True

def _check_speed_value_available(self, speed, target_speed, tolerance, current_status):
if speed == NOT_AVAILABLE or target_speed == NOT_AVAILABLE or tolerance == NOT_AVAILABLE:
if tolerance > 100 or tolerance < 0:
Expand Down Expand Up @@ -142,7 +176,11 @@ class FanStatus(object):
Indicate the Fan works as expect
:return: True if Fan works normal else False
"""
return self.presence and not self.under_speed and not self.over_speed and not self.invalid_direction
return self.presence and \
self.status and \
not self.under_speed and \
not self.over_speed and \
not self.invalid_direction


#
Expand Down Expand Up @@ -176,33 +214,51 @@ class FanUpdater(object):
:return:
"""
logger.log_debug("Start fan updating")
for index, fan in enumerate(self.chassis.get_all_fans()):
try:
self._refresh_fan_status(fan, index)
except Exception as e:
logger.log_warning('Failed to update FAN status - {}'.format(e))
old_bad_fan_count = FanStatus.get_bad_fan_count()
FanStatus.reset_fan_counter()

fan_index = 0
for drawer in self.chassis.get_all_fan_drawers():
for fan in drawer.get_all_fans():
try:
self._refresh_fan_status(drawer, fan, fan_index)
except Exception as e:
logger.log_warning('Failed to update FAN status - {}'.format(e))
fan_index += 1

for psu_index, psu in enumerate(self.chassis.get_all_psus()):
psu_name = try_get(psu.get_name, 'PSU {}'.format(psu_index))
for fan_index, fan in enumerate(psu.get_all_fans()):
try:
self._refresh_fan_status(fan, fan_index, '{} FAN'.format(psu_name))
self._refresh_fan_status(None, fan, fan_index, '{} FAN'.format(psu_name), True)
except Exception as e:
logger.log_warning('Failed to update PSU FAN status - {}'.format(e))

self._update_led_color()

bad_fan_count = FanStatus.get_bad_fan_count()
if bad_fan_count > 0 and old_bad_fan_count != bad_fan_count:
logger.log_warning("Insufficient number of working fans warning: {} fans are not working.".format(
bad_fan_count
))
elif old_bad_fan_count > 0 and bad_fan_count == 0:
logger.log_notice("Insufficient number of working fans warning cleared: all fans are back to normal.")

logger.log_debug("End fan updating")

def _refresh_fan_status(self, fan, index, name_prefix='FAN'):
def _refresh_fan_status(self, fan_drawer, fan, index, name_prefix='FAN', is_psu_fan=False):
"""
Get Fan status by platform API and write to database for a given Fan
:param fan_drawer: Object representing a platform Fan drawer
:param fan: Object representing a platform Fan
:param index: Index of the Fan object in the platform
:param name_prefix: name prefix of Fan object if Fan.get_name not presented
:return:
"""
drawer_name = NOT_AVAILABLE if is_psu_fan else str(try_get(fan_drawer.get_name))
fan_name = try_get(fan.get_name, '{} {}'.format(name_prefix, index + 1))
if fan_name not in self.fan_status_dict:
self.fan_status_dict[fan_name] = FanStatus()
self.fan_status_dict[fan_name] = FanStatus(fan, is_psu_fan)

fan_status = self.fan_status_dict[fan_name]

Expand All @@ -228,45 +284,55 @@ class FanUpdater(object):
'the system, potential overheat hazard'.format(fan_name)
)

if presence and fan_status.set_fault_status(fan_fault_status):
set_led = True
log_on_status_changed(fan_status.status,
'Fan fault warning cleared: {} is back to normal.'.format(fan_name),
'Fan fault warning: {} is broken.'.format(fan_name)
)

if presence and fan_status.set_under_speed(speed, speed_target, speed_tolerance):
set_led = True
log_on_status_changed(not fan_status.under_speed,
'Fan under speed warning cleared: {} speed back to normal.'.format(fan_name),
'Fan under speed warning: {} current speed={}, target speed={}, tolerance={}.'.
'Fan low speed warning cleared: {} speed is back to normal.'.format(fan_name),
'Fan low speed warning: {} current speed={}, target speed={}, tolerance={}.'.
format(fan_name, speed, speed_target, speed_tolerance)
)

if presence and fan_status.set_over_speed(speed, speed_target, speed_tolerance):
set_led = True
log_on_status_changed(not fan_status.over_speed,
'Fan over speed warning cleared: {} speed back to normal.'.format(fan_name),
'Fan over speed warning: {} target speed={}, current speed={}, tolerance={}.'.
'Fan high speed warning cleared: {} speed is back to normal.'.format(fan_name),
'Fan high speed warning: {} target speed={}, current speed={}, tolerance={}.'.
format(fan_name, speed_target, speed, speed_tolerance)
)

# TODO: handle invalid fan direction

if set_led:
self._set_fan_led(fan, fan_name, fan_status)
# We don't set PSU led here, PSU led will be handled in psud
if set_led and not is_psu_fan:
self._set_fan_led(fan_drawer, fan, fan_name, fan_status)
FanStatus.update_led_color = True

fvs = swsscommon.FieldValuePairs(
[('presence', str(presence)),
('drawer_name', drawer_name),
('model', str(try_get(fan.get_model))),
('serial', str(try_get(fan.get_serial))),
('status', str(fan_fault_status)),
('direction', str(fan_direction)),
('speed', str(speed)),
('speed_tolerance', str(speed_tolerance)),
('speed_target', str(speed_target)),
('led_status', str(try_get(fan.get_status_led))),
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
])

self.table.set(fan_name, fvs)

def _set_fan_led(self, fan, fan_name, fan_status):
def _set_fan_led(self, fan_drawer, fan, fan_name, fan_status):
"""
Set fan led according to current status
:param fan_drawer: Object representing a platform Fan drawer or PSU
:param fan: Object representing a platform Fan
:param fan_name: Name of the Fan object in case any vendor not implement Fan.get_name
:param fan_status: Object representing the FanStatus
Expand All @@ -275,13 +341,30 @@ class FanUpdater(object):
try:
if fan_status.is_ok():
fan.set_status_led(fan.STATUS_LED_COLOR_GREEN)
fan_drawer.set_status_led(fan.STATUS_LED_COLOR_GREEN)
else:
# TODO: wait for Kebo to define the mapping of fan status to led color,
# just set it to red so far
fan.set_status_led(fan.STATUS_LED_COLOR_RED)
fan_drawer.set_status_led(fan.STATUS_LED_COLOR_RED)
except NotImplementedError as e:
logger.log_warning('Failed to set led to fan, set_status_led not implemented')

def _update_led_color(self):
if FanStatus.update_led_color:
for fan_name, fan_status in self.fan_status_dict.items():
try:
fvs = swsscommon.FieldValuePairs([
('led_status', str(try_get(fan_status.fan.get_status_led)))
])
except Exception as e:
logger.log_warning('Failed to get led status for fan')
fvs = swsscommon.FieldValuePairs([
('led_status', NOT_AVAILABLE)
])
self.table.set(fan_name, fvs)
FanStatus.update_led_color = False


class TemperatureStatus(object):
TEMPERATURE_DIFF_THRESHOLD = 10
Expand Down
43 changes: 43 additions & 0 deletions sonic-thermalctld/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,25 @@ def get_all_fans(self):
return self.fan_list


class MockFanDrawer(MockDevice):
def __init__(self):
self.name = 'FanDrawer'
self.fan_list = []
self.led_status = 'red'

def get_name(self):
return self.name

def get_all_fans(self):
return self.fan_list

def get_status_led(self):
return self.led_status

def set_status_led(self, value):
self.led_status = value


class MockThermal:
def __init__(self):
self.name = None
Expand Down Expand Up @@ -134,6 +153,7 @@ def __init__(self):
self.fan_list = []
self.psu_list = []
self.thermal_list = []
self.fan_drawer_list = []

def get_all_fans(self):
return self.fan_list
Expand All @@ -144,24 +164,47 @@ def get_all_psus(self):
def get_all_thermals(self):
return self.thermal_list

def get_all_fan_drawers(self):
return self.fan_drawer_list

def make_absence_fan(self):
fan = MockFan()
fan.presence = False
fan_drawer = MockFanDrawer()
fan_drawer.fan_list.append(fan)
self.fan_list.append(fan)
self.fan_drawer_list.append(fan_drawer)

def make_fault_fan(self):
fan = MockFan()
fan.status = False
fan_drawer = MockFanDrawer()
fan_drawer.fan_list.append(fan)
self.fan_list.append(fan)
self.fan_drawer_list.append(fan_drawer)

def make_under_speed_fan(self):
fan = MockFan()
fan.make_under_speed()
fan_drawer = MockFanDrawer()
fan_drawer.fan_list.append(fan)
self.fan_list.append(fan)
self.fan_drawer_list.append(fan_drawer)

def make_over_speed_fan(self):
fan = MockFan()
fan.make_over_speed()
fan_drawer = MockFanDrawer()
fan_drawer.fan_list.append(fan)
self.fan_list.append(fan)
self.fan_drawer_list.append(fan_drawer)

def make_error_fan(self):
fan = MockErrorFan()
fan_drawer = MockFanDrawer()
fan_drawer.fan_list.append(fan)
self.fan_list.append(fan)
self.fan_drawer_list.append(fan_drawer)

def make_over_temper_thermal(self):
thermal = MockThermal()
Expand Down
48 changes: 46 additions & 2 deletions sonic-thermalctld/tests/test_thermalctld.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,27 @@ def test_fanupdater_fan_absence():
fan_updater.update()
fan_list = chassis.get_all_fans()
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
logger.log_warning.assert_called_once()
logger.log_warning.assert_called()

fan_list[0].presence = True
fan_updater.update()
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_GREEN
logger.log_notice.assert_called_once()
logger.log_notice.assert_called()


def test_fanupdater_fan_fault():
chassis = MockChassis()
chassis.make_fault_fan()
fan_updater = FanUpdater(chassis)
fan_updater.update()
fan_list = chassis.get_all_fans()
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
logger.log_warning.assert_called()

fan_list[0].status = True
fan_updater.update()
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_GREEN
logger.log_notice.assert_called()


def test_fanupdater_fan_under_speed():
Expand Down Expand Up @@ -133,6 +148,35 @@ def test_fanupdater_fan_over_speed():
logger.log_notice.assert_called_once()


def test_insufficient_fan_number():
fan_status1 = FanStatus()
fan_status2 = FanStatus()
fan_status1.set_presence(False)
fan_status2.set_fault_status(False)
assert FanStatus.get_bad_fan_count() == 2
FanStatus.reset_fan_counter()
assert FanStatus.get_bad_fan_count() == 0

chassis = MockChassis()
chassis.make_absence_fan()
chassis.make_fault_fan()
fan_updater = FanUpdater(chassis)
fan_updater.update()
assert logger.log_warning.call_count == 3
logger.log_warning.assert_called_with('Insufficient number of working fans warning: 2 fans are not working.')

fan_list = chassis.get_all_fans()
fan_list[0].presence = True
fan_updater.update()
assert logger.log_notice.call_count == 1
logger.log_warning.assert_called_with('Insufficient number of working fans warning: 1 fans are not working.')

fan_list[1].status = True
fan_updater.update()
assert logger.log_notice.call_count == 3
logger.log_notice.assert_called_with('Insufficient number of working fans warning cleared: all fans are back to normal.')


def test_temperature_status_set_over_temper():
temperatue_status = TemperatureStatus()
ret = temperatue_status.set_over_temperature(NOT_AVAILABLE, NOT_AVAILABLE)
Expand Down

0 comments on commit 6e975f5

Please sign in to comment.