Skip to content

Commit

Permalink
[Mellanox] implement state machine for always firmware control ports (s…
Browse files Browse the repository at this point in the history
…onic-net#19473)

- Why I did it
When module host management is enabled, all modules are managed by the same state machine which initializes all modules/ports. However, on 5600 and 5400, there is a type of port which we called "service port". Those ports are always under firmware control. In that case, those ports should be managed by a different state machine. This PR is to implement the state machine for always firmware control ports.

- How I did it
Implement a state machine for always firmware control ports.

- How to verify it
Manual test
unit test
  • Loading branch information
Junchao-Mellanox authored Jul 8, 2024
1 parent b151923 commit b1cb4e7
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 11 deletions.
19 changes: 19 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@
"comex_amb": False,
"pch_temp": True
}
},
'sfp': {
'fw_control_ports': [64, 65] # 0 based sfp index list
}
},
'x86_64-nvidia_sn5600-r0': {
Expand All @@ -134,6 +137,9 @@
"comex_amb": False,
"pch_temp": True
}
},
'sfp': {
'fw_control_ports': [64] # 0 based sfp index list
}
},
'x86_64-nvidia_sn4280_simx-r0': {
Expand Down Expand Up @@ -307,3 +313,16 @@ def get_watchdog_max_period(cls):
return DEFAULT_WD_PERIOD

return watchdog_data.get('max_period', None)

@classmethod
@utils.read_only_cache()
def get_always_fw_control_ports(cls):
platform_data = DEVICE_DATA.get(cls.get_platform_name())
if not platform_data:
return None

sfp_data = platform_data.get('sfp')
if not sfp_data:
return None

return sfp_data.get('fw_control_ports')
53 changes: 42 additions & 11 deletions platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,14 @@
ACTION_ON_FW_CONTROL = 'On Firmware Control'
ACTION_ON_POWER_LIMIT_ERROR = 'On Power Limit Error'
ACTION_ON_CANCEL_WAIT = 'On Cancel Wait'

# States/actions for always firmware control ports
STATE_FCP_DOWN = 'Down(Firmware Control)'
STATE_FCP_INIT = 'Initializing(Firmware Control)'
STATE_FCP_NOT_PRESENT = 'Not Present(Firmware Control)'
STATE_FCP_PRESENT = 'Present(Firmware Control)'

ACTION_FCP_ON_START = 'On Start(Firmware Control)'
# Module host management definitions end

# SFP EEPROM limited bytes
Expand Down Expand Up @@ -463,7 +471,11 @@ def __init__(self, sfp_index, sfp_type=None, slot_id=0, linecard_port_count=0, l
self.slot_id = slot_id
self._sfp_type_str = None
# SFP state, only applicable for module host management
self.state = STATE_DOWN
fw_control_ports = DeviceDataManager.get_always_fw_control_ports()
if not fw_control_ports or self.sdk_index not in fw_control_ports:
self.state = STATE_DOWN
else:
self.state = STATE_FCP_DOWN

def __str__(self):
return f'SFP {self.sdk_index}'
Expand Down Expand Up @@ -1430,7 +1442,7 @@ def get_state_machine(cls):
sm.add_state(STATE_POWER_LIMIT_ERROR).set_entry_action(ACTION_ON_POWER_LIMIT_ERROR) \
.add_transition(EVENT_POWER_GOOD, STATE_POWERED_ON) \
.add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT)

cls.action_table = {}
cls.action_table[ACTION_ON_START] = cls.action_on_start
cls.action_table[ACTION_ON_RESET] = cls.action_on_reset
Expand All @@ -1440,6 +1452,16 @@ def get_state_machine(cls):
cls.action_table[ACTION_ON_CANCEL_WAIT] = cls.action_on_cancel_wait
cls.action_table[ACTION_ON_POWER_LIMIT_ERROR] = cls.action_on_power_limit_error

# For always firewire control ports
sm.add_state(STATE_FCP_DOWN).add_transition(EVENT_START, STATE_FCP_INIT)
sm.add_state(STATE_FCP_INIT).set_entry_action(ACTION_FCP_ON_START) \
.add_transition(EVENT_NOT_PRESENT, STATE_FCP_NOT_PRESENT) \
.add_transition(EVENT_PRESENT, STATE_FCP_PRESENT)
sm.add_state(STATE_FCP_NOT_PRESENT).add_transition(EVENT_PRESENT, STATE_FCP_PRESENT)
sm.add_state(STATE_FCP_PRESENT).add_transition(EVENT_NOT_PRESENT, STATE_FCP_NOT_PRESENT)

cls.action_table[ACTION_FCP_ON_START] = cls.action_fcp_on_start

cls.sm = sm

return cls.sm
Expand Down Expand Up @@ -1468,6 +1490,14 @@ def action_on_start(cls, sfp):
sfp.on_event(EVENT_RESET)
else:
sfp.on_event(EVENT_POWER_ON)

@classmethod
def action_fcp_on_start(cls, sfp):
present = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{sfp.sdk_index}/present')
if present:
sfp.on_event(EVENT_PRESENT)
else:
sfp.on_event(EVENT_NOT_PRESENT)

@classmethod
def action_on_reset(cls, sfp):
Expand Down Expand Up @@ -1564,10 +1594,12 @@ def in_stable_state(self):
Returns:
bool: True if the module is in a stable state
"""
return self.state in (STATE_NOT_PRESENT, STATE_SW_CONTROL, STATE_FW_CONTROL, STATE_POWER_BAD, STATE_POWER_LIMIT_ERROR)
return self.state in (STATE_NOT_PRESENT, STATE_SW_CONTROL, STATE_FW_CONTROL,
STATE_POWER_BAD, STATE_POWER_LIMIT_ERROR, STATE_FCP_NOT_PRESENT,
STATE_FCP_PRESENT)

def get_fds_for_poling(self):
if self.state == STATE_FW_CONTROL:
if self.state == STATE_FW_CONTROL or self.state == STATE_FCP_NOT_PRESENT or self.state == STATE_FCP_PRESENT:
return {
'present': self.get_fd('present')
}
Expand All @@ -1583,11 +1615,9 @@ def fill_change_event(self, port_dict):
Args:
port_dict (dict): {<sfp_index>:<sfp_state>}
"""
if self.state == STATE_NOT_PRESENT:
if self.state == STATE_NOT_PRESENT or self.state == STATE_FCP_NOT_PRESENT:
port_dict[self.sdk_index + 1] = SFP_STATUS_REMOVED
elif self.state == STATE_SW_CONTROL:
port_dict[self.sdk_index + 1] = SFP_STATUS_INSERTED
elif self.state == STATE_FW_CONTROL:
elif self.state == STATE_SW_CONTROL or self.state == STATE_FW_CONTROL or self.state == STATE_FCP_PRESENT:
port_dict[self.sdk_index + 1] = SFP_STATUS_INSERTED
elif self.state == STATE_POWER_BAD or self.state == STATE_POWER_LIMIT_ERROR:
sfp_state = SFP.SFP_ERROR_BIT_POWER_BUDGET_EXCEEDED | SFP.SFP_STATUS_BIT_INSERTED
Expand All @@ -1606,7 +1636,7 @@ def refresh_poll_obj(self, poll_obj, all_registered_fds):
# find fds registered by this SFP
current_registered_fds = {item[2]: (fileno, item[1]) for fileno, item in all_registered_fds.items() if item[0] == self.sdk_index}
logger.log_debug(f'SFP {self.sdk_index} registered fds are: {current_registered_fds}')
if self.state == STATE_FW_CONTROL:
if self.state == STATE_FW_CONTROL or self.state == STATE_FCP_NOT_PRESENT or self.state == STATE_FCP_PRESENT:
target_poll_types = ['present']
else:
target_poll_types = ['hw_present', 'power_good']
Expand Down Expand Up @@ -1642,9 +1672,10 @@ def is_dummy_event(self, fd_type, fd_value):
"""
if fd_type == 'hw_present' or fd_type == 'present':
if fd_value == int(SFP_STATUS_INSERTED):
return self.state in (STATE_SW_CONTROL, STATE_FW_CONTROL, STATE_POWER_BAD, STATE_POWER_LIMIT_ERROR)
return self.state in (STATE_SW_CONTROL, STATE_FW_CONTROL, STATE_POWER_BAD,
STATE_POWER_LIMIT_ERROR, STATE_FCP_PRESENT)
elif fd_value == int(SFP_STATUS_REMOVED):
return self.state == STATE_NOT_PRESENT
return self.state in (STATE_NOT_PRESENT, STATE_FCP_NOT_PRESENT)
elif fd_type == 'power_good':
if fd_value == 1:
return self.state in (STATE_SW_CONTROL, STATE_NOT_PRESENT, STATE_RESETTING)
Expand Down
13 changes: 13 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/test_sfp_sm.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,16 @@ def test_sw_control(self):
s.disable_tx_for_sff_optics = mock.MagicMock()
s.on_event(sfp.EVENT_START)
assert s.get_state() == sfp.STATE_SW_CONTROL

@mock.patch('sonic_platform.device_data.DeviceDataManager.get_always_fw_control_ports', mock.MagicMock(return_value=[0]))
def test_fcp_state(self):
self.mock_value('present', 1)
s = sfp.SFP(0)
s.on_event(sfp.EVENT_START)
assert s.get_state() == sfp.STATE_FCP_PRESENT

self.mock_value('present', 0)
s = sfp.SFP(0)
s.on_event(sfp.EVENT_START)
assert s.get_state() == sfp.STATE_FCP_NOT_PRESENT

0 comments on commit b1cb4e7

Please sign in to comment.