From b1cb4e70170bff8cd593e360f9abebe8cd0bfcc4 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Mon, 8 Jul 2024 18:35:44 +0800 Subject: [PATCH] [Mellanox] implement state machine for always firmware control ports (#19473) - Why I did it When module host management is enabled, all modules are managed by the same state machine which initializes all modules/ports. However, on 5600 and 5400, there is a type of port which we called "service port". Those ports are always under firmware control. In that case, those ports should be managed by a different state machine. This PR is to implement the state machine for always firmware control ports. - How I did it Implement a state machine for always firmware control ports. - How to verify it Manual test unit test --- .../sonic_platform/device_data.py | 19 +++++++ .../mlnx-platform-api/sonic_platform/sfp.py | 53 +++++++++++++++---- .../mlnx-platform-api/tests/test_sfp_sm.py | 13 +++++ 3 files changed, 74 insertions(+), 11 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py index 35c40476b93a..1f2b3164a64d 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -126,6 +126,9 @@ "comex_amb": False, "pch_temp": True } + }, + 'sfp': { + 'fw_control_ports': [64, 65] # 0 based sfp index list } }, 'x86_64-nvidia_sn5600-r0': { @@ -134,6 +137,9 @@ "comex_amb": False, "pch_temp": True } + }, + 'sfp': { + 'fw_control_ports': [64] # 0 based sfp index list } }, 'x86_64-nvidia_sn4280_simx-r0': { @@ -307,3 +313,16 @@ def get_watchdog_max_period(cls): return DEFAULT_WD_PERIOD return watchdog_data.get('max_period', None) + + @classmethod + @utils.read_only_cache() + def get_always_fw_control_ports(cls): + platform_data = DEVICE_DATA.get(cls.get_platform_name()) + if not platform_data: + return None + + sfp_data = platform_data.get('sfp') + if not sfp_data: + return None + + return sfp_data.get('fw_control_ports') diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py index a457b7458072..8ced4a08dad7 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py @@ -253,6 +253,14 @@ ACTION_ON_FW_CONTROL = 'On Firmware Control' ACTION_ON_POWER_LIMIT_ERROR = 'On Power Limit Error' ACTION_ON_CANCEL_WAIT = 'On Cancel Wait' + +# States/actions for always firmware control ports +STATE_FCP_DOWN = 'Down(Firmware Control)' +STATE_FCP_INIT = 'Initializing(Firmware Control)' +STATE_FCP_NOT_PRESENT = 'Not Present(Firmware Control)' +STATE_FCP_PRESENT = 'Present(Firmware Control)' + +ACTION_FCP_ON_START = 'On Start(Firmware Control)' # Module host management definitions end # SFP EEPROM limited bytes @@ -463,7 +471,11 @@ def __init__(self, sfp_index, sfp_type=None, slot_id=0, linecard_port_count=0, l self.slot_id = slot_id self._sfp_type_str = None # SFP state, only applicable for module host management - self.state = STATE_DOWN + fw_control_ports = DeviceDataManager.get_always_fw_control_ports() + if not fw_control_ports or self.sdk_index not in fw_control_ports: + self.state = STATE_DOWN + else: + self.state = STATE_FCP_DOWN def __str__(self): return f'SFP {self.sdk_index}' @@ -1430,7 +1442,7 @@ def get_state_machine(cls): sm.add_state(STATE_POWER_LIMIT_ERROR).set_entry_action(ACTION_ON_POWER_LIMIT_ERROR) \ .add_transition(EVENT_POWER_GOOD, STATE_POWERED_ON) \ .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) - + cls.action_table = {} cls.action_table[ACTION_ON_START] = cls.action_on_start cls.action_table[ACTION_ON_RESET] = cls.action_on_reset @@ -1440,6 +1452,16 @@ def get_state_machine(cls): cls.action_table[ACTION_ON_CANCEL_WAIT] = cls.action_on_cancel_wait cls.action_table[ACTION_ON_POWER_LIMIT_ERROR] = cls.action_on_power_limit_error + # For always firewire control ports + sm.add_state(STATE_FCP_DOWN).add_transition(EVENT_START, STATE_FCP_INIT) + sm.add_state(STATE_FCP_INIT).set_entry_action(ACTION_FCP_ON_START) \ + .add_transition(EVENT_NOT_PRESENT, STATE_FCP_NOT_PRESENT) \ + .add_transition(EVENT_PRESENT, STATE_FCP_PRESENT) + sm.add_state(STATE_FCP_NOT_PRESENT).add_transition(EVENT_PRESENT, STATE_FCP_PRESENT) + sm.add_state(STATE_FCP_PRESENT).add_transition(EVENT_NOT_PRESENT, STATE_FCP_NOT_PRESENT) + + cls.action_table[ACTION_FCP_ON_START] = cls.action_fcp_on_start + cls.sm = sm return cls.sm @@ -1468,6 +1490,14 @@ def action_on_start(cls, sfp): sfp.on_event(EVENT_RESET) else: sfp.on_event(EVENT_POWER_ON) + + @classmethod + def action_fcp_on_start(cls, sfp): + present = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{sfp.sdk_index}/present') + if present: + sfp.on_event(EVENT_PRESENT) + else: + sfp.on_event(EVENT_NOT_PRESENT) @classmethod def action_on_reset(cls, sfp): @@ -1564,10 +1594,12 @@ def in_stable_state(self): Returns: bool: True if the module is in a stable state """ - return self.state in (STATE_NOT_PRESENT, STATE_SW_CONTROL, STATE_FW_CONTROL, STATE_POWER_BAD, STATE_POWER_LIMIT_ERROR) + return self.state in (STATE_NOT_PRESENT, STATE_SW_CONTROL, STATE_FW_CONTROL, + STATE_POWER_BAD, STATE_POWER_LIMIT_ERROR, STATE_FCP_NOT_PRESENT, + STATE_FCP_PRESENT) def get_fds_for_poling(self): - if self.state == STATE_FW_CONTROL: + if self.state == STATE_FW_CONTROL or self.state == STATE_FCP_NOT_PRESENT or self.state == STATE_FCP_PRESENT: return { 'present': self.get_fd('present') } @@ -1583,11 +1615,9 @@ def fill_change_event(self, port_dict): Args: port_dict (dict): {:} """ - if self.state == STATE_NOT_PRESENT: + if self.state == STATE_NOT_PRESENT or self.state == STATE_FCP_NOT_PRESENT: port_dict[self.sdk_index + 1] = SFP_STATUS_REMOVED - elif self.state == STATE_SW_CONTROL: - port_dict[self.sdk_index + 1] = SFP_STATUS_INSERTED - elif self.state == STATE_FW_CONTROL: + elif self.state == STATE_SW_CONTROL or self.state == STATE_FW_CONTROL or self.state == STATE_FCP_PRESENT: port_dict[self.sdk_index + 1] = SFP_STATUS_INSERTED elif self.state == STATE_POWER_BAD or self.state == STATE_POWER_LIMIT_ERROR: sfp_state = SFP.SFP_ERROR_BIT_POWER_BUDGET_EXCEEDED | SFP.SFP_STATUS_BIT_INSERTED @@ -1606,7 +1636,7 @@ def refresh_poll_obj(self, poll_obj, all_registered_fds): # find fds registered by this SFP current_registered_fds = {item[2]: (fileno, item[1]) for fileno, item in all_registered_fds.items() if item[0] == self.sdk_index} logger.log_debug(f'SFP {self.sdk_index} registered fds are: {current_registered_fds}') - if self.state == STATE_FW_CONTROL: + if self.state == STATE_FW_CONTROL or self.state == STATE_FCP_NOT_PRESENT or self.state == STATE_FCP_PRESENT: target_poll_types = ['present'] else: target_poll_types = ['hw_present', 'power_good'] @@ -1642,9 +1672,10 @@ def is_dummy_event(self, fd_type, fd_value): """ if fd_type == 'hw_present' or fd_type == 'present': if fd_value == int(SFP_STATUS_INSERTED): - return self.state in (STATE_SW_CONTROL, STATE_FW_CONTROL, STATE_POWER_BAD, STATE_POWER_LIMIT_ERROR) + return self.state in (STATE_SW_CONTROL, STATE_FW_CONTROL, STATE_POWER_BAD, + STATE_POWER_LIMIT_ERROR, STATE_FCP_PRESENT) elif fd_value == int(SFP_STATUS_REMOVED): - return self.state == STATE_NOT_PRESENT + return self.state in (STATE_NOT_PRESENT, STATE_FCP_NOT_PRESENT) elif fd_type == 'power_good': if fd_value == 1: return self.state in (STATE_SW_CONTROL, STATE_NOT_PRESENT, STATE_RESETTING) diff --git a/platform/mellanox/mlnx-platform-api/tests/test_sfp_sm.py b/platform/mellanox/mlnx-platform-api/tests/test_sfp_sm.py index 9f2154173d32..a1e4a0280037 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_sfp_sm.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_sfp_sm.py @@ -168,3 +168,16 @@ def test_sw_control(self): s.disable_tx_for_sff_optics = mock.MagicMock() s.on_event(sfp.EVENT_START) assert s.get_state() == sfp.STATE_SW_CONTROL + + @mock.patch('sonic_platform.device_data.DeviceDataManager.get_always_fw_control_ports', mock.MagicMock(return_value=[0])) + def test_fcp_state(self): + self.mock_value('present', 1) + s = sfp.SFP(0) + s.on_event(sfp.EVENT_START) + assert s.get_state() == sfp.STATE_FCP_PRESENT + + self.mock_value('present', 0) + s = sfp.SFP(0) + s.on_event(sfp.EVENT_START) + assert s.get_state() == sfp.STATE_FCP_NOT_PRESENT +