From e6e34c137c75f354b60dc3c1cb71b23afacc09ed Mon Sep 17 00:00:00 2001 From: mihirpat1 <112018033+mihirpat1@users.noreply.github.com> Date: Wed, 20 Mar 2024 18:50:57 -0700 Subject: [PATCH] Disable periodic polling of port in DomInfoUpdateTask thread during CMIS init (#449) (#450) Cherry-pick for #449 Description We currently need to disable periodic DOM polling of a port through DomInfoUpdateTask thread during CMIS initialization. Motivation and Context Disabling of DOM polling during CMIS initialization is primarily needed to disable sending CDB commands to read FW version from DomInfoUpdateTask thread during CMIS initialization. For transceivers which do not support CDB background mode, any EEPROM access to the module can fail if a CDB command is executed at the same time. In order to disable DOM polling during CMIS initialization, the cmis_state from CmisManagerTask thread is now being updated in the TRANSCEIVER_STATUS table of STATE_DB. If the current cmis_state does not belong to CMIS_TERMINAL_STATES, DomInfoUpdateTask will disable DOM polling for the port to allow CmisManagerTask thread to complete CMIS initialization if required. For platforms with CmisManagerTask disabled, the function is_port_in_cmis_initialization_process will always return False to allow DOM polling. In case of device boot-up or transceiver insertion, the DomInfoUpdateTask thread will wait for the port to be in either of the CMIS_TERMINAL_STATES before proceeding with DOM polling. For non-CMIS transceivers, the expected cmis_state is CMIS_STATE_READY. Hence, once the corresponding port reaches CMIS_STATE_READY state, DOM polling will be enabled for such port. Also, the cmis_state is not planned to be modified by DomInfoUpdateTask thread at any time to prevent race condition with CmisManagerTask. How Has This Been Tested? Following is the summary of tests performed 1 Ensure DOM thread polls for non-CMIS transceivers 2 Ensure DOM thread polls for platform with CMIS manager disabled 3 Ensure DOM thread waits during CMIS initialization 4 Ensure DOM polling is resumed after transceiver insertion 4.1 After removal 4.2 After insertion 5. Ensured `show interface transceiver status` CLI works with the current changes Redis-db dump snippet to show `cmis_state` field in TRANSCEIVER_STATUS table root@sonic:/home/admin# redis-cli -n 6 hgetall "TRANSCEIVER_STATUS|Ethernet0" 1) "cmis_state" 2) "READY" Additional Information (Optional) MSFT ADO - 26993372 --- sonic-xcvrd/tests/test_xcvrd.py | 91 ++++++++++++++++--- sonic-xcvrd/xcvrd/xcvrd.py | 155 +++++++++++++++++++++----------- 2 files changed, 185 insertions(+), 61 deletions(-) diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index dda908b5f..8c7d64939 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -77,7 +77,8 @@ def test_CmisManagerTask_task_run_with_exception(self): def test_DomInfoUpdateTask_task_run_with_exception(self): port_mapping = PortMapping() stop_event = threading.Event() - dom_info_update = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event) + mock_cmis_manager = MagicMock() + dom_info_update = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event, mock_cmis_manager) exception_received = None trace = None try: @@ -230,6 +231,16 @@ def test_del_port_sfp_dom_info_from_db(self): firmware_info_tbl = Table("STATE_DB", TRANSCEIVER_FIRMWARE_INFO_TABLE) del_port_sfp_dom_info_from_db(logical_port_name, port_mapping, init_tbl, dom_tbl, dom_threshold_tbl, pm_tbl, firmware_info_tbl) + @pytest.mark.parametrize("mock_found, mock_status_dict, expected_cmis_state", [ + (True, {'cmis_state': CMIS_STATE_INSERTED}, CMIS_STATE_INSERTED), + (False, {}, CMIS_STATE_UNKNOWN), + (True, {'other_key': 'some_value'}, CMIS_STATE_UNKNOWN) + ]) + def test_get_cmis_state_from_state_db(self, mock_found, mock_status_dict, expected_cmis_state): + status_tbl = MagicMock() + status_tbl.get.return_value = (mock_found, mock_status_dict) + assert get_cmis_state_from_state_db("Ethernet0", status_tbl) == expected_cmis_state + @patch('xcvrd.xcvrd.get_physical_port_name_dict', MagicMock(return_value={0: 'Ethernet0'})) @patch('xcvrd.xcvrd._wrapper_get_presence', MagicMock(return_value=True)) @patch('xcvrd.xcvrd._wrapper_get_transceiver_status', MagicMock(return_value={'module_state': 'ModuleReady', @@ -649,6 +660,22 @@ def test_DaemonXcvrd_run(self, mock_task_stop1, mock_task_stop2, mock_task_run1, assert mock_deinit.call_count == 1 assert mock_init.call_count == 1 + def test_CmisManagerTask_update_port_transceiver_status_table_sw_cmis_state(self): + port_mapping = PortMapping() + stop_event = threading.Event() + task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event) + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET) + task.on_port_update_event(port_change_event) + + task.xcvr_table_helper.get_status_tbl = MagicMock(return_value=None) + task.update_port_transceiver_status_table_sw_cmis_state("Ethernet0", CMIS_STATE_INSERTED) + + mock_get_status_tbl = MagicMock() + mock_get_status_tbl.set = MagicMock() + task.xcvr_table_helper.get_status_tbl.return_value = mock_get_status_tbl + task.update_port_transceiver_status_table_sw_cmis_state("Ethernet0", CMIS_STATE_INSERTED) + assert mock_get_status_tbl.set.call_count == 1 + @patch('xcvrd.xcvrd._wrapper_get_sfp_type', MagicMock(return_value='QSFP_DD')) def test_CmisManagerTask_handle_port_change_event(self): port_mapping = PortMapping() @@ -903,12 +930,14 @@ def test_CmisManagerTask_post_port_active_apsel_to_db(self): ret = task.post_port_active_apsel_to_db(mock_xcvr_api, lport, host_lanes_mask) assert int_tbl.getKeys() == [] + @patch('xcvrd.xcvrd.XcvrTableHelper.get_status_tbl') @patch('xcvrd.xcvrd.platform_chassis') @patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', MagicMock(return_value=(None, None))) @patch('xcvrd.xcvrd_utilities.port_mapping.handle_port_update_event', MagicMock()) @patch('xcvrd.xcvrd._wrapper_get_sfp_type', MagicMock(return_value='QSFP_DD')) @patch('xcvrd.xcvrd.CmisManagerTask.wait_for_port_config_done', MagicMock()) - def test_CmisManagerTask_task_worker(self, mock_chassis): + def test_CmisManagerTask_task_worker(self, mock_chassis, mock_get_status_tbl): + mock_get_status_tbl = Table("STATE_DB", TRANSCEIVER_STATUS_TABLE) mock_xcvr_api = MagicMock() mock_xcvr_api.set_datapath_deinit = MagicMock(return_value=True) mock_xcvr_api.set_datapath_init = MagicMock(return_value=True) @@ -1005,7 +1034,13 @@ def test_CmisManagerTask_task_worker(self, mock_chassis): port_mapping = PortMapping() stop_event = threading.Event() task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event) + task.port_mapping.logical_port_list = ['Ethernet0'] + task.xcvr_table_helper.get_status_tbl.return_value = mock_get_status_tbl + task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) + task.task_worker() + assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_UNKNOWN + task.port_mapping.logical_port_list = MagicMock() port_change_event = PortChangeEvent('PortConfigDone', -1, 0, PortChangeEvent.PORT_SET) task.on_port_update_event(port_change_event) assert task.isPortConfigDone @@ -1014,6 +1049,7 @@ def test_CmisManagerTask_task_worker(self, mock_chassis): {'speed':'400000', 'lanes':'1,2,3,4,5,6,7,8'}) task.on_port_update_event(port_change_event) assert len(task.port_dict) == 1 + assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_INSERTED task.get_host_tx_status = MagicMock(return_value='true') task.get_port_admin_status = MagicMock(return_value='up') @@ -1025,31 +1061,38 @@ def test_CmisManagerTask_task_worker(self, mock_chassis): # Case 1: Module Inserted --> DP_DEINIT task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) task.task_worker() - assert task.port_dict['Ethernet0']['cmis_state'] == 'DP_DEINIT' + assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_DP_DEINIT task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) task.task_worker() assert mock_xcvr_api.set_datapath_deinit.call_count == 1 assert mock_xcvr_api.tx_disable_channel.call_count == 1 assert mock_xcvr_api.set_lpmode.call_count == 1 - assert task.port_dict['Ethernet0']['cmis_state'] == 'AP_CONFIGURED' + assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_AP_CONF # Case 2: DP_DEINIT --> AP Configured task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) task.task_worker() assert mock_xcvr_api.set_application.call_count == 1 - assert task.port_dict['Ethernet0']['cmis_state'] == 'DP_INIT' + assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_DP_INIT # Case 3: AP Configured --> DP_INIT task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) task.task_worker() assert mock_xcvr_api.set_datapath_init.call_count == 1 - assert task.port_dict['Ethernet0']['cmis_state'] == 'DP_TXON' + assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_DP_TXON # Case 4: DP_INIT --> DP_TXON task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) task.task_worker() assert mock_xcvr_api.tx_disable_channel.call_count == 2 - assert task.port_dict['Ethernet0']['cmis_state'] == 'DP_ACTIVATION' + assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_DP_ACTIVATE + + # Case 5: DP_TXON --> DP_ACTIVATION + task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) + task.post_port_active_apsel_to_db = MagicMock() + task.task_worker() + assert task.post_port_active_apsel_to_db.call_count == 1 + assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_READY @pytest.mark.parametrize("lport, expected_dom_polling", [ ('Ethernet0', 'disabled'), @@ -1071,7 +1114,8 @@ def mock_get(key): port_mapping = PortMapping() stop_event = threading.Event() - task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event) + mock_cmis_manager = MagicMock() + task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event, mock_cmis_manager) task.xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE) task.port_mapping.handle_port_change_event(PortChangeEvent('Ethernet4', 1, 0, PortChangeEvent.PORT_ADD)) task.port_mapping.handle_port_change_event(PortChangeEvent('Ethernet12', 1, 0, PortChangeEvent.PORT_ADD)) @@ -1084,12 +1128,34 @@ def mock_get(key): assert task.get_dom_polling_from_config_db(lport) == expected_dom_polling + @pytest.mark.parametrize("skip_cmis_manager, is_asic_index_none, mock_cmis_state, expected_result", [ + (True, False, None, False), + (False, False, CMIS_STATE_INSERTED, True), + (False, False, CMIS_STATE_READY, False), + (False, False, CMIS_STATE_UNKNOWN, True), + (False, True, None, False), + ]) + @patch('xcvrd.xcvrd.get_cmis_state_from_state_db') + def test_DomInfoUpdateTask_is_port_in_cmis_initialization_process(self, mock_get_cmis_state_from_state_db, skip_cmis_manager, is_asic_index_none, mock_cmis_state, expected_result): + port_mapping = PortMapping() + lport = 'Ethernet0' + port_change_event = PortChangeEvent(lport, 1, 0, PortChangeEvent.PORT_ADD) + stop_event = threading.Event() + task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event, skip_cmis_manager) + task.xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE) + task.on_port_config_change(port_change_event) + mock_get_cmis_state_from_state_db.return_value = mock_cmis_state + if is_asic_index_none: + lport='INVALID_PORT' + assert task.is_port_in_cmis_initialization_process(lport) == expected_result + @patch('xcvrd.xcvrd.XcvrTableHelper', MagicMock()) @patch('xcvrd.xcvrd.delete_port_from_status_table_hw') def test_DomInfoUpdateTask_handle_port_change_event(self, mock_del_status_tbl_hw): port_mapping = PortMapping() stop_event = threading.Event() - task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event) + mock_cmis_manager = MagicMock() + task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event, mock_cmis_manager) task.xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE) port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_ADD) task.on_port_config_change(port_change_event) @@ -1112,7 +1178,8 @@ def test_DomInfoUpdateTask_handle_port_change_event(self, mock_del_status_tbl_hw def test_DomInfoUpdateTask_task_run_stop(self): port_mapping = PortMapping() stop_event = threading.Event() - task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event) + mock_cmis_manager = MagicMock() + task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event, mock_cmis_manager) task.start() task.join() assert not task.is_alive() @@ -1137,10 +1204,12 @@ def test_DomInfoUpdateTask_task_worker(self, mock_post_pm_info, mock_update_stat port_mapping = PortMapping() stop_event = threading.Event() - task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event) + mock_cmis_manager = MagicMock() + task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event, mock_cmis_manager) task.xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE) task.task_stopping_event.wait = MagicMock(side_effect=[False, True]) task.get_dom_polling_from_config_db = MagicMock(return_value='enabled') + task.is_port_in_cmis_terminal_state = MagicMock(return_value=False) mock_detect_error.return_value = True task.task_worker() assert task.port_mapping.logical_port_list.count('Ethernet0') diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index dacb7f8e3..61a01acef 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -48,7 +48,24 @@ TRANSCEIVER_STATUS_TABLE = 'TRANSCEIVER_STATUS' TRANSCEIVER_PM_TABLE = 'TRANSCEIVER_PM' -TRANSCEIVER_STATUS_TABLE_SW_FIELDS = ["status", "error"] +TRANSCEIVER_STATUS_TABLE_SW_FIELDS = ["status", "error", "cmis_state"] + +CMIS_STATE_UNKNOWN = 'UNKNOWN' +CMIS_STATE_INSERTED = 'INSERTED' +CMIS_STATE_DP_DEINIT = 'DP_DEINIT' +CMIS_STATE_AP_CONF = 'AP_CONFIGURED' +CMIS_STATE_DP_ACTIVATE = 'DP_ACTIVATION' +CMIS_STATE_DP_INIT = 'DP_INIT' +CMIS_STATE_DP_TXON = 'DP_TXON' +CMIS_STATE_READY = 'READY' +CMIS_STATE_REMOVED = 'REMOVED' +CMIS_STATE_FAILED = 'FAILED' + +CMIS_TERMINAL_STATES = { + CMIS_STATE_FAILED, + CMIS_STATE_READY, + CMIS_STATE_REMOVED + } # Mgminit time required as per CMIS spec MGMT_INIT_TIME_DELAY_SECS = 2 @@ -843,6 +860,14 @@ def update_port_transceiver_status_table_sw(logical_port_name, status_tbl, statu fvs = swsscommon.FieldValuePairs([('status', status), ('error', error_descriptions)]) status_tbl.set(logical_port_name, fvs) +def get_cmis_state_from_state_db(lport, status_tbl): + found, transceiver_status_dict = status_tbl.get(lport) + if found and 'cmis_state' in dict(transceiver_status_dict): + return dict(transceiver_status_dict)['cmis_state'] + else: + return CMIS_STATE_UNKNOWN + + # Update port SFP status table for HW fields @@ -911,17 +936,6 @@ class CmisManagerTask(threading.Thread): CMIS_MODULE_TYPES = ['QSFP-DD', 'QSFP_DD', 'OSFP', 'QSFP+C'] CMIS_MAX_HOST_LANES = 8 - CMIS_STATE_UNKNOWN = 'UNKNOWN' - CMIS_STATE_INSERTED = 'INSERTED' - CMIS_STATE_DP_DEINIT = 'DP_DEINIT' - CMIS_STATE_AP_CONF = 'AP_CONFIGURED' - CMIS_STATE_DP_ACTIVATE = 'DP_ACTIVATION' - CMIS_STATE_DP_INIT = 'DP_INIT' - CMIS_STATE_DP_TXON = 'DP_TXON' - CMIS_STATE_READY = 'READY' - CMIS_STATE_REMOVED = 'REMOVED' - CMIS_STATE_FAILED = 'FAILED' - def __init__(self, namespaces, port_mapping, main_thread_stop_event, skip_cmis_mgr=False): threading.Thread.__init__(self) self.name = "CmisManagerTask" @@ -942,6 +956,17 @@ def log_notice(self, message): def log_error(self, message): helper_logger.log_error("CMIS: {}".format(message)) + def update_port_transceiver_status_table_sw_cmis_state(self, lport, cmis_state_to_set): + asic_index = self.port_mapping.get_asic_id_for_logical_port(lport) + status_table = self.xcvr_table_helper.get_status_tbl(asic_index) + if status_table is None: + helper_logger.log_error("status_table is None while updating " + "sw CMIS state for lport {}".format(lport)) + return + + fvs = swsscommon.FieldValuePairs([('cmis_state', cmis_state_to_set)]) + status_table.set(lport, fvs) + def on_port_update_event(self, port_change_event): if port_change_event.event_type not in [port_change_event.PORT_SET, port_change_event.PORT_DEL]: return @@ -993,7 +1018,7 @@ def on_port_update_event(self, port_change_event): self.force_cmis_reinit(lport, 0) else: - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_REMOVED + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_REMOVED) def get_interface_speed(self, ifname): """ @@ -1204,7 +1229,7 @@ def force_cmis_reinit(self, lport, retries=0): """ Try to force the restart of CMIS state machine """ - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_INSERTED + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_INSERTED) self.port_dict[lport]['cmis_retries'] = retries self.port_dict[lport]['cmis_expired'] = None # No expiration @@ -1433,6 +1458,10 @@ def task_worker(self): for namespace in self.namespaces: self.wait_for_port_config_done(namespace) + logical_port_list = self.port_mapping.logical_port_list + for lport in logical_port_list: + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_UNKNOWN) + # APPL_DB for CONFIG updates, and STATE_DB for insertion/removal sel, asic_context = port_mapping.subscribe_port_update_event(self.namespaces, helper_logger) while not self.task_stopping_event.is_set(): @@ -1450,12 +1479,9 @@ def task_worker(self): if lport not in self.port_dict: continue - state = self.port_dict[lport].get('cmis_state', self.CMIS_STATE_UNKNOWN) - if state in [self.CMIS_STATE_UNKNOWN, - self.CMIS_STATE_FAILED, - self.CMIS_STATE_READY, - self.CMIS_STATE_REMOVED]: - if state != self.CMIS_STATE_READY: + state = get_cmis_state_from_state_db(lport, self.xcvr_table_helper.get_status_tbl(self.port_mapping.get_asic_id_for_logical_port(lport))) + if state in CMIS_TERMINAL_STATES or state == CMIS_STATE_UNKNOWN: + if state != CMIS_STATE_READY: self.port_dict[lport]['appl'] = 0 self.port_dict[lport]['host_lanes_mask'] = 0 continue @@ -1482,7 +1508,7 @@ def task_worker(self): # double-check the HW presence before moving forward sfp = platform_chassis.get_sfp(pport) if not sfp.get_presence(): - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_REMOVED + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_REMOVED) continue try: @@ -1490,19 +1516,19 @@ def task_worker(self): api = sfp.get_xcvr_api() if api is None: self.log_error("{}: skipping CMIS state machine since no xcvr api!!!".format(lport)) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY) continue # Skip if it's not a paged memory device if api.is_flat_memory(): self.log_notice("{}: skipping CMIS state machine for flat memory xcvr".format(lport)) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY) continue # Skip if it's not a CMIS module type = api.get_module_type_abbreviation() if (type is None) or (type not in self.CMIS_MODULE_TYPES): - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY) continue if api.is_coherent_module(): @@ -1512,7 +1538,7 @@ def task_worker(self): self.port_dict[lport]['laser_freq'] = self.get_configured_laser_freq_from_db(lport) except AttributeError: # Skip if these essential routines are not available - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY) continue # CMIS expiration and retries @@ -1525,10 +1551,10 @@ def task_worker(self): retries = self.port_dict[lport].get('cmis_retries', 0) host_lanes_mask = self.port_dict[lport].get('host_lanes_mask', 0) appl = self.port_dict[lport].get('appl', 0) - if state != self.CMIS_STATE_INSERTED and (host_lanes_mask <= 0 or appl < 1): + if state != CMIS_STATE_INSERTED and (host_lanes_mask <= 0 or appl < 1): self.log_error("{}: Unexpected value for host_lanes_mask {} or appl {} in " "{} state".format(lport, host_lanes_mask, appl, state)) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_FAILED + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED) continue self.log_notice("{}: {}G, lanemask=0x{:x}, state={}, appl {} host_lane_count {} " @@ -1536,18 +1562,18 @@ def task_worker(self): state, appl, host_lane_count, retries)) if retries > self.CMIS_MAX_RETRIES: self.log_error("{}: FAILED".format(lport)) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_FAILED + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED) continue try: # CMIS state transitions - if state == self.CMIS_STATE_INSERTED: + if state == CMIS_STATE_INSERTED: self.port_dict[lport]['appl'] = self.get_cmis_application_desired(api, host_lane_count, host_speed) if self.port_dict[lport]['appl'] < 1: self.log_error("{}: no suitable app for the port appl {} host_lane_count {} " "host_speed {}".format(lport, appl, host_lane_count, host_speed)) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_FAILED + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED) continue appl = self.port_dict[lport]['appl'] self.log_notice("{}: Setting appl={}".format(lport, appl)) @@ -1557,7 +1583,7 @@ def task_worker(self): if self.port_dict[lport]['host_lanes_mask'] <= 0: self.log_error("{}: Invalid lane mask received - host_lane_count {} subport {} " "appl {}!".format(lport, host_lane_count, subport, appl)) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_FAILED + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED) continue host_lanes_mask = self.port_dict[lport]['host_lanes_mask'] self.log_notice("{}: Setting host_lanemask=0x{:x}".format(lport, host_lanes_mask)) @@ -1572,7 +1598,7 @@ def task_worker(self): self.log_error("{}: Invalid media lane mask received - media_lane_count {} " "media_lane_assignment_options {} lport{} subport {}" " appl {}!".format(media_lane_count,media_lane_assignment_options,lport,subport,appl)) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_FAILED + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED) continue media_lanes_mask = self.port_dict[lport]['media_lanes_mask'] self.log_notice("{}: Setting media_lanemask=0x{:x}".format(lport, media_lanes_mask)) @@ -1582,7 +1608,7 @@ def task_worker(self): self.log_notice("{} Forcing Tx laser OFF".format(lport)) # Force DataPath re-init api.tx_disable_channel(media_lanes_mask, True) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY) continue # Configure the target output power if ZR module if api.is_coherent_module(): @@ -1607,11 +1633,11 @@ def task_worker(self): if not need_update: # No application updates self.log_notice("{}: no CMIS application update required...READY".format(lport)) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY) continue self.log_notice("{}: force Datapath reinit".format(lport)) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_DP_DEINIT - elif state == self.CMIS_STATE_DP_DEINIT: + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_DP_DEINIT) + elif state == CMIS_STATE_DP_DEINIT: # D.2.2 Software Deinitialization api.set_datapath_deinit(host_lanes_mask) @@ -1624,13 +1650,13 @@ def task_worker(self): #Sets module to high power mode and doesn't impact datapath if module is already in high power mode api.set_lpmode(False) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_AP_CONF + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_AP_CONF) dpDeinitDuration = self.get_cmis_dp_deinit_duration_secs(api) modulePwrUpDuration = self.get_cmis_module_power_up_duration_secs(api) self.log_notice("{}: DpDeinit duration {} secs, modulePwrUp duration {} secs".format(lport, dpDeinitDuration, modulePwrUpDuration)) self.port_dict[lport]['cmis_expired'] = now + datetime.timedelta(seconds = max(modulePwrUpDuration, dpDeinitDuration)) - elif state == self.CMIS_STATE_AP_CONF: + elif state == CMIS_STATE_AP_CONF: # Explicit control bit to apply custom Host SI settings. # It will be set to 1 and applied via set_application if # custom SI settings is applicable @@ -1683,8 +1709,8 @@ def task_worker(self): self.force_cmis_reinit(lport, retries + 1) continue - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_DP_INIT - elif state == self.CMIS_STATE_DP_INIT: + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_DP_INIT) + elif state == CMIS_STATE_DP_INIT: if not self.check_config_error(api, host_lanes_mask, ['ConfigSuccess']): if (expired is not None) and (expired <= now): self.log_notice("{}: timeout for 'ConfigSuccess'".format(lport)) @@ -1713,8 +1739,8 @@ def task_worker(self): dpInitDuration = self.get_cmis_dp_init_duration_secs(api) self.log_notice("{}: DpInit duration {} secs".format(lport, dpInitDuration)) self.port_dict[lport]['cmis_expired'] = now + datetime.timedelta(seconds=dpInitDuration) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_DP_TXON - elif state == self.CMIS_STATE_DP_TXON: + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_DP_TXON) + elif state == CMIS_STATE_DP_TXON: if not self.check_datapath_state(api, host_lanes_mask, ['DataPathInitialized']): if (expired is not None) and (expired <= now): self.log_notice("{}: timeout for 'DataPathInitialized'".format(lport)) @@ -1725,8 +1751,8 @@ def task_worker(self): media_lanes_mask = self.port_dict[lport]['media_lanes_mask'] api.tx_disable_channel(media_lanes_mask, False) self.log_notice("{}: Turning ON tx power".format(lport)) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_DP_ACTIVATE - elif state == self.CMIS_STATE_DP_ACTIVATE: + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_DP_ACTIVATE) + elif state == CMIS_STATE_DP_ACTIVATE: if not self.check_datapath_state(api, host_lanes_mask, ['DataPathActivated']): if (expired is not None) and (expired <= now): self.log_notice("{}: timeout for 'DataPathActivated'".format(lport)) @@ -1734,12 +1760,12 @@ def task_worker(self): continue self.log_notice("{}: READY".format(lport)) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY) self.post_port_active_apsel_to_db(api, lport, host_lanes_mask) except (NotImplementedError, AttributeError) as e: self.log_error("{}: internal errors due to {}".format(lport, e)) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_FAILED + self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED) self.log_notice("Stopped") @@ -1775,7 +1801,7 @@ def join(self): class DomInfoUpdateTask(threading.Thread): - def __init__(self, namespaces, port_mapping, main_thread_stop_event): + def __init__(self, namespaces, port_mapping, main_thread_stop_event, skip_cmis_mgr): threading.Thread.__init__(self) self.name = "DomInfoUpdateTask" self.exc = None @@ -1783,6 +1809,7 @@ def __init__(self, namespaces, port_mapping, main_thread_stop_event): self.main_thread_stop_event = main_thread_stop_event self.port_mapping = copy.deepcopy(port_mapping) self.namespaces = namespaces + self.skip_cmis_mgr = skip_cmis_mgr def get_dom_polling_from_config_db(self, lport): """ @@ -1820,8 +1847,35 @@ def get_dom_polling_from_config_db(self, lport): return dom_polling + """ + Checks if the port is going through CMIS initialization process + This API assumes CMIS_STATE_UNKNOWN as a transitional state since it is the + first state after starting CMIS state machine. + This assumption allows the DomInfoUpdateTask thread to skip polling on the port + to allow CMIS initialization to complete if needed. + Returns: + True if the port is in CMIS initialization process, + otherwise False + """ + def is_port_in_cmis_initialization_process(self, logical_port_name): + # If CMIS manager is not available for the platform, return False + if self.skip_cmis_mgr: + return False + + asic_index = self.port_mapping.get_asic_id_for_logical_port(logical_port_name) + if asic_index is None: + helper_logger.log_warning("Got invalid asic index for {} while checking cmis init status".format(logical_port_name)) + return False + + cmis_state = get_cmis_state_from_state_db(logical_port_name, self.xcvr_table_helper.get_status_tbl(asic_index)) + if cmis_state not in CMIS_TERMINAL_STATES: + return True + else: + return False + def is_port_dom_monitoring_disabled(self, logical_port_name): - return self.get_dom_polling_from_config_db(logical_port_name) == 'disabled' + return self.get_dom_polling_from_config_db(logical_port_name) == 'disabled' or \ + self.is_port_in_cmis_initialization_process(logical_port_name) def task_worker(self): self.xcvr_table_helper = XcvrTableHelper(self.namespaces) @@ -2614,13 +2668,14 @@ def run(self): port_mapping_data = self.init() # Start the CMIS manager - cmis_manager = CmisManagerTask(self.namespaces, port_mapping_data, self.stop_event, self.skip_cmis_mgr) + cmis_manager = None if not self.skip_cmis_mgr: + cmis_manager = CmisManagerTask(self.namespaces, port_mapping_data, self.stop_event, self.skip_cmis_mgr) cmis_manager.start() self.threads.append(cmis_manager) # Start the dom sensor info update thread - dom_info_update = DomInfoUpdateTask(self.namespaces, port_mapping_data, self.stop_event) + dom_info_update = DomInfoUpdateTask(self.namespaces, port_mapping_data, self.stop_event, self.skip_cmis_mgr) dom_info_update.start() self.threads.append(dom_info_update)