From ca812b0df105ecb98b90f2b9ef6b7aa24592222b Mon Sep 17 00:00:00 2001 From: mihirpat1 <112018033+mihirpat1@users.noreply.github.com> Date: Mon, 7 Oct 2024 19:41:27 -0700 Subject: [PATCH] Xcvrd crash and restart should not cause link flap on platforms needing custom NPU SI settings (#541) * Xcvrd crash and restart should not cause link flap on platforms needing custom SI settings Signed-off-by: Mihir Patel * Improved code coverage --------- Signed-off-by: Mihir Patel --- sonic-xcvrd/tests/test_xcvrd.py | 51 +++++++++++++- sonic-xcvrd/xcvrd/xcvrd.py | 46 +++++++++++-- .../xcvrd_utilities/media_settings_parser.py | 23 ++++++- .../xcvrd_utilities/xcvr_table_helper.py | 69 ++++++++++++++++++- 4 files changed, 177 insertions(+), 12 deletions(-) diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index b21b3d22a..d259862c5 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -387,6 +387,33 @@ def test_is_cmis_api(self, mock_class, expected_return_value): mock_xcvr_api.__class__ = mock_class assert is_cmis_api(mock_xcvr_api) == expected_return_value + def test_get_state_db_port_table_val_by_key(self): + xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE) + port_mapping = PortMapping() + + assert xcvr_table_helper.get_state_db_port_table_val_by_key("Ethernet0", None, NPU_SI_SETTINGS_SYNC_STATUS_KEY) == None + + port_mapping.get_asic_id_for_logical_port = MagicMock(return_value=0) + xcvr_table_helper.get_state_port_tbl = MagicMock(return_value=None) + assert xcvr_table_helper.get_state_db_port_table_val_by_key("Ethernet0", port_mapping, NPU_SI_SETTINGS_SYNC_STATUS_KEY) == None + + mock_state_port_table = MagicMock() + xcvr_table_helper.get_state_port_tbl = MagicMock(return_value=mock_state_port_table) + mock_state_port_table.get = MagicMock(return_value=(None, None)) + assert xcvr_table_helper.get_state_db_port_table_val_by_key("Ethernet0", port_mapping, NPU_SI_SETTINGS_SYNC_STATUS_KEY) == None + + mock_state_port_table.get = MagicMock(return_value=(True, {'A' : 'B'})) + assert xcvr_table_helper.get_state_db_port_table_val_by_key("Ethernet0", port_mapping, NPU_SI_SETTINGS_SYNC_STATUS_KEY) == None + mock_state_port_table.get = MagicMock(return_value=(True, {NPU_SI_SETTINGS_SYNC_STATUS_KEY : NPU_SI_SETTINGS_DEFAULT_VALUE})) + assert xcvr_table_helper.get_state_db_port_table_val_by_key("Ethernet0", port_mapping, NPU_SI_SETTINGS_SYNC_STATUS_KEY) == NPU_SI_SETTINGS_DEFAULT_VALUE + + def test_is_npu_si_settings_update_required(self): + xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE) + port_mapping = PortMapping() + xcvr_table_helper.get_state_db_port_table_val_by_key = MagicMock(side_effect=[None, NPU_SI_SETTINGS_NOTIFIED_VALUE]) + assert xcvr_table_helper.is_npu_si_settings_update_required("Ethernet0", port_mapping) + assert not xcvr_table_helper.is_npu_si_settings_update_required("Ethernet0", port_mapping) + @patch('xcvrd.xcvrd._wrapper_get_sfp_type') @patch('xcvrd.xcvrd_utilities.port_event_helper.PortMapping.logical_port_name_to_physical_port_list', MagicMock(return_value=[0])) @patch('xcvrd.xcvrd._wrapper_get_presence', MagicMock(return_value=True)) @@ -986,10 +1013,12 @@ def _check_notify_media_setting(self, index, expected_found=False, expected_valu } } if xcvr_info_dict is None else xcvr_info_dict app_port_tbl = Table("APPL_DB", 'PORT_TABLE') + xcvr_table_helper.get_app_port_tbl = MagicMock(return_value=app_port_tbl) + xcvr_table_helper.is_npu_si_settings_update_required = MagicMock(return_value=True) port_mapping = PortMapping() port_change_event = PortChangeEvent('Ethernet0', index, 0, PortChangeEvent.PORT_ADD) port_mapping.handle_port_change_event(port_change_event) - media_settings_parser.notify_media_setting(logical_port_name, xcvr_info_dict, app_port_tbl, mock_cfg_table, port_mapping) + media_settings_parser.notify_media_setting(logical_port_name, xcvr_info_dict, xcvr_table_helper, port_mapping) found, result = app_port_tbl.get(logical_port_name) result_dict = dict(result) if result else None assert found == expected_found @@ -1335,6 +1364,24 @@ def test_DaemonXcvrd_wait_for_port_config_done(self, mock_select, mock_sub_table xcvrd.wait_for_port_config_done('') assert swsscommon.Select.select.call_count == 2 + def test_DaemonXcvrd_initialize_port_init_control_fields_in_port_table(self): + port_mapping = PortMapping() + xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER) + + port_mapping.logical_port_list = ['Ethernet0'] + port_mapping.get_asic_id_for_logical_port = MagicMock(return_value=0) + mock_xcvrd_table_helper = MagicMock() + mock_xcvrd_table_helper.get_state_port_tbl = MagicMock(return_value=None) + xcvrd.xcvr_table_helper = mock_xcvrd_table_helper + xcvrd.initialize_port_init_control_fields_in_port_table(port_mapping) + + mock_state_db = MagicMock() + mock_xcvrd_table_helper.get_state_port_tbl = MagicMock(return_value=mock_state_db) + mock_state_db.get = MagicMock(return_value=(False, {})) + + xcvrd.initialize_port_init_control_fields_in_port_table(port_mapping) + mock_state_db.set.call_count = 2 + @patch('xcvrd.xcvrd.DaemonXcvrd.init') @patch('xcvrd.xcvrd.DaemonXcvrd.deinit') @patch('xcvrd.xcvrd.DomInfoUpdateTask.start') @@ -3200,6 +3247,7 @@ def test_DaemonXcvrd_init_deinit_fastboot_enabled(self): xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER) with patch("subprocess.check_output") as mock_run: mock_run.return_value = "true" + xcvrd.initialize_port_init_control_fields_in_port_table = MagicMock() xcvrd.init() @@ -3226,6 +3274,7 @@ def test_DaemonXcvrd_init_deinit_cold(self): xcvrdaemon = DaemonXcvrd(SYSLOG_IDENTIFIER) with patch("subprocess.check_output") as mock_run: mock_run.return_value = "false" + xcvrdaemon.initialize_port_init_control_fields_in_port_table = MagicMock() xcvrdaemon.init() diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index f07da1572..fbb31269f 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -28,7 +28,7 @@ from .xcvrd_utilities import sfp_status_helper from .sff_mgr import SffManagerTask - from .xcvrd_utilities.xcvr_table_helper import XcvrTableHelper + from .xcvrd_utilities.xcvr_table_helper import * from .xcvrd_utilities import port_event_helper from .xcvrd_utilities.port_event_helper import PortChangeObserver from .xcvrd_utilities import media_settings_parser @@ -1934,7 +1934,7 @@ def _post_port_sfp_info_and_dom_thr_to_db_once(self, port_mapping, xcvr_table_he # Do not notify media settings during warm reboot to avoid dataplane traffic impact if is_warm_start == False: - media_settings_parser.notify_media_setting(logical_port_name, transceiver_dict, xcvr_table_helper.get_app_port_tbl(asic_index), xcvr_table_helper.get_cfg_port_tbl(asic_index), port_mapping) + media_settings_parser.notify_media_setting(logical_port_name, transceiver_dict, xcvr_table_helper, port_mapping) transceiver_dict.clear() else: retry_eeprom_set.add(logical_port_name) @@ -2156,10 +2156,12 @@ def task_worker(self, stopping_event, sfp_error_event): if rc != SFP_EEPROM_NOT_READY: post_port_dom_threshold_info_to_db(logical_port, self.port_mapping, self.xcvr_table_helper.get_dom_threshold_tbl(asic_index)) - media_settings_parser.notify_media_setting(logical_port, transceiver_dict, self.xcvr_table_helper.get_app_port_tbl(asic_index), self.xcvr_table_helper.get_cfg_port_tbl(asic_index), self.port_mapping) + media_settings_parser.notify_media_setting(logical_port, transceiver_dict, self.xcvr_table_helper, self.port_mapping) transceiver_dict.clear() elif value == sfp_status_helper.SFP_STATUS_REMOVED: helper_logger.log_notice("{}: Got SFP removed event".format(logical_port)) + state_port_table = self.xcvr_table_helper.get_state_port_tbl(asic_index) + state_port_table.set(logical_port, [(NPU_SI_SETTINGS_SYNC_STATUS_KEY, NPU_SI_SETTINGS_DEFAULT_VALUE)]) update_port_transceiver_status_table_sw( logical_port, self.xcvr_table_helper.get_status_tbl(asic_index), sfp_status_helper.SFP_STATUS_REMOVED) helper_logger.log_notice("{}: received plug out and update port sfp status table.".format(logical_port)) @@ -2354,7 +2356,7 @@ def on_add_logical_port(self, port_change_event): self.retry_eeprom_set.add(port_change_event.port_name) else: post_port_dom_threshold_info_to_db(port_change_event.port_name, self.port_mapping, dom_threshold_tbl) - media_settings_parser.notify_media_setting(port_change_event.port_name, transceiver_dict, self.xcvr_table_helper.get_app_port_tbl(port_change_event.asic_id), self.xcvr_table_helper.get_cfg_port_tbl(port_change_event.asic_id), self.port_mapping) + media_settings_parser.notify_media_setting(port_change_event.port_name, transceiver_dict, self.xcvr_table_helper, self.port_mapping) else: status = sfp_status_helper.SFP_STATUS_REMOVED if not status else status update_port_transceiver_status_table_sw(port_change_event.port_name, status_tbl, status, error_description) @@ -2380,7 +2382,7 @@ def retry_eeprom_reading(self): rc = post_port_sfp_info_to_db(logical_port, self.port_mapping, self.xcvr_table_helper.get_intf_tbl(asic_index), transceiver_dict) if rc != SFP_EEPROM_NOT_READY: post_port_dom_threshold_info_to_db(logical_port, self.port_mapping, self.xcvr_table_helper.get_dom_threshold_tbl(asic_index)) - media_settings_parser.notify_media_setting(logical_port, transceiver_dict, self.xcvr_table_helper.get_app_port_tbl(asic_index), self.xcvr_table_helper.get_cfg_port_tbl(asic_index), self.port_mapping) + media_settings_parser.notify_media_setting(logical_port, transceiver_dict, self.xcvr_table_helper, self.port_mapping) transceiver_dict.clear() retry_success_set.add(logical_port) # Update retry EEPROM set @@ -2437,6 +2439,30 @@ def wait_for_port_config_done(self, namespace): if key in ["PortConfigDone", "PortInitDone"]: break + """ + Initialize NPU_SI_SETTINGS_SYNC_STATUS_KEY field in STATE_DB PORT_TABLE| + if not already present for a port. + """ + def initialize_port_init_control_fields_in_port_table(self, port_mapping_data): + logical_port_list = port_mapping_data.logical_port_list + for lport in logical_port_list: + asic_index = port_mapping_data.get_asic_id_for_logical_port(lport) + state_port_table = self.xcvr_table_helper.get_state_port_tbl(asic_index) + if state_port_table is None: + helper_logger.log_error("Port init control: state_port_tbl is None for lport {}".format(lport)) + continue + + found, state_port_table_fvs = state_port_table.get(lport) + if not found: + self.log_notice("Port init control: Creating STATE_DB PORT_TABLE as unable to find for lport {}".format(lport)) + state_port_table_fvs = [] + state_port_table_fvs_dict = dict(state_port_table_fvs) + if NPU_SI_SETTINGS_SYNC_STATUS_KEY not in state_port_table_fvs_dict: + state_port_table.set(lport, [(NPU_SI_SETTINGS_SYNC_STATUS_KEY, + NPU_SI_SETTINGS_DEFAULT_VALUE)]) + self.log_notice("Port init control: Initialized NPU_SI_SETTINGS_SYNC_STATUS for lport {}".format(lport)) + + self.log_notice("XCVRD INIT: Port init control fields initialized in STATE_DB PORT_TABLE") # Initialize daemon def init(self): @@ -2490,7 +2516,11 @@ def init(self): self.wait_for_port_config_done(namespace) self.log_notice("XCVRD INIT: After port config is done") - return port_event_helper.get_port_mapping(self.namespaces) + port_mapping_data = port_event_helper.get_port_mapping(self.namespaces) + + self.initialize_port_init_control_fields_in_port_table(port_mapping_data) + + return port_mapping_data # Deinitialize daemon def deinit(self): @@ -2508,7 +2538,9 @@ def deinit(self): helper_logger.log_warning("Got invalid asic index for {}, ignored".format(logical_port_name)) continue - intf_tbl = self.xcvr_table_helper.get_intf_tbl(asic_index) if not is_warm_fast_reboot else None + # Skip deleting intf_tbl for avoiding OA to trigger Tx disable signal + # due to TRANSCEIVER_INFO table deletion during xcvrd shutdown/crash + intf_tbl = None del_port_sfp_dom_info_from_db(logical_port_name, port_mapping_data, intf_tbl, diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/media_settings_parser.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/media_settings_parser.py index 2dd79d5ea..4242e8571 100644 --- a/sonic-xcvrd/xcvrd/xcvrd_utilities/media_settings_parser.py +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/media_settings_parser.py @@ -11,6 +11,7 @@ from sonic_py_common import device_info, logger from swsscommon import swsscommon from xcvrd import xcvrd +from .xcvr_table_helper import * g_dict = {} @@ -24,6 +25,7 @@ SYSLOG_IDENTIFIER = "xcvrd" helper_logger = logger.Logger(SYSLOG_IDENTIFIER) +PHYSICAL_PORT_NOT_EXIST = -1 def load_media_settings(): global g_dict @@ -303,12 +305,24 @@ def get_speed_lane_count_and_subport(port, cfg_port_tbl): def notify_media_setting(logical_port_name, transceiver_dict, - app_port_tbl, cfg_port_tbl, port_mapping): + xcvr_table_helper, port_mapping): if not media_settings_present(): return - port_speed, lane_count, subport_num = get_speed_lane_count_and_subport(logical_port_name, cfg_port_tbl) + if not xcvr_table_helper: + helper_logger.log_error("Notify media setting: xcvr_table_helper " + "not initialized for lport {}".format(logical_port_name)) + return + + if not xcvr_table_helper.is_npu_si_settings_update_required(logical_port_name, port_mapping): + helper_logger.log_notice("Notify media setting: Media settings already " + "notified for lport {}".format(logical_port_name)) + return + + asic_index = port_mapping.get_asic_id_for_logical_port(logical_port_name) + + port_speed, lane_count, subport_num = get_speed_lane_count_and_subport(logical_port_name, xcvr_table_helper.get_cfg_port_tbl(asic_index)) ganged_port = False ganged_member_num = 1 @@ -354,4 +368,7 @@ def notify_media_setting(logical_port_name, transceiver_dict, fvs[index] = (str(media_key), str(val_str)) index += 1 - app_port_tbl.set(port_name, fvs) + xcvr_table_helper.get_app_port_tbl(asic_index).set(port_name, fvs) + xcvr_table_helper.get_state_port_tbl(asic_index).set(logical_port_name, [(NPU_SI_SETTINGS_SYNC_STATUS_KEY, NPU_SI_SETTINGS_NOTIFIED_VALUE)]) + helper_logger.log_notice("Notify media setting: Published ASIC-side SI setting " + "for lport {} in APP_DB".format(logical_port_name)) diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/xcvr_table_helper.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/xcvr_table_helper.py index 2f372a76f..863eda5b5 100644 --- a/sonic-xcvrd/xcvrd/xcvrd_utilities/xcvr_table_helper.py +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/xcvr_table_helper.py @@ -1,10 +1,13 @@ try: - from sonic_py_common import daemon_base + from sonic_py_common import daemon_base, logger from sonic_py_common import multi_asic from swsscommon import swsscommon except ImportError as e: raise ImportError(str(e) + " - required module not found") +SYSLOG_IDENTIFIER = "xcvrd" +helper_logger = logger.Logger(SYSLOG_IDENTIFIER) + TRANSCEIVER_INFO_TABLE = 'TRANSCEIVER_INFO' TRANSCEIVER_FIRMWARE_INFO_TABLE = 'TRANSCEIVER_FIRMWARE_INFO' TRANSCEIVER_DOM_SENSOR_TABLE = 'TRANSCEIVER_DOM_SENSOR' @@ -12,6 +15,10 @@ TRANSCEIVER_STATUS_TABLE = 'TRANSCEIVER_STATUS' TRANSCEIVER_PM_TABLE = 'TRANSCEIVER_PM' +NPU_SI_SETTINGS_SYNC_STATUS_KEY = 'NPU_SI_SETTINGS_SYNC_STATUS' +NPU_SI_SETTINGS_DEFAULT_VALUE = 'NPU_SI_SETTINGS_DEFAULT' +NPU_SI_SETTINGS_NOTIFIED_VALUE = 'NPU_SI_SETTINGS_NOTIFIED' + class XcvrTableHelper: def __init__(self, namespaces): self.int_tbl, self.dom_tbl, self.dom_threshold_tbl, self.status_tbl, self.app_port_tbl, \ @@ -62,3 +69,63 @@ def get_cfg_port_tbl(self, asic_id): def get_state_port_tbl(self, asic_id): return self.state_port_tbl[asic_id] + + def get_state_db_port_table_val_by_key(self, lport, port_mapping, key): + """ + Retrieves the value of a key from STATE_DB PORT_TABLE| for the given logical port + Args: + lport: + logical port name + port_mapping: + A PortMapping object + key: + key for the corresponding value to be retrieved + Returns: + The value of the key if the key is found in STATE_DB PORT_TABLE| + None otherwise + """ + + if port_mapping is None: + helper_logger.log_error("Get value by key from STATE_DB: port_mapping is None " + "for lport {}".format(lport)) + return None + + asic_index = port_mapping.get_asic_id_for_logical_port(lport) + state_port_table = self.get_state_port_tbl(asic_index) + if state_port_table is None: + helper_logger.log_error("Get value by key from STATE_DB: state_db is None with asic_index {} " + "for lport {}".format(asic_index, lport)) + return None + + found, state_port_table_fvs = state_port_table.get(lport) + if not found: + helper_logger.log_error("Get value by key from STATE_DB: Unable to find lport {}".format(lport)) + return None + + state_port_table_fvs_dict = dict(state_port_table_fvs) + if key not in state_port_table_fvs_dict: + helper_logger.log_error("Get value by key from STATE_DB: Unable to find key {} " + "state_port_table_fvs_dict {} for lport {}".format(key, state_port_table_fvs_dict, lport)) + return None + + return state_port_table_fvs_dict[key] + + def is_npu_si_settings_update_required(self, lport, port_mapping): + """ + Checks if NPU SI settings update is required for a module + Args: + lport: + logical port name + port_mapping: + A PortMapping object + Returns: + True if NPU_SI_SETTINGS_SYNC_STATUS_KEY is + - not present/accessible from STATE_DB or + - set to NPU_SI_SETTINGS_DEFAULT_VALUE + False otherwise + """ + npu_si_settings_sync_val = self.get_state_db_port_table_val_by_key(lport, + port_mapping, NPU_SI_SETTINGS_SYNC_STATUS_KEY) + + # If npu_si_settings_sync_val is None, it can also mean that the key is not present in the table + return npu_si_settings_sync_val is None or npu_si_settings_sync_val == NPU_SI_SETTINGS_DEFAULT_VALUE