From 42f64d8f61b83fdef07c6ac24b2d67f820e1e745 Mon Sep 17 00:00:00 2001 From: Stephen Sun <5379172+stephenxs@users.noreply.github.com> Date: Wed, 31 Jul 2019 00:46:43 +0800 Subject: [PATCH] [xcvrd] recovery SFP modules (backport PR#30 to 201811) (#34) * [xcvrd] 1. periodically check whether all SFPs' information is in the database and insert it if not. 2. retry for 5 times when a newly inserted SFP module isn't "present" * [xcvrd] adjust the period in which recovery is called --- sonic-xcvrd/scripts/xcvrd | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/sonic-xcvrd/scripts/xcvrd b/sonic-xcvrd/scripts/xcvrd index 777719011..7e850f632 100644 --- a/sonic-xcvrd/scripts/xcvrd +++ b/sonic-xcvrd/scripts/xcvrd @@ -49,6 +49,8 @@ SELECT_TIMEOUT_MSECS = 1000 DOM_INFO_UPDATE_PERIOD_SECS = 60 TIME_FOR_SFP_READY_SECS = 1 +RETRIES_FOR_SPF_READY = 5 +XCVRD_MAIN_THREAD_SLEEP_MSECS = 60000 SFP_STATUS_INSERTED = '1' SFP_STATUS_REMOVED = '0' @@ -233,7 +235,7 @@ def post_port_sfp_info_to_db(logical_port_name, table): for physical_port in physical_port_list: if not platform_sfputil.get_presence(physical_port): - continue + return SFP_EEPROM_NOT_READY port_name = get_physical_port_name(logical_port_name, ganged_member_num, ganged_port) ganged_member_num += 1 @@ -325,6 +327,15 @@ def del_port_sfp_dom_info_to_db(logical_port_name, int_tbl, dom_tbl): log_error("This functionality is currently not implemented for this platform") sys.exit(3) +# recover missing sfp table entries if any +def recover_missing_sfp_table_entries(sfp_util, int_tbl): + keys = int_tbl.getKeys() + logical_port_list = sfp_util.logical + for logical_port_name in logical_port_list: + if logical_port_name not in keys: + post_port_sfp_info_to_db(logical_port_name, int_tbl) + log_info("Port {} has been recovered".format(logical_port_name)) + # Timer thread wrapper class to update dom info to DB periodically class dom_info_update_task: def __init__(self, table): @@ -416,8 +427,9 @@ def main(): # Start main loop to listen to the SFP change event. log_info("Start main loop") + time_last_recovery_run = time.time() while True: - status, port_dict = platform_sfputil.get_transceiver_change_event() + status, port_dict = platform_sfputil.get_transceiver_change_event(XCVRD_MAIN_THREAD_SLEEP_MSECS) if status: for key, value in port_dict.iteritems(): logical_port_list = platform_sfputil.get_physical_to_logical(int(key)) @@ -426,8 +438,18 @@ def main(): rc = post_port_sfp_info_to_db(logical_port, int_tbl) # If we didn't get the sfp info, assuming the eeprom is not ready, give a try again. if rc == SFP_EEPROM_NOT_READY: - time.sleep(TIME_FOR_SFP_READY_SECS) - post_port_sfp_info_to_db(logical_port, int_tbl) + log_info("Port {} isn't present when got SFP insert event".format(logical_port)) + retry = 0 + while retry <= RETRIES_FOR_SPF_READY: + time.sleep(TIME_FOR_SFP_READY_SECS) + rc = post_port_sfp_info_to_db(logical_port, int_tbl) + if rc == SFP_EEPROM_NOT_READY: + log_info("Port {} isn't present when got SFP insert event, retry {}".format(logical_port, retry)) + retry = retry + 1 + else: + break + else: + log_info("get sfp info successfully {}, push to db".format(logical_port)) post_port_dom_info_to_db(logical_port, dom_tbl) elif value == SFP_STATUS_REMOVED: @@ -440,6 +462,13 @@ def main(): # TODO: next step need to define more error types to handle accordingly. break + time_now = time.time() + time_diff = time_now - time_last_recovery_run + # time.time() returns value in unit of seconds while XCVRD_MAIN_THREAD_SLEEP_MSECS is defined in unit of milliseconds + if time_diff * 1000 > XCVRD_MAIN_THREAD_SLEEP_MSECS: + recover_missing_sfp_table_entries(platform_sfputil, int_tbl) + time_last_recovery_run = time_now + # Stop the dom info update timer dom_info_update.task_stop()