Skip to content

Commit

Permalink
[xcvrd] recovery SFP modules (backport PR#30 to 201811) (#34)
Browse files Browse the repository at this point in the history
* [xcvrd]
1. periodically check whether all SFPs' information is in the database and insert it if not.
2. retry for 5 times when a newly inserted SFP module isn't "present"

* [xcvrd] adjust the period in which recovery is called
  • Loading branch information
stephenxs authored and lguohan committed Jul 30, 2019
1 parent bc23ab0 commit 42f64d8
Showing 1 changed file with 33 additions and 4 deletions.
37 changes: 33 additions & 4 deletions sonic-xcvrd/scripts/xcvrd
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ SELECT_TIMEOUT_MSECS = 1000

DOM_INFO_UPDATE_PERIOD_SECS = 60
TIME_FOR_SFP_READY_SECS = 1
RETRIES_FOR_SPF_READY = 5
XCVRD_MAIN_THREAD_SLEEP_MSECS = 60000

SFP_STATUS_INSERTED = '1'
SFP_STATUS_REMOVED = '0'
Expand Down Expand Up @@ -233,7 +235,7 @@ def post_port_sfp_info_to_db(logical_port_name, table):

for physical_port in physical_port_list:
if not platform_sfputil.get_presence(physical_port):
continue
return SFP_EEPROM_NOT_READY

port_name = get_physical_port_name(logical_port_name, ganged_member_num, ganged_port)
ganged_member_num += 1
Expand Down Expand Up @@ -325,6 +327,15 @@ def del_port_sfp_dom_info_to_db(logical_port_name, int_tbl, dom_tbl):
log_error("This functionality is currently not implemented for this platform")
sys.exit(3)

# recover missing sfp table entries if any
def recover_missing_sfp_table_entries(sfp_util, int_tbl):
keys = int_tbl.getKeys()
logical_port_list = sfp_util.logical
for logical_port_name in logical_port_list:
if logical_port_name not in keys:
post_port_sfp_info_to_db(logical_port_name, int_tbl)
log_info("Port {} has been recovered".format(logical_port_name))

# Timer thread wrapper class to update dom info to DB periodically
class dom_info_update_task:
def __init__(self, table):
Expand Down Expand Up @@ -416,8 +427,9 @@ def main():

# Start main loop to listen to the SFP change event.
log_info("Start main loop")
time_last_recovery_run = time.time()
while True:
status, port_dict = platform_sfputil.get_transceiver_change_event()
status, port_dict = platform_sfputil.get_transceiver_change_event(XCVRD_MAIN_THREAD_SLEEP_MSECS)
if status:
for key, value in port_dict.iteritems():
logical_port_list = platform_sfputil.get_physical_to_logical(int(key))
Expand All @@ -426,8 +438,18 @@ def main():
rc = post_port_sfp_info_to_db(logical_port, int_tbl)
# If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
if rc == SFP_EEPROM_NOT_READY:
time.sleep(TIME_FOR_SFP_READY_SECS)
post_port_sfp_info_to_db(logical_port, int_tbl)
log_info("Port {} isn't present when got SFP insert event".format(logical_port))
retry = 0
while retry <= RETRIES_FOR_SPF_READY:
time.sleep(TIME_FOR_SFP_READY_SECS)
rc = post_port_sfp_info_to_db(logical_port, int_tbl)
if rc == SFP_EEPROM_NOT_READY:
log_info("Port {} isn't present when got SFP insert event, retry {}".format(logical_port, retry))
retry = retry + 1
else:
break
else:
log_info("get sfp info successfully {}, push to db".format(logical_port))
post_port_dom_info_to_db(logical_port, dom_tbl)

elif value == SFP_STATUS_REMOVED:
Expand All @@ -440,6 +462,13 @@ def main():
# TODO: next step need to define more error types to handle accordingly.
break

time_now = time.time()
time_diff = time_now - time_last_recovery_run
# time.time() returns value in unit of seconds while XCVRD_MAIN_THREAD_SLEEP_MSECS is defined in unit of milliseconds
if time_diff * 1000 > XCVRD_MAIN_THREAD_SLEEP_MSECS:
recover_missing_sfp_table_entries(platform_sfputil, int_tbl)
time_last_recovery_run = time_now

# Stop the dom info update timer
dom_info_update.task_stop()

Expand Down

0 comments on commit 42f64d8

Please sign in to comment.