-
Notifications
You must be signed in to change notification settings - Fork 163
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[xcvrd] Enhance xcvrd to handle new system level event/error #39
Changes from 10 commits
94583f5
15fe0c9
515e1e2
d26b6a8
64f7780
8d72aba
4b8bddd
aac7671
522311c
db8c991
ff98af3
5790499
9071ea1
346f420
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,13 +43,27 @@ XCVRD_MAIN_THREAD_SLEEP_SECS = 60 | |
SFP_STATUS_INSERTED = '1' | ||
SFP_STATUS_REMOVED = '0' | ||
|
||
EVENT_ON_ALL_SFP = '-1' | ||
# events definition | ||
SYSTEM_NOT_READY = 'system_not_ready' | ||
SYSTEM_BECOME_READY = 'system_become_ready' | ||
SYSTEM_FAIL = 'system_fail' | ||
NORMAL_EVENT = 'normal' | ||
# states definition | ||
STATE_INIT = 'INIT STATE' | ||
STATE_NORMAL = 'NORMAL STATE' | ||
STATE_EXIT = 'EXIT STATE' | ||
|
||
PHYSICAL_PORT_NOT_EXIST = -1 | ||
SFP_EEPROM_NOT_READY = -2 | ||
|
||
SFPUTIL_LOAD_ERROR = 1 | ||
PORT_CONFIG_LOAD_ERROR = 2 | ||
NOT_IMPLEMENTED_ERROR = 3 | ||
|
||
RETRY_TIMES_FOR_SYSTEM_READY = 24 | ||
RETRY_PERIOD_FOR_SYSTEM_READY_MSECS = 5000 | ||
|
||
TEMP_UNIT = 'C' | ||
VOLT_UNIT = 'Volts' | ||
POWER_UNIT = 'dBm' | ||
|
@@ -613,6 +627,30 @@ class sfp_state_update_task: | |
self.task_process = None | ||
self.task_stopping_event = multiprocessing.Event() | ||
|
||
def _mapping_event_from_change_event(self, status, port_dict): | ||
""" | ||
mapping from what get_transceiver_change_event returns to event defined in the state machine | ||
the logic is pretty straightforword | ||
""" | ||
if status: | ||
if bool(port_dict): | ||
event = NORMAL_EVENT | ||
else: | ||
event = SYSTEM_BECOME_READY | ||
# here, a simple timeout event whose port_dict is empty is mapped | ||
# into a SYSTEM_BECOME_READY event so that it can be handled | ||
port_dict[EVENT_ON_ALL_SFP] = SYSTEM_BECOME_READY | ||
else: | ||
if EVENT_ON_ALL_SFP in port_dict.keys(): | ||
event = port_dict[EVENT_ON_ALL_SFP] | ||
else: | ||
# this should not happen. just for protection | ||
event = SYSTEM_FAIL | ||
port_dict[EVENT_ON_ALL_SFP] = SYSTEM_FAIL | ||
|
||
logger.log_debug("mapping from {} {} to {}".format(status, port_dict, event)) | ||
return event | ||
|
||
def task_worker(self, stopping_event): | ||
logger.log_info("Start SFP monitoring loop") | ||
|
||
|
@@ -628,36 +666,163 @@ class sfp_state_update_task: | |
swsscommon.APP_PORT_TABLE_NAME) | ||
|
||
# Start loop to listen to the sfp change event | ||
# The state migrating sequence: | ||
# 1. When the system starts, it is in "INIT" state, calling get_transceiver_change_event | ||
# with RETRY_PERIOD_FOR_SYSTEM_READY_MSECS as timeout for as many as RETRY_TIMES_FOR_SYSTEM_READY | ||
# times | ||
# 2. Once 'system_become_ready' returned, the system enters "SYSTEM_READY" state and starts to monitor | ||
# the insertion/removal event of all the SFP modules. | ||
# In this state, receiving any system level event will be treated as an unrecoverable error and cause | ||
# the daemon exit | ||
|
||
# states definition | ||
# - Initial state: INIT, before received system ready or a normal event | ||
# - Final state: EXIT | ||
# - other state: NORMAL, after has received system-ready or a normal event | ||
|
||
# events definition | ||
# - SYSTEM_NOT_READY | ||
# - SYSTEM_BECOME_READY | ||
# - | ||
# - NORMAL_EVENT | ||
# - sfp insertion/removal | ||
# - timeout returned by sfputil.get_change_event with status = true | ||
# - SYSTEM_FAIL | ||
|
||
# State transmit: | ||
# 1. SYSTEM_NOT_READY | ||
# - INIT | ||
# - retry < RETRY_TIMES_FOR_SYSTEM_READY | ||
# retry ++ | ||
# - else | ||
# max retry reached, treat as fatal, exit | ||
# - NORMAL | ||
# Treat as a fatal error, exit | ||
# 2. SYSTEM_BECOME_READY | ||
# - INIT | ||
# transmit to NORMAL | ||
# - NORMAL | ||
# log the event | ||
# nop | ||
# 3. NORMAL_EVENT | ||
# - INIT (for the vendors who don't implement SYSTEM_BECOME_READY) | ||
# transmit to NORMAL | ||
# handle the event normally | ||
# - NORMAL | ||
# handle the event normally | ||
# 4. SYSTEM_FAIL | ||
# treat as a fatal error | ||
|
||
# State event next state | ||
# INIT SYSTEM NOT READY INIT / EXIT | ||
# INIT SYSTEM BECOME READY NORMAL | ||
# NORMAL SYSTEM BECOME READY NORMAL | ||
# INIT/NORMAL SYSTEM FAIL EXIT | ||
# INIT/NORMAL NORMAL EVENT NORMAL | ||
# NORMAL SYSTEM NOT READY EXIT | ||
# EXIT - | ||
|
||
retry = 0 | ||
timeout = RETRY_PERIOD_FOR_SYSTEM_READY_MSECS | ||
state = STATE_INIT | ||
unexpected_state = False | ||
while not stopping_event.is_set(): | ||
status, port_dict = platform_sfputil.get_transceiver_change_event() | ||
if status: | ||
for key, value in port_dict.iteritems(): | ||
logical_port_list = platform_sfputil.get_physical_to_logical(int(key)) | ||
for logical_port in logical_port_list: | ||
if value == SFP_STATUS_INSERTED: | ||
logger.log_info("Got SFP inserted event") | ||
rc = post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict) | ||
# If we didn't get the sfp info, assuming the eeprom is not ready, give a try again. | ||
if rc == SFP_EEPROM_NOT_READY: | ||
logger.log_warning("SFP EEPROM is not ready. One more try...") | ||
time.sleep(TIME_FOR_SFP_READY_SECS) | ||
post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict) | ||
post_port_dom_info_to_db(logical_port, dom_tbl) | ||
post_port_dom_threshold_info_to_db(logical_port, dom_tbl) | ||
notify_media_setting(logical_port, transceiver_dict, app_port_tbl) | ||
transceiver_dict.clear() | ||
elif value == SFP_STATUS_REMOVED: | ||
logger.log_info("Got SFP removed event") | ||
del_port_sfp_dom_info_from_db(logical_port, int_tbl, dom_tbl) | ||
else: | ||
# TODO, SFP return error code, need handle accordingly. | ||
continue | ||
else: | ||
# If get_transceiver_change_event() return error, will clean up the DB and then exit | ||
# TODO: next step need to define more error types to handle accordingly. | ||
logger.log_error("Failed to get transceiver change event. Exiting...") | ||
next_state = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
You may initialize it with state, next_state will never be None. #Closed |
||
time_start = time.time() | ||
status, port_dict = platform_sfputil.get_transceiver_change_event(timeout) | ||
logger.log_debug("Got event {} {} in state {}".format(status, port_dict, state)) | ||
event = self._mapping_event_from_change_event(status, port_dict) | ||
for key, value in port_dict.iteritems(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Why you put the for-loop outside the event/state branches? Can you move it inside a smaller branch? #Closed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. You are correct. |
||
if event == SYSTEM_NOT_READY: | ||
if state == STATE_INIT: | ||
# system not ready, wait and retry | ||
if retry >= RETRY_TIMES_FOR_SYSTEM_READY: | ||
logger.log_error("System failed to get ready in {} secs or received system error. Exiting...".format((RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000)*RETRY_TIMES_FOR_SYSTEM_READY)) | ||
next_state = STATE_EXIT | ||
break | ||
retry = retry + 1 | ||
|
||
# get_transceiver_change_event may return immediately, | ||
# we want the retry expired in expected time period, | ||
# So need to calc the time diff, | ||
# if time diff less that the pre-defined waiting time, | ||
# use sleep() to complete the time. | ||
time_now = time.time() | ||
time_diff = time_now - time_start | ||
if time_diff < RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000: | ||
time.sleep(RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000 - time_diff) | ||
elif state == STATE_NORMAL: | ||
logger.log_error("Got system_not_ready in normal state, treat as fatal. Exiting...") | ||
next_state = STATE_EXIT | ||
break | ||
else: | ||
unexpected_state = True | ||
next_state = STATE_EXIT | ||
break | ||
elif event == SYSTEM_BECOME_READY: | ||
if state == STATE_INIT: | ||
next_state = STATE_NORMAL | ||
logger.log_info("Got system_become_ready in init state, transmit to normal state") | ||
elif state == STATE_NORMAL: | ||
logger.log_info("Got system_become_ready in normal state, ignored") | ||
else: | ||
unexpected_state = True | ||
next_state = STATE_EXIT | ||
break | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. repeated code, you can extract function. #Closed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. now, only next_state is set when facing unexpected state. in this sense, no need to extract function. |
||
elif event == NORMAL_EVENT: | ||
if state == STATE_NORMAL or state == STATE_INIT: | ||
if state == STATE_INIT: | ||
next_state = STATE_NORMAL | ||
# this is the originally logic that handled the transceiver change event | ||
# this can be reached in two cases: | ||
# 1. the state has been normal before got the event | ||
# 2. the state was init and is transmitted to normal after got the event. | ||
# this is for the vendors who don't implement "system_not_ready/system_becom_ready" logic | ||
logical_port_list = platform_sfputil.get_physical_to_logical(int(key)) | ||
for logical_port in logical_port_list: | ||
if value == SFP_STATUS_INSERTED: | ||
logger.log_info("Got SFP inserted event") | ||
rc = post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict) | ||
# If we didn't get the sfp info, assuming the eeprom is not ready, give a try again. | ||
if rc == SFP_EEPROM_NOT_READY: | ||
logger.log_warning("SFP EEPROM is not ready. One more try...") | ||
time.sleep(TIME_FOR_SFP_READY_SECS) | ||
post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict) | ||
post_port_dom_info_to_db(logical_port, dom_tbl) | ||
post_port_dom_threshold_info_to_db(logical_port, dom_tbl) | ||
notify_media_setting(logical_port, transceiver_dict, app_port_tbl) | ||
transceiver_dict.clear() | ||
elif value == SFP_STATUS_REMOVED: | ||
logger.log_info("Got SFP removed event") | ||
del_port_sfp_dom_info_from_db(logical_port, int_tbl, dom_tbl) | ||
else: | ||
# TODO, SFP return error code, need handle accordingly. | ||
logger.log_warning("Got unknown event {}, ignored".format(value)) | ||
continue | ||
else: | ||
unexpected_state = True | ||
next_state = STATE_EXIT | ||
break | ||
elif event == SYSTEM_FAIL: | ||
# no matter which state current it is, it's fatal | ||
next_state = STATE_EXIT | ||
logger.log_error("Got system_fail event on state {}, exiting".format(state)) | ||
else: | ||
logger.log_warning("Got unknown event {} on state {}.".format(event, state)) | ||
|
||
if unexpected_state: | ||
next_state = STATE_EXIT | ||
logger.log_error("Facing unexpected state {}, exiting...".format(state)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can remove unexpected_state. This is redundant logic. #Closed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is only for log. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I decide to remove this. It is very unlikely to enter an unexpected state since all states are handled for each event. |
||
|
||
if next_state is not None: | ||
logger.log_debug("State transmitted from {} to {}".format(state, next_state)) | ||
state = next_state | ||
|
||
if next_state == STATE_EXIT: | ||
os.kill(os.getppid(), signal.SIGTERM) | ||
break | ||
elif next_state == STATE_NORMAL: | ||
timeout = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can move this part into event handling branches. #Closed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's better to set it here since it can ensure that wherever the state reaches STATE_NORMAL the timeout can be properly set. |
||
|
||
logger.log_info("Stop SFP monitoring loop") | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can use integer as value, don't rely on python to optimize the string comparison. #Closed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think to use string is better since it makes logging information more human-readable.
Even though a dictionary can be adopted to map from integer value to string, the case that the state is not in the key set of the dictionary has to be handled. That's too complicated.
BTW, I'm not sure whether to use string address to compare the string is part of python's language specification or just optimization that depends on python interpreter's implementation?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
states have been updated to integer value.
The events, like SYSTEM_FAIL, are already used by the plugins. In this sence, we'd better not to change them. It seems that to use string has already been a common practice in plugins, like the normal plugin/out events which are represented by '1', '0' respectively.