From d26b6a87ad6cedce80e9b69884464904467106a8 Mon Sep 17 00:00:00 2001 From: keboliu Date: Wed, 31 Jul 2019 11:23:42 +0300 Subject: [PATCH 1/9] make xcvrd wait when system not ready --- sonic-xcvrd/scripts/xcvrd | 88 +++++++++++++++++++++++++++++++++++---- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/sonic-xcvrd/scripts/xcvrd b/sonic-xcvrd/scripts/xcvrd index 8daeb079c..aa00f2d92 100644 --- a/sonic-xcvrd/scripts/xcvrd +++ b/sonic-xcvrd/scripts/xcvrd @@ -43,6 +43,11 @@ XCVRD_MAIN_THREAD_SLEEP_SECS = 60 SFP_STATUS_INSERTED = '1' SFP_STATUS_REMOVED = '0' +EVENT_ON_ALL_SFP = '-1' +SYSTEM_NOT_READY = 'system_not_ready' +SYSTEM_READY = 'system_become_ready' +SYSTEM_FAIL = 'system_fail' + PHYSICAL_PORT_NOT_EXIST = -1 SFP_EEPROM_NOT_READY = -2 @@ -50,6 +55,9 @@ SFPUTIL_LOAD_ERROR = 1 PORT_CONFIG_LOAD_ERROR = 2 NOT_IMPLEMENTED_ERROR = 3 +RETRY_TIMES_FOR_SYSTEM_READY = 24 +RETRY_PERIOD_FOR_SYSTEM_READY_MSECS = 5000 + TEMP_UNIT = 'C' VOLT_UNIT = 'Volts' POWER_UNIT = 'dBm' @@ -613,6 +621,42 @@ class sfp_state_update_task: self.task_process = None self.task_stopping_event = multiprocessing.Event() + def _retry_for_system_ready(self): + retry = 0 + while retry < RETRY_TIMES_FOR_SYSTEM_READY: + #time.sleep(RETRY_PERIOD_FOR_SYSTEM_READY_SECS) + time_start = time.time() + rc, event_dict = platform_sfputil.get_transceiver_change_event(RETRY_PERIOD_FOR_SYSTEM_READY_MSECS) + if not rc and bool(event_dict) and EVENT_ON_ALL_SFP in event_dict: + value = event_dict[EVENT_ON_ALL_SFP] + if value == SYSTEM_NOT_READY: + retry = retry + 1 + # get_transceiver_change_event may return immediately, + # we want the retry expired in expected time period, + # So need to calc the time diff, + # if time diff less that the pre-defined waiting time, + # use sleep() to complete the time. + time_now = time.time() + time_diff = time_now - time_start + if time_diff < RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000: + time.sleep(RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000 - time_diff) + + logger.log_info("receive system not ready event, retried for {} times".format(retry)) + continue + elif value == SYSTEM_FAIL: + logger.log_info("receive system fail event, stop retry.") + return False + elif value == SYSTEM_READY: + logger.log_info("receive system ready event, stop retry.") + return True + else: + logger.log_error("receive unknown error event: {}, stop retry.".format(value)) + return False + else: + logger.log_error("error in received event, rc: {}, dict: {}".format(rc, event_dict)) + return False + return False + def task_worker(self, stopping_event): logger.log_info("Start SFP monitoring loop") @@ -631,11 +675,16 @@ class sfp_state_update_task: while not stopping_event.is_set(): status, port_dict = platform_sfputil.get_transceiver_change_event() if status: + # Receive event on single SFP for key, value in port_dict.iteritems(): + if key == EVENT_ON_ALL_SFP: + logger.log_error("Receive event with unexpected content, key: {}, value: {}".format(key, value)) + os.kill(os.getppid(), signal.SIGTERM) + break logical_port_list = platform_sfputil.get_physical_to_logical(int(key)) for logical_port in logical_port_list: if value == SFP_STATUS_INSERTED: - logger.log_info("Got SFP inserted event") + logger.log_info("Receive SFP inserted event") rc = post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict) # If we didn't get the sfp info, assuming the eeprom is not ready, give a try again. if rc == SFP_EEPROM_NOT_READY: @@ -647,17 +696,42 @@ class sfp_state_update_task: notify_media_setting(logical_port, transceiver_dict, app_port_tbl) transceiver_dict.clear() elif value == SFP_STATUS_REMOVED: - logger.log_info("Got SFP removed event") + logger.log_info("Receive SFP removed event") del_port_sfp_dom_info_from_db(logical_port, int_tbl, dom_tbl) else: # TODO, SFP return error code, need handle accordingly. continue else: - # If get_transceiver_change_event() return error, will clean up the DB and then exit - # TODO: next step need to define more error types to handle accordingly. - logger.log_error("Failed to get transceiver change event. Exiting...") - os.kill(os.getppid(), signal.SIGTERM) - break + # Receive system level error/event + if bool(port_dict): + for key, value in port_dict.iteritems(): + if key == EVENT_ON_ALL_SFP: + if value == SYSTEM_NOT_READY: + # system not ready, wait and retry + if self._retry_for_system_ready(): + continue + else: + logger.log_error("System failed to get ready in {} secs or received system error. Exiting...".format((RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000)*RETRY_TIMES_FOR_SYSTEM_READY)) + os.kill(os.getppid(), signal.SIGTERM) + break + elif value == SYSTEM_FAIL: + logger.log_error("Receive system fail event. Exiting...") + os.kill(os.getppid(), signal.SIGTERM) + break + elif value == SYSTEM_READY: + logger.log_info("Receive system ready event. Continue...") + continue + else: + logger.log_error("Receive unknown event, key: {}, value: {}. Exiting...".format(key, value)) + os.kill(os.getppid(), signal.SIGTERM) + break + else: + logger.log_error("Receive unknown error, key: {}, value: {}. Exiting...".format(key, value)) + else: + # Receive event with error dict, consider as system fail + logger.log_error("Failed to get transceiver change event. Exiting...") + os.kill(os.getppid(), signal.SIGTERM) + break logger.log_info("Stop SFP monitoring loop") From 64f778089e040ad66f8f606a1128dbb3a0b45716 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Fri, 2 Aug 2019 13:45:05 +0300 Subject: [PATCH 2/9] [xcvrd] fix indents and tense error. --- sonic-xcvrd/scripts/xcvrd | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sonic-xcvrd/scripts/xcvrd b/sonic-xcvrd/scripts/xcvrd index aa00f2d92..040b9afb1 100644 --- a/sonic-xcvrd/scripts/xcvrd +++ b/sonic-xcvrd/scripts/xcvrd @@ -678,13 +678,13 @@ class sfp_state_update_task: # Receive event on single SFP for key, value in port_dict.iteritems(): if key == EVENT_ON_ALL_SFP: - logger.log_error("Receive event with unexpected content, key: {}, value: {}".format(key, value)) + logger.log_error("Got event with unexpected content, key: {}, value: {}".format(key, value)) os.kill(os.getppid(), signal.SIGTERM) break logical_port_list = platform_sfputil.get_physical_to_logical(int(key)) for logical_port in logical_port_list: if value == SFP_STATUS_INSERTED: - logger.log_info("Receive SFP inserted event") + logger.log_info("Got SFP inserted event") rc = post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict) # If we didn't get the sfp info, assuming the eeprom is not ready, give a try again. if rc == SFP_EEPROM_NOT_READY: @@ -696,7 +696,7 @@ class sfp_state_update_task: notify_media_setting(logical_port, transceiver_dict, app_port_tbl) transceiver_dict.clear() elif value == SFP_STATUS_REMOVED: - logger.log_info("Receive SFP removed event") + logger.log_info("Got SFP removed event") del_port_sfp_dom_info_from_db(logical_port, int_tbl, dom_tbl) else: # TODO, SFP return error code, need handle accordingly. @@ -708,25 +708,25 @@ class sfp_state_update_task: if key == EVENT_ON_ALL_SFP: if value == SYSTEM_NOT_READY: # system not ready, wait and retry - if self._retry_for_system_ready(): + if self._retry_for_system_ready(): continue - else: + else: logger.log_error("System failed to get ready in {} secs or received system error. Exiting...".format((RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000)*RETRY_TIMES_FOR_SYSTEM_READY)) os.kill(os.getppid(), signal.SIGTERM) break elif value == SYSTEM_FAIL: - logger.log_error("Receive system fail event. Exiting...") + logger.log_error("Got system fail event. Exiting...") os.kill(os.getppid(), signal.SIGTERM) break elif value == SYSTEM_READY: - logger.log_info("Receive system ready event. Continue...") + logger.log_info("Got system ready event. Continue...") continue else: - logger.log_error("Receive unknown event, key: {}, value: {}. Exiting...".format(key, value)) + logger.log_error("Got unknown event, key: {}, value: {}. Exiting...".format(key, value)) os.kill(os.getppid(), signal.SIGTERM) break else: - logger.log_error("Receive unknown error, key: {}, value: {}. Exiting...".format(key, value)) + logger.log_error("Got unknown error, key: {}, value: {}. Exiting...".format(key, value)) else: # Receive event with error dict, consider as system fail logger.log_error("Failed to get transceiver change event. Exiting...") From 8d72abad3ed799ff44544ed3603443117d194652 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Sat, 3 Aug 2019 12:44:21 +0300 Subject: [PATCH 3/9] [xcvrd]update retry-until-ready logic in state-machine approach --- sonic-xcvrd/scripts/xcvrd | 72 +++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 40 deletions(-) diff --git a/sonic-xcvrd/scripts/xcvrd b/sonic-xcvrd/scripts/xcvrd index 040b9afb1..01e282613 100644 --- a/sonic-xcvrd/scripts/xcvrd +++ b/sonic-xcvrd/scripts/xcvrd @@ -621,42 +621,6 @@ class sfp_state_update_task: self.task_process = None self.task_stopping_event = multiprocessing.Event() - def _retry_for_system_ready(self): - retry = 0 - while retry < RETRY_TIMES_FOR_SYSTEM_READY: - #time.sleep(RETRY_PERIOD_FOR_SYSTEM_READY_SECS) - time_start = time.time() - rc, event_dict = platform_sfputil.get_transceiver_change_event(RETRY_PERIOD_FOR_SYSTEM_READY_MSECS) - if not rc and bool(event_dict) and EVENT_ON_ALL_SFP in event_dict: - value = event_dict[EVENT_ON_ALL_SFP] - if value == SYSTEM_NOT_READY: - retry = retry + 1 - # get_transceiver_change_event may return immediately, - # we want the retry expired in expected time period, - # So need to calc the time diff, - # if time diff less that the pre-defined waiting time, - # use sleep() to complete the time. - time_now = time.time() - time_diff = time_now - time_start - if time_diff < RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000: - time.sleep(RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000 - time_diff) - - logger.log_info("receive system not ready event, retried for {} times".format(retry)) - continue - elif value == SYSTEM_FAIL: - logger.log_info("receive system fail event, stop retry.") - return False - elif value == SYSTEM_READY: - logger.log_info("receive system ready event, stop retry.") - return True - else: - logger.log_error("receive unknown error event: {}, stop retry.".format(value)) - return False - else: - logger.log_error("error in received event, rc: {}, dict: {}".format(rc, event_dict)) - return False - return False - def task_worker(self, stopping_event): logger.log_info("Start SFP monitoring loop") @@ -672,8 +636,20 @@ class sfp_state_update_task: swsscommon.APP_PORT_TABLE_NAME) # Start loop to listen to the sfp change event + # The state migrating sequence: + # 1. When the system starts, it is in "SYSTEM_NOT_READY" state, calling get_transceiver_change_event + # with RETRY_PERIOD_FOR_SYSTEM_READY_MSECS as timeout for as many as RETRY_TIMES_FOR_SYSTEM_READY + # times + # 2. Once 'system_become_ready' returned, the system enters "SYSTEM_READY" state and starts to monitor + # the insertion/removal event of all the SFP modules. + # In this state, receiving any system level event will be treated as an unrecoverable error and cause + # the daemon exit + retry = 0 + timeout = RETRY_PERIOD_FOR_SYSTEM_READY_MSECS + state = SYSTEM_NOT_READY while not stopping_event.is_set(): - status, port_dict = platform_sfputil.get_transceiver_change_event() + time_start = time.time() + status, port_dict = platform_sfputil.get_transceiver_change_event(timeout) if status: # Receive event on single SFP for key, value in port_dict.iteritems(): @@ -707,18 +683,34 @@ class sfp_state_update_task: for key, value in port_dict.iteritems(): if key == EVENT_ON_ALL_SFP: if value == SYSTEM_NOT_READY: + # receiving this event under SYSTEM_READY state should treated as an error + if state != SYSTEM_NOT_READY: + logger.log_error("Got system become ready event in {} state. Exiting...".format(state)) + os.kill(os.getppid(), signal.SIGTERM) + break # system not ready, wait and retry - if self._retry_for_system_ready(): - continue - else: + if retry >= RETRY_TIMES_FOR_SYSTEM_READY: logger.log_error("System failed to get ready in {} secs or received system error. Exiting...".format((RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000)*RETRY_TIMES_FOR_SYSTEM_READY)) os.kill(os.getppid(), signal.SIGTERM) break + retry = retry + 1 + + # get_transceiver_change_event may return immediately, + # we want the retry expired in expected time period, + # So need to calc the time diff, + # if time diff less that the pre-defined waiting time, + # use sleep() to complete the time. + time_now = time.time() + time_diff = time_now - time_start + if time_diff < RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000: + time.sleep(RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000 - time_diff) elif value == SYSTEM_FAIL: logger.log_error("Got system fail event. Exiting...") os.kill(os.getppid(), signal.SIGTERM) break elif value == SYSTEM_READY: + timeout = 0 + state = SYSTEM_READY logger.log_info("Got system ready event. Continue...") continue else: From 4b8bddd4e6aa275eba75452f76b93c0aa74fd02d Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Tue, 6 Aug 2019 08:50:22 +0300 Subject: [PATCH 4/9] [xcvrd] receiving a normal event when "SYSTEM_NOT_READY", transmit the state to "SYSTEM_READY" --- sonic-xcvrd/scripts/xcvrd | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sonic-xcvrd/scripts/xcvrd b/sonic-xcvrd/scripts/xcvrd index 01e282613..7cd3d8d7d 100644 --- a/sonic-xcvrd/scripts/xcvrd +++ b/sonic-xcvrd/scripts/xcvrd @@ -652,6 +652,10 @@ class sfp_state_update_task: status, port_dict = platform_sfputil.get_transceiver_change_event(timeout) if status: # Receive event on single SFP + # 1. Transit to SYSTEM_READY state, since some vendors don't implement system level events + if state == SYSTEM_NOT_READY: + state = SYSTEM_READY + # 2. Handle the event for key, value in port_dict.iteritems(): if key == EVENT_ON_ALL_SFP: logger.log_error("Got event with unexpected content, key: {}, value: {}".format(key, value)) From aac76715f0e66aff0afd48ff8c0b2b5b8393abe5 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Tue, 6 Aug 2019 13:04:29 +0300 Subject: [PATCH 5/9] [xcvrd]set timeout = 0 when enters SYSTEM_READY state. --- sonic-xcvrd/scripts/xcvrd | 1 + 1 file changed, 1 insertion(+) diff --git a/sonic-xcvrd/scripts/xcvrd b/sonic-xcvrd/scripts/xcvrd index 7cd3d8d7d..976e55118 100644 --- a/sonic-xcvrd/scripts/xcvrd +++ b/sonic-xcvrd/scripts/xcvrd @@ -654,6 +654,7 @@ class sfp_state_update_task: # Receive event on single SFP # 1. Transit to SYSTEM_READY state, since some vendors don't implement system level events if state == SYSTEM_NOT_READY: + timeout = 0 state = SYSTEM_READY # 2. Handle the event for key, value in port_dict.iteritems(): From 522311cd15cb2ab0efb6a08fda253e19be8cd986 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Thu, 8 Aug 2019 11:54:11 +0300 Subject: [PATCH 6/9] [xcvrd] update the logic of sfp_state_update_task in a state machine approach the definition of state machine is in front of sfp_state_update_task.task_worker --- sonic-xcvrd/scripts/xcvrd | 252 ++++++++++++++++++++++++++------------ 1 file changed, 173 insertions(+), 79 deletions(-) diff --git a/sonic-xcvrd/scripts/xcvrd b/sonic-xcvrd/scripts/xcvrd index 976e55118..5ca8ff78d 100644 --- a/sonic-xcvrd/scripts/xcvrd +++ b/sonic-xcvrd/scripts/xcvrd @@ -44,9 +44,15 @@ SFP_STATUS_INSERTED = '1' SFP_STATUS_REMOVED = '0' EVENT_ON_ALL_SFP = '-1' +# events definition SYSTEM_NOT_READY = 'system_not_ready' -SYSTEM_READY = 'system_become_ready' +SYSTEM_BECOME_READY = 'system_become_ready' SYSTEM_FAIL = 'system_fail' +NORMAL_EVENT = 'normal' +# states definition +STATE_INIT = 'INIT STATE' +STATE_NORMAL = 'NORMAL STATE' +STATE_EXIT = 'EXIT STATE' PHYSICAL_PORT_NOT_EXIST = -1 SFP_EEPROM_NOT_READY = -2 @@ -621,6 +627,30 @@ class sfp_state_update_task: self.task_process = None self.task_stopping_event = multiprocessing.Event() + def _mapping_event_from_change_event(self, status, port_dict): + """ + mapping from what get_transceiver_change_event returns to event defined in the state machine + the logic is pretty straightforword + """ + if status: + if bool(port_dict): + event = NORMAL_EVENT + else: + event = SYSTEM_BECOME_READY + # here, a simple timeout event whose port_dict is empty is mapped + # into a SYSTEM_BECOME_READY event so that it can be handled + port_dict[EVENT_ON_ALL_SFP] = SYSTEM_BECOME_READY + else: + if EVENT_ON_ALL_SFP in port_dict.keys(): + event = port_dict[EVENT_ON_ALL_SFP] + else: + # this should not happen. just for protection + event = SYSTEM_FAIL + port_dict[EVENT_ON_ALL_SFP] = SYSTEM_FAIL + + logger.log_debug("mapping from {} {} to {}".format(status, port_dict, event)) + return event + def task_worker(self, stopping_event): logger.log_info("Start SFP monitoring loop") @@ -637,98 +667,162 @@ class sfp_state_update_task: # Start loop to listen to the sfp change event # The state migrating sequence: - # 1. When the system starts, it is in "SYSTEM_NOT_READY" state, calling get_transceiver_change_event + # 1. When the system starts, it is in "INIT" state, calling get_transceiver_change_event # with RETRY_PERIOD_FOR_SYSTEM_READY_MSECS as timeout for as many as RETRY_TIMES_FOR_SYSTEM_READY # times # 2. Once 'system_become_ready' returned, the system enters "SYSTEM_READY" state and starts to monitor # the insertion/removal event of all the SFP modules. # In this state, receiving any system level event will be treated as an unrecoverable error and cause # the daemon exit + + # states definition + # - Initial state: INIT, before received system ready or a normal event + # - Final state: EXIT + # - other state: NORMAL, after has received system-ready or a normal event + + # events definition + # - SYSTEM_NOT_READY + # - SYSTEM_BECOME_READY + # - + # - NORMAL_EVENT + # - sfp insertion/removal + # - timeout returned by sfputil.get_change_event with status = true + # - SYSTEM_FAIL + + # State transmit: + # 1. SYSTEM_NOT_READY + # - INIT + # - retry < RETRY_TIMES_FOR_SYSTEM_READY + # retry ++ + # - else + # max retry reached, treat as fatal, exit + # - NORMAL + # Treat as a fatal error, exit + # 2. SYSTEM_BECOME_READY + # - INIT + # transmit to NORMAL + # - NORMAL + # log the event + # nop + # 3. NORMAL_EVENT + # - INIT (for the vendors who don't implement SYSTEM_BECOME_READY) + # transmit to NORMAL + # handle the event normally + # - NORMAL + # handle the event normally + # 4. SYSTEM_FAIL + # treat as a fatal error + + # State event next state + # INIT SYSTEM NOT READY INIT / EXIT + # INIT SYSTEM BECOME READY NORMAL + # NORMAL SYSTEM BECOME READY NORMAL + # INIT/NORMAL SYSTEM FAIL EXIT + # INIT/NORMAL NORMAL EVENT NORMAL + # NORMAL SYSTEM NOT READY EXIT + # EXIT - + retry = 0 timeout = RETRY_PERIOD_FOR_SYSTEM_READY_MSECS - state = SYSTEM_NOT_READY + state = STATE_INIT + unexpected_state = False while not stopping_event.is_set(): + next_state = None time_start = time.time() status, port_dict = platform_sfputil.get_transceiver_change_event(timeout) - if status: - # Receive event on single SFP - # 1. Transit to SYSTEM_READY state, since some vendors don't implement system level events - if state == SYSTEM_NOT_READY: - timeout = 0 - state = SYSTEM_READY - # 2. Handle the event - for key, value in port_dict.iteritems(): - if key == EVENT_ON_ALL_SFP: - logger.log_error("Got event with unexpected content, key: {}, value: {}".format(key, value)) - os.kill(os.getppid(), signal.SIGTERM) + logger.log_debug("Got event {} {} in state {}".format(status, port_dict, state)) + event = self._mapping_event_from_change_event(status, port_dict) + for key, value in port_dict.iteritems(): + if event == SYSTEM_NOT_READY: + if state == STATE_INIT: + # system not ready, wait and retry + if retry >= RETRY_TIMES_FOR_SYSTEM_READY: + logger.log_error("System failed to get ready in {} secs or received system error. Exiting...".format((RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000)*RETRY_TIMES_FOR_SYSTEM_READY)) + next_state = STATE_EXIT + break + retry = retry + 1 + + # get_transceiver_change_event may return immediately, + # we want the retry expired in expected time period, + # So need to calc the time diff, + # if time diff less that the pre-defined waiting time, + # use sleep() to complete the time. + time_now = time.time() + time_diff = time_now - time_start + if time_diff < RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000: + time.sleep(RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000 - time_diff) + elif state == STATE_NORMAL: + logger.log_error("Got system_not_ready in normal state, treat as fatal. Exiting...") + next_state = STATE_EXIT break - logical_port_list = platform_sfputil.get_physical_to_logical(int(key)) - for logical_port in logical_port_list: - if value == SFP_STATUS_INSERTED: - logger.log_info("Got SFP inserted event") - rc = post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict) - # If we didn't get the sfp info, assuming the eeprom is not ready, give a try again. - if rc == SFP_EEPROM_NOT_READY: - logger.log_warning("SFP EEPROM is not ready. One more try...") - time.sleep(TIME_FOR_SFP_READY_SECS) - post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict) - post_port_dom_info_to_db(logical_port, dom_tbl) - post_port_dom_threshold_info_to_db(logical_port, dom_tbl) - notify_media_setting(logical_port, transceiver_dict, app_port_tbl) - transceiver_dict.clear() - elif value == SFP_STATUS_REMOVED: - logger.log_info("Got SFP removed event") - del_port_sfp_dom_info_from_db(logical_port, int_tbl, dom_tbl) - else: - # TODO, SFP return error code, need handle accordingly. - continue - else: - # Receive system level error/event - if bool(port_dict): - for key, value in port_dict.iteritems(): - if key == EVENT_ON_ALL_SFP: - if value == SYSTEM_NOT_READY: - # receiving this event under SYSTEM_READY state should treated as an error - if state != SYSTEM_NOT_READY: - logger.log_error("Got system become ready event in {} state. Exiting...".format(state)) - os.kill(os.getppid(), signal.SIGTERM) - break - # system not ready, wait and retry - if retry >= RETRY_TIMES_FOR_SYSTEM_READY: - logger.log_error("System failed to get ready in {} secs or received system error. Exiting...".format((RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000)*RETRY_TIMES_FOR_SYSTEM_READY)) - os.kill(os.getppid(), signal.SIGTERM) - break - retry = retry + 1 - - # get_transceiver_change_event may return immediately, - # we want the retry expired in expected time period, - # So need to calc the time diff, - # if time diff less that the pre-defined waiting time, - # use sleep() to complete the time. - time_now = time.time() - time_diff = time_now - time_start - if time_diff < RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000: - time.sleep(RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000 - time_diff) - elif value == SYSTEM_FAIL: - logger.log_error("Got system fail event. Exiting...") - os.kill(os.getppid(), signal.SIGTERM) - break - elif value == SYSTEM_READY: - timeout = 0 - state = SYSTEM_READY - logger.log_info("Got system ready event. Continue...") - continue + else: + unexpected_state = True + next_state = STATE_EXIT + break + elif event == SYSTEM_BECOME_READY: + if state == STATE_INIT: + next_state = STATE_NORMAL + logger.log_info("Got system_become_ready in init state, transmit to normal state") + elif state == STATE_NORMAL: + logger.log_info("Got system_become_ready in normal state, ignored") + else: + unexpected_state = True + next_state = STATE_EXIT + break + elif event == NORMAL_EVENT: + if state == STATE_NORMAL or state == STATE_INIT: + if state == STATE_INIT: + next_state = STATE_NORMAL + # this is the originally logic that handled the transceiver change event + # this can be reached in two cases: + # 1. the state has been normal before got the event + # 2. the state was init and is transmitted to normal after got the event. + # this is for the vendors who don't implement "system_not_ready/system_becom_ready" logic + logical_port_list = platform_sfputil.get_physical_to_logical(int(key)) + for logical_port in logical_port_list: + if value == SFP_STATUS_INSERTED: + logger.log_info("Got SFP inserted event") + rc = post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict) + # If we didn't get the sfp info, assuming the eeprom is not ready, give a try again. + if rc == SFP_EEPROM_NOT_READY: + logger.log_warning("SFP EEPROM is not ready. One more try...") + time.sleep(TIME_FOR_SFP_READY_SECS) + post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict) + post_port_dom_info_to_db(logical_port, dom_tbl) + post_port_dom_threshold_info_to_db(logical_port, dom_tbl) + notify_media_setting(logical_port, transceiver_dict, app_port_tbl) + transceiver_dict.clear() + elif value == SFP_STATUS_REMOVED: + logger.log_info("Got SFP removed event") + del_port_sfp_dom_info_from_db(logical_port, int_tbl, dom_tbl) else: - logger.log_error("Got unknown event, key: {}, value: {}. Exiting...".format(key, value)) - os.kill(os.getppid(), signal.SIGTERM) - break - else: - logger.log_error("Got unknown error, key: {}, value: {}. Exiting...".format(key, value)) + # TODO, SFP return error code, need handle accordingly. + logger.log_warning("Got unknown event {}, ignored".format(value)) + continue + else: + unexpected_state = True + next_state = STATE_EXIT + break + elif event == SYSTEM_FAIL: + # no matter which state current it is, it's fatal + next_state = STATE_EXIT + logger.log_error("Got system_fail event on state {}, exiting".format(state)) else: - # Receive event with error dict, consider as system fail - logger.log_error("Failed to get transceiver change event. Exiting...") - os.kill(os.getppid(), signal.SIGTERM) - break + logger.log_warning("Got unknown event {} on state {}.".format(event, state)) + + if unexpected_state: + next_state = STATE_EXIT + logger.log_error("Facing unexpected state {}, exiting...".format(state)) + + if next_state is not None: + logger.log_debug("State transmitted from {} to {}".format(state, next_state)) + state = next_state + + if next_state == STATE_EXIT: + os.kill(os.getppid(), signal.SIGTERM) + break + elif next_state == STATE_NORMAL: + timeout = 0 logger.log_info("Stop SFP monitoring loop") From 57904997694ac49d1bb9207241e5508e5f3202b3 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Fri, 9 Aug 2019 10:05:08 +0300 Subject: [PATCH 7/9] [xcvrd]address review comments --- sonic-xcvrd/scripts/xcvrd | 90 +++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 51 deletions(-) diff --git a/sonic-xcvrd/scripts/xcvrd b/sonic-xcvrd/scripts/xcvrd index f222b8c57..8123951e3 100644 --- a/sonic-xcvrd/scripts/xcvrd +++ b/sonic-xcvrd/scripts/xcvrd @@ -50,9 +50,9 @@ SYSTEM_BECOME_READY = 'system_become_ready' SYSTEM_FAIL = 'system_fail' NORMAL_EVENT = 'normal' # states definition -STATE_INIT = 'INIT STATE' -STATE_NORMAL = 'NORMAL STATE' -STATE_EXIT = 'EXIT STATE' +STATE_INIT = 0 +STATE_NORMAL = 1 +STATE_EXIT = 2 PHYSICAL_PORT_NOT_EXIST = -1 SFP_EEPROM_NOT_READY = -2 @@ -769,21 +769,19 @@ class sfp_state_update_task: retry = 0 timeout = RETRY_PERIOD_FOR_SYSTEM_READY_MSECS state = STATE_INIT - unexpected_state = False while not stopping_event.is_set(): - next_state = None + next_state = state time_start = time.time() status, port_dict = platform_sfputil.get_transceiver_change_event(timeout) logger.log_debug("Got event {} {} in state {}".format(status, port_dict, state)) event = self._mapping_event_from_change_event(status, port_dict) - for key, value in port_dict.iteritems(): - if event == SYSTEM_NOT_READY: - if state == STATE_INIT: - # system not ready, wait and retry - if retry >= RETRY_TIMES_FOR_SYSTEM_READY: - logger.log_error("System failed to get ready in {} secs or received system error. Exiting...".format((RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000)*RETRY_TIMES_FOR_SYSTEM_READY)) - next_state = STATE_EXIT - break + if event == SYSTEM_NOT_READY: + if state == STATE_INIT: + # system not ready, wait and retry + if retry >= RETRY_TIMES_FOR_SYSTEM_READY: + logger.log_error("System failed to get ready in {} secs or received system error. Exiting...".format((RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000)*RETRY_TIMES_FOR_SYSTEM_READY)) + next_state = STATE_EXIT + else: retry = retry + 1 # get_transceiver_change_event may return immediately, @@ -795,33 +793,29 @@ class sfp_state_update_task: time_diff = time_now - time_start if time_diff < RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000: time.sleep(RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000 - time_diff) - elif state == STATE_NORMAL: - logger.log_error("Got system_not_ready in normal state, treat as fatal. Exiting...") - next_state = STATE_EXIT - break - else: - unexpected_state = True - next_state = STATE_EXIT - break - elif event == SYSTEM_BECOME_READY: + elif state == STATE_NORMAL: + logger.log_error("Got system_not_ready in normal state, treat as fatal. Exiting...") + next_state = STATE_EXIT + else: + next_state = STATE_EXIT + elif event == SYSTEM_BECOME_READY: + if state == STATE_INIT: + next_state = STATE_NORMAL + logger.log_info("Got system_become_ready in init state, transmit to normal state") + elif state == STATE_NORMAL: + logger.log_info("Got system_become_ready in normal state, ignored") + else: + next_state = STATE_EXIT + elif event == NORMAL_EVENT: + if state == STATE_NORMAL or state == STATE_INIT: if state == STATE_INIT: next_state = STATE_NORMAL - logger.log_info("Got system_become_ready in init state, transmit to normal state") - elif state == STATE_NORMAL: - logger.log_info("Got system_become_ready in normal state, ignored") - else: - unexpected_state = True - next_state = STATE_EXIT - break - elif event == NORMAL_EVENT: - if state == STATE_NORMAL or state == STATE_INIT: - if state == STATE_INIT: - next_state = STATE_NORMAL - # this is the originally logic that handled the transceiver change event - # this can be reached in two cases: - # 1. the state has been normal before got the event - # 2. the state was init and is transmitted to normal after got the event. - # this is for the vendors who don't implement "system_not_ready/system_becom_ready" logic + # this is the originally logic that handled the transceiver change event + # this can be reached in two cases: + # 1. the state has been normal before got the event + # 2. the state was init and is transmitted to normal after got the event. + # this is for the vendors who don't implement "system_not_ready/system_becom_ready" logic + for key, value in port_dict.iteritems(): logical_port_list = platform_sfputil.get_physical_to_logical(int(key)) for logical_port in logical_port_list: if value == SFP_STATUS_INSERTED: @@ -843,22 +837,16 @@ class sfp_state_update_task: # TODO, SFP return error code, need handle accordingly. logger.log_warning("Got unknown event {}, ignored".format(value)) continue - else: - unexpected_state = True - next_state = STATE_EXIT - break - elif event == SYSTEM_FAIL: - # no matter which state current it is, it's fatal - next_state = STATE_EXIT - logger.log_error("Got system_fail event on state {}, exiting".format(state)) else: - logger.log_warning("Got unknown event {} on state {}.".format(event, state)) - - if unexpected_state: + next_state = STATE_EXIT + elif event == SYSTEM_FAIL: + # no matter which state current it is, it's fatal next_state = STATE_EXIT - logger.log_error("Facing unexpected state {}, exiting...".format(state)) + logger.log_error("Got system_fail event on state {}, exiting".format(state)) + else: + logger.log_warning("Got unknown event {} on state {}.".format(event, state)) - if next_state is not None: + if not next_state == state: logger.log_debug("State transmitted from {} to {}".format(state, next_state)) state = next_state From 9071ea1a98ae2f6424fefc1429a577e9db9a0449 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Fri, 9 Aug 2019 12:19:29 +0300 Subject: [PATCH 8/9] [xcvrd] fix merging confilcts --- sonic-xcvrd/scripts/xcvrd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sonic-xcvrd/scripts/xcvrd b/sonic-xcvrd/scripts/xcvrd index 8123951e3..1da25e8da 100644 --- a/sonic-xcvrd/scripts/xcvrd +++ b/sonic-xcvrd/scripts/xcvrd @@ -144,15 +144,15 @@ def _wrapper_get_transceiver_dom_threshold_info(physical_port): return platform_sfputil.get_transceiver_dom_threshold_info_dict(physical_port) -def _wrapper_get_transceiver_change_event(): +def _wrapper_get_transceiver_change_event(timeout): if platform_chassis is not None: try: - status, events = platform_chassis.get_change_event() + status, events = platform_chassis.get_change_event(timeout) sfp_events = events['sfp'] return status, sfp_events except NotImplementedError: pass - return platform_sfputil.get_transceiver_change_event() + return platform_sfputil.get_transceiver_change_event(timeout) # Remove unnecessary unit from the raw data def beautify_dom_info_dict(dom_info_dict): @@ -772,7 +772,7 @@ class sfp_state_update_task: while not stopping_event.is_set(): next_state = state time_start = time.time() - status, port_dict = platform_sfputil.get_transceiver_change_event(timeout) + status, port_dict = _wrapper_get_transceiver_change_event(timeout) logger.log_debug("Got event {} {} in state {}".format(status, port_dict, state)) event = self._mapping_event_from_change_event(status, port_dict) if event == SYSTEM_NOT_READY: From 346f420c78a62f4db4db4cbbb89ade8c108c9513 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Sat, 10 Aug 2019 03:55:51 +0300 Subject: [PATCH 9/9] [xcvrd] improve readability. --- sonic-xcvrd/scripts/xcvrd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sonic-xcvrd/scripts/xcvrd b/sonic-xcvrd/scripts/xcvrd index 1da25e8da..0fd8a7e03 100644 --- a/sonic-xcvrd/scripts/xcvrd +++ b/sonic-xcvrd/scripts/xcvrd @@ -846,7 +846,7 @@ class sfp_state_update_task: else: logger.log_warning("Got unknown event {} on state {}.".format(event, state)) - if not next_state == state: + if next_state != state: logger.log_debug("State transmitted from {} to {}".format(state, next_state)) state = next_state