From 9ac12bfb406416b0fb3b03a084d8179bdc53f3a6 Mon Sep 17 00:00:00 2001 From: judyjoseph <53951155+judyjoseph@users.noreply.github.com> Date: Thu, 31 Mar 2022 08:56:14 -0700 Subject: [PATCH] Fix platform daemon chassisd to handle auto restart on fail (#247) Description Add signal handlers correctly and fix clean up so that the auto restart works ok on signals. Motivation and Context Add signal handlers correctly and fix clean up so that the auto restart works ok on signals. How Has This Been Tested? Verified by running again chassis LC. --- sonic-chassisd/scripts/chassisd | 33 ++++++++++------ sonic-chassisd/tests/test_chassisd.py | 55 +++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 12 deletions(-) diff --git a/sonic-chassisd/scripts/chassisd b/sonic-chassisd/scripts/chassisd index 78ada79e0be4..5fb8644706f5 100644 --- a/sonic-chassisd/scripts/chassisd +++ b/sonic-chassisd/scripts/chassisd @@ -30,6 +30,9 @@ except ImportError as e: # Constants ==================================================================== # +SIGNALS_TO_NAMES_DICT = dict((getattr(signal, n), n) + for n in dir(signal) if n.startswith('SIG') and '_' not in n) + SYSLOG_IDENTIFIER = "chassisd" CHASSIS_CFG_TABLE = 'CHASSIS_MODULE' @@ -75,6 +78,10 @@ INVALID_IP = '0.0.0.0' MODULE_ADMIN_DOWN = 0 MODULE_ADMIN_UP = 1 +# This daemon should return non-zero exit code so that supervisord will +# restart it automatically. +exit_code = 0 + # # Helper functions ============================================================= # @@ -366,18 +373,21 @@ class ChassisdDaemon(daemon_base.DaemonBase): self.stop = threading.Event() - # Signal handler + # Override signal handler from DaemonBase def signal_handler(self, sig, frame): - if sig == signal.SIGHUP: - self.log_info("Caught SIGHUP - ignoring...") - elif sig == signal.SIGINT: - self.log_info("Caught SIGINT - exiting...") - self.stop.set() - elif sig == signal.SIGTERM: - self.log_info("Caught SIGTERM - exiting...") + FATAL_SIGNALS = [signal.SIGINT, signal.SIGTERM] + NONFATAL_SIGNALS = [signal.SIGHUP] + + global exit_code + + if sig in FATAL_SIGNALS: + exit_code = 128 + sig # Make sure we exit with a non-zero code so that supervisor will try to restart us + self.log_info("Caught {} signal '{}' - exiting...".format(exit_code,SIGNALS_TO_NAMES_DICT[sig])) self.stop.set() + elif sig in NONFATAL_SIGNALS: + self.log_info("Caught signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig])) else: - self.log_warning("Caught unhandled signal '" + sig + "'") + self.log_warning("Caught unhandled signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig])) # Run daemon def run(self): @@ -421,9 +431,6 @@ class ChassisdDaemon(daemon_base.DaemonBase): self.log_info("Stop daemon main loop") - if config_manager is not None: - config_manager.task_stop() - # Delete all the information from DB and then exit self.module_updater.deinit() @@ -435,9 +442,11 @@ class ChassisdDaemon(daemon_base.DaemonBase): def main(): + global exit_code chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER) chassisd.run() + sys.exit(exit_code) if __name__ == '__main__': main() diff --git a/sonic-chassisd/tests/test_chassisd.py b/sonic-chassisd/tests/test_chassisd.py index 2950abd81f23..f3f36c3c34b8 100644 --- a/sonic-chassisd/tests/test_chassisd.py +++ b/sonic-chassisd/tests/test_chassisd.py @@ -441,3 +441,58 @@ def verify_fabric_asic(asic_name, asic_pci_address, module_name, asic_id_in_modu assert fvs == None fvs = fabric_asic_table.get("asic5") assert fvs == None + +def test_signal_handler(): + exit_code = 0 + daemon_chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER) + daemon_chassisd.stop.set = MagicMock() + daemon_chassisd.log_info = MagicMock() + daemon_chassisd.log_warning = MagicMock() + + # Test SIGHUP + daemon_chassisd.signal_handler(signal.SIGHUP, None) + assert daemon_chassisd.log_info.call_count == 1 + daemon_chassisd.log_info.assert_called_with("Caught signal 'SIGHUP' - ignoring...") + assert daemon_chassisd.log_warning.call_count == 0 + assert daemon_chassisd.stop.set.call_count == 0 + assert exit_code == 0 + + # Reset + daemon_chassisd.log_info.reset_mock() + daemon_chassisd.log_warning.reset_mock() + daemon_chassisd.stop.set.reset_mock() + + # Test SIGINT + test_signal = signal.SIGINT + daemon_chassisd.signal_handler(test_signal, None) + assert daemon_chassisd.log_info.call_count == 1 + daemon_chassisd.log_info.assert_called_with("Caught {} signal 'SIGINT' - exiting...".format(128 + test_signal)) + assert daemon_chassisd.log_warning.call_count == 0 + assert daemon_chassisd.stop.set.call_count == 1 + + # Reset + daemon_chassisd.log_info.reset_mock() + daemon_chassisd.log_warning.reset_mock() + daemon_chassisd.stop.set.reset_mock() + + # Test SIGTERM + test_signal = signal.SIGTERM + daemon_chassisd.signal_handler(test_signal, None) + assert daemon_chassisd.log_info.call_count == 1 + daemon_chassisd.log_info.assert_called_with("Caught {} signal 'SIGTERM' - exiting...".format(128 + test_signal)) + assert daemon_chassisd.log_warning.call_count == 0 + assert daemon_chassisd.stop.set.call_count == 1 + + # Reset + daemon_chassisd.log_info.reset_mock() + daemon_chassisd.log_warning.reset_mock() + daemon_chassisd.stop.set.reset_mock() + exit_code = 0 + + # Test an unhandled signal + daemon_chassisd.signal_handler(signal.SIGUSR1, None) + assert daemon_chassisd.log_warning.call_count == 1 + daemon_chassisd.log_warning.assert_called_with("Caught unhandled signal 'SIGUSR1' - ignoring...") + assert daemon_chassisd.log_info.call_count == 0 + assert daemon_chassisd.stop.set.call_count == 0 + assert exit_code == 0