From 56c07d2336f295a44ae01ece67f49986b8770dc9 Mon Sep 17 00:00:00 2001 From: Stepan Blyshchak <38952541+stepanblyschak@users.noreply.github.com> Date: Mon, 2 Dec 2024 08:45:43 +0200 Subject: [PATCH] [healthd] fix healthd shutdown race (#19504) #### Why I did it To fix errors that happen when writing to the queue: ``` Jun 5 23:04:41.798613 r-leopard-56 NOTICE healthd: Caught SIGTERM - exiting... Jun 5 23:04:41.798985 r-leopard-56 NOTICE healthd: Caught SIGTERM - exiting... Jun 5 23:04:41.799535 r-leopard-56 NOTICE healthd: Caught SIGTERM - exiting... Jun 5 23:04:41.806010 r-leopard-56 NOTICE healthd: Caught SIGTERM - exiting... Jun 5 23:04:41.814075 r-leopard-56 ERR healthd: system_service[Errno 104] Connection reset by peer Jun 5 23:04:41.824135 r-leopard-56 ERR healthd: Traceback (most recent call last):#012 File "/usr/local/lib/python3.9/dist-packages/health_checker/sysmonitor.py", line 484, in system_service#012 msg = self.myQ.get(timeout=QUEUE_TIMEOUT)#012 File "", line 2, in get#012 File "/usr/lib/python3.9/multiprocessing/managers.py", line 809, in _callmethod#012 kind, result = conn.recv()#012 File "/usr/lib/python3.9/multiprocessing/connection.py", line 255, in recv#012 buf = self._recv_bytes()#012 File "/usr/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes#012 buf = self._recv(4)#012 File "/usr/lib/python3.9/multiprocessing/connection.py", line 384, in _recv#012 chunk = read(handle, remaining)#012ConnectionResetError: [Errno 104] Connection reset by peer Jun 5 23:04:41.826489 r-leopard-56 INFO healthd[8494]: ERROR:dbus.connection:Exception in handler for D-Bus signal: Jun 5 23:04:41.826591 r-leopard-56 INFO healthd[8494]: Traceback (most recent call last): Jun 5 23:04:41.826640 r-leopard-56 INFO healthd[8494]: File "/usr/lib/python3/dist-packages/dbus/connection.py", line 232, in maybe_handle_message Jun 5 23:04:41.826686 r-leopard-56 INFO healthd[8494]: self._handler(*args, **kwargs) Jun 5 23:04:41.826738 r-leopard-56 INFO healthd[8494]: File "/usr/local/lib/python3.9/dist-packages/health_checker/sysmonitor.py", line 82, in on_job_removed Jun 5 23:04:41.826785 r-leopard-56 INFO healthd[8494]: self.task_notify(msg) Jun 5 23:04:41.826831 r-leopard-56 INFO healthd[8494]: File "/usr/local/lib/python3.9/dist-packages/health_checker/sysmonitor.py", line 110, in task_notify Jun 5 23:04:41.826877 r-leopard-56 INFO healthd[8494]: self.task_queue.put(msg) Jun 5 23:04:41.826923 r-leopard-56 INFO healthd[8494]: File "", line 2, in put Jun 5 23:04:41.826973 r-leopard-56 INFO healthd[8494]: File "/usr/lib/python3.9/multiprocessing/managers.py", line 808, in _callmethod Jun 5 23:04:41.827018 r-leopard-56 INFO healthd[8494]: conn.send((self._id, methodname, args, kwds)) Jun 5 23:04:41.827065 r-leopard-56 INFO healthd[8494]: File "/usr/lib/python3.9/multiprocessing/connection.py", line 211, in send Jun 5 23:04:41.827115 r-leopard-56 INFO healthd[8494]: self._send_bytes(_ForkingPickler.dumps(obj)) Jun 5 23:04:41.827158 r-leopard-56 INFO healthd[8494]: File "/usr/lib/python3.9/multiprocessing/connection.py", line 416, in _send_bytes Jun 5 23:04:41.827199 r-leopard-56 INFO healthd[8494]: self._send(header + buf) Jun 5 23:04:41.827254 r-leopard-56 INFO healthd[8494]: File "/usr/lib/python3.9/multiprocessing/connection.py", line 373, in _send Jun 5 23:04:41.827322 r-leopard-56 INFO healthd[8494]: n = write(self._handle, buf) Jun 5 23:04:41.827368 r-leopard-56 INFO healthd[8494]: BrokenPipeError: [Errno 32] Broken pipe Jun 5 23:04:42.800216 r-leopard-56 NOTICE healthd: Caught SIGTERM - exiting... ``` When the multiprocessing.Manager is shutdown the queue will raise the above errors. This happens during shutdown - fast-reboot, warm-reboot. With the fix, system-health service does not hang: ``` root@sonic:/home/admin# sudo systemctl start system-health ; sleep 10; echo "$(date): Stopping..."; sudo systemctl stop system-health; echo "$(date): Stopped" Thu Oct 17 01:07:56 PM IDT 2024: Stopping... Thu Oct 17 01:07:58 PM IDT 2024: Stopped root@sonic:/home/admin# sudo systemctl start system-health ; sleep 10; echo "$(date): Stopping..."; sudo systemctl stop system-health; echo "$(date): Stopped" Thu Oct 17 01:08:13 PM IDT 2024: Stopping... Thu Oct 17 01:08:14 PM IDT 2024: Stopped root@sonic:/home/admin# sudo systemctl start system-health ; sleep 10; echo "$(date): Stopping..."; sudo systemctl stop system-health; echo "$(date): Stopped" Thu Oct 17 01:09:05 PM IDT 2024: Stopping... Thu Oct 17 01:09:06 PM IDT 2024: Stopped ``` ##### Work item tracking - Microsoft ADO **(number only)**: #### How I did it Remove the call to shutdown, the cleanup will happen automatically when GC runs as per documentation - https://docs.python.org/3/library/multiprocessing.html #### How to verify it Run warm-reboot, fast-reboot multiple times and verify no errors in the log. #### Which release branch to backport (provide reason below if selected) - [ ] 201811 - [ ] 201911 - [ ] 202006 - [ ] 202012 - [ ] 202106 - [ ] 202111 - [x] 202205 - [x] 202311 - [x] 202405 #### Tested branch (Please provide the tested image version) - [ ] - [ ] #### Description for the changelog #### Link to config_db schema for YANG module changes #### A picture of a cute animal (not mandatory but encouraged) --- .../health_checker/sysmonitor.py | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/system-health/health_checker/sysmonitor.py b/src/system-health/health_checker/sysmonitor.py index c07b58b5489b..1f95006befcf 100755 --- a/src/system-health/health_checker/sysmonitor.py +++ b/src/system-health/health_checker/sysmonitor.py @@ -104,6 +104,11 @@ def task_worker(self): logger.log_info("Start Listening to systemd bus (pid {0})".format(os.getpid())) self.subscribe_sysbus() + def task_stop(self): + # FIXME: Gracefully stop `loop.run()`. + self._task_process.kill() + return True + def task_notify(self, msg): if self.task_stopping_event.is_set(): return @@ -481,9 +486,11 @@ def system_service(self): from queue import Empty # Queue to receive the STATEDB and Systemd state change event - while not self.task_stopping_event.is_set(): + while True: try: msg = self.myQ.get(timeout=QUEUE_TIMEOUT) + if msg == "stop": + break event = msg["unit"] event_src = msg["evt_src"] event_time = msg["time"] @@ -503,15 +510,10 @@ def system_service(self): monitor_statedb_table.task_stop() def task_worker(self): - if self.task_stopping_event.is_set(): - return self.system_service() def task_stop(self): - # Signal the process to stop - self.task_stopping_event.set() - #Clear the resources of mpmgr- Queue - self.mpmgr.shutdown() + self.myQ.put("stop") # Wait for the process to exit self._task_process.join(self._stop_timeout_secs) @@ -519,12 +521,8 @@ def task_stop(self): # If the process didn't exit, attempt to kill it if self._task_process.is_alive(): logger.log_notice("Attempting to kill sysmon main process with pid {}".format(self._task_process.pid)) - os.kill(self._task_process.pid, signal.SIGKILL) - - if self._task_process.is_alive(): - logger.log_error("Sysmon main process with pid {} could not be killed".format(self._task_process.pid)) + self._task_process.kill() + self._task_process.join() return False return True - -