From 643684ff30dd264d8496b83fbf46c26645f59aa4 Mon Sep 17 00:00:00 2001 From: Tim Stumbaugh Date: Wed, 12 Jan 2022 13:35:13 -0700 Subject: [PATCH 1/3] subprocess - Fix a bug where trio isn't notified about pidfd closes If you have 1. Task A blocks in `Process.wait` 2. The child process exits 3. Task B calls `Process.returncode` (3) will close the pidfd, but nothing will ever wake up Task A, since trio wasn't notified. I noticed this when I wrote a `deliver_cancel` function that was slow enough that the child actually exited before the call to `Process.returncode`. In the default `_posix_deliver_cancel`, everything is so fast that the usual ordering is that (3) comes before (2), and there isn't an early call to `_pidfd_close`. This fixes the bug by ensuring we notify trio about closing the pidfd when we do it! --- trio/_subprocess.py | 9 +++++++- trio/tests/test_subprocess.py | 41 ++++++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/trio/_subprocess.py b/trio/_subprocess.py index 154a940df8..aef7bcbedb 100644 --- a/trio/_subprocess.py +++ b/trio/_subprocess.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING from ._abc import AsyncResource, SendStream, ReceiveStream +from ._core import ClosedResourceError from ._highlevel_generic import StapledStream from ._sync import Lock from ._subprocess_platform import ( @@ -217,6 +218,7 @@ async def aclose(self): def _close_pidfd(self): if self._pidfd is not None: + trio.lowlevel.notify_closing(self._pidfd.fileno()) self._pidfd.close() self._pidfd = None @@ -229,7 +231,12 @@ async def wait(self): async with self._wait_lock: if self.poll() is None: if self._pidfd is not None: - await trio.lowlevel.wait_readable(self._pidfd) + try: + await trio.lowlevel.wait_readable(self._pidfd) + except ClosedResourceError: + # something else (probably a call to poll) already closed the + # pidfd + pass else: await wait_child_exiting(self) # We have to use .wait() here, not .poll(), because on macOS diff --git a/trio/tests/test_subprocess.py b/trio/tests/test_subprocess.py index 45e6c035e4..c2e17cf0da 100644 --- a/trio/tests/test_subprocess.py +++ b/trio/tests/test_subprocess.py @@ -1,27 +1,27 @@ import os +import random import signal import subprocess import sys +from functools import partial from pathlib import Path as SyncPath import pytest -import random -from functools import partial from async_generator import asynccontextmanager from .. import ( + ClosedResourceError, + Process, _core, - move_on_after, fail_after, + move_on_after, + run_process, sleep, sleep_forever, - Process, - run_process, - ClosedResourceError, ) +from .._core.tests.tutil import skip_if_fbsd_pipes_broken, slow from ..lowlevel import open_process -from .._core.tests.tutil import slow, skip_if_fbsd_pipes_broken -from ..testing import wait_all_tasks_blocked +from ..testing import assert_no_checkpoints, wait_all_tasks_blocked posix = os.name == "posix" if posix: @@ -570,3 +570,28 @@ async def test_for_leaking_fds(): with pytest.raises(PermissionError): await run_process(["/dev/fd/0"]) assert set(SyncPath("/dev/fd").iterdir()) == starting_fds + + +async def test_subprocess_pidfd_unnotified(): + noticed_exit = None + + async def wait_and_tell(proc) -> None: + nonlocal noticed_exit + noticed_exit = False + await proc.wait() + noticed_exit = True + + proc = await open_process(SLEEP(9999)) + async with _core.open_nursery() as nursery: + nursery.start_soon(wait_and_tell, proc) + await wait_all_tasks_blocked() + assert noticed_exit is False + proc.terminate() + # without giving trio a chance to do so, + with assert_no_checkpoints(): + # wait until the process has actually exited; + proc._proc.wait() + # force a call to poll (that closes the pidfd on linux) + proc.poll() + await wait_all_tasks_blocked() + assert noticed_exit, "child task wasn't woken after poll, DEADLOCK" From bb02d15fe9128b4f821c35fbc9f5d1d0b712a2f5 Mon Sep 17 00:00:00 2001 From: Tim Stumbaugh Date: Wed, 12 Jan 2022 14:29:07 -0700 Subject: [PATCH 2/3] Add news and comment --- newsfragments/2209.bugfix.rst | 3 +++ trio/tests/test_subprocess.py | 2 ++ 2 files changed, 5 insertions(+) create mode 100644 newsfragments/2209.bugfix.rst diff --git a/newsfragments/2209.bugfix.rst b/newsfragments/2209.bugfix.rst new file mode 100644 index 0000000000..8ea6094f15 --- /dev/null +++ b/newsfragments/2209.bugfix.rst @@ -0,0 +1,3 @@ +Fix a bug that could cause `Process.wait` to hang on Linux systems using pidfds, if +another task were to access `Process.returncode` after the process exited but before +``wait`` woke up diff --git a/trio/tests/test_subprocess.py b/trio/tests/test_subprocess.py index c2e17cf0da..2627521380 100644 --- a/trio/tests/test_subprocess.py +++ b/trio/tests/test_subprocess.py @@ -572,6 +572,8 @@ async def test_for_leaking_fds(): assert set(SyncPath("/dev/fd").iterdir()) == starting_fds +# regression test for #2209 +@pytest.mark.skipif(not posix, reason="Regression test for Linux-only bug") async def test_subprocess_pidfd_unnotified(): noticed_exit = None From 18a850c186e23afa5d9ffdfcf3d4f536bb60ae77 Mon Sep 17 00:00:00 2001 From: Tim Stumbaugh Date: Wed, 12 Jan 2022 14:37:39 -0700 Subject: [PATCH 3/3] Use an event instead --- trio/tests/test_subprocess.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/trio/tests/test_subprocess.py b/trio/tests/test_subprocess.py index 2627521380..65abe13e50 100644 --- a/trio/tests/test_subprocess.py +++ b/trio/tests/test_subprocess.py @@ -11,6 +11,7 @@ from .. import ( ClosedResourceError, + Event, Process, _core, fail_after, @@ -573,21 +574,20 @@ async def test_for_leaking_fds(): # regression test for #2209 -@pytest.mark.skipif(not posix, reason="Regression test for Linux-only bug") async def test_subprocess_pidfd_unnotified(): noticed_exit = None async def wait_and_tell(proc) -> None: nonlocal noticed_exit - noticed_exit = False + noticed_exit = Event() await proc.wait() - noticed_exit = True + noticed_exit.set() proc = await open_process(SLEEP(9999)) async with _core.open_nursery() as nursery: nursery.start_soon(wait_and_tell, proc) await wait_all_tasks_blocked() - assert noticed_exit is False + assert isinstance(noticed_exit, Event) proc.terminate() # without giving trio a chance to do so, with assert_no_checkpoints(): @@ -595,5 +595,8 @@ async def wait_and_tell(proc) -> None: proc._proc.wait() # force a call to poll (that closes the pidfd on linux) proc.poll() - await wait_all_tasks_blocked() - assert noticed_exit, "child task wasn't woken after poll, DEADLOCK" + with move_on_after(5): + # Some platforms use threads to wait for exit, so it might take a bit + # for everything to notice + await noticed_exit.wait() + assert noticed_exit.is_set(), "child task wasn't woken after poll, DEADLOCK"