Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix a potential deadlock in await_asynchronously with nested locks #503

Merged
merged 6 commits into from
Aug 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion hivemind/utils/asyncio.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import asyncio
import concurrent.futures
import multiprocessing as mp
import os
from concurrent.futures import ThreadPoolExecutor
from contextlib import AbstractAsyncContextManager, AbstractContextManager, asynccontextmanager
from typing import AsyncIterable, AsyncIterator, Awaitable, Callable, Iterable, Optional, Tuple, TypeVar, Union
Expand Down Expand Up @@ -167,12 +169,25 @@ async def attach_event_on_finished(iterable: AsyncIterable[T], event: asyncio.Ev
class _AsyncContextWrapper(AbstractAsyncContextManager):
"""Wrapper for a non-async context manager that allows entering and exiting it in EventLoop-friendly manner"""

EXECUTOR_PID = None
CONTEXT_EXECUTOR = None
EXECUTOR_LOCK = mp.Lock()

def __init__(self, context: AbstractContextManager):
self._context = context

@classmethod
def get_process_wide_executor(cls):
if os.getpid() != cls.EXECUTOR_PID:
with cls.EXECUTOR_LOCK:
borzunov marked this conversation as resolved.
Show resolved Hide resolved
if os.getpid() != cls.EXECUTOR_PID:
cls.CONTEXT_EXECUTOR = ThreadPoolExecutor(max_workers=float("inf"))
cls.EXECUTOR_PID = os.getpid()
return cls.CONTEXT_EXECUTOR

async def __aenter__(self):
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, self._context.__enter__)
return await loop.run_in_executor(self.get_process_wide_executor(), self._context.__enter__)

async def __aexit__(self, exc_type, exc_value, traceback):
return self._context.__exit__(exc_type, exc_value, traceback)
Expand Down
26 changes: 26 additions & 0 deletions tests/test_util_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,32 @@ async def coro2():
# running this without enter_asynchronously would deadlock the event loop


@pytest.mark.asyncio
async def test_async_context_flooding():
"""
test for a possible deadlock when many coroutines await the lock and overwhelm the underlying ThreadPoolExecutor

Here's how the test below works: suppose that the thread pool has at most N workers;
If at least N + 1 coroutines await lock1 concurrently, N of them occupy workers and the rest are awaiting workers;
When the first of N workers acquires lock1, it lets coroutine A inside lock1 and into await sleep(1e-2);
During that sleep, one of the worker-less coroutines will take up the worker freed by coroutine A.
Finally, coroutine A finishes sleeping and immediately gets stuck at lock2, because there are no free workers.
Thus, every single coroutine is either awaiting an already acquired lock, or awaiting for free workers in executor.

"""
lock1, lock2 = mp.Lock(), mp.Lock()

async def coro():
async with enter_asynchronously(lock1):
await asyncio.sleep(1e-2)
async with enter_asynchronously(lock2):
await asyncio.sleep(1e-2)

num_coros = max(100, mp.cpu_count() * 5 + 1)
# note: if we deprecate py3.7, this can be reduced to max(33, cpu + 5); see https://bugs.python.org/issue35279
await asyncio.wait({coro() for _ in range(num_coros)})


def test_batch_tensor_descriptor_msgpack():
tensor_descr = BatchTensorDescriptor.from_tensor(torch.ones(1, 3, 3, 7))
tensor_descr_roundtrip = MSGPackSerializer.loads(MSGPackSerializer.dumps(tensor_descr))
Expand Down