Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix race condition with async kernel management #5875

Merged
merged 1 commit into from
Dec 23, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions notebook/services/kernels/kernelmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,15 +294,19 @@ def shutdown_kernel(self, kernel_id, now=False, restart=False):
kernel._activity_stream.close()
kernel._activity_stream = None
self.stop_buffering(kernel_id)
self._kernel_connections.pop(kernel_id, None)

# Decrease the metric of number of kernels
# running for the relevant kernel type by 1
KERNEL_CURRENTLY_RUNNING_TOTAL.labels(
type=self._kernels[kernel_id].kernel_name
).dec()

return self.pinned_superclass.shutdown_kernel(self, kernel_id, now=now, restart=restart)
self.pinned_superclass.shutdown_kernel(self, kernel_id, now=now, restart=restart)
# Unlike its async sibling method in AsyncMappingKernelManager, removing the kernel_id
# from the connections dictionary isn't as problematic before the shutdown since the
# method is synchronous. However, we'll keep the relative call orders the same from
# a maintenance perspective.
self._kernel_connections.pop(kernel_id, None)

async def restart_kernel(self, kernel_id, now=False):
"""Restart a kernel by kernel_id"""
Expand Down Expand Up @@ -376,8 +380,11 @@ def list_kernels(self):
kernels = []
kernel_ids = self.pinned_superclass.list_kernel_ids(self)
for kernel_id in kernel_ids:
model = self.kernel_model(kernel_id)
kernels.append(model)
try:
model = self.kernel_model(kernel_id)
kernels.append(model)
except (web.HTTPError, KeyError):
pass # Probably due to a (now) non-existent kernel, continue building the list
return kernels

# override _check_kernel_id to raise 404 instead of KeyError
Expand Down Expand Up @@ -498,12 +505,16 @@ async def shutdown_kernel(self, kernel_id, now=False, restart=False):
kernel._activity_stream.close()
kernel._activity_stream = None
self.stop_buffering(kernel_id)
self._kernel_connections.pop(kernel_id, None)

# Decrease the metric of number of kernels
# running for the relevant kernel type by 1
KERNEL_CURRENTLY_RUNNING_TOTAL.labels(
type=self._kernels[kernel_id].kernel_name
).dec()

return await self.pinned_superclass.shutdown_kernel(self, kernel_id, now=now, restart=restart)
await self.pinned_superclass.shutdown_kernel(self, kernel_id, now=now, restart=restart)
# Remove kernel_id from the connections dictionary only after kernel has been shutdown,
# otherwise a race condition can occur since the shutdown may take a while - allowing
# list/fetch kernel operations to access _kernel_connections for a non-existent key
# (kernel_id) while "awaiting" the result of the shutdown.
self._kernel_connections.pop(kernel_id, None)