Skip to content

Commit

Permalink
Deal with stale daemon PID files during stop attempts (#2795)
Browse files Browse the repository at this point in the history
In some circumstances, the circus PID file which stores the PID of the
running circus process, can be corrupted or not deleted properly,
causing calls to `is_daemon_running` to return a false positive.

This situation can arise if a system is shut down suddenly and so
the process is killed but the PID file is not deleted in time.
Alternatively, another process or the user may have meddled with the
PID file in some way, corrupting it.

Here, we implement a function that checks for stale PID files by
checking whether the PID contained in the PID file matches a valid
running `verdi `process. If it does not, a warning is emitted and the
PID file is deleted. This function is called when `verdi daemon stop` is
called, making it function as a hard reset in a situation of a stale PID.

Co-authored-by: Giovanni Pizzi <[email protected]>
  • Loading branch information
2 people authored and sphuber committed May 30, 2019
1 parent 875f3db commit 04e80d2
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 4 deletions.
14 changes: 10 additions & 4 deletions aiida/cmdline/commands/cmd_daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
from aiida.cmdline.commands.cmd_verdi import verdi
from aiida.cmdline.utils import decorators, echo
from aiida.cmdline.utils.common import get_env_with_venv_bin
from aiida.cmdline.utils.daemon import get_daemon_status, print_client_response_status
from aiida.cmdline.utils.daemon import get_daemon_status, \
print_client_response_status, delete_stale_pid_file, _START_CIRCUS_COMMAND
from aiida.manage.configuration import get_config


Expand All @@ -46,9 +47,9 @@ def start(foreground):
echo.echo('Starting the daemon... ', nl=False)

if foreground:
command = ['verdi', '-p', client.profile.name, 'daemon', 'start-circus', '--foreground']
command = ['verdi', '-p', client.profile.name, 'daemon', _START_CIRCUS_COMMAND, '--foreground']
else:
command = ['verdi', '-p', client.profile.name, 'daemon', 'start-circus']
command = ['verdi', '-p', client.profile.name, 'daemon', _START_CIRCUS_COMMAND]

try:
currenv = get_env_with_venv_bin()
Expand Down Expand Up @@ -159,6 +160,8 @@ def stop(no_wait, all_profiles):
echo.echo('Daemon was not running')
continue

delete_stale_pid_file(client)

wait = not no_wait

if wait:
Expand All @@ -169,7 +172,10 @@ def stop(no_wait, all_profiles):
response = client.stop_daemon(wait)

if wait:
print_client_response_status(response)
if response['status'] == client.DAEMON_ERROR_NOT_RUNNING:
click.echo('The daemon was not running.')
else:
print_client_response_status(response)


@verdi_daemon.command()
Expand Down
37 changes: 37 additions & 0 deletions aiida/cmdline/utils/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,15 @@
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

import click
from tabulate import tabulate

from aiida.cmdline.utils import echo
from aiida.cmdline.utils.common import format_local_time

_START_CIRCUS_COMMAND = 'start-circus'


def print_client_response_status(response):
"""
Expand Down Expand Up @@ -89,3 +93,36 @@ def get_daemon_status(client):
'Use verdi daemon [incr | decr] [num] to increase / decrease the amount of workers')

return template.format(**info)


def delete_stale_pid_file(client):
"""Delete a potentially state daemon PID file.
Checks if the PID contatined in the circus PID file (circus-{PROFILE_NAME}.pid) matches a valid running `verdi`
process. If it does not, the PID file is stale and will be removed.
This situation can arise if a system is shut down suddenly and so the process is killed but the PID file is not
deleted in time. When the `get_daemon_pid()` method is called, an incorrect PID is returned. Alternatively, another
process or the user may have meddled with the PID file in some way, corrupting it.
:param client: the `DaemonClient`
"""
import os
import psutil

class StartCircusNotFound(Exception):
"""For when 'start-circus' is not found in the ps command."""

pid = client.get_daemon_pid()

if pid is not None:
try:
process = psutil.Process(pid)
if _START_CIRCUS_COMMAND not in process.cmdline():
raise StartCircusNotFound() # Also this is a case in which the process is not there anymore
except (psutil.AccessDenied, psutil.NoSuchProcess, StartCircusNotFound):
echo.echo_warning(
'Deleted apparently stale daemon PID file as its associated process<{}> does not exist anymore'.format(
pid))
if os.path.isfile(client.circus_pid_file):
os.remove(client.circus_pid_file)

0 comments on commit 04e80d2

Please sign in to comment.