Skip to content

Commit

Permalink
Merge pull request #30 from JoshKarpel/force-stop
Browse files Browse the repository at this point in the history
Add a --force option to jupyter stop
  • Loading branch information
JoshKarpel authored Jul 23, 2020
2 parents de86828 + 0db078e commit 574899d
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 24 deletions.
120 changes: 98 additions & 22 deletions dask_chtc/cli.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
import functools
import getpass
import logging
import os
import re
import signal
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from pprint import pformat
from typing import List, Mapping, Optional
from typing import List, Mapping, Optional, Tuple

import classad
import click
import dask
import htcondor
import humanize
import psutil
from click_didyoumean import DYMGroup
from watchdog import events
from watchdog.observers import Observer
Expand Down Expand Up @@ -218,11 +222,24 @@ def start(jupyter_args):


@jupyter.command()
def stop():
@click.option(
"--force",
"-f",
is_flag=True,
default=False,
help="Stop your notebook server without giving it a chance to clean up.",
)
def stop(force):
"""
Stop a Jupyter notebook server that was started via "start".
If the --force option is given, the notebook server will be killed without
giving it time to shutdown cleanly. We recommend always trying a normal stop
first, then stopping it again with --force only if it is stuck in the
REMOVED state for more than a few minutes
(use the "status" subcommand to see its current state).
"""
JupyterJobManager().connect().stop()
JupyterJobManager().connect().stop(force=force)


@jupyter.command()
Expand All @@ -238,6 +255,9 @@ def status(raw):
If you have started a Jupyter notebook server in the past and need to
find it's address again, use this command.
If you are trying to shut down your notebook server job and it is stuck in
the REMOVED state, try running "dask-chtc jupyter stop --force".
"""
now = datetime.utcnow().replace(tzinfo=timezone.utc)

Expand Down Expand Up @@ -370,7 +390,8 @@ def on_modified(self, event: events.FileSystemEvent):
click.secho(line.rstrip(), fg=self.color, err=True)


MARKER = "IsDaskCHTCJupyterNotebookServer"
MARKER_KEY = "IsDaskCHTCJupyterNotebookServer"
MARKER_VALUE = "true"


class JupyterJobManager:
Expand All @@ -389,7 +410,9 @@ def __init__(self, logs_dir: Optional[Path] = None):
def discover(cls) -> Job:
schedd = htcondor.Schedd()

query = schedd.query(constraint=f"Owner == {classad.quote(getpass.getuser())} && {MARKER}",)
query = schedd.query(
constraint=f"Owner == {classad.quote(getpass.getuser())} && {MARKER_KEY}",
)
if len(query) == 0:
raise click.ClickException(
"Was not able to find a running Jupyter notebook server job!"
Expand Down Expand Up @@ -422,7 +445,9 @@ def contact_address(self) -> str:
contact_addresses.add(match.group(0))

if len(contact_addresses) == 0:
raise Exception("Could not find contact address for Jupyter notebook server from logs")
raise click.ClickException(
"Could not find contact address for Jupyter notebook server from logs; wait a few seconds and try again."
)

# TODO: this choice is extremely arbitrary...
return sorted(contact_addresses)[0]
Expand All @@ -449,9 +474,14 @@ def start(self, jupyter_args: List[str]) -> "JupyterJobManager":
"stream_output": "true",
"stream_error": "true",
"getenv": "true",
"environment": f"{MARKER_KEY}={MARKER_VALUE}",
"transfer_executable": "false",
"transfer_output_files": '""',
f"My.{MARKER}": "true",
# job_max_vacate_time doesn't actually work in local universe,
# but might some day:
# https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=7746
"job_max_vacate_time": "60",
f"My.{MARKER_KEY}": MARKER_VALUE,
}
)

Expand All @@ -471,24 +501,24 @@ def watch_events(self) -> None:

for event in self.events:
text = str(event).rstrip()
if event.type in (htcondor.JobEventType.JOB_HELD, htcondor.JobEventType.JOB_TERMINATED):
click.secho(text, err=True, fg="red")
elif event.type is htcondor.JobEventType.JOB_ABORTED:
click.secho(text, err=True, fg="white")
click.secho(text, err=True, fg=JOB_EVENT_TO_COLOR.get(event.type, "white"))
if event.type in BREAK_ON_JOB_EVENTS:
break
else:
click.secho(text, err=True, fg="white")

def remove_job(self) -> None:
def remove_job(self, force: bool = False) -> None:
try:
schedd = htcondor.Schedd()
schedd.act(
htcondor.JobAction.Remove,
[f"{self.cluster_id}.0"],
"Shut down Jupyter notebook server",
)
if not force:
schedd = htcondor.Schedd()
schedd.act(
htcondor.JobAction.Remove,
[f"{self.cluster_id}.0"],
"Shut down Jupyter notebook server",
)
else:
kill_proc_tree(find_notebook_server_process())
except Exception:
logger.exception(f"Failed to remove Jupyter notebook server job!")
raise

def start_echoing(self) -> None:
if self.observer is not None:
Expand Down Expand Up @@ -527,9 +557,9 @@ def rotate_files(self) -> int:

return stamp

def stop(self) -> None:
def stop(self, force: bool = False) -> None:
self.start_echoing()
self.remove_job()
self.remove_job(force=force)
self.watch_events()
self.stop_echoing()
self.rotate_files()
Expand All @@ -540,3 +570,49 @@ def __enter__(self) -> "JupyterJobManager":

def __exit__(self, exc_type, exc_val, exc_tb):
self.stop()


JOB_EVENT_TO_COLOR = {
htcondor.JobEventType.JOB_HELD: "red",
htcondor.JobEventType.JOB_TERMINATED: "red",
htcondor.JobEventType.JOB_ABORTED: "green",
}
BREAK_ON_JOB_EVENTS = {
htcondor.JobEventType.JOB_HELD,
htcondor.JobEventType.JOB_TERMINATED,
htcondor.JobEventType.JOB_ABORTED,
}


def find_notebook_server_process() -> psutil.Process:
"""
Find the current user's running notebook server process by looking for a
marker environment variable and matching against their username.
Raises an exception if there were no matches.
"""
username = getpass.getuser()
for proc in psutil.process_iter(attrs=["username", "environ"]):
if (
proc.info["username"] == username
and (proc.info["environ"] or {}).get(MARKER_KEY) == MARKER_VALUE
):
return proc

raise Exception("Couldn't find Jupyter notebook server process for current user.")


def kill_proc_tree(
process: psutil.Process, signal: signal.Signals = signal.SIGKILL, timeout: Optional[int] = None
) -> Tuple[Tuple[psutil.Process, ...], Tuple[psutil.Process, ...]]:
"""
Kill a process tree
and return a (gone, still_alive) tuple after the timeout has expired.
Adapted from https://psutil.readthedocs.io/en/latest/#kill-process-tree
"""
to_kill = [process] + process.children(recursive=True)
for p in to_kill:
p.send_signal(signal)
gone, alive = psutil.wait_procs(to_kill, timeout=timeout)
return gone, alive
20 changes: 18 additions & 2 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ development environment to use Dask in.
`GitHub repository <https://github.com/JoshKarpel/dask-chtc>`_
to keep track of new releases, and upgrading promptly when they occur.

These pages will walk you through the various aspects of using Dask-CHTC:
These pages will get you started with Dask-CHTC:

:doc:`installation`
How to install Python and Dask-CHTC on a CHTC submit node.
Expand All @@ -64,9 +64,18 @@ These pages will walk you through the various aspects of using Dask-CHTC:
showing how to start up a :class:`CHTCCluster`
and use it to perform some calculations.


These pages have information for troubleshooting problems and handling
specific use cases:

:doc:`troubleshooting`
Solutions and advice for tackling specific problems that might arise
while using Dask-CHTC.

:doc:`docker`
Information on how to build Docker images for use with Dask-CHTC.


Detailed information on the Python API
and the associated command line tool
can be found on these pages:
Expand All @@ -93,12 +102,19 @@ can be found on these pages:
jupyter
networking
example

.. toctree::
:maxdepth: 2
:hidden:
:caption: Getting It Working

troubleshooting
docker

.. toctree::
:maxdepth: 2
:hidden:
:caption: Reference
:caption: Getting the Details

api
cli
58 changes: 58 additions & 0 deletions docs/source/troubleshooting.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
.. _troubleshooting:

.. py:currentmodule:: dask_chtc
Troubleshooting
===============


Jupyter
-------

Jupyter notebook server is stuck in the ``REMOVED`` state
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

If something goes wrong during a normal ``dask-chtc jupyter stop``, you may
find that your notebook server will refuse to shut down.
The notebook server status will get stuck in ``REMOVED``, like this:

.. code-block:: console
$ dask-chtc jupyter status
█ REMOVED jupyter lab
├─ Contact Address: http://127.0.0.1:8888/?token=d1717bce73ebc0e54ebeb16eeeef70811ead8eaae23e213c
├─ Python Executable: /home/karpel/miniconda3/bin/python
├─ Working Directory: /home/karpel
├─ Job ID: 8138911.0
├─ Last status change at: 2020-07-19 21:34:02+00:00 UTC (23 minutes ago)
├─ Originally started at: 2020-07-19 18:57:07+00:00 UTC (3 hours ago)
├─ Output: /home/karpel/.dask-chtc/jupyter-logs/current.out
├─ Error: /home/karpel/.dask-chtc/jupyter-logs/current.err
└─ Events: /home/karpel/.dask-chtc/jupyter-logs/current.events
Because you can only run one notebook server at a time, this will prevent you
from launching a new notebook server.
To resolve this issue, you should run ``dask-chtc jupyter stop --force``:

.. code-block:: console
$ dask-chtc jupyter stop --force
000 (16453.000.000) 2020-07-21 11:58:25 Job submitted from host: <10.0.1.43:40415?addrs=10.0.1.43-40415+[2600-6c44-1180-1661-99fa-fc04-10e3-fd8d]-40415&alias=JKARPEL&noUDP&sock=schedd_20423_5f31>
001 (16453.000.000) 2020-07-21 11:58:27 Job executing on host: <10.0.1.43:40415?addrs=10.0.1.43-40415+[2600-6c44-1180-1661-99fa-fc04-10e3-fd8d]-40415&alias=JKARPEL&noUDP&sock=starter_20464_7d39_11>
005 (16453.000.000) 2020-07-21 11:58:30 Job terminated.
(0) Abnormal termination (signal 9)
(0) No core file
Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage
Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage
Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage
Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage
0 - Run Bytes Sent By Job
0 - Run Bytes Received By Job
0 - Total Bytes Sent By Job
0 - Total Bytes Received By Job
Always try stopping your notebook server with a plain ``stop`` command before
trying ``stop --force``;
``--force`` does not give the notebook server a chance
to shut down cleanly, so your Jupyter kernels may be interrupted while in the
middle of an operation.

0 comments on commit 574899d

Please sign in to comment.