Merge pull request #30 from JoshKarpel/force-stop

Add a --force option to jupyter stop
CHTC · Jul 23, 2020 · 574899d · 574899d
2 parents de86828 + 0db078e
commit 574899d
Show file tree

Hide file tree

Showing 3 changed files with 174 additions and 24 deletions.
diff --git a/dask_chtc/cli.py b/dask_chtc/cli.py
@@ -1,18 +1,22 @@
+import functools
 import getpass
 import logging
+import os
 import re
+import signal
 import sys
 import time
 from datetime import datetime, timezone
 from pathlib import Path
 from pprint import pformat
-from typing import List, Mapping, Optional
+from typing import List, Mapping, Optional, Tuple
 
 import classad
 import click
 import dask
 import htcondor
 import humanize
+import psutil
 from click_didyoumean import DYMGroup
 from watchdog import events
 from watchdog.observers import Observer
@@ -218,11 +222,24 @@ def start(jupyter_args):
 
 
 @jupyter.command()
-def stop():
+@click.option(
+    "--force",
+    "-f",
+    is_flag=True,
+    default=False,
+    help="Stop your notebook server without giving it a chance to clean up.",
+)
+def stop(force):
     """
     Stop a Jupyter notebook server that was started via "start".
+
+    If the --force option is given, the notebook server will be killed without
+    giving it time to shutdown cleanly. We recommend always trying a normal stop
+    first, then stopping it again with --force only if it is stuck in the
+    REMOVED state for more than a few minutes
+    (use the "status" subcommand to see its current state).
     """
-    JupyterJobManager().connect().stop()
+    JupyterJobManager().connect().stop(force=force)
 
 
 @jupyter.command()
@@ -238,6 +255,9 @@ def status(raw):
 
     If you have started a Jupyter notebook server in the past and need to
     find it's address again, use this command.
+
+    If you are trying to shut down your notebook server job and it is stuck in
+    the REMOVED state, try running "dask-chtc jupyter stop --force".
     """
     now = datetime.utcnow().replace(tzinfo=timezone.utc)
 
@@ -370,7 +390,8 @@ def on_modified(self, event: events.FileSystemEvent):
             click.secho(line.rstrip(), fg=self.color, err=True)
 
 
-MARKER = "IsDaskCHTCJupyterNotebookServer"
+MARKER_KEY = "IsDaskCHTCJupyterNotebookServer"
+MARKER_VALUE = "true"
 
 
 class JupyterJobManager:
@@ -389,7 +410,9 @@ def __init__(self, logs_dir: Optional[Path] = None):
     def discover(cls) -> Job:
         schedd = htcondor.Schedd()
 
-        query = schedd.query(constraint=f"Owner == {classad.quote(getpass.getuser())} && {MARKER}",)
+        query = schedd.query(
+            constraint=f"Owner == {classad.quote(getpass.getuser())} && {MARKER_KEY}",
+        )
         if len(query) == 0:
             raise click.ClickException(
                 "Was not able to find a running Jupyter notebook server job!"
@@ -422,7 +445,9 @@ def contact_address(self) -> str:
                 contact_addresses.add(match.group(0))
 
         if len(contact_addresses) == 0:
-            raise Exception("Could not find contact address for Jupyter notebook server from logs")
+            raise click.ClickException(
+                "Could not find contact address for Jupyter notebook server from logs; wait a few seconds and try again."
+            )
 
         # TODO: this choice is extremely arbitrary...
         return sorted(contact_addresses)[0]
@@ -449,9 +474,14 @@ def start(self, jupyter_args: List[str]) -> "JupyterJobManager":
                 "stream_output": "true",
                 "stream_error": "true",
                 "getenv": "true",
+                "environment": f"{MARKER_KEY}={MARKER_VALUE}",
                 "transfer_executable": "false",
                 "transfer_output_files": '""',
-                f"My.{MARKER}": "true",
+                # job_max_vacate_time doesn't actually work in local universe,
+                # but might some day:
+                # https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=7746
+                "job_max_vacate_time": "60",
+                f"My.{MARKER_KEY}": MARKER_VALUE,
             }
         )
 
@@ -471,24 +501,24 @@ def watch_events(self) -> None:
 
         for event in self.events:
             text = str(event).rstrip()
-            if event.type in (htcondor.JobEventType.JOB_HELD, htcondor.JobEventType.JOB_TERMINATED):
-                click.secho(text, err=True, fg="red")
-            elif event.type is htcondor.JobEventType.JOB_ABORTED:
-                click.secho(text, err=True, fg="white")
+            click.secho(text, err=True, fg=JOB_EVENT_TO_COLOR.get(event.type, "white"))
+            if event.type in BREAK_ON_JOB_EVENTS:
                 break
-            else:
-                click.secho(text, err=True, fg="white")
 
-    def remove_job(self) -> None:
+    def remove_job(self, force: bool = False) -> None:
         try:
-            schedd = htcondor.Schedd()
-            schedd.act(
-                htcondor.JobAction.Remove,
-                [f"{self.cluster_id}.0"],
-                "Shut down Jupyter notebook server",
-            )
+            if not force:
+                schedd = htcondor.Schedd()
+                schedd.act(
+                    htcondor.JobAction.Remove,
+                    [f"{self.cluster_id}.0"],
+                    "Shut down Jupyter notebook server",
+                )
+            else:
+                kill_proc_tree(find_notebook_server_process())
         except Exception:
             logger.exception(f"Failed to remove Jupyter notebook server job!")
+            raise
 
     def start_echoing(self) -> None:
         if self.observer is not None:
@@ -527,9 +557,9 @@ def rotate_files(self) -> int:
 
         return stamp
 
-    def stop(self) -> None:
+    def stop(self, force: bool = False) -> None:
         self.start_echoing()
-        self.remove_job()
+        self.remove_job(force=force)
         self.watch_events()
         self.stop_echoing()
         self.rotate_files()
@@ -540,3 +570,49 @@ def __enter__(self) -> "JupyterJobManager":
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.stop()
+
+
+JOB_EVENT_TO_COLOR = {
+    htcondor.JobEventType.JOB_HELD: "red",
+    htcondor.JobEventType.JOB_TERMINATED: "red",
+    htcondor.JobEventType.JOB_ABORTED: "green",
+}
+BREAK_ON_JOB_EVENTS = {
+    htcondor.JobEventType.JOB_HELD,
+    htcondor.JobEventType.JOB_TERMINATED,
+    htcondor.JobEventType.JOB_ABORTED,
+}
+
+
+def find_notebook_server_process() -> psutil.Process:
+    """
+    Find the current user's running notebook server process by looking for a
+    marker environment variable and matching against their username.
+
+    Raises an exception if there were no matches.
+    """
+    username = getpass.getuser()
+    for proc in psutil.process_iter(attrs=["username", "environ"]):
+        if (
+            proc.info["username"] == username
+            and (proc.info["environ"] or {}).get(MARKER_KEY) == MARKER_VALUE
+        ):
+            return proc
+
+    raise Exception("Couldn't find Jupyter notebook server process for current user.")
+
+
+def kill_proc_tree(
+    process: psutil.Process, signal: signal.Signals = signal.SIGKILL, timeout: Optional[int] = None
+) -> Tuple[Tuple[psutil.Process, ...], Tuple[psutil.Process, ...]]:
+    """
+    Kill a process tree
+    and return a (gone, still_alive) tuple after the timeout has expired.
+
+    Adapted from https://psutil.readthedocs.io/en/latest/#kill-process-tree
+    """
+    to_kill = [process] + process.children(recursive=True)
+    for p in to_kill:
+        p.send_signal(signal)
+    gone, alive = psutil.wait_procs(to_kill, timeout=timeout)
+    return gone, alive
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -45,7 +45,7 @@ development environment to use Dask in.
     `GitHub repository <https://github.com/JoshKarpel/dask-chtc>`_
     to keep track of new releases, and upgrading promptly when they occur.
 
-These pages will walk you through the various aspects of using Dask-CHTC:
+These pages will get you started with Dask-CHTC:
 
 :doc:`installation`
     How to install Python and Dask-CHTC on a CHTC submit node.
@@ -64,9 +64,18 @@ These pages will walk you through the various aspects of using Dask-CHTC:
     showing how to start up a :class:`CHTCCluster`
     and use it to perform some calculations.
 
+
+These pages have information for troubleshooting problems and handling
+specific use cases:
+
+:doc:`troubleshooting`
+    Solutions and advice for tackling specific problems that might arise
+    while using Dask-CHTC.
+
 :doc:`docker`
     Information on how to build Docker images for use with Dask-CHTC.
 
+
 Detailed information on the Python API
 and the associated command line tool
 can be found on these pages:
@@ -93,12 +102,19 @@ can be found on these pages:
    jupyter
    networking
    example
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :caption: Getting It Working
+
+   troubleshooting
    docker
 
 .. toctree::
    :maxdepth: 2
    :hidden:
-   :caption: Reference
+   :caption: Getting the Details
 
    api
    cli
diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst
@@ -0,0 +1,58 @@
+.. _troubleshooting:
+
+.. py:currentmodule:: dask_chtc
+
+Troubleshooting
+===============
+
+
+Jupyter
+-------
+
+Jupyter notebook server is stuck in the ``REMOVED`` state
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+If something goes wrong during a normal ``dask-chtc jupyter stop``, you may
+find that your notebook server will refuse to shut down.
+The notebook server status will get stuck in ``REMOVED``, like this:
+
+.. code-block:: console
+
+    $ dask-chtc jupyter status
+    █ REMOVED  jupyter lab
+    ├─ Contact Address: http://127.0.0.1:8888/?token=d1717bce73ebc0e54ebeb16eeeef70811ead8eaae23e213c
+    ├─ Python Executable: /home/karpel/miniconda3/bin/python
+    ├─ Working Directory:  /home/karpel
+    ├─ Job ID: 8138911.0
+    ├─ Last status change at:  2020-07-19 21:34:02+00:00 UTC (23 minutes ago)
+    ├─ Originally started at: 2020-07-19 18:57:07+00:00 UTC (3 hours ago)
+    ├─ Output: /home/karpel/.dask-chtc/jupyter-logs/current.out
+    ├─ Error:  /home/karpel/.dask-chtc/jupyter-logs/current.err
+    └─ Events: /home/karpel/.dask-chtc/jupyter-logs/current.events
+
+Because you can only run one notebook server at a time, this will prevent you
+from launching a new notebook server.
+To resolve this issue, you should run ``dask-chtc jupyter stop --force``:
+
+.. code-block:: console
+
+    $ dask-chtc jupyter stop --force
+    000 (16453.000.000) 2020-07-21 11:58:25 Job submitted from host: <10.0.1.43:40415?addrs=10.0.1.43-40415+[2600-6c44-1180-1661-99fa-fc04-10e3-fd8d]-40415&alias=JKARPEL&noUDP&sock=schedd_20423_5f31>
+    001 (16453.000.000) 2020-07-21 11:58:27 Job executing on host: <10.0.1.43:40415?addrs=10.0.1.43-40415+[2600-6c44-1180-1661-99fa-fc04-10e3-fd8d]-40415&alias=JKARPEL&noUDP&sock=starter_20464_7d39_11>
+    005 (16453.000.000) 2020-07-21 11:58:30 Job terminated.
+        (0) Abnormal termination (signal 9)
+        (0) No core file
+            Usr 0 00:00:00, Sys 0 00:00:00  -  Run Remote Usage
+            Usr 0 00:00:00, Sys 0 00:00:00  -  Run Local Usage
+            Usr 0 00:00:00, Sys 0 00:00:00  -  Total Remote Usage
+            Usr 0 00:00:00, Sys 0 00:00:00  -  Total Local Usage
+        0  -  Run Bytes Sent By Job
+        0  -  Run Bytes Received By Job
+        0  -  Total Bytes Sent By Job
+        0  -  Total Bytes Received By Job
+
+Always try stopping your notebook server with a plain ``stop`` command before
+trying ``stop --force``;
+``--force`` does not give the notebook server a chance
+to shut down cleanly, so your Jupyter kernels may be interrupted while in the
+middle of an operation.