From 0896c1522f0cf2703c65aab9095719cc30372666 Mon Sep 17 00:00:00 2001 From: Josh Karpel Date: Tue, 21 Jul 2020 12:08:20 -0500 Subject: [PATCH 1/4] add a --force option to jupyter stop (resolves #29) A force stop looks for the notebook server process tree and sends it a kill signal. This is a yucky, low-level hack that we will probably need to revisit later. --- dask_chtc/cli.py | 118 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 96 insertions(+), 22 deletions(-) diff --git a/dask_chtc/cli.py b/dask_chtc/cli.py index 80934c6..cbf7c2e 100644 --- a/dask_chtc/cli.py +++ b/dask_chtc/cli.py @@ -1,18 +1,22 @@ +import functools import getpass import logging +import os import re +import signal import sys import time from datetime import datetime, timezone from pathlib import Path from pprint import pformat -from typing import List, Mapping, Optional +from typing import List, Mapping, Optional, Tuple import classad import click import dask import htcondor import humanize +import psutil from click_didyoumean import DYMGroup from watchdog import events from watchdog.observers import Observer @@ -218,11 +222,24 @@ def start(jupyter_args): @jupyter.command() -def stop(): +@click.option( + "--force", + "-f", + is_flag=True, + default=False, + help="Stop your notebook server without giving it a chance to clean up.", +) +def stop(force): """ Stop a Jupyter notebook server that was started via "start". + + If the --force option is given, the notebook server will be killed without + giving it time to shutdown cleanly. We recommend always trying a normal stop + first, then stopping it again with --force only if it is stuck in the + REMOVED state for more than a few minutes + (use the "status" subcommand to see its current state). """ - JupyterJobManager().connect().stop() + JupyterJobManager().connect().stop(force=force) @jupyter.command() @@ -238,6 +255,9 @@ def status(raw): If you have started a Jupyter notebook server in the past and need to find it's address again, use this command. + + If you are trying to shut down your notebook server job and it is stuck in + the REMOVED state, try running "dask-chtc jupyter stop --force". """ now = datetime.utcnow().replace(tzinfo=timezone.utc) @@ -370,7 +390,8 @@ def on_modified(self, event: events.FileSystemEvent): click.secho(line.rstrip(), fg=self.color, err=True) -MARKER = "IsDaskCHTCJupyterNotebookServer" +MARKER_KEY = "IsDaskCHTCJupyterNotebookServer" +MARKER_VALUE = "true" class JupyterJobManager: @@ -389,7 +410,9 @@ def __init__(self, logs_dir: Optional[Path] = None): def discover(cls) -> Job: schedd = htcondor.Schedd() - query = schedd.query(constraint=f"Owner == {classad.quote(getpass.getuser())} && {MARKER}",) + query = schedd.query( + constraint=f"Owner == {classad.quote(getpass.getuser())} && {MARKER_KEY}", + ) if len(query) == 0: raise click.ClickException( "Was not able to find a running Jupyter notebook server job!" @@ -422,7 +445,9 @@ def contact_address(self) -> str: contact_addresses.add(match.group(0)) if len(contact_addresses) == 0: - raise Exception("Could not find contact address for Jupyter notebook server from logs") + raise click.ClickException( + "Could not find contact address for Jupyter notebook server from logs; wait a few seconds and try again." + ) # TODO: this choice is extremely arbitrary... return sorted(contact_addresses)[0] @@ -449,9 +474,14 @@ def start(self, jupyter_args: List[str]) -> "JupyterJobManager": "stream_output": "true", "stream_error": "true", "getenv": "true", + "environment": f"{MARKER_KEY}={MARKER_VALUE}", "transfer_executable": "false", "transfer_output_files": '""', - f"My.{MARKER}": "true", + # job_max_vacate_time doesn't actually work in local universe, + # but might some day: + # https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=7746 + "job_max_vacate_time": "60", + f"My.{MARKER_KEY}": MARKER_VALUE, } ) @@ -471,24 +501,25 @@ def watch_events(self) -> None: for event in self.events: text = str(event).rstrip() - if event.type in (htcondor.JobEventType.JOB_HELD, htcondor.JobEventType.JOB_TERMINATED): - click.secho(text, err=True, fg="red") - elif event.type is htcondor.JobEventType.JOB_ABORTED: - click.secho(text, err=True, fg="white") + click.secho(text, err=True, fg=JOB_EVENT_TO_COLOR.get(event.type, "white")) + if event.type in BREAK_ON_JOB_EVENTS: break - else: - click.secho(text, err=True, fg="white") - def remove_job(self) -> None: + def remove_job(self, force: bool = False) -> None: try: - schedd = htcondor.Schedd() - schedd.act( - htcondor.JobAction.Remove, - [f"{self.cluster_id}.0"], - "Shut down Jupyter notebook server", - ) + if not force: + schedd = htcondor.Schedd() + schedd.act( + htcondor.JobAction.Remove, + [f"{self.cluster_id}.0"], + "Shut down Jupyter notebook server", + ) + else: + kill_proc_tree(find_notebook_server_process()) + except Exception: logger.exception(f"Failed to remove Jupyter notebook server job!") + raise def start_echoing(self) -> None: if self.observer is not None: @@ -527,9 +558,9 @@ def rotate_files(self) -> int: return stamp - def stop(self) -> None: + def stop(self, force: bool = False) -> None: self.start_echoing() - self.remove_job() + self.remove_job(force=force) self.watch_events() self.stop_echoing() self.rotate_files() @@ -540,3 +571,46 @@ def __enter__(self) -> "JupyterJobManager": def __exit__(self, exc_type, exc_val, exc_tb): self.stop() + + +JOB_EVENT_TO_COLOR = { + htcondor.JobEventType.JOB_HELD: "red", + htcondor.JobEventType.JOB_TERMINATED: "red", + htcondor.JobEventType.JOB_ABORTED: "green", +} +BREAK_ON_JOB_EVENTS = { + htcondor.JobEventType.JOB_HELD, + htcondor.JobEventType.JOB_TERMINATED, + htcondor.JobEventType.JOB_ABORTED, +} + + +def find_notebook_server_process() -> Optional[psutil.Process]: + """ + Find the current user's running notebook server process by looking for a + marker environment variable (and matching against their username). + """ + me = getpass.getuser() + for proc in psutil.process_iter(attrs=["username", "environ"]): + if ( + proc.info["username"] == me + and (proc.info["environ"] or {}).get(MARKER_KEY) == MARKER_VALUE + ): + return proc + return None + + +def kill_proc_tree( + process: psutil.Process, signal: signal.Signals = signal.SIGKILL, timeout=None +) -> Tuple[Tuple[psutil.Process, ...], Tuple[psutil.Process, ...]]: + """ + Kill a process tree + and return a (gone, still_alive) tuple after the timeout has expired. + + Adapted from https://psutil.readthedocs.io/en/latest/#kill-process-tree + """ + to_kill = [process] + process.children(recursive=True) + for p in to_kill: + p.send_signal(signal) + gone, alive = psutil.wait_procs(to_kill, timeout=timeout) + return gone, alive From dfb6f3915a6c7acf1f57fbc4eb911e16f296d917 Mon Sep 17 00:00:00 2001 From: Josh Karpel Date: Thu, 23 Jul 2020 11:13:56 -0500 Subject: [PATCH 2/4] clean up jupyter stop --force and add troubleshooting.rst --- dask_chtc/cli.py | 16 +++++---- docs/source/index.rst | 7 +++- docs/source/troubleshooting.rst | 58 +++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 8 deletions(-) create mode 100644 docs/source/troubleshooting.rst diff --git a/dask_chtc/cli.py b/dask_chtc/cli.py index cbf7c2e..9af28da 100644 --- a/dask_chtc/cli.py +++ b/dask_chtc/cli.py @@ -516,7 +516,6 @@ def remove_job(self, force: bool = False) -> None: ) else: kill_proc_tree(find_notebook_server_process()) - except Exception: logger.exception(f"Failed to remove Jupyter notebook server job!") raise @@ -585,23 +584,26 @@ def __exit__(self, exc_type, exc_val, exc_tb): } -def find_notebook_server_process() -> Optional[psutil.Process]: +def find_notebook_server_process() -> psutil.Process: """ Find the current user's running notebook server process by looking for a - marker environment variable (and matching against their username). + marker environment variable and matching against their username. + + Raises an exception if there were no matches. """ - me = getpass.getuser() + username = getpass.getuser() for proc in psutil.process_iter(attrs=["username", "environ"]): if ( - proc.info["username"] == me + proc.info["username"] == username and (proc.info["environ"] or {}).get(MARKER_KEY) == MARKER_VALUE ): return proc - return None + + raise Exception("Couldn't find Jupyter notebook server process for current user.") def kill_proc_tree( - process: psutil.Process, signal: signal.Signals = signal.SIGKILL, timeout=None + process: psutil.Process, signal: signal.Signals = signal.SIGKILL, timeout: Optional[int] = None ) -> Tuple[Tuple[psutil.Process, ...], Tuple[psutil.Process, ...]]: """ Kill a process tree diff --git a/docs/source/index.rst b/docs/source/index.rst index ea3c562..716dbdb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -67,6 +67,10 @@ These pages will walk you through the various aspects of using Dask-CHTC: :doc:`docker` Information on how to build Docker images for use with Dask-CHTC. +:doc:`troubleshooting` + Solutions and advice for tackling specific problems that might arise + while using Dask-CHTC. + Detailed information on the Python API and the associated command line tool can be found on these pages: @@ -94,11 +98,12 @@ can be found on these pages: networking example docker + troubleshooting .. toctree:: :maxdepth: 2 :hidden: - :caption: Reference + :caption: Interfaces api cli diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst new file mode 100644 index 0000000..9f5dfd2 --- /dev/null +++ b/docs/source/troubleshooting.rst @@ -0,0 +1,58 @@ +.. _troubleshooting: + +.. py:currentmodule:: dask_chtc + +Troubleshooting +=============== + + +Jupyter +------- + +My Jupyter notebook server is stuck in the ``REMOVED`` state +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +If something goes wrong during a normal ``dask-chtc jupyter stop``, you may +find that your notebook server will refuse to shut down. +The notebook server status will get stuck in ``REMOVED``, like this: + +.. code-block:: console + + $ dask-chtc jupyter status + █ REMOVED jupyter lab + ├─ Contact Address: http://127.0.0.1:8888/?token=d1717bce73ebc0e54ebeb16eeeef70811ead8eaae23e213c + ├─ Python Executable: /home/karpel/miniconda3/bin/python + ├─ Working Directory: /home/karpel + ├─ Job ID: 8138911.0 + ├─ Last status change at: 2020-07-19 21:34:02+00:00 UTC (23 minutes ago) + ├─ Originally started at: 2020-07-19 18:57:07+00:00 UTC (3 hours ago) + ├─ Output: /home/karpel/.dask-chtc/jupyter-logs/current.out + ├─ Error: /home/karpel/.dask-chtc/jupyter-logs/current.err + └─ Events: /home/karpel/.dask-chtc/jupyter-logs/current.events + +Because you can only run one notebook server at a time, this will prevent you +from launching a new notebook server. +To resolve this issue, you should run ``dask-chtc jupyter stop --force``: + +.. code-block:: console + + $ dask-chtc jupyter stop --force + 000 (16453.000.000) 2020-07-21 11:58:25 Job submitted from host: <10.0.1.43:40415?addrs=10.0.1.43-40415+[2600-6c44-1180-1661-99fa-fc04-10e3-fd8d]-40415&alias=JKARPEL&noUDP&sock=schedd_20423_5f31> + 001 (16453.000.000) 2020-07-21 11:58:27 Job executing on host: <10.0.1.43:40415?addrs=10.0.1.43-40415+[2600-6c44-1180-1661-99fa-fc04-10e3-fd8d]-40415&alias=JKARPEL&noUDP&sock=starter_20464_7d39_11> + 005 (16453.000.000) 2020-07-21 11:58:30 Job terminated. + (0) Abnormal termination (signal 9) + (0) No core file + Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage + Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage + 0 - Run Bytes Sent By Job + 0 - Run Bytes Received By Job + 0 - Total Bytes Sent By Job + 0 - Total Bytes Received By Job + +Always try stopping your notebook server with a plain ``stop`` command before +trying ``stop --force``; +``--force`` does not give the notebook server a chance +to shut down cleanly, so your Jupyter kernels may be interrupted while in the +middle of an operation. From e207749d253fce71340cf7e10f7a6d737591dfaf Mon Sep 17 00:00:00 2001 From: Josh Karpel Date: Thu, 23 Jul 2020 11:29:43 -0500 Subject: [PATCH 3/4] reorganize docs (add a section for topics past "getting started") --- docs/source/index.rst | 10 ++++++++-- docs/source/troubleshooting.rst | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 716dbdb..775d19f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -97,13 +97,19 @@ can be found on these pages: jupyter networking example - docker + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: Getting It Working + troubleshooting + docker .. toctree:: :maxdepth: 2 :hidden: - :caption: Interfaces + :caption: Getting the Details api cli diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index 9f5dfd2..c77e8d8 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -9,8 +9,8 @@ Troubleshooting Jupyter ------- -My Jupyter notebook server is stuck in the ``REMOVED`` state -++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Jupyter notebook server is stuck in the ``REMOVED`` state ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ If something goes wrong during a normal ``dask-chtc jupyter stop``, you may find that your notebook server will refuse to shut down. From 0db078e877830eb9005088727bb67470397aff05 Mon Sep 17 00:00:00 2001 From: Josh Karpel Date: Thu, 23 Jul 2020 11:37:31 -0500 Subject: [PATCH 4/4] rewrite index page --- docs/source/index.rst | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 775d19f..02e8ae8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -45,7 +45,7 @@ development environment to use Dask in. `GitHub repository `_ to keep track of new releases, and upgrading promptly when they occur. -These pages will walk you through the various aspects of using Dask-CHTC: +These pages will get you started with Dask-CHTC: :doc:`installation` How to install Python and Dask-CHTC on a CHTC submit node. @@ -64,13 +64,18 @@ These pages will walk you through the various aspects of using Dask-CHTC: showing how to start up a :class:`CHTCCluster` and use it to perform some calculations. -:doc:`docker` - Information on how to build Docker images for use with Dask-CHTC. + +These pages have information for troubleshooting problems and handling +specific use cases: :doc:`troubleshooting` Solutions and advice for tackling specific problems that might arise while using Dask-CHTC. +:doc:`docker` + Information on how to build Docker images for use with Dask-CHTC. + + Detailed information on the Python API and the associated command line tool can be found on these pages: