From 7a27b95c77b40689fb37680f85df3671db1e1f25 Mon Sep 17 00:00:00 2001 From: Vishaal Kapoor Date: Tue, 7 Aug 2018 10:20:25 -0700 Subject: [PATCH 1/2] [MXAPPS-805] Notebook execution failures in CI. * Add a retry policy when starting a notebook executor to handle the failure to start a notebook executor (due to a port collision, kernel taking too long to start, etc.). * Change logging level for tests to INFO so that we have more informative test output. --- .../straight_dope/test_notebooks_multi_gpu.py | 2 ++ .../test_notebooks_single_gpu.py | 3 ++- tests/utils/notebook_test/__init__.py | 23 +++++++++++++++---- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/tests/nightly/straight_dope/test_notebooks_multi_gpu.py b/tests/nightly/straight_dope/test_notebooks_multi_gpu.py index 2038ada3a8b2..ef07550bdf78 100644 --- a/tests/nightly/straight_dope/test_notebooks_multi_gpu.py +++ b/tests/nightly/straight_dope/test_notebooks_multi_gpu.py @@ -20,6 +20,7 @@ This file tests that the notebooks requiring multi GPUs run without warning or exception. """ +import logging import unittest from straight_dope_test_utils import _test_notebook from straight_dope_test_utils import _download_straight_dope_notebooks @@ -27,6 +28,7 @@ class StraightDopeMultiGpuTests(unittest.TestCase): @classmethod def setUpClass(self): + logging.basicConfig(level=logging.INFO) assert _download_straight_dope_notebooks() # Chapter 7 diff --git a/tests/nightly/straight_dope/test_notebooks_single_gpu.py b/tests/nightly/straight_dope/test_notebooks_single_gpu.py index ee7c94c80afc..a9db85398bf8 100644 --- a/tests/nightly/straight_dope/test_notebooks_single_gpu.py +++ b/tests/nightly/straight_dope/test_notebooks_single_gpu.py @@ -21,6 +21,7 @@ warning or exception. """ import glob +import logging import re import os import unittest @@ -51,9 +52,9 @@ class StraightDopeSingleGpuTests(unittest.TestCase): @classmethod def setUpClass(self): + logging.basicConfig(level=logging.INFO) assert _download_straight_dope_notebooks() - def test_completeness(self): """ Make sure that every tutorial that isn't in the whitelist is considered for testing by this diff --git a/tests/utils/notebook_test/__init__.py b/tests/utils/notebook_test/__init__.py index 2cdb6134a604..6d6d79c5dce6 100644 --- a/tests/utils/notebook_test/__init__.py +++ b/tests/utils/notebook_test/__init__.py @@ -32,6 +32,13 @@ IPYTHON_VERSION = 4 # Pin to ipython version 4. TIME_OUT = 10*60 # Maximum 10 mins/test. Reaching timeout causes test failure. +RETRIES = 10 + +try: + TimeoutError +except NameError: + # py2 + TimeoutError = RuntimeError def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='tmp_notebook'): """Run tutorial Jupyter notebook to catch any execution error. @@ -72,15 +79,21 @@ def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir=' os.makedirs(working_dir) try: notebook = nbformat.read(notebook_path + '.ipynb', as_version=IPYTHON_VERSION) - # Adding a small delay to allow time for sockets to be freed - # stop-gap measure to battle the 1000ms linger of socket hard coded - # in the kernel API code - time.sleep(1.1) if kernel is not None: eprocessor = ExecutePreprocessor(timeout=TIME_OUT, kernel_name=kernel) else: eprocessor = ExecutePreprocessor(timeout=TIME_OUT) - nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}}) + + # There is a low (< 1%) chance that starting a notebook executor will fail due to the kernel + # taking to long to start, or a port collision, etc. + for i in range(RETRIES): + try: + nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}}) + except (RuntimeError, TimeoutError) as rte: + logging.info("Error starting preprocessor: {}. Attempt {}/{}".format(str(rte), i+1, RETRIES)) + time.sleep(1) + continue + break except Exception as err: err_msg = str(err) errors.append(err_msg) From 5c0ebbc960a66908c5006f90d8349231c5326a2b Mon Sep 17 00:00:00 2001 From: Vishaal Kapoor Date: Wed, 8 Aug 2018 16:11:42 -0700 Subject: [PATCH 2/2] Make retry logic for Jupyter notebook execution specific to the error message we are looking for to prevent false positives in the retry logic. --- tests/utils/notebook_test/__init__.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/utils/notebook_test/__init__.py b/tests/utils/notebook_test/__init__.py index 6d6d79c5dce6..25e96ab0fc55 100644 --- a/tests/utils/notebook_test/__init__.py +++ b/tests/utils/notebook_test/__init__.py @@ -32,13 +32,9 @@ IPYTHON_VERSION = 4 # Pin to ipython version 4. TIME_OUT = 10*60 # Maximum 10 mins/test. Reaching timeout causes test failure. -RETRIES = 10 +RETRIES = 8 +KERNEL_ERROR_MSG = 'Kernel died before replying to kernel_info' -try: - TimeoutError -except NameError: - # py2 - TimeoutError = RuntimeError def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='tmp_notebook'): """Run tutorial Jupyter notebook to catch any execution error. @@ -89,7 +85,14 @@ def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir=' for i in range(RETRIES): try: nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}}) - except (RuntimeError, TimeoutError) as rte: + except RuntimeError as rte: + # We check if the exception has to do with the Jupyter kernel failing to start. If + # not, we rethrow to prevent the notebook from erring RETRIES times. It is not ideal + # to inspect the exception message, but necessary for retry logic, as Jupyter client + # throws the generic RuntimeError that can be confused with other Runtime errors. + if str(rte) != KERNEL_ERROR_MSG: + raise rte + logging.info("Error starting preprocessor: {}. Attempt {}/{}".format(str(rte), i+1, RETRIES)) time.sleep(1) continue