From 7a27b95c77b40689fb37680f85df3671db1e1f25 Mon Sep 17 00:00:00 2001
From: Vishaal Kapoor <vishaalk@amazon.com>
Date: Tue, 7 Aug 2018 10:20:25 -0700
Subject: [PATCH 1/2] [MXAPPS-805] Notebook execution failures in CI.

* Add a retry policy when starting a notebook executor to handle the failure to
 start a notebook executor (due to a port collision, kernel taking too
 long to start, etc.).

* Change logging level for tests to INFO so that we have more
 informative test output.
---
 .../straight_dope/test_notebooks_multi_gpu.py |  2 ++
 .../test_notebooks_single_gpu.py              |  3 ++-
 tests/utils/notebook_test/__init__.py         | 23 +++++++++++++++----
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/tests/nightly/straight_dope/test_notebooks_multi_gpu.py b/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
index 2038ada3a8b2..ef07550bdf78 100644
--- a/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
+++ b/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
@@ -20,6 +20,7 @@
     This file tests that the notebooks requiring multi GPUs run without
     warning or exception.
 """
+import logging
 import unittest
 from straight_dope_test_utils import _test_notebook
 from straight_dope_test_utils import _download_straight_dope_notebooks
@@ -27,6 +28,7 @@
 class StraightDopeMultiGpuTests(unittest.TestCase):
     @classmethod
     def setUpClass(self):
+        logging.basicConfig(level=logging.INFO)
         assert _download_straight_dope_notebooks()
 
     # Chapter 7
diff --git a/tests/nightly/straight_dope/test_notebooks_single_gpu.py b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
index ee7c94c80afc..a9db85398bf8 100644
--- a/tests/nightly/straight_dope/test_notebooks_single_gpu.py
+++ b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
@@ -21,6 +21,7 @@
     warning or exception.
 """
 import glob
+import logging
 import re
 import os
 import unittest
@@ -51,9 +52,9 @@
 class StraightDopeSingleGpuTests(unittest.TestCase):
     @classmethod
     def setUpClass(self):
+        logging.basicConfig(level=logging.INFO)
         assert _download_straight_dope_notebooks()
 
-
     def test_completeness(self):
         """
         Make sure that every tutorial that isn't in the whitelist is considered for testing by this
diff --git a/tests/utils/notebook_test/__init__.py b/tests/utils/notebook_test/__init__.py
index 2cdb6134a604..6d6d79c5dce6 100644
--- a/tests/utils/notebook_test/__init__.py
+++ b/tests/utils/notebook_test/__init__.py
@@ -32,6 +32,13 @@
 
 IPYTHON_VERSION = 4  # Pin to ipython version 4.
 TIME_OUT = 10*60  # Maximum 10 mins/test. Reaching timeout causes test failure.
+RETRIES = 10
+
+try:
+    TimeoutError
+except NameError:
+    # py2
+    TimeoutError = RuntimeError
 
 def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='tmp_notebook'):
     """Run tutorial Jupyter notebook to catch any execution error.
@@ -72,15 +79,21 @@ def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='
         os.makedirs(working_dir)
     try:
         notebook = nbformat.read(notebook_path + '.ipynb', as_version=IPYTHON_VERSION)
-        # Adding a small delay to allow time for sockets to be freed
-        # stop-gap measure to battle the 1000ms linger of socket hard coded
-        # in the kernel API code
-        time.sleep(1.1)
         if kernel is not None:
             eprocessor = ExecutePreprocessor(timeout=TIME_OUT, kernel_name=kernel)
         else:
             eprocessor = ExecutePreprocessor(timeout=TIME_OUT)
-        nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}})
+
+        # There is a low (< 1%) chance that starting a notebook executor will fail due to the kernel
+        # taking to long to start, or a port collision, etc.
+        for i in range(RETRIES):
+            try:
+                nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}})
+            except (RuntimeError, TimeoutError) as rte:
+                logging.info("Error starting preprocessor: {}. Attempt {}/{}".format(str(rte), i+1, RETRIES))
+                time.sleep(1)
+                continue
+            break
     except Exception as err:
         err_msg = str(err)
         errors.append(err_msg)

From 5c0ebbc960a66908c5006f90d8349231c5326a2b Mon Sep 17 00:00:00 2001
From: Vishaal Kapoor <vishaalk@amazon.com>
Date: Wed, 8 Aug 2018 16:11:42 -0700
Subject: [PATCH 2/2] Make retry logic for Jupyter notebook execution specific
 to the error message we are looking for to prevent false positives in the
 retry logic.

---
 tests/utils/notebook_test/__init__.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/utils/notebook_test/__init__.py b/tests/utils/notebook_test/__init__.py
index 6d6d79c5dce6..25e96ab0fc55 100644
--- a/tests/utils/notebook_test/__init__.py
+++ b/tests/utils/notebook_test/__init__.py
@@ -32,13 +32,9 @@
 
 IPYTHON_VERSION = 4  # Pin to ipython version 4.
 TIME_OUT = 10*60  # Maximum 10 mins/test. Reaching timeout causes test failure.
-RETRIES = 10
+RETRIES = 8
+KERNEL_ERROR_MSG = 'Kernel died before replying to kernel_info'
 
-try:
-    TimeoutError
-except NameError:
-    # py2
-    TimeoutError = RuntimeError
 
 def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='tmp_notebook'):
     """Run tutorial Jupyter notebook to catch any execution error.
@@ -89,7 +85,14 @@ def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='
         for i in range(RETRIES):
             try:
                 nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}})
-            except (RuntimeError, TimeoutError) as rte:
+            except RuntimeError as rte:
+                # We check if the exception has to do with the Jupyter kernel failing to start. If
+                # not, we rethrow to prevent the notebook from erring RETRIES times. It is not ideal
+                # to inspect the exception message, but necessary for retry logic, as Jupyter client
+                # throws the generic RuntimeError that can be confused with other Runtime errors.
+                if str(rte) != KERNEL_ERROR_MSG:
+                    raise rte
+
                 logging.info("Error starting preprocessor: {}. Attempt {}/{}".format(str(rte), i+1, RETRIES))
                 time.sleep(1)
                 continue