From 6770450e37b8b2679362c302f9f8768c70fe5aa7 Mon Sep 17 00:00:00 2001
From: David Roberts <dave.roberts@elastic.co>
Date: Sun, 15 Dec 2019 21:42:20 +0000
Subject: [PATCH] Change process kill order for testclusters shutdown (#50175)

The testclusters shutdown code was killing child processes
of the ES JVM before the ES JVM.  This causes any running
ML jobs to be recorded as failed, as the ES JVM notices that
they have disconnected from it without being told to stop,
as they would if they crashed.  In many test suites this
doesn't matter because the test cluster will never be
restarted, but in the case of upgrade tests it makes it
impossible to test what happens when an ML job is running
at the time of the upgrade.

This change reverses the order of killing the ES process
tree such that the parent processes are killed before their
children.  A list of children is stored before killing the
parent so that they can subsequently be killed (if they
don't exit by themselves as a side effect of the parent
dying).

Fixes #46262
---
 .../testclusters/ElasticsearchNode.java       | 51 ++++++++++---------
 .../upgrades/MlMappingsUpgradeIT.java         |  1 -
 2 files changed, 28 insertions(+), 24 deletions(-)
diff --git a/buildSrc/src/main/java/org/elasticsearch/gradle/testclusters/ElasticsearchNode.java b/buildSrc/src/main/java/org/elasticsearch/gradle/testclusters/ElasticsearchNode.java
index 2f258733e7575..f43f28190da1a 100644
--- a/buildSrc/src/main/java/org/elasticsearch/gradle/testclusters/ElasticsearchNode.java
+++ b/buildSrc/src/main/java/org/elasticsearch/gradle/testclusters/ElasticsearchNode.java
@@ -807,6 +807,7 @@ public synchronized void stop(boolean tailLogs) {
         requireNonNull(esProcess, "Can't stop `" + this + "` as it was not started or already stopped.");
         // Test clusters are not reused, don't spend time on a graceful shutdown
         stopHandle(esProcess.toHandle(), true);
+        reaper.unregister(toString());
         if (tailLogs) {
             logFileContents("Standard output of node", esStdoutFile);
             logFileContents("Standard error of node", esStderrFile);
@@ -831,39 +832,43 @@ public void setNameCustomization(Function<String, String> nameCustomizer) {
     }
 
     private void stopHandle(ProcessHandle processHandle, boolean forcibly) {
-        // Stop all children first, ES could actually be a child when there's some wrapper process like on Windows.
+        // No-op if the process has already exited by itself.
         if (processHandle.isAlive() == false) {
             LOGGER.info("Process was not running when we tried to terminate it.");
             return;
         }
 
-        // Stop all children first, ES could actually be a child when there's some wrapper process like on Windows.
-        processHandle.children().forEach(each -> stopHandle(each, forcibly));
+        // Stop all children last - if the ML processes are killed before the ES JVM then
+        // they'll be recorded as having failed and won't restart when the cluster restarts.
+        // ES could actually be a child when there's some wrapper process like on Windows,
+        // and in that case the ML processes will be grandchildren of the wrapper.
+        List<ProcessHandle> children = processHandle.children().collect(Collectors.toList());
+        try {
+            logProcessInfo(
+                "Terminating elasticsearch process" + (forcibly ? " forcibly " : "gracefully") + ":",
+                processHandle.info()
+            );
 
-        logProcessInfo(
-            "Terminating elasticsearch process" + (forcibly ? " forcibly " : "gracefully") + ":",
-            processHandle.info()
-        );
+            if (forcibly) {
+                processHandle.destroyForcibly();
+            } else {
+                processHandle.destroy();
+                waitForProcessToExit(processHandle);
+                if (processHandle.isAlive() == false) {
+                    return;
+                }
+                LOGGER.info("process did not terminate after {} {}, stopping it forcefully",
+                    ES_DESTROY_TIMEOUT, ES_DESTROY_TIMEOUT_UNIT);
+                processHandle.destroyForcibly();
+            }
 
-        if (forcibly) {
-            processHandle.destroyForcibly();
-        } else {
-            processHandle.destroy();
             waitForProcessToExit(processHandle);
-            if (processHandle.isAlive() == false) {
-                return;
+            if (processHandle.isAlive()) {
+                throw new TestClustersException("Was not able to terminate elasticsearch process for " + this);
             }
-            LOGGER.info("process did not terminate after {} {}, stopping it forcefully",
-                ES_DESTROY_TIMEOUT, ES_DESTROY_TIMEOUT_UNIT);
-            processHandle.destroyForcibly();
-        }
-
-        waitForProcessToExit(processHandle);
-        if (processHandle.isAlive()) {
-            throw new TestClustersException("Was not able to terminate elasticsearch process for " + this);
+        } finally {
+            children.forEach(each -> stopHandle(each, forcibly));
         }
-
-        reaper.unregister(toString());
     }
 
     private void logProcessInfo(String prefix, ProcessHandle.Info info) {
diff --git a/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/MlMappingsUpgradeIT.java b/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/MlMappingsUpgradeIT.java
index 8099812c8ea1b..13ed2dafc5f31 100644
--- a/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/MlMappingsUpgradeIT.java
+++ b/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/MlMappingsUpgradeIT.java
@@ -38,7 +38,6 @@ protected Collection<String> templatesToWaitFor() {
      * The purpose of this test is to ensure that when a job is open through a rolling upgrade we upgrade the results
      * index mappings when it is assigned to an upgraded node even if no other ML endpoint is called after the upgrade
      */
-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/46262")
     public void testMappingsUpgrade() throws Exception {
 
         switch (CLUSTER_TYPE) {