elastic · benwtrent · Jun 24, 2021 · Jun 23, 2021 · Jun 23, 2021 · Jun 23, 2021
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java
@@ -1312,9 +1312,12 @@ public String getFeatureDescription() {
     public void cleanUpFeature(
         ClusterService clusterService,
         Client client,
-        ActionListener<ResetFeatureStateResponse.ResetFeatureStateStatus> finalListener) {
+        ActionListener<ResetFeatureStateResponse.ResetFeatureStateStatus> finalListener
+    ) {
         logger.info("Starting machine learning feature reset");
 
+        final Map<String, Boolean> results = new ConcurrentHashMap<>();
+
         ActionListener<ResetFeatureStateResponse.ResetFeatureStateStatus> unsetResetModeListener = ActionListener.wrap(
             success -> client.execute(SetResetModeAction.INSTANCE, SetResetModeActionRequest.disabled(true), ActionListener.wrap(
                 resetSuccess -> finalListener.onResponse(success),
@@ -1337,25 +1340,43 @@ public void cleanUpFeature(
             )
         );
 
-        Map<String, Boolean> results = new ConcurrentHashMap<>();
+        ActionListener<ResetFeatureStateResponse.ResetFeatureStateStatus> cleanedUpIndicesListener = ActionListener.wrap(
+            success -> {
+                if (memoryTracker.get() != null) {
+                    memoryTracker.get().awaitAndClear(ActionListener.wrap(
+                        cacheCleared -> unsetResetModeListener.onResponse(success),
+                        clearFailed -> {
+                            logger.error("failed to clear memory tracker cache via machine learning reset feature API", clearFailed);
+                            unsetResetModeListener.onResponse(success);
+                        }
+                    ));
+                    return;
+                }
+                unsetResetModeListener.onResponse(success);
+            },
+            failure -> {
+                logger.error("failed to clear .ml-* indices via reset feature API", failure);
+                unsetResetModeListener.onFailure(failure);
+            }
+        );
 
         ActionListener<ListTasksResponse> afterWaitingForTasks = ActionListener.wrap(
             listTasksResponse -> {
                 listTasksResponse.rethrowFailures("Waiting for indexing requests for .ml-* indices");
                 if (results.values().stream().allMatch(b -> b)) {
-                    // Call into the original listener to clean up the indices
-                    SystemIndexPlugin.super.cleanUpFeature(clusterService, client, unsetResetModeListener);
+                    // Call into the original listener to clean up the indices and then clear ml memory cache
+                    SystemIndexPlugin.super.cleanUpFeature(clusterService, client, cleanedUpIndicesListener);
                 } else {
                     final List<String> failedComponents = results.entrySet().stream()
                         .filter(result -> result.getValue() == false)
                         .map(Map.Entry::getKey)
                         .collect(Collectors.toList());
-                    unsetResetModeListener.onFailure(
+                    cleanedUpIndicesListener.onFailure(
                         new RuntimeException("Some machine learning components failed to reset: " + failedComponents)
                     );
                 }
             },
-            unsetResetModeListener::onFailure
+            cleanedUpIndicesListener::onFailure
         );
 
         ActionListener<StopDataFrameAnalyticsAction.Response> afterDataframesStopped = ActionListener.wrap(dataFrameStopResponse -> {

diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/process/MlMemoryTracker.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/process/MlMemoryTracker.java
@@ -42,6 +42,7 @@
 import java.util.TreeMap;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.Phaser;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
 
 /**
@@ -72,6 +73,7 @@ public class MlMemoryTracker implements LocalNodeMasterListener {
     private final JobResultsProvider jobResultsProvider;
     private final DataFrameAnalyticsConfigProvider configProvider;
     private final Phaser stopPhaser;
+    private volatile AtomicInteger phase = new AtomicInteger(0);
     private volatile boolean isMaster;
     private volatile Instant lastUpdateTime;
     private volatile Duration reassignmentRecheckInterval;
@@ -115,6 +117,37 @@ public void onMaster() {
     public void offMaster() {
         isMaster = false;
         logger.trace("ML memory tracker off master");
+        clear();
+    }
+
+    public void awaitAndClear(ActionListener<Void> listener) {
+        // We never terminate the phaser
+        assert stopPhaser.isTerminated() == false;
+        // If there are no registered parties or no unarrived parties then there is a flaw
+        // in the register/arrive/unregister logic in another method that uses the phaser
+        assert stopPhaser.getRegisteredParties() > 0;
+        assert stopPhaser.getUnarrivedParties() > 0;
+        threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME).execute(
+            () -> {
+                try {
+                    // We await all current refreshes to complete, this increments the "current phase" and prevents
+                    // further interaction while we clear contents
+                    int newPhase = stopPhaser.arriveAndAwaitAdvance();
+                    assert newPhase > 0;
+                    clear();
+                    phase.incrementAndGet();
+                    listener.onResponse(null);
+                } catch (Exception e) {
+                    logger.warn("failed to wait for all refresh requests to complete", e);
+                    listener.onFailure(e);
+                }
+            }
+        );
+
+    }
+
+    private void clear() {
+        logger.trace("clearing ML Memory tracker contents");
         for (Map<String, Long> memoryRequirementByJob : memoryRequirementByTaskName.values()) {
             memoryRequirementByJob.clear();
         }
@@ -401,8 +434,9 @@ public void refreshAnomalyDetectorJobMemory(String jobId, ActionListener<Long> l
         }
 
         // The phaser prevents searches being started after the memory tracker's stop() method has returned
-        if (stopPhaser.register() != 0) {
-            // Phases above 0 mean we've been stopped, so don't do any operations that involve external interaction
+        // Note: `phase` is incremented if cache is reset via the feature reset API
+        if (stopPhaser.register() != phase.get()) {
+            // Phases above not equal to `phase` mean we've been stopped, so don't do any operations that involve external interaction
             stopPhaser.arriveAndDeregister();
             listener.onFailure(new EsRejectedExecutionException("Couldn't run ML memory update - node is shutting down"));
             return;