From 7f52ac4d282766ac8c117b7a9188fff4c109b4d7 Mon Sep 17 00:00:00 2001
From: Andrei Dan <andrei.dan@elastic.co>
Date: Tue, 27 Jul 2021 11:40:42 +0100
Subject: [PATCH] ILM ClusterStateWaitThresholdBreachTests cycles due to
 `shrunk-shards-allocated` (#75695)

ClusterStateWaitThresholdBreachTests is meant to simulate a shrink
action failure that lasts past the threshold we configured such that ILM
rewinds, deletes an attempted shrunk index and retries (successfully the
2nd time).

We used to simulated this failure by configuring a shrink action with a
number of shards higher than the index number of shards. We're now
adding a step that'll validate against this misconfiguration so we
needed a new way to integration test this shrink action cycle.

This makes the test use a high number of replicas configuration for the
managed index, blocking it in the `shrunk-shards-allocated` step,
instead of the previous failure in the `shrink` step.

(cherry picked from commit 3b2973d02433b94208d48ba510d256a9935cddcd)
Signed-off-by: Andrei Dan <andrei.dan@elastic.co>

# Conflicts:
#	x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/ClusterStateWaitThresholdBreachTests.java
---
 .../ClusterStateWaitThresholdBreachTests.java | 65 ++++++-------------
 1 file changed, 20 insertions(+), 45 deletions(-)

diff --git a/x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/ClusterStateWaitThresholdBreachTests.java b/x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/ClusterStateWaitThresholdBreachTests.java
index 2e9cbea62738e..9ebecd15e45c6 100644
--- a/x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/ClusterStateWaitThresholdBreachTests.java
+++ b/x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/ClusterStateWaitThresholdBreachTests.java
@@ -8,7 +8,6 @@
 package org.elasticsearch.xpack.ilm;
 
 import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;
-import org.elasticsearch.action.admin.indices.shrink.ResizeRequest;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.settings.Settings;
@@ -40,11 +39,11 @@
 import java.util.concurrent.TimeUnit;
 import java.util.function.LongSupplier;
 
+import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING;
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
 import static org.hamcrest.Matchers.equalTo;
-import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.is;
 import static org.hamcrest.Matchers.not;
 import static org.hamcrest.Matchers.notNullValue;
@@ -104,18 +103,16 @@ public void testWaitInShrunkShardsAllocatedExceedsThreshold() throws Exception {
         internalCluster().startDataOnlyNode();
 
         int numShards = 2;
-        {
-            Phase warmPhase = new Phase("warm", TimeValue.ZERO, Map
-                .of(MigrateAction.NAME, new MigrateAction(false), ShrinkAction.NAME,
-                    new ShrinkAction(numShards + randomIntBetween(1, numShards), null))
-            );
-            LifecyclePolicy lifecyclePolicy = new LifecyclePolicy(policy, Map.of("warm", warmPhase));
-            PutLifecycleAction.Request putLifecycleRequest = new PutLifecycleAction.Request(lifecyclePolicy);
-            assertAcked(client().execute(PutLifecycleAction.INSTANCE, putLifecycleRequest).get());
-        }
-
+        Phase warmPhase = new Phase("warm", TimeValue.ZERO, Map.of(MigrateAction.NAME, new MigrateAction(false), ShrinkAction.NAME,
+            new ShrinkAction(1, null)));
+        LifecyclePolicy lifecyclePolicy = new LifecyclePolicy(policy, Map.of("warm", warmPhase));
+        PutLifecycleAction.Request putLifecycleRequest = new PutLifecycleAction.Request(lifecyclePolicy);
+        assertAcked(client().execute(PutLifecycleAction.INSTANCE, putLifecycleRequest).get());
+
+        // we're configuring a very high number of replicas. this will make ths shrunk index unable to allocate successfully, so ILM will
+        // wait in the `shrunk-shards-allocated` step (we don't wait for the original index to be GREEN before)
         Settings settings = Settings.builder().put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, numShards)
-            .put(SETTING_NUMBER_OF_REPLICAS, 0).put(LifecycleSettings.LIFECYCLE_NAME, policy)
+            .put(SETTING_NUMBER_OF_REPLICAS, 42).put(LifecycleSettings.LIFECYCLE_NAME, policy)
             // configuring the threshold to the minimum value
             .put(LifecycleSettings.LIFECYCLE_STEP_WAIT_TIME_THRESHOLD, "1h")
             .build();
@@ -123,36 +120,16 @@ public void testWaitInShrunkShardsAllocatedExceedsThreshold() throws Exception {
         assertTrue(res.isAcknowledged());
 
         String[] firstAttemptShrinkIndexName = new String[1];
-        // ILM will retry the shrink step because the number of shards to shrink to is gt the current number of shards
         assertBusy(() -> {
             ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex);
             ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE,
                 explainRequest).get();
 
             IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex);
-            assertThat(indexLifecycleExplainResponse.getFailedStepRetryCount(), greaterThanOrEqualTo(1));
-
             firstAttemptShrinkIndexName[0] = indexLifecycleExplainResponse.getShrinkIndexName();
             assertThat(firstAttemptShrinkIndexName[0], is(notNullValue()));
         }, 30, TimeUnit.SECONDS);
 
-
-        // we're manually shrinking the index but configuring a very high number of replicas and waiting for all active shards
-        // this will make ths shrunk index unable to allocate successfully, so ILM will wait in the `shrunk-shards-allocated` step
-        ResizeRequest resizeRequest = new ResizeRequest(firstAttemptShrinkIndexName[0], managedIndex);
-        Settings.Builder builder = Settings.builder();
-        // a very high number of replicas, coupled with an `all` wait for active shards configuration will block the shrink action in the
-        // `shrunk-shards-allocated` step.
-        builder.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 42)
-            .put("index.write.wait_for_active_shards", "all")
-            .put(LifecycleSettings.LIFECYCLE_NAME, policy)
-            .put(IndexMetadata.INDEX_ROUTING_REQUIRE_GROUP_SETTING.getKey() + "_id", (String) null)
-            .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1);
-        Settings relevantTargetSettings = builder.build();
-        resizeRequest.getTargetIndexRequest().settings(relevantTargetSettings);
-        client().admin().indices().resizeIndex(resizeRequest).get();
-        ensureYellow(firstAttemptShrinkIndexName[0]);
-
         // let's check ILM for the managed index is waiting in the `shrunk-shards-allocated` step
         assertBusy(() -> {
             ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex);
@@ -197,25 +174,23 @@ public void testWaitInShrunkShardsAllocatedExceedsThreshold() throws Exception {
         // the shrink index generated in the first attempt must've been deleted!
         assertBusy(() -> assertFalse(indexExists(firstAttemptShrinkIndexName[0])));
 
-        // at this point, the manged index is looping into the `shrink` step as the action is trying to shrink to a higher number of
-        // shards than the source index has. we'll update the policy to shrink to 1 shard and this should unblock the policy and it
-        // should successfully shrink the managed index to the second cycle shrink index name
-        {
-            Phase warmPhase = new Phase("warm", TimeValue.ZERO, Map.of(MigrateAction.NAME,
-                new MigrateAction(false), ShrinkAction.NAME, new ShrinkAction(1, null))
-            );
-            LifecyclePolicy lifecyclePolicy = new LifecyclePolicy(policy, Map.of("warm", warmPhase));
-            PutLifecycleAction.Request putLifecycleRequest = new PutLifecycleAction.Request(lifecyclePolicy);
-            assertAcked(client().execute(PutLifecycleAction.INSTANCE, putLifecycleRequest).get());
-        }
-
         assertBusy(() -> assertTrue(indexExists(secondCycleShrinkIndexName[0])), 30, TimeUnit.SECONDS);
+
+        // at this point, the second shrink attempt was executed and the manged index is looping into the `shrunk-shards-allocated` step as
+        // waiting for the huge numbers of replicas for the shrunk index to allocate. this will never happen, so let's unblock this
+        // situation and allow for shrink to complete by reducing the number of shards for the shrunk index to 0
+        Settings.Builder zeroReplicasSetting = Settings.builder().put(INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 0);
+        assertAcked(
+            client().admin().indices().prepareUpdateSettings(secondCycleShrinkIndexName[0]).setSettings(zeroReplicasSetting)
+        );
+
         assertBusy(() -> {
             ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(secondCycleShrinkIndexName[0]);
             ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE,
                 explainRequest).get();
             IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses()
                 .get(secondCycleShrinkIndexName[0]);
+            assertThat(indexLifecycleExplainResponse.getPhase(), equalTo("warm"));
             assertThat(indexLifecycleExplainResponse.getStep(), equalTo(PhaseCompleteStep.NAME));
         }, 30, TimeUnit.SECONDS);
     }