From 4e57fa77589ba4560d131ad309b1c2c1ddee7796 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Wed, 10 Apr 2019 11:52:09 +0530 Subject: [PATCH 1/7] Reset max_retries counter before executing routing commands --- .../routing/allocation/AllocationService.java | 15 ++-- .../RetryFailedAllocationTests.java | 82 +++++++++++++++++++ 2 files changed, 91 insertions(+), 6 deletions(-) create mode 100644 server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java index c688a120a8b6a..b7297a8a544db 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java @@ -296,7 +296,7 @@ private void removeDelayMarkers(RoutingAllocation allocation) { /** * Reset failed allocation counter for unassigned shards */ - private void resetFailedAllocationCounter(RoutingAllocation allocation) { + private RoutingAllocation resetFailedAllocationCounter(ClusterState oldState, RoutingAllocation allocation) { final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = allocation.routingNodes().unassigned().iterator(); while (unassignedIterator.hasNext()) { ShardRouting shardRouting = unassignedIterator.next(); @@ -307,6 +307,9 @@ private void resetFailedAllocationCounter(RoutingAllocation allocation) { unassignedInfo.getUnassignedTimeInMillis(), unassignedInfo.isDelayed(), unassignedInfo.getLastAllocationStatus()), shardRouting.recoverySource(), allocation.changes()); } + ClusterState newState = buildResult(oldState, allocation); + return new RoutingAllocation(allocationDeciders, getMutableRoutingNodes(newState), newState, + clusterInfoService.getClusterInfo(), allocation.getCurrentNanoTime()); } /** @@ -337,16 +340,16 @@ public CommandsResult reroute(final ClusterState clusterState, AllocationCommand allocation.debugDecision(true); // we ignore disable allocation, because commands are explicit allocation.ignoreDisable(true); + + if (retryFailed) { + allocation = resetFailedAllocationCounter(clusterState, allocation); + } + RoutingExplanations explanations = commands.execute(allocation, explain); // we revert the ignore disable flag, since when rerouting, we want the original setting to take place allocation.ignoreDisable(false); // the assumption is that commands will move / act on shards (or fail through exceptions) // so, there will always be shard "movements", so no need to check on reroute - - if (retryFailed) { - resetFailedAllocationCounter(allocation); - } - reroute(allocation); return new CommandsResult(explanations, buildResultAndLogHealthChange(clusterState, allocation, "reroute commands")); } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java new file mode 100644 index 0000000000000..73c2d03149b43 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java @@ -0,0 +1,82 @@ +package org.elasticsearch.cluster.routing.allocation; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.ClusterName; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.ESAllocationTestCase; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.cluster.metadata.MetaData; +import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.cluster.routing.RoutingTable; +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.ShardRoutingState; +import org.elasticsearch.cluster.routing.allocation.command.AllocateReplicaAllocationCommand; +import org.elasticsearch.cluster.routing.allocation.command.AllocationCommands; +import org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider; +import org.elasticsearch.common.settings.Settings; + +import java.util.Collections; +import java.util.List; + +public class RetryFailedAllocationTests extends ESAllocationTestCase { + + private MockAllocationService strategy; + private ClusterState clusterState; + private final String INDEX_NAME = "index"; + + @Override + public void setUp() throws Exception { + super.setUp(); + MetaData metaData = MetaData.builder().put(IndexMetaData.builder(INDEX_NAME) + .settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1)).build(); + RoutingTable routingTable = RoutingTable.builder().addAsNew(metaData.index(INDEX_NAME)).build(); + clusterState = ClusterState.builder(ClusterName.DEFAULT).metaData(metaData).routingTable(routingTable) + .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))).build(); + strategy = createAllocationService(Settings.EMPTY); + } + + private ShardRouting getPrimary() { + for (ShardRouting shard: clusterState.getRoutingTable().allShards()) { + if (shard.getIndexName().equals(INDEX_NAME) && shard.primary()) { + return shard; + } + } + throw new IllegalArgumentException("No primary found for index: " + INDEX_NAME); + } + + private ShardRouting getReplica() { + for (ShardRouting shard: clusterState.getRoutingTable().allShards()) { + if (shard.getIndexName().equals(INDEX_NAME) && !shard.primary()) { + return shard; + } + } + throw new IllegalArgumentException("No replica found for index: " + INDEX_NAME); + } + + public void testRetryFailedResetForAllocationCommands() { + final int retries = MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY.get(Settings.EMPTY); + clusterState = strategy.reroute(clusterState, "initial allocation"); + clusterState = strategy.applyStartedShards(clusterState, Collections.singletonList(getPrimary())); + + // Exhaust all replica allocation attempts with shard failures + for (int i = 0; i < retries; i++) { + List failedShards = Collections.singletonList( + new FailedShard(getReplica(), "failing-shard::attempt-" + i, + new UnsupportedOperationException(), randomBoolean())); + clusterState = strategy.applyFailedShards(clusterState, failedShards); + clusterState = strategy.reroute(clusterState, "allocation retry attempt-" + i); + } + + // Now allocate replica with retry_failed flag set + AllocationService.CommandsResult result = strategy.reroute(clusterState, + new AllocationCommands(new AllocateReplicaAllocationCommand(INDEX_NAME, 0, + getPrimary().currentNodeId().equals("node1") ? "node2" : "node1")), + false, true); + clusterState = result.getClusterState(); + + assertEquals(ShardRoutingState.INITIALIZING, getReplica().state()); + clusterState = strategy.applyStartedShards(clusterState, Collections.singletonList(getReplica())); + assertEquals(ShardRoutingState.STARTED, getReplica().state()); + assertFalse(clusterState.getRoutingNodes().hasUnassignedShards()); + } +} From a07ebb03263066e55494ef8895aeea00b650c860 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Thu, 11 Apr 2019 22:40:42 +0530 Subject: [PATCH 2/7] Add license header --- .../RetryFailedAllocationTests.java | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java index 73c2d03149b43..f2bc145f7a448 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java @@ -1,3 +1,22 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; From 9c30d543d95941a5eef62f0c51e782db902e60ee Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Wed, 29 May 2019 11:00:30 +0530 Subject: [PATCH 3/7] Avoid creating RoutingAllocation on each counter reset --- .../cluster/routing/allocation/AllocationService.java | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java index b7297a8a544db..d0a3a6ea63bbb 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java @@ -296,7 +296,7 @@ private void removeDelayMarkers(RoutingAllocation allocation) { /** * Reset failed allocation counter for unassigned shards */ - private RoutingAllocation resetFailedAllocationCounter(ClusterState oldState, RoutingAllocation allocation) { + private void resetFailedAllocationCounter(RoutingAllocation allocation) { final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = allocation.routingNodes().unassigned().iterator(); while (unassignedIterator.hasNext()) { ShardRouting shardRouting = unassignedIterator.next(); @@ -307,9 +307,6 @@ private RoutingAllocation resetFailedAllocationCounter(ClusterState oldState, Ro unassignedInfo.getUnassignedTimeInMillis(), unassignedInfo.isDelayed(), unassignedInfo.getLastAllocationStatus()), shardRouting.recoverySource(), allocation.changes()); } - ClusterState newState = buildResult(oldState, allocation); - return new RoutingAllocation(allocationDeciders, getMutableRoutingNodes(newState), newState, - clusterInfoService.getClusterInfo(), allocation.getCurrentNanoTime()); } /** @@ -342,7 +339,7 @@ public CommandsResult reroute(final ClusterState clusterState, AllocationCommand allocation.ignoreDisable(true); if (retryFailed) { - allocation = resetFailedAllocationCounter(clusterState, allocation); + resetFailedAllocationCounter(allocation); } RoutingExplanations explanations = commands.execute(allocation, explain); From 3edd4579736eb5b423cfcb900ffcb1e859098881 Mon Sep 17 00:00:00 2001 From: David Turner Date: Fri, 31 May 2019 08:54:15 +0100 Subject: [PATCH 4/7] Reformat --- .../routing/allocation/RetryFailedAllocationTests.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java index f2bc145f7a448..744a5c4df4929 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java @@ -55,7 +55,7 @@ public void setUp() throws Exception { } private ShardRouting getPrimary() { - for (ShardRouting shard: clusterState.getRoutingTable().allShards()) { + for (ShardRouting shard : clusterState.getRoutingTable().allShards()) { if (shard.getIndexName().equals(INDEX_NAME) && shard.primary()) { return shard; } @@ -64,7 +64,7 @@ private ShardRouting getPrimary() { } private ShardRouting getReplica() { - for (ShardRouting shard: clusterState.getRoutingTable().allShards()) { + for (ShardRouting shard : clusterState.getRoutingTable().allShards()) { if (shard.getIndexName().equals(INDEX_NAME) && !shard.primary()) { return shard; } From c45ab04db61f22f10cf8bc86150e84024b53bccb Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Fri, 31 May 2019 14:10:37 +0530 Subject: [PATCH 5/7] Apply suggestions from code review Co-Authored-By: David Turner --- .../routing/allocation/RetryFailedAllocationTests.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java index 744a5c4df4929..7278108165505 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java @@ -81,10 +81,12 @@ public void testRetryFailedResetForAllocationCommands() { for (int i = 0; i < retries; i++) { List failedShards = Collections.singletonList( new FailedShard(getReplica(), "failing-shard::attempt-" + i, - new UnsupportedOperationException(), randomBoolean())); + new ElasticsearchException("simulated"), randomBoolean())); clusterState = strategy.applyFailedShards(clusterState, failedShards); clusterState = strategy.reroute(clusterState, "allocation retry attempt-" + i); } + assertThat("replica should not be assigned", getReplica().state(), equalTo(ShardRoutingState.UNASSIGNED)); + assertThat("reroute should be a no-op", strategy.reroute(clusterState, "test"), sameInstance(clusterState)); // Now allocate replica with retry_failed flag set AllocationService.CommandsResult result = strategy.reroute(clusterState, From c936a2a3f429c8133297e23df78ad70df8980439 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Fri, 31 May 2019 14:19:24 +0530 Subject: [PATCH 6/7] Minor CR suggestions --- .../allocation/RetryFailedAllocationTests.java | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java index 7278108165505..1a7348ec3bb5d 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java @@ -55,21 +55,11 @@ public void setUp() throws Exception { } private ShardRouting getPrimary() { - for (ShardRouting shard : clusterState.getRoutingTable().allShards()) { - if (shard.getIndexName().equals(INDEX_NAME) && shard.primary()) { - return shard; - } - } - throw new IllegalArgumentException("No primary found for index: " + INDEX_NAME); + return clusterState.getRoutingTable().index(INDEX_NAME).shard(0).primaryShard(); } private ShardRouting getReplica() { - for (ShardRouting shard : clusterState.getRoutingTable().allShards()) { - if (shard.getIndexName().equals(INDEX_NAME) && !shard.primary()) { - return shard; - } - } - throw new IllegalArgumentException("No replica found for index: " + INDEX_NAME); + return clusterState.getRoutingTable().index(INDEX_NAME).shard(0).replicaShards().get(0); } public void testRetryFailedResetForAllocationCommands() { From e641401ecd3432dbd2ea630e6806b3f9a2a33e6e Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Fri, 31 May 2019 15:06:51 +0530 Subject: [PATCH 7/7] Modify test as per review suggestions --- .../routing/allocation/RetryFailedAllocationTests.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java index 1a7348ec3bb5d..1ca516da26286 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java @@ -19,6 +19,7 @@ package org.elasticsearch.cluster.routing.allocation; +import org.elasticsearch.ElasticsearchException; import org.elasticsearch.Version; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; @@ -37,6 +38,9 @@ import java.util.Collections; import java.util.List; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.sameInstance; + public class RetryFailedAllocationTests extends ESAllocationTestCase { private MockAllocationService strategy;