diff --git a/server/src/main/java/org/elasticsearch/indices/store/TransportNodesListShardStoreMetaData.java b/server/src/main/java/org/elasticsearch/indices/store/TransportNodesListShardStoreMetaData.java index add27eac68af8..1ad52605bf97e 100644 --- a/server/src/main/java/org/elasticsearch/indices/store/TransportNodesListShardStoreMetaData.java +++ b/server/src/main/java/org/elasticsearch/indices/store/TransportNodesListShardStoreMetaData.java @@ -19,6 +19,7 @@ package org.elasticsearch.indices.store; +import org.apache.logging.log4j.message.ParameterizedMessage; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.FailedNodeException; @@ -125,8 +126,17 @@ private StoreFilesMetaData listStoreMetaData(ShardId shardId) throws IOException if (indexService != null) { IndexShard indexShard = indexService.getShardOrNull(shardId.id()); if (indexShard != null) { - exists = true; - return new StoreFilesMetaData(shardId, indexShard.snapshotStoreMetadata()); + try { + final StoreFilesMetaData storeFilesMetaData = new StoreFilesMetaData(shardId, indexShard.snapshotStoreMetadata()); + exists = true; + return storeFilesMetaData; + } catch (org.apache.lucene.index.IndexNotFoundException e) { + logger.trace(new ParameterizedMessage("[{}] node is missing index, responding with empty", shardId), e); + return new StoreFilesMetaData(shardId, Store.MetadataSnapshot.EMPTY); + } catch (IOException e) { + logger.warn(new ParameterizedMessage("[{}] can't read metadata from store, responding with empty", shardId), e); + return new StoreFilesMetaData(shardId, Store.MetadataSnapshot.EMPTY); + } } } // try and see if we an list unallocated diff --git a/server/src/test/java/org/elasticsearch/indices/recovery/IndexRecoveryIT.java b/server/src/test/java/org/elasticsearch/indices/recovery/IndexRecoveryIT.java index 9b2bacdd7fb4c..88019e9f452d1 100644 --- a/server/src/test/java/org/elasticsearch/indices/recovery/IndexRecoveryIT.java +++ b/server/src/test/java/org/elasticsearch/indices/recovery/IndexRecoveryIT.java @@ -76,6 +76,7 @@ import java.util.Map; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; +import java.util.concurrent.Semaphore; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; @@ -90,6 +91,7 @@ import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.lessThanOrEqualTo; import static org.hamcrest.Matchers.not; +import static org.junit.Assert.assertFalse; @ClusterScope(scope = Scope.TEST, numDataNodes = 0) public class IndexRecoveryIT extends ESIntegTestCase { @@ -789,4 +791,48 @@ public void sendRequest(Transport.Connection connection, long requestId, String assertHitCount(client().prepareSearch(indexName).get(), numDocs); } } + + /** Makes sure the new master does not repeatedly fetch index metadata from recovering replicas */ + public void testOngoingRecoveryAndMasterFailOver() throws Exception { + String indexName = "test"; + internalCluster().startNodes(2); + String nodeWithPrimary = internalCluster().startDataOnlyNode(); + assertAcked(client().admin().indices().prepareCreate(indexName) + .setSettings(Settings.builder() + .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0) + .put("index.routing.allocation.include._name", nodeWithPrimary))); + MockTransportService transport = (MockTransportService) internalCluster().getInstance(TransportService.class, nodeWithPrimary); + CountDownLatch phase1ReadyBlocked = new CountDownLatch(1); + CountDownLatch allowToCompletePhase1Latch = new CountDownLatch(1); + Semaphore blockRecovery = new Semaphore(1); + transport.addSendBehavior((connection, requestId, action, request, options) -> { + if (PeerRecoveryTargetService.Actions.CLEAN_FILES.equals(action) && blockRecovery.tryAcquire()) { + phase1ReadyBlocked.countDown(); + try { + allowToCompletePhase1Latch.await(); + } catch (InterruptedException e) { + throw new AssertionError(e); + } + } + connection.sendRequest(requestId, action, request, options); + }); + try { + String nodeWithReplica = internalCluster().startDataOnlyNode(); + assertAcked(client().admin().indices().prepareUpdateSettings(indexName).setSettings(Settings.builder() + .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1) + .put("index.routing.allocation.include._name", nodeWithPrimary + "," + nodeWithReplica))); + phase1ReadyBlocked.await(); + internalCluster().restartNode(clusterService().state().nodes().getMasterNode().getName(), + new InternalTestCluster.RestartCallback()); + internalCluster().ensureAtLeastNumDataNodes(3); + assertAcked(client().admin().indices().prepareUpdateSettings(indexName).setSettings(Settings.builder() + .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2) + .putNull("index.routing.allocation.include._name"))); + assertFalse(client().admin().cluster().prepareHealth(indexName).setWaitForActiveShards(2).get().isTimedOut()); + } finally { + allowToCompletePhase1Latch.countDown(); + } + ensureGreen(indexName); + } }