From aee520a42e4270638615192a9ec2e7b09d82a45c Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Mon, 2 Aug 2021 18:24:36 +0200 Subject: [PATCH] Fix Bug Causing Queued Snapshots of Deleted Indices to Never Finalize (#75942) We have to run the loop checking for completed snapshots if we see an index disappearing. Otherwise, we never get around to finalizing a queued snapshot stuck after a clone if the index is deleted during cloning. --- .../snapshots/ConcurrentSnapshotsIT.java | 51 +++++++++++++++++++ .../cluster/SnapshotsInProgress.java | 1 + .../snapshots/SnapshotsService.java | 21 ++++++-- 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java index ee420c9361540..def40a995c844 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java @@ -1611,6 +1611,57 @@ public void testOutOfOrderCloneFinalization() throws Exception { ); } + public void testIndexDeletedWhileSnapshotQueuedAfterClone() throws Exception { + final String master = internalCluster().startMasterOnlyNode(LARGE_SNAPSHOT_POOL_SETTINGS); + internalCluster().startDataOnlyNode(); + final String index1 = "index-1"; + final String index2 = "index-2"; + createIndexWithContent(index1); + createIndexWithContent(index2); + + final String repository = "test-repo"; + createRepository(repository, "mock"); + + final String sourceSnapshot = "source-snapshot"; + createFullSnapshot(repository, sourceSnapshot); + + final IndexId index1Id = getRepositoryData(repository).resolveIndexId(index1); + blockMasterOnShardLevelSnapshotFile(repository, index1Id.getId()); + + final String cloneTarget = "target-snapshot"; + final ActionFuture cloneSnapshot = clusterAdmin().prepareCloneSnapshot( + repository, + sourceSnapshot, + cloneTarget + ).setIndices(index1, index2).execute(); + awaitNumberOfSnapshotsInProgress(1); + waitForBlock(master, repository); + + final ActionFuture snapshot3 = clusterAdmin().prepareCreateSnapshot(repository, "snapshot-3") + .setIndices(index1, index2) + .setWaitForCompletion(true) + .setPartial(true) + .execute(); + final ActionFuture snapshot2 = clusterAdmin().prepareCreateSnapshot(repository, "snapshot-2") + .setIndices(index2) + .setWaitForCompletion(true) + .execute(); + assertSuccessful(snapshot2); + awaitNumberOfSnapshotsInProgress(2); + assertFalse(snapshot3.isDone()); + assertAcked(admin().indices().prepareDelete(index1).get()); + assertSuccessful(snapshot3); + unblockNode(repository, master); + + assertAcked(cloneSnapshot.get()); + assertAcked(startDeleteSnapshot(repository, cloneTarget).get()); + + assertThat( + clusterAdmin().prepareSnapshotStatus().setSnapshots("snapshot-2", "snapshot-3").setRepository(repository).get().getSnapshots(), + hasSize(2) + ); + } + public void testQueuedAfterFailedShardSnapshot() throws Exception { internalCluster().startMasterOnlyNode(); final String dataNode = internalCluster().startDataOnlyNode(); diff --git a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java index cf8b28eb2d14c..aa864df0423fe 100644 --- a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java +++ b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java @@ -756,6 +756,7 @@ public ImmutableOpenMap shardsByRepoShar } public Index indexByName(String name) { + assert isClone() == false : "tried to get routing index for clone entry [" + this + "]"; return snapshotIndices.get(name); } diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java index c9cccb351a53b..77a3a662e87e1 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java @@ -1229,8 +1229,18 @@ private static ImmutableOpenMap processWaitingShar // this shard snapshot is waiting for a previous snapshot to finish execution for this shard final ShardSnapshotStatus knownFailure = knownFailures.get(shardId); if (knownFailure == null) { - // if no failure is known for the shard we keep waiting - shards.put(shardId, shardStatus); + final IndexRoutingTable indexShardRoutingTable = routingTable.index(shardId.getIndex()); + if (indexShardRoutingTable == null) { + // shard became unassigned while queued so we fail as missing here + assert entry.partial(); + snapshotChanged = true; + logger.debug("failing snapshot of shard [{}] because index got deleted", shardId); + shards.put(shardId, ShardSnapshotStatus.MISSING); + knownFailures.put(shardId, ShardSnapshotStatus.MISSING); + } else { + // if no failure is known for the shard we keep waiting + shards.put(shardId, shardStatus); + } } else { // If a failure is known for an execution we waited on for this shard then we fail with the same exception here // as well @@ -1298,9 +1308,10 @@ private static ImmutableOpenMap processWaitingShar private static boolean waitingShardsStartedOrUnassigned(SnapshotsInProgress snapshotsInProgress, ClusterChangedEvent event) { for (SnapshotsInProgress.Entry entry : snapshotsInProgress.entries()) { - if (entry.state() == State.STARTED) { + if (entry.state() == State.STARTED && entry.isClone() == false) { for (ObjectObjectCursor shardStatus : entry.shardsByRepoShardId()) { - if (shardStatus.value.state() != ShardState.WAITING) { + final ShardState state = shardStatus.value.state(); + if (state != ShardState.WAITING && state != ShardState.QUEUED) { continue; } final RepositoryShardId shardId = shardStatus.key; @@ -1309,7 +1320,7 @@ private static boolean waitingShardsStartedOrUnassigned(SnapshotsInProgress snap .getRoutingTable() .index(entry.indexByName(shardId.indexName())); if (indexShardRoutingTable == null) { - // index got removed concurrently and we have to fail WAITING state shards + // index got removed concurrently and we have to fail WAITING or QUEUED state shards return true; } ShardRouting shardRouting = indexShardRoutingTable.shard(shardId.shardId()).primaryShard();