Skip to content

Commit

Permalink
Fix Bug Causing Queued Snapshots of Deleted Indices to Never Finalize (
Browse files Browse the repository at this point in the history
…elastic#75942)

We have to run the loop checking for completed snapshots if we see an index disappearing.
Otherwise, we never get around to finalizing a queued snapshot stuck after a clone if the
index is deleted during cloning.
  • Loading branch information
original-brownbear authored and Adam Locke committed Aug 3, 2021
1 parent 3d5c51e commit aee520a
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1611,6 +1611,57 @@ public void testOutOfOrderCloneFinalization() throws Exception {
);
}

public void testIndexDeletedWhileSnapshotQueuedAfterClone() throws Exception {
final String master = internalCluster().startMasterOnlyNode(LARGE_SNAPSHOT_POOL_SETTINGS);
internalCluster().startDataOnlyNode();
final String index1 = "index-1";
final String index2 = "index-2";
createIndexWithContent(index1);
createIndexWithContent(index2);

final String repository = "test-repo";
createRepository(repository, "mock");

final String sourceSnapshot = "source-snapshot";
createFullSnapshot(repository, sourceSnapshot);

final IndexId index1Id = getRepositoryData(repository).resolveIndexId(index1);
blockMasterOnShardLevelSnapshotFile(repository, index1Id.getId());

final String cloneTarget = "target-snapshot";
final ActionFuture<AcknowledgedResponse> cloneSnapshot = clusterAdmin().prepareCloneSnapshot(
repository,
sourceSnapshot,
cloneTarget
).setIndices(index1, index2).execute();
awaitNumberOfSnapshotsInProgress(1);
waitForBlock(master, repository);

final ActionFuture<CreateSnapshotResponse> snapshot3 = clusterAdmin().prepareCreateSnapshot(repository, "snapshot-3")
.setIndices(index1, index2)
.setWaitForCompletion(true)
.setPartial(true)
.execute();
final ActionFuture<CreateSnapshotResponse> snapshot2 = clusterAdmin().prepareCreateSnapshot(repository, "snapshot-2")
.setIndices(index2)
.setWaitForCompletion(true)
.execute();
assertSuccessful(snapshot2);
awaitNumberOfSnapshotsInProgress(2);
assertFalse(snapshot3.isDone());
assertAcked(admin().indices().prepareDelete(index1).get());
assertSuccessful(snapshot3);
unblockNode(repository, master);

assertAcked(cloneSnapshot.get());
assertAcked(startDeleteSnapshot(repository, cloneTarget).get());

assertThat(
clusterAdmin().prepareSnapshotStatus().setSnapshots("snapshot-2", "snapshot-3").setRepository(repository).get().getSnapshots(),
hasSize(2)
);
}

public void testQueuedAfterFailedShardSnapshot() throws Exception {
internalCluster().startMasterOnlyNode();
final String dataNode = internalCluster().startDataOnlyNode();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,7 @@ public ImmutableOpenMap<RepositoryShardId, ShardSnapshotStatus> shardsByRepoShar
}

public Index indexByName(String name) {
assert isClone() == false : "tried to get routing index for clone entry [" + this + "]";
return snapshotIndices.get(name);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1229,8 +1229,18 @@ private static ImmutableOpenMap<ShardId, ShardSnapshotStatus> processWaitingShar
// this shard snapshot is waiting for a previous snapshot to finish execution for this shard
final ShardSnapshotStatus knownFailure = knownFailures.get(shardId);
if (knownFailure == null) {
// if no failure is known for the shard we keep waiting
shards.put(shardId, shardStatus);
final IndexRoutingTable indexShardRoutingTable = routingTable.index(shardId.getIndex());
if (indexShardRoutingTable == null) {
// shard became unassigned while queued so we fail as missing here
assert entry.partial();
snapshotChanged = true;
logger.debug("failing snapshot of shard [{}] because index got deleted", shardId);
shards.put(shardId, ShardSnapshotStatus.MISSING);
knownFailures.put(shardId, ShardSnapshotStatus.MISSING);
} else {
// if no failure is known for the shard we keep waiting
shards.put(shardId, shardStatus);
}
} else {
// If a failure is known for an execution we waited on for this shard then we fail with the same exception here
// as well
Expand Down Expand Up @@ -1298,9 +1308,10 @@ private static ImmutableOpenMap<ShardId, ShardSnapshotStatus> processWaitingShar

private static boolean waitingShardsStartedOrUnassigned(SnapshotsInProgress snapshotsInProgress, ClusterChangedEvent event) {
for (SnapshotsInProgress.Entry entry : snapshotsInProgress.entries()) {
if (entry.state() == State.STARTED) {
if (entry.state() == State.STARTED && entry.isClone() == false) {
for (ObjectObjectCursor<RepositoryShardId, ShardSnapshotStatus> shardStatus : entry.shardsByRepoShardId()) {
if (shardStatus.value.state() != ShardState.WAITING) {
final ShardState state = shardStatus.value.state();
if (state != ShardState.WAITING && state != ShardState.QUEUED) {
continue;
}
final RepositoryShardId shardId = shardStatus.key;
Expand All @@ -1309,7 +1320,7 @@ private static boolean waitingShardsStartedOrUnassigned(SnapshotsInProgress snap
.getRoutingTable()
.index(entry.indexByName(shardId.indexName()));
if (indexShardRoutingTable == null) {
// index got removed concurrently and we have to fail WAITING state shards
// index got removed concurrently and we have to fail WAITING or QUEUED state shards
return true;
}
ShardRouting shardRouting = indexShardRoutingTable.shard(shardId.shardId()).primaryShard();
Expand Down

0 comments on commit aee520a

Please sign in to comment.