Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Bug Causing Queued Snapshots of Deleted Indices to Never Finalize (#75942) #76575

Merged
merged 1 commit into from
Aug 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1611,6 +1611,57 @@ public void testOutOfOrderCloneFinalization() throws Exception {
);
}

public void testIndexDeletedWhileSnapshotQueuedAfterClone() throws Exception {
final String master = internalCluster().startMasterOnlyNode(LARGE_SNAPSHOT_POOL_SETTINGS);
internalCluster().startDataOnlyNode();
final String index1 = "index-1";
final String index2 = "index-2";
createIndexWithContent(index1);
createIndexWithContent(index2);

final String repository = "test-repo";
createRepository(repository, "mock");

final String sourceSnapshot = "source-snapshot";
createFullSnapshot(repository, sourceSnapshot);

final IndexId index1Id = getRepositoryData(repository).resolveIndexId(index1);
blockMasterOnShardLevelSnapshotFile(repository, index1Id.getId());

final String cloneTarget = "target-snapshot";
final ActionFuture<AcknowledgedResponse> cloneSnapshot = clusterAdmin().prepareCloneSnapshot(
repository,
sourceSnapshot,
cloneTarget
).setIndices(index1, index2).execute();
awaitNumberOfSnapshotsInProgress(1);
waitForBlock(master, repository);

final ActionFuture<CreateSnapshotResponse> snapshot3 = clusterAdmin().prepareCreateSnapshot(repository, "snapshot-3")
.setIndices(index1, index2)
.setWaitForCompletion(true)
.setPartial(true)
.execute();
final ActionFuture<CreateSnapshotResponse> snapshot2 = clusterAdmin().prepareCreateSnapshot(repository, "snapshot-2")
.setIndices(index2)
.setWaitForCompletion(true)
.execute();
assertSuccessful(snapshot2);
awaitNumberOfSnapshotsInProgress(2);
assertFalse(snapshot3.isDone());
assertAcked(admin().indices().prepareDelete(index1).get());
assertSuccessful(snapshot3);
unblockNode(repository, master);

assertAcked(cloneSnapshot.get());
assertAcked(startDeleteSnapshot(repository, cloneTarget).get());

assertThat(
clusterAdmin().prepareSnapshotStatus().setSnapshots("snapshot-2", "snapshot-3").setRepository(repository).get().getSnapshots(),
hasSize(2)
);
}

public void testQueuedAfterFailedShardSnapshot() throws Exception {
internalCluster().startMasterOnlyNode();
final String dataNode = internalCluster().startDataOnlyNode();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -824,6 +824,7 @@ public ImmutableOpenMap<RepositoryShardId, ShardSnapshotStatus> shardsByRepoShar
}

public Index indexByName(String name) {
assert isClone() == false : "tried to get routing index for clone entry [" + this + "]";
return snapshotIndices.get(name);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1681,8 +1681,18 @@ private static ImmutableOpenMap<ShardId, ShardSnapshotStatus> processWaitingShar
// this shard snapshot is waiting for a previous snapshot to finish execution for this shard
final ShardSnapshotStatus knownFailure = knownFailures.get(shardId);
if (knownFailure == null) {
// if no failure is known for the shard we keep waiting
shards.put(shardId, shardStatus);
final IndexRoutingTable indexShardRoutingTable = routingTable.index(shardId.getIndex());
if (indexShardRoutingTable == null) {
// shard became unassigned while queued so we fail as missing here
assert entry.partial();
snapshotChanged = true;
logger.debug("failing snapshot of shard [{}] because index got deleted", shardId);
shards.put(shardId, ShardSnapshotStatus.MISSING);
knownFailures.put(shardId, ShardSnapshotStatus.MISSING);
} else {
// if no failure is known for the shard we keep waiting
shards.put(shardId, shardStatus);
}
} else {
// If a failure is known for an execution we waited on for this shard then we fail with the same exception here
// as well
Expand Down Expand Up @@ -1750,9 +1760,10 @@ private static ImmutableOpenMap<ShardId, ShardSnapshotStatus> processWaitingShar

private static boolean waitingShardsStartedOrUnassigned(SnapshotsInProgress snapshotsInProgress, ClusterChangedEvent event) {
for (SnapshotsInProgress.Entry entry : snapshotsInProgress.entries()) {
if (entry.state() == State.STARTED) {
if (entry.state() == State.STARTED && entry.isClone() == false) {
for (ObjectObjectCursor<RepositoryShardId, ShardSnapshotStatus> shardStatus : entry.shardsByRepoShardId()) {
if (shardStatus.value.state() != ShardState.WAITING) {
final ShardState state = shardStatus.value.state();
if (state != ShardState.WAITING && state != ShardState.QUEUED) {
continue;
}
final RepositoryShardId shardId = shardStatus.key;
Expand All @@ -1761,7 +1772,7 @@ private static boolean waitingShardsStartedOrUnassigned(SnapshotsInProgress snap
.getRoutingTable()
.index(entry.indexByName(shardId.indexName()));
if (indexShardRoutingTable == null) {
// index got removed concurrently and we have to fail WAITING state shards
// index got removed concurrently and we have to fail WAITING or QUEUED state shards
return true;
}
ShardRouting shardRouting = indexShardRoutingTable.shard(shardId.shardId()).primaryShard();
Expand Down