From 57b88dbde28fcbf2829246ddd5b34c3bf8cf47b5 Mon Sep 17 00:00:00 2001 From: David Turner Date: Wed, 29 May 2024 09:43:51 +0100 Subject: [PATCH] Fix double-pausing shard snapshot Closes #109143 --- .../snapshots/SnapshotsService.java | 5 ++ .../snapshots/SnapshotsServiceTests.java | 64 +++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java index dd8ddcffd5fe3..2b46442a6e876 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java @@ -3354,6 +3354,11 @@ private void executeShardSnapshotUpdate( updatedState = updateSnapshotState.updatedState; } + if (updatedState.state() == ShardState.PAUSED_FOR_NODE_REMOVAL) { + // leave subsequent entries for this shard alone until this one is unpaused + iterator.remove(); + } + logger.trace("[{}] Updating shard [{}] with status [{}]", updateSnapshotState.snapshot, updatedShard, updatedState.state()); changedCount++; newStates.get().put(updatedShard, updatedState); diff --git a/server/src/test/java/org/elasticsearch/snapshots/SnapshotsServiceTests.java b/server/src/test/java/org/elasticsearch/snapshots/SnapshotsServiceTests.java index 56a28b11edfe7..bcc7a23bbec53 100644 --- a/server/src/test/java/org/elasticsearch/snapshots/SnapshotsServiceTests.java +++ b/server/src/test/java/org/elasticsearch/snapshots/SnapshotsServiceTests.java @@ -401,6 +401,70 @@ public void testCompletedCloneStartsNextClone() throws Exception { assertIsNoop(updatedClusterState, completeShardClone); } + public void testPauseForNodeRemovalWithQueuedShards() throws Exception { + final var repoName = "test-repo"; + final var snapshot1 = snapshot(repoName, "snap-1"); + final var snapshot2 = snapshot(repoName, "snap-2"); + final var indexName = "index-1"; + final var shardId = new ShardId(index(indexName), 0); + final var repositoryShardId = new RepositoryShardId(indexId(indexName), 0); + final var nodeId = uuid(); + + final var runningEntry = snapshotEntry( + snapshot1, + Collections.singletonMap(indexName, repositoryShardId.index()), + Map.of(shardId, initShardStatus(nodeId)) + ); + + final var queuedEntry = snapshotEntry( + snapshot2, + Collections.singletonMap(indexName, repositoryShardId.index()), + Map.of(shardId, SnapshotsInProgress.ShardSnapshotStatus.UNASSIGNED_QUEUED) + ); + + final var initialState = stateWithSnapshots( + ClusterState.builder(ClusterState.EMPTY_STATE) + .nodes(DiscoveryNodes.builder().add(DiscoveryNodeUtils.create(nodeId)).localNodeId(nodeId).masterNodeId(nodeId).build()) + .routingTable( + RoutingTable.builder() + .add( + IndexRoutingTable.builder(shardId.getIndex()) + .addShard(TestShardRouting.newShardRouting(shardId, nodeId, true, ShardRoutingState.STARTED)) + ) + .build() + ) + .build(), + repoName, + runningEntry, + queuedEntry + ); + + final var updatedState = applyUpdates( + initialState, + new SnapshotsService.ShardSnapshotUpdate( + snapshot1, + shardId, + null, + new SnapshotsInProgress.ShardSnapshotStatus( + nodeId, + SnapshotsInProgress.ShardState.PAUSED_FOR_NODE_REMOVAL, + runningEntry.shards().get(shardId).generation() + ), + ActionTestUtils.assertNoFailureListener(t -> {}) + ) + ); + + assertEquals( + SnapshotsInProgress.ShardState.PAUSED_FOR_NODE_REMOVAL, + SnapshotsInProgress.get(updatedState).snapshot(snapshot1).shards().get(shardId).state() + ); + + assertEquals( + SnapshotsInProgress.ShardState.QUEUED, + SnapshotsInProgress.get(updatedState).snapshot(snapshot2).shards().get(shardId).state() + ); + } + public void testSnapshottingIndicesExcludesClones() { final String repoName = "test-repo"; final String indexName = "index";