From 38dea2323b9e8b0e7c2795ab61cf863703ed6aaa Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Mon, 2 Aug 2021 22:04:25 +0200 Subject: [PATCH] Fix Finalizing Failed Shard Snapshots (#75953) We must never write generations extracted out of failed shard snapshot status values to the repository as these can not be trusted in all cases. Instead we must always put a `null` for these into the generations object to write to the repo so that the existing generations are not changed. --- .../snapshots/ConcurrentSnapshotsIT.java | 34 +++++++++++++++++++ .../repositories/ShardGenerations.java | 6 ++++ .../snapshots/SnapshotsService.java | 4 +-- 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java index f5db347c2d55b..d2e6c635f6708 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java @@ -1611,6 +1611,40 @@ public void testOutOfOrderCloneFinalization() throws Exception { ); } + public void testCorrectlyFinalizeOutOfOrderPartialFailures() throws Exception { + internalCluster().startMasterOnlyNode(); + final String dataNode1 = internalCluster().startDataOnlyNode(); + final String dataNode2 = internalCluster().startDataOnlyNode(); + final String index1 = "index-1"; + final String index2 = "index-2"; + createIndexWithContent(index1, dataNode1, dataNode2); + createIndexWithContent(index2, dataNode2, dataNode1); + + final String repository = "test-repo"; + createRepository(repository, "mock"); + + createFullSnapshot(repository, "snapshot-1"); + index(index1, "_doc", "some_doc", org.elasticsearch.core.Map.of("foo", "bar")); + index(index2, "_doc", "some_doc", org.elasticsearch.core.Map.of("foo", "bar")); + blockAndFailDataNode(repository, dataNode1); + blockDataNode(repository, dataNode2); + final ActionFuture snapshotBlocked = startFullSnapshot(repository, "snapshot-2"); + waitForBlock(dataNode1, repository); + waitForBlock(dataNode2, repository); + + unblockNode(repository, dataNode1); + assertAcked(clusterAdmin().prepareCloneSnapshot(repository, "snapshot-1", "target-1").setIndices(index1).get()); + unblockNode(repository, dataNode2); + snapshotBlocked.get(); + + assertThat( + clusterAdmin().prepareSnapshotStatus().setSnapshots("target-1").setRepository(repository).get().getSnapshots(), + hasSize(1) + ); + + createFullSnapshot(repository, "snapshot-3"); + } + public void testIndexDeletedWhileSnapshotQueuedAfterClone() throws Exception { final String master = internalCluster().startMasterOnlyNode(LARGE_SNAPSHOT_POOL_SETTINGS); internalCluster().startDataOnlyNode(); diff --git a/server/src/main/java/org/elasticsearch/repositories/ShardGenerations.java b/server/src/main/java/org/elasticsearch/repositories/ShardGenerations.java index 842c8bbebca36..b903f13b90067 100644 --- a/server/src/main/java/org/elasticsearch/repositories/ShardGenerations.java +++ b/server/src/main/java/org/elasticsearch/repositories/ShardGenerations.java @@ -8,6 +8,7 @@ package org.elasticsearch.repositories; +import org.elasticsearch.cluster.SnapshotsInProgress; import org.elasticsearch.core.Nullable; import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus; @@ -212,6 +213,11 @@ public Builder putAll(ShardGenerations shardGenerations) { return this; } + public Builder put(IndexId indexId, int shardId, SnapshotsInProgress.ShardSnapshotStatus status) { + // only track generations for successful shard status values + return put(indexId, shardId, status.state().failed() ? null : status.generation()); + } + public Builder put(IndexId indexId, int shardId, ShardGeneration generation) { ShardGeneration existingGeneration = generations.computeIfAbsent(indexId, i -> new HashMap<>()).put(shardId, generation); assert generation != null || existingGeneration == null diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java index 12a37c0eaddb4..9dbf5ff4f44b2 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java @@ -1293,7 +1293,7 @@ public void onNoLongerMaster() { private static ShardGenerations buildGenerations(SnapshotsInProgress.Entry snapshot, Metadata metadata) { ShardGenerations.Builder builder = ShardGenerations.builder(); if (snapshot.isClone()) { - snapshot.shardsByRepoShardId().forEach(c -> builder.put(c.key.index(), c.key.shardId(), c.value.generation())); + snapshot.shardsByRepoShardId().forEach(c -> builder.put(c.key.index(), c.key.shardId(), c.value)); } else { snapshot.shardsByRepoShardId().forEach(c -> { final Index index = snapshot.indexByName(c.key.indexName()); @@ -1301,7 +1301,7 @@ private static ShardGenerations buildGenerations(SnapshotsInProgress.Entry snaps assert snapshot.partial() : "Index [" + index + "] was deleted during a snapshot but snapshot was not partial."; return; } - builder.put(c.key.index(), c.key.shardId(), c.value.generation()); + builder.put(c.key.index(), c.key.shardId(), c.value); }); } return builder.build();