elastic · elasticsearchmachine · Feb 9, 2024 · Feb 7, 2024 · Feb 7, 2024 · Feb 8, 2024
diff --git a/docs/changelog/105245.yaml b/docs/changelog/105245.yaml
@@ -0,0 +1,6 @@
+pr: 105245
+summary: Finalize all snapshots completed by shard snapshot updates
+area: Snapshot/Restore
+type: bug
+issues:
+ - 104939
diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
@@ -23,6 +23,7 @@
 import org.elasticsearch.action.support.ActionFilters;
 import org.elasticsearch.action.support.ContextPreservingActionListener;
 import org.elasticsearch.action.support.GroupedActionListener;
+import org.elasticsearch.action.support.RefCountingRunnable;
 import org.elasticsearch.action.support.master.TransportMasterNodeAction;
 import org.elasticsearch.cluster.ClusterChangedEvent;
 import org.elasticsearch.cluster.ClusterState;
@@ -70,7 +71,6 @@
 import org.elasticsearch.common.util.Maps;
 import org.elasticsearch.common.util.concurrent.EsExecutors;
 import org.elasticsearch.common.util.concurrent.ListenableFuture;
-import org.elasticsearch.common.util.concurrent.RunOnce;
 import org.elasticsearch.core.Nullable;
 import org.elasticsearch.core.SuppressForbidden;
 import org.elasticsearch.core.Tuple;
@@ -188,6 +188,8 @@ public final class SnapshotsService extends AbstractLifecycleComponent implement
 
     private final MasterServiceTaskQueue<SnapshotTask> masterServiceTaskQueue;
 
+    private final ShardSnapshotUpdateCompletionHandler shardSnapshotUpdateCompletionHandler;
+
     /**
      * Setting that specifies the maximum number of allowed concurrent snapshot create and delete operations in the
      * cluster state. The number of concurrent operations in a cluster state is defined as the sum of
@@ -243,6 +245,8 @@ public SnapshotsService(
             Priority.NORMAL,
             UpdateNodeIdsForRemovalTask::executeBatch
         );
+
+        this.shardSnapshotUpdateCompletionHandler = this::handleShardSnapshotUpdateCompletion;
     }
 
     /**
@@ -3129,17 +3133,23 @@ static final class SnapshotShardsUpdateContext {
         // updates that were used to update an existing in-progress shard snapshot
         private final Set<ShardSnapshotUpdate> executedUpdates = new HashSet<>();
 
-        // enqueues a reroute because some shard snapshots finished
-        private final Runnable rerouteRunnable;
+        // handles the completion of some shard-snapshot updates, performing the next possible actions
+        private final ShardSnapshotUpdateCompletionHandler completionHandler;
+
+        // whether to execute a reroute on completion
+        private boolean needsReroute;
+
+        // entries that became complete due to this batch of updates
+        private final List<SnapshotsInProgress.Entry> newlyCompletedEntries = new ArrayList<>();
 
         SnapshotShardsUpdateContext(
             ClusterStateTaskExecutor.BatchExecutionContext<SnapshotTask> batchExecutionContext,
-            Runnable rerouteRunnable
+            ShardSnapshotUpdateCompletionHandler completionHandler
         ) {
             this.batchExecutionContext = batchExecutionContext;
             this.initialState = batchExecutionContext.initialState();
             this.nodeIdRemovalPredicate = SnapshotsInProgress.get(initialState)::isNodeIdForRemoval;
-            this.rerouteRunnable = new RunOnce(rerouteRunnable); // RunOnce to avoid enqueueing O(#shards) listeners
+            this.completionHandler = completionHandler;
             this.updatesByRepo = new HashMap<>();
             for (final var taskContext : batchExecutionContext.taskContexts()) {
                 if (taskContext.getTask() instanceof ShardSnapshotUpdate task) {
@@ -3159,7 +3169,11 @@ SnapshotsInProgress computeUpdatedState() {
                 }
                 final List<SnapshotsInProgress.Entry> newEntries = new ArrayList<>(oldEntries.size());
                 for (SnapshotsInProgress.Entry entry : oldEntries) {
-                    newEntries.add(applyToEntry(entry, updates.getValue()));
+                    final var newEntry = applyToEntry(entry, updates.getValue());
+                    newEntries.add(newEntry);
+                    if (newEntry != entry && newEntry.state().completed()) {
+                        newlyCompletedEntries.add(newEntry);
 tryStartNextTaskAfterCloneUpdated(update.repoShardId, update.updatedState); 
 tryStartSnapshotAfterCloneFinish(repoShardId, updatedState.generation()); 
 startShardSnapshot(repoShardId, generation); 
 final ShardSnapshotStatus shardSnapshotStatus = initShardSnapshotStatus(generation, shardRouting, nodeIdRemovalPredicate); 
 tryStartNextTaskAfterCloneUpdated(update.repoShardId, update.updatedState); 
 tryStartSnapshotAfterCloneFinish(repoShardId, updatedState.generation()); 
 startShardSnapshot(repoShardId, generation); 
 final ShardSnapshotStatus shardSnapshotStatus = initShardSnapshotStatus(generation, shardRouting, nodeIdRemovalPredicate); 
+                    }
                 }
                 updated = updated.withUpdatedEntriesForRepo(repoName, newEntries);
             }
@@ -3178,12 +3192,21 @@ SnapshotsInProgress computeUpdatedState() {
         void completeWithUpdatedState(SnapshotsInProgress snapshotsInProgress) {
             if (updatesByRepo.isEmpty() == false) {
                 final var result = new ShardSnapshotUpdateResult(initialState.metadata(), snapshotsInProgress);
-                for (final var taskContext : batchExecutionContext.taskContexts()) {
-                    if (taskContext.getTask() instanceof ShardSnapshotUpdate task) {
-                        taskContext.success(() -> {
-                            rerouteRunnable.run();
-                            task.listener.onResponse(result);
-                        });
+                try (
+                    var onCompletionRefs = new RefCountingRunnable(
+                        () -> completionHandler.handleCompletion(result, needsReroute, newlyCompletedEntries, updatesByRepo.keySet())
+                    )
+                ) {
+                    for (final var taskContext : batchExecutionContext.taskContexts()) {
+                        if (taskContext.getTask() instanceof ShardSnapshotUpdate task) {
+                            final var ref = onCompletionRefs.acquire();
+                            needsReroute = true;
+                            taskContext.success(() -> {
+                                try (ref) {
+                                    task.listener.onResponse(result);
+                                }
+                            });
+                        }
                     }
                 }
             }
@@ -3432,6 +3455,39 @@ private ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> shardsBuilder() {
      */
     record ShardSnapshotUpdateResult(Metadata metadata, SnapshotsInProgress snapshotsInProgress) {}
 
+    interface ShardSnapshotUpdateCompletionHandler {
+        void handleCompletion(
+            ShardSnapshotUpdateResult shardSnapshotUpdateResult,
+            boolean needsReroute,
+            List<SnapshotsInProgress.Entry> newlyCompletedEntries,
+            Set<String> updatedRepositories
+        );
+    }
+
+    private void handleShardSnapshotUpdateCompletion(
+        ShardSnapshotUpdateResult shardSnapshotUpdateResult,
+        boolean needsReroute,
+        List<SnapshotsInProgress.Entry> newlyCompletedEntries,
+        Set<String> updatedRepositories
+    ) {
+        // Maybe this state update completed one or more snapshots. If we are not already ending them because of some earlier update, end
+        // them now.
+        final var snapshotsInProgress = shardSnapshotUpdateResult.snapshotsInProgress();
+        for (final var newlyCompletedEntry : newlyCompletedEntries) {
+            if (endingSnapshots.contains(newlyCompletedEntry.snapshot()) == false) {
+                endSnapshot(newlyCompletedEntry, shardSnapshotUpdateResult.metadata, null);
+            }
+        }
+        // Likewise this state update may enable some new shard clones on any affected repository, so check them all.
+        for (final var updatedRepository : updatedRepositories) {
+            startExecutableClones(snapshotsInProgress, updatedRepository);
+        }
+        // Also shard snapshot completions may free up some shards to move to other nodes so we must trigger a reroute.
+        if (needsReroute) {
+            rerouteService.reroute("after shards snapshot update", Priority.NORMAL, ActionListener.noop());
+        }
+    }
+
     /**
      * An update to the snapshot state of a shard.
      *
@@ -3511,23 +3567,13 @@ private void innerUpdateSnapshotState(
         ShardSnapshotStatus updatedState,
         ActionListener<Void> listener
     ) {
-        var update = new ShardSnapshotUpdate(snapshot, shardId, repoShardId, updatedState, listener.delegateFailure((delegate, result) -> {
-            try {
-                delegate.onResponse(null);
-            } finally {
-                // Maybe this state update completed the snapshot. If we are not already ending it because of a concurrent
-                // state update we check if its state is completed and end it if it is.
-                final SnapshotsInProgress snapshotsInProgress = result.snapshotsInProgress();
-                if (endingSnapshots.contains(snapshot) == false) {
-                    final SnapshotsInProgress.Entry updatedEntry = snapshotsInProgress.snapshot(snapshot);
-                    // If the entry is still in the cluster state and is completed, try finalizing the snapshot in the repo
-                    if (updatedEntry != null && updatedEntry.state().completed()) {
-                        endSnapshot(updatedEntry, result.metadata(), null);
-                    }
-                }
-                startExecutableClones(snapshotsInProgress, snapshot.getRepository());
-            }
-        }));
+        var update = new ShardSnapshotUpdate(
+            snapshot,
+            shardId,
+            repoShardId,
+            updatedState,
+            listener.delegateFailure((delegate, result) -> delegate.onResponse(null))
+        );
         logger.trace("received updated snapshot restore state [{}]", update);
         masterServiceTaskQueue.submitTask("update snapshot state", update, null);
     }
@@ -3807,7 +3853,7 @@ public ClusterState execute(BatchExecutionContext<SnapshotTask> batchExecutionCo
             final ClusterState state = batchExecutionContext.initialState();
             final SnapshotShardsUpdateContext shardsUpdateContext = new SnapshotShardsUpdateContext(
                 batchExecutionContext,
-                () -> rerouteService.reroute("after shards snapshot update", Priority.NORMAL, ActionListener.noop())
+                shardSnapshotUpdateCompletionHandler
             );
             final SnapshotsInProgress initialSnapshots = SnapshotsInProgress.get(state);
             SnapshotsInProgress snapshotsInProgress = shardsUpdateContext.computeUpdatedState();