elastic · original-brownbear · Dec 9, 2020 · Nov 26, 2020 · Nov 29, 2020 · Nov 30, 2020
diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
@@ -69,6 +69,7 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.util.BigArrays;
+import org.elasticsearch.common.util.concurrent.AbstractRefCounted;
 import org.elasticsearch.common.util.concurrent.AbstractRunnable;
 import org.elasticsearch.common.util.concurrent.AsyncIOProcessor;
 import org.elasticsearch.common.util.concurrent.RunOnce;
@@ -376,6 +377,9 @@ public boolean shouldCache(Query query) {
         persistMetadata(path, indexSettings, shardRouting, null, logger);
         this.useRetentionLeasesInPeerRecovery = replicationTracker.hasAllPeerRecoveryRetentionLeases();
         this.refreshPendingLocationListener = new RefreshPendingLocationListener();
+        if (shardRouting.isRelocationTarget()) {
+            relocationCondition = new RelocationCondition(shardRouting);
+        }
     }
 
     public ThreadPool getThreadPool() {
@@ -457,6 +461,38 @@ public QueryCachingPolicy getQueryCachingPolicy() {
         return cachingPolicy;
     }
 
+    /**
+     * A ref counter that can be used to delay primary relocation handoff via {@link #createRelocationDependency()}.
+     */
+    private final class RelocationCondition extends AbstractRefCounted {
+
+        private Runnable asyncActivation;
+
+        RelocationCondition(ShardRouting routing) {
+            super("relocation condition for [" + routing.shardId() + "][" + routing.allocationId() + "]");
+        }
+
+        @Override
+        protected void closeInternal() {
+            synchronized (this) {
+                if (asyncActivation != null) {
+                    threadPool.generic().execute(asyncActivation);
+                }
+            }
+        }
+
+        // Set the relocation context when receiving it and execute the handoff right away if no more conditions are waiting or create a
+        // Runnable to execute once all conditions have finished
+        void receivePrimaryContext(ReplicationTracker.PrimaryContext primaryContext, ActionListener<Void> listener) {
+            synchronized (this) {
+                if (decRef()) {
+                    doActivateWithPrimaryContext(primaryContext, listener);
+                } else {
+                    asyncActivation = () -> doActivateWithPrimaryContext(primaryContext, listener);
+                }
+            }
+        }
+    }
 
     @Override
     public void updateShardState(final ShardRouting newRouting,
@@ -604,6 +640,14 @@ public void onFailure(Exception e) {
                         }, null);
                 }
             }
+            if (newRouting.isRelocationTarget()) {
+                if (currentRouting.isRelocationTarget() == false) {
+                    assert relocationCondition == null : "Found relocation condition even though there shouldn't be one";
+                    relocationCondition = new RelocationCondition(newRouting);
+                }
+            } else {
+                relocationCondition = null;
+            }
             // set this last, once we finished updating all internal state.
             this.shardRouting = newRouting;
 
@@ -2409,23 +2453,62 @@ assert state() != IndexShardState.POST_RECOVERY && state() != IndexShardState.ST
         replicationTracker.updateGlobalCheckpointOnReplica(globalCheckpoint, reason);
     }
 
+    private RelocationCondition relocationCondition;
+
+    /**
+     * Creates a {@link Runnable} that must be executed before primary relocation to this shard can complete by a call to
+     * {@link #activateThrottling()}.
+     *
+     * @return listener that must be resolved before primary relocation to this shard can complete
+     */
+    public Runnable createRelocationDependency() {
+        assert assertRelocationTarget();
+        logger.trace("adding relocation condition for [{}]", shardRouting);
+        final RelocationCondition condition;
+        synchronized (mutex) {
+            condition = this.relocationCondition;
+        }
+        condition.incRef();
+        return condition::decRef;
+    }
+
     /**
      * Updates the known allocation IDs and the local checkpoints for the corresponding allocations from a primary relocation source.
      *
      * @param primaryContext the sequence number context
      */
-    public void activateWithPrimaryContext(final ReplicationTracker.PrimaryContext primaryContext) {
-        assert shardRouting.primary() && shardRouting.isRelocationTarget() :
-            "only primary relocation target can update allocation IDs from primary context: " + shardRouting;
+    public void activateWithPrimaryContext(ReplicationTracker.PrimaryContext primaryContext, ActionListener<Void> listener) {
+        assert assertRelocationTarget();
         assert primaryContext.getCheckpointStates().containsKey(routingEntry().allocationId().getId()) :
             "primary context [" + primaryContext + "] does not contain relocation target [" + routingEntry() + "]";
         assert getLocalCheckpoint() == primaryContext.getCheckpointStates().get(routingEntry().allocationId().getId())
             .getLocalCheckpoint() || indexSettings().getTranslogDurability() == Translog.Durability.ASYNC :
             "local checkpoint [" + getLocalCheckpoint() + "] does not match checkpoint from primary context [" + primaryContext + "]";
+        final RelocationCondition condition;
         synchronized (mutex) {
-            replicationTracker.activateWithPrimaryContext(primaryContext); // make changes to primaryMode flag only under mutex
+            condition = relocationCondition;
         }
-        ensurePeerRecoveryRetentionLeasesExist();
+        condition.receivePrimaryContext(primaryContext, listener);
+    }
+
+    private void doActivateWithPrimaryContext(ReplicationTracker.PrimaryContext primaryContext, ActionListener<Void> listener) {
+        try {
+            synchronized (mutex) {
+                // make changes to primaryMode flag only under mutex
+                replicationTracker.activateWithPrimaryContext(primaryContext);
+            }
+            ensurePeerRecoveryRetentionLeasesExist();
+        } catch (Exception e) {
+            listener.onFailure(e);
+            return;
+        }
+        listener.onResponse(null);
+    }
+
+    private boolean assertRelocationTarget() {
+        assert shardRouting.primary() && shardRouting.isRelocationTarget() :
+                "only primary relocation target can update allocation IDs from primary context: " + shardRouting;
+        return true;
     }
 
     private void ensurePeerRecoveryRetentionLeasesExist() {

diff --git a/server/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java b/server/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java
@@ -38,6 +38,8 @@
 import org.elasticsearch.common.CheckedFunction;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.lease.Releasable;
+import org.elasticsearch.common.lease.Releasables;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.common.unit.TimeValue;
@@ -68,6 +70,8 @@
 import org.elasticsearch.transport.TransportService;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.function.Consumer;
 
@@ -312,16 +316,23 @@ class HandoffPrimaryContextRequestHandler implements TransportRequestHandler<Rec
         public void messageReceived(final RecoveryHandoffPrimaryContextRequest request, final TransportChannel channel,
                                     Task task) throws Exception {
             final RecoveryRef recoveryRef = onGoingRecoveries.getRecoverySafe(request.recoveryId(), request.shardId());
+            final List<Releasable> toRelease = new ArrayList<>(2);
+            toRelease.add(recoveryRef::close);
             boolean success = false;
             try {
+                // Due to relocation conditions on the shard it could take a while for the hand-off to complete so we disable the recovery
+                // monitor since we don't expect any transport messages from master for the duration of the handoff and activate it again
+                // after the handoff.
+                final Releasable disabledMonitor = recoveryRef.target().disableRecoveryMonitor();
+                toRelease.add(disabledMonitor);
                 recoveryRef.target().handoffPrimaryContext(request.primaryContext(),
                         ActionListener.runBefore(ActionListener.map(
                                 new ChannelActionListener<>(channel, Actions.HANDOFF_PRIMARY_CONTEXT, request),
-                                v -> TransportResponse.Empty.INSTANCE), recoveryRef::close));
+                                v -> TransportResponse.Empty.INSTANCE), () -> Releasables.close(toRelease)));
                 success = true;
             } finally {
                 if (success == false) {
-                    recoveryRef.close();
+                    Releasables.close(toRelease);
                 }
             }
         }

diff --git a/server/src/main/java/org/elasticsearch/indices/recovery/RecoveriesCollection.java b/server/src/main/java/org/elasticsearch/indices/recovery/RecoveriesCollection.java
@@ -275,16 +275,20 @@ protected void doRun() throws Exception {
                 logger.trace("[monitor] no status found for [{}], shutting down", recoveryId);
                 return;
             }
-            long accessTime = status.lastAccessTime();
-            if (accessTime == lastSeenAccessTime) {
-                String message = "no activity after [" + checkInterval + "]";
-                failRecovery(recoveryId,
-                        new RecoveryFailedException(status.state(), message, new ElasticsearchTimeoutException(message)),
-                        true // to be safe, we don't know what go stuck
-                );
-                return;
+            if (status.isRecoveryMonitorEnabled()) {
+                long accessTime = status.lastAccessTime();
+                if (accessTime == lastSeenAccessTime) {
+                    String message = "no activity after [" + checkInterval + "]";
+                    failRecovery(recoveryId,
+                            new RecoveryFailedException(status.state(), message, new ElasticsearchTimeoutException(message)),
+                            true // to be safe, we don't know what go stuck
+                    );
+                    return;
+                }
+                lastSeenAccessTime = accessTime;
+            } else {
+                lastSeenAccessTime = System.nanoTime();
             }
-            lastSeenAccessTime = accessTime;
             logger.trace("[monitor] rescheduling check for [{}]. last access time is [{}]", recoveryId, lastSeenAccessTime);
             threadPool.schedule(this, checkInterval, ThreadPool.Names.GENERIC);
         }

diff --git a/server/src/main/java/org/elasticsearch/indices/recovery/RecoveryTarget.java b/server/src/main/java/org/elasticsearch/indices/recovery/RecoveryTarget.java
@@ -32,6 +32,7 @@
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.UUIDs;
 import org.elasticsearch.common.bytes.BytesReference;
+import org.elasticsearch.common.lease.Releasable;
 import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.util.CancellableThreads;
@@ -84,6 +85,8 @@ public class RecoveryTarget extends AbstractRefCounted implements RecoveryTarget
     // last time this status was accessed
     private volatile long lastAccessTime = System.nanoTime();
 
+    private volatile boolean recoveryMonitorEnabled = true;
+
     // latch that can be used to blockingly wait for RecoveryTarget to be closed
     private final CountDownLatch closedLatch = new CountDownLatch(1);
 
@@ -161,6 +164,26 @@ public void setLastAccessTime() {
         lastAccessTime = System.nanoTime();
     }
 
+    /**
+     * Set flag to signal to {@link org.elasticsearch.indices.recovery.RecoveriesCollection.RecoveryMonitor} that it must not cancel this
+     * recovery temporarily. This is used by the primary relocation mechanism to avoid recovery failure in case a long running relocation
+     * condition was added to the shard via {@link IndexShard#createRelocationDependency()}.
+     *
+     * @return releasable that once closed will re-enable liveness checks by the recovery monitor
+     */
+    public Releasable disableRecoveryMonitor() {
+        assert recoveryMonitorEnabled : "recovery monitor already disabled";
+        recoveryMonitorEnabled = false;
+        return () -> {
+            setLastAccessTime();
+            recoveryMonitorEnabled = true;
+        };
+    }
+
+    public boolean isRecoveryMonitorEnabled() {
+        return recoveryMonitorEnabled;
+    }
+
     public Store store() {
         ensureRefCount();
         return store;
@@ -332,10 +355,7 @@ private boolean hasUncommittedOperations() throws IOException {
 
     @Override
     public void handoffPrimaryContext(final ReplicationTracker.PrimaryContext primaryContext, ActionListener<Void> listener) {
-        ActionListener.completeWith(listener, () -> {
-            indexShard.activateWithPrimaryContext(primaryContext);
-            return null;
-        });
+        indexShard.activateWithPrimaryContext(primaryContext, listener);
     }
 
     @Override

diff --git a/...le-snapshots/src/main/java/org/elasticsearch/index/store/SearchableSnapshotDirectory.java b/...le-snapshots/src/main/java/org/elasticsearch/index/store/SearchableSnapshotDirectory.java
@@ -194,7 +194,7 @@ protected final boolean assertCurrentThreadMayLoadSnapshot() {
      *
      * @return true if the snapshot was loaded by executing this method, false otherwise
      */
-    public boolean loadSnapshot(RecoveryState recoveryState) {
+    public boolean loadSnapshot(RecoveryState recoveryState, ActionListener<Void> preWarmListener) {
         assert recoveryState != null;
         assert recoveryState instanceof SearchableSnapshotRecoveryState;
         assert recoveryState.getRecoverySource().getType() == RecoverySource.Type.SNAPSHOT
@@ -214,7 +214,7 @@ public boolean loadSnapshot(RecoveryState recoveryState) {
                     this.loaded = true;
                     cleanExistingRegularShardFiles();
                     this.recoveryState = (SearchableSnapshotRecoveryState) recoveryState;
-                    prewarmCache();
+                    prewarmCache(preWarmListener);
                 }
             }
         }
@@ -414,19 +414,20 @@ private void cleanExistingRegularShardFiles() {
         }
     }
 
-    private void prewarmCache() {
+    private void prewarmCache(ActionListener<Void> listener) {
         if (prewarmCache == false) {
             recoveryState.setPreWarmComplete();
+            listener.onResponse(null);
             return;
         }
 
         final BlockingQueue<Tuple<ActionListener<Void>, CheckedRunnable<Exception>>> queue = new LinkedBlockingQueue<>();
         final Executor executor = prewarmExecutor();
 
-        final GroupedActionListener<Void> completionListener = new GroupedActionListener<>(
-            ActionListener.wrap(voids -> recoveryState.setPreWarmComplete(), e -> {}), // Ignore pre-warm errors
-            snapshot().totalFileCount()
-        );
+        final GroupedActionListener<Void> completionListener = new GroupedActionListener<>(ActionListener.wrap(voids -> {
+            recoveryState.setPreWarmComplete();
+            listener.onResponse(null);
+        }, listener::onFailure), snapshot().totalFileCount());
 
         for (BlobStoreIndexShardSnapshot.FileInfo file : snapshot().indexFiles()) {
             if (file.metadata().hashEqualsContents() || isExcludedFromCache(file.physicalName())) {
@@ -448,11 +449,11 @@ private void prewarmCache() {
                 fileCompletionListener.whenComplete(voids -> input.close(), e -> IOUtils.closeWhileHandlingException(input));
                 fileCompletionListener.whenComplete(voids -> completionListener.onResponse(null), completionListener::onFailure);
 
-                final GroupedActionListener<Void> listener = new GroupedActionListener<>(fileCompletionListener, numberOfParts);
+                final GroupedActionListener<Void> partsListener = new GroupedActionListener<>(fileCompletionListener, numberOfParts);
 
                 for (int p = 0; p < numberOfParts; p++) {
                     final int part = p;
-                    queue.add(Tuple.tuple(listener, () -> {
+                    queue.add(Tuple.tuple(partsListener, () -> {
                         ensureOpen();
 
                         logger.trace("{} warming cache for [{}] part [{}/{}]", shardId, file.physicalName(), part + 1, numberOfParts);

diff --git a/...ava/org/elasticsearch/xpack/searchablesnapshots/SearchableSnapshotIndexEventListener.java b/...ava/org/elasticsearch/xpack/searchablesnapshots/SearchableSnapshotIndexEventListener.java
@@ -5,8 +5,13 @@
  */
 package org.elasticsearch.xpack.searchablesnapshots;
 
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.logging.log4j.message.ParameterizedMessage;
 import org.apache.lucene.index.SegmentInfos;
+import org.elasticsearch.action.StepListener;
 import org.elasticsearch.cluster.routing.RecoverySource;
+import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.seqno.SequenceNumbers;
 import org.elasticsearch.index.shard.IndexEventListener;
@@ -23,6 +28,8 @@
 
 public class SearchableSnapshotIndexEventListener implements IndexEventListener {
 
+    private static final Logger logger = LogManager.getLogger(SearchableSnapshotIndexEventListener.class);
+
     @Override
     public void beforeIndexShardRecovery(IndexShard indexShard, IndexSettings indexSettings) {
         assert Thread.currentThread().getName().contains(ThreadPool.Names.GENERIC);
@@ -33,8 +40,23 @@ public void beforeIndexShardRecovery(IndexShard indexShard, IndexSettings indexS
     private static void ensureSnapshotIsLoaded(IndexShard indexShard) {
         final SearchableSnapshotDirectory directory = SearchableSnapshotDirectory.unwrapDirectory(indexShard.store().directory());
         assert directory != null;
-
-        final boolean success = directory.loadSnapshot(indexShard.recoveryState());
+        final StepListener<Void> preWarmListener = new StepListener<>();
+        final boolean success = directory.loadSnapshot(indexShard.recoveryState(), preWarmListener);
+        final ShardRouting shardRouting = indexShard.routingEntry();
+        if (success && shardRouting.isRelocationTarget()) {
+            final Runnable preWarmCondition = indexShard.createRelocationDependency();
+            preWarmListener.whenComplete(v -> preWarmCondition.run(), e -> {
+                logger.warn(
+                    new ParameterizedMessage(
+                        "pre-warm operation failed for [{}] while it was the target of primary relocation [{}]",
+                        shardRouting.shardId(),
+                        shardRouting
+                    ),
+                    e
+                );
+                preWarmCondition.run();
+            });
+        }
         assert directory.listAll().length > 0 : "expecting directory listing to be non-empty";
         assert success
             || indexShard.routingEntry()

diff --git a/...ts/src/test/java/org/elasticsearch/index/store/SearchableSnapshotDirectoryStatsTests.java b/...ts/src/test/java/org/elasticsearch/index/store/SearchableSnapshotDirectoryStatsTests.java
@@ -9,6 +9,7 @@
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexInput;
 import org.elasticsearch.Version;
+import org.elasticsearch.action.support.PlainActionFuture;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.routing.RecoverySource;
 import org.elasticsearch.cluster.routing.ShardRouting;
@@ -654,7 +655,9 @@ protected IndexInputStats createIndexInputStats(long fileLength) {
             );
             DiscoveryNode targetNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), Version.CURRENT);
             RecoveryState recoveryState = new SearchableSnapshotRecoveryState(shardRouting, targetNode, null);
-            final boolean loaded = directory.loadSnapshot(recoveryState);
+            final PlainActionFuture<Void> future = PlainActionFuture.newFuture();
+            final boolean loaded = directory.loadSnapshot(recoveryState, future);
+            future.get();
             assertThat("Failed to load snapshot", loaded, is(true));
             assertThat("Snapshot should be loaded", directory.snapshot(), notNullValue());
             assertThat("BlobContainer should be loaded", directory.blobContainer(), notNullValue());