elastic · bleskes · Nov 30, 2017 · Nov 28, 2017 · Nov 28, 2017 · Nov 28, 2017
diff --git a/core/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/core/src/main/java/org/elasticsearch/index/shard/IndexShard.java
@@ -422,7 +422,13 @@ public void updateShardState(final ShardRouting newRouting,
                     final DiscoveryNode recoverySourceNode = recoveryState.getSourceNode();
                     if (currentRouting.isRelocationTarget() == false || recoverySourceNode.getVersion().before(Version.V_6_0_0_alpha1)) {
                         // there was no primary context hand-off in < 6.0.0, need to manually activate the shard
-                        getEngine().seqNoService().activatePrimaryMode(getEngine().seqNoService().getLocalCheckpoint());
+                        final Engine engine = getEngine();
+                        engine.seqNoService().activatePrimaryMode(getEngine().seqNoService().getLocalCheckpoint());
+                        // Flush the translog as it may contain operations with no sequence numbers. We want to make sure those
+                        // operations will never be replayed as part of peer recovery to avoid an arbitrary mixture of operations with seq#
+                        // (due to active indexing) and operations without a seq# coming from the translog. We therefore flush
+                        // to create a lucene commit point to an empty translog file.
+                        engine.flush(false, true);
                     }
                 }
 
@@ -487,15 +493,26 @@ public void updateShardState(final ShardRouting newRouting,
                                  * subsequently fails before the primary/replica re-sync completes successfully and we are now being
                                  * promoted, the local checkpoint tracker here could be left in a state where it would re-issue sequence
                                  * numbers. To ensure that this is not the case, we restore the state of the local checkpoint tracker by
-                                 * replaying the translog and marking any operations there are completed. Rolling the translog generation is
-                                 * not strictly needed here (as we will never have collisions between sequence numbers in a translog
-                                 * generation in a new primary as it takes the last known sequence number as a starting point), but it
-                                 * simplifies reasoning about the relationship between primary terms and translog generations.
+                                 * replaying the translog and marking any operations there are completed.
                                  */
-                                getEngine().rollTranslogGeneration();
-                                getEngine().restoreLocalCheckpointFromTranslog();
-                                getEngine().fillSeqNoGaps(newPrimaryTerm);
-                                getEngine().seqNoService().updateLocalCheckpointForShard(currentRouting.allocationId().getId(),
+                                final Engine engine = getEngine();
+                                engine.restoreLocalCheckpointFromTranslog();
+                                if (indexSettings.getIndexVersionCreated().onOrBefore(Version.V_6_0_0_alpha1)) {
+                                    // an index that was created before sequence numbers were introduce may contain operations in its
+                                    // translog that do not have a sequence numbers. We want to make sure those operations will never
+                                    // be replayed as part of peer recovery to avoid an arbitrary mixture of operations with seq# (due
+                                    // to active indexing) and operations without a seq# coming from the translog. We therefore flush
+                                    // to create a lucene commit point to an empty translog file.
+                                    engine.flush(false, true);
+                                }
+                                /* Rolling the translog generation is not strictly needed here (as we will never have collisions between
+                                 * sequence numbers in a translog generation in a new primary as it takes the last known sequence number
+                                 * as a starting point), but it simplifies reasoning about the relationship between primary terms and
+                                 * translog generations.
+                                 */
+                                engine.rollTranslogGeneration();
+                                engine.fillSeqNoGaps(newPrimaryTerm);
+                                engine.seqNoService().updateLocalCheckpointForShard(currentRouting.allocationId().getId(),
                                     getEngine().seqNoService().getLocalCheckpoint());
                                 primaryReplicaSyncer.accept(this, new ActionListener<ResyncTask>() {
                                     @Override

diff --git a/core/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java b/core/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java
@@ -148,23 +148,26 @@ public RecoveryResponse recoverToTarget() throws IOException {
             final Translog translog = shard.getTranslog();
 
             final long startingSeqNo;
+            final long requiredSeqNoRangeStart;
             final boolean isSequenceNumberBasedRecoveryPossible = request.startingSeqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO &&
                 isTargetSameHistory() && isTranslogReadyForSequenceNumberBasedRecovery();
-
             if (isSequenceNumberBasedRecoveryPossible) {
                 logger.trace("performing sequence numbers based recovery. starting at [{}]", request.startingSeqNo());
                 startingSeqNo = request.startingSeqNo();
+                requiredSeqNoRangeStart = startingSeqNo;
             } else {
                 final Engine.IndexCommitRef phase1Snapshot;
                 try {
                     phase1Snapshot = shard.acquireIndexCommit(false);
                 } catch (final Exception e) {
                     throw new RecoveryEngineException(shard.shardId(), 1, "snapshot failed", e);
                 }
-                // we set this to unassigned to create a translog roughly according to the retention policy
-                // on the target
-                startingSeqNo = SequenceNumbers.UNASSIGNED_SEQ_NO;
-
+                // we set this to 0 to create a translog roughly according to the retention policy
+                // on the target. Note that it will still filter out legacy operations with no sequence numbers
+                startingSeqNo = 0;
+                // but we must have everything above the local checkpoint in the commit
+                requiredSeqNoRangeStart =
+                    Long.parseLong(phase1Snapshot.getIndexCommit().getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)) + 1;
                 try {
                     phase1(phase1Snapshot.getIndexCommit(), translog::totalOperations);
                 } catch (final Exception e) {
@@ -177,6 +180,9 @@ public RecoveryResponse recoverToTarget() throws IOException {
                     }
                 }
             }
+            assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo;
+            assert requiredSeqNoRangeStart >= startingSeqNo : "requiredSeqNoRangeStart [" + requiredSeqNoRangeStart + "] is lower than ["
+                + startingSeqNo + "]";
 
             runUnderPrimaryPermit(() -> shard.initiateTracking(request.targetAllocationId()));
 
@@ -186,10 +192,19 @@ public RecoveryResponse recoverToTarget() throws IOException {
                 throw new RecoveryEngineException(shard.shardId(), 1, "prepare target for translog failed", e);
             }
 
+            final long endingSeqNo = shard.seqNoStats().getMaxSeqNo();
+            /*
+             * We need to wait for all operations up to the current max to complete, otherwise we can not guarantee that all
+             * operations in the required range will be available for replaying from the translog of the source.
+             */
+            cancellableThreads.execute(() -> shard.waitForOpsToComplete(endingSeqNo));
+
+            logger.trace("all operations up to [{}] completed, which will be used as an ending sequence number", endingSeqNo);
+
             logger.trace("snapshot translog for recovery; current size is [{}]", translog.estimateTotalOperationsFromMinSeq(startingSeqNo));
             final long targetLocalCheckpoint;
             try(Translog.Snapshot snapshot = translog.newSnapshotFromMinSeqNo(startingSeqNo)) {
-                targetLocalCheckpoint = phase2(startingSeqNo, snapshot);
+                targetLocalCheckpoint = phase2(startingSeqNo, requiredSeqNoRangeStart, endingSeqNo, snapshot);
             } catch (Exception e) {
                 throw new RecoveryEngineException(shard.shardId(), 2, "phase2 failed", e);
             }
@@ -223,26 +238,19 @@ private void runUnderPrimaryPermit(CancellableThreads.Interruptable runnable) {
 
     /**
      * Determines if the source translog is ready for a sequence-number-based peer recovery. The main condition here is that the source
-     * translog contains all operations between the local checkpoint on the target and the current maximum sequence number on the source.
+     * translog contains all operations above the local checkpoint on the target. We already know the that translog contains or will contain
+     * all ops above the source local checkpoint, so we can stop check there.
      *
      * @return {@code true} if the source is ready for a sequence-number-based recovery
      * @throws IOException if an I/O exception occurred reading the translog snapshot
      */
     boolean isTranslogReadyForSequenceNumberBasedRecovery() throws IOException {
         final long startingSeqNo = request.startingSeqNo();
         assert startingSeqNo >= 0;
-        final long endingSeqNo = shard.seqNoStats().getMaxSeqNo();
-        logger.trace("testing sequence numbers in range: [{}, {}]", startingSeqNo, endingSeqNo);
+        final long localCheckpoint = shard.getLocalCheckpoint();
+        logger.trace("testing sequence numbers in range: [{}, {}]", startingSeqNo, localCheckpoint);
         // the start recovery request is initialized with the starting sequence number set to the target shard's local checkpoint plus one
-        if (startingSeqNo - 1 <= endingSeqNo) {
-            /*
-             * We need to wait for all operations up to the current max to complete, otherwise we can not guarantee that all
-             * operations in the required range will be available for replaying from the translog of the source.
-             */
-            cancellableThreads.execute(() -> shard.waitForOpsToComplete(endingSeqNo));
-
-            logger.trace("all operations up to [{}] completed, checking translog content", endingSeqNo);
-
+        if (startingSeqNo - 1 <= localCheckpoint) {
             final LocalCheckpointTracker tracker = new LocalCheckpointTracker(startingSeqNo, startingSeqNo - 1);
             try (Translog.Snapshot snapshot = shard.getTranslog().newSnapshotFromMinSeqNo(startingSeqNo)) {
                 Translog.Operation operation;
@@ -252,7 +260,7 @@ boolean isTranslogReadyForSequenceNumberBasedRecovery() throws IOException {
                     }
                 }
             }
-            return tracker.getCheckpoint() >= endingSeqNo;
+            return tracker.getCheckpoint() >= localCheckpoint;
         } else {
             return false;
         }
@@ -434,22 +442,25 @@ void prepareTargetForTranslog(final int totalTranslogOps) throws IOException {
      *
      * @param startingSeqNo the sequence number to start recovery from, or {@link SequenceNumbers#UNASSIGNED_SEQ_NO} if all
      *                      ops should be sent
+     * @param requiredSeqNoRangeStart the lower sequence number of the required range (ending with endingSeqNo)
+     * @param endingSeqNo   the highest sequence number that should be sent
      * @param snapshot      a snapshot of the translog
-     *
      * @return the local checkpoint on the target
      */
-    long phase2(final long startingSeqNo, final Translog.Snapshot snapshot) throws IOException {
+    long phase2(final long startingSeqNo, long requiredSeqNoRangeStart, long endingSeqNo, final Translog.Snapshot snapshot)
+        throws IOException {
         if (shard.state() == IndexShardState.CLOSED) {
             throw new IndexShardClosedException(request.shardId());
         }
         cancellableThreads.checkForCancel();
 
         final StopWatch stopWatch = new StopWatch().start();
 
-        logger.trace("recovery [phase2]: sending transaction log operations");
+        logger.trace("recovery [phase2]: sending transaction log operations (seq# from [" +  startingSeqNo  + "], " +
+            "required [" + requiredSeqNoRangeStart + ":" + endingSeqNo + "]");
 
         // send all the snapshot's translog operations to the target
-        final SendSnapshotResult result = sendSnapshot(startingSeqNo, snapshot);
+        final SendSnapshotResult result = sendSnapshot(startingSeqNo, requiredSeqNoRangeStart, endingSeqNo, snapshot);
 
         stopWatch.stop();
         logger.trace("recovery [phase2]: took [{}]", stopWatch.totalTime());
@@ -511,17 +522,25 @@ static class SendSnapshotResult {
      * Operations are bulked into a single request depending on an operation count limit or size-in-bytes limit.
      *
      * @param startingSeqNo the sequence number for which only operations with a sequence number greater than this will be sent
-     * @param snapshot      the translog snapshot to replay operations from
-     * @return the local checkpoint on the target and the total number of operations sent
+     * @param requiredSeqNoRangeStart the lower sequence number of the required range
+     * @param endingSeqNo   the upper bound of the sequence number range to be sent (inclusive)
+     * @param snapshot      the translog snapshot to replay operations from  @return the local checkpoint on the target and the total
+     *                      number of operations sent
      * @throws IOException if an I/O exception occurred reading the translog snapshot
      */
-    protected SendSnapshotResult sendSnapshot(final long startingSeqNo, final Translog.Snapshot snapshot) throws IOException {
+    protected SendSnapshotResult sendSnapshot(final long startingSeqNo, long requiredSeqNoRangeStart, long endingSeqNo,
+                                              final Translog.Snapshot snapshot) throws IOException {
+        assert requiredSeqNoRangeStart <= endingSeqNo + 1:
+            "requiredSeqNoRangeStart " + requiredSeqNoRangeStart + " is larger than endingSeqNo " + endingSeqNo;
+        assert startingSeqNo <= requiredSeqNoRangeStart :
+            "startingSeqNo " + startingSeqNo + " is larger than requiredSeqNoRangeStart " + requiredSeqNoRangeStart;
         int ops = 0;
         long size = 0;
         int skippedOps = 0;
         int totalSentOps = 0;
         final AtomicLong targetLocalCheckpoint = new AtomicLong(SequenceNumbers.UNASSIGNED_SEQ_NO);
         final List<Translog.Operation> operations = new ArrayList<>();
+        final LocalCheckpointTracker requiredOpsTracker = new LocalCheckpointTracker(endingSeqNo, requiredSeqNoRangeStart - 1);
 
         final int expectedTotalOps = snapshot.totalOperations();
         if (expectedTotalOps == 0) {
@@ -538,19 +557,17 @@ protected SendSnapshotResult sendSnapshot(final long startingSeqNo, final Transl
                 throw new IndexShardClosedException(request.shardId());
             }
             cancellableThreads.checkForCancel();
-            /*
-             * If we are doing a sequence-number-based recovery, we have to skip older ops for which no sequence number was assigned, and
-             * any ops before the starting sequence number.
-             */
+
             final long seqNo = operation.seqNo();
-            if (startingSeqNo >= 0 && (seqNo == SequenceNumbers.UNASSIGNED_SEQ_NO || seqNo < startingSeqNo)) {
+            if (seqNo < startingSeqNo || seqNo > endingSeqNo) {
                 skippedOps++;
                 continue;
             }
             operations.add(operation);
             ops++;
             size += operation.estimateSize();
             totalSentOps++;
+            requiredOpsTracker.markSeqNoAsCompleted(seqNo);
 
             // check if this request is past bytes threshold, and if so, send it off
             if (size >= chunkSizeInBytes) {
@@ -567,6 +584,12 @@ protected SendSnapshotResult sendSnapshot(final long startingSeqNo, final Transl
             cancellableThreads.executeIO(sendBatch);
         }
 
+        if (requiredOpsTracker.getCheckpoint() < endingSeqNo) {
+            throw new IllegalStateException("translog replay failed to covered required sequence numbers" +
+                " (required range [" + requiredSeqNoRangeStart + ":" + endingSeqNo + "). first missing op is ["
+                + (requiredOpsTracker.getCheckpoint() + 1) + "]");
+        }
+
         assert expectedTotalOps == skippedOps + totalSentOps
                 : "expected total [" + expectedTotalOps + "], skipped [" + skippedOps + "], total sent [" + totalSentOps + "]";
 

diff --git a/core/src/test/java/org/elasticsearch/index/replication/RecoveryDuringReplicationTests.java b/core/src/test/java/org/elasticsearch/index/replication/RecoveryDuringReplicationTests.java
@@ -374,15 +374,15 @@ protected EngineFactory getEngineFactory(ShardRouting routing) {
             IndexShard newReplica = shards.addReplicaWithExistingPath(replica.shardPath(), replica.routingEntry().currentNodeId());
 
             CountDownLatch recoveryStart = new CountDownLatch(1);
-            AtomicBoolean preparedForTranslog = new AtomicBoolean(false);
+            AtomicBoolean opsSent = new AtomicBoolean(false);
             final Future<Void> recoveryFuture = shards.asyncRecoverReplica(newReplica, (indexShard, node) -> {
                 recoveryStart.countDown();
                 return new RecoveryTarget(indexShard, node, recoveryListener, l -> {
                 }) {
                     @Override
-                    public void prepareForTranslogOperations(int totalTranslogOps) throws IOException {
-                        preparedForTranslog.set(true);
-                        super.prepareForTranslogOperations(totalTranslogOps);
+                    public long indexTranslogOperations(List<Translog.Operation> operations, int totalTranslogOps) throws IOException {
+                        opsSent.set(true);
+                        return super.indexTranslogOperations(operations, totalTranslogOps);
                     }
                 };
             });
@@ -392,7 +392,7 @@ public void prepareForTranslogOperations(int totalTranslogOps) throws IOExceptio
             // index some more
             docs += shards.indexDocs(randomInt(5));
 
-            assertFalse("recovery should wait on pending docs", preparedForTranslog.get());
+            assertFalse("recovery should wait on pending docs", opsSent.get());
 
             primaryEngineFactory.releaseLatchedIndexers();
             pendingDocsDone.await();