Skip to content

Commit

Permalink
Reset replica engine to global checkpoint on promotion (#33473)
Browse files Browse the repository at this point in the history
When a replica starts following a newly promoted primary, it may have
some operations which don't exist on the new primary. Thus we need to
throw those operations to align a replica with the new primary. This can
be done by first resetting an engine from the safe commit, then replaying
the local translog up to the global checkpoint.

Relates #32867
  • Loading branch information
dnhatn committed Sep 12, 2018
1 parent fa301ad commit b0587bc
Show file tree
Hide file tree
Showing 16 changed files with 274 additions and 121 deletions.
13 changes: 6 additions & 7 deletions server/src/main/java/org/elasticsearch/index/engine/Engine.java
Original file line number Diff line number Diff line change
Expand Up @@ -678,12 +678,6 @@ public final CommitStats commitStats() {
*/
public abstract void waitForOpsToComplete(long seqNo) throws InterruptedException;

/**
* Reset the local checkpoint in the tracker to the given local checkpoint
* @param localCheckpoint the new checkpoint to be set
*/
public abstract void resetLocalCheckpoint(long localCheckpoint);

/**
* @return a {@link SeqNoStats} object, using local state and the supplied global checkpoint
*/
Expand Down Expand Up @@ -1165,11 +1159,16 @@ public enum Origin {
PRIMARY,
REPLICA,
PEER_RECOVERY,
LOCAL_TRANSLOG_RECOVERY;
LOCAL_TRANSLOG_RECOVERY,
LOCAL_RESET;

public boolean isRecovery() {
return this == PEER_RECOVERY || this == LOCAL_TRANSLOG_RECOVERY;
}

boolean isFromTranslog() {
return this == LOCAL_TRANSLOG_RECOVERY || this == LOCAL_RESET;
}
}

public Origin origin() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,7 @@ private boolean canOptimizeAddDocument(Index index) {
: "version: " + index.version() + " type: " + index.versionType();
return true;
case LOCAL_TRANSLOG_RECOVERY:
case LOCAL_RESET:
assert index.isRetry();
return true; // allow to optimize in order to update the max safe time stamp
default:
Expand Down Expand Up @@ -881,7 +882,7 @@ public IndexResult index(Index index) throws IOException {
indexResult = new IndexResult(
plan.versionForIndexing, getPrimaryTerm(), plan.seqNoForIndexing, plan.currentNotFoundOrDeleted);
}
if (index.origin() != Operation.Origin.LOCAL_TRANSLOG_RECOVERY) {
if (index.origin().isFromTranslog() == false) {
final Translog.Location location;
if (indexResult.getResultType() == Result.Type.SUCCESS) {
location = translog.add(new Translog.Index(index, indexResult));
Expand Down Expand Up @@ -1234,7 +1235,7 @@ public DeleteResult delete(Delete delete) throws IOException {
deleteResult = new DeleteResult(
plan.versionOfDeletion, getPrimaryTerm(), plan.seqNoOfDeletion, plan.currentlyDeleted == false);
}
if (delete.origin() != Operation.Origin.LOCAL_TRANSLOG_RECOVERY) {
if (delete.origin().isFromTranslog() == false) {
final Translog.Location location;
if (deleteResult.getResultType() == Result.Type.SUCCESS) {
location = translog.add(new Translog.Delete(delete, deleteResult));
Expand Down Expand Up @@ -1485,7 +1486,7 @@ private NoOpResult innerNoOp(final NoOp noOp) throws IOException {
}
}
final NoOpResult noOpResult = failure != null ? new NoOpResult(getPrimaryTerm(), noOp.seqNo(), failure) : new NoOpResult(getPrimaryTerm(), noOp.seqNo());
if (noOp.origin() != Operation.Origin.LOCAL_TRANSLOG_RECOVERY) {
if (noOp.origin().isFromTranslog() == false) {
final Translog.Location location = translog.add(new Translog.NoOp(noOp.seqNo(), noOp.primaryTerm(), noOp.reason()));
noOpResult.setTranslogLocation(location);
}
Expand Down Expand Up @@ -2404,11 +2405,6 @@ public void waitForOpsToComplete(long seqNo) throws InterruptedException {
localCheckpointTracker.waitForOpsToComplete(seqNo);
}

@Override
public void resetLocalCheckpoint(long localCheckpoint) {
localCheckpointTracker.resetCheckpoint(localCheckpoint);
}

@Override
public SeqNoStats getSeqNoStats(long globalCheckpoint) {
return localCheckpointTracker.getStats(globalCheckpoint);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,10 +257,6 @@ public long getLocalCheckpoint() {
public void waitForOpsToComplete(long seqNo) {
}

@Override
public void resetLocalCheckpoint(long newCheckpoint) {
}

@Override
public SeqNoStats getSeqNoStats(long globalCheckpoint) {
return new SeqNoStats(seqNoStats.getMaxSeqNo(), seqNoStats.getLocalCheckpoint(), globalCheckpoint);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ public synchronized void markSeqNoAsCompleted(final long seqNo) {
* @param checkpoint the local checkpoint to reset this tracker to
*/
public synchronized void resetCheckpoint(final long checkpoint) {
// TODO: remove this method as after we restore the local history on promotion.
assert checkpoint != SequenceNumbers.UNASSIGNED_SEQ_NO;
assert checkpoint <= this.checkpoint;
processedSeqNo.clear();
Expand Down
84 changes: 56 additions & 28 deletions server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@
import java.util.stream.StreamSupport;

import static org.elasticsearch.index.mapper.SourceToParse.source;
import static org.elasticsearch.index.seqno.SequenceNumbers.NO_OPS_PERFORMED;
import static org.elasticsearch.index.seqno.SequenceNumbers.UNASSIGNED_SEQ_NO;

public class IndexShard extends AbstractIndexShardComponent implements IndicesClusterStateService.Shard {
Expand Down Expand Up @@ -1307,16 +1306,18 @@ public Engine.Result applyTranslogOperation(Translog.Operation operation, Engine
return result;
}

// package-private for testing
int runTranslogRecovery(Engine engine, Translog.Snapshot snapshot) throws IOException {
recoveryState.getTranslog().totalOperations(snapshot.totalOperations());
recoveryState.getTranslog().totalOperationsOnStart(snapshot.totalOperations());
/**
* Replays translog operations from the provided translog {@code snapshot} to the current engine using the given {@code origin}.
* The callback {@code onOperationRecovered} is notified after each translog operation is replayed successfully.
*/
int runTranslogRecovery(Engine engine, Translog.Snapshot snapshot, Engine.Operation.Origin origin,
Runnable onOperationRecovered) throws IOException {
int opsRecovered = 0;
Translog.Operation operation;
while ((operation = snapshot.next()) != null) {
try {
logger.trace("[translog] recover op {}", operation);
Engine.Result result = applyTranslogOperation(operation, Engine.Operation.Origin.LOCAL_TRANSLOG_RECOVERY);
Engine.Result result = applyTranslogOperation(operation, origin);
switch (result.getResultType()) {
case FAILURE:
throw result.getFailure();
Expand All @@ -1329,7 +1330,7 @@ int runTranslogRecovery(Engine engine, Translog.Snapshot snapshot) throws IOExce
}

opsRecovered++;
recoveryState.getTranslog().incrementRecoveredOperations();
onOperationRecovered.run();
} catch (Exception e) {
if (ExceptionsHelper.status(e) == RestStatus.BAD_REQUEST) {
// mainly for MapperParsingException and Failure to detect xcontent
Expand All @@ -1347,8 +1348,15 @@ int runTranslogRecovery(Engine engine, Translog.Snapshot snapshot) throws IOExce
* Operations from the translog will be replayed to bring lucene up to date.
**/
public void openEngineAndRecoverFromTranslog() throws IOException {
final RecoveryState.Translog translogRecoveryStats = recoveryState.getTranslog();
final Engine.TranslogRecoveryRunner translogRecoveryRunner = (engine, snapshot) -> {
translogRecoveryStats.totalOperations(snapshot.totalOperations());
translogRecoveryStats.totalOperationsOnStart(snapshot.totalOperations());
return runTranslogRecovery(engine, snapshot, Engine.Operation.Origin.LOCAL_TRANSLOG_RECOVERY,
translogRecoveryStats::incrementRecoveredOperations);
};
innerOpenEngineAndTranslog();
getEngine().recoverFromTranslog(this::runTranslogRecovery, Long.MAX_VALUE);
getEngine().recoverFromTranslog(translogRecoveryRunner, Long.MAX_VALUE);
}

/**
Expand Down Expand Up @@ -1386,11 +1394,7 @@ private void innerOpenEngineAndTranslog() throws IOException {
final String translogUUID = store.readLastCommittedSegmentsInfo().getUserData().get(Translog.TRANSLOG_UUID_KEY);
final long globalCheckpoint = Translog.readGlobalCheckpoint(translogConfig.getTranslogPath(), translogUUID);
replicationTracker.updateGlobalCheckpointOnReplica(globalCheckpoint, "read from translog checkpoint");

assertMaxUnsafeAutoIdInCommit();

final long minRetainedTranslogGen = Translog.readMinTranslogGeneration(translogConfig.getTranslogPath(), translogUUID);
store.trimUnsafeCommits(globalCheckpoint, minRetainedTranslogGen, config.getIndexSettings().getIndexVersionCreated());
trimUnsafeCommits();

createNewEngine(config);
verifyNotClosed();
Expand All @@ -1401,6 +1405,15 @@ private void innerOpenEngineAndTranslog() throws IOException {
assert recoveryState.getStage() == RecoveryState.Stage.TRANSLOG : "TRANSLOG stage expected but was: " + recoveryState.getStage();
}

private void trimUnsafeCommits() throws IOException {
assert currentEngineReference.get() == null : "engine is running";
final String translogUUID = store.readLastCommittedSegmentsInfo().getUserData().get(Translog.TRANSLOG_UUID_KEY);
final long globalCheckpoint = Translog.readGlobalCheckpoint(translogConfig.getTranslogPath(), translogUUID);
final long minRetainedTranslogGen = Translog.readMinTranslogGeneration(translogConfig.getTranslogPath(), translogUUID);
assertMaxUnsafeAutoIdInCommit();
store.trimUnsafeCommits(globalCheckpoint, minRetainedTranslogGen, indexSettings.getIndexVersionCreated());
}

private boolean assertSequenceNumbersInCommit() throws IOException {
final Map<String, String> userData = SegmentInfos.readLatestCommit(store.directory()).getUserData();
assert userData.containsKey(SequenceNumbers.LOCAL_CHECKPOINT_KEY) : "commit point doesn't contains a local checkpoint";
Expand Down Expand Up @@ -1501,7 +1514,7 @@ private void ensureWriteAllowed(Engine.Operation.Origin origin) throws IllegalIn
if (origin == Engine.Operation.Origin.PRIMARY) {
assert assertPrimaryMode();
} else {
assert origin == Engine.Operation.Origin.REPLICA;
assert origin == Engine.Operation.Origin.REPLICA || origin == Engine.Operation.Origin.LOCAL_RESET;
assert assertReplicationTarget();
}
if (writeAllowedStates.contains(state) == false) {
Expand Down Expand Up @@ -2207,9 +2220,7 @@ public void onFailedEngine(String reason, @Nullable Exception failure) {

private Engine createNewEngine(EngineConfig config) {
synchronized (mutex) {
if (state == IndexShardState.CLOSED) {
throw new AlreadyClosedException(shardId + " can't create engine - shard is closed");
}
verifyNotClosed();
assert this.currentEngineReference.get() == null;
Engine engine = newEngine(config);
onNewEngine(engine); // call this before we pass the memory barrier otherwise actions that happen
Expand Down Expand Up @@ -2355,19 +2366,14 @@ public void acquireReplicaOperationPermit(final long opPrimaryTerm, final long g
bumpPrimaryTerm(opPrimaryTerm, () -> {
updateGlobalCheckpointOnReplica(globalCheckpoint, "primary term transition");
final long currentGlobalCheckpoint = getGlobalCheckpoint();
final long localCheckpoint;
if (currentGlobalCheckpoint == UNASSIGNED_SEQ_NO) {
localCheckpoint = NO_OPS_PERFORMED;
final long maxSeqNo = seqNoStats().getMaxSeqNo();
logger.info("detected new primary with primary term [{}], global checkpoint [{}], max_seq_no [{}]",
opPrimaryTerm, currentGlobalCheckpoint, maxSeqNo);
if (currentGlobalCheckpoint < maxSeqNo) {
resetEngineToGlobalCheckpoint();
} else {
localCheckpoint = currentGlobalCheckpoint;
getEngine().rollTranslogGeneration();
}
logger.trace(
"detected new primary with primary term [{}], resetting local checkpoint from [{}] to [{}]",
opPrimaryTerm,
getLocalCheckpoint(),
localCheckpoint);
getEngine().resetLocalCheckpoint(localCheckpoint);
getEngine().rollTranslogGeneration();
});
}
}
Expand Down Expand Up @@ -2663,4 +2669,26 @@ public ParsedDocument newNoopTombstoneDoc(String reason) {
}
};
}

/**
* Rollback the current engine to the safe commit, then replay local translog up to the global checkpoint.
*/
void resetEngineToGlobalCheckpoint() throws IOException {
assert getActiveOperationsCount() == 0 : "Ongoing writes [" + getActiveOperations() + "]";
sync(); // persist the global checkpoint to disk
final long globalCheckpoint = getGlobalCheckpoint();
final Engine newEngine;
synchronized (mutex) {
verifyNotClosed();
IOUtils.close(currentEngineReference.getAndSet(null));
trimUnsafeCommits();
newEngine = createNewEngine(newEngineConfig());
active.set(true);
}
final Engine.TranslogRecoveryRunner translogRunner = (engine, snapshot) -> runTranslogRecovery(
engine, snapshot, Engine.Operation.Origin.LOCAL_RESET, () -> {
// TODO: add a dedicate recovery stats for the reset translog
});
newEngine.recoverFromTranslog(translogRunner, globalCheckpoint);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ protected void beforeIndexDeletion() throws Exception {
super.beforeIndexDeletion();
internalCluster().assertConsistentHistoryBetweenTranslogAndLuceneIndex();
assertSeqNos();
assertSameDocIdsOnShards();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4105,7 +4105,7 @@ public void markSeqNoAsCompleted(long seqNo) {
final long currentLocalCheckpoint = actualEngine.getLocalCheckpoint();
final long resetLocalCheckpoint =
randomIntBetween(Math.toIntExact(SequenceNumbers.NO_OPS_PERFORMED), Math.toIntExact(currentLocalCheckpoint));
actualEngine.resetLocalCheckpoint(resetLocalCheckpoint);
actualEngine.getLocalCheckpointTracker().resetCheckpoint(resetLocalCheckpoint);
completedSeqNos.clear();
actualEngine.restoreLocalCheckpointFromTranslog();
final Set<Long> intersection = new HashSet<>(expectedCompletedSeqNos);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import org.elasticsearch.index.store.Store;

import java.io.IOException;
import java.util.Set;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;

Expand All @@ -44,7 +44,7 @@ public void testReadOnlyEngine() throws Exception {
EngineConfig config = config(defaultSettings, store, createTempDir(), newMergePolicy(), null, null, globalCheckpoint::get);
int numDocs = scaledRandomIntBetween(10, 1000);
final SeqNoStats lastSeqNoStats;
final Set<String> lastDocIds;
final List<DocIdSeqNoAndTerm> lastDocIds;
try (InternalEngine engine = createEngine(config)) {
Engine.Get get = null;
for (int i = 0; i < numDocs; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -522,18 +522,14 @@ public void testSeqNoCollision() throws Exception {
shards.promoteReplicaToPrimary(replica2).get();
logger.info("--> Recover replica3 from replica2");
recoverReplica(replica3, replica2, true);
try (Translog.Snapshot snapshot = getTranslog(replica3).newSnapshot()) {
try (Translog.Snapshot snapshot = replica3.getHistoryOperations("test", 0)) {
assertThat(snapshot.totalOperations(), equalTo(initDocs + 1));
final List<Translog.Operation> expectedOps = new ArrayList<>(initOperations);
expectedOps.add(op2);
assertThat(snapshot, containsOperationsInAnyOrder(expectedOps));
assertThat("Peer-recovery should not send overridden operations", snapshot.skippedOperations(), equalTo(0));
}
// TODO: We should assert the content of shards in the ReplicationGroup.
// Without rollback replicas(current implementation), we don't have the same content across shards:
// - replica1 has {doc1}
// - replica2 has {doc1, doc2}
// - replica3 can have either {doc2} only if operation-based recovery or {doc1, doc2} if file-based recovery
shards.assertAllEqual(initDocs + 1);
}
}

Expand Down
Loading

0 comments on commit b0587bc

Please sign in to comment.