Skip to content

Commit

Permalink
Fix Inconsistent Shard Failure Count in Failed Snapshots (#51416) (#5…
Browse files Browse the repository at this point in the history
…4480)

* Fix Inconsistent Shard Failure Count in Failed Snapshots

This fix was necessary to allow for the below test enhancement:
We were not adding shard failure entries to a failed snapshot for those
snapshot entries that were never attempted because the snapshot failed during
the init stage and wasn't partial. This caused the never attempted snapshots
to be counted towards the successful shard count which seems wrong and
broke repository consistency tests.

Also, this change adjusts snapshot resiliency tests to run another snapshot
at the end of each test run to guarantee a correct `index.latest` blob exists
after each run.

Closes #47550
  • Loading branch information
original-brownbear authored Mar 31, 2020
1 parent ec60ee5 commit bb43fc0
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1084,8 +1084,13 @@ protected void doRun() {
for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shardStatus : entry.shards()) {
ShardId shardId = shardStatus.key;
ShardSnapshotStatus status = shardStatus.value;
if (status.state().failed()) {
final ShardState state = status.state();
if (state.failed()) {
shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, status.reason()));
} else if (state.completed() == false) {
shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, "skipped"));
} else {
assert state == ShardState.SUCCESS;
}
}
final ShardGenerations shardGenerations = buildGenerations(entry, metaData);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,6 @@
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.seqno.GlobalCheckpointSyncAction;
import org.elasticsearch.index.seqno.RetentionLeaseSyncer;
import org.elasticsearch.index.seqno.RetentionLeaseSyncer;
import org.elasticsearch.index.seqno.RetentionLeaseSyncer;
import org.elasticsearch.index.shard.PrimaryReplicaSyncer;
import org.elasticsearch.indices.IndicesModule;
import org.elasticsearch.indices.IndicesService;
Expand Down Expand Up @@ -216,6 +214,7 @@
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.iterableWithSize;
import static org.hamcrest.Matchers.lessThanOrEqualTo;
import static org.mockito.Mockito.mock;

Expand Down Expand Up @@ -249,8 +248,19 @@ public void verifyReposThenStopServices() {
clearDisruptionsAndAwaitSync();

final StepListener<CleanupRepositoryResponse> cleanupResponse = new StepListener<>();
client().admin().cluster().cleanupRepository(
new CleanupRepositoryRequest("repo"), cleanupResponse);
final StepListener<CreateSnapshotResponse> createSnapshotResponse = new StepListener<>();
// Create another snapshot and then clean up the repository to verify that the repository works correctly no matter the
// failures seen during the previous test.
client().admin().cluster().prepareCreateSnapshot("repo", "last-snapshot")
.setWaitForCompletion(true).execute(createSnapshotResponse);
continueOrDie(createSnapshotResponse, r -> {
final SnapshotInfo snapshotInfo = r.getSnapshotInfo();
// Snapshot can fail because some tests leave indices in a red state because data nodes were stopped
assertThat(snapshotInfo.state(), either(is(SnapshotState.SUCCESS)).or(is(SnapshotState.FAILED)));
assertThat(snapshotInfo.shardFailures(), iterableWithSize(snapshotInfo.failedShards()));
assertThat(snapshotInfo.successfulShards(), is(snapshotInfo.totalShards() - snapshotInfo.failedShards()));
client().admin().cluster().cleanupRepository(new CleanupRepositoryRequest("repo"), cleanupResponse);
});
final AtomicBoolean cleanedUp = new AtomicBoolean(false);
continueOrDie(cleanupResponse, r -> cleanedUp.set(true));

Expand Down Expand Up @@ -340,7 +350,6 @@ public void testSuccessfulSnapshotAndRestore() {
assertEquals(0, snapshotInfo.failedShards());
}

@AwaitsFix(bugUrl="https://github.com/elastic/elasticsearch/issues/54459")
public void testSnapshotWithNodeDisconnects() {
final int dataNodes = randomIntBetween(2, 10);
final int masterNodes = randomFrom(1, 3, 5);
Expand Down

0 comments on commit bb43fc0

Please sign in to comment.