From f0d8c785e324d3d61057f17ea0bac0a45fd92b41 Mon Sep 17 00:00:00 2001
From: Armin Braun <me@obrown.io>
Date: Fri, 24 Jan 2020 18:20:47 +0100
Subject: [PATCH] Fix Inconsistent Shard Failure Count in Failed Snapshots
 (#51416) (#51426)

* Fix Inconsistent Shard Failure Count in Failed Snapshots

This fix was necessary to allow for the below test enhancement:
We were not adding shard failure entries to a failed snapshot for those
snapshot entries that were never attempted because the snapshot failed during
the init stage and wasn't partial. This caused the never attempted snapshots
to be counted towards the successful shard count which seems wrong and
broke repository consistency tests.

Also, this change adjusts snapshot resiliency tests to run another snapshot
at the end of each test run to guarantee a correct `index.latest` blob exists
after each run.

Closes #47550
---
 .../snapshots/SnapshotsService.java              |  7 ++++++-
 .../snapshots/SnapshotResiliencyTests.java       | 16 ++++++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
index 56a4c0f1e6d83..735d7d55b0d52 100644
--- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
+++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
@@ -1090,8 +1090,13 @@ protected void doRun() {
                 for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shardStatus : entry.shards()) {
                     ShardId shardId = shardStatus.key;
                     ShardSnapshotStatus status = shardStatus.value;
-                    if (status.state().failed()) {
+                    final ShardState state = status.state();
+                    if (state.failed()) {
                         shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, status.reason()));
+                    } else if (state.completed() == false) {
+                        shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, "skipped"));
+                    } else {
+                        assert state == ShardState.SUCCESS;
                     }
                 }
                 final ShardGenerations shardGenerations = buildGenerations(entry, metaData);
diff --git a/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java b/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java
index b9ee29514dab7..f09b373189a64 100644
--- a/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java
+++ b/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java
@@ -216,6 +216,7 @@
 import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.instanceOf;
 import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.iterableWithSize;
 import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.mockito.Mockito.mock;
 
@@ -249,8 +250,19 @@ public void verifyReposThenStopServices() {
             clearDisruptionsAndAwaitSync();
 
             final StepListener<CleanupRepositoryResponse> cleanupResponse = new StepListener<>();
-            client().admin().cluster().cleanupRepository(
-                new CleanupRepositoryRequest("repo"), cleanupResponse);
+            final StepListener<CreateSnapshotResponse> createSnapshotResponse = new StepListener<>();
+            // Create another snapshot and then clean up the repository to verify that the repository works correctly no matter the
+            // failures seen during the previous test.
+            client().admin().cluster().prepareCreateSnapshot("repo", "last-snapshot")
+                .setWaitForCompletion(true).execute(createSnapshotResponse);
+            continueOrDie(createSnapshotResponse, r -> {
+                final SnapshotInfo snapshotInfo = r.getSnapshotInfo();
+                // Snapshot can fail because some tests leave indices in a red state because data nodes were stopped
+                assertThat(snapshotInfo.state(), either(is(SnapshotState.SUCCESS)).or(is(SnapshotState.FAILED)));
+                assertThat(snapshotInfo.shardFailures(), iterableWithSize(snapshotInfo.failedShards()));
+                assertThat(snapshotInfo.successfulShards(), is(snapshotInfo.totalShards() - snapshotInfo.failedShards()));
+                client().admin().cluster().cleanupRepository(new CleanupRepositoryRequest("repo"), cleanupResponse);
+            });
             final AtomicBoolean cleanedUp = new AtomicBoolean(false);
             continueOrDie(cleanupResponse, r -> cleanedUp.set(true));