From 18fba4b1df0d13556838053c3e96e7ce1bc74ec2 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Mon, 7 Oct 2019 10:38:44 +0200 Subject: [PATCH] Add IT for Snapshot Issue in 47552 (#47627) (#47635) * Add IT for Snapshot Issue in 47552 (#47627) Adding a specific integration test that reproduces the problem fixed in #47552. The issue fixed only reproduces in the snapshot resiliency otherwise which are not available in 6.8 where the fix is being backported to as well. --- .../DedicatedClusterSnapshotRestoreIT.java | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java b/server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java index b924d1eccced1..c2368d9420139 100644 --- a/server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java +++ b/server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java @@ -1237,6 +1237,55 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception { }, 60L, TimeUnit.SECONDS); } + public void testDataNodeRestartAfterShardSnapshotFailure() throws Exception { + logger.info("--> starting a master node and two data nodes"); + internalCluster().startMasterOnlyNode(); + final List dataNodes = internalCluster().startDataOnlyNodes(2); + logger.info("--> creating repository"); + assertAcked(client().admin().cluster().preparePutRepository("test-repo") + .setType("mock").setSettings(Settings.builder() + .put("location", randomRepoPath()) + .put("compress", randomBoolean()) + .put("chunk_size", randomIntBetween(100, 1000), ByteSizeUnit.BYTES))); + assertAcked(prepareCreate("test-idx", 0, Settings.builder() + .put("number_of_shards", 2).put("number_of_replicas", 0))); + ensureGreen(); + logger.info("--> indexing some data"); + final int numdocs = randomIntBetween(50, 100); + IndexRequestBuilder[] builders = new IndexRequestBuilder[numdocs]; + for (int i = 0; i < builders.length; i++) { + builders[i] = client().prepareIndex("test-idx", "type1", + Integer.toString(i)).setSource("field1", "bar " + i); + } + indexRandom(true, builders); + flushAndRefresh(); + blockAllDataNodes("test-repo"); + logger.info("--> snapshot"); + client(internalCluster().getMasterName()).admin().cluster() + .prepareCreateSnapshot("test-repo", "test-snap").setWaitForCompletion(false).setIndices("test-idx").get(); + logger.info("--> restarting first data node, which should cause the primary shard on it to be failed"); + internalCluster().restartNode(dataNodes.get(0), InternalTestCluster.EMPTY_CALLBACK); + + logger.info("--> wait for shard snapshot of first primary to show as failed"); + assertBusy(() -> assertThat( + client().admin().cluster().prepareSnapshotStatus("test-repo").setSnapshots("test-snap").get().getSnapshots() + .get(0).getShardsStats().getFailedShards(), is(1)), 60L, TimeUnit.SECONDS); + + logger.info("--> restarting second data node, which should cause the primary shard on it to be failed"); + internalCluster().restartNode(dataNodes.get(1), InternalTestCluster.EMPTY_CALLBACK); + + // check that snapshot completes with both failed shards being accounted for in the snapshot result + assertBusy(() -> { + GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster() + .prepareGetSnapshots("test-repo").setSnapshots("test-snap").setIgnoreUnavailable(true).get(); + assertEquals(1, snapshotsStatusResponse.getSnapshots().size()); + SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0); + assertTrue(snapshotInfo.state().toString(), snapshotInfo.state().completed()); + assertThat(snapshotInfo.totalShards(), is(2)); + assertThat(snapshotInfo.shardFailures(), hasSize(2)); + }, 60L, TimeUnit.SECONDS); + } + public void testRetentionLeasesClearedOnRestore() throws Exception { final String repoName = "test-repo-retention-leases"; assertAcked(client().admin().cluster().preparePutRepository(repoName)