Skip to content

Commit

Permalink
Fix Broken Index Shard Snapshot File Preventing Snapshot Creation (el…
Browse files Browse the repository at this point in the history
…astic#41310)


* The problem here is that if we run into a corrupted index-N file, instead of generating a new index-(N+1) file, we instead set the newest index generation to -1 and thus tried to create `index-0`
   * If `index-0` is corrupt, this prevents us from ever creating a new snapshot using the broken shard, because we are unable to create `index-0` since it already exists
   * Fixed by still using the index generation for naming the next index file, even if it was a broken index file
* Added test that makes sure restoring as well as snapshotting on top of the broken shard index file work as expected
* closes elastic#41304
  • Loading branch information
original-brownbear authored and Gurkan Kaymak committed May 27, 2019
1 parent 77e2b84 commit ada4164
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@
import static java.util.Collections.unmodifiableMap;

/**
* Contains information about all snapshot for the given shard in repository
* Contains information about all snapshots for the given shard in repository
* <p>
* This class is used to find files that were already snapshotted and clear out files that no longer referenced by any
* snapshots
* snapshots.
*/
public class BlobStoreIndexShardSnapshots implements Iterable<SnapshotFiles>, ToXContentFragment {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,27 +126,28 @@
* <pre>
* {@code
* STORE_ROOT
* |- index-N - list of all snapshot ids and the indices belonging to each snapshot, N is the generation of the file
* |- index-N - JSON serialized {@link RepositoryData} containing a list of all snapshot ids and the indices belonging to
* | each snapshot, N is the generation of the file
* |- index.latest - contains the numeric value of the latest generation of the index file (i.e. N from above)
* |- incompatible-snapshots - list of all snapshot ids that are no longer compatible with the current version of the cluster
* |- snap-20131010.dat - JSON serialized Snapshot for snapshot "20131010"
* |- meta-20131010.dat - JSON serialized MetaData for snapshot "20131010" (includes only global metadata)
* |- snap-20131011.dat - JSON serialized Snapshot for snapshot "20131011"
* |- meta-20131011.dat - JSON serialized MetaData for snapshot "20131011"
* |- snap-20131010.dat - SMILE serialized {@link SnapshotInfo} for snapshot "20131010"
* |- meta-20131010.dat - SMILE serialized {@link MetaData} for snapshot "20131010" (includes only global metadata)
* |- snap-20131011.dat - SMILE serialized {@link SnapshotInfo} for snapshot "20131011"
* |- meta-20131011.dat - SMILE serialized {@link MetaData} for snapshot "20131011"
* .....
* |- indices/ - data for all indices
* |- Ac1342-B_x/ - data for index "foo" which was assigned the unique id of Ac1342-B_x in the repository
* | |- meta-20131010.dat - JSON Serialized IndexMetaData for index "foo"
* | |- meta-20131010.dat - JSON Serialized {@link IndexMetaData} for index "foo"
* | |- 0/ - data for shard "0" of index "foo"
* | | |- __1 \ (files with numeric names were created by older ES versions)
* | | |- __2 |
* | | |- __VPO5oDMVT5y4Akv8T_AO_A |- files from different segments see snap-* for their mappings to real segment files
* | | |- __1gbJy18wS_2kv1qI7FgKuQ |
* | | |- __R8JvZAHlSMyMXyZc2SS8Zg /
* | | .....
* | | |- snap-20131010.dat - JSON serialized BlobStoreIndexShardSnapshot for snapshot "20131010"
* | | |- snap-20131011.dat - JSON serialized BlobStoreIndexShardSnapshot for snapshot "20131011"
* | | |- list-123 - JSON serialized BlobStoreIndexShardSnapshot for snapshot "20131011"
* | | |- snap-20131010.dat - SMILE serialized {@link BlobStoreIndexShardSnapshot} for snapshot "20131010"
* | | |- snap-20131011.dat - SMILE serialized {@link BlobStoreIndexShardSnapshot} for snapshot "20131011"
* | | |- index-123 - SMILE serialized {@link BlobStoreIndexShardSnapshots} for the shard
* | |
* | |- 1/ - data for shard "1" of index "foo"
* | | |- __1
Expand Down Expand Up @@ -1086,7 +1087,7 @@ protected Tuple<BlobStoreIndexShardSnapshots, Integer> buildBlobStoreIndexShardS
logger.warn(() -> new ParameterizedMessage("failed to read commit point [{}]", name), e);
}
}
return new Tuple<>(new BlobStoreIndexShardSnapshots(snapshots), -1);
return new Tuple<>(new BlobStoreIndexShardSnapshots(snapshots), latest);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2961,6 +2961,108 @@ public void testRestoreSnapshotWithCorruptedIndexMetadata() throws Exception {
assertAcked(client().admin().cluster().prepareDeleteSnapshot("test-repo", snapshotInfo.snapshotId().getName()).get());
}

/**
* Tests that a shard snapshot with a corrupted shard index file can still be used for restore and incremental snapshots.
*/
public void testSnapshotWithCorruptedShardIndexFile() throws Exception {
final Client client = client();
final Path repo = randomRepoPath();
final String indexName = "test-idx";
final int nDocs = randomIntBetween(1, 10);

logger.info("--> creating index [{}] with [{}] documents in it", indexName, nDocs);
assertAcked(prepareCreate(indexName).setSettings(Settings.builder()
.put(SETTING_NUMBER_OF_SHARDS, 1).put(SETTING_NUMBER_OF_REPLICAS, 0)));

final IndexRequestBuilder[] documents = new IndexRequestBuilder[nDocs];
for (int j = 0; j < nDocs; j++) {
documents[j] = client.prepareIndex(indexName, "_doc").setSource("foo", "bar");
}
indexRandom(true, documents);
flushAndRefresh();

logger.info("--> creating repository");
assertAcked(client().admin().cluster().preparePutRepository("test-repo")
.setType("fs")
.setSettings(Settings.builder()
.put("location", repo)));

final String snapshot1 = "test-snap-1";
logger.info("--> creating snapshot [{}]", snapshot1);
final SnapshotInfo snapshotInfo = client().admin().cluster().prepareCreateSnapshot("test-repo", snapshot1)
.setWaitForCompletion(true)
.get()
.getSnapshotInfo();
assertThat(snapshotInfo.failedShards(), equalTo(0));
assertThat(snapshotInfo.successfulShards(), equalTo(snapshotInfo.totalShards()));
assertThat(snapshotInfo.indices(), hasSize(1));

RepositoriesService service = internalCluster().getInstance(RepositoriesService.class, internalCluster().getMasterName());
Repository repository = service.repository("test-repo");

final RepositoryData repositoryData = getRepositoryData(repository);
final Map<String, IndexId> indexIds = repositoryData.getIndices();
assertThat(indexIds.size(), equalTo(1));

final IndexId corruptedIndex = indexIds.get(indexName);
final Path shardIndexFile = repo.resolve("indices")
.resolve(corruptedIndex.getId()).resolve("0")
.resolve("index-0");

logger.info("--> truncating shard index file [{}]", shardIndexFile);
try (SeekableByteChannel outChan = Files.newByteChannel(shardIndexFile, StandardOpenOption.WRITE)) {
outChan.truncate(randomInt(10));
}

logger.info("--> verifying snapshot state for [{}]", snapshot1);
List<SnapshotInfo> snapshotInfos = client().admin().cluster().prepareGetSnapshots("test-repo").get().getSnapshots();
assertThat(snapshotInfos.size(), equalTo(1));
assertThat(snapshotInfos.get(0).state(), equalTo(SnapshotState.SUCCESS));
assertThat(snapshotInfos.get(0).snapshotId().getName(), equalTo(snapshot1));

logger.info("--> deleting index [{}]", indexName);
assertAcked(client().admin().indices().prepareDelete(indexName));

logger.info("--> restoring snapshot [{}]", snapshot1);
client().admin().cluster().prepareRestoreSnapshot("test-repo", snapshot1)
.setRestoreGlobalState(randomBoolean())
.setWaitForCompletion(true)
.get();
ensureGreen();

assertHitCount(client().prepareSearch(indexName).setSize(0).get(), nDocs);

logger.info("--> indexing [{}] more documents into [{}]", nDocs, indexName);
for (int j = 0; j < nDocs; j++) {
documents[j] = client.prepareIndex(indexName, "_doc").setSource("foo2", "bar2");
}
indexRandom(true, documents);

final String snapshot2 = "test-snap-2";
logger.info("--> creating snapshot [{}]", snapshot2);
final SnapshotInfo snapshotInfo2 = client().admin().cluster().prepareCreateSnapshot("test-repo", snapshot2)
.setWaitForCompletion(true)
.get()
.getSnapshotInfo();
assertThat(snapshotInfo2.state(), equalTo(SnapshotState.SUCCESS));
assertThat(snapshotInfo2.failedShards(), equalTo(0));
assertThat(snapshotInfo2.successfulShards(), equalTo(snapshotInfo.totalShards()));
assertThat(snapshotInfo2.indices(), hasSize(1));

logger.info("--> deleting index [{}]", indexName);
assertAcked(client().admin().indices().prepareDelete(indexName));

logger.info("--> restoring snapshot [{}]", snapshot2);
client().admin().cluster().prepareRestoreSnapshot("test-repo", snapshot2)
.setRestoreGlobalState(randomBoolean())
.setWaitForCompletion(true)
.get();

ensureGreen();

assertHitCount(client().prepareSearch(indexName).setSize(0).get(), 2 * nDocs);
}

public void testCannotCreateSnapshotsWithSameName() throws Exception {
final String repositoryName = "test-repo";
final String snapshotName = "test-snap";
Expand Down

0 comments on commit ada4164

Please sign in to comment.