diff --git a/server/src/internalClusterTest/java/org/elasticsearch/repositories/blobstore/BlobStoreCorruptionIT.java b/server/src/internalClusterTest/java/org/elasticsearch/repositories/blobstore/BlobStoreCorruptionIT.java new file mode 100644 index 0000000000000..422696d6b61c6 --- /dev/null +++ b/server/src/internalClusterTest/java/org/elasticsearch/repositories/blobstore/BlobStoreCorruptionIT.java @@ -0,0 +1,186 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.repositories.blobstore; + +import org.apache.lucene.tests.mockfile.ExtrasFS; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse; +import org.elasticsearch.action.support.ActionTestUtils; +import org.elasticsearch.action.support.SubscribableListener; +import org.elasticsearch.action.support.master.AcknowledgedResponse; +import org.elasticsearch.common.Strings; +import org.elasticsearch.core.CheckedConsumer; +import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshotsIntegritySuppressor; +import org.elasticsearch.logging.LogManager; +import org.elasticsearch.logging.Logger; +import org.elasticsearch.repositories.fs.FsRepository; +import org.elasticsearch.snapshots.AbstractSnapshotIntegTestCase; +import org.elasticsearch.snapshots.SnapshotState; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.hamcrest.ElasticsearchAssertions; +import org.junit.Before; + +import java.io.IOException; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.ArrayList; +import java.util.Base64; +import java.util.List; + +public class BlobStoreCorruptionIT extends AbstractSnapshotIntegTestCase { + + private static final Logger logger = LogManager.getLogger(BlobStoreCorruptionIT.class); + + @Before + public void suppressConsistencyCheck() { + disableRepoConsistencyCheck("testing corruption detection involves breaking the repo"); + } + + public void testCorruptionDetection() throws Exception { + final var repositoryName = randomIdentifier(); + final var indexName = randomIdentifier(); + final var snapshotName = randomIdentifier(); + final var repositoryRootPath = randomRepoPath(); + + createRepository(repositoryName, FsRepository.TYPE, repositoryRootPath); + createIndexWithRandomDocs(indexName, between(1, 100)); + flushAndRefresh(indexName); + createSnapshot(repositoryName, snapshotName, List.of(indexName)); + + final var corruptedFile = corruptRandomFile(repositoryRootPath); + final var corruptedFileType = RepositoryFileType.getRepositoryFileType(repositoryRootPath, corruptedFile); + final var corruptionDetectors = new ArrayList, ?>>(); + + // detect corruption by listing the snapshots + if (corruptedFileType == RepositoryFileType.SNAPSHOT_INFO) { + corruptionDetectors.add(exceptionListener -> { + logger.info("--> listing snapshots"); + client().admin() + .cluster() + .prepareGetSnapshots(TEST_REQUEST_TIMEOUT, repositoryName) + .execute(ActionTestUtils.assertNoSuccessListener(exceptionListener::onResponse)); + }); + } + + // detect corruption by taking another snapshot + if (corruptedFileType == RepositoryFileType.SHARD_GENERATION) { + corruptionDetectors.add(exceptionListener -> { + logger.info("--> taking another snapshot"); + client().admin() + .cluster() + .prepareCreateSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, randomIdentifier()) + .setWaitForCompletion(true) + .execute(exceptionListener.map(createSnapshotResponse -> { + assertNotEquals(SnapshotState.SUCCESS, createSnapshotResponse.getSnapshotInfo().state()); + return new ElasticsearchException("create-snapshot failed as expected"); + })); + }); + } + + // detect corruption by restoring the snapshot + switch (corruptedFileType) { + case SNAPSHOT_INFO, GLOBAL_METADATA, INDEX_METADATA -> corruptionDetectors.add(exceptionListener -> { + logger.info("--> restoring snapshot"); + client().admin() + .cluster() + .prepareRestoreSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, snapshotName) + .setRestoreGlobalState(corruptedFileType == RepositoryFileType.GLOBAL_METADATA || randomBoolean()) + .setWaitForCompletion(true) + .execute(ActionTestUtils.assertNoSuccessListener(exceptionListener::onResponse)); + }); + case SHARD_SNAPSHOT_INFO, SHARD_DATA -> corruptionDetectors.add(exceptionListener -> { + logger.info("--> restoring snapshot and checking for failed shards"); + SubscribableListener + // if shard-level data is corrupted then the overall restore succeeds but the shard recoveries fail + .newForked(l -> client().admin().indices().prepareDelete(indexName).execute(l)) + .andThenAccept(ElasticsearchAssertions::assertAcked) + + .andThen( + l -> client().admin() + .cluster() + .prepareRestoreSnapshot(TEST_REQUEST_TIMEOUT, repositoryName, snapshotName) + .setRestoreGlobalState(randomBoolean()) + .setWaitForCompletion(true) + .execute(l) + ) + + .addListener(exceptionListener.map(restoreSnapshotResponse -> { + assertNotEquals(0, restoreSnapshotResponse.getRestoreInfo().failedShards()); + return new ElasticsearchException("post-restore recoveries failed as expected"); + })); + }); + } + + try (var ignored = new BlobStoreIndexShardSnapshotsIntegritySuppressor()) { + final var exception = safeAwait(randomFrom(corruptionDetectors)); + logger.info(Strings.format("--> corrupted [%s] and caught exception", corruptedFile), exception); + } + } + + private static Path corruptRandomFile(Path repositoryRootPath) throws IOException { + final var corruptedFileType = getRandomCorruptibleFileType(); + final var corruptedFile = getRandomFileToCorrupt(repositoryRootPath, corruptedFileType); + if (randomBoolean()) { + logger.info("--> deleting [{}]", corruptedFile); + Files.delete(corruptedFile); + } else { + corruptFileContents(corruptedFile); + } + return corruptedFile; + } + + private static void corruptFileContents(Path fileToCorrupt) throws IOException { + final var oldFileContents = Files.readAllBytes(fileToCorrupt); + logger.info("--> contents of [{}] before corruption: [{}]", fileToCorrupt, Base64.getEncoder().encodeToString(oldFileContents)); + final byte[] newFileContents = new byte[randomBoolean() ? oldFileContents.length : between(0, oldFileContents.length)]; + System.arraycopy(oldFileContents, 0, newFileContents, 0, newFileContents.length); + if (newFileContents.length == oldFileContents.length) { + final var corruptionPosition = between(0, newFileContents.length - 1); + newFileContents[corruptionPosition] = randomValueOtherThan(oldFileContents[corruptionPosition], ESTestCase::randomByte); + logger.info( + "--> updating byte at position [{}] from [{}] to [{}]", + corruptionPosition, + oldFileContents[corruptionPosition], + newFileContents[corruptionPosition] + ); + } else { + logger.info("--> truncating file from length [{}] to length [{}]", oldFileContents.length, newFileContents.length); + } + Files.write(fileToCorrupt, newFileContents); + logger.info("--> contents of [{}] after corruption: [{}]", fileToCorrupt, Base64.getEncoder().encodeToString(newFileContents)); + } + + private static RepositoryFileType getRandomCorruptibleFileType() { + return randomValueOtherThanMany( + // these blob types do not have reliable corruption detection, so we must skip them + t -> t == RepositoryFileType.ROOT_INDEX_N || t == RepositoryFileType.ROOT_INDEX_LATEST, + () -> randomFrom(RepositoryFileType.values()) + ); + } + + private static Path getRandomFileToCorrupt(Path repositoryRootPath, RepositoryFileType corruptedFileType) throws IOException { + final var corruptibleFiles = new ArrayList(); + Files.walkFileTree(repositoryRootPath, new SimpleFileVisitor<>() { + @Override + public FileVisitResult visitFile(Path filePath, BasicFileAttributes attrs) throws IOException { + if (ExtrasFS.isExtra(filePath.getFileName().toString()) == false + && RepositoryFileType.getRepositoryFileType(repositoryRootPath, filePath) == corruptedFileType) { + corruptibleFiles.add(filePath); + } + return super.visitFile(filePath, attrs); + } + }); + return randomFrom(corruptibleFiles); + } + +} diff --git a/server/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshot.java b/server/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshot.java index 2a8fe96151c11..817ecb4601d59 100644 --- a/server/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshot.java +++ b/server/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshot.java @@ -17,6 +17,7 @@ import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.xcontent.XContentParserUtils; import org.elasticsearch.core.Nullable; +import org.elasticsearch.gateway.CorruptStateException; import org.elasticsearch.index.store.StoreFileMetadata; import org.elasticsearch.xcontent.ParseField; import org.elasticsearch.xcontent.ToXContentFragment; @@ -318,7 +319,11 @@ public static FileInfo fromXContent(XContentParser parser) throws IOException { } case WRITER_UUID -> { writerUuid = new BytesRef(parser.binaryValue()); - assert writerUuid.length > 0; + assert BlobStoreIndexShardSnapshots.INTEGRITY_ASSERTIONS_ENABLED == false || writerUuid.length > 0; + if (writerUuid.length == 0) { + // we never write UNAVAILABLE_WRITER_UUID, so this must be due to corruption + throw new ElasticsearchParseException("invalid (empty) writer uuid"); + } } default -> XContentParserUtils.throwUnknownField(currentFieldName, parser); } @@ -336,6 +341,12 @@ public static FileInfo fromXContent(XContentParser parser) throws IOException { } else if (checksum == null) { throw new ElasticsearchParseException("missing checksum for name [" + name + "]"); } + try { + // check for corruption before asserting writtenBy is parseable in the StoreFileMetadata constructor + org.apache.lucene.util.Version.parse(writtenBy); + } catch (Exception e) { + throw new ElasticsearchParseException("invalid written_by [" + writtenBy + "]"); + } return new FileInfo(name, new StoreFileMetadata(physicalName, length, checksum, writtenBy, metaHash, writerUuid), partSize); } @@ -566,6 +577,11 @@ public static BlobStoreIndexShardSnapshot fromXContent(XContentParser parser) th } } + // check for corruption before asserting snapshot != null in the BlobStoreIndexShardSnapshot ctor + if (snapshot == null) { + throw new CorruptStateException("snapshot missing"); + } + return new BlobStoreIndexShardSnapshot( snapshot, indexFiles == null ? List.of() : indexFiles, diff --git a/server/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshots.java b/server/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshots.java index b17545a4cbeb6..30fbbba5ed095 100644 --- a/server/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshots.java +++ b/server/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshots.java @@ -264,6 +264,8 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws return builder; } + static volatile boolean INTEGRITY_ASSERTIONS_ENABLED = true; + public static BlobStoreIndexShardSnapshots fromXContent(XContentParser parser) throws IOException { XContentParser.Token token = parser.currentToken(); if (token == null) { // New parser @@ -317,7 +319,12 @@ public static BlobStoreIndexShardSnapshots fromXContent(XContentParser parser) t List fileInfosBuilder = new ArrayList<>(); for (String file : entry.v2()) { FileInfo fileInfo = files.get(file); - assert fileInfo != null; + if (fileInfo == null) { + // could happen in production if the repo contents are corrupted + final var exception = new IllegalStateException("shard index inconsistent at file [" + file + "]"); + assert INTEGRITY_ASSERTIONS_ENABLED == false : exception; + throw exception; + } fileInfosBuilder.add(fileInfo); } snapshots.add(new SnapshotFiles(entry.v1(), Collections.unmodifiableList(fileInfosBuilder), historyUUIDs.get(entry.v1()))); diff --git a/test/framework/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshotsIntegritySuppressor.java b/test/framework/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshotsIntegritySuppressor.java new file mode 100644 index 0000000000000..511116d9b2125 --- /dev/null +++ b/test/framework/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshotsIntegritySuppressor.java @@ -0,0 +1,27 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.index.snapshots.blobstore; + +import org.elasticsearch.core.Releasable; + +/** + * Test utility class to suppress assertions about the integrity of the contents of a blobstore repository, in order to verify the + * production behaviour on encountering invalid data. + */ +public class BlobStoreIndexShardSnapshotsIntegritySuppressor implements Releasable { + + public BlobStoreIndexShardSnapshotsIntegritySuppressor() { + BlobStoreIndexShardSnapshots.INTEGRITY_ASSERTIONS_ENABLED = false; + } + + @Override + public void close() { + BlobStoreIndexShardSnapshots.INTEGRITY_ASSERTIONS_ENABLED = true; + } +} diff --git a/test/framework/src/main/java/org/elasticsearch/repositories/blobstore/RepositoryFileType.java b/test/framework/src/main/java/org/elasticsearch/repositories/blobstore/RepositoryFileType.java new file mode 100644 index 0000000000000..014cbcd2bcc3a --- /dev/null +++ b/test/framework/src/main/java/org/elasticsearch/repositories/blobstore/RepositoryFileType.java @@ -0,0 +1,60 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.repositories.blobstore; + +import org.elasticsearch.common.Strings; + +import java.nio.file.Path; +import java.util.regex.Pattern; + +/** + * The types of blobs in a {@link BlobStoreRepository}. + */ +public enum RepositoryFileType { + + ROOT_INDEX_N("index-NUM"), + ROOT_INDEX_LATEST("index.latest"), + SNAPSHOT_INFO("snap-UUID.dat"), + GLOBAL_METADATA("meta-UUID.dat"), + INDEX_METADATA("indices/UUID/meta-SHORTUUID.dat"), + SHARD_GENERATION("indices/UUID/NUM/index-UUID"), + SHARD_SNAPSHOT_INFO("indices/UUID/NUM/snap-UUID.dat"), + SHARD_DATA("indices/UUID/NUM/__UUID"), + // NB no support for legacy names (yet) + ; + + private final Pattern pattern; + + RepositoryFileType(String regex) { + pattern = Pattern.compile( + "^(" + + regex + // decimal numbers + .replace("NUM", "(0|[1-9][0-9]*)") + // 15-byte UUIDS from TimeBasedUUIDGenerator + .replace("SHORTUUID", "[0-9a-zA-Z_-]{20}") + // 16-byte UUIDs from RandomBasedUUIDGenerator + .replace("UUID", "[0-9a-zA-Z_-]{22}") + + ")$" + ); + } + + public static RepositoryFileType getRepositoryFileType(Path repositoryRoot, Path blobPath) { + final var relativePath = repositoryRoot.relativize(blobPath).toString().replace(repositoryRoot.getFileSystem().getSeparator(), "/"); + for (final var repositoryFileType : RepositoryFileType.values()) { + if (repositoryFileType.pattern.matcher(relativePath).matches()) { + return repositoryFileType; + } + } + throw new IllegalArgumentException( + Strings.format("[%s] is not the path of a known blob type within [%s]", relativePath, repositoryRoot) + ); + } + +}