Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update WAL corruption test so that it fails without fix #9942

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 205 additions & 38 deletions db/corruption_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ class CorruptionTest : public testing::Test {
uint64_t new_size = old_size - bytes_to_truncate;
// If bytes_to_truncate == 0, it will do full truncation.
if (bytes_to_truncate == 0) {
new_size = old_size;
new_size = 0;
}
ASSERT_OK(test::TruncateFile(env_, path, new_size));
}
Expand Down Expand Up @@ -1049,16 +1049,23 @@ INSTANTIATE_TEST_CASE_P(CorruptionTest, CrashDuringRecoveryWithCorruptionTest,
// "Column family inconsistency: SST file contains data beyond the point of
// corruption" error will be hit, causing recovery to fail.
//
// After adding the fix, corrupted WALs whose numbers are larger than the
// corrupted wal and smaller than the new WAL are moved to a separate folder.
// Only after new WAL is synced, RocksDB persist a new MANIFEST with column
// families to ensure RocksDB is in consistent state.
// After adding the fix, only after new WAL is synced, RocksDB persist a new
// MANIFEST with column families to ensure RocksDB is in consistent state.
// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
// synced immediately afterwards. The sequence number of the sentinel
// WriteBatch will be the next sequence number immediately after the largest
// sequence number recovered from previous WALs and MANIFEST because of which DB
// will be in consistent state.
TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
// If a future recovery starts from the new MANIFEST, then it means the new WAL
// is successfully synced. Due to the sentinel empty write batch at the
// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
// If future recovery starts from the old MANIFEST, it means the writing the new
// MANIFEST failed. It won't have the "SST ahead of WAL" error.
//
// The combination of corrupting a WAL and injecting an error during subsequent
// re-open exposes the bug of prematurely persisting a new MANIFEST with
// advanced ColumnFamilyData::log_number.
TEST_P(CrashDuringRecoveryWithCorruptionTest, DISABLED_CrashDuringRecovery) {
CloseDb();
Options options;
options.track_and_verify_wals_in_manifest =
Expand All @@ -1068,7 +1075,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
options.env = env_;
ASSERT_OK(DestroyDB(dbname_, options));
options.create_if_missing = true;
options.max_write_buffer_number = 3;
options.max_write_buffer_number = 8;

Reopen(&options);
Status s;
Expand All @@ -1094,13 +1101,15 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {

// Write one key to test_cf.
ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));
ASSERT_OK(db_->Flush(FlushOptions(), handles[1]));

// Write to default_cf and flush this cf several times to advance wal
// number.
// number. TEST_SwitchMemtable makes sure WALs are not synced and test can
// corrupt un-sync WAL.
for (int i = 0; i < 2; ++i) {
ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i), "value"));
ASSERT_OK(dbimpl->TEST_SwitchMemtable());
}
ASSERT_OK(db_->Put(WriteOptions(), handles[1], "dontcare", "dontcare"));

for (auto* h : handles) {
delete h;
Expand All @@ -1109,12 +1118,13 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
CloseDb();
}

// 2. Corrupt second last wal file to emulate power reset which caused the DB
// to lose the un-synced WAL.
// 2. Corrupt second last un-syned wal file to emulate power reset which
// caused the DB to lose the un-synced WAL.
{
std::vector<uint64_t> file_nums;
GetSortedWalFiles(file_nums);
size_t size = file_nums.size();
assert(size >= 2);
uint64_t log_num = file_nums[size - 2];
CorruptFileWithTruncation(FileType::kWalFile, log_num,
/*bytes_to_truncate=*/8);
Expand All @@ -1131,25 +1141,31 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
// Case2: If avoid_flush_during_recovery = false, all column families have
// flushed their data from WAL to L0 during recovery, and none of them will
// ever need to read the WALs again.

// 4. Fault is injected to fail the recovery.
{
options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) {
auto* tmp_s = reinterpret_cast<Status*>(arg);
assert(tmp_s);
*tmp_s = Status::IOError("Injected");
});
SyncPoint::GetInstance()->EnableProcessing();

handles.clear();
options.avoid_flush_during_recovery = true;
s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
ASSERT_OK(s);
ASSERT_TRUE(s.IsIOError());
ASSERT_EQ("IO error: Injected", s.ToString());
for (auto* h : handles) {
delete h;
}
handles.clear();
CloseDb();
}

// 4. Corrupt max_wal_num to emulate second power reset which caused the
// DB to again lose the un-synced WAL.
{
std::vector<uint64_t> file_nums;
GetSortedWalFiles(file_nums);
size_t size = file_nums.size();
uint64_t log_num = file_nums[size - 1];
CorruptFileWithTruncation(FileType::kWalFile, log_num);
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}

// 5. After second crash reopen the db with second corruption. Default family
Expand Down Expand Up @@ -1187,16 +1203,24 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
// file contains data beyond the point of corruption" error will be hit, causing
// recovery to fail.
//
// After adding the fix, corrupted WALs whose numbers are larger than the
// corrupted wal and smaller than the new WAL are moved to a separate folder.
// Only after new WAL is synced, RocksDB persist a new MANIFEST with column
// families to ensure RocksDB is in consistent state.
// After adding the fix, only after new WAL is synced, RocksDB persist a new
// MANIFEST with column families to ensure RocksDB is in consistent state.
// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
// synced immediately afterwards. The sequence number of the sentinel
// WriteBatch will be the next sequence number immediately after the largest
// sequence number recovered from previous WALs and MANIFEST because of which DB
// will be in consistent state.
TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
// If a future recovery starts from the new MANIFEST, then it means the new WAL
// is successfully synced. Due to the sentinel empty write batch at the
// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
// If future recovery starts from the old MANIFEST, it means the writing the new
// MANIFEST failed. It won't have the "SST ahead of WAL" error.
//
// The combination of corrupting a WAL and injecting an error during subsequent
// re-open exposes the bug of prematurely persisting a new MANIFEST with
// advanced ColumnFamilyData::log_number.
TEST_P(CrashDuringRecoveryWithCorruptionTest,
DISABLED_TxnDbCrashDuringRecovery) {
CloseDb();
Options options;
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
Expand Down Expand Up @@ -1229,7 +1253,6 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
// advance wal number so that some column families have advanced log_number
// while other don't.
{
options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
&handles, &txn_db));

Expand All @@ -1238,6 +1261,8 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
ASSERT_OK(txn->Put(handles[1], "foo", "value"));
ASSERT_OK(txn->SetName("txn0"));
ASSERT_OK(txn->Prepare());
ASSERT_OK(txn_db->Flush(FlushOptions()));

delete txn;
txn = nullptr;

Expand All @@ -1257,6 +1282,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {

delete txn;
txn = nullptr;

for (auto* h : handles) {
delete h;
}
Expand All @@ -1270,6 +1296,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
std::vector<uint64_t> file_nums;
GetSortedWalFiles(file_nums);
size_t size = file_nums.size();
assert(size >= 2);
uint64_t log_num = file_nums[size - 2];
CorruptFileWithTruncation(FileType::kWalFile, log_num,
/*bytes_to_truncate=*/8);
Expand All @@ -1279,24 +1306,38 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
// family has higher log number than corrupted wal number. There may be old
// WAL files that it must not delete because they can contain data of
// uncommitted transactions. As a result, min_log_number_to_keep won't change.

{
options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
&handles, &txn_db));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::Open::BeforeSyncWAL", [&](void* arg) {
auto* tmp_s = reinterpret_cast<Status*>(arg);
assert(tmp_s);
*tmp_s = Status::IOError("Injected");
});
SyncPoint::GetInstance()->EnableProcessing();

handles.clear();
s = TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, &handles,
&txn_db);
ASSERT_TRUE(s.IsIOError());
ASSERT_EQ("IO error: Injected", s.ToString());
for (auto* h : handles) {
delete h;
}
handles.clear();
delete txn_db;
CloseDb();

SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}

// 4. Corrupt max_wal_num to emulate second power reset which caused the
// DB to again lose the un-synced WAL.
// 4. Corrupt max_wal_num.
{
std::vector<uint64_t> file_nums;
GetSortedWalFiles(file_nums);
size_t size = file_nums.size();
assert(size >= 2);
uint64_t log_num = file_nums[size - 1];
CorruptFileWithTruncation(FileType::kWalFile, log_num);
}
Expand All @@ -1311,8 +1352,6 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
// recovered from previous WALs and MANIFEST, db will be in consistent state
// and opens successfully.
{
options.avoid_flush_during_recovery = false;

ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
&handles, &txn_db));
for (auto* h : handles) {
Expand All @@ -1322,6 +1361,134 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
}
}

// This test is similar to
// CrashDuringRecoveryWithCorruptionTest.CrashDuringRecovery except it calls
// flush and corrupts Last WAL. It calls flush to sync some of the WALs and
// remaining are unsyned one of which is then corrupted to simulate crash.
//
// In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB
// won't flush the data from WAL to L0 for all column families if possible. As a
// result, not all column families can increase their log_numbers, and
// min_log_number_to_keep won't change.
// It may prematurely persist a new MANIFEST even before we can declare the DB
// is in consistent state after recovery (this is when the new WAL is synced)
// and advances log_numbers for some column families.
//
// If there is power failure before we sync the new WAL, we will end up in
// a situation in which after persisting the MANIFEST, RocksDB will see some
// column families' log_numbers larger than the corrupted wal, and
// "Column family inconsistency: SST file contains data beyond the point of
// corruption" error will be hit, causing recovery to fail.
//
// After adding the fix, only after new WAL is synced, RocksDB persist a new
// MANIFEST with column families to ensure RocksDB is in consistent state.
// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
// synced immediately afterwards. The sequence number of the sentinel
// WriteBatch will be the next sequence number immediately after the largest
// sequence number recovered from previous WALs and MANIFEST because of which DB
// will be in consistent state.
// If a future recovery starts from the new MANIFEST, then it means the new WAL
// is successfully synced. Due to the sentinel empty write batch at the
// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
// If future recovery starts from the old MANIFEST, it means the writing the new
// MANIFEST failed. It won't have the "SST ahead of WAL" error.

// The combination of corrupting a WAL and injecting an error during subsequent
// re-open exposes the bug of prematurely persisting a new MANIFEST with
// advanced ColumnFamilyData::log_number.
TEST_P(CrashDuringRecoveryWithCorruptionTest,
DISABLED_CrashDuringRecoveryWithFlush) {
CloseDb();
Options options;
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
options.avoid_flush_during_recovery = false;
options.env = env_;
options.create_if_missing = true;

ASSERT_OK(DestroyDB(dbname_, options));
Reopen(&options);

ColumnFamilyHandle* cfh = nullptr;
const std::string test_cf_name = "test_cf";
Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
ASSERT_OK(s);
delete cfh;

CloseDb();

std::vector<ColumnFamilyDescriptor> cf_descs;
cf_descs.emplace_back(kDefaultColumnFamilyName, options);
cf_descs.emplace_back(test_cf_name, options);
std::vector<ColumnFamilyHandle*> handles;

{
ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));

// Write one key to test_cf.
ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));

// Write to default_cf and flush this cf several times to advance wal
// number.
for (int i = 0; i < 2; ++i) {
ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i), "value"));
ASSERT_OK(db_->Flush(FlushOptions()));
}

ASSERT_OK(db_->Put(WriteOptions(), handles[1], "dontcare", "dontcare"));
for (auto* h : handles) {
delete h;
}
handles.clear();
CloseDb();
}

// Corrupt second last un-syned wal file to emulate power reset which
// caused the DB to lose the un-synced WAL.
{
std::vector<uint64_t> file_nums;
GetSortedWalFiles(file_nums);
size_t size = file_nums.size();
uint64_t log_num = file_nums[size - 1];
CorruptFileWithTruncation(FileType::kWalFile, log_num,
/*bytes_to_truncate=*/8);
}

// Fault is injected to fail the recovery.
{
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) {
auto* tmp_s = reinterpret_cast<Status*>(arg);
assert(tmp_s);
*tmp_s = Status::IOError("Injected");
});
SyncPoint::GetInstance()->EnableProcessing();

handles.clear();
options.avoid_flush_during_recovery = true;
s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
ASSERT_TRUE(s.IsIOError());
ASSERT_EQ("IO error: Injected", s.ToString());
for (auto* h : handles) {
delete h;
}
CloseDb();

SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}

// Reopen db again
{
options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
for (auto* h : handles) {
delete h;
}
}
}

} // namespace ROCKSDB_NAMESPACE

int main(int argc, char** argv) {
Expand Down
2 changes: 2 additions & 0 deletions db/db_impl/db_impl_open.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1326,6 +1326,7 @@ Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
Status s;
// This gets the appear size of the wals, not including preallocated space.
s = env_->GetFileSize(fname, &log.size);
TEST_SYNC_POINT_CALLBACK("DBImpl::GetLogSizeAndMaybeTruncate:0", /*arg=*/&s);
if (s.ok() && truncate) {
std::unique_ptr<FSWritableFile> last_log;
Status truncate_status = fs_->ReopenWritableFile(
Expand Down Expand Up @@ -1825,6 +1826,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
if (s.ok()) {
// Need to fsync, otherwise it might get lost after a power reset.
s = impl->FlushWAL(false);
TEST_SYNC_POINT_CALLBACK("DBImpl::Open::BeforeSyncWAL", /*arg=*/&s);
if (s.ok()) {
s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
}
Expand Down