facebook · akankshamahajan15 · May 8, 2022 · May 11, 2022 · May 11, 2022
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
@@ -308,7 +308,7 @@ class CorruptionTest : public testing::Test {
     uint64_t new_size = old_size - bytes_to_truncate;
     // If bytes_to_truncate == 0, it will do full truncation.
     if (bytes_to_truncate == 0) {
-      new_size = old_size;
+      new_size = 0;
     }
     ASSERT_OK(test::TruncateFile(env_, path, new_size));
   }
@@ -1049,16 +1049,23 @@ INSTANTIATE_TEST_CASE_P(CorruptionTest, CrashDuringRecoveryWithCorruptionTest,
 // "Column family inconsistency: SST file contains data beyond the point of
 // corruption" error will be hit, causing recovery to fail.
 //
-// After adding the fix, corrupted WALs whose numbers are larger than the
-// corrupted wal and smaller than the new WAL are moved to a separate folder.
-// Only after new WAL is synced, RocksDB persist a new MANIFEST with column
-// families to ensure RocksDB is in consistent state.
+// After adding the fix, only after new WAL is synced, RocksDB persist a new
+// MANIFEST with column families to ensure RocksDB is in consistent state.
 // RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
 // synced immediately afterwards. The sequence number of the sentinel
 // WriteBatch will be the next sequence number immediately after the largest
 // sequence number recovered from previous WALs and MANIFEST because of which DB
 // will be in consistent state.
-TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
+// If a future recovery starts from the new MANIFEST, then it means the new WAL
+// is successfully synced. Due to the sentinel empty write batch at the
+// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
+// If future recovery starts from the old MANIFEST, it means the writing the new
+// MANIFEST failed. It won't have the "SST ahead of WAL" error.
+//
+// The combination of corrupting a WAL and injecting an error during subsequent
+// re-open exposes the bug of prematurely persisting a new MANIFEST with
+// advanced ColumnFamilyData::log_number.
+TEST_P(CrashDuringRecoveryWithCorruptionTest, DISABLED_CrashDuringRecovery) {
   CloseDb();
   Options options;
   options.track_and_verify_wals_in_manifest =
@@ -1068,7 +1075,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
   options.env = env_;
   ASSERT_OK(DestroyDB(dbname_, options));
   options.create_if_missing = true;
-  options.max_write_buffer_number = 3;
+  options.max_write_buffer_number = 8;
 
   Reopen(&options);
   Status s;
@@ -1094,13 +1101,15 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
 
     // Write one key to test_cf.
     ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));
+    ASSERT_OK(db_->Flush(FlushOptions(), handles[1]));
+
     // Write to default_cf and flush this cf several times to advance wal
-    // number.
+    // number. TEST_SwitchMemtable makes sure WALs are not synced and test can
+    // corrupt un-sync WAL.
     for (int i = 0; i < 2; ++i) {
       ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i), "value"));
       ASSERT_OK(dbimpl->TEST_SwitchMemtable());
     }
-    ASSERT_OK(db_->Put(WriteOptions(), handles[1], "dontcare", "dontcare"));
 
     for (auto* h : handles) {
       delete h;
@@ -1109,12 +1118,13 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
     CloseDb();
   }
 
-  // 2. Corrupt second last wal file to emulate power reset which caused the DB
-  // to lose the un-synced WAL.
+  // 2. Corrupt second last un-syned wal file to emulate power reset which
+  // caused the DB to lose the un-synced WAL.
   {
     std::vector<uint64_t> file_nums;
     GetSortedWalFiles(file_nums);
     size_t size = file_nums.size();
+    assert(size >= 2);
     uint64_t log_num = file_nums[size - 2];
     CorruptFileWithTruncation(FileType::kWalFile, log_num,
                               /*bytes_to_truncate=*/8);
@@ -1131,25 +1141,31 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
   // Case2: If avoid_flush_during_recovery = false, all column families have
   // flushed their data from WAL to L0 during recovery, and none of them will
   // ever need to read the WALs again.
+
+  // 4. Fault is injected to fail the recovery.
   {
-    options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) {
+          auto* tmp_s = reinterpret_cast<Status*>(arg);
+          assert(tmp_s);
+          *tmp_s = Status::IOError("Injected");
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    handles.clear();
+    options.avoid_flush_during_recovery = true;
     s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
-    ASSERT_OK(s);
+    ASSERT_TRUE(s.IsIOError());
+    ASSERT_EQ("IO error: Injected", s.ToString());
     for (auto* h : handles) {
       delete h;
     }
-    handles.clear();
     CloseDb();
-  }
 
-  // 4. Corrupt max_wal_num to emulate second power reset which caused the
-  // DB to again lose the un-synced WAL.
-  {
-    std::vector<uint64_t> file_nums;
-    GetSortedWalFiles(file_nums);
-    size_t size = file_nums.size();
-    uint64_t log_num = file_nums[size - 1];
-    CorruptFileWithTruncation(FileType::kWalFile, log_num);
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
   }
 
   // 5. After second crash reopen the db with second corruption. Default family
@@ -1187,16 +1203,24 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
 // file contains data beyond the point of corruption" error will be hit, causing
 // recovery to fail.
 //
-// After adding the fix, corrupted WALs whose numbers are larger than the
-// corrupted wal and smaller than the new WAL are moved to a separate folder.
-// Only after new WAL is synced, RocksDB persist a new MANIFEST with column
-// families to ensure RocksDB is in consistent state.
+// After adding the fix, only after new WAL is synced, RocksDB persist a new
+// MANIFEST with column families to ensure RocksDB is in consistent state.
 // RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
 // synced immediately afterwards. The sequence number of the sentinel
 // WriteBatch will be the next sequence number immediately after the largest
 // sequence number recovered from previous WALs and MANIFEST because of which DB
 // will be in consistent state.
-TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
+// If a future recovery starts from the new MANIFEST, then it means the new WAL
+// is successfully synced. Due to the sentinel empty write batch at the
+// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
+// If future recovery starts from the old MANIFEST, it means the writing the new
+// MANIFEST failed. It won't have the "SST ahead of WAL" error.
+//
+// The combination of corrupting a WAL and injecting an error during subsequent
+// re-open exposes the bug of prematurely persisting a new MANIFEST with
+// advanced ColumnFamilyData::log_number.
+TEST_P(CrashDuringRecoveryWithCorruptionTest,
+       DISABLED_TxnDbCrashDuringRecovery) {
   CloseDb();
   Options options;
   options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
@@ -1229,7 +1253,6 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
   // advance wal number so that some column families have advanced log_number
   // while other don't.
   {
-    options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
     ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
                                   &handles, &txn_db));
 
@@ -1238,6 +1261,8 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
     ASSERT_OK(txn->Put(handles[1], "foo", "value"));
     ASSERT_OK(txn->SetName("txn0"));
     ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn_db->Flush(FlushOptions()));
+
     delete txn;
     txn = nullptr;
 
@@ -1257,6 +1282,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
 
     delete txn;
     txn = nullptr;
+
     for (auto* h : handles) {
       delete h;
     }
@@ -1270,6 +1296,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
     std::vector<uint64_t> file_nums;
     GetSortedWalFiles(file_nums);
     size_t size = file_nums.size();
+    assert(size >= 2);
     uint64_t log_num = file_nums[size - 2];
     CorruptFileWithTruncation(FileType::kWalFile, log_num,
                               /*bytes_to_truncate=*/8);
@@ -1279,24 +1306,38 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
   // family has higher log number than corrupted wal number. There may be old
   // WAL files that it must not delete because they can contain data of
   // uncommitted transactions. As a result, min_log_number_to_keep won't change.
+
   {
-    options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
-    ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
-                                  &handles, &txn_db));
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::Open::BeforeSyncWAL", [&](void* arg) {
+          auto* tmp_s = reinterpret_cast<Status*>(arg);
+          assert(tmp_s);
+          *tmp_s = Status::IOError("Injected");
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
 
+    handles.clear();
+    s = TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, &handles,
+                            &txn_db);
+    ASSERT_TRUE(s.IsIOError());
+    ASSERT_EQ("IO error: Injected", s.ToString());
     for (auto* h : handles) {
       delete h;
     }
-    handles.clear();
-    delete txn_db;
+    CloseDb();
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
   }
 
-  // 4. Corrupt max_wal_num to emulate second power reset which caused the
-  // DB to again lose the un-synced WAL.
+  // 4. Corrupt max_wal_num.
   {
     std::vector<uint64_t> file_nums;
     GetSortedWalFiles(file_nums);
     size_t size = file_nums.size();
+    assert(size >= 2);
     uint64_t log_num = file_nums[size - 1];
     CorruptFileWithTruncation(FileType::kWalFile, log_num);
   }
@@ -1311,8 +1352,6 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
   // recovered from previous WALs and MANIFEST, db will be in consistent state
   // and opens successfully.
   {
-    options.avoid_flush_during_recovery = false;
-
     ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
                                   &handles, &txn_db));
     for (auto* h : handles) {
@@ -1322,6 +1361,134 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
   }
 }
 
+// This test is similar to
+// CrashDuringRecoveryWithCorruptionTest.CrashDuringRecovery except it calls
+// flush and corrupts Last WAL. It calls flush to sync some of the WALs and
+// remaining are unsyned one of which is then corrupted to simulate crash.
+//
+// In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB
+// won't flush the data from WAL to L0 for all column families if possible. As a
+// result, not all column families can increase their log_numbers, and
+// min_log_number_to_keep won't change.
+// It may prematurely persist a new MANIFEST even before we can declare the DB
+// is in consistent state after recovery (this is when the new WAL is synced)
+// and advances log_numbers for some column families.
+//
+// If there is power failure before we sync the new WAL, we will end up in
+// a situation in which after persisting the MANIFEST, RocksDB will see some
+// column families' log_numbers larger than the corrupted wal, and
+// "Column family inconsistency: SST file contains data beyond the point of
+// corruption" error will be hit, causing recovery to fail.
+//
+// After adding the fix, only after new WAL is synced, RocksDB persist a new
+// MANIFEST with column families to ensure RocksDB is in consistent state.
+// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
+// synced immediately afterwards. The sequence number of the sentinel
+// WriteBatch will be the next sequence number immediately after the largest
+// sequence number recovered from previous WALs and MANIFEST because of which DB
+// will be in consistent state.
+// If a future recovery starts from the new MANIFEST, then it means the new WAL
+// is successfully synced. Due to the sentinel empty write batch at the
+// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
+// If future recovery starts from the old MANIFEST, it means the writing the new
+// MANIFEST failed. It won't have the "SST ahead of WAL" error.
+
+// The combination of corrupting a WAL and injecting an error during subsequent
+// re-open exposes the bug of prematurely persisting a new MANIFEST with
+// advanced ColumnFamilyData::log_number.
+TEST_P(CrashDuringRecoveryWithCorruptionTest,
+       DISABLED_CrashDuringRecoveryWithFlush) {
+  CloseDb();
+  Options options;
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  options.avoid_flush_during_recovery = false;
+  options.env = env_;
+  options.create_if_missing = true;
+
+  ASSERT_OK(DestroyDB(dbname_, options));
+  Reopen(&options);
+
+  ColumnFamilyHandle* cfh = nullptr;
+  const std::string test_cf_name = "test_cf";
+  Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
+  ASSERT_OK(s);
+  delete cfh;
+
+  CloseDb();
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, options);
+  std::vector<ColumnFamilyHandle*> handles;
+
+  {
+    ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+
+    // Write one key to test_cf.
+    ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));
+
+    // Write to default_cf and flush this cf several times to advance wal
+    // number.
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i), "value"));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+
+    ASSERT_OK(db_->Put(WriteOptions(), handles[1], "dontcare", "dontcare"));
+    for (auto* h : handles) {
+      delete h;
+    }
+    handles.clear();
+    CloseDb();
+  }
+
+  // Corrupt second last un-syned wal file to emulate power reset which
+  // caused the DB to lose the un-synced WAL.
+  {
+    std::vector<uint64_t> file_nums;
+    GetSortedWalFiles(file_nums);
+    size_t size = file_nums.size();
+    uint64_t log_num = file_nums[size - 1];
+    CorruptFileWithTruncation(FileType::kWalFile, log_num,
+                              /*bytes_to_truncate=*/8);
+  }
+
+  // Fault is injected to fail the recovery.
+  {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) {
+          auto* tmp_s = reinterpret_cast<Status*>(arg);
+          assert(tmp_s);
+          *tmp_s = Status::IOError("Injected");
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    handles.clear();
+    options.avoid_flush_during_recovery = true;
+    s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
+    ASSERT_TRUE(s.IsIOError());
+    ASSERT_EQ("IO error: Injected", s.ToString());
+    for (auto* h : handles) {
+      delete h;
+    }
+    CloseDb();
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
+  // Reopen db again
+  {
+    options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
+    ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+    for (auto* h : handles) {
+      delete h;
+    }
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {

diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
@@ -1326,6 +1326,7 @@ Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
   Status s;
   // This gets the appear size of the wals, not including preallocated space.
   s = env_->GetFileSize(fname, &log.size);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::GetLogSizeAndMaybeTruncate:0", /*arg=*/&s);
   if (s.ok() && truncate) {
     std::unique_ptr<FSWritableFile> last_log;
     Status truncate_status = fs_->ReopenWritableFile(
@@ -1825,6 +1826,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
         if (s.ok()) {
           // Need to fsync, otherwise it might get lost after a power reset.
           s = impl->FlushWAL(false);
+          TEST_SYNC_POINT_CALLBACK("DBImpl::Open::BeforeSyncWAL", /*arg=*/&s);
           if (s.ok()) {
             s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
           }