From 2bcb47a62c68218f640de8dbf04a491a5955f08c Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 24 May 2022 21:56:46 +0800 Subject: [PATCH] PageStorage: Fix the WAL snapshot dumped into file may contains invalid "being_ref_count" (#4987) (#4992) close pingcap/tiflash#4957, close pingcap/tiflash#4986 --- dbms/src/Encryption/FileProvider.cpp | 6 +- dbms/src/Encryption/FileProvider.h | 1 + dbms/src/Encryption/MockKeyManager.cpp | 6 ++ dbms/src/Encryption/MockKeyManager.h | 11 ++- dbms/src/Storages/DeltaMerge/StoragePool.cpp | 2 +- dbms/src/Storages/Page/PageStorage.h | 3 +- dbms/src/Storages/Page/V3/PageDirectory.cpp | 22 ++++- dbms/src/Storages/Page/V3/PageDirectory.h | 2 +- .../Storages/Page/V3/PageDirectoryFactory.cpp | 9 +- .../Storages/Page/V3/PageDirectoryFactory.h | 3 + dbms/src/Storages/Page/V3/PageStorageImpl.cpp | 2 +- dbms/src/Storages/Page/V3/PageStorageImpl.h | 4 +- dbms/src/Storages/Page/V3/WALStore.cpp | 35 +++---- dbms/src/Storages/Page/V3/WALStore.h | 79 ++++++++-------- .../Storages/Page/V3/tests/entries_helper.h | 5 +- .../Page/V3/tests/gtest_page_directory.cpp | 91 ++++++++++++++++++- .../Page/V3/tests/gtest_page_storage.cpp | 29 ++++++ .../Page/V3/tests/gtest_wal_store.cpp | 10 +- dbms/src/Storages/Page/WALRecoveryMode.h | 61 +++++++++++++ 19 files changed, 294 insertions(+), 87 deletions(-) create mode 100644 dbms/src/Storages/Page/WALRecoveryMode.h diff --git a/dbms/src/Encryption/FileProvider.cpp b/dbms/src/Encryption/FileProvider.cpp index b4666cf6a71..f2f96fa8568 100644 --- a/dbms/src/Encryption/FileProvider.cpp +++ b/dbms/src/Encryption/FileProvider.cpp @@ -142,8 +142,12 @@ void FileProvider::deleteRegularFile(const String & file_path_, const Encryption { throw DB::TiFlashException("File: " + data_file.path() + " is not a regular file", Errors::Encryption::Internal); } - key_manager->deleteFile(encryption_path_.full_path, true); + // Remove the file on disk before removing the encryption key. Or we may leave an encrypted file without the encryption key + // and the encrypted file can not be read. + // In the worst case that TiFlash crash between removing the file on disk and removing the encryption key, we may leave + // the encryption key not deleted. However, this is a rare case and won't cause serious problem. data_file.remove(false); + key_manager->deleteFile(encryption_path_.full_path, true); } } diff --git a/dbms/src/Encryption/FileProvider.h b/dbms/src/Encryption/FileProvider.h index 79eec8f632b..1d23af7a2db 100644 --- a/dbms/src/Encryption/FileProvider.h +++ b/dbms/src/Encryption/FileProvider.h @@ -67,6 +67,7 @@ class FileProvider // If dir_path_as_encryption_path is true, use dir_path_ as EncryptionPath // If false, use every file's path inside dir_path_ as EncryptionPath + // Note this method is not atomic, and after calling it, the files in dir_path_ cannot be read again. void deleteDirectory( const String & dir_path_, bool dir_path_as_encryption_path = false, diff --git a/dbms/src/Encryption/MockKeyManager.cpp b/dbms/src/Encryption/MockKeyManager.cpp index d125961fd06..bbaeb37848a 100644 --- a/dbms/src/Encryption/MockKeyManager.cpp +++ b/dbms/src/Encryption/MockKeyManager.cpp @@ -13,8 +13,10 @@ // limitations under the License. #include +#include #include #include +#include #include #include @@ -40,12 +42,14 @@ MockKeyManager::MockKeyManager(EncryptionMethod method_, const String & key_, co , key{key_} , iv{iv} , encryption_enabled{encryption_enabled_} + , logger(DB::Logger::get("MockKeyManager")) {} FileEncryptionInfo MockKeyManager::newFile(const String & fname) { if (encryption_enabled) { + LOG_FMT_TRACE(logger, "Create mock encryption [file={}]", fname); files.emplace_back(fname); } return getFile(fname); @@ -64,6 +68,7 @@ void MockKeyManager::deleteFile(const String & fname, bool throw_on_error) { if (*iter == fname) { + LOG_FMT_TRACE(logger, "Delete mock encryption [file={}]", fname); files.erase(iter); break; } @@ -80,6 +85,7 @@ void MockKeyManager::linkFile(const String & src_fname, const String & dst_fname { throw DB::Exception(fmt::format("Can't find file which name is {}", src_fname), DB::ErrorCodes::LOGICAL_ERROR); } + LOG_FMT_TRACE(logger, "Link mock encryption file [src_file={}] [dst_file={}]", src_fname, dst_fname); files.emplace_back(dst_fname); } } diff --git a/dbms/src/Encryption/MockKeyManager.h b/dbms/src/Encryption/MockKeyManager.h index 914e6ab1fe4..268bb00d129 100644 --- a/dbms/src/Encryption/MockKeyManager.h +++ b/dbms/src/Encryption/MockKeyManager.h @@ -20,12 +20,15 @@ namespace DB { -class MockKeyManager : public KeyManager +class Logger; +using LoggerPtr = std::shared_ptr; + +class MockKeyManager final : public KeyManager { public: - ~MockKeyManager() = default; + ~MockKeyManager() override = default; - MockKeyManager(bool encryption_enabled_ = true); + explicit MockKeyManager(bool encryption_enabled_ = true); MockKeyManager(EncryptionMethod method_, const String & key_, const String & iv, bool encryption_enabled_ = true); @@ -50,5 +53,7 @@ class MockKeyManager : public KeyManager String key; String iv; bool encryption_enabled; + + LoggerPtr logger; }; } // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/StoragePool.cpp b/dbms/src/Storages/DeltaMerge/StoragePool.cpp index a040c5b6c6a..fa765cd9b1d 100644 --- a/dbms/src/Storages/DeltaMerge/StoragePool.cpp +++ b/dbms/src/Storages/DeltaMerge/StoragePool.cpp @@ -130,7 +130,7 @@ void GlobalStoragePool::restore() bool GlobalStoragePool::gc() { - return gc(Settings(), true, DELTA_MERGE_GC_PERIOD); + return gc(global_context.getSettingsRef(), true, DELTA_MERGE_GC_PERIOD); } bool GlobalStoragePool::gc(const Settings & settings, bool immediately, const Seconds & try_gc_period) diff --git a/dbms/src/Storages/Page/PageStorage.h b/dbms/src/Storages/Page/PageStorage.h index 479c368a585..78d800dc3af 100644 --- a/dbms/src/Storages/Page/PageStorage.h +++ b/dbms/src/Storages/Page/PageStorage.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -140,7 +141,7 @@ class PageStorage : private boost::noncopyable SettingUInt64 blob_block_alignment_bytes = 0; SettingUInt64 wal_roll_size = PAGE_META_ROLL_SIZE; - SettingUInt64 wal_recover_mode = 0; + SettingUInt64 wal_recover_mode = static_cast(WALRecoveryMode::TolerateCorruptedTailRecords); SettingUInt64 wal_max_persisted_log_files = MAX_PERSISTED_LOG_FILES; void reload(const Config & rhs) diff --git a/dbms/src/Storages/Page/V3/PageDirectory.cpp b/dbms/src/Storages/Page/V3/PageDirectory.cpp index 64a3fead674..d944fe422d3 100644 --- a/dbms/src/Storages/Page/V3/PageDirectory.cpp +++ b/dbms/src/Storages/Page/V3/PageDirectory.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1191,16 +1192,31 @@ PageDirectory::getEntriesByBlobIds(const std::vector & blob_ids) con return std::make_pair(std::move(blob_versioned_entries), total_page_size); } -bool PageDirectory::tryDumpSnapshot(const WriteLimiterPtr & write_limiter) +bool PageDirectory::tryDumpSnapshot(const ReadLimiterPtr & read_limiter, const WriteLimiterPtr & write_limiter) { bool done_any_io = false; // In order not to make read amplification too high, only apply compact logs when ... auto files_snap = wal->getFilesSnapshot(); if (files_snap.needSave(max_persisted_log_files)) { + // To prevent writes from affecting dumping snapshot (and vice versa), old log files + // are read from disk and a temporary PageDirectory is generated for dumping snapshot. + // The main reason write affect dumping snapshot is that we can not get a read-only + // `being_ref_count` by the function `createSnapshot()`. + assert(!files_snap.persisted_log_files.empty()); // should not be empty when `needSave` return true + auto log_num = files_snap.persisted_log_files.rbegin()->log_num; + auto identifier = fmt::format("{}_dump_{}", wal->name(), log_num); + auto snapshot_reader = wal->createReaderForFiles(identifier, files_snap.persisted_log_files, read_limiter); + PageDirectoryFactory factory; + // we just use the `collapsed_dir` to dump edit of the snapshot, should never call functions like `apply` that + // persist new logs into disk. So we pass `nullptr` as `wal` to the factory. + PageDirectoryPtr collapsed_dir = factory.createFromReader( + identifier, + std::move(snapshot_reader), + /*wal=*/nullptr); // The records persisted in `files_snap` is older than or equal to all records in `edit` - auto edit = dumpSnapshotToEdit(); - done_any_io = wal->saveSnapshot(std::move(files_snap), std::move(edit), write_limiter); + auto edit_from_disk = collapsed_dir->dumpSnapshotToEdit(); + done_any_io = wal->saveSnapshot(std::move(files_snap), std::move(edit_from_disk), write_limiter); } return done_any_io; } diff --git a/dbms/src/Storages/Page/V3/PageDirectory.h b/dbms/src/Storages/Page/V3/PageDirectory.h index a3c6b079fee..39b5a05a40a 100644 --- a/dbms/src/Storages/Page/V3/PageDirectory.h +++ b/dbms/src/Storages/Page/V3/PageDirectory.h @@ -358,7 +358,7 @@ class PageDirectory void gcApply(PageEntriesEdit && migrated_edit, const WriteLimiterPtr & write_limiter = nullptr); - bool tryDumpSnapshot(const WriteLimiterPtr & write_limiter = nullptr); + bool tryDumpSnapshot(const ReadLimiterPtr & read_limiter = nullptr, const WriteLimiterPtr & write_limiter = nullptr); PageEntriesV3 gcInMemEntries(); diff --git a/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp b/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp index 9d20e0a64ab..483c5073ab5 100644 --- a/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp +++ b/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp @@ -31,7 +31,12 @@ namespace PS::V3 PageDirectoryPtr PageDirectoryFactory::create(String storage_name, FileProviderPtr & file_provider, PSDiskDelegatorPtr & delegator, WALStore::Config config) { auto [wal, reader] = WALStore::create(storage_name, file_provider, delegator, config); - PageDirectoryPtr dir = std::make_unique(std::move(storage_name), std::move(wal), config.max_persisted_log_files); + return createFromReader(storage_name, reader, std::move(wal)); +} + +PageDirectoryPtr PageDirectoryFactory::createFromReader(String storage_name, WALStoreReaderPtr reader, WALStorePtr wal) +{ + PageDirectoryPtr dir = std::make_unique(storage_name, std::move(wal)); loadFromDisk(dir, std::move(reader)); // Reset the `sequence` to the maximum of persisted. @@ -40,7 +45,7 @@ PageDirectoryPtr PageDirectoryFactory::create(String storage_name, FileProviderP // After restoring from the disk, we need cleanup all invalid entries in memory, or it will // try to run GC again on some entries that are already marked as invalid in BlobStore. dir->gcInMemEntries(); - LOG_FMT_INFO(DB::Logger::get("PageDirectoryFactory"), "PageDirectory restored [max_page_id={}] [max_applied_ver={}]", dir->getMaxId(), dir->sequence); + LOG_FMT_INFO(DB::Logger::get("PageDirectoryFactory", storage_name), "PageDirectory restored [max_page_id={}] [max_applied_ver={}]", dir->getMaxId(), dir->sequence); if (blob_stats) { diff --git a/dbms/src/Storages/Page/V3/PageDirectoryFactory.h b/dbms/src/Storages/Page/V3/PageDirectoryFactory.h index 185e8fd19a5..a922db3b497 100644 --- a/dbms/src/Storages/Page/V3/PageDirectoryFactory.h +++ b/dbms/src/Storages/Page/V3/PageDirectoryFactory.h @@ -17,6 +17,7 @@ #include #include #include +#include namespace DB { @@ -47,6 +48,8 @@ class PageDirectoryFactory PageDirectoryPtr create(String storage_name, FileProviderPtr & file_provider, PSDiskDelegatorPtr & delegator, WALStore::Config config); + PageDirectoryPtr createFromReader(String storage_name, WALStoreReaderPtr reader, WALStorePtr wal); + // just for test PageDirectoryPtr createFromEdit(String storage_name, FileProviderPtr & file_provider, PSDiskDelegatorPtr & delegator, const PageEntriesEdit & edit); diff --git a/dbms/src/Storages/Page/V3/PageStorageImpl.cpp b/dbms/src/Storages/Page/V3/PageStorageImpl.cpp index 58fe4b4dd4c..cfa07199637 100644 --- a/dbms/src/Storages/Page/V3/PageStorageImpl.cpp +++ b/dbms/src/Storages/Page/V3/PageStorageImpl.cpp @@ -289,7 +289,7 @@ bool PageStorageImpl::gcImpl(bool /*not_skip*/, const WriteLimiterPtr & write_li // 1. Do the MVCC gc, clean up expired snapshot. // And get the expired entries. - if (page_directory->tryDumpSnapshot(write_limiter)) + if (page_directory->tryDumpSnapshot(read_limiter, write_limiter)) { GET_METRIC(tiflash_storage_page_gc_count, type_v3_mvcc_dumped).Increment(); } diff --git a/dbms/src/Storages/Page/V3/PageStorageImpl.h b/dbms/src/Storages/Page/V3/PageStorageImpl.h index 082adb8df34..50d160e81da 100644 --- a/dbms/src/Storages/Page/V3/PageStorageImpl.h +++ b/dbms/src/Storages/Page/V3/PageStorageImpl.h @@ -34,7 +34,7 @@ class PageStorageImpl : public DB::PageStorage const Config & config_, const FileProviderPtr & file_provider_); - ~PageStorageImpl(); + ~PageStorageImpl() override; static BlobStore::Config parseBlobConfig(const Config & config) { @@ -54,8 +54,8 @@ class PageStorageImpl : public DB::PageStorage WALStore::Config wal_config; wal_config.roll_size = config.wal_roll_size; - wal_config.wal_recover_mode = config.wal_recover_mode; wal_config.max_persisted_log_files = config.wal_max_persisted_log_files; + wal_config.setRecoverMode(config.wal_recover_mode); return wal_config; } diff --git a/dbms/src/Storages/Page/V3/WALStore.cpp b/dbms/src/Storages/Page/V3/WALStore.cpp index 1f1eaf3bc33..c7f11ee8b3c 100644 --- a/dbms/src/Storages/Page/V3/WALStore.cpp +++ b/dbms/src/Storages/Page/V3/WALStore.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include #include @@ -46,7 +47,7 @@ std::pair WALStore::create( auto reader = WALStoreReader::create(storage_name, provider, delegator, - static_cast(config.wal_recover_mode.get())); + config.getRecoverMode()); // Create a new LogFile for writing new logs auto last_log_num = reader->lastLogNum() + 1; // TODO reuse old file return { @@ -54,17 +55,23 @@ std::pair WALStore::create( reader}; } +WALStoreReaderPtr WALStore::createReaderForFiles(const String & identifier, const LogFilenameSet & log_filenames, const ReadLimiterPtr & read_limiter) +{ + return WALStoreReader::create(identifier, provider, log_filenames, config.getRecoverMode(), read_limiter); +} + WALStore::WALStore( - String storage_name, + String storage_name_, const PSDiskDelegatorPtr & delegator_, const FileProviderPtr & provider_, Format::LogNumberType last_log_num_, WALStore::Config config_) - : delegator(delegator_) + : storage_name(std::move(storage_name_)) + , delegator(delegator_) , provider(provider_) , last_log_num(last_log_num_) , wal_paths_index(0) - , logger(Logger::get("WALStore", std::move(storage_name))) + , logger(Logger::get("WALStore", storage_name)) , config(config_) { } @@ -186,7 +193,7 @@ bool WALStore::saveSnapshot(FilesSnapshot && files_snap, PageEntriesEdit && dire LOG_FMT_INFO(logger, "Saving directory snapshot"); - // Use {largest_log_num + 1, 1} to save the `edit` + // Use {largest_log_num, 1} to save the `edit` const auto log_num = files_snap.persisted_log_files.rbegin()->log_num; // Create a temporary file for saving directory snapshot auto [compact_log, log_filename] = createLogWriter({log_num, 1}, /*manual_flush*/ true); @@ -212,25 +219,11 @@ bool WALStore::saveSnapshot(FilesSnapshot && files_snap, PageEntriesEdit && dire true); LOG_FMT_INFO(logger, "Rename log file to normal done [fullname={}]", normal_fullname); - // #define ARCHIVE_COMPACTED_LOGS // keep for debug - // Remove compacted log files. for (const auto & filename : files_snap.persisted_log_files) { - if (auto f = Poco::File(filename.fullname(LogFileStage::Normal)); f.exists()) - { -#ifndef ARCHIVE_COMPACTED_LOGS - f.remove(); -#else - const Poco::Path archive_path(delegator->defaultPath(), "archive"); - Poco::File archive_dir(archive_path); - if (!archive_dir.exists()) - archive_dir.createDirectory(); - auto dest = archive_path.toString() + "/" + filename.filename(LogFileStage::Normal); - f.moveTo(dest); - LOG_FMT_INFO(logger, "archive {} to {}", filename.fullname(LogFileStage::Normal), dest); -#endif - } + const auto log_fullname = filename.fullname(LogFileStage::Normal); + provider->deleteRegularFile(log_fullname, EncryptionPath(log_fullname, "")); } FmtBuffer fmt_buf; diff --git a/dbms/src/Storages/Page/V3/WALStore.h b/dbms/src/Storages/Page/V3/WALStore.h index 039903a8608..f1ea00d3562 100644 --- a/dbms/src/Storages/Page/V3/WALStore.h +++ b/dbms/src/Storages/Page/V3/WALStore.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -34,45 +35,7 @@ class PSDiskDelegator; using PSDiskDelegatorPtr = std::shared_ptr; namespace PS::V3 { -enum class WALRecoveryMode : UInt8 -{ - // Original levelDB recovery - // - // We tolerate the last record in any log to be incomplete due to a crash - // while writing it. Zeroed bytes from preallocation are also tolerated in the - // trailing data of any log. - // - // Use case: Applications for which updates, once applied, must not be rolled - // back even after a crash-recovery. In this recovery mode, RocksDB guarantees - // this as long as `WritableFile::Append()` writes are durable. In case the - // user needs the guarantee in more situations (e.g., when - // `WritableFile::Append()` writes to page cache, but the user desires this - // guarantee in face of power-loss crash-recovery), RocksDB offers various - // mechanisms to additionally invoke `WritableFile::Sync()` in order to - // strengthen the guarantee. - // - // This differs from `kPointInTimeRecovery` in that, in case a corruption is - // detected during recovery, this mode will refuse to open the DB. Whereas, - // `kPointInTimeRecovery` will stop recovery just before the corruption since - // that is a valid point-in-time to which to recover. - TolerateCorruptedTailRecords = 0x00, - // Recover from clean shutdown - // We don't expect to find any corruption in the WAL - // Use case : This is ideal for unit tests and rare applications that - // can require high consistency guarantee - AbsoluteConsistency = 0x01, - // Recover to point-in-time consistency (default) - // We stop the WAL playback on discovering WAL inconsistency - // Use case : Ideal for systems that have disk controller cache like - // hard disk, SSD without super capacitor that store related data - PointInTimeRecovery = 0x02, - // Recovery after a disaster - // We ignore any corruption in the WAL and try to salvage as much data as - // possible - // Use case : Ideal for last ditch effort to recover data or systems that - // operate with low grade unrelated data - SkipAnyCorruptedRecords = 0x03, -}; + class WALStore; using WALStorePtr = std::unique_ptr; @@ -86,30 +49,56 @@ class WALStore struct Config { SettingUInt64 roll_size = PAGE_META_ROLL_SIZE; - SettingUInt64 wal_recover_mode = 0; SettingUInt64 max_persisted_log_files = MAX_PERSISTED_LOG_FILES; + + private: + SettingUInt64 wal_recover_mode = 0; + + public: + void setRecoverMode(UInt64 recover_mode) + { + if (unlikely(recover_mode != static_cast(WALRecoveryMode::TolerateCorruptedTailRecords) + && recover_mode != static_cast(WALRecoveryMode::AbsoluteConsistency) + && recover_mode != static_cast(WALRecoveryMode::PointInTimeRecovery) + && recover_mode != static_cast(WALRecoveryMode::SkipAnyCorruptedRecords))) + { + throw Exception("Unknow recover mode [num={}]", recover_mode); + } + wal_recover_mode = recover_mode; + } + + WALRecoveryMode getRecoverMode() + { + return static_cast(wal_recover_mode.get()); + } }; constexpr static const char * wal_folder_prefix = "/wal"; static std::pair create( - String storage_name, + String storage_name_, FileProviderPtr & provider, PSDiskDelegatorPtr & delegator, WALStore::Config config); + WALStoreReaderPtr createReaderForFiles(const String & identifier, const LogFilenameSet & log_filenames, const ReadLimiterPtr & read_limiter); + void apply(PageEntriesEdit & edit, const PageVersion & version, const WriteLimiterPtr & write_limiter = nullptr); void apply(const PageEntriesEdit & edit, const WriteLimiterPtr & write_limiter = nullptr); struct FilesSnapshot { Format::LogNumberType current_writting_log_num; + // The log files to generate snapshot from. Sorted by . + // If the WAL log file is not inited, it is an empty set. LogFilenameSet persisted_log_files; - bool needSave(const size_t & max_size) const + // Note that persisted_log_files should not be empty for needSave() == true, + // cause we get the largest log num from persisted_log_files as the new + // file name. + bool needSave(const size_t max_size) const { - // TODO: Make it configurable and check the reasonable of this number return persisted_log_files.size() > max_size; } }; @@ -121,6 +110,8 @@ class WALStore PageEntriesEdit && directory_snap, const WriteLimiterPtr & write_limiter = nullptr); + const String & name() { return storage_name; } + private: WALStore( String storage_name, @@ -134,6 +125,8 @@ class WALStore const std::pair & new_log_lvl, bool manual_flush); +private: + const String storage_name; PSDiskDelegatorPtr delegator; FileProviderPtr provider; mutable std::mutex log_file_mutex; diff --git a/dbms/src/Storages/Page/V3/tests/entries_helper.h b/dbms/src/Storages/Page/V3/tests/entries_helper.h index cce59919ec8..19e42755dae 100644 --- a/dbms/src/Storages/Page/V3/tests/entries_helper.h +++ b/dbms/src/Storages/Page/V3/tests/entries_helper.h @@ -25,6 +25,7 @@ #include #include #include +#include namespace DB { @@ -221,7 +222,9 @@ inline ::testing::AssertionResult getEntryNotExist( String error; try { - auto id_entry = dir->get(page_id, snap); + auto id_entry = dir->getOrNull(page_id, snap); + if (!id_entry.second.isValid()) + return ::testing::AssertionSuccess(); error = fmt::format( "Expect entry [id={}] from {} with snap{} not exist, but got <{}.{}, {}>", page_id_expr, diff --git a/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp b/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp index a75ae056023..6e2b0efa1ea 100644 --- a/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp +++ b/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp @@ -50,12 +50,17 @@ class PageDirectoryTest : public DB::base::TiFlashStorageTestBasic { auto path = getTemporaryPath(); dropDataOnDisk(path); + dir = restoreFromDisk(); + } + static PageDirectoryPtr restoreFromDisk() + { + auto path = getTemporaryPath(); auto ctx = DB::tests::TiFlashTestEnv::getContext(); FileProviderPtr provider = ctx.getFileProvider(); PSDiskDelegatorPtr delegator = std::make_shared(path); PageDirectoryFactory factory; - dir = factory.create("PageDirectoryTest", provider, delegator, WALStore::Config()); + return factory.create("PageDirectoryTest", provider, delegator, WALStore::Config()); } protected: @@ -1286,6 +1291,60 @@ class PageDirectoryGCTest : public PageDirectoryTest dir->apply(std::move(edit)); \ } +TEST_F(PageDirectoryGCTest, ManyEditsAndDumpSnapshot) +{ + PageId page_id0 = 50; + PageId page_id1 = 51; + PageId page_id2 = 52; + PageId page_id3 = 53; + + PageEntryV3 last_entry_for_0; + constexpr size_t num_edits_test = 50000; + for (size_t i = 0; i < num_edits_test; ++i) + { + { + INSERT_ENTRY(page_id0, i); + last_entry_for_0 = entry_vi; + } + { + INSERT_ENTRY(page_id1, i); + } + } + INSERT_DELETE(page_id1); + EXPECT_TRUE(dir->tryDumpSnapshot()); + dir.reset(); + + dir = restoreFromDisk(); + { + auto snap = dir->createSnapshot(); + ASSERT_SAME_ENTRY(dir->get(page_id0, snap).second, last_entry_for_0); + EXPECT_ENTRY_NOT_EXIST(dir, page_id1, snap); + } + + PageEntryV3 last_entry_for_2; + for (size_t i = 0; i < num_edits_test; ++i) + { + { + INSERT_ENTRY(page_id2, i); + last_entry_for_2 = entry_vi; + } + { + INSERT_ENTRY(page_id3, i); + } + } + INSERT_DELETE(page_id3); + EXPECT_TRUE(dir->tryDumpSnapshot()); + + dir = restoreFromDisk(); + { + auto snap = dir->createSnapshot(); + ASSERT_SAME_ENTRY(dir->get(page_id0, snap).second, last_entry_for_0); + EXPECT_ENTRY_NOT_EXIST(dir, page_id1, snap); + ASSERT_SAME_ENTRY(dir->get(page_id2, snap).second, last_entry_for_2); + EXPECT_ENTRY_NOT_EXIST(dir, page_id3, snap); + } +} + TEST_F(PageDirectoryGCTest, GCPushForward) try { @@ -1931,7 +1990,6 @@ try auto s0 = dir->createSnapshot(); auto edit = dir->dumpSnapshotToEdit(s0); - edit.size(); auto restore_from_edit = [](const PageEntriesEdit & edit) { auto deseri_edit = DB::PS::V3::ser::deserializeFrom(DB::PS::V3::ser::serializeTo(edit)); auto ctx = DB::tests::TiFlashTestEnv::getContext(); @@ -2214,6 +2272,35 @@ try } CATCH +TEST_F(PageDirectoryGCTest, CleanAfterDecreaseRef) +try +{ + PageEntryV3 entry_50_1{.file_id = 1, .size = 7890, .tag = 0, .offset = 0x123, .checksum = 0x4567}; + PageEntryV3 entry_50_2{.file_id = 2, .size = 7890, .tag = 0, .offset = 0x123, .checksum = 0x4567}; + + auto restore_from_edit = [](const PageEntriesEdit & edit) { + auto ctx = ::DB::tests::TiFlashTestEnv::getContext(); + auto provider = ctx.getFileProvider(); + auto path = getTemporaryPath(); + PSDiskDelegatorPtr delegator = std::make_shared(path); + PageDirectoryFactory factory; + auto d = factory.createFromEdit(getCurrentTestName(), provider, delegator, edit); + return d; + }; + + { + PageEntriesEdit edit; + edit.put(50, entry_50_1); + edit.put(50, entry_50_2); + edit.ref(51, 50); + edit.del(50); + edit.del(51); + auto restored_dir = restore_from_edit(edit); + auto page_ids = restored_dir->getAllPageIds(); + ASSERT_EQ(page_ids.size(), 0); + } +} +CATCH #undef INSERT_ENTRY_TO #undef INSERT_ENTRY diff --git a/dbms/src/Storages/Page/V3/tests/gtest_page_storage.cpp b/dbms/src/Storages/Page/V3/tests/gtest_page_storage.cpp index 498fd4124e5..ce2ba0adaf4 100644 --- a/dbms/src/Storages/Page/V3/tests/gtest_page_storage.cpp +++ b/dbms/src/Storages/Page/V3/tests/gtest_page_storage.cpp @@ -1380,5 +1380,34 @@ try } CATCH +TEST_F(PageStorageTest, CleanAfterDecreaseRef) +try +{ + // Make it in log_1_0 + { + WriteBatch batch; + batch.putExternal(1, 0); + page_storage->write(std::move(batch)); + } + + page_storage = reopenWithConfig(config); + + // Make it in log_2_0 + { + WriteBatch batch; + batch.putExternal(1, 0); + batch.putRefPage(2, 1); + batch.delPage(1); + batch.delPage(2); + page_storage->write(std::move(batch)); + } + page_storage = reopenWithConfig(config); + + auto alive_ids = page_storage->getAliveExternalPageIds(TEST_NAMESPACE_ID); + ASSERT_EQ(alive_ids.size(), 0); +} +CATCH + + } // namespace PS::V3::tests } // namespace DB diff --git a/dbms/src/Storages/Page/V3/tests/gtest_wal_store.cpp b/dbms/src/Storages/Page/V3/tests/gtest_wal_store.cpp index 89c4e54f7e7..6d47adabbc5 100644 --- a/dbms/src/Storages/Page/V3/tests/gtest_wal_store.cpp +++ b/dbms/src/Storages/Page/V3/tests/gtest_wal_store.cpp @@ -596,11 +596,12 @@ TEST_P(WALStoreTest, ManyEdits) try { auto ctx = DB::tests::TiFlashTestEnv::getContext(); - auto provider = ctx.getFileProvider(); + auto enc_key_manager = std::make_shared(/*encryption_enabled_=*/true); + auto enc_provider = std::make_shared(enc_key_manager, true); auto path = getTemporaryPath(); // Stage 1. empty - auto [wal, reader] = WALStore::create(getCurrentTestName(), provider, delegator, config); + auto [wal, reader] = WALStore::create(getCurrentTestName(), enc_provider, delegator, config); ASSERT_NE(wal, nullptr); std::mt19937 rd; @@ -633,7 +634,7 @@ try size_t num_edits_read = 0; size_t num_pages_read = 0; - std::tie(wal, reader) = WALStore::create(getCurrentTestName(), provider, delegator, config); + std::tie(wal, reader) = WALStore::create(getCurrentTestName(), enc_provider, delegator, config); while (reader->remained()) { auto [ok, edit] = reader->next(); @@ -653,8 +654,7 @@ try LOG_FMT_INFO(&Poco::Logger::get("WALStoreTest"), "Done test for {} persist pages in {} edits", num_pages_read, num_edits_test); // Test for save snapshot (with encryption) - auto enc_key_manager = std::make_shared(/*encryption_enabled_=*/true); - auto enc_provider = std::make_shared(enc_key_manager, true); + LogFilenameSet persisted_log_files = WALStoreReader::listAllFiles(delegator, log); WALStore::FilesSnapshot file_snap{.current_writting_log_num = 100, // just a fake value .persisted_log_files = persisted_log_files}; diff --git a/dbms/src/Storages/Page/WALRecoveryMode.h b/dbms/src/Storages/Page/WALRecoveryMode.h new file mode 100644 index 00000000000..740c9ed37a5 --- /dev/null +++ b/dbms/src/Storages/Page/WALRecoveryMode.h @@ -0,0 +1,61 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace DB +{ + +enum class WALRecoveryMode : UInt8 +{ + // Original levelDB recovery + // + // We tolerate the last record in any log to be incomplete due to a crash + // while writing it. Zeroed bytes from preallocation are also tolerated in the + // trailing data of any log. + // + // Use case: Applications for which updates, once applied, must not be rolled + // back even after a crash-recovery. In this recovery mode, RocksDB guarantees + // this as long as `WritableFile::Append()` writes are durable. In case the + // user needs the guarantee in more situations (e.g., when + // `WritableFile::Append()` writes to page cache, but the user desires this + // guarantee in face of power-loss crash-recovery), RocksDB offers various + // mechanisms to additionally invoke `WritableFile::Sync()` in order to + // strengthen the guarantee. + // + // This differs from `kPointInTimeRecovery` in that, in case a corruption is + // detected during recovery, this mode will refuse to open the DB. Whereas, + // `kPointInTimeRecovery` will stop recovery just before the corruption since + // that is a valid point-in-time to which to recover. + TolerateCorruptedTailRecords = 0x00, + // Recover from clean shutdown + // We don't expect to find any corruption in the WAL + // Use case : This is ideal for unit tests and rare applications that + // can require high consistency guarantee + AbsoluteConsistency = 0x01, + // Recover to point-in-time consistency (default) + // We stop the WAL playback on discovering WAL inconsistency + // Use case : Ideal for systems that have disk controller cache like + // hard disk, SSD without super capacitor that store related data + PointInTimeRecovery = 0x02, + // Recovery after a disaster + // We ignore any corruption in the WAL and try to salvage as much data as + // possible + // Use case : Ideal for last ditch effort to recover data or systems that + // operate with low grade unrelated data + SkipAnyCorruptedRecords = 0x03, +}; + +} // namespace DB