From 2df8905531677fa41c0fa4190b3d9bee6e0c55ec Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Mon, 11 Oct 2021 16:22:10 -0700 Subject: [PATCH] Protect existing files in `FaultInjectionTest{Env,FS}::ReopenWritableFile()` (#8995) Summary: `FaultInjectionTest{Env,FS}::ReopenWritableFile()` functions were accidentally deleting WALs from previous `db_stress` runs causing verification to fail. They were operating under the assumption that `ReopenWritableFile()` would delete any existing file. It was a reasonable assumption considering the `{Env,FileSystem}::ReopenWritableFile()` documentation stated that would happen. The only problem was neither the implementations we offer nor the "real" clients in RocksDB code followed that contract. So, this PR updates the contract as well as fixing the fault injection client usage. The fault injection change exposed that `ExternalSSTFileBasicTest.SyncFailure` was relying on a fault injection `Env` dropping unsynced data written by a regular `Env`. I changed that test to make its `SstFileWriter` use fault injection `Env`, and also implemented `LinkFile()` in fault injection so the unsynced data is tracked under the new name. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8995 Test Plan: - Verified it fixes the following failure: ``` $ ./db_stress --clear_column_family_one_in=0 --column_families=1 --db=/dev/shm/rocksdb_crashtest_whitebox --delpercent=5 --expected_values_dir=/dev/shm/rocksdb_crashtest_expected --iterpercent=0 --key_len_percent_dist=1,30,69 --max_key=100000 --max_key_len=3 --nooverwritepercent=1 --ops_per_thread=1000 --prefixpercent=0 --readpercent=60 --reopen=0 --target_file_size_base=1048576 --test_batches_snapshots=0 --write_buffer_size=1048576 --writepercent=35 --value_size_mult=33 -threads=1 ... $ ./db_stress --avoid_flush_during_recovery=1 --clear_column_family_one_in=0 --column_families=1 --db=/dev/shm/rocksdb_crashtest_whitebox --delpercent=5 --destroy_db_initially=0 --expected_values_dir=/dev/shm/rocksdb_crashtest_expected --iterpercent=10 --key_len_percent_dist=1,30,69 --max_bytes_for_level_base=4194304 --max_key=100000 --max_key_len=3 --nooverwritepercent=1 --open_files=-1 --open_metadata_write_fault_one_in=8 --open_write_fault_one_in=16 --ops_per_thread=1000 --prefix_size=-1 --prefixpercent=0 --readpercent=50 --sync=1 --target_file_size_base=1048576 --test_batches_snapshots=0 --write_buffer_size=1048576 --writepercent=35 --value_size_mult=33 -threads=1 ... Verification failed for column family 0 key 000000000000001300000000000000857878787878 (1143): Value not found: NotFound: Crash-recovery verification failed :( ... ``` - `make check -j48` Reviewed By: ltamasi Differential Revision: D31495388 Pulled By: ajkr fbshipit-source-id: 7886ccb6a07cb8b78ad7b6c1c341ccf40bb68385 --- HISTORY.md | 1 + db/external_sst_file_basic_test.cc | 2 +- db_stress_tool/db_stress_gflags.cc | 11 ++- include/rocksdb/env.h | 11 +-- include/rocksdb/file_system.h | 11 +-- tools/db_crashtest.py | 3 + utilities/fault_injection_env.cc | 92 +++++++++++++++++++----- utilities/fault_injection_env.h | 6 +- utilities/fault_injection_fs.cc | 110 ++++++++++++++++++++++++----- utilities/fault_injection_fs.h | 8 ++- 10 files changed, 203 insertions(+), 52 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 0e55919775a..e8542b878c3 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## Unreleased ### Bug Fixes * Fix `DisableManualCompaction()` to cancel compactions even when they are waiting on automatic compactions to drain due to `CompactRangeOptions::exclusive_manual_compactions == true`. +* Fix contract of `Env::ReopenWritableFile()` and `FileSystem::ReopenWritableFile()` to specify any existing file must not be deleted or truncated. ## 6.25.1 (2021-09-28) ### Bug Fixes diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index a006a817da1..bbede382aa9 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -1147,7 +1147,7 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) { } Options sst_file_writer_options; - sst_file_writer_options.env = env_; + sst_file_writer_options.env = fault_injection_test_env_.get(); std::unique_ptr sst_file_writer( new SstFileWriter(EnvOptions(), sst_file_writer_options)); std::string file_name = diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 602a0712551..4dd39d2a851 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -19,7 +19,10 @@ static bool ValidateUint32Range(const char* flagname, uint64_t value) { return true; } -DEFINE_uint64(seed, 2341234, "Seed for PRNG"); +DEFINE_uint64(seed, 2341234, + "Seed for PRNG. When --nooverwritepercent is " + "nonzero and --expected_values_dir is nonempty, this value " + "must be fixed across invocations."); static const bool FLAGS_seed_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range); @@ -453,7 +456,8 @@ DEFINE_string( "provided and non-empty, the DB state will be verified against these " "values after recovery. --max_key and --column_family must be kept the " "same across invocations of this program that use the same " - "--expected_values_path."); + "--expected_values_path. See --seed and --nooverwritepercent for further " + "requirements."); DEFINE_bool(verify_checksum, false, "Verify checksum for every block read from storage"); @@ -644,7 +648,8 @@ static const bool FLAGS_delrangepercent_dummy __attribute__((__unused__)) = DEFINE_int32(nooverwritepercent, 60, "Ratio of keys without overwrite to total workload (expressed as " - " a percentage)"); + "a percentage). When --expected_values_dir is nonempty, must " + "keep this value constant across invocations."); static const bool FLAGS_nooverwritepercent_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_nooverwritepercent, &ValidateInt32Percent); diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 880046e5a0b..ce9827efbb7 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -266,11 +266,12 @@ class Env { std::unique_ptr* result, const EnvOptions& options) = 0; - // Create an object that writes to a new file with the specified - // name. Deletes any existing file with the same name and creates a - // new file. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores nullptr in *result and - // returns non-OK. + // Create an object that writes to a file with the specified name. + // `WritableFile::Append()`s will append after any existing content. If the + // file does not already exist, creates it. + // + // On success, stores a pointer to the file in *result and returns OK. On + // failure stores nullptr in *result and returns non-OK. // // The returned file will only be accessed by one thread at a time. virtual Status ReopenWritableFile(const std::string& /*fname*/, diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 2c73f720ad9..4dd74e8ef45 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -309,11 +309,12 @@ class FileSystem { std::unique_ptr* result, IODebugContext* dbg) = 0; - // Create an object that writes to a new file with the specified - // name. Deletes any existing file with the same name and creates a - // new file. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores nullptr in *result and - // returns non-OK. + // Create an object that writes to a file with the specified name. + // `FSWritableFile::Append()`s will append after any existing content. If the + // file does not already exist, creates it. + // + // On success, stores a pointer to the file in *result and returns OK. On + // failure stores nullptr in *result and returns non-OK. // // The returned file will only be accessed by one thread at a time. virtual IOStatus ReopenWritableFile( diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 3a96079bd66..1d0b52723e9 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -78,6 +78,9 @@ "max_key": 100000000, "max_write_buffer_number": 3, "mmap_read": lambda: random.randint(0, 1), + # Setting `nooverwritepercent > 0` is only possible because we do not vary + # the random seed, so the same keys are chosen by every run for disallowing + # overwrites. "nooverwritepercent": 1, "open_files": lambda : random.choice([-1, -1, 100, 500000]), "optimize_filters_for_memory": lambda: random.randint(0, 1), diff --git a/utilities/fault_injection_env.cc b/utilities/fault_injection_env.cc index 650655f2d9e..5ef206aec70 100644 --- a/utilities/fault_injection_env.cc +++ b/utilities/fault_injection_env.cc @@ -288,7 +288,7 @@ Status FaultInjectionTestEnv::NewWritableFile( // again then it will be truncated - so forget our saved state. UntrackFile(fname); MutexLock l(&mutex_); - open_files_.insert(fname); + open_managed_files_.insert(fname); auto dir_and_name = GetDirAndName(fname); auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; list.insert(dir_and_name.second); @@ -302,17 +302,49 @@ Status FaultInjectionTestEnv::ReopenWritableFile( if (!IsFilesystemActive()) { return GetError(); } - Status s = target()->ReopenWritableFile(fname, result, soptions); + + bool exists; + Status s, exists_s = target()->FileExists(fname); + if (exists_s.IsNotFound()) { + exists = false; + } else if (exists_s.ok()) { + exists = true; + } else { + s = exists_s; + exists = false; + } + if (s.ok()) { - result->reset(new TestWritableFile(fname, std::move(*result), this)); - // WritableFileWriter* file is opened - // again then it will be truncated - so forget our saved state. - UntrackFile(fname); - MutexLock l(&mutex_); - open_files_.insert(fname); - auto dir_and_name = GetDirAndName(fname); - auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; - list.insert(dir_and_name.second); + s = target()->ReopenWritableFile(fname, result, soptions); + } + + // Only track files we created. Files created outside of this + // `FaultInjectionTestEnv` are not eligible for tracking/data dropping + // (for example, they may contain data a previous db_stress run expects to + // be recovered). This could be extended to track/drop data appended once + // the file is under `FaultInjectionTestEnv`'s control. + if (s.ok()) { + bool should_track; + { + MutexLock l(&mutex_); + if (db_file_state_.find(fname) != db_file_state_.end()) { + // It was written by this `Env` earlier. + assert(exists); + should_track = true; + } else if (!exists) { + // It was created by this `Env` just now. + should_track = true; + open_managed_files_.insert(fname); + auto dir_and_name = GetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list.insert(dir_and_name.second); + } else { + should_track = false; + } + } + if (should_track) { + result->reset(new TestWritableFile(fname, std::move(*result), this)); + } } return s; } @@ -330,7 +362,7 @@ Status FaultInjectionTestEnv::NewRandomRWFile( // again then it will be truncated - so forget our saved state. UntrackFile(fname); MutexLock l(&mutex_); - open_files_.insert(fname); + open_managed_files_.insert(fname); auto dir_and_name = GetDirAndName(fname); auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; list.insert(dir_and_name.second); @@ -394,17 +426,43 @@ Status FaultInjectionTestEnv::RenameFile(const std::string& s, return ret; } +Status FaultInjectionTestEnv::LinkFile(const std::string& s, + const std::string& t) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status ret = EnvWrapper::LinkFile(s, t); + + if (ret.ok()) { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + } + + auto sdn = GetDirAndName(s); + auto tdn = GetDirAndName(t); + if (dir_to_new_files_since_last_sync_[sdn.first].find(sdn.second) != + dir_to_new_files_since_last_sync_[sdn.first].end()) { + auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; + assert(tlist.find(tdn.second) == tlist.end()); + tlist.insert(tdn.second); + } + } + + return ret; +} + void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) { MutexLock l(&mutex_); - if (open_files_.find(state.filename_) != open_files_.end()) { + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { db_file_state_[state.filename_] = state; - open_files_.erase(state.filename_); + open_managed_files_.erase(state.filename_); } } void FaultInjectionTestEnv::WritableFileSynced(const FileState& state) { MutexLock l(&mutex_); - if (open_files_.find(state.filename_) != open_files_.end()) { + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { if (db_file_state_.find(state.filename_) == db_file_state_.end()) { db_file_state_.insert(std::make_pair(state.filename_, state)); } else { @@ -415,7 +473,7 @@ void FaultInjectionTestEnv::WritableFileSynced(const FileState& state) { void FaultInjectionTestEnv::WritableFileAppended(const FileState& state) { MutexLock l(&mutex_); - if (open_files_.find(state.filename_) != open_files_.end()) { + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { if (db_file_state_.find(state.filename_) == db_file_state_.end()) { db_file_state_.insert(std::make_pair(state.filename_, state)); } else { @@ -485,6 +543,6 @@ void FaultInjectionTestEnv::UntrackFile(const std::string& f) { dir_to_new_files_since_last_sync_[dir_and_name.first].erase( dir_and_name.second); db_file_state_.erase(f); - open_files_.erase(f); + open_managed_files_.erase(f); } } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/fault_injection_env.h b/utilities/fault_injection_env.h index fa1fa0d64a9..b82b45237f5 100644 --- a/utilities/fault_injection_env.h +++ b/utilities/fault_injection_env.h @@ -175,6 +175,8 @@ class FaultInjectionTestEnv : public EnvWrapper { virtual Status RenameFile(const std::string& s, const std::string& t) override; + virtual Status LinkFile(const std::string& s, const std::string& t) override; + // Undef to eliminate clash on Windows #undef GetFreeSpace virtual Status GetFreeSpace(const std::string& path, @@ -237,13 +239,13 @@ class FaultInjectionTestEnv : public EnvWrapper { SetFilesystemActiveNoLock(active, error); error.PermitUncheckedError(); } - void AssertNoOpenFile() { assert(open_files_.empty()); } + void AssertNoOpenFile() { assert(open_managed_files_.empty()); } Status GetError() { return error_; } private: port::Mutex mutex_; std::map db_file_state_; - std::set open_files_; + std::set open_managed_files_; std::unordered_map> dir_to_new_files_since_last_sync_; bool filesystem_active_; // Record flushes, syncs, writes diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc index 45399f24fda..9babdd5e55d 100644 --- a/utilities/fault_injection_fs.cc +++ b/utilities/fault_injection_fs.cc @@ -443,7 +443,7 @@ IOStatus FaultInjectionTestFS::NewWritableFile( UntrackFile(fname); { MutexLock l(&mutex_); - open_files_.insert(fname); + open_managed_files_.insert(fname); auto dir_and_name = TestFSGetDirAndName(fname); auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; // The new file could overwrite an old one. Here we simplify @@ -476,19 +476,50 @@ IOStatus FaultInjectionTestFS::ReopenWritableFile( return in_s; } } - IOStatus io_s = target()->ReopenWritableFile(fname, file_opts, result, dbg); + + bool exists; + IOStatus io_s, + exists_s = target()->FileExists(fname, IOOptions(), nullptr /* dbg */); + if (exists_s.IsNotFound()) { + exists = false; + } else if (exists_s.ok()) { + exists = true; + } else { + io_s = exists_s; + exists = false; + } + if (io_s.ok()) { - result->reset( - new TestFSWritableFile(fname, file_opts, std::move(*result), this)); - // WritableFileWriter* file is opened - // again then it will be truncated - so forget our saved state. - UntrackFile(fname); + io_s = target()->ReopenWritableFile(fname, file_opts, result, dbg); + } + + // Only track files we created. Files created outside of this + // `FaultInjectionTestFS` are not eligible for tracking/data dropping + // (for example, they may contain data a previous db_stress run expects to + // be recovered). This could be extended to track/drop data appended once + // the file is under `FaultInjectionTestFS`'s control. + if (io_s.ok()) { + bool should_track; { MutexLock l(&mutex_); - open_files_.insert(fname); - auto dir_and_name = TestFSGetDirAndName(fname); - auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; - list[dir_and_name.second] = kNewFileNoOverwrite; + if (db_file_state_.find(fname) != db_file_state_.end()) { + // It was written by this `FileSystem` earlier. + assert(exists); + should_track = true; + } else if (!exists) { + // It was created by this `FileSystem` just now. + should_track = true; + open_managed_files_.insert(fname); + auto dir_and_name = TestFSGetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list[dir_and_name.second] = kNewFileNoOverwrite; + } else { + should_track = false; + } + } + if (should_track) { + result->reset( + new TestFSWritableFile(fname, file_opts, std::move(*result), this)); } { IOStatus in_s = InjectMetadataWriteError(); @@ -523,7 +554,7 @@ IOStatus FaultInjectionTestFS::NewRandomRWFile( UntrackFile(fname); { MutexLock l(&mutex_); - open_files_.insert(fname); + open_managed_files_.insert(fname); auto dir_and_name = TestFSGetDirAndName(fname); auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; // It could be overwriting an old file, but we simplify the @@ -655,17 +686,62 @@ IOStatus FaultInjectionTestFS::RenameFile(const std::string& s, return io_s; } +IOStatus FaultInjectionTestFS::LinkFile(const std::string& s, + const std::string& t, + const IOOptions& options, + IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + // Using the value in `dir_to_new_files_since_last_sync_` for the source file + // may be a more reasonable choice. + std::string previous_contents = kNewFileNoOverwrite; + + IOStatus io_s = FileSystemWrapper::LinkFile(s, t, options, dbg); + + if (io_s.ok()) { + { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + } + + auto sdn = TestFSGetDirAndName(s); + auto tdn = TestFSGetDirAndName(t); + if (dir_to_new_files_since_last_sync_[sdn.first].find(sdn.second) != + dir_to_new_files_since_last_sync_[sdn.first].end()) { + auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; + assert(tlist.find(tdn.second) == tlist.end()); + tlist[tdn.second] = previous_contents; + } + } + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + return io_s; +} + void FaultInjectionTestFS::WritableFileClosed(const FSFileState& state) { MutexLock l(&mutex_); - if (open_files_.find(state.filename_) != open_files_.end()) { + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { db_file_state_[state.filename_] = state; - open_files_.erase(state.filename_); + open_managed_files_.erase(state.filename_); } } void FaultInjectionTestFS::WritableFileSynced(const FSFileState& state) { MutexLock l(&mutex_); - if (open_files_.find(state.filename_) != open_files_.end()) { + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { if (db_file_state_.find(state.filename_) == db_file_state_.end()) { db_file_state_.insert(std::make_pair(state.filename_, state)); } else { @@ -676,7 +752,7 @@ void FaultInjectionTestFS::WritableFileSynced(const FSFileState& state) { void FaultInjectionTestFS::WritableFileAppended(const FSFileState& state) { MutexLock l(&mutex_); - if (open_files_.find(state.filename_) != open_files_.end()) { + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { if (db_file_state_.find(state.filename_) == db_file_state_.end()) { db_file_state_.insert(std::make_pair(state.filename_, state)); } else { @@ -755,7 +831,7 @@ void FaultInjectionTestFS::UntrackFile(const std::string& f) { dir_to_new_files_since_last_sync_[dir_and_name.first].erase( dir_and_name.second); db_file_state_.erase(f); - open_files_.erase(f); + open_managed_files_.erase(f); } IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError( diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h index 2ed2b5c016d..c160e2c40d0 100644 --- a/utilities/fault_injection_fs.h +++ b/utilities/fault_injection_fs.h @@ -236,6 +236,10 @@ class FaultInjectionTestFS : public FileSystemWrapper { const IOOptions& options, IODebugContext* dbg) override; + virtual IOStatus LinkFile(const std::string& src, const std::string& target, + const IOOptions& options, + IODebugContext* dbg) override; + // Undef to eliminate clash on Windows #undef GetFreeSpace virtual IOStatus GetFreeSpace(const std::string& path, @@ -321,7 +325,7 @@ class FaultInjectionTestFS : public FileSystemWrapper { MutexLock l(&mutex_); filesystem_writable_ = writable; } - void AssertNoOpenFile() { assert(open_files_.empty()); } + void AssertNoOpenFile() { assert(open_managed_files_.empty()); } IOStatus GetError() { return error_; } @@ -500,7 +504,7 @@ class FaultInjectionTestFS : public FileSystemWrapper { private: port::Mutex mutex_; std::map db_file_state_; - std::set open_files_; + std::set open_managed_files_; // directory -> (file name -> file contents to recover) // When data is recovered from unsyned parent directory, the files with // empty file contents to recover is deleted. Those with non-empty ones