From ccd52b74cc256f81d896372d9627658160e79471 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Mon, 3 Dec 2018 17:58:20 -0800 Subject: [PATCH 01/33] Add PathNotFound subcode to IOError Summary: As titled. Returning error about non-existing path can help user better handle them. Test Plan: ``` $make clean && make -j32 all check ``` Reviewers: Subscribers: Tasks: Tags: --- HISTORY.md | 1 + env/env_hdfs.cc | 8 +++++--- env/io_posix.h | 3 +++ include/rocksdb/status.h | 14 ++++++++++++++ port/win/io_win.h | 12 +++++++----- util/status.cc | 3 ++- 6 files changed, 32 insertions(+), 9 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index ab576e5071a..6e25bbbe998 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -33,6 +33,7 @@ * For all users of dictionary compression, we now generate a separate dictionary for compressing each bottom-level SST file. Previously we reused a single dictionary for a whole compaction to bottom level. The new approach achieves better compression ratios; however, it uses more memory and CPU for buffering/sampling data blocks and training dictionaries. * Add whole key bloom filter support in memtable. * Files written by `SstFileWriter` will now use dictionary compression if it is configured in the file writer's `CompressionOptions`. +* Introduce a new IOError subcode, PathNotFound, to indicate trying to open a nonexistent file or directory for read. ### Public API Change * Disallow CompactionFilter::IgnoreSnapshots() = false, because it is not very useful and the behavior is confusing. The filter will filter everything if there is no snapshot declared by the time the compaction starts. However, users can define a snapshot after the compaction starts and before it finishes and this new snapshot won't be repeatable, because after the compaction finishes, some keys may be dropped. diff --git a/env/env_hdfs.cc b/env/env_hdfs.cc index 14fb902f0d4..7c0e14fe23e 100644 --- a/env/env_hdfs.cc +++ b/env/env_hdfs.cc @@ -36,9 +36,11 @@ namespace { // Log error message static Status IOError(const std::string& context, int err_number) { - return (err_number == ENOSPC) ? - Status::NoSpace(context, strerror(err_number)) : - Status::IOError(context, strerror(err_number)); + return (err_number == ENOSPC) + ? Status::NoSpace(context, strerror(err_number)) + : (err_number == ENOENT) + ? Status::PathNotFound(context, strerror(err_number)) + : Status::IOError(context, strerror(err_number)); } // assume that there is one global logger for now. It is not thread-safe, diff --git a/env/io_posix.h b/env/io_posix.h index 106f6df6507..e6824d3e870 100644 --- a/env/io_posix.h +++ b/env/io_posix.h @@ -41,6 +41,9 @@ static Status IOError(const std::string& context, const std::string& file_name, strerror(err_number)); case ESTALE: return Status::IOError(Status::kStaleFile); + case ENOENT: + return Status::PathNotFound(IOErrorMsg(context, file_name), + strerror(err_number)); default: return Status::IOError(IOErrorMsg(context, file_name), strerror(err_number)); diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index 40b374ecf6e..f8f66bf4226 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -73,6 +73,7 @@ class Status { kStaleFile = 6, kMemoryLimit = 7, kSpaceLimit = 8, + kPathNotFound = 9, kMaxSubCode }; @@ -198,6 +199,11 @@ class Status { return Status(kIOError, kSpaceLimit, msg, msg2); } + static Status PathNotFound() { return Status(kIOError, kPathNotFound); } + static Status PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, kPathNotFound, msg, msg2); + } + // Returns true iff the status indicates success. bool ok() const { return code() == kOk; } @@ -266,6 +272,14 @@ class Status { return (code() == kAborted) && (subcode() == kMemoryLimit); } + // Returns true iff the status indicates a PathNotFound error + // This is caused by an I/O error returning the specific "no such file or + // directory" error condition. A PathNotFound error is an I/O error with + // a specific subcode, enabling users to take appropriate action if necessary + bool IsPathNotFound() const { + return (code() == kIOError) && (subcode() == kPathNotFound); + } + // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. std::string ToString() const; diff --git a/port/win/io_win.h b/port/win/io_win.h index c46876b8c0c..1c9d803b13f 100644 --- a/port/win/io_win.h +++ b/port/win/io_win.h @@ -27,7 +27,9 @@ std::string GetWindowsErrSz(DWORD err); inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) { return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL)) ? Status::NoSpace(context, GetWindowsErrSz(err)) - : Status::IOError(context, GetWindowsErrSz(err)); + : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND)) + ? Status::PathNotFound(context, GetWindowsErrSz(err)) + : Status::IOError(context, GetWindowsErrSz(err)); } inline Status IOErrorFromLastWindowsError(const std::string& context) { @@ -37,7 +39,9 @@ inline Status IOErrorFromLastWindowsError(const std::string& context) { inline Status IOError(const std::string& context, int err_number) { return (err_number == ENOSPC) ? Status::NoSpace(context, strerror(err_number)) - : Status::IOError(context, strerror(err_number)); + : (err_number == ENOENT) + ? Status::PathNotFound(context, strerror(err_number)) + : Status::IOError(context, strerror(err_number)); } class WinFileData; @@ -426,9 +430,7 @@ class WinMemoryMappedBuffer : public MemoryMappedFileBuffer { class WinDirectory : public Directory { HANDLE handle_; public: - explicit - WinDirectory(HANDLE h) noexcept : - handle_(h) { + explicit WinDirectory(HANDLE h) noexcept : handle_(h) { assert(handle_ != INVALID_HANDLE_VALUE); } ~WinDirectory() { diff --git a/util/status.cc b/util/status.cc index 5b3dcf8e92e..c66bf6f8e16 100644 --- a/util/status.cc +++ b/util/status.cc @@ -41,7 +41,8 @@ static const char* msgs[static_cast(Status::kMaxSubCode)] = { "Deadlock", // kDeadlock "Stale file handle", // kStaleFile "Memory limit reached", // kMemoryLimit - "Space limit reached" // kSpaceLimit + "Space limit reached", // kSpaceLimit + "No such file or directory", // kPathNotFound }; Status::Status(Code _code, SubCode _subcode, const Slice& msg, From cb5754deeb9240488a7e9d832271488efb952fae Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Mon, 29 Oct 2018 10:53:36 -0700 Subject: [PATCH 02/33] Support non-blocking, retryable record reading Add TryReadRecord method to log::Reader so that caller can call `TryReadRecord` multiple times until a complete record is read. When a complete record is read, `TryReadRecord` returns `true`; when a record has remaining part yet to be read, or an error has occurred, `TryReadRecord` returns false. The caller can implement different retry *policies*. Also add unit test for non-blocking record read. Example usage: ``` log::Reader reader(...); Slice record; std::string scratch; while (reader.TryReadRecord(&record, &scratch)) { // process record } if (reader.reader_error_) { // handle error } else { // consider retry } ``` Note that `reader_error_` is not exposed at the moment. Furthermore, we need finer-grained error handling according to error type that is not exposed either. Update log test and rebase. Test plan: ``` $make clean && make -j32 all check $./log_test $./log_test --gtest_filter=bool/RetriableLogTest.NonBlockingReadFullRecord/* ``` --- db/db_impl_open.cc | 3 +- db/log_reader.cc | 282 +++++++++++++++++++++++++++++++++---- db/log_reader.h | 24 +++- db/log_test.cc | 226 +++++++++++++++++++---------- db/repair.cc | 3 +- db/transaction_log_impl.cc | 3 +- db/version_set.cc | 9 +- db/wal_manager.cc | 2 +- tools/ldb_cmd.cc | 5 +- 9 files changed, 437 insertions(+), 120 deletions(-) diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index 52ee537486d..99c27f45d57 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -629,8 +629,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // to be skipped instead of propagating bad information (like overly // large sequence numbers). log::Reader reader(immutable_db_options_.info_log, std::move(file_reader), - &reporter, true /*checksum*/, log_number, - false /* retry_after_eof */); + &reporter, true /*checksum*/, log_number); // Determine if we should tolerate incomplete records at the tail end of the // Read all the records and add to a memtable diff --git a/db/log_reader.cc b/db/log_reader.cc index 2c57cde5d59..237fd192948 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -24,8 +24,7 @@ Reader::Reporter::~Reporter() { Reader::Reader(std::shared_ptr info_log, std::unique_ptr&& _file, - Reporter* reporter, bool checksum, uint64_t log_num, - bool retry_after_eof) + Reporter* reporter, bool checksum, uint64_t log_num) : info_log_(info_log), file_(std::move(_file)), reporter_(reporter), @@ -39,7 +38,8 @@ Reader::Reader(std::shared_ptr info_log, end_of_buffer_offset_(0), log_number_(log_num), recycled_(false), - retry_after_eof_(retry_after_eof) {} + fragments_(), + in_fragmented_record_(false) {} Reader::~Reader() { delete[] backing_store_; @@ -199,6 +199,118 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, return false; } +// return true if a complete record has been read successfully. +bool Reader::TryReadRecord(Slice* record, std::string* scratch) { + assert(record != nullptr); + assert(scratch != nullptr); + record->clear(); + scratch->clear(); + + uint64_t prospective_record_offset = 0; + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); + size_t drop_size = 0; + unsigned int fragment_type_or_err = 0; // Initialize to make compiler happy + Slice fragment; + while (TryReadFragment(&fragment, &drop_size, &fragment_type_or_err)) { + switch (fragment_type_or_err) { + case kFullType: + case kRecyclableFullType: + if (in_fragmented_record_ && !fragments_.empty()) { + ReportCorruption(fragments_.size(), "partial record without end(1)"); + } + fragments_.clear(); + *record = fragment; + prospective_record_offset = physical_record_offset; + last_record_offset_ = prospective_record_offset; + in_fragmented_record_ = false; + return true; + + case kFirstType: + case kRecyclableFirstType: + if (in_fragmented_record_ && !fragments_.empty()) { + ReportCorruption(fragments_.size(), "partial record without end(2)"); + } + prospective_record_offset = physical_record_offset; + fragments_.assign(fragment.data(), fragment.size()); + in_fragmented_record_ = true; + break; + + case kMiddleType: + case kRecyclableMiddleType: + if (!in_fragmented_record_) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(1)"); + } else { + fragments_.append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + case kRecyclableLastType: + if (!in_fragmented_record_) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(2)"); + } else { + fragments_.append(fragment.data(), fragment.size()); + scratch->assign(fragments_.data(), fragments_.size()); + fragments_.clear(); + *record = Slice(*scratch); + last_record_offset_ = prospective_record_offset; + in_fragmented_record_ = false; + return true; + } + break; + + case kBadHeader: + case kEof: + case kOldRecord: + if (in_fragmented_record_) { + fragments_.clear(); + } + return false; + + case kBadRecord: + if (in_fragmented_record_) { + ReportCorruption(fragments_.size(), "error in middle of record"); + in_fragmented_record_ = false; + fragments_.clear(); + } + break; + + case kBadRecordLen: + case kBadRecordChecksum: + if (recycled_) { + fragments_.clear(); + return false; + } + if (fragment_type_or_err == kBadRecordLen) { + ReportCorruption(drop_size, "bad record length"); + } else { + ReportCorruption(drop_size, "checksum mismatch"); + } + if (in_fragmented_record_) { + ReportCorruption(fragments_.size(), "error in middle of record"); + in_fragmented_record_ = false; + fragments_.clear(); + } + break; + + default: { + char buf[40]; + snprintf(buf, sizeof(buf), "unknown record type %u", + fragment_type_or_err); + ReportCorruption( + fragment.size() + (in_fragmented_record_ ? fragments_.size() : 0), + buf); + in_fragmented_record_ = false; + fragments_.clear(); + break; + } + } + } + return false; +} + uint64_t Reader::LastRecordOffset() { return last_record_offset_; } @@ -207,14 +319,22 @@ void Reader::UnmarkEOF() { if (read_error_) { return; } - eof_ = false; + if (eof_offset_ == 0) { + return; + } + UnmarkEOFInternal(); +} - // If retry_after_eof_ is true, we have to proceed to read anyway. - if (!retry_after_eof_ && eof_offset_ == 0) { +void Reader::ForceUnmarkEOF() { + if (read_error_) { return; } + eof_ = false; + UnmarkEOFInternal(); +} +void Reader::UnmarkEOFInternal() { // If the EOF was in the middle of a block (a partial block was read) we have // to read the rest of the block as ReadPhysicalRecord can only read full // blocks and expects the file position indicator to be aligned to the start @@ -292,12 +412,8 @@ bool Reader::ReadMore(size_t* drop_size, int *error) { } else if (buffer_.size() < static_cast(kBlockSize)) { eof_ = true; eof_offset_ = buffer_.size(); - TEST_SYNC_POINT("LogReader::ReadMore:FirstEOF"); } return true; - } else if (retry_after_eof_ && !read_error_) { - UnmarkEOF(); - return !read_error_; } else { // Note that if buffer_ is non-empty, we have a truncated header at the // end of the file, which can be caused by the writer crashing in the @@ -355,24 +471,16 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) { } } if (header_size + length > buffer_.size()) { - if (!retry_after_eof_) { - *drop_size = buffer_.size(); - buffer_.clear(); - if (!eof_) { - return kBadRecordLen; - } - // If the end of the file has been reached without reading |length| - // bytes of payload, assume the writer died in the middle of writing the - // record. Don't report a corruption unless requested. - if (*drop_size) { - return kBadHeader; - } - } else { - int r = kEof; - if (!ReadMore(drop_size, &r)) { - return r; - } - continue; + *drop_size = buffer_.size(); + buffer_.clear(); + if (!eof_) { + return kBadRecordLen; + } + // If the end of the file has been reached without reading |length| + // bytes of payload, assume the writer died in the middle of writing the + // record. Don't report a corruption unless requested. + if (*drop_size) { + return kBadHeader; } return kEof; } @@ -409,5 +517,123 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) { } } +bool Reader::TryReadMore(size_t* drop_size, int* error) { + if (!eof_ && !read_error_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + end_of_buffer_offset_ += buffer_.size(); + if (!status.ok()) { + buffer_.clear(); + ReportDrop(kBlockSize, status); + read_error_ = true; + *error = kEof; + return false; + } else if (buffer_.size() < static_cast(kBlockSize)) { + eof_ = true; + eof_offset_ = buffer_.size(); + TEST_SYNC_POINT_CALLBACK("LogReader::TryReadMore:FirstEOF", nullptr); + } + return true; + } else if (!read_error_) { + ForceUnmarkEOF(); + return !read_error_; + } else { + // Note that if buffer_ is non-empty, we have a truncated header at the + // end of the file, which can be caused by the writer crashing in the + // middle of writing the header. Unless explicitly requested we don't + // considering this an error, just report EOF. + if (buffer_.size()) { + *drop_size = buffer_.size(); + buffer_.clear(); + *error = kBadHeader; + return false; + } + buffer_.clear(); + *error = kEof; + return false; + } +} + +// return true if the caller should process the fragment_type_or_err. +bool Reader::TryReadFragment(Slice* fragment, size_t* drop_size, + unsigned int* fragment_type_or_err) { + assert(fragment != nullptr); + assert(drop_size != nullptr); + assert(fragment_type_or_err != nullptr); + + while (buffer_.size() < static_cast(kHeaderSize)) { + size_t old_size = buffer_.size(); + int error = kEof; + if (!TryReadMore(drop_size, &error)) { + *fragment_type_or_err = error; + return false; + } else if (old_size == buffer_.size()) { + return false; + } + } + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + int header_size = kHeaderSize; + if (type >= kRecyclableFullType && type <= kRecyclableLastType) { + if (end_of_buffer_offset_ - buffer_.size() == 0) { + recycled_ = true; + } + header_size = kRecyclableHeaderSize; + while (buffer_.size() < static_cast(kRecyclableHeaderSize)) { + size_t old_size = buffer_.size(); + int error = kEof; + if (!TryReadMore(drop_size, &error)) { + *fragment_type_or_err = error; + return false; + } else if (old_size == buffer_.size()) { + return false; + } + } + const uint32_t log_num = DecodeFixed32(header + 7); + if (log_num != log_number_) { + *fragment_type_or_err = kOldRecord; + return true; + } + } + + while (header_size + length > buffer_.size()) { + size_t old_size = buffer_.size(); + int error = kEof; + if (!TryReadMore(drop_size, &error)) { + *fragment_type_or_err = error; + return false; + } else if (old_size == buffer_.size()) { + return false; + } + } + + if (type == kZeroType && length == 0) { + buffer_.clear(); + *fragment_type_or_err = kBadRecord; + return true; + } + + if (checksum_) { + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6); + if (actual_crc != expected_crc) { + *drop_size = buffer_.size(); + buffer_.clear(); + *fragment_type_or_err = kBadRecordChecksum; + return true; + } + } + + buffer_.remove_prefix(header_size + length); + + *fragment = Slice(header + header_size, length); + *fragment_type_or_err = type; + return true; +} + } // namespace log } // namespace rocksdb diff --git a/db/log_reader.h b/db/log_reader.h index 2c4f4f05990..bd5f8f54883 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -53,7 +53,7 @@ class Reader { Reader(std::shared_ptr info_log, // @lint-ignore TXT2 T25377293 Grandfathered in std::unique_ptr&& file, Reporter* reporter, - bool checksum, uint64_t log_num, bool retry_after_eof); + bool checksum, uint64_t log_num); ~Reader(); @@ -66,6 +66,8 @@ class Reader { WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords); + bool TryReadRecord(Slice* record, std::string* scratch); + // Returns the physical offset of the last record returned by ReadRecord. // // Undefined before the first call to ReadRecord. @@ -76,6 +78,9 @@ class Reader { return eof_; } + // returns true if the reader has encountered read error. + bool hasReadError() const { return read_error_; } + // when we know more data has been written to the file. we can use this // function to force the reader to look again in the file. // Also aligns the file position indicator to the start of the next block @@ -83,6 +88,8 @@ class Reader { // block that was partially read. void UnmarkEOF(); + void ForceUnmarkEOF(); + SequentialFileReader* file() { return file_.get(); } private: @@ -91,6 +98,8 @@ class Reader { Reporter* const reporter_; bool const checksum_; char* const backing_store_; + + // Internal state variables used for reading records Slice buffer_; bool eof_; // Last Read() indicated EOF by returning < kBlockSize bool read_error_; // Error occurred while reading from file @@ -110,10 +119,8 @@ class Reader { // Whether this is a recycled log file bool recycled_; - // Whether retry after encountering EOF - // TODO (yanqin) add support for retry policy, e.g. sleep, max retry limit, - // etc. - const bool retry_after_eof_; + std::string fragments_; + bool in_fragmented_record_; // Extend record types with the following special values enum { @@ -136,9 +143,16 @@ class Reader { // Return type, or one of the preceding special values unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size); + bool TryReadFragment(Slice* result, size_t* drop_size, + unsigned int* fragment_type_or_err); + // Read some more bool ReadMore(size_t* drop_size, int *error); + bool TryReadMore(size_t* drop_size, int* error); + + void UnmarkEOFInternal(); + // Reports dropped bytes to the reporter. // buffer_ must be updated to remove the dropped bytes prior to invocation. void ReportCorruption(size_t bytes, const char* reason); diff --git a/db/log_test.cc b/db/log_test.cc index 834dec7cd82..c79ffd82c65 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -43,7 +43,10 @@ static std::string RandomSkewedString(int i, Random* rnd) { return BigString(NumberString(i), rnd->Skewed(17)); } -class LogTest : public ::testing::TestWithParam { +// Param type is tuple +// get<0>(tuple): non-zero if recycling log, zero if regular log +// get<1>(tuple): true if allow retry after read EOF, false otherwise +class LogTest : public ::testing::TestWithParam> { private: class StringSource : public SequentialFile { public: @@ -53,16 +56,20 @@ class LogTest : public ::testing::TestWithParam { bool force_eof_; size_t force_eof_position_; bool returned_partial_; - explicit StringSource(Slice& contents) : - contents_(contents), - force_error_(false), - force_error_position_(0), - force_eof_(false), - force_eof_position_(0), - returned_partial_(false) { } + bool fail_after_read_partial_; + explicit StringSource(Slice& contents, bool fail_after_read_partial) + : contents_(contents), + force_error_(false), + force_error_position_(0), + force_eof_(false), + force_eof_position_(0), + returned_partial_(false), + fail_after_read_partial_(fail_after_read_partial) {} Status Read(size_t n, Slice* result, char* scratch) override { - EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error"; + if (fail_after_read_partial_) { + EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error"; + } if (force_error_) { if (force_error_position_ >= n) { @@ -151,9 +158,8 @@ class LogTest : public ::testing::TestWithParam { Writer writer_; Reader reader_; - // Record metadata for testing initial offset functionality - static size_t initial_offset_record_sizes_[]; - uint64_t initial_offset_last_record_offsets_[4]; + protected: + bool allow_retry_read_; public: LogTest() @@ -161,19 +167,12 @@ class LogTest : public ::testing::TestWithParam { dest_holder_(test::GetWritableFileWriter( new test::StringSink(&reader_contents_), "" /* don't care */)), source_holder_(test::GetSequentialFileReader( - new StringSource(reader_contents_), "" /* file name */)), - writer_(std::move(dest_holder_), 123, GetParam()), + new StringSource(reader_contents_, !std::get<1>(GetParam())), + "" /* file name */)), + writer_(std::move(dest_holder_), 123, std::get<0>(GetParam())), reader_(nullptr, std::move(source_holder_), &report_, - true /* checksum */, 123 /* log_number */, - false /* retry_after_eof */) { - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; - initial_offset_last_record_offsets_[0] = 0; - initial_offset_last_record_offsets_[1] = header_size + 10000; - initial_offset_last_record_offsets_[2] = 2 * (header_size + 10000); - initial_offset_last_record_offsets_[3] = 2 * (header_size + 10000) + - (2 * log::kBlockSize - 1000) + - 3 * header_size; - } + true /* checksum */, 123 /* log_number */), + allow_retry_read_(std::get<1>(GetParam())) {} Slice* get_reader_contents() { return &reader_contents_; } @@ -189,7 +188,13 @@ class LogTest : public ::testing::TestWithParam { WALRecoveryMode::kTolerateCorruptedTailRecords) { std::string scratch; Slice record; - if (reader_.ReadRecord(&record, &scratch, wal_recovery_mode)) { + bool ret = false; + if (allow_retry_read_) { + ret = reader_.TryReadRecord(&record, &scratch); + } else { + ret = reader_.ReadRecord(&record, &scratch, wal_recovery_mode); + } + if (ret) { return record.ToString(); } else { return "EOF"; @@ -258,23 +263,8 @@ class LogTest : public ::testing::TestWithParam { return "OK"; } } - - void WriteInitialOffsetLog() { - for (int i = 0; i < 4; i++) { - std::string record(initial_offset_record_sizes_[i], - static_cast('a' + i)); - Write(record); - } - } - }; -size_t LogTest::initial_offset_record_sizes_[] = - {10000, // Two sizable records in first block - 10000, - 2 * log::kBlockSize - 1000, // Span three blocks - 1}; - TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); } TEST_P(LogTest, ReadWrite) { @@ -312,7 +302,8 @@ TEST_P(LogTest, Fragmentation) { TEST_P(LogTest, MarginalTrailer) { // Make a trailer that is exactly the same length as an empty record. - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + int header_size = + std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize; const int n = kBlockSize - 2 * header_size; Write(BigString("foo", n)); ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes()); @@ -326,7 +317,8 @@ TEST_P(LogTest, MarginalTrailer) { TEST_P(LogTest, MarginalTrailer2) { // Make a trailer that is exactly the same length as an empty record. - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + int header_size = + std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize; const int n = kBlockSize - 2 * header_size; Write(BigString("foo", n)); ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes()); @@ -339,7 +331,8 @@ TEST_P(LogTest, MarginalTrailer2) { } TEST_P(LogTest, ShortTrailer) { - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + int header_size = + std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize; const int n = kBlockSize - 2 * header_size + 4; Write(BigString("foo", n)); ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes()); @@ -352,7 +345,8 @@ TEST_P(LogTest, ShortTrailer) { } TEST_P(LogTest, AlignedEof) { - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + int header_size = + std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize; const int n = kBlockSize - 2 * header_size + 4; Write(BigString("foo", n)); ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes()); @@ -403,6 +397,11 @@ TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) { } TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then truncated trailing record should not + // raise an error. + return; + } Write("foo"); ShrinkSize(4); // Drop all payload as well as a header byte ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); @@ -412,13 +411,20 @@ TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) { } TEST_P(LogTest, BadLength) { - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + if (allow_retry_read_) { + // If read retry is allowed, then we should not raise an error when the + // record length specified in header is longer than data currently + // available. It's possible that the body of the record is not written yet. + return; + } + bool recyclable_log = (std::get<0>(GetParam()) != 0); + int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize; const int kPayloadSize = kBlockSize - header_size; Write(BigString("bar", kPayloadSize)); Write("foo"); // Least significant size byte is stored in header[4]. IncrementByte(4, 1); - if (!GetParam()) { + if (!recyclable_log) { ASSERT_EQ("foo", Read()); ASSERT_EQ(kBlockSize, DroppedBytes()); ASSERT_EQ("OK", MatchError("bad record length")); @@ -428,6 +434,12 @@ TEST_P(LogTest, BadLength) { } TEST_P(LogTest, BadLengthAtEndIsIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then we should not raise an error when the + // record length specified in header is longer than data currently + // available. It's possible that the body of the record is not written yet. + return; + } Write("foo"); ShrinkSize(1); ASSERT_EQ("EOF", Read()); @@ -436,6 +448,12 @@ TEST_P(LogTest, BadLengthAtEndIsIgnored) { } TEST_P(LogTest, BadLengthAtEndIsNotIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then we should not raise an error when the + // record length specified in header is longer than data currently + // available. It's possible that the body of the record is not written yet. + return; + } Write("foo"); ShrinkSize(1); ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); @@ -447,7 +465,8 @@ TEST_P(LogTest, ChecksumMismatch) { Write("foooooo"); IncrementByte(0, 14); ASSERT_EQ("EOF", Read()); - if (!GetParam()) { + bool recyclable_log = (std::get<0>(GetParam()) != 0); + if (!recyclable_log) { ASSERT_EQ(14U, DroppedBytes()); ASSERT_EQ("OK", MatchError("checksum mismatch")); } else { @@ -458,8 +477,10 @@ TEST_P(LogTest, ChecksumMismatch) { TEST_P(LogTest, UnexpectedMiddleType) { Write("foo"); - SetByte(6, static_cast(GetParam() ? kRecyclableMiddleType : kMiddleType)); - FixChecksum(0, 3, !!GetParam()); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + SetByte(6, static_cast(recyclable_log ? kRecyclableMiddleType + : kMiddleType)); + FixChecksum(0, 3, !!recyclable_log); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); ASSERT_EQ("OK", MatchError("missing start")); @@ -467,8 +488,10 @@ TEST_P(LogTest, UnexpectedMiddleType) { TEST_P(LogTest, UnexpectedLastType) { Write("foo"); - SetByte(6, static_cast(GetParam() ? kRecyclableLastType : kLastType)); - FixChecksum(0, 3, !!GetParam()); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + SetByte(6, + static_cast(recyclable_log ? kRecyclableLastType : kLastType)); + FixChecksum(0, 3, !!recyclable_log); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); ASSERT_EQ("OK", MatchError("missing start")); @@ -477,8 +500,10 @@ TEST_P(LogTest, UnexpectedLastType) { TEST_P(LogTest, UnexpectedFullType) { Write("foo"); Write("bar"); - SetByte(6, static_cast(GetParam() ? kRecyclableFirstType : kFirstType)); - FixChecksum(0, 3, !!GetParam()); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + SetByte( + 6, static_cast(recyclable_log ? kRecyclableFirstType : kFirstType)); + FixChecksum(0, 3, !!recyclable_log); ASSERT_EQ("bar", Read()); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); @@ -488,8 +513,10 @@ TEST_P(LogTest, UnexpectedFullType) { TEST_P(LogTest, UnexpectedFirstType) { Write("foo"); Write(BigString("bar", 100000)); - SetByte(6, static_cast(GetParam() ? kRecyclableFirstType : kFirstType)); - FixChecksum(0, 3, !!GetParam()); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + SetByte( + 6, static_cast(recyclable_log ? kRecyclableFirstType : kFirstType)); + FixChecksum(0, 3, !!recyclable_log); ASSERT_EQ(BigString("bar", 100000), Read()); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); @@ -506,6 +533,11 @@ TEST_P(LogTest, MissingLastIsIgnored) { } TEST_P(LogTest, MissingLastIsNotIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then truncated trailing record should not + // raise an error. + return; + } Write(BigString("bar", kBlockSize)); // Remove the LAST block, including header. ShrinkSize(14); @@ -524,6 +556,11 @@ TEST_P(LogTest, PartialLastIsIgnored) { } TEST_P(LogTest, PartialLastIsNotIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then truncated trailing record should not + // raise an error. + return; + } Write(BigString("bar", kBlockSize)); // Cause a bad record length in the LAST block. ShrinkSize(1); @@ -550,7 +587,8 @@ TEST_P(LogTest, ErrorJoinsRecords) { SetByte(offset, 'x'); } - if (!GetParam()) { + bool recyclable_log = (std::get<0>(GetParam()) != 0); + if (!recyclable_log) { ASSERT_EQ("correct", Read()); ASSERT_EQ("EOF", Read()); size_t dropped = DroppedBytes(); @@ -564,7 +602,8 @@ TEST_P(LogTest, ErrorJoinsRecords) { TEST_P(LogTest, ClearEofSingleBlock) { Write("foo"); Write("bar"); - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + bool recyclable_log = (std::get<0>(GetParam()) != 0); + int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize; ForceEOF(3 + header_size + 2); ASSERT_EQ("foo", Read()); UnmarkEOF(); @@ -579,7 +618,8 @@ TEST_P(LogTest, ClearEofSingleBlock) { TEST_P(LogTest, ClearEofMultiBlock) { size_t num_full_blocks = 5; - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + bool recyclable_log = (std::get<0>(GetParam()) != 0); + int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize; size_t n = (kBlockSize - header_size) * num_full_blocks + 25; Write(BigString("foo", n)); Write(BigString("bar", n)); @@ -628,7 +668,8 @@ TEST_P(LogTest, ClearEofError2) { } TEST_P(LogTest, Recycle) { - if (!GetParam()) { + bool recyclable_log = (std::get<0>(GetParam()) != 0); + if (!recyclable_log) { return; // test is only valid for recycled logs } Write("foo"); @@ -651,7 +692,11 @@ TEST_P(LogTest, Recycle) { ASSERT_EQ("EOF", Read()); } -INSTANTIATE_TEST_CASE_P(bool, LogTest, ::testing::Values(0, 2)); +INSTANTIATE_TEST_CASE_P(bool, LogTest, + ::testing::Values(std::make_tuple(0, false), + std::make_tuple(0, true), + std::make_tuple(1, false), + std::make_tuple(1, true))); class RetriableLogTest : public ::testing::TestWithParam { private: @@ -717,8 +762,7 @@ class RetriableLogTest : public ::testing::TestWithParam { reader_.reset(new SequentialFileReader(std::move(seq_file), log_file_)); assert(reader_ != nullptr); log_reader_.reset(new Reader(nullptr, std::move(reader_), &report_, - true /* checksum */, 123 /* log_number */, - true /* retry_after_eof */)); + true /* checksum */, 123 /* log_number */)); assert(log_reader_ != nullptr); } return s; @@ -738,14 +782,17 @@ class RetriableLogTest : public ::testing::TestWithParam { writer_->Sync(true); } - std::string Read() { - auto wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; + bool TryRead(std::string* result) { + assert(result != nullptr); + result->clear(); std::string scratch; Slice record; - if (log_reader_->ReadRecord(&record, &scratch, wal_recovery_mode)) { - return record.ToString(); + bool r = log_reader_->TryReadRecord(&record, &scratch); + if (r) { + result->assign(record.data(), record.size()); + return true; } else { - return "Read error"; + return false; } } }; @@ -754,12 +801,16 @@ TEST_P(RetriableLogTest, TailLog_PartialHeader) { ASSERT_OK(SetupTestEnv()); std::vector remaining_bytes_in_last_record; size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + bool eof = false; SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->LoadDependency( {{"RetriableLogTest::TailLog:AfterPart1", "RetriableLogTest::TailLog:BeforeReadRecord"}, - {"LogReader::ReadMore:FirstEOF", + {"LogReader::TryReadMore:FirstEOF", "RetriableLogTest::TailLog:BeforePart2"}}); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack("LogReader::TryReadMore:FirstEOF", + [&](void* /*arg*/) { eof = true; }); SyncPoint::GetInstance()->EnableProcessing(); size_t delta = header_size - 1; @@ -779,23 +830,29 @@ TEST_P(RetriableLogTest, TailLog_PartialHeader) { std::string record; port::Thread log_reader_thread([&]() { TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord"); - record = Read(); + while (!TryRead(&record)) { + } }); log_reader_thread.join(); log_writer_thread.join(); ASSERT_EQ("foo", record); + ASSERT_TRUE(eof); } TEST_P(RetriableLogTest, TailLog_FullHeader) { ASSERT_OK(SetupTestEnv()); std::vector remaining_bytes_in_last_record; size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + bool eof = false; SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->LoadDependency( {{"RetriableLogTest::TailLog:AfterPart1", "RetriableLogTest::TailLog:BeforeReadRecord"}, - {"LogReader::ReadMore:FirstEOF", + {"LogReader::TryReadMore:FirstEOF", "RetriableLogTest::TailLog:BeforePart2"}}); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack("LogReader::TryReadMore:FirstEOF", + [&](void* /*arg*/) { eof = true; }); SyncPoint::GetInstance()->EnableProcessing(); size_t delta = header_size + 1; @@ -810,18 +867,45 @@ TEST_P(RetriableLogTest, TailLog_FullHeader) { TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1"); TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2"); Write(Slice(part2)); + ASSERT_TRUE(eof); }); std::string record; port::Thread log_reader_thread([&]() { TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord"); - record = Read(); + while (!TryRead(&record)) { + } }); log_reader_thread.join(); log_writer_thread.join(); ASSERT_EQ("foo", record); } +TEST_P(RetriableLogTest, NonBlockingReadFullRecord) { + // Clear all sync point callbacks even if this test does not use sync point. + // It is necessary, otherwise the execute of this test may hit a sync point + // with which a callback is registered. The registered callback may access + // some dead variable, causing segfault. + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_OK(SetupTestEnv()); + size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + size_t delta = header_size - 1; + size_t old_sz = contents().size(); + Encode("foo-bar"); + size_t new_sz = contents().size(); + std::string part1 = contents().substr(old_sz, delta); + std::string part2 = + contents().substr(old_sz + delta, new_sz - old_sz - delta); + Write(Slice(part1)); + std::string record; + ASSERT_FALSE(TryRead(&record)); + ASSERT_TRUE(record.empty()); + Write(Slice(part2)); + ASSERT_TRUE(TryRead(&record)); + ASSERT_EQ("foo-bar", record); +} + INSTANTIATE_TEST_CASE_P(bool, RetriableLogTest, ::testing::Values(0, 2)); } // namespace log diff --git a/db/repair.cc b/db/repair.cc index ae74e578c2c..7b9409a229e 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -364,8 +364,7 @@ class Repairer { // propagating bad information (like overly large sequence // numbers). log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter, - true /*enable checksum*/, log, - false /* retry_after_eof */); + true /*enable checksum*/, log); // Initialize per-column family memtables for (auto* cfd : *vset_.GetColumnFamilySet()) { diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index 4d6671ef66d..4f55a30d30a 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -315,8 +315,7 @@ Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) { assert(file); currentLogReader_.reset( new log::Reader(options_->info_log, std::move(file), &reporter_, - read_options_.verify_checksums_, logFile->LogNumber(), - false /* retry_after_eof */)); + read_options_.verify_checksums_, logFile->LogNumber())); return Status::OK(); } } // namespace rocksdb diff --git a/db/version_set.cc b/db/version_set.cc index 12c7754b198..6a1ef5a8400 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3565,8 +3565,7 @@ Status VersionSet::Recover( VersionSet::LogReporter reporter; reporter.status = &s; log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, - true /* checksum */, 0 /* log_number */, - false /* retry_after_eof */); + true /* checksum */, 0 /* log_number */); Slice record; std::string scratch; std::vector replay_buffer; @@ -3781,8 +3780,7 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, VersionSet::LogReporter reporter; reporter.status = &s; log::Reader reader(nullptr, std::move(file_reader), &reporter, - true /* checksum */, 0 /* log_number */, - false /* retry_after_eof */); + true /* checksum */, 0 /* log_number */); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -3942,8 +3940,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, VersionSet::LogReporter reporter; reporter.status = &s; log::Reader reader(nullptr, std::move(file_reader), &reporter, - true /* checksum */, 0 /* log_number */, - false /* retry_after_eof */); + true /* checksum */, 0 /* log_number */); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 44676a77a7f..b306df710e1 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -457,7 +457,7 @@ Status WalManager::ReadFirstLine(const std::string& fname, reporter.status = &status; reporter.ignore_error = !db_options_.paranoid_checks; log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter, - true /*checksum*/, number, false /* retry_after_eof */); + true /*checksum*/, number); std::string scratch; Slice record; diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index c82369777df..c071c0bdfad 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -2006,9 +2006,8 @@ void DumpWalFile(Options options, std::string wal_file, bool print_header, // bogus input, carry on as best we can log_number = 0; } - log::Reader reader(options.info_log, std::move(wal_file_reader), &reporter, - true /* checksum */, log_number, - false /* retry_after_eof */); + log::Reader reader(options.info_log, std::move(wal_file_reader), + &reporter, true /* checksum */, log_number); std::string scratch; WriteBatch batch; Slice record; From 6264e08448b58d39fd4ad397fb1628cd0f3fd5e1 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Sat, 20 Oct 2018 18:15:34 -0700 Subject: [PATCH 03/33] Add support for MANIFEST tailing Test plan (to be updated) ``` $make clean && make -j32 all check ``` All tests must pass. --- db/db_impl.h | 13 ++ db/db_impl_open.cc | 2 +- db/db_impl_readonly.cc | 2 +- db/db_impl_secondary.cc | 368 ++++++++++++++++++++++++++++++ db/db_impl_secondary.h | 140 ++++++++++++ db/log_reader.h | 2 + db/version_builder.cc | 44 ++-- db/version_builder.h | 8 +- db/version_set.cc | 493 +++++++++++++++++++++++++++++++++++++--- db/version_set.h | 35 ++- include/rocksdb/db.h | 9 + src.mk | 1 + 12 files changed, 1061 insertions(+), 56 deletions(-) create mode 100644 db/db_impl_secondary.cc create mode 100644 db/db_impl_secondary.h diff --git a/db/db_impl.h b/db/db_impl.h index f7ead885b10..cfe053ad318 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -1215,6 +1215,8 @@ class DBImpl : public DB { // and log_empty_. Refer to the definition of each variable below for more // details. InstrumentedMutex log_write_mutex_; + + protected: // State below is protected by mutex_ // With two_write_queues enabled, some of the variables that accessed during // WriteToWAL need different synchronization: log_empty_, alive_log_files_, @@ -1222,6 +1224,7 @@ class DBImpl : public DB { // more description. mutable InstrumentedMutex mutex_; + private: std::atomic shutting_down_; // This condition variable is signaled on these conditions: // * whenever bg_compaction_scheduled_ goes down to 0 @@ -1252,8 +1255,12 @@ class DBImpl : public DB { // read and writes are protected by log_write_mutex_ instead. This is to avoid // expesnive mutex_ lock during WAL write, which update log_empty_. bool log_empty_; + + protected: ColumnFamilyHandleImpl* default_cf_handle_; InternalStats* default_cf_internal_stats_; + + private: std::unique_ptr column_family_memtables_; struct LogFileNumberSize { explicit LogFileNumberSize(uint64_t _number) @@ -1320,12 +1327,16 @@ class DBImpl : public DB { WriteBatch cached_recoverable_state_; std::atomic cached_recoverable_state_empty_ = {true}; std::atomic total_log_size_; + + protected: // only used for dynamically adjusting max_total_wal_size. it is a sum of // [write_buffer_size * max_write_buffer_number] over all column families uint64_t max_total_in_memory_state_; // If true, we have only one (default) column family. We use this to optimize // some code-paths bool single_column_family_mode_; + + private: // If this is non-empty, we need to delete these log files in background // threads. Protected by db mutex. autovector logs_to_free_; @@ -1544,12 +1555,14 @@ class DBImpl : public DB { std::string db_absolute_path_; + protected: // The options to access storage files const EnvOptions env_options_; // Additonal options for compaction and flush EnvOptions env_options_for_compaction_; + private: // Number of running IngestExternalFile() calls. // REQUIRES: mutex held int num_running_ingest_file_; diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index 99c27f45d57..9c8937bf2f0 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -405,7 +405,6 @@ Status DBImpl::Recover( } if (s.ok()) { - SequenceNumber next_sequence(kMaxSequenceNumber); default_cf_handle_ = new ColumnFamilyHandleImpl( versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); @@ -468,6 +467,7 @@ Status DBImpl::Recover( if (!logs.empty()) { // Recover in the order in which the logs were generated std::sort(logs.begin(), logs.end()); + SequenceNumber next_sequence(kMaxSequenceNumber); s = RecoverLogFiles(logs, &next_sequence, read_only); if (!s.ok()) { // Clear memtables if recovery failed diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index 5d7515c28e2..46b2c61b5d5 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -159,7 +159,6 @@ Status DB::OpenForReadOnly( *dbptr = nullptr; handles->clear(); - SuperVersionContext sv_context(/* create_superversion */ true); DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname); impl->mutex_.Lock(); Status s = impl->Recover(column_families, true /* read only */, @@ -176,6 +175,7 @@ Status DB::OpenForReadOnly( handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); } } + SuperVersionContext sv_context(/* create_superversion */ true); if (s.ok()) { for (auto cfd : *impl->versions_->GetColumnFamilySet()) { sv_context.NewSuperVersion(); diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc new file mode 100644 index 00000000000..205339a00a2 --- /dev/null +++ b/db/db_impl_secondary.cc @@ -0,0 +1,368 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_impl_secondary.h" +#include "db/db_impl.h" +#include "db/db_iter.h" +#include "db/forward_iterator.h" +#include "db/merge_context.h" +#include "db/range_del_aggregator.h" +#include "monitoring/perf_context_imp.h" +#include "util/auto_roll_logger.h" + +namespace rocksdb { + +#ifndef ROCKSDB_LITE + +DBImplSecondary::DBImplSecondary(const DBOptions& db_options, + const std::string& dbname) + : DBImpl(db_options, dbname) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Opening the db in secondary mode"); + LogFlush(immutable_db_options_.info_log); +} + +DBImplSecondary::~DBImplSecondary() {} + +Status DBImplSecondary::Recover( + const std::vector& column_families) { + mutex_.AssertHeld(); + + Status s; + s = versions_->RecoverAsSecondary(column_families, &manifest_reader_, + &manifest_reporter_, + &manifest_reader_status_); + if (!s.ok()) { + return s; + } + if (immutable_db_options_.paranoid_checks && s.ok()) { + s = CheckConsistency(); + } + // Initial max_total_in_memory_state_ before recovery logs. Log recovery + // may check this value to decide whether to flush. + max_total_in_memory_state_ = 0; + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + if (s.ok()) { + default_cf_handle_ = new ColumnFamilyHandleImpl( + versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); + default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); + single_column_family_mode_ = + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; + } + + // TODO: attempt to recover from WAL files. + return s; +} + +// Implementation of the DB interface +Status DBImplSecondary::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + return GetImpl(read_options, column_family, key, value); +} + +Status DBImplSecondary::GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val) { + assert(pinnable_val != nullptr); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); + StopWatch sw(env_, stats_, DB_GET); + PERF_TIMER_GUARD(get_snapshot_time); + + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(column_family, key); + } + } + // Acquire SuperVersion + SuperVersion* super_version = GetAndRefSuperVersion(cfd); + SequenceNumber snapshot = versions_->LastSequence(); + ; + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + Status s; + LookupKey lkey(key, snapshot); + PERF_TIMER_STOP(get_snapshot_time); + + bool done = false; + if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, + &max_covering_tombstone_seq, read_options)) { + done = true; + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + super_version->imm->Get( + lkey, pinnable_val->GetSelf(), &s, &merge_context, + &max_covering_tombstone_seq, read_options)) { + done = true; + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } + if (!done && !s.ok() && !s.IsMergeInProgress()) { + ReturnAndCleanupSuperVersion(cfd, super_version); + return s; + } + if (!done) { + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->Get(read_options, lkey, pinnable_val, &s, + &merge_context, &max_covering_tombstone_seq); + RecordTick(stats_, MEMTABLE_MISS); + } + { + PERF_TIMER_GUARD(get_post_process_time); + ReturnAndCleanupSuperVersion(cfd, super_version); + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = pinnable_val->size(); + RecordTick(stats_, BYTES_READ, size); + MeasureTime(stats_, BYTES_PER_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + } + return s; +} + +Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + if (read_options.managed) { + return NewErrorIterator( + Status::NotSupported("Managed iterator is not supported anymore.")); + } + if (read_options.read_tier == kPersistedTier) { + return NewErrorIterator(Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators.")); + } + Iterator* result = nullptr; + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + ReadCallback* read_callback = nullptr; // No read callback provided. + if (read_options.tailing) { + SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_); + auto iter = new ForwardIterator(this, read_options, cfd, super_version); + result = NewDBIterator( + env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, + cfd->user_comparator(), iter, kMaxSequenceNumber, + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + read_callback, this, cfd); + } else { + auto snapshot = read_options.snapshot != nullptr + ? read_options.snapshot->GetSequenceNumber() + : versions_->LastSequence(); + result = NewIteratorImpl(read_options, cfd, snapshot, read_callback); + } + return result; +} + +ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( + const ReadOptions& read_options, ColumnFamilyData* cfd, + SequenceNumber snapshot, ReadCallback* read_callback) { + SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_); + auto db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, + snapshot, + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + super_version->version_number, read_callback); + auto internal_iter = + NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), snapshot); + db_iter->SetIterUnderDBIter(internal_iter); + return db_iter; +} + +Status DBImplSecondary::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + if (read_options.managed) { + return Status::NotSupported("Managed iterator is not supported anymore."); + } + if (read_options.read_tier == kPersistedTier) { + return Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators."); + } + ReadCallback* read_callback = nullptr; // No read callback provided. + if (iterators == nullptr) { + return Status::InvalidArgument("iterators not allowed to be nullptr"); + } + iterators->clear(); + iterators->reserve(column_families.size()); + if (read_options.tailing) { + for (auto cfh : column_families) { + auto cfd = reinterpret_cast(cfh)->cfd(); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_); + auto iter = new ForwardIterator(this, read_options, cfd, super_version); + iterators->push_back(NewDBIterator( + env_, read_options, *cfd->ioptions(), + super_version->mutable_cf_options, cfd->user_comparator(), iter, + kMaxSequenceNumber, + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + read_callback, this, cfd)); + } + } else { + SequenceNumber latest_snapshot = versions_->LastSequence(); + SequenceNumber read_seq = + read_options.snapshot != nullptr + ? reinterpret_cast(read_options.snapshot) + ->number_ + : latest_snapshot; + + for (auto cfh : column_families) { + auto* cfd = reinterpret_cast(cfh)->cfd(); + iterators->push_back( + NewIteratorImpl(read_options, cfd, read_seq, read_callback)); + } + } + + return Status::OK(); +} + +Status DBImplSecondary::TryCatchUpWithPrimary() { + assert(versions_.get() != nullptr); + assert(manifest_reader_.get() != nullptr); + Status s; + std::unordered_set cfds_changed; + InstrumentedMutexLock lock_guard(mutex()); + s = versions_->ReadAndApply(mutex(), &manifest_reader_, &cfds_changed); + if (s.ok()) { + SuperVersionContext sv_context(true /* create_superversion */); + for (auto cfd : cfds_changed) { + sv_context.NewSuperVersion(); + cfd->InstallSuperVersion(&sv_context, mutex()); + } + sv_context.Clean(); + } + return s; +} + +Status DB::OpenAsSecondary(const Options& options, const std::string& dbname, + const std::string& secondary_dbname, DB** dbptr) { + *dbptr = nullptr; + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); + std::vector handles; + + Status s = DB::OpenAsSecondary(db_options, dbname, secondary_dbname, + column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + delete handles[0]; + } + return s; +} + +Status DB::OpenAsSecondary( + const DBOptions& db_options, const std::string& dbname, + const std::string& secondary_dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr) { + *dbptr = nullptr; + if (db_options.max_open_files != -1) { + // TODO (yanqin) maybe support max_open_files != -1 by creating hard links + // on SST files so that db secondary can still have access to old SSTs + // while primary instance may delete original. + return Status::InvalidArgument("require max_open_files to be -1"); + } + + DBOptions tmp_opts(db_options); + if (nullptr == tmp_opts.info_log) { + Env* env = tmp_opts.env; + assert(env != nullptr); + std::string secondary_db_abs_path; + env->GetAbsolutePath(secondary_dbname, &secondary_db_abs_path); + std::string fname = InfoLogFileName(secondary_dbname, secondary_db_abs_path, + tmp_opts.db_log_dir); + + env->CreateDirIfMissing(secondary_dbname); + if (tmp_opts.log_file_time_to_roll > 0 || tmp_opts.max_log_file_size > 0) { + AutoRollLogger* result = new AutoRollLogger( + env, secondary_dbname, tmp_opts.db_log_dir, + tmp_opts.max_log_file_size, tmp_opts.log_file_time_to_roll, + tmp_opts.info_log_level); + Status s = result->GetStatus(); + if (!s.ok()) { + delete result; + } else { + tmp_opts.info_log.reset(result); + } + } + if (nullptr == tmp_opts.info_log) { + env->RenameFile(fname, OldInfoLogFileName( + secondary_dbname, env->NowMicros(), + secondary_db_abs_path, tmp_opts.db_log_dir)); + Status s = env->NewLogger(fname, &(tmp_opts.info_log)); + if (tmp_opts.info_log != nullptr) { + tmp_opts.info_log->SetInfoLogLevel(tmp_opts.info_log_level); + } + } + } + + assert(tmp_opts.info_log != nullptr); + + handles->clear(); + DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname); + impl->mutex_.Lock(); + Status s = impl->Recover(column_families); + if (s.ok()) { + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (nullptr == cfd) { + s = Status::InvalidArgument("Column family not found: ", cf.name); + break; + } + handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + } + } + SuperVersionContext sv_context(true /* create_superversion */); + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + sv_context.NewSuperVersion(); + cfd->InstallSuperVersion(&sv_context, &impl->mutex_); + } + } + impl->mutex_.Unlock(); + sv_context.Clean(); + if (s.ok()) { + *dbptr = impl; + for (auto h : *handles) { + impl->NewThreadStatusCfInfo( + reinterpret_cast(h)->cfd()); + } + } else { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete impl; + } + return s; +} +#else // !ROCKSDB_LITE + +Status DB::OpenAsSecondary(const Options& /*options*/, + const std::string& /*name*/, + const std::string& /*secondary_name*/, + DB** /*dbptr*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} + +Status DB::OpenAsSecondary( + const DBOptions& /*db_options*/, const std::string& /*dbname*/, + const std::string& /*secondary_name*/, + const std::vector& /*column_families*/, + std::vector* /*handles*/, DB** /*dbptr*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} +#endif // !ROCKSDB_LITE + +} // namespace rocksdb diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h new file mode 100644 index 00000000000..6b7570c414d --- /dev/null +++ b/db/db_impl_secondary.h @@ -0,0 +1,140 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include "db/db_impl.h" + +namespace rocksdb { + +class DBImplSecondary : public DBImpl { + public: + DBImplSecondary(const DBOptions& options, const std::string& dbname); + virtual ~DBImplSecondary(); + + Status Recover(const std::vector& column_families); + + // Implementations of the DB interface + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value); + + using DBImpl::NewIterator; + virtual Iterator* NewIterator(const ReadOptions&, + ColumnFamilyHandle* column_family) override; + + ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback); + + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + using DBImpl::Put; + virtual Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Merge; + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Delete; + virtual Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::SingleDelete; + virtual Status SingleDelete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr, + CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + virtual Status DisableFileDeletions() override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + virtual Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status GetLiveFiles(std::vector&, + uint64_t* /*manifest_file_size*/, + bool /*flush_memtable*/ = true) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::Flush; + virtual Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::SyncWAL; + virtual Status SyncWAL() override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + Status TryCatchUpWithPrimary(); + + private: + friend class DB; + + // No copying allowed + DBImplSecondary(const DBImplSecondary&); + void operator=(const DBImplSecondary&); + + std::unique_ptr manifest_reader_; + std::unique_ptr manifest_reporter_; + std::unique_ptr manifest_reader_status_; +}; +} // namespace rocksdb + +#endif // !ROCKSDB_LITE diff --git a/db/log_reader.h b/db/log_reader.h index bd5f8f54883..83d05ddcb11 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -92,6 +92,8 @@ class Reader { SequentialFileReader* file() { return file_.get(); } + Reporter* GetReporter() const { return reporter_; } + private: std::shared_ptr info_log_; const std::unique_ptr file_; diff --git a/db/version_builder.cc b/db/version_builder.cc index 7b45347c124..a920e28d651 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -364,10 +364,10 @@ class VersionBuilder::Rep { CheckConsistency(vstorage); } - void LoadTableHandlers(InternalStats* internal_stats, int max_threads, - bool prefetch_index_and_filter_in_cache, - bool is_initial_load, - const SliceTransform* prefix_extractor) { + Status LoadTableHandlers(InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load, + const SliceTransform* prefix_extractor) { assert(table_cache_ != nullptr); size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity(); @@ -394,7 +394,8 @@ class VersionBuilder::Rep { size_t table_cache_usage = table_cache_->get_cache()->GetUsage(); if (table_cache_usage >= load_limit) { - return; + // TODO (yanqin) find a suitable status code. + return Status::OK(); } else { max_load = load_limit - table_cache_usage; } @@ -402,11 +403,15 @@ class VersionBuilder::Rep { // std::vector> files_meta; + std::vector statuses; for (int level = 0; level < num_levels_; level++) { for (auto& file_meta_pair : levels_[level].added_files) { auto* file_meta = file_meta_pair.second; - assert(!file_meta->table_reader_handle); - files_meta.emplace_back(file_meta, level); + // If the file has been opened before, just skip it. + if (!file_meta->table_reader_handle) { + files_meta.emplace_back(file_meta, level); + statuses.emplace_back(Status::OK()); + } if (files_meta.size() >= max_load) { break; } @@ -426,7 +431,7 @@ class VersionBuilder::Rep { auto* file_meta = files_meta[file_idx].first; int level = files_meta[file_idx].second; - table_cache_->FindTable( + statuses[file_idx] = table_cache_->FindTable( env_options_, *(base_vstorage_->InternalComparator()), file_meta->fd, &file_meta->table_reader_handle, prefix_extractor, false /*no_io */, true /* record_read_stats */, @@ -448,6 +453,12 @@ class VersionBuilder::Rep { for (auto& t : threads) { t.join(); } + for (const auto& s : statuses) { + if (!s.ok()) { + return s; + } + } + return Status::OK(); } void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) { @@ -487,14 +498,15 @@ void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) { rep_->SaveTo(vstorage); } -void VersionBuilder::LoadTableHandlers(InternalStats* internal_stats, - int max_threads, - bool prefetch_index_and_filter_in_cache, - bool is_initial_load, - const SliceTransform* prefix_extractor) { - rep_->LoadTableHandlers(internal_stats, max_threads, - prefetch_index_and_filter_in_cache, is_initial_load, - prefix_extractor); +Status VersionBuilder::LoadTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load, + const SliceTransform* prefix_extractor) { + return rep_->LoadTableHandlers(internal_stats, max_threads, + prefetch_index_and_filter_in_cache, + is_initial_load, + prefix_extractor); } void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level, diff --git a/db/version_builder.h b/db/version_builder.h index d6ee37e08ff..168301fdd61 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -33,10 +33,10 @@ class VersionBuilder { bool CheckConsistencyForNumLevels(); void Apply(VersionEdit* edit); void SaveTo(VersionStorageInfo* vstorage); - void LoadTableHandlers(InternalStats* internal_stats, int max_threads, - bool prefetch_index_and_filter_in_cache, - bool is_initial_load, - const SliceTransform* prefix_extractor); + Status LoadTableHandlers(InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load, + const SliceTransform* prefix_extractor); void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f); private: diff --git a/db/version_set.cc b/db/version_set.cc index 6a1ef5a8400..54ecc2abe92 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -712,6 +712,7 @@ void LevelIterator::InitFileIterator(size_t new_file_index) { } } } +} // anonymous namespace // A wrapper of version builder which references the current version in // constructor and unref it in the destructor. @@ -735,7 +736,6 @@ class BaseReferencedVersionBuilder { VersionBuilder* version_builder_; Version* version_; }; -} // anonymous namespace Status Version::GetTableProperties(std::shared_ptr* tp, const FileMetaData* file_meta, @@ -2936,7 +2936,7 @@ Status VersionSet::ProcessManifestWrites( } else if (group_start != std::numeric_limits::max()) { group_start = std::numeric_limits::max(); } - LogAndApplyHelper(last_writer->cfd, builder, version, e, mu); + LogAndApplyHelper(last_writer->cfd, builder, e, mu); batch_edits.push_back(e); } } @@ -2990,6 +2990,7 @@ Status VersionSet::ProcessManifestWrites( assert(pending_manifest_file_number_ == 0); if (!descriptor_log_ || manifest_file_size_ > db_options_->max_manifest_file_size) { + TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest"); pending_manifest_file_number_ = NewFileNumber(); batch_edits.back()->SetNextFile(next_file_number_.load()); new_descriptor_log = true; @@ -3098,6 +3099,7 @@ Status VersionSet::ProcessManifestWrites( if (s.ok() && new_descriptor_log) { s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_, db_directory); + TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:AfterNewManifest"); } if (s.ok()) { @@ -3225,7 +3227,7 @@ Status VersionSet::ProcessManifestWrites( return s; } -// 'datas' is gramatically incorrect. We still use this notation is to indicate +// 'datas' is gramatically incorrect. We still use this notation to indicate // that this variable represents a collection of column_family_data. Status VersionSet::LogAndApply( const autovector& column_family_datas, @@ -3307,6 +3309,133 @@ Status VersionSet::LogAndApply( new_cf_options); } +Status VersionSet::ReadAndApply( + InstrumentedMutex* mu, std::unique_ptr* manifest_reader, + std::unordered_set* cfds_changed) { + assert(manifest_reader != nullptr); + assert(cfds_changed != nullptr); + mu->AssertHeld(); + + Status s; + bool have_log_number = false; + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t log_number = 0; + uint64_t previous_log_number = 0; + uint32_t max_column_family = 0; + uint64_t min_log_number_to_keep = 0; + + while (s.ok()) { + Slice record; + std::string scratch; + bool read_success = false; // Make lint happy + log::Reader* reader = manifest_reader->get(); + std::string old_manifest_path = reader->file()->file_name(); + while ((read_success = reader->TryReadRecord(&record, &scratch))) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + auto cfd = column_family_set_->GetColumnFamily(edit.column_family_); + if (active_version_builders_.find(edit.column_family_) == + active_version_builders_.end()) { + std::unique_ptr builder_guard( + new BaseReferencedVersionBuilder(cfd)); + active_version_builders_.insert( + std::make_pair(edit.column_family_, std::move(builder_guard))); + } + s = ApplyOneVersionEditToBuilder( + edit, &have_log_number, &log_number, &have_prev_log_number, + &previous_log_number, &have_next_file, &next_file, + &have_last_sequence, &last_sequence, &min_log_number_to_keep, + &max_column_family); + if (!s.ok()) { + break; + } + if (column_family_set_->get_table_cache()->GetCapacity() == + TableCache::kInfiniteCapacity) { + // Unlimited table cache. Pre-load table handle now so that the table + // files are still accessible to us after the primary unlinks them. + auto builder_iter = active_version_builders_.find(edit.column_family_); + assert(builder_iter != active_version_builders_.end()); + auto builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + s = builder->LoadTableHandlers( + cfd->internal_stats(), db_options_->max_file_opening_threads, + false /* prefetch_index_and_filter_in_cache */, + false /* is_initial_load */, + cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); + if (!s.ok() && !s.IsPathNotFound()) { + break; + } else if (s.IsPathNotFound()) { + s = Status::OK(); + // TODO (yanqin) release file descriptors already opened, or modify + // LoadTableHandlers so that opened files are not re-opened. + } else { // s.ok() == true + auto version = new Version(cfd, this, env_options_, + *cfd->GetLatestMutableCFOptions(), + current_version_number_++); + builder->SaveTo(version->storage_info()); + version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true); + AppendVersion(cfd, version); + active_version_builders_.erase(builder_iter); + if (cfds_changed->count(cfd) == 0) { + cfds_changed->insert(cfd); + } + } + } + if (have_next_file) { + next_file_number_.store(next_file + 1); + } + if (have_last_sequence) { + last_allocated_sequence_ = last_sequence; + last_published_sequence_ = last_sequence; + last_sequence_ = last_sequence; + } + if (have_prev_log_number) { + prev_log_number_ = previous_log_number; + MarkFileNumberUsed(previous_log_number); + } + if (have_log_number) { + MarkFileNumberUsed(log_number); + } + column_family_set_->UpdateMaxColumnFamily(max_column_family); + MarkMinLogNumberToKeep2PC(min_log_number_to_keep); + } + if (s.ok() && !read_success) { + // It's possible that we have finished reading the current MANIFEST, and + // the primary has created a new MANIFEST. + log::Reader::Reporter* reporter = reader->GetReporter(); + s = MaybeSwitchManifest(reporter, manifest_reader); + reader = manifest_reader->get(); + } + if (s.ok() && reader->file()->file_name() == old_manifest_path) { + break; + } + } + + if (s.ok()) { + for (auto cfd : *column_family_set_) { + auto builder_iter = active_version_builders_.find(cfd->GetID()); + if (builder_iter == active_version_builders_.end()) { + continue; + } + auto builder = builder_iter->second->version_builder(); + if (!builder->CheckConsistencyForNumLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + } + } + + return s; +} + void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) { assert(edit->IsColumnFamilyManipulation()); edit->SetNextFile(next_file_number_.load()); @@ -3325,8 +3454,8 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) { } void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, - VersionBuilder* builder, Version* /*v*/, - VersionEdit* edit, InstrumentedMutex* mu) { + VersionBuilder* builder, VersionEdit* edit, + InstrumentedMutex* mu) { #ifdef NDEBUG (void)cfd; #endif @@ -3353,7 +3482,7 @@ void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, builder->Apply(edit); } -Status VersionSet::ApplyOneVersionEdit( +Status VersionSet::ApplyOneVersionEditToBuilder( VersionEdit& edit, const std::unordered_map& name_to_options, std::unordered_map& column_families_not_found, @@ -3480,6 +3609,152 @@ Status VersionSet::ApplyOneVersionEdit( return Status::OK(); } +Status VersionSet::ApplyOneVersionEditToBuilder( + VersionEdit& edit, bool* have_log_number, uint64_t* /* log_number */, + bool* have_prev_log_number, uint64_t* previous_log_number, + bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, + SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, + uint32_t* max_column_family) { + ColumnFamilyData* cfd = nullptr; + Status status; + if (edit.is_column_family_add_) { + // TODO (yanqin) for now the secondary ignores column families created + // after Open. This also simplifies handling of switching to a new MANIFEST + // and processing the snapshot of the system at the beginning of the + // MANIFEST. + return Status::OK(); + } else if (edit.is_column_family_drop_) { + // Drop the column family by setting it to be 'dropped' without destroying + // the column family handle. + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + // Drop a CF created after Open? Then ignore + if (cfd == nullptr) { + return Status::OK(); + } + cfd->SetDropped(); + if (cfd->Unref()) { + delete cfd; + cfd = nullptr; + } + } else { + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + // Operation on a CF created after Open? Then ignore + if (cfd == nullptr) { + return Status::OK(); + } + auto builder_iter = active_version_builders_.find(edit.column_family_); + assert(builder_iter != active_version_builders_.end()); + auto builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + builder->Apply(&edit); + } + if (cfd != nullptr) { + if (edit.has_log_number_) { + if (cfd->GetLogNumber() > edit.log_number_) { + // TODO (yanqin) use a separate info log for secondary instance. + } else { + cfd->SetLogNumber(edit.log_number_); + *have_log_number = true; + } + } + if (edit.has_comparator_ && + edit.comparator_ != cfd->user_comparator()->Name()) { + return Status::InvalidArgument( + cfd->user_comparator()->Name(), + "does not match existing comparator " + edit.comparator_); + } + } + + if (edit.has_prev_log_number_) { + *previous_log_number = edit.prev_log_number_; + *have_prev_log_number = true; + } + + if (edit.has_next_file_number_) { + *next_file = edit.next_file_number_; + *have_next_file = true; + } + + if (edit.has_max_column_family_) { + *max_column_family = edit.max_column_family_; + } + + if (edit.has_min_log_number_to_keep_) { + *min_log_number_to_keep = + std::max(*min_log_number_to_keep, edit.min_log_number_to_keep_); + } + + if (edit.has_last_sequence_) { + *last_sequence = edit.last_sequence_; + *have_last_sequence = true; + } + return status; +} + +Status VersionSet::MaybeSwitchManifest( + log::Reader::Reporter* reporter, + std::unique_ptr* manifest_reader) { + assert(manifest_reader != nullptr); + Status s; + do { + std::string manifest_path; + s = GetCurrentManifestPath(&manifest_path); + std::unique_ptr manifest_file; + if (s.ok()) { + if (nullptr == manifest_reader->get() || + manifest_reader->get()->file()->file_name() != manifest_path) { + TEST_SYNC_POINT( + "VersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0"); + TEST_SYNC_POINT( + "VersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:1"); + s = env_->NewSequentialFile( + manifest_path, &manifest_file, + env_->OptimizeForManifestRead(env_options_)); + } else { + // No need to switch manifest. + break; + } + } + std::unique_ptr manifest_file_reader; + if (s.ok()) { + manifest_file_reader.reset( + new SequentialFileReader(std::move(manifest_file), manifest_path)); + // TODO(yanqin) secondary instance needs a separate info log file. + manifest_reader->reset( + new log::Reader(nullptr, std::move(manifest_file_reader), reporter, + true /* checksum */, 0 /* log_number */)); + ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n", + manifest_path.c_str()); + } + } while (s.IsPathNotFound()); + return s; +} + +Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) { + assert(manifest_path != nullptr); + std::string fname; + Status s = ReadFileToString(env_, CurrentFileName(dbname_), &fname); + if (!s.ok()) { + return s; + } + if (fname.empty() || fname.back() != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + // remove the trailing '\n' + fname.resize(fname.size() - 1); + FileType type; + bool parse_ok = ParseFileName(fname, &manifest_file_number_, &type); + if (!parse_ok || type != kDescriptorFile) { + return Status::Corruption("CURRENT file corrupted"); + } + *manifest_path = dbname_; + if (dbname_.back() != '/') { + manifest_path->push_back('/'); + } + *manifest_path += fname; + return Status::OK(); +} + Status VersionSet::Recover( const std::vector& column_families, bool read_only) { @@ -3493,43 +3768,28 @@ Status VersionSet::Recover( std::unordered_map column_families_not_found; // Read "CURRENT" file, which contains a pointer to the current manifest file - std::string manifest_filename; - Status s = ReadFileToString( - env_, CurrentFileName(dbname_), &manifest_filename - ); + std::string manifest_path; + Status s = GetCurrentManifestPath(&manifest_path); if (!s.ok()) { return s; } - if (manifest_filename.empty() || - manifest_filename.back() != '\n') { - return Status::Corruption("CURRENT file does not end with newline"); - } - // remove the trailing '\n' - manifest_filename.resize(manifest_filename.size() - 1); - FileType type; - bool parse_ok = - ParseFileName(manifest_filename, &manifest_file_number_, &type); - if (!parse_ok || type != kDescriptorFile) { - return Status::Corruption("CURRENT file corrupted"); - } ROCKS_LOG_INFO(db_options_->info_log, "Recovering from manifest file: %s\n", - manifest_filename.c_str()); + manifest_path.c_str()); - manifest_filename = dbname_ + "/" + manifest_filename; std::unique_ptr manifest_file_reader; { std::unique_ptr manifest_file; - s = env_->NewSequentialFile(manifest_filename, &manifest_file, + s = env_->NewSequentialFile(manifest_path, &manifest_file, env_->OptimizeForManifestRead(env_options_)); if (!s.ok()) { return s; } manifest_file_reader.reset( - new SequentialFileReader(std::move(manifest_file), manifest_filename)); + new SequentialFileReader(std::move(manifest_file), manifest_path)); } uint64_t current_manifest_file_size; - s = env_->GetFileSize(manifest_filename, ¤t_manifest_file_size); + s = env_->GetFileSize(manifest_path, ¤t_manifest_file_size); if (!s.ok()) { return s; } @@ -3596,7 +3856,7 @@ Status VersionSet::Recover( TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:LastInAtomicGroup", &edit); for (auto& e : replay_buffer) { - s = ApplyOneVersionEdit( + s = ApplyOneVersionEditToBuilder( e, cf_name_to_options, column_families_not_found, builders, &have_log_number, &log_number, &have_prev_log_number, &previous_log_number, &have_next_file, &next_file, @@ -3617,7 +3877,7 @@ Status VersionSet::Recover( s = Status::Corruption("corrupted atomic group"); break; } - s = ApplyOneVersionEdit( + s = ApplyOneVersionEditToBuilder( edit, cf_name_to_options, column_families_not_found, builders, &have_log_number, &log_number, &have_prev_log_number, &previous_log_number, &have_next_file, &next_file, @@ -3724,7 +3984,7 @@ Status VersionSet::Recover( "prev_log_number is %lu," "max_column_family is %u," "min_log_number_to_keep is %lu\n", - manifest_filename.c_str(), (unsigned long)manifest_file_number_, + manifest_path.c_str(), (unsigned long)manifest_file_number_, (unsigned long)next_file_number_.load(), (unsigned long)last_sequence_, (unsigned long)log_number, (unsigned long)prev_log_number_, column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc()); @@ -3746,6 +4006,179 @@ Status VersionSet::Recover( return s; } +Status VersionSet::RecoverAsSecondary( + const std::vector& column_families, + std::unique_ptr* manifest_reader, + std::unique_ptr* manifest_reporter, + std::unique_ptr* manifest_reader_status) { + assert(manifest_reader != nullptr); + assert(manifest_reporter != nullptr); + assert(manifest_reader_status != nullptr); + + std::unordered_map cf_name_to_options; + for (const auto& cf : column_families) { + cf_name_to_options.insert({cf.name, cf.options}); + } + + // add default column family + auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName); + if (default_cf_iter == cf_name_to_options.end()) { + return Status::InvalidArgument("Default column family not specified"); + } + VersionEdit default_cf_edit; + default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); + default_cf_edit.SetColumnFamily(0); + ColumnFamilyData* default_cfd = + CreateColumnFamily(default_cf_iter->second, &default_cf_edit); + // In recovery, nobody else can access it, so it's fine to set it to be + // initialized earlier. + default_cfd->set_initialized(); + + bool have_log_number = false; + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t log_number = 0; + uint64_t previous_log_number = 0; + uint32_t max_column_family = 0; + uint64_t min_log_number_to_keep = 0; + std::unordered_map builders; + std::unordered_map column_families_not_found; + builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)}); + + manifest_reader_status->reset(new Status()); + manifest_reporter->reset(new LogReporter()); + static_cast(manifest_reporter->get())->status = + manifest_reader_status->get(); + Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader); + log::Reader* reader = manifest_reader->get(); + + while (s.ok()) { + assert(reader != nullptr); + Slice record; + std::string scratch; + while (s.ok() && reader->TryReadRecord(&record, &scratch)) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + s = ApplyOneVersionEditToBuilder( + edit, cf_name_to_options, column_families_not_found, builders, + &have_log_number, &log_number, &have_prev_log_number, + &previous_log_number, &have_next_file, &next_file, + &have_last_sequence, &last_sequence, &min_log_number_to_keep, + &max_column_family); + } + if (s.ok()) { + bool enough = have_next_file && have_log_number && have_last_sequence; + if (enough) { + for (const auto& cf : column_families) { + auto cfd = column_family_set_->GetColumnFamily(cf.name); + if (cfd == nullptr) { + enough = false; + break; + } + } + } + if (enough && column_family_set_->get_table_cache()->GetCapacity() == + TableCache::kInfiniteCapacity) { + for (const auto& cf : column_families) { + auto cfd = column_family_set_->GetColumnFamily(cf.name); + assert(cfd != nullptr); + if (!cfd->IsDropped()) { + auto builder_iter = builders.find(cfd->GetID()); + assert(builder_iter != builders.end()); + auto builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + s = builder->LoadTableHandlers( + cfd->internal_stats(), db_options_->max_file_opening_threads, + false /* prefetch_index_and_filter_in_cache */, + false /* is_initial_load */, + cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); + if (!s.ok()) { + enough = false; + if (s.IsPathNotFound()) { + s = Status::OK(); + } + break; + } + } + } + if (!enough) { + // TODO (yanqin) release table handlers if any of the files are not + // found. + } + } + if (enough) { + break; + } + } + } + + if (s.ok()) { + if (!have_prev_log_number) { + previous_log_number = 0; + } + column_family_set_->UpdateMaxColumnFamily(max_column_family); + + MarkMinLogNumberToKeep2PC(min_log_number_to_keep); + MarkFileNumberUsed(previous_log_number); + MarkFileNumberUsed(log_number); + + for (auto cfd : *column_family_set_) { + assert(builders.count(cfd->GetID()) > 0); + auto builder = builders[cfd->GetID()]->version_builder(); + if (!builder->CheckConsistencyForNumLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + } + } + + if (s.ok()) { + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + auto builders_iter = builders.find(cfd->GetID()); + assert(builders_iter != builders.end()); + auto* builder = builders_iter->second->version_builder(); + + Version* v = new Version(cfd, this, env_options_, + *cfd->GetLatestMutableCFOptions(), + current_version_number_++); + builder->SaveTo(v->storage_info()); + + // Install recovered version + v->PrepareApply(*cfd->GetLatestMutableCFOptions(), + !(db_options_->skip_stats_update_on_db_open)); + AppendVersion(cfd, v); + } + next_file_number_.store(next_file + 1); + last_allocated_sequence_ = last_sequence; + last_published_sequence_ = last_sequence; + last_sequence_ = last_sequence; + prev_log_number_ = previous_log_number; + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + ROCKS_LOG_INFO(db_options_->info_log, + "Column family [%s] (ID %u), log number is %" PRIu64 "\n", + cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); + } + } + for (auto& builder : builders) { + delete builder.second; + } + return s; +} + Status VersionSet::ListColumnFamilies(std::vector* column_families, const std::string& dbname, Env* env) { // these are just for performance reasons, not correcntes, diff --git a/db/version_set.h b/db/version_set.h index b50f653ba43..c4c1b2445b1 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -735,9 +735,7 @@ struct ObsoleteFileInfo { } }; -namespace { class BaseReferencedVersionBuilder; -} class VersionSet { public: @@ -799,12 +797,24 @@ class VersionSet { bool new_descriptor_log = false, const ColumnFamilyOptions* new_cf_options = nullptr); + Status ReadAndApply(InstrumentedMutex* mu, + std::unique_ptr* manifest_reader, + std::unordered_set* cfds_changed); + + Status GetCurrentManifestPath(std::string* manifest_filename); + // Recover the last saved descriptor from persistent storage. // If read_only == true, Recover() will not complain if some column families // are not opened Status Recover(const std::vector& column_families, bool read_only = false); + Status RecoverAsSecondary( + const std::vector& column_families, + std::unique_ptr* manifest_reader, + std::unique_ptr* manifest_reporter, + std::unique_ptr* manifest_reader_status); + // Reads a manifest file and returns a list of column families in // column_families. static Status ListColumnFamilies(std::vector* column_families, @@ -984,6 +994,7 @@ class VersionSet { friend class Version; friend class DBImpl; + friend class DBImplReadOnly; struct LogReporter : public log::Reader::Reporter { Status* status; @@ -1007,7 +1018,8 @@ class VersionSet { ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, VersionEdit* edit); - Status ApplyOneVersionEdit( + // REQUIRES db mutex + Status ApplyOneVersionEditToBuilder( VersionEdit& edit, const std::unordered_map& name_to_opts, std::unordered_map& column_families_not_found, @@ -1017,6 +1029,18 @@ class VersionSet { bool* have_last_sequence, SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, uint32_t* max_column_family); + // REQUIRES db mutex + Status ApplyOneVersionEditToBuilder( + VersionEdit& edit, bool* have_log_number, uint64_t* log_number, + bool* have_prev_log_number, uint64_t* previous_log_number, + bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, + SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, + uint32_t* max_column_family); + + Status MaybeSwitchManifest(log::Reader::Reporter* reporter, + std::unique_ptr* manifest_reader); + + // REQUIRES db mutex at beginning. may release and re-acquire db mutex Status ProcessManifestWrites(std::deque& writers, InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log, @@ -1070,12 +1094,15 @@ class VersionSet { // env options for all reads and writes except compactions EnvOptions env_options_; + std::unordered_map> + active_version_builders_; + // No copying allowed VersionSet(const VersionSet&); void operator=(const VersionSet&); void LogAndApplyCFHelper(VersionEdit* edit); - void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, Version* v, + void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, VersionEdit* edit, InstrumentedMutex* mu); }; diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 9d5316546b2..25e8bdf9a5a 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -162,6 +162,15 @@ class DB { std::vector* handles, DB** dbptr, bool error_if_log_file_exist = false); + static Status OpenAsSecondary(const Options& options, const std::string& name, + const std::string& secondary_name, DB** dbptr); + + static Status OpenAsSecondary( + const DBOptions& db_options, const std::string& name, + const std::string& secondary_name, + const std::vector& column_families, + std::vector* handles, DB** dbptr); + // Open DB with column families. // db_options specify database specific options // column_families is the vector of all column families in the database, diff --git a/src.mk b/src.mk index 7283329051f..5b696431cbf 100644 --- a/src.mk +++ b/src.mk @@ -22,6 +22,7 @@ LIB_SOURCES = \ db/db_impl_files.cc \ db/db_impl_open.cc \ db/db_impl_readonly.cc \ + db/db_impl_secondary.cc \ db/db_impl_write.cc \ db/db_info_dumper.cc \ db/db_iter.cc \ From 5207df365c33c00ae754dc14bc102ccdbb98ceca Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Wed, 26 Dec 2018 10:33:25 -0800 Subject: [PATCH 04/33] Add unit test for DBImplSecondary Summary: as title Test Plan: ``` $make clean && make -j32 db_secondary_test $./db_secondary_test ``` All tests must pass. --- CMakeLists.txt | 1 + Makefile | 5 + db/db_secondary_test.cc | 294 ++++++++++++++++++++++++++++++++++++++++ db/version_builder.cc | 6 +- db/version_set.cc | 5 +- src.mk | 1 + tools/ldb_cmd.cc | 4 +- 7 files changed, 307 insertions(+), 9 deletions(-) create mode 100644 db/db_secondary_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cabf3b388c..d4980c98ed2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -873,6 +873,7 @@ if(WITH_TESTS) db/db_options_test.cc db/db_properties_test.cc db/db_range_del_test.cc + db/db_secondary_test.cc db/db_sst_test.cc db/db_statistics_test.cc db/db_table_properties_test.cc diff --git a/Makefile b/Makefile index 7030eb48d9e..eee0f9fba02 100644 --- a/Makefile +++ b/Makefile @@ -443,6 +443,7 @@ TESTS = \ db_merge_operator_test \ db_options_test \ db_range_del_test \ + db_secondary_test \ db_sst_test \ db_tailing_iter_test \ db_io_failure_test \ @@ -547,6 +548,7 @@ TESTS = \ range_tombstone_fragmenter_test \ range_del_aggregator_test \ sst_file_reader_test \ + db_secondary_test \ PARALLEL_TEST = \ backupable_db_test \ @@ -1571,6 +1573,9 @@ range_tombstone_fragmenter_test: db/range_tombstone_fragmenter_test.o db/db_test sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +db_secondary_test: db/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + #------------------------------------------------- # make install related stuff INSTALL_PATH ?= /usr/local diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc new file mode 100644 index 00000000000..d960247d68c --- /dev/null +++ b/db/db_secondary_test.cc @@ -0,0 +1,294 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl_secondary.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "util/fault_injection_test_env.h" +#include "util/sync_point.h" + +namespace rocksdb { + +#ifndef ROCKSDB_LITE +class DBSecondaryTest : public DBTestBase { + public: + DBSecondaryTest() : DBTestBase("/db_secondary_test"), secondary_dbname_() { + secondary_dbname_ = + test::PerThreadDBPath(env_, "/db_secondary_test_secondary"); + } + + ~DBSecondaryTest() { + if (getenv("KEEP_DB") != nullptr) { + fprintf(stdout, "Secondary DB is still at %s\n", + secondary_dbname_.c_str()); + } else { + Options options; + options.env = env_; + EXPECT_OK(DestroyDB(secondary_dbname_, options)); + } + } + + protected: + Status ReopenAsSecondary(const Options& options) { + return DB::OpenAsSecondary(options, dbname_, secondary_dbname_, &db_); + } + + std::string secondary_dbname_; +}; + +TEST_F(DBSecondaryTest, ReopenAsSecondary) { + Options options; + options.env = env_; + Reopen(options); + ASSERT_OK(Put("foo", "foo_value")); + ASSERT_OK(Put("bar", "bar_value")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + Close(); + + ASSERT_OK(ReopenAsSecondary(options)); + ASSERT_EQ("foo_value", Get("foo")); + ASSERT_EQ("bar_value", Get("bar")); + ReadOptions ropts; + ropts.verify_checksums = true; + auto db1 = static_cast(db_); + ASSERT_NE(nullptr, db1); + Iterator* iter = db1->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + if (0 == count) { + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value", iter->value().ToString()); + } else if (1 == count) { + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value", iter->value().ToString()); + } + ++count; + } + delete iter; + ASSERT_EQ(2, count); + Close(); +} + +TEST_F(DBSecondaryTest, OpenAsSecondary) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + DB* db_secondary = nullptr; + Options options1; + options1.env = env_; + options1.max_open_files = -1; + Status s = + DB::OpenAsSecondary(options1, dbname_, secondary_dbname_, &db_secondary); + ASSERT_OK(s); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ReadOptions ropts; + ropts.verify_checksums = true; + const auto verify_db_func = [&](const std::string& foo_val, + const std::string& bar_val) { + std::string value; + ASSERT_OK(db_secondary->Get(ropts, "foo", &value)); + ASSERT_EQ(foo_val, value); + ASSERT_OK(db_secondary->Get(ropts, "bar", &value)); + ASSERT_EQ(bar_val, value); + Iterator* iter = db_secondary->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ(foo_val, iter->value().ToString()); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ(bar_val, iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; + }; + + verify_db_func("foo_value2", "bar_value2"); + + ASSERT_OK(Put("foo", "new_foo_value")); + ASSERT_OK(Put("bar", "new_bar_value")); + ASSERT_OK(Flush()); + + ASSERT_OK( + static_cast(db_secondary)->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value", "new_bar_value"); + + delete db_secondary; + Close(); +} + +TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { + Options options; + options.env = env_; + Reopen(options); + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"VersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0", + "VersionSet::ProcessManifestWrites:BeforeNewManifest"}, + {"VersionSet::ProcessManifestWrites:AfterNewManifest", + "VersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + // Make sure db calls RecoverLogFiles so as to trigger a manifest write, + // which causes the db to switch to a new MANIFEST upon start. + port::Thread ro_db_thread([&]() { + DB* db_secondary = nullptr; + Options options1; + options1.env = env_; + options1.max_open_files = -1; + Status s = DB::OpenAsSecondary(options1, dbname_, secondary_dbname_, + &db_secondary); + ASSERT_OK(s); + delete db_secondary; + }); + Reopen(options); + ro_db_thread.join(); + Close(); +} + +TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + DB* db1 = nullptr; + Options options1; + options1.env = env_; + options1.max_open_files = -1; + Status s = DB::OpenAsSecondary(options1, dbname_, secondary_dbname_, &db1); + ASSERT_OK(s); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db1->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + ASSERT_OK(db1->Get(ropts, "bar", &value)); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + Iterator* iter = db1->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; + delete db1; + Close(); +} + +TEST_F(DBSecondaryTest, MissingTableFile) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + + DB* db1 = nullptr; + Options options1; + options1.env = env_; + options1.max_open_files = -1; + Status s = DB::OpenAsSecondary(options1, dbname_, secondary_dbname_, &db1); + ASSERT_OK(s); + + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + auto db_secondary = static_cast(db1); + ASSERT_NE(nullptr, db_secondary); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_NOK(db_secondary->Get(ropts, "foo", &value)); + ASSERT_NOK(db_secondary->Get(ropts, "bar", &value)); + + ASSERT_OK(db_secondary->TryCatchUpWithPrimary()); + ASSERT_OK(db_secondary->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + ASSERT_OK(db_secondary->Get(ropts, "bar", &value)); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + Iterator* iter = db1->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; + delete db1; + Close(); +} +#endif //! ROCKSDB_LITE + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/version_builder.cc b/db/version_builder.cc index a920e28d651..84e4dc6579a 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -500,13 +500,11 @@ void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) { Status VersionBuilder::LoadTableHandlers( InternalStats* internal_stats, int max_threads, - bool prefetch_index_and_filter_in_cache, - bool is_initial_load, + bool prefetch_index_and_filter_in_cache, bool is_initial_load, const SliceTransform* prefix_extractor) { return rep_->LoadTableHandlers(internal_stats, max_threads, prefetch_index_and_filter_in_cache, - is_initial_load, - prefix_extractor); + is_initial_load, prefix_extractor); } void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level, diff --git a/db/version_set.cc b/db/version_set.cc index 54ecc2abe92..4ee91060ebb 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3331,10 +3331,9 @@ Status VersionSet::ReadAndApply( while (s.ok()) { Slice record; std::string scratch; - bool read_success = false; // Make lint happy log::Reader* reader = manifest_reader->get(); std::string old_manifest_path = reader->file()->file_name(); - while ((read_success = reader->TryReadRecord(&record, &scratch))) { + while (reader->TryReadRecord(&record, &scratch)) { VersionEdit edit; s = edit.DecodeFrom(record); if (!s.ok()) { @@ -3406,7 +3405,7 @@ Status VersionSet::ReadAndApply( column_family_set_->UpdateMaxColumnFamily(max_column_family); MarkMinLogNumberToKeep2PC(min_log_number_to_keep); } - if (s.ok() && !read_success) { + if (s.ok()) { // It's possible that we have finished reading the current MANIFEST, and // the primary has created a new MANIFEST. log::Reader::Reporter* reporter = reader->GetReporter(); diff --git a/src.mk b/src.mk index 5b696431cbf..55b4e3427c6 100644 --- a/src.mk +++ b/src.mk @@ -280,6 +280,7 @@ MAIN_SOURCES = \ db/db_options_test.cc \ db/db_properties_test.cc \ db/db_range_del_test.cc \ + db/db_secondary_test.cc \ db/db_sst_test.cc \ db/db_statistics_test.cc \ db/db_table_properties_test.cc \ diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index c071c0bdfad..dd923366517 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -2006,8 +2006,8 @@ void DumpWalFile(Options options, std::string wal_file, bool print_header, // bogus input, carry on as best we can log_number = 0; } - log::Reader reader(options.info_log, std::move(wal_file_reader), - &reporter, true /* checksum */, log_number); + log::Reader reader(options.info_log, std::move(wal_file_reader), &reporter, + true /* checksum */, log_number); std::string scratch; WriteBatch batch; Slice record; From 92b38038970583d08cc96f21a833a39cdf28b0df Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 18 Jan 2019 17:52:06 -0800 Subject: [PATCH 05/33] Add new file to TARGETS and CMake --- CMakeLists.txt | 1 + TARGETS | 1 + 2 files changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index d4980c98ed2..132d3b04e96 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -489,6 +489,7 @@ set(SOURCES db/db_impl_debug.cc db/db_impl_experimental.cc db/db_impl_readonly.cc + db/db_impl_secondary.cc db/db_info_dumper.cc db/db_iter.cc db/dbformat.cc diff --git a/TARGETS b/TARGETS index 4590560f1a8..590ae480899 100644 --- a/TARGETS +++ b/TARGETS @@ -98,6 +98,7 @@ cpp_library( "db/db_impl_files.cc", "db/db_impl_open.cc", "db/db_impl_readonly.cc", + "db/db_impl_secondary.cc", "db/db_impl_write.cc", "db/db_info_dumper.cc", "db/db_iter.cc", From 8f3c4155c423b413be91874fed838a3b0a2cce26 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 1 Feb 2019 15:46:40 -0800 Subject: [PATCH 06/33] Address review comments --- db/db_impl_secondary.cc | 52 ++++++++++++++--------------------------- db/version_set.cc | 3 ++- 2 files changed, 19 insertions(+), 36 deletions(-) diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index 205339a00a2..a5b082caf02 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -40,8 +40,7 @@ Status DBImplSecondary::Recover( if (immutable_db_options_.paranoid_checks && s.ok()) { s = CheckConsistency(); } - // Initial max_total_in_memory_state_ before recovery logs. Log recovery - // may check this value to decide whether to flush. + // Initial max_total_in_memory_state_ before recovery logs. max_total_in_memory_state_ = 0; for (auto cfd : *versions_->GetColumnFamilySet()) { auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); @@ -75,8 +74,8 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, StopWatch sw(env_, stats_, DB_GET); PERF_TIMER_GUARD(get_snapshot_time); - auto cfh = reinterpret_cast(column_family); - auto cfd = cfh->cfd(); + auto cfh = static_cast(column_family); + ColumnFamilyData* cfd = cfh->cfd(); if (tracer_) { InstrumentedMutexLock lock(&trace_mutex_); if (tracer_) { @@ -86,7 +85,6 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, // Acquire SuperVersion SuperVersion* super_version = GetAndRefSuperVersion(cfd); SequenceNumber snapshot = versions_->LastSequence(); - ; MergeContext merge_context; SequenceNumber max_covering_tombstone_seq = 0; Status s; @@ -144,17 +142,14 @@ Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, auto cfd = cfh->cfd(); ReadCallback* read_callback = nullptr; // No read callback provided. if (read_options.tailing) { - SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_); - auto iter = new ForwardIterator(this, read_options, cfd, super_version); - result = NewDBIterator( - env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, - cfd->user_comparator(), iter, kMaxSequenceNumber, - super_version->mutable_cf_options.max_sequential_skip_in_iterations, - read_callback, this, cfd); + return NewErrorIterator(Status::NotSupported( + "tailing iterator not supported in secondary mode")); + } else if (read_options.snapshot != nullptr) { + // TODO (yanqin) support snapshot. + return NewErrorIterator( + Status::NotSupported("snapshot not supported in secondary mode")); } else { - auto snapshot = read_options.snapshot != nullptr - ? read_options.snapshot->GetSequenceNumber() - : versions_->LastSequence(); + auto snapshot = versions_->LastSequence(); result = NewIteratorImpl(read_options, cfd, snapshot, read_callback); } return result; @@ -194,32 +189,19 @@ Status DBImplSecondary::NewIterators( iterators->clear(); iterators->reserve(column_families.size()); if (read_options.tailing) { - for (auto cfh : column_families) { - auto cfd = reinterpret_cast(cfh)->cfd(); - SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_); - auto iter = new ForwardIterator(this, read_options, cfd, super_version); - iterators->push_back(NewDBIterator( - env_, read_options, *cfd->ioptions(), - super_version->mutable_cf_options, cfd->user_comparator(), iter, - kMaxSequenceNumber, - super_version->mutable_cf_options.max_sequential_skip_in_iterations, - read_callback, this, cfd)); - } + return Status::NotSupported( + "tailing iterator not supported in secondary mode"); + } else if (read_options.snapshot != nullptr) { + // TODO (yanqin) support snapshot. + return Status::NotSupported("snapshot not supported in secondary mode"); } else { - SequenceNumber latest_snapshot = versions_->LastSequence(); - SequenceNumber read_seq = - read_options.snapshot != nullptr - ? reinterpret_cast(read_options.snapshot) - ->number_ - : latest_snapshot; - + SequenceNumber read_seq = versions_->LastSequence(); for (auto cfh : column_families) { - auto* cfd = reinterpret_cast(cfh)->cfd(); + ColumnFamilyData* cfd = static_cast(cfh)->cfd(); iterators->push_back( NewIteratorImpl(read_options, cfd, read_seq, read_callback)); } } - return Status::OK(); } diff --git a/db/version_set.cc b/db/version_set.cc index 4ee91060ebb..329c28a7403 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3339,7 +3339,8 @@ Status VersionSet::ReadAndApply( if (!s.ok()) { break; } - auto cfd = column_family_set_->GetColumnFamily(edit.column_family_); + ColumnFamilyData* cfd = + column_family_set_->GetColumnFamily(edit.column_family_); if (active_version_builders_.find(edit.column_family_) == active_version_builders_.end()) { std::unique_ptr builder_guard( From 9b2d270be1abab7a94171fb06c3903b3f6afb99a Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 1 Feb 2019 17:48:02 -0800 Subject: [PATCH 07/33] Address more review comments 1. Rename secondary_dbname to secondary_path because it's not a database name. It's just a directory to keep files owned by the secondary. 2. Search for next MANIFEST when necessary. Rename the variable because the directory path is not a db, thus should not be called secondary_dbname. --- db/db_impl_secondary.cc | 24 ++++++++++++------------ db/db_impl_secondary.h | 2 ++ db/db_secondary_test.cc | 20 ++++++++++---------- db/version_set.cc | 16 ++++++++-------- include/rocksdb/db.h | 4 ++-- 5 files changed, 34 insertions(+), 32 deletions(-) diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index a5b082caf02..ec8cf7df521 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -224,7 +224,7 @@ Status DBImplSecondary::TryCatchUpWithPrimary() { } Status DB::OpenAsSecondary(const Options& options, const std::string& dbname, - const std::string& secondary_dbname, DB** dbptr) { + const std::string& secondary_path, DB** dbptr) { *dbptr = nullptr; DBOptions db_options(options); @@ -233,7 +233,7 @@ Status DB::OpenAsSecondary(const Options& options, const std::string& dbname, column_families.emplace_back(kDefaultColumnFamilyName, cf_options); std::vector handles; - Status s = DB::OpenAsSecondary(db_options, dbname, secondary_dbname, + Status s = DB::OpenAsSecondary(db_options, dbname, secondary_path, column_families, &handles, dbptr); if (s.ok()) { assert(handles.size() == 1); @@ -244,7 +244,7 @@ Status DB::OpenAsSecondary(const Options& options, const std::string& dbname, Status DB::OpenAsSecondary( const DBOptions& db_options, const std::string& dbname, - const std::string& secondary_dbname, + const std::string& secondary_path, const std::vector& column_families, std::vector* handles, DB** dbptr) { *dbptr = nullptr; @@ -259,15 +259,15 @@ Status DB::OpenAsSecondary( if (nullptr == tmp_opts.info_log) { Env* env = tmp_opts.env; assert(env != nullptr); - std::string secondary_db_abs_path; - env->GetAbsolutePath(secondary_dbname, &secondary_db_abs_path); - std::string fname = InfoLogFileName(secondary_dbname, secondary_db_abs_path, + std::string secondary_abs_path; + env->GetAbsolutePath(secondary_path, &secondary_abs_path); + std::string fname = InfoLogFileName(secondary_path, secondary_abs_path, tmp_opts.db_log_dir); - env->CreateDirIfMissing(secondary_dbname); + env->CreateDirIfMissing(secondary_path); if (tmp_opts.log_file_time_to_roll > 0 || tmp_opts.max_log_file_size > 0) { AutoRollLogger* result = new AutoRollLogger( - env, secondary_dbname, tmp_opts.db_log_dir, + env, secondary_path, tmp_opts.db_log_dir, tmp_opts.max_log_file_size, tmp_opts.log_file_time_to_roll, tmp_opts.info_log_level); Status s = result->GetStatus(); @@ -279,8 +279,8 @@ Status DB::OpenAsSecondary( } if (nullptr == tmp_opts.info_log) { env->RenameFile(fname, OldInfoLogFileName( - secondary_dbname, env->NowMicros(), - secondary_db_abs_path, tmp_opts.db_log_dir)); + secondary_path, env->NowMicros(), + secondary_abs_path, tmp_opts.db_log_dir)); Status s = env->NewLogger(fname, &(tmp_opts.info_log)); if (tmp_opts.info_log != nullptr) { tmp_opts.info_log->SetInfoLogLevel(tmp_opts.info_log_level); @@ -333,14 +333,14 @@ Status DB::OpenAsSecondary( Status DB::OpenAsSecondary(const Options& /*options*/, const std::string& /*name*/, - const std::string& /*secondary_name*/, + const std::string& /*secondary_path*/, DB** /*dbptr*/) { return Status::NotSupported("Not supported in ROCKSDB_LITE."); } Status DB::OpenAsSecondary( const DBOptions& /*db_options*/, const std::string& /*dbname*/, - const std::string& /*secondary_name*/, + const std::string& /*secondary_path*/, const std::vector& /*column_families*/, std::vector* /*handles*/, DB** /*dbptr*/) { return Status::NotSupported("Not supported in ROCKSDB_LITE."); diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h index 6b7570c414d..b149f66e399 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl_secondary.h @@ -122,6 +122,8 @@ class DBImplSecondary : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } + // Try to catch up with the primary by reading as much as possible from the + // log files until there is nothing more to read or encounters an error. Status TryCatchUpWithPrimary(); private: diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index d960247d68c..f64ff490ba6 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -18,28 +18,28 @@ namespace rocksdb { #ifndef ROCKSDB_LITE class DBSecondaryTest : public DBTestBase { public: - DBSecondaryTest() : DBTestBase("/db_secondary_test"), secondary_dbname_() { - secondary_dbname_ = + DBSecondaryTest() : DBTestBase("/db_secondary_test"), secondary_path_() { + secondary_path_ = test::PerThreadDBPath(env_, "/db_secondary_test_secondary"); } ~DBSecondaryTest() { if (getenv("KEEP_DB") != nullptr) { fprintf(stdout, "Secondary DB is still at %s\n", - secondary_dbname_.c_str()); + secondary_path_.c_str()); } else { Options options; options.env = env_; - EXPECT_OK(DestroyDB(secondary_dbname_, options)); + EXPECT_OK(DestroyDB(secondary_path_, options)); } } protected: Status ReopenAsSecondary(const Options& options) { - return DB::OpenAsSecondary(options, dbname_, secondary_dbname_, &db_); + return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_); } - std::string secondary_dbname_; + std::string secondary_path_; }; TEST_F(DBSecondaryTest, ReopenAsSecondary) { @@ -91,7 +91,7 @@ TEST_F(DBSecondaryTest, OpenAsSecondary) { options1.env = env_; options1.max_open_files = -1; Status s = - DB::OpenAsSecondary(options1, dbname_, secondary_dbname_, &db_secondary); + DB::OpenAsSecondary(options1, dbname_, secondary_path_, &db_secondary); ASSERT_OK(s); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); @@ -159,7 +159,7 @@ TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { Options options1; options1.env = env_; options1.max_open_files = -1; - Status s = DB::OpenAsSecondary(options1, dbname_, secondary_dbname_, + Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, &db_secondary); ASSERT_OK(s); delete db_secondary; @@ -185,7 +185,7 @@ TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { Options options1; options1.env = env_; options1.max_open_files = -1; - Status s = DB::OpenAsSecondary(options1, dbname_, secondary_dbname_, &db1); + Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, &db1); ASSERT_OK(s); ReadOptions ropts; ropts.verify_checksums = true; @@ -232,7 +232,7 @@ TEST_F(DBSecondaryTest, MissingTableFile) { Options options1; options1.env = env_; options1.max_open_files = -1; - Status s = DB::OpenAsSecondary(options1, dbname_, secondary_dbname_, &db1); + Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, &db1); ASSERT_OK(s); for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { diff --git a/db/version_set.cc b/db/version_set.cc index 329c28a7403..a5ca356c3ac 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3406,13 +3406,14 @@ Status VersionSet::ReadAndApply( column_family_set_->UpdateMaxColumnFamily(max_column_family); MarkMinLogNumberToKeep2PC(min_log_number_to_keep); } - if (s.ok()) { - // It's possible that we have finished reading the current MANIFEST, and - // the primary has created a new MANIFEST. - log::Reader::Reporter* reporter = reader->GetReporter(); - s = MaybeSwitchManifest(reporter, manifest_reader); - reader = manifest_reader->get(); - } + // It's possible that: + // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted. + // 2) we have finished reading the current MANIFEST. + // 3) we have encountered an IOError reading the current MANIFEST. + // We need to look for the next MANIFEST and start from there. If we cannot + // find the next MANIFEST, we should exit the loop. + s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); + reader = manifest_reader->get(); if (s.ok() && reader->file()->file_name() == old_manifest_path) { break; } @@ -3432,7 +3433,6 @@ Status VersionSet::ReadAndApply( } } } - return s; } diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 25e8bdf9a5a..e551c50417e 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -163,11 +163,11 @@ class DB { bool error_if_log_file_exist = false); static Status OpenAsSecondary(const Options& options, const std::string& name, - const std::string& secondary_name, DB** dbptr); + const std::string& secondary_path, DB** dbptr); static Status OpenAsSecondary( const DBOptions& db_options, const std::string& name, - const std::string& secondary_name, + const std::string& secondary_path, const std::vector& column_families, std::vector* handles, DB** dbptr); From fc5fb971919614dd4bdb9ef150fa4a78fd166885 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 14 Feb 2019 15:34:35 -0800 Subject: [PATCH 08/33] Uninteresting change --- db/db_impl_secondary.h | 34 +++++++++++++++++----------------- db/db_secondary_test.cc | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h index b149f66e399..b562c53ed0e 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl_secondary.h @@ -16,13 +16,13 @@ namespace rocksdb { class DBImplSecondary : public DBImpl { public: DBImplSecondary(const DBOptions& options, const std::string& dbname); - virtual ~DBImplSecondary(); + ~DBImplSecondary(); Status Recover(const std::vector& column_families); // Implementations of the DB interface using DB::Get; - virtual Status Get(const ReadOptions& options, + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; @@ -30,7 +30,7 @@ class DBImplSecondary : public DBImpl { const Slice& key, PinnableSlice* value); using DBImpl::NewIterator; - virtual Iterator* NewIterator(const ReadOptions&, + Iterator* NewIterator(const ReadOptions&, ColumnFamilyHandle* column_family) override; ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options, @@ -38,41 +38,41 @@ class DBImplSecondary : public DBImpl { SequenceNumber snapshot, ReadCallback* read_callback); - virtual Status NewIterators( + Status NewIterators( const ReadOptions& options, const std::vector& column_families, std::vector* iterators) override; using DBImpl::Put; - virtual Status Put(const WriteOptions& /*options*/, + Status Put(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, const Slice& /*value*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::Merge; - virtual Status Merge(const WriteOptions& /*options*/, + Status Merge(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, const Slice& /*value*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::Delete; - virtual Status Delete(const WriteOptions& /*options*/, + Status Delete(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::SingleDelete; - virtual Status SingleDelete(const WriteOptions& /*options*/, + Status SingleDelete(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/) override { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status Write(const WriteOptions& /*options*/, + Status Write(const WriteOptions& /*options*/, WriteBatch* /*updates*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::CompactRange; - virtual Status CompactRange(const CompactRangeOptions& /*options*/, + Status CompactRange(const CompactRangeOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice* /*begin*/, const Slice* /*end*/) override { @@ -80,7 +80,7 @@ class DBImplSecondary : public DBImpl { } using DBImpl::CompactFiles; - virtual Status CompactFiles( + Status CompactFiles( const CompactionOptions& /*compact_options*/, ColumnFamilyHandle* /*column_family*/, const std::vector& /*input_file_names*/, @@ -90,32 +90,32 @@ class DBImplSecondary : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status DisableFileDeletions() override { + Status DisableFileDeletions() override { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status EnableFileDeletions(bool /*force*/) override { + Status EnableFileDeletions(bool /*force*/) override { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status GetLiveFiles(std::vector&, + Status GetLiveFiles(std::vector&, uint64_t* /*manifest_file_size*/, bool /*flush_memtable*/ = true) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::Flush; - virtual Status Flush(const FlushOptions& /*options*/, + Status Flush(const FlushOptions& /*options*/, ColumnFamilyHandle* /*column_family*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::SyncWAL; - virtual Status SyncWAL() override { + Status SyncWAL() override { return Status::NotSupported("Not supported operation in read only mode."); } using DB::IngestExternalFile; - virtual Status IngestExternalFile( + Status IngestExternalFile( ColumnFamilyHandle* /*column_family*/, const std::vector& /*external_files*/, const IngestExternalFileOptions& /*ingestion_options*/) override { diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index f64ff490ba6..92932fecbe1 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -23,7 +23,7 @@ class DBSecondaryTest : public DBTestBase { test::PerThreadDBPath(env_, "/db_secondary_test_secondary"); } - ~DBSecondaryTest() { + ~DBSecondaryTest() override { if (getenv("KEEP_DB") != nullptr) { fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str()); From 4e76ada54579a927802c7d6a1b3ee6a2239ae690 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 19 Feb 2019 10:16:50 -0800 Subject: [PATCH 09/33] Rebase and adjust format --- db/db_impl_secondary.cc | 11 +++++------ db/db_impl_secondary.h | 43 +++++++++++++++++++---------------------- db/db_secondary_test.cc | 7 +++---- 3 files changed, 28 insertions(+), 33 deletions(-) diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index ec8cf7df521..f50da15dbef 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -267,9 +267,8 @@ Status DB::OpenAsSecondary( env->CreateDirIfMissing(secondary_path); if (tmp_opts.log_file_time_to_roll > 0 || tmp_opts.max_log_file_size > 0) { AutoRollLogger* result = new AutoRollLogger( - env, secondary_path, tmp_opts.db_log_dir, - tmp_opts.max_log_file_size, tmp_opts.log_file_time_to_roll, - tmp_opts.info_log_level); + env, secondary_path, tmp_opts.db_log_dir, tmp_opts.max_log_file_size, + tmp_opts.log_file_time_to_roll, tmp_opts.info_log_level); Status s = result->GetStatus(); if (!s.ok()) { delete result; @@ -278,9 +277,9 @@ Status DB::OpenAsSecondary( } } if (nullptr == tmp_opts.info_log) { - env->RenameFile(fname, OldInfoLogFileName( - secondary_path, env->NowMicros(), - secondary_abs_path, tmp_opts.db_log_dir)); + env->RenameFile( + fname, OldInfoLogFileName(secondary_path, env->NowMicros(), + secondary_abs_path, tmp_opts.db_log_dir)); Status s = env->NewLogger(fname, &(tmp_opts.info_log)); if (tmp_opts.info_log != nullptr) { tmp_opts.info_log->SetInfoLogLevel(tmp_opts.info_log_level); diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h index b562c53ed0e..9cbf6ce3f8b 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl_secondary.h @@ -22,60 +22,57 @@ class DBImplSecondary : public DBImpl { // Implementations of the DB interface using DB::Get; - Status Get(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value); using DBImpl::NewIterator; Iterator* NewIterator(const ReadOptions&, - ColumnFamilyHandle* column_family) override; + ColumnFamilyHandle* column_family) override; ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options, ColumnFamilyData* cfd, SequenceNumber snapshot, ReadCallback* read_callback); - Status NewIterators( - const ReadOptions& options, - const std::vector& column_families, - std::vector* iterators) override; + Status NewIterators(const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; using DBImpl::Put; Status Put(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/, const Slice& /*value*/) override { + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::Merge; Status Merge(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/, const Slice& /*value*/) override { + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::Delete; Status Delete(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/) override { + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::SingleDelete; Status SingleDelete(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/) override { + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { return Status::NotSupported("Not supported operation in read only mode."); } Status Write(const WriteOptions& /*options*/, - WriteBatch* /*updates*/) override { + WriteBatch* /*updates*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::CompactRange; Status CompactRange(const CompactRangeOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice* /*begin*/, - const Slice* /*end*/) override { + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, const Slice* /*end*/) override { return Status::NotSupported("Not supported operation in read only mode."); } @@ -98,14 +95,14 @@ class DBImplSecondary : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } Status GetLiveFiles(std::vector&, - uint64_t* /*manifest_file_size*/, - bool /*flush_memtable*/ = true) override { + uint64_t* /*manifest_file_size*/, + bool /*flush_memtable*/ = true) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::Flush; Status Flush(const FlushOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/) override { + ColumnFamilyHandle* /*column_family*/) override { return Status::NotSupported("Not supported operation in read only mode."); } diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 92932fecbe1..5a6295058e0 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -25,8 +25,7 @@ class DBSecondaryTest : public DBTestBase { ~DBSecondaryTest() override { if (getenv("KEEP_DB") != nullptr) { - fprintf(stdout, "Secondary DB is still at %s\n", - secondary_path_.c_str()); + fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str()); } else { Options options; options.env = env_; @@ -159,8 +158,8 @@ TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { Options options1; options1.env = env_; options1.max_open_files = -1; - Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, - &db_secondary); + Status s = + DB::OpenAsSecondary(options1, dbname_, secondary_path_, &db_secondary); ASSERT_OK(s); delete db_secondary; }); From f8d47145113277cbbbeb8b4c632f7175c51200a6 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Wed, 20 Feb 2019 09:44:34 -0800 Subject: [PATCH 10/33] Add db_secondary_test to TARGETS --- TARGETS | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/TARGETS b/TARGETS index 590ae480899..073c977e5ad 100644 --- a/TARGETS +++ b/TARGETS @@ -606,6 +606,11 @@ ROCKS_TESTS = [ "db/db_range_del_test.cc", "serial", ], + [ + "db_secondary_test", + "db/db_secondary_test.cc", + "serial", + ], [ "db_sst_test", "db/db_sst_test.cc", From ce46b8c82e4420cc55804d70bc991f3f64a3fc40 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 28 Feb 2019 11:41:15 -0800 Subject: [PATCH 11/33] Minor change and update HISTORY --- HISTORY.md | 13 ++----------- db/log_reader.cc | 2 +- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 6e25bbbe998..e467aa45354 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,22 +2,14 @@ ## Unreleased ### New Features * Introduce two more stats levels, kExceptHistogramOrTimers and kExceptTimers. +* Added a feature to perform data-block sampling for compressibility, and report stats to user. * Add support for trace filtering. - ### Public API Change * statistics.stats_level_ becomes atomic. It is preferred to use statistics.set_stats_level() and statistics.get_stats_level() to access it. - +* Introduce a new IOError subcode, PathNotFound, to indicate trying to open a nonexistent file or directory for read. ### Bug Fixes * Fix JEMALLOC_CXX_THROW macro missing from older Jemalloc versions, causing build failures on some platforms. - -## Unreleased -### New Features -* Added a feature to perform data-block sampling for compressibility, and report stats to user. -### Public API Change -### Bug fixes - - ## 6.0.0 (2/19/2019) ### New Features * Enabled checkpoint on readonly db (DBImplReadOnly). @@ -33,7 +25,6 @@ * For all users of dictionary compression, we now generate a separate dictionary for compressing each bottom-level SST file. Previously we reused a single dictionary for a whole compaction to bottom level. The new approach achieves better compression ratios; however, it uses more memory and CPU for buffering/sampling data blocks and training dictionaries. * Add whole key bloom filter support in memtable. * Files written by `SstFileWriter` will now use dictionary compression if it is configured in the file writer's `CompressionOptions`. -* Introduce a new IOError subcode, PathNotFound, to indicate trying to open a nonexistent file or directory for read. ### Public API Change * Disallow CompactionFilter::IgnoreSnapshots() = false, because it is not very useful and the behavior is confusing. The filter will filter everything if there is no snapshot declared by the time the compaction starts. However, users can define a snapshot after the compaction starts and before it finishes and this new snapshot won't be repeatable, because after the compaction finishes, some keys may be dropped. diff --git a/db/log_reader.cc b/db/log_reader.cc index 237fd192948..5b2379bfea1 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -227,7 +227,7 @@ bool Reader::TryReadRecord(Slice* record, std::string* scratch) { case kFirstType: case kRecyclableFirstType: - if (in_fragmented_record_ && !fragments_.empty()) { + if (in_fragmented_record_ || !fragments_.empty()) { ReportCorruption(fragments_.size(), "partial record without end(2)"); } prospective_record_offset = physical_record_offset; From 590ab93f2487400b4d5aa3de219177a7bca7c34f Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 28 Feb 2019 14:42:58 -0800 Subject: [PATCH 12/33] Rebase and adjust --- db/db_impl_secondary.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index f50da15dbef..8cbd3788f64 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -121,7 +121,7 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, RecordTick(stats_, NUMBER_KEYS_READ); size_t size = pinnable_val->size(); RecordTick(stats_, BYTES_READ, size); - MeasureTime(stats_, BYTES_PER_READ, size); + RecordTimeToHistogram(stats_, BYTES_PER_READ, size); PERF_COUNTER_ADD(get_read_bytes, size); } return s; From c95eea8754d89a147bdc1a6a6937bdf9658d507d Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 28 Feb 2019 14:54:15 -0800 Subject: [PATCH 13/33] Refactor some log reader code --- db/log_reader.cc | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/db/log_reader.cc b/db/log_reader.cc index 5b2379bfea1..46b9dfd2308 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -262,14 +262,9 @@ bool Reader::TryReadRecord(Slice* record, std::string* scratch) { break; case kBadHeader: + case kBadRecord: case kEof: case kOldRecord: - if (in_fragmented_record_) { - fragments_.clear(); - } - return false; - - case kBadRecord: if (in_fragmented_record_) { ReportCorruption(fragments_.size(), "error in middle of record"); in_fragmented_record_ = false; @@ -277,17 +272,12 @@ bool Reader::TryReadRecord(Slice* record, std::string* scratch) { } break; - case kBadRecordLen: case kBadRecordChecksum: if (recycled_) { fragments_.clear(); return false; } - if (fragment_type_or_err == kBadRecordLen) { - ReportCorruption(drop_size, "bad record length"); - } else { - ReportCorruption(drop_size, "checksum mismatch"); - } + ReportCorruption(drop_size, "checksum mismatch"); if (in_fragmented_record_) { ReportCorruption(fragments_.size(), "error in middle of record"); in_fragmented_record_ = false; @@ -537,22 +527,17 @@ bool Reader::TryReadMore(size_t* drop_size, int* error) { return true; } else if (!read_error_) { ForceUnmarkEOF(); - return !read_error_; - } else { - // Note that if buffer_ is non-empty, we have a truncated header at the - // end of the file, which can be caused by the writer crashing in the - // middle of writing the header. Unless explicitly requested we don't - // considering this an error, just report EOF. - if (buffer_.size()) { - *drop_size = buffer_.size(); - buffer_.clear(); - *error = kBadHeader; - return false; - } - buffer_.clear(); - *error = kEof; - return false; } + if (!read_error_) { + return true; + } + *error = kEof; + *drop_size = buffer_.size(); + if (buffer_.size() > 0) { + *error = kBadHeader; + } + buffer_.clear(); + return false; } // return true if the caller should process the fragment_type_or_err. From ae515741de10173e3fc3ce47ee26646824882cf3 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 28 Feb 2019 17:59:19 -0800 Subject: [PATCH 14/33] Refactor log reader --- db/db_impl_secondary.h | 2 +- db/log_reader.cc | 235 ++++++++++++++++++++--------------------- db/log_reader.h | 54 ++++++---- db/log_test.cc | 58 +++++----- db/version_set.cc | 17 +-- db/version_set.h | 14 +-- 6 files changed, 202 insertions(+), 178 deletions(-) diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h index 9cbf6ce3f8b..60da065e124 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl_secondary.h @@ -130,7 +130,7 @@ class DBImplSecondary : public DBImpl { DBImplSecondary(const DBImplSecondary&); void operator=(const DBImplSecondary&); - std::unique_ptr manifest_reader_; + std::unique_ptr manifest_reader_; std::unique_ptr manifest_reporter_; std::unique_ptr manifest_reader_status_; }; diff --git a/db/log_reader.cc b/db/log_reader.cc index 46b9dfd2308..e734e9d6c88 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -37,9 +37,7 @@ Reader::Reader(std::shared_ptr info_log, last_record_offset_(0), end_of_buffer_offset_(0), log_number_(log_num), - recycled_(false), - fragments_(), - in_fragmented_record_(false) {} + recycled_(false) {} Reader::~Reader() { delete[] backing_store_; @@ -199,108 +197,6 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, return false; } -// return true if a complete record has been read successfully. -bool Reader::TryReadRecord(Slice* record, std::string* scratch) { - assert(record != nullptr); - assert(scratch != nullptr); - record->clear(); - scratch->clear(); - - uint64_t prospective_record_offset = 0; - uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); - size_t drop_size = 0; - unsigned int fragment_type_or_err = 0; // Initialize to make compiler happy - Slice fragment; - while (TryReadFragment(&fragment, &drop_size, &fragment_type_or_err)) { - switch (fragment_type_or_err) { - case kFullType: - case kRecyclableFullType: - if (in_fragmented_record_ && !fragments_.empty()) { - ReportCorruption(fragments_.size(), "partial record without end(1)"); - } - fragments_.clear(); - *record = fragment; - prospective_record_offset = physical_record_offset; - last_record_offset_ = prospective_record_offset; - in_fragmented_record_ = false; - return true; - - case kFirstType: - case kRecyclableFirstType: - if (in_fragmented_record_ || !fragments_.empty()) { - ReportCorruption(fragments_.size(), "partial record without end(2)"); - } - prospective_record_offset = physical_record_offset; - fragments_.assign(fragment.data(), fragment.size()); - in_fragmented_record_ = true; - break; - - case kMiddleType: - case kRecyclableMiddleType: - if (!in_fragmented_record_) { - ReportCorruption(fragment.size(), - "missing start of fragmented record(1)"); - } else { - fragments_.append(fragment.data(), fragment.size()); - } - break; - - case kLastType: - case kRecyclableLastType: - if (!in_fragmented_record_) { - ReportCorruption(fragment.size(), - "missing start of fragmented record(2)"); - } else { - fragments_.append(fragment.data(), fragment.size()); - scratch->assign(fragments_.data(), fragments_.size()); - fragments_.clear(); - *record = Slice(*scratch); - last_record_offset_ = prospective_record_offset; - in_fragmented_record_ = false; - return true; - } - break; - - case kBadHeader: - case kBadRecord: - case kEof: - case kOldRecord: - if (in_fragmented_record_) { - ReportCorruption(fragments_.size(), "error in middle of record"); - in_fragmented_record_ = false; - fragments_.clear(); - } - break; - - case kBadRecordChecksum: - if (recycled_) { - fragments_.clear(); - return false; - } - ReportCorruption(drop_size, "checksum mismatch"); - if (in_fragmented_record_) { - ReportCorruption(fragments_.size(), "error in middle of record"); - in_fragmented_record_ = false; - fragments_.clear(); - } - break; - - default: { - char buf[40]; - snprintf(buf, sizeof(buf), "unknown record type %u", - fragment_type_or_err); - ReportCorruption( - fragment.size() + (in_fragmented_record_ ? fragments_.size() : 0), - buf); - in_fragmented_record_ = false; - fragments_.clear(); - break; - } - } - } - return false; -} - uint64_t Reader::LastRecordOffset() { return last_record_offset_; } @@ -316,14 +212,6 @@ void Reader::UnmarkEOF() { UnmarkEOFInternal(); } -void Reader::ForceUnmarkEOF() { - if (read_error_) { - return; - } - eof_ = false; - UnmarkEOFInternal(); -} - void Reader::UnmarkEOFInternal() { // If the EOF was in the middle of a block (a partial block was read) we have // to read the rest of the block as ReadPhysicalRecord can only read full @@ -507,7 +395,117 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) { } } -bool Reader::TryReadMore(size_t* drop_size, int* error) { +bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode /*unused*/) { + assert(record != nullptr); + assert(scratch != nullptr); + record->clear(); + scratch->clear(); + + uint64_t prospective_record_offset = 0; + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); + size_t drop_size = 0; + unsigned int fragment_type_or_err = 0; // Initialize to make compiler happy + Slice fragment; + while (TryReadFragment(&fragment, &drop_size, &fragment_type_or_err)) { + switch (fragment_type_or_err) { + case kFullType: + case kRecyclableFullType: + if (in_fragmented_record_ && !fragments_.empty()) { + ReportCorruption(fragments_.size(), "partial record without end(1)"); + } + fragments_.clear(); + *record = fragment; + prospective_record_offset = physical_record_offset; + last_record_offset_ = prospective_record_offset; + in_fragmented_record_ = false; + return true; + + case kFirstType: + case kRecyclableFirstType: + if (in_fragmented_record_ || !fragments_.empty()) { + ReportCorruption(fragments_.size(), "partial record without end(2)"); + } + prospective_record_offset = physical_record_offset; + fragments_.assign(fragment.data(), fragment.size()); + in_fragmented_record_ = true; + break; + + case kMiddleType: + case kRecyclableMiddleType: + if (!in_fragmented_record_) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(1)"); + } else { + fragments_.append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + case kRecyclableLastType: + if (!in_fragmented_record_) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(2)"); + } else { + fragments_.append(fragment.data(), fragment.size()); + scratch->assign(fragments_.data(), fragments_.size()); + fragments_.clear(); + *record = Slice(*scratch); + last_record_offset_ = prospective_record_offset; + in_fragmented_record_ = false; + return true; + } + break; + + case kBadHeader: + case kBadRecord: + case kEof: + case kOldRecord: + if (in_fragmented_record_) { + ReportCorruption(fragments_.size(), "error in middle of record"); + in_fragmented_record_ = false; + fragments_.clear(); + } + break; + + case kBadRecordChecksum: + if (recycled_) { + fragments_.clear(); + return false; + } + ReportCorruption(drop_size, "checksum mismatch"); + if (in_fragmented_record_) { + ReportCorruption(fragments_.size(), "error in middle of record"); + in_fragmented_record_ = false; + fragments_.clear(); + } + break; + + default: { + char buf[40]; + snprintf(buf, sizeof(buf), "unknown record type %u", + fragment_type_or_err); + ReportCorruption( + fragment.size() + (in_fragmented_record_ ? fragments_.size() : 0), + buf); + in_fragmented_record_ = false; + fragments_.clear(); + break; + } + } + } + return false; +} + +void FragmentBufferedReader::UnmarkEOF() { + if (read_error_) { + return; + } + eof_ = false; + UnmarkEOFInternal(); +} + +bool FragmentBufferedReader::TryReadMore(size_t* drop_size, int* error) { if (!eof_ && !read_error_) { // Last read was a full read, so this is a trailer to skip buffer_.clear(); @@ -522,11 +520,12 @@ bool Reader::TryReadMore(size_t* drop_size, int* error) { } else if (buffer_.size() < static_cast(kBlockSize)) { eof_ = true; eof_offset_ = buffer_.size(); - TEST_SYNC_POINT_CALLBACK("LogReader::TryReadMore:FirstEOF", nullptr); + TEST_SYNC_POINT_CALLBACK( + "FragmentBufferedLogReader::TryReadMore:FirstEOF", nullptr); } return true; } else if (!read_error_) { - ForceUnmarkEOF(); + UnmarkEOF(); } if (!read_error_) { return true; @@ -541,8 +540,8 @@ bool Reader::TryReadMore(size_t* drop_size, int* error) { } // return true if the caller should process the fragment_type_or_err. -bool Reader::TryReadFragment(Slice* fragment, size_t* drop_size, - unsigned int* fragment_type_or_err) { +bool FragmentBufferedReader::TryReadFragment( + Slice* fragment, size_t* drop_size, unsigned int* fragment_type_or_err) { assert(fragment != nullptr); assert(drop_size != nullptr); assert(fragment_type_or_err != nullptr); diff --git a/db/log_reader.h b/db/log_reader.h index 83d05ddcb11..63777d6daa7 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -55,18 +55,16 @@ class Reader { std::unique_ptr&& file, Reporter* reporter, bool checksum, uint64_t log_num); - ~Reader(); + virtual ~Reader(); // Read the next record into *record. Returns true if read // successfully, false if we hit end of the input. May use // "*scratch" as temporary storage. The contents filled in *record // will only be valid until the next mutating operation on this // reader or the next mutation to *scratch. - bool ReadRecord(Slice* record, std::string* scratch, - WALRecoveryMode wal_recovery_mode = - WALRecoveryMode::kTolerateCorruptedTailRecords); - - bool TryReadRecord(Slice* record, std::string* scratch); + virtual bool ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords); // Returns the physical offset of the last record returned by ReadRecord. // @@ -86,15 +84,13 @@ class Reader { // Also aligns the file position indicator to the start of the next block // by reading the rest of the data from the EOF position to the end of the // block that was partially read. - void UnmarkEOF(); - - void ForceUnmarkEOF(); + virtual void UnmarkEOF(); SequentialFileReader* file() { return file_.get(); } Reporter* GetReporter() const { return reporter_; } - private: + protected: std::shared_ptr info_log_; const std::unique_ptr file_; Reporter* const reporter_; @@ -121,9 +117,6 @@ class Reader { // Whether this is a recycled log file bool recycled_; - std::string fragments_; - bool in_fragmented_record_; - // Extend record types with the following special values enum { kEof = kMaxRecordType + 1, @@ -145,14 +138,9 @@ class Reader { // Return type, or one of the preceding special values unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size); - bool TryReadFragment(Slice* result, size_t* drop_size, - unsigned int* fragment_type_or_err); - // Read some more bool ReadMore(size_t* drop_size, int *error); - bool TryReadMore(size_t* drop_size, int* error); - void UnmarkEOFInternal(); // Reports dropped bytes to the reporter. @@ -160,10 +148,40 @@ class Reader { void ReportCorruption(size_t bytes, const char* reason); void ReportDrop(size_t bytes, const Status& reason); + private: // No copying allowed Reader(const Reader&); void operator=(const Reader&); }; +class FragmentBufferedReader : public Reader { + public: + FragmentBufferedReader(std::shared_ptr info_log, + // @lint-ignore TXT2 T25377293 Grandfathered in + std::unique_ptr&& _file, + Reporter* reporter, bool checksum, uint64_t log_num) + : Reader(info_log, std::move(_file), reporter, checksum, log_num), + fragments_(), + in_fragmented_record_(false) {} + ~FragmentBufferedReader() override {} + bool ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords) override; + void UnmarkEOF() override; + + private: + std::string fragments_; + bool in_fragmented_record_; + + bool TryReadFragment(Slice* result, size_t* drop_size, + unsigned int* fragment_type_or_err); + + bool TryReadMore(size_t* drop_size, int* error); + + // No copy allowed + FragmentBufferedReader(const FragmentBufferedReader&); + void operator=(const FragmentBufferedReader&); +}; + } // namespace log } // namespace rocksdb diff --git a/db/log_test.cc b/db/log_test.cc index c79ffd82c65..fd237b030e7 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -146,7 +146,7 @@ class LogTest : public ::testing::TestWithParam> { } void reset_source_contents() { - auto src = dynamic_cast(reader_.file()->file()); + auto src = dynamic_cast(reader_->file()->file()); assert(src); src->contents_ = dest_contents(); } @@ -156,7 +156,7 @@ class LogTest : public ::testing::TestWithParam> { std::unique_ptr source_holder_; ReportCollector report_; Writer writer_; - Reader reader_; + std::unique_ptr reader_; protected: bool allow_retry_read_; @@ -170,9 +170,16 @@ class LogTest : public ::testing::TestWithParam> { new StringSource(reader_contents_, !std::get<1>(GetParam())), "" /* file name */)), writer_(std::move(dest_holder_), 123, std::get<0>(GetParam())), - reader_(nullptr, std::move(source_holder_), &report_, - true /* checksum */, 123 /* log_number */), - allow_retry_read_(std::get<1>(GetParam())) {} + allow_retry_read_(std::get<1>(GetParam())) { + if (allow_retry_read_) { + reader_.reset(new FragmentBufferedReader( + nullptr, std::move(source_holder_), &report_, true /* checksum */, + 123 /* log_number */)); + } else { + reader_.reset(new Reader(nullptr, std::move(source_holder_), &report_, + true /* checksum */, 123 /* log_number */)); + } + } Slice* get_reader_contents() { return &reader_contents_; } @@ -189,11 +196,7 @@ class LogTest : public ::testing::TestWithParam> { std::string scratch; Slice record; bool ret = false; - if (allow_retry_read_) { - ret = reader_.TryReadRecord(&record, &scratch); - } else { - ret = reader_.ReadRecord(&record, &scratch, wal_recovery_mode); - } + ret = reader_->ReadRecord(&record, &scratch, wal_recovery_mode); if (ret) { return record.ToString(); } else { @@ -226,7 +229,7 @@ class LogTest : public ::testing::TestWithParam> { } void ForceError(size_t position = 0) { - auto src = dynamic_cast(reader_.file()->file()); + auto src = dynamic_cast(reader_->file()->file()); src->force_error_ = true; src->force_error_position_ = position; } @@ -240,20 +243,18 @@ class LogTest : public ::testing::TestWithParam> { } void ForceEOF(size_t position = 0) { - auto src = dynamic_cast(reader_.file()->file()); + auto src = dynamic_cast(reader_->file()->file()); src->force_eof_ = true; src->force_eof_position_ = position; } void UnmarkEOF() { - auto src = dynamic_cast(reader_.file()->file()); + auto src = dynamic_cast(reader_->file()->file()); src->returned_partial_ = false; - reader_.UnmarkEOF(); + reader_->UnmarkEOF(); } - bool IsEOF() { - return reader_.IsEOF(); - } + bool IsEOF() { return reader_->IsEOF(); } // Returns OK iff recorded error message contains "msg" std::string MatchError(const std::string& msg) const { @@ -722,7 +723,7 @@ class RetriableLogTest : public ::testing::TestWithParam { std::unique_ptr writer_; std::unique_ptr reader_; ReportCollector report_; - std::unique_ptr log_reader_; + std::unique_ptr log_reader_; public: RetriableLogTest() @@ -761,8 +762,9 @@ class RetriableLogTest : public ::testing::TestWithParam { if (s.ok()) { reader_.reset(new SequentialFileReader(std::move(seq_file), log_file_)); assert(reader_ != nullptr); - log_reader_.reset(new Reader(nullptr, std::move(reader_), &report_, - true /* checksum */, 123 /* log_number */)); + log_reader_.reset(new FragmentBufferedReader( + nullptr, std::move(reader_), &report_, true /* checksum */, + 123 /* log_number */)); assert(log_reader_ != nullptr); } return s; @@ -787,7 +789,7 @@ class RetriableLogTest : public ::testing::TestWithParam { result->clear(); std::string scratch; Slice record; - bool r = log_reader_->TryReadRecord(&record, &scratch); + bool r = log_reader_->ReadRecord(&record, &scratch); if (r) { result->assign(record.data(), record.size()); return true; @@ -806,11 +808,12 @@ TEST_P(RetriableLogTest, TailLog_PartialHeader) { SyncPoint::GetInstance()->LoadDependency( {{"RetriableLogTest::TailLog:AfterPart1", "RetriableLogTest::TailLog:BeforeReadRecord"}, - {"LogReader::TryReadMore:FirstEOF", + {"FragmentBufferedLogReader::TryReadMore:FirstEOF", "RetriableLogTest::TailLog:BeforePart2"}}); SyncPoint::GetInstance()->ClearAllCallBacks(); - SyncPoint::GetInstance()->SetCallBack("LogReader::TryReadMore:FirstEOF", - [&](void* /*arg*/) { eof = true; }); + SyncPoint::GetInstance()->SetCallBack( + "FragmentBufferedLogReader::TryReadMore:FirstEOF", + [&](void* /*arg*/) { eof = true; }); SyncPoint::GetInstance()->EnableProcessing(); size_t delta = header_size - 1; @@ -848,11 +851,12 @@ TEST_P(RetriableLogTest, TailLog_FullHeader) { SyncPoint::GetInstance()->LoadDependency( {{"RetriableLogTest::TailLog:AfterPart1", "RetriableLogTest::TailLog:BeforeReadRecord"}, - {"LogReader::TryReadMore:FirstEOF", + {"FragmentBufferedLogReader::TryReadMore:FirstEOF", "RetriableLogTest::TailLog:BeforePart2"}}); SyncPoint::GetInstance()->ClearAllCallBacks(); - SyncPoint::GetInstance()->SetCallBack("LogReader::TryReadMore:FirstEOF", - [&](void* /*arg*/) { eof = true; }); + SyncPoint::GetInstance()->SetCallBack( + "FragmentBufferedLogReader::TryReadMore:FirstEOF", + [&](void* /*arg*/) { eof = true; }); SyncPoint::GetInstance()->EnableProcessing(); size_t delta = header_size + 1; diff --git a/db/version_set.cc b/db/version_set.cc index a5ca356c3ac..dd322ed919b 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3310,7 +3310,8 @@ Status VersionSet::LogAndApply( } Status VersionSet::ReadAndApply( - InstrumentedMutex* mu, std::unique_ptr* manifest_reader, + InstrumentedMutex* mu, + std::unique_ptr* manifest_reader, std::unordered_set* cfds_changed) { assert(manifest_reader != nullptr); assert(cfds_changed != nullptr); @@ -3333,7 +3334,7 @@ Status VersionSet::ReadAndApply( std::string scratch; log::Reader* reader = manifest_reader->get(); std::string old_manifest_path = reader->file()->file_name(); - while (reader->TryReadRecord(&record, &scratch)) { + while (reader->ReadRecord(&record, &scratch)) { VersionEdit edit; s = edit.DecodeFrom(record); if (!s.ok()) { @@ -3693,7 +3694,7 @@ Status VersionSet::ApplyOneVersionEditToBuilder( Status VersionSet::MaybeSwitchManifest( log::Reader::Reporter* reporter, - std::unique_ptr* manifest_reader) { + std::unique_ptr* manifest_reader) { assert(manifest_reader != nullptr); Status s; do { @@ -3720,9 +3721,9 @@ Status VersionSet::MaybeSwitchManifest( manifest_file_reader.reset( new SequentialFileReader(std::move(manifest_file), manifest_path)); // TODO(yanqin) secondary instance needs a separate info log file. - manifest_reader->reset( - new log::Reader(nullptr, std::move(manifest_file_reader), reporter, - true /* checksum */, 0 /* log_number */)); + manifest_reader->reset(new log::FragmentBufferedReader( + nullptr, std::move(manifest_file_reader), reporter, + true /* checksum */, 0 /* log_number */)); ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n", manifest_path.c_str()); } @@ -4008,7 +4009,7 @@ Status VersionSet::Recover( Status VersionSet::RecoverAsSecondary( const std::vector& column_families, - std::unique_ptr* manifest_reader, + std::unique_ptr* manifest_reader, std::unique_ptr* manifest_reporter, std::unique_ptr* manifest_reader_status) { assert(manifest_reader != nullptr); @@ -4059,7 +4060,7 @@ Status VersionSet::RecoverAsSecondary( assert(reader != nullptr); Slice record; std::string scratch; - while (s.ok() && reader->TryReadRecord(&record, &scratch)) { + while (s.ok() && reader->ReadRecord(&record, &scratch)) { VersionEdit edit; s = edit.DecodeFrom(record); if (!s.ok()) { diff --git a/db/version_set.h b/db/version_set.h index c4c1b2445b1..a3ef19e1780 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -797,9 +797,10 @@ class VersionSet { bool new_descriptor_log = false, const ColumnFamilyOptions* new_cf_options = nullptr); - Status ReadAndApply(InstrumentedMutex* mu, - std::unique_ptr* manifest_reader, - std::unordered_set* cfds_changed); + Status ReadAndApply( + InstrumentedMutex* mu, + std::unique_ptr* manifest_reader, + std::unordered_set* cfds_changed); Status GetCurrentManifestPath(std::string* manifest_filename); @@ -811,7 +812,7 @@ class VersionSet { Status RecoverAsSecondary( const std::vector& column_families, - std::unique_ptr* manifest_reader, + std::unique_ptr* manifest_reader, std::unique_ptr* manifest_reporter, std::unique_ptr* manifest_reader_status); @@ -1037,8 +1038,9 @@ class VersionSet { SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, uint32_t* max_column_family); - Status MaybeSwitchManifest(log::Reader::Reporter* reporter, - std::unique_ptr* manifest_reader); + Status MaybeSwitchManifest( + log::Reader::Reporter* reporter, + std::unique_ptr* manifest_reader); // REQUIRES db mutex at beginning. may release and re-acquire db mutex Status ProcessManifestWrites(std::deque& writers, From ac2fb37c7259d2c4f016046b6d1cbd1f18cffbe1 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 1 Mar 2019 13:34:16 -0800 Subject: [PATCH 15/33] Remove unrelated changes to reduce number of files changed --- db/db_impl_open.cc | 2 +- db/db_impl_readonly.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index 9c8937bf2f0..99c27f45d57 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -405,6 +405,7 @@ Status DBImpl::Recover( } if (s.ok()) { + SequenceNumber next_sequence(kMaxSequenceNumber); default_cf_handle_ = new ColumnFamilyHandleImpl( versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); @@ -467,7 +468,6 @@ Status DBImpl::Recover( if (!logs.empty()) { // Recover in the order in which the logs were generated std::sort(logs.begin(), logs.end()); - SequenceNumber next_sequence(kMaxSequenceNumber); s = RecoverLogFiles(logs, &next_sequence, read_only); if (!s.ok()) { // Clear memtables if recovery failed diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index 46b2c61b5d5..5d7515c28e2 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -159,6 +159,7 @@ Status DB::OpenForReadOnly( *dbptr = nullptr; handles->clear(); + SuperVersionContext sv_context(/* create_superversion */ true); DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname); impl->mutex_.Lock(); Status s = impl->Recover(column_families, true /* read only */, @@ -175,7 +176,6 @@ Status DB::OpenForReadOnly( handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); } } - SuperVersionContext sv_context(/* create_superversion */ true); if (s.ok()) { for (auto cfd : *impl->versions_->GetColumnFamilySet()) { sv_context.NewSuperVersion(); From 00fa7853c1eef7aedf0178f202ee6e1ec10a6baf Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 1 Mar 2019 23:01:08 -0800 Subject: [PATCH 16/33] Format --- db/db_impl_secondary.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h index 60da065e124..96856006365 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl_secondary.h @@ -47,28 +47,33 @@ class DBImplSecondary : public DBImpl { const Slice& /*value*/) override { return Status::NotSupported("Not supported operation in read only mode."); } + using DBImpl::Merge; Status Merge(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, const Slice& /*value*/) override { return Status::NotSupported("Not supported operation in read only mode."); } + using DBImpl::Delete; Status Delete(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/) override { return Status::NotSupported("Not supported operation in read only mode."); } + using DBImpl::SingleDelete; Status SingleDelete(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/) override { return Status::NotSupported("Not supported operation in read only mode."); } + Status Write(const WriteOptions& /*options*/, WriteBatch* /*updates*/) override { return Status::NotSupported("Not supported operation in read only mode."); } + using DBImpl::CompactRange; Status CompactRange(const CompactRangeOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, @@ -94,6 +99,7 @@ class DBImplSecondary : public DBImpl { Status EnableFileDeletions(bool /*force*/) override { return Status::NotSupported("Not supported operation in read only mode."); } + Status GetLiveFiles(std::vector&, uint64_t* /*manifest_file_size*/, bool /*flush_memtable*/ = true) override { From 2aadea1360c787f78c186dad24309f707c89b4fa Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Sat, 2 Mar 2019 11:48:06 -0800 Subject: [PATCH 17/33] Rearrange member variables --- db/db_impl.cc | 12 +++---- db/db_impl.h | 69 ++++++++++++++++++----------------------- db/db_impl_secondary.cc | 6 ++-- db/db_impl_secondary.h | 6 +++- 4 files changed, 46 insertions(+), 47 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 558b76082a9..e24672c86fe 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -148,18 +148,21 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, immutable_db_options_(initial_db_options_), mutable_db_options_(initial_db_options_), stats_(immutable_db_options_.statistics.get()), - db_lock_(nullptr), mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS, immutable_db_options_.use_adaptive_mutex), + default_cf_handle_(nullptr), + max_total_in_memory_state_(0), + env_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)), + env_options_for_compaction_(env_->OptimizeForCompactionTableWrite( + env_options_, immutable_db_options_)), + db_lock_(nullptr), shutting_down_(false), bg_cv_(&mutex_), logfile_number_(0), log_dir_synced_(false), log_empty_(true), - default_cf_handle_(nullptr), log_sync_cv_(&mutex_), total_log_size_(0), - max_total_in_memory_state_(0), is_snapshot_supported_(true), write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()), write_thread_(immutable_db_options_), @@ -186,9 +189,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, next_job_id_(1), has_unpersisted_data_(false), unable_to_release_oldest_log_(false), - env_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)), - env_options_for_compaction_(env_->OptimizeForCompactionTableWrite( - env_options_, immutable_db_options_)), num_running_ingest_file_(0), #ifndef ROCKSDB_LITE wal_manager_(immutable_db_options_, env_options_, seq_per_batch), diff --git a/db/db_impl.h b/db/db_impl.h index cfe053ad318..bedb8f54290 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -758,6 +758,29 @@ class DBImpl : public DB { std::unique_ptr tracer_; InstrumentedMutex trace_mutex_; + // State below is protected by mutex_ + // With two_write_queues enabled, some of the variables that accessed during + // WriteToWAL need different synchronization: log_empty_, alive_log_files_, + // logs_, logfile_number_. Refer to the definition of each variable below for + // more description. + mutable InstrumentedMutex mutex_; + + ColumnFamilyHandleImpl* default_cf_handle_; + InternalStats* default_cf_internal_stats_; + + // only used for dynamically adjusting max_total_wal_size. it is a sum of + // [write_buffer_size * max_write_buffer_number] over all column families + uint64_t max_total_in_memory_state_; + // If true, we have only one (default) column family. We use this to optimize + // some code-paths + bool single_column_family_mode_; + + // The options to access storage files + const EnvOptions env_options_; + + // Additonal options for compaction and flush + EnvOptions env_options_for_compaction_; + // Except in DB::Open(), WriteOptionsFile can only be called when: // Persist options to options file. // If need_mutex_lock = false, the method will lock DB mutex. @@ -845,6 +868,14 @@ class DBImpl : public DB { // Actual implementation of Close() Status CloseImpl(); + // Recover the descriptor from persistent storage. May do a significant + // amount of work to recover recently logged updates. Any changes to + // be made to the descriptor are added to *edit. + virtual Status Recover( + const std::vector& column_families, + bool read_only = false, bool error_if_log_file_exist = false, + bool error_if_data_exists_in_logs = false); + private: friend class DB; friend class ErrorHandler; @@ -892,13 +923,6 @@ class DBImpl : public DB { struct PrepickedCompaction; struct PurgeFileInfo; - // Recover the descriptor from persistent storage. May do a significant - // amount of work to recover recently logged updates. Any changes to - // be made to the descriptor are added to *edit. - Status Recover(const std::vector& column_families, - bool read_only = false, bool error_if_log_file_exist = false, - bool error_if_data_exists_in_logs = false); - Status ResumeImpl(); void MaybeIgnoreError(Status* s) const; @@ -1216,15 +1240,6 @@ class DBImpl : public DB { // details. InstrumentedMutex log_write_mutex_; - protected: - // State below is protected by mutex_ - // With two_write_queues enabled, some of the variables that accessed during - // WriteToWAL need different synchronization: log_empty_, alive_log_files_, - // logs_, logfile_number_. Refer to the definition of each variable below for - // more description. - mutable InstrumentedMutex mutex_; - - private: std::atomic shutting_down_; // This condition variable is signaled on these conditions: // * whenever bg_compaction_scheduled_ goes down to 0 @@ -1256,11 +1271,6 @@ class DBImpl : public DB { // expesnive mutex_ lock during WAL write, which update log_empty_. bool log_empty_; - protected: - ColumnFamilyHandleImpl* default_cf_handle_; - InternalStats* default_cf_internal_stats_; - - private: std::unique_ptr column_family_memtables_; struct LogFileNumberSize { explicit LogFileNumberSize(uint64_t _number) @@ -1328,15 +1338,6 @@ class DBImpl : public DB { std::atomic cached_recoverable_state_empty_ = {true}; std::atomic total_log_size_; - protected: - // only used for dynamically adjusting max_total_wal_size. it is a sum of - // [write_buffer_size * max_write_buffer_number] over all column families - uint64_t max_total_in_memory_state_; - // If true, we have only one (default) column family. We use this to optimize - // some code-paths - bool single_column_family_mode_; - - private: // If this is non-empty, we need to delete these log files in background // threads. Protected by db mutex. autovector logs_to_free_; @@ -1555,14 +1556,6 @@ class DBImpl : public DB { std::string db_absolute_path_; - protected: - // The options to access storage files - const EnvOptions env_options_; - - // Additonal options for compaction and flush - EnvOptions env_options_for_compaction_; - - private: // Number of running IngestExternalFile() calls. // REQUIRES: mutex held int num_running_ingest_file_; diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index 8cbd3788f64..f2d9b0dfb6e 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -27,7 +27,9 @@ DBImplSecondary::DBImplSecondary(const DBOptions& db_options, DBImplSecondary::~DBImplSecondary() {} Status DBImplSecondary::Recover( - const std::vector& column_families) { + const std::vector& column_families, + bool /*readonly*/, bool /*error_if_log_file_exist*/, + bool /*error_if_data_exists_in_logs*/) { mutex_.AssertHeld(); Status s; @@ -292,7 +294,7 @@ Status DB::OpenAsSecondary( handles->clear(); DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname); impl->mutex_.Lock(); - Status s = impl->Recover(column_families); + Status s = impl->Recover(column_families, true, false, false); if (s.ok()) { for (auto cf : column_families) { auto cfd = diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h index 96856006365..49510d126b0 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl_secondary.h @@ -18,7 +18,9 @@ class DBImplSecondary : public DBImpl { DBImplSecondary(const DBOptions& options, const std::string& dbname); ~DBImplSecondary(); - Status Recover(const std::vector& column_families); + Status Recover(const std::vector& column_families, + bool read_only, bool error_if_log_file_exist, + bool error_if_data_exists_in_logs) override; // Implementations of the DB interface using DB::Get; @@ -136,6 +138,8 @@ class DBImplSecondary : public DBImpl { DBImplSecondary(const DBImplSecondary&); void operator=(const DBImplSecondary&); + using DBImpl::Recover; + std::unique_ptr manifest_reader_; std::unique_ptr manifest_reporter_; std::unique_ptr manifest_reader_status_; From f43d73b28c51044cc972a3a572fe64e1cc998d76 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Sat, 2 Mar 2019 12:23:57 -0800 Subject: [PATCH 18/33] Introduce state/mode to prevent unexpected function calls --- db/version_set.cc | 10 +++++++++- db/version_set.h | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/db/version_set.cc b/db/version_set.cc index dd322ed919b..c7e21a6de23 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2785,7 +2785,9 @@ VersionSet::VersionSet(const std::string& dbname, prev_log_number_(0), current_version_number_(0), manifest_file_size_(0), - env_options_(storage_options) {} + env_options_(storage_options) { + state_ = State::INITIALIZED; +} void CloseTables(void* ptr, size_t) { TableReader* table_reader = reinterpret_cast(ptr); @@ -3235,6 +3237,7 @@ Status VersionSet::LogAndApply( const autovector>& edit_lists, InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options) { + assert(State::PRIMARY == state_); mu->AssertHeld(); int num_edits = 0; for (const auto& elist : edit_lists) { @@ -3313,6 +3316,7 @@ Status VersionSet::ReadAndApply( InstrumentedMutex* mu, std::unique_ptr* manifest_reader, std::unordered_set* cfds_changed) { + assert(State::SECONDARY == state_); assert(manifest_reader != nullptr); assert(cfds_changed != nullptr); mu->AssertHeld(); @@ -3759,6 +3763,8 @@ Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) { Status VersionSet::Recover( const std::vector& column_families, bool read_only) { + assert(State::INITIALIZED == state_); + state_ = State::PRIMARY; std::unordered_map cf_name_to_options; for (auto cf : column_families) { cf_name_to_options.insert({cf.name, cf.options}); @@ -4012,6 +4018,8 @@ Status VersionSet::RecoverAsSecondary( std::unique_ptr* manifest_reader, std::unique_ptr* manifest_reporter, std::unique_ptr* manifest_reader_status) { + assert(State::INITIALIZED == state_); + state_ = State::SECONDARY; assert(manifest_reader != nullptr); assert(manifest_reporter != nullptr); assert(manifest_reader_status != nullptr); diff --git a/db/version_set.h b/db/version_set.h index a3ef19e1780..6501d445878 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1096,6 +1096,14 @@ class VersionSet { // env options for all reads and writes except compactions EnvOptions env_options_; + enum class State { + INITIALIZED, + PRIMARY, + SECONDARY, + }; + + State state_; + std::unordered_map> active_version_builders_; From 8d237a4a24ba1b01104a87c8070e0cd290a20b83 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Sat, 2 Mar 2019 21:35:14 -0800 Subject: [PATCH 19/33] Refactor, format and print --- db/db_impl_secondary.cc | 3 -- db/version_set.cc | 61 ++++++++++++++++++++--------------------- db/version_set.h | 3 +- 3 files changed, 32 insertions(+), 35 deletions(-) diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index f2d9b0dfb6e..dbb594ff803 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -4,11 +4,8 @@ // (found in the LICENSE.Apache file in the root directory). #include "db/db_impl_secondary.h" -#include "db/db_impl.h" #include "db/db_iter.h" -#include "db/forward_iterator.h" #include "db/merge_context.h" -#include "db/range_del_aggregator.h" #include "monitoring/perf_context_imp.h" #include "util/auto_roll_logger.h" diff --git a/db/version_set.cc b/db/version_set.cc index c7e21a6de23..1bfd1e63ab5 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -727,13 +727,12 @@ class BaseReferencedVersionBuilder { version_->Ref(); } ~BaseReferencedVersionBuilder() { - delete version_builder_; version_->Unref(); } - VersionBuilder* version_builder() { return version_builder_; } + VersionBuilder* version_builder() { return version_builder_.get(); } private: - VersionBuilder* version_builder_; + std::unique_ptr version_builder_; Version* version_; }; @@ -3491,7 +3490,8 @@ Status VersionSet::ApplyOneVersionEditToBuilder( VersionEdit& edit, const std::unordered_map& name_to_options, std::unordered_map& column_families_not_found, - std::unordered_map& builders, + std::unordered_map>& + builders, bool* have_log_number, uint64_t* /* log_number */, bool* have_prev_log_number, uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, @@ -3526,14 +3526,14 @@ Status VersionSet::ApplyOneVersionEditToBuilder( } else { cfd = CreateColumnFamily(cf_options->second, &edit); cfd->set_initialized(); - builders.insert( - {edit.column_family_, new BaseReferencedVersionBuilder(cfd)}); + builders.insert(std::make_pair( + edit.column_family_, std::unique_ptr( + new BaseReferencedVersionBuilder(cfd)))); } } else if (edit.is_column_family_drop_) { if (cf_in_builders) { auto builder = builders.find(edit.column_family_); assert(builder != builders.end()); - delete builder->second; builders.erase(builder); cfd = column_family_set_->GetColumnFamily(edit.column_family_); assert(cfd != nullptr); @@ -3656,7 +3656,10 @@ Status VersionSet::ApplyOneVersionEditToBuilder( if (cfd != nullptr) { if (edit.has_log_number_) { if (cfd->GetLogNumber() > edit.log_number_) { - // TODO (yanqin) use a separate info log for secondary instance. + ROCKS_LOG_WARN( + db_options_->info_log, + "MANIFEST corruption detected, but ignored - log numbers in " + "records NOT monotonically increasing"); } else { cfd->SetLogNumber(edit.log_number_); *have_log_number = true; @@ -3724,7 +3727,6 @@ Status VersionSet::MaybeSwitchManifest( if (s.ok()) { manifest_file_reader.reset( new SequentialFileReader(std::move(manifest_file), manifest_path)); - // TODO(yanqin) secondary instance needs a separate info log file. manifest_reader->reset(new log::FragmentBufferedReader( nullptr, std::move(manifest_file_reader), reporter, true /* checksum */, 0 /* log_number */)); @@ -3811,7 +3813,8 @@ Status VersionSet::Recover( uint64_t previous_log_number = 0; uint32_t max_column_family = 0; uint64_t min_log_number_to_keep = 0; - std::unordered_map builders; + std::unordered_map> + builders; // add default column family auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName); @@ -3826,7 +3829,9 @@ Status VersionSet::Recover( // In recovery, nobody else can access it, so it's fine to set it to be // initialized earlier. default_cfd->set_initialized(); - builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)}); + builders.insert( + std::make_pair(0, std::unique_ptr( + new BaseReferencedVersionBuilder(default_cfd)))); { VersionSet::LogReporter reporter; @@ -3955,7 +3960,7 @@ Status VersionSet::Recover( assert(cfd->initialized()); auto builders_iter = builders.find(cfd->GetID()); assert(builders_iter != builders.end()); - auto* builder = builders_iter->second->version_builder(); + auto builder = builders_iter->second->version_builder(); // unlimited table cache. Pre-load table handle now. // Need to do it out of the mutex. @@ -4006,10 +4011,6 @@ Status VersionSet::Recover( } } - for (auto& builder : builders) { - delete builder.second; - } - return s; } @@ -4053,9 +4054,12 @@ Status VersionSet::RecoverAsSecondary( uint64_t previous_log_number = 0; uint32_t max_column_family = 0; uint64_t min_log_number_to_keep = 0; - std::unordered_map builders; + std::unordered_map> + builders; std::unordered_map column_families_not_found; - builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)}); + builders.insert( + std::make_pair(0, std::unique_ptr( + new BaseReferencedVersionBuilder(default_cfd)))); manifest_reader_status->reset(new Status()); manifest_reporter->reset(new LogReporter()); @@ -4182,9 +4186,6 @@ Status VersionSet::RecoverAsSecondary( cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); } } - for (auto& builder : builders) { - delete builder.second; - } return s; } @@ -4368,7 +4369,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, uint64_t previous_log_number = 0; int count = 0; std::unordered_map comparators; - std::unordered_map builders; + std::unordered_map> + builders; // add default column family VersionEdit default_cf_edit; @@ -4376,7 +4378,9 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, default_cf_edit.SetColumnFamily(0); ColumnFamilyData* default_cfd = CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit); - builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)}); + builders.insert( + std::make_pair(0, std::unique_ptr( + new BaseReferencedVersionBuilder(default_cfd)))); { VersionSet::LogReporter reporter; @@ -4417,8 +4421,9 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, } cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit); cfd->set_initialized(); - builders.insert( - {edit.column_family_, new BaseReferencedVersionBuilder(cfd)}); + builders.insert(std::make_pair( + edit.column_family_, std::unique_ptr( + new BaseReferencedVersionBuilder(cfd)))); } else if (edit.is_column_family_drop_) { if (!cf_in_builders) { s = Status::Corruption( @@ -4426,7 +4431,6 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, break; } auto builder_iter = builders.find(edit.column_family_); - delete builder_iter->second; builders.erase(builder_iter); comparators.erase(edit.column_family_); cfd = column_family_set_->GetColumnFamily(edit.column_family_); @@ -4526,11 +4530,6 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, delete v; } - // Free builders - for (auto& builder : builders) { - delete builder.second; - } - next_file_number_.store(next_file + 1); last_allocated_sequence_ = last_sequence; last_published_sequence_ = last_sequence; diff --git a/db/version_set.h b/db/version_set.h index 6501d445878..5d29c8d48a7 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1024,7 +1024,8 @@ class VersionSet { VersionEdit& edit, const std::unordered_map& name_to_opts, std::unordered_map& column_families_not_found, - std::unordered_map& builders, + std::unordered_map< + uint32_t, std::unique_ptr>& builders, bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number, uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, SequenceNumber* last_sequence, From ff2a70d14b8d0cb00b976b99d9b52d9ceb64766c Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Sun, 3 Mar 2019 17:31:01 -0800 Subject: [PATCH 20/33] Refactor VersionSet code for secondary instance --- db/db_impl_secondary.cc | 15 +- db/log_reader.h | 2 +- db/version_edit.h | 1 + db/version_set.cc | 850 +++++++++++++++++++--------------------- db/version_set.h | 116 ++++-- 5 files changed, 499 insertions(+), 485 deletions(-) diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index dbb594ff803..31f112e4341 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -30,9 +30,9 @@ Status DBImplSecondary::Recover( mutex_.AssertHeld(); Status s; - s = versions_->RecoverAsSecondary(column_families, &manifest_reader_, - &manifest_reporter_, - &manifest_reader_status_); + s = static_cast(versions_.get()) + ->Recover(column_families, &manifest_reader_, &manifest_reporter_, + &manifest_reader_status_); if (!s.ok()) { return s; } @@ -210,7 +210,8 @@ Status DBImplSecondary::TryCatchUpWithPrimary() { Status s; std::unordered_set cfds_changed; InstrumentedMutexLock lock_guard(mutex()); - s = versions_->ReadAndApply(mutex(), &manifest_reader_, &cfds_changed); + s = static_cast(versions_.get()) + ->ReadAndApply(mutex(), &manifest_reader_, &cfds_changed); if (s.ok()) { SuperVersionContext sv_context(true /* create_superversion */); for (auto cfd : cfds_changed) { @@ -290,6 +291,12 @@ Status DB::OpenAsSecondary( handles->clear(); DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname); + impl->versions_.reset(new ReactiveVersionSet( + dbname, &impl->immutable_db_options_, impl->env_options_, + impl->table_cache_.get(), impl->write_buffer_manager_, + &impl->write_controller_)); + impl->column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet())); impl->mutex_.Lock(); Status s = impl->Recover(column_families, true, false, false); if (s.ok()) { diff --git a/db/log_reader.h b/db/log_reader.h index 63777d6daa7..130c1d597ec 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -163,7 +163,7 @@ class FragmentBufferedReader : public Reader { : Reader(info_log, std::move(_file), reporter, checksum, log_num), fragments_(), in_fragmented_record_(false) {} - ~FragmentBufferedReader() override {} + ~FragmentBufferedReader() {} bool ReadRecord(Slice* record, std::string* scratch, WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords) override; diff --git a/db/version_edit.h b/db/version_edit.h index 229531792ba..ee6499cdc3b 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -313,6 +313,7 @@ class VersionEdit { std::string DebugJSON(int edit_num, bool hex_key = false) const; private: + friend class ReactiveVersionSet; friend class VersionSet; friend class Version; diff --git a/db/version_set.cc b/db/version_set.cc index 1bfd1e63ab5..99beb453165 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2784,9 +2784,7 @@ VersionSet::VersionSet(const std::string& dbname, prev_log_number_(0), current_version_number_(0), manifest_file_size_(0), - env_options_(storage_options) { - state_ = State::INITIALIZED; -} + env_options_(storage_options) {} void CloseTables(void* ptr, size_t) { TableReader* table_reader = reinterpret_cast(ptr); @@ -3236,7 +3234,6 @@ Status VersionSet::LogAndApply( const autovector>& edit_lists, InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options) { - assert(State::PRIMARY == state_); mu->AssertHeld(); int num_edits = 0; for (const auto& elist : edit_lists) { @@ -3311,135 +3308,6 @@ Status VersionSet::LogAndApply( new_cf_options); } -Status VersionSet::ReadAndApply( - InstrumentedMutex* mu, - std::unique_ptr* manifest_reader, - std::unordered_set* cfds_changed) { - assert(State::SECONDARY == state_); - assert(manifest_reader != nullptr); - assert(cfds_changed != nullptr); - mu->AssertHeld(); - - Status s; - bool have_log_number = false; - bool have_prev_log_number = false; - bool have_next_file = false; - bool have_last_sequence = false; - uint64_t next_file = 0; - uint64_t last_sequence = 0; - uint64_t log_number = 0; - uint64_t previous_log_number = 0; - uint32_t max_column_family = 0; - uint64_t min_log_number_to_keep = 0; - - while (s.ok()) { - Slice record; - std::string scratch; - log::Reader* reader = manifest_reader->get(); - std::string old_manifest_path = reader->file()->file_name(); - while (reader->ReadRecord(&record, &scratch)) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - ColumnFamilyData* cfd = - column_family_set_->GetColumnFamily(edit.column_family_); - if (active_version_builders_.find(edit.column_family_) == - active_version_builders_.end()) { - std::unique_ptr builder_guard( - new BaseReferencedVersionBuilder(cfd)); - active_version_builders_.insert( - std::make_pair(edit.column_family_, std::move(builder_guard))); - } - s = ApplyOneVersionEditToBuilder( - edit, &have_log_number, &log_number, &have_prev_log_number, - &previous_log_number, &have_next_file, &next_file, - &have_last_sequence, &last_sequence, &min_log_number_to_keep, - &max_column_family); - if (!s.ok()) { - break; - } - if (column_family_set_->get_table_cache()->GetCapacity() == - TableCache::kInfiniteCapacity) { - // Unlimited table cache. Pre-load table handle now so that the table - // files are still accessible to us after the primary unlinks them. - auto builder_iter = active_version_builders_.find(edit.column_family_); - assert(builder_iter != active_version_builders_.end()); - auto builder = builder_iter->second->version_builder(); - assert(builder != nullptr); - s = builder->LoadTableHandlers( - cfd->internal_stats(), db_options_->max_file_opening_threads, - false /* prefetch_index_and_filter_in_cache */, - false /* is_initial_load */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); - if (!s.ok() && !s.IsPathNotFound()) { - break; - } else if (s.IsPathNotFound()) { - s = Status::OK(); - // TODO (yanqin) release file descriptors already opened, or modify - // LoadTableHandlers so that opened files are not re-opened. - } else { // s.ok() == true - auto version = new Version(cfd, this, env_options_, - *cfd->GetLatestMutableCFOptions(), - current_version_number_++); - builder->SaveTo(version->storage_info()); - version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true); - AppendVersion(cfd, version); - active_version_builders_.erase(builder_iter); - if (cfds_changed->count(cfd) == 0) { - cfds_changed->insert(cfd); - } - } - } - if (have_next_file) { - next_file_number_.store(next_file + 1); - } - if (have_last_sequence) { - last_allocated_sequence_ = last_sequence; - last_published_sequence_ = last_sequence; - last_sequence_ = last_sequence; - } - if (have_prev_log_number) { - prev_log_number_ = previous_log_number; - MarkFileNumberUsed(previous_log_number); - } - if (have_log_number) { - MarkFileNumberUsed(log_number); - } - column_family_set_->UpdateMaxColumnFamily(max_column_family); - MarkMinLogNumberToKeep2PC(min_log_number_to_keep); - } - // It's possible that: - // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted. - // 2) we have finished reading the current MANIFEST. - // 3) we have encountered an IOError reading the current MANIFEST. - // We need to look for the next MANIFEST and start from there. If we cannot - // find the next MANIFEST, we should exit the loop. - s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); - reader = manifest_reader->get(); - if (s.ok() && reader->file()->file_name() == old_manifest_path) { - break; - } - } - - if (s.ok()) { - for (auto cfd : *column_family_set_) { - auto builder_iter = active_version_builders_.find(cfd->GetID()); - if (builder_iter == active_version_builders_.end()) { - continue; - } - auto builder = builder_iter->second->version_builder(); - if (!builder->CheckConsistencyForNumLevels()) { - s = Status::InvalidArgument( - "db has more levels than options.num_levels"); - break; - } - } - } - return s; -} - void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) { assert(edit->IsColumnFamilyManipulation()); edit->SetNextFile(next_file_number_.load()); @@ -3492,11 +3360,10 @@ Status VersionSet::ApplyOneVersionEditToBuilder( std::unordered_map& column_families_not_found, std::unordered_map>& builders, - bool* have_log_number, uint64_t* /* log_number */, - bool* have_prev_log_number, uint64_t* previous_log_number, - bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, - SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, - uint32_t* max_column_family) { + bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number, + uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, + bool* have_last_sequence, SequenceNumber* last_sequence, + uint64_t* min_log_number_to_keep, uint32_t* max_column_family) { // Not found means that user didn't supply that column // family option AND we encountered column family add // record. Once we encounter column family drop record, @@ -3567,7 +3434,18 @@ Status VersionSet::ApplyOneVersionEditToBuilder( assert(builder != builders.end()); builder->second->version_builder()->Apply(&edit); } + return ExtractInfoFromVersionEdit( + cfd, edit, have_log_number, log_number, have_prev_log_number, + previous_log_number, have_next_file, next_file, have_last_sequence, + last_sequence, min_log_number_to_keep, max_column_family); +} +Status VersionSet::ExtractInfoFromVersionEdit( + ColumnFamilyData* cfd, const VersionEdit& edit, bool* have_log_number, + uint64_t* log_number, bool* have_prev_log_number, + uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, + bool* have_last_sequence, SequenceNumber* last_sequence, + uint64_t* min_log_number_to_keep, uint32_t* max_column_family) { if (cfd != nullptr) { if (edit.has_log_number_) { if (cfd->GetLogNumber() > edit.log_number_) { @@ -3578,6 +3456,7 @@ Status VersionSet::ApplyOneVersionEditToBuilder( } else { cfd->SetLogNumber(edit.log_number_); *have_log_number = true; + *log_number = edit.log_number_; } } if (edit.has_comparator_ && @@ -3614,129 +3493,6 @@ Status VersionSet::ApplyOneVersionEditToBuilder( return Status::OK(); } -Status VersionSet::ApplyOneVersionEditToBuilder( - VersionEdit& edit, bool* have_log_number, uint64_t* /* log_number */, - bool* have_prev_log_number, uint64_t* previous_log_number, - bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, - SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, - uint32_t* max_column_family) { - ColumnFamilyData* cfd = nullptr; - Status status; - if (edit.is_column_family_add_) { - // TODO (yanqin) for now the secondary ignores column families created - // after Open. This also simplifies handling of switching to a new MANIFEST - // and processing the snapshot of the system at the beginning of the - // MANIFEST. - return Status::OK(); - } else if (edit.is_column_family_drop_) { - // Drop the column family by setting it to be 'dropped' without destroying - // the column family handle. - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - // Drop a CF created after Open? Then ignore - if (cfd == nullptr) { - return Status::OK(); - } - cfd->SetDropped(); - if (cfd->Unref()) { - delete cfd; - cfd = nullptr; - } - } else { - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - // Operation on a CF created after Open? Then ignore - if (cfd == nullptr) { - return Status::OK(); - } - auto builder_iter = active_version_builders_.find(edit.column_family_); - assert(builder_iter != active_version_builders_.end()); - auto builder = builder_iter->second->version_builder(); - assert(builder != nullptr); - builder->Apply(&edit); - } - if (cfd != nullptr) { - if (edit.has_log_number_) { - if (cfd->GetLogNumber() > edit.log_number_) { - ROCKS_LOG_WARN( - db_options_->info_log, - "MANIFEST corruption detected, but ignored - log numbers in " - "records NOT monotonically increasing"); - } else { - cfd->SetLogNumber(edit.log_number_); - *have_log_number = true; - } - } - if (edit.has_comparator_ && - edit.comparator_ != cfd->user_comparator()->Name()) { - return Status::InvalidArgument( - cfd->user_comparator()->Name(), - "does not match existing comparator " + edit.comparator_); - } - } - - if (edit.has_prev_log_number_) { - *previous_log_number = edit.prev_log_number_; - *have_prev_log_number = true; - } - - if (edit.has_next_file_number_) { - *next_file = edit.next_file_number_; - *have_next_file = true; - } - - if (edit.has_max_column_family_) { - *max_column_family = edit.max_column_family_; - } - - if (edit.has_min_log_number_to_keep_) { - *min_log_number_to_keep = - std::max(*min_log_number_to_keep, edit.min_log_number_to_keep_); - } - - if (edit.has_last_sequence_) { - *last_sequence = edit.last_sequence_; - *have_last_sequence = true; - } - return status; -} - -Status VersionSet::MaybeSwitchManifest( - log::Reader::Reporter* reporter, - std::unique_ptr* manifest_reader) { - assert(manifest_reader != nullptr); - Status s; - do { - std::string manifest_path; - s = GetCurrentManifestPath(&manifest_path); - std::unique_ptr manifest_file; - if (s.ok()) { - if (nullptr == manifest_reader->get() || - manifest_reader->get()->file()->file_name() != manifest_path) { - TEST_SYNC_POINT( - "VersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0"); - TEST_SYNC_POINT( - "VersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:1"); - s = env_->NewSequentialFile( - manifest_path, &manifest_file, - env_->OptimizeForManifestRead(env_options_)); - } else { - // No need to switch manifest. - break; - } - } - std::unique_ptr manifest_file_reader; - if (s.ok()) { - manifest_file_reader.reset( - new SequentialFileReader(std::move(manifest_file), manifest_path)); - manifest_reader->reset(new log::FragmentBufferedReader( - nullptr, std::move(manifest_file_reader), reporter, - true /* checksum */, 0 /* log_number */)); - ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n", - manifest_path.c_str()); - } - } while (s.IsPathNotFound()); - return s; -} - Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) { assert(manifest_path != nullptr); std::string fname; @@ -3765,8 +3521,6 @@ Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) { Status VersionSet::Recover( const std::vector& column_families, bool read_only) { - assert(State::INITIALIZED == state_); - state_ = State::PRIMARY; std::unordered_map cf_name_to_options; for (auto cf : column_families) { cf_name_to_options.insert({cf.name, cf.options}); @@ -4014,181 +3768,6 @@ Status VersionSet::Recover( return s; } -Status VersionSet::RecoverAsSecondary( - const std::vector& column_families, - std::unique_ptr* manifest_reader, - std::unique_ptr* manifest_reporter, - std::unique_ptr* manifest_reader_status) { - assert(State::INITIALIZED == state_); - state_ = State::SECONDARY; - assert(manifest_reader != nullptr); - assert(manifest_reporter != nullptr); - assert(manifest_reader_status != nullptr); - - std::unordered_map cf_name_to_options; - for (const auto& cf : column_families) { - cf_name_to_options.insert({cf.name, cf.options}); - } - - // add default column family - auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName); - if (default_cf_iter == cf_name_to_options.end()) { - return Status::InvalidArgument("Default column family not specified"); - } - VersionEdit default_cf_edit; - default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); - default_cf_edit.SetColumnFamily(0); - ColumnFamilyData* default_cfd = - CreateColumnFamily(default_cf_iter->second, &default_cf_edit); - // In recovery, nobody else can access it, so it's fine to set it to be - // initialized earlier. - default_cfd->set_initialized(); - - bool have_log_number = false; - bool have_prev_log_number = false; - bool have_next_file = false; - bool have_last_sequence = false; - uint64_t next_file = 0; - uint64_t last_sequence = 0; - uint64_t log_number = 0; - uint64_t previous_log_number = 0; - uint32_t max_column_family = 0; - uint64_t min_log_number_to_keep = 0; - std::unordered_map> - builders; - std::unordered_map column_families_not_found; - builders.insert( - std::make_pair(0, std::unique_ptr( - new BaseReferencedVersionBuilder(default_cfd)))); - - manifest_reader_status->reset(new Status()); - manifest_reporter->reset(new LogReporter()); - static_cast(manifest_reporter->get())->status = - manifest_reader_status->get(); - Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader); - log::Reader* reader = manifest_reader->get(); - - while (s.ok()) { - assert(reader != nullptr); - Slice record; - std::string scratch; - while (s.ok() && reader->ReadRecord(&record, &scratch)) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - s = ApplyOneVersionEditToBuilder( - edit, cf_name_to_options, column_families_not_found, builders, - &have_log_number, &log_number, &have_prev_log_number, - &previous_log_number, &have_next_file, &next_file, - &have_last_sequence, &last_sequence, &min_log_number_to_keep, - &max_column_family); - } - if (s.ok()) { - bool enough = have_next_file && have_log_number && have_last_sequence; - if (enough) { - for (const auto& cf : column_families) { - auto cfd = column_family_set_->GetColumnFamily(cf.name); - if (cfd == nullptr) { - enough = false; - break; - } - } - } - if (enough && column_family_set_->get_table_cache()->GetCapacity() == - TableCache::kInfiniteCapacity) { - for (const auto& cf : column_families) { - auto cfd = column_family_set_->GetColumnFamily(cf.name); - assert(cfd != nullptr); - if (!cfd->IsDropped()) { - auto builder_iter = builders.find(cfd->GetID()); - assert(builder_iter != builders.end()); - auto builder = builder_iter->second->version_builder(); - assert(builder != nullptr); - s = builder->LoadTableHandlers( - cfd->internal_stats(), db_options_->max_file_opening_threads, - false /* prefetch_index_and_filter_in_cache */, - false /* is_initial_load */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); - if (!s.ok()) { - enough = false; - if (s.IsPathNotFound()) { - s = Status::OK(); - } - break; - } - } - } - if (!enough) { - // TODO (yanqin) release table handlers if any of the files are not - // found. - } - } - if (enough) { - break; - } - } - } - - if (s.ok()) { - if (!have_prev_log_number) { - previous_log_number = 0; - } - column_family_set_->UpdateMaxColumnFamily(max_column_family); - - MarkMinLogNumberToKeep2PC(min_log_number_to_keep); - MarkFileNumberUsed(previous_log_number); - MarkFileNumberUsed(log_number); - - for (auto cfd : *column_family_set_) { - assert(builders.count(cfd->GetID()) > 0); - auto builder = builders[cfd->GetID()]->version_builder(); - if (!builder->CheckConsistencyForNumLevels()) { - s = Status::InvalidArgument( - "db has more levels than options.num_levels"); - break; - } - } - } - - if (s.ok()) { - for (auto cfd : *column_family_set_) { - if (cfd->IsDropped()) { - continue; - } - assert(cfd->initialized()); - auto builders_iter = builders.find(cfd->GetID()); - assert(builders_iter != builders.end()); - auto* builder = builders_iter->second->version_builder(); - - Version* v = new Version(cfd, this, env_options_, - *cfd->GetLatestMutableCFOptions(), - current_version_number_++); - builder->SaveTo(v->storage_info()); - - // Install recovered version - v->PrepareApply(*cfd->GetLatestMutableCFOptions(), - !(db_options_->skip_stats_update_on_db_open)); - AppendVersion(cfd, v); - } - next_file_number_.store(next_file + 1); - last_allocated_sequence_ = last_sequence; - last_published_sequence_ = last_sequence; - last_sequence_ = last_sequence; - prev_log_number_ = previous_log_number; - for (auto cfd : *column_family_set_) { - if (cfd->IsDropped()) { - continue; - } - ROCKS_LOG_INFO(db_options_->info_log, - "Column family [%s] (ID %u), log number is %" PRIu64 "\n", - cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); - } - } - return s; -} - Status VersionSet::ListColumnFamilies(std::vector* column_families, const std::string& dbname, Env* env) { // these are just for performance reasons, not correcntes, @@ -5021,4 +4600,399 @@ uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) { return total_files_size; } +ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname, + const ImmutableDBOptions* _db_options, + const EnvOptions& _env_options, + Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller) + : VersionSet(dbname, _db_options, _env_options, table_cache, + write_buffer_manager, write_controller) {} + +ReactiveVersionSet::~ReactiveVersionSet() {} + +Status ReactiveVersionSet::Recover( + const std::vector& column_families, + std::unique_ptr* manifest_reader, + std::unique_ptr* manifest_reporter, + std::unique_ptr* manifest_reader_status) { + assert(manifest_reader != nullptr); + assert(manifest_reporter != nullptr); + assert(manifest_reader_status != nullptr); + + std::unordered_map cf_name_to_options; + for (const auto& cf : column_families) { + cf_name_to_options.insert({cf.name, cf.options}); + } + + // add default column family + auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName); + if (default_cf_iter == cf_name_to_options.end()) { + return Status::InvalidArgument("Default column family not specified"); + } + VersionEdit default_cf_edit; + default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); + default_cf_edit.SetColumnFamily(0); + ColumnFamilyData* default_cfd = + CreateColumnFamily(default_cf_iter->second, &default_cf_edit); + // In recovery, nobody else can access it, so it's fine to set it to be + // initialized earlier. + default_cfd->set_initialized(); + + bool have_log_number = false; + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t log_number = 0; + uint64_t previous_log_number = 0; + uint32_t max_column_family = 0; + uint64_t min_log_number_to_keep = 0; + std::unordered_map> + builders; + std::unordered_map column_families_not_found; + builders.insert( + std::make_pair(0, std::unique_ptr( + new BaseReferencedVersionBuilder(default_cfd)))); + + manifest_reader_status->reset(new Status()); + manifest_reporter->reset(new LogReporter()); + static_cast(manifest_reporter->get())->status = + manifest_reader_status->get(); + Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader); + log::Reader* reader = manifest_reader->get(); + + while (s.ok()) { + assert(reader != nullptr); + Slice record; + std::string scratch; + while (s.ok() && reader->ReadRecord(&record, &scratch)) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + s = ApplyOneVersionEditToBuilder( + edit, cf_name_to_options, column_families_not_found, builders, + &have_log_number, &log_number, &have_prev_log_number, + &previous_log_number, &have_next_file, &next_file, + &have_last_sequence, &last_sequence, &min_log_number_to_keep, + &max_column_family); + } + if (s.ok()) { + bool enough = have_next_file && have_log_number && have_last_sequence; + if (enough) { + for (const auto& cf : column_families) { + auto cfd = column_family_set_->GetColumnFamily(cf.name); + if (cfd == nullptr) { + enough = false; + break; + } + } + } + if (enough && column_family_set_->get_table_cache()->GetCapacity() == + TableCache::kInfiniteCapacity) { + for (const auto& cf : column_families) { + auto cfd = column_family_set_->GetColumnFamily(cf.name); + assert(cfd != nullptr); + if (!cfd->IsDropped()) { + auto builder_iter = builders.find(cfd->GetID()); + assert(builder_iter != builders.end()); + auto builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + s = builder->LoadTableHandlers( + cfd->internal_stats(), db_options_->max_file_opening_threads, + false /* prefetch_index_and_filter_in_cache */, + false /* is_initial_load */, + cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); + if (!s.ok()) { + enough = false; + if (s.IsPathNotFound()) { + s = Status::OK(); + } + break; + } + } + } + if (!enough) { + // TODO (yanqin) release table handlers if any of the files are not + // found. + } + } + if (enough) { + break; + } + } + } + + if (s.ok()) { + if (!have_prev_log_number) { + previous_log_number = 0; + } + column_family_set_->UpdateMaxColumnFamily(max_column_family); + + MarkMinLogNumberToKeep2PC(min_log_number_to_keep); + MarkFileNumberUsed(previous_log_number); + MarkFileNumberUsed(log_number); + + for (auto cfd : *column_family_set_) { + assert(builders.count(cfd->GetID()) > 0); + auto builder = builders[cfd->GetID()]->version_builder(); + if (!builder->CheckConsistencyForNumLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + } + } + + if (s.ok()) { + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + auto builders_iter = builders.find(cfd->GetID()); + assert(builders_iter != builders.end()); + auto* builder = builders_iter->second->version_builder(); + + Version* v = new Version(cfd, this, env_options_, + *cfd->GetLatestMutableCFOptions(), + current_version_number_++); + builder->SaveTo(v->storage_info()); + + // Install recovered version + v->PrepareApply(*cfd->GetLatestMutableCFOptions(), + !(db_options_->skip_stats_update_on_db_open)); + AppendVersion(cfd, v); + } + next_file_number_.store(next_file + 1); + last_allocated_sequence_ = last_sequence; + last_published_sequence_ = last_sequence; + last_sequence_ = last_sequence; + prev_log_number_ = previous_log_number; + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + ROCKS_LOG_INFO(db_options_->info_log, + "Column family [%s] (ID %u), log number is %" PRIu64 "\n", + cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); + } + } + return s; +} + +Status ReactiveVersionSet::ReadAndApply( + InstrumentedMutex* mu, + std::unique_ptr* manifest_reader, + std::unordered_set* cfds_changed) { + assert(manifest_reader != nullptr); + assert(cfds_changed != nullptr); + mu->AssertHeld(); + + Status s; + bool have_log_number = false; + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t log_number = 0; + uint64_t previous_log_number = 0; + uint32_t max_column_family = 0; + uint64_t min_log_number_to_keep = 0; + + while (s.ok()) { + Slice record; + std::string scratch; + log::Reader* reader = manifest_reader->get(); + std::string old_manifest_path = reader->file()->file_name(); + while (reader->ReadRecord(&record, &scratch)) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + ColumnFamilyData* cfd = + column_family_set_->GetColumnFamily(edit.column_family_); + if (active_version_builders_.find(edit.column_family_) == + active_version_builders_.end()) { + std::unique_ptr builder_guard( + new BaseReferencedVersionBuilder(cfd)); + active_version_builders_.insert( + std::make_pair(edit.column_family_, std::move(builder_guard))); + } + s = ApplyOneVersionEditToBuilder( + edit, &have_log_number, &log_number, &have_prev_log_number, + &previous_log_number, &have_next_file, &next_file, + &have_last_sequence, &last_sequence, &min_log_number_to_keep, + &max_column_family); + if (!s.ok()) { + break; + } + if (column_family_set_->get_table_cache()->GetCapacity() == + TableCache::kInfiniteCapacity) { + // Unlimited table cache. Pre-load table handle now so that the table + // files are still accessible to us after the primary unlinks them. + auto builder_iter = active_version_builders_.find(edit.column_family_); + assert(builder_iter != active_version_builders_.end()); + auto builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + s = builder->LoadTableHandlers( + cfd->internal_stats(), db_options_->max_file_opening_threads, + false /* prefetch_index_and_filter_in_cache */, + false /* is_initial_load */, + cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); + if (!s.ok() && !s.IsPathNotFound()) { + break; + } else if (s.IsPathNotFound()) { + s = Status::OK(); + // TODO (yanqin) release file descriptors already opened, or modify + // LoadTableHandlers so that opened files are not re-opened. + } else { // s.ok() == true + auto version = new Version(cfd, this, env_options_, + *cfd->GetLatestMutableCFOptions(), + current_version_number_++); + builder->SaveTo(version->storage_info()); + version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true); + AppendVersion(cfd, version); + active_version_builders_.erase(builder_iter); + if (cfds_changed->count(cfd) == 0) { + cfds_changed->insert(cfd); + } + } + } + if (have_next_file) { + next_file_number_.store(next_file + 1); + } + if (have_last_sequence) { + last_allocated_sequence_ = last_sequence; + last_published_sequence_ = last_sequence; + last_sequence_ = last_sequence; + } + if (have_prev_log_number) { + prev_log_number_ = previous_log_number; + MarkFileNumberUsed(previous_log_number); + } + if (have_log_number) { + MarkFileNumberUsed(log_number); + } + column_family_set_->UpdateMaxColumnFamily(max_column_family); + MarkMinLogNumberToKeep2PC(min_log_number_to_keep); + } + // It's possible that: + // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted. + // 2) we have finished reading the current MANIFEST. + // 3) we have encountered an IOError reading the current MANIFEST. + // We need to look for the next MANIFEST and start from there. If we cannot + // find the next MANIFEST, we should exit the loop. + s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); + reader = manifest_reader->get(); + if (s.ok() && reader->file()->file_name() == old_manifest_path) { + break; + } + } + + if (s.ok()) { + for (auto cfd : *column_family_set_) { + auto builder_iter = active_version_builders_.find(cfd->GetID()); + if (builder_iter == active_version_builders_.end()) { + continue; + } + auto builder = builder_iter->second->version_builder(); + if (!builder->CheckConsistencyForNumLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + } + } + return s; +} + +Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( + VersionEdit& edit, bool* have_log_number, uint64_t* log_number, + bool* have_prev_log_number, uint64_t* previous_log_number, + bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, + SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, + uint32_t* max_column_family) { + ColumnFamilyData* cfd = nullptr; + Status status; + if (edit.is_column_family_add_) { + // TODO (yanqin) for now the secondary ignores column families created + // after Open. This also simplifies handling of switching to a new MANIFEST + // and processing the snapshot of the system at the beginning of the + // MANIFEST. + return Status::OK(); + } else if (edit.is_column_family_drop_) { + // Drop the column family by setting it to be 'dropped' without destroying + // the column family handle. + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + // Drop a CF created after Open? Then ignore + if (cfd == nullptr) { + return Status::OK(); + } + cfd->SetDropped(); + if (cfd->Unref()) { + delete cfd; + cfd = nullptr; + } + } else { + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + // Operation on a CF created after Open? Then ignore + if (cfd == nullptr) { + return Status::OK(); + } + auto builder_iter = active_version_builders_.find(edit.column_family_); + assert(builder_iter != active_version_builders_.end()); + auto builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + builder->Apply(&edit); + } + return ExtractInfoFromVersionEdit( + cfd, edit, have_log_number, log_number, have_prev_log_number, + previous_log_number, have_next_file, next_file, have_last_sequence, + last_sequence, min_log_number_to_keep, max_column_family); +} + +Status ReactiveVersionSet::MaybeSwitchManifest( + log::Reader::Reporter* reporter, + std::unique_ptr* manifest_reader) { + assert(manifest_reader != nullptr); + Status s; + do { + std::string manifest_path; + s = GetCurrentManifestPath(&manifest_path); + std::unique_ptr manifest_file; + if (s.ok()) { + if (nullptr == manifest_reader->get() || + manifest_reader->get()->file()->file_name() != manifest_path) { + TEST_SYNC_POINT( + "VersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0"); + TEST_SYNC_POINT( + "VersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:1"); + s = env_->NewSequentialFile( + manifest_path, &manifest_file, + env_->OptimizeForManifestRead(env_options_)); + } else { + // No need to switch manifest. + break; + } + } + std::unique_ptr manifest_file_reader; + if (s.ok()) { + manifest_file_reader.reset( + new SequentialFileReader(std::move(manifest_file), manifest_path)); + manifest_reader->reset(new log::FragmentBufferedReader( + nullptr, std::move(manifest_file_reader), reporter, + true /* checksum */, 0 /* log_number */)); + ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n", + manifest_path.c_str()); + } + } while (s.IsPathNotFound()); + return s; +} + } // namespace rocksdb diff --git a/db/version_set.h b/db/version_set.h index 5d29c8d48a7..00d048ab40d 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -644,6 +644,7 @@ class Version { private: Env* env_; + friend class ReactiveVersionSet; friend class VersionSet; const InternalKeyComparator* internal_comparator() const { @@ -743,7 +744,7 @@ class VersionSet { const EnvOptions& env_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, WriteController* write_controller); - ~VersionSet(); + virtual ~VersionSet(); // Apply *edit to the current version to form a new descriptor that // is both saved to persistent state and installed as the new @@ -789,7 +790,7 @@ class VersionSet { // The across-multi-cf batch version. If edit_lists contain more than // 1 version edits, caller must ensure that no edit in the []list is column // family manipulation. - Status LogAndApply( + virtual Status LogAndApply( const autovector& cfds, const autovector& mutable_cf_options_list, const autovector>& edit_lists, @@ -797,11 +798,6 @@ class VersionSet { bool new_descriptor_log = false, const ColumnFamilyOptions* new_cf_options = nullptr); - Status ReadAndApply( - InstrumentedMutex* mu, - std::unique_ptr* manifest_reader, - std::unordered_set* cfds_changed); - Status GetCurrentManifestPath(std::string* manifest_filename); // Recover the last saved descriptor from persistent storage. @@ -810,12 +806,6 @@ class VersionSet { Status Recover(const std::vector& column_families, bool read_only = false); - Status RecoverAsSecondary( - const std::vector& column_families, - std::unique_ptr* manifest_reader, - std::unique_ptr* manifest_reporter, - std::unique_ptr* manifest_reader_status); - // Reads a manifest file and returns a list of column families in // column_families. static Status ListColumnFamilies(std::vector* column_families, @@ -990,7 +980,7 @@ class VersionSet { static uint64_t GetTotalSstFilesSize(Version* dummy_versions); - private: + protected: struct ManifestWriter; friend class Version; @@ -1031,23 +1021,12 @@ class VersionSet { bool* have_last_sequence, SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, uint32_t* max_column_family); - // REQUIRES db mutex - Status ApplyOneVersionEditToBuilder( - VersionEdit& edit, bool* have_log_number, uint64_t* log_number, - bool* have_prev_log_number, uint64_t* previous_log_number, - bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, - SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, - uint32_t* max_column_family); - - Status MaybeSwitchManifest( - log::Reader::Reporter* reporter, - std::unique_ptr* manifest_reader); - - // REQUIRES db mutex at beginning. may release and re-acquire db mutex - Status ProcessManifestWrites(std::deque& writers, - InstrumentedMutex* mu, Directory* db_directory, - bool new_descriptor_log, - const ColumnFamilyOptions* new_cf_options); + Status ExtractInfoFromVersionEdit( + ColumnFamilyData* cfd, const VersionEdit& edit, bool* have_log_number, + uint64_t* log_number, bool* have_prev_log_number, + uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, + bool* have_last_sequence, SequenceNumber* last_sequence, + uint64_t* min_log_number_to_keep, uint32_t* max_column_family); std::unique_ptr column_family_set_; @@ -1097,24 +1076,77 @@ class VersionSet { // env options for all reads and writes except compactions EnvOptions env_options_; - enum class State { - INITIALIZED, - PRIMARY, - SECONDARY, - }; - - State state_; - - std::unordered_map> - active_version_builders_; - + private: // No copying allowed VersionSet(const VersionSet&); void operator=(const VersionSet&); + // REQUIRES db mutex at beginning. may release and re-acquire db mutex + Status ProcessManifestWrites(std::deque& writers, + InstrumentedMutex* mu, Directory* db_directory, + bool new_descriptor_log, + const ColumnFamilyOptions* new_cf_options); + void LogAndApplyCFHelper(VersionEdit* edit); void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, VersionEdit* edit, InstrumentedMutex* mu); }; +class ReactiveVersionSet : public VersionSet { + public: + ReactiveVersionSet(const std::string& dbname, + const ImmutableDBOptions* _db_options, + const EnvOptions& _env_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller); + + ~ReactiveVersionSet(); + + Status ReadAndApply( + InstrumentedMutex* mu, + std::unique_ptr* manifest_reader, + std::unordered_set* cfds_changed); + + Status Recover(const std::vector& column_families, + std::unique_ptr* manifest_reader, + std::unique_ptr* manifest_reporter, + std::unique_ptr* manifest_reader_status); + + protected: + using VersionSet::ApplyOneVersionEditToBuilder; + + // REQUIRES db mutex + Status ApplyOneVersionEditToBuilder( + VersionEdit& edit, bool* have_log_number, uint64_t* log_number, + bool* have_prev_log_number, uint64_t* previous_log_number, + bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, + SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, + uint32_t* max_column_family); + + Status MaybeSwitchManifest( + log::Reader::Reporter* reporter, + std::unique_ptr* manifest_reader); + + private: + std::unordered_map> + active_version_builders_; + + using VersionSet::LogAndApply; + using VersionSet::Recover; + + Status LogAndApply( + const autovector& /*cfds*/, + const autovector& /*mutable_cf_options_list*/, + const autovector>& /*edit_lists*/, + InstrumentedMutex* /*mu*/, Directory* /*db_directory*/, + bool /*new_descriptor_log*/, + const ColumnFamilyOptions* /*new_cf_option*/) override { + return Status::NotSupported("not supported in reactive mode"); + } + + // No copy allowed + ReactiveVersionSet(const ReactiveVersionSet&); + ReactiveVersionSet& operator=(const ReactiveVersionSet&); +}; + } // namespace rocksdb From 54079048c3d34ead7a9be11e66ae20778e200914 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Mon, 18 Mar 2019 15:57:08 -0700 Subject: [PATCH 21/33] Address lint warnings --- db/db_impl_secondary.cc | 1 + db/db_impl_secondary.h | 2 +- db/log_reader.h | 2 +- db/version_set.h | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index 31f112e4341..d55aaed09a5 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -157,6 +157,7 @@ Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( const ReadOptions& read_options, ColumnFamilyData* cfd, SequenceNumber snapshot, ReadCallback* read_callback) { + assert(nullptr != cfd); SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_); auto db_iter = NewArenaWrappedDbIterator( env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h index 49510d126b0..64c622ccc7c 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl_secondary.h @@ -16,7 +16,7 @@ namespace rocksdb { class DBImplSecondary : public DBImpl { public: DBImplSecondary(const DBOptions& options, const std::string& dbname); - ~DBImplSecondary(); + ~DBImplSecondary() override; Status Recover(const std::vector& column_families, bool read_only, bool error_if_log_file_exist, diff --git a/db/log_reader.h b/db/log_reader.h index 130c1d597ec..63777d6daa7 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -163,7 +163,7 @@ class FragmentBufferedReader : public Reader { : Reader(info_log, std::move(_file), reporter, checksum, log_num), fragments_(), in_fragmented_record_(false) {} - ~FragmentBufferedReader() {} + ~FragmentBufferedReader() override {} bool ReadRecord(Slice* record, std::string* scratch, WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords) override; diff --git a/db/version_set.h b/db/version_set.h index 00d048ab40d..30405ad92b8 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1100,7 +1100,7 @@ class ReactiveVersionSet : public VersionSet { WriteBufferManager* write_buffer_manager, WriteController* write_controller); - ~ReactiveVersionSet(); + ~ReactiveVersionSet() override; Status ReadAndApply( InstrumentedMutex* mu, From b00cdc0db7b7268a4822cdf7bfdb8b5fca23104e Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 19 Mar 2019 11:23:06 -0700 Subject: [PATCH 22/33] Rename test sync points and update tests --- db/db_secondary_test.cc | 5 +++-- db/version_set.cc | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 5a6295058e0..504a1c7d73d 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -145,10 +145,11 @@ TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->LoadDependency( - {{"VersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0", + {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0", "VersionSet::ProcessManifestWrites:BeforeNewManifest"}, {"VersionSet::ProcessManifestWrites:AfterNewManifest", - "VersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:1"}}); + "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:" + "1"}}); SyncPoint::GetInstance()->EnableProcessing(); // Make sure db calls RecoverLogFiles so as to trigger a manifest write, diff --git a/db/version_set.cc b/db/version_set.cc index 99beb453165..124ee68d44b 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4970,9 +4970,11 @@ Status ReactiveVersionSet::MaybeSwitchManifest( if (nullptr == manifest_reader->get() || manifest_reader->get()->file()->file_name() != manifest_path) { TEST_SYNC_POINT( - "VersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0"); + "ReactiveVersionSet::MaybeSwitchManifest:" + "AfterGetCurrentManifestPath:0"); TEST_SYNC_POINT( - "VersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:1"); + "ReactiveVersionSet::MaybeSwitchManifest:" + "AfterGetCurrentManifestPath:1"); s = env_->NewSequentialFile( manifest_path, &manifest_file, env_->OptimizeForManifestRead(env_options_)); From 4fdea6248b78402ea6b30e80f002a2e2cf4239ee Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 19 Mar 2019 12:14:16 -0700 Subject: [PATCH 23/33] Update HISTORY --- HISTORY.md | 1 + 1 file changed, 1 insertion(+) diff --git a/HISTORY.md b/HISTORY.md index e467aa45354..270f57311a1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ ### Public API Change * statistics.stats_level_ becomes atomic. It is preferred to use statistics.set_stats_level() and statistics.get_stats_level() to access it. * Introduce a new IOError subcode, PathNotFound, to indicate trying to open a nonexistent file or directory for read. +* Add initial support for multiple db instances sharing the same data in single-writer, multi-reader mode. ### Bug Fixes * Fix JEMALLOC_CXX_THROW macro missing from older Jemalloc versions, causing build failures on some platforms. From 7b5ab34368fe625927ece828318c0467e0b06e30 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 19 Mar 2019 13:44:57 -0700 Subject: [PATCH 24/33] Update test for handling non-existing SSTs --- db/db_secondary_test.cc | 15 +++++++++++++++ db/version_set.cc | 2 ++ 2 files changed, 17 insertions(+) diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 504a1c7d73d..ef57d4fd970 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -223,6 +223,20 @@ TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { } TEST_F(DBSecondaryTest, MissingTableFile) { + int table_files_not_exist = 0; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers", + [&](void* arg) { + Status s = *reinterpret_cast(arg); + if (s.IsPathNotFound()) { + ++table_files_not_exist; + } else if (!s.ok()) { + assert(false); // Should not reach here + } + }); + SyncPoint::GetInstance()->EnableProcessing(); Options options; options.env = env_; options.level0_file_num_compaction_trigger = 4; @@ -252,6 +266,7 @@ TEST_F(DBSecondaryTest, MissingTableFile) { ASSERT_NOK(db_secondary->Get(ropts, "bar", &value)); ASSERT_OK(db_secondary->TryCatchUpWithPrimary()); + ASSERT_EQ(options.level0_file_num_compaction_trigger, table_files_not_exist); ASSERT_OK(db_secondary->Get(ropts, "foo", &value)); ASSERT_EQ("foo_value" + std::to_string(options.level0_file_num_compaction_trigger - 1), diff --git a/db/version_set.cc b/db/version_set.cc index 124ee68d44b..06f8d2fa6d8 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4845,6 +4845,8 @@ Status ReactiveVersionSet::ReadAndApply( false /* prefetch_index_and_filter_in_cache */, false /* is_initial_load */, cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); + TEST_SYNC_POINT_CALLBACK( + "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers", &s); if (!s.ok() && !s.IsPathNotFound()) { break; } else if (s.IsPathNotFound()) { From 702171fcfb4a7c4fa57569d696df19735aa80285 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 21 Mar 2019 21:04:02 -0700 Subject: [PATCH 25/33] Use retry counter to prevent infinite loop --- db/version_set.cc | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/db/version_set.cc b/db/version_set.cc index 06f8d2fa6d8..c4ae0ddad52 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4663,7 +4663,8 @@ Status ReactiveVersionSet::Recover( Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader); log::Reader* reader = manifest_reader->get(); - while (s.ok()) { + int retry = 0; + while (s.ok() && retry < 1) { assert(reader != nullptr); Slice record; std::string scratch; @@ -4704,7 +4705,7 @@ Status ReactiveVersionSet::Recover( s = builder->LoadTableHandlers( cfd->internal_stats(), db_options_->max_file_opening_threads, false /* prefetch_index_and_filter_in_cache */, - false /* is_initial_load */, + true /* is_initial_load */, cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); if (!s.ok()) { enough = false; @@ -4715,15 +4716,12 @@ Status ReactiveVersionSet::Recover( } } } - if (!enough) { - // TODO (yanqin) release table handlers if any of the files are not - // found. - } } if (enough) { break; } } + ++retry; } if (s.ok()) { @@ -4851,8 +4849,6 @@ Status ReactiveVersionSet::ReadAndApply( break; } else if (s.IsPathNotFound()) { s = Status::OK(); - // TODO (yanqin) release file descriptors already opened, or modify - // LoadTableHandlers so that opened files are not re-opened. } else { // s.ok() == true auto version = new Version(cfd, this, env_options_, *cfd->GetLatestMutableCFOptions(), From a4006e4cd4b87b2bd1e0f2d724606b607e850cd0 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 22 Mar 2019 11:44:45 -0700 Subject: [PATCH 26/33] Address review comments --- db/db_impl_secondary.cc | 6 ++--- db/version_set.cc | 58 ++++++++++++++++++----------------------- 2 files changed, 29 insertions(+), 35 deletions(-) diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc index d55aaed09a5..acc952524b2 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl_secondary.cc @@ -210,14 +210,14 @@ Status DBImplSecondary::TryCatchUpWithPrimary() { assert(manifest_reader_.get() != nullptr); Status s; std::unordered_set cfds_changed; - InstrumentedMutexLock lock_guard(mutex()); + InstrumentedMutexLock lock_guard(&mutex_); s = static_cast(versions_.get()) - ->ReadAndApply(mutex(), &manifest_reader_, &cfds_changed); + ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed); if (s.ok()) { SuperVersionContext sv_context(true /* create_superversion */); for (auto cfd : cfds_changed) { sv_context.NewSuperVersion(); - cfd->InstallSuperVersion(&sv_context, mutex()); + cfd->InstallSuperVersion(&sv_context, &mutex_); } sv_context.Clean(); } diff --git a/db/version_set.cc b/db/version_set.cc index c4ae0ddad52..2bb647c9f4e 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4692,8 +4692,7 @@ Status ReactiveVersionSet::Recover( } } } - if (enough && column_family_set_->get_table_cache()->GetCapacity() == - TableCache::kInfiniteCapacity) { + if (enough) { for (const auto& cf : column_families) { auto cfd = column_family_set_->GetColumnFamily(cf.name); assert(cfd != nullptr); @@ -4830,36 +4829,31 @@ Status ReactiveVersionSet::ReadAndApply( if (!s.ok()) { break; } - if (column_family_set_->get_table_cache()->GetCapacity() == - TableCache::kInfiniteCapacity) { - // Unlimited table cache. Pre-load table handle now so that the table - // files are still accessible to us after the primary unlinks them. - auto builder_iter = active_version_builders_.find(edit.column_family_); - assert(builder_iter != active_version_builders_.end()); - auto builder = builder_iter->second->version_builder(); - assert(builder != nullptr); - s = builder->LoadTableHandlers( - cfd->internal_stats(), db_options_->max_file_opening_threads, - false /* prefetch_index_and_filter_in_cache */, - false /* is_initial_load */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); - TEST_SYNC_POINT_CALLBACK( - "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers", &s); - if (!s.ok() && !s.IsPathNotFound()) { - break; - } else if (s.IsPathNotFound()) { - s = Status::OK(); - } else { // s.ok() == true - auto version = new Version(cfd, this, env_options_, - *cfd->GetLatestMutableCFOptions(), - current_version_number_++); - builder->SaveTo(version->storage_info()); - version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true); - AppendVersion(cfd, version); - active_version_builders_.erase(builder_iter); - if (cfds_changed->count(cfd) == 0) { - cfds_changed->insert(cfd); - } + auto builder_iter = active_version_builders_.find(edit.column_family_); + assert(builder_iter != active_version_builders_.end()); + auto builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + s = builder->LoadTableHandlers( + cfd->internal_stats(), db_options_->max_file_opening_threads, + false /* prefetch_index_and_filter_in_cache */, + false /* is_initial_load */, + cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); + TEST_SYNC_POINT_CALLBACK( + "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers", &s); + if (!s.ok() && !s.IsPathNotFound()) { + break; + } else if (s.IsPathNotFound()) { + s = Status::OK(); + } else { // s.ok() == true + auto version = new Version(cfd, this, env_options_, + *cfd->GetLatestMutableCFOptions(), + current_version_number_++); + builder->SaveTo(version->storage_info()); + version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true); + AppendVersion(cfd, version); + active_version_builders_.erase(builder_iter); + if (cfds_changed->count(cfd) == 0) { + cfds_changed->insert(cfd); } } if (have_next_file) { From 78dc13881b90c444d5359c7411930eebc38a4f13 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 22 Mar 2019 15:27:04 -0700 Subject: [PATCH 27/33] Construct versions from scratch upon MANIFEST switch --- db/version_set.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/db/version_set.cc b/db/version_set.cc index 2bb647c9f4e..287ffbdcc6e 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4984,6 +4984,12 @@ Status ReactiveVersionSet::MaybeSwitchManifest( true /* checksum */, 0 /* log_number */)); ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n", manifest_path.c_str()); + // TODO (yanqin) every time we switch to a new MANIFEST, we clear the + // active_version_builders_ map because we choose to construct the + // versions from scratch, thanks to the first part of each MANIFEST + // written by VersionSet::WriteSnapshot. This is not necessary, but we + // choose this at present for the sake of simplicity. + active_version_builders_.clear(); } } while (s.IsPathNotFound()); return s; From 6680f18e755f2ccd4abfe80e0e744a0093157e72 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 22 Mar 2019 16:27:43 -0700 Subject: [PATCH 28/33] Expose TryCatchUpWithPrimary --- db/db_impl_secondary.h | 2 +- include/rocksdb/db.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h index 64c622ccc7c..386d7d3e9dd 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl_secondary.h @@ -129,7 +129,7 @@ class DBImplSecondary : public DBImpl { // Try to catch up with the primary by reading as much as possible from the // log files until there is nothing more to read or encounters an error. - Status TryCatchUpWithPrimary(); + Status TryCatchUpWithPrimary() override; private: friend class DB; diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index e551c50417e..d03be65a3d0 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1244,6 +1244,10 @@ class DB { return Status::NotSupported("GetStatsHistory() is not implemented."); } + virtual Status TryCatchUpWithPrimary() { + return Status::NotSupported("Supported only by secondary instance"); + } + private: // No copying allowed DB(const DB&); From 164a57dfb18a37a0663f1b229b104419f3730d12 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 22 Mar 2019 14:19:08 -0700 Subject: [PATCH 29/33] Refactor unit test for db secondary --- db/db_secondary_test.cc | 255 ++++++++++++++++++++++++++++++++++------ db/version_set.cc | 6 +- 2 files changed, 219 insertions(+), 42 deletions(-) diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index ef57d4fd970..1019f28f4df 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -18,12 +18,17 @@ namespace rocksdb { #ifndef ROCKSDB_LITE class DBSecondaryTest : public DBTestBase { public: - DBSecondaryTest() : DBTestBase("/db_secondary_test"), secondary_path_() { + DBSecondaryTest() + : DBTestBase("/db_secondary_test"), + secondary_path_(), + handles_secondary_(), + db_secondary_(nullptr) { secondary_path_ = test::PerThreadDBPath(env_, "/db_secondary_test_secondary"); } ~DBSecondaryTest() override { + CloseSecondary(); if (getenv("KEEP_DB") != nullptr) { fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str()); } else { @@ -38,9 +43,71 @@ class DBSecondaryTest : public DBTestBase { return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_); } + void OpenSecondary(const Options& options); + + void OpenSecondaryWithColumnFamilies( + const std::vector& column_families, const Options& options); + + void CloseSecondary() { + for (auto h : handles_secondary_) { + db_secondary_->DestroyColumnFamilyHandle(h); + } + handles_secondary_.clear(); + delete db_secondary_; + db_secondary_ = nullptr; + } + + DBImplSecondary* db_secondary_full() { + return static_cast(db_secondary_); + } + + void CheckFileTypeCounts(const std::string& dir, int expected_log, + int expected_sst, int expected_manifest) const; + std::string secondary_path_; + std::vector handles_secondary_; + DB* db_secondary_; }; +void DBSecondaryTest::OpenSecondary(const Options& options) { + Status s = + DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_); + ASSERT_OK(s); +} + +void DBSecondaryTest::OpenSecondaryWithColumnFamilies( + const std::vector& column_families, const Options& options) { + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + for (const auto& cf_name : column_families) { + cf_descs.emplace_back(cf_name, options); + } + Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs, + &handles_secondary_, &db_secondary_); + ASSERT_OK(s); +} + +void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir, + int expected_log, int expected_sst, + int expected_manifest) const { + std::vector filenames; + env_->GetChildren(dir, &filenames); + + int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; + for (auto file : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type)) { + log_cnt += (type == kLogFile); + sst_cnt += (type == kTableFile); + manifest_cnt += (type == kDescriptorFile); + } + } + ASSERT_EQ(expected_log, log_cnt); + ASSERT_EQ(expected_sst, sst_cnt); + ASSERT_EQ(expected_manifest, manifest_cnt); +} + TEST_F(DBSecondaryTest, ReopenAsSecondary) { Options options; options.env = env_; @@ -72,7 +139,6 @@ TEST_F(DBSecondaryTest, ReopenAsSecondary) { } delete iter; ASSERT_EQ(2, count); - Close(); } TEST_F(DBSecondaryTest, OpenAsSecondary) { @@ -85,13 +151,10 @@ TEST_F(DBSecondaryTest, OpenAsSecondary) { ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); ASSERT_OK(Flush()); } - DB* db_secondary = nullptr; Options options1; options1.env = env_; options1.max_open_files = -1; - Status s = - DB::OpenAsSecondary(options1, dbname_, secondary_path_, &db_secondary); - ASSERT_OK(s); + OpenSecondary(options1); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); @@ -100,11 +163,11 @@ TEST_F(DBSecondaryTest, OpenAsSecondary) { const auto verify_db_func = [&](const std::string& foo_val, const std::string& bar_val) { std::string value; - ASSERT_OK(db_secondary->Get(ropts, "foo", &value)); + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); ASSERT_EQ(foo_val, value); - ASSERT_OK(db_secondary->Get(ropts, "bar", &value)); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); ASSERT_EQ(bar_val, value); - Iterator* iter = db_secondary->NewIterator(ropts); + Iterator* iter = db_secondary_->NewIterator(ropts); ASSERT_NE(nullptr, iter); iter->Seek("foo"); ASSERT_TRUE(iter->Valid()); @@ -128,12 +191,49 @@ TEST_F(DBSecondaryTest, OpenAsSecondary) { ASSERT_OK(Put("bar", "new_bar_value")); ASSERT_OK(Flush()); - ASSERT_OK( - static_cast(db_secondary)->TryCatchUpWithPrimary()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); verify_db_func("new_foo_value", "new_bar_value"); - delete db_secondary; - Close(); + CloseSecondary(); +} + +TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) { + Options options; + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options1); + cf_descs.emplace_back("pikachu", options1); + cf_descs.emplace_back("eevee", options1); + Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs, + &handles_secondary_, &db_secondary_); + ASSERT_NOK(s); +} + +TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) { + Options options; + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ASSERT_EQ(0, handles_secondary_.size()); + ASSERT_NE(nullptr, db_secondary_); + + ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value")); + ASSERT_OK(Flush(0 /*cf*/)); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value", value); + CloseSecondary(); } TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { @@ -155,18 +255,14 @@ TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { // Make sure db calls RecoverLogFiles so as to trigger a manifest write, // which causes the db to switch to a new MANIFEST upon start. port::Thread ro_db_thread([&]() { - DB* db_secondary = nullptr; Options options1; options1.env = env_; options1.max_open_files = -1; - Status s = - DB::OpenAsSecondary(options1, dbname_, secondary_path_, &db_secondary); - ASSERT_OK(s); - delete db_secondary; + OpenSecondary(options1); + CloseSecondary(); }); Reopen(options); ro_db_thread.join(); - Close(); } TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { @@ -181,24 +277,22 @@ TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { } ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); - DB* db1 = nullptr; Options options1; options1.env = env_; options1.max_open_files = -1; - Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, &db1); - ASSERT_OK(s); + OpenSecondary(options1); ReadOptions ropts; ropts.verify_checksums = true; std::string value; - ASSERT_OK(db1->Get(ropts, "foo", &value)); + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); ASSERT_EQ("foo_value" + std::to_string(options.level0_file_num_compaction_trigger - 1), value); - ASSERT_OK(db1->Get(ropts, "bar", &value)); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); ASSERT_EQ("bar_value" + std::to_string(options.level0_file_num_compaction_trigger - 1), value); - Iterator* iter = db1->NewIterator(ropts); + Iterator* iter = db_secondary_->NewIterator(ropts); ASSERT_NE(nullptr, iter); iter->Seek("bar"); ASSERT_TRUE(iter->Valid()); @@ -218,8 +312,7 @@ TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { } ASSERT_EQ(2, count); delete iter; - delete db1; - Close(); + CloseSecondary(); } TEST_F(DBSecondaryTest, MissingTableFile) { @@ -242,12 +335,10 @@ TEST_F(DBSecondaryTest, MissingTableFile) { options.level0_file_num_compaction_trigger = 4; Reopen(options); - DB* db1 = nullptr; Options options1; options1.env = env_; options1.max_open_files = -1; - Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, &db1); - ASSERT_OK(s); + OpenSecondary(options1); for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); @@ -257,25 +348,24 @@ TEST_F(DBSecondaryTest, MissingTableFile) { ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); - auto db_secondary = static_cast(db1); - ASSERT_NE(nullptr, db_secondary); + ASSERT_NE(nullptr, db_secondary_full()); ReadOptions ropts; ropts.verify_checksums = true; std::string value; - ASSERT_NOK(db_secondary->Get(ropts, "foo", &value)); - ASSERT_NOK(db_secondary->Get(ropts, "bar", &value)); + ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value)); - ASSERT_OK(db_secondary->TryCatchUpWithPrimary()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); ASSERT_EQ(options.level0_file_num_compaction_trigger, table_files_not_exist); - ASSERT_OK(db_secondary->Get(ropts, "foo", &value)); + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); ASSERT_EQ("foo_value" + std::to_string(options.level0_file_num_compaction_trigger - 1), value); - ASSERT_OK(db_secondary->Get(ropts, "bar", &value)); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); ASSERT_EQ("bar_value" + std::to_string(options.level0_file_num_compaction_trigger - 1), value); - Iterator* iter = db1->NewIterator(ropts); + Iterator* iter = db_secondary_->NewIterator(ropts); ASSERT_NE(nullptr, iter); iter->Seek("bar"); ASSERT_TRUE(iter->Valid()); @@ -295,8 +385,95 @@ TEST_F(DBSecondaryTest, MissingTableFile) { } ASSERT_EQ(2, count); delete iter; - delete db1; + CloseSecondary(); +} + +TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) { + Options options; + options.env = env_; + const std::string kCfName1 = "pikachu"; + CreateAndReopenWithCF({kCfName1}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kCfName1}, options1); + ASSERT_EQ(2, handles_secondary_.size()); + + ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1")); + ASSERT_OK(Flush(1 /*cf*/)); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); + ASSERT_EQ("foo_val_1", value); + + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); Close(); + CheckFileTypeCounts(dbname_, 1, 0, 1); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + value.clear(); + ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); + ASSERT_EQ("foo_val_1", value); + + CloseSecondary(); +} + +TEST_F(DBSecondaryTest, SwitchManifest) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + const int kNumFiles = options.level0_file_num_compaction_trigger - 1; + // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1, + // ..., 9. + const int kNumKeys = 10; + // Create two sst + for (int i = 0; i != kNumFiles; ++i) { + for (int j = 0; j != kNumKeys; ++j) { + ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i))); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + const auto& range_scan_db = [&]() { + ReadOptions tmp_ropts; + tmp_ropts.total_order_seek = true; + tmp_ropts.verify_checksums = true; + std::unique_ptr iter(db_secondary_->NewIterator(tmp_ropts)); + int cnt = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) { + ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString()); + ASSERT_EQ("value_" + std::to_string(kNumFiles - 1), + iter->value().ToString()); + } + }; + + range_scan_db(); + + // While secondary instance still keeps old MANIFEST open, we close primary, + // restart primary, performs full compaction, close again, restart again so + // that next time secondary tries to catch up with primary, the secondary + // will skip the MANIFEST in middle. + Reopen(options); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + Reopen(options); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + range_scan_db(); + CloseSecondary(); } #endif //! ROCKSDB_LITE diff --git a/db/version_set.cc b/db/version_set.cc index 287ffbdcc6e..006ef5569d8 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4919,13 +4919,13 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( // MANIFEST. return Status::OK(); } else if (edit.is_column_family_drop_) { - // Drop the column family by setting it to be 'dropped' without destroying - // the column family handle. cfd = column_family_set_->GetColumnFamily(edit.column_family_); - // Drop a CF created after Open? Then ignore + // Drop a CF created by primary after secondary starts? Then ignore if (cfd == nullptr) { return Status::OK(); } + // Drop the column family by setting it to be 'dropped' without destroying + // the column family handle. cfd->SetDropped(); if (cfd->Unref()) { delete cfd; From a24be882ef244186cc162c86cabd07727b74bc57 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 22 Mar 2019 17:17:12 -0700 Subject: [PATCH 30/33] Add comments to explain the usage of new API --- db/db_secondary_test.cc | 8 -------- include/rocksdb/db.h | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 1019f28f4df..f061696cdb1 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -193,8 +193,6 @@ TEST_F(DBSecondaryTest, OpenAsSecondary) { ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); verify_db_func("new_foo_value", "new_bar_value"); - - CloseSecondary(); } TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) { @@ -233,7 +231,6 @@ TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) { std::string value; ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); ASSERT_EQ("foo_value", value); - CloseSecondary(); } TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { @@ -312,7 +309,6 @@ TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { } ASSERT_EQ(2, count); delete iter; - CloseSecondary(); } TEST_F(DBSecondaryTest, MissingTableFile) { @@ -385,7 +381,6 @@ TEST_F(DBSecondaryTest, MissingTableFile) { } ASSERT_EQ(2, count); delete iter; - CloseSecondary(); } TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) { @@ -417,8 +412,6 @@ TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) { value.clear(); ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); ASSERT_EQ("foo_val_1", value); - - CloseSecondary(); } TEST_F(DBSecondaryTest, SwitchManifest) { @@ -473,7 +466,6 @@ TEST_F(DBSecondaryTest, SwitchManifest) { ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); range_scan_db(); - CloseSecondary(); } #endif //! ROCKSDB_LITE diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index d03be65a3d0..0ecc76339ac 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -162,9 +162,33 @@ class DB { std::vector* handles, DB** dbptr, bool error_if_log_file_exist = false); + // The following OpenAsSecondary functions create a secondary instance that + // can dynamically tail the MANIFEST of a primary that must have already been + // created. + // + // The options argument specifies the options to open the secondary instance. + // The name argument specifies the name of the primary db that you have used + // to open the primary instance. + // The secondary_path argument points to a directory where the secondary + // instance stores its info log. + // The dbptr is an out-arg corresponding to the opened secondary instance. + // Open DB as secondary instance with only the default column family. static Status OpenAsSecondary(const Options& options, const std::string& name, const std::string& secondary_path, DB** dbptr); + // Open DB as secondary instance with column families. + // The db_options specify the database specific options. + // The name argument specifies the name of the primary db that you have used + // to open the primary instance. + // The secondary_path argument points to a directory where the secondary + // instance stores its info log. + // The column_families argument specifieds a list of column families to open. + // If any of the column families does not exist, the function returns non-OK + // status. + // The handles is an out-arg corresponding to the opened database column + // familiy handles. + // The dbptr is an out-arg corresponding to the opened secondary instance. + // Open DB as secondary instance with only the default column family. static Status OpenAsSecondary( const DBOptions& db_options, const std::string& name, const std::string& secondary_path, @@ -1244,9 +1268,22 @@ class DB { return Status::NotSupported("GetStatsHistory() is not implemented."); } +#ifndef ROCKSDB_LITE + // Make the secondary instance catch up with the primary by tailing and + // replaying the MANIFEST and WAL of the primary. + // Column families created by the primary after the secondary instance starts + // will be ignored unless the secondary instance closes and restarts with the + // newly created column families. + // Column families that exist before secondary instance starts and dropped by + // the primary afterwards will be marked as dropped. However, as long as the + // secondary instance does not delete the corresponding column family + // handles, the data of the column family is still accessible to the + // secondary. + // TODO: we will support WAL tailing soon. virtual Status TryCatchUpWithPrimary() { return Status::NotSupported("Supported only by secondary instance"); } +#endif // !ROCKSDB_LITE private: // No copying allowed From 79d773bac6c8e83fa6a1d1d9e09516ef3f87e4e9 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Sun, 24 Mar 2019 19:50:39 -0700 Subject: [PATCH 31/33] Allow secondary to open a strict subset of column families --- db/db_secondary_test.cc | 2 ++ db/version_set.cc | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index f061696cdb1..478a7cec972 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -224,7 +224,9 @@ TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) { ASSERT_NE(nullptr, db_secondary_); ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value")); + ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value")); ASSERT_OK(Flush(0 /*cf*/)); + ASSERT_OK(Flush(1 /*cf*/)); ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); ReadOptions ropts; ropts.verify_checksums = true; diff --git a/db/version_set.cc b/db/version_set.cc index 006ef5569d8..5241608df85 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4814,6 +4814,12 @@ Status ReactiveVersionSet::ReadAndApply( } ColumnFamilyData* cfd = column_family_set_->GetColumnFamily(edit.column_family_); + // If we cannot find this column family in our column family set, then it + // may be a new column family created by the primary after the secondary + // starts. Ignore it for now. + if (nullptr == cfd) { + continue; + } if (active_version_builders_.find(edit.column_family_) == active_version_builders_.end()) { std::unique_ptr builder_guard( From d21071d25f5108e8d905efd3e6eb2bf7d59ee8fb Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Sun, 24 Mar 2019 11:56:52 -0700 Subject: [PATCH 32/33] Add an example --- examples/.gitignore | 1 + examples/Makefile | 5 +- examples/multi_processes_example.cc | 395 ++++++++++++++++++++++++++++ 3 files changed, 400 insertions(+), 1 deletion(-) create mode 100644 examples/multi_processes_example.cc diff --git a/examples/.gitignore b/examples/.gitignore index b5a05e44a27..823664ae1f5 100644 --- a/examples/.gitignore +++ b/examples/.gitignore @@ -2,6 +2,7 @@ c_simple_example column_families_example compact_files_example compaction_filter_example +multi_processes_example optimistic_transaction_example options_file_example simple_example diff --git a/examples/Makefile b/examples/Makefile index 57cd1a75a1c..27a6f0f421a 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -43,8 +43,11 @@ transaction_example: librocksdb transaction_example.cc options_file_example: librocksdb options_file_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) +multi_processes_example: librocksdb multi_processes_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + clean: - rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example ./options_file_example + rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example ./options_file_example ./multi_processes_example librocksdb: cd .. && $(MAKE) static_lib diff --git a/examples/multi_processes_example.cc b/examples/multi_processes_example.cc new file mode 100644 index 00000000000..b1c1d02ba25 --- /dev/null +++ b/examples/multi_processes_example.cc @@ -0,0 +1,395 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// How to use this example +// Open two terminals, in one of them, run `./multi_processes_example 0` to +// start a process running the primary instance. This will create a new DB in +// kDBPath. The process will run for a while inserting keys to the normal +// RocksDB database. +// Next, go to the other terminal and run `./multi_processes_example 1` to +// start a process running the secondary instance. This will create a secondary +// instance following the aforementioned primary instance. This process will +// run for a while, tailing the logs of the primary. After process with primary +// instance exits, this process will keep running until you hit 'CTRL+C'. + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(OS_LINUX) +#include +#include +#include +#include +#include +#include +#endif // !OS_LINUX + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" + +using rocksdb::ColumnFamilyDescriptor; +using rocksdb::ColumnFamilyHandle; +using rocksdb::ColumnFamilyOptions; +using rocksdb::DB; +using rocksdb::FlushOptions; +using rocksdb::Iterator; +using rocksdb::Options; +using rocksdb::ReadOptions; +using rocksdb::Slice; +using rocksdb::Status; +using rocksdb::WriteOptions; + +const std::string kDBPath = "/tmp/rocksdb_multi_processes_example"; +const std::string kPrimaryStatusFile = + "/tmp/rocksdb_multi_processes_example_primary_status"; +const uint64_t kMaxKey = 600000; +const size_t kMaxValueLength = 256; +const size_t kNumKeysPerFlush = 1000; + +const std::vector& GetColumnFamilyNames() { + static std::vector column_family_names = { + rocksdb::kDefaultColumnFamilyName, "pikachu"}; + return column_family_names; +} + +inline bool IsLittleEndian() { + uint32_t x = 1; + return *reinterpret_cast(&x) != 0; +} + +static std::atomic& ShouldSecondaryWait() { + static std::atomic should_secondary_wait{1}; + return should_secondary_wait; +} + +static std::string Key(uint64_t k) { + std::string ret; + if (IsLittleEndian()) { + ret.append(reinterpret_cast(&k), sizeof(k)); + } else { + char buf[sizeof(k)]; + buf[0] = k & 0xff; + buf[1] = (k >> 8) & 0xff; + buf[2] = (k >> 16) & 0xff; + buf[3] = (k >> 24) & 0xff; + buf[4] = (k >> 32) & 0xff; + buf[5] = (k >> 40) & 0xff; + buf[6] = (k >> 48) & 0xff; + buf[7] = (k >> 56) & 0xff; + ret.append(buf, sizeof(k)); + } + size_t i = 0, j = ret.size() - 1; + while (i < j) { + char tmp = ret[i]; + ret[i] = ret[j]; + ret[j] = tmp; + ++i; + --j; + } + return ret; +} + +static uint64_t Key(std::string key) { + assert(key.size() == sizeof(uint64_t)); + size_t i = 0, j = key.size() - 1; + while (i < j) { + char tmp = key[i]; + key[i] = key[j]; + key[j] = tmp; + ++i; + --j; + } + uint64_t ret = 0; + if (IsLittleEndian()) { + memcpy(&ret, key.c_str(), sizeof(uint64_t)); + } else { + const char* buf = key.c_str(); + ret |= static_cast(buf[0]); + ret |= (static_cast(buf[1]) << 8); + ret |= (static_cast(buf[2]) << 16); + ret |= (static_cast(buf[3]) << 24); + ret |= (static_cast(buf[4]) << 32); + ret |= (static_cast(buf[5]) << 40); + ret |= (static_cast(buf[6]) << 48); + ret |= (static_cast(buf[7]) << 56); + } + return ret; +} + +static Slice GenerateRandomValue(const size_t max_length, char scratch[]) { + size_t sz = 1 + (std::rand() % max_length); + int rnd = std::rand(); + for (size_t i = 0; i != sz; ++i) { + scratch[i] = static_cast(rnd ^ i); + } + return Slice(scratch, sz); +} + +static bool ShouldCloseDB() { return true; } + +// TODO: port this example to other systems. It should be straightforward for +// POSIX-compliant systems. +#if defined(OS_LINUX) +void CreateDB() { + long my_pid = static_cast(getpid()); + Options options; + Status s = rocksdb::DestroyDB(kDBPath, options); + if (!s.ok()) { + fprintf(stderr, "[process %ld] Failed to destroy DB: %s\n", my_pid, + s.ToString().c_str()); + assert(false); + } + options.create_if_missing = true; + DB* db = nullptr; + s = DB::Open(options, kDBPath, &db); + if (!s.ok()) { + fprintf(stderr, "[process %ld] Failed to open DB: %s\n", my_pid, + s.ToString().c_str()); + assert(false); + } + std::vector handles; + ColumnFamilyOptions cf_opts(options); + for (const auto& cf_name : GetColumnFamilyNames()) { + if (rocksdb::kDefaultColumnFamilyName != cf_name) { + ColumnFamilyHandle* handle = nullptr; + s = db->CreateColumnFamily(cf_opts, cf_name, &handle); + if (!s.ok()) { + fprintf(stderr, "[process %ld] Failed to create CF %s: %s\n", my_pid, + cf_name.c_str(), s.ToString().c_str()); + assert(false); + } + handles.push_back(handle); + } + } + fprintf(stdout, "[process %ld] Column families created\n", my_pid); + for (auto h : handles) { + delete h; + } + handles.clear(); + delete db; +} + +void RunPrimary() { + long my_pid = static_cast(getpid()); + fprintf(stdout, "[process %ld] Primary instance starts\n", my_pid); + CreateDB(); + std::srand(time(nullptr)); + DB* db = nullptr; + Options options; + options.create_if_missing = false; + std::vector column_families; + for (const auto& cf_name : GetColumnFamilyNames()) { + column_families.push_back(ColumnFamilyDescriptor(cf_name, options)); + } + std::vector handles; + WriteOptions write_opts; + char val_buf[kMaxValueLength] = {0}; + uint64_t curr_key = 0; + while (curr_key < kMaxKey) { + Status s; + if (nullptr == db) { + s = DB::Open(options, kDBPath, column_families, &handles, &db); + if (!s.ok()) { + fprintf(stderr, "[process %ld] Failed to open DB: %s\n", my_pid, + s.ToString().c_str()); + assert(false); + } + } + assert(nullptr != db); + assert(handles.size() == GetColumnFamilyNames().size()); + for (auto h : handles) { + assert(nullptr != h); + for (size_t i = 0; i != kNumKeysPerFlush; ++i) { + Slice key = Key(curr_key + static_cast(i)); + Slice value = GenerateRandomValue(kMaxValueLength, val_buf); + s = db->Put(write_opts, h, key, value); + if (!s.ok()) { + fprintf(stderr, "[process %ld] Failed to insert\n", my_pid); + assert(false); + } + } + s = db->Flush(FlushOptions(), h); + if (!s.ok()) { + fprintf(stderr, "[process %ld] Failed to flush\n", my_pid); + assert(false); + } + } + curr_key += static_cast(kNumKeysPerFlush); + if (ShouldCloseDB()) { + for (auto h : handles) { + delete h; + } + handles.clear(); + delete db; + db = nullptr; + } + } + if (nullptr != db) { + for (auto h : handles) { + delete h; + } + handles.clear(); + delete db; + db = nullptr; + } + fprintf(stdout, "[process %ld] Finished adding keys\n", my_pid); +} + +void secondary_instance_sigint_handler(int signal) { + ShouldSecondaryWait().store(0, std::memory_order_relaxed); + fprintf(stdout, "\n"); + fflush(stdout); +}; + +void RunSecondary() { + ::signal(SIGINT, secondary_instance_sigint_handler); + long my_pid = static_cast(getpid()); + const std::string kSecondaryPath = + "/tmp/rocksdb_multi_processes_example_secondary"; + // Create directory if necessary + if (nullptr == opendir(kSecondaryPath.c_str())) { + int ret = + mkdir(kSecondaryPath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); + if (ret < 0) { + perror("failed to create directory for secondary instance"); + exit(0); + } + } + DB* db = nullptr; + Options options; + options.create_if_missing = false; + options.max_open_files = -1; + Status s = DB::OpenAsSecondary(options, kDBPath, kSecondaryPath, &db); + if (!s.ok()) { + fprintf(stderr, "[process %ld] Failed to open in secondary mode: %s\n", + my_pid, s.ToString().c_str()); + assert(false); + } else { + fprintf(stdout, "[process %ld] Secondary instance starts\n", my_pid); + } + + ReadOptions ropts; + ropts.verify_checksums = true; + ropts.total_order_seek = true; + + std::vector test_threads; + test_threads.emplace_back([&]() { + while (1 == ShouldSecondaryWait().load(std::memory_order_relaxed)) { + std::unique_ptr iter(db->NewIterator(ropts)); + iter->SeekToFirst(); + size_t count = 0; + for (; iter->Valid(); iter->Next()) { + ++count; + } + } + fprintf(stdout, "[process %ld] Range_scan thread finished\n", my_pid); + }); + + test_threads.emplace_back([&]() { + std::srand(time(nullptr)); + while (1 == ShouldSecondaryWait().load(std::memory_order_relaxed)) { + Slice key = Key(std::rand() % kMaxKey); + std::string value; + db->Get(ropts, key, &value); + } + fprintf(stdout, "[process %ld] Point lookup thread finished\n"); + }); + + uint64_t curr_key = 0; + while (1 == ShouldSecondaryWait().load(std::memory_order_relaxed)) { + s = db->TryCatchUpWithPrimary(); + if (!s.ok()) { + fprintf(stderr, + "[process %ld] error while trying to catch up with " + "primary %s\n", + my_pid, s.ToString().c_str()); + assert(false); + } + { + std::unique_ptr iter(db->NewIterator(ropts)); + if (!iter) { + fprintf(stderr, "[process %ld] Failed to create iterator\n", my_pid); + assert(false); + } + iter->SeekToLast(); + if (iter->Valid()) { + uint64_t curr_max_key = Key(iter->key().ToString()); + if (curr_max_key != curr_key) { + fprintf(stdout, "[process %ld] Observed key %" PRIu64 "\n", my_pid, + curr_key); + curr_key = curr_max_key; + } + } + } + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + s = db->TryCatchUpWithPrimary(); + if (!s.ok()) { + fprintf(stderr, + "[process %ld] error while trying to catch up with " + "primary %s\n", + my_pid, s.ToString().c_str()); + assert(false); + } + + std::vector column_families; + for (const auto& cf_name : GetColumnFamilyNames()) { + column_families.push_back(ColumnFamilyDescriptor(cf_name, options)); + } + std::vector handles; + DB* verification_db = nullptr; + s = DB::OpenForReadOnly(options, kDBPath, column_families, &handles, + &verification_db); + assert(s.ok()); + Iterator* iter1 = verification_db->NewIterator(ropts); + iter1->SeekToFirst(); + + Iterator* iter = db->NewIterator(ropts); + iter->SeekToFirst(); + for (; iter->Valid() && iter1->Valid(); iter->Next(), iter1->Next()) { + if (iter->key().ToString() != iter1->key().ToString()) { + fprintf(stderr, "%" PRIu64 "!= %" PRIu64 "\n", + Key(iter->key().ToString()), Key(iter1->key().ToString())); + assert(false); + } else if (iter->value().ToString() != iter1->value().ToString()) { + fprintf(stderr, "Value mismatch\n"); + assert(false); + } + } + fprintf(stdout, "[process %ld] Verification succeeded\n", my_pid); + for (auto& thr : test_threads) { + thr.join(); + } + delete iter; + delete iter1; + delete db; + delete verification_db; +} + +int main(int argc, char** argv) { + if (argc < 2) { + fprintf(stderr, "%s <0 for primary, 1 for secondary>\n", argv[0]); + return 0; + } + if (atoi(argv[1]) == 0) { + RunPrimary(); + } else { + RunSecondary(); + } + return 0; +} +#else // OS_LINUX +int main() { + fpritnf(stderr, "Not implemented.\n"); + return 0; +} +#endif // !OS_LINUX From d46f9c5e2a3b904085362f331c56201565b6fac8 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Mon, 25 Mar 2019 17:58:19 -0700 Subject: [PATCH 33/33] Add more comments --- db/db_impl_secondary.h | 4 +++- include/rocksdb/db.h | 21 ++++++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h index 386d7d3e9dd..1b6746f7e44 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl_secondary.h @@ -128,7 +128,9 @@ class DBImplSecondary : public DBImpl { } // Try to catch up with the primary by reading as much as possible from the - // log files until there is nothing more to read or encounters an error. + // log files until there is nothing more to read or encounters an error. If + // the amount of information in the log files to process is huge, this + // method can take long time due to all the I/O and CPU costs. Status TryCatchUpWithPrimary() override; private: diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 0ecc76339ac..b9e72747921 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -164,7 +164,15 @@ class DB { // The following OpenAsSecondary functions create a secondary instance that // can dynamically tail the MANIFEST of a primary that must have already been - // created. + // created. User can call TryCatchUpWithPrimary to make the secondary + // instance catch up with primary (WAL tailing is NOT supported now) whenever + // the user feels necessary. Column families created by the primary after the + // secondary instance starts are currently ignored by the secondary instance. + // Column families opened by secondary and dropped by the primary will be + // dropped by secondary as well. However the user of the secondary instance + // can still access the data of such dropped column family as long as they + // do not destroy the corresponding column family handle. + // WAL tailing is not supported at present, but will arrive soon. // // The options argument specifies the options to open the secondary instance. // The name argument specifies the name of the primary db that you have used @@ -172,11 +180,15 @@ class DB { // The secondary_path argument points to a directory where the secondary // instance stores its info log. // The dbptr is an out-arg corresponding to the opened secondary instance. + // The pointer points to a heap-allocated database, and the user should + // delete it after use. // Open DB as secondary instance with only the default column family. + // Return OK on success, non-OK on failures. static Status OpenAsSecondary(const Options& options, const std::string& name, const std::string& secondary_path, DB** dbptr); - // Open DB as secondary instance with column families. + // Open DB as secondary instance with column families. You can open a subset + // of column families in secondary mode. // The db_options specify the database specific options. // The name argument specifies the name of the primary db that you have used // to open the primary instance. @@ -188,7 +200,10 @@ class DB { // The handles is an out-arg corresponding to the opened database column // familiy handles. // The dbptr is an out-arg corresponding to the opened secondary instance. - // Open DB as secondary instance with only the default column family. + // The pointer points to a heap-allocated database, and the caller should + // delete it after use. Before deleting the dbptr, the user should also + // delete the pointers stored in handles vector. + // Return OK on success, on-OK on failures. static Status OpenAsSecondary( const DBOptions& db_options, const std::string& name, const std::string& secondary_path,