From 8bd78920ff0e928e574ea8b93a5b615434ec1242 Mon Sep 17 00:00:00 2001 From: Hui Xiao Date: Fri, 5 May 2023 15:29:07 -0700 Subject: [PATCH] Address comment --- HISTORY.md | 2 +- db/builder.cc | 2 +- db/compaction/compaction_outputs.cc | 2 +- db/db_impl/db_impl_compaction_flush.cc | 4 ++-- db/db_impl/db_impl_experimental.cc | 2 +- db/db_impl/db_impl_open.cc | 4 ++-- db/experimental.cc | 2 +- db/external_sst_file_ingestion_job.cc | 13 +++++++++-- db/flush_job.cc | 2 +- db/import_column_family_job.cc | 13 +++++++++-- db/repair.cc | 13 ++++++++--- db/table_cache.cc | 6 +---- db/version_edit.cc | 20 ++++++++-------- db/version_edit.h | 14 +++++------ db/version_set.cc | 16 ++++++------- file/prefetch_test.cc | 1 + .../block_based/block_based_table_builder.cc | 15 +++++++----- table/block_based/block_based_table_builder.h | 4 ++-- .../block_based/block_based_table_factory.cc | 3 +-- table/block_based/block_based_table_factory.h | 2 +- table/block_based/block_based_table_reader.cc | 23 ++++++------------- table/block_based/block_based_table_reader.h | 6 ++--- table/block_fetcher_test.cc | 8 +++---- table/table_builder.h | 11 ++++----- 24 files changed, 99 insertions(+), 89 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 5c59c661861..96170b41ba9 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,7 +11,7 @@ * Delete an empty WAL file on DB open if the log number is less than the min log number to keep ### Performance Improvements -* Record the starting offset of block-based table's tail (i.e, all blocks after data blocks till the end) in manifest and use it to prefetch the tail more accurately during table open instead of relying on heuristics (#11406).Heuristics will still be used for now as a fallback for backward compatibility. +* Improved the I/O efficiency of prefetching SST metadata by recording more information in the DB manifest. Opening files written with previous versions will still rely on heuristics for how much to prefetch (#11406). ## 8.2.0 (04/24/2023) ### Public API Changes diff --git a/db/builder.cc b/db/builder.cc index c2e99fb6640..a99bb57e8c2 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -291,7 +291,7 @@ Status BuildTable( if (s.ok() && !empty) { uint64_t file_size = builder->FileSize(); meta->fd.file_size = file_size; - meta->tail_start_offset = builder->GetTailStartOffset(); + meta->tail_size = builder->GetTailSize(); meta->marked_for_compaction = builder->NeedCompact(); assert(meta->fd.GetFileSize() > 0); tp = builder diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index e1d54ecb3fc..f6e0dbd7d43 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -43,7 +43,7 @@ Status CompactionOutputs::Finish(const Status& intput_status, const uint64_t current_bytes = builder_->FileSize(); if (s.ok()) { meta->fd.file_size = current_bytes; - meta->tail_start_offset = builder_->GetTailStartOffset(); + meta->tail_size = builder_->GetTailSize(); meta->marked_for_compaction = builder_->NeedCompact(); } current_output().finished = true; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index dfe41ded830..39a8339199e 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1758,7 +1758,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size, f->tail_start_offset); + f->compensated_range_deletion_size, f->tail_size); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), @@ -3457,7 +3457,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size, f->tail_start_offset); + f->compensated_range_deletion_size, f->tail_size); ROCKS_LOG_BUFFER( log_buffer, diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index 9354e22f520..8d958ffc17d 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -138,7 +138,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size, f->tail_start_offset); + f->compensated_range_deletion_size, f->tail_size); } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index ee51c41578f..5bad1e12bdf 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -617,7 +617,7 @@ Status DBImpl::Recover( f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, f->compensated_range_deletion_size, - f->tail_start_offset); + f->tail_size); ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] Moving #%" PRIu64 " from from_level-%d to from_level-%d %" PRIu64 @@ -1681,7 +1681,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, meta.oldest_blob_file_number, meta.oldest_ancester_time, meta.file_creation_time, meta.epoch_number, meta.file_checksum, meta.file_checksum_func_name, meta.unique_id, - meta.compensated_range_deletion_size, meta.tail_start_offset); + meta.compensated_range_deletion_size, meta.tail_size); for (const auto& blob : blob_file_additions) { edit->AddBlobFile(blob); diff --git a/db/experimental.cc b/db/experimental.cc index f2b8854a374..cb5fb317945 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -102,7 +102,7 @@ Status UpdateManifestForFilesState( lf->oldest_blob_file_number, lf->oldest_ancester_time, lf->file_creation_time, lf->epoch_number, lf->file_checksum, lf->file_checksum_func_name, lf->unique_id, - lf->compensated_range_deletion_size, lf->tail_start_offset); + lf->compensated_range_deletion_size, lf->tail_size); } } } else { diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index f77727845cd..f8716e5f487 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -464,6 +464,16 @@ Status ExternalSstFileIngestionJob::Run() { current_time = oldest_ancester_time = static_cast(temp_current_time); } + uint64_t tail_size = 0; + bool contain_no_data_blocks = f.table_properties.num_entries > 0 && + (f.table_properties.num_entries == + f.table_properties.num_range_deletions); + if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { + uint64_t file_size = f.fd.GetFileSize(); + assert(f.table_properties.tail_start_offset <= file_size); + tail_size = file_size - f.table_properties.tail_start_offset; + } + FileMetaData f_metadata( f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, @@ -472,8 +482,7 @@ Status ExternalSstFileIngestionJob::Run() { ingestion_options_.ingest_behind ? kReservedEpochNumberForFileIngestedBehind : cfd_->NewEpochNumber(), - f.file_checksum, f.file_checksum_func_name, f.unique_id, 0, - f.table_properties.tail_start_offset); + f.file_checksum, f.file_checksum_func_name, f.unique_id, 0, tail_size); f_metadata.temperature = f.file_temperature; edit_.AddFile(f.picked_level, f_metadata); } diff --git a/db/flush_job.cc b/db/flush_job.cc index 4c687a9e42b..8b152ee3e0a 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -1008,7 +1008,7 @@ Status FlushJob::WriteLevel0Table() { meta_.file_creation_time, meta_.epoch_number, meta_.file_checksum, meta_.file_checksum_func_name, meta_.unique_id, meta_.compensated_range_deletion_size, - meta_.tail_start_offset); + meta_.tail_size); edit_->SetBlobFileAdditions(std::move(blob_file_additions)); } // Piggyback FlushJobInfo on the first first flushed memtable. diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 5428d7631de..f76d0ac1b90 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -138,6 +138,16 @@ Status ImportColumnFamilyJob::Run() { const auto& f = files_to_import_[i]; const auto& file_metadata = metadata_[i]; + uint64_t tail_size = 0; + bool contain_no_data_blocks = f.table_properties.num_entries > 0 && + (f.table_properties.num_entries == + f.table_properties.num_range_deletions); + if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { + uint64_t file_size = f.fd.GetFileSize(); + assert(f.table_properties.tail_start_offset <= file_size); + tail_size = file_size - f.table_properties.tail_start_offset; + } + VersionEdit dummy_version_edit; dummy_version_edit.AddFile( file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), @@ -145,8 +155,7 @@ Status ImportColumnFamilyJob::Run() { file_metadata.smallest_seqno, file_metadata.largest_seqno, false, file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time, current_time, file_metadata.epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, f.unique_id, 0, - f.table_properties.tail_start_offset); + kUnknownFileChecksumFuncName, f.unique_id, 0, tail_size); s = dummy_version_builder.Apply(&dummy_version_edit); } if (s.ok()) { diff --git a/db/repair.cc b/db/repair.cc index 8dbacb41e4c..67513cacc4c 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -552,7 +552,15 @@ class Repairer { t->meta.oldest_ancester_time = props->creation_time; } if (status.ok()) { - t->meta.tail_start_offset = props->tail_start_offset; + uint64_t tail_size = 0; + bool contain_no_data_blocks = + props->num_entries > 0 && + (props->num_entries == props->num_range_deletions); + if (props->tail_start_offset > 0 || contain_no_data_blocks) { + assert(props->tail_start_offset <= file_size); + tail_size = file_size - props->tail_start_offset; + } + t->meta.tail_size = tail_size; } ColumnFamilyData* cfd = nullptr; if (status.ok()) { @@ -685,8 +693,7 @@ class Repairer { table->meta.oldest_ancester_time, table->meta.file_creation_time, table->meta.epoch_number, table->meta.file_checksum, table->meta.file_checksum_func_name, table->meta.unique_id, - table->meta.compensated_range_deletion_size, - table->meta.tail_start_offset); + table->meta.compensated_range_deletion_size, table->meta.tail_size); } s = dummy_version_builder.Apply(&dummy_edit); if (s.ok()) { diff --git a/db/table_cache.cc b/db/table_cache.cc index 560f253847c..73fa07c6d54 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -146,11 +146,7 @@ Status TableCache::GetTableReader( false /* force_direct_prefetch */, level, block_cache_tracer_, max_file_size_for_l0_meta_pin, db_session_id_, file_meta.fd.GetNumber(), expected_unique_id, - file_meta.fd.largest_seqno, file_meta.tail_start_offset, - file_meta.num_entries > 0 && - (file_meta.num_entries == file_meta.num_range_deletions) - ? true - : false /* contain_no_data_block */), + file_meta.fd.largest_seqno, file_meta.tail_size), std::move(file_reader), file_meta.fd.GetFileSize(), table_reader, prefetch_index_and_filter_in_cache); TEST_SYNC_POINT("TableCache::GetTableReader:0"); diff --git a/db/version_edit.cc b/db/version_edit.cc index c371228aa59..4f1ae80d21e 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -238,11 +238,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const { f.compensated_range_deletion_size); PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size)); } - if (f.tail_start_offset) { - PutVarint32(dst, NewFileCustomTag::kTailStartOffset); - std::string varint_tail_start_offset; - PutVarint64(&varint_tail_start_offset, f.tail_start_offset); - PutLengthPrefixedSlice(dst, Slice(varint_tail_start_offset)); + if (f.tail_size) { + PutVarint32(dst, NewFileCustomTag::kTailSize); + std::string varint_tail_size; + PutVarint64(&varint_tail_size, f.tail_size); + PutLengthPrefixedSlice(dst, Slice(varint_tail_size)); } TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", @@ -422,8 +422,8 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "Invalid compensated range deletion size"; } break; - case kTailStartOffset: - if (!GetVarint64(&field, &f.tail_start_offset)) { + case kTailSize: + if (!GetVarint64(&field, &f.tail_size)) { return "invalid tail start offset"; } break; @@ -862,8 +862,8 @@ std::string VersionEdit::DebugString(bool hex_key) const { InternalUniqueIdToExternal(&id); r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id))); } - r.append(" tail start offset:"); - AppendNumberTo(&r, f.tail_start_offset); + r.append(" tail size:"); + AppendNumberTo(&r, f.tail_size); } for (const auto& blob_file_addition : blob_file_additions_) { @@ -979,7 +979,7 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { // permanent jw << "Temperature" << static_cast(f.temperature); } - jw << "TailStartOffset" << f.tail_start_offset; + jw << "TailSize" << f.tail_size; jw.EndArrayedObject(); } diff --git a/db/version_edit.h b/db/version_edit.h index d017ef9be24..07e8f37742c 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -90,7 +90,7 @@ enum NewFileCustomTag : uint32_t { kUniqueId = 12, kEpochNumber = 13, kCompensatedRangeDeletionSize = 14, - kTailStartOffset = 15, + kTailSize = 15, // If this bit for the custom tag is set, opening DB should fail if // we don't know this field. @@ -239,9 +239,9 @@ struct FileMetaData { // SST unique id UniqueId64x2 unique_id{}; - // Offset where the "tail" part of SST file starts + // Size of the "tail" part of a SST file // "Tail" refers to all blocks after data blocks till the end of the SST file - uint64_t tail_start_offset = 0; + uint64_t tail_size = 0; FileMetaData() = default; @@ -255,7 +255,7 @@ struct FileMetaData { const std::string& _file_checksum_func_name, UniqueId64x2 _unique_id, const uint64_t _compensated_range_deletion_size, - uint64_t _tail_start_offset) + uint64_t _tail_size) : fd(file, file_path_id, file_size, smallest_seq, largest_seq), smallest(smallest_key), largest(largest_key), @@ -269,7 +269,7 @@ struct FileMetaData { file_checksum(_file_checksum), file_checksum_func_name(_file_checksum_func_name), unique_id(std::move(_unique_id)), - tail_start_offset(_tail_start_offset) { + tail_size(_tail_size) { TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); } @@ -454,7 +454,7 @@ class VersionEdit { const std::string& file_checksum_func_name, const UniqueId64x2& unique_id, const uint64_t compensated_range_deletion_size, - uint64_t tail_start_offset) { + uint64_t tail_size) { assert(smallest_seqno <= largest_seqno); new_files_.emplace_back( level, @@ -463,7 +463,7 @@ class VersionEdit { temperature, oldest_blob_file_number, oldest_ancester_time, file_creation_time, epoch_number, file_checksum, file_checksum_func_name, unique_id, - compensated_range_deletion_size, tail_start_offset)); + compensated_range_deletion_size, tail_size)); if (!HasLastSequence() || largest_seqno > GetLastSequence()) { SetLastSequence(largest_seqno); } diff --git a/db/version_set.cc b/db/version_set.cc index 392081535aa..436389b81b1 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -6346,14 +6346,14 @@ Status VersionSet::WriteCurrentStateToManifest( for (const auto& f : level_files) { assert(f); - edit.AddFile( - level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), - f->smallest, f->largest, f->fd.smallest_seqno, - f->fd.largest_seqno, f->marked_for_compaction, f->temperature, - f->oldest_blob_file_number, f->oldest_ancester_time, - f->file_creation_time, f->epoch_number, f->file_checksum, - f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size, f->tail_start_offset); + edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->epoch_number, f->file_checksum, + f->file_checksum_func_name, f->unique_id, + f->compensated_range_deletion_size, f->tail_size); } } diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index b1caccc67fc..053caacd09e 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -261,6 +261,7 @@ TEST_P(PrefetchTest, BlockBasedTableTailPrefetch) { Status s = TryReopen(options); if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { // If direct IO is not supported, skip the test + ROCKSDB_GTEST_BYPASS("Direct IO is not supported"); return; } else { ASSERT_OK(s); diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 309f392a004..3d7a08e7755 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -341,6 +341,10 @@ struct BlockBasedTableBuilder::Rep { std::unique_ptr pc_rep; BlockCreateContext create_context; + // The size of the "tail" part of a SST file. "Tail" refers to + // all blocks after data blocks till the end of the SST file. + uint64_t tail_size; + uint64_t get_offset() { return offset.load(std::memory_order_relaxed); } void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); } @@ -456,6 +460,7 @@ struct BlockBasedTableBuilder::Rep { !use_delta_encoding_for_index_values, table_opt.index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey), + tail_size(0), status_ok(true), io_status_ok(true) { if (tbo.target_file_size == 0) { @@ -1908,9 +1913,8 @@ Status BlockBasedTableBuilder::Finish() { } } - if (ok()) { - r->props.tail_start_offset = r->offset; - } + r->props.tail_start_offset = r->offset; + // Write meta blocks, metaindex block and footer in the following order. // 1. [meta block: filter] // 2. [meta block: index] @@ -1938,6 +1942,7 @@ Status BlockBasedTableBuilder::Finish() { r->SetStatus(r->CopyIOStatus()); Status ret_status = r->CopyStatus(); assert(!ret_status.ok() || io_status().ok()); + r->tail_size = r->offset - r->props.tail_start_offset; return ret_status; } @@ -1971,9 +1976,7 @@ uint64_t BlockBasedTableBuilder::EstimatedFileSize() const { } } -uint64_t BlockBasedTableBuilder::GetTailStartOffset() const { - return rep_->props.tail_start_offset; -} +uint64_t BlockBasedTableBuilder::GetTailSize() const { return rep_->tail_size; } bool BlockBasedTableBuilder::NeedCompact() const { for (const auto& collector : rep_->table_properties_collectors) { diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h index f5bf48b8736..3949474c580 100644 --- a/table/block_based/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -91,9 +91,9 @@ class BlockBasedTableBuilder : public TableBuilder { // is enabled. uint64_t EstimatedFileSize() const override; - // Get the offset where the "tail" part of a SST file starts. "Tail" refers to + // Get the size of the "tail" part of a SST file. "Tail" refers to // all blocks after data blocks till the end of the SST file. - uint64_t GetTailStartOffset() const override; + uint64_t GetTailSize() const override; bool NeedCompact() const override; diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 1302ad29fbe..653c222d5e4 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -568,8 +568,7 @@ Status BlockBasedTableFactory::NewTableReader( ro, table_reader_options.ioptions, table_reader_options.env_options, table_options_, table_reader_options.internal_comparator, std::move(file), file_size, table_reader_options.block_protection_bytes_per_key, - table_reader, table_reader_options.tail_start_offset, - table_reader_options.contain_no_data_block, table_reader_cache_res_mgr_, + table_reader, table_reader_options.tail_size, table_reader_cache_res_mgr_, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, table_reader_options.level, table_reader_options.immortal, table_reader_options.largest_seqno, diff --git a/table/block_based/block_based_table_factory.h b/table/block_based/block_based_table_factory.h index f43183caf28..1f787697772 100644 --- a/table/block_based/block_based_table_factory.h +++ b/table/block_based/block_based_table_factory.h @@ -29,7 +29,7 @@ class RandomAccessFileReader; class WritableFileWriter; // TODO: deprecate this class as it can be replaced with -// `FileMetaData::tail_start_offset` +// `FileMetaData::tail_size` // // A class used to track actual bytes written from the tail in the recent SST // file opens, and provide a suggestion for following open. diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 1a53a77d07d..a5e8e701467 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -561,8 +561,7 @@ Status BlockBasedTable::Open( const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, uint64_t file_size, uint8_t block_protection_bytes_per_key, - std::unique_ptr* table_reader, uint64_t tail_start_offset, - bool contain_no_data_block, + std::unique_ptr* table_reader, uint64_t tail_size, std::shared_ptr table_reader_cache_res_mgr, const std::shared_ptr& prefix_extractor, const bool prefetch_index_and_filter_in_cache, const bool skip_filters, @@ -594,8 +593,8 @@ Status BlockBasedTable::Open( if (!ioptions.allow_mmap_reads) { s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch, tail_prefetch_stats, prefetch_all, preload_all, - &prefetch_buffer, ioptions.stats, tail_start_offset, - contain_no_data_block, ioptions.logger); + &prefetch_buffer, ioptions.stats, tail_size, + ioptions.logger); // Return error in prefetch path to users. if (!s.ok()) { return s; @@ -810,20 +809,12 @@ Status BlockBasedTable::PrefetchTail( bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, std::unique_ptr* prefetch_buffer, Statistics* stats, - uint64_t tail_start_offset, bool contain_no_data_block, - Logger* const logger) { - assert(tail_start_offset <= file_size); + uint64_t tail_size, Logger* const logger) { + assert(tail_size <= file_size); size_t tail_prefetch_size = 0; - if (tail_start_offset != 0) { - tail_prefetch_size = file_size - tail_start_offset; - } else if (contain_no_data_block) { - tail_prefetch_size = file_size - tail_start_offset; - ROCKS_LOG_WARN(logger, - "Tail prefetch size %zu is calculated based on tail start " - "offset being 0 and the " - "file does not contain any data blocks", - tail_prefetch_size); + if (tail_size != 0) { + tail_prefetch_size = tail_size; } else { if (tail_prefetch_stats != nullptr) { // Multiple threads may get a 0 (no history) when running in parallel, diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 71c8a8fd3ec..0f14e05dd14 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -99,8 +99,7 @@ class BlockBasedTable : public TableReader { const InternalKeyComparator& internal_key_comparator, std::unique_ptr&& file, uint64_t file_size, uint8_t block_protection_bytes_per_key, - std::unique_ptr* table_reader, uint64_t tail_start_offset, - bool contain_no_data_block, + std::unique_ptr* table_reader, uint64_t tail_size, std::shared_ptr table_reader_cache_res_mgr = nullptr, const std::shared_ptr& prefix_extractor = nullptr, @@ -461,8 +460,7 @@ class BlockBasedTable : public TableReader { bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, std::unique_ptr* prefetch_buffer, Statistics* stats, - uint64_t tail_start_offset, bool contain_no_data_block, - Logger* const logger); + uint64_t tail_size, Logger* const logger); Status ReadMetaIndexBlock(const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* metaindex_block, diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc index 9d09b2ae540..c2f6552cc4b 100644 --- a/table/block_fetcher_test.cc +++ b/table/block_fetcher_test.cc @@ -266,10 +266,10 @@ class BlockFetcherTest : public testing::Test { const auto* table_options = table_factory_.GetOptions(); ASSERT_NE(table_options, nullptr); - ASSERT_OK(BlockBasedTable::Open( - ro, ioptions, EnvOptions(), *table_options, comparator, std::move(file), - file_size, 0 /* block_protection_bytes_per_key */, &table_reader, - 0 /* tail_start_offset */, false /* contain_no_data_block */)); + ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options, + comparator, std::move(file), file_size, + 0 /* block_protection_bytes_per_key */, + &table_reader, 0 /* tail_size */)); table->reset(reinterpret_cast(table_reader.release())); } diff --git a/table/table_builder.h b/table/table_builder.h index d8a62df7b1a..6d98bce7078 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -43,7 +43,7 @@ struct TableReaderOptions { size_t _max_file_size_for_l0_meta_pin = 0, const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0, UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0, - uint64_t _tail_start_offset = 0, bool _contain_no_data_block = false) + uint64_t _tail_size = 0) : ioptions(_ioptions), prefix_extractor(_prefix_extractor), env_options(_env_options), @@ -59,8 +59,7 @@ struct TableReaderOptions { cur_file_num(_cur_file_num), unique_id(_unique_id), block_protection_bytes_per_key(_block_protection_bytes_per_key), - tail_start_offset(_tail_start_offset), - contain_no_data_block(_contain_no_data_block) {} + tail_size(_tail_size) {} const ImmutableOptions& ioptions; const std::shared_ptr& prefix_extractor; @@ -93,9 +92,7 @@ struct TableReaderOptions { uint8_t block_protection_bytes_per_key; - uint64_t tail_start_offset; - - bool contain_no_data_block; + uint64_t tail_size; }; struct TableBuilderOptions { @@ -207,7 +204,7 @@ class TableBuilder { // is enabled. virtual uint64_t EstimatedFileSize() const { return FileSize(); } - virtual uint64_t GetTailStartOffset() const { return 0; } + virtual uint64_t GetTailSize() const { return 0; } // If the user defined table properties collector suggest the file to // be further compacted.