diff --git a/HISTORY.md b/HISTORY.md index 9b5e6027992..96170b41ba9 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -10,6 +10,9 @@ ### Bug Fixes * Delete an empty WAL file on DB open if the log number is less than the min log number to keep +### Performance Improvements +* Improved the I/O efficiency of prefetching SST metadata by recording more information in the DB manifest. Opening files written with previous versions will still rely on heuristics for how much to prefetch (#11406). + ## 8.2.0 (04/24/2023) ### Public API Changes * `SstFileWriter::DeleteRange()` now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined. diff --git a/db/builder.cc b/db/builder.cc index eadc315c9aa..a99bb57e8c2 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -291,6 +291,7 @@ Status BuildTable( if (s.ok() && !empty) { uint64_t file_size = builder->FileSize(); meta->fd.file_size = file_size; + meta->tail_size = builder->GetTailSize(); meta->marked_for_compaction = builder->NeedCompact(); assert(meta->fd.GetFileSize() > 0); tp = builder diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 3f7966404f7..a7cf65f0131 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -386,7 +386,7 @@ class CompactionJobTestBase : public testing::Test { kUnknownFileCreationTime, versions_->GetColumnFamilySet()->GetDefault()->NewEpochNumber(), kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, - 0); + 0, 0); mutex_.Lock(); EXPECT_OK(versions_->LogAndApply( diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index cf5105e4108..f6e0dbd7d43 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -43,6 +43,7 @@ Status CompactionOutputs::Finish(const Status& intput_status, const uint64_t current_bytes = builder_->FileSize(); if (s.ok()) { meta->fd.file_size = current_bytes; + meta->tail_size = builder_->GetTailSize(); meta->marked_for_compaction = builder_->NeedCompact(); } current_output().finished = true; diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index c9cbb09ed37..a5a14ac21a7 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -148,7 +148,7 @@ class CompactionPickerTestBase : public testing::Test { smallest_seq, largest_seq, marked_for_compact, temperature, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); f->compensated_file_size = (compensated_file_size != 0) ? compensated_file_size : file_size; f->oldest_ancester_time = oldest_ancestor_time; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index b8ab5eb7e7a..39a8339199e 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1758,7 +1758,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), @@ -3457,7 +3457,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size); ROCKS_LOG_BUFFER( log_buffer, diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index 5e3b7ba6155..8d958ffc17d 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -138,7 +138,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size); } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index a9b42d62ab4..5bad1e12bdf 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -616,7 +616,8 @@ Status DBImpl::Recover( f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, - f->unique_id, f->compensated_range_deletion_size); + f->unique_id, f->compensated_range_deletion_size, + f->tail_size); ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] Moving #%" PRIu64 " from from_level-%d to from_level-%d %" PRIu64 @@ -1673,14 +1674,14 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, constexpr int level = 0; if (s.ok() && has_output) { - edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), - meta.fd.GetFileSize(), meta.smallest, meta.largest, - meta.fd.smallest_seqno, meta.fd.largest_seqno, - meta.marked_for_compaction, meta.temperature, - meta.oldest_blob_file_number, meta.oldest_ancester_time, - meta.file_creation_time, meta.epoch_number, - meta.file_checksum, meta.file_checksum_func_name, - meta.unique_id, meta.compensated_range_deletion_size); + edit->AddFile( + level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(), + meta.smallest, meta.largest, meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature, + meta.oldest_blob_file_number, meta.oldest_ancester_time, + meta.file_creation_time, meta.epoch_number, meta.file_checksum, + meta.file_checksum_func_name, meta.unique_id, + meta.compensated_range_deletion_size, meta.tail_size); for (const auto& blob : blob_file_additions) { edit->AddBlobFile(blob); diff --git a/db/db_test2.cc b/db/db_test2.cc index 544d9b299d6..43aea21813e 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -5298,88 +5298,6 @@ TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -TEST_F(DBTest2, TestBBTTailPrefetch) { - std::atomic called(false); - size_t expected_lower_bound = 512 * 1024; - size_t expected_higher_bound = 512 * 1024; - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { - size_t* prefetch_size = static_cast(arg); - EXPECT_LE(expected_lower_bound, *prefetch_size); - EXPECT_GE(expected_higher_bound, *prefetch_size); - called = true; - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - expected_lower_bound = 0; - expected_higher_bound = 8 * 1024; - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - // Full compaction to make sure there is no L0 file after the open. - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - - ASSERT_TRUE(called.load()); - called = false; - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - - std::atomic first_call(true); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { - size_t* prefetch_size = static_cast(arg); - if (first_call) { - EXPECT_EQ(4 * 1024, *prefetch_size); - first_call = false; - } else { - EXPECT_GE(4 * 1024, *prefetch_size); - } - called = true; - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - Options options = CurrentOptions(); - options.max_file_opening_threads = 1; // one thread - BlockBasedTableOptions table_options; - table_options.cache_index_and_filter_blocks = true; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.max_open_files = -1; - Reopen(options); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - ASSERT_TRUE(called.load()); - called = false; - - // Parallel loading SST files - options.max_file_opening_threads = 16; - Reopen(options); - - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - - ASSERT_TRUE(called.load()); - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); -} - TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) { // Setup sync point dependency to reproduce the race condition of // DBImpl::GetColumnFamilyHandleUnlocked diff --git a/db/experimental.cc b/db/experimental.cc index c2dce7fde46..cb5fb317945 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -102,7 +102,7 @@ Status UpdateManifestForFilesState( lf->oldest_blob_file_number, lf->oldest_ancester_time, lf->file_creation_time, lf->epoch_number, lf->file_checksum, lf->file_checksum_func_name, lf->unique_id, - lf->compensated_range_deletion_size); + lf->compensated_range_deletion_size, lf->tail_size); } } } else { diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index ca9b6fb9bbe..f8716e5f487 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -464,6 +464,16 @@ Status ExternalSstFileIngestionJob::Run() { current_time = oldest_ancester_time = static_cast(temp_current_time); } + uint64_t tail_size = 0; + bool contain_no_data_blocks = f.table_properties.num_entries > 0 && + (f.table_properties.num_entries == + f.table_properties.num_range_deletions); + if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { + uint64_t file_size = f.fd.GetFileSize(); + assert(f.table_properties.tail_start_offset <= file_size); + tail_size = file_size - f.table_properties.tail_start_offset; + } + FileMetaData f_metadata( f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, @@ -472,7 +482,7 @@ Status ExternalSstFileIngestionJob::Run() { ingestion_options_.ingest_behind ? kReservedEpochNumberForFileIngestedBehind : cfd_->NewEpochNumber(), - f.file_checksum, f.file_checksum_func_name, f.unique_id, 0); + f.file_checksum, f.file_checksum_func_name, f.unique_id, 0, tail_size); f_metadata.temperature = f.file_temperature; edit_.AddFile(f.picked_level, f_metadata); } diff --git a/db/flush_job.cc b/db/flush_job.cc index a3ffc270714..8b152ee3e0a 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -1007,7 +1007,8 @@ Status FlushJob::WriteLevel0Table() { meta_.oldest_blob_file_number, meta_.oldest_ancester_time, meta_.file_creation_time, meta_.epoch_number, meta_.file_checksum, meta_.file_checksum_func_name, - meta_.unique_id, meta_.compensated_range_deletion_size); + meta_.unique_id, meta_.compensated_range_deletion_size, + meta_.tail_size); edit_->SetBlobFileAdditions(std::move(blob_file_additions)); } // Piggyback FlushJobInfo on the first first flushed memtable. diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 9a8b48dd054..f76d0ac1b90 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -138,6 +138,16 @@ Status ImportColumnFamilyJob::Run() { const auto& f = files_to_import_[i]; const auto& file_metadata = metadata_[i]; + uint64_t tail_size = 0; + bool contain_no_data_blocks = f.table_properties.num_entries > 0 && + (f.table_properties.num_entries == + f.table_properties.num_range_deletions); + if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { + uint64_t file_size = f.fd.GetFileSize(); + assert(f.table_properties.tail_start_offset <= file_size); + tail_size = file_size - f.table_properties.tail_start_offset; + } + VersionEdit dummy_version_edit; dummy_version_edit.AddFile( file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), @@ -145,7 +155,7 @@ Status ImportColumnFamilyJob::Run() { file_metadata.smallest_seqno, file_metadata.largest_seqno, false, file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time, current_time, file_metadata.epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, f.unique_id, 0); + kUnknownFileChecksumFuncName, f.unique_id, 0, tail_size); s = dummy_version_builder.Apply(&dummy_version_edit); } if (s.ok()) { diff --git a/db/repair.cc b/db/repair.cc index 633c348a5c3..67513cacc4c 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -551,6 +551,17 @@ class Repairer { } t->meta.oldest_ancester_time = props->creation_time; } + if (status.ok()) { + uint64_t tail_size = 0; + bool contain_no_data_blocks = + props->num_entries > 0 && + (props->num_entries == props->num_range_deletions); + if (props->tail_start_offset > 0 || contain_no_data_blocks) { + assert(props->tail_start_offset <= file_size); + tail_size = file_size - props->tail_start_offset; + } + t->meta.tail_size = tail_size; + } ColumnFamilyData* cfd = nullptr; if (status.ok()) { cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id); @@ -682,7 +693,7 @@ class Repairer { table->meta.oldest_ancester_time, table->meta.file_creation_time, table->meta.epoch_number, table->meta.file_checksum, table->meta.file_checksum_func_name, table->meta.unique_id, - table->meta.compensated_range_deletion_size); + table->meta.compensated_range_deletion_size, table->meta.tail_size); } s = dummy_version_builder.Apply(&dummy_edit); if (s.ok()) { diff --git a/db/table_cache.cc b/db/table_cache.cc index c288ec8c7fd..73fa07c6d54 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -140,13 +140,13 @@ Status TableCache::GetTableReader( } s = ioptions_.table_factory->NewTableReader( ro, - TableReaderOptions(ioptions_, prefix_extractor, file_options, - internal_comparator, block_protection_bytes_per_key, - skip_filters, immortal_tables_, - false /* force_direct_prefetch */, level, - block_cache_tracer_, max_file_size_for_l0_meta_pin, - db_session_id_, file_meta.fd.GetNumber(), - expected_unique_id, file_meta.fd.largest_seqno), + TableReaderOptions( + ioptions_, prefix_extractor, file_options, internal_comparator, + block_protection_bytes_per_key, skip_filters, immortal_tables_, + false /* force_direct_prefetch */, level, block_cache_tracer_, + max_file_size_for_l0_meta_pin, db_session_id_, + file_meta.fd.GetNumber(), expected_unique_id, + file_meta.fd.largest_seqno, file_meta.tail_size), std::move(file_reader), file_meta.fd.GetFileSize(), table_reader, prefetch_index_and_filter_in_cache); TEST_SYNC_POINT("TableCache::GetTableReader:0"); diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 611dee774b0..af41f00f6c8 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -73,7 +73,7 @@ class VersionBuilderTest : public testing::Test { /* marked_for_compact */ false, Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); f->compensated_file_size = file_size; f->num_entries = num_entries; f->num_deletions = num_deletions; @@ -136,7 +136,7 @@ class VersionBuilderTest : public testing::Test { Temperature::kUnknown, blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); } void UpdateVersionStorageInfo(VersionStorageInfo* vstorage) { @@ -183,11 +183,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) { UpdateVersionStorageInfo(); VersionEdit version_edit; - version_edit.AddFile( - 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); version_edit.DeleteFile(3, 27U); EnvOptions env_options; @@ -230,11 +231,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { UpdateVersionStorageInfo(); VersionEdit version_edit; - version_edit.AddFile( - 3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); @@ -281,11 +283,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { UpdateVersionStorageInfo(); VersionEdit version_edit; - version_edit.AddFile( - 4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); version_edit.DeleteFile(4, 6U); @@ -317,31 +320,36 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) { UpdateVersionStorageInfo(); VersionEdit version_edit; - version_edit.AddFile( - 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), + GetInternalKey("450"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), + GetInternalKey("650"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), + GetInternalKey("550"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), + GetInternalKey("750"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); EnvOptions env_options; constexpr TableCache* table_cache = nullptr; @@ -376,46 +384,53 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { kCompactionStyleLevel, nullptr, false); VersionEdit version_edit; - version_edit.AddFile( - 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), + GetInternalKey("450"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), + GetInternalKey("650"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), + GetInternalKey("550"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), + GetInternalKey("750"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); ASSERT_OK(version_builder.Apply(&version_edit)); VersionEdit version_edit2; - version_edit.AddFile( - 2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"), + GetInternalKey("950"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); version_edit2.DeleteFile(2, 616); version_edit2.DeleteFile(2, 636); - version_edit.AddFile( - 2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"), + GetInternalKey("850"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); ASSERT_OK(version_builder.Apply(&version_edit2)); ASSERT_OK(version_builder.SaveTo(&new_vstorage)); @@ -520,13 +535,14 @@ TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) { constexpr bool marked_for_compaction = false; - addition.AddFile( - level, file_number, path_id, file_size, - GetInternalKey(smallest, smallest_seq), - GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + addition.AddFile(level, file_number, path_id, file_size, + GetInternalKey(smallest, smallest_seq), + GetInternalKey(largest, largest_seq), smallest_seqno, + largest_seqno, marked_for_compaction, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0, 0); ASSERT_OK(builder.Apply(&addition)); @@ -570,12 +586,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) { constexpr SequenceNumber largest_seqno = 1000; constexpr bool marked_for_compaction = false; - edit.AddFile( - new_level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + edit.AddFile(new_level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); const Status s = builder.Apply(&edit); ASSERT_TRUE(s.IsCorruption()); @@ -606,12 +623,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { constexpr SequenceNumber largest_seqno = 1000; constexpr bool marked_for_compaction = false; - edit.AddFile( - level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0, 0); ASSERT_OK(builder.Apply(&edit)); @@ -619,12 +637,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { constexpr int new_level = 2; - other_edit.AddFile( - new_level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + other_edit.AddFile(new_level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); const Status s = builder.Apply(&other_edit); ASSERT_TRUE(s.IsCorruption()); @@ -655,12 +674,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) { VersionEdit addition; - addition.AddFile( - level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + addition.AddFile(level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); ASSERT_OK(builder.Apply(&addition)); @@ -1233,7 +1253,7 @@ TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) { GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 2 /*epoch_number*/, - checksum_value, checksum_method, kNullUniqueId64x2, 0); + checksum_value, checksum_method, kNullUniqueId64x2, 0, 0); edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, checksum_method, checksum_value); @@ -1321,7 +1341,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("801"), @@ -1331,7 +1351,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000, /* total_blob_bytes */ 200000, /* checksum_method */ std::string(), @@ -1552,7 +1572,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { Temperature::kUnknown, /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); // Add an SST that does not reference any blob files. edit.AddFile( @@ -1562,7 +1582,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest_seqno */ 2200, /* marked_for_compaction */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); // Delete a file that references a blob file. edit.DeleteFile(/* level */ 1, /* file_number */ 6); @@ -1585,7 +1605,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); // Trivially move a file that does not reference any blob files. edit.DeleteFile(/* level */ 1, /* file_number */ 13); @@ -1597,7 +1617,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); // Add one more SST file that references a blob file, then promptly // delete it in a second version edit before the new version gets saved. @@ -1611,7 +1631,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); VersionEdit edit2; @@ -1712,7 +1732,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); version_edit_1.AddFile( /* level */ 0, /* file_number */ 2U, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("b", 2), @@ -1722,7 +1742,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); VersionBuilder version_builder_1(EnvOptions(), &ioptions_, nullptr /* table_cache */, &vstorage_, @@ -1749,7 +1769,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); version_edit_2.AddFile( /* level */ 0, /* file_number */ 2U, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("b", 2), @@ -1759,7 +1779,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 2 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); VersionBuilder version_builder_2(EnvOptions(), &ioptions_, nullptr /* table_cache */, &vstorage_, diff --git a/db/version_edit.cc b/db/version_edit.cc index ecddaa49ccd..4f1ae80d21e 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -238,6 +238,12 @@ bool VersionEdit::EncodeTo(std::string* dst) const { f.compensated_range_deletion_size); PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size)); } + if (f.tail_size) { + PutVarint32(dst, NewFileCustomTag::kTailSize); + std::string varint_tail_size; + PutVarint64(&varint_tail_size, f.tail_size); + PutLengthPrefixedSlice(dst, Slice(varint_tail_size)); + } TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", dst); @@ -416,6 +422,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "Invalid compensated range deletion size"; } break; + case kTailSize: + if (!GetVarint64(&field, &f.tail_size)) { + return "invalid tail start offset"; + } + break; default: if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { // Should not proceed if cannot understand it @@ -851,6 +862,8 @@ std::string VersionEdit::DebugString(bool hex_key) const { InternalUniqueIdToExternal(&id); r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id))); } + r.append(" tail size:"); + AppendNumberTo(&r, f.tail_size); } for (const auto& blob_file_addition : blob_file_additions_) { @@ -966,6 +979,7 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { // permanent jw << "Temperature" << static_cast(f.temperature); } + jw << "TailSize" << f.tail_size; jw.EndArrayedObject(); } diff --git a/db/version_edit.h b/db/version_edit.h index 65c7fc43ab7..07e8f37742c 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -90,6 +90,7 @@ enum NewFileCustomTag : uint32_t { kUniqueId = 12, kEpochNumber = 13, kCompensatedRangeDeletionSize = 14, + kTailSize = 15, // If this bit for the custom tag is set, opening DB should fail if // we don't know this field. @@ -238,6 +239,10 @@ struct FileMetaData { // SST unique id UniqueId64x2 unique_id{}; + // Size of the "tail" part of a SST file + // "Tail" refers to all blocks after data blocks till the end of the SST file + uint64_t tail_size = 0; + FileMetaData() = default; FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, @@ -249,7 +254,8 @@ struct FileMetaData { uint64_t _epoch_number, const std::string& _file_checksum, const std::string& _file_checksum_func_name, UniqueId64x2 _unique_id, - const uint64_t _compensated_range_deletion_size) + const uint64_t _compensated_range_deletion_size, + uint64_t _tail_size) : fd(file, file_path_id, file_size, smallest_seq, largest_seq), smallest(smallest_key), largest(largest_key), @@ -262,7 +268,8 @@ struct FileMetaData { epoch_number(_epoch_number), file_checksum(_file_checksum), file_checksum_func_name(_file_checksum_func_name), - unique_id(std::move(_unique_id)) { + unique_id(std::move(_unique_id)), + tail_size(_tail_size) { TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); } @@ -446,7 +453,8 @@ class VersionEdit { uint64_t epoch_number, const std::string& file_checksum, const std::string& file_checksum_func_name, const UniqueId64x2& unique_id, - const uint64_t compensated_range_deletion_size) { + const uint64_t compensated_range_deletion_size, + uint64_t tail_size) { assert(smallest_seqno <= largest_seqno); new_files_.emplace_back( level, @@ -455,7 +463,7 @@ class VersionEdit { temperature, oldest_blob_file_number, oldest_ancester_time, file_creation_time, epoch_number, file_checksum, file_checksum_func_name, unique_id, - compensated_range_deletion_size)); + compensated_range_deletion_size, tail_size)); if (!HasLastSequence() || largest_seqno > GetLastSequence()) { SetLastSequence(largest_seqno); } diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 1fa6c005497..da1a85999c1 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -45,7 +45,7 @@ TEST_F(VersionEditTest, EncodeDecode) { kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown, kInvalidBlobFileNumber, 888, 678, kBig + 300 + i /* epoch_number */, "234", "crc32c", - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); edit.DeleteFile(4, kBig + 700 + i); } @@ -65,24 +65,24 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 300 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 301 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber, 666, 888, 302 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex), InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503, kBig + 603, true, Temperature::kUnknown, 1001, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 303 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.DeleteFile(4, 700); @@ -123,12 +123,12 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 300 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, 686, 868, 301 /* epoch_number */, "234", "crc32c", - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); edit.DeleteFile(4, 700); edit.SetComparatorName("foo"); @@ -177,7 +177,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 300 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.SetComparatorName("foo"); edit.SetLogNumber(kBig + 100); @@ -208,7 +208,7 @@ TEST_F(VersionEditTest, EncodeEmptyFile) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /*epoch_number*/, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); std::string buffer; ASSERT_TRUE(!edit.EncodeTo(&buffer)); } diff --git a/db/version_set.cc b/db/version_set.cc index cfe8d617366..436389b81b1 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -6353,7 +6353,7 @@ Status VersionSet::WriteCurrentStateToManifest( f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size); } } diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 481dd46d90e..c0f6d1340a4 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -51,7 +51,7 @@ class GenerateLevelFilesBriefTest : public testing::Test { largest_seq, /* marked_for_compact */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); files_.push_back(f); } @@ -163,7 +163,7 @@ class VersionStorageInfoTestBase : public testing::Test { Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, compensated_range_deletion_size); + kNullUniqueId64x2, compensated_range_deletion_size, 0); vstorage_.AddFile(level, f); } @@ -3296,7 +3296,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, info.epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, - 0); + 0, 0); } } @@ -3353,7 +3353,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, file_num /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( @@ -3414,7 +3414,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, file_num /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index 304d1c3bb26..77286ddcfa4 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -180,6 +180,8 @@ class FilePrefetchBuffer { RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded); } + bool Enabled() const { return enable_; } + // Load data into the buffer from a file. // reader : the file reader. // offset : the file offset to start reading from. diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index fb31144498c..053caacd09e 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -236,6 +236,93 @@ TEST_P(PrefetchTest, Basic) { Close(); } +TEST_P(PrefetchTest, BlockBasedTableTailPrefetch) { + const bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + // Second param is if directIO is enabled or not + const bool use_direct_io = std::get<1>(GetParam()); + const bool use_file_prefetch_buffer = !support_prefetch || use_direct_io; + + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + Options options; + SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); + + BlockBasedTableOptions bbto; + bbto.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + bbto.partition_filters = true; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + ROCKSDB_GTEST_BYPASS("Direct IO is not supported"); + return; + } else { + ASSERT_OK(s); + } + + ASSERT_OK(Put("k1", "v1")); + + HistogramData pre_flush_file_read; + options.statistics->histogramData(FILE_READ_FLUSH_MICROS, + &pre_flush_file_read); + ASSERT_OK(Flush()); + HistogramData post_flush_file_read; + options.statistics->histogramData(FILE_READ_FLUSH_MICROS, + &post_flush_file_read); + if (use_file_prefetch_buffer) { + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // should read from the prefetched tail in file prefetch buffer instead of + // initiating extra SST reads. Therefore `BlockBasedTable::PrefetchTail()` + // should be the only SST read in table verification during flush. + ASSERT_EQ(post_flush_file_read.count - pre_flush_file_read.count, 1); + } else { + // Without the prefetched tail in file prefetch buffer, + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // will initiate extra SST reads + ASSERT_GT(post_flush_file_read.count - pre_flush_file_read.count, 1); + } + ASSERT_OK(Put("k1", "v2")); + ASSERT_OK(Put("k2", "v2")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + HistogramData pre_compaction_file_read; + options.statistics->histogramData(FILE_READ_COMPACTION_MICROS, + &pre_compaction_file_read); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + HistogramData post_compaction_file_read; + options.statistics->histogramData(FILE_READ_COMPACTION_MICROS, + &post_compaction_file_read); + if (use_file_prefetch_buffer) { + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // should read from the prefetched tail in file prefetch buffer instead of + // initiating extra SST reads. + // + // Therefore the 3 reads are + // (1) `ProcessKeyValueCompaction()` of input file 1 + // (2) `ProcessKeyValueCompaction()` of input file 2 + // (3) `BlockBasedTable::PrefetchTail()` of output file during table + // verification in compaction + ASSERT_EQ(post_compaction_file_read.count - pre_compaction_file_read.count, + 3); + } else { + // Without the prefetched tail in file prefetch buffer, + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // as well as reading other parts of the tail (e.g, footer, table + // properties..) will initiate extra SST reads + ASSERT_GT(post_compaction_file_read.count - pre_compaction_file_read.count, + 3); + } + Close(); +} + // This test verifies BlockBasedTableOptions.max_auto_readahead_size is // configured dynamically. TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index cbe87fa3af1..ab259f930be 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -70,6 +70,7 @@ struct TablePropertiesNames { static const std::string kSlowCompressionEstimatedDataSize; static const std::string kFastCompressionEstimatedDataSize; static const std::string kSequenceNumberTimeMapping; + static const std::string kTailStartOffset; }; // `TablePropertiesCollector` provides the mechanism for users to collect @@ -239,6 +240,10 @@ struct TableProperties { // 0 means not exists. uint64_t external_sst_file_global_seqno_offset = 0; + // Offset where the "tail" part of SST file starts + // "Tail" refers to all blocks after data blocks till the end of the SST file + uint64_t tail_start_offset = 0; + // DB identity // db_id is an identifier generated the first time the DB is created // If DB identity is unset or unassigned, `db_id` will be an empty string. diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 5121d1b43d7..3d7a08e7755 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -341,6 +341,10 @@ struct BlockBasedTableBuilder::Rep { std::unique_ptr pc_rep; BlockCreateContext create_context; + // The size of the "tail" part of a SST file. "Tail" refers to + // all blocks after data blocks till the end of the SST file. + uint64_t tail_size; + uint64_t get_offset() { return offset.load(std::memory_order_relaxed); } void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); } @@ -456,6 +460,7 @@ struct BlockBasedTableBuilder::Rep { !use_delta_encoding_for_index_values, table_opt.index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey), + tail_size(0), status_ok(true), io_status_ok(true) { if (tbo.target_file_size == 0) { @@ -1908,6 +1913,8 @@ Status BlockBasedTableBuilder::Finish() { } } + r->props.tail_start_offset = r->offset; + // Write meta blocks, metaindex block and footer in the following order. // 1. [meta block: filter] // 2. [meta block: index] @@ -1935,6 +1942,7 @@ Status BlockBasedTableBuilder::Finish() { r->SetStatus(r->CopyIOStatus()); Status ret_status = r->CopyStatus(); assert(!ret_status.ok() || io_status().ok()); + r->tail_size = r->offset - r->props.tail_start_offset; return ret_status; } @@ -1968,6 +1976,8 @@ uint64_t BlockBasedTableBuilder::EstimatedFileSize() const { } } +uint64_t BlockBasedTableBuilder::GetTailSize() const { return rep_->tail_size; } + bool BlockBasedTableBuilder::NeedCompact() const { for (const auto& collector : rep_->table_properties_collectors) { if (collector->NeedCompact()) { diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h index 32ca30d1b4c..3949474c580 100644 --- a/table/block_based/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -91,6 +91,10 @@ class BlockBasedTableBuilder : public TableBuilder { // is enabled. uint64_t EstimatedFileSize() const override; + // Get the size of the "tail" part of a SST file. "Tail" refers to + // all blocks after data blocks till the end of the SST file. + uint64_t GetTailSize() const override; + bool NeedCompact() const override; // Get table properties diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 2bca0703327..653c222d5e4 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -568,7 +568,7 @@ Status BlockBasedTableFactory::NewTableReader( ro, table_reader_options.ioptions, table_reader_options.env_options, table_options_, table_reader_options.internal_comparator, std::move(file), file_size, table_reader_options.block_protection_bytes_per_key, - table_reader, table_reader_cache_res_mgr_, + table_reader, table_reader_options.tail_size, table_reader_cache_res_mgr_, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, table_reader_options.level, table_reader_options.immortal, table_reader_options.largest_seqno, diff --git a/table/block_based/block_based_table_factory.h b/table/block_based/block_based_table_factory.h index 1bf870ba6f9..1f787697772 100644 --- a/table/block_based/block_based_table_factory.h +++ b/table/block_based/block_based_table_factory.h @@ -28,6 +28,9 @@ class BlockBasedTableBuilder; class RandomAccessFileReader; class WritableFileWriter; +// TODO: deprecate this class as it can be replaced with +// `FileMetaData::tail_size` +// // A class used to track actual bytes written from the tail in the recent SST // file opens, and provide a suggestion for following open. class TailPrefetchStats { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index b5144166198..a5e8e701467 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -561,7 +561,7 @@ Status BlockBasedTable::Open( const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, uint64_t file_size, uint8_t block_protection_bytes_per_key, - std::unique_ptr* table_reader, + std::unique_ptr* table_reader, uint64_t tail_size, std::shared_ptr table_reader_cache_res_mgr, const std::shared_ptr& prefix_extractor, const bool prefetch_index_and_filter_in_cache, const bool skip_filters, @@ -593,7 +593,8 @@ Status BlockBasedTable::Open( if (!ioptions.allow_mmap_reads) { s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch, tail_prefetch_stats, prefetch_all, preload_all, - &prefetch_buffer, ioptions.stats); + &prefetch_buffer, ioptions.stats, tail_size, + ioptions.logger); // Return error in prefetch path to users. if (!s.ok()) { return s; @@ -807,21 +808,40 @@ Status BlockBasedTable::PrefetchTail( const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, - std::unique_ptr* prefetch_buffer, Statistics* stats) { + std::unique_ptr* prefetch_buffer, Statistics* stats, + uint64_t tail_size, Logger* const logger) { + assert(tail_size <= file_size); + size_t tail_prefetch_size = 0; - if (tail_prefetch_stats != nullptr) { - // Multiple threads may get a 0 (no history) when running in parallel, - // but it will get cleared after the first of them finishes. - tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); - } - if (tail_prefetch_size == 0) { - // Before read footer, readahead backwards to prefetch data. Do more - // readahead if we're going to read index/filter. - // TODO: This may incorrectly select small readahead in case partitioned - // index/filter is enabled and top-level partition pinning is enabled. - // That's because we need to issue readahead before we read the properties, - // at which point we don't yet know the index type. - tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + if (tail_size != 0) { + tail_prefetch_size = tail_size; + } else { + if (tail_prefetch_stats != nullptr) { + // Multiple threads may get a 0 (no history) when running in parallel, + // but it will get cleared after the first of them finishes. + tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); + } + if (tail_prefetch_size == 0) { + // Before read footer, readahead backwards to prefetch data. Do more + // readahead if we're going to read index/filter. + // TODO: This may incorrectly select small readahead in case partitioned + // index/filter is enabled and top-level partition pinning is enabled. + // That's because we need to issue readahead before we read the + // properties, at which point we don't yet know the index type. + tail_prefetch_size = + prefetch_all || preload_all + ? static_cast(4 * 1024 + 0.01 * file_size) + : 4 * 1024; + + ROCKS_LOG_WARN(logger, + "Tail prefetch size %zu is calculated based on heuristics", + tail_prefetch_size); + } else { + ROCKS_LOG_WARN( + logger, + "Tail prefetch size %zu is calculated based on TailPrefetchStats", + tail_prefetch_size); + } } size_t prefetch_off; size_t prefetch_len; @@ -1140,7 +1160,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( // are hence follow the configuration for pin and prefetch regardless of // the value of cache_index_and_filter_blocks if (prefetch_all || pin_partition) { - s = rep_->index_reader->CacheDependencies(ro, pin_partition); + s = rep_->index_reader->CacheDependencies(ro, pin_partition, + prefetch_buffer); } if (!s.ok()) { return s; @@ -1164,7 +1185,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( if (filter) { // Refer to the comment above about paritioned indexes always being cached if (prefetch_all || pin_partition) { - s = filter->CacheDependencies(ro, pin_partition); + s = filter->CacheDependencies(ro, pin_partition, prefetch_buffer); if (!s.ok()) { return s; } @@ -1984,8 +2005,8 @@ Status BlockBasedTable::ApproximateKeyAnchors(const ReadOptions& read_options, // `CacheDependencies()` brings all the blocks into cache using one I/O. That // way the full index scan usually finds the index data it is looking for in // cache rather than doing an I/O for each "dependency" (partition). - Status s = - rep_->index_reader->CacheDependencies(read_options, false /* pin */); + Status s = rep_->index_reader->CacheDependencies( + read_options, false /* pin */, nullptr /* prefetch_buffer */); if (!s.ok()) { return s; } diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index dafaa4ebf85..0f14e05dd14 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -99,7 +99,7 @@ class BlockBasedTable : public TableReader { const InternalKeyComparator& internal_key_comparator, std::unique_ptr&& file, uint64_t file_size, uint8_t block_protection_bytes_per_key, - std::unique_ptr* table_reader, + std::unique_ptr* table_reader, uint64_t tail_size, std::shared_ptr table_reader_cache_res_mgr = nullptr, const std::shared_ptr& prefix_extractor = nullptr, @@ -225,8 +225,9 @@ class BlockBasedTable : public TableReader { virtual size_t ApproximateMemoryUsage() const = 0; // Cache the dependencies of the index reader (e.g. the partitions // of a partitioned index). - virtual Status CacheDependencies(const ReadOptions& /*ro*/, - bool /* pin */) { + virtual Status CacheDependencies( + const ReadOptions& /*ro*/, bool /* pin */, + FilePrefetchBuffer* /* tail_prefetch_buffer */) { return Status::OK(); } }; @@ -458,7 +459,8 @@ class BlockBasedTable : public TableReader { const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, - std::unique_ptr* prefetch_buffer, Statistics* stats); + std::unique_ptr* prefetch_buffer, Statistics* stats, + uint64_t tail_size, Logger* const logger); Status ReadMetaIndexBlock(const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* metaindex_block, diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index 957413d1067..b14858c0209 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -163,7 +163,9 @@ class FilterBlockReader { return error_msg; } - virtual Status CacheDependencies(const ReadOptions& /*ro*/, bool /*pin*/) { + virtual Status CacheDependencies( + const ReadOptions& /*ro*/, bool /*pin*/, + FilePrefetchBuffer* /* tail_prefetch_buffer */) { return Status::OK(); } diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index 569b58fef06..9c2b61e8761 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -439,8 +439,8 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { } // TODO(myabandeh): merge this with the same function in IndexReader -Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, - bool pin) { +Status PartitionedFilterBlockReader::CacheDependencies( + const ReadOptions& ro, bool pin, FilePrefetchBuffer* tail_prefetch_buffer) { assert(table()); const BlockBasedTable::Rep* const rep = table()->get_rep(); @@ -484,21 +484,22 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, handle.offset() + handle.size() + BlockBasedTable::kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; - rep->CreateFilePrefetchBuffer( - 0, 0, &prefetch_buffer, false /* Implicit autoreadahead */, - 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); - - IOOptions opts; - s = rep->file->PrepareIOOptions(ro, opts); - if (s.ok()) { - s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, - static_cast(prefetch_len), - ro.rate_limiter_priority); - } - if (!s.ok()) { - return s; + if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled()) { + rep->CreateFilePrefetchBuffer( + 0, 0, &prefetch_buffer, false /* Implicit autoreadahead */, + 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); + + IOOptions opts; + s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len), + ro.rate_limiter_priority); + } + if (!s.ok()) { + return s; + } } - // After prefetch, read the partitions one by one for (biter.SeekToFirst(); biter.Valid(); biter.Next()) { handle = biter.value().handle; @@ -507,7 +508,8 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, // TODO: Support counter batch update for partitioned index and // filter blocks s = table()->MaybeReadBlockAndLoadToCache( - prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), + prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro, + handle, UncompressionDict::GetEmptyDict(), /* for_compaction */ false, &block, nullptr /* get_context */, &lookup_context, nullptr /* contents */, false); if (!s.ok()) { diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 9f7a7be7b91..0e3b4a5b34d 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -166,7 +166,8 @@ class PartitionedFilterBlockReader BlockCacheLookupContext* lookup_context, const ReadOptions& read_options, FilterManyFunction filter_function) const; - Status CacheDependencies(const ReadOptions& ro, bool pin) override; + Status CacheDependencies(const ReadOptions& ro, bool pin, + FilePrefetchBuffer* tail_prefetch_buffer) override; const InternalKeyComparator* internal_comparator() const; bool index_key_includes_seq() const; diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index d1cc88834ea..f322cc910d7 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -112,8 +112,8 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( // the first level iter is always on heap and will attempt to delete it // in its destructor. } -Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, - bool pin) { +Status PartitionIndexReader::CacheDependencies( + const ReadOptions& ro, bool pin, FilePrefetchBuffer* tail_prefetch_buffer) { if (!partition_map_.empty()) { // The dependencies are already cached since `partition_map_` is filled in // an all-or-nothing manner. @@ -162,22 +162,23 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, handle.offset() + BlockBasedTable::BlockSizeWithTrailer(handle); uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; - rep->CreateFilePrefetchBuffer( - 0, 0, &prefetch_buffer, false /*Implicit auto readahead*/, - 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); - IOOptions opts; - { - Status s = rep->file->PrepareIOOptions(ro, opts); - if (s.ok()) { - s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, - static_cast(prefetch_len), - ro.rate_limiter_priority); - } - if (!s.ok()) { - return s; + if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled()) { + rep->CreateFilePrefetchBuffer( + 0, 0, &prefetch_buffer, false /*Implicit auto readahead*/, + 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); + IOOptions opts; + { + Status s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len), + ro.rate_limiter_priority); + } + if (!s.ok()) { + return s; + } } } - // For saving "all or nothing" to partition_map_ UnorderedMap> map_in_progress; @@ -191,7 +192,8 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, // TODO: Support counter batch update for partitioned index and // filter blocks Status s = table()->MaybeReadBlockAndLoadToCache( - prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), + prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro, + handle, UncompressionDict::GetEmptyDict(), /*for_compaction=*/false, &block.As(), /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr, /*async_read=*/false); diff --git a/table/block_based/partitioned_index_reader.h b/table/block_based/partitioned_index_reader.h index 58a7877ab5d..9482fd6b44c 100644 --- a/table/block_based/partitioned_index_reader.h +++ b/table/block_based/partitioned_index_reader.h @@ -30,7 +30,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override; - Status CacheDependencies(const ReadOptions& ro, bool pin) override; + Status CacheDependencies(const ReadOptions& ro, bool pin, + FilePrefetchBuffer* tail_prefetch_buffer) override; size_t ApproximateMemoryUsage() const override { size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc index 9696a509dce..c2f6552cc4b 100644 --- a/table/block_fetcher_test.cc +++ b/table/block_fetcher_test.cc @@ -266,9 +266,10 @@ class BlockFetcherTest : public testing::Test { const auto* table_options = table_factory_.GetOptions(); ASSERT_NE(table_options, nullptr); - ASSERT_OK(BlockBasedTable::Open( - ro, ioptions, EnvOptions(), *table_options, comparator, std::move(file), - file_size, 0 /* block_protection_bytes_per_key */, &table_reader)); + ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options, + comparator, std::move(file), file_size, + 0 /* block_protection_bytes_per_key */, + &table_reader, 0 /* tail_size */)); table->reset(reinterpret_cast(table_reader.release())); } diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 2c58ff9c7d9..6fea536d624 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -115,6 +115,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { Add(TablePropertiesNames::kFastCompressionEstimatedDataSize, props.fast_compression_estimated_data_size); } + Add(TablePropertiesNames::kTailStartOffset, props.tail_start_offset); if (!props.db_id.empty()) { Add(TablePropertiesNames::kDbId, props.db_id); } @@ -307,6 +308,8 @@ Status ReadTablePropertiesHelper( &new_table_properties->slow_compression_estimated_data_size}, {TablePropertiesNames::kFastCompressionEstimatedDataSize, &new_table_properties->fast_compression_estimated_data_size}, + {TablePropertiesNames::kTailStartOffset, + &new_table_properties->tail_start_offset}, }; std::string last_key; diff --git a/table/table_builder.h b/table/table_builder.h index e1bb4b55747..6d98bce7078 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -42,7 +42,8 @@ struct TableReaderOptions { int _level = -1, BlockCacheTracer* const _block_cache_tracer = nullptr, size_t _max_file_size_for_l0_meta_pin = 0, const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0, - UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0) + UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0, + uint64_t _tail_size = 0) : ioptions(_ioptions), prefix_extractor(_prefix_extractor), env_options(_env_options), @@ -57,7 +58,8 @@ struct TableReaderOptions { cur_db_session_id(_cur_db_session_id), cur_file_num(_cur_file_num), unique_id(_unique_id), - block_protection_bytes_per_key(_block_protection_bytes_per_key) {} + block_protection_bytes_per_key(_block_protection_bytes_per_key), + tail_size(_tail_size) {} const ImmutableOptions& ioptions; const std::shared_ptr& prefix_extractor; @@ -89,6 +91,8 @@ struct TableReaderOptions { UniqueId64x2 unique_id; uint8_t block_protection_bytes_per_key; + + uint64_t tail_size; }; struct TableBuilderOptions { @@ -200,6 +204,8 @@ class TableBuilder { // is enabled. virtual uint64_t EstimatedFileSize() const { return FileSize(); } + virtual uint64_t GetTailSize() const { return 0; } + // If the user defined table properties collector suggest the file to // be further compacted. virtual bool NeedCompact() const { return false; } diff --git a/table/table_properties.cc b/table/table_properties.cc index b382281f857..819d4da2ad8 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -303,6 +303,8 @@ const std::string TablePropertiesNames::kFastCompressionEstimatedDataSize = "rocksdb.sample_for_compression.fast.data.size"; const std::string TablePropertiesNames::kSequenceNumberTimeMapping = "rocksdb.seqno.time.map"; +const std::string TablePropertiesNames::kTailStartOffset = + "rocksdb.tail.start.offset"; #ifndef NDEBUG // WARNING: TEST_SetRandomTableProperties assumes the following layout of