Skip to content

Commit

Permalink
Address comment
Browse files Browse the repository at this point in the history
  • Loading branch information
hx235 committed May 5, 2023
1 parent 9c88bee commit 8bd7892
Show file tree
Hide file tree
Showing 24 changed files with 99 additions and 89 deletions.
2 changes: 1 addition & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* Delete an empty WAL file on DB open if the log number is less than the min log number to keep

### Performance Improvements
* Record the starting offset of block-based table's tail (i.e, all blocks after data blocks till the end) in manifest and use it to prefetch the tail more accurately during table open instead of relying on heuristics (#11406).Heuristics will still be used for now as a fallback for backward compatibility.
* Improved the I/O efficiency of prefetching SST metadata by recording more information in the DB manifest. Opening files written with previous versions will still rely on heuristics for how much to prefetch (#11406).

## 8.2.0 (04/24/2023)
### Public API Changes
Expand Down
2 changes: 1 addition & 1 deletion db/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ Status BuildTable(
if (s.ok() && !empty) {
uint64_t file_size = builder->FileSize();
meta->fd.file_size = file_size;
meta->tail_start_offset = builder->GetTailStartOffset();
meta->tail_size = builder->GetTailSize();
meta->marked_for_compaction = builder->NeedCompact();
assert(meta->fd.GetFileSize() > 0);
tp = builder
Expand Down
2 changes: 1 addition & 1 deletion db/compaction/compaction_outputs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Status CompactionOutputs::Finish(const Status& intput_status,
const uint64_t current_bytes = builder_->FileSize();
if (s.ok()) {
meta->fd.file_size = current_bytes;
meta->tail_start_offset = builder_->GetTailStartOffset();
meta->tail_size = builder_->GetTailSize();
meta->marked_for_compaction = builder_->NeedCompact();
}
current_output().finished = true;
Expand Down
4 changes: 2 additions & 2 deletions db/db_impl/db_impl_compaction_flush.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1758,7 +1758,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
f->marked_for_compaction, f->temperature, f->oldest_blob_file_number,
f->oldest_ancester_time, f->file_creation_time, f->epoch_number,
f->file_checksum, f->file_checksum_func_name, f->unique_id,
f->compensated_range_deletion_size, f->tail_start_offset);
f->compensated_range_deletion_size, f->tail_size);
}
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
"[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
Expand Down Expand Up @@ -3457,7 +3457,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
f->oldest_blob_file_number, f->oldest_ancester_time,
f->file_creation_time, f->epoch_number, f->file_checksum,
f->file_checksum_func_name, f->unique_id,
f->compensated_range_deletion_size, f->tail_start_offset);
f->compensated_range_deletion_size, f->tail_size);

ROCKS_LOG_BUFFER(
log_buffer,
Expand Down
2 changes: 1 addition & 1 deletion db/db_impl/db_impl_experimental.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
f->oldest_blob_file_number, f->oldest_ancester_time,
f->file_creation_time, f->epoch_number, f->file_checksum,
f->file_checksum_func_name, f->unique_id,
f->compensated_range_deletion_size, f->tail_start_offset);
f->compensated_range_deletion_size, f->tail_size);
}

status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
Expand Down
4 changes: 2 additions & 2 deletions db/db_impl/db_impl_open.cc
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ Status DBImpl::Recover(
f->file_creation_time, f->epoch_number,
f->file_checksum, f->file_checksum_func_name,
f->unique_id, f->compensated_range_deletion_size,
f->tail_start_offset);
f->tail_size);
ROCKS_LOG_WARN(immutable_db_options_.info_log,
"[%s] Moving #%" PRIu64
" from from_level-%d to from_level-%d %" PRIu64
Expand Down Expand Up @@ -1681,7 +1681,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
meta.oldest_blob_file_number, meta.oldest_ancester_time,
meta.file_creation_time, meta.epoch_number, meta.file_checksum,
meta.file_checksum_func_name, meta.unique_id,
meta.compensated_range_deletion_size, meta.tail_start_offset);
meta.compensated_range_deletion_size, meta.tail_size);

for (const auto& blob : blob_file_additions) {
edit->AddBlobFile(blob);
Expand Down
2 changes: 1 addition & 1 deletion db/experimental.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ Status UpdateManifestForFilesState(
lf->oldest_blob_file_number, lf->oldest_ancester_time,
lf->file_creation_time, lf->epoch_number, lf->file_checksum,
lf->file_checksum_func_name, lf->unique_id,
lf->compensated_range_deletion_size, lf->tail_start_offset);
lf->compensated_range_deletion_size, lf->tail_size);
}
}
} else {
Expand Down
13 changes: 11 additions & 2 deletions db/external_sst_file_ingestion_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,16 @@ Status ExternalSstFileIngestionJob::Run() {
current_time = oldest_ancester_time =
static_cast<uint64_t>(temp_current_time);
}
uint64_t tail_size = 0;
bool contain_no_data_blocks = f.table_properties.num_entries > 0 &&
(f.table_properties.num_entries ==
f.table_properties.num_range_deletions);
if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) {
uint64_t file_size = f.fd.GetFileSize();
assert(f.table_properties.tail_start_offset <= file_size);
tail_size = file_size - f.table_properties.tail_start_offset;
}

FileMetaData f_metadata(
f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(),
f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno,
Expand All @@ -472,8 +482,7 @@ Status ExternalSstFileIngestionJob::Run() {
ingestion_options_.ingest_behind
? kReservedEpochNumberForFileIngestedBehind
: cfd_->NewEpochNumber(),
f.file_checksum, f.file_checksum_func_name, f.unique_id, 0,
f.table_properties.tail_start_offset);
f.file_checksum, f.file_checksum_func_name, f.unique_id, 0, tail_size);
f_metadata.temperature = f.file_temperature;
edit_.AddFile(f.picked_level, f_metadata);
}
Expand Down
2 changes: 1 addition & 1 deletion db/flush_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1008,7 +1008,7 @@ Status FlushJob::WriteLevel0Table() {
meta_.file_creation_time, meta_.epoch_number,
meta_.file_checksum, meta_.file_checksum_func_name,
meta_.unique_id, meta_.compensated_range_deletion_size,
meta_.tail_start_offset);
meta_.tail_size);
edit_->SetBlobFileAdditions(std::move(blob_file_additions));
}
// Piggyback FlushJobInfo on the first first flushed memtable.
Expand Down
13 changes: 11 additions & 2 deletions db/import_column_family_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,15 +138,24 @@ Status ImportColumnFamilyJob::Run() {
const auto& f = files_to_import_[i];
const auto& file_metadata = metadata_[i];

uint64_t tail_size = 0;
bool contain_no_data_blocks = f.table_properties.num_entries > 0 &&
(f.table_properties.num_entries ==
f.table_properties.num_range_deletions);
if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) {
uint64_t file_size = f.fd.GetFileSize();
assert(f.table_properties.tail_start_offset <= file_size);
tail_size = file_size - f.table_properties.tail_start_offset;
}

VersionEdit dummy_version_edit;
dummy_version_edit.AddFile(
file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key,
file_metadata.smallest_seqno, file_metadata.largest_seqno, false,
file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time,
current_time, file_metadata.epoch_number, kUnknownFileChecksum,
kUnknownFileChecksumFuncName, f.unique_id, 0,
f.table_properties.tail_start_offset);
kUnknownFileChecksumFuncName, f.unique_id, 0, tail_size);
s = dummy_version_builder.Apply(&dummy_version_edit);
}
if (s.ok()) {
Expand Down
13 changes: 10 additions & 3 deletions db/repair.cc
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,15 @@ class Repairer {
t->meta.oldest_ancester_time = props->creation_time;
}
if (status.ok()) {
t->meta.tail_start_offset = props->tail_start_offset;
uint64_t tail_size = 0;
bool contain_no_data_blocks =
props->num_entries > 0 &&
(props->num_entries == props->num_range_deletions);
if (props->tail_start_offset > 0 || contain_no_data_blocks) {
assert(props->tail_start_offset <= file_size);
tail_size = file_size - props->tail_start_offset;
}
t->meta.tail_size = tail_size;
}
ColumnFamilyData* cfd = nullptr;
if (status.ok()) {
Expand Down Expand Up @@ -685,8 +693,7 @@ class Repairer {
table->meta.oldest_ancester_time, table->meta.file_creation_time,
table->meta.epoch_number, table->meta.file_checksum,
table->meta.file_checksum_func_name, table->meta.unique_id,
table->meta.compensated_range_deletion_size,
table->meta.tail_start_offset);
table->meta.compensated_range_deletion_size, table->meta.tail_size);
}
s = dummy_version_builder.Apply(&dummy_edit);
if (s.ok()) {
Expand Down
6 changes: 1 addition & 5 deletions db/table_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,7 @@ Status TableCache::GetTableReader(
false /* force_direct_prefetch */, level, block_cache_tracer_,
max_file_size_for_l0_meta_pin, db_session_id_,
file_meta.fd.GetNumber(), expected_unique_id,
file_meta.fd.largest_seqno, file_meta.tail_start_offset,
file_meta.num_entries > 0 &&
(file_meta.num_entries == file_meta.num_range_deletions)
? true
: false /* contain_no_data_block */),
file_meta.fd.largest_seqno, file_meta.tail_size),
std::move(file_reader), file_meta.fd.GetFileSize(), table_reader,
prefetch_index_and_filter_in_cache);
TEST_SYNC_POINT("TableCache::GetTableReader:0");
Expand Down
20 changes: 10 additions & 10 deletions db/version_edit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -238,11 +238,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
f.compensated_range_deletion_size);
PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size));
}
if (f.tail_start_offset) {
PutVarint32(dst, NewFileCustomTag::kTailStartOffset);
std::string varint_tail_start_offset;
PutVarint64(&varint_tail_start_offset, f.tail_start_offset);
PutLengthPrefixedSlice(dst, Slice(varint_tail_start_offset));
if (f.tail_size) {
PutVarint32(dst, NewFileCustomTag::kTailSize);
std::string varint_tail_size;
PutVarint64(&varint_tail_size, f.tail_size);
PutLengthPrefixedSlice(dst, Slice(varint_tail_size));
}

TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
Expand Down Expand Up @@ -422,8 +422,8 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
return "Invalid compensated range deletion size";
}
break;
case kTailStartOffset:
if (!GetVarint64(&field, &f.tail_start_offset)) {
case kTailSize:
if (!GetVarint64(&field, &f.tail_size)) {
return "invalid tail start offset";
}
break;
Expand Down Expand Up @@ -862,8 +862,8 @@ std::string VersionEdit::DebugString(bool hex_key) const {
InternalUniqueIdToExternal(&id);
r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id)));
}
r.append(" tail start offset:");
AppendNumberTo(&r, f.tail_start_offset);
r.append(" tail size:");
AppendNumberTo(&r, f.tail_size);
}

for (const auto& blob_file_addition : blob_file_additions_) {
Expand Down Expand Up @@ -979,7 +979,7 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
// permanent
jw << "Temperature" << static_cast<int>(f.temperature);
}
jw << "TailStartOffset" << f.tail_start_offset;
jw << "TailSize" << f.tail_size;
jw.EndArrayedObject();
}

Expand Down
14 changes: 7 additions & 7 deletions db/version_edit.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ enum NewFileCustomTag : uint32_t {
kUniqueId = 12,
kEpochNumber = 13,
kCompensatedRangeDeletionSize = 14,
kTailStartOffset = 15,
kTailSize = 15,

// If this bit for the custom tag is set, opening DB should fail if
// we don't know this field.
Expand Down Expand Up @@ -239,9 +239,9 @@ struct FileMetaData {
// SST unique id
UniqueId64x2 unique_id{};

// Offset where the "tail" part of SST file starts
// Size of the "tail" part of a SST file
// "Tail" refers to all blocks after data blocks till the end of the SST file
uint64_t tail_start_offset = 0;
uint64_t tail_size = 0;

FileMetaData() = default;

Expand All @@ -255,7 +255,7 @@ struct FileMetaData {
const std::string& _file_checksum_func_name,
UniqueId64x2 _unique_id,
const uint64_t _compensated_range_deletion_size,
uint64_t _tail_start_offset)
uint64_t _tail_size)
: fd(file, file_path_id, file_size, smallest_seq, largest_seq),
smallest(smallest_key),
largest(largest_key),
Expand All @@ -269,7 +269,7 @@ struct FileMetaData {
file_checksum(_file_checksum),
file_checksum_func_name(_file_checksum_func_name),
unique_id(std::move(_unique_id)),
tail_start_offset(_tail_start_offset) {
tail_size(_tail_size) {
TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
}

Expand Down Expand Up @@ -454,7 +454,7 @@ class VersionEdit {
const std::string& file_checksum_func_name,
const UniqueId64x2& unique_id,
const uint64_t compensated_range_deletion_size,
uint64_t tail_start_offset) {
uint64_t tail_size) {
assert(smallest_seqno <= largest_seqno);
new_files_.emplace_back(
level,
Expand All @@ -463,7 +463,7 @@ class VersionEdit {
temperature, oldest_blob_file_number, oldest_ancester_time,
file_creation_time, epoch_number, file_checksum,
file_checksum_func_name, unique_id,
compensated_range_deletion_size, tail_start_offset));
compensated_range_deletion_size, tail_size));
if (!HasLastSequence() || largest_seqno > GetLastSequence()) {
SetLastSequence(largest_seqno);
}
Expand Down
16 changes: 8 additions & 8 deletions db/version_set.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6346,14 +6346,14 @@ Status VersionSet::WriteCurrentStateToManifest(
for (const auto& f : level_files) {
assert(f);

edit.AddFile(
level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(),
f->smallest, f->largest, f->fd.smallest_seqno,
f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
f->oldest_blob_file_number, f->oldest_ancester_time,
f->file_creation_time, f->epoch_number, f->file_checksum,
f->file_checksum_func_name, f->unique_id,
f->compensated_range_deletion_size, f->tail_start_offset);
edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
f->fd.GetFileSize(), f->smallest, f->largest,
f->fd.smallest_seqno, f->fd.largest_seqno,
f->marked_for_compaction, f->temperature,
f->oldest_blob_file_number, f->oldest_ancester_time,
f->file_creation_time, f->epoch_number, f->file_checksum,
f->file_checksum_func_name, f->unique_id,
f->compensated_range_deletion_size, f->tail_size);
}
}

Expand Down
1 change: 1 addition & 0 deletions file/prefetch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ TEST_P(PrefetchTest, BlockBasedTableTailPrefetch) {
Status s = TryReopen(options);
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
// If direct IO is not supported, skip the test
ROCKSDB_GTEST_BYPASS("Direct IO is not supported");
return;
} else {
ASSERT_OK(s);
Expand Down
15 changes: 9 additions & 6 deletions table/block_based/block_based_table_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,10 @@ struct BlockBasedTableBuilder::Rep {
std::unique_ptr<ParallelCompressionRep> pc_rep;
BlockCreateContext create_context;

// The size of the "tail" part of a SST file. "Tail" refers to
// all blocks after data blocks till the end of the SST file.
uint64_t tail_size;

uint64_t get_offset() { return offset.load(std::memory_order_relaxed); }
void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); }

Expand Down Expand Up @@ -456,6 +460,7 @@ struct BlockBasedTableBuilder::Rep {
!use_delta_encoding_for_index_values,
table_opt.index_type ==
BlockBasedTableOptions::kBinarySearchWithFirstKey),
tail_size(0),
status_ok(true),
io_status_ok(true) {
if (tbo.target_file_size == 0) {
Expand Down Expand Up @@ -1908,9 +1913,8 @@ Status BlockBasedTableBuilder::Finish() {
}
}

if (ok()) {
r->props.tail_start_offset = r->offset;
}
r->props.tail_start_offset = r->offset;

// Write meta blocks, metaindex block and footer in the following order.
// 1. [meta block: filter]
// 2. [meta block: index]
Expand Down Expand Up @@ -1938,6 +1942,7 @@ Status BlockBasedTableBuilder::Finish() {
r->SetStatus(r->CopyIOStatus());
Status ret_status = r->CopyStatus();
assert(!ret_status.ok() || io_status().ok());
r->tail_size = r->offset - r->props.tail_start_offset;
return ret_status;
}

Expand Down Expand Up @@ -1971,9 +1976,7 @@ uint64_t BlockBasedTableBuilder::EstimatedFileSize() const {
}
}

uint64_t BlockBasedTableBuilder::GetTailStartOffset() const {
return rep_->props.tail_start_offset;
}
uint64_t BlockBasedTableBuilder::GetTailSize() const { return rep_->tail_size; }

bool BlockBasedTableBuilder::NeedCompact() const {
for (const auto& collector : rep_->table_properties_collectors) {
Expand Down
4 changes: 2 additions & 2 deletions table/block_based/block_based_table_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@ class BlockBasedTableBuilder : public TableBuilder {
// is enabled.
uint64_t EstimatedFileSize() const override;

// Get the offset where the "tail" part of a SST file starts. "Tail" refers to
// Get the size of the "tail" part of a SST file. "Tail" refers to
// all blocks after data blocks till the end of the SST file.
uint64_t GetTailStartOffset() const override;
uint64_t GetTailSize() const override;

bool NeedCompact() const override;

Expand Down
3 changes: 1 addition & 2 deletions table/block_based/block_based_table_factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -568,8 +568,7 @@ Status BlockBasedTableFactory::NewTableReader(
ro, table_reader_options.ioptions, table_reader_options.env_options,
table_options_, table_reader_options.internal_comparator, std::move(file),
file_size, table_reader_options.block_protection_bytes_per_key,
table_reader, table_reader_options.tail_start_offset,
table_reader_options.contain_no_data_block, table_reader_cache_res_mgr_,
table_reader, table_reader_options.tail_size, table_reader_cache_res_mgr_,
table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache,
table_reader_options.skip_filters, table_reader_options.level,
table_reader_options.immortal, table_reader_options.largest_seqno,
Expand Down
Loading

0 comments on commit 8bd7892

Please sign in to comment.