Skip to content

Commit

Permalink
Address comment
Browse files Browse the repository at this point in the history
  • Loading branch information
hx235 committed May 5, 2023
1 parent 9c88bee commit 5d05423
Show file tree
Hide file tree
Showing 27 changed files with 73 additions and 94 deletions.
2 changes: 1 addition & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* Delete an empty WAL file on DB open if the log number is less than the min log number to keep

### Performance Improvements
* Record the starting offset of block-based table's tail (i.e, all blocks after data blocks till the end) in manifest and use it to prefetch the tail more accurately during table open instead of relying on heuristics (#11406).Heuristics will still be used for now as a fallback for backward compatibility.
* Improved the I/O efficiency of prefetching SST metadata by recording more information in the DB manifest. Opening files written with previous versions will still rely on heuristics for how much to prefetch (#11406).

## 8.2.0 (04/24/2023)
### Public API Changes
Expand Down
2 changes: 1 addition & 1 deletion db/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ Status BuildTable(
if (s.ok() && !empty) {
uint64_t file_size = builder->FileSize();
meta->fd.file_size = file_size;
meta->tail_start_offset = builder->GetTailStartOffset();
meta->tail_size = builder->GetTailSize();
meta->marked_for_compaction = builder->NeedCompact();
assert(meta->fd.GetFileSize() > 0);
tp = builder
Expand Down
2 changes: 1 addition & 1 deletion db/compaction/compaction_outputs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Status CompactionOutputs::Finish(const Status& intput_status,
const uint64_t current_bytes = builder_->FileSize();
if (s.ok()) {
meta->fd.file_size = current_bytes;
meta->tail_start_offset = builder_->GetTailStartOffset();
meta->tail_size = builder_->GetTailSize();
meta->marked_for_compaction = builder_->NeedCompact();
}
current_output().finished = true;
Expand Down
4 changes: 2 additions & 2 deletions db/db_impl/db_impl_compaction_flush.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1758,7 +1758,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
f->marked_for_compaction, f->temperature, f->oldest_blob_file_number,
f->oldest_ancester_time, f->file_creation_time, f->epoch_number,
f->file_checksum, f->file_checksum_func_name, f->unique_id,
f->compensated_range_deletion_size, f->tail_start_offset);
f->compensated_range_deletion_size, f->tail_size);
}
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
"[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
Expand Down Expand Up @@ -3457,7 +3457,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
f->oldest_blob_file_number, f->oldest_ancester_time,
f->file_creation_time, f->epoch_number, f->file_checksum,
f->file_checksum_func_name, f->unique_id,
f->compensated_range_deletion_size, f->tail_start_offset);
f->compensated_range_deletion_size, f->tail_size);

ROCKS_LOG_BUFFER(
log_buffer,
Expand Down
2 changes: 1 addition & 1 deletion db/db_impl/db_impl_experimental.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
f->oldest_blob_file_number, f->oldest_ancester_time,
f->file_creation_time, f->epoch_number, f->file_checksum,
f->file_checksum_func_name, f->unique_id,
f->compensated_range_deletion_size, f->tail_start_offset);
f->compensated_range_deletion_size, f->tail_size);
}

status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
Expand Down
4 changes: 2 additions & 2 deletions db/db_impl/db_impl_open.cc
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ Status DBImpl::Recover(
f->file_creation_time, f->epoch_number,
f->file_checksum, f->file_checksum_func_name,
f->unique_id, f->compensated_range_deletion_size,
f->tail_start_offset);
f->tail_size);
ROCKS_LOG_WARN(immutable_db_options_.info_log,
"[%s] Moving #%" PRIu64
" from from_level-%d to from_level-%d %" PRIu64
Expand Down Expand Up @@ -1681,7 +1681,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
meta.oldest_blob_file_number, meta.oldest_ancester_time,
meta.file_creation_time, meta.epoch_number, meta.file_checksum,
meta.file_checksum_func_name, meta.unique_id,
meta.compensated_range_deletion_size, meta.tail_start_offset);
meta.compensated_range_deletion_size, meta.tail_size);

for (const auto& blob : blob_file_additions) {
edit->AddBlobFile(blob);
Expand Down
2 changes: 1 addition & 1 deletion db/experimental.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ Status UpdateManifestForFilesState(
lf->oldest_blob_file_number, lf->oldest_ancester_time,
lf->file_creation_time, lf->epoch_number, lf->file_checksum,
lf->file_checksum_func_name, lf->unique_id,
lf->compensated_range_deletion_size, lf->tail_start_offset);
lf->compensated_range_deletion_size, lf->tail_size);
}
}
} else {
Expand Down
2 changes: 1 addition & 1 deletion db/external_sst_file_ingestion_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ Status ExternalSstFileIngestionJob::Run() {
? kReservedEpochNumberForFileIngestedBehind
: cfd_->NewEpochNumber(),
f.file_checksum, f.file_checksum_func_name, f.unique_id, 0,
f.table_properties.tail_start_offset);
f.table_properties.tail_size);
f_metadata.temperature = f.file_temperature;
edit_.AddFile(f.picked_level, f_metadata);
}
Expand Down
2 changes: 1 addition & 1 deletion db/flush_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1008,7 +1008,7 @@ Status FlushJob::WriteLevel0Table() {
meta_.file_creation_time, meta_.epoch_number,
meta_.file_checksum, meta_.file_checksum_func_name,
meta_.unique_id, meta_.compensated_range_deletion_size,
meta_.tail_start_offset);
meta_.tail_size);
edit_->SetBlobFileAdditions(std::move(blob_file_additions));
}
// Piggyback FlushJobInfo on the first first flushed memtable.
Expand Down
2 changes: 1 addition & 1 deletion db/import_column_family_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ Status ImportColumnFamilyJob::Run() {
file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time,
current_time, file_metadata.epoch_number, kUnknownFileChecksum,
kUnknownFileChecksumFuncName, f.unique_id, 0,
f.table_properties.tail_start_offset);
f.table_properties.tail_size);
s = dummy_version_builder.Apply(&dummy_version_edit);
}
if (s.ok()) {
Expand Down
5 changes: 2 additions & 3 deletions db/repair.cc
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ class Repairer {
t->meta.oldest_ancester_time = props->creation_time;
}
if (status.ok()) {
t->meta.tail_start_offset = props->tail_start_offset;
t->meta.tail_size = props->tail_size;
}
ColumnFamilyData* cfd = nullptr;
if (status.ok()) {
Expand Down Expand Up @@ -685,8 +685,7 @@ class Repairer {
table->meta.oldest_ancester_time, table->meta.file_creation_time,
table->meta.epoch_number, table->meta.file_checksum,
table->meta.file_checksum_func_name, table->meta.unique_id,
table->meta.compensated_range_deletion_size,
table->meta.tail_start_offset);
table->meta.compensated_range_deletion_size, table->meta.tail_size);
}
s = dummy_version_builder.Apply(&dummy_edit);
if (s.ok()) {
Expand Down
6 changes: 1 addition & 5 deletions db/table_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,7 @@ Status TableCache::GetTableReader(
false /* force_direct_prefetch */, level, block_cache_tracer_,
max_file_size_for_l0_meta_pin, db_session_id_,
file_meta.fd.GetNumber(), expected_unique_id,
file_meta.fd.largest_seqno, file_meta.tail_start_offset,
file_meta.num_entries > 0 &&
(file_meta.num_entries == file_meta.num_range_deletions)
? true
: false /* contain_no_data_block */),
file_meta.fd.largest_seqno, file_meta.tail_size),
std::move(file_reader), file_meta.fd.GetFileSize(), table_reader,
prefetch_index_and_filter_in_cache);
TEST_SYNC_POINT("TableCache::GetTableReader:0");
Expand Down
20 changes: 10 additions & 10 deletions db/version_edit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -238,11 +238,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
f.compensated_range_deletion_size);
PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size));
}
if (f.tail_start_offset) {
PutVarint32(dst, NewFileCustomTag::kTailStartOffset);
std::string varint_tail_start_offset;
PutVarint64(&varint_tail_start_offset, f.tail_start_offset);
PutLengthPrefixedSlice(dst, Slice(varint_tail_start_offset));
if (f.tail_size) {
PutVarint32(dst, NewFileCustomTag::kTailSize);
std::string varint_tail_size;
PutVarint64(&varint_tail_size, f.tail_size);
PutLengthPrefixedSlice(dst, Slice(varint_tail_size));
}

TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
Expand Down Expand Up @@ -422,8 +422,8 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
return "Invalid compensated range deletion size";
}
break;
case kTailStartOffset:
if (!GetVarint64(&field, &f.tail_start_offset)) {
case kTailSize:
if (!GetVarint64(&field, &f.tail_size)) {
return "invalid tail start offset";
}
break;
Expand Down Expand Up @@ -862,8 +862,8 @@ std::string VersionEdit::DebugString(bool hex_key) const {
InternalUniqueIdToExternal(&id);
r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id)));
}
r.append(" tail start offset:");
AppendNumberTo(&r, f.tail_start_offset);
r.append(" tail size:");
AppendNumberTo(&r, f.tail_size);
}

for (const auto& blob_file_addition : blob_file_additions_) {
Expand Down Expand Up @@ -979,7 +979,7 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
// permanent
jw << "Temperature" << static_cast<int>(f.temperature);
}
jw << "TailStartOffset" << f.tail_start_offset;
jw << "TailSize" << f.tail_size;
jw.EndArrayedObject();
}

Expand Down
14 changes: 7 additions & 7 deletions db/version_edit.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ enum NewFileCustomTag : uint32_t {
kUniqueId = 12,
kEpochNumber = 13,
kCompensatedRangeDeletionSize = 14,
kTailStartOffset = 15,
kTailSize = 15,

// If this bit for the custom tag is set, opening DB should fail if
// we don't know this field.
Expand Down Expand Up @@ -239,9 +239,9 @@ struct FileMetaData {
// SST unique id
UniqueId64x2 unique_id{};

// Offset where the "tail" part of SST file starts
// Size of the "tail" part of a SST file
// "Tail" refers to all blocks after data blocks till the end of the SST file
uint64_t tail_start_offset = 0;
uint64_t tail_size = 0;

FileMetaData() = default;

Expand All @@ -255,7 +255,7 @@ struct FileMetaData {
const std::string& _file_checksum_func_name,
UniqueId64x2 _unique_id,
const uint64_t _compensated_range_deletion_size,
uint64_t _tail_start_offset)
uint64_t _tail_size)
: fd(file, file_path_id, file_size, smallest_seq, largest_seq),
smallest(smallest_key),
largest(largest_key),
Expand All @@ -269,7 +269,7 @@ struct FileMetaData {
file_checksum(_file_checksum),
file_checksum_func_name(_file_checksum_func_name),
unique_id(std::move(_unique_id)),
tail_start_offset(_tail_start_offset) {
tail_size(_tail_size) {
TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
}

Expand Down Expand Up @@ -454,7 +454,7 @@ class VersionEdit {
const std::string& file_checksum_func_name,
const UniqueId64x2& unique_id,
const uint64_t compensated_range_deletion_size,
uint64_t tail_start_offset) {
uint64_t tail_size) {
assert(smallest_seqno <= largest_seqno);
new_files_.emplace_back(
level,
Expand All @@ -463,7 +463,7 @@ class VersionEdit {
temperature, oldest_blob_file_number, oldest_ancester_time,
file_creation_time, epoch_number, file_checksum,
file_checksum_func_name, unique_id,
compensated_range_deletion_size, tail_start_offset));
compensated_range_deletion_size, tail_size));
if (!HasLastSequence() || largest_seqno > GetLastSequence()) {
SetLastSequence(largest_seqno);
}
Expand Down
16 changes: 8 additions & 8 deletions db/version_set.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6346,14 +6346,14 @@ Status VersionSet::WriteCurrentStateToManifest(
for (const auto& f : level_files) {
assert(f);

edit.AddFile(
level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(),
f->smallest, f->largest, f->fd.smallest_seqno,
f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
f->oldest_blob_file_number, f->oldest_ancester_time,
f->file_creation_time, f->epoch_number, f->file_checksum,
f->file_checksum_func_name, f->unique_id,
f->compensated_range_deletion_size, f->tail_start_offset);
edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
f->fd.GetFileSize(), f->smallest, f->largest,
f->fd.smallest_seqno, f->fd.largest_seqno,
f->marked_for_compaction, f->temperature,
f->oldest_blob_file_number, f->oldest_ancester_time,
f->file_creation_time, f->epoch_number, f->file_checksum,
f->file_checksum_func_name, f->unique_id,
f->compensated_range_deletion_size, f->tail_size);
}
}

Expand Down
1 change: 1 addition & 0 deletions file/prefetch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ TEST_P(PrefetchTest, BlockBasedTableTailPrefetch) {
Status s = TryReopen(options);
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
// If direct IO is not supported, skip the test
ROCKSDB_GTEST_BYPASS("Direct IO is not supported");
return;
} else {
ASSERT_OK(s);
Expand Down
6 changes: 3 additions & 3 deletions include/rocksdb/table_properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ struct TablePropertiesNames {
static const std::string kSlowCompressionEstimatedDataSize;
static const std::string kFastCompressionEstimatedDataSize;
static const std::string kSequenceNumberTimeMapping;
static const std::string kTailStartOffset;
static const std::string kTailSize;
};

// `TablePropertiesCollector` provides the mechanism for users to collect
Expand Down Expand Up @@ -240,9 +240,9 @@ struct TableProperties {
// 0 means not exists.
uint64_t external_sst_file_global_seqno_offset = 0;

// Offset where the "tail" part of SST file starts
// Size of the "tail" part of a SST file
// "Tail" refers to all blocks after data blocks till the end of the SST file
uint64_t tail_start_offset = 0;
uint64_t tail_size = 0;

// DB identity
// db_id is an identifier generated the first time the DB is created
Expand Down
10 changes: 5 additions & 5 deletions table/block_based/block_based_table_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1908,9 +1908,8 @@ Status BlockBasedTableBuilder::Finish() {
}
}

if (ok()) {
r->props.tail_start_offset = r->offset;
}
uint64_t tail_start_offset = r->offset;

// Write meta blocks, metaindex block and footer in the following order.
// 1. [meta block: filter]
// 2. [meta block: index]
Expand Down Expand Up @@ -1938,6 +1937,7 @@ Status BlockBasedTableBuilder::Finish() {
r->SetStatus(r->CopyIOStatus());
Status ret_status = r->CopyStatus();
assert(!ret_status.ok() || io_status().ok());
r->props.tail_size = r->offset - tail_start_offset;
return ret_status;
}

Expand Down Expand Up @@ -1971,8 +1971,8 @@ uint64_t BlockBasedTableBuilder::EstimatedFileSize() const {
}
}

uint64_t BlockBasedTableBuilder::GetTailStartOffset() const {
return rep_->props.tail_start_offset;
uint64_t BlockBasedTableBuilder::GetTailSize() const {
return rep_->props.tail_size;
}

bool BlockBasedTableBuilder::NeedCompact() const {
Expand Down
4 changes: 2 additions & 2 deletions table/block_based/block_based_table_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@ class BlockBasedTableBuilder : public TableBuilder {
// is enabled.
uint64_t EstimatedFileSize() const override;

// Get the offset where the "tail" part of a SST file starts. "Tail" refers to
// Get the size of the "tail" part of a SST file. "Tail" refers to
// all blocks after data blocks till the end of the SST file.
uint64_t GetTailStartOffset() const override;
uint64_t GetTailSize() const override;

bool NeedCompact() const override;

Expand Down
3 changes: 1 addition & 2 deletions table/block_based/block_based_table_factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -568,8 +568,7 @@ Status BlockBasedTableFactory::NewTableReader(
ro, table_reader_options.ioptions, table_reader_options.env_options,
table_options_, table_reader_options.internal_comparator, std::move(file),
file_size, table_reader_options.block_protection_bytes_per_key,
table_reader, table_reader_options.tail_start_offset,
table_reader_options.contain_no_data_block, table_reader_cache_res_mgr_,
table_reader, table_reader_options.tail_size, table_reader_cache_res_mgr_,
table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache,
table_reader_options.skip_filters, table_reader_options.level,
table_reader_options.immortal, table_reader_options.largest_seqno,
Expand Down
2 changes: 1 addition & 1 deletion table/block_based/block_based_table_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class RandomAccessFileReader;
class WritableFileWriter;

// TODO: deprecate this class as it can be replaced with
// `FileMetaData::tail_start_offset`
// `FileMetaData::tail_size`
//
// A class used to track actual bytes written from the tail in the recent SST
// file opens, and provide a suggestion for following open.
Expand Down
Loading

0 comments on commit 5d05423

Please sign in to comment.