From c73c1e524d9de50802eb1fca975c6fbdd6732fb2 Mon Sep 17 00:00:00 2001 From: sdong Date: Wed, 18 Jul 2018 15:21:45 -0700 Subject: [PATCH 1/3] BlockBasedTableReader: automatically adjust tail prefetch size Summary: Right now we use one hard-coded prefetch size to prefetch data from the tail of the SST files. However, this may introduce a waste for some use cases, while not efficient for others. Introduce a way to adjust this prefetch size by tracking 32 recent times, and pick a value with which the wasted read is less than 10% Test Plan: Add some unit tests for functionality correctnes. Run strace against db_bench to verify it works end to end. Reviewers: fix Fix a bug Add comments and fix the test --- db/db_test2.cc | 87 ++++++++++++++++++- table/block_based_table_factory.cc | 133 ++++++++++++++++++++++++++++- table/block_based_table_factory.h | 17 ++++ table/block_based_table_reader.cc | 44 +++++++--- table/block_based_table_reader.h | 4 +- table/table_test.cc | 44 ++++++++++ util/file_reader_writer.cc | 5 +- util/file_reader_writer.h | 21 ++++- 8 files changed, 334 insertions(+), 21 deletions(-) diff --git a/db/db_test2.cc b/db/db_test2.cc index 3cb453359f1..3a4c3c4a5d9 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2321,9 +2321,9 @@ TEST_F(DBTest2, RateLimitedCompactionReads) { options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW); // Include the explicit prefetch of the footer in direct I/O case. size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0; - ASSERT_GE(rate_limited_bytes, - static_cast(kNumKeysPerFile * kBytesPerKey * kNumL0Files + - direct_io_extra)); + ASSERT_GE( + rate_limited_bytes, + static_cast(kNumKeysPerFile * kBytesPerKey * kNumL0Files)); ASSERT_LT( rate_limited_bytes, static_cast(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files + @@ -2547,6 +2547,87 @@ TEST_F(DBTest2, PinnableSliceAndMmapReads) { #endif } +TEST_F(DBTest2, TestBBTTailPrefetch) { + std::atomic called(false); + size_t expected_lower_bound = 512 * 1024; + size_t expected_higher_bound = 512 * 1024; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { + size_t* prefetch_size = static_cast(arg); + EXPECT_LE(expected_lower_bound, *prefetch_size); + EXPECT_GE(expected_higher_bound, *prefetch_size); + called = true; + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Put("1", "1"); + Put("9", "1"); + Flush(); + + expected_lower_bound = 0; + expected_higher_bound = 8 * 1024; + + Put("1", "1"); + Put("9", "1"); + Flush(); + + Put("1", "1"); + Put("9", "1"); + Flush(); + + ASSERT_TRUE(called.load()); + called = false; + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + + std::atomic first_call(true); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { + size_t* prefetch_size = static_cast(arg); + if (first_call) { + EXPECT_EQ(8 * 1024, *prefetch_size); + first_call = false; + } else { + EXPECT_GE(4 * 1024, *prefetch_size); + } + called = true; + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.max_file_opening_threads = 1; // one thread + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.max_open_files = -1; + Reopen(options); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + Put("1", "1"); + Put("9", "1"); + Flush(); + + Put("1", "1"); + Put("9", "1"); + Flush(); + + ASSERT_TRUE(called.load()); + called = false; + + // Parallel loading SST files + options.max_file_opening_threads = 16; + Reopen(options); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_TRUE(called.load()); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index 2f0d41ae931..33906fbe7b3 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -27,10 +27,140 @@ #include "table/block_based_table_builder.h" #include "table/block_based_table_reader.h" #include "table/format.h" +#include "util/mutexlock.h" #include "util/string_util.h" namespace rocksdb { +void TailPrefetchStats::RecordEffectiveSize(size_t len) { + MutexLock l(&mutex_); + if (num_records_ < kNumTracked) { + num_records_++; + } + records_[next_++] = len; + if (next_ == kNumTracked) { + next_ = 0; + } +} + +size_t TailPrefetchStats::GetSuggestedPrefetchSize() { + std::vector sorted; + { + MutexLock l(&mutex_); + + if (num_records_ == 0) { + return 0; + } + sorted.assign(records_, records_ + num_records_); + } + + // Of the historic size, we find the maximum one that satisifis the condtiion + // that if prefetching all, less than 1/8 will be wasted. + std::sort(sorted.begin(), sorted.end()); + + // Assuming we have 5 data points, and after sorting it looks like this: + // + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // +---+ | | | | + // | | | | | | + // +---+ | | | | | | + // | | | | | | | | + // +---+ | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // and we use every of the value as a candidate, and estimate how much we + // wasted, compared to read. For example, when we use the 3rd record + // as candiate. This area is what we read: + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // *** *** *** ***+ *** *** *** *** ** + // * | | | | | | + // +---+ | | | | | * + // * | | | | | | | | + // +---+ | | | | | | | * + // * | | | | X | | | | | + // | | | | | | | | | * + // * | | | | | | | | | + // | | | | | | | | | * + // * | | | | | | | | | + // *** *** ***-*** ***--*** ***--*** +**** + // which is (size of the record) X (number of records). + // + // While wasted is this area: + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // *** *** *** ****---+ | | | | + // * * | | | | | + // * *-*** *** | | | | | + // * * | | | | | | | + // *--** *** | | | | | | | + // | | | | | X | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // Which can be calculated iteratively. + // The difference between wasted using 4st and 3rd record, will + // be following area: + // +---+ + // +--+ +-+ ++ +-+ +-+ +---+ | | + // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // | xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // +-+ +-+ +-+ ++ +---+ +--+ | | | + // | | | | | | | + // +---+ ++ | | | | | | + // | | | | | | X | | | + // +---+ ++ | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // which will be the size difference between 4st and 3rd record, + // times 3, which is number of records before the 4st. + // Here we assume that all data within the prefetch range will be useful. In + // reality, it may not be the case when a partial block is inside the range, + // or there are data in the middle that is not read. We ignore those cases + // for simplicity. + size_t prev_size = sorted[0]; + size_t max_qualified_size = sorted[0]; + size_t wasted = 0; + for (size_t i = 1; i < sorted.size(); i++) { + size_t read = sorted[i] * sorted.size(); + wasted += (sorted[i] - prev_size) * i; + if (wasted <= read / 8) { + max_qualified_size = sorted[i]; + } + prev_size = sorted[i]; + } + const size_t kMaxPrefetchSize = 512 * 1024; // Never exceed 512KB + return std::min(kMaxPrefetchSize, max_qualified_size); +} + BlockBasedTableFactory::BlockBasedTableFactory( const BlockBasedTableOptions& _table_options) : table_options_(_table_options) { @@ -71,7 +201,8 @@ Status BlockBasedTableFactory::NewTableReader( table_options_, table_reader_options.internal_comparator, std::move(file), file_size, table_reader, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, - table_reader_options.level, table_reader_options.immortal); + table_reader_options.level, table_reader_options.immortal, + &tail_prefetch_stats_); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index 47e7d923b54..6f05d604082 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -26,6 +26,22 @@ struct EnvOptions; using std::unique_ptr; class BlockBasedTableBuilder; +// A class used to track actual bytes written from the tail in the recent SST +// file opens, and provide a suggestion for following open. +class TailPrefetchStats { + public: + void RecordEffectiveSize(size_t len); + // 0 indicates no information to determine. + size_t GetSuggestedPrefetchSize(); + + private: + const static size_t kNumTracked = 32; + size_t records_[kNumTracked]; + port::Mutex mutex_; + size_t next_ = 0; + size_t num_records_ = 0; +}; + class BlockBasedTableFactory : public TableFactory { public: explicit BlockBasedTableFactory( @@ -64,6 +80,7 @@ class BlockBasedTableFactory : public TableFactory { private: BlockBasedTableOptions table_options_; + mutable TailPrefetchStats tail_prefetch_stats_; }; extern const std::string kHashIndexPrefixesBlock; diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index bf0fc66137d..d2fdba35000 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -731,7 +731,8 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, const SliceTransform* prefix_extractor, const bool prefetch_index_and_filter_in_cache, const bool skip_filters, const int level, - const bool immortal_table) { + const bool immortal_table, + TailPrefetchStats* tail_prefetch_stats) { table_reader->reset(); Footer footer; @@ -741,29 +742,40 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // prefetch both index and filters, down to all partitions const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; const bool preload_all = !table_options.cache_index_and_filter_blocks; - // Before read footer, readahead backwards to prefetch data. Do more readahead - // if we're going to read index/filter. - // TODO: This may incorrectly select small readahead in case partitioned - // index/filter is enabled and top-level partition pinning is enabled. That's - // because we need to issue readahead before we read the properties, at which - // point we don't yet know the index type. - const size_t kTailPrefetchSize = - prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + + size_t tail_prefetch_size = 0; + if (tail_prefetch_stats != nullptr) { + // Multiple threads may get a 0 (no history) when running in parallel, + // but it will get cleared after the first of them finishes. + tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); + } + if (tail_prefetch_size == 0) { + // Before read footer, readahead backwards to prefetch data. Do more readahead + // if we're going to read index/filter. + // TODO: This may incorrectly select small readahead in case partitioned + // index/filter is enabled and top-level partition pinning is enabled. That's + // because we need to issue readahead before we read the properties, at which + // point we don't yet know the index type. + tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + } size_t prefetch_off; size_t prefetch_len; - if (file_size < kTailPrefetchSize) { + if (file_size < tail_prefetch_size) { prefetch_off = 0; prefetch_len = static_cast(file_size); } else { - prefetch_off = static_cast(file_size - kTailPrefetchSize); - prefetch_len = kTailPrefetchSize; + prefetch_off = static_cast(file_size - tail_prefetch_size); + prefetch_len = tail_prefetch_size; } + TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen", + &tail_prefetch_size); Status s; // TODO should not have this special logic in the future. if (!file->use_direct_io()) { + prefetch_buffer.reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true)); s = file->Prefetch(prefetch_off, prefetch_len); } else { - prefetch_buffer.reset(new FilePrefetchBuffer()); + prefetch_buffer.reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true)); s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len); } s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer, @@ -1060,6 +1072,12 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, } if (s.ok()) { + assert(prefetch_buffer.get() != nullptr); + if (tail_prefetch_stats != nullptr) { + assert(prefetch_buffer->min_offset_read() < file_size); + tail_prefetch_stats->RecordEffectiveSize( + file_size - prefetch_buffer->min_offset_read()); + } *table_reader = std::move(new_table); } diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index faa1fa82de4..98a81e9869a 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -23,6 +23,7 @@ #include "rocksdb/status.h" #include "rocksdb/table.h" #include "table/block.h" +#include "table/block_based_table_factory.h" #include "table/filter_block.h" #include "table/format.h" #include "table/persistent_cache_helper.h" @@ -93,7 +94,8 @@ class BlockBasedTable : public TableReader { const SliceTransform* prefix_extractor = nullptr, bool prefetch_index_and_filter_in_cache = true, bool skip_filters = false, int level = -1, - const bool immortal_table = false); + const bool immortal_table = false, + TailPrefetchStats* tail_prefetch_stats = nullptr); bool PrefixMayMatch(const Slice& internal_key, const ReadOptions& read_options, diff --git a/table/table_test.cc b/table/table_test.cc index 356dc28a7aa..82427b4e75a 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1075,6 +1075,7 @@ class BlockBasedTableTest }; class PlainTableTest : public TableTest {}; class TablePropertyTest : public testing::Test {}; +class BBTTailPrefetchTest : public TableTest {}; INSTANTIATE_TEST_CASE_P(FormatDef, BlockBasedTableTest, testing::Values(test::kDefaultFormatVersion)); @@ -3514,6 +3515,49 @@ TEST_P(BlockBasedTableTest, BadOptions) { ASSERT_NOK(rocksdb::DB::Open(options, kDBPath, &db)); } +TEST_F(BBTTailPrefetchTest, TestTailPrefetchStats) { + TailPrefetchStats tpstats; + ASSERT_EQ(0, tpstats.GetSuggestedPrefetchSize()); + tpstats.RecordEffectiveSize(size_t{1000}); + tpstats.RecordEffectiveSize(size_t{1005}); + tpstats.RecordEffectiveSize(size_t{1002}); + ASSERT_EQ(1005, tpstats.GetSuggestedPrefetchSize()); + + // One single super large value shouldn't influence much + tpstats.RecordEffectiveSize(size_t{1002000}); + tpstats.RecordEffectiveSize(size_t{999}); + ASSERT_LE(1005, tpstats.GetSuggestedPrefetchSize()); + ASSERT_GT(1200, tpstats.GetSuggestedPrefetchSize()); + + // Only history of 32 is kept + for (int i = 0; i < 32; i++) { + tpstats.RecordEffectiveSize(size_t{100}); + } + ASSERT_EQ(100, tpstats.GetSuggestedPrefetchSize()); + + // 16 large values and 16 small values. The result should be closer + // to the small value as the algorithm. + for (int i = 0; i < 16; i++) { + tpstats.RecordEffectiveSize(size_t{1000}); + } + tpstats.RecordEffectiveSize(size_t{10}); + tpstats.RecordEffectiveSize(size_t{20}); + for (int i = 0; i < 6; i++) { + tpstats.RecordEffectiveSize(size_t{100}); + } + ASSERT_LE(80, tpstats.GetSuggestedPrefetchSize()); + ASSERT_GT(200, tpstats.GetSuggestedPrefetchSize()); +} + +TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) { + TailPrefetchStats tpstats; + FilePrefetchBuffer buffer(nullptr, 0, 0, false, true); + buffer.TryReadFromCache(500, 10, nullptr); + buffer.TryReadFromCache(480, 10, nullptr); + buffer.TryReadFromCache(490, 10, nullptr); + ASSERT_EQ(480, buffer.min_offset_read()); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index ae1e6671d33..54d431d70e4 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -714,7 +714,10 @@ Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader, bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n, Slice* result) { - if (offset < buffer_offset_) { + if (track_min_offset_ && offset < min_offset_read_) { + min_offset_read_ = offset; + } + if (!enable_ || offset < buffer_offset_) { return false; } diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 90a71c45982..d0b21d473f8 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -213,21 +213,38 @@ class WritableFileWriter { // readahead_size will be doubled on every IO, until max_readahead_size. class FilePrefetchBuffer { public: + // If `track_min_offset` is true, track minimum offset ever read. FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr, - size_t readadhead_size = 0, size_t max_readahead_size = 0) + size_t readadhead_size = 0, size_t max_readahead_size = 0, + bool enable = true, bool track_min_offset = false) : buffer_offset_(0), file_reader_(file_reader), readahead_size_(readadhead_size), - max_readahead_size_(max_readahead_size) {} + max_readahead_size_(max_readahead_size), + min_offset_read_(port::kMaxSizet), + enable_(enable), + track_min_offset_(track_min_offset) {} Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n); bool TryReadFromCache(uint64_t offset, size_t n, Slice* result); + // The minimum `offset` ever passed to TryReadFromCache(). Only be tracked + // if track_min_offset = true. + size_t min_offset_read() const { return min_offset_read_; } + private: AlignedBuffer buffer_; uint64_t buffer_offset_; RandomAccessFileReader* file_reader_; size_t readahead_size_; size_t max_readahead_size_; + // The minimum `offset` ever passed to TryReadFromCache(). + size_t min_offset_read_; + // if false, TryReadFromCache() always return false, and we only take stats + // for track_min_offset_ if track_min_offset_ = true + bool enable_; + // If true, track minimum `offset` ever passed to TryReadFromCache(), which + // can be fetched from min_offset_read(). + bool track_min_offset_; }; extern Status NewWritableFile(Env* env, const std::string& fname, From b66c75217fb7039fdd72b30b5cfcc94abd89c77c Mon Sep 17 00:00:00 2001 From: sdong Date: Thu, 19 Jul 2018 18:10:12 -0700 Subject: [PATCH 2/3] Fix --- db/db_test2.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/db/db_test2.cc b/db/db_test2.cc index 3a4c3c4a5d9..b30a64900b6 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2575,6 +2575,9 @@ TEST_F(DBTest2, TestBBTTailPrefetch) { Put("9", "1"); Flush(); + // Full compaction to make sure there is no L0 file after the open. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_TRUE(called.load()); called = false; @@ -2586,7 +2589,7 @@ TEST_F(DBTest2, TestBBTTailPrefetch) { "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { size_t* prefetch_size = static_cast(arg); if (first_call) { - EXPECT_EQ(8 * 1024, *prefetch_size); + EXPECT_EQ(4 * 1024, *prefetch_size); first_call = false; } else { EXPECT_GE(4 * 1024, *prefetch_size); @@ -2603,8 +2606,6 @@ TEST_F(DBTest2, TestBBTTailPrefetch) { options.max_open_files = -1; Reopen(options); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - Put("1", "1"); Put("9", "1"); Flush(); From 35e82e591e286ef1adf7548629d4c79e18ffbc5c Mon Sep 17 00:00:00 2001 From: sdong Date: Fri, 20 Jul 2018 11:08:09 -0700 Subject: [PATCH 3/3] Fix --- db/db_test2.cc | 2 +- table/block_based_table_factory.cc | 1 + table/block_based_table_reader.cc | 10 +++++----- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/db/db_test2.cc b/db/db_test2.cc index b30a64900b6..79cb605ebdb 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2577,7 +2577,7 @@ TEST_F(DBTest2, TestBBTTailPrefetch) { // Full compaction to make sure there is no L0 file after the open. ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - + ASSERT_TRUE(called.load()); called = false; diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index 33906fbe7b3..e56dcf644de 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -146,6 +146,7 @@ size_t TailPrefetchStats::GetSuggestedPrefetchSize() { // reality, it may not be the case when a partial block is inside the range, // or there are data in the middle that is not read. We ignore those cases // for simplicity. + assert(!sorted.empty()); size_t prev_size = sorted[0]; size_t max_qualified_size = sorted[0]; size_t wasted = 0; diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index d2fdba35000..e70235c06ee 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -750,12 +750,12 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); } if (tail_prefetch_size == 0) { - // Before read footer, readahead backwards to prefetch data. Do more readahead - // if we're going to read index/filter. + // Before read footer, readahead backwards to prefetch data. Do more + // readahead if we're going to read index/filter. // TODO: This may incorrectly select small readahead in case partitioned - // index/filter is enabled and top-level partition pinning is enabled. That's - // because we need to issue readahead before we read the properties, at which - // point we don't yet know the index type. + // index/filter is enabled and top-level partition pinning is enabled. + // That's because we need to issue readahead before we read the properties, + // at which point we don't yet know the index type. tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; } size_t prefetch_off;