From 9460832de6ca37b41251d04d95ef3ffdff90dadc Mon Sep 17 00:00:00 2001 From: Peter Mattis Date: Fri, 8 Jun 2018 09:06:15 -0400 Subject: [PATCH] Skip sstables that are entirely covered by a range tombstone During iteration skip sstables that are entirely covered by a range tombstone. The check to do this is loose: we check that an sstable's [smallest_key,largest_key) at largest_seqno is covered by a range tombstone. --- db/db_range_del_test.cc | 19 ++++++++++ db/range_del_aggregator.cc | 59 +++++++++++++++++++++++++++++++ db/range_del_aggregator.h | 9 +++++ db/range_del_aggregator_test.cc | 61 +++++++++++++++++++++++++++++++++ db/version_set.cc | 25 +++++++++++++- 5 files changed, 172 insertions(+), 1 deletion(-) diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 3aea0b75e24..cba20cbceb7 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -813,6 +813,25 @@ TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) { db_->ReleaseSnapshot(snapshot); } +TEST_F(DBRangeDelTest, IteratorCoveredSst) { + // TODO(peter): generate multiple sstables with some being covered + // by tombstones and some that aren't. + db_->Put(WriteOptions(), "key", "val"); + ASSERT_OK(db_->Flush(FlushOptions())); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + + ReadOptions read_opts; + auto* iter = db_->NewIterator(read_opts); + + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(0, count); + delete iter; +} + #ifndef ROCKSDB_UBSAN_RUN TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) { db_->Put(WriteOptions(), "key", "val"); diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index c99d9da022d..7b0ab931c9a 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -65,6 +65,15 @@ class UncollapsedRangeDelMap : public RangeDelMap { return false; } + bool ShouldDeleteRange(const Slice& start, const Slice& end, + SequenceNumber seqno) { + // Unimplemented because the only client of this method, iteration, uses + // collapsed maps. + fprintf(stderr, "UncollapsedRangeDelMap::ShouldDeleteRange unimplemented"); + abort(); + return false; + } + bool IsRangeOverlapped(const Slice& start, const Slice& end) { for (const auto& tombstone : rep_) { if (ucmp_->Compare(start, tombstone.end_key_) < 0 && @@ -224,6 +233,44 @@ class CollapsedRangeDelMap : public RangeDelMap { return parsed.sequence < iter_->second; } + bool ShouldDeleteRange(const Slice& start, const Slice& end, + SequenceNumber seqno) { + ParsedInternalKey parsed_start; + if (!ParseInternalKey(start, &parsed_start)) { + assert(false); + } + ParsedInternalKey parsed_end; + if (!ParseInternalKey(end, &parsed_end)) { + assert(false); + } + if (ucmp_->Compare(parsed_start.user_key, parsed_end.user_key) > 0) { + return false; + } + + auto iter = rep_.upper_bound(parsed_start.user_key); + if (iter == rep_.begin()) { + // before start of deletion intervals + return false; + } + --iter; + if (ucmp_->Compare(parsed_start.user_key, iter->first) < 0) { + return false; + } + // Loop looking for a tombstone that is older than the range + // sequence number, or we determine that our range is completely + // covered by newer tombstones. + for (; iter != rep_.end(); ++iter) { + if (ucmp_->Compare(parsed_end.user_key, iter->first) < 0) { + return true; + } + if (seqno >= iter->second) { + // Tombstone is older than range sequence number. + return false; + } + } + return false; + } + bool IsRangeOverlapped(const Slice&, const Slice&) { // Unimplemented because the only client of this method, file ingestion, // uses uncollapsed maps. @@ -409,6 +456,18 @@ bool RangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed, return tombstone_map.ShouldDelete(parsed, mode); } +bool RangeDelAggregator::ShouldDeleteRange( + const Slice& start, const Slice& end, SequenceNumber seqno) { + if (rep_ == nullptr) { + return false; + } + auto& tombstone_map = GetRangeDelMap(seqno); + if (tombstone_map.IsEmpty()) { + return false; + } + return tombstone_map.ShouldDeleteRange(start, end, seqno); +} + bool RangeDelAggregator::IsRangeOverlapped(const Slice& start, const Slice& end) { // Unimplemented because the only client of this method, file ingestion, diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index 5ba85814357..f28b9ffe3c6 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -61,6 +61,8 @@ class RangeDelMap { virtual bool ShouldDelete(const ParsedInternalKey& parsed, RangeDelPositioningMode mode) = 0; + virtual bool ShouldDeleteRange(const Slice& start, const Slice& end, + SequenceNumber seqno) = 0; virtual bool IsRangeOverlapped(const Slice& start, const Slice& end) = 0; virtual void InvalidatePosition() = 0; @@ -127,6 +129,13 @@ class RangeDelAggregator { bool ShouldDeleteImpl(const Slice& internal_key, RangeDelPositioningMode mode); + // Return true if one or more tombstones that are newer than seqno fully + // cover the range [start,end] (both endpoints are inclusive). Beware the + // inclusive endpoint which differs from most other key ranges, but matches + // the largest_key metadata for an sstable. + bool ShouldDeleteRange(const Slice& start, const Slice& end, + SequenceNumber seqno); + // Checks whether range deletions cover any keys between `start` and `end`, // inclusive. // diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index 85db17b1679..964ebebb18b 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -21,6 +21,12 @@ struct ExpectedPoint { SequenceNumber seq; }; +struct ExpectedRange { + Slice begin; + Slice end; + SequenceNumber seq; +}; + enum Direction { kForward, kReverse, @@ -121,6 +127,25 @@ void VerifyRangeDels( } } +bool ShouldDeleteRange(const std::vector& range_dels, + const ExpectedRange& expected_range) { + RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, true); + std::vector keys, values; + for (const auto& range_del : range_dels) { + auto key_and_value = range_del.Serialize(); + keys.push_back(key_and_value.first.Encode().ToString()); + values.push_back(key_and_value.second.ToString()); + } + std::unique_ptr range_del_iter( + new test::VectorIterator(keys, values)); + range_del_agg.AddTombstones(std::move(range_del_iter)); + + std::string begin, end; + AppendInternalKey(&begin, {expected_range.begin, expected_range.seq, kTypeValue}); + AppendInternalKey(&end, {expected_range.end, expected_range.seq, kTypeValue}); + return range_del_agg.ShouldDeleteRange(begin, end, expected_range.seq); +} + } // anonymous namespace TEST_F(RangeDelAggregatorTest, Empty) { VerifyRangeDels({}, {{"a", 0}}, {}); } @@ -275,6 +300,42 @@ TEST_F(RangeDelAggregatorTest, MergingIteratorSeek) { {{"c", "d", 20}, {"e", "f", 20}, {"f", "g", 10}}); } +TEST_F(RangeDelAggregatorTest, ShouldDeleteRange) { + ASSERT_TRUE(ShouldDeleteRange( + {{"a", "c", 10}}, + {"a", "b", 9})); + ASSERT_TRUE(ShouldDeleteRange( + {{"a", "c", 10}}, + {"a", "a", 9})); + ASSERT_FALSE(ShouldDeleteRange( + {{"a", "c", 10}}, + {"b", "a", 9})); + ASSERT_FALSE(ShouldDeleteRange( + {{"a", "c", 10}}, + {"a", "b", 10})); + ASSERT_FALSE(ShouldDeleteRange( + {{"a", "c", 10}}, + {"a", "c", 9})); + ASSERT_FALSE(ShouldDeleteRange( + {{"b", "c", 10}}, + {"a", "b", 9})); + ASSERT_TRUE(ShouldDeleteRange( + {{"a", "b", 10}, {"b", "d", 20}}, + {"a", "c", 9})); + ASSERT_FALSE(ShouldDeleteRange( + {{"a", "b", 10}, {"b", "d", 20}}, + {"a", "c", 15})); + ASSERT_FALSE(ShouldDeleteRange( + {{"a", "b", 10}, {"c", "e", 20}}, + {"a", "d", 9})); + ASSERT_TRUE(ShouldDeleteRange( + {{"a", "b", 10}, {"c", "e", 20}}, + {"c", "d", 15})); + ASSERT_FALSE(ShouldDeleteRange( + {{"a", "b", 10}, {"c", "e", 20}}, + {"c", "d", 20})); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/version_set.cc b/db/version_set.cc index b7a62d5e7dd..41b23837bcc 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -540,6 +540,14 @@ class LevelIterator final : public InternalIterator { InternalIterator* NewFileIterator() { assert(file_index_ < flevel_->num_files); auto file_meta = flevel_->files[file_index_]; + // Check to see if every key in the sstable is covered by a range + // tombstone. SkipEmptyFile{Forward,Backward} will take care of + // skipping over an "empty" file if we return null. + if (range_del_agg_->ShouldDeleteRange(file_meta.smallest_key, file_meta.largest_key, + file_meta.file_metadata->largest_seqno)) { + return nullptr; + } + if (should_sample_) { sample_file_read_inc(file_meta.file_metadata); } @@ -574,6 +582,21 @@ void LevelIterator::Seek(const Slice& target) { InitFileIterator(new_file_index); if (file_iter_.iter() != nullptr) { + // TODO(peter): Rather the seeking to target, we could use + // RangeDelAggregator to find the first key within the file that + // is not covered by a range tombstone. Something like: + // + // first_non_deleted = range_del_agg_->SeekForward( + // smallest_key, largest_key, largest_seqno); + // if (target < first_non_deleted) { + // target = first_non_deleted; + // } + // + // This optimization applies to SeekForPrev, Next and Prev as + // well. For Next and Prev we'd want some way to keep track of the + // current tombstone. Perhaps a RangeDelAggregator::Iterator, + // though that would need to be stable in the presence of + // modifications to RangeDelAggregator tombstone_maps. file_iter_.Seek(target); } SkipEmptyFileForward(); @@ -588,8 +611,8 @@ void LevelIterator::SeekForPrev(const Slice& target) { InitFileIterator(new_file_index); if (file_iter_.iter() != nullptr) { file_iter_.SeekForPrev(target); - SkipEmptyFileBackward(); } + SkipEmptyFileBackward(); } void LevelIterator::SeekToFirst() {