From f502dd67e6ca22ba6e127d6476f1f1f39ce0f493 Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Mon, 1 Oct 2018 13:35:08 -0700 Subject: [PATCH 1/6] Truncate RangeTombstones by leveraging InternalKeys Summary: To more accurately truncate range tombstones at SST boundaries, we now represent them in RangeDelAggregator using InternalKeys, which are end-key-exclusive as they were before this change. During compaction, "atomic compaction unit boundaries" (the range of keys contained in neighbouring and overlaping SSTs) are propagated down to RangeDelAggregator to truncate range tombstones at those boundariies instead. See https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and https://github.com/facebook/rocksdb/pull/4432#discussion_r221138683 for motivating examples. Test Plan: make -j64 check --- db/compaction.cc | 51 ++++++++- db/compaction.h | 25 +++++ db/range_del_aggregator.cc | 174 ++++++++++++++++++----------- db/range_del_aggregator.h | 29 ++++- db/range_del_aggregator_test.cc | 189 ++++++++++++++++++++++++-------- db/table_cache.cc | 19 +++- db/table_cache.h | 4 +- db/version_set.cc | 23 +++- 8 files changed, 390 insertions(+), 124 deletions(-) diff --git a/db/compaction.cc b/db/compaction.cc index 4ea92d5cc78..65d39ec63d3 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -81,6 +81,55 @@ void Compaction::GetBoundaryKeys( } } +std::vector Compaction::PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector inputs) { + const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); + for (size_t i = 0; i < inputs.size(); i++) { + if (inputs[i].level == 0 || inputs[i].files.empty()) { + continue; + } + inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size()); + AtomicCompactionUnitBoundary cur_boundary; + uint64_t cur_end_key_footer = 0; + size_t first_atomic_idx = 0; + auto add_unit_boundary = [&](size_t to) { + if (first_atomic_idx == to) return; + for (size_t k = first_atomic_idx; k < to; k++) { + inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary); + } + first_atomic_idx = to; + }; + for (size_t j = 0; j < inputs[i].files.size(); j++) { + const auto* f = inputs[i].files[j]; + const Slice& start_user_key = f->smallest.user_key(); + const Slice& end_user_key = f->largest.user_key(); + if (first_atomic_idx == j) { + // First file in an atomic compaction unit. + cur_boundary.smallest = start_user_key; + cur_boundary.largest = end_user_key; + } else if (ucmp->Compare(cur_boundary.largest, start_user_key) == 0 && + cur_end_key_footer != + PackSequenceAndType(kMaxSequenceNumber, + kTypeRangeDeletion)) { + // SSTs overlap but the end key of the previous file was not + // artificially extended by a range tombstone. Extend the current + // boundary. + cur_boundary.largest = end_user_key; + } else { + // Atomic compaction unit has ended. + add_unit_boundary(j); + cur_boundary.smallest = start_user_key; + cur_boundary.largest = end_user_key; + } + cur_end_key_footer = ExtractInternalKeyFooter(f->largest.Encode()); + } + add_unit_boundary(inputs[i].files.size()); + assert(inputs[i].files.size() == + inputs[i].atomic_compaction_unit_boundaries.size()); + } + return std::move(inputs); +} + // helper function to determine if compaction is creating files at the // bottommost level bool Compaction::IsBottommostLevel( @@ -155,7 +204,7 @@ Compaction::Compaction(VersionStorageInfo* vstorage, output_compression_(_compression), output_compression_opts_(_compression_opts), deletion_compaction_(_deletion_compaction), - inputs_(std::move(_inputs)), + inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))), grandparents_(std::move(_grandparents)), score_(_score), bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)), diff --git a/db/compaction.h b/db/compaction.h index f1d78771696..58793c65a03 100644 --- a/db/compaction.h +++ b/db/compaction.h @@ -15,11 +15,23 @@ namespace rocksdb { +// An AtomicCompactionUnitBoundary represents a range of keys [smallest, +// largest] that exactly spans one ore more neighbouring SSTs on the same +// level. Every pair of SSTs in this range "overlap" (i.e., the largest +// user key of one file is the smallest user key of the next file). These +// boundaries are propagated down to RangeDelAggregator during compaction +// to provide safe truncation boundaries for range tombstones. +struct AtomicCompactionUnitBoundary { + Slice smallest; + Slice largest; +}; + // The structure that manages compaction input files associated // with the same physical level. struct CompactionInputFiles { int level; std::vector files; + std::vector atomic_compaction_unit_boundaries; inline bool empty() const { return files.empty(); } inline size_t size() const { return files.size(); } inline void clear() { files.clear(); } @@ -96,6 +108,12 @@ class Compaction { return inputs_[compaction_input_level][i]; } + const std::vector* boundaries( + size_t compaction_input_level) const { + assert(compaction_input_level < inputs_.size()); + return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries; + } + // Returns the list of file meta data of the specified compaction // input level. // REQUIREMENT: "compaction_input_level" must be >= 0 and @@ -262,6 +280,13 @@ class Compaction { const std::vector& inputs, Slice* smallest_key, Slice* largest_key); + // Get the atomic file boundaries for all files in the compaction. Necessary + // in order to avoid the scenario described in + // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and plumb + // down appropriate key boundaries to RangeDelAggregator during compaction. + static std::vector PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector inputs); + // helper function to determine if compaction with inputs and storage is // bottommost static bool IsBottommostLevel( diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index d1885603c6c..787b0d1b198 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -11,19 +11,29 @@ namespace rocksdb { struct TombstoneStartKeyComparator { - TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {} + TombstoneStartKeyComparator(const InternalKeyComparator* c) : cmp(c) {} - bool operator()(const RangeTombstone& a, const RangeTombstone& b) const { + bool operator()(const TruncatedRangeTombstone& a, const TruncatedRangeTombstone& b) const { return cmp->Compare(a.start_key_, b.start_key_) < 0; } - const Comparator* cmp; + const InternalKeyComparator* cmp; +}; + +struct ParsedInternalKeyComparator { + ParsedInternalKeyComparator(const InternalKeyComparator* c) : cmp(c) {} + + bool operator()(const ParsedInternalKey& a, const ParsedInternalKey& b) const { + return cmp->Compare(a, b) < 0; + } + + const InternalKeyComparator* cmp; }; // An UncollapsedRangeDelMap is quick to create but slow to answer ShouldDelete // queries. class UncollapsedRangeDelMap : public RangeDelMap { - typedef std::multiset Rep; + typedef std::multiset Rep; class Iterator : public RangeDelIterator { const Rep& rep_; @@ -35,48 +45,55 @@ class UncollapsedRangeDelMap : public RangeDelMap { void Next() override { iter_++; } void Seek(const Slice&) override { - fprintf(stderr, "UncollapsedRangeDelMap::Iterator::Seek unimplemented\n"); + fprintf(stderr, "UncollapsedRangeDelMap::Iterator::Seek(Slice&) unimplemented\n"); abort(); } - RangeTombstone Tombstone() const override { return *iter_; } + void Seek(const ParsedInternalKey&) override { + fprintf(stderr, "UncollapsedRangeDelMap::Iterator::Seek(ParsedInternalKey&) unimplemented\n"); + abort(); + } + + RangeTombstone Tombstone() const override { return iter_->Tombstone(); } }; Rep rep_; - const Comparator* ucmp_; + const InternalKeyComparator* icmp_; public: - UncollapsedRangeDelMap(const Comparator* ucmp) - : rep_(TombstoneStartKeyComparator(ucmp)), ucmp_(ucmp) {} + UncollapsedRangeDelMap(const InternalKeyComparator* icmp) + : rep_(TombstoneStartKeyComparator(icmp)), icmp_(icmp) {} bool ShouldDelete(const ParsedInternalKey& parsed, RangeDelPositioningMode mode) override { (void)mode; assert(mode == RangeDelPositioningMode::kFullScan); for (const auto& tombstone : rep_) { - if (ucmp_->Compare(parsed.user_key, tombstone.start_key_) < 0) { + if (icmp_->Compare(parsed, tombstone.start_key_) < 0) { break; } if (parsed.sequence < tombstone.seq_ && - ucmp_->Compare(parsed.user_key, tombstone.end_key_) < 0) { + icmp_->Compare(parsed, tombstone.end_key_) < 0) { return true; + } else { } } return false; } - bool IsRangeOverlapped(const Slice& start, const Slice& end) override { + bool IsRangeOverlapped(const ParsedInternalKey& start, + const ParsedInternalKey& end) override { for (const auto& tombstone : rep_) { - if (ucmp_->Compare(start, tombstone.end_key_) < 0 && - ucmp_->Compare(tombstone.start_key_, end) <= 0 && - ucmp_->Compare(tombstone.start_key_, tombstone.end_key_) < 0) { + if (icmp_->Compare(start, tombstone.end_key_) < 0 && + icmp_->Compare(tombstone.start_key_, end) <= 0 && + icmp_->Compare(tombstone.start_key_, tombstone.end_key_) < 0) { return true; } } return false; } - void AddTombstone(RangeTombstone tombstone) override { + void AddTombstone(TruncatedRangeTombstone tombstone) override { rep_.emplace(tombstone); } @@ -126,7 +143,9 @@ class UncollapsedRangeDelMap : public RangeDelMap { // compared against the map entry g → 3 and determined to be uncovered. By // contrast, the key h @ 2 would be determined to be covered. class CollapsedRangeDelMap : public RangeDelMap { - typedef std::map Rep; + typedef std::map + Rep; class Iterator : public RangeDelIterator { void MaybeSeekPastSentinel() { @@ -148,7 +167,12 @@ class CollapsedRangeDelMap : public RangeDelMap { MaybeSeekPastSentinel(); } - void Seek(const Slice& target) override { + void Seek(const Slice&) override { + fprintf(stderr, "CollapsedRangeDelMap::Iterator::Seek(Slice&) unimplemented\n"); + abort(); + } + + void Seek(const ParsedInternalKey& target) override { iter_ = rep_.upper_bound(target); if (iter_ != rep_.begin()) { iter_--; @@ -161,8 +185,8 @@ class CollapsedRangeDelMap : public RangeDelMap { assert(std::next(iter_) != rep_.end()); assert(iter_->second != 0); RangeTombstone tombstone; - tombstone.start_key_ = iter_->first; - tombstone.end_key_ = std::next(iter_)->first; + tombstone.start_key_ = iter_->first.user_key; + tombstone.end_key_ = std::next(iter_)->first.user_key; tombstone.seq_ = iter_->second; return tombstone; } @@ -170,12 +194,12 @@ class CollapsedRangeDelMap : public RangeDelMap { Rep rep_; Rep::iterator iter_; - const Comparator* ucmp_; + const InternalKeyComparator* icmp_; public: - explicit CollapsedRangeDelMap(const Comparator* ucmp) - : rep_(stl_wrappers::LessOfComparator(ucmp)), - ucmp_(ucmp) { + explicit CollapsedRangeDelMap(const InternalKeyComparator* icmp) + : rep_(ParsedInternalKeyComparator(icmp)), + icmp_(icmp) { InvalidatePosition(); } @@ -194,29 +218,29 @@ class CollapsedRangeDelMap : public RangeDelMap { case RangeDelPositioningMode::kForwardTraversal: assert(iter_ != rep_.end()); if (iter_ == rep_.begin() && - ucmp_->Compare(parsed.user_key, iter_->first) < 0) { + icmp_->Compare(parsed, iter_->first) < 0) { // before start of deletion intervals return false; } while (std::next(iter_) != rep_.end() && - ucmp_->Compare(std::next(iter_)->first, parsed.user_key) <= 0) { + icmp_->Compare(std::next(iter_)->first, parsed) <= 0) { ++iter_; } break; case RangeDelPositioningMode::kBackwardTraversal: assert(iter_ != rep_.end()); while (iter_ != rep_.begin() && - ucmp_->Compare(parsed.user_key, iter_->first) < 0) { + icmp_->Compare(parsed, iter_->first) < 0) { --iter_; } if (iter_ == rep_.begin() && - ucmp_->Compare(parsed.user_key, iter_->first) < 0) { + icmp_->Compare(parsed, iter_->first) < 0) { // before start of deletion intervals return false; } break; case RangeDelPositioningMode::kBinarySearch: - iter_ = rep_.upper_bound(parsed.user_key); + iter_ = rep_.upper_bound(parsed); if (iter_ == rep_.begin()) { // before start of deletion intervals return false; @@ -225,21 +249,22 @@ class CollapsedRangeDelMap : public RangeDelMap { break; } assert(iter_ != rep_.end() && - ucmp_->Compare(iter_->first, parsed.user_key) <= 0); + icmp_->Compare(iter_->first, parsed) <= 0); assert(std::next(iter_) == rep_.end() || - ucmp_->Compare(parsed.user_key, std::next(iter_)->first) < 0); + icmp_->Compare(parsed, std::next(iter_)->first) < 0); return parsed.sequence < iter_->second; } - bool IsRangeOverlapped(const Slice&, const Slice&) override { + bool IsRangeOverlapped(const ParsedInternalKey&, + const ParsedInternalKey&) override { // Unimplemented because the only client of this method, file ingestion, // uses uncollapsed maps. fprintf(stderr, "CollapsedRangeDelMap::IsRangeOverlapped unimplemented"); abort(); } - void AddTombstone(RangeTombstone t) override { - if (ucmp_->Compare(t.start_key_, t.end_key_) >= 0 || t.seq_ == 0) { + void AddTombstone(TruncatedRangeTombstone t) override { + if (icmp_->Compare(t.start_key_, t.end_key_) >= 0 || t.seq_ == 0) { // The tombstone covers no keys. Nothing to do. return; } @@ -272,7 +297,8 @@ class CollapsedRangeDelMap : public RangeDelMap { end_seq = prev_seq(); Rep::iterator pit; if (it != rep_.begin() && (pit = std::prev(it)) != rep_.begin() && - ucmp_->Compare(pit->first, t.start_key_) == 0 && std::prev(pit)->second == t.seq_) { + icmp_->Compare(pit->first, t.start_key_) == 0 && + std::prev(pit)->second == t.seq_) { // The new tombstone starts at the end of an existing tombstone with an // identical seqno: // @@ -297,7 +323,7 @@ class CollapsedRangeDelMap : public RangeDelMap { } // Look at all the existing transitions that overlap the new tombstone. - while (it != rep_.end() && ucmp_->Compare(it->first, t.end_key_) < 0) { + while (it != rep_.end() && icmp_->Compare(it->first, t.end_key_) < 0) { if (t.seq_ >= it->second) { // The transition is to an existing tombstone that the new tombstone // covers. Save the covered tombstone's seqno. We'll need to return to @@ -343,12 +369,14 @@ class CollapsedRangeDelMap : public RangeDelMap { if (t.seq_ == prev_seq()) { // The new tombstone is unterminated in the map. - if (it != rep_.end() && t.seq_ == it->second && ucmp_->Compare(it->first, t.end_key_) == 0) { + if (it != rep_.end() && t.seq_ == it->second && + icmp_->Compare(it->first, t.end_key_) == 0) { // The new tombstone ends at the start of another tombstone with an // identical seqno. Merge the tombstones by removing the existing // tombstone's start key. rep_.erase(it); - } else if (end_seq == prev_seq() || (it != rep_.end() && end_seq == it->second)) { + } else if (end_seq == prev_seq() || + (it != rep_.end() && end_seq == it->second)) { // The new tombstone is implicitly ended because its end point is // contained within an existing tombstone with the same seqno: // @@ -363,7 +391,8 @@ class CollapsedRangeDelMap : public RangeDelMap { // Install one that returns to the last seqno we covered. Because end // keys are exclusive, if there's an existing transition at t.end_key_, // it takes precedence over the transition that we install here. - rep_.emplace(t.end_key_, end_seq); // emplace is a noop if existing entry + rep_.emplace(t.end_key_, + end_seq); // emplace is a noop if existing entry } } else { // The new tombstone is implicitly ended because its end point is covered @@ -416,9 +445,9 @@ void RangeDelAggregator::InitRep(const std::vector& snapshots) { std::unique_ptr RangeDelAggregator::NewRangeDelMap() { RangeDelMap* tombstone_map; if (collapse_deletions_) { - tombstone_map = new CollapsedRangeDelMap(icmp_.user_comparator()); + tombstone_map = new CollapsedRangeDelMap(&icmp_); } else { - tombstone_map = new UncollapsedRangeDelMap(icmp_.user_comparator()); + tombstone_map = new UncollapsedRangeDelMap(&icmp_); } return std::unique_ptr(tombstone_map); } @@ -429,13 +458,13 @@ bool RangeDelAggregator::ShouldDeleteImpl(const Slice& internal_key, ParsedInternalKey parsed; if (!ParseInternalKey(internal_key, &parsed)) { assert(false); + return false; } - return ShouldDelete(parsed, mode); + return ShouldDeleteImpl(parsed, mode); } bool RangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed, RangeDelPositioningMode mode) { - assert(IsValueType(parsed.type)); assert(rep_ != nullptr); auto& tombstone_map = GetRangeDelMap(parsed.sequence); if (tombstone_map.IsEmpty()) { @@ -452,8 +481,10 @@ bool RangeDelAggregator::IsRangeOverlapped(const Slice& start, if (rep_ == nullptr) { return false; } + ParsedInternalKey start_ikey(start, kMaxSequenceNumber, kMaxValue); + ParsedInternalKey end_ikey(end, 0, static_cast(0)); for (const auto& stripe : rep_->stripe_map_) { - if (stripe.second->IsRangeOverlapped(start, end)) { + if (stripe.second->IsRangeOverlapped(start_ikey, end_ikey)) { return true; } } @@ -492,40 +523,49 @@ Status RangeDelAggregator::AddTombstones( if (!parsed) { return Status::Corruption("Unable to parse range tombstone InternalKey"); } - RangeTombstone tombstone; + Slice end_user_key; if (input->IsValuePinned()) { - tombstone = RangeTombstone(parsed_key, input->value()); + end_user_key = input->value(); } else { // The tombstone map holds slices into the iterator's memory. Make a // copy of the value if it is not pinned. rep_->pinned_slices_.emplace_back(input->value().data(), input->value().size()); - tombstone = RangeTombstone(parsed_key, rep_->pinned_slices_.back()); + end_user_key = rep_->pinned_slices_.back(); } + ParsedInternalKey start_key(parsed_key.user_key, kMaxSequenceNumber, + kMaxValue); + ParsedInternalKey end_key(end_user_key, kMaxSequenceNumber, kMaxValue); // Truncate the tombstone to the range [smallest, largest]. if (smallest != nullptr) { - if (icmp_.user_comparator()->Compare( - tombstone.start_key_, smallest->user_key()) < 0) { - tombstone.start_key_ = smallest->user_key(); + ParsedInternalKey parsed_smallest; + if (ParseInternalKey(smallest->Encode(), &parsed_smallest) && + icmp_.Compare(start_key, parsed_smallest) < 0) { + start_key.user_key = parsed_smallest.user_key; + start_key.sequence = parsed_smallest.sequence; } } if (largest != nullptr) { - // To safely truncate the range tombstone's end key, it must extend past - // the largest key in the sstable (which may have been extended to the - // smallest key in the next sstable), and largest must be a tombstone - // sentinel key. A range tombstone may straddle two sstables and not be - // the tombstone sentinel key in the first sstable if a user-key also - // straddles the sstables (possible if there is a snapshot between the - // two versions of the user-key), in which case we cannot truncate the - // range tombstone. - if (icmp_.user_comparator()->Compare(tombstone.end_key_, - largest->user_key()) > 0 && - GetInternalKeySeqno(largest->Encode()) == kMaxSequenceNumber) { - tombstone.end_key_ = largest->user_key(); + ParsedInternalKey parsed_largest; + if (ParseInternalKey(largest->Encode(), &parsed_largest) && + icmp_.Compare(end_key, parsed_largest) > 0) { + end_key.user_key = parsed_largest.user_key; + if (parsed_largest.sequence != kMaxSequenceNumber) { + // The same user key straddles two adjacent sstables. To make sure we + // can truncate to a range that includes the largest range tombstone, + // set the tombstone end key's sequence number to 1 less than the + // largest key. + assert(parsed_largest.sequence != 0); + end_key.sequence = parsed_largest.sequence - 1; + } else { + // The SST file boundary was artificially extended by a range tombstone. + // We will not see any entries in this SST with this user key, so we + // can leave the seqnum at kMaxSequenceNumber. + } } } - auto seq = tombstone.seq_; - GetRangeDelMap(seq).AddTombstone(std::move(tombstone)); + TruncatedRangeTombstone tombstone(start_key, end_key, parsed_key.sequence); + GetRangeDelMap(parsed_key.sequence).AddTombstone(std::move(tombstone)); input->Next(); } if (!first_iter) { @@ -604,6 +644,11 @@ class MergingRangeDelIter : public RangeDelIterator { } void Seek(const Slice& target) override { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kMaxValue); + Seek(ikey); + } + + void Seek(const ParsedInternalKey& target) override { heap_.clear(); for (auto& iter : iters_) { iter->Seek(target); @@ -617,6 +662,7 @@ class MergingRangeDelIter : public RangeDelIterator { RangeTombstone Tombstone() const override { return current_->Tombstone(); } private: + // TODO: evaluate whether this is the best approach; maybe use truncated tombstones instead struct IterComparator { IterComparator(const Comparator* c) : cmp(c) {} diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index 66088b3d4eb..6968a9ee0dd 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -40,6 +40,28 @@ enum class RangeDelPositioningMode { kBinarySearch, }; +// TruncatedRangeTombstones are a slight generalization of regular +// RangeTombstones that can represent truncations caused by SST boundaries. +// Instead of using user keys to represent the start and end keys, they instead +// use internal keys, whose sequence number indicates the sequence number of +// the smallest/largest SST key (in the case where a tombstone is untruncated, +// the sequence numbers will be kMaxSequenceNumber for both start and end +// keys). Like RangeTombstones, TruncatedRangeTombstone are also +// end-key-exclusive. +struct TruncatedRangeTombstone { + TruncatedRangeTombstone(const ParsedInternalKey& sk, + const ParsedInternalKey& ek, SequenceNumber s) + : start_key_(sk), end_key_(ek), seq_(s) {} + + RangeTombstone Tombstone() const { + return RangeTombstone(start_key_.user_key, end_key_.user_key, seq_); + } + + ParsedInternalKey start_key_; + ParsedInternalKey end_key_; + SequenceNumber seq_; +}; + // A RangeDelIterator iterates over range deletion tombstones. class RangeDelIterator { public: @@ -47,7 +69,9 @@ class RangeDelIterator { virtual bool Valid() const = 0; virtual void Next() = 0; + // NOTE: the Slice passed to this method must be a user key. virtual void Seek(const Slice& target) = 0; + virtual void Seek(const ParsedInternalKey& target) = 0; virtual RangeTombstone Tombstone() const = 0; }; @@ -62,13 +86,14 @@ class RangeDelMap { virtual bool ShouldDelete(const ParsedInternalKey& parsed, RangeDelPositioningMode mode) = 0; - virtual bool IsRangeOverlapped(const Slice& start, const Slice& end) = 0; + virtual bool IsRangeOverlapped(const ParsedInternalKey& start, + const ParsedInternalKey& end) = 0; virtual void InvalidatePosition() = 0; virtual size_t Size() const = 0; bool IsEmpty() const { return Size() == 0; } - virtual void AddTombstone(RangeTombstone tombstone) = 0; + virtual void AddTombstone(TruncatedRangeTombstone tombstone) = 0; virtual std::unique_ptr NewIterator() = 0; }; diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index a5746df15f8..ee530fbf274 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -27,6 +27,12 @@ enum Direction { kReverse, }; +struct AddTombstonesArgs { + const std::vector tombstones; + const InternalKey* smallest; + const InternalKey* largest; +}; + static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator()); void AddTombstones(RangeDelAggregator* range_del_agg, @@ -54,8 +60,7 @@ void VerifyRangeDelIter( RangeDelIterator* range_del_iter, const std::vector& expected_range_dels) { size_t i = 0; - for (; range_del_iter->Valid() && i < expected_range_dels.size(); - range_del_iter->Next(), i++) { + for (; range_del_iter->Valid(); range_del_iter->Next(), i++) { VerifyTombstonesEq(expected_range_dels[i], range_del_iter->Tombstone()); } ASSERT_EQ(expected_range_dels.size(), i); @@ -63,22 +68,25 @@ void VerifyRangeDelIter( } void VerifyRangeDels( - const std::vector& range_dels_in, + const std::vector& all_args, const std::vector& expected_points, const std::vector& expected_collapsed_range_dels, - const InternalKey* smallest = nullptr, const InternalKey* largest = nullptr, const InternalKeyComparator& icmp = bytewise_icmp) { // Test same result regardless of which order the range deletions are added // and regardless of collapsed mode. for (bool collapsed : {false, true}) { for (Direction dir : {kForward, kReverse}) { RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, collapsed); + std::vector all_range_dels; - std::vector range_dels = range_dels_in; - if (dir == kReverse) { - std::reverse(range_dels.begin(), range_dels.end()); + for (const auto& args : all_args) { + std::vector range_dels = args.tombstones; + if (dir == kReverse) { + std::reverse(range_dels.begin(), range_dels.end()); + } + all_range_dels.insert(all_range_dels.end(), range_dels.begin(), range_dels.end()); + AddTombstones(&range_del_agg, range_dels, args.smallest, args.largest); } - AddTombstones(&range_del_agg, range_dels, smallest, largest); auto mode = RangeDelPositioningMode::kFullScan; if (collapsed) { @@ -90,38 +98,45 @@ void VerifyRangeDels( parsed_key.user_key = expected_point.begin; parsed_key.sequence = expected_point.seq; parsed_key.type = kTypeValue; - ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode)); + std::string ikey; + AppendInternalKey(&ikey, parsed_key); + ASSERT_FALSE(range_del_agg.ShouldDelete(ikey, mode)); if (parsed_key.sequence > 0) { --parsed_key.sequence; + ikey.clear(); + AppendInternalKey(&ikey, parsed_key); if (expected_point.expectAlive) { - ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode)); + ASSERT_FALSE(range_del_agg.ShouldDelete(ikey, mode)); } else { - ASSERT_TRUE(range_del_agg.ShouldDelete(parsed_key, mode)); + ASSERT_TRUE(range_del_agg.ShouldDelete(ikey, mode)); } } } if (collapsed) { - range_dels = expected_collapsed_range_dels; - VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels); - } else if (smallest == nullptr && largest == nullptr) { + all_range_dels = expected_collapsed_range_dels; + VerifyRangeDelIter(range_del_agg.NewIterator().get(), all_range_dels); + } else if (all_args.size() == 1 && all_args[0].smallest == nullptr && + all_args[0].largest == nullptr) { // Tombstones in an uncollapsed map are presented in start key // order. Tombstones with the same start key are presented in // insertion order. We don't handle tombstone truncation here, so the // verification is only performed if no truncation was requested. - std::stable_sort(range_dels.begin(), range_dels.end(), + std::stable_sort(all_range_dels.begin(), all_range_dels.end(), [&](const RangeTombstone& a, const RangeTombstone& b) { return icmp.user_comparator()->Compare( a.start_key_, b.start_key_) < 0; }); - VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels); + VerifyRangeDelIter(range_del_agg.NewIterator().get(), all_range_dels); } } } RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, false /* collapse_deletions */); - AddTombstones(&range_del_agg, range_dels_in); + for (const auto& args : all_args) { + AddTombstones(&range_del_agg, args.tombstones); + } for (size_t i = 1; i < expected_points.size(); ++i) { bool overlapped = range_del_agg.IsRangeOverlapped( expected_points[i - 1].begin, expected_points[i].begin); @@ -138,65 +153,64 @@ void VerifyRangeDels( TEST_F(RangeDelAggregatorTest, Empty) { VerifyRangeDels({}, {{"a", 0}}, {}); } TEST_F(RangeDelAggregatorTest, SameStartAndEnd) { - VerifyRangeDels({{"a", "a", 5}}, {{" ", 0}, {"a", 0}, {"b", 0}}, {}); + VerifyRangeDels({{{{"a", "a", 5}}}}, {{" ", 0}, {"a", 0}, {"b", 0}}, {}); } TEST_F(RangeDelAggregatorTest, Single) { - VerifyRangeDels({{"a", "b", 10}}, {{" ", 0}, {"a", 10}, {"b", 0}}, + VerifyRangeDels({{{{"a", "b", 10}}}}, {{" ", 0}, {"a", 10}, {"b", 0}}, {{"a", "b", 10}}); } TEST_F(RangeDelAggregatorTest, OverlapAboveLeft) { - VerifyRangeDels({{"a", "c", 10}, {"b", "d", 5}}, + VerifyRangeDels({{{{"a", "c", 10}, {"b", "d", 5}}}}, {{" ", 0}, {"a", 10}, {"c", 5}, {"d", 0}}, {{"a", "c", 10}, {"c", "d", 5}}); } TEST_F(RangeDelAggregatorTest, OverlapAboveRight) { - VerifyRangeDels({{"a", "c", 5}, {"b", "d", 10}}, + VerifyRangeDels({{{{"a", "c", 5}, {"b", "d", 10}}}}, {{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}}, {{"a", "b", 5}, {"b", "d", 10}}); } TEST_F(RangeDelAggregatorTest, OverlapAboveMiddle) { - VerifyRangeDels({{"a", "d", 5}, {"b", "c", 10}}, + VerifyRangeDels({{{{"a", "d", 5}, {"b", "c", 10}}}}, {{" ", 0}, {"a", 5}, {"b", 10}, {"c", 5}, {"d", 0}}, {{"a", "b", 5}, {"b", "c", 10}, {"c", "d", 5}}); } TEST_F(RangeDelAggregatorTest, OverlapAboveMiddleReverse) { - VerifyRangeDels({{"d", "a", 5}, {"c", "b", 10}}, + VerifyRangeDels({{{{"d", "a", 5}, {"c", "b", 10}}}}, {{"z", 0}, {"d", 5}, {"c", 10}, {"b", 5}, {"a", 0}}, {{"d", "c", 5}, {"c", "b", 10}, {"b", "a", 5}}, - nullptr /* smallest */, nullptr /* largest */, InternalKeyComparator(ReverseBytewiseComparator())); } TEST_F(RangeDelAggregatorTest, OverlapFully) { - VerifyRangeDels({{"a", "d", 10}, {"b", "c", 5}}, + VerifyRangeDels({{{{"a", "d", 10}, {"b", "c", 5}}}}, {{" ", 0}, {"a", 10}, {"d", 0}}, {{"a", "d", 10}}); } TEST_F(RangeDelAggregatorTest, OverlapPoint) { - VerifyRangeDels({{"a", "b", 5}, {"b", "c", 10}}, + VerifyRangeDels({{{{"a", "b", 5}, {"b", "c", 10}}}}, {{" ", 0}, {"a", 5}, {"b", 10}, {"c", 0}}, {{"a", "b", 5}, {"b", "c", 10}}); } TEST_F(RangeDelAggregatorTest, SameStartKey) { - VerifyRangeDels({{"a", "c", 5}, {"a", "b", 10}}, + VerifyRangeDels({{{{"a", "c", 5}, {"a", "b", 10}}}}, {{" ", 0}, {"a", 10}, {"b", 5}, {"c", 0}}, {{"a", "b", 10}, {"b", "c", 5}}); } TEST_F(RangeDelAggregatorTest, SameEndKey) { - VerifyRangeDels({{"a", "d", 5}, {"b", "d", 10}}, + VerifyRangeDels({{{{"a", "d", 5}, {"b", "d", 10}}}}, {{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}}, {{"a", "b", 5}, {"b", "d", 10}}); } TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) { - VerifyRangeDels({{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}, + VerifyRangeDels({{{{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}}}, {{" ", 0}, {"a", 5}, {"b", 0}, @@ -209,25 +223,25 @@ TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) { } TEST_F(RangeDelAggregatorTest, IdenticalSameSeqNo) { - VerifyRangeDels({{"a", "b", 5}, {"a", "b", 5}}, + VerifyRangeDels({{{{"a", "b", 5}, {"a", "b", 5}}}}, {{" ", 0}, {"a", 5}, {"b", 0}}, {{"a", "b", 5}}); } TEST_F(RangeDelAggregatorTest, ContiguousSameSeqNo) { - VerifyRangeDels({{"a", "b", 5}, {"b", "c", 5}}, + VerifyRangeDels({{{{"a", "b", 5}, {"b", "c", 5}}}}, {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 0}}, {{"a", "c", 5}}); } TEST_F(RangeDelAggregatorTest, OverlappingSameSeqNo) { - VerifyRangeDels({{"a", "c", 5}, {"b", "d", 5}}, + VerifyRangeDels({{{{"a", "c", 5}, {"b", "d", 5}}}}, {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}}, {{"a", "d", 5}}); } TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) { - VerifyRangeDels({{"a", "d", 5}, {"b", "c", 5}}, + VerifyRangeDels({{{{"a", "d", 5}, {"b", "c", 5}}}}, {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}}, {{"a", "d", 5}}); } @@ -236,27 +250,27 @@ TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) { // larger one when VerifyRangeDels() runs them in reverse TEST_F(RangeDelAggregatorTest, CoverMultipleFromLeft) { VerifyRangeDels( - {{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "f", 20}}, + {{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "f", 20}}}}, {{" ", 0}, {"a", 20}, {"f", 15}, {"g", 0}}, {{"a", "f", 20}, {"f", "g", 15}}); } TEST_F(RangeDelAggregatorTest, CoverMultipleFromRight) { VerifyRangeDels( - {{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"c", "h", 20}}, + {{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"c", "h", 20}}}}, {{" ", 0}, {"b", 5}, {"c", 20}, {"h", 0}}, {{"b", "c", 5}, {"c", "h", 20}}); } TEST_F(RangeDelAggregatorTest, CoverMultipleFully) { VerifyRangeDels( - {{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "h", 20}}, + {{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "h", 20}}}}, {{" ", 0}, {"a", 20}, {"h", 0}}, {{"a", "h", 20}}); } TEST_F(RangeDelAggregatorTest, AlternateMultipleAboveBelow) { VerifyRangeDels( - {{"b", "d", 15}, {"c", "f", 10}, {"e", "g", 20}, {"a", "h", 5}}, + {{{{"b", "d", 15}, {"c", "f", 10}, {"e", "g", 20}, {"a", "h", 5}}}}, {{" ", 0}, {"a", 5}, {"b", 15}, {"d", 10}, {"e", 20}, {"g", 5}, {"h", 0}}, {{"a", "b", 5}, {"b", "d", 15}, @@ -321,31 +335,112 @@ TEST_F(RangeDelAggregatorTest, MergingIteratorSeek) { } TEST_F(RangeDelAggregatorTest, TruncateTombstones) { - const InternalKey smallest("b", 1, kTypeRangeDeletion); + const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion); const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion); VerifyRangeDels( - {{"a", "c", 10}, {"d", "f", 10}}, + {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, {{"a", 10, true}, // truncated {"b", 10, false}, // not truncated {"d", 10, false}, // not truncated {"e", 10, true}}, // truncated - {{"b", "c", 10}, {"d", "e", 10}}, - &smallest, &largest); + {{"b", "c", 10}, {"d", "e", 10}}); } -TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateTombstones) { - const InternalKey smallest("b", 1, kTypeRangeDeletion); +TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateBelowTombstone) { + const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion); const InternalKey largest( "e", 3, // could happen if "e" is in consecutive sstables kTypeValue); VerifyRangeDels( - {{"a", "c", 10}, {"d", "f", 10}}, + {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, + {{"a", 10, true}, // truncated + {"b", 10, false}, // not truncated + {"d", 10, false}, // not truncated + {"e", 10, false}, // not truncated + {"e", 2, true}}, // truncated here + {{"b", "c", 10}, {"d", "e", 10}}); +} + +TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateAboveTombstone) { + const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion); + const InternalKey largest( + "e", 15, // could happen if "e" is in consecutive sstables + kTypeValue); + VerifyRangeDels( + {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, + {{"a", 10, true}, // truncated + {"b", 10, false}, // not truncated + {"d", 10, false}, // not truncated + {"e", kMaxSequenceNumber, true}}, // truncated + {{"b", "c", 10}, {"d", "e", 10}}); +} + +TEST_F(RangeDelAggregatorTest, OverlappingSmallestKeyTruncateBelowTombstone) { + const InternalKey smallest("b", 5, kTypeValue); + const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion); + VerifyRangeDels( + {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, {{"a", 10, true}, // truncated + {"b", 10, true}, // truncated + {"b", 6, false}, // not truncated; start boundary moved + {"d", 10, false}, // not truncated + {"e", kMaxSequenceNumber, true}}, // truncated + {{"b", "c", 10}, {"d", "e", 10}}); +} + +TEST_F(RangeDelAggregatorTest, OverlappingSmallestKeyTruncateAboveTombstone) { + const InternalKey smallest("b", 15, kTypeValue); + const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion); + VerifyRangeDels( + {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, + {{"a", 10, true}, // truncated + {"b", 15, true}, // truncated {"b", 10, false}, // not truncated {"d", 10, false}, // not truncated - {"e", 10, false}}, // not truncated - {{"b", "c", 10}, {"d", "f", 10}}, - &smallest, &largest); + {"e", kMaxSequenceNumber, true}}, // truncated + {{"b", "c", 10}, {"d", "e", 10}}); +} + +TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapAboveTombstone) { + const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion); + const InternalKey largest1("c", 20, kTypeValue); + const InternalKey smallest2("c", 10, kTypeValue); + const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion); + VerifyRangeDels( + {{{{"b", "d", 5}}, &smallest1, &largest1}, + {{{"b", "d", 5}}, &smallest2, &largest2}}, + {{"b", 5, false}, // not truncated + {"c", 5, false}}, // not truncated + {{"b", "c", 5}, {"c", "d", 5}}); // not collapsed due to boundaries +} + +TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapBelowTombstone) { + const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion); + const InternalKey largest1("c", 20, kTypeValue); + const InternalKey smallest2("c", 10, kTypeValue); + const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion); + VerifyRangeDels( + {{{{"b", "d", 30}}, &smallest1, &largest1}, + {{{"b", "d", 30}}, &smallest2, &largest2}}, + {{"b", 30, false}, // not truncated + {"c", 30, false}, // not truncated + {"c", 19, true}, // truncated here (keys in this range should not exist) + {"c", 11, false}}, // not truncated again + {{"b", "c", 30}, {"c", "d", 30}}); // not collapsed due to boundaries +} + +TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapContainsTombstone) { + const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion); + const InternalKey largest1("c", 20, kTypeValue); + const InternalKey smallest2("c", 10, kTypeValue); + const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion); + VerifyRangeDels( + {{{{"b", "d", 15}}, &smallest1, &largest1}, + {{{"b", "d", 15}}, &smallest2, &largest2}}, + {{"b", 15, false}, // not truncated + {"c", 15, true}, // truncated (keys in this range should not exist) + {"c", 11, false}}, // not truncated here + {{"b", "c", 15}, {"c", "d", 15}}); // not collapsed due to boundaries } } // namespace rocksdb diff --git a/db/table_cache.cc b/db/table_cache.cc index f374a68766d..1a51360ab90 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -183,7 +183,9 @@ InternalIterator* TableCache::NewIterator( const InternalKeyComparator& icomparator, const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor, TableReader** table_reader_ptr, HistogramImpl* file_read_hist, - bool for_compaction, Arena* arena, bool skip_filters, int level) { + bool for_compaction, Arena* arena, bool skip_filters, int level, + const Slice* smallest_compaction_key, + const Slice* largest_compaction_key) { PERF_TIMER_GUARD(new_table_iterator_nanos); Status s; @@ -266,10 +268,17 @@ InternalIterator* TableCache::NewIterator( s = range_del_iter->status(); } if (s.ok()) { - s = range_del_agg->AddTombstones( - std::move(range_del_iter), - &file_meta.smallest, - &file_meta.largest); + const InternalKey* smallest = &file_meta.smallest; + const InternalKey* largest = &file_meta.largest; + InternalKey smallest_compaction_ikey, largest_compaction_ikey; + if (smallest_compaction_key != nullptr) { + smallest_compaction_ikey.Set(*smallest_compaction_key, kMaxSequenceNumber, kTypeRangeDeletion); + } + if (largest_compaction_key != nullptr) { + largest_compaction_ikey.Set(*largest_compaction_key, kMaxSequenceNumber, kTypeRangeDeletion); + } + s = range_del_agg->AddTombstones(std::move(range_del_iter), smallest, + largest); } } } diff --git a/db/table_cache.h b/db/table_cache.h index 7e7f53cc1ff..73821139214 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -56,7 +56,9 @@ class TableCache { const SliceTransform* prefix_extractor = nullptr, TableReader** table_reader_ptr = nullptr, HistogramImpl* file_read_hist = nullptr, bool for_compaction = false, - Arena* arena = nullptr, bool skip_filters = false, int level = -1); + Arena* arena = nullptr, bool skip_filters = false, int level = -1, + const Slice* smallest_compaction_key = nullptr, + const Slice* largest_compaction_key = nullptr); // If a seek to internal key "k" in specified file finds an entry, // call (*handle_result)(arg, found_key, found_value) repeatedly until diff --git a/db/version_set.cc b/db/version_set.cc index 704dabaf882..1be2bec1f6f 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -458,6 +458,7 @@ bool SomeFileOverlapsRange( } namespace { + class LevelIterator final : public InternalIterator { public: LevelIterator(TableCache* table_cache, const ReadOptions& read_options, @@ -466,7 +467,9 @@ class LevelIterator final : public InternalIterator { const LevelFilesBrief* flevel, const SliceTransform* prefix_extractor, bool should_sample, HistogramImpl* file_read_hist, bool for_compaction, - bool skip_filters, int level, RangeDelAggregator* range_del_agg) + bool skip_filters, int level, RangeDelAggregator* range_del_agg, + const std::vector* + compaction_boundaries = nullptr) : table_cache_(table_cache), read_options_(read_options), env_options_(env_options), @@ -480,7 +483,8 @@ class LevelIterator final : public InternalIterator { file_index_(flevel_->num_files), level_(level), range_del_agg_(range_del_agg), - pinned_iters_mgr_(nullptr) { + pinned_iters_mgr_(nullptr), + compaction_boundaries_(compaction_boundaries) { // Empty level is not supported. assert(flevel_ != nullptr && flevel_->num_files > 0); } @@ -547,12 +551,18 @@ class LevelIterator final : public InternalIterator { sample_file_read_inc(file_meta.file_metadata); } + const Slice* smallest_compaction_key = nullptr; + const Slice* largest_compaction_key = nullptr; + if (compaction_boundaries_ != nullptr) { + smallest_compaction_key = &(*compaction_boundaries_)[file_index_].smallest; + largest_compaction_key = &(*compaction_boundaries_)[file_index_].largest; + } return table_cache_->NewIterator( read_options_, env_options_, icomparator_, *file_meta.file_metadata, range_del_agg_, prefix_extractor_, nullptr /* don't need reference to table */, file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_, - level_); + level_, smallest_compaction_key, largest_compaction_key); } TableCache* table_cache_; @@ -572,6 +582,10 @@ class LevelIterator final : public InternalIterator { RangeDelAggregator* range_del_agg_; IteratorWrapper file_iter_; // May be nullptr PinnedIteratorsManager* pinned_iters_mgr_; + + // To be propagated to RangeDelAggregator in order to safely truncate range + // tombstones. + const std::vector* compaction_boundaries_; }; void LevelIterator::Seek(const Slice& target) { @@ -4236,7 +4250,8 @@ InternalIterator* VersionSet::MakeInputIterator( false /* should_sample */, nullptr /* no per level latency histogram */, true /* for_compaction */, false /* skip_filters */, - static_cast(which) /* level */, range_del_agg); + static_cast(which) /* level */, range_del_agg, + c->boundaries(which)); } } } From 7923a120b28ad47ceff325f172cebd85afe28fd8 Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Mon, 1 Oct 2018 15:07:34 -0700 Subject: [PATCH 2/6] Improve a comment, fix clang compile error --- db/compaction.cc | 2 +- db/range_del_aggregator.h | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/db/compaction.cc b/db/compaction.cc index 65d39ec63d3..b5fed0fc326 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -127,7 +127,7 @@ std::vector Compaction::PopulateWithAtomicBoundaries( assert(inputs[i].files.size() == inputs[i].atomic_compaction_unit_boundaries.size()); } - return std::move(inputs); + return inputs; } // helper function to determine if compaction is creating files at the diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index 6968a9ee0dd..459de866db2 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -54,6 +54,13 @@ struct TruncatedRangeTombstone { : start_key_(sk), end_key_(ek), seq_(s) {} RangeTombstone Tombstone() const { + // The RangeTombstone returned here can cover less than the + // TruncatedRangeTombstone when its end key has a seqnum that is not + // kMaxSequenceNumber. Since this method is only used by RangeDelIterators + // (which in turn are only used during flush/compaction), we avoid this + // problem by using truncation boundaries spanning multiple SSTs, which + // are selected in a way that guarantee a clean break at the end key. + assert(end_key_.sequence == kMaxSequenceNumber); return RangeTombstone(start_key_.user_key, end_key_.user_key, seq_); } From ce5d47236717ee3b1bcad388733d900d74738ee2 Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Tue, 9 Oct 2018 12:11:01 -0700 Subject: [PATCH 3/6] Address PR comments, actually use compaction boundaries --- db/compaction.cc | 61 ++++++++++++++++++++++++-------- db/compaction.h | 24 +++++++++++-- db/range_del_aggregator.cc | 25 +++++++------ db/range_del_aggregator_test.cc | 2 +- db/table_cache.cc | 9 +++-- db/table_cache.h | 4 +-- db/version_set.cc | 62 +++------------------------------ 7 files changed, 94 insertions(+), 93 deletions(-) diff --git a/db/compaction.cc b/db/compaction.cc index b5fed0fc326..26d1ad3c657 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -23,6 +23,43 @@ namespace rocksdb { +const uint64_t kRangeTombstoneSentinel = + PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b) { + auto c = user_cmp->Compare(a.user_key(), b.user_key()); + if (c != 0) { + return c; + } + auto a_footer = ExtractInternalKeyFooter(a.Encode()); + auto b_footer = ExtractInternalKeyFooter(b.Encode()); + if (a_footer == kRangeTombstoneSentinel) { + if (b_footer != kRangeTombstoneSentinel) { + return -1; + } + } else if (b_footer == kRangeTombstoneSentinel) { + return 1; + } + return 0; +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b) { + if (a == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, *a, b); +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b) { + if (b == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, a, *b); +} + uint64_t TotalFileSize(const std::vector& files) { uint64_t sum = 0; for (size_t i = 0; i < files.size() && files[i]; i++) { @@ -90,7 +127,6 @@ std::vector Compaction::PopulateWithAtomicBoundaries( } inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size()); AtomicCompactionUnitBoundary cur_boundary; - uint64_t cur_end_key_footer = 0; size_t first_atomic_idx = 0; auto add_unit_boundary = [&](size_t to) { if (first_atomic_idx == to) return; @@ -101,27 +137,22 @@ std::vector Compaction::PopulateWithAtomicBoundaries( }; for (size_t j = 0; j < inputs[i].files.size(); j++) { const auto* f = inputs[i].files[j]; - const Slice& start_user_key = f->smallest.user_key(); - const Slice& end_user_key = f->largest.user_key(); - if (first_atomic_idx == j) { - // First file in an atomic compaction unit. - cur_boundary.smallest = start_user_key; - cur_boundary.largest = end_user_key; - } else if (ucmp->Compare(cur_boundary.largest, start_user_key) == 0 && - cur_end_key_footer != - PackSequenceAndType(kMaxSequenceNumber, - kTypeRangeDeletion)) { + if (j == 0) { + // First file in a level. + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) == + 0) { // SSTs overlap but the end key of the previous file was not // artificially extended by a range tombstone. Extend the current // boundary. - cur_boundary.largest = end_user_key; + cur_boundary.largest = &f->largest; } else { // Atomic compaction unit has ended. add_unit_boundary(j); - cur_boundary.smallest = start_user_key; - cur_boundary.largest = end_user_key; + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; } - cur_end_key_footer = ExtractInternalKeyFooter(f->largest.Encode()); } add_unit_boundary(inputs[i].files.size()); assert(inputs[i].files.size() == diff --git a/db/compaction.h b/db/compaction.h index 58793c65a03..2cf737b676a 100644 --- a/db/compaction.h +++ b/db/compaction.h @@ -15,6 +15,26 @@ namespace rocksdb { +// Utility for comparing sstable boundary keys. Returns -1 if either a or b is +// null which provides the property that a==null indicates a key that is less +// than any key and b==null indicates a key that is greater than any key. Note +// that the comparison is performed primarily on the user-key portion of the +// key. If the user-keys compare equal, an additional test is made to sort +// range tombstone sentinel keys before other keys with the same user-key. The +// result is that 2 user-keys will compare equal if they differ purely on +// their sequence number and value, but the range tombstone sentinel for that +// user-key will compare not equal. This is necessary because the range +// tombstone sentinel key is set as the largest key for an sstable even though +// that key never appears in the database. We don't want adjacent sstables to +// be considered overlapping if they are separated by the range tombstone +// sentinel. +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b); + // An AtomicCompactionUnitBoundary represents a range of keys [smallest, // largest] that exactly spans one ore more neighbouring SSTs on the same // level. Every pair of SSTs in this range "overlap" (i.e., the largest @@ -22,8 +42,8 @@ namespace rocksdb { // boundaries are propagated down to RangeDelAggregator during compaction // to provide safe truncation boundaries for range tombstones. struct AtomicCompactionUnitBoundary { - Slice smallest; - Slice largest; + const InternalKey* smallest = nullptr; + const InternalKey* largest = nullptr; }; // The structure that manages compaction input files associated diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index 787b0d1b198..adf81d4e755 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -13,7 +13,8 @@ namespace rocksdb { struct TombstoneStartKeyComparator { TombstoneStartKeyComparator(const InternalKeyComparator* c) : cmp(c) {} - bool operator()(const TruncatedRangeTombstone& a, const TruncatedRangeTombstone& b) const { + bool operator()(const TruncatedRangeTombstone& a, + const TruncatedRangeTombstone& b) const { return cmp->Compare(a.start_key_, b.start_key_) < 0; } @@ -23,7 +24,8 @@ struct TombstoneStartKeyComparator { struct ParsedInternalKeyComparator { ParsedInternalKeyComparator(const InternalKeyComparator* c) : cmp(c) {} - bool operator()(const ParsedInternalKey& a, const ParsedInternalKey& b) const { + bool operator()(const ParsedInternalKey& a, + const ParsedInternalKey& b) const { return cmp->Compare(a, b) < 0; } @@ -33,7 +35,8 @@ struct ParsedInternalKeyComparator { // An UncollapsedRangeDelMap is quick to create but slow to answer ShouldDelete // queries. class UncollapsedRangeDelMap : public RangeDelMap { - typedef std::multiset Rep; + typedef std::multiset + Rep; class Iterator : public RangeDelIterator { const Rep& rep_; @@ -45,12 +48,15 @@ class UncollapsedRangeDelMap : public RangeDelMap { void Next() override { iter_++; } void Seek(const Slice&) override { - fprintf(stderr, "UncollapsedRangeDelMap::Iterator::Seek(Slice&) unimplemented\n"); + fprintf(stderr, + "UncollapsedRangeDelMap::Iterator::Seek(Slice&) unimplemented\n"); abort(); } void Seek(const ParsedInternalKey&) override { - fprintf(stderr, "UncollapsedRangeDelMap::Iterator::Seek(ParsedInternalKey&) unimplemented\n"); + fprintf(stderr, + "UncollapsedRangeDelMap::Iterator::Seek(ParsedInternalKey&) " + "unimplemented\n"); abort(); } @@ -75,7 +81,6 @@ class UncollapsedRangeDelMap : public RangeDelMap { if (parsed.sequence < tombstone.seq_ && icmp_->Compare(parsed, tombstone.end_key_) < 0) { return true; - } else { } } return false; @@ -465,6 +470,7 @@ bool RangeDelAggregator::ShouldDeleteImpl(const Slice& internal_key, bool RangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed, RangeDelPositioningMode mode) { + assert(IsValueType(parsed.type)); assert(rep_ != nullptr); auto& tombstone_map = GetRangeDelMap(parsed.sequence); if (tombstone_map.IsEmpty()) { @@ -552,9 +558,9 @@ Status RangeDelAggregator::AddTombstones( end_key.user_key = parsed_largest.user_key; if (parsed_largest.sequence != kMaxSequenceNumber) { // The same user key straddles two adjacent sstables. To make sure we - // can truncate to a range that includes the largest range tombstone, - // set the tombstone end key's sequence number to 1 less than the - // largest key. + // can truncate to a range that includes the largest point key in the + // first sstable, set the tombstone end key's sequence number to 1 + // less than the largest key. assert(parsed_largest.sequence != 0); end_key.sequence = parsed_largest.sequence - 1; } else { @@ -662,7 +668,6 @@ class MergingRangeDelIter : public RangeDelIterator { RangeTombstone Tombstone() const override { return current_->Tombstone(); } private: - // TODO: evaluate whether this is the best approach; maybe use truncated tombstones instead struct IterComparator { IterComparator(const Comparator* c) : cmp(c) {} diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index ee530fbf274..10a84a9ff9a 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -135,7 +135,7 @@ void VerifyRangeDels( RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, false /* collapse_deletions */); for (const auto& args : all_args) { - AddTombstones(&range_del_agg, args.tombstones); + AddTombstones(&range_del_agg, args.tombstones, args.smallest, args.largest); } for (size_t i = 1; i < expected_points.size(); ++i) { bool overlapped = range_del_agg.IsRangeOverlapped( diff --git a/db/table_cache.cc b/db/table_cache.cc index 1a51360ab90..e9a5ee9ab12 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -184,8 +184,8 @@ InternalIterator* TableCache::NewIterator( RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor, TableReader** table_reader_ptr, HistogramImpl* file_read_hist, bool for_compaction, Arena* arena, bool skip_filters, int level, - const Slice* smallest_compaction_key, - const Slice* largest_compaction_key) { + const InternalKey* smallest_compaction_key, + const InternalKey* largest_compaction_key) { PERF_TIMER_GUARD(new_table_iterator_nanos); Status s; @@ -270,12 +270,11 @@ InternalIterator* TableCache::NewIterator( if (s.ok()) { const InternalKey* smallest = &file_meta.smallest; const InternalKey* largest = &file_meta.largest; - InternalKey smallest_compaction_ikey, largest_compaction_ikey; if (smallest_compaction_key != nullptr) { - smallest_compaction_ikey.Set(*smallest_compaction_key, kMaxSequenceNumber, kTypeRangeDeletion); + smallest = smallest_compaction_key; } if (largest_compaction_key != nullptr) { - largest_compaction_ikey.Set(*largest_compaction_key, kMaxSequenceNumber, kTypeRangeDeletion); + largest = largest_compaction_key; } s = range_del_agg->AddTombstones(std::move(range_del_iter), smallest, largest); diff --git a/db/table_cache.h b/db/table_cache.h index 73821139214..61e1824013d 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -57,8 +57,8 @@ class TableCache { TableReader** table_reader_ptr = nullptr, HistogramImpl* file_read_hist = nullptr, bool for_compaction = false, Arena* arena = nullptr, bool skip_filters = false, int level = -1, - const Slice* smallest_compaction_key = nullptr, - const Slice* largest_compaction_key = nullptr); + const InternalKey* smallest_compaction_key = nullptr, + const InternalKey* largest_compaction_key = nullptr); // If a seek to internal key "k" in specified file finds an entry, // call (*handle_result)(arg, found_key, found_value) repeatedly until diff --git a/db/version_set.cc b/db/version_set.cc index 1be2bec1f6f..b176c46a33b 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -551,11 +551,11 @@ class LevelIterator final : public InternalIterator { sample_file_read_inc(file_meta.file_metadata); } - const Slice* smallest_compaction_key = nullptr; - const Slice* largest_compaction_key = nullptr; + const InternalKey* smallest_compaction_key = nullptr; + const InternalKey* largest_compaction_key = nullptr; if (compaction_boundaries_ != nullptr) { - smallest_compaction_key = &(*compaction_boundaries_)[file_index_].smallest; - largest_compaction_key = &(*compaction_boundaries_)[file_index_].largest; + smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest; + largest_compaction_key = (*compaction_boundaries_)[file_index_].largest; } return table_cache_->NewIterator( read_options_, env_options_, icomparator_, *file_meta.file_metadata, @@ -2136,60 +2136,6 @@ void VersionStorageInfo::GetCleanInputsWithinInterval( true /* within_interval */); } -namespace { - -const uint64_t kRangeTombstoneSentinel = - PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); - -// Utility for comparing sstable boundary keys. Returns -1 if either a or b is -// null which provides the property that a==null indicates a key that is less -// than any key and b==null indicates a key that is greater than any key. Note -// that the comparison is performed primarily on the user-key portion of the -// key. If the user-keys compare equal, an additional test is made to sort -// range tombstone sentinel keys before other keys with the same user-key. The -// result is that 2 user-keys will compare equal if they differ purely on -// their sequence number and value, but the range tombstone sentinel for that -// user-key will compare not equal. This is necessary because the range -// tombstone sentinel key is set as the largest key for an sstable even though -// that key never appears in the database. We don't want adjacent sstables to -// be considered overlapping if they are separated by the range tombstone -// sentinel. -int sstableKeyCompare(const Comparator* user_cmp, - const InternalKey& a, const InternalKey& b) { - auto c = user_cmp->Compare(a.user_key(), b.user_key()); - if (c != 0) { - return c; - } - auto a_footer = ExtractInternalKeyFooter(a.Encode()); - auto b_footer = ExtractInternalKeyFooter(b.Encode()); - if (a_footer == kRangeTombstoneSentinel) { - if (b_footer != kRangeTombstoneSentinel) { - return -1; - } - } else if (b_footer == kRangeTombstoneSentinel) { - return 1; - } - return 0; -} - -int sstableKeyCompare(const Comparator* user_cmp, - const InternalKey* a, const InternalKey& b) { - if (a == nullptr) { - return -1; - } - return sstableKeyCompare(user_cmp, *a, b); -} - -int sstableKeyCompare(const Comparator* user_cmp, - const InternalKey& a, const InternalKey* b) { - if (b == nullptr) { - return -1; - } - return sstableKeyCompare(user_cmp, a, *b); -} - -} // namespace - // Store in "*inputs" all files in "level" that overlap [begin,end] // Employ binary search to find at least one file that overlaps the // specified range. From that file, iterate backwards and From 7820f5e18f125ab8ee0675f8f633f966d4a11735 Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Tue, 9 Oct 2018 13:12:59 -0700 Subject: [PATCH 4/6] Include tombstones that start at next SST's smallest key If SSTs overlap, we need these tombstones to cover keys in the current SST. RangeDelAggregator's truncation will deal with extraneous tombstones added this way by truncating them to be empty. --- db/compaction_job.cc | 9 +++++---- db/range_del_aggregator_test.cc | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/db/compaction_job.cc b/db/compaction_job.cc index b0a19ead40a..b62c3bf6950 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1212,10 +1212,11 @@ Status CompactionJob::FinishCompactionOutputFile( for (; it->Valid(); it->Next()) { auto tombstone = it->Tombstone(); if (upper_bound != nullptr && - ucmp->Compare(*upper_bound, tombstone.start_key_) <= 0) { - // Tombstones starting at upper_bound or later only need to be included - // in the next table. Break because subsequent tombstones will start - // even later. + ucmp->Compare(*upper_bound, tombstone.start_key_) < 0) { + // Tombstones starting after upper_bound only need to be included in the + // next table (if the SSTs overlap, then upper_bound is contained in + // this SST and hence must be covered). Break because subsequent + // tombstones will start even later. break; } diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index 10a84a9ff9a..e089ca9027b 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -443,6 +443,26 @@ TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapContainsTombstone) { {{"b", "c", 15}, {"c", "d", 15}}); // not collapsed due to boundaries } +TEST_F(RangeDelAggregatorTest, FileCoversOneKeyAndTombstoneAbove) { + const InternalKey smallest("a", kMaxSequenceNumber, kTypeRangeDeletion); + const InternalKey largest("a", 20, kTypeValue); + VerifyRangeDels( + {{{{"a", "b", 35}}, &smallest, &largest}}, + {{"a", 40, true}, // not truncated + {"a", 35, false}}, // not truncated + {{"a", "a", 35}}); // empty tombstone but can't occur during a compaction +} + +TEST_F(RangeDelAggregatorTest, FileCoversOneKeyAndTombstoneBelow) { + const InternalKey smallest("a", kMaxSequenceNumber, kTypeRangeDeletion); + const InternalKey largest("a", 20, kTypeValue); + VerifyRangeDels( + {{{{"a", "b", 15}}, &smallest, &largest}}, + {{"a", 20, true}, // truncated here + {"a", 15, true}}, // truncated + {{"a", "a", 15}}); // empty tombstone but can't occur during a compaction +} + } // namespace rocksdb int main(int argc, char** argv) { From 45aa08e2a8ab5820811f23f86fc78033dfa2a6e8 Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Tue, 9 Oct 2018 13:38:37 -0700 Subject: [PATCH 5/6] Fix linter errors --- db/range_del_aggregator.cc | 12 +++++++++--- db/range_del_aggregator_test.cc | 7 ++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index adf81d4e755..194a20ed0c0 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -10,8 +10,11 @@ namespace rocksdb { +namespace { + struct TombstoneStartKeyComparator { - TombstoneStartKeyComparator(const InternalKeyComparator* c) : cmp(c) {} + explicit TombstoneStartKeyComparator(const InternalKeyComparator* c) + : cmp(c) {} bool operator()(const TruncatedRangeTombstone& a, const TruncatedRangeTombstone& b) const { @@ -22,7 +25,8 @@ struct TombstoneStartKeyComparator { }; struct ParsedInternalKeyComparator { - ParsedInternalKeyComparator(const InternalKeyComparator* c) : cmp(c) {} + explicit ParsedInternalKeyComparator(const InternalKeyComparator* c) + : cmp(c) {} bool operator()(const ParsedInternalKey& a, const ParsedInternalKey& b) const { @@ -32,6 +36,8 @@ struct ParsedInternalKeyComparator { const InternalKeyComparator* cmp; }; +} // namespace + // An UncollapsedRangeDelMap is quick to create but slow to answer ShouldDelete // queries. class UncollapsedRangeDelMap : public RangeDelMap { @@ -67,7 +73,7 @@ class UncollapsedRangeDelMap : public RangeDelMap { const InternalKeyComparator* icmp_; public: - UncollapsedRangeDelMap(const InternalKeyComparator* icmp) + explicit UncollapsedRangeDelMap(const InternalKeyComparator* icmp) : rep_(TombstoneStartKeyComparator(icmp)), icmp_(icmp) {} bool ShouldDelete(const ParsedInternalKey& parsed, diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index e089ca9027b..a98d33c43a3 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -29,8 +29,8 @@ enum Direction { struct AddTombstonesArgs { const std::vector tombstones; - const InternalKey* smallest; - const InternalKey* largest; + const InternalKey* smallest = nullptr; + const InternalKey* largest = nullptr; }; static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator()); @@ -84,7 +84,8 @@ void VerifyRangeDels( if (dir == kReverse) { std::reverse(range_dels.begin(), range_dels.end()); } - all_range_dels.insert(all_range_dels.end(), range_dels.begin(), range_dels.end()); + all_range_dels.insert(all_range_dels.end(), range_dels.begin(), + range_dels.end()); AddTombstones(&range_del_agg, range_dels, args.smallest, args.largest); } From e8d49e0e7a2e236a1bc315c4c07d770f8ed393d3 Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Tue, 9 Oct 2018 14:15:03 -0700 Subject: [PATCH 6/6] Remove default initialization (disallows initializer list) --- db/range_del_aggregator_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index a98d33c43a3..2cfc6540ea3 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -29,8 +29,8 @@ enum Direction { struct AddTombstonesArgs { const std::vector tombstones; - const InternalKey* smallest = nullptr; - const InternalKey* largest = nullptr; + const InternalKey* smallest; + const InternalKey* largest; }; static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());