diff --git a/db/compaction.cc b/db/compaction.cc index 4ea92d5cc78..26d1ad3c657 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -23,6 +23,43 @@ namespace rocksdb { +const uint64_t kRangeTombstoneSentinel = + PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b) { + auto c = user_cmp->Compare(a.user_key(), b.user_key()); + if (c != 0) { + return c; + } + auto a_footer = ExtractInternalKeyFooter(a.Encode()); + auto b_footer = ExtractInternalKeyFooter(b.Encode()); + if (a_footer == kRangeTombstoneSentinel) { + if (b_footer != kRangeTombstoneSentinel) { + return -1; + } + } else if (b_footer == kRangeTombstoneSentinel) { + return 1; + } + return 0; +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b) { + if (a == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, *a, b); +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b) { + if (b == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, a, *b); +} + uint64_t TotalFileSize(const std::vector& files) { uint64_t sum = 0; for (size_t i = 0; i < files.size() && files[i]; i++) { @@ -81,6 +118,49 @@ void Compaction::GetBoundaryKeys( } } +std::vector Compaction::PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector inputs) { + const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); + for (size_t i = 0; i < inputs.size(); i++) { + if (inputs[i].level == 0 || inputs[i].files.empty()) { + continue; + } + inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size()); + AtomicCompactionUnitBoundary cur_boundary; + size_t first_atomic_idx = 0; + auto add_unit_boundary = [&](size_t to) { + if (first_atomic_idx == to) return; + for (size_t k = first_atomic_idx; k < to; k++) { + inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary); + } + first_atomic_idx = to; + }; + for (size_t j = 0; j < inputs[i].files.size(); j++) { + const auto* f = inputs[i].files[j]; + if (j == 0) { + // First file in a level. + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) == + 0) { + // SSTs overlap but the end key of the previous file was not + // artificially extended by a range tombstone. Extend the current + // boundary. + cur_boundary.largest = &f->largest; + } else { + // Atomic compaction unit has ended. + add_unit_boundary(j); + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } + } + add_unit_boundary(inputs[i].files.size()); + assert(inputs[i].files.size() == + inputs[i].atomic_compaction_unit_boundaries.size()); + } + return inputs; +} + // helper function to determine if compaction is creating files at the // bottommost level bool Compaction::IsBottommostLevel( @@ -155,7 +235,7 @@ Compaction::Compaction(VersionStorageInfo* vstorage, output_compression_(_compression), output_compression_opts_(_compression_opts), deletion_compaction_(_deletion_compaction), - inputs_(std::move(_inputs)), + inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))), grandparents_(std::move(_grandparents)), score_(_score), bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)), diff --git a/db/compaction.h b/db/compaction.h index f1d78771696..2cf737b676a 100644 --- a/db/compaction.h +++ b/db/compaction.h @@ -15,11 +15,43 @@ namespace rocksdb { +// Utility for comparing sstable boundary keys. Returns -1 if either a or b is +// null which provides the property that a==null indicates a key that is less +// than any key and b==null indicates a key that is greater than any key. Note +// that the comparison is performed primarily on the user-key portion of the +// key. If the user-keys compare equal, an additional test is made to sort +// range tombstone sentinel keys before other keys with the same user-key. The +// result is that 2 user-keys will compare equal if they differ purely on +// their sequence number and value, but the range tombstone sentinel for that +// user-key will compare not equal. This is necessary because the range +// tombstone sentinel key is set as the largest key for an sstable even though +// that key never appears in the database. We don't want adjacent sstables to +// be considered overlapping if they are separated by the range tombstone +// sentinel. +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b); + +// An AtomicCompactionUnitBoundary represents a range of keys [smallest, +// largest] that exactly spans one ore more neighbouring SSTs on the same +// level. Every pair of SSTs in this range "overlap" (i.e., the largest +// user key of one file is the smallest user key of the next file). These +// boundaries are propagated down to RangeDelAggregator during compaction +// to provide safe truncation boundaries for range tombstones. +struct AtomicCompactionUnitBoundary { + const InternalKey* smallest = nullptr; + const InternalKey* largest = nullptr; +}; + // The structure that manages compaction input files associated // with the same physical level. struct CompactionInputFiles { int level; std::vector files; + std::vector atomic_compaction_unit_boundaries; inline bool empty() const { return files.empty(); } inline size_t size() const { return files.size(); } inline void clear() { files.clear(); } @@ -96,6 +128,12 @@ class Compaction { return inputs_[compaction_input_level][i]; } + const std::vector* boundaries( + size_t compaction_input_level) const { + assert(compaction_input_level < inputs_.size()); + return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries; + } + // Returns the list of file meta data of the specified compaction // input level. // REQUIREMENT: "compaction_input_level" must be >= 0 and @@ -262,6 +300,13 @@ class Compaction { const std::vector& inputs, Slice* smallest_key, Slice* largest_key); + // Get the atomic file boundaries for all files in the compaction. Necessary + // in order to avoid the scenario described in + // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and plumb + // down appropriate key boundaries to RangeDelAggregator during compaction. + static std::vector PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector inputs); + // helper function to determine if compaction with inputs and storage is // bottommost static bool IsBottommostLevel( diff --git a/db/compaction_job.cc b/db/compaction_job.cc index b0a19ead40a..b62c3bf6950 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1212,10 +1212,11 @@ Status CompactionJob::FinishCompactionOutputFile( for (; it->Valid(); it->Next()) { auto tombstone = it->Tombstone(); if (upper_bound != nullptr && - ucmp->Compare(*upper_bound, tombstone.start_key_) <= 0) { - // Tombstones starting at upper_bound or later only need to be included - // in the next table. Break because subsequent tombstones will start - // even later. + ucmp->Compare(*upper_bound, tombstone.start_key_) < 0) { + // Tombstones starting after upper_bound only need to be included in the + // next table (if the SSTs overlap, then upper_bound is contained in + // this SST and hence must be covered). Break because subsequent + // tombstones will start even later. break; } diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index d1885603c6c..194a20ed0c0 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -10,20 +10,39 @@ namespace rocksdb { +namespace { + struct TombstoneStartKeyComparator { - TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {} + explicit TombstoneStartKeyComparator(const InternalKeyComparator* c) + : cmp(c) {} - bool operator()(const RangeTombstone& a, const RangeTombstone& b) const { + bool operator()(const TruncatedRangeTombstone& a, + const TruncatedRangeTombstone& b) const { return cmp->Compare(a.start_key_, b.start_key_) < 0; } - const Comparator* cmp; + const InternalKeyComparator* cmp; +}; + +struct ParsedInternalKeyComparator { + explicit ParsedInternalKeyComparator(const InternalKeyComparator* c) + : cmp(c) {} + + bool operator()(const ParsedInternalKey& a, + const ParsedInternalKey& b) const { + return cmp->Compare(a, b) < 0; + } + + const InternalKeyComparator* cmp; }; +} // namespace + // An UncollapsedRangeDelMap is quick to create but slow to answer ShouldDelete // queries. class UncollapsedRangeDelMap : public RangeDelMap { - typedef std::multiset Rep; + typedef std::multiset + Rep; class Iterator : public RangeDelIterator { const Rep& rep_; @@ -35,48 +54,57 @@ class UncollapsedRangeDelMap : public RangeDelMap { void Next() override { iter_++; } void Seek(const Slice&) override { - fprintf(stderr, "UncollapsedRangeDelMap::Iterator::Seek unimplemented\n"); + fprintf(stderr, + "UncollapsedRangeDelMap::Iterator::Seek(Slice&) unimplemented\n"); + abort(); + } + + void Seek(const ParsedInternalKey&) override { + fprintf(stderr, + "UncollapsedRangeDelMap::Iterator::Seek(ParsedInternalKey&) " + "unimplemented\n"); abort(); } - RangeTombstone Tombstone() const override { return *iter_; } + RangeTombstone Tombstone() const override { return iter_->Tombstone(); } }; Rep rep_; - const Comparator* ucmp_; + const InternalKeyComparator* icmp_; public: - UncollapsedRangeDelMap(const Comparator* ucmp) - : rep_(TombstoneStartKeyComparator(ucmp)), ucmp_(ucmp) {} + explicit UncollapsedRangeDelMap(const InternalKeyComparator* icmp) + : rep_(TombstoneStartKeyComparator(icmp)), icmp_(icmp) {} bool ShouldDelete(const ParsedInternalKey& parsed, RangeDelPositioningMode mode) override { (void)mode; assert(mode == RangeDelPositioningMode::kFullScan); for (const auto& tombstone : rep_) { - if (ucmp_->Compare(parsed.user_key, tombstone.start_key_) < 0) { + if (icmp_->Compare(parsed, tombstone.start_key_) < 0) { break; } if (parsed.sequence < tombstone.seq_ && - ucmp_->Compare(parsed.user_key, tombstone.end_key_) < 0) { + icmp_->Compare(parsed, tombstone.end_key_) < 0) { return true; } } return false; } - bool IsRangeOverlapped(const Slice& start, const Slice& end) override { + bool IsRangeOverlapped(const ParsedInternalKey& start, + const ParsedInternalKey& end) override { for (const auto& tombstone : rep_) { - if (ucmp_->Compare(start, tombstone.end_key_) < 0 && - ucmp_->Compare(tombstone.start_key_, end) <= 0 && - ucmp_->Compare(tombstone.start_key_, tombstone.end_key_) < 0) { + if (icmp_->Compare(start, tombstone.end_key_) < 0 && + icmp_->Compare(tombstone.start_key_, end) <= 0 && + icmp_->Compare(tombstone.start_key_, tombstone.end_key_) < 0) { return true; } } return false; } - void AddTombstone(RangeTombstone tombstone) override { + void AddTombstone(TruncatedRangeTombstone tombstone) override { rep_.emplace(tombstone); } @@ -126,7 +154,9 @@ class UncollapsedRangeDelMap : public RangeDelMap { // compared against the map entry g → 3 and determined to be uncovered. By // contrast, the key h @ 2 would be determined to be covered. class CollapsedRangeDelMap : public RangeDelMap { - typedef std::map Rep; + typedef std::map + Rep; class Iterator : public RangeDelIterator { void MaybeSeekPastSentinel() { @@ -148,7 +178,12 @@ class CollapsedRangeDelMap : public RangeDelMap { MaybeSeekPastSentinel(); } - void Seek(const Slice& target) override { + void Seek(const Slice&) override { + fprintf(stderr, "CollapsedRangeDelMap::Iterator::Seek(Slice&) unimplemented\n"); + abort(); + } + + void Seek(const ParsedInternalKey& target) override { iter_ = rep_.upper_bound(target); if (iter_ != rep_.begin()) { iter_--; @@ -161,8 +196,8 @@ class CollapsedRangeDelMap : public RangeDelMap { assert(std::next(iter_) != rep_.end()); assert(iter_->second != 0); RangeTombstone tombstone; - tombstone.start_key_ = iter_->first; - tombstone.end_key_ = std::next(iter_)->first; + tombstone.start_key_ = iter_->first.user_key; + tombstone.end_key_ = std::next(iter_)->first.user_key; tombstone.seq_ = iter_->second; return tombstone; } @@ -170,12 +205,12 @@ class CollapsedRangeDelMap : public RangeDelMap { Rep rep_; Rep::iterator iter_; - const Comparator* ucmp_; + const InternalKeyComparator* icmp_; public: - explicit CollapsedRangeDelMap(const Comparator* ucmp) - : rep_(stl_wrappers::LessOfComparator(ucmp)), - ucmp_(ucmp) { + explicit CollapsedRangeDelMap(const InternalKeyComparator* icmp) + : rep_(ParsedInternalKeyComparator(icmp)), + icmp_(icmp) { InvalidatePosition(); } @@ -194,29 +229,29 @@ class CollapsedRangeDelMap : public RangeDelMap { case RangeDelPositioningMode::kForwardTraversal: assert(iter_ != rep_.end()); if (iter_ == rep_.begin() && - ucmp_->Compare(parsed.user_key, iter_->first) < 0) { + icmp_->Compare(parsed, iter_->first) < 0) { // before start of deletion intervals return false; } while (std::next(iter_) != rep_.end() && - ucmp_->Compare(std::next(iter_)->first, parsed.user_key) <= 0) { + icmp_->Compare(std::next(iter_)->first, parsed) <= 0) { ++iter_; } break; case RangeDelPositioningMode::kBackwardTraversal: assert(iter_ != rep_.end()); while (iter_ != rep_.begin() && - ucmp_->Compare(parsed.user_key, iter_->first) < 0) { + icmp_->Compare(parsed, iter_->first) < 0) { --iter_; } if (iter_ == rep_.begin() && - ucmp_->Compare(parsed.user_key, iter_->first) < 0) { + icmp_->Compare(parsed, iter_->first) < 0) { // before start of deletion intervals return false; } break; case RangeDelPositioningMode::kBinarySearch: - iter_ = rep_.upper_bound(parsed.user_key); + iter_ = rep_.upper_bound(parsed); if (iter_ == rep_.begin()) { // before start of deletion intervals return false; @@ -225,21 +260,22 @@ class CollapsedRangeDelMap : public RangeDelMap { break; } assert(iter_ != rep_.end() && - ucmp_->Compare(iter_->first, parsed.user_key) <= 0); + icmp_->Compare(iter_->first, parsed) <= 0); assert(std::next(iter_) == rep_.end() || - ucmp_->Compare(parsed.user_key, std::next(iter_)->first) < 0); + icmp_->Compare(parsed, std::next(iter_)->first) < 0); return parsed.sequence < iter_->second; } - bool IsRangeOverlapped(const Slice&, const Slice&) override { + bool IsRangeOverlapped(const ParsedInternalKey&, + const ParsedInternalKey&) override { // Unimplemented because the only client of this method, file ingestion, // uses uncollapsed maps. fprintf(stderr, "CollapsedRangeDelMap::IsRangeOverlapped unimplemented"); abort(); } - void AddTombstone(RangeTombstone t) override { - if (ucmp_->Compare(t.start_key_, t.end_key_) >= 0 || t.seq_ == 0) { + void AddTombstone(TruncatedRangeTombstone t) override { + if (icmp_->Compare(t.start_key_, t.end_key_) >= 0 || t.seq_ == 0) { // The tombstone covers no keys. Nothing to do. return; } @@ -272,7 +308,8 @@ class CollapsedRangeDelMap : public RangeDelMap { end_seq = prev_seq(); Rep::iterator pit; if (it != rep_.begin() && (pit = std::prev(it)) != rep_.begin() && - ucmp_->Compare(pit->first, t.start_key_) == 0 && std::prev(pit)->second == t.seq_) { + icmp_->Compare(pit->first, t.start_key_) == 0 && + std::prev(pit)->second == t.seq_) { // The new tombstone starts at the end of an existing tombstone with an // identical seqno: // @@ -297,7 +334,7 @@ class CollapsedRangeDelMap : public RangeDelMap { } // Look at all the existing transitions that overlap the new tombstone. - while (it != rep_.end() && ucmp_->Compare(it->first, t.end_key_) < 0) { + while (it != rep_.end() && icmp_->Compare(it->first, t.end_key_) < 0) { if (t.seq_ >= it->second) { // The transition is to an existing tombstone that the new tombstone // covers. Save the covered tombstone's seqno. We'll need to return to @@ -343,12 +380,14 @@ class CollapsedRangeDelMap : public RangeDelMap { if (t.seq_ == prev_seq()) { // The new tombstone is unterminated in the map. - if (it != rep_.end() && t.seq_ == it->second && ucmp_->Compare(it->first, t.end_key_) == 0) { + if (it != rep_.end() && t.seq_ == it->second && + icmp_->Compare(it->first, t.end_key_) == 0) { // The new tombstone ends at the start of another tombstone with an // identical seqno. Merge the tombstones by removing the existing // tombstone's start key. rep_.erase(it); - } else if (end_seq == prev_seq() || (it != rep_.end() && end_seq == it->second)) { + } else if (end_seq == prev_seq() || + (it != rep_.end() && end_seq == it->second)) { // The new tombstone is implicitly ended because its end point is // contained within an existing tombstone with the same seqno: // @@ -363,7 +402,8 @@ class CollapsedRangeDelMap : public RangeDelMap { // Install one that returns to the last seqno we covered. Because end // keys are exclusive, if there's an existing transition at t.end_key_, // it takes precedence over the transition that we install here. - rep_.emplace(t.end_key_, end_seq); // emplace is a noop if existing entry + rep_.emplace(t.end_key_, + end_seq); // emplace is a noop if existing entry } } else { // The new tombstone is implicitly ended because its end point is covered @@ -416,9 +456,9 @@ void RangeDelAggregator::InitRep(const std::vector& snapshots) { std::unique_ptr RangeDelAggregator::NewRangeDelMap() { RangeDelMap* tombstone_map; if (collapse_deletions_) { - tombstone_map = new CollapsedRangeDelMap(icmp_.user_comparator()); + tombstone_map = new CollapsedRangeDelMap(&icmp_); } else { - tombstone_map = new UncollapsedRangeDelMap(icmp_.user_comparator()); + tombstone_map = new UncollapsedRangeDelMap(&icmp_); } return std::unique_ptr(tombstone_map); } @@ -429,8 +469,9 @@ bool RangeDelAggregator::ShouldDeleteImpl(const Slice& internal_key, ParsedInternalKey parsed; if (!ParseInternalKey(internal_key, &parsed)) { assert(false); + return false; } - return ShouldDelete(parsed, mode); + return ShouldDeleteImpl(parsed, mode); } bool RangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed, @@ -452,8 +493,10 @@ bool RangeDelAggregator::IsRangeOverlapped(const Slice& start, if (rep_ == nullptr) { return false; } + ParsedInternalKey start_ikey(start, kMaxSequenceNumber, kMaxValue); + ParsedInternalKey end_ikey(end, 0, static_cast(0)); for (const auto& stripe : rep_->stripe_map_) { - if (stripe.second->IsRangeOverlapped(start, end)) { + if (stripe.second->IsRangeOverlapped(start_ikey, end_ikey)) { return true; } } @@ -492,40 +535,49 @@ Status RangeDelAggregator::AddTombstones( if (!parsed) { return Status::Corruption("Unable to parse range tombstone InternalKey"); } - RangeTombstone tombstone; + Slice end_user_key; if (input->IsValuePinned()) { - tombstone = RangeTombstone(parsed_key, input->value()); + end_user_key = input->value(); } else { // The tombstone map holds slices into the iterator's memory. Make a // copy of the value if it is not pinned. rep_->pinned_slices_.emplace_back(input->value().data(), input->value().size()); - tombstone = RangeTombstone(parsed_key, rep_->pinned_slices_.back()); + end_user_key = rep_->pinned_slices_.back(); } + ParsedInternalKey start_key(parsed_key.user_key, kMaxSequenceNumber, + kMaxValue); + ParsedInternalKey end_key(end_user_key, kMaxSequenceNumber, kMaxValue); // Truncate the tombstone to the range [smallest, largest]. if (smallest != nullptr) { - if (icmp_.user_comparator()->Compare( - tombstone.start_key_, smallest->user_key()) < 0) { - tombstone.start_key_ = smallest->user_key(); + ParsedInternalKey parsed_smallest; + if (ParseInternalKey(smallest->Encode(), &parsed_smallest) && + icmp_.Compare(start_key, parsed_smallest) < 0) { + start_key.user_key = parsed_smallest.user_key; + start_key.sequence = parsed_smallest.sequence; } } if (largest != nullptr) { - // To safely truncate the range tombstone's end key, it must extend past - // the largest key in the sstable (which may have been extended to the - // smallest key in the next sstable), and largest must be a tombstone - // sentinel key. A range tombstone may straddle two sstables and not be - // the tombstone sentinel key in the first sstable if a user-key also - // straddles the sstables (possible if there is a snapshot between the - // two versions of the user-key), in which case we cannot truncate the - // range tombstone. - if (icmp_.user_comparator()->Compare(tombstone.end_key_, - largest->user_key()) > 0 && - GetInternalKeySeqno(largest->Encode()) == kMaxSequenceNumber) { - tombstone.end_key_ = largest->user_key(); + ParsedInternalKey parsed_largest; + if (ParseInternalKey(largest->Encode(), &parsed_largest) && + icmp_.Compare(end_key, parsed_largest) > 0) { + end_key.user_key = parsed_largest.user_key; + if (parsed_largest.sequence != kMaxSequenceNumber) { + // The same user key straddles two adjacent sstables. To make sure we + // can truncate to a range that includes the largest point key in the + // first sstable, set the tombstone end key's sequence number to 1 + // less than the largest key. + assert(parsed_largest.sequence != 0); + end_key.sequence = parsed_largest.sequence - 1; + } else { + // The SST file boundary was artificially extended by a range tombstone. + // We will not see any entries in this SST with this user key, so we + // can leave the seqnum at kMaxSequenceNumber. + } } } - auto seq = tombstone.seq_; - GetRangeDelMap(seq).AddTombstone(std::move(tombstone)); + TruncatedRangeTombstone tombstone(start_key, end_key, parsed_key.sequence); + GetRangeDelMap(parsed_key.sequence).AddTombstone(std::move(tombstone)); input->Next(); } if (!first_iter) { @@ -604,6 +656,11 @@ class MergingRangeDelIter : public RangeDelIterator { } void Seek(const Slice& target) override { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kMaxValue); + Seek(ikey); + } + + void Seek(const ParsedInternalKey& target) override { heap_.clear(); for (auto& iter : iters_) { iter->Seek(target); diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index 66088b3d4eb..459de866db2 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -40,6 +40,35 @@ enum class RangeDelPositioningMode { kBinarySearch, }; +// TruncatedRangeTombstones are a slight generalization of regular +// RangeTombstones that can represent truncations caused by SST boundaries. +// Instead of using user keys to represent the start and end keys, they instead +// use internal keys, whose sequence number indicates the sequence number of +// the smallest/largest SST key (in the case where a tombstone is untruncated, +// the sequence numbers will be kMaxSequenceNumber for both start and end +// keys). Like RangeTombstones, TruncatedRangeTombstone are also +// end-key-exclusive. +struct TruncatedRangeTombstone { + TruncatedRangeTombstone(const ParsedInternalKey& sk, + const ParsedInternalKey& ek, SequenceNumber s) + : start_key_(sk), end_key_(ek), seq_(s) {} + + RangeTombstone Tombstone() const { + // The RangeTombstone returned here can cover less than the + // TruncatedRangeTombstone when its end key has a seqnum that is not + // kMaxSequenceNumber. Since this method is only used by RangeDelIterators + // (which in turn are only used during flush/compaction), we avoid this + // problem by using truncation boundaries spanning multiple SSTs, which + // are selected in a way that guarantee a clean break at the end key. + assert(end_key_.sequence == kMaxSequenceNumber); + return RangeTombstone(start_key_.user_key, end_key_.user_key, seq_); + } + + ParsedInternalKey start_key_; + ParsedInternalKey end_key_; + SequenceNumber seq_; +}; + // A RangeDelIterator iterates over range deletion tombstones. class RangeDelIterator { public: @@ -47,7 +76,9 @@ class RangeDelIterator { virtual bool Valid() const = 0; virtual void Next() = 0; + // NOTE: the Slice passed to this method must be a user key. virtual void Seek(const Slice& target) = 0; + virtual void Seek(const ParsedInternalKey& target) = 0; virtual RangeTombstone Tombstone() const = 0; }; @@ -62,13 +93,14 @@ class RangeDelMap { virtual bool ShouldDelete(const ParsedInternalKey& parsed, RangeDelPositioningMode mode) = 0; - virtual bool IsRangeOverlapped(const Slice& start, const Slice& end) = 0; + virtual bool IsRangeOverlapped(const ParsedInternalKey& start, + const ParsedInternalKey& end) = 0; virtual void InvalidatePosition() = 0; virtual size_t Size() const = 0; bool IsEmpty() const { return Size() == 0; } - virtual void AddTombstone(RangeTombstone tombstone) = 0; + virtual void AddTombstone(TruncatedRangeTombstone tombstone) = 0; virtual std::unique_ptr NewIterator() = 0; }; diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index a5746df15f8..2cfc6540ea3 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -27,6 +27,12 @@ enum Direction { kReverse, }; +struct AddTombstonesArgs { + const std::vector tombstones; + const InternalKey* smallest; + const InternalKey* largest; +}; + static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator()); void AddTombstones(RangeDelAggregator* range_del_agg, @@ -54,8 +60,7 @@ void VerifyRangeDelIter( RangeDelIterator* range_del_iter, const std::vector& expected_range_dels) { size_t i = 0; - for (; range_del_iter->Valid() && i < expected_range_dels.size(); - range_del_iter->Next(), i++) { + for (; range_del_iter->Valid(); range_del_iter->Next(), i++) { VerifyTombstonesEq(expected_range_dels[i], range_del_iter->Tombstone()); } ASSERT_EQ(expected_range_dels.size(), i); @@ -63,22 +68,26 @@ void VerifyRangeDelIter( } void VerifyRangeDels( - const std::vector& range_dels_in, + const std::vector& all_args, const std::vector& expected_points, const std::vector& expected_collapsed_range_dels, - const InternalKey* smallest = nullptr, const InternalKey* largest = nullptr, const InternalKeyComparator& icmp = bytewise_icmp) { // Test same result regardless of which order the range deletions are added // and regardless of collapsed mode. for (bool collapsed : {false, true}) { for (Direction dir : {kForward, kReverse}) { RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, collapsed); + std::vector all_range_dels; - std::vector range_dels = range_dels_in; - if (dir == kReverse) { - std::reverse(range_dels.begin(), range_dels.end()); + for (const auto& args : all_args) { + std::vector range_dels = args.tombstones; + if (dir == kReverse) { + std::reverse(range_dels.begin(), range_dels.end()); + } + all_range_dels.insert(all_range_dels.end(), range_dels.begin(), + range_dels.end()); + AddTombstones(&range_del_agg, range_dels, args.smallest, args.largest); } - AddTombstones(&range_del_agg, range_dels, smallest, largest); auto mode = RangeDelPositioningMode::kFullScan; if (collapsed) { @@ -90,38 +99,45 @@ void VerifyRangeDels( parsed_key.user_key = expected_point.begin; parsed_key.sequence = expected_point.seq; parsed_key.type = kTypeValue; - ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode)); + std::string ikey; + AppendInternalKey(&ikey, parsed_key); + ASSERT_FALSE(range_del_agg.ShouldDelete(ikey, mode)); if (parsed_key.sequence > 0) { --parsed_key.sequence; + ikey.clear(); + AppendInternalKey(&ikey, parsed_key); if (expected_point.expectAlive) { - ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode)); + ASSERT_FALSE(range_del_agg.ShouldDelete(ikey, mode)); } else { - ASSERT_TRUE(range_del_agg.ShouldDelete(parsed_key, mode)); + ASSERT_TRUE(range_del_agg.ShouldDelete(ikey, mode)); } } } if (collapsed) { - range_dels = expected_collapsed_range_dels; - VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels); - } else if (smallest == nullptr && largest == nullptr) { + all_range_dels = expected_collapsed_range_dels; + VerifyRangeDelIter(range_del_agg.NewIterator().get(), all_range_dels); + } else if (all_args.size() == 1 && all_args[0].smallest == nullptr && + all_args[0].largest == nullptr) { // Tombstones in an uncollapsed map are presented in start key // order. Tombstones with the same start key are presented in // insertion order. We don't handle tombstone truncation here, so the // verification is only performed if no truncation was requested. - std::stable_sort(range_dels.begin(), range_dels.end(), + std::stable_sort(all_range_dels.begin(), all_range_dels.end(), [&](const RangeTombstone& a, const RangeTombstone& b) { return icmp.user_comparator()->Compare( a.start_key_, b.start_key_) < 0; }); - VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels); + VerifyRangeDelIter(range_del_agg.NewIterator().get(), all_range_dels); } } } RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, false /* collapse_deletions */); - AddTombstones(&range_del_agg, range_dels_in); + for (const auto& args : all_args) { + AddTombstones(&range_del_agg, args.tombstones, args.smallest, args.largest); + } for (size_t i = 1; i < expected_points.size(); ++i) { bool overlapped = range_del_agg.IsRangeOverlapped( expected_points[i - 1].begin, expected_points[i].begin); @@ -138,65 +154,64 @@ void VerifyRangeDels( TEST_F(RangeDelAggregatorTest, Empty) { VerifyRangeDels({}, {{"a", 0}}, {}); } TEST_F(RangeDelAggregatorTest, SameStartAndEnd) { - VerifyRangeDels({{"a", "a", 5}}, {{" ", 0}, {"a", 0}, {"b", 0}}, {}); + VerifyRangeDels({{{{"a", "a", 5}}}}, {{" ", 0}, {"a", 0}, {"b", 0}}, {}); } TEST_F(RangeDelAggregatorTest, Single) { - VerifyRangeDels({{"a", "b", 10}}, {{" ", 0}, {"a", 10}, {"b", 0}}, + VerifyRangeDels({{{{"a", "b", 10}}}}, {{" ", 0}, {"a", 10}, {"b", 0}}, {{"a", "b", 10}}); } TEST_F(RangeDelAggregatorTest, OverlapAboveLeft) { - VerifyRangeDels({{"a", "c", 10}, {"b", "d", 5}}, + VerifyRangeDels({{{{"a", "c", 10}, {"b", "d", 5}}}}, {{" ", 0}, {"a", 10}, {"c", 5}, {"d", 0}}, {{"a", "c", 10}, {"c", "d", 5}}); } TEST_F(RangeDelAggregatorTest, OverlapAboveRight) { - VerifyRangeDels({{"a", "c", 5}, {"b", "d", 10}}, + VerifyRangeDels({{{{"a", "c", 5}, {"b", "d", 10}}}}, {{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}}, {{"a", "b", 5}, {"b", "d", 10}}); } TEST_F(RangeDelAggregatorTest, OverlapAboveMiddle) { - VerifyRangeDels({{"a", "d", 5}, {"b", "c", 10}}, + VerifyRangeDels({{{{"a", "d", 5}, {"b", "c", 10}}}}, {{" ", 0}, {"a", 5}, {"b", 10}, {"c", 5}, {"d", 0}}, {{"a", "b", 5}, {"b", "c", 10}, {"c", "d", 5}}); } TEST_F(RangeDelAggregatorTest, OverlapAboveMiddleReverse) { - VerifyRangeDels({{"d", "a", 5}, {"c", "b", 10}}, + VerifyRangeDels({{{{"d", "a", 5}, {"c", "b", 10}}}}, {{"z", 0}, {"d", 5}, {"c", 10}, {"b", 5}, {"a", 0}}, {{"d", "c", 5}, {"c", "b", 10}, {"b", "a", 5}}, - nullptr /* smallest */, nullptr /* largest */, InternalKeyComparator(ReverseBytewiseComparator())); } TEST_F(RangeDelAggregatorTest, OverlapFully) { - VerifyRangeDels({{"a", "d", 10}, {"b", "c", 5}}, + VerifyRangeDels({{{{"a", "d", 10}, {"b", "c", 5}}}}, {{" ", 0}, {"a", 10}, {"d", 0}}, {{"a", "d", 10}}); } TEST_F(RangeDelAggregatorTest, OverlapPoint) { - VerifyRangeDels({{"a", "b", 5}, {"b", "c", 10}}, + VerifyRangeDels({{{{"a", "b", 5}, {"b", "c", 10}}}}, {{" ", 0}, {"a", 5}, {"b", 10}, {"c", 0}}, {{"a", "b", 5}, {"b", "c", 10}}); } TEST_F(RangeDelAggregatorTest, SameStartKey) { - VerifyRangeDels({{"a", "c", 5}, {"a", "b", 10}}, + VerifyRangeDels({{{{"a", "c", 5}, {"a", "b", 10}}}}, {{" ", 0}, {"a", 10}, {"b", 5}, {"c", 0}}, {{"a", "b", 10}, {"b", "c", 5}}); } TEST_F(RangeDelAggregatorTest, SameEndKey) { - VerifyRangeDels({{"a", "d", 5}, {"b", "d", 10}}, + VerifyRangeDels({{{{"a", "d", 5}, {"b", "d", 10}}}}, {{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}}, {{"a", "b", 5}, {"b", "d", 10}}); } TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) { - VerifyRangeDels({{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}, + VerifyRangeDels({{{{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}}}, {{" ", 0}, {"a", 5}, {"b", 0}, @@ -209,25 +224,25 @@ TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) { } TEST_F(RangeDelAggregatorTest, IdenticalSameSeqNo) { - VerifyRangeDels({{"a", "b", 5}, {"a", "b", 5}}, + VerifyRangeDels({{{{"a", "b", 5}, {"a", "b", 5}}}}, {{" ", 0}, {"a", 5}, {"b", 0}}, {{"a", "b", 5}}); } TEST_F(RangeDelAggregatorTest, ContiguousSameSeqNo) { - VerifyRangeDels({{"a", "b", 5}, {"b", "c", 5}}, + VerifyRangeDels({{{{"a", "b", 5}, {"b", "c", 5}}}}, {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 0}}, {{"a", "c", 5}}); } TEST_F(RangeDelAggregatorTest, OverlappingSameSeqNo) { - VerifyRangeDels({{"a", "c", 5}, {"b", "d", 5}}, + VerifyRangeDels({{{{"a", "c", 5}, {"b", "d", 5}}}}, {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}}, {{"a", "d", 5}}); } TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) { - VerifyRangeDels({{"a", "d", 5}, {"b", "c", 5}}, + VerifyRangeDels({{{{"a", "d", 5}, {"b", "c", 5}}}}, {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}}, {{"a", "d", 5}}); } @@ -236,27 +251,27 @@ TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) { // larger one when VerifyRangeDels() runs them in reverse TEST_F(RangeDelAggregatorTest, CoverMultipleFromLeft) { VerifyRangeDels( - {{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "f", 20}}, + {{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "f", 20}}}}, {{" ", 0}, {"a", 20}, {"f", 15}, {"g", 0}}, {{"a", "f", 20}, {"f", "g", 15}}); } TEST_F(RangeDelAggregatorTest, CoverMultipleFromRight) { VerifyRangeDels( - {{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"c", "h", 20}}, + {{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"c", "h", 20}}}}, {{" ", 0}, {"b", 5}, {"c", 20}, {"h", 0}}, {{"b", "c", 5}, {"c", "h", 20}}); } TEST_F(RangeDelAggregatorTest, CoverMultipleFully) { VerifyRangeDels( - {{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "h", 20}}, + {{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "h", 20}}}}, {{" ", 0}, {"a", 20}, {"h", 0}}, {{"a", "h", 20}}); } TEST_F(RangeDelAggregatorTest, AlternateMultipleAboveBelow) { VerifyRangeDels( - {{"b", "d", 15}, {"c", "f", 10}, {"e", "g", 20}, {"a", "h", 5}}, + {{{{"b", "d", 15}, {"c", "f", 10}, {"e", "g", 20}, {"a", "h", 5}}}}, {{" ", 0}, {"a", 5}, {"b", 15}, {"d", 10}, {"e", 20}, {"g", 5}, {"h", 0}}, {{"a", "b", 5}, {"b", "d", 15}, @@ -321,31 +336,132 @@ TEST_F(RangeDelAggregatorTest, MergingIteratorSeek) { } TEST_F(RangeDelAggregatorTest, TruncateTombstones) { - const InternalKey smallest("b", 1, kTypeRangeDeletion); + const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion); const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion); VerifyRangeDels( - {{"a", "c", 10}, {"d", "f", 10}}, + {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, {{"a", 10, true}, // truncated {"b", 10, false}, // not truncated {"d", 10, false}, // not truncated {"e", 10, true}}, // truncated - {{"b", "c", 10}, {"d", "e", 10}}, - &smallest, &largest); + {{"b", "c", 10}, {"d", "e", 10}}); } -TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateTombstones) { - const InternalKey smallest("b", 1, kTypeRangeDeletion); +TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateBelowTombstone) { + const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion); const InternalKey largest( "e", 3, // could happen if "e" is in consecutive sstables kTypeValue); VerifyRangeDels( - {{"a", "c", 10}, {"d", "f", 10}}, + {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, + {{"a", 10, true}, // truncated + {"b", 10, false}, // not truncated + {"d", 10, false}, // not truncated + {"e", 10, false}, // not truncated + {"e", 2, true}}, // truncated here + {{"b", "c", 10}, {"d", "e", 10}}); +} + +TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateAboveTombstone) { + const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion); + const InternalKey largest( + "e", 15, // could happen if "e" is in consecutive sstables + kTypeValue); + VerifyRangeDels( + {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, + {{"a", 10, true}, // truncated + {"b", 10, false}, // not truncated + {"d", 10, false}, // not truncated + {"e", kMaxSequenceNumber, true}}, // truncated + {{"b", "c", 10}, {"d", "e", 10}}); +} + +TEST_F(RangeDelAggregatorTest, OverlappingSmallestKeyTruncateBelowTombstone) { + const InternalKey smallest("b", 5, kTypeValue); + const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion); + VerifyRangeDels( + {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, + {{"a", 10, true}, // truncated + {"b", 10, true}, // truncated + {"b", 6, false}, // not truncated; start boundary moved + {"d", 10, false}, // not truncated + {"e", kMaxSequenceNumber, true}}, // truncated + {{"b", "c", 10}, {"d", "e", 10}}); +} + +TEST_F(RangeDelAggregatorTest, OverlappingSmallestKeyTruncateAboveTombstone) { + const InternalKey smallest("b", 15, kTypeValue); + const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion); + VerifyRangeDels( + {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, {{"a", 10, true}, // truncated + {"b", 15, true}, // truncated {"b", 10, false}, // not truncated {"d", 10, false}, // not truncated - {"e", 10, false}}, // not truncated - {{"b", "c", 10}, {"d", "f", 10}}, - &smallest, &largest); + {"e", kMaxSequenceNumber, true}}, // truncated + {{"b", "c", 10}, {"d", "e", 10}}); +} + +TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapAboveTombstone) { + const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion); + const InternalKey largest1("c", 20, kTypeValue); + const InternalKey smallest2("c", 10, kTypeValue); + const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion); + VerifyRangeDels( + {{{{"b", "d", 5}}, &smallest1, &largest1}, + {{{"b", "d", 5}}, &smallest2, &largest2}}, + {{"b", 5, false}, // not truncated + {"c", 5, false}}, // not truncated + {{"b", "c", 5}, {"c", "d", 5}}); // not collapsed due to boundaries +} + +TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapBelowTombstone) { + const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion); + const InternalKey largest1("c", 20, kTypeValue); + const InternalKey smallest2("c", 10, kTypeValue); + const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion); + VerifyRangeDels( + {{{{"b", "d", 30}}, &smallest1, &largest1}, + {{{"b", "d", 30}}, &smallest2, &largest2}}, + {{"b", 30, false}, // not truncated + {"c", 30, false}, // not truncated + {"c", 19, true}, // truncated here (keys in this range should not exist) + {"c", 11, false}}, // not truncated again + {{"b", "c", 30}, {"c", "d", 30}}); // not collapsed due to boundaries +} + +TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapContainsTombstone) { + const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion); + const InternalKey largest1("c", 20, kTypeValue); + const InternalKey smallest2("c", 10, kTypeValue); + const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion); + VerifyRangeDels( + {{{{"b", "d", 15}}, &smallest1, &largest1}, + {{{"b", "d", 15}}, &smallest2, &largest2}}, + {{"b", 15, false}, // not truncated + {"c", 15, true}, // truncated (keys in this range should not exist) + {"c", 11, false}}, // not truncated here + {{"b", "c", 15}, {"c", "d", 15}}); // not collapsed due to boundaries +} + +TEST_F(RangeDelAggregatorTest, FileCoversOneKeyAndTombstoneAbove) { + const InternalKey smallest("a", kMaxSequenceNumber, kTypeRangeDeletion); + const InternalKey largest("a", 20, kTypeValue); + VerifyRangeDels( + {{{{"a", "b", 35}}, &smallest, &largest}}, + {{"a", 40, true}, // not truncated + {"a", 35, false}}, // not truncated + {{"a", "a", 35}}); // empty tombstone but can't occur during a compaction +} + +TEST_F(RangeDelAggregatorTest, FileCoversOneKeyAndTombstoneBelow) { + const InternalKey smallest("a", kMaxSequenceNumber, kTypeRangeDeletion); + const InternalKey largest("a", 20, kTypeValue); + VerifyRangeDels( + {{{{"a", "b", 15}}, &smallest, &largest}}, + {{"a", 20, true}, // truncated here + {"a", 15, true}}, // truncated + {{"a", "a", 15}}); // empty tombstone but can't occur during a compaction } } // namespace rocksdb diff --git a/db/table_cache.cc b/db/table_cache.cc index f374a68766d..e9a5ee9ab12 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -183,7 +183,9 @@ InternalIterator* TableCache::NewIterator( const InternalKeyComparator& icomparator, const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor, TableReader** table_reader_ptr, HistogramImpl* file_read_hist, - bool for_compaction, Arena* arena, bool skip_filters, int level) { + bool for_compaction, Arena* arena, bool skip_filters, int level, + const InternalKey* smallest_compaction_key, + const InternalKey* largest_compaction_key) { PERF_TIMER_GUARD(new_table_iterator_nanos); Status s; @@ -266,10 +268,16 @@ InternalIterator* TableCache::NewIterator( s = range_del_iter->status(); } if (s.ok()) { - s = range_del_agg->AddTombstones( - std::move(range_del_iter), - &file_meta.smallest, - &file_meta.largest); + const InternalKey* smallest = &file_meta.smallest; + const InternalKey* largest = &file_meta.largest; + if (smallest_compaction_key != nullptr) { + smallest = smallest_compaction_key; + } + if (largest_compaction_key != nullptr) { + largest = largest_compaction_key; + } + s = range_del_agg->AddTombstones(std::move(range_del_iter), smallest, + largest); } } } diff --git a/db/table_cache.h b/db/table_cache.h index 7e7f53cc1ff..61e1824013d 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -56,7 +56,9 @@ class TableCache { const SliceTransform* prefix_extractor = nullptr, TableReader** table_reader_ptr = nullptr, HistogramImpl* file_read_hist = nullptr, bool for_compaction = false, - Arena* arena = nullptr, bool skip_filters = false, int level = -1); + Arena* arena = nullptr, bool skip_filters = false, int level = -1, + const InternalKey* smallest_compaction_key = nullptr, + const InternalKey* largest_compaction_key = nullptr); // If a seek to internal key "k" in specified file finds an entry, // call (*handle_result)(arg, found_key, found_value) repeatedly until diff --git a/db/version_set.cc b/db/version_set.cc index 434d791e74d..dd1d38e42b6 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -451,6 +451,7 @@ bool SomeFileOverlapsRange( } namespace { + class LevelIterator final : public InternalIterator { public: LevelIterator(TableCache* table_cache, const ReadOptions& read_options, @@ -459,7 +460,9 @@ class LevelIterator final : public InternalIterator { const LevelFilesBrief* flevel, const SliceTransform* prefix_extractor, bool should_sample, HistogramImpl* file_read_hist, bool for_compaction, - bool skip_filters, int level, RangeDelAggregator* range_del_agg) + bool skip_filters, int level, RangeDelAggregator* range_del_agg, + const std::vector* + compaction_boundaries = nullptr) : table_cache_(table_cache), read_options_(read_options), env_options_(env_options), @@ -473,7 +476,8 @@ class LevelIterator final : public InternalIterator { file_index_(flevel_->num_files), level_(level), range_del_agg_(range_del_agg), - pinned_iters_mgr_(nullptr) { + pinned_iters_mgr_(nullptr), + compaction_boundaries_(compaction_boundaries) { // Empty level is not supported. assert(flevel_ != nullptr && flevel_->num_files > 0); } @@ -540,12 +544,18 @@ class LevelIterator final : public InternalIterator { sample_file_read_inc(file_meta.file_metadata); } + const InternalKey* smallest_compaction_key = nullptr; + const InternalKey* largest_compaction_key = nullptr; + if (compaction_boundaries_ != nullptr) { + smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest; + largest_compaction_key = (*compaction_boundaries_)[file_index_].largest; + } return table_cache_->NewIterator( read_options_, env_options_, icomparator_, *file_meta.file_metadata, range_del_agg_, prefix_extractor_, nullptr /* don't need reference to table */, file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_, - level_); + level_, smallest_compaction_key, largest_compaction_key); } TableCache* table_cache_; @@ -565,6 +575,10 @@ class LevelIterator final : public InternalIterator { RangeDelAggregator* range_del_agg_; IteratorWrapper file_iter_; // May be nullptr PinnedIteratorsManager* pinned_iters_mgr_; + + // To be propagated to RangeDelAggregator in order to safely truncate range + // tombstones. + const std::vector* compaction_boundaries_; }; void LevelIterator::Seek(const Slice& target) { @@ -2140,60 +2154,6 @@ void VersionStorageInfo::GetCleanInputsWithinInterval( true /* within_interval */); } -namespace { - -const uint64_t kRangeTombstoneSentinel = - PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); - -// Utility for comparing sstable boundary keys. Returns -1 if either a or b is -// null which provides the property that a==null indicates a key that is less -// than any key and b==null indicates a key that is greater than any key. Note -// that the comparison is performed primarily on the user-key portion of the -// key. If the user-keys compare equal, an additional test is made to sort -// range tombstone sentinel keys before other keys with the same user-key. The -// result is that 2 user-keys will compare equal if they differ purely on -// their sequence number and value, but the range tombstone sentinel for that -// user-key will compare not equal. This is necessary because the range -// tombstone sentinel key is set as the largest key for an sstable even though -// that key never appears in the database. We don't want adjacent sstables to -// be considered overlapping if they are separated by the range tombstone -// sentinel. -int sstableKeyCompare(const Comparator* user_cmp, - const InternalKey& a, const InternalKey& b) { - auto c = user_cmp->Compare(a.user_key(), b.user_key()); - if (c != 0) { - return c; - } - auto a_footer = ExtractInternalKeyFooter(a.Encode()); - auto b_footer = ExtractInternalKeyFooter(b.Encode()); - if (a_footer == kRangeTombstoneSentinel) { - if (b_footer != kRangeTombstoneSentinel) { - return -1; - } - } else if (b_footer == kRangeTombstoneSentinel) { - return 1; - } - return 0; -} - -int sstableKeyCompare(const Comparator* user_cmp, - const InternalKey* a, const InternalKey& b) { - if (a == nullptr) { - return -1; - } - return sstableKeyCompare(user_cmp, *a, b); -} - -int sstableKeyCompare(const Comparator* user_cmp, - const InternalKey& a, const InternalKey* b) { - if (b == nullptr) { - return -1; - } - return sstableKeyCompare(user_cmp, a, *b); -} - -} // namespace - // Store in "*inputs" all files in "level" that overlap [begin,end] // Employ binary search to find at least one file that overlaps the // specified range. From that file, iterate backwards and @@ -4266,7 +4226,8 @@ InternalIterator* VersionSet::MakeInputIterator( false /* should_sample */, nullptr /* no per level latency histogram */, true /* for_compaction */, false /* skip_filters */, - static_cast(which) /* level */, range_del_agg); + static_cast(which) /* level */, range_del_agg, + c->boundaries(which)); } } }