From 182f9366a128eda2a4a632a54b009d0335146ade Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Tue, 7 May 2024 17:05:21 -0400 Subject: [PATCH 1/2] internal/keyspan: add special SPANSTART, SPANEND key kinds --- batch.go | 2 +- batchrepr/reader.go | 2 +- ingest.go | 6 +-- internal/base/internal.go | 17 ++++++- internal/base/internal_test.go | 2 +- internal/keyspan/interleaving_iter.go | 17 ++++++- internal/keyspan/interleaving_iter_test.go | 1 + internal/keyspan/testdata/interleaving_iter | 52 +++++++++++++++++++++ 8 files changed, 89 insertions(+), 10 deletions(-) diff --git a/batch.go b/batch.go index c77896d25a..ea6b12aee7 100644 --- a/batch.go +++ b/batch.go @@ -2284,7 +2284,7 @@ func (i *flushableBatchIter) extractValue() base.LazyValue { return base.LazyValue{} } kind := InternalKeyKind(p[0]) - if kind > InternalKeyKindMax { + if kind > base.InternalKeyKindDurableMax { i.err = base.CorruptionErrorf("corrupted batch") return base.LazyValue{} } diff --git a/batchrepr/reader.go b/batchrepr/reader.go index ebb14d7819..735f69e6ea 100644 --- a/batchrepr/reader.go +++ b/batchrepr/reader.go @@ -87,7 +87,7 @@ func (r *Reader) Next() (kind base.InternalKeyKind, ukey []byte, value []byte, o return 0, nil, nil, false, nil } kind = base.InternalKeyKind((*r)[0]) - if kind > base.InternalKeyKindMax { + if kind > base.InternalKeyKindDurableMax { return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "invalid key kind 0x%x", (*r)[0]) } *r, ukey, ok = DecodeStr((*r)[1:]) diff --git a/ingest.go b/ingest.go index c12591b2be..4c9e697d9e 100644 --- a/ingest.go +++ b/ingest.go @@ -139,7 +139,7 @@ func ingestSynthesizeShared( // a.RANGEDEL.100, with a.RANGEDEL.100 being the smallest key. To create a // correct bound, we just use the maximum key kind (which sorts first). // Similarly, we use the smallest key kind for the largest key. - smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, base.InternalKeyKindMax) + smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, base.InternalKeyKindDurableMax) largestPointKey := base.MakeInternalKey(sm.LargestPointKey.UserKey, 0, 0) if sm.LargestPointKey.IsExclusiveSentinel() { largestPointKey = base.MakeRangeDeleteSentinelKey(sm.LargestPointKey.UserKey) @@ -220,12 +220,12 @@ func ingestLoad1External( if e.EndKeyIsInclusive { meta.ExtendPointKeyBounds( opts.Comparer.Compare, - base.MakeInternalKey(smallestCopy, 0, InternalKeyKindMax), + base.MakeInternalKey(smallestCopy, 0, base.InternalKeyKindDurableMax), base.MakeInternalKey(largestCopy, 0, 0)) } else { meta.ExtendPointKeyBounds( opts.Comparer.Compare, - base.MakeInternalKey(smallestCopy, 0, InternalKeyKindMax), + base.MakeInternalKey(smallestCopy, 0, base.InternalKeyKindDurableMax), base.MakeRangeDeleteSentinelKey(largestCopy)) } } diff --git a/internal/base/internal.go b/internal/base/internal.go index b16d5d6b92..ae1c6512bb 100644 --- a/internal/base/internal.go +++ b/internal/base/internal.go @@ -96,6 +96,9 @@ const ( // heuristics, but is not required to be accurate for correctness. InternalKeyKindDeleteSized InternalKeyKind = 23 + InternalKeyKindSpanStart InternalKeyKind = 24 + InternalKeyKindSpanEnd InternalKeyKind = 25 + // This maximum value isn't part of the file format. Future extensions may // increase this value. // @@ -105,7 +108,12 @@ const ( // which sorts 'less than or equal to' any other valid internalKeyKind, when // searching for any kind of internal key formed by a certain user key and // seqNum. - InternalKeyKindMax InternalKeyKind = 23 + InternalKeyKindMax InternalKeyKind = 25 + + // NB: This is less than InternalKeyKindSpanStart and InternalKeyKindSpanEnd + // because those key kinds are never used in durable formats; only as + // special in-memory indicators. + InternalKeyKindDurableMax InternalKeyKind = 23 // Internal to the sstable format. Not exposed by any sstable iterator. // Declared here to prevent definition of valid key kinds that set this bit. @@ -157,6 +165,8 @@ var internalKeyKindNames = []string{ InternalKeyKindRangeKeyDelete: "RANGEKEYDEL", InternalKeyKindIngestSST: "INGESTSST", InternalKeyKindDeleteSized: "DELSIZED", + InternalKeyKindSpanStart: "SPANSTART", + InternalKeyKindSpanEnd: "SPANEND", InternalKeyKindInvalid: "INVALID", } @@ -249,6 +259,8 @@ var kindsMap = map[string]InternalKeyKind{ "RANGEKEYDEL": InternalKeyKindRangeKeyDelete, "INGESTSST": InternalKeyKindIngestSST, "DELSIZED": InternalKeyKindDeleteSized, + "SPANSTART": InternalKeyKindSpanStart, + "SPANEND": InternalKeyKindSpanEnd, } // ParseInternalKey parses the string representation of an internal key. The @@ -476,7 +488,8 @@ func (k InternalKey) IsExclusiveSentinel() bool { } switch kind := k.Kind(); kind { case InternalKeyKindRangeDelete, InternalKeyKindRangeKeyDelete, - InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet: + InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet, + InternalKeyKindSpanStart, InternalKeyKindSpanEnd: return true default: return false diff --git a/internal/base/internal_test.go b/internal/base/internal_test.go index 39466cd97d..172a550d38 100644 --- a/internal/base/internal_test.go +++ b/internal/base/internal_test.go @@ -41,7 +41,7 @@ func TestInvalidInternalKey(t *testing.T) { "\x01\x02\x03\x04\x05\x06\x07", "foo", "foo\x08\x07\x06\x05\x04\x03\x02", - "foo\x18\x07\x06\x05\x04\x03\x02\x01", + "foo\x1a\x07\x06\x05\x04\x03\x02\x01", } for _, tc := range testCases { k := DecodeInternalKey([]byte(tc)) diff --git a/internal/keyspan/interleaving_iter.go b/internal/keyspan/interleaving_iter.go index a6eee31eae..f7a649c560 100644 --- a/internal/keyspan/interleaving_iter.go +++ b/internal/keyspan/interleaving_iter.go @@ -199,6 +199,10 @@ type InterleavingIterOpts struct { // end keys of spans (in addition to the start keys, which are always // interleaved). InterleaveEndKeys bool + // UseBoundaryKeyKinds configures the interleaving iterator to interleave + // keys using SPANSTART and SPANEND key kinds for start and end boundaries + // respectively, rather than the key kind of the first key in the Span. + UseBoundaryKeyKinds bool } // Init initializes the InterleavingIter to interleave point keys from pointIter @@ -979,7 +983,12 @@ func (i *InterleavingIter) yieldPointKey() *base.InternalKV { func (i *InterleavingIter) yieldSyntheticSpanStartMarker(lowerBound []byte) *base.InternalKV { i.spanMarker.K.UserKey = i.startKey() - i.spanMarker.K.Trailer = base.MakeTrailer(base.InternalKeySeqNumMax, i.span.Keys[0].Kind()) + + kind := base.InternalKeyKindSpanStart + if !i.opts.UseBoundaryKeyKinds { + kind = i.span.Keys[0].Kind() + } + i.spanMarker.K.Trailer = base.MakeTrailer(base.InternalKeySeqNumMax, kind) // Truncate the key we return to our lower bound if we have one. Note that // we use the lowerBound function parameter, not i.lower. The lowerBound @@ -1015,8 +1024,12 @@ func (i *InterleavingIter) yieldSyntheticSpanStartMarker(lowerBound []byte) *bas } func (i *InterleavingIter) yieldSyntheticSpanEndMarker() *base.InternalKV { + kind := base.InternalKeyKindSpanEnd + if !i.opts.UseBoundaryKeyKinds { + kind = i.span.Keys[0].Kind() + } i.spanMarker.K.UserKey = i.endKey() - i.spanMarker.K.Trailer = base.MakeTrailer(base.InternalKeySeqNumMax, i.span.Keys[0].Kind()) + i.spanMarker.K.Trailer = base.MakeTrailer(base.InternalKeySeqNumMax, kind) return i.verify(&i.spanMarker) } diff --git a/internal/keyspan/interleaving_iter_test.go b/internal/keyspan/interleaving_iter_test.go index f9d4df4674..bdffeccfa2 100644 --- a/internal/keyspan/interleaving_iter_test.go +++ b/internal/keyspan/interleaving_iter_test.go @@ -123,6 +123,7 @@ func runInterleavingIterTest(t *testing.T, filename string) { hooks.threshold = []byte(strings.Join(cmdArg.Vals, "")) } opts.InterleaveEndKeys = td.HasArg("interleave-end-keys") + opts.UseBoundaryKeyKinds = td.HasArg("use-boundary-key-kinds") iter.Init(testkeys.Comparer, &pointIter, keyspanIter, opts) // Clear any previous bounds. pointIter.SetBounds(nil, nil) diff --git a/internal/keyspan/testdata/interleaving_iter b/internal/keyspan/testdata/interleaving_iter index 4d66462000..63e7c2539b 100644 --- a/internal/keyspan/testdata/interleaving_iter +++ b/internal/keyspan/testdata/interleaving_iter @@ -1152,6 +1152,58 @@ Span: f-g:{(#6,RANGEDEL)} -- SpanChanged(nil) . +iter interleave-end-keys use-boundary-key-kinds +first +next +next +next +next +next +next +next +next +next +next +---- +-- SpanChanged(nil) +-- SpanChanged(nil) +PointKey: a@4#8,SET +Span: +- +-- SpanChanged(b-e:{(#5,RANGEDEL)}) +PointKey: b#72057594037927935,SPANSTART +Span: b-e:{(#5,RANGEDEL)} +- +PointKey: c@11#8,SET +Span: b-e:{(#5,RANGEDEL)} +- +PointKey: c@3#8,SET +Span: b-e:{(#5,RANGEDEL)} +- +PointKey: c@1#4,SET +Span: b-e:{(#5,RANGEDEL)} +- +PointKey: d@5#3,SET +Span: b-e:{(#5,RANGEDEL)} +- +PointKey: e#72057594037927935,SPANEND +Span: b-e:{(#5,RANGEDEL)} +- +-- SpanChanged(nil) +PointKey: e@9#2,SET +Span: +- +-- SpanChanged(f-g:{(#6,RANGEDEL)}) +PointKey: f#72057594037927935,SPANSTART +Span: f-g:{(#6,RANGEDEL)} +- +PointKey: g#72057594037927935,SPANEND +Span: f-g:{(#6,RANGEDEL)} +- +-- SpanChanged(nil) +. + + iter interleave-end-keys last prev From 154aa2a3fe3312dfe788e818bfb9e47eac8c3033 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Wed, 8 May 2024 11:52:58 -0400 Subject: [PATCH 2/2] db: rework merging iterator range deletion handling Rework the merging iterator's handling of range deletions to use the interleaved range deletion boundary keys to determine when the iterator is positioned within a level's range deletion. This removes the direct manipulation of a range deletion keyspan.FragmentIterator from the mergingIter, delegating that to the child iterator's keyspan.InterleavingIter. This factoring is a little cleaner and decouples the mergingIter from the details of range deletion iterators, and in particular, the levelIter's individual sstables. It also should reduce key comparisons, especially during scans, by avoiding unnecessary key comparisons with range deletions that are loaded but outside the keyspace being iterated over. Close #2863. --- db.go | 20 +- external_iterator.go | 10 +- internal/base/internal.go | 11 + internal/keyspan/span.go | 7 +- internal/rangedel/rangedel.go | 48 +++ iterator.go | 4 +- level_checker.go | 57 ++- level_iter.go | 89 ++--- level_iter_test.go | 95 +---- merging_iter.go | 624 ++++++++++++++------------------- merging_iter_test.go | 9 +- scan_internal.go | 4 +- testdata/ingest | 2 +- testdata/level_iter_boundaries | 72 ++-- testdata/level_iter_seek | 70 ++-- testdata/merging_iter | 18 +- testdata/metrics | 18 +- 17 files changed, 500 insertions(+), 658 deletions(-) diff --git a/db.go b/db.go index b4b9acd0a4..2d6cb05344 100644 --- a/db.go +++ b/db.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/pebble/internal/keyspan/keyspanimpl" "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/manual" + "github.com/cockroachdb/pebble/internal/rangedel" "github.com/cockroachdb/pebble/objstorage" "github.com/cockroachdb/pebble/objstorage/remote" "github.com/cockroachdb/pebble/rangekey" @@ -1431,19 +1432,16 @@ func (i *Iterator) constructPointIter( } else { i.batch.initInternalIter(&i.opts, &i.batchPointIter) i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, i.batchSeqNum) + mil := mergingIterLevel{iter: &i.batchPointIter, getTombstone: nil} // Only include the batch's rangedel iterator if it's non-empty. // This requires some subtle logic in the case a rangedel is later // written to the batch and the view of the batch is refreshed // during a call to SetOptions—in this case, we need to reconstruct // the point iterator to add the batch rangedel iterator. - var rangeDelIter keyspan.FragmentIterator if i.batchRangeDelIter.Count() > 0 { - rangeDelIter = &i.batchRangeDelIter + mil.iter, mil.getTombstone = rangedel.Interleave(&i.comparer, &i.batchPointIter, &i.batchRangeDelIter) } - mlevels = append(mlevels, mergingIterLevel{ - iter: &i.batchPointIter, - rangeDelIter: rangeDelIter, - }) + mlevels = append(mlevels, mil) } } @@ -1451,10 +1449,9 @@ func (i *Iterator) constructPointIter( // Next are the memtables. for j := len(memtables) - 1; j >= 0; j-- { mem := memtables[j] - mlevels = append(mlevels, mergingIterLevel{ - iter: mem.newIter(&i.opts), - rangeDelIter: mem.newRangeDelIter(&i.opts), - }) + mil := mergingIterLevel{} + mil.iter, mil.getTombstone = rangedel.Interleave(&i.comparer, mem.newIter(&i.opts), mem.newRangeDelIter(&i.opts)) + mlevels = append(mlevels, mil) } // Next are the file levels: L0 sub-levels followed by lower levels. @@ -1467,10 +1464,11 @@ func (i *Iterator) constructPointIter( li := &levels[levelsIndex] li.init(ctx, i.opts, &i.comparer, i.newIters, files, level, internalOpts) - li.initRangeDel(&mlevels[mlevelsIndex].rangeDelIter) + li.interleaveRangeDeletions = true li.initCombinedIterState(&i.lazyCombinedIter.combinedIterState) mlevels[mlevelsIndex].levelIter = li mlevels[mlevelsIndex].iter = invalidating.MaybeWrapIfInvariants(li) + mlevels[mlevelsIndex].getTombstone = li.getTombstone levelsIndex++ mlevelsIndex++ diff --git a/external_iterator.go b/external_iterator.go index 1d8f0d5217..92dd29e5d0 100644 --- a/external_iterator.go +++ b/external_iterator.go @@ -11,6 +11,7 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" + "github.com/cockroachdb/pebble/internal/rangedel" "github.com/cockroachdb/pebble/sstable" ) @@ -212,10 +213,11 @@ func createExternalPointIter(ctx context.Context, it *Iterator) (topLevelIterato if err != nil { return nil, err } - mlevels = append(mlevels, mergingIterLevel{ - iter: pointIter, - rangeDelIter: rangeDelIter, - }) + mil := mergingIterLevel{iter: pointIter, getTombstone: nil} + if rangeDelIter != nil { + mil.iter, mil.getTombstone = rangedel.Interleave(&it.comparer, mil.iter, rangeDelIter) + } + mlevels = append(mlevels, mil) } } diff --git a/internal/base/internal.go b/internal/base/internal.go index ae1c6512bb..442d4ddb51 100644 --- a/internal/base/internal.go +++ b/internal/base/internal.go @@ -574,3 +574,14 @@ func (kv *InternalKV) Visible(snapshot, batchSnapshot uint64) bool { func (kv *InternalKV) IsExclusiveSentinel() bool { return kv.K.IsExclusiveSentinel() } + +// String returns a string representation of the kv pair. +func (kv *InternalKV) String() string { + if kv == nil { + return "" + } + if kv.V.Fetcher != nil { + return fmt.Sprintf("%s=", kv.K, kv.V.ValueOrHandle) + } + return fmt.Sprintf("%s:%s", kv.K, FormatBytes(kv.V.ValueOrHandle)) +} diff --git a/internal/keyspan/span.go b/internal/keyspan/span.go index b78cc44ed7..c25bf9eef7 100644 --- a/internal/keyspan/span.go +++ b/internal/keyspan/span.go @@ -298,10 +298,11 @@ func (s Span) Visible(snapshot uint64) Span { // VisibleAt requires the Span's keys be in ByTrailerDesc order. It panics if // the span's keys are sorted in a different order. func (s *Span) VisibleAt(snapshot uint64) bool { - if s.KeysOrder != ByTrailerDesc { + if s == nil { + return false + } else if s.KeysOrder != ByTrailerDesc { panic("pebble: span's keys unexpectedly not in trailer order") - } - if len(s.Keys) == 0 { + } else if len(s.Keys) == 0 { return false } else if first := s.Keys[0].SeqNum(); first&base.InternalKeySeqNumBatch != 0 { // Only visible batch keys are included when an Iterator's batch spans diff --git a/internal/rangedel/rangedel.go b/internal/rangedel/rangedel.go index f8504bb7b2..4c846d12b1 100644 --- a/internal/rangedel/rangedel.go +++ b/internal/rangedel/rangedel.go @@ -5,6 +5,8 @@ package rangedel import ( + "sync" + "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/keyspan" ) @@ -41,3 +43,49 @@ func Decode(ik base.InternalKey, v []byte, keysDst []keyspan.Key) keyspan.Span { }), } } + +// Interleave takes a point iterator and a range deletion iterator, returning an +// iterator that interleaves range deletion boundary keys at the maximal +// sequence number among the stream of point keys with SPANSTART and SPANEND key +// kinds. +// +// In addition, Interleave returns a function that may be used to retrieve the +// range tombstone overlapping the current iterator position, if any. +// +// The returned iterator must only be closed once. +func Interleave( + comparer *base.Comparer, iter base.InternalIterator, rangeDelIter keyspan.FragmentIterator, +) (base.InternalIterator, func() *keyspan.Span) { + // If there is no range deletion iterator, don't bother using an interleaving + // iterator. We can return iter verbatim and a func that unconditionally + // returns nil. + if rangeDelIter == nil { + return iter, nil + } + + ii := interleavingIterPool.Get().(*interleavingIter) + ii.Init(comparer, iter, rangeDelIter, keyspan.InterleavingIterOpts{ + InterleaveEndKeys: true, + UseBoundaryKeyKinds: true, + }) + return ii, ii.Span +} + +var interleavingIterPool = sync.Pool{ + New: func() interface{} { + return &interleavingIter{} + }, +} + +type interleavingIter struct { + keyspan.InterleavingIter +} + +// Close closes the interleaving iterator and returns the interleaving iterator +// to the pool. +func (i *interleavingIter) Close() error { + err := i.InterleavingIter.Close() + *i = interleavingIter{} + interleavingIterPool.Put(i) + return err +} diff --git a/iterator.go b/iterator.go index bba2a131bd..19187cd16d 100644 --- a/iterator.go +++ b/iterator.go @@ -1820,8 +1820,8 @@ func (i *Iterator) nextPrefix() IterValidityState { return i.iterValidityState } if invariants.Enabled && !i.equal(i.iterKV.K.UserKey, i.key) { - i.opts.getLogger().Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q", - i.iterKV.K.UserKey, i.key) + panic(errors.AssertionFailedf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q", + i.iterKV.K.UserKey, i.key)) } } // The internal iterator is now positioned at i.key. Advance to the next diff --git a/level_checker.go b/level_checker.go index b2f0936b1c..7b5ca57390 100644 --- a/level_checker.go +++ b/level_checker.go @@ -14,6 +14,7 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" + "github.com/cockroachdb/pebble/internal/rangedel" ) // This file implements DB.CheckLevels() which checks that every entry in the @@ -47,11 +48,10 @@ import ( // The per-level structure used by simpleMergingIter. type simpleMergingIterLevel struct { - iter internalIterator - rangeDelIter keyspan.FragmentIterator - - iterKV *base.InternalKV - tombstone *keyspan.Span + iter internalIterator + getTombstone func() *keyspan.Span + iterKV *base.InternalKV + iterTombstone *keyspan.Span } type simpleMergingIter struct { @@ -102,22 +102,6 @@ func (m *simpleMergingIter) init( if m.heap.len() == 0 { return } - m.positionRangeDels() -} - -// Positions all the rangedel iterators at or past the current top of the -// heap, using SeekGE(). -func (m *simpleMergingIter) positionRangeDels() { - item := &m.heap.items[0] - for i := range m.levels { - l := &m.levels[i] - if l.rangeDelIter == nil { - continue - } - t, err := l.rangeDelIter.SeekGE(item.key.UserKey) - m.err = firstError(m.err, err) - l.tombstone = t - } } // Returns true if not yet done. @@ -128,7 +112,12 @@ func (m *simpleMergingIter) step() bool { item := &m.heap.items[0] l := &m.levels[item.index] // Sentinels are not relevant for this point checking. - if !item.key.IsExclusiveSentinel() && item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) { + if item.key.IsExclusiveSentinel() { + l.iterTombstone = nil + if item.key.Kind() != base.InternalKeyKindSpanEnd { + l.iterTombstone = l.getTombstone() + } + } else if item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) { // This is a visible point key. if !m.handleVisiblePoint(item, l) { return false @@ -187,7 +176,6 @@ func (m *simpleMergingIter) step() bool { } return false } - m.positionRangeDels() return true } @@ -269,12 +257,12 @@ func (m *simpleMergingIter) handleVisiblePoint( // iterators must be positioned at a key > item.key. for level := item.index + 1; level < len(m.levels); level++ { lvl := &m.levels[level] - if lvl.rangeDelIter == nil || lvl.tombstone.Empty() { + if lvl.iterTombstone.Empty() { continue } - if lvl.tombstone.Contains(m.heap.cmp, item.key.UserKey) && lvl.tombstone.CoversAt(m.snapshot, item.key.SeqNum()) { + if lvl.iterTombstone.Contains(m.heap.cmp, item.key.UserKey) && lvl.iterTombstone.CoversAt(m.snapshot, item.key.SeqNum()) { m.err = errors.Errorf("tombstone %s in %s deletes key %s in %s", - lvl.tombstone.Pretty(m.formatKey), lvl.iter, item.key.Pretty(m.formatKey), + lvl.iterTombstone.Pretty(m.formatKey), lvl.iter, item.key.Pretty(m.formatKey), l.iter) return false } @@ -593,20 +581,15 @@ func checkLevelsInternal(c *checkConfig) (err error) { err = firstError(err, l.iter.Close()) l.iter = nil } - if l.rangeDelIter != nil { - err = firstError(err, l.rangeDelIter.Close()) - l.rangeDelIter = nil - } } }() memtables := c.readState.memtables for i := len(memtables) - 1; i >= 0; i-- { mem := memtables[i] - mlevels = append(mlevels, simpleMergingIterLevel{ - iter: mem.newIter(nil), - rangeDelIter: mem.newRangeDelIter(nil), - }) + var smil simpleMergingIterLevel + smil.iter, smil.getTombstone = rangedel.Interleave(c.comparer, mem.newIter(nil), mem.newRangeDelIter(nil)) + mlevels = append(mlevels, smil) } current := c.readState.current @@ -636,8 +619,9 @@ func checkLevelsInternal(c *checkConfig) (err error) { li := &levelIter{} li.init(context.Background(), iterOpts, c.comparer, c.newIters, manifestIter, manifest.L0Sublevel(sublevel), internalIterOpts{}) - li.initRangeDel(&mlevelAlloc[0].rangeDelIter) + li.interleaveRangeDeletions = true mlevelAlloc[0].iter = li + mlevelAlloc[0].getTombstone = li.getTombstone mlevelAlloc = mlevelAlloc[1:] } for level := 1; level < len(current.Levels); level++ { @@ -649,8 +633,9 @@ func checkLevelsInternal(c *checkConfig) (err error) { li := &levelIter{} li.init(context.Background(), iterOpts, c.comparer, c.newIters, current.Levels[level].Iter(), manifest.Level(level), internalIterOpts{}) - li.initRangeDel(&mlevelAlloc[0].rangeDelIter) + li.interleaveRangeDeletions = true mlevelAlloc[0].iter = li + mlevelAlloc[0].getTombstone = li.getTombstone mlevelAlloc = mlevelAlloc[1:] } diff --git a/level_iter.go b/level_iter.go index 00af0887d1..3ead730aa4 100644 --- a/level_iter.go +++ b/level_iter.go @@ -80,19 +80,8 @@ type levelIter struct { // iterFile holds the current file. It is always equal to l.files.Current(). iterFile *fileMetadata newIters tableNewIters - // When rangeDelIterPtr != nil, the caller requires that *rangeDelIterPtr must - // point to a range del iterator corresponding to the current file. When this - // iterator returns nil, *rangeDelIterPtr should also be set to nil. Whenever - // a non-nil internalIterator is placed in rangeDelIterPtr, a copy is placed - // in rangeDelIterCopy. This is done for the following special case: - // when this iterator returns nil because of exceeding the bounds, we don't - // close iter and *rangeDelIterPtr since we could reuse it in the next seek. But - // we need to set *rangeDelIterPtr to nil because of the aforementioned contract. - // This copy is used to revive the *rangeDelIterPtr in the case of reuse. - rangeDelIterPtr *keyspan.FragmentIterator - rangeDelIterCopy keyspan.FragmentIterator - files manifest.LevelIterator - err error + files manifest.LevelIterator + err error // internalOpts holds the internal iterator options to pass to the table // cache when constructing new table iterators. @@ -118,6 +107,9 @@ type levelIter struct { // first or last key within iteration bounds. exhaustedDir int8 + // interleaveRangeDeletions + interleaveRangeDeletions bool + // Disable invariant checks even if they are otherwise enabled. Used by tests // which construct "impossible" situations (e.g. seeking to a key before the // lower bound). @@ -176,10 +168,7 @@ func (l *levelIter) init( l.files = files l.exhaustedDir = 0 l.internalOpts = internalOpts -} - -func (l *levelIter) initRangeDel(rangeDelIter *keyspan.FragmentIterator) { - l.rangeDelIterPtr = rangeDelIter + l.interleaveRangeDeletions = false } func (l *levelIter) initCombinedIterState(state *combinedIterState) { @@ -488,13 +477,6 @@ func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicato return noFileLoaded } if l.iter != nil { - // We don't bother comparing the file bounds with the iteration bounds when we have - // an already open iterator. It is possible that the iter may not be relevant given the - // current iteration bounds, but it knows those bounds, so it will enforce them. - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = l.rangeDelIterCopy - } - // There are a few reasons we might not have triggered combined // iteration yet, even though we already had `file` open. // 1. If the bounds changed, we might have previously avoided @@ -511,10 +493,8 @@ func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicato // have changed. We handle that below. } - // Close both iter and rangeDelIterPtr. While mergingIter knows about - // rangeDelIterPtr, it can't call Close() on it because it does not know - // when the levelIter will switch it. Note that levelIter.Close() can be - // called multiple times. + // Close the iterator. Note that levelIter.Close() can be called multiple + // times. if err := l.Close(); err != nil { return noFileLoaded } @@ -556,7 +536,7 @@ func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicato } iterKinds := iterPointKeys - if l.rangeDelIterPtr != nil { + if l.interleaveRangeDeletions { iterKinds |= iterRangeDeletions } @@ -566,32 +546,17 @@ func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicato return noFileLoaded } l.iter = iters.Point() - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = iters.rangeDeletion - l.rangeDelIterCopy = iters.rangeDeletion - + if l.interleaveRangeDeletions && iters.rangeDeletion != nil { // If this file has range deletions, interleave the bounds of the // range deletions among the point keys. When used with a // mergingIter, this ensures we don't move beyond a file with range // deletions until its range deletions are no longer relevant. - // - // For now, we open a second range deletion iterator. Future work - // will avoid the need to open a second range deletion iterator, and - // avoid surfacing the file's range deletion iterator directly to - // the caller. - var itersForBounds iterSet - itersForBounds, l.err = l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts, iterRangeDeletions) - l.interleaving.Init(l.comparer, l.iter, itersForBounds.RangeDeletion(), keyspan.InterleavingIterOpts{ - LowerBound: l.tableOpts.LowerBound, - UpperBound: l.tableOpts.UpperBound, - InterleaveEndKeys: true, + l.interleaving.Init(l.comparer, l.iter, iters.rangeDeletion, keyspan.InterleavingIterOpts{ + LowerBound: l.tableOpts.LowerBound, + UpperBound: l.tableOpts.UpperBound, + InterleaveEndKeys: true, + UseBoundaryKeyKinds: true, }) - if l.err != nil { - l.iter = nil - *l.rangeDelIterPtr = nil - l.err = errors.CombineErrors(l.err, iters.CloseAll()) - return noFileLoaded - } l.iter = &l.interleaving } return newFileLoaded @@ -884,16 +849,23 @@ func (l *levelIter) skipEmptyFileBackward() *base.InternalKV { func (l *levelIter) exhaustedForward() { l.exhaustedDir = +1 - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = nil - } } func (l *levelIter) exhaustedBackward() { l.exhaustedDir = -1 - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = nil +} + +// getTombstone retrieves the range tombstone covering the current iterator +// position. If there is none, or if the iterator is not configured to +// interleave range deletions, getTombstone returns nil. +// +// The returned Span's memory is guaranteed to be valid until the iterator is +// moved beyond the Span's interleaved boundary keys. +func (l *levelIter) getTombstone() *keyspan.Span { + if l.iter != &l.interleaving { + return nil } + return l.interleaving.Span() } func (l *levelIter) Error() error { @@ -908,13 +880,6 @@ func (l *levelIter) Close() error { l.err = l.iter.Close() l.iter = nil } - if l.rangeDelIterPtr != nil { - if t := l.rangeDelIterCopy; t != nil { - l.err = firstError(l.err, t.Close()) - } - *l.rangeDelIterPtr = nil - l.rangeDelIterCopy = nil - } return l.err } diff --git a/level_iter_test.go b/level_iter_test.go index 7303c691eb..b1263e9d58 100644 --- a/level_iter_test.go +++ b/level_iter_test.go @@ -89,8 +89,7 @@ func TestLevelIter(t *testing.T) { iter := newLevelIter(context.Background(), opts, testkeys.Comparer, newIters, files.Iter(), manifest.Level(level), internalIterOpts{}) defer iter.Close() - // Fake up the range deletion initialization. - iter.initRangeDel(new(keyspan.FragmentIterator)) + iter.interleaveRangeDeletions = true iter.disableInvariants = true return itertest.RunInternalIterCmd(t, d, iter, itertest.Verbose) @@ -333,8 +332,7 @@ func TestLevelIterBoundaries(t *testing.T) { if iter == nil { slice := manifest.NewLevelSliceKeySorted(lt.cmp.Compare, lt.metas) iter = newLevelIter(context.Background(), IterOptions{}, testkeys.Comparer, lt.newIters, slice.Iter(), manifest.Level(level), internalIterOpts{}) - // Fake up the range deletion initialization. - iter.initRangeDel(new(keyspan.FragmentIterator)) + iter.interleaveRangeDeletions = true } if !save { defer func() { @@ -363,81 +361,6 @@ func TestLevelIterBoundaries(t *testing.T) { }) } -// levelIterTestIter allows a datadriven test to use runInternalIterCmd and -// perform parallel operations on both both a levelIter and rangeDelIter. -type levelIterTestIter struct { - *levelIter - rangeDelIter keyspan.FragmentIterator - // rangeDel is only set on seeks: SeekGE, SeekLT, SeekPrefixGE. - // TODO(jackson): Clean this up when #2863 is resolved. - rangeDel *keyspan.Span -} - -func must(err error) { - if err != nil { - panic(err) - } -} - -func (i *levelIterTestIter) getRangeDel() *keyspan.Span { - return i.rangeDel -} - -func (i *levelIterTestIter) rangeDelSeek( - key []byte, kv *base.InternalKV, dir int, -) *base.InternalKV { - i.rangeDel = nil - if i.rangeDelIter != nil { - var t *keyspan.Span - var err error - if dir < 0 { - t, err = keyspan.SeekLE(i.levelIter.cmp, i.rangeDelIter, key) - } else { - t, err = i.rangeDelIter.SeekGE(key) - } - // TODO(jackson): Clean this up when the InternalIterator interface - // is refactored to return an error return value from all - // positioning methods. - must(err) - if t != nil { - i.rangeDel = new(keyspan.Span) - *i.rangeDel = t.Visible(1000) - } - } - return kv -} - -func (i *levelIterTestIter) String() string { - return "level-iter-test" -} - -func (i *levelIterTestIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { - kv := i.levelIter.SeekGE(key, flags) - return i.rangeDelSeek(key, kv, 1) -} - -func (i *levelIterTestIter) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) *base.InternalKV { - kv := i.levelIter.SeekPrefixGE(prefix, key, flags) - return i.rangeDelSeek(key, kv, 1) -} - -func (i *levelIterTestIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { - kv := i.levelIter.SeekLT(key, flags) - return i.rangeDelSeek(key, kv, -1) -} - -func (i *levelIterTestIter) Next() *base.InternalKV { - i.rangeDel = nil - return i.levelIter.Next() -} - -func (i *levelIterTestIter) Prev() *base.InternalKV { - i.rangeDel = nil - return i.levelIter.Prev() -} - func TestLevelIterSeek(t *testing.T) { lt := newLevelIterTest() defer lt.runClear() @@ -453,12 +376,12 @@ func TestLevelIterSeek(t *testing.T) { case "iter": var stats base.InternalIteratorStats slice := manifest.NewLevelSliceKeySorted(lt.cmp.Compare, lt.metas) - iter := &levelIterTestIter{levelIter: &levelIter{}} + iter := &levelIter{} iter.init(context.Background(), IterOptions{}, testkeys.Comparer, lt.newIters, slice.Iter(), manifest.Level(level), internalIterOpts{stats: &stats}) defer iter.Close() - iter.initRangeDel(&iter.rangeDelIter) - return itertest.RunInternalIterCmd(t, d, iter, itertest.Verbose, itertest.WithSpan(iter.getRangeDel), itertest.WithStats(&stats)) + iter.interleaveRangeDeletions = true + return itertest.RunInternalIterCmd(t, d, iter, itertest.Verbose, itertest.WithSpan(iter.getTombstone), itertest.WithStats(&stats)) case "iters-created": return fmt.Sprintf("%d", lt.itersCreated) @@ -605,9 +528,7 @@ func BenchmarkLevelIterSeqSeekGEWithBounds(b *testing.B) { return iterSet{point: iter}, err } l := newLevelIter(context.Background(), IterOptions{}, DefaultComparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{}) - // Fake up the range deletion initialization, to resemble the usage - // in a mergingIter. - l.initRangeDel(new(keyspan.FragmentIterator)) + l.interleaveRangeDeletions = true keyCount := len(keys) b.ResetTimer() for i := 0; i < b.N; i++ { @@ -653,9 +574,7 @@ func BenchmarkLevelIterSeqSeekPrefixGE(b *testing.B) { b.Run(fmt.Sprintf("skip=%d/use-next=%t", skip, useNext), func(b *testing.B) { l := newLevelIter(context.Background(), IterOptions{}, testkeys.Comparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{}) - // Fake up the range deletion initialization, to resemble the usage - // in a mergingIter. - l.initRangeDel(new(keyspan.FragmentIterator)) + l.interleaveRangeDeletions = true keyCount := len(keys) pos := 0 l.SeekPrefixGE(keys[pos], keys[pos], base.SeekGEFlagsNone) diff --git a/merging_iter.go b/merging_iter.go index e04df79c90..8161b3891c 100644 --- a/merging_iter.go +++ b/merging_iter.go @@ -20,25 +20,71 @@ import ( type mergingIterLevel struct { index int iter internalIterator - // rangeDelIter is set to the range-deletion iterator for the level. When - // configured with a levelIter, this pointer changes as sstable boundaries - // are crossed. See levelIter.initRangeDel and the Range Deletions comment - // below. - rangeDelIter keyspan.FragmentIterator + // getTombstone, if non-nil, returns the current range deletion tombstone + // covering the key getTombstone. This is typically a pointer to + // levelIter.getTombstone if using a levelIter, or a func returned by + // rangedel.Interleave otherwise. + getTombstone func() *keyspan.Span // iterKV caches the current key-value pair iter points to. iterKV *base.InternalKV + // tombstone holds a Span containing a visible range deletion tombstone that + // covers the current mergingIter's heap root, if any. It's updated when an + // interleaved range deletion boundary rises to the top of the heap, and the + // merging iterator steps over it. Care must be taken to update it + // appropriately when the heap is switching directions. See + // updateTombstone{Forward,Backward}. + tombstone *keyspan.Span // levelIter is non-nil if this level's iter is ultimately backed by a // *levelIter. The handle in iter may have wrapped the levelIter with // intermediary internalIterator implementations. levelIter *levelIter +} - // tombstone caches the tombstone rangeDelIter is currently pointed at. If - // tombstone is nil, there are no further tombstones within the - // current sstable in the current iterator direction. The cached tombstone is - // only valid for the levels in the range [0,heap[0].index]. This avoids - // positioning tombstones at lower levels which cannot possibly shadow the - // current key. - tombstone *keyspan.Span +func (l *mergingIterLevel) String() string { + return fmt.Sprintf("[index:%d iter:%s iterKV:%v iterTombstone:%s]", l.index, l.iter, l.iterKV, l.tombstone) +} + +// updateTombstoneForward updates the value of l.tombstone to reflect the +// current range deletion span covering the iterator position, if any, while +// oriented in the forward direction. It's called while re-orienting the +// direction of the heap in initMinHeap. +func (l *mergingIterLevel) updateTombstoneForward(snapshot uint64) { + switch { + case l.iterKV == nil: + l.tombstone = nil + case l.iterKV.K.Kind() == base.InternalKeyKindSpanStart: + // If we're positioned at a span start, the span lies before us, and + // we're not yet positioned within the span. + l.tombstone = nil + default: + l.tombstone = l.getVisibleTombstone(snapshot) + } +} + +// updateTombstoneBackward updates the value of l.tombstone to reflect the +// current range deletion span covering the iterator position, if any, while +// oriented in the reverse direction. It's called while re-orienting the +// direction of the heap in initMaxHeap. +func (l *mergingIterLevel) updateTombstoneBackward(snapshot uint64) { + switch { + case l.iterKV == nil: + l.tombstone = nil + case l.iterKV.K.Kind() == base.InternalKeyKindSpanEnd: + // If we're positioned at a span end, the span lies before us (in the + // reverse direction), and we're not yet positioned within the span. + l.tombstone = nil + default: + l.tombstone = l.getVisibleTombstone(snapshot) + } +} + +func (l *mergingIterLevel) getVisibleTombstone(snapshot uint64) *keyspan.Span { + if l.getTombstone != nil { + if t := l.getTombstone(); t != nil && t.VisibleAt(snapshot) { + return t + } + } + return nil } // mergingIter provides a merged view of multiple iterators from different @@ -273,6 +319,7 @@ func newMergingIter( levels := make([]mergingIterLevel, len(iters)) for i := range levels { levels[i].iter = iters[i] + levels[i].getTombstone = nil } m.init(&IterOptions{logger: logger}, stats, cmp, split, levels...) return m @@ -317,69 +364,16 @@ func (m *mergingIter) initHeap() { m.heap.init() } -func (m *mergingIter) initMinHeap() error { +func (m *mergingIter) initMinHeap() { m.dir = 1 m.heap.reverse = false m.initHeap() - return m.initMinRangeDelIters(-1) } -// The level of the previous top element was oldTopLevel. Note that all range delete -// iterators < oldTopLevel are positioned past the key of the previous top element and -// the range delete iterator == oldTopLevel is positioned at or past the key of the -// previous top element. We need to position the range delete iterators from oldTopLevel + 1 -// to the level of the current top element. -func (m *mergingIter) initMinRangeDelIters(oldTopLevel int) error { - if m.heap.len() == 0 { - return nil - } - - // Position the range-del iterators at levels <= m.heap.items[0].index. - item := m.heap.items[0] - for level := oldTopLevel + 1; level <= item.index; level++ { - l := &m.levels[level] - if l.rangeDelIter == nil { - continue - } - var err error - l.tombstone, err = l.rangeDelIter.SeekGE(item.iterKV.K.UserKey) - if err != nil { - return err - } - } - return nil -} - -func (m *mergingIter) initMaxHeap() error { +func (m *mergingIter) initMaxHeap() { m.dir = -1 m.heap.reverse = true m.initHeap() - return m.initMaxRangeDelIters(-1) -} - -// The level of the previous top element was oldTopLevel. Note that all range delete -// iterators < oldTopLevel are positioned before the key of the previous top element and -// the range delete iterator == oldTopLevel is positioned at or before the key of the -// previous top element. We need to position the range delete iterators from oldTopLevel + 1 -// to the level of the current top element. -func (m *mergingIter) initMaxRangeDelIters(oldTopLevel int) error { - if m.heap.len() == 0 { - return nil - } - // Position the range-del iterators at levels <= m.heap.items[0].index. - item := m.heap.items[0] - for level := oldTopLevel + 1; level <= item.index; level++ { - l := &m.levels[level] - if l.rangeDelIter == nil { - continue - } - tomb, err := keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKV.K.UserKey) - if err != nil { - return err - } - l.tombstone = tomb - } - return nil } func (m *mergingIter) switchToMinHeap() error { @@ -416,7 +410,9 @@ func (m *mergingIter) switchToMinHeap() error { break } // key >= iter-key + l.updateTombstoneForward(m.snapshot) } + l.updateTombstoneForward(m.snapshot) if l.iterKV == nil { if err := l.iter.Error(); err != nil { return err @@ -427,12 +423,14 @@ func (m *mergingIter) switchToMinHeap() error { // Special handling for the current iterator because we were using its key // above. cur.iterKV = cur.iter.Next() + cur.updateTombstoneForward(m.snapshot) if cur.iterKV == nil { if err := cur.iter.Error(); err != nil { return err } } - return m.initMinHeap() + m.initMinHeap() + return nil } func (m *mergingIter) switchToMaxHeap() error { @@ -468,8 +466,10 @@ func (m *mergingIter) switchToMaxHeap() error { // key > iter-key break } + l.updateTombstoneBackward(m.snapshot) // key <= iter-key } + l.updateTombstoneBackward(m.snapshot) if l.iterKV == nil { if err := l.iter.Error(); err != nil { return err @@ -480,12 +480,14 @@ func (m *mergingIter) switchToMaxHeap() error { // Special handling for the current iterator because we were using its key // above. cur.iterKV = cur.iter.Prev() + cur.updateTombstoneBackward(m.snapshot) if cur.iterKV == nil { if err := cur.iter.Error(); err != nil { return err } } - return m.initMaxHeap() + m.initMaxHeap() + return nil } // nextEntry unconditionally steps to the next entry. item is the current top @@ -518,8 +520,14 @@ func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) error { } } - oldTopLevel := l.index - oldRangeDelIter := l.rangeDelIter + // Before we advance, consider whehter we're moving on to or out of a range + // tombstone within the level. + switch l.iterKV.K.Kind() { + case base.InternalKeyKindSpanStart: + l.tombstone = l.getVisibleTombstone(m.snapshot) + case base.InternalKeyKindSpanEnd: + l.tombstone = nil + } if succKey == nil { l.iterKV = l.iter.Next() @@ -531,6 +539,7 @@ func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) error { if err := l.iter.Error(); err != nil { return err } + l.tombstone = nil m.heap.pop() } else { if m.prefix != nil && !bytes.Equal(m.prefix, l.iterKV.K.UserKey[:m.split(l.iterKV.K.UserKey)]) { @@ -541,17 +550,9 @@ func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) error { } else if m.heap.len() > 1 { m.heap.fix(0) } - if l.rangeDelIter != oldRangeDelIter { - // The rangeDelIter changed which indicates that the l.iter moved to the - // next sstable. We have to update the tombstone for oldTopLevel as well. - oldTopLevel-- - } } - // The cached tombstones are only valid for the levels - // [0,oldTopLevel]. Updated the cached tombstones for any levels in the range - // [oldTopLevel+1,heap[0].index]. - return m.initMinRangeDelIters(oldTopLevel) + return nil } // isNextEntryDeleted starts from the current entry (as the next entry) and if @@ -564,111 +565,102 @@ func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) error { // during prefix-iteration mode. func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) (bool, error) { // Look for a range deletion tombstone containing item.iterKV at higher - // levels (level < item.index). If we find such a range tombstone we know - // it deletes the key in the current level. Also look for a range - // deletion at the current level (level == item.index). If we find such a - // range deletion we need to check whether it is newer than the current - // entry. + // levels (level < item.index). If we find such a range tombstone we know it + // deletes the key in the current level. Also look for a range deletion at + // the current level (level == item.index). If we find such a range deletion + // we need to check whether it is newer than the current entry. for level := 0; level <= item.index; level++ { l := &m.levels[level] - if l.rangeDelIter == nil || l.tombstone == nil { - // If l.tombstone is nil, there are no further tombstones - // in the current sstable in the current (forward) iteration - // direction. + if l.tombstone == nil { + // This level does not contain a tombstone overlapping the current + // heap root. continue } - if m.heap.cmp(l.tombstone.End, item.iterKV.K.UserKey) <= 0 { - // The current key is at or past the tombstone end key. - // - // NB: for the case that this l.rangeDelIter is provided by a levelIter we know that - // the levelIter must be positioned at a key >= item.iterKV. So it is sufficient to seek the - // current l.rangeDelIter (since any range del iterators that will be provided by the - // levelIter in the future cannot contain item.iterKV). Also, it is possible that we - // will encounter parts of the range delete that should be ignored -- we handle that - // below. - var err error - l.tombstone, err = l.rangeDelIter.SeekGE(item.iterKV.K.UserKey) - if err != nil { - return false, err + // Heap invariants must ensure that l.tombstone always contains the + // current heap root within its span boundaries. Ensure this holds in + // invariant builds. + if invariants.Enabled { + if !l.tombstone.Valid() { + panic(errors.AssertionFailedf("tombstone from level %s is not valid", l)) + } else if !l.tombstone.VisibleAt(m.snapshot) { + panic(errors.AssertionFailedf("tombstone from level %s is not visible at snapshot %d", l, m.snapshot)) + } else if !l.tombstone.Contains(m.heap.cmp, item.iterKV.K.UserKey) { + panic(errors.AssertionFailedf("tombstone from level %s does not contain key %s", l, item.iterKV.K)) } } - if l.tombstone == nil { - continue - } + if level < item.index { + // We could also do m.seekGE(..., level + 1). The levels from [level + // + 1, item.index) are already after item.iterKV so seeking them + // may be wasteful. - if l.tombstone.VisibleAt(m.snapshot) && m.heap.cmp(l.tombstone.Start, item.iterKV.K.UserKey) <= 0 { - if level < item.index { - // We could also do m.seekGE(..., level + 1). The levels from - // [level + 1, item.index) are already after item.iterKV so seeking them may be - // wasteful. - - // We can seek up to tombstone.End. - // - // Progress argument: Since this file is at a higher level than item.iterKV we know - // that the iterator in this file must be positioned within its bounds and at a key - // X > item.iterKV (otherwise it would be the min of the heap). It is not - // possible for X.UserKey == item.iterKV.UserKey, since it is incompatible with - // X > item.iterKV (a lower version cannot be in a higher sstable), so it must be that - // X.UserKey > item.iterKV.UserKey. Which means l.largestUserKey > item.key.UserKey. - // We also know that l.tombstone.End > item.iterKV.UserKey. So the min of these, - // seekKey, computed below, is > item.iterKV.UserKey, so the call to seekGE() will - // make forward progress. - seekKey := l.tombstone.End - // This seek is not directly due to a SeekGE call, so we don't know - // enough about the underlying iterator positions, and so we keep the - // try-seek-using-next optimization disabled. Additionally, if we're in - // prefix-seek mode and a re-seek would have moved us past the original - // prefix, we can remove all merging iter levels below the rangedel - // tombstone's level and return immediately instead of re-seeking. This - // is correct since those levels cannot provide a key that matches the - // prefix, and is also visible. Additionally, this is important to make - // subsequent `TrySeekUsingNext` work correctly, as a re-seek on a - // different prefix could have resulted in this iterator skipping visible - // keys at prefixes in between m.prefix and seekKey, that are currently - // not in the heap due to a bloom filter mismatch. - // - // Additionally, we set the relative-seek flag. This is - // important when iterating with lazy combined iteration. If - // there's a range key between this level's current file and the - // file the seek will land on, we need to detect it in order to - // trigger construction of the combined iterator. - if m.prefix != nil { - if n := m.split(seekKey); !bytes.Equal(m.prefix, seekKey[:n]) { - for i := item.index; i < len(m.levels); i++ { - // Remove this level from the heap. Setting iterKV - // to nil should be sufficient for initMinHeap to - // not re-initialize the heap with them in it. Other - // fields in mergingIterLevel can remain as-is; the - // iter/rangeDelIter needs to stay intact for future - // trySeekUsingNexts to work, the level iter - // boundary context is owned by the levelIter which - // is not being repositioned, and any tombstones in - // these levels will be irrelevant for us anyway. - m.levels[i].iterKV = nil - } - // TODO(bilal): Consider a more efficient way of removing levels from - // the heap without reinitializing all of it. This would likely - // necessitate tracking the heap positions of each mergingIterHeap - // item in the mergingIterLevel, and then swapping that item in the - // heap with the last-positioned heap item, and shrinking the heap by - // one. - if err := m.initMinHeap(); err != nil { - return false, err - } - return true, nil + // We can seek up to tombstone.End. + // + // Progress argument: Since this file is at a higher level than + // item.iterKV we know that the iterator in this file must be + // positioned within its bounds and at a key X > item.iterKV + // (otherwise it would be the min of the heap). It is not possible + // for X.UserKey == item.iterKV.UserKey, since it is incompatible + // with X > item.iterKV (a lower version cannot be in a higher + // sstable), so it must be that X.UserKey > item.iterKV.UserKey. We + // know that l.iterTombstone.End > item.iterKV.UserKey, so the call + // to seekGE() will make forward progress. + seekKey := l.tombstone.End + // This seek is not directly due to a SeekGE call, so we don't know + // enough about the underlying iterator positions, and so we keep + // the try-seek-using-next optimization disabled. Additionally, if + // we're in prefix-seek mode and a re-seek would have moved us past + // the original prefix, we can remove all merging iter levels below + // the rangedel tombstone's level and return immediately instead of + // re-seeking. This is correct since those levels cannot provide a + // key that matches the prefix, and is also visible. Additionally, + // this is important to make subsequent `TrySeekUsingNext` work + // correctly, as a re-seek on a different prefix could have resulted + // in this iterator skipping visible keys at prefixes in between + // m.prefix and seekKey, that are currently not in the heap due to a + // bloom filter mismatch. + // + // Additionally, we set the relative-seek flag. This is important + // when iterating with lazy combined iteration. If there's a range + // key between this level's current file and the file the seek will + // land on, we need to detect it in order to trigger construction of + // the combined iterator. + if m.prefix != nil { + if n := m.split(seekKey); !bytes.Equal(m.prefix, seekKey[:n]) { + for i := item.index; i < len(m.levels); i++ { + // Remove this level from the heap. Setting iterKV to + // nil should be sufficient for initMinHeap to not + // re-initialize the heap with them in it. Other fields + // in mergingIterLevel can remain as-is; the + // iter/rangeDelIter needs to stay intact for future + // trySeekUsingNexts to work, the level iter boundary + // context is owned by the levelIter which is not being + // repositioned, and any tombstones in these levels will + // be irrelevant for us anyway. + m.levels[i].iterKV = nil } + // TODO(bilal): Consider a more efficient way of removing + // levels from the heap without reinitializing all of it. + // This would likely necessitate tracking the heap positions + // of each mergingIterHeap item in the mergingIterLevel, and + // then swapping that item in the heap with the + // last-positioned heap item, and shrinking the heap by one. + m.initMinHeap() + return true, nil } - if err := m.seekGE(seekKey, item.index, base.SeekGEFlagsNone.EnableRelativeSeek()); err != nil { - return false, err - } - return true, nil } - if l.tombstone.CoversAt(m.snapshot, item.iterKV.SeqNum()) { - if err := m.nextEntry(item, nil /* succKey */); err != nil { - return false, err - } - return true, nil + if err := m.seekGE(seekKey, item.index, base.SeekGEFlagsNone.EnableRelativeSeek()); err != nil { + return false, err } + return true, nil + } + // Otherwise level == item.index; Since the tombstone and the iterator + // are in the same level, the tombstone may or may not delete the point + // key depending on their sequence numbers. + if l.tombstone.CoversAt(m.snapshot, item.iterKV.SeqNum()) { + if err := m.nextEntry(item, nil /* succKey */); err != nil { + return false, err + } + return true, nil } } return false, nil @@ -684,26 +676,9 @@ func (m *mergingIter) findNextEntry() *base.InternalKV { // The levelIter internal iterator will interleave exclusive sentinel // keys to keep files open until their range deletions are no longer - // necessary. Sometimes these are interleaved with the user key of a - // file's largest key, in which case they may simply be stepped over to - // move to the next file in the forward direction. Other times they're - // interleaved at the user key of the user-iteration boundary, if that - // falls within the bounds of a file. In the latter case, there are no - // more keys < m.upper, and we can stop iterating. - // - // We perform a key comparison to differentiate between these two cases. - // This key comparison is considered okay because it only happens for - // sentinel keys. It may be eliminated after #2863. - if m.levels[item.index].iterKV.K.IsExclusiveSentinel() { - if m.upper != nil && m.heap.cmp(m.levels[item.index].iterKV.K.UserKey, m.upper) >= 0 { - break - } - // This key is the largest boundary of a file and can be skipped now - // that the file's range deletions are no longer relevant. + // necessary and to inform us when we step in to or out of a range tombstone. + if item.iterKV.K.IsExclusiveSentinel() { m.err = m.nextEntry(item, nil /* succKey */) - if m.err != nil { - return nil - } continue } @@ -739,99 +714,88 @@ func (m *mergingIter) findNextEntry() *base.InternalKV { // Steps to the prev entry. item is the current top item in the heap. func (m *mergingIter) prevEntry(l *mergingIterLevel) error { - oldTopLevel := l.index - oldRangeDelIter := l.rangeDelIter + // Before we advance, consider whehter we're moving on to or out of a range + // tombstone within the level. + switch l.iterKV.K.Kind() { + case base.InternalKeyKindSpanStart: + l.tombstone = nil + case base.InternalKeyKindSpanEnd: + l.tombstone = l.getVisibleTombstone(m.snapshot) + } + if l.iterKV = l.iter.Prev(); l.iterKV != nil { if m.heap.len() > 1 { m.heap.fix(0) } - if l.rangeDelIter != oldRangeDelIter && l.rangeDelIter != nil { - // The rangeDelIter changed which indicates that the l.iter moved to the - // previous sstable. We have to update the tombstone for oldTopLevel as - // well. - oldTopLevel-- - } } else { + l.tombstone = nil if err := l.iter.Error(); err != nil { return err } m.heap.pop() } - - // The cached tombstones are only valid for the levels - // [0,oldTopLevel]. Updated the cached tombstones for any levels in the range - // [oldTopLevel+1,heap[0].index]. - return m.initMaxRangeDelIters(oldTopLevel) + return nil } -// isPrevEntryDeleted() starts from the current entry (as the prev entry) and if it is deleted, -// moves the iterators backward as needed and returns true, else it returns false. item is the top -// item in the heap. +// isPrevEntryDeleted() starts from the current entry (as the prev entry) and if +// it is deleted, moves the iterators backward as needed and returns true, else +// it returns false. item is the top item in the heap. func (m *mergingIter) isPrevEntryDeleted(item *mergingIterLevel) (bool, error) { // Look for a range deletion tombstone containing item.iterKV at higher - // levels (level < item.index). If we find such a range tombstone we know - // it deletes the key in the current level. Also look for a range - // deletion at the current level (level == item.index). If we find such a - // range deletion we need to check whether it is newer than the current - // entry. + // levels (level < item.index). If we find such a range tombstone we know it + // deletes the key in the current level. Also look for a range deletion at + // the current level (level == item.index). If we find such a range deletion + // we need to check whether it is newer than the current entry. for level := 0; level <= item.index; level++ { l := &m.levels[level] - if l.rangeDelIter == nil || l.tombstone == nil { - // If l.tombstone is nil, there are no further tombstones - // in the current sstable in the current (reverse) iteration - // direction. + if l.tombstone == nil { + // If l.iterTombstone is nil, there is no range tombstones in this + // level overlapping the current heap root. continue } - if m.heap.cmp(item.iterKV.K.UserKey, l.tombstone.Start) < 0 { - // The current key is before the tombstone start key. - // - // NB: for the case that this l.rangeDelIter is provided by a levelIter we know that - // the levelIter must be positioned at a key < item.iterKV. So it is sufficient to seek the - // current l.rangeDelIter (since any range del iterators that will be provided by the - // levelIter in the future cannot contain item.iterKV). Also, it is it is possible that we - // will encounter parts of the range delete that should be ignored -- we handle that - // below. - - tomb, err := keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKV.K.UserKey) - if err != nil { - return false, err + // Heap invariants must ensure that l.tombstone is always visible at the + // merging iterator's snapshot and contains the current heap root within + // its span boundaries. Ensure this holds in invariant builds. + if invariants.Enabled { + if !l.tombstone.Valid() { + panic(errors.AssertionFailedf("tombstone from level %s is not valid", l)) + } else if !l.tombstone.VisibleAt(m.snapshot) { + panic(errors.AssertionFailedf("tombstone from level %s is not visible at snapshot %d", l, m.snapshot)) + } else if !l.tombstone.Contains(m.heap.cmp, item.iterKV.K.UserKey) { + panic(errors.AssertionFailedf("tombstone from level %s does not contain key %s", l, item.iterKV.K)) } - l.tombstone = tomb - } - if l.tombstone == nil { - continue } - if l.tombstone.VisibleAt(m.snapshot) && m.heap.cmp(l.tombstone.End, item.iterKV.K.UserKey) > 0 { - if level < item.index { - // We could also do m.seekLT(..., level + 1). The levels from - // [level + 1, item.index) are already before item.iterKV so seeking them may be - // wasteful. + if level < item.index { + // We could also do m.seekLT(..., level + 1). The levels from [level + // + 1, item.index) are already before item.iterKV so seeking them + // may be wasteful. - // We can seek up to tombstone.Start.UserKey. - // - // Progress argument: We know that the iterator in this file is positioned within - // its bounds and at a key X < item.iterKV (otherwise it would be the max of the heap). - // So smallestUserKey <= item.iterKV.UserKey and we already know that - // l.tombstone.Start.UserKey <= item.iterKV.UserKey. So the seekKey computed below - // is <= item.iterKV.UserKey, and since we do a seekLT() we will make backwards - // progress. - seekKey := l.tombstone.Start - // We set the relative-seek flag. This is important when - // iterating with lazy combined iteration. If there's a range - // key between this level's current file and the file the seek - // will land on, we need to detect it in order to trigger - // construction of the combined iterator. - if err := m.seekLT(seekKey, item.index, base.SeekLTFlagsNone.EnableRelativeSeek()); err != nil { - return false, err - } - return true, nil + // We can seek up to tombstone.Start.UserKey. + // + // Progress argument: We know that the iterator in this file is + // positioned within its bounds and at a key X < item.iterKV + // (otherwise it would be the max of the heap). We already know that + // l.tombstone.Start.UserKey <= item.iterKV.UserKey, so the SeekLT + // will make backwards progress. + seekKey := l.tombstone.Start + // We set the relative-seek flag. This is important when iterating + // with lazy combined iteration. If there's a range key between this + // level's current file and the file the seek will land on, we need + // to detect it in order to trigger construction of the combined + // iterator. + if err := m.seekLT(seekKey, item.index, base.SeekLTFlagsNone.EnableRelativeSeek()); err != nil { + return false, err } - if l.tombstone.CoversAt(m.snapshot, item.iterKV.SeqNum()) { - if err := m.prevEntry(item); err != nil { - return false, err - } - return true, nil + return true, nil + } + // Otherwise level == item.index; Since the tombstone and the iterator + // are in the same level, the tombstone may or may not delete the point + // key depending on their sequence numbers. + if l.tombstone.CoversAt(m.snapshot, item.iterKV.SeqNum()) { + if err := m.prevEntry(item); err != nil { + return false, err } + return true, nil } } return false, nil @@ -847,26 +811,10 @@ func (m *mergingIter) findPrevEntry() *base.InternalKV { // The levelIter internal iterator will interleave exclusive sentinel // keys to keep files open until their range deletions are no longer - // necessary. Sometimes these are interleaved with the user key of a - // file's smallest key, in which case they may simply be stepped over to - // move to the next file in the backward direction. Other times they're - // interleaved at the user key of the user-iteration boundary, if that - // falls within the bounds of a file. In the latter case, there are no - // more keys ≥ m.lower, and we can stop iterating. - // - // We perform a key comparison to differentiate between these two cases. - // This key comparison is considered okay because it only happens for - // sentinel keys. It may be eliminated after #2863. - if m.levels[item.index].iterKV.K.IsExclusiveSentinel() { - if m.lower != nil && m.heap.cmp(m.levels[item.index].iterKV.K.UserKey, m.lower) <= 0 { - break - } - // This key is the smallest boundary of a file and can be skipped - // now that the file's range deletions are no longer relevant. + // necessary and inform of us when we're stepping in to or out of a + // range tombstone. + if item.iterKV.K.IsExclusiveSentinel() { m.err = m.prevEntry(item) - if m.err != nil { - return nil - } continue } @@ -886,7 +834,8 @@ func (m *mergingIter) findPrevEntry() *base.InternalKV { return nil } -// Seeks levels >= level to >= key. Additionally uses range tombstones to extend the seeks. +// Seeks levels >= level to >= key. Additionally uses range tombstones to extend +// the seeks. // // If an error occurs, seekGE returns the error without setting m.err. func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) error { @@ -948,6 +897,7 @@ func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) erro } l := &m.levels[level] + l.tombstone = nil if m.prefix != nil { l.iterKV = l.iter.SeekPrefixGE(m.prefix, key, flags) if l.iterKV != nil { @@ -964,37 +914,29 @@ func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) erro if err := l.iter.Error(); err != nil { return err } - } - - // If this level contains overlapping range tombstones, alter the seek - // key accordingly. Caveat: If we're performing lazy-combined iteration, - // we cannot alter the seek key: Range tombstones don't delete range - // keys, and there might exist live range keys within the range - // tombstone's span that need to be observed to trigger a switch to - // combined iteration. - if rangeDelIter := l.rangeDelIter; rangeDelIter != nil && + } else if l.iterKV.K.IsExclusiveSentinel() && (m.combinedIterState == nil || m.combinedIterState.initialized) { - // The level has a range-del iterator. Find the tombstone containing - // the search key. - var err error - l.tombstone, err = rangeDelIter.SeekGE(key) - if err != nil { - return err - } - if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && m.heap.cmp(l.tombstone.Start, key) <= 0 { - // Based on the containment condition tombstone.End > key, so - // the assignment to key results in a monotonically - // non-decreasing key across iterations of this loop. + // If this level contains an overlapping range tombstone, alter the + // seek key accordingly. Caveat: If we're performing lazy-combined + // iteration, we cannot alter the seek key: Range tombstones don't + // delete range keys, and there might exist live range keys within + // the range tombstone's span that need to be observed to trigger a + // switch to combined iteration. + if t := l.getVisibleTombstone(m.snapshot); t != nil && m.heap.cmp(t.Start, key) <= 0 { + // Based on SeekGE semantics, we know t.End > key, so the + // assignment to key results in a monotonically non-decreasing + // key across iterations of this loop. // // The adjustment of key here can only move it to a larger key. // Since the caller of seekGE guaranteed that the original key // was greater than or equal to m.lower, the new key will // continue to be greater than or equal to m.lower. - key = l.tombstone.End + key = t.End } } } - return m.initMinHeap() + m.initMinHeap() + return nil } func (m *mergingIter) String() string { @@ -1038,7 +980,8 @@ func (m *mergingIter) SeekPrefixGEStrict( return iterKV } -// Seeks levels >= level to < key. Additionally uses range tombstones to extend the seeks. +// Seeks levels >= level to < key. Additionally uses range tombstones to extend +// the seeks. func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) error { // See the comment in seekGE regarding using tombstones to adjust the seek // target per level. @@ -1050,48 +993,34 @@ func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) erro l := &m.levels[level] l.iterKV = l.iter.SeekLT(key, flags) + l.tombstone = nil if l.iterKV == nil { if err := l.iter.Error(); err != nil { return err } - } - - // If this level contains overlapping range tombstones, alter the seek - // key accordingly. Caveat: If we're performing lazy-combined iteration, - // we cannot alter the seek key: Range tombstones don't delete range - // keys, and there might exist live range keys within the range - // tombstone's span that need to be observed to trigger a switch to - // combined iteration. - if rangeDelIter := l.rangeDelIter; rangeDelIter != nil && + } else if l.iterKV.K.IsExclusiveSentinel() && (m.combinedIterState == nil || m.combinedIterState.initialized) { - // The level has a range-del iterator. Find the tombstone containing - // the search key. - tomb, err := keyspan.SeekLE(m.heap.cmp, rangeDelIter, key) - if err != nil { - return err - } - l.tombstone = tomb - // Since SeekLT is exclusive on `key` and a tombstone's end key is - // also exclusive, a seek key equal to a tombstone's end key still - // enables the seek optimization (Note this is different than the - // check performed by (*keyspan.Span).Contains). - if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && - m.heap.cmp(key, l.tombstone.End) <= 0 { - // NB: Based on the containment condition - // tombstone.Start.UserKey <= key, so the assignment to key - // results in a monotonically non-increasing key across - // iterations of this loop. + // If this level contains an overlapping range tombstone, alter the + // seek key accordingly. Caveat: If we're performing lazy-combined + // iteration, we cannot alter the seek key: Range tombstones don't + // delete range keys, and there might exist live range keys within + // the range tombstone's span that need to be observed to trigger a + // switch to combined iteration. + if t := l.getVisibleTombstone(m.snapshot); t != nil && m.heap.cmp(key, t.End) <= 0 { + // Based on the SeekLT semantics we know t.Start.UserKey < key, + // so the assignment to key results in a monotonically + // non-increasing key across iterations of this loop. // // The adjustment of key here can only move it to a smaller key. // Since the caller of seekLT guaranteed that the original key // was less than or equal to m.upper, the new key will continue // to be less than or equal to m.upper. - key = l.tombstone.Start + key = t.Start } } } - - return m.initMaxHeap() + m.initMaxHeap() + return nil } // SeekLT implements base.InternalIterator.SeekLT. Note that SeekLT only checks @@ -1116,15 +1045,14 @@ func (m *mergingIter) First() *base.InternalKV { for i := range m.levels { l := &m.levels[i] l.iterKV = l.iter.First() + l.tombstone = nil if l.iterKV == nil { if m.err = l.iter.Error(); m.err != nil { return nil } } } - if m.err = m.initMinHeap(); m.err != nil { - return nil - } + m.initMinHeap() return m.findNextEntry() } @@ -1137,15 +1065,14 @@ func (m *mergingIter) Last() *base.InternalKV { for i := range m.levels { l := &m.levels[i] l.iterKV = l.iter.Last() + l.tombstone = nil if l.iterKV == nil { if m.err = l.iter.Error(); m.err != nil { return nil } } } - if m.err = m.initMaxHeap(); m.err != nil { - return nil - } + m.initMaxHeap() return m.findPrevEntry() } @@ -1153,14 +1080,12 @@ func (m *mergingIter) Next() *base.InternalKV { if m.err != nil { return nil } - if m.dir != 1 { if m.err = m.switchToMinHeap(); m.err != nil { return nil } return m.findNextEntry() } - if m.heap.len() == 0 { return nil } @@ -1172,7 +1097,6 @@ func (m *mergingIter) Next() *base.InternalKV { if m.err = m.nextEntry(m.heap.items[0], nil /* succKey */); m.err != nil { return nil } - iterKV := m.findNextEntry() if invariants.Enabled && m.prefix != nil && iterKV != nil { if n := m.split(iterKV.K.UserKey); !bytes.Equal(m.prefix, iterKV.K.UserKey[:n]) { @@ -1271,7 +1195,8 @@ func (m *mergingIter) Prev() *base.InternalKV { if m.heap.len() == 0 { return nil } - if m.err = m.prevEntry(m.heap.items[0]); m.err != nil { + root := m.heap.items[0] + if m.err = m.prevEntry(root); m.err != nil { return nil } return m.findPrevEntry() @@ -1290,11 +1215,6 @@ func (m *mergingIter) Close() error { if err := iter.Close(); err != nil && m.err == nil { m.err = err } - if rangeDelIter := m.levels[i].rangeDelIter; rangeDelIter != nil { - if err := rangeDelIter.Close(); err != nil && m.err == nil { - m.err = err - } - } } m.levels = nil m.heap.items = m.heap.items[:0] @@ -1327,9 +1247,9 @@ func (m *mergingIter) DebugString() string { } var err error if m.dir == 1 { - err = m.initMinHeap() + m.initMinHeap() } else { - err = m.initMaxHeap() + m.initMaxHeap() } if err != nil { fmt.Fprintf(&buf, "err=<%s>", err) diff --git a/merging_iter_test.go b/merging_iter_test.go index 4a141af3cd..23018d456c 100644 --- a/merging_iter_test.go +++ b/merging_iter_test.go @@ -303,10 +303,8 @@ func TestMergingIterCornerCases(t *testing.T) { li := &levelIter{} li.init(context.Background(), IterOptions{}, testkeys.Comparer, newIters, slice.Iter(), manifest.Level(i), internalIterOpts{stats: &stats}) - - i := len(levelIters) - levelIters = append(levelIters, mergingIterLevel{iter: li}) - li.initRangeDel(&levelIters[i].rangeDelIter) + li.interleaveRangeDeletions = true + levelIters = append(levelIters, mergingIterLevel{iter: li, getTombstone: li.getTombstone}) } miter := &mergingIter{} miter.init(nil /* opts */, &stats, cmp, func(a []byte) int { return len(a) }, levelIters...) @@ -677,8 +675,9 @@ func buildMergingIter(readers [][]*sstable.Reader, levelSlices []manifest.LevelS l := newLevelIter( context.Background(), IterOptions{}, testkeys.Comparer, newIters, levelSlices[i].Iter(), manifest.Level(level), internalIterOpts{}) - l.initRangeDel(&mils[level].rangeDelIter) + l.interleaveRangeDeletions = true mils[level].iter = l + mils[level].getTombstone = l.getTombstone } var stats base.InternalIteratorStats m := &mergingIter{} diff --git a/scan_internal.go b/scan_internal.go index d6ee26618f..5f3c17cde3 100644 --- a/scan_internal.go +++ b/scan_internal.go @@ -874,7 +874,8 @@ func (i *scanInternalIterator) constructPointIter( for j := len(memtables) - 1; j >= 0; j-- { mem := memtables[j] mlevels = append(mlevels, mergingIterLevel{ - iter: mem.newIter(&i.opts.IterOptions), + iter: mem.newIter(&i.opts.IterOptions), + getTombstone: nil, }) i.iterLevels[mlevelsIndex] = IteratorLevel{ Kind: IteratorLevelFlushable, @@ -901,6 +902,7 @@ func (i *scanInternalIterator) constructPointIter( i.ctx, i.opts.IterOptions, i.comparer, i.newIters, files, level, internalIterOpts{}) mlevels[mlevelsIndex].iter = li + mlevels[mlevelsIndex].getTombstone = nil // range dels handled separately below rli.Init(keyspan.SpanIterOptions{RangeKeyFilters: i.opts.RangeKeyFilters}, i.comparer.Compare, tableNewRangeDelIter(i.ctx, i.newIters), files, level, manifest.KeyTypePoint) diff --git a/testdata/ingest b/testdata/ingest index fc6a64bfa7..4be171bdf0 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -53,7 +53,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 569B Block cache: 6 entries (945B) hit rate: 30.8% -Table cache: 1 entries (760B) hit rate: 66.7% +Table cache: 1 entries (760B) hit rate: 50.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 diff --git a/testdata/level_iter_boundaries b/testdata/level_iter_boundaries index dec8d90c8e..f3287f7c52 100644 --- a/testdata/level_iter_boundaries +++ b/testdata/level_iter_boundaries @@ -20,19 +20,19 @@ prev prev prev ---- -a#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: -d#72057594037927935,RANGEDEL: +a#72057594037927935,SPANSTART: +b#72057594037927935,SPANEND: +b#72057594037927935,SPANSTART: +c#72057594037927935,SPANEND: +c#72057594037927935,SPANSTART: +d#72057594037927935,SPANEND: . -d#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: -a#72057594037927935,RANGEDEL: +d#72057594037927935,SPANEND: +c#72057594037927935,SPANSTART: +c#72057594037927935,SPANEND: +b#72057594037927935,SPANSTART: +b#72057594037927935,SPANEND: +a#72057594037927935,SPANSTART: . iter @@ -41,10 +41,10 @@ seek-ge d seek-lt b prev ---- -c#72057594037927935,RANGEDEL: +c#72057594037927935,SPANSTART: . -b#72057594037927935,RANGEDEL: -a#72057594037927935,RANGEDEL: +b#72057594037927935,SPANEND: +a#72057594037927935,SPANSTART: iter seek-prefix-ge c @@ -52,10 +52,10 @@ seek-prefix-ge d seek-lt b prev ---- -c#72057594037927935,RANGEDEL: +c#72057594037927935,SPANSTART: . -b#72057594037927935,RANGEDEL: -a#72057594037927935,RANGEDEL: +b#72057594037927935,SPANEND: +a#72057594037927935,SPANSTART: iter seek-ge e @@ -100,8 +100,8 @@ next next ---- a#1,SET:a -b#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: +b#72057594037927935,SPANSTART: +c#72057594037927935,SPANEND: c#3,SET:c . @@ -113,8 +113,8 @@ prev prev ---- c#3,SET:c -c#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: +c#72057594037927935,SPANEND: +b#72057594037927935,SPANSTART: a#1,SET:a . @@ -134,8 +134,8 @@ next next ---- a#1,SET:b -b#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: +b#72057594037927935,SPANSTART: +c#72057594037927935,SPANEND: . iter @@ -144,8 +144,8 @@ prev prev prev ---- -c#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: +c#72057594037927935,SPANEND: +b#72057594037927935,SPANSTART: a#1,SET:b . @@ -164,8 +164,8 @@ next next next ---- -a#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: +a#72057594037927935,SPANSTART: +b#72057594037927935,SPANEND: c#2,SET:c . @@ -176,8 +176,8 @@ prev prev ---- c#2,SET:c -b#72057594037927935,RANGEDEL: -a#72057594037927935,RANGEDEL: +b#72057594037927935,SPANEND: +a#72057594037927935,SPANSTART: . # Regression test to check that Seek{GE,LT} work properly in certain @@ -203,15 +203,15 @@ seek-ge d prev seek-lt e ---- -d#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: d#3,SET:d -d#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: d#3,SET:d -e#72057594037927935,RANGEDEL: +e#72057594037927935,SPANEND: d#3,SET:d -d#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: -e#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: +c#72057594037927935,SPANSTART: +e#72057594037927935,SPANEND: # Regression test to check that Seek{GE,LT}, First, and Last do not # have iteration bounds affected by SeekPrefixGE. Previously, diff --git a/testdata/level_iter_seek b/testdata/level_iter_seek index 74a4691e03..b95a38fdcc 100644 --- a/testdata/level_iter_seek +++ b/testdata/level_iter_seek @@ -36,8 +36,8 @@ seek-ge d next next ---- -d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} -e#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} +e#72057594037927935,SPANEND: / d-e:{(#6,RANGEDEL)} f#5,SET:f iter @@ -53,7 +53,7 @@ next prev next ---- -c#7,SET:c / d-e:{(#6,RANGEDEL)} +c#7,SET:c . c#7,SET:c . @@ -64,8 +64,8 @@ iter seek-prefix-ge d next ---- -d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} -d\x00#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: / d-d\x00:{(#6,RANGEDEL)} +d\x00#72057594037927935,SPANEND: / d-d\x00:{(#6,RANGEDEL)} # Tests a sequence of SeekPrefixGE with monotonically increasing keys, some of # which are present and some not (so fail the bloom filter match). The seek to @@ -81,9 +81,9 @@ seek-prefix-ge gg seek-prefix-ge h ---- . -c#7,SET:c / d-e:{(#6,RANGEDEL)} -d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} -cc\x00#72057594037927935,RANGEDEL: +c#7,SET:c +d#72057594037927935,SPANSTART: / d-cc\x00:{(#6,RANGEDEL)} +cc\x00#72057594037927935,SPANEND: / d-cc\x00:{(#6,RANGEDEL)} f#5,SET:f g#4,SET:g . @@ -125,9 +125,9 @@ b#8,SET:b {BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} c#7,SET:c {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} -d#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} -e#72057594037927935,RANGEDEL: +e#72057594037927935,SPANEND: / d-e:{(#6,RANGEDEL)} {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} f#5,SET:f {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} @@ -154,11 +154,11 @@ prev prev prev ---- -f#5,SET:f / d-e:{(#6,RANGEDEL)} -e#72057594037927935,RANGEDEL: f#5,SET:f -e#72057594037927935,RANGEDEL: -d#72057594037927935,RANGEDEL: +e#72057594037927935,SPANEND: / d-e:{(#6,RANGEDEL)} +f#5,SET:f +e#72057594037927935,SPANEND: / d-e:{(#6,RANGEDEL)} +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} . # Verify that First() in the presence of an upper-bound pauses at the @@ -177,7 +177,7 @@ iter set-bounds upper=f first ---- -d#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} # Verify that Last() in the presence of a lower-bound pauses at the # last range deletion's end key. @@ -197,8 +197,8 @@ last prev prev ---- -e#72057594037927935,RANGEDEL: -d#72057594037927935,RANGEDEL: +e#72057594037927935,SPANEND: / d-e:{(#6,RANGEDEL)} +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} c#7,SET:c # Verify that a seek to a file with range tombstones as boundaries pauses on @@ -228,10 +228,10 @@ next next next ---- -d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} c#7,SET:c -d#72057594037927935,RANGEDEL: -e#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} +e#72057594037927935,SPANEND: / d-e:{(#6,RANGEDEL)} f#8,SET:f iter @@ -241,10 +241,10 @@ prev prev prev ---- -b#72057594037927935,RANGEDEL: / a-b:{(#5,RANGEDEL)} +b#72057594037927935,SPANEND: / a-b:{(#5,RANGEDEL)} c#7,SET:c -b#72057594037927935,RANGEDEL: -a#72057594037927935,RANGEDEL: +b#72057594037927935,SPANEND: / a-b:{(#5,RANGEDEL)} +a#72057594037927935,SPANSTART: / a-b:{(#5,RANGEDEL)} . # Verify that prev when positioned at the start bound of the first range @@ -266,7 +266,7 @@ seek-ge d prev ---- b#1,SET:b -d#72057594037927935,RANGEDEL: / d-e:{(#2,RANGEDEL)} +d#72057594037927935,SPANSTART: / d-e:{(#2,RANGEDEL)} b#1,SET:b # Verify that next when positioned at the start boundary of the first range @@ -288,7 +288,7 @@ seek-lt d next ---- d#2,SET:d -b#72057594037927935,RANGEDEL: / a-b:{(#1,RANGEDEL)} +b#72057594037927935,SPANEND: / a-b:{(#1,RANGEDEL)} d#2,SET:d # Verify SeekPrefixGE correctness with trySeekUsingNext=true @@ -342,12 +342,12 @@ seek-prefix-ge h true seek-prefix-ge i true seek-prefix-ge j true ---- -a#1,SET:a / c-e:{(#4,RANGEDEL)} -a#1,SET:a / c-e:{(#4,RANGEDEL)} -b#2,SET:b / c-e:{(#4,RANGEDEL)} -c#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: / c-e:{(#4,RANGEDEL)} -d#72057594037927935,RANGEDEL: / c-e:{(#4,RANGEDEL)} +a#1,SET:a +a#1,SET:a +b#2,SET:b +c#72057594037927935,SPANSTART: / c-b\x00:{(#4,RANGEDEL)} +c#72057594037927935,SPANSTART: / c-c\x00:{(#4,RANGEDEL)} +d#72057594037927935,SPANSTART: / d-d\x00:{(#4,RANGEDEL)} f#5,SINGLEDEL: g#6,SET:g h#7,SINGLEDEL: @@ -361,7 +361,7 @@ seek-prefix-ge e true seek-prefix-ge i true seek-prefix-ge j true ---- -a#1,SET:a / c-e:{(#4,RANGEDEL)} +a#1,SET:a e#4,SET:e i#6,SET:i j#7,SET:j @@ -408,10 +408,8 @@ next f#5,SET:f i#4,SET:i -# The below count should be 4, as we skip over the rangekey-only file. -# TODO(jackson): When we stop opening range deletion iterators twice, this -# should be 2. +# The below count should be 2, as we skip over the rangekey-only file. iters-created ---- -4 +2 diff --git a/testdata/merging_iter b/testdata/merging_iter index 3595ac6232..4fd4d09a13 100644 --- a/testdata/merging_iter +++ b/testdata/merging_iter @@ -53,22 +53,16 @@ e#10,SET:10 g#20,SET:20 . -iter probe-rangedels=(000000,(Log "# 000000.rangeDelIter.")) probe-rangedels=(000001,(If (Equal SeekKey (Bytes "g")) ErrInjected noop),(Log "# 000001.rangeDelIter.")) +iter probe-rangedels=(000000,(Log "# 000000.rangeDelIter.")) probe-rangedels=(000001,(If opSpanNext ErrInjected noop),(Log "# 000001.rangeDelIter.")) seek-ge d next ---- # 000000.rangeDelIter.opSpanSeekGE("d") = a-e:{(#8,RANGEDEL)} -# 000000.rangeDelIter.opSpanSeekGE("d") = a-e:{(#8,RANGEDEL)} -# 000001.rangeDelIter.opSpanSeekGE("e") = e-g:{(#8,RANGEDEL)} # 000001.rangeDelIter.opSpanSeekGE("e") = e-g:{(#8,RANGEDEL)} -# 000000.rangeDelIter.opSpanSeekGE("d") = a-e:{(#8,RANGEDEL)} # 000000.rangeDelIter.opSpanNext() = nil # 000000.rangeDelIter.opSpanClose() = nil -# 000000.rangeDelIter.opSpanClose() = nil -# 000001.rangeDelIter.opSpanSeekGE("e") = e-g:{(#8,RANGEDEL)} e#10,SET:10 -# 000001.rangeDelIter.opSpanNext() = nil -# 000001.rangeDelIter.opSpanSeekGE("g") = nil +# 000001.rangeDelIter.opSpanNext() = nil err=injected error # isPrevEntryDeleted() should not allow the rangedel to act on the points in the lower sstable @@ -556,8 +550,8 @@ seek-lt jd next ---- # 000022.SeekLT("jd") = (iwoeionch#792,SET,"792") -# 000023.SeekLT("c") = nil -# 000024.SeekLT("c") = nil +# 000023.SeekLT("cz") = nil +# 000024.SeekLT("cz") = nil iwoeionch#792,SET:792 # 000023.SeekGE("cz") = nil err=injected error @@ -909,6 +903,6 @@ set-bounds lower=cz upper=jd seek-lt jd ---- # 000033.SeekLT("jd") = (iwoeionch#792,SET,"792") -# 000034.SeekLT("c") = nil -# 000035.SeekLT("c") = nil +# 000034.SeekLT("cz") = nil +# 000035.SeekLT("cz") = nil iwoeionch#792,SET:792 diff --git a/testdata/metrics b/testdata/metrics index 04e93ab50e..678d5180f6 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -73,7 +73,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 589B Block cache: 3 entries (484B) hit rate: 0.0% -Table cache: 1 entries (760B) hit rate: 50.0% +Table cache: 1 entries (760B) hit rate: 0.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 1 @@ -128,7 +128,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 595B Block cache: 5 entries (946B) hit rate: 33.3% -Table cache: 2 entries (1.5KB) hit rate: 75.0% +Table cache: 2 entries (1.5KB) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 2 @@ -170,7 +170,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 595B Block cache: 5 entries (946B) hit rate: 33.3% -Table cache: 2 entries (1.5KB) hit rate: 75.0% +Table cache: 2 entries (1.5KB) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 2 @@ -209,7 +209,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 595B Block cache: 3 entries (484B) hit rate: 33.3% -Table cache: 1 entries (760B) hit rate: 75.0% +Table cache: 1 entries (760B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 1 @@ -252,7 +252,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 595B Block cache: 0 entries (0B) hit rate: 33.3% -Table cache: 0 entries (0B) hit rate: 75.0% +Table cache: 0 entries (0B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -322,7 +322,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 2.6KB Block cache: 0 entries (0B) hit rate: 33.3% -Table cache: 0 entries (0B) hit rate: 75.0% +Table cache: 0 entries (0B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -376,7 +376,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 2.0KB Block cache: 0 entries (0B) hit rate: 14.3% -Table cache: 0 entries (0B) hit rate: 64.3% +Table cache: 0 entries (0B) hit rate: 58.3% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -479,7 +479,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 4.3KB Block cache: 12 entries (1.9KB) hit rate: 16.7% -Table cache: 1 entries (760B) hit rate: 64.7% +Table cache: 1 entries (760B) hit rate: 60.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -541,7 +541,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 6.1KB Block cache: 12 entries (1.9KB) hit rate: 16.7% -Table cache: 1 entries (760B) hit rate: 64.7% +Table cache: 1 entries (760B) hit rate: 60.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0