diff --git a/batch.go b/batch.go index c77896d25a..ea6b12aee7 100644 --- a/batch.go +++ b/batch.go @@ -2284,7 +2284,7 @@ func (i *flushableBatchIter) extractValue() base.LazyValue { return base.LazyValue{} } kind := InternalKeyKind(p[0]) - if kind > InternalKeyKindMax { + if kind > base.InternalKeyKindDurableMax { i.err = base.CorruptionErrorf("corrupted batch") return base.LazyValue{} } diff --git a/batchrepr/reader.go b/batchrepr/reader.go index ebb14d7819..735f69e6ea 100644 --- a/batchrepr/reader.go +++ b/batchrepr/reader.go @@ -87,7 +87,7 @@ func (r *Reader) Next() (kind base.InternalKeyKind, ukey []byte, value []byte, o return 0, nil, nil, false, nil } kind = base.InternalKeyKind((*r)[0]) - if kind > base.InternalKeyKindMax { + if kind > base.InternalKeyKindDurableMax { return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "invalid key kind 0x%x", (*r)[0]) } *r, ukey, ok = DecodeStr((*r)[1:]) diff --git a/db.go b/db.go index b4b9acd0a4..2d6cb05344 100644 --- a/db.go +++ b/db.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/pebble/internal/keyspan/keyspanimpl" "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/manual" + "github.com/cockroachdb/pebble/internal/rangedel" "github.com/cockroachdb/pebble/objstorage" "github.com/cockroachdb/pebble/objstorage/remote" "github.com/cockroachdb/pebble/rangekey" @@ -1431,19 +1432,16 @@ func (i *Iterator) constructPointIter( } else { i.batch.initInternalIter(&i.opts, &i.batchPointIter) i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, i.batchSeqNum) + mil := mergingIterLevel{iter: &i.batchPointIter, getTombstone: nil} // Only include the batch's rangedel iterator if it's non-empty. // This requires some subtle logic in the case a rangedel is later // written to the batch and the view of the batch is refreshed // during a call to SetOptions—in this case, we need to reconstruct // the point iterator to add the batch rangedel iterator. - var rangeDelIter keyspan.FragmentIterator if i.batchRangeDelIter.Count() > 0 { - rangeDelIter = &i.batchRangeDelIter + mil.iter, mil.getTombstone = rangedel.Interleave(&i.comparer, &i.batchPointIter, &i.batchRangeDelIter) } - mlevels = append(mlevels, mergingIterLevel{ - iter: &i.batchPointIter, - rangeDelIter: rangeDelIter, - }) + mlevels = append(mlevels, mil) } } @@ -1451,10 +1449,9 @@ func (i *Iterator) constructPointIter( // Next are the memtables. for j := len(memtables) - 1; j >= 0; j-- { mem := memtables[j] - mlevels = append(mlevels, mergingIterLevel{ - iter: mem.newIter(&i.opts), - rangeDelIter: mem.newRangeDelIter(&i.opts), - }) + mil := mergingIterLevel{} + mil.iter, mil.getTombstone = rangedel.Interleave(&i.comparer, mem.newIter(&i.opts), mem.newRangeDelIter(&i.opts)) + mlevels = append(mlevels, mil) } // Next are the file levels: L0 sub-levels followed by lower levels. @@ -1467,10 +1464,11 @@ func (i *Iterator) constructPointIter( li := &levels[levelsIndex] li.init(ctx, i.opts, &i.comparer, i.newIters, files, level, internalOpts) - li.initRangeDel(&mlevels[mlevelsIndex].rangeDelIter) + li.interleaveRangeDeletions = true li.initCombinedIterState(&i.lazyCombinedIter.combinedIterState) mlevels[mlevelsIndex].levelIter = li mlevels[mlevelsIndex].iter = invalidating.MaybeWrapIfInvariants(li) + mlevels[mlevelsIndex].getTombstone = li.getTombstone levelsIndex++ mlevelsIndex++ diff --git a/external_iterator.go b/external_iterator.go index 1d8f0d5217..92dd29e5d0 100644 --- a/external_iterator.go +++ b/external_iterator.go @@ -11,6 +11,7 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" + "github.com/cockroachdb/pebble/internal/rangedel" "github.com/cockroachdb/pebble/sstable" ) @@ -212,10 +213,11 @@ func createExternalPointIter(ctx context.Context, it *Iterator) (topLevelIterato if err != nil { return nil, err } - mlevels = append(mlevels, mergingIterLevel{ - iter: pointIter, - rangeDelIter: rangeDelIter, - }) + mil := mergingIterLevel{iter: pointIter, getTombstone: nil} + if rangeDelIter != nil { + mil.iter, mil.getTombstone = rangedel.Interleave(&it.comparer, mil.iter, rangeDelIter) + } + mlevels = append(mlevels, mil) } } diff --git a/ingest.go b/ingest.go index c12591b2be..4c9e697d9e 100644 --- a/ingest.go +++ b/ingest.go @@ -139,7 +139,7 @@ func ingestSynthesizeShared( // a.RANGEDEL.100, with a.RANGEDEL.100 being the smallest key. To create a // correct bound, we just use the maximum key kind (which sorts first). // Similarly, we use the smallest key kind for the largest key. - smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, base.InternalKeyKindMax) + smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, base.InternalKeyKindDurableMax) largestPointKey := base.MakeInternalKey(sm.LargestPointKey.UserKey, 0, 0) if sm.LargestPointKey.IsExclusiveSentinel() { largestPointKey = base.MakeRangeDeleteSentinelKey(sm.LargestPointKey.UserKey) @@ -220,12 +220,12 @@ func ingestLoad1External( if e.EndKeyIsInclusive { meta.ExtendPointKeyBounds( opts.Comparer.Compare, - base.MakeInternalKey(smallestCopy, 0, InternalKeyKindMax), + base.MakeInternalKey(smallestCopy, 0, base.InternalKeyKindDurableMax), base.MakeInternalKey(largestCopy, 0, 0)) } else { meta.ExtendPointKeyBounds( opts.Comparer.Compare, - base.MakeInternalKey(smallestCopy, 0, InternalKeyKindMax), + base.MakeInternalKey(smallestCopy, 0, base.InternalKeyKindDurableMax), base.MakeRangeDeleteSentinelKey(largestCopy)) } } diff --git a/internal/base/internal.go b/internal/base/internal.go index b16d5d6b92..442d4ddb51 100644 --- a/internal/base/internal.go +++ b/internal/base/internal.go @@ -96,6 +96,9 @@ const ( // heuristics, but is not required to be accurate for correctness. InternalKeyKindDeleteSized InternalKeyKind = 23 + InternalKeyKindSpanStart InternalKeyKind = 24 + InternalKeyKindSpanEnd InternalKeyKind = 25 + // This maximum value isn't part of the file format. Future extensions may // increase this value. // @@ -105,7 +108,12 @@ const ( // which sorts 'less than or equal to' any other valid internalKeyKind, when // searching for any kind of internal key formed by a certain user key and // seqNum. - InternalKeyKindMax InternalKeyKind = 23 + InternalKeyKindMax InternalKeyKind = 25 + + // NB: This is less than InternalKeyKindSpanStart and InternalKeyKindSpanEnd + // because those key kinds are never used in durable formats; only as + // special in-memory indicators. + InternalKeyKindDurableMax InternalKeyKind = 23 // Internal to the sstable format. Not exposed by any sstable iterator. // Declared here to prevent definition of valid key kinds that set this bit. @@ -157,6 +165,8 @@ var internalKeyKindNames = []string{ InternalKeyKindRangeKeyDelete: "RANGEKEYDEL", InternalKeyKindIngestSST: "INGESTSST", InternalKeyKindDeleteSized: "DELSIZED", + InternalKeyKindSpanStart: "SPANSTART", + InternalKeyKindSpanEnd: "SPANEND", InternalKeyKindInvalid: "INVALID", } @@ -249,6 +259,8 @@ var kindsMap = map[string]InternalKeyKind{ "RANGEKEYDEL": InternalKeyKindRangeKeyDelete, "INGESTSST": InternalKeyKindIngestSST, "DELSIZED": InternalKeyKindDeleteSized, + "SPANSTART": InternalKeyKindSpanStart, + "SPANEND": InternalKeyKindSpanEnd, } // ParseInternalKey parses the string representation of an internal key. The @@ -476,7 +488,8 @@ func (k InternalKey) IsExclusiveSentinel() bool { } switch kind := k.Kind(); kind { case InternalKeyKindRangeDelete, InternalKeyKindRangeKeyDelete, - InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet: + InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet, + InternalKeyKindSpanStart, InternalKeyKindSpanEnd: return true default: return false @@ -561,3 +574,14 @@ func (kv *InternalKV) Visible(snapshot, batchSnapshot uint64) bool { func (kv *InternalKV) IsExclusiveSentinel() bool { return kv.K.IsExclusiveSentinel() } + +// String returns a string representation of the kv pair. +func (kv *InternalKV) String() string { + if kv == nil { + return "" + } + if kv.V.Fetcher != nil { + return fmt.Sprintf("%s=", kv.K, kv.V.ValueOrHandle) + } + return fmt.Sprintf("%s:%s", kv.K, FormatBytes(kv.V.ValueOrHandle)) +} diff --git a/internal/base/internal_test.go b/internal/base/internal_test.go index 39466cd97d..172a550d38 100644 --- a/internal/base/internal_test.go +++ b/internal/base/internal_test.go @@ -41,7 +41,7 @@ func TestInvalidInternalKey(t *testing.T) { "\x01\x02\x03\x04\x05\x06\x07", "foo", "foo\x08\x07\x06\x05\x04\x03\x02", - "foo\x18\x07\x06\x05\x04\x03\x02\x01", + "foo\x1a\x07\x06\x05\x04\x03\x02\x01", } for _, tc := range testCases { k := DecodeInternalKey([]byte(tc)) diff --git a/internal/keyspan/interleaving_iter.go b/internal/keyspan/interleaving_iter.go index a6eee31eae..f7a649c560 100644 --- a/internal/keyspan/interleaving_iter.go +++ b/internal/keyspan/interleaving_iter.go @@ -199,6 +199,10 @@ type InterleavingIterOpts struct { // end keys of spans (in addition to the start keys, which are always // interleaved). InterleaveEndKeys bool + // UseBoundaryKeyKinds configures the interleaving iterator to interleave + // keys using SPANSTART and SPANEND key kinds for start and end boundaries + // respectively, rather than the key kind of the first key in the Span. + UseBoundaryKeyKinds bool } // Init initializes the InterleavingIter to interleave point keys from pointIter @@ -979,7 +983,12 @@ func (i *InterleavingIter) yieldPointKey() *base.InternalKV { func (i *InterleavingIter) yieldSyntheticSpanStartMarker(lowerBound []byte) *base.InternalKV { i.spanMarker.K.UserKey = i.startKey() - i.spanMarker.K.Trailer = base.MakeTrailer(base.InternalKeySeqNumMax, i.span.Keys[0].Kind()) + + kind := base.InternalKeyKindSpanStart + if !i.opts.UseBoundaryKeyKinds { + kind = i.span.Keys[0].Kind() + } + i.spanMarker.K.Trailer = base.MakeTrailer(base.InternalKeySeqNumMax, kind) // Truncate the key we return to our lower bound if we have one. Note that // we use the lowerBound function parameter, not i.lower. The lowerBound @@ -1015,8 +1024,12 @@ func (i *InterleavingIter) yieldSyntheticSpanStartMarker(lowerBound []byte) *bas } func (i *InterleavingIter) yieldSyntheticSpanEndMarker() *base.InternalKV { + kind := base.InternalKeyKindSpanEnd + if !i.opts.UseBoundaryKeyKinds { + kind = i.span.Keys[0].Kind() + } i.spanMarker.K.UserKey = i.endKey() - i.spanMarker.K.Trailer = base.MakeTrailer(base.InternalKeySeqNumMax, i.span.Keys[0].Kind()) + i.spanMarker.K.Trailer = base.MakeTrailer(base.InternalKeySeqNumMax, kind) return i.verify(&i.spanMarker) } diff --git a/internal/keyspan/interleaving_iter_test.go b/internal/keyspan/interleaving_iter_test.go index f9d4df4674..bdffeccfa2 100644 --- a/internal/keyspan/interleaving_iter_test.go +++ b/internal/keyspan/interleaving_iter_test.go @@ -123,6 +123,7 @@ func runInterleavingIterTest(t *testing.T, filename string) { hooks.threshold = []byte(strings.Join(cmdArg.Vals, "")) } opts.InterleaveEndKeys = td.HasArg("interleave-end-keys") + opts.UseBoundaryKeyKinds = td.HasArg("use-boundary-key-kinds") iter.Init(testkeys.Comparer, &pointIter, keyspanIter, opts) // Clear any previous bounds. pointIter.SetBounds(nil, nil) diff --git a/internal/keyspan/span.go b/internal/keyspan/span.go index b78cc44ed7..c25bf9eef7 100644 --- a/internal/keyspan/span.go +++ b/internal/keyspan/span.go @@ -298,10 +298,11 @@ func (s Span) Visible(snapshot uint64) Span { // VisibleAt requires the Span's keys be in ByTrailerDesc order. It panics if // the span's keys are sorted in a different order. func (s *Span) VisibleAt(snapshot uint64) bool { - if s.KeysOrder != ByTrailerDesc { + if s == nil { + return false + } else if s.KeysOrder != ByTrailerDesc { panic("pebble: span's keys unexpectedly not in trailer order") - } - if len(s.Keys) == 0 { + } else if len(s.Keys) == 0 { return false } else if first := s.Keys[0].SeqNum(); first&base.InternalKeySeqNumBatch != 0 { // Only visible batch keys are included when an Iterator's batch spans diff --git a/internal/keyspan/testdata/interleaving_iter b/internal/keyspan/testdata/interleaving_iter index 4d66462000..63e7c2539b 100644 --- a/internal/keyspan/testdata/interleaving_iter +++ b/internal/keyspan/testdata/interleaving_iter @@ -1152,6 +1152,58 @@ Span: f-g:{(#6,RANGEDEL)} -- SpanChanged(nil) . +iter interleave-end-keys use-boundary-key-kinds +first +next +next +next +next +next +next +next +next +next +next +---- +-- SpanChanged(nil) +-- SpanChanged(nil) +PointKey: a@4#8,SET +Span: +- +-- SpanChanged(b-e:{(#5,RANGEDEL)}) +PointKey: b#72057594037927935,SPANSTART +Span: b-e:{(#5,RANGEDEL)} +- +PointKey: c@11#8,SET +Span: b-e:{(#5,RANGEDEL)} +- +PointKey: c@3#8,SET +Span: b-e:{(#5,RANGEDEL)} +- +PointKey: c@1#4,SET +Span: b-e:{(#5,RANGEDEL)} +- +PointKey: d@5#3,SET +Span: b-e:{(#5,RANGEDEL)} +- +PointKey: e#72057594037927935,SPANEND +Span: b-e:{(#5,RANGEDEL)} +- +-- SpanChanged(nil) +PointKey: e@9#2,SET +Span: +- +-- SpanChanged(f-g:{(#6,RANGEDEL)}) +PointKey: f#72057594037927935,SPANSTART +Span: f-g:{(#6,RANGEDEL)} +- +PointKey: g#72057594037927935,SPANEND +Span: f-g:{(#6,RANGEDEL)} +- +-- SpanChanged(nil) +. + + iter interleave-end-keys last prev diff --git a/internal/rangedel/rangedel.go b/internal/rangedel/rangedel.go index f8504bb7b2..4c846d12b1 100644 --- a/internal/rangedel/rangedel.go +++ b/internal/rangedel/rangedel.go @@ -5,6 +5,8 @@ package rangedel import ( + "sync" + "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/keyspan" ) @@ -41,3 +43,49 @@ func Decode(ik base.InternalKey, v []byte, keysDst []keyspan.Key) keyspan.Span { }), } } + +// Interleave takes a point iterator and a range deletion iterator, returning an +// iterator that interleaves range deletion boundary keys at the maximal +// sequence number among the stream of point keys with SPANSTART and SPANEND key +// kinds. +// +// In addition, Interleave returns a function that may be used to retrieve the +// range tombstone overlapping the current iterator position, if any. +// +// The returned iterator must only be closed once. +func Interleave( + comparer *base.Comparer, iter base.InternalIterator, rangeDelIter keyspan.FragmentIterator, +) (base.InternalIterator, func() *keyspan.Span) { + // If there is no range deletion iterator, don't bother using an interleaving + // iterator. We can return iter verbatim and a func that unconditionally + // returns nil. + if rangeDelIter == nil { + return iter, nil + } + + ii := interleavingIterPool.Get().(*interleavingIter) + ii.Init(comparer, iter, rangeDelIter, keyspan.InterleavingIterOpts{ + InterleaveEndKeys: true, + UseBoundaryKeyKinds: true, + }) + return ii, ii.Span +} + +var interleavingIterPool = sync.Pool{ + New: func() interface{} { + return &interleavingIter{} + }, +} + +type interleavingIter struct { + keyspan.InterleavingIter +} + +// Close closes the interleaving iterator and returns the interleaving iterator +// to the pool. +func (i *interleavingIter) Close() error { + err := i.InterleavingIter.Close() + *i = interleavingIter{} + interleavingIterPool.Put(i) + return err +} diff --git a/iterator.go b/iterator.go index bba2a131bd..19187cd16d 100644 --- a/iterator.go +++ b/iterator.go @@ -1820,8 +1820,8 @@ func (i *Iterator) nextPrefix() IterValidityState { return i.iterValidityState } if invariants.Enabled && !i.equal(i.iterKV.K.UserKey, i.key) { - i.opts.getLogger().Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q", - i.iterKV.K.UserKey, i.key) + panic(errors.AssertionFailedf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q", + i.iterKV.K.UserKey, i.key)) } } // The internal iterator is now positioned at i.key. Advance to the next diff --git a/level_checker.go b/level_checker.go index b2f0936b1c..7b5ca57390 100644 --- a/level_checker.go +++ b/level_checker.go @@ -14,6 +14,7 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" + "github.com/cockroachdb/pebble/internal/rangedel" ) // This file implements DB.CheckLevels() which checks that every entry in the @@ -47,11 +48,10 @@ import ( // The per-level structure used by simpleMergingIter. type simpleMergingIterLevel struct { - iter internalIterator - rangeDelIter keyspan.FragmentIterator - - iterKV *base.InternalKV - tombstone *keyspan.Span + iter internalIterator + getTombstone func() *keyspan.Span + iterKV *base.InternalKV + iterTombstone *keyspan.Span } type simpleMergingIter struct { @@ -102,22 +102,6 @@ func (m *simpleMergingIter) init( if m.heap.len() == 0 { return } - m.positionRangeDels() -} - -// Positions all the rangedel iterators at or past the current top of the -// heap, using SeekGE(). -func (m *simpleMergingIter) positionRangeDels() { - item := &m.heap.items[0] - for i := range m.levels { - l := &m.levels[i] - if l.rangeDelIter == nil { - continue - } - t, err := l.rangeDelIter.SeekGE(item.key.UserKey) - m.err = firstError(m.err, err) - l.tombstone = t - } } // Returns true if not yet done. @@ -128,7 +112,12 @@ func (m *simpleMergingIter) step() bool { item := &m.heap.items[0] l := &m.levels[item.index] // Sentinels are not relevant for this point checking. - if !item.key.IsExclusiveSentinel() && item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) { + if item.key.IsExclusiveSentinel() { + l.iterTombstone = nil + if item.key.Kind() != base.InternalKeyKindSpanEnd { + l.iterTombstone = l.getTombstone() + } + } else if item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) { // This is a visible point key. if !m.handleVisiblePoint(item, l) { return false @@ -187,7 +176,6 @@ func (m *simpleMergingIter) step() bool { } return false } - m.positionRangeDels() return true } @@ -269,12 +257,12 @@ func (m *simpleMergingIter) handleVisiblePoint( // iterators must be positioned at a key > item.key. for level := item.index + 1; level < len(m.levels); level++ { lvl := &m.levels[level] - if lvl.rangeDelIter == nil || lvl.tombstone.Empty() { + if lvl.iterTombstone.Empty() { continue } - if lvl.tombstone.Contains(m.heap.cmp, item.key.UserKey) && lvl.tombstone.CoversAt(m.snapshot, item.key.SeqNum()) { + if lvl.iterTombstone.Contains(m.heap.cmp, item.key.UserKey) && lvl.iterTombstone.CoversAt(m.snapshot, item.key.SeqNum()) { m.err = errors.Errorf("tombstone %s in %s deletes key %s in %s", - lvl.tombstone.Pretty(m.formatKey), lvl.iter, item.key.Pretty(m.formatKey), + lvl.iterTombstone.Pretty(m.formatKey), lvl.iter, item.key.Pretty(m.formatKey), l.iter) return false } @@ -593,20 +581,15 @@ func checkLevelsInternal(c *checkConfig) (err error) { err = firstError(err, l.iter.Close()) l.iter = nil } - if l.rangeDelIter != nil { - err = firstError(err, l.rangeDelIter.Close()) - l.rangeDelIter = nil - } } }() memtables := c.readState.memtables for i := len(memtables) - 1; i >= 0; i-- { mem := memtables[i] - mlevels = append(mlevels, simpleMergingIterLevel{ - iter: mem.newIter(nil), - rangeDelIter: mem.newRangeDelIter(nil), - }) + var smil simpleMergingIterLevel + smil.iter, smil.getTombstone = rangedel.Interleave(c.comparer, mem.newIter(nil), mem.newRangeDelIter(nil)) + mlevels = append(mlevels, smil) } current := c.readState.current @@ -636,8 +619,9 @@ func checkLevelsInternal(c *checkConfig) (err error) { li := &levelIter{} li.init(context.Background(), iterOpts, c.comparer, c.newIters, manifestIter, manifest.L0Sublevel(sublevel), internalIterOpts{}) - li.initRangeDel(&mlevelAlloc[0].rangeDelIter) + li.interleaveRangeDeletions = true mlevelAlloc[0].iter = li + mlevelAlloc[0].getTombstone = li.getTombstone mlevelAlloc = mlevelAlloc[1:] } for level := 1; level < len(current.Levels); level++ { @@ -649,8 +633,9 @@ func checkLevelsInternal(c *checkConfig) (err error) { li := &levelIter{} li.init(context.Background(), iterOpts, c.comparer, c.newIters, current.Levels[level].Iter(), manifest.Level(level), internalIterOpts{}) - li.initRangeDel(&mlevelAlloc[0].rangeDelIter) + li.interleaveRangeDeletions = true mlevelAlloc[0].iter = li + mlevelAlloc[0].getTombstone = li.getTombstone mlevelAlloc = mlevelAlloc[1:] } diff --git a/level_iter.go b/level_iter.go index 00af0887d1..3ead730aa4 100644 --- a/level_iter.go +++ b/level_iter.go @@ -80,19 +80,8 @@ type levelIter struct { // iterFile holds the current file. It is always equal to l.files.Current(). iterFile *fileMetadata newIters tableNewIters - // When rangeDelIterPtr != nil, the caller requires that *rangeDelIterPtr must - // point to a range del iterator corresponding to the current file. When this - // iterator returns nil, *rangeDelIterPtr should also be set to nil. Whenever - // a non-nil internalIterator is placed in rangeDelIterPtr, a copy is placed - // in rangeDelIterCopy. This is done for the following special case: - // when this iterator returns nil because of exceeding the bounds, we don't - // close iter and *rangeDelIterPtr since we could reuse it in the next seek. But - // we need to set *rangeDelIterPtr to nil because of the aforementioned contract. - // This copy is used to revive the *rangeDelIterPtr in the case of reuse. - rangeDelIterPtr *keyspan.FragmentIterator - rangeDelIterCopy keyspan.FragmentIterator - files manifest.LevelIterator - err error + files manifest.LevelIterator + err error // internalOpts holds the internal iterator options to pass to the table // cache when constructing new table iterators. @@ -118,6 +107,9 @@ type levelIter struct { // first or last key within iteration bounds. exhaustedDir int8 + // interleaveRangeDeletions + interleaveRangeDeletions bool + // Disable invariant checks even if they are otherwise enabled. Used by tests // which construct "impossible" situations (e.g. seeking to a key before the // lower bound). @@ -176,10 +168,7 @@ func (l *levelIter) init( l.files = files l.exhaustedDir = 0 l.internalOpts = internalOpts -} - -func (l *levelIter) initRangeDel(rangeDelIter *keyspan.FragmentIterator) { - l.rangeDelIterPtr = rangeDelIter + l.interleaveRangeDeletions = false } func (l *levelIter) initCombinedIterState(state *combinedIterState) { @@ -488,13 +477,6 @@ func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicato return noFileLoaded } if l.iter != nil { - // We don't bother comparing the file bounds with the iteration bounds when we have - // an already open iterator. It is possible that the iter may not be relevant given the - // current iteration bounds, but it knows those bounds, so it will enforce them. - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = l.rangeDelIterCopy - } - // There are a few reasons we might not have triggered combined // iteration yet, even though we already had `file` open. // 1. If the bounds changed, we might have previously avoided @@ -511,10 +493,8 @@ func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicato // have changed. We handle that below. } - // Close both iter and rangeDelIterPtr. While mergingIter knows about - // rangeDelIterPtr, it can't call Close() on it because it does not know - // when the levelIter will switch it. Note that levelIter.Close() can be - // called multiple times. + // Close the iterator. Note that levelIter.Close() can be called multiple + // times. if err := l.Close(); err != nil { return noFileLoaded } @@ -556,7 +536,7 @@ func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicato } iterKinds := iterPointKeys - if l.rangeDelIterPtr != nil { + if l.interleaveRangeDeletions { iterKinds |= iterRangeDeletions } @@ -566,32 +546,17 @@ func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicato return noFileLoaded } l.iter = iters.Point() - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = iters.rangeDeletion - l.rangeDelIterCopy = iters.rangeDeletion - + if l.interleaveRangeDeletions && iters.rangeDeletion != nil { // If this file has range deletions, interleave the bounds of the // range deletions among the point keys. When used with a // mergingIter, this ensures we don't move beyond a file with range // deletions until its range deletions are no longer relevant. - // - // For now, we open a second range deletion iterator. Future work - // will avoid the need to open a second range deletion iterator, and - // avoid surfacing the file's range deletion iterator directly to - // the caller. - var itersForBounds iterSet - itersForBounds, l.err = l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts, iterRangeDeletions) - l.interleaving.Init(l.comparer, l.iter, itersForBounds.RangeDeletion(), keyspan.InterleavingIterOpts{ - LowerBound: l.tableOpts.LowerBound, - UpperBound: l.tableOpts.UpperBound, - InterleaveEndKeys: true, + l.interleaving.Init(l.comparer, l.iter, iters.rangeDeletion, keyspan.InterleavingIterOpts{ + LowerBound: l.tableOpts.LowerBound, + UpperBound: l.tableOpts.UpperBound, + InterleaveEndKeys: true, + UseBoundaryKeyKinds: true, }) - if l.err != nil { - l.iter = nil - *l.rangeDelIterPtr = nil - l.err = errors.CombineErrors(l.err, iters.CloseAll()) - return noFileLoaded - } l.iter = &l.interleaving } return newFileLoaded @@ -884,16 +849,23 @@ func (l *levelIter) skipEmptyFileBackward() *base.InternalKV { func (l *levelIter) exhaustedForward() { l.exhaustedDir = +1 - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = nil - } } func (l *levelIter) exhaustedBackward() { l.exhaustedDir = -1 - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = nil +} + +// getTombstone retrieves the range tombstone covering the current iterator +// position. If there is none, or if the iterator is not configured to +// interleave range deletions, getTombstone returns nil. +// +// The returned Span's memory is guaranteed to be valid until the iterator is +// moved beyond the Span's interleaved boundary keys. +func (l *levelIter) getTombstone() *keyspan.Span { + if l.iter != &l.interleaving { + return nil } + return l.interleaving.Span() } func (l *levelIter) Error() error { @@ -908,13 +880,6 @@ func (l *levelIter) Close() error { l.err = l.iter.Close() l.iter = nil } - if l.rangeDelIterPtr != nil { - if t := l.rangeDelIterCopy; t != nil { - l.err = firstError(l.err, t.Close()) - } - *l.rangeDelIterPtr = nil - l.rangeDelIterCopy = nil - } return l.err } diff --git a/level_iter_test.go b/level_iter_test.go index 7303c691eb..b1263e9d58 100644 --- a/level_iter_test.go +++ b/level_iter_test.go @@ -89,8 +89,7 @@ func TestLevelIter(t *testing.T) { iter := newLevelIter(context.Background(), opts, testkeys.Comparer, newIters, files.Iter(), manifest.Level(level), internalIterOpts{}) defer iter.Close() - // Fake up the range deletion initialization. - iter.initRangeDel(new(keyspan.FragmentIterator)) + iter.interleaveRangeDeletions = true iter.disableInvariants = true return itertest.RunInternalIterCmd(t, d, iter, itertest.Verbose) @@ -333,8 +332,7 @@ func TestLevelIterBoundaries(t *testing.T) { if iter == nil { slice := manifest.NewLevelSliceKeySorted(lt.cmp.Compare, lt.metas) iter = newLevelIter(context.Background(), IterOptions{}, testkeys.Comparer, lt.newIters, slice.Iter(), manifest.Level(level), internalIterOpts{}) - // Fake up the range deletion initialization. - iter.initRangeDel(new(keyspan.FragmentIterator)) + iter.interleaveRangeDeletions = true } if !save { defer func() { @@ -363,81 +361,6 @@ func TestLevelIterBoundaries(t *testing.T) { }) } -// levelIterTestIter allows a datadriven test to use runInternalIterCmd and -// perform parallel operations on both both a levelIter and rangeDelIter. -type levelIterTestIter struct { - *levelIter - rangeDelIter keyspan.FragmentIterator - // rangeDel is only set on seeks: SeekGE, SeekLT, SeekPrefixGE. - // TODO(jackson): Clean this up when #2863 is resolved. - rangeDel *keyspan.Span -} - -func must(err error) { - if err != nil { - panic(err) - } -} - -func (i *levelIterTestIter) getRangeDel() *keyspan.Span { - return i.rangeDel -} - -func (i *levelIterTestIter) rangeDelSeek( - key []byte, kv *base.InternalKV, dir int, -) *base.InternalKV { - i.rangeDel = nil - if i.rangeDelIter != nil { - var t *keyspan.Span - var err error - if dir < 0 { - t, err = keyspan.SeekLE(i.levelIter.cmp, i.rangeDelIter, key) - } else { - t, err = i.rangeDelIter.SeekGE(key) - } - // TODO(jackson): Clean this up when the InternalIterator interface - // is refactored to return an error return value from all - // positioning methods. - must(err) - if t != nil { - i.rangeDel = new(keyspan.Span) - *i.rangeDel = t.Visible(1000) - } - } - return kv -} - -func (i *levelIterTestIter) String() string { - return "level-iter-test" -} - -func (i *levelIterTestIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { - kv := i.levelIter.SeekGE(key, flags) - return i.rangeDelSeek(key, kv, 1) -} - -func (i *levelIterTestIter) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) *base.InternalKV { - kv := i.levelIter.SeekPrefixGE(prefix, key, flags) - return i.rangeDelSeek(key, kv, 1) -} - -func (i *levelIterTestIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { - kv := i.levelIter.SeekLT(key, flags) - return i.rangeDelSeek(key, kv, -1) -} - -func (i *levelIterTestIter) Next() *base.InternalKV { - i.rangeDel = nil - return i.levelIter.Next() -} - -func (i *levelIterTestIter) Prev() *base.InternalKV { - i.rangeDel = nil - return i.levelIter.Prev() -} - func TestLevelIterSeek(t *testing.T) { lt := newLevelIterTest() defer lt.runClear() @@ -453,12 +376,12 @@ func TestLevelIterSeek(t *testing.T) { case "iter": var stats base.InternalIteratorStats slice := manifest.NewLevelSliceKeySorted(lt.cmp.Compare, lt.metas) - iter := &levelIterTestIter{levelIter: &levelIter{}} + iter := &levelIter{} iter.init(context.Background(), IterOptions{}, testkeys.Comparer, lt.newIters, slice.Iter(), manifest.Level(level), internalIterOpts{stats: &stats}) defer iter.Close() - iter.initRangeDel(&iter.rangeDelIter) - return itertest.RunInternalIterCmd(t, d, iter, itertest.Verbose, itertest.WithSpan(iter.getRangeDel), itertest.WithStats(&stats)) + iter.interleaveRangeDeletions = true + return itertest.RunInternalIterCmd(t, d, iter, itertest.Verbose, itertest.WithSpan(iter.getTombstone), itertest.WithStats(&stats)) case "iters-created": return fmt.Sprintf("%d", lt.itersCreated) @@ -605,9 +528,7 @@ func BenchmarkLevelIterSeqSeekGEWithBounds(b *testing.B) { return iterSet{point: iter}, err } l := newLevelIter(context.Background(), IterOptions{}, DefaultComparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{}) - // Fake up the range deletion initialization, to resemble the usage - // in a mergingIter. - l.initRangeDel(new(keyspan.FragmentIterator)) + l.interleaveRangeDeletions = true keyCount := len(keys) b.ResetTimer() for i := 0; i < b.N; i++ { @@ -653,9 +574,7 @@ func BenchmarkLevelIterSeqSeekPrefixGE(b *testing.B) { b.Run(fmt.Sprintf("skip=%d/use-next=%t", skip, useNext), func(b *testing.B) { l := newLevelIter(context.Background(), IterOptions{}, testkeys.Comparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{}) - // Fake up the range deletion initialization, to resemble the usage - // in a mergingIter. - l.initRangeDel(new(keyspan.FragmentIterator)) + l.interleaveRangeDeletions = true keyCount := len(keys) pos := 0 l.SeekPrefixGE(keys[pos], keys[pos], base.SeekGEFlagsNone) diff --git a/merging_iter.go b/merging_iter.go index e04df79c90..8161b3891c 100644 --- a/merging_iter.go +++ b/merging_iter.go @@ -20,25 +20,71 @@ import ( type mergingIterLevel struct { index int iter internalIterator - // rangeDelIter is set to the range-deletion iterator for the level. When - // configured with a levelIter, this pointer changes as sstable boundaries - // are crossed. See levelIter.initRangeDel and the Range Deletions comment - // below. - rangeDelIter keyspan.FragmentIterator + // getTombstone, if non-nil, returns the current range deletion tombstone + // covering the key getTombstone. This is typically a pointer to + // levelIter.getTombstone if using a levelIter, or a func returned by + // rangedel.Interleave otherwise. + getTombstone func() *keyspan.Span // iterKV caches the current key-value pair iter points to. iterKV *base.InternalKV + // tombstone holds a Span containing a visible range deletion tombstone that + // covers the current mergingIter's heap root, if any. It's updated when an + // interleaved range deletion boundary rises to the top of the heap, and the + // merging iterator steps over it. Care must be taken to update it + // appropriately when the heap is switching directions. See + // updateTombstone{Forward,Backward}. + tombstone *keyspan.Span // levelIter is non-nil if this level's iter is ultimately backed by a // *levelIter. The handle in iter may have wrapped the levelIter with // intermediary internalIterator implementations. levelIter *levelIter +} - // tombstone caches the tombstone rangeDelIter is currently pointed at. If - // tombstone is nil, there are no further tombstones within the - // current sstable in the current iterator direction. The cached tombstone is - // only valid for the levels in the range [0,heap[0].index]. This avoids - // positioning tombstones at lower levels which cannot possibly shadow the - // current key. - tombstone *keyspan.Span +func (l *mergingIterLevel) String() string { + return fmt.Sprintf("[index:%d iter:%s iterKV:%v iterTombstone:%s]", l.index, l.iter, l.iterKV, l.tombstone) +} + +// updateTombstoneForward updates the value of l.tombstone to reflect the +// current range deletion span covering the iterator position, if any, while +// oriented in the forward direction. It's called while re-orienting the +// direction of the heap in initMinHeap. +func (l *mergingIterLevel) updateTombstoneForward(snapshot uint64) { + switch { + case l.iterKV == nil: + l.tombstone = nil + case l.iterKV.K.Kind() == base.InternalKeyKindSpanStart: + // If we're positioned at a span start, the span lies before us, and + // we're not yet positioned within the span. + l.tombstone = nil + default: + l.tombstone = l.getVisibleTombstone(snapshot) + } +} + +// updateTombstoneBackward updates the value of l.tombstone to reflect the +// current range deletion span covering the iterator position, if any, while +// oriented in the reverse direction. It's called while re-orienting the +// direction of the heap in initMaxHeap. +func (l *mergingIterLevel) updateTombstoneBackward(snapshot uint64) { + switch { + case l.iterKV == nil: + l.tombstone = nil + case l.iterKV.K.Kind() == base.InternalKeyKindSpanEnd: + // If we're positioned at a span end, the span lies before us (in the + // reverse direction), and we're not yet positioned within the span. + l.tombstone = nil + default: + l.tombstone = l.getVisibleTombstone(snapshot) + } +} + +func (l *mergingIterLevel) getVisibleTombstone(snapshot uint64) *keyspan.Span { + if l.getTombstone != nil { + if t := l.getTombstone(); t != nil && t.VisibleAt(snapshot) { + return t + } + } + return nil } // mergingIter provides a merged view of multiple iterators from different @@ -273,6 +319,7 @@ func newMergingIter( levels := make([]mergingIterLevel, len(iters)) for i := range levels { levels[i].iter = iters[i] + levels[i].getTombstone = nil } m.init(&IterOptions{logger: logger}, stats, cmp, split, levels...) return m @@ -317,69 +364,16 @@ func (m *mergingIter) initHeap() { m.heap.init() } -func (m *mergingIter) initMinHeap() error { +func (m *mergingIter) initMinHeap() { m.dir = 1 m.heap.reverse = false m.initHeap() - return m.initMinRangeDelIters(-1) } -// The level of the previous top element was oldTopLevel. Note that all range delete -// iterators < oldTopLevel are positioned past the key of the previous top element and -// the range delete iterator == oldTopLevel is positioned at or past the key of the -// previous top element. We need to position the range delete iterators from oldTopLevel + 1 -// to the level of the current top element. -func (m *mergingIter) initMinRangeDelIters(oldTopLevel int) error { - if m.heap.len() == 0 { - return nil - } - - // Position the range-del iterators at levels <= m.heap.items[0].index. - item := m.heap.items[0] - for level := oldTopLevel + 1; level <= item.index; level++ { - l := &m.levels[level] - if l.rangeDelIter == nil { - continue - } - var err error - l.tombstone, err = l.rangeDelIter.SeekGE(item.iterKV.K.UserKey) - if err != nil { - return err - } - } - return nil -} - -func (m *mergingIter) initMaxHeap() error { +func (m *mergingIter) initMaxHeap() { m.dir = -1 m.heap.reverse = true m.initHeap() - return m.initMaxRangeDelIters(-1) -} - -// The level of the previous top element was oldTopLevel. Note that all range delete -// iterators < oldTopLevel are positioned before the key of the previous top element and -// the range delete iterator == oldTopLevel is positioned at or before the key of the -// previous top element. We need to position the range delete iterators from oldTopLevel + 1 -// to the level of the current top element. -func (m *mergingIter) initMaxRangeDelIters(oldTopLevel int) error { - if m.heap.len() == 0 { - return nil - } - // Position the range-del iterators at levels <= m.heap.items[0].index. - item := m.heap.items[0] - for level := oldTopLevel + 1; level <= item.index; level++ { - l := &m.levels[level] - if l.rangeDelIter == nil { - continue - } - tomb, err := keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKV.K.UserKey) - if err != nil { - return err - } - l.tombstone = tomb - } - return nil } func (m *mergingIter) switchToMinHeap() error { @@ -416,7 +410,9 @@ func (m *mergingIter) switchToMinHeap() error { break } // key >= iter-key + l.updateTombstoneForward(m.snapshot) } + l.updateTombstoneForward(m.snapshot) if l.iterKV == nil { if err := l.iter.Error(); err != nil { return err @@ -427,12 +423,14 @@ func (m *mergingIter) switchToMinHeap() error { // Special handling for the current iterator because we were using its key // above. cur.iterKV = cur.iter.Next() + cur.updateTombstoneForward(m.snapshot) if cur.iterKV == nil { if err := cur.iter.Error(); err != nil { return err } } - return m.initMinHeap() + m.initMinHeap() + return nil } func (m *mergingIter) switchToMaxHeap() error { @@ -468,8 +466,10 @@ func (m *mergingIter) switchToMaxHeap() error { // key > iter-key break } + l.updateTombstoneBackward(m.snapshot) // key <= iter-key } + l.updateTombstoneBackward(m.snapshot) if l.iterKV == nil { if err := l.iter.Error(); err != nil { return err @@ -480,12 +480,14 @@ func (m *mergingIter) switchToMaxHeap() error { // Special handling for the current iterator because we were using its key // above. cur.iterKV = cur.iter.Prev() + cur.updateTombstoneBackward(m.snapshot) if cur.iterKV == nil { if err := cur.iter.Error(); err != nil { return err } } - return m.initMaxHeap() + m.initMaxHeap() + return nil } // nextEntry unconditionally steps to the next entry. item is the current top @@ -518,8 +520,14 @@ func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) error { } } - oldTopLevel := l.index - oldRangeDelIter := l.rangeDelIter + // Before we advance, consider whehter we're moving on to or out of a range + // tombstone within the level. + switch l.iterKV.K.Kind() { + case base.InternalKeyKindSpanStart: + l.tombstone = l.getVisibleTombstone(m.snapshot) + case base.InternalKeyKindSpanEnd: + l.tombstone = nil + } if succKey == nil { l.iterKV = l.iter.Next() @@ -531,6 +539,7 @@ func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) error { if err := l.iter.Error(); err != nil { return err } + l.tombstone = nil m.heap.pop() } else { if m.prefix != nil && !bytes.Equal(m.prefix, l.iterKV.K.UserKey[:m.split(l.iterKV.K.UserKey)]) { @@ -541,17 +550,9 @@ func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) error { } else if m.heap.len() > 1 { m.heap.fix(0) } - if l.rangeDelIter != oldRangeDelIter { - // The rangeDelIter changed which indicates that the l.iter moved to the - // next sstable. We have to update the tombstone for oldTopLevel as well. - oldTopLevel-- - } } - // The cached tombstones are only valid for the levels - // [0,oldTopLevel]. Updated the cached tombstones for any levels in the range - // [oldTopLevel+1,heap[0].index]. - return m.initMinRangeDelIters(oldTopLevel) + return nil } // isNextEntryDeleted starts from the current entry (as the next entry) and if @@ -564,111 +565,102 @@ func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) error { // during prefix-iteration mode. func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) (bool, error) { // Look for a range deletion tombstone containing item.iterKV at higher - // levels (level < item.index). If we find such a range tombstone we know - // it deletes the key in the current level. Also look for a range - // deletion at the current level (level == item.index). If we find such a - // range deletion we need to check whether it is newer than the current - // entry. + // levels (level < item.index). If we find such a range tombstone we know it + // deletes the key in the current level. Also look for a range deletion at + // the current level (level == item.index). If we find such a range deletion + // we need to check whether it is newer than the current entry. for level := 0; level <= item.index; level++ { l := &m.levels[level] - if l.rangeDelIter == nil || l.tombstone == nil { - // If l.tombstone is nil, there are no further tombstones - // in the current sstable in the current (forward) iteration - // direction. + if l.tombstone == nil { + // This level does not contain a tombstone overlapping the current + // heap root. continue } - if m.heap.cmp(l.tombstone.End, item.iterKV.K.UserKey) <= 0 { - // The current key is at or past the tombstone end key. - // - // NB: for the case that this l.rangeDelIter is provided by a levelIter we know that - // the levelIter must be positioned at a key >= item.iterKV. So it is sufficient to seek the - // current l.rangeDelIter (since any range del iterators that will be provided by the - // levelIter in the future cannot contain item.iterKV). Also, it is possible that we - // will encounter parts of the range delete that should be ignored -- we handle that - // below. - var err error - l.tombstone, err = l.rangeDelIter.SeekGE(item.iterKV.K.UserKey) - if err != nil { - return false, err + // Heap invariants must ensure that l.tombstone always contains the + // current heap root within its span boundaries. Ensure this holds in + // invariant builds. + if invariants.Enabled { + if !l.tombstone.Valid() { + panic(errors.AssertionFailedf("tombstone from level %s is not valid", l)) + } else if !l.tombstone.VisibleAt(m.snapshot) { + panic(errors.AssertionFailedf("tombstone from level %s is not visible at snapshot %d", l, m.snapshot)) + } else if !l.tombstone.Contains(m.heap.cmp, item.iterKV.K.UserKey) { + panic(errors.AssertionFailedf("tombstone from level %s does not contain key %s", l, item.iterKV.K)) } } - if l.tombstone == nil { - continue - } + if level < item.index { + // We could also do m.seekGE(..., level + 1). The levels from [level + // + 1, item.index) are already after item.iterKV so seeking them + // may be wasteful. - if l.tombstone.VisibleAt(m.snapshot) && m.heap.cmp(l.tombstone.Start, item.iterKV.K.UserKey) <= 0 { - if level < item.index { - // We could also do m.seekGE(..., level + 1). The levels from - // [level + 1, item.index) are already after item.iterKV so seeking them may be - // wasteful. - - // We can seek up to tombstone.End. - // - // Progress argument: Since this file is at a higher level than item.iterKV we know - // that the iterator in this file must be positioned within its bounds and at a key - // X > item.iterKV (otherwise it would be the min of the heap). It is not - // possible for X.UserKey == item.iterKV.UserKey, since it is incompatible with - // X > item.iterKV (a lower version cannot be in a higher sstable), so it must be that - // X.UserKey > item.iterKV.UserKey. Which means l.largestUserKey > item.key.UserKey. - // We also know that l.tombstone.End > item.iterKV.UserKey. So the min of these, - // seekKey, computed below, is > item.iterKV.UserKey, so the call to seekGE() will - // make forward progress. - seekKey := l.tombstone.End - // This seek is not directly due to a SeekGE call, so we don't know - // enough about the underlying iterator positions, and so we keep the - // try-seek-using-next optimization disabled. Additionally, if we're in - // prefix-seek mode and a re-seek would have moved us past the original - // prefix, we can remove all merging iter levels below the rangedel - // tombstone's level and return immediately instead of re-seeking. This - // is correct since those levels cannot provide a key that matches the - // prefix, and is also visible. Additionally, this is important to make - // subsequent `TrySeekUsingNext` work correctly, as a re-seek on a - // different prefix could have resulted in this iterator skipping visible - // keys at prefixes in between m.prefix and seekKey, that are currently - // not in the heap due to a bloom filter mismatch. - // - // Additionally, we set the relative-seek flag. This is - // important when iterating with lazy combined iteration. If - // there's a range key between this level's current file and the - // file the seek will land on, we need to detect it in order to - // trigger construction of the combined iterator. - if m.prefix != nil { - if n := m.split(seekKey); !bytes.Equal(m.prefix, seekKey[:n]) { - for i := item.index; i < len(m.levels); i++ { - // Remove this level from the heap. Setting iterKV - // to nil should be sufficient for initMinHeap to - // not re-initialize the heap with them in it. Other - // fields in mergingIterLevel can remain as-is; the - // iter/rangeDelIter needs to stay intact for future - // trySeekUsingNexts to work, the level iter - // boundary context is owned by the levelIter which - // is not being repositioned, and any tombstones in - // these levels will be irrelevant for us anyway. - m.levels[i].iterKV = nil - } - // TODO(bilal): Consider a more efficient way of removing levels from - // the heap without reinitializing all of it. This would likely - // necessitate tracking the heap positions of each mergingIterHeap - // item in the mergingIterLevel, and then swapping that item in the - // heap with the last-positioned heap item, and shrinking the heap by - // one. - if err := m.initMinHeap(); err != nil { - return false, err - } - return true, nil + // We can seek up to tombstone.End. + // + // Progress argument: Since this file is at a higher level than + // item.iterKV we know that the iterator in this file must be + // positioned within its bounds and at a key X > item.iterKV + // (otherwise it would be the min of the heap). It is not possible + // for X.UserKey == item.iterKV.UserKey, since it is incompatible + // with X > item.iterKV (a lower version cannot be in a higher + // sstable), so it must be that X.UserKey > item.iterKV.UserKey. We + // know that l.iterTombstone.End > item.iterKV.UserKey, so the call + // to seekGE() will make forward progress. + seekKey := l.tombstone.End + // This seek is not directly due to a SeekGE call, so we don't know + // enough about the underlying iterator positions, and so we keep + // the try-seek-using-next optimization disabled. Additionally, if + // we're in prefix-seek mode and a re-seek would have moved us past + // the original prefix, we can remove all merging iter levels below + // the rangedel tombstone's level and return immediately instead of + // re-seeking. This is correct since those levels cannot provide a + // key that matches the prefix, and is also visible. Additionally, + // this is important to make subsequent `TrySeekUsingNext` work + // correctly, as a re-seek on a different prefix could have resulted + // in this iterator skipping visible keys at prefixes in between + // m.prefix and seekKey, that are currently not in the heap due to a + // bloom filter mismatch. + // + // Additionally, we set the relative-seek flag. This is important + // when iterating with lazy combined iteration. If there's a range + // key between this level's current file and the file the seek will + // land on, we need to detect it in order to trigger construction of + // the combined iterator. + if m.prefix != nil { + if n := m.split(seekKey); !bytes.Equal(m.prefix, seekKey[:n]) { + for i := item.index; i < len(m.levels); i++ { + // Remove this level from the heap. Setting iterKV to + // nil should be sufficient for initMinHeap to not + // re-initialize the heap with them in it. Other fields + // in mergingIterLevel can remain as-is; the + // iter/rangeDelIter needs to stay intact for future + // trySeekUsingNexts to work, the level iter boundary + // context is owned by the levelIter which is not being + // repositioned, and any tombstones in these levels will + // be irrelevant for us anyway. + m.levels[i].iterKV = nil } + // TODO(bilal): Consider a more efficient way of removing + // levels from the heap without reinitializing all of it. + // This would likely necessitate tracking the heap positions + // of each mergingIterHeap item in the mergingIterLevel, and + // then swapping that item in the heap with the + // last-positioned heap item, and shrinking the heap by one. + m.initMinHeap() + return true, nil } - if err := m.seekGE(seekKey, item.index, base.SeekGEFlagsNone.EnableRelativeSeek()); err != nil { - return false, err - } - return true, nil } - if l.tombstone.CoversAt(m.snapshot, item.iterKV.SeqNum()) { - if err := m.nextEntry(item, nil /* succKey */); err != nil { - return false, err - } - return true, nil + if err := m.seekGE(seekKey, item.index, base.SeekGEFlagsNone.EnableRelativeSeek()); err != nil { + return false, err } + return true, nil + } + // Otherwise level == item.index; Since the tombstone and the iterator + // are in the same level, the tombstone may or may not delete the point + // key depending on their sequence numbers. + if l.tombstone.CoversAt(m.snapshot, item.iterKV.SeqNum()) { + if err := m.nextEntry(item, nil /* succKey */); err != nil { + return false, err + } + return true, nil } } return false, nil @@ -684,26 +676,9 @@ func (m *mergingIter) findNextEntry() *base.InternalKV { // The levelIter internal iterator will interleave exclusive sentinel // keys to keep files open until their range deletions are no longer - // necessary. Sometimes these are interleaved with the user key of a - // file's largest key, in which case they may simply be stepped over to - // move to the next file in the forward direction. Other times they're - // interleaved at the user key of the user-iteration boundary, if that - // falls within the bounds of a file. In the latter case, there are no - // more keys < m.upper, and we can stop iterating. - // - // We perform a key comparison to differentiate between these two cases. - // This key comparison is considered okay because it only happens for - // sentinel keys. It may be eliminated after #2863. - if m.levels[item.index].iterKV.K.IsExclusiveSentinel() { - if m.upper != nil && m.heap.cmp(m.levels[item.index].iterKV.K.UserKey, m.upper) >= 0 { - break - } - // This key is the largest boundary of a file and can be skipped now - // that the file's range deletions are no longer relevant. + // necessary and to inform us when we step in to or out of a range tombstone. + if item.iterKV.K.IsExclusiveSentinel() { m.err = m.nextEntry(item, nil /* succKey */) - if m.err != nil { - return nil - } continue } @@ -739,99 +714,88 @@ func (m *mergingIter) findNextEntry() *base.InternalKV { // Steps to the prev entry. item is the current top item in the heap. func (m *mergingIter) prevEntry(l *mergingIterLevel) error { - oldTopLevel := l.index - oldRangeDelIter := l.rangeDelIter + // Before we advance, consider whehter we're moving on to or out of a range + // tombstone within the level. + switch l.iterKV.K.Kind() { + case base.InternalKeyKindSpanStart: + l.tombstone = nil + case base.InternalKeyKindSpanEnd: + l.tombstone = l.getVisibleTombstone(m.snapshot) + } + if l.iterKV = l.iter.Prev(); l.iterKV != nil { if m.heap.len() > 1 { m.heap.fix(0) } - if l.rangeDelIter != oldRangeDelIter && l.rangeDelIter != nil { - // The rangeDelIter changed which indicates that the l.iter moved to the - // previous sstable. We have to update the tombstone for oldTopLevel as - // well. - oldTopLevel-- - } } else { + l.tombstone = nil if err := l.iter.Error(); err != nil { return err } m.heap.pop() } - - // The cached tombstones are only valid for the levels - // [0,oldTopLevel]. Updated the cached tombstones for any levels in the range - // [oldTopLevel+1,heap[0].index]. - return m.initMaxRangeDelIters(oldTopLevel) + return nil } -// isPrevEntryDeleted() starts from the current entry (as the prev entry) and if it is deleted, -// moves the iterators backward as needed and returns true, else it returns false. item is the top -// item in the heap. +// isPrevEntryDeleted() starts from the current entry (as the prev entry) and if +// it is deleted, moves the iterators backward as needed and returns true, else +// it returns false. item is the top item in the heap. func (m *mergingIter) isPrevEntryDeleted(item *mergingIterLevel) (bool, error) { // Look for a range deletion tombstone containing item.iterKV at higher - // levels (level < item.index). If we find such a range tombstone we know - // it deletes the key in the current level. Also look for a range - // deletion at the current level (level == item.index). If we find such a - // range deletion we need to check whether it is newer than the current - // entry. + // levels (level < item.index). If we find such a range tombstone we know it + // deletes the key in the current level. Also look for a range deletion at + // the current level (level == item.index). If we find such a range deletion + // we need to check whether it is newer than the current entry. for level := 0; level <= item.index; level++ { l := &m.levels[level] - if l.rangeDelIter == nil || l.tombstone == nil { - // If l.tombstone is nil, there are no further tombstones - // in the current sstable in the current (reverse) iteration - // direction. + if l.tombstone == nil { + // If l.iterTombstone is nil, there is no range tombstones in this + // level overlapping the current heap root. continue } - if m.heap.cmp(item.iterKV.K.UserKey, l.tombstone.Start) < 0 { - // The current key is before the tombstone start key. - // - // NB: for the case that this l.rangeDelIter is provided by a levelIter we know that - // the levelIter must be positioned at a key < item.iterKV. So it is sufficient to seek the - // current l.rangeDelIter (since any range del iterators that will be provided by the - // levelIter in the future cannot contain item.iterKV). Also, it is it is possible that we - // will encounter parts of the range delete that should be ignored -- we handle that - // below. - - tomb, err := keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKV.K.UserKey) - if err != nil { - return false, err + // Heap invariants must ensure that l.tombstone is always visible at the + // merging iterator's snapshot and contains the current heap root within + // its span boundaries. Ensure this holds in invariant builds. + if invariants.Enabled { + if !l.tombstone.Valid() { + panic(errors.AssertionFailedf("tombstone from level %s is not valid", l)) + } else if !l.tombstone.VisibleAt(m.snapshot) { + panic(errors.AssertionFailedf("tombstone from level %s is not visible at snapshot %d", l, m.snapshot)) + } else if !l.tombstone.Contains(m.heap.cmp, item.iterKV.K.UserKey) { + panic(errors.AssertionFailedf("tombstone from level %s does not contain key %s", l, item.iterKV.K)) } - l.tombstone = tomb - } - if l.tombstone == nil { - continue } - if l.tombstone.VisibleAt(m.snapshot) && m.heap.cmp(l.tombstone.End, item.iterKV.K.UserKey) > 0 { - if level < item.index { - // We could also do m.seekLT(..., level + 1). The levels from - // [level + 1, item.index) are already before item.iterKV so seeking them may be - // wasteful. + if level < item.index { + // We could also do m.seekLT(..., level + 1). The levels from [level + // + 1, item.index) are already before item.iterKV so seeking them + // may be wasteful. - // We can seek up to tombstone.Start.UserKey. - // - // Progress argument: We know that the iterator in this file is positioned within - // its bounds and at a key X < item.iterKV (otherwise it would be the max of the heap). - // So smallestUserKey <= item.iterKV.UserKey and we already know that - // l.tombstone.Start.UserKey <= item.iterKV.UserKey. So the seekKey computed below - // is <= item.iterKV.UserKey, and since we do a seekLT() we will make backwards - // progress. - seekKey := l.tombstone.Start - // We set the relative-seek flag. This is important when - // iterating with lazy combined iteration. If there's a range - // key between this level's current file and the file the seek - // will land on, we need to detect it in order to trigger - // construction of the combined iterator. - if err := m.seekLT(seekKey, item.index, base.SeekLTFlagsNone.EnableRelativeSeek()); err != nil { - return false, err - } - return true, nil + // We can seek up to tombstone.Start.UserKey. + // + // Progress argument: We know that the iterator in this file is + // positioned within its bounds and at a key X < item.iterKV + // (otherwise it would be the max of the heap). We already know that + // l.tombstone.Start.UserKey <= item.iterKV.UserKey, so the SeekLT + // will make backwards progress. + seekKey := l.tombstone.Start + // We set the relative-seek flag. This is important when iterating + // with lazy combined iteration. If there's a range key between this + // level's current file and the file the seek will land on, we need + // to detect it in order to trigger construction of the combined + // iterator. + if err := m.seekLT(seekKey, item.index, base.SeekLTFlagsNone.EnableRelativeSeek()); err != nil { + return false, err } - if l.tombstone.CoversAt(m.snapshot, item.iterKV.SeqNum()) { - if err := m.prevEntry(item); err != nil { - return false, err - } - return true, nil + return true, nil + } + // Otherwise level == item.index; Since the tombstone and the iterator + // are in the same level, the tombstone may or may not delete the point + // key depending on their sequence numbers. + if l.tombstone.CoversAt(m.snapshot, item.iterKV.SeqNum()) { + if err := m.prevEntry(item); err != nil { + return false, err } + return true, nil } } return false, nil @@ -847,26 +811,10 @@ func (m *mergingIter) findPrevEntry() *base.InternalKV { // The levelIter internal iterator will interleave exclusive sentinel // keys to keep files open until their range deletions are no longer - // necessary. Sometimes these are interleaved with the user key of a - // file's smallest key, in which case they may simply be stepped over to - // move to the next file in the backward direction. Other times they're - // interleaved at the user key of the user-iteration boundary, if that - // falls within the bounds of a file. In the latter case, there are no - // more keys ≥ m.lower, and we can stop iterating. - // - // We perform a key comparison to differentiate between these two cases. - // This key comparison is considered okay because it only happens for - // sentinel keys. It may be eliminated after #2863. - if m.levels[item.index].iterKV.K.IsExclusiveSentinel() { - if m.lower != nil && m.heap.cmp(m.levels[item.index].iterKV.K.UserKey, m.lower) <= 0 { - break - } - // This key is the smallest boundary of a file and can be skipped - // now that the file's range deletions are no longer relevant. + // necessary and inform of us when we're stepping in to or out of a + // range tombstone. + if item.iterKV.K.IsExclusiveSentinel() { m.err = m.prevEntry(item) - if m.err != nil { - return nil - } continue } @@ -886,7 +834,8 @@ func (m *mergingIter) findPrevEntry() *base.InternalKV { return nil } -// Seeks levels >= level to >= key. Additionally uses range tombstones to extend the seeks. +// Seeks levels >= level to >= key. Additionally uses range tombstones to extend +// the seeks. // // If an error occurs, seekGE returns the error without setting m.err. func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) error { @@ -948,6 +897,7 @@ func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) erro } l := &m.levels[level] + l.tombstone = nil if m.prefix != nil { l.iterKV = l.iter.SeekPrefixGE(m.prefix, key, flags) if l.iterKV != nil { @@ -964,37 +914,29 @@ func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) erro if err := l.iter.Error(); err != nil { return err } - } - - // If this level contains overlapping range tombstones, alter the seek - // key accordingly. Caveat: If we're performing lazy-combined iteration, - // we cannot alter the seek key: Range tombstones don't delete range - // keys, and there might exist live range keys within the range - // tombstone's span that need to be observed to trigger a switch to - // combined iteration. - if rangeDelIter := l.rangeDelIter; rangeDelIter != nil && + } else if l.iterKV.K.IsExclusiveSentinel() && (m.combinedIterState == nil || m.combinedIterState.initialized) { - // The level has a range-del iterator. Find the tombstone containing - // the search key. - var err error - l.tombstone, err = rangeDelIter.SeekGE(key) - if err != nil { - return err - } - if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && m.heap.cmp(l.tombstone.Start, key) <= 0 { - // Based on the containment condition tombstone.End > key, so - // the assignment to key results in a monotonically - // non-decreasing key across iterations of this loop. + // If this level contains an overlapping range tombstone, alter the + // seek key accordingly. Caveat: If we're performing lazy-combined + // iteration, we cannot alter the seek key: Range tombstones don't + // delete range keys, and there might exist live range keys within + // the range tombstone's span that need to be observed to trigger a + // switch to combined iteration. + if t := l.getVisibleTombstone(m.snapshot); t != nil && m.heap.cmp(t.Start, key) <= 0 { + // Based on SeekGE semantics, we know t.End > key, so the + // assignment to key results in a monotonically non-decreasing + // key across iterations of this loop. // // The adjustment of key here can only move it to a larger key. // Since the caller of seekGE guaranteed that the original key // was greater than or equal to m.lower, the new key will // continue to be greater than or equal to m.lower. - key = l.tombstone.End + key = t.End } } } - return m.initMinHeap() + m.initMinHeap() + return nil } func (m *mergingIter) String() string { @@ -1038,7 +980,8 @@ func (m *mergingIter) SeekPrefixGEStrict( return iterKV } -// Seeks levels >= level to < key. Additionally uses range tombstones to extend the seeks. +// Seeks levels >= level to < key. Additionally uses range tombstones to extend +// the seeks. func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) error { // See the comment in seekGE regarding using tombstones to adjust the seek // target per level. @@ -1050,48 +993,34 @@ func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) erro l := &m.levels[level] l.iterKV = l.iter.SeekLT(key, flags) + l.tombstone = nil if l.iterKV == nil { if err := l.iter.Error(); err != nil { return err } - } - - // If this level contains overlapping range tombstones, alter the seek - // key accordingly. Caveat: If we're performing lazy-combined iteration, - // we cannot alter the seek key: Range tombstones don't delete range - // keys, and there might exist live range keys within the range - // tombstone's span that need to be observed to trigger a switch to - // combined iteration. - if rangeDelIter := l.rangeDelIter; rangeDelIter != nil && + } else if l.iterKV.K.IsExclusiveSentinel() && (m.combinedIterState == nil || m.combinedIterState.initialized) { - // The level has a range-del iterator. Find the tombstone containing - // the search key. - tomb, err := keyspan.SeekLE(m.heap.cmp, rangeDelIter, key) - if err != nil { - return err - } - l.tombstone = tomb - // Since SeekLT is exclusive on `key` and a tombstone's end key is - // also exclusive, a seek key equal to a tombstone's end key still - // enables the seek optimization (Note this is different than the - // check performed by (*keyspan.Span).Contains). - if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && - m.heap.cmp(key, l.tombstone.End) <= 0 { - // NB: Based on the containment condition - // tombstone.Start.UserKey <= key, so the assignment to key - // results in a monotonically non-increasing key across - // iterations of this loop. + // If this level contains an overlapping range tombstone, alter the + // seek key accordingly. Caveat: If we're performing lazy-combined + // iteration, we cannot alter the seek key: Range tombstones don't + // delete range keys, and there might exist live range keys within + // the range tombstone's span that need to be observed to trigger a + // switch to combined iteration. + if t := l.getVisibleTombstone(m.snapshot); t != nil && m.heap.cmp(key, t.End) <= 0 { + // Based on the SeekLT semantics we know t.Start.UserKey < key, + // so the assignment to key results in a monotonically + // non-increasing key across iterations of this loop. // // The adjustment of key here can only move it to a smaller key. // Since the caller of seekLT guaranteed that the original key // was less than or equal to m.upper, the new key will continue // to be less than or equal to m.upper. - key = l.tombstone.Start + key = t.Start } } } - - return m.initMaxHeap() + m.initMaxHeap() + return nil } // SeekLT implements base.InternalIterator.SeekLT. Note that SeekLT only checks @@ -1116,15 +1045,14 @@ func (m *mergingIter) First() *base.InternalKV { for i := range m.levels { l := &m.levels[i] l.iterKV = l.iter.First() + l.tombstone = nil if l.iterKV == nil { if m.err = l.iter.Error(); m.err != nil { return nil } } } - if m.err = m.initMinHeap(); m.err != nil { - return nil - } + m.initMinHeap() return m.findNextEntry() } @@ -1137,15 +1065,14 @@ func (m *mergingIter) Last() *base.InternalKV { for i := range m.levels { l := &m.levels[i] l.iterKV = l.iter.Last() + l.tombstone = nil if l.iterKV == nil { if m.err = l.iter.Error(); m.err != nil { return nil } } } - if m.err = m.initMaxHeap(); m.err != nil { - return nil - } + m.initMaxHeap() return m.findPrevEntry() } @@ -1153,14 +1080,12 @@ func (m *mergingIter) Next() *base.InternalKV { if m.err != nil { return nil } - if m.dir != 1 { if m.err = m.switchToMinHeap(); m.err != nil { return nil } return m.findNextEntry() } - if m.heap.len() == 0 { return nil } @@ -1172,7 +1097,6 @@ func (m *mergingIter) Next() *base.InternalKV { if m.err = m.nextEntry(m.heap.items[0], nil /* succKey */); m.err != nil { return nil } - iterKV := m.findNextEntry() if invariants.Enabled && m.prefix != nil && iterKV != nil { if n := m.split(iterKV.K.UserKey); !bytes.Equal(m.prefix, iterKV.K.UserKey[:n]) { @@ -1271,7 +1195,8 @@ func (m *mergingIter) Prev() *base.InternalKV { if m.heap.len() == 0 { return nil } - if m.err = m.prevEntry(m.heap.items[0]); m.err != nil { + root := m.heap.items[0] + if m.err = m.prevEntry(root); m.err != nil { return nil } return m.findPrevEntry() @@ -1290,11 +1215,6 @@ func (m *mergingIter) Close() error { if err := iter.Close(); err != nil && m.err == nil { m.err = err } - if rangeDelIter := m.levels[i].rangeDelIter; rangeDelIter != nil { - if err := rangeDelIter.Close(); err != nil && m.err == nil { - m.err = err - } - } } m.levels = nil m.heap.items = m.heap.items[:0] @@ -1327,9 +1247,9 @@ func (m *mergingIter) DebugString() string { } var err error if m.dir == 1 { - err = m.initMinHeap() + m.initMinHeap() } else { - err = m.initMaxHeap() + m.initMaxHeap() } if err != nil { fmt.Fprintf(&buf, "err=<%s>", err) diff --git a/merging_iter_test.go b/merging_iter_test.go index 4a141af3cd..23018d456c 100644 --- a/merging_iter_test.go +++ b/merging_iter_test.go @@ -303,10 +303,8 @@ func TestMergingIterCornerCases(t *testing.T) { li := &levelIter{} li.init(context.Background(), IterOptions{}, testkeys.Comparer, newIters, slice.Iter(), manifest.Level(i), internalIterOpts{stats: &stats}) - - i := len(levelIters) - levelIters = append(levelIters, mergingIterLevel{iter: li}) - li.initRangeDel(&levelIters[i].rangeDelIter) + li.interleaveRangeDeletions = true + levelIters = append(levelIters, mergingIterLevel{iter: li, getTombstone: li.getTombstone}) } miter := &mergingIter{} miter.init(nil /* opts */, &stats, cmp, func(a []byte) int { return len(a) }, levelIters...) @@ -677,8 +675,9 @@ func buildMergingIter(readers [][]*sstable.Reader, levelSlices []manifest.LevelS l := newLevelIter( context.Background(), IterOptions{}, testkeys.Comparer, newIters, levelSlices[i].Iter(), manifest.Level(level), internalIterOpts{}) - l.initRangeDel(&mils[level].rangeDelIter) + l.interleaveRangeDeletions = true mils[level].iter = l + mils[level].getTombstone = l.getTombstone } var stats base.InternalIteratorStats m := &mergingIter{} diff --git a/scan_internal.go b/scan_internal.go index d6ee26618f..5f3c17cde3 100644 --- a/scan_internal.go +++ b/scan_internal.go @@ -874,7 +874,8 @@ func (i *scanInternalIterator) constructPointIter( for j := len(memtables) - 1; j >= 0; j-- { mem := memtables[j] mlevels = append(mlevels, mergingIterLevel{ - iter: mem.newIter(&i.opts.IterOptions), + iter: mem.newIter(&i.opts.IterOptions), + getTombstone: nil, }) i.iterLevels[mlevelsIndex] = IteratorLevel{ Kind: IteratorLevelFlushable, @@ -901,6 +902,7 @@ func (i *scanInternalIterator) constructPointIter( i.ctx, i.opts.IterOptions, i.comparer, i.newIters, files, level, internalIterOpts{}) mlevels[mlevelsIndex].iter = li + mlevels[mlevelsIndex].getTombstone = nil // range dels handled separately below rli.Init(keyspan.SpanIterOptions{RangeKeyFilters: i.opts.RangeKeyFilters}, i.comparer.Compare, tableNewRangeDelIter(i.ctx, i.newIters), files, level, manifest.KeyTypePoint) diff --git a/testdata/ingest b/testdata/ingest index fc6a64bfa7..4be171bdf0 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -53,7 +53,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 569B Block cache: 6 entries (945B) hit rate: 30.8% -Table cache: 1 entries (760B) hit rate: 66.7% +Table cache: 1 entries (760B) hit rate: 50.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 diff --git a/testdata/level_iter_boundaries b/testdata/level_iter_boundaries index dec8d90c8e..f3287f7c52 100644 --- a/testdata/level_iter_boundaries +++ b/testdata/level_iter_boundaries @@ -20,19 +20,19 @@ prev prev prev ---- -a#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: -d#72057594037927935,RANGEDEL: +a#72057594037927935,SPANSTART: +b#72057594037927935,SPANEND: +b#72057594037927935,SPANSTART: +c#72057594037927935,SPANEND: +c#72057594037927935,SPANSTART: +d#72057594037927935,SPANEND: . -d#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: -a#72057594037927935,RANGEDEL: +d#72057594037927935,SPANEND: +c#72057594037927935,SPANSTART: +c#72057594037927935,SPANEND: +b#72057594037927935,SPANSTART: +b#72057594037927935,SPANEND: +a#72057594037927935,SPANSTART: . iter @@ -41,10 +41,10 @@ seek-ge d seek-lt b prev ---- -c#72057594037927935,RANGEDEL: +c#72057594037927935,SPANSTART: . -b#72057594037927935,RANGEDEL: -a#72057594037927935,RANGEDEL: +b#72057594037927935,SPANEND: +a#72057594037927935,SPANSTART: iter seek-prefix-ge c @@ -52,10 +52,10 @@ seek-prefix-ge d seek-lt b prev ---- -c#72057594037927935,RANGEDEL: +c#72057594037927935,SPANSTART: . -b#72057594037927935,RANGEDEL: -a#72057594037927935,RANGEDEL: +b#72057594037927935,SPANEND: +a#72057594037927935,SPANSTART: iter seek-ge e @@ -100,8 +100,8 @@ next next ---- a#1,SET:a -b#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: +b#72057594037927935,SPANSTART: +c#72057594037927935,SPANEND: c#3,SET:c . @@ -113,8 +113,8 @@ prev prev ---- c#3,SET:c -c#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: +c#72057594037927935,SPANEND: +b#72057594037927935,SPANSTART: a#1,SET:a . @@ -134,8 +134,8 @@ next next ---- a#1,SET:b -b#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: +b#72057594037927935,SPANSTART: +c#72057594037927935,SPANEND: . iter @@ -144,8 +144,8 @@ prev prev prev ---- -c#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: +c#72057594037927935,SPANEND: +b#72057594037927935,SPANSTART: a#1,SET:b . @@ -164,8 +164,8 @@ next next next ---- -a#72057594037927935,RANGEDEL: -b#72057594037927935,RANGEDEL: +a#72057594037927935,SPANSTART: +b#72057594037927935,SPANEND: c#2,SET:c . @@ -176,8 +176,8 @@ prev prev ---- c#2,SET:c -b#72057594037927935,RANGEDEL: -a#72057594037927935,RANGEDEL: +b#72057594037927935,SPANEND: +a#72057594037927935,SPANSTART: . # Regression test to check that Seek{GE,LT} work properly in certain @@ -203,15 +203,15 @@ seek-ge d prev seek-lt e ---- -d#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: d#3,SET:d -d#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: d#3,SET:d -e#72057594037927935,RANGEDEL: +e#72057594037927935,SPANEND: d#3,SET:d -d#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: -e#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: +c#72057594037927935,SPANSTART: +e#72057594037927935,SPANEND: # Regression test to check that Seek{GE,LT}, First, and Last do not # have iteration bounds affected by SeekPrefixGE. Previously, diff --git a/testdata/level_iter_seek b/testdata/level_iter_seek index 74a4691e03..b95a38fdcc 100644 --- a/testdata/level_iter_seek +++ b/testdata/level_iter_seek @@ -36,8 +36,8 @@ seek-ge d next next ---- -d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} -e#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} +e#72057594037927935,SPANEND: / d-e:{(#6,RANGEDEL)} f#5,SET:f iter @@ -53,7 +53,7 @@ next prev next ---- -c#7,SET:c / d-e:{(#6,RANGEDEL)} +c#7,SET:c . c#7,SET:c . @@ -64,8 +64,8 @@ iter seek-prefix-ge d next ---- -d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} -d\x00#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: / d-d\x00:{(#6,RANGEDEL)} +d\x00#72057594037927935,SPANEND: / d-d\x00:{(#6,RANGEDEL)} # Tests a sequence of SeekPrefixGE with monotonically increasing keys, some of # which are present and some not (so fail the bloom filter match). The seek to @@ -81,9 +81,9 @@ seek-prefix-ge gg seek-prefix-ge h ---- . -c#7,SET:c / d-e:{(#6,RANGEDEL)} -d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} -cc\x00#72057594037927935,RANGEDEL: +c#7,SET:c +d#72057594037927935,SPANSTART: / d-cc\x00:{(#6,RANGEDEL)} +cc\x00#72057594037927935,SPANEND: / d-cc\x00:{(#6,RANGEDEL)} f#5,SET:f g#4,SET:g . @@ -125,9 +125,9 @@ b#8,SET:b {BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} c#7,SET:c {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} -d#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} -e#72057594037927935,RANGEDEL: +e#72057594037927935,SPANEND: / d-e:{(#6,RANGEDEL)} {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} f#5,SET:f {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} @@ -154,11 +154,11 @@ prev prev prev ---- -f#5,SET:f / d-e:{(#6,RANGEDEL)} -e#72057594037927935,RANGEDEL: f#5,SET:f -e#72057594037927935,RANGEDEL: -d#72057594037927935,RANGEDEL: +e#72057594037927935,SPANEND: / d-e:{(#6,RANGEDEL)} +f#5,SET:f +e#72057594037927935,SPANEND: / d-e:{(#6,RANGEDEL)} +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} . # Verify that First() in the presence of an upper-bound pauses at the @@ -177,7 +177,7 @@ iter set-bounds upper=f first ---- -d#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} # Verify that Last() in the presence of a lower-bound pauses at the # last range deletion's end key. @@ -197,8 +197,8 @@ last prev prev ---- -e#72057594037927935,RANGEDEL: -d#72057594037927935,RANGEDEL: +e#72057594037927935,SPANEND: / d-e:{(#6,RANGEDEL)} +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} c#7,SET:c # Verify that a seek to a file with range tombstones as boundaries pauses on @@ -228,10 +228,10 @@ next next next ---- -d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} c#7,SET:c -d#72057594037927935,RANGEDEL: -e#72057594037927935,RANGEDEL: +d#72057594037927935,SPANSTART: / d-e:{(#6,RANGEDEL)} +e#72057594037927935,SPANEND: / d-e:{(#6,RANGEDEL)} f#8,SET:f iter @@ -241,10 +241,10 @@ prev prev prev ---- -b#72057594037927935,RANGEDEL: / a-b:{(#5,RANGEDEL)} +b#72057594037927935,SPANEND: / a-b:{(#5,RANGEDEL)} c#7,SET:c -b#72057594037927935,RANGEDEL: -a#72057594037927935,RANGEDEL: +b#72057594037927935,SPANEND: / a-b:{(#5,RANGEDEL)} +a#72057594037927935,SPANSTART: / a-b:{(#5,RANGEDEL)} . # Verify that prev when positioned at the start bound of the first range @@ -266,7 +266,7 @@ seek-ge d prev ---- b#1,SET:b -d#72057594037927935,RANGEDEL: / d-e:{(#2,RANGEDEL)} +d#72057594037927935,SPANSTART: / d-e:{(#2,RANGEDEL)} b#1,SET:b # Verify that next when positioned at the start boundary of the first range @@ -288,7 +288,7 @@ seek-lt d next ---- d#2,SET:d -b#72057594037927935,RANGEDEL: / a-b:{(#1,RANGEDEL)} +b#72057594037927935,SPANEND: / a-b:{(#1,RANGEDEL)} d#2,SET:d # Verify SeekPrefixGE correctness with trySeekUsingNext=true @@ -342,12 +342,12 @@ seek-prefix-ge h true seek-prefix-ge i true seek-prefix-ge j true ---- -a#1,SET:a / c-e:{(#4,RANGEDEL)} -a#1,SET:a / c-e:{(#4,RANGEDEL)} -b#2,SET:b / c-e:{(#4,RANGEDEL)} -c#72057594037927935,RANGEDEL: -c#72057594037927935,RANGEDEL: / c-e:{(#4,RANGEDEL)} -d#72057594037927935,RANGEDEL: / c-e:{(#4,RANGEDEL)} +a#1,SET:a +a#1,SET:a +b#2,SET:b +c#72057594037927935,SPANSTART: / c-b\x00:{(#4,RANGEDEL)} +c#72057594037927935,SPANSTART: / c-c\x00:{(#4,RANGEDEL)} +d#72057594037927935,SPANSTART: / d-d\x00:{(#4,RANGEDEL)} f#5,SINGLEDEL: g#6,SET:g h#7,SINGLEDEL: @@ -361,7 +361,7 @@ seek-prefix-ge e true seek-prefix-ge i true seek-prefix-ge j true ---- -a#1,SET:a / c-e:{(#4,RANGEDEL)} +a#1,SET:a e#4,SET:e i#6,SET:i j#7,SET:j @@ -408,10 +408,8 @@ next f#5,SET:f i#4,SET:i -# The below count should be 4, as we skip over the rangekey-only file. -# TODO(jackson): When we stop opening range deletion iterators twice, this -# should be 2. +# The below count should be 2, as we skip over the rangekey-only file. iters-created ---- -4 +2 diff --git a/testdata/merging_iter b/testdata/merging_iter index 3595ac6232..4fd4d09a13 100644 --- a/testdata/merging_iter +++ b/testdata/merging_iter @@ -53,22 +53,16 @@ e#10,SET:10 g#20,SET:20 . -iter probe-rangedels=(000000,(Log "# 000000.rangeDelIter.")) probe-rangedels=(000001,(If (Equal SeekKey (Bytes "g")) ErrInjected noop),(Log "# 000001.rangeDelIter.")) +iter probe-rangedels=(000000,(Log "# 000000.rangeDelIter.")) probe-rangedels=(000001,(If opSpanNext ErrInjected noop),(Log "# 000001.rangeDelIter.")) seek-ge d next ---- # 000000.rangeDelIter.opSpanSeekGE("d") = a-e:{(#8,RANGEDEL)} -# 000000.rangeDelIter.opSpanSeekGE("d") = a-e:{(#8,RANGEDEL)} -# 000001.rangeDelIter.opSpanSeekGE("e") = e-g:{(#8,RANGEDEL)} # 000001.rangeDelIter.opSpanSeekGE("e") = e-g:{(#8,RANGEDEL)} -# 000000.rangeDelIter.opSpanSeekGE("d") = a-e:{(#8,RANGEDEL)} # 000000.rangeDelIter.opSpanNext() = nil # 000000.rangeDelIter.opSpanClose() = nil -# 000000.rangeDelIter.opSpanClose() = nil -# 000001.rangeDelIter.opSpanSeekGE("e") = e-g:{(#8,RANGEDEL)} e#10,SET:10 -# 000001.rangeDelIter.opSpanNext() = nil -# 000001.rangeDelIter.opSpanSeekGE("g") = nil +# 000001.rangeDelIter.opSpanNext() = nil err=injected error # isPrevEntryDeleted() should not allow the rangedel to act on the points in the lower sstable @@ -556,8 +550,8 @@ seek-lt jd next ---- # 000022.SeekLT("jd") = (iwoeionch#792,SET,"792") -# 000023.SeekLT("c") = nil -# 000024.SeekLT("c") = nil +# 000023.SeekLT("cz") = nil +# 000024.SeekLT("cz") = nil iwoeionch#792,SET:792 # 000023.SeekGE("cz") = nil err=injected error @@ -909,6 +903,6 @@ set-bounds lower=cz upper=jd seek-lt jd ---- # 000033.SeekLT("jd") = (iwoeionch#792,SET,"792") -# 000034.SeekLT("c") = nil -# 000035.SeekLT("c") = nil +# 000034.SeekLT("cz") = nil +# 000035.SeekLT("cz") = nil iwoeionch#792,SET:792 diff --git a/testdata/metrics b/testdata/metrics index 04e93ab50e..678d5180f6 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -73,7 +73,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 589B Block cache: 3 entries (484B) hit rate: 0.0% -Table cache: 1 entries (760B) hit rate: 50.0% +Table cache: 1 entries (760B) hit rate: 0.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 1 @@ -128,7 +128,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 595B Block cache: 5 entries (946B) hit rate: 33.3% -Table cache: 2 entries (1.5KB) hit rate: 75.0% +Table cache: 2 entries (1.5KB) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 2 @@ -170,7 +170,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 595B Block cache: 5 entries (946B) hit rate: 33.3% -Table cache: 2 entries (1.5KB) hit rate: 75.0% +Table cache: 2 entries (1.5KB) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 2 @@ -209,7 +209,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 595B Block cache: 3 entries (484B) hit rate: 33.3% -Table cache: 1 entries (760B) hit rate: 75.0% +Table cache: 1 entries (760B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 1 @@ -252,7 +252,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 595B Block cache: 0 entries (0B) hit rate: 33.3% -Table cache: 0 entries (0B) hit rate: 75.0% +Table cache: 0 entries (0B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -322,7 +322,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 2.6KB Block cache: 0 entries (0B) hit rate: 33.3% -Table cache: 0 entries (0B) hit rate: 75.0% +Table cache: 0 entries (0B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -376,7 +376,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 2.0KB Block cache: 0 entries (0B) hit rate: 14.3% -Table cache: 0 entries (0B) hit rate: 64.3% +Table cache: 0 entries (0B) hit rate: 58.3% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -479,7 +479,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 4.3KB Block cache: 12 entries (1.9KB) hit rate: 16.7% -Table cache: 1 entries (760B) hit rate: 64.7% +Table cache: 1 entries (760B) hit rate: 60.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -541,7 +541,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 6.1KB Block cache: 12 entries (1.9KB) hit rate: 16.7% -Table cache: 1 entries (760B) hit rate: 64.7% +Table cache: 1 entries (760B) hit rate: 60.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0