From c0ab2a51bb03ccda464e2499e480fded52f6bd53 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Wed, 24 Apr 2024 15:44:28 -0400 Subject: [PATCH] db: rework mergingIter and levelIter synthetic keys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The merging iterator and the level iterator are more tightly coupled than desired. Most of this coupling is a consequence of range deletions. The merging iterator is responsible for applying range deletions across levels and requires that the level iterator not progress to a new file until the file's range deletions are no longer relevant to other levels. It does this by interleaving "synthetic" keys. When these keys reach the top of the heap, it signals that all other levels have progressed to a point where the current file's range deletions are no longer relevant. Previously, knowledge of which keys were interleaved "synthetic" keys was propagated indirectly—outside the internal iterator interface—through a pointer to a levelIterBoundaryContext struct with boolean fields. This commit instead always interleaves synthetic keys as range deletion exclusive sentinels. The merging iterator can infer which keys are synthetic by calling InternalKey.IsExclusiveSentinel. Propagating this information directly through the internal iterator interface is more direct and understandable. Beyond needing to know which keys are synthetic, the merging iterator also must know when user-imposed iteration bounds have been reached. Ordinarily when bounds have been reached, internal iterators become exhausted. Since the levelIter's file's range deletions may still be relevant, it cannot and it interleaves a "synthetic" key at the iteration bound. When this key reaches the top of the merging iterator's heap, there are no more keys within bounds and the merging iterator is exhausted. To make this determination, we add a simple key comparison. The use of a key comparison is expected to be acceptable, because it's only performed when a synthetic key reaches the top of the heap. This additional key comparison should ultimately be eliminated when #2863 is complete. Informs #2863. --- db.go | 1 - level_checker.go | 86 +++++++------- level_iter.go | 199 ++++++++------------------------- merging_iter.go | 81 +++++++------- merging_iter_test.go | 2 - scan_internal.go | 1 - testdata/level_iter_boundaries | 20 ++-- testdata/level_iter_seek | 14 +-- testdata/merging_iter | 6 +- 9 files changed, 147 insertions(+), 263 deletions(-) diff --git a/db.go b/db.go index 9db6f5923aa..765ac63a65b 100644 --- a/db.go +++ b/db.go @@ -1468,7 +1468,6 @@ func (i *Iterator) constructPointIter( li.init(ctx, i.opts, &i.comparer, i.newIters, files, level, internalOpts) li.initRangeDel(&mlevels[mlevelsIndex].rangeDelIter) - li.initBoundaryContext(&mlevels[mlevelsIndex].levelIterBoundaryContext) li.initCombinedIterState(&i.lazyCombinedIter.combinedIterState) mlevels[mlevelsIndex].levelIter = li mlevels[mlevelsIndex].iter = invalidating.MaybeWrapIfInvariants(li) diff --git a/level_checker.go b/level_checker.go index 69d55392190..7efcb5010d5 100644 --- a/level_checker.go +++ b/level_checker.go @@ -49,7 +49,6 @@ import ( type simpleMergingIterLevel struct { iter internalIterator rangeDelIter keyspan.FragmentIterator - levelIterBoundaryContext iterKV *base.InternalKV tombstone *keyspan.Span @@ -129,8 +128,7 @@ func (m *simpleMergingIter) step() bool { item := &m.heap.items[0] l := &m.levels[item.index] // Sentinels are not relevant for this point checking. - if !l.isIgnorableBoundaryKey && !item.key.IsExclusiveSentinel() && - item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) { + if !item.key.IsExclusiveSentinel() && item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) { // This is a visible point key. if !m.handleVisiblePoint(item, l) { return false @@ -144,50 +142,50 @@ func (m *simpleMergingIter) step() bool { // Step to the next point. l.iterKV = l.iter.Next() - if !l.isIgnorableBoundaryKey { - if l.iterKV != nil { - // Check point keys in an sstable are ordered. Although not required, we check - // for memtables as well. A subtle check here is that successive sstables of - // L1 and higher levels are ordered. This happens when levelIter moves to the - // next sstable in the level, in which case item.key is previous sstable's - // last point key. - if base.InternalCompare(m.heap.cmp, item.key, l.iterKV.K) >= 0 { - m.err = errors.Errorf("out of order keys %s >= %s in %s", - item.key.Pretty(m.formatKey), l.iterKV.K.Pretty(m.formatKey), l.iter) - return false - } - item.key = base.InternalKey{ - Trailer: l.iterKV.K.Trailer, - UserKey: append(item.key.UserKey[:0], l.iterKV.K.UserKey...), - } - item.value = l.iterKV.V - if m.heap.len() > 1 { - m.heap.fix(0) - } - } else { - m.err = l.iter.Close() - l.iter = nil - m.heap.pop() - } - if m.err != nil { + if l.iterKV == nil { + m.err = errors.CombineErrors(l.iter.Error(), l.iter.Close()) + l.iter = nil + m.heap.pop() + } else if !l.iterKV.K.IsExclusiveSentinel() { + // Check point keys in an sstable are ordered. Although not required, we check + // for memtables as well. A subtle check here is that successive sstables of + // L1 and higher levels are ordered. This happens when levelIter moves to the + // next sstable in the level, in which case item.key is previous sstable's + // last point key. + if base.InternalCompare(m.heap.cmp, item.key, l.iterKV.K) >= 0 { + m.err = errors.Errorf("out of order keys %s >= %s in %s", + item.key.Pretty(m.formatKey), l.iterKV.K.Pretty(m.formatKey), l.iter) return false } - if m.heap.len() == 0 { - // Last record was a MERGE record. - if m.valueMerger != nil { - var closer io.Closer - _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) - if m.err == nil && closer != nil { - m.err = closer.Close() - } - if m.err != nil { - m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s", - item.key.Pretty(m.formatKey), m.lastIterMsg) - } - m.valueMerger = nil + item.key = base.InternalKey{ + Trailer: l.iterKV.K.Trailer, + UserKey: append(item.key.UserKey[:0], l.iterKV.K.UserKey...), + } + item.value = l.iterKV.V + if m.heap.len() > 1 { + m.heap.fix(0) + } + } + if m.err != nil { + return false + } + if m.heap.len() == 0 { + // If m.valueMerger != nil, the last record was a MERGE record. + if m.valueMerger != nil { + var closer io.Closer + var err error + _, closer, err = m.valueMerger.Finish(true /* includesBase */) + if closer != nil { + err = errors.CombineErrors(err, closer.Close()) } - return false + if err != nil { + m.err = errors.CombineErrors(m.err, + errors.Wrapf(err, "merge processing error on key %s in %s", + item.key.Pretty(m.formatKey), m.lastIterMsg)) + } + m.valueMerger = nil } + return false } m.positionRangeDels() return true @@ -639,7 +637,6 @@ func checkLevelsInternal(c *checkConfig) (err error) { li.init(context.Background(), iterOpts, c.comparer, c.newIters, manifestIter, manifest.L0Sublevel(sublevel), internalIterOpts{}) li.initRangeDel(&mlevelAlloc[0].rangeDelIter) - li.initBoundaryContext(&mlevelAlloc[0].levelIterBoundaryContext) mlevelAlloc[0].iter = li mlevelAlloc = mlevelAlloc[1:] } @@ -653,7 +650,6 @@ func checkLevelsInternal(c *checkConfig) (err error) { li.init(context.Background(), iterOpts, c.comparer, c.newIters, current.Levels[level].Iter(), manifest.Level(level), internalIterOpts{}) li.initRangeDel(&mlevelAlloc[0].rangeDelIter) - li.initBoundaryContext(&mlevelAlloc[0].levelIterBoundaryContext) mlevelAlloc[0].iter = li mlevelAlloc = mlevelAlloc[1:] } diff --git a/level_iter.go b/level_iter.go index d8f5a6dc196..33d7723d729 100644 --- a/level_iter.go +++ b/level_iter.go @@ -84,9 +84,12 @@ type levelIter struct { // the levelIter passes over a file containing range keys. See the // lazyCombinedIter for more details. combinedIterState *combinedIterState - // A synthetic boundary key-value pair to return when SeekPrefixGE finds an - // sstable which doesn't contain the search key, but which does contain - // range tombstones. + // A synthetic boundary key-value pair to return when an sstable contains + // range tombstones that might be relevant but no more relevant point keys. + // The synthetic boundary key is always an exclusive range deletion sentinel + // key. When the user-imposed iteration bounds have been reached, the key's + // user key is the exceeded bound. Otherwise, it's the smallest/largest key + // in the file. syntheticBoundary base.InternalKV // The iter for the current file. It is nil under any of the following conditions: // - files.Current() == nil @@ -111,24 +114,6 @@ type levelIter struct { files manifest.LevelIterator err error - // Pointer into this level's mergingIterLevel.levelIterBoundaryContext. - // It's populated when the levelIter is in-use by a mergingIter. It's used - // to signal additional semantic meaning about the most recently returned - // key. It's currently used to pause at two different types of bounds: - // - // - isSyntheticIterBoundsKey is set to true when the iterator has - // user-imposed iteration bounds (l.{lower,upper}), and the levelIter - // reached the user-imposed bound. This signals that the underlying - // iterators are not necessarily exhausted, but iteration has paused to - // avoid unnecessarily loading sstables outside the user-imposed bounds. - // - isIgnorableBoundaryKey is set to true when the levelIter returns a - // fake key at one of the bounds of an sstable within the level. It does - // this only when the current sstable contains range deletions. It ensures - // the merging iterator does not move beyond the table until the table's - // range deletions are no longer necessary, even if the table contains - // no more relevant point keys. - boundaryContext *levelIterBoundaryContext - // internalOpts holds the internal iterator options to pass to the table // cache when constructing new table iterators. internalOpts internalIterOpts @@ -210,14 +195,18 @@ func (l *levelIter) initRangeDel(rangeDelIter *keyspan.FragmentIterator) { l.rangeDelIterPtr = rangeDelIter } -func (l *levelIter) initBoundaryContext(context *levelIterBoundaryContext) { - l.boundaryContext = context -} - func (l *levelIter) initCombinedIterState(state *combinedIterState) { l.combinedIterState = state } +func (l *levelIter) makeSyntheticBound(userKey []byte) *base.InternalKV { + l.syntheticBoundary = base.InternalKV{ + K: base.MakeRangeDeleteSentinelKey(userKey), + V: base.LazyValue{}, + } + return &l.syntheticBoundary +} + func (l *levelIter) maybeTriggerCombinedIteration(file *fileMetadata, dir int) { // If we encounter a file that contains range keys, we may need to // trigger a switch to combined range-key and point-key iteration, @@ -517,10 +506,6 @@ const ( func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicator { l.smallestBoundary = nil l.largestBoundary = nil - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } if l.iterFile == file { if l.err != nil { return noFileLoaded @@ -628,17 +613,6 @@ func (l *levelIter) verify(kv *base.InternalKV) *base.InternalKV { if l.upper != nil && kv != l.largestBoundary && l.cmp(kv.K.UserKey, l.upper) > 0 { l.logger.Fatalf("levelIter %s: upper bound violation: %s > %s\n%s", l.level, kv, l.upper, debug.Stack()) } - - if l.boundaryContext != nil && l.boundaryContext.isSyntheticIterBoundsKey { - switch { - case !kv.K.IsExclusiveSentinel(): - l.logger.Fatalf("levelIter %s: returning %s and isSyntheticIterBounds=true", l.level, kv) - case l.lower == nil && l.upper == nil: - // If we knew the direction of iteration, we could verify that - // specifically the corresponding bound is non-nil. - l.logger.Fatalf("levelIter %s: returning %s but has no bounds", l.level, kv) - } - } } return kv } @@ -650,10 +624,6 @@ func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV l.err = nil // clear cached iteration error l.exhaustedDir = 0 - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } // NB: the top-level Iterator has already adjusted key based on // IterOptions.LowerBound. loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1) @@ -679,10 +649,6 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba l.err = nil // clear cached iteration error l.exhaustedDir = 0 - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } // NB: the top-level Iterator has already adjusted key based on // IterOptions.LowerBound. @@ -706,31 +672,16 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba // the sstable. All we know is that a key with prefix does not exist in the // current sstable. We do know that the key lies within the bounds of the // table as findFileGE found the table where key <= meta.Largest. We return - // the table's bound with isIgnorableBoundaryKey set. + // the table's bound as a synthetic key. if l.rangeDelIterPtr != nil && *l.rangeDelIterPtr != nil { if l.tableOpts.UpperBound != nil { - l.syntheticBoundary = base.MakeInternalKV(base.InternalKey{ - UserKey: l.tableOpts.UpperBound, - Trailer: InternalKeyRangeDeleteSentinel, - }, nil) - l.largestBoundary = &l.syntheticBoundary - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = true - l.boundaryContext.isIgnorableBoundaryKey = false - } + l.largestBoundary = l.makeSyntheticBound(l.tableOpts.UpperBound) l.exhaustedDir = +1 return l.verify(l.largestBoundary) } // Return the file's largest bound, ensuring this file stays open until - // the mergingIter advances beyond the file's bounds. We set - // isIgnorableBoundaryKey to signal that the actual key returned should - // be ignored, and does not represent a real key in the database. - l.syntheticBoundary = base.MakeInternalKV(l.iterFile.LargestPointKey, nil) - l.largestBoundary = &l.syntheticBoundary - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = true - } + // the mergingIter advances beyond the file's bounds. + l.largestBoundary = l.makeSyntheticBound(l.iterFile.LargestPointKey.UserKey) return l.verify(l.largestBoundary) } // It is possible that we are here because bloom filter matching failed. In @@ -756,10 +707,6 @@ func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV l.err = nil // clear cached iteration error l.exhaustedDir = 0 - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } // NB: the top-level Iterator has already adjusted key based on // IterOptions.UpperBound. @@ -780,10 +727,6 @@ func (l *levelIter) First() *base.InternalKV { l.err = nil // clear cached iteration error l.exhaustedDir = 0 - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } // NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is // set. @@ -804,10 +747,6 @@ func (l *levelIter) Last() *base.InternalKV { l.err = nil // clear cached iteration error l.exhaustedDir = 0 - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } // NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is // set. @@ -831,10 +770,6 @@ func (l *levelIter) Next() *base.InternalKV { if l.err != nil || l.iter == nil { return nil } - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } switch { case l.largestBoundary != nil: @@ -874,10 +809,6 @@ func (l *levelIter) NextPrefix(succKey []byte) *base.InternalKV { if l.err != nil || l.iter == nil { return nil } - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } switch { case l.largestBoundary != nil: @@ -934,10 +865,6 @@ func (l *levelIter) Prev() *base.InternalKV { if l.err != nil || l.iter == nil { return nil } - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } switch { case l.smallestBoundary != nil: @@ -996,27 +923,21 @@ func (l *levelIter) skipEmptyFileForward() *base.InternalKV { } if l.rangeDelIterPtr != nil { // We're being used as part of a mergingIter and we've exhausted the - // current sstable. If an upper bound is present and the upper bound lies - // within the current sstable, then we will have reached the upper bound - // rather than the end of the sstable. We need to return a synthetic - // boundary key so that mergingIter can use the range tombstone iterator - // until the other levels have reached this boundary. + // current sstable. If an upper bound is present and the upper bound + // lies within the current sstable, then we will have reached the + // upper bound rather than the end of the sstable. We need to return + // a synthetic boundary key so that mergingIter can use the range + // tombstone iterator until the other levels have reached this + // boundary. // // It is safe to set the boundary key to the UpperBound user key // with the RANGEDEL sentinel since it is the smallest InternalKey - // that matches the exclusive upper bound, and does not represent - // a real key. + // that matches the exclusive upper bound, and does not represent a + // real key. if l.tableOpts.UpperBound != nil { l.exhaustedDir = +1 if *l.rangeDelIterPtr != nil { - l.syntheticBoundary.K = base.InternalKey{ - UserKey: l.tableOpts.UpperBound, - Trailer: InternalKeyRangeDeleteSentinel, - } - l.largestBoundary = &l.syntheticBoundary - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = true - } + l.largestBoundary = l.makeSyntheticBound(l.tableOpts.UpperBound) return l.largestBoundary } // Else there are no range deletions in this sstable. This @@ -1025,28 +946,17 @@ func (l *levelIter) skipEmptyFileForward() *base.InternalKV { // bounds. return nil } - // If the boundary is a range deletion tombstone, or the caller is - // accessing range dels through l.rangeDelIterPtr, pause at an - // ignorable boundary key to avoid advancing to the next file until - // other levels are caught up. + // If the caller is accessing range dels through l.rangeDelIterPtr, + // pause at an ignorable boundary key to avoid advancing to the next + // file until other levels are caught up. // // Note that even if the largest boundary is not a range deletion, - // there may still be range deletions beyong the last point key + // there may still be range deletions beyond the last point key // returned. When block-property filters are in use, the sstable // iterator may have transparently skipped a tail of the point keys - // in the file. If the last point key returned /was/ the largest - // key, then we'll return a key with the same user key and trailer - // twice. Returning it again is a violation of the strict - // monotonicity normally provided. The mergingIter's heap can - // tolerate this repeat key and in this case will keep the level at - // the top of the heap and immediately skip the entry, advancing to - // the next file. + // in the file. if *l.rangeDelIterPtr != nil { - l.syntheticBoundary = base.MakeInternalKV(l.iterFile.LargestPointKey, nil) - l.largestBoundary = &l.syntheticBoundary - if l.boundaryContext != nil { - l.boundaryContext.isIgnorableBoundaryKey = true - } + l.largestBoundary = l.makeSyntheticBound(l.iterFile.LargestPointKey.UserKey) return l.largestBoundary } } @@ -1082,27 +992,21 @@ func (l *levelIter) skipEmptyFileBackward() *base.InternalKV { } if l.rangeDelIterPtr != nil { // We're being used as part of a mergingIter and we've exhausted the - // current sstable. If a lower bound is present and the lower bound lies - // within the current sstable, then we will have reached the lower bound - // rather than the beginning of the sstable. We need to return a - // synthetic boundary key so that mergingIter can use the range tombstone - // iterator until the other levels have reached this boundary. + // current sstable. If a lower bound is present and the lower bound + // lies within the current sstable, then we will have reached the + // lower bound rather than the beginning of the sstable. We need to + // return a synthetic boundary key so that mergingIter can use the + // range tombstone iterator until the other levels have reached this + // boundary. // // It is safe to set the boundary key to the LowerBound user key // with the RANGEDEL sentinel since it is the smallest InternalKey - // that is within the inclusive lower bound, and does not - // represent a real key. + // that is within the inclusive lower bound, and does not represent + // a real key. if l.tableOpts.LowerBound != nil { l.exhaustedDir = -1 if *l.rangeDelIterPtr != nil { - l.syntheticBoundary = base.MakeInternalKV(base.InternalKey{ - UserKey: l.tableOpts.LowerBound, - Trailer: InternalKeyRangeDeleteSentinel, - }, nil) - l.smallestBoundary = &l.syntheticBoundary - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = true - } + l.smallestBoundary = l.makeSyntheticBound(l.tableOpts.LowerBound) return l.smallestBoundary } // Else there are no range deletions in this sstable. This @@ -1111,21 +1015,12 @@ func (l *levelIter) skipEmptyFileBackward() *base.InternalKV { // bounds. return nil } - // If the boundary could be a range deletion tombstone, return the - // smallest point key as a special ignorable key to avoid advancing to the - // next file. - // - // It's possible the SmallestPointKey was already returned. Returning it - // again is a violation of the strict monotonicity normally provided. The - // mergingIter's heap can tolerate this repeat key and in this case will - // keep the level at the top of the heap and immediately skip the entry, - // advancing to the next file. + // If the user is using the range deletion iterator, return a + // synthetic key with the smallest user key in the file. This file + // sorts before all other keys with the same user key, so + // monotonicity is maintained. if *l.rangeDelIterPtr != nil { - l.syntheticBoundary = base.MakeInternalKV(l.iterFile.SmallestPointKey, nil) - l.smallestBoundary = &l.syntheticBoundary - if l.boundaryContext != nil { - l.boundaryContext.isIgnorableBoundaryKey = true - } + l.smallestBoundary = l.makeSyntheticBound(l.iterFile.SmallestPointKey.UserKey) return l.smallestBoundary } } diff --git a/merging_iter.go b/merging_iter.go index 26fb82729fb..2e2792d8cde 100644 --- a/merging_iter.go +++ b/merging_iter.go @@ -32,11 +32,6 @@ type mergingIterLevel struct { // intermediary internalIterator implementations. levelIter *levelIter - // levelIterBoundaryContext's fields are set when using levelIter, in order - // to surface sstable boundary keys and file-level context. See levelIter - // comment and the Range Deletions comment below. - levelIterBoundaryContext - // tombstone caches the tombstone rangeDelIter is currently pointed at. If // tombstone is nil, there are no further tombstones within the // current sstable in the current iterator direction. The cached tombstone is @@ -46,27 +41,6 @@ type mergingIterLevel struct { tombstone *keyspan.Span } -type levelIterBoundaryContext struct { - // isSyntheticIterBoundsKey is set to true iff the key returned by the level - // iterator is a synthetic key derived from the iterator bounds. This is used - // to prevent the mergingIter from being stuck at such a synthetic key if it - // becomes the top element of the heap. When used with a user-facing Iterator, - // the only range deletions exposed by this mergingIter should be those with - // `isSyntheticIterBoundsKey || isIgnorableBoundaryKey`. - // - // When true, the coincident key is an exclusive sentinel, and the current - // direction of iteration has a user-imposed iteration bound. - isSyntheticIterBoundsKey bool - // isIgnorableBoundaryKey is set to true iff the key returned by the level - // iterator is a file boundary key that should be ignored when returning to - // the parent iterator. File boundary keys are used by the level iter to - // keep a levelIter file's range deletion iterator open as long as other - // levels within the merging iterator require it. When used with a user-facing - // Iterator, the only range deletions exposed by this mergingIter should be - // those with `isSyntheticIterBoundsKey || isIgnorableBoundaryKey`. - isIgnorableBoundaryKey bool -} - // mergingIter provides a merged view of multiple iterators from different // levels of the LSM. // @@ -707,16 +681,25 @@ func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) (bool, error) { func (m *mergingIter) findNextEntry() *base.InternalKV { for m.heap.len() > 0 && m.err == nil { item := m.heap.items[0] - if m.levels[item.index].isSyntheticIterBoundsKey { - break - } - - m.addItemStats(item) - // Skip ignorable boundary keys. These are not real keys and exist to - // keep sstables open until we've surpassed their end boundaries so that - // their range deletions are visible. - if m.levels[item.index].isIgnorableBoundaryKey { + // The levelIter internal iterator will interleave exclusive sentinel + // keys to keep files open until their range deletions are no longer + // necessary. Sometimes these are interleaved with the user key of a + // file's largest key, in which case they may simply be stepped over to + // move to the next file in the forward direction. Other times they're + // interleaved at the user key of the user-iteration boundary, if that + // falls within the bounds of a file. In the latter case, there are no + // more keys < m.upper, and we can stop iterating. + // + // We perform a key comparison to differentiate between these two cases. + // This key comparison is considered okay because it only happens for + // sentinel keys. It may be eliminated after #2863. + if m.levels[item.index].iterKV.K.IsExclusiveSentinel() { + if m.upper != nil && m.heap.cmp(m.levels[item.index].iterKV.K.UserKey, m.upper) >= 0 { + break + } + // This key is the largest boundary of a file and can be skipped now + // that the file's range deletions are no longer relevant. m.err = m.nextEntry(item, nil /* succKey */) if m.err != nil { return nil @@ -724,6 +707,8 @@ func (m *mergingIter) findNextEntry() *base.InternalKV { continue } + m.addItemStats(item) + // Check if the heap root key is deleted by a range tombstone in a // higher level. If it is, isNextEntryDeleted will advance the iterator // to a later key (through seeking or nexting). @@ -859,13 +844,25 @@ func (m *mergingIter) isPrevEntryDeleted(item *mergingIterLevel) (bool, error) { func (m *mergingIter) findPrevEntry() *base.InternalKV { for m.heap.len() > 0 && m.err == nil { item := m.heap.items[0] - if m.levels[item.index].isSyntheticIterBoundsKey { - break - } - // Skip ignorable boundary keys. These are not real keys and exist to - // keep sstables open until we've surpassed their end boundaries so that - // their range deletions are visible. - if m.levels[item.index].isIgnorableBoundaryKey { + + // The levelIter internal iterator will interleave exclusive sentinel + // keys to keep files open until their range deletions are no longer + // necessary. Sometimes these are interleaved with the user key of a + // file's smallest key, in which case they may simply be stepped over to + // move to the next file in the backward direction. Other times they're + // interleaved at the user key of the user-iteration boundary, if that + // falls within the bounds of a file. In the latter case, there are no + // more keys ≥ m.lower, and we can stop iterating. + // + // We perform a key comparison to differentiate between these two cases. + // This key comparison is considered okay because it only happens for + // sentinel keys. It may be eliminated after #2863. + if m.levels[item.index].iterKV.K.IsExclusiveSentinel() { + if m.lower != nil && m.heap.cmp(m.levels[item.index].iterKV.K.UserKey, m.lower) <= 0 { + break + } + // This key is the smallest boundary of a file and can be skipped + // now that the file's range deletions are no longer relevant. m.err = m.prevEntry(item) if m.err != nil { return nil diff --git a/merging_iter_test.go b/merging_iter_test.go index 2f918713d51..4a141af3cdd 100644 --- a/merging_iter_test.go +++ b/merging_iter_test.go @@ -307,7 +307,6 @@ func TestMergingIterCornerCases(t *testing.T) { i := len(levelIters) levelIters = append(levelIters, mergingIterLevel{iter: li}) li.initRangeDel(&levelIters[i].rangeDelIter) - li.initBoundaryContext(&levelIters[i].levelIterBoundaryContext) } miter := &mergingIter{} miter.init(nil /* opts */, &stats, cmp, func(a []byte) int { return len(a) }, levelIters...) @@ -679,7 +678,6 @@ func buildMergingIter(readers [][]*sstable.Reader, levelSlices []manifest.LevelS context.Background(), IterOptions{}, testkeys.Comparer, newIters, levelSlices[i].Iter(), manifest.Level(level), internalIterOpts{}) l.initRangeDel(&mils[level].rangeDelIter) - l.initBoundaryContext(&mils[level].levelIterBoundaryContext) mils[level].iter = l } var stats base.InternalIteratorStats diff --git a/scan_internal.go b/scan_internal.go index e6a5f75c936..d6ee26618f5 100644 --- a/scan_internal.go +++ b/scan_internal.go @@ -900,7 +900,6 @@ func (i *scanInternalIterator) constructPointIter( li.init( i.ctx, i.opts.IterOptions, i.comparer, i.newIters, files, level, internalIterOpts{}) - li.initBoundaryContext(&mlevels[mlevelsIndex].levelIterBoundaryContext) mlevels[mlevelsIndex].iter = li rli.Init(keyspan.SpanIterOptions{RangeKeyFilters: i.opts.RangeKeyFilters}, i.comparer.Compare, tableNewRangeDelIter(i.ctx, i.newIters), files, level, diff --git a/testdata/level_iter_boundaries b/testdata/level_iter_boundaries index 5a4edc025bf..413090bd37e 100644 --- a/testdata/level_iter_boundaries +++ b/testdata/level_iter_boundaries @@ -12,7 +12,7 @@ prev ---- d#72057594037927935,RANGEDEL: . -a#1,RANGEDEL: +a#72057594037927935,RANGEDEL: . iter @@ -23,7 +23,7 @@ prev ---- d#72057594037927935,RANGEDEL: . -a#1,RANGEDEL: +a#72057594037927935,RANGEDEL: . iter @@ -34,7 +34,7 @@ prev ---- d#72057594037927935,RANGEDEL: . -a#1,RANGEDEL: +a#72057594037927935,RANGEDEL: . iter @@ -90,7 +90,7 @@ prev prev ---- c#3,SET:c -b#2,RANGEDEL: +b#72057594037927935,RANGEDEL: a#1,SET:a . @@ -117,7 +117,7 @@ last prev ---- a#1,SET:b -a#1,SET: +a#72057594037927935,RANGEDEL: clear ---- @@ -134,7 +134,7 @@ next next ---- c#2,SET:c -c#2,SET: +c#72057594037927935,RANGEDEL: . iter @@ -143,7 +143,7 @@ prev prev ---- c#2,SET:c -a#1,RANGEDEL: +a#72057594037927935,RANGEDEL: . # Regression test to check that Seek{GE,LT} work properly in certain @@ -174,9 +174,9 @@ e#72057594037927935,RANGEDEL: d#3,SET:d e#72057594037927935,RANGEDEL: d#3,SET:d -c#2,RANGEDEL: +c#72057594037927935,RANGEDEL: d#3,SET:d -c#2,RANGEDEL: +c#72057594037927935,RANGEDEL: d#3,SET:d # Regression test to check that Seek{GE,LT}, First, and Last do not @@ -359,7 +359,7 @@ prev prev ---- e#5,SET:z -e#5,SET: +e#72057594037927935,RANGEDEL: . file-pos diff --git a/testdata/level_iter_seek b/testdata/level_iter_seek index 01181e4eea2..83a3ae399a8 100644 --- a/testdata/level_iter_seek +++ b/testdata/level_iter_seek @@ -61,7 +61,7 @@ d#72057594037927935,RANGEDEL: iter seek-prefix-ge d ---- -f#5,SET: / d-e:{(#6,RANGEDEL)} +f#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} # Tests a sequence of SeekPrefixGE with monotonically increasing keys, some of # which are present and some not (so fail the bloom filter match). The seek to @@ -77,7 +77,7 @@ seek-prefix-ge h ---- . c#7,SET:c / d-e:{(#6,RANGEDEL)} -f#5,SET: / d-e:{(#6,RANGEDEL)} +f#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} f#5,SET:f g#4,SET:g . @@ -119,7 +119,7 @@ c#7,SET:c {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} f#5,SET:f {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} -f#5,SET: +f#72057594037927935,RANGEDEL: {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} g#4,SET:g {BlockBytes:112 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} @@ -222,9 +222,9 @@ next prev prev ---- -a#5,RANGEDEL: / a-b:{(#5,RANGEDEL)} +a#72057594037927935,RANGEDEL: / a-b:{(#5,RANGEDEL)} c#7,SET:c -a#5,RANGEDEL: +a#72057594037927935,RANGEDEL: . # Verify that prev when positioned at the largest boundary returns the @@ -268,7 +268,7 @@ seek-lt d next ---- d#2,SET:d -a#1,RANGEDEL: / a-b:{(#1,RANGEDEL)} +a#72057594037927935,RANGEDEL: / a-b:{(#1,RANGEDEL)} d#2,SET:d # Verify SeekPrefixGE correctness with trySeekUsingNext=true @@ -387,7 +387,7 @@ next next ---- f#5,SET:f -f#5,SET: +f#72057594037927935,RANGEDEL: i#4,SET:i # The below count should be 2, as we skip over the rangekey-only file. diff --git a/testdata/merging_iter b/testdata/merging_iter index 643801bca5e..c24465e9759 100644 --- a/testdata/merging_iter +++ b/testdata/merging_iter @@ -40,7 +40,7 @@ c#27,SET:27 e#10,SET:10 g#20,SET:20 . -{BlockBytes:116 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:6 ValueBytes:8 PointCount:6 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} +{BlockBytes:116 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:4 ValueBytes:8 PointCount:4 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} {BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} # seekGE() should not allow the rangedel to act on points in the lower sstable that are after it. @@ -636,9 +636,9 @@ a#30,SET:30 f#21,SET:21 {BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:5 ValueBytes:10 PointCount:5 PointsCoveredByRangeTombstones:4 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} . -{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:6 ValueBytes:10 PointCount:6 PointsCoveredByRangeTombstones:4 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} +{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:5 ValueBytes:10 PointCount:5 PointsCoveredByRangeTombstones:4 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} . -{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:6 ValueBytes:10 PointCount:6 PointsCoveredByRangeTombstones:4 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} +{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:5 ValueBytes:10 PointCount:5 PointsCoveredByRangeTombstones:4 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} # Test a dead simple error handling case of a 1-level seek erroring.