From 919114cb50c57162ee8c7f2b9a41667f3ed09eef Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Mon, 29 Apr 2024 11:00:33 -0400 Subject: [PATCH] db: interleave range deletion boundaries in levelIter Previously, levelIter would sometimes interleave fake keys at file boundaries or user-provided iteration boundaries when a file contains range deletions. This commit reworks the levelIter to instead interleave the individual bounds of the range deletions themselves, using an interleaving iterator. This simplifies the levelIter. Because range deletions are now read in two places: once for interleaving bounds and once to expose a rangeDelIter to the mergingIter, this commit requires opening a file's range deletion iterator twice. This is an intermediary state until #2863 is completed when the mergingIter will consult the levelIter's interleaving iterator to determine which range deletions overlap the current iterator position. Informs #2863. --- internal/keyspan/interleaving_iter.go | 5 +- level_checker_test.go | 5 + level_iter.go | 344 +++++++------------------- merging_iter.go | 39 ++- testdata/ingest | 2 +- testdata/level_iter_boundaries | 60 +++-- testdata/level_iter_seek | 80 +++--- testdata/merging_iter | 13 +- testdata/metrics | 18 +- 9 files changed, 242 insertions(+), 324 deletions(-) diff --git a/internal/keyspan/interleaving_iter.go b/internal/keyspan/interleaving_iter.go index 63a556da0a..a6eee31eae 100644 --- a/internal/keyspan/interleaving_iter.go +++ b/internal/keyspan/interleaving_iter.go @@ -1036,9 +1036,10 @@ func (i *InterleavingIter) verify(kv *base.InternalKV) *base.InternalKV { switch { case i.dir == -1 && i.spanMarkerTruncated: panic("pebble: invariant violation: truncated span key in reverse iteration") - case kv != nil && i.opts.LowerBound != nil && i.cmp(kv.K.UserKey, i.opts.LowerBound) < 0: + case kv != nil && i.opts.LowerBound != nil && !kv.K.IsExclusiveSentinel() && + i.cmp(kv.K.UserKey, i.opts.LowerBound) < 0: panic("pebble: invariant violation: key < lower bound") - case kv != nil && i.opts.UpperBound != nil && + case kv != nil && i.opts.UpperBound != nil && !kv.K.IsExclusiveSentinel() && !base.UserKeyExclusive(i.opts.UpperBound).IsUpperBoundForInternalKey(i.comparer.Compare, kv.K): panic("pebble: invariant violation: key ≥ upper bound") case i.err != nil && kv != nil: diff --git a/level_checker_test.go b/level_checker_test.go index 947cc35e17..57b654921f 100644 --- a/level_checker_test.go +++ b/level_checker_test.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/invalidating" + "github.com/cockroachdb/pebble/internal/invariants" "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/private" @@ -85,6 +86,10 @@ func (f *failMerger) Close() error { } func TestCheckLevelsCornerCases(t *testing.T) { + if invariants.Enabled { + t.Skip("disabled under invariants; relies on violating invariants to detect them") + } + memFS := vfs.NewMem() var levels [][]*fileMetadata formatKey := testkeys.Comparer.FormatKey diff --git a/level_iter.go b/level_iter.go index dceca63f18..b799dd6053 100644 --- a/level_iter.go +++ b/level_iter.go @@ -66,18 +66,6 @@ type levelIter struct { tableOpts IterOptions // The LSM level this levelIter is initialized for. level manifest.Level - // The keys to return when iterating past an sstable boundary and that - // boundary is a range deletion tombstone. The boundary could be smallest - // (i.e. arrived at with Prev), or largest (arrived at with Next). - smallestBoundary *base.InternalKV - largestBoundary *base.InternalKV - // A synthetic boundary key-value pair to return when an sstable contains - // range tombstones that might be relevant but no more relevant point keys. - // The synthetic boundary key is always an exclusive range deletion sentinel - // key. When the user-imposed iteration bounds have been reached, the key's - // user key is the exceeded bound. Otherwise, it's the smallest/largest key - // in the file. - syntheticBoundary base.InternalKV // combinedIterState may be set when a levelIter is used during user // iteration. Although levelIter only iterates over point keys, it's also // responsible for lazily constructing the combined range & point iterator @@ -120,6 +108,12 @@ type levelIter struct { // IterOptions.PointKeyFilters is declared. filtersBuf [1]BlockPropertyFilter + // interleaving is used when rangeDelIterPtr != nil to interleave the + // boundaries of range deletions among point keys. This ensures that we + // don't advance to a new file until the range deletions are no longer + // needed by other levels. + interleaving keyspan.InterleavingIter + // exhaustedDir is set to +1 or -1 when the levelIter has been exhausted in // the forward or backward direction respectively. It is set when the // underlying data is exhausted or when iteration has reached the upper or @@ -196,31 +190,6 @@ func (l *levelIter) initCombinedIterState(state *combinedIterState) { l.combinedIterState = state } -// emitSyntheticBoundaries indicates whether or not the iterator should emit -// synthetic boundary keys at table boundaries and user-imposed iteration bounds -// (l.{lower,upper}) for the current file. This behavior is used by the -// mergingIter specifically, when initRangeDel is called. See the comment in -// makeSynehticBoundary. -func (l *levelIter) emitSyntheticBoundaries() bool { - return l.rangeDelIterPtr != nil && *l.rangeDelIterPtr != nil -} - -func (l *levelIter) makeSyntheticBoundary(userKey []byte) *base.InternalKV { - // We only emit synthetic boundaries when used by the merging iterator and - // the file has range deletions that must be observed by the merging - // iterator. If we're emitting a synthetic boundary but are not configured - // to surface range deletions or the file doesn't have range deletions, - // something is wrong. - if invariants.Enabled && !l.emitSyntheticBoundaries() { - panic(errors.AssertionFailedf("levelIter: emitting synthetic boundary unnecessarily; no range deletions")) - } - l.syntheticBoundary = base.InternalKV{ - K: base.MakeRangeDeleteSentinelKey(userKey), - V: base.LazyValue{}, - } - return &l.syntheticBoundary -} - func (l *levelIter) maybeTriggerCombinedIteration(file *fileMetadata, dir int) { // If we encounter a file that contains range keys, we may need to // trigger a switch to combined range-key and point-key iteration, @@ -518,8 +487,6 @@ const ( ) func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicator { - l.smallestBoundary = nil - l.largestBoundary = nil if l.iterFile == file { if l.err != nil { return noFileLoaded @@ -606,6 +573,30 @@ func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicato if l.rangeDelIterPtr != nil { *l.rangeDelIterPtr = iters.rangeDeletion l.rangeDelIterCopy = iters.rangeDeletion + + // If this file has range deletions, interleave the bounds of the + // range deletions among the point keys. When used with a + // mergingIter, this ensures we don't move beyond a file with range + // deletions until its range deletions are no longer relevant. + // + // For now, we open a second range deletion iterator. Future work + // will avoid the need to open a second range deletion iterator, and + // avoid surfacing the file's range deletion iterator directly to + // the caller. + var itersForBounds iterSet + itersForBounds, l.err = l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts, iterRangeDeletions) + l.interleaving.Init(l.comparer, l.iter, itersForBounds.RangeDeletion(), keyspan.InterleavingIterOpts{ + LowerBound: l.tableOpts.LowerBound, + UpperBound: l.tableOpts.UpperBound, + InterleaveEndKeys: true, + }) + if l.err != nil { + l.iter = nil + *l.rangeDelIterPtr = nil + l.err = errors.CombineErrors(l.err, iters.CloseAll()) + return noFileLoaded + } + l.iter = &l.interleaving } return newFileLoaded } @@ -619,12 +610,12 @@ func (l *levelIter) verify(kv *base.InternalKV) *base.InternalKV { // eligible for inlining. Do not change this to use a variable. if invariants.Enabled && !l.disableInvariants && kv != nil { // We allow returning a boundary key that is outside of the lower/upper - // bounds as such keys are always range tombstones which will be skipped by - // the Iterator. - if l.lower != nil && kv != l.smallestBoundary && l.cmp(kv.K.UserKey, l.lower) < 0 { + // bounds as such keys are always range tombstones which will be skipped + // by the Iterator. + if l.lower != nil && kv != nil && !kv.K.IsExclusiveSentinel() && l.cmp(kv.K.UserKey, l.lower) < 0 { l.logger.Fatalf("levelIter %s: lower bound violation: %s < %s\n%s", l.level, kv, l.lower, debug.Stack()) } - if l.upper != nil && kv != l.largestBoundary && l.cmp(kv.K.UserKey, l.upper) > 0 { + if l.upper != nil && kv != nil && !kv.K.IsExclusiveSentinel() && l.cmp(kv.K.UserKey, l.upper) > 0 { l.logger.Fatalf("levelIter %s: upper bound violation: %s > %s\n%s", l.level, kv, l.upper, debug.Stack()) } } @@ -642,7 +633,7 @@ func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV // IterOptions.LowerBound. loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1) if loadFileIndicator == noFileLoaded { - l.exhaustedDir = +1 + l.exhaustedForward() return nil } if loadFileIndicator == newFileLoaded { @@ -668,7 +659,7 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba // IterOptions.LowerBound. loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1) if loadFileIndicator == noFileLoaded { - l.exhaustedDir = +1 + l.exhaustedForward() return nil } if loadFileIndicator == newFileLoaded { @@ -682,22 +673,6 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba if err := l.iter.Error(); err != nil { return nil } - // When SeekPrefixGE returns nil, we have not necessarily reached the end of - // the sstable. All we know is that a key with prefix does not exist in the - // current sstable. We do know that the key lies within the bounds of the - // table as findFileGE found the table where key <= meta.Largest. We return - // the table's bound as a synthetic key. - if l.emitSyntheticBoundaries() { - if l.tableOpts.UpperBound != nil { - l.largestBoundary = l.makeSyntheticBoundary(l.tableOpts.UpperBound) - l.exhaustedDir = +1 - return l.verify(l.largestBoundary) - } - // Return the file's largest bound, ensuring this file stays open until - // the mergingIter advances beyond the file's bounds. - l.largestBoundary = l.makeSyntheticBoundary(l.iterFile.LargestPointKey.UserKey) - return l.verify(l.largestBoundary) - } // It is possible that we are here because bloom filter matching failed. In // that case it is likely that all keys matching the prefix are wholly // within the current file and cannot be in the subsequent file. In that @@ -708,7 +683,7 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba // likely that the next key will also be contained in the current file. n := l.split(l.iterFile.LargestPointKey.UserKey) if l.cmp(prefix, l.iterFile.LargestPointKey.UserKey[:n]) < 0 { - l.exhaustedDir = +1 + l.exhaustedForward() return nil } return l.verify(l.skipEmptyFileForward()) @@ -725,7 +700,7 @@ func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV // NB: the top-level Iterator has already adjusted key based on // IterOptions.UpperBound. if l.loadFile(l.findFileLT(key, flags), -1) == noFileLoaded { - l.exhaustedDir = -1 + l.exhaustedBackward() return nil } if kv := l.iter.SeekLT(key, flags); kv != nil { @@ -745,7 +720,7 @@ func (l *levelIter) First() *base.InternalKV { // NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is // set. if l.loadFile(l.files.First(), +1) == noFileLoaded { - l.exhaustedDir = +1 + l.exhaustedForward() return nil } if kv := l.iter.First(); kv != nil { @@ -765,7 +740,7 @@ func (l *levelIter) Last() *base.InternalKV { // NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is // set. if l.loadFile(l.files.Last(), -1) == noFileLoaded { - l.exhaustedDir = -1 + l.exhaustedBackward() return nil } if kv := l.iter.Last(); kv != nil { @@ -784,37 +759,8 @@ func (l *levelIter) Next() *base.InternalKV { if l.err != nil || l.iter == nil { return nil } - - switch { - case l.largestBoundary != nil: - if l.tableOpts.UpperBound != nil { - // The UpperBound was within this file, so don't load the next - // file. We leave the largestBoundary unchanged so that subsequent - // calls to Next() stay at this file. If a Seek/First/Last call is - // made and this file continues to be relevant, loadFile() will - // set the largestBoundary to nil. - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = nil - } - l.exhaustedDir = +1 - return nil - } - // We're stepping past the boundary key, so now we can load the next file. - if l.loadFile(l.files.Next(), +1) != noFileLoaded { - if kv := l.iter.First(); kv != nil { - return l.verify(kv) - } - return l.verify(l.skipEmptyFileForward()) - } - l.exhaustedDir = +1 - return nil - - default: - // Reset the smallest boundary since we're moving away from it. - l.smallestBoundary = nil - if kv := l.iter.Next(); kv != nil { - return l.verify(kv) - } + if kv := l.iter.Next(); kv != nil { + return l.verify(kv) } return l.verify(l.skipEmptyFileForward()) } @@ -824,33 +770,16 @@ func (l *levelIter) NextPrefix(succKey []byte) *base.InternalKV { return nil } - switch { - case l.largestBoundary != nil: - if l.tableOpts.UpperBound != nil { - // The UpperBound was within this file, so don't load the next - // file. We leave the largestBoundary unchanged so that subsequent - // calls to Next() stay at this file. If a Seek/First/Last call is - // made and this file continues to be relevant, loadFile() will - // set the largestBoundary to nil. - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = nil - } - return nil - } - // We're stepping past the boundary key, so we need to load a later - // file. - - default: - // Reset the smallest boundary since we're moving away from it. - l.smallestBoundary = nil - - if kv := l.iter.NextPrefix(succKey); kv != nil { - return l.verify(kv) - } - if l.iter.Error() != nil { - return nil - } - // Fall through to seeking. + if kv := l.iter.NextPrefix(succKey); kv != nil { + return l.verify(kv) + } + if l.iter.Error() != nil { + return nil + } + if l.tableOpts.UpperBound != nil { + // The UpperBound was within this file, so don't load the next file. + l.exhaustedForward() + return nil } // Seek the manifest level iterator using TrySeekUsingNext=true and @@ -865,7 +794,7 @@ func (l *levelIter) NextPrefix(succKey []byte) *base.InternalKV { } return l.verify(l.skipEmptyFileForward()) } - l.exhaustedDir = +1 + l.exhaustedForward() return nil } @@ -879,105 +808,36 @@ func (l *levelIter) Prev() *base.InternalKV { if l.err != nil || l.iter == nil { return nil } - - switch { - case l.smallestBoundary != nil: - if l.tableOpts.LowerBound != nil { - // The LowerBound was within this file, so don't load the previous - // file. We leave the smallestBoundary unchanged so that - // subsequent calls to Prev() stay at this file. If a - // Seek/First/Last call is made and this file continues to be - // relevant, loadFile() will set the smallestBoundary to nil. - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = nil - } - l.exhaustedDir = -1 - return nil - } - // We're stepping past the boundary key, so now we can load the prev file. - if l.loadFile(l.files.Prev(), -1) != noFileLoaded { - if kv := l.iter.Last(); kv != nil { - return l.verify(kv) - } - return l.verify(l.skipEmptyFileBackward()) - } - l.exhaustedDir = -1 - return nil - - default: - // Reset the largest boundary since we're moving away from it. - l.largestBoundary = nil - if kv := l.iter.Prev(); kv != nil { - return l.verify(kv) - } + if kv := l.iter.Prev(); kv != nil { + return l.verify(kv) } return l.verify(l.skipEmptyFileBackward()) } func (l *levelIter) skipEmptyFileForward() *base.InternalKV { var kv *base.InternalKV - // The first iteration of this loop starts with an already exhausted - // l.iter. The reason for the exhaustion is either that we iterated to the - // end of the sstable, or our iteration was terminated early due to the - // presence of an upper-bound or the use of SeekPrefixGE. If - // l.rangeDelIterPtr is non-nil, we may need to pretend the iterator is - // not exhausted to allow for the merging to finish consuming the - // l.rangeDelIterPtr before levelIter switches the rangeDelIter from - // under it. This pretense is done by either generating a synthetic - // boundary key or returning the largest key of the file, depending on the - // exhaustion reason. - + // The first iteration of this loop starts with an already exhausted l.iter. + // The reason for the exhaustion is either that we iterated to the end of + // the sstable, or our iteration was terminated early due to the presence of + // an upper-bound or the use of SeekPrefixGE. + // // Subsequent iterations will examine consecutive files such that the first // file that does not have an exhausted iterator causes the code to return - // that key, else the behavior described above if there is a corresponding - // rangeDelIterPtr. + // that key. for ; kv == nil; kv = l.iter.First() { if l.iter.Error() != nil { return nil } - if l.rangeDelIterPtr != nil { - // We're being used as part of a mergingIter and we've exhausted the - // current sstable. If an upper bound is present and the upper bound - // lies within the current sstable, then we will have reached the - // upper bound rather than the end of the sstable. We need to return - // a synthetic boundary key so that mergingIter can use the range - // tombstone iterator until the other levels have reached this - // boundary. - // - // It is safe to set the boundary key to the UpperBound user key - // with the RANGEDEL sentinel since it is the smallest InternalKey - // that matches the exclusive upper bound, and does not represent a - // real key. - if l.tableOpts.UpperBound != nil { - l.exhaustedDir = +1 - if *l.rangeDelIterPtr != nil { - l.largestBoundary = l.makeSyntheticBoundary(l.tableOpts.UpperBound) - return l.largestBoundary - } - // Else there are no range deletions in this sstable. This - // helps with performance when many levels are populated with - // sstables and most don't have any actual keys within the - // bounds. - return nil - } - // If the caller is accessing range dels through l.rangeDelIterPtr, - // pause at an ignorable boundary key to avoid advancing to the next - // file until other levels are caught up. - // - // Note that even if the largest boundary is not a range deletion, - // there may still be range deletions beyond the last point key - // returned. When block-property filters are in use, the sstable - // iterator may have transparently skipped a tail of the point keys - // in the file. - if *l.rangeDelIterPtr != nil { - l.largestBoundary = l.makeSyntheticBoundary(l.iterFile.LargestPointKey.UserKey) - return l.largestBoundary - } + // If an upper bound is present and the upper bound lies within the + // current sstable, then we will have reached the upper bound rather + // than the end of the sstable. + if l.tableOpts.UpperBound != nil { + l.exhaustedForward() + return nil } - // Current file was exhausted. Move to the next file. if l.loadFile(l.files.Next(), +1) == noFileLoaded { - l.exhaustedDir = +1 + l.exhaustedForward() return nil } } @@ -989,65 +849,45 @@ func (l *levelIter) skipEmptyFileBackward() *base.InternalKV { // The first iteration of this loop starts with an already exhausted // l.iter. The reason for the exhaustion is either that we iterated to the // end of the sstable, or our iteration was terminated early due to the - // presence of a lower-bound. If l.rangeDelIterPtr is non-nil, we may need - // to pretend the iterator is not exhausted to allow for the merging to - // finish consuming the l.rangeDelIterPtr before levelIter switches the - // rangeDelIter from under it. This pretense is done by either generating - // a synthetic boundary key or returning the smallest key of the file, - // depending on the exhaustion reason. - + // presence of a lower-bound. + // // Subsequent iterations will examine consecutive files such that the first // file that does not have an exhausted iterator causes the code to return - // that key, else the behavior described above if there is a corresponding - // rangeDelIterPtr. + // that key. for ; kv == nil; kv = l.iter.Last() { if l.iter.Error() != nil { return nil } - if l.rangeDelIterPtr != nil { - // We're being used as part of a mergingIter and we've exhausted the - // current sstable. If a lower bound is present and the lower bound - // lies within the current sstable, then we will have reached the - // lower bound rather than the beginning of the sstable. We need to - // return a synthetic boundary key so that mergingIter can use the - // range tombstone iterator until the other levels have reached this - // boundary. - // - // It is safe to set the boundary key to the LowerBound user key - // with the RANGEDEL sentinel since it is the smallest InternalKey - // that is within the inclusive lower bound, and does not represent - // a real key. - if l.tableOpts.LowerBound != nil { - l.exhaustedDir = -1 - if *l.rangeDelIterPtr != nil { - l.smallestBoundary = l.makeSyntheticBoundary(l.tableOpts.LowerBound) - return l.smallestBoundary - } - // Else there are no range deletions in this sstable. This - // helps with performance when many levels are populated with - // sstables and most don't have any actual keys within the - // bounds. - return nil - } - // If the user is using the range deletion iterator, return a - // synthetic key with the smallest user key in the file. This file - // sorts before all other keys with the same user key, so - // monotonicity is maintained. - if *l.rangeDelIterPtr != nil { - l.smallestBoundary = l.makeSyntheticBoundary(l.iterFile.SmallestPointKey.UserKey) - return l.smallestBoundary - } + // If a lower bound is present and the lower bound lies within the + // current sstable, then we will have reached the lowerr bound rather + // than the end of the sstable. + if l.tableOpts.LowerBound != nil { + l.exhaustedBackward() + return nil } - // Current file was exhausted. Move to the previous file. if l.loadFile(l.files.Prev(), -1) == noFileLoaded { - l.exhaustedDir = -1 + l.exhaustedBackward() return nil } } return kv } +func (l *levelIter) exhaustedForward() { + l.exhaustedDir = +1 + if l.rangeDelIterPtr != nil { + *l.rangeDelIterPtr = nil + } +} + +func (l *levelIter) exhaustedBackward() { + l.exhaustedDir = -1 + if l.rangeDelIterPtr != nil { + *l.rangeDelIterPtr = nil + } +} + func (l *levelIter) Error() error { if l.err != nil || l.iter == nil { return l.err diff --git a/merging_iter.go b/merging_iter.go index 2e2792d8cd..e04df79c90 100644 --- a/merging_iter.go +++ b/merging_iter.go @@ -1199,36 +1199,55 @@ func (m *mergingIter) NextPrefix(succKey []byte) *base.InternalKV { // The heap root necessarily must be positioned at a key < succKey, because // NextPrefix was invoked. - root := &m.heap.items[0] - m.levelsPositioned[(*root).index] = true + root := m.heap.items[0] if invariants.Enabled && m.heap.cmp((*root).iterKV.K.UserKey, succKey) >= 0 { m.logger.Fatalf("pebble: invariant violation: NextPrefix(%q) called on merging iterator already positioned at %q", succKey, (*root).iterKV) } - if m.err = m.nextEntry(*root, succKey); m.err != nil { + // NB: root is the heap root before we call nextEntry; nextEntry may change + // the heap root, so we must not `root` to still be the root of the heap, or + // even to be in the heap if the level's iterator becomes exhausted. + if m.err = m.nextEntry(root, succKey); m.err != nil { return nil } - // NB: root is a pointer to the heap root. nextEntry may have changed - // the heap root, so we must not expect root to still point to the same - // level (or to even be valid, if the heap is now exhaused). + // We only consider the level to be conclusively positioned at the next + // prefix if our call to nextEntry did not advance the level onto a range + // deletion's boundary. Range deletions may have bounds within the prefix + // that are still surfaced by NextPrefix. + m.levelsPositioned[root.index] = root.iterKV == nil || !root.iterKV.K.IsExclusiveSentinel() for m.heap.len() > 0 { - if m.levelsPositioned[(*root).index] { + root := m.heap.items[0] + if m.levelsPositioned[root.index] { // A level we've previously positioned is at the top of the heap, so // there are no other levels positioned at keys < succKey. We've // advanced as far as we need to. break } + // If the current heap root is a sentinel key, we need to skip it. + // Calling NextPrefix while positioned at a sentinel key is not + // supported. + if root.iterKV.K.IsExclusiveSentinel() { + if m.err = m.nextEntry(root, nil); m.err != nil { + return nil + } + continue + } + // Since this level was not the original heap root when NextPrefix was // called, we don't know whether this level's current key has the // previous prefix or a new one. - if m.heap.cmp((*root).iterKV.K.UserKey, succKey) >= 0 { + if m.heap.cmp(root.iterKV.K.UserKey, succKey) >= 0 { break } - m.levelsPositioned[(*root).index] = true - if m.err = m.nextEntry(*root, succKey); m.err != nil { + if m.err = m.nextEntry(root, succKey); m.err != nil { return nil } + // We only consider the level to be conclusively positioned at the next + // prefix if our call to nextEntry did not land onto a range deletion's + // boundary. Range deletions may have bounds within the prefix that are + // still surfaced by NextPrefix. + m.levelsPositioned[root.index] = root.iterKV == nil || !root.iterKV.K.IsExclusiveSentinel() } return m.findNextEntry() } diff --git a/testdata/ingest b/testdata/ingest index 24302a6b0a..c27b75fc91 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -53,7 +53,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 569B Block cache: 6 entries (945B) hit rate: 30.8% -Table cache: 1 entries (760B) hit rate: 50.0% +Table cache: 1 entries (760B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 diff --git a/testdata/level_iter_boundaries b/testdata/level_iter_boundaries index 413090bd37..dec8d90c8e 100644 --- a/testdata/level_iter_boundaries +++ b/testdata/level_iter_boundaries @@ -7,11 +7,31 @@ b.RANGEDEL.2:d iter first next +next +next +next +next +next last prev +prev +prev +prev +prev +prev ---- +a#72057594037927935,RANGEDEL: +b#72057594037927935,RANGEDEL: +b#72057594037927935,RANGEDEL: +c#72057594037927935,RANGEDEL: +c#72057594037927935,RANGEDEL: d#72057594037927935,RANGEDEL: . +d#72057594037927935,RANGEDEL: +c#72057594037927935,RANGEDEL: +c#72057594037927935,RANGEDEL: +b#72057594037927935,RANGEDEL: +b#72057594037927935,RANGEDEL: a#72057594037927935,RANGEDEL: . @@ -21,10 +41,10 @@ seek-ge d seek-lt b prev ---- -d#72057594037927935,RANGEDEL: +c#72057594037927935,RANGEDEL: . +b#72057594037927935,RANGEDEL: a#72057594037927935,RANGEDEL: -. iter seek-prefix-ge c @@ -32,10 +52,10 @@ seek-prefix-ge d seek-lt b prev ---- -d#72057594037927935,RANGEDEL: +c#72057594037927935,RANGEDEL: . +b#72057594037927935,RANGEDEL: a#72057594037927935,RANGEDEL: -. iter seek-ge e @@ -77,8 +97,10 @@ first next next next +next ---- a#1,SET:a +b#72057594037927935,RANGEDEL: c#72057594037927935,RANGEDEL: c#3,SET:c . @@ -88,8 +110,10 @@ last prev prev prev +prev ---- c#3,SET:c +c#72057594037927935,RANGEDEL: b#72057594037927935,RANGEDEL: a#1,SET:a . @@ -107,17 +131,23 @@ iter first next next +next ---- a#1,SET:b +b#72057594037927935,RANGEDEL: c#72057594037927935,RANGEDEL: . iter last prev +prev +prev ---- +c#72057594037927935,RANGEDEL: +b#72057594037927935,RANGEDEL: a#1,SET:b -a#72057594037927935,RANGEDEL: +. clear ---- @@ -132,17 +162,21 @@ iter first next next +next ---- +a#72057594037927935,RANGEDEL: +b#72057594037927935,RANGEDEL: c#2,SET:c -c#72057594037927935,RANGEDEL: . iter last prev prev +prev ---- c#2,SET:c +b#72057594037927935,RANGEDEL: a#72057594037927935,RANGEDEL: . @@ -169,15 +203,15 @@ seek-ge d prev seek-lt e ---- +d#72057594037927935,RANGEDEL: d#3,SET:d -e#72057594037927935,RANGEDEL: +d#72057594037927935,RANGEDEL: d#3,SET:d e#72057594037927935,RANGEDEL: d#3,SET:d +d#72057594037927935,RANGEDEL: c#72057594037927935,RANGEDEL: -d#3,SET:d -c#72057594037927935,RANGEDEL: -d#3,SET:d +e#72057594037927935,RANGEDEL: # Regression test to check that Seek{GE,LT}, First, and Last do not # have iteration bounds affected by SeekPrefixGE. Previously, @@ -343,10 +377,8 @@ iter save continue set-bounds lower=e upper=f seek-ge e next -next ---- e#5,SET:z -f#72057594037927935,RANGEDEL: . file-pos @@ -356,10 +388,8 @@ file 000001 [loaded] iter save continue seek-lt f prev -prev ---- e#5,SET:z -e#72057594037927935,RANGEDEL: . file-pos @@ -370,10 +400,8 @@ iter save continue set-bounds lower=f upper=g seek-lt g prev -prev ---- f#6,SET:z -f#72057594037927935,RANGEDEL: . file-pos diff --git a/testdata/level_iter_seek b/testdata/level_iter_seek index 83a3ae399a..74a4691e03 100644 --- a/testdata/level_iter_seek +++ b/testdata/level_iter_seek @@ -33,14 +33,18 @@ h.SET.3:h iter seek-ge d +next +next ---- -f#5,SET:f / d-e:{(#6,RANGEDEL)} +d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} +e#72057594037927935,RANGEDEL: +f#5,SET:f iter set-bounds upper=d seek-ge d ---- -d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} +. iter set-bounds upper=d @@ -48,28 +52,29 @@ seek-ge c next prev next -next ---- c#7,SET:c / d-e:{(#6,RANGEDEL)} -d#72057594037927935,RANGEDEL: +. c#7,SET:c -d#72057594037927935,RANGEDEL: . # There is no point key with d, but since there is a rangedel, levelIter returns # the boundary key using the largest key, f, in the file. iter seek-prefix-ge d +next ---- -f#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} +d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} +d\x00#72057594037927935,RANGEDEL: # Tests a sequence of SeekPrefixGE with monotonically increasing keys, some of # which are present and some not (so fail the bloom filter match). The seek to -# cc returns a boundary key. +# cc returns a range deletion tombstone's start key. iter seek-prefix-ge aa seek-prefix-ge c seek-prefix-ge cc +next seek-prefix-ge f seek-prefix-ge g seek-prefix-ge gg @@ -77,7 +82,8 @@ seek-prefix-ge h ---- . c#7,SET:c / d-e:{(#6,RANGEDEL)} -f#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} +d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} +cc\x00#72057594037927935,RANGEDEL: f#5,SET:f g#4,SET:g . @@ -107,6 +113,8 @@ next stats next stats +next +stats reset-stats stats ---- @@ -117,9 +125,11 @@ b#8,SET:b {BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} c#7,SET:c {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} -f#5,SET:f +d#72057594037927935,RANGEDEL: {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} -f#72057594037927935,RANGEDEL: +e#72057594037927935,RANGEDEL: +{BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} +f#5,SET:f {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} g#4,SET:g {BlockBytes:112 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} @@ -133,7 +143,7 @@ iter set-bounds lower=d seek-lt d ---- -d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} +. iter set-bounds lower=d @@ -142,15 +152,17 @@ prev next prev prev +prev ---- f#5,SET:f / d-e:{(#6,RANGEDEL)} -d#72057594037927935,RANGEDEL: +e#72057594037927935,RANGEDEL: f#5,SET:f +e#72057594037927935,RANGEDEL: d#72057594037927935,RANGEDEL: . # Verify that First() in the presence of an upper-bound pauses at the -# table containing the upper-bound. +# first range deletion's start key. clear ---- @@ -165,10 +177,10 @@ iter set-bounds upper=f first ---- -f#72057594037927935,RANGEDEL: +d#72057594037927935,RANGEDEL: # Verify that Last() in the presence of a lower-bound pauses at the -# table containing the lower-bound. +# last range deletion's end key. clear ---- @@ -182,7 +194,11 @@ d.RANGEDEL.6:e iter set-bounds lower=c last +prev +prev ---- +e#72057594037927935,RANGEDEL: +d#72057594037927935,RANGEDEL: c#7,SET:c # Verify that a seek to a file with range tombstones as boundaries pauses on @@ -210,9 +226,11 @@ seek-ge d prev next next +next ---- -e#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} +d#72057594037927935,RANGEDEL: / d-e:{(#6,RANGEDEL)} c#7,SET:c +d#72057594037927935,RANGEDEL: e#72057594037927935,RANGEDEL: f#8,SET:f @@ -221,14 +239,16 @@ seek-lt b next prev prev +prev ---- -a#72057594037927935,RANGEDEL: / a-b:{(#5,RANGEDEL)} +b#72057594037927935,RANGEDEL: / a-b:{(#5,RANGEDEL)} c#7,SET:c +b#72057594037927935,RANGEDEL: a#72057594037927935,RANGEDEL: . -# Verify that prev when positioned at the largest boundary returns the -# last key. +# Verify that prev when positioned at the start bound of the first range +# deletion returns the last key. clear ---- @@ -246,11 +266,11 @@ seek-ge d prev ---- b#1,SET:b -e#72057594037927935,RANGEDEL: / d-e:{(#2,RANGEDEL)} +d#72057594037927935,RANGEDEL: / d-e:{(#2,RANGEDEL)} b#1,SET:b -# Verify that next when positioned at the smallest boundary returns -# the first key. +# Verify that next when positioned at the start boundary of the first range +# deletion returns the first key. clear ---- @@ -268,7 +288,7 @@ seek-lt d next ---- d#2,SET:d -a#72057594037927935,RANGEDEL: / a-b:{(#1,RANGEDEL)} +b#72057594037927935,RANGEDEL: / a-b:{(#1,RANGEDEL)} d#2,SET:d # Verify SeekPrefixGE correctness with trySeekUsingNext=true @@ -325,9 +345,9 @@ seek-prefix-ge j true a#1,SET:a / c-e:{(#4,RANGEDEL)} a#1,SET:a / c-e:{(#4,RANGEDEL)} b#2,SET:b / c-e:{(#4,RANGEDEL)} -e#72057594037927935,RANGEDEL: -e#72057594037927935,RANGEDEL: / c-e:{(#4,RANGEDEL)} -e#72057594037927935,RANGEDEL: / c-e:{(#4,RANGEDEL)} +c#72057594037927935,RANGEDEL: +c#72057594037927935,RANGEDEL: / c-e:{(#4,RANGEDEL)} +d#72057594037927935,RANGEDEL: / c-e:{(#4,RANGEDEL)} f#5,SINGLEDEL: g#6,SET:g h#7,SINGLEDEL: @@ -384,14 +404,14 @@ j.SET.3:j iter seek-ge f next -next ---- f#5,SET:f -f#72057594037927935,RANGEDEL: i#4,SET:i -# The below count should be 2, as we skip over the rangekey-only file. +# The below count should be 4, as we skip over the rangekey-only file. +# TODO(jackson): When we stop opening range deletion iterators twice, this +# should be 2. iters-created ---- -2 +4 diff --git a/testdata/merging_iter b/testdata/merging_iter index c24465e975..3595ac6232 100644 --- a/testdata/merging_iter +++ b/testdata/merging_iter @@ -58,11 +58,16 @@ seek-ge d next ---- # 000000.rangeDelIter.opSpanSeekGE("d") = a-e:{(#8,RANGEDEL)} +# 000000.rangeDelIter.opSpanSeekGE("d") = a-e:{(#8,RANGEDEL)} +# 000001.rangeDelIter.opSpanSeekGE("e") = e-g:{(#8,RANGEDEL)} # 000001.rangeDelIter.opSpanSeekGE("e") = e-g:{(#8,RANGEDEL)} -# 000000.rangeDelIter.opSpanSeekGE("e") = nil +# 000000.rangeDelIter.opSpanSeekGE("d") = a-e:{(#8,RANGEDEL)} +# 000000.rangeDelIter.opSpanNext() = nil +# 000000.rangeDelIter.opSpanClose() = nil # 000000.rangeDelIter.opSpanClose() = nil # 000001.rangeDelIter.opSpanSeekGE("e") = e-g:{(#8,RANGEDEL)} e#10,SET:10 +# 000001.rangeDelIter.opSpanNext() = nil # 000001.rangeDelIter.opSpanSeekGE("g") = nil err=injected error @@ -820,16 +825,16 @@ seek-lt coo seek-prefix-ge b ---- # iter.First() = (a#30,SET,"30") -# rangedelIter.opSpanSeekGE("a") = nil +# rangedelIter.opSpanFirst() = nil err=injected error # iter.Last() = (f#21,SET,"21") -# rangedelIter.opSpanSeekGE("f") = nil +# rangedelIter.opSpanLast() = nil err=injected error # iter.SeekGE("boo") = (c#18,SET,"18") # rangedelIter.opSpanSeekGE("boo") = nil err=injected error # iter.SeekLT("coo") = (c#18,SET,"18") -# rangedelIter.opSpanSeekGE("coo") = nil +# rangedelIter.opSpanSeekLT("coo") = nil err=injected error # iter.SeekPrefixGE("b") = (b#19,SET,"19") # rangedelIter.opSpanSeekGE("b") = nil diff --git a/testdata/metrics b/testdata/metrics index 92161350d5..914700667f 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -73,7 +73,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 589B Block cache: 3 entries (484B) hit rate: 0.0% -Table cache: 1 entries (760B) hit rate: 0.0% +Table cache: 1 entries (760B) hit rate: 50.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 1 @@ -128,7 +128,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 595B Block cache: 5 entries (946B) hit rate: 33.3% -Table cache: 2 entries (1.5KB) hit rate: 66.7% +Table cache: 2 entries (1.5KB) hit rate: 75.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 2 @@ -170,7 +170,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 595B Block cache: 5 entries (946B) hit rate: 33.3% -Table cache: 2 entries (1.5KB) hit rate: 66.7% +Table cache: 2 entries (1.5KB) hit rate: 75.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 2 @@ -209,7 +209,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 595B Block cache: 3 entries (484B) hit rate: 33.3% -Table cache: 1 entries (760B) hit rate: 66.7% +Table cache: 1 entries (760B) hit rate: 75.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 1 @@ -252,7 +252,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 595B Block cache: 0 entries (0B) hit rate: 33.3% -Table cache: 0 entries (0B) hit rate: 66.7% +Table cache: 0 entries (0B) hit rate: 75.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -322,7 +322,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 2.6KB Block cache: 0 entries (0B) hit rate: 33.3% -Table cache: 0 entries (0B) hit rate: 66.7% +Table cache: 0 entries (0B) hit rate: 75.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -376,7 +376,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 2.0KB Block cache: 0 entries (0B) hit rate: 14.3% -Table cache: 0 entries (0B) hit rate: 58.3% +Table cache: 0 entries (0B) hit rate: 64.3% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -479,7 +479,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 4.3KB Block cache: 12 entries (1.9KB) hit rate: 16.7% -Table cache: 1 entries (760B) hit rate: 60.0% +Table cache: 1 entries (760B) hit rate: 64.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -541,7 +541,7 @@ Backing tables: 0 (0B) Virtual tables: 0 (0B) Local tables size: 6.1KB Block cache: 12 entries (1.9KB) hit rate: 16.7% -Table cache: 1 entries (760B) hit rate: 60.0% +Table cache: 1 entries (760B) hit rate: 64.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0