From 906511708065db751a456a1d6e09999eb3356bcd Mon Sep 17 00:00:00 2001 From: sumeerbhola Date: Thu, 18 Mar 2021 20:46:38 -0400 Subject: [PATCH] db: optimize levelIter for non-matching bloom filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When SeekPrefixGE on the underlying file returns false due to a bloom filter non-match, levelIter would skip to the next file. This is wasteful if the upper bound of the file is beyond the prefix. Additionally, it defeats the optimization for sparse key spaces like CockroachDB's lock table, where we try to reuse the current position of the iterator -- by skipping to the next file the subsequent SeekPrefixGE will again need to reload the previous file. This behavior was first noticed when diagnosing tpcc slowness in CockroacbDB, where almost half the overhead of seeking in the lock table could be attributed to this (see https://github.com/cockroachdb/cockroach/issues/62078 for details). The benchmark numbers for bloom=true/with-tombstone=false are the ones intended to benefit from this change. name old time/op new time/op delta IteratorSeqSeekPrefixGENotFound/skip=1/bloom=false/with-tombstone=false-16 441ns ± 9% 445ns ± 7% ~ (p=0.332 n=19+20) IteratorSeqSeekPrefixGENotFound/skip=1/bloom=false/with-tombstone=true-16 299ns ± 3% 300ns ± 3% ~ (p=0.455 n=20+20) IteratorSeqSeekPrefixGENotFound/skip=1/bloom=true/with-tombstone=false-16 3.73µs ± 8% 0.82µs ± 2% -78.02% (p=0.000 n=20+16) IteratorSeqSeekPrefixGENotFound/skip=1/bloom=true/with-tombstone=true-16 1.78µs ±73% 1.21µs ± 7% -32.15% (p=0.000 n=20+20) IteratorSeqSeekPrefixGENotFound/skip=2/bloom=false/with-tombstone=false-16 484ns ±27% 427ns ± 2% -11.83% (p=0.000 n=19+19) IteratorSeqSeekPrefixGENotFound/skip=2/bloom=false/with-tombstone=true-16 320ns ± 7% 300ns ± 3% -6.11% (p=0.000 n=16+19) IteratorSeqSeekPrefixGENotFound/skip=2/bloom=true/with-tombstone=false-16 5.07µs ±41% 0.82µs ± 2% -83.84% (p=0.000 n=20+18) IteratorSeqSeekPrefixGENotFound/skip=2/bloom=true/with-tombstone=true-16 1.76µs ±37% 1.21µs ± 9% -30.92% (p=0.000 n=20+20) IteratorSeqSeekPrefixGENotFound/skip=4/bloom=false/with-tombstone=false-16 439ns ± 4% 436ns ± 6% ~ (p=0.109 n=20+20) IteratorSeqSeekPrefixGENotFound/skip=4/bloom=false/with-tombstone=true-16 435ns ±29% 307ns ± 5% -29.40% (p=0.000 n=20+19) IteratorSeqSeekPrefixGENotFound/skip=4/bloom=true/with-tombstone=false-16 5.63µs ±19% 0.82µs ± 2% -85.40% (p=0.000 n=20+19) IteratorSeqSeekPrefixGENotFound/skip=4/bloom=true/with-tombstone=true-16 1.87µs ±36% 1.24µs ± 8% -33.66% (p=0.000 n=20+20) name old alloc/op new alloc/op delta IteratorSeqSeekPrefixGENotFound/skip=1/bloom=false/with-tombstone=false-16 0.00B 0.00B ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=1/bloom=false/with-tombstone=true-16 0.00B 0.00B ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=1/bloom=true/with-tombstone=false-16 0.00B 0.00B ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=1/bloom=true/with-tombstone=true-16 271B ± 0% 271B ± 0% ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=2/bloom=false/with-tombstone=false-16 0.00B 0.00B ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=2/bloom=false/with-tombstone=true-16 0.00B 0.00B ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=2/bloom=true/with-tombstone=false-16 0.00B 0.00B ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=2/bloom=true/with-tombstone=true-16 271B ± 0% 271B ± 0% ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=4/bloom=false/with-tombstone=false-16 0.00B 0.00B ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=4/bloom=false/with-tombstone=true-16 0.00B 0.00B ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=4/bloom=true/with-tombstone=false-16 0.00B 0.00B ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=4/bloom=true/with-tombstone=true-16 271B ± 0% 271B ± 0% ~ (all equal) name old allocs/op new allocs/op delta IteratorSeqSeekPrefixGENotFound/skip=1/bloom=false/with-tombstone=false-16 0.00 0.00 ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=1/bloom=false/with-tombstone=true-16 0.00 0.00 ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=1/bloom=true/with-tombstone=false-16 0.00 0.00 ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=1/bloom=true/with-tombstone=true-16 1.00 ± 0% 1.00 ± 0% ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=2/bloom=false/with-tombstone=false-16 0.00 0.00 ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=2/bloom=false/with-tombstone=true-16 0.00 0.00 ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=2/bloom=true/with-tombstone=false-16 0.00 0.00 ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=2/bloom=true/with-tombstone=true-16 1.00 ± 0% 1.00 ± 0% ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=4/bloom=false/with-tombstone=false-16 0.00 0.00 ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=4/bloom=false/with-tombstone=true-16 0.00 0.00 ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=4/bloom=true/with-tombstone=false-16 0.00 0.00 ~ (all equal) IteratorSeqSeekPrefixGENotFound/skip=4/bloom=true/with-tombstone=true-16 1.00 ± 0% 1.00 ± 0% ~ (all equal) --- compaction.go | 4 +- db.go | 2 +- get_iter.go | 5 +- ingest.go | 3 +- iterator_test.go | 119 +++++++++++++++++++++++---------------- level_checker.go | 5 +- level_iter.go | 19 ++++++- level_iter_test.go | 31 +++++----- merging_iter_test.go | 27 +++++++-- testdata/level_iter_seek | 22 ++++++++ 10 files changed, 158 insertions(+), 79 deletions(-) diff --git a/compaction.go b/compaction.go index f37e889bc1..dcd238dce5 100644 --- a/compaction.go +++ b/compaction.go @@ -1028,8 +1028,8 @@ func (c *compaction) newInputIter(newIters tableNewIters) (_ internalIterator, r iterOpts := IterOptions{logger: c.logger} addItersForLevel := func(iters []internalIterator, level *compactionLevel) ([]internalIterator, error) { - iters = append(iters, newLevelIter(iterOpts, c.cmp, newIters, level.files.Iter(), - manifest.Level(level.level), &c.bytesIterated)) + iters = append(iters, newLevelIter(iterOpts, c.cmp, nil /* split */, newIters, + level.files.Iter(), manifest.Level(level.level), &c.bytesIterated)) // Add the range deletion iterator for each file as an independent level // in mergingIter, as opposed to making a levelIter out of those. This // is safer as levelIter expects all keys coming from underlying diff --git a/db.go b/db.go index 5561e54348..218df6c585 100644 --- a/db.go +++ b/db.go @@ -793,7 +793,7 @@ func finishInitializingIter(buf *iterAlloc) *Iterator { li = &levelIter{} } - li.init(dbi.opts, dbi.cmp, dbi.newIters, files, level, nil) + li.init(dbi.opts, dbi.cmp, dbi.split, dbi.newIters, files, level, nil) li.initRangeDel(&mlevels[0].rangeDelIter) li.initSmallestLargestUserKey(&mlevels[0].smallestUserKey, &mlevels[0].largestUserKey, &mlevels[0].isLargestUserKeyRangeDelSentinel) diff --git a/get_iter.go b/get_iter.go index 87b241bea1..eaec1baa50 100644 --- a/get_iter.go +++ b/get_iter.go @@ -145,7 +145,8 @@ func (g *getIter) Next() (*InternalKey, []byte) { files := g.l0[n-1].Iter() g.l0 = g.l0[:n-1] iterOpts := IterOptions{logger: g.logger} - g.levelIter.init(iterOpts, g.cmp, g.newIters, files, manifest.L0Sublevel(n), nil) + g.levelIter.init(iterOpts, g.cmp, nil /* split */, g.newIters, + files, manifest.L0Sublevel(n), nil) g.levelIter.initRangeDel(&g.rangeDelIter) g.iter = &g.levelIter g.iterKey, g.iterValue = g.iter.SeekGE(g.key) @@ -163,7 +164,7 @@ func (g *getIter) Next() (*InternalKey, []byte) { } iterOpts := IterOptions{logger: g.logger} - g.levelIter.init(iterOpts, g.cmp, g.newIters, + g.levelIter.init(iterOpts, g.cmp, nil /* split */, g.newIters, g.version.Levels[g.level].Iter(), manifest.Level(g.level), nil) g.levelIter.initRangeDel(&g.rangeDelIter) g.level++ diff --git a/ingest.go b/ingest.go index a0bff27654..fe219d7ee9 100644 --- a/ingest.go +++ b/ingest.go @@ -417,7 +417,8 @@ func ingestTargetLevel( level := baseLevel for ; level < numLevels; level++ { - levelIter := newLevelIter(iterOps, cmp, newIters, v.Levels[level].Iter(), manifest.Level(level), nil) + levelIter := newLevelIter(iterOps, cmp, nil /* split */, newIters, + v.Levels[level].Iter(), manifest.Level(level), nil) var rangeDelIter internalIterator // Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE sets it up for the target file. levelIter.initRangeDel(&rangeDelIter) diff --git a/iterator_test.go b/iterator_test.go index e37acaff17..797ac38f5d 100644 --- a/iterator_test.go +++ b/iterator_test.go @@ -16,6 +16,7 @@ import ( "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/datadriven" + "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/sstable" "github.com/cockroachdb/pebble/vfs" "github.com/stretchr/testify/require" @@ -1003,10 +1004,31 @@ func BenchmarkIteratorSeqSeekPrefixGENotFound(b *testing.B) { const restartInterval = 16 const levelCount = 5 const keyOffset = 100000 - readers, levelSlices, _ := buildLevelsForMergingIterSeqSeek( - b, blockSize, restartInterval, levelCount, keyOffset, false) - readersWithTombstone, levelSlicesWithTombstone, _ := buildLevelsForMergingIterSeqSeek( - b, blockSize, restartInterval, 1, keyOffset, true) + + var readers [4][][]*sstable.Reader + var levelSlices [4][]manifest.LevelSlice + indexFunc := func(bloom bool, withTombstone bool) int { + index := 0 + if bloom { + index = 2 + } + if withTombstone { + index++ + } + return index + } + for _, bloom := range []bool{false, true} { + for _, withTombstone := range []bool{false, true} { + index := indexFunc(bloom, withTombstone) + levels := levelCount + if withTombstone { + levels = 1 + } + readers[index], levelSlices[index], _ = buildLevelsForMergingIterSeqSeek( + b, blockSize, restartInterval, levels, keyOffset, withTombstone, bloom) + + } + } // We will not be seeking to the keys that were written but instead to // keys before the written keys. This is to validate that the optimization // to use Next still functions when mergingIter checks for the prefix @@ -1018,53 +1040,52 @@ func BenchmarkIteratorSeqSeekPrefixGENotFound(b *testing.B) { keys = append(keys, []byte(fmt.Sprintf("%08d", i))) } for _, skip := range []int{1, 2, 4} { - for _, withTombstone := range []bool{false, true} { - b.Run(fmt.Sprintf("skip=%d/with-tombstone=%t", skip, withTombstone), - func(b *testing.B) { - readers := readers - levelSlices := levelSlices - if withTombstone { - readers = readersWithTombstone - levelSlices = levelSlicesWithTombstone - } - m := buildMergingIter(readers, levelSlices) - iter := Iterator{ - cmp: DefaultComparer.Compare, - equal: DefaultComparer.Equal, - split: func(a []byte) int { return len(a) }, - merge: DefaultMerger.Merge, - iter: m, - } - pos := 0 - b.ResetTimer() - for i := 0; i < b.N; i++ { - // When withTombstone=true, and prior to the - // optimization to stop early due to a range - // tombstone, the iteration would continue into the - // next file, and not be able to use Next at the lower - // level in the next SeekPrefixGE call. So we would - // incur the cost of iterating over all the deleted - // keys for every seek. Note that it is not possible - // to do a noop optimization in Iterator for the - // prefix case, unlike SeekGE/SeekLT, since we don't - // know if the iterators inside mergingIter are all - // appropriately positioned -- some may not be due to - // bloom filters not matching. - valid := iter.SeekPrefixGE(keys[pos]) - if valid { - b.Fatalf("key should not be found") + for _, bloom := range []bool{false, true} { + for _, withTombstone := range []bool{false, true} { + b.Run(fmt.Sprintf("skip=%d/bloom=%t/with-tombstone=%t", skip, bloom, withTombstone), + func(b *testing.B) { + index := indexFunc(bloom, withTombstone) + readers := readers[index] + levelSlices := levelSlices[index] + m := buildMergingIter(readers, levelSlices) + iter := Iterator{ + cmp: DefaultComparer.Compare, + equal: DefaultComparer.Equal, + split: func(a []byte) int { return len(a) }, + merge: DefaultMerger.Merge, + iter: m, } - pos += skip - if pos >= keyOffset { - pos = 0 + pos := 0 + b.ResetTimer() + for i := 0; i < b.N; i++ { + // When withTombstone=true, and prior to the + // optimization to stop early due to a range + // tombstone, the iteration would continue into the + // next file, and not be able to use Next at the lower + // level in the next SeekPrefixGE call. So we would + // incur the cost of iterating over all the deleted + // keys for every seek. Note that it is not possible + // to do a noop optimization in Iterator for the + // prefix case, unlike SeekGE/SeekLT, since we don't + // know if the iterators inside mergingIter are all + // appropriately positioned -- some may not be due to + // bloom filters not matching. + valid := iter.SeekPrefixGE(keys[pos]) + if valid { + b.Fatalf("key should not be found") + } + pos += skip + if pos >= keyOffset { + pos = 0 + } } - } - b.StopTimer() - iter.Close() - }) + b.StopTimer() + iter.Close() + }) + } } } - for _, r := range [][][]*sstable.Reader{readers, readersWithTombstone} { + for _, r := range readers { for i := range r { for j := range r[i] { r[i][j].Close() @@ -1081,7 +1102,7 @@ func BenchmarkIteratorSeqSeekGEWithBounds(b *testing.B) { const restartInterval = 16 const levelCount = 5 readers, levelSlices, keys := buildLevelsForMergingIterSeqSeek( - b, blockSize, restartInterval, levelCount, 0 /* keyOffset */, false) + b, blockSize, restartInterval, levelCount, 0 /* keyOffset */, false, false) m := buildMergingIter(readers, levelSlices) iter := Iterator{ cmp: DefaultComparer.Compare, @@ -1118,7 +1139,7 @@ func BenchmarkIteratorSeekGENoop(b *testing.B) { const levelCount = 5 const keyOffset = 10000 readers, levelSlices, _ := buildLevelsForMergingIterSeqSeek( - b, blockSize, restartInterval, levelCount, keyOffset, false) + b, blockSize, restartInterval, levelCount, keyOffset, false, false) var keys [][]byte for i := 0; i < keyOffset; i++ { keys = append(keys, []byte(fmt.Sprintf("%08d", i))) diff --git a/level_checker.go b/level_checker.go index 6a4cae2eb4..24cb34ae1c 100644 --- a/level_checker.go +++ b/level_checker.go @@ -636,7 +636,7 @@ func checkLevelsInternal(c *checkConfig) (err error) { manifestIter := current.L0Sublevels.Levels[sublevel].Iter() iterOpts := IterOptions{logger: c.logger} li := &levelIter{} - li.init(iterOpts, c.cmp, c.newIters, manifestIter, + li.init(iterOpts, c.cmp, nil /* split */, c.newIters, manifestIter, manifest.L0Sublevel(sublevel), nil) li.initRangeDel(&mlevelAlloc[0].rangeDelIter) li.initSmallestLargestUserKey(&mlevelAlloc[0].smallestUserKey, nil, nil) @@ -650,7 +650,8 @@ func checkLevelsInternal(c *checkConfig) (err error) { iterOpts := IterOptions{logger: c.logger} li := &levelIter{} - li.init(iterOpts, c.cmp, c.newIters, current.Levels[level].Iter(), manifest.Level(level), nil) + li.init(iterOpts, c.cmp, nil /* split */, c.newIters, + current.Levels[level].Iter(), manifest.Level(level), nil) li.initRangeDel(&mlevelAlloc[0].rangeDelIter) li.initSmallestLargestUserKey(&mlevelAlloc[0].smallestUserKey, nil, nil) mlevelAlloc[0].iter = li diff --git a/level_iter.go b/level_iter.go index cc599b42e5..35e8814ab9 100644 --- a/level_iter.go +++ b/level_iter.go @@ -45,6 +45,7 @@ type tableNewIters func( type levelIter struct { logger Logger cmp Compare + split Split // The lower/upper bounds for iteration as specified at creation or the most // recent call to SetBounds. lower []byte @@ -153,22 +154,26 @@ type levelIter struct { // levelIter implements the base.InternalIterator interface. var _ base.InternalIterator = (*levelIter)(nil) +// newLevelIter returns a levelIter. It is permissible to pass a nil split +// parameter if the caller is never going to call SeekPrefixGE. func newLevelIter( opts IterOptions, cmp Compare, + split Split, newIters tableNewIters, files manifest.LevelIterator, level manifest.Level, bytesIterated *uint64, ) *levelIter { l := &levelIter{} - l.init(opts, cmp, newIters, files, level, bytesIterated) + l.init(opts, cmp, split, newIters, files, level, bytesIterated) return l } func (l *levelIter) init( opts IterOptions, cmp Compare, + split Split, newIters tableNewIters, files manifest.LevelIterator, level manifest.Level, @@ -181,6 +186,7 @@ func (l *levelIter) init( l.upper = opts.UpperBound l.tableOpts.TableFilter = opts.TableFilter l.cmp = cmp + l.split = split l.iterFile = nil l.newIters = newIters l.files = files @@ -435,6 +441,17 @@ func (l *levelIter) SeekPrefixGE( } return l.verify(l.largestBoundary, nil) } + // It is possible that we are here because bloom filter matching failed. + // In that case it is likely that all keys matching the prefix are wholly + // within the current file and cannot be in the subsequent file. In that + // case we don't want to go to the next file, since loading and seeking in + // there has some cost. Additionally, for sparse key spaces, loading the + // next file will defeat the optimization for the next SeekPrefixGE that + // is called with trySeekUsingNext=true, since for sparse key spaces it is + // likely that the next key will also be contained in the current file. + if n := l.split(l.iterFile.Largest.UserKey); l.cmp(prefix, l.iterFile.Largest.UserKey[:n]) < 0 { + return nil, nil + } return l.verify(l.skipEmptyFileForward()) } diff --git a/level_iter_test.go b/level_iter_test.go index 75d7f05110..c9110feab2 100644 --- a/level_iter_test.go +++ b/level_iter_test.go @@ -81,7 +81,8 @@ func TestLevelIter(t *testing.T) { } iter := newLevelIter(opts, DefaultComparer.Compare, - newIters, files.Iter(), manifest.Level(level), nil) + func(a []byte) int { return len(a) }, newIters, files.Iter(), manifest.Level(level), + nil) defer iter.Close() // Fake up the range deletion initialization. iter.initRangeDel(new(internalIterator)) @@ -122,7 +123,8 @@ func TestLevelIter(t *testing.T) { } iter := newLevelIter(opts, DefaultComparer.Compare, - newIters2, files.Iter(), manifest.Level(level), nil) + func(a []byte) int { return len(a) }, newIters2, files.Iter(), + manifest.Level(level), nil) iter.SeekGE([]byte(key)) lower, upper := tableOpts.GetLowerBound(), tableOpts.GetUpperBound() return fmt.Sprintf("[%s,%s]\n", lower, upper) @@ -283,8 +285,9 @@ func TestLevelIterBoundaries(t *testing.T) { } if iter == nil { slice := manifest.NewLevelSliceKeySorted(lt.cmp.Compare, lt.metas) - iter = newLevelIter(IterOptions{}, DefaultComparer.Compare, lt.newIters, - slice.Iter(), manifest.Level(level), nil) + iter = newLevelIter(IterOptions{}, DefaultComparer.Compare, + func(a []byte) int { return len(a) }, lt.newIters, slice.Iter(), + manifest.Level(level), nil) // Fake up the range deletion initialization. iter.initRangeDel(new(internalIterator)) } @@ -377,8 +380,9 @@ func TestLevelIterSeek(t *testing.T) { case "iter": slice := manifest.NewLevelSliceKeySorted(lt.cmp.Compare, lt.metas) iter := &levelIterTestIter{ - levelIter: newLevelIter(IterOptions{}, DefaultComparer.Compare, lt.newIters, - slice.Iter(), manifest.Level(level), nil), + levelIter: newLevelIter(IterOptions{}, DefaultComparer.Compare, + func(a []byte) int { return len(a) }, lt.newIters, slice.Iter(), + manifest.Level(level), nil), } defer iter.Close() iter.initRangeDel(&iter.rangeDelIter) @@ -479,8 +483,7 @@ func BenchmarkLevelIterSeekGE(b *testing.B) { iter, err := readers[file.FileNum].NewIter(nil /* lower */, nil /* upper */) return iter, nil, err } - l := newLevelIter(IterOptions{}, DefaultComparer.Compare, - newIters, metas.Iter(), manifest.Level(level), nil) + l := newLevelIter(IterOptions{}, DefaultComparer.Compare, nil, newIters, metas.Iter(), manifest.Level(level), nil) rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano()))) b.ResetTimer() @@ -522,8 +525,7 @@ func BenchmarkLevelIterSeqSeekGEWithBounds(b *testing.B) { opts.LowerBound, opts.UpperBound) return iter, nil, err } - l := newLevelIter(IterOptions{}, DefaultComparer.Compare, - newIters, metas.Iter(), manifest.Level(level), nil) + l := newLevelIter(IterOptions{}, DefaultComparer.Compare, nil, newIters, metas.Iter(), manifest.Level(level), nil) // Fake up the range deletion initialization, to resemble the usage // in a mergingIter. l.initRangeDel(new(internalIterator)) @@ -571,7 +573,8 @@ func BenchmarkLevelIterSeqSeekPrefixGE(b *testing.B) { b.Run(fmt.Sprintf("skip=%d/use-next=%t", skip, useNext), func(b *testing.B) { l := newLevelIter(IterOptions{}, DefaultComparer.Compare, - newIters, metas.Iter(), manifest.Level(level), nil) + func(a []byte) int { return len(a) }, newIters, metas.Iter(), + manifest.Level(level), nil) // Fake up the range deletion initialization, to resemble the usage // in a mergingIter. l.initRangeDel(new(internalIterator)) @@ -613,8 +616,7 @@ func BenchmarkLevelIterNext(b *testing.B) { iter, err := readers[file.FileNum].NewIter(nil /* lower */, nil /* upper */) return iter, nil, err } - l := newLevelIter(IterOptions{}, DefaultComparer.Compare, - newIters, metas.Iter(), manifest.Level(level), nil) + l := newLevelIter(IterOptions{}, DefaultComparer.Compare, nil, newIters, metas.Iter(), manifest.Level(level), nil) b.ResetTimer() for i := 0; i < b.N; i++ { @@ -648,8 +650,7 @@ func BenchmarkLevelIterPrev(b *testing.B) { iter, err := readers[file.FileNum].NewIter(nil /* lower */, nil /* upper */) return iter, nil, err } - l := newLevelIter(IterOptions{}, DefaultComparer.Compare, - newIters, metas.Iter(), manifest.Level(level), nil) + l := newLevelIter(IterOptions{}, DefaultComparer.Compare, nil, newIters, metas.Iter(), manifest.Level(level), nil) b.ResetTimer() for i := 0; i < b.N; i++ { diff --git a/merging_iter_test.go b/merging_iter_test.go index 37272cac7d..7ec1a31be6 100644 --- a/merging_iter_test.go +++ b/merging_iter_test.go @@ -10,6 +10,7 @@ import ( "testing" "time" + "github.com/cockroachdb/pebble/bloom" "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/datadriven" "github.com/cockroachdb/pebble/internal/manifest" @@ -245,7 +246,8 @@ func TestMergingIterCornerCases(t *testing.T) { continue } li := &levelIter{} - li.init(IterOptions{}, cmp, newIters, slice.Iter(), manifest.Level(i), nil) + li.init(IterOptions{}, cmp, func(a []byte) int { return len(a) }, newIters, + slice.Iter(), manifest.Level(i), nil) i := len(levelIters) levelIters = append(levelIters, mergingIterLevel{iter: li}) li.initRangeDel(&levelIters[i].rangeDelIter) @@ -449,6 +451,7 @@ func buildLevelsForMergingIterSeqSeek( blockSize, restartInterval, levelCount int, keyOffset int, writeRangeTombstoneToLowestLevel bool, + writeBloomFilters bool, ) ([][]*sstable.Reader, []manifest.LevelSlice, [][]byte) { mem := vfs.NewMem() if writeRangeTombstoneToLowestLevel && levelCount != 1 { @@ -466,13 +469,20 @@ func buildLevelsForMergingIterSeqSeek( } writers := make([][]*sstable.Writer, levelCount) + // A policy unlikely to have false positives. + filterPolicy := bloom.FilterPolicy(100) for i := range files { for j := range files[i] { - writers[i] = append(writers[i], sstable.NewWriter(files[i][j], sstable.WriterOptions{ + writerOptions := sstable.WriterOptions{ BlockRestartInterval: restartInterval, BlockSize: blockSize, Compression: NoCompression, - })) + } + if writeBloomFilters { + writerOptions.FilterPolicy = filterPolicy + writerOptions.FilterType = base.TableFilter + } + writers[i] = append(writers[i], sstable.NewWriter(files[i][j], writerOptions)) } } @@ -511,6 +521,10 @@ func buildLevelsForMergingIterSeqSeek( } opts := sstable.ReaderOptions{Cache: NewCache(128 << 20)} + if writeBloomFilters { + opts.Filters = make(map[string]FilterPolicy) + opts.Filters[filterPolicy.Name()] = filterPolicy + } defer opts.Cache.Unref() readers := make([][]*sstable.Reader, levelCount) @@ -569,7 +583,8 @@ func buildMergingIter(readers [][]*sstable.Reader, levelSlices []manifest.LevelS return iter, rdIter, err } l := newLevelIter(IterOptions{}, DefaultComparer.Compare, - newIters, levelSlices[i].Iter(), manifest.Level(level), nil) + func(a []byte) int { return len(a) }, newIters, levelSlices[i].Iter(), + manifest.Level(level), nil) l.initRangeDel(&mils[level].rangeDelIter) l.initSmallestLargestUserKey( &mils[level].smallestUserKey, &mils[level].largestUserKey, @@ -598,7 +613,7 @@ func BenchmarkMergingIterSeqSeekGEWithBounds(b *testing.B) { b.Run(fmt.Sprintf("levelCount=%d", levelCount), func(b *testing.B) { readers, levelSlices, keys := buildLevelsForMergingIterSeqSeek( - b, blockSize, restartInterval, levelCount, 0 /* keyOffset */, false) + b, blockSize, restartInterval, levelCount, 0 /* keyOffset */, false, false) m := buildMergingIter(readers, levelSlices) keyCount := len(keys) b.ResetTimer() @@ -626,7 +641,7 @@ func BenchmarkMergingIterSeqSeekPrefixGE(b *testing.B) { const restartInterval = 16 const levelCount = 5 readers, levelSlices, keys := buildLevelsForMergingIterSeqSeek( - b, blockSize, restartInterval, levelCount, 0 /* keyOffset */, false) + b, blockSize, restartInterval, levelCount, 0 /* keyOffset */, false, false) for _, skip := range []int{1, 2, 4, 8, 16} { for _, useNext := range []bool{false, true} { diff --git a/testdata/level_iter_seek b/testdata/level_iter_seek index 10a7e07f43..9d279ed0da 100644 --- a/testdata/level_iter_seek +++ b/testdata/level_iter_seek @@ -56,11 +56,33 @@ c#7,1:c d#72057594037927935,15: . +# There is no point key with d, but since there is a rangedel, levelIter +# returns a synthetic boundary key using the largest key, f, in the file. iter seek-prefix-ge d ---- f/d-e#6#5,15: +# Tests a sequence of SeekPrefixGE with monotonically increasing keys, some of +# which are present and some not (so fail the bloom filter match). The seek to +# cc returns a synthetic boundary key. +iter +seek-prefix-ge aa +seek-prefix-ge c +seek-prefix-ge cc +seek-prefix-ge f +seek-prefix-ge g +seek-prefix-ge gg +seek-prefix-ge h +---- +./#0,0: +c/d-e#6#7,1:c +f/d-e#6#5,15: +f/#5,1:f +g/#4,1:g +./#0,0: +h/#3,1:h + iter set-bounds lower=d seek-lt d