diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 358313330404..a85e0e1d3557 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -1072,8 +1072,17 @@ func (p *Pebble) NewBatch() Batch { // NewReadOnly implements the Engine interface. func (p *Pebble) NewReadOnly() ReadWriter { + // TODO(sumeer): a sync.Pool for pebbleReadOnly would save on allocations + // for the underlying pebbleIterators. return &pebbleReadOnly{ parent: p, + // Defensively set reusable=true. One has to be careful about this since + // an accidental false value would cause these iterators, that are value + // members of pebbleReadOnly, to be put in the pebbleIterPool. + prefixIter: pebbleIterator{reusable: true}, + normalIter: pebbleIterator{reusable: true}, + prefixEngineIter: pebbleIterator{reusable: true}, + normalEngineIter: pebbleIterator{reusable: true}, } } @@ -1368,14 +1377,14 @@ func (p *pebbleReadOnly) NewMVCCIterator(iterKind MVCCIterKind, opts IterOptions if iter.inuse { panic("iterator already in use") } + // Ensures no timestamp hints etc. + checkOptionsForIterReuse(opts) if iter.iter != nil { - iter.setOptions(opts) + iter.setBounds(opts.LowerBound, opts.UpperBound) } else { iter.init(p.parent.db, p.iter, opts) - // The timestamp hints should be empty given the earlier code, but we are - // being defensive. - if p.iter == nil && opts.MaxTimestampHint.IsEmpty() && opts.MinTimestampHint.IsEmpty() { + if p.iter == nil { // For future cloning. p.iter = iter.iter } @@ -1403,14 +1412,14 @@ func (p *pebbleReadOnly) NewEngineIterator(opts IterOptions) EngineIterator { if iter.inuse { panic("iterator already in use") } + // Ensures no timestamp hints etc. + checkOptionsForIterReuse(opts) if iter.iter != nil { - iter.setOptions(opts) + iter.setBounds(opts.LowerBound, opts.UpperBound) } else { iter.init(p.parent.db, p.iter, opts) - // The timestamp hints should be empty given this is an EngineIterator, - // but we are being defensive. - if p.iter == nil && opts.MaxTimestampHint.IsEmpty() && opts.MinTimestampHint.IsEmpty() { + if p.iter == nil { // For future cloning. p.iter = iter.iter } @@ -1421,6 +1430,18 @@ func (p *pebbleReadOnly) NewEngineIterator(opts IterOptions) EngineIterator { return iter } +// checkOptionsForIterReuse checks that the options are appropriate for +// iterators that are reusable, and panics if not. This includes disallowing +// any timestamp hints. +func checkOptionsForIterReuse(opts IterOptions) { + if !opts.MinTimestampHint.IsEmpty() || !opts.MaxTimestampHint.IsEmpty() { + panic("iterator with timestamp hints cannot be reused") + } + if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 { + panic("iterator must set prefix or upper bound or lower bound") + } +} + // ConsistentIterators implements the Engine interface. func (p *pebbleReadOnly) ConsistentIterators() bool { return true diff --git a/pkg/storage/pebble_batch.go b/pkg/storage/pebble_batch.go index 3e73b3d6256e..b33bc7e6c1d1 100644 --- a/pkg/storage/pebble_batch.go +++ b/pkg/storage/pebble_batch.go @@ -229,18 +229,18 @@ func (p *pebbleBatch) NewMVCCIterator(iterKind MVCCIterKind, opts IterOptions) M if iter.inuse { panic("iterator already in use") } + // Ensures no timestamp hints etc. + checkOptionsForIterReuse(opts) if iter.iter != nil { - iter.setOptions(opts) + iter.setBounds(opts.LowerBound, opts.UpperBound) } else { if p.batch.Indexed() { iter.init(p.batch, p.iter, opts) } else { iter.init(p.db, p.iter, opts) } - // The timestamp hints should be empty given the earlier code, but we are - // being defensive. - if p.iter == nil && opts.MaxTimestampHint.IsEmpty() && opts.MinTimestampHint.IsEmpty() { + if p.iter == nil { // For future cloning. p.iter = iter.iter } @@ -271,18 +271,18 @@ func (p *pebbleBatch) NewEngineIterator(opts IterOptions) EngineIterator { if iter.inuse { panic("iterator already in use") } + // Ensures no timestamp hints etc. + checkOptionsForIterReuse(opts) if iter.iter != nil { - iter.setOptions(opts) + iter.setBounds(opts.LowerBound, opts.UpperBound) } else { if p.batch.Indexed() { iter.init(p.batch, p.iter, opts) } else { iter.init(p.db, p.iter, opts) } - // The timestamp hints should be empty given this is an EngineIterator, - // but we are being defensive. - if p.iter == nil && opts.MaxTimestampHint.IsEmpty() && opts.MinTimestampHint.IsEmpty() { + if p.iter == nil { // For future cloning. p.iter = iter.iter } diff --git a/pkg/storage/pebble_iterator.go b/pkg/storage/pebble_iterator.go index f46917eb4ffd..dc328ac0d6f6 100644 --- a/pkg/storage/pebble_iterator.go +++ b/pkg/storage/pebble_iterator.go @@ -38,9 +38,11 @@ type pebbleIterator struct { // use two slices for each of the bounds since this caller should not change // the slice holding the current bounds, that the callee (pebble.MVCCIterator) // is currently using, until after the caller has made the SetBounds call. - lowerBoundBuf [2][]byte - upperBoundBuf [2][]byte - curBuf int + lowerBoundBuf [2][]byte + upperBoundBuf [2][]byte + curBuf int + testingSetBoundsListener testingSetBoundsListener + // Set to true to govern whether to call SeekPrefixGE or SeekGE. Skips // SSTables based on MVCC/Engine key when true. prefix bool @@ -75,11 +77,16 @@ type cloneableIter interface { Clone() (*pebble.Iterator, error) } +type testingSetBoundsListener interface { + postSetBounds(lower, upper []byte) +} + // Instantiates a new Pebble iterator, or gets one from the pool. func newPebbleIterator( handle pebble.Reader, iterToClone cloneableIter, opts IterOptions, ) *pebbleIterator { iter := pebbleIterPool.Get().(*pebbleIterator) + iter.reusable = false // defensive iter.init(handle, iterToClone, opts) return iter } @@ -169,38 +176,65 @@ func (p *pebbleIterator) init(handle pebble.Reader, iterToClone cloneableIter, o p.inuse = true } -func (p *pebbleIterator) setOptions(opts IterOptions) { - // Overwrite any stale options from last time. - p.options = pebble.IterOptions{} - - if !opts.MinTimestampHint.IsEmpty() || !opts.MaxTimestampHint.IsEmpty() { - panic("iterator with timestamp hints cannot be reused") +// setBounds is called to change the bounds on a pebbleIterator. Note that +// this is not the first time that bounds will be passed to the underlying +// pebble.Iterator. The existing bounds are in p.options. +func (p *pebbleIterator) setBounds(lowerBound, upperBound roachpb.Key) { + // If the roachpb.Key bound is nil, the corresponding bound for the + // pebble.Iterator will also be nil. p.options contains the current bounds + // known to the pebble.Iterator. + boundsChanged := ((lowerBound == nil) != (p.options.LowerBound == nil)) || + ((upperBound == nil) != (p.options.UpperBound == nil)) + if !boundsChanged { + // The nil-ness is the same but the values may be different. + if lowerBound != nil { + // Both must be non-nil. We know that we've appended 0x00 to + // p.options.LowerBound, which must be ignored for this comparison. + if !bytes.Equal(p.options.LowerBound[:len(p.options.LowerBound)-1], lowerBound) { + boundsChanged = true + } + } + // If the preceding if-block has not already set boundsChanged=true, see + // if the upper bound has changed. + if !boundsChanged && upperBound != nil { + // Both must be non-nil. We know that we've appended 0x00 to + // p.options.UpperBound, which must be ignored for this comparison. + if !bytes.Equal(p.options.UpperBound[:len(p.options.UpperBound)-1], upperBound) { + boundsChanged = true + } + } } - if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 { - panic("iterator must set prefix or upper bound or lower bound") + if !boundsChanged { + // This noop optimization helps the underlying pebble.Iterator to optimize + // seeks. + return } - - p.prefix = opts.Prefix + // Set the bounds to nil, before we selectively change them. + p.options.LowerBound = nil + p.options.UpperBound = nil p.curBuf = (p.curBuf + 1) % 2 i := p.curBuf - if opts.LowerBound != nil { + if lowerBound != nil { // This is the same as - // p.options.LowerBound = EncodeKeyToBuf(p.lowerBoundBuf[i][:0], MVCCKey{Key: opts.LowerBound}) . - // or EngineKey{Key: opts.LowerBound}.EncodeToBuf(...). + // p.options.LowerBound = EncodeKeyToBuf(p.lowerBoundBuf[i][:0], MVCCKey{Key: lowerBound}) . + // or EngineKey{Key: lowerBound}.EncodeToBuf(...). // Since we are encoding keys with an empty version anyway, we can just // append the NUL byte instead of calling the above encode functions which // will do the same thing. - p.lowerBoundBuf[i] = append(p.lowerBoundBuf[i][:0], opts.LowerBound...) + p.lowerBoundBuf[i] = append(p.lowerBoundBuf[i][:0], lowerBound...) p.lowerBoundBuf[i] = append(p.lowerBoundBuf[i], 0x00) p.options.LowerBound = p.lowerBoundBuf[i] } - if opts.UpperBound != nil { + if upperBound != nil { // Same as above. - p.upperBoundBuf[i] = append(p.upperBoundBuf[i][:0], opts.UpperBound...) + p.upperBoundBuf[i] = append(p.upperBoundBuf[i][:0], upperBound...) p.upperBoundBuf[i] = append(p.upperBoundBuf[i], 0x00) p.options.UpperBound = p.upperBoundBuf[i] } p.iter.SetBounds(p.options.LowerBound, p.options.UpperBound) + if p.testingSetBoundsListener != nil { + p.testingSetBoundsListener.postSetBounds(p.options.LowerBound, p.options.UpperBound) + } } // Close implements the MVCCIterator interface. @@ -624,11 +658,22 @@ func findSplitKeyUsingIterator( return bestSplitKey, nil } -// SetUpperBound implements the MVCCIterator interface. +// SetUpperBound implements the MVCCIterator interface. Note that this is not +// the first time that bounds will be passed to the underlying +// pebble.Iterator. The existing bounds are in p.options. func (p *pebbleIterator) SetUpperBound(upperBound roachpb.Key) { if upperBound == nil { panic("SetUpperBound must not use a nil key") } + if p.options.UpperBound != nil { + // We know that we've appended 0x00 to p.options.UpperBound, which must be + // ignored for this comparison. + if bytes.Equal(p.options.UpperBound[:len(p.options.UpperBound)-1], upperBound) { + // Nothing to do. This noop optimization helps the underlying + // pebble.Iterator to optimize seeks. + return + } + } p.curBuf = (p.curBuf + 1) % 2 i := p.curBuf if p.options.LowerBound != nil { @@ -639,6 +684,9 @@ func (p *pebbleIterator) SetUpperBound(upperBound roachpb.Key) { p.upperBoundBuf[i] = append(p.upperBoundBuf[i], 0x00) p.options.UpperBound = p.upperBoundBuf[i] p.iter.SetBounds(p.options.LowerBound, p.options.UpperBound) + if p.testingSetBoundsListener != nil { + p.testingSetBoundsListener.postSetBounds(p.options.LowerBound, p.options.UpperBound) + } } // Stats implements the MVCCIterator interface. diff --git a/pkg/storage/pebble_test.go b/pkg/storage/pebble_test.go index 864dd8da893d..8e8d4a2d9e2f 100644 --- a/pkg/storage/pebble_test.go +++ b/pkg/storage/pebble_test.go @@ -186,6 +186,140 @@ func TestPebbleIterReuse(t *testing.T) { iter2.Close() } +type iterBoundsChecker struct { + t *testing.T + expectSetBounds bool + boundsSlices [2][]byte + boundsSlicesCopied [2][]byte +} + +func (ibc *iterBoundsChecker) postSetBounds(lower, upper []byte) { + require.True(ibc.t, ibc.expectSetBounds) + ibc.expectSetBounds = false + // The slices passed in the second from last SetBounds call + // must still be the same. + for i := range ibc.boundsSlices { + if ibc.boundsSlices[i] != nil { + if !bytes.Equal(ibc.boundsSlices[i], ibc.boundsSlicesCopied[i]) { + ibc.t.Fatalf("bound slice changed: expected: %x, actual: %x", + ibc.boundsSlicesCopied[i], ibc.boundsSlices[i]) + } + } + } + // Stash the bounds for later checking. + for i, bound := range [][]byte{lower, upper} { + ibc.boundsSlices[i] = bound + if bound != nil { + ibc.boundsSlicesCopied[i] = append(ibc.boundsSlicesCopied[i][:0], bound...) + } else { + ibc.boundsSlicesCopied[i] = nil + } + } +} + +func TestPebbleIterBoundSliceStabilityAndNoop(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + eng := createTestPebbleEngine().(*Pebble) + defer eng.Close() + iter := newPebbleIterator(eng.db, nil, IterOptions{UpperBound: roachpb.Key("foo")}) + defer iter.Close() + checker := &iterBoundsChecker{t: t} + iter.testingSetBoundsListener = checker + + tc := []struct { + expectSetBounds bool + setUpperOnly bool + lb roachpb.Key + ub roachpb.Key + }{ + { + // [nil, www) + expectSetBounds: true, + ub: roachpb.Key("www"), + }, + { + // [nil, www) + expectSetBounds: false, + ub: roachpb.Key("www"), + }, + { + // [nil, www) + expectSetBounds: false, + setUpperOnly: true, + ub: roachpb.Key("www"), + }, + { + // [ddd, www) + expectSetBounds: true, + lb: roachpb.Key("ddd"), + ub: roachpb.Key("www"), + }, + { + // [ddd, www) + expectSetBounds: false, + setUpperOnly: true, + ub: roachpb.Key("www"), + }, + { + // [ddd, xxx) + expectSetBounds: true, + setUpperOnly: true, + ub: roachpb.Key("xxx"), + }, + { + // [aaa, bbb) + expectSetBounds: true, + lb: roachpb.Key("aaa"), + ub: roachpb.Key("bbb"), + }, + { + // [ccc, ddd) + expectSetBounds: true, + lb: roachpb.Key("ccc"), + ub: roachpb.Key("ddd"), + }, + { + // [ccc, nil) + expectSetBounds: true, + lb: roachpb.Key("ccc"), + }, + { + // [ccc, nil) + expectSetBounds: false, + lb: roachpb.Key("ccc"), + }, + } + var lb, ub roachpb.Key + for _, c := range tc { + t.Run(fmt.Sprintf("%v", c), func(t *testing.T) { + checker.expectSetBounds = c.expectSetBounds + checker.t = t + if c.setUpperOnly { + iter.SetUpperBound(c.ub) + ub = c.ub + } else { + iter.setBounds(c.lb, c.ub) + lb, ub = c.lb, c.ub + } + require.False(t, checker.expectSetBounds) + for i, bound := range [][]byte{lb, ub} { + if (bound == nil) != (checker.boundsSlicesCopied[i] == nil) { + t.Fatalf("inconsistent nil %d", i) + } + if bound != nil { + expected := append([]byte(nil), bound...) + expected = append(expected, 0x00) + if !bytes.Equal(expected, checker.boundsSlicesCopied[i]) { + t.Fatalf("expected: %x, actual: %x", expected, checker.boundsSlicesCopied[i]) + } + } + } + }) + } +} + func makeMVCCKey(a string) MVCCKey { return MVCCKey{Key: []byte(a)} }