Skip to content

Commit

Permalink
storage: add noop behavior for bounds setting in pebbleIterator
Browse files Browse the repository at this point in the history
This is to compensate for the bug fix in Pebble which removes
the noop behavior from Pebble
cockroachdb/pebble#1073

Added a test that checks that the noop behavior is working,
the contract regarding slice stability is followed, and that
the bounds are correct.

Additionally, there are some minor tweaks involving removal
and addition of some defensive code. The additions are
related to the pebbleIterator.reusable field.

Release justification: Fix for high-severity bug in existing
functionality. The Pebble bug fix that needs to be compensated
here could lead to incorrect results from iterators, even prior
to the latest seek optimization in Pebble (though our existing
tests were not failing prior to that latest seek optimization).

Release note: None
  • Loading branch information
sumeerbhola committed Feb 26, 2021
1 parent dac79a2 commit 02007ae
Show file tree
Hide file tree
Showing 4 changed files with 237 additions and 36 deletions.
37 changes: 29 additions & 8 deletions pkg/storage/pebble.go
Original file line number Diff line number Diff line change
Expand Up @@ -1072,8 +1072,17 @@ func (p *Pebble) NewBatch() Batch {

// NewReadOnly implements the Engine interface.
func (p *Pebble) NewReadOnly() ReadWriter {
// TODO(sumeer): a sync.Pool for pebbleReadOnly would save on allocations
// for the underlying pebbleIterators.
return &pebbleReadOnly{
parent: p,
// Defensively set reusable=true. One has to be careful about this since
// an accidental false value would cause these iterators, that are value
// members of pebbleReadOnly, to be put in the pebbleIterPool.
prefixIter: pebbleIterator{reusable: true},
normalIter: pebbleIterator{reusable: true},
prefixEngineIter: pebbleIterator{reusable: true},
normalEngineIter: pebbleIterator{reusable: true},
}
}

Expand Down Expand Up @@ -1368,14 +1377,14 @@ func (p *pebbleReadOnly) NewMVCCIterator(iterKind MVCCIterKind, opts IterOptions
if iter.inuse {
panic("iterator already in use")
}
// Ensures no timestamp hints etc.
checkOptionsForIterReuse(opts)

if iter.iter != nil {
iter.setOptions(opts)
iter.setBounds(opts.LowerBound, opts.UpperBound)
} else {
iter.init(p.parent.db, p.iter, opts)
// The timestamp hints should be empty given the earlier code, but we are
// being defensive.
if p.iter == nil && opts.MaxTimestampHint.IsEmpty() && opts.MinTimestampHint.IsEmpty() {
if p.iter == nil {
// For future cloning.
p.iter = iter.iter
}
Expand Down Expand Up @@ -1403,14 +1412,14 @@ func (p *pebbleReadOnly) NewEngineIterator(opts IterOptions) EngineIterator {
if iter.inuse {
panic("iterator already in use")
}
// Ensures no timestamp hints etc.
checkOptionsForIterReuse(opts)

if iter.iter != nil {
iter.setOptions(opts)
iter.setBounds(opts.LowerBound, opts.UpperBound)
} else {
iter.init(p.parent.db, p.iter, opts)
// The timestamp hints should be empty given this is an EngineIterator,
// but we are being defensive.
if p.iter == nil && opts.MaxTimestampHint.IsEmpty() && opts.MinTimestampHint.IsEmpty() {
if p.iter == nil {
// For future cloning.
p.iter = iter.iter
}
Expand All @@ -1421,6 +1430,18 @@ func (p *pebbleReadOnly) NewEngineIterator(opts IterOptions) EngineIterator {
return iter
}

// checkOptionsForIterReuse checks that the options are appropriate for
// iterators that are reusable, and panics if not. This includes disallowing
// any timestamp hints.
func checkOptionsForIterReuse(opts IterOptions) {
if !opts.MinTimestampHint.IsEmpty() || !opts.MaxTimestampHint.IsEmpty() {
panic("iterator with timestamp hints cannot be reused")
}
if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 {
panic("iterator must set prefix or upper bound or lower bound")
}
}

// ConsistentIterators implements the Engine interface.
func (p *pebbleReadOnly) ConsistentIterators() bool {
return true
Expand Down
16 changes: 8 additions & 8 deletions pkg/storage/pebble_batch.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,18 +229,18 @@ func (p *pebbleBatch) NewMVCCIterator(iterKind MVCCIterKind, opts IterOptions) M
if iter.inuse {
panic("iterator already in use")
}
// Ensures no timestamp hints etc.
checkOptionsForIterReuse(opts)

if iter.iter != nil {
iter.setOptions(opts)
iter.setBounds(opts.LowerBound, opts.UpperBound)
} else {
if p.batch.Indexed() {
iter.init(p.batch, p.iter, opts)
} else {
iter.init(p.db, p.iter, opts)
}
// The timestamp hints should be empty given the earlier code, but we are
// being defensive.
if p.iter == nil && opts.MaxTimestampHint.IsEmpty() && opts.MinTimestampHint.IsEmpty() {
if p.iter == nil {
// For future cloning.
p.iter = iter.iter
}
Expand Down Expand Up @@ -271,18 +271,18 @@ func (p *pebbleBatch) NewEngineIterator(opts IterOptions) EngineIterator {
if iter.inuse {
panic("iterator already in use")
}
// Ensures no timestamp hints etc.
checkOptionsForIterReuse(opts)

if iter.iter != nil {
iter.setOptions(opts)
iter.setBounds(opts.LowerBound, opts.UpperBound)
} else {
if p.batch.Indexed() {
iter.init(p.batch, p.iter, opts)
} else {
iter.init(p.db, p.iter, opts)
}
// The timestamp hints should be empty given this is an EngineIterator,
// but we are being defensive.
if p.iter == nil && opts.MaxTimestampHint.IsEmpty() && opts.MinTimestampHint.IsEmpty() {
if p.iter == nil {
// For future cloning.
p.iter = iter.iter
}
Expand Down
86 changes: 66 additions & 20 deletions pkg/storage/pebble_iterator.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,11 @@ type pebbleIterator struct {
// use two slices for each of the bounds since this caller should not change
// the slice holding the current bounds, that the callee (pebble.MVCCIterator)
// is currently using, until after the caller has made the SetBounds call.
lowerBoundBuf [2][]byte
upperBoundBuf [2][]byte
curBuf int
lowerBoundBuf [2][]byte
upperBoundBuf [2][]byte
curBuf int
testingSetBoundsListener testingSetBoundsListener

// Set to true to govern whether to call SeekPrefixGE or SeekGE. Skips
// SSTables based on MVCC/Engine key when true.
prefix bool
Expand Down Expand Up @@ -75,11 +77,16 @@ type cloneableIter interface {
Clone() (*pebble.Iterator, error)
}

type testingSetBoundsListener interface {
postSetBounds(lower, upper []byte)
}

// Instantiates a new Pebble iterator, or gets one from the pool.
func newPebbleIterator(
handle pebble.Reader, iterToClone cloneableIter, opts IterOptions,
) *pebbleIterator {
iter := pebbleIterPool.Get().(*pebbleIterator)
iter.reusable = false // defensive
iter.init(handle, iterToClone, opts)
return iter
}
Expand Down Expand Up @@ -169,38 +176,63 @@ func (p *pebbleIterator) init(handle pebble.Reader, iterToClone cloneableIter, o
p.inuse = true
}

func (p *pebbleIterator) setOptions(opts IterOptions) {
// Overwrite any stale options from last time.
p.options = pebble.IterOptions{}

if !opts.MinTimestampHint.IsEmpty() || !opts.MaxTimestampHint.IsEmpty() {
panic("iterator with timestamp hints cannot be reused")
// setBounds is called to change the bounds on a pebbleIterator. Note that
// this is not the first time that bounds will be passed to the underlying
// pebble.Iterator. The existing bounds are in p.options.
func (p *pebbleIterator) setBounds(lowerBound, upperBound roachpb.Key) {
// If the roachpb.Key bound is nil, the corresponding bound for the
// pebble.Iterator will also be nil. p.options contains the current bounds
// known to the pebble.Iterator.
boundsChanged := ((lowerBound == nil) != (p.options.LowerBound == nil)) ||
((upperBound == nil) != (p.options.UpperBound == nil))
if !boundsChanged {
// The nil-ness is the same but the values may be different.
if lowerBound != nil {
// Both must be non-nil. We know that we've appended 0x00 to
// p.options.LowerBound, which must be ignored for this comparison.
if !bytes.Equal(p.options.LowerBound[:len(p.options.LowerBound)-1], lowerBound) {
boundsChanged = true
}
}
if !boundsChanged && upperBound != nil {
// Both must be non-nil. We know that we've appended 0x00 to
// p.options.UpperBound, which must be ignored for this comparison.
if !bytes.Equal(p.options.UpperBound[:len(p.options.UpperBound)-1], upperBound) {
boundsChanged = true
}
}
}
if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 {
panic("iterator must set prefix or upper bound or lower bound")
if !boundsChanged {
// This noop optimization helps the underlying pebble.Iterator to optimize
// seeks.
return
}

p.prefix = opts.Prefix
// Set the bounds to nil, before we selectively change them.
p.options.LowerBound = nil
p.options.UpperBound = nil
p.curBuf = (p.curBuf + 1) % 2
i := p.curBuf
if opts.LowerBound != nil {
if lowerBound != nil {
// This is the same as
// p.options.LowerBound = EncodeKeyToBuf(p.lowerBoundBuf[i][:0], MVCCKey{Key: opts.LowerBound}) .
// or EngineKey{Key: opts.LowerBound}.EncodeToBuf(...).
// p.options.LowerBound = EncodeKeyToBuf(p.lowerBoundBuf[i][:0], MVCCKey{Key: lowerBound}) .
// or EngineKey{Key: lowerBound}.EncodeToBuf(...).
// Since we are encoding keys with an empty version anyway, we can just
// append the NUL byte instead of calling the above encode functions which
// will do the same thing.
p.lowerBoundBuf[i] = append(p.lowerBoundBuf[i][:0], opts.LowerBound...)
p.lowerBoundBuf[i] = append(p.lowerBoundBuf[i][:0], lowerBound...)
p.lowerBoundBuf[i] = append(p.lowerBoundBuf[i], 0x00)
p.options.LowerBound = p.lowerBoundBuf[i]
}
if opts.UpperBound != nil {
if upperBound != nil {
// Same as above.
p.upperBoundBuf[i] = append(p.upperBoundBuf[i][:0], opts.UpperBound...)
p.upperBoundBuf[i] = append(p.upperBoundBuf[i][:0], upperBound...)
p.upperBoundBuf[i] = append(p.upperBoundBuf[i], 0x00)
p.options.UpperBound = p.upperBoundBuf[i]
}
p.iter.SetBounds(p.options.LowerBound, p.options.UpperBound)
if p.testingSetBoundsListener != nil {
p.testingSetBoundsListener.postSetBounds(p.options.LowerBound, p.options.UpperBound)
}
}

// Close implements the MVCCIterator interface.
Expand Down Expand Up @@ -624,11 +656,22 @@ func findSplitKeyUsingIterator(
return bestSplitKey, nil
}

// SetUpperBound implements the MVCCIterator interface.
// SetUpperBound implements the MVCCIterator interface. Note that this is not
// the first time that bounds will be passed to the underlying
// pebble.Iterator. The existing bounds are in p.options.
func (p *pebbleIterator) SetUpperBound(upperBound roachpb.Key) {
if upperBound == nil {
panic("SetUpperBound must not use a nil key")
}
if p.options.UpperBound != nil {
// We know that we've appended 0x00 to p.options.UpperBound, which must be
// ignored for this comparison.
if bytes.Equal(p.options.UpperBound[:len(p.options.UpperBound)-1], upperBound) {
// Nothing to do. This noop optimization helps the underlying
// pebble.Iterator to optimize seeks.
return
}
}
p.curBuf = (p.curBuf + 1) % 2
i := p.curBuf
if p.options.LowerBound != nil {
Expand All @@ -639,6 +682,9 @@ func (p *pebbleIterator) SetUpperBound(upperBound roachpb.Key) {
p.upperBoundBuf[i] = append(p.upperBoundBuf[i], 0x00)
p.options.UpperBound = p.upperBoundBuf[i]
p.iter.SetBounds(p.options.LowerBound, p.options.UpperBound)
if p.testingSetBoundsListener != nil {
p.testingSetBoundsListener.postSetBounds(p.options.LowerBound, p.options.UpperBound)
}
}

// Stats implements the MVCCIterator interface.
Expand Down
Loading

0 comments on commit 02007ae

Please sign in to comment.