Skip to content

Commit

Permalink
db: optimize SeekGE/SeekLT for the noop case
Browse files Browse the repository at this point in the history
This is especially useful for sparse parts of the
key space which may have large numbers of deleted
versions, as happens with the locks in CockroachDB.

Results from a CockroachDB benchmark are included
below. The separated=true case is what mainly benefits
from this. The actual new numbers are listed after
the diff.
name                                                                      old time/op    new time/op    delta
ScanAllIntentDeleted/separated=false/versions=200/percent-flushed=0-16      63.8µs ± 0%    50.2µs ± 0%   -21.27%
ScanAllIntentDeleted/separated=false/versions=200/percent-flushed=50-16     24.4µs ± 0%    24.8µs ± 0%    +1.81%
ScanAllIntentDeleted/separated=false/versions=200/percent-flushed=90-16     5.60µs ± 0%    4.28µs ± 0%   -23.68%
ScanAllIntentDeleted/separated=false/versions=200/percent-flushed=100-16    1.59µs ± 0%    1.07µs ± 0%   -33.04%
ScanAllIntentDeleted/separated=true/versions=200/percent-flushed=0-16       51.7ms ± 0%     0.0ms ± 0%   -99.99%
ScanAllIntentDeleted/separated=true/versions=200/percent-flushed=50-16      14.7ms ± 0%     0.0ms ± 0%   -99.99%
ScanAllIntentDeleted/separated=true/versions=200/percent-flushed=90-16      1.04ms ± 0%    0.00ms ± 0%   -99.85%

BenchmarkScanAllIntentDeleted/separated=false/versions=200/percent-flushed=0-16         	   23816	     50222 ns/op	       0 B/op	       0 allocs/op
BenchmarkScanAllIntentDeleted/separated=false/versions=200/percent-flushed=50-16        	   46825	     24840 ns/op	       0 B/op	       0 allocs/op
BenchmarkScanAllIntentDeleted/separated=false/versions=200/percent-flushed=90-16        	  304212	      4277 ns/op	       0 B/op	       0 allocs/op
BenchmarkScanAllIntentDeleted/separated=false/versions=200/percent-flushed=100-16       	 1342129	      1066 ns/op	       0 B/op	       0 allocs/op
BenchmarkScanAllIntentDeleted/separated=true/versions=200/percent-flushed=0-16          	  197700	      5267 ns/op	       0 B/op	       0 allocs/op
BenchmarkScanAllIntentDeleted/separated=true/versions=200/percent-flushed=50-16         	  673788	      1696 ns/op	       0 B/op	       0 allocs/op
BenchmarkScanAllIntentDeleted/separated=true/versions=200/percent-flushed=90-16         	  798824	      1530 ns/op	       0 B/op	       0 allocs/op
BenchmarkScanAllIntentDeleted/separated=true/versions=200/percent-flushed=100-16        	 1256680	      1027 ns/op	       0 B/op	       0 allocs/op
  • Loading branch information
sumeerbhola committed Feb 19, 2021
1 parent 444296c commit 959663f
Showing 1 changed file with 91 additions and 45 deletions.
136 changes: 91 additions & 45 deletions iterator.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,24 +65,24 @@ type IteratorMetrics struct {
// Next, Prev) return without advancing if the iterator has an accumulated
// error.
type Iterator struct {
opts IterOptions
cmp Compare
equal Equal
merge Merge
split Split
iter internalIterator
readState *readState
err error
key []byte
keyBuf []byte
value []byte
valueBuf []byte
valueCloser io.Closer
iterKey *InternalKey
iterValue []byte
alloc *iterAlloc
prefix []byte
readSampling readSampling
opts IterOptions
cmp Compare
equal Equal
merge Merge
split Split
iter internalIterator
readState *readState
err error
key []byte
keyBuf []byte
value []byte
valueBuf []byte
valueCloser io.Closer
iterKey *InternalKey
iterValue []byte
alloc *iterAlloc
prefixOrFullSeekKey []byte
readSampling readSampling

// Following fields are only used in Clone.
// Non-nil if this Iterator includes a Batch.
Expand All @@ -95,12 +95,22 @@ type Iterator struct {

valid bool
pos iterPos
// Relates to the prefix field above.
// Relates to the prefixOrFullSeekKey field above.
hasPrefix bool
// Used for deriving the value of SeekPrefixGE(..., trySeekUsingNext).
lastPositioningOpIsSeekPrefixGE bool
// Used for deriving the value of SeekPrefixGE(..., trySeekUsingNext),
// and SeekGE/SeekLT optimizations
lastPositioningOp lastPositioningOpKind
}

type lastPositioningOpKind int8

const (
unknownLastPositionOp lastPositioningOpKind = iota
seekPrefixGELastPositioningOp
seekGELastPositioningOp
seekLTLastPositioningOp
)

// readSampling stores variables used to sample a read to trigger a read
// compaction
type readSampling struct {
Expand Down Expand Up @@ -128,7 +138,7 @@ func (i *Iterator) findNextEntry() bool {
key := *i.iterKey

if i.hasPrefix {
if n := i.split(key.UserKey); !bytes.Equal(i.prefix, key.UserKey[:n]) {
if n := i.split(key.UserKey); !bytes.Equal(i.prefixOrFullSeekKey, key.UserKey[:n]) {
return false
}
}
Expand Down Expand Up @@ -425,18 +435,36 @@ func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) {
// than or equal to the given key. Returns true if the iterator is pointing at
// a valid entry and false otherwise.
func (i *Iterator) SeekGE(key []byte) bool {
lastPositioningOp := i.lastPositioningOp
// Set it to unknown, since this operation may not succeed, in which case
// the SeekGE following this should not make any assumption about iterator
// position.
i.lastPositioningOp = unknownLastPositionOp
i.err = nil // clear cached iteration error
i.hasPrefix = false
i.lastPositioningOpIsSeekPrefixGE = false
if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
key = lowerBound
} else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
key = upperBound
}

if lastPositioningOp == seekGELastPositioningOp && i.batch == nil {
cmp := i.cmp(i.prefixOrFullSeekKey, key)
// If this seek is to the same or later key, and the iterator is
// already positioned there, this is a noop. This can be helpful for
// sparse key spaces that have many deleted keys, where one can avoid
// the overhead of iterating past them again and again.
if cmp <= 0 && (!i.valid || i.cmp(key, i.key) <= 0) {
i.lastPositioningOp = seekGELastPositioningOp
return i.valid
}
}
i.iterKey, i.iterValue = i.iter.SeekGE(key)
valid := i.findNextEntry()
i.maybeSampleRead()
if i.Error() == nil {
i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
i.lastPositioningOp = seekGELastPositioningOp
}
return valid
}

Expand Down Expand Up @@ -484,11 +512,11 @@ func (i *Iterator) SeekGE(key []byte) bool {
//
// See Example_prefixiteration for a working example.
func (i *Iterator) SeekPrefixGE(key []byte) bool {
lastPositioningOpIsSeekPrefixGE := i.lastPositioningOpIsSeekPrefixGE
// Set it to false, since this operation may not succeed, in which case
lastPositioningOp := i.lastPositioningOp
// Set it to unknown, since this operation may not succeed, in which case
// the SeekPrefixGE following this should not make any assumption about
// iterator position.
i.lastPositioningOpIsSeekPrefixGE = false
i.lastPositioningOp = unknownLastPositionOp
i.err = nil // clear cached iteration error

if i.split == nil {
Expand All @@ -498,7 +526,7 @@ func (i *Iterator) SeekPrefixGE(key []byte) bool {
prefixLen := i.split(key)
keyPrefix := key[:prefixLen]
trySeekUsingNext := false
if lastPositioningOpIsSeekPrefixGE {
if lastPositioningOp == seekPrefixGELastPositioningOp {
if !i.hasPrefix {
panic("lastPositioningOpsIsSeekPrefixGE is true, but hasPrefix is false")
}
Expand All @@ -511,7 +539,7 @@ func (i *Iterator) SeekPrefixGE(key []byte) bool {
// become de-optimizations (if one usually has to do all the next
// calls and then the seek). This SeekPrefixGE optimization
// specifically benefits CockroachDB.
cmp := i.cmp(i.prefix, keyPrefix)
cmp := i.cmp(i.prefixOrFullSeekKey, keyPrefix)
// cmp == 0 is not safe to optimize since
// - i.pos could be at iterPosNext, due to a merge.
// - Even if i.pos were at iterPosCurForward, we could have a DELETE,
Expand All @@ -528,33 +556,33 @@ func (i *Iterator) SeekPrefixGE(key []byte) bool {
}
// Make a copy of the prefix so that modifications to the key after
// SeekPrefixGE returns does not affect the stored prefix.
if cap(i.prefix) < prefixLen {
i.prefix = make([]byte, prefixLen)
if cap(i.prefixOrFullSeekKey) < prefixLen {
i.prefixOrFullSeekKey = make([]byte, prefixLen)
} else {
i.prefix = i.prefix[:prefixLen]
i.prefixOrFullSeekKey = i.prefixOrFullSeekKey[:prefixLen]
}
i.hasPrefix = true
copy(i.prefix, keyPrefix)
copy(i.prefixOrFullSeekKey, keyPrefix)

if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
if n := i.split(lowerBound); !bytes.Equal(i.prefix, lowerBound[:n]) {
if n := i.split(lowerBound); !bytes.Equal(i.prefixOrFullSeekKey, lowerBound[:n]) {
i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of lower bound")
return false
}
key = lowerBound
} else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
if n := i.split(upperBound); !bytes.Equal(i.prefix, upperBound[:n]) {
if n := i.split(upperBound); !bytes.Equal(i.prefixOrFullSeekKey, upperBound[:n]) {
i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of upper bound")
return false
}
key = upperBound
}

i.iterKey, i.iterValue = i.iter.SeekPrefixGE(i.prefix, key, trySeekUsingNext)
i.iterKey, i.iterValue = i.iter.SeekPrefixGE(i.prefixOrFullSeekKey, key, trySeekUsingNext)
valid := i.findNextEntry()
i.maybeSampleRead()
if i.Error() == nil {
i.lastPositioningOpIsSeekPrefixGE = true
i.lastPositioningOp = seekPrefixGELastPositioningOp
}
return valid
}
Expand All @@ -572,18 +600,36 @@ func disableSeekOpt(key []byte, ptr uintptr) bool {
// the given key. Returns true if the iterator is pointing at a valid entry and
// false otherwise.
func (i *Iterator) SeekLT(key []byte) bool {
lastPositioningOp := i.lastPositioningOp
// Set it to unknown, since this operation may not succeed, in which case
// the SeekLT following this should not make any assumption about iterator
// position.
i.lastPositioningOp = unknownLastPositionOp
i.err = nil // clear cached iteration error
i.hasPrefix = false
i.lastPositioningOpIsSeekPrefixGE = false
if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
key = upperBound
} else if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
key = lowerBound
}

if lastPositioningOp == seekLTLastPositioningOp && i.batch == nil {
cmp := i.cmp(key, i.prefixOrFullSeekKey)
// If this seek is to the same or earlier key, and the iterator is
// already positioned there, this is a noop. This can be helpful for
// sparse key spaces that have many deleted keys, where one can avoid
// the overhead of iterating past them again and again.
if cmp <= 0 && (!i.valid || i.cmp(i.key, key) <= 0) {
i.lastPositioningOp = seekLTLastPositioningOp
return i.valid
}
}
i.iterKey, i.iterValue = i.iter.SeekLT(key)
valid := i.findPrevEntry()
i.maybeSampleRead()
if i.Error() == nil {
i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
i.lastPositioningOp = seekLTLastPositioningOp
}
return valid
}

Expand All @@ -592,7 +638,7 @@ func (i *Iterator) SeekLT(key []byte) bool {
func (i *Iterator) First() bool {
i.err = nil // clear cached iteration error
i.hasPrefix = false
i.lastPositioningOpIsSeekPrefixGE = false
i.lastPositioningOp = unknownLastPositionOp
if lowerBound := i.opts.GetLowerBound(); lowerBound != nil {
i.iterKey, i.iterValue = i.iter.SeekGE(lowerBound)
} else {
Expand All @@ -608,7 +654,7 @@ func (i *Iterator) First() bool {
func (i *Iterator) Last() bool {
i.err = nil // clear cached iteration error
i.hasPrefix = false
i.lastPositioningOpIsSeekPrefixGE = false
i.lastPositioningOp = unknownLastPositionOp
if upperBound := i.opts.GetUpperBound(); upperBound != nil {
i.iterKey, i.iterValue = i.iter.SeekLT(upperBound)
} else {
Expand All @@ -625,7 +671,7 @@ func (i *Iterator) Next() bool {
if i.err != nil {
return false
}
i.lastPositioningOpIsSeekPrefixGE = false
i.lastPositioningOp = unknownLastPositionOp
switch i.pos {
case iterPosCurForward:
i.nextUserKey()
Expand Down Expand Up @@ -676,7 +722,7 @@ func (i *Iterator) Prev() bool {
if i.err != nil {
return false
}
i.lastPositioningOpIsSeekPrefixGE = false
i.lastPositioningOp = unknownLastPositionOp
if i.hasPrefix {
i.err = errReversePrefixIteration
return false
Expand Down Expand Up @@ -808,8 +854,8 @@ func (i *Iterator) SetBounds(lower, upper []byte) {
return
}
// Even though this is not a positioning operation, the alteration of the
// bounds means we cannot optimize SeekPrefixGE by using Next.
i.lastPositioningOpIsSeekPrefixGE = false
// bounds means we cannot optimize Seeks by using Next.
i.lastPositioningOp = unknownLastPositionOp
i.hasPrefix = false
i.iterKey = nil
i.iterValue = nil
Expand Down

0 comments on commit 959663f

Please sign in to comment.