Skip to content

Commit

Permalink
storage: implement iter-before-seek optimization for LockTableIterator
Browse files Browse the repository at this point in the history
Informs cockroachdb#100193.

This commit addresses a TODO left by cockroachdb#110319 to implement an "iter
before seek" optimization in the LockTableIterator, similar to the one
that exists in the pebbleMVCCScanner. The optimization places an upper
bound on the number of iterations that a LockTableIterator that is
configured to ignore some or all shared locks will perform across the
shared locks on a single user key before seeking past them. This is used
to avoid iterating over all shared locks on a key when not necessary.

The optimization achieves the goal of avoiding cases of O(ignored_locks)
work in the LockTableIterator, instead performing at most
O(matching_locks + locked_keys) work. This is important for iteration
over the lock table (e.g. intentInterleavingIter), lock acquisition
(MVCCAcquireLock), and lock release (mvccReleaseLockInternal). There is
a caveat to these complexity bounds, however, in that they do not
consider LSM tombstones. This is being discussed in cockroachdb#110324.

Release note: None
  • Loading branch information
nvanbenschoten authored and Thomas Hardy committed Oct 4, 2023
1 parent 00f1957 commit 2d2d638
Show file tree
Hide file tree
Showing 7 changed files with 258 additions and 54 deletions.
3 changes: 3 additions & 0 deletions pkg/storage/intent_interleaving_iter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,9 @@ func checkAndOutputIter(iter MVCCIterator, b *strings.Builder) {
func TestIntentInterleavingIter(t *testing.T) {
defer leaktest.AfterTest(t)()

// Disable the metamorphic value for deterministic iteration stats.
DisableMetamorphicLockTableItersBeforeSeek(t)

var eng Engine
defer func() {
if eng != nil {
Expand Down
177 changes: 167 additions & 10 deletions pkg/storage/lock_table_iterator.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
Expand Down Expand Up @@ -64,6 +65,10 @@ type LockTableIterator struct {
// If set, return locks held by any transaction with this strength or
// stronger.
matchMinStr lock.Strength
// Used to avoid iterating over all shared locks on a key when not necessary,
// given the filtering criteria. See the comment about "skip past locks" above
// for details about why this is important.
itersBeforeSeek lockTableItersBeforeSeekHelper
}

var _ EngineIterator = &LockTableIterator{}
Expand Down Expand Up @@ -127,9 +132,10 @@ func NewLockTableIterator(
}
ltIter := lockTableIteratorPool.Get().(*LockTableIterator)
*ltIter = LockTableIterator{
iter: iter,
matchTxnID: opts.MatchTxnID,
matchMinStr: opts.MatchMinStr,
iter: iter,
matchTxnID: opts.MatchTxnID,
matchMinStr: opts.MatchMinStr,
itersBeforeSeek: ltIter.itersBeforeSeek,
}
return ltIter, nil
}
Expand Down Expand Up @@ -248,6 +254,7 @@ func (i *LockTableIterator) PrevEngineKeyWithLimit(
func (i *LockTableIterator) advanceToMatchingLock(
dir int, limit roachpb.Key,
) (state pebble.IterValidityState, err error) {
defer i.itersBeforeSeek.reset()
for {
engineKey, err := i.iter.UnsafeEngineKey()
if err != nil {
Expand All @@ -261,14 +268,75 @@ func (i *LockTableIterator) advanceToMatchingLock(
return pebble.IterValid, nil
}

// TODO(nvanbenschoten): implement a maxItersBeforeSeek-like algorithm
// to skip over ignored locks and bound the work performed by the
// iterator for ignored locks.
// We found a non-matching lock. Determine whether to step or seek past it.
// We only ever seek if we found a shared lock, because no other locking
// strength allows for multiple locks to be held by different transactions
// on the same key.
var seek bool
if str == lock.Shared {
seek = i.itersBeforeSeek.shouldSeek(engineKey.Key)
}

if dir < 0 {
state, err = i.iter.PrevEngineKeyWithLimit(limit)
// Advance to the next key, either by stepping or seeking.
if seek {
ltKey, ltKeyErr := engineKey.ToLockTableKey()
if ltKeyErr != nil {
return 0, ltKeyErr
}
seekKeyBuf := &i.itersBeforeSeek.seekKeyBuf
var seekKey EngineKey
if dir < 0 {
// If iterating backwards and searching for locks held by a specific
// transaction, determine whether we have yet to reach key/shared/txnID
// or have already passed it. If we have not yet passed it, seek to the
// specific version, remembering to offset the txn ID by 1 to account
// for the exclusive reverse seek. Otherwise, seek past the maximum
// (first) txn ID to the previous locking strength (exclusive).
// NOTE: Recall that txnIDs in the lock table key version are ordered in
// reverse lexicographical order.
if i.matchTxnID != uuid.Nil && bytes.Compare(txnID.GetBytes(), i.matchTxnID.GetBytes()) < 0 {
// The subtraction cannot underflow because matchTxnID cannot be the
// zero UUID if we are in this branch, with the iterator positioned
// after the matchTxnID. Assert for good measure.
if i.matchTxnID == uuid.Nil {
panic("matchTxnID is unexpectedly the zero UUID")
}
ltKey.TxnUUID = uuid.FromUint128(i.matchTxnID.ToUint128().Sub(1))
seekKey, *seekKeyBuf = ltKey.ToEngineKey(*seekKeyBuf)
} else {
ltKey.TxnUUID = uuid.Max
seekKey, *seekKeyBuf = ltKey.ToEngineKey(*seekKeyBuf)
}
state, err = i.iter.SeekEngineKeyLTWithLimit(seekKey, limit)
} else {
// If iterating forwards and searching for locks held by a specific
// transaction, determine whether we have yet to reach /key/shared/txnID
// or have already passed it. If we have not yet passed it, seek to the
// specific version. Otherwise, seek to the next key prefix.
// NOTE: Recall that txnIDs in the lock table key version are ordered in
// reverse lexicographical order.
// NOTE: Recall that shared locks are ordered last for a given key.
if i.matchTxnID != uuid.Nil && bytes.Compare(txnID.GetBytes(), i.matchTxnID.GetBytes()) > 0 {
ltKey.TxnUUID = i.matchTxnID
seekKey, *seekKeyBuf = ltKey.ToEngineKey(*seekKeyBuf)
} else {
// TODO(nvanbenschoten): for now, we call SeekEngineKeyGEWithLimit
// with the prefix of the next lock table key. If EngineIterator
// exposed an interface that called NextPrefix(), we could use that
// instead. This will require adding a NextPrefixWithLimit() method
// to pebble.
var seekKeyPrefix roachpb.Key
seekKeyPrefix, *seekKeyBuf = keys.LockTableSingleNextKey(ltKey.Key, *seekKeyBuf)
seekKey = EngineKey{Key: seekKeyPrefix}
}
state, err = i.iter.SeekEngineKeyGEWithLimit(seekKey, limit)
}
} else {
state, err = i.iter.NextEngineKeyWithLimit(limit)
if dir < 0 {
state, err = i.iter.PrevEngineKeyWithLimit(limit)
} else {
state, err = i.iter.NextEngineKeyWithLimit(limit)
}
}
if state != pebble.IterValid || err != nil {
return state, err
Expand All @@ -288,7 +356,9 @@ func (i *LockTableIterator) matchingLock(str lock.Strength, txnID uuid.UUID) boo
// Close implements the EngineIterator interface.
func (i *LockTableIterator) Close() {
i.iter.Close()
*i = LockTableIterator{}
*i = LockTableIterator{
itersBeforeSeek: i.itersBeforeSeek,
}
lockTableIteratorPool.Put(i)
}

Expand Down Expand Up @@ -408,3 +478,90 @@ func checkLockTableKeyOrNil(key roachpb.Key) error {
}
return checkLockTableKey(key)
}

// defaultLockTableItersBeforeSeek is the default value for the
// lockTableItersBeforeSeek metamorphic value.
const defaultLockTableItersBeforeSeek = 5

// lockTableItersBeforeSeek is the number of iterations to perform across the
// shared locks on a single user key before seeking past them. This is used to
// avoid iterating over all shared locks on a key when not necessary, given the
// filtering criteria.
var lockTableItersBeforeSeek = util.ConstantWithMetamorphicTestRange(
"lock-table-iters-before-seek",
defaultLockTableItersBeforeSeek, /* defaultValue */
0, /* min */
3, /* max */
)

// DisableMetamorphicLockTableItersBeforeSeek disables the metamorphic value for
// the duration of a test, resetting it at the end.
func DisableMetamorphicLockTableItersBeforeSeek(t interface {
Helper()
Cleanup(func())
}) {
t.Helper()
prev := lockTableItersBeforeSeek
lockTableItersBeforeSeek = defaultLockTableItersBeforeSeek
t.Cleanup(func() {
lockTableItersBeforeSeek = prev
})
}

// lockTableItersBeforeSeekHelper is a helper struct that keeps track of the
// number of iterations performed across the shared locks on a single user key
// while searching for matching locks in the lock table. It is used to determine
// when to seek past the shared locks to avoid O(ignored_locks) work.
//
// This is similar to the dynamic itersBeforeSeek algorithm that is used by
// pebbleMVCCScanner when scanning over mvcc versions for a key. However, we
// don't adaptively adjust the number of itersBeforeSeek as we go. Instead, we
// reset the iteration counter to lockTableItersBeforeSeek (default: 5) on each
// new key prefix. Doing something more sophisticated introduces complexity and
// it's not clear that this is worth it.
//
// The zero value is ready to use.
type lockTableItersBeforeSeekHelper struct {
curItersBeforeSeek int
curKeyPrefix roachpb.Key

// Buffers that avoids allocations.
keyPrefixBuf []byte
seekKeyBuf []byte
}

func (h *lockTableItersBeforeSeekHelper) reset() {
// Clearing the curKeyPrefix ensures that the next call to shouldSeek() will
// save the new key prefix and reset curItersBeforeSeek. This is why the zero
// value of the struct is ready to use.
h.curKeyPrefix = nil
}

func (h *lockTableItersBeforeSeekHelper) shouldSeek(keyPrefix roachpb.Key) bool {
if h.alwaysSeek() {
return true
}
if !h.curKeyPrefix.Equal(keyPrefix) {
// New key prefix (or curKeyPrefix was nil). Save it and reset the iteration
// count.
h.saveKeyPrefix(keyPrefix)
h.curItersBeforeSeek = lockTableItersBeforeSeek
} else {
// Same key prefix as before. Check if we should seek.
if h.curItersBeforeSeek == 0 {
return true
}
}
h.curItersBeforeSeek--
return false
}

func (h *lockTableItersBeforeSeekHelper) alwaysSeek() bool {
// Only returns true in tests when the metamorphic value is set to 0.
return lockTableItersBeforeSeek == 0
}

func (h *lockTableItersBeforeSeekHelper) saveKeyPrefix(keyPrefix roachpb.Key) {
h.keyPrefixBuf = append(h.keyPrefixBuf[:0], keyPrefix...)
h.curKeyPrefix = h.keyPrefixBuf
}
42 changes: 42 additions & 0 deletions pkg/storage/lock_table_iterator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,9 @@ func TestLockTableIterator(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

// Disable the metamorphic value for deterministic iteration stats.
DisableMetamorphicLockTableItersBeforeSeek(t)

var eng Engine
defer func() {
if eng != nil {
Expand Down Expand Up @@ -639,3 +642,42 @@ func TestLockTableIteratorEquivalence(t *testing.T) {

require.NoError(t, quick.CheckEqual(lockTableIter, preFilterIter, nil))
}

func TestLockTableItersBeforeSeekHelper(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

// Disable the metamorphic value.
DisableMetamorphicLockTableItersBeforeSeek(t)

// Check that the value is 5.
require.Equal(t, 5, lockTableItersBeforeSeek)

var h lockTableItersBeforeSeekHelper
keyA, keyB := roachpb.Key("a"), roachpb.Key("b")

// Seek to keyA. Should start stepping.
require.False(t, h.shouldSeek(keyA))
// Step. Same key. Should step again.
require.False(t, h.shouldSeek(keyA))
// Step. Same key. Should step again.
require.False(t, h.shouldSeek(keyA))
// Step. Same key. Should step again.
require.False(t, h.shouldSeek(keyA))
// Step. Same key. Should step again.
require.False(t, h.shouldSeek(keyA))
// Step. Same key. Should start seeking.
require.True(t, h.shouldSeek(keyA))
// Seek. Same key. Should keep seeking if not new key prefix.
require.True(t, h.shouldSeek(keyA))
// Seek. New key. Should start stepping again.
require.False(t, h.shouldSeek(keyB))

// Test that the key is copied and not referenced.
for i := 0; i < lockTableItersBeforeSeek; i++ {
keyUnstable := roachpb.Key("unstable")
require.False(t, h.shouldSeek(keyUnstable))
keyUnstable[0] = 'a'
}
require.True(t, h.shouldSeek(roachpb.Key("unstable")))
}
4 changes: 2 additions & 2 deletions pkg/storage/pebble_mvcc_scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ type pebbleMVCCScanner struct {
// Stores any error returned. If non-nil, iteration short circuits.
err error
// Number of iterations to try before we do a Seek/SeekReverse. Stays within
// [0, maxItersBeforeSeek] and defaults to maxItersBeforeSeek/2 .
// [0, maxItersBeforeSeek] and defaults to maxItersBeforeSeek/2.
itersBeforeSeek int
// machine is the state machine for how the iterator should be advanced in
// order to handle scans and reverse scans.
Expand Down Expand Up @@ -711,7 +711,7 @@ func (p *pebbleMVCCScanner) afterScan() (*roachpb.Span, kvpb.ResumeReason, int64
return nil, 0, 0, nil
}

// Increments itersBeforeSeek while ensuring it stays <= maxItersBeforeSeek
// Increments itersBeforeSeek while ensuring it stays <= maxItersBeforeSeek.
func (p *pebbleMVCCScanner) incrementItersBeforeSeek() {
p.itersBeforeSeek++
if p.itersBeforeSeek > maxItersBeforeSeek {
Expand Down
8 changes: 4 additions & 4 deletions pkg/storage/testdata/intent_interleaving_iter/basic
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ next: output: value k=b ts=30.000000000,0 v=b30
next: output: meta k=c
next: output: value k=d ts=25.000000000,0 v=d25
next: output: .
stats: (interface (dir, seek, step): (fwd, 2, 19), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 15), (rev, 0, 0))
stats: (interface (dir, seek, step): (fwd, 3, 18), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 3, 14), (rev, 0, 0))
prev: output: value k=d ts=25.000000000,0 v=d25
prev: output: meta k=c
prev: output: value k=b ts=30.000000000,0 v=b30
Expand All @@ -81,16 +81,16 @@ prev: output: value k=a ts=10.000000000,0 v=a10
prev: output: value k=a ts=20.000000000,0 v=a20
prev: output: meta k=a ts=20.000000000,0 txn=1
prev: output: .
stats: (interface (dir, seek, step): (fwd, 2, 19), (rev, 0, 19)), (internal (dir, seek, step): (fwd, 2, 15), (rev, 2, 15))
stats: (interface (dir, seek, step): (fwd, 3, 18), (rev, 1, 18)), (internal (dir, seek, step): (fwd, 3, 14), (rev, 3, 15))
seek-ge "b"/0,0: output: meta k=b ts=30.000000000,0 txn=2
stats: (interface (dir, seek, step): (fwd, 4, 19), (rev, 0, 19)), (internal (dir, seek, step): (fwd, 4, 15), (rev, 2, 15))
stats: (interface (dir, seek, step): (fwd, 5, 18), (rev, 1, 18)), (internal (dir, seek, step): (fwd, 5, 14), (rev, 3, 15))
next: output: value k=b ts=30.000000000,0 v=b30
next: output: meta k=c
prev: output: value k=b ts=30.000000000,0 v=b30
prev: output: meta k=b ts=30.000000000,0 txn=2
prev: output: value k=a ts=10.000000000,0 v=a10
seek-lt "b"/0,0: output: value k=a ts=10.000000000,0 v=a10
stats: (interface (dir, seek, step): (fwd, 4, 23), (rev, 2, 27)), (internal (dir, seek, step): (fwd, 4, 18), (rev, 4, 26))
stats: (interface (dir, seek, step): (fwd, 5, 22), (rev, 3, 26)), (internal (dir, seek, step): (fwd, 5, 17), (rev, 5, 26))
next: output: meta k=b ts=30.000000000,0 txn=2
prev: output: value k=a ts=10.000000000,0 v=a10
prev: output: value k=a ts=20.000000000,0 v=a20
Expand Down
Loading

0 comments on commit 2d2d638

Please sign in to comment.