storage: performance improvements for intent resolution

When intents are separated, the version suffix is the txn UUID, which bears no ordering relationship to preceding txns. In contrast, with interleaved intents, which reuse the same key, the latest intent has the highest Pebble seqnum. There is another way in which way separated intents differ: usually we use a SingleDelete to remove a previous intent written using a Set. When these meet as part of a flush or compaction, both disappear. In contrast, with interleaved intents, we write Delete to remove a previous intent. When they meet in a compaction/flush, the Set disappears, but the Delete will not typically disappear until it reaches L6 (if there are any overlapping files in lower levels). So if Delete/SingleDelete have been compacted, there are usally fewer obsolete key seqnums in Pebble with SingleDelete. We consider three kinds of reads when many intents have been written and resolved for a key, and compactions have not happened. Note, that this should not be common, especially in a node with write traffic spread across many keys and ranges that share the same LSM. But it has been observed in tests, and we may as well improve this behavior. - Read of a specific intent written by a txn: a Seek is used to find that intent. With interleaved intents, the highest seqnum for that key is that live intent. With separated intents, on average, that live intent will have half the deleted intents preceding it and half succeeding it. The current code for separated intents does not optimize, by using the txn UUID to skip past these deleted intents. This is fixed in this PR, and affects MVCCResolveWriteIntent. - Read of an intent followed by read of the latest version: This can happen on the read path. With interleaved intents, one does not need to traverse any Deletes/Sets before encountering the intent, but calling Next on the pebble.Iterator will cause it to traverse all these deleted intents. With separated intents, half of these will be traversed before the intent and half after. So the count that is traversed is the same. Additionally, any compaction/flush will help more with separated intents since both SingleDelete and Set disappear. - Read of all intents in a range: this happens with MVCCResolveWriteIntentRange. It uses Seeks to go from one intent to another. This is a bad case for separated intents since it needs to iterate over half the deleted intents for each live intent. I don't know of a way to optimize this, if indeed this is important to do, without changing the lock table key to prefix the txn UUID with the timestamp (which would make it longer and introduce expense). This PR adds 3 benchmarks that mimic the above scenarios: BenchmarkIntentResolution, BenchmarkIntentScan, BenchmarkIntentRangeResolution. These benchmarks vary the number of versions and how many are flushed. Note that for the 400 version case, some flushing happens even without the explicit flush. Also, these benchmarks don't populate lower levels so the tombstone elision code in Pebble will manage to elide even the Delete operations and not just the SingleDeletes when doing the flush. It adds the following optimizations: - MVCCIterator.SeekIntentGE can be used to seek to an intent for a particular txn UUID. It is used by MVCCResolveWriteIntent. - Avoid seeking twice for MVCCResolveWriteIntentRange since the iterator is already positioned correctly. - Memory allocation improvements to LockTableKey.ToEngineKey and intentInterleavingReader. Informs cockroachdb#41720 Release note: None
knz · Feb 18, 2021 · d1c91e0 · d1c91e0
1 parent 9171f18
commit d1c91e0
Show file tree

Hide file tree

Showing 12 changed files with 403 additions and 61 deletions.
diff --git a/pkg/keys/keys.go b/pkg/keys/keys.go
@@ -414,8 +414,9 @@ func QueueLastProcessedKey(key roachpb.RKey, queue string) roachpb.Key {
 }
 
 // LockTableSingleKey creates a key under which all single-key locks for the
-// given key can be found. buf is used as scratch-space to avoid allocations
-// -- its contents will be overwritten and not appended to.
+// given key can be found. buf is used as scratch-space, up to its capacity,
+// to avoid allocations -- its contents will be overwritten and not appended
+// to.
 // Note that there can be multiple locks for the given key, but those are
 // distinguished using the "version" which is not in scope of the keys
 // package.

diff --git a/pkg/kv/kvserver/spanset/batch.go b/pkg/kv/kvserver/spanset/batch.go
@@ -87,6 +87,12 @@ func (i *MVCCIterator) SeekGE(key storage.MVCCKey) {
 	i.checkAllowed(roachpb.Span{Key: key.Key}, true)
 }
 
+// SeekIntentGE is part of the storage.MVCCIterator interface.
+func (i *MVCCIterator) SeekIntentGE(key roachpb.Key, txnUUID uuid.UUID) {
+	i.i.SeekIntentGE(key, txnUUID)
+	i.checkAllowed(roachpb.Span{Key: key}, true)
+}
+
 // SeekLT is part of the storage.MVCCIterator interface.
 func (i *MVCCIterator) SeekLT(key storage.MVCCKey) {
 	i.i.SeekLT(key)

diff --git a/pkg/storage/bench_pebble_test.go b/pkg/storage/bench_pebble_test.go
@@ -18,6 +18,7 @@ import (
 
 	"github.com/cockroachdb/cockroach/pkg/base"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/testutils/skip"
 	"github.com/cockroachdb/cockroach/pkg/util/encoding"
 	"github.com/cockroachdb/cockroach/pkg/util/hlc"
@@ -52,6 +53,11 @@ func setupMVCCPebble(b testing.TB, dir string) Engine {
 }
 
 func setupMVCCInMemPebble(b testing.TB, loc string) Engine {
+	return setupMVCCInMemPebbleWithSettings(b, makeSettingsForSeparatedIntents(
+		false /* oldClusterVersion */, true /* enabled */))
+}
+
+func setupMVCCInMemPebbleWithSettings(b testing.TB, settings *cluster.Settings) Engine {
 	opts := DefaultPebbleOptions()
 	opts.FS = vfs.NewMem()
 	opts.Cache = pebble.NewCache(testCacheSize)
@@ -61,6 +67,9 @@ func setupMVCCInMemPebble(b testing.TB, loc string) Engine {
 		context.Background(),
 		PebbleConfig{
 			Opts: opts,
+			StorageConfig: base.StorageConfig{
+				Settings: settings,
+			},
 		})
 	if err != nil {
 		b.Fatalf("could not create new in-mem pebble instance: %+v", err)

diff --git a/pkg/storage/bench_test.go b/pkg/storage/bench_test.go
@@ -33,7 +33,10 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/randutil"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
+	"github.com/cockroachdb/cockroach/pkg/util/uint128"
+	"github.com/cockroachdb/cockroach/pkg/util/uuid"
 	"github.com/cockroachdb/errors/oserror"
+	"github.com/stretchr/testify/require"
 )
 
 // Note: most benchmarks in this package have an engine-specific Benchmark
@@ -135,6 +138,218 @@ func BenchmarkExportToSst(b *testing.B) {
 	}
 }
 
+const numIntentKeys = 1000
+
+func setupKeysWithIntent(
+	b *testing.B, eng Engine, numVersions int, numFlushedVersions int,
+) roachpb.LockUpdate {
+	txnIDCount := 2 * numVersions
+	val := []byte("value")
+	var lockUpdate roachpb.LockUpdate
+	for i := 1; i <= numVersions; i++ {
+		// Assign txn IDs in a deterministic way that will mimic the end result of
+		// random assignment -- the live intent is centered between dead intents,
+		// when we have separated intents.
+		txnID := i
+		if i%2 == 0 {
+			txnID = txnIDCount - txnID
+		}
+		txnUUID := uuid.FromUint128(uint128.FromInts(0, uint64(txnID)))
+		ts := hlc.Timestamp{WallTime: int64(i)}
+		txn := roachpb.Transaction{
+			TxnMeta: enginepb.TxnMeta{
+				ID:             txnUUID,
+				Key:            []byte("foo"),
+				WriteTimestamp: ts,
+				MinTimestamp:   ts,
+			},
+			Status:                 roachpb.PENDING,
+			ReadTimestamp:          ts,
+			GlobalUncertaintyLimit: ts,
+		}
+		value := roachpb.Value{RawBytes: val}
+		batch := eng.NewBatch()
+		for j := 0; j < numIntentKeys; j++ {
+			key := makeKey(nil, j)
+			require.NoError(b, MVCCPut(context.Background(), batch, nil, key, ts, value, &txn))
+		}
+		require.NoError(b, batch.Commit(true))
+		batch.Close()
+		lockUpdate = roachpb.LockUpdate{
+			Txn:    txn.TxnMeta,
+			Status: roachpb.COMMITTED,
+		}
+		if i < numVersions {
+			batch := eng.NewBatch()
+			for j := 0; j < numIntentKeys; j++ {
+				key := makeKey(nil, j)
+				lockUpdate.Key = key
+				found, err := MVCCResolveWriteIntent(context.Background(), batch, nil, lockUpdate)
+				require.Equal(b, true, found)
+				require.NoError(b, err)
+			}
+			require.NoError(b, batch.Commit(true))
+			batch.Close()
+		}
+		if i == numFlushedVersions {
+			require.NoError(b, eng.Flush())
+		}
+	}
+	return lockUpdate
+}
+
+func BenchmarkIntentScan(b *testing.B) {
+	skip.UnderShort(b, "setting up unflushed data takes too long")
+	for _, sep := range []bool{false, true} {
+		b.Run(fmt.Sprintf("separated=%t", sep), func(b *testing.B) {
+			for _, numVersions := range []int{10, 100, 200, 400} {
+				b.Run(fmt.Sprintf("versions=%d", numVersions), func(b *testing.B) {
+					for _, percentFlushed := range []int{0, 50, 80, 90, 100} {
+						b.Run(fmt.Sprintf("percent-flushed=%d", percentFlushed), func(b *testing.B) {
+							eng := setupMVCCInMemPebbleWithSettings(
+								b, makeSettingsForSeparatedIntents(false, sep))
+							numFlushedVersions := (percentFlushed * numVersions) / 100
+							setupKeysWithIntent(b, eng, numVersions, numFlushedVersions)
+							lower := makeKey(nil, 0)
+							iter := eng.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{
+								LowerBound: lower,
+								UpperBound: makeKey(nil, numIntentKeys),
+							})
+							iter.SeekGE(MVCCKey{Key: lower})
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								valid, err := iter.Valid()
+								if err != nil {
+									b.Fatal(err)
+								}
+								if !valid {
+									iter.SeekGE(MVCCKey{Key: lower})
+								} else {
+									// Read intent.
+									k := iter.UnsafeKey()
+									if k.IsValue() {
+										b.Fatalf("expected intent %s", k.String())
+									}
+									// Read latest version.
+									//
+									// This Next dominates the cost of the benchmark when
+									// percent-flushed is < 100, since the pebble.Iterator has
+									// to iterate over all the Deletes/SingleDeletes/Sets
+									// corresponding to resolved intents.
+									iter.Next()
+									valid, err = iter.Valid()
+									if !valid || err != nil {
+										b.Fatalf("valid: %t, err: %s", valid, err)
+									}
+									k = iter.UnsafeKey()
+									if !k.IsValue() {
+										b.Fatalf("expected value")
+									}
+									// Skip to next key. This dominates the cost of the benchmark,
+									// when percent-flushed=100.
+									iter.NextKey()
+								}
+							}
+						})
+					}
+				})
+			}
+		})
+	}
+}
+
+func BenchmarkIntentResolution(b *testing.B) {
+	skip.UnderShort(b, "setting up unflushed data takes too long")
+	for _, sep := range []bool{false, true} {
+		b.Run(fmt.Sprintf("separated=%t", sep), func(b *testing.B) {
+			for _, numVersions := range []int{10, 100, 200, 400} {
+				b.Run(fmt.Sprintf("versions=%d", numVersions), func(b *testing.B) {
+					for _, percentFlushed := range []int{0, 50, 80, 90, 100} {
+						b.Run(fmt.Sprintf("percent-flushed=%d", percentFlushed), func(b *testing.B) {
+							eng := setupMVCCInMemPebbleWithSettings(
+								b, makeSettingsForSeparatedIntents(false, sep))
+							numFlushedVersions := (percentFlushed * numVersions) / 100
+							lockUpdate := setupKeysWithIntent(b, eng, numVersions, numFlushedVersions)
+							keys := make([]roachpb.Key, numIntentKeys)
+							for i := range keys {
+								keys[i] = makeKey(nil, i)
+							}
+							batch := eng.NewBatch()
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								if i > 0 && i%numIntentKeys == 0 {
+									// Wrapped around.
+									b.StopTimer()
+									batch.Close()
+									batch = eng.NewBatch()
+									b.StartTimer()
+								}
+								lockUpdate.Key = keys[i%numIntentKeys]
+								found, err := MVCCResolveWriteIntent(context.Background(), batch, nil, lockUpdate)
+								if !found || err != nil {
+									b.Fatalf("intent not found or err %s", err)
+								}
+							}
+						})
+					}
+				})
+			}
+		})
+	}
+}
+
+func BenchmarkIntentRangeResolution(b *testing.B) {
+	skip.UnderShort(b, "setting up unflushed data takes too long")
+	for _, sep := range []bool{false, true} {
+		b.Run(fmt.Sprintf("separated=%t", sep), func(b *testing.B) {
+			for _, numVersions := range []int{10, 100, 200, 400} {
+				b.Run(fmt.Sprintf("versions=%d", numVersions), func(b *testing.B) {
+					for _, percentFlushed := range []int{0, 50, 80, 90, 100} {
+						b.Run(fmt.Sprintf("percent-flushed=%d", percentFlushed), func(b *testing.B) {
+							eng := setupMVCCInMemPebbleWithSettings(
+								b, makeSettingsForSeparatedIntents(false, sep))
+							numFlushedVersions := (percentFlushed * numVersions) / 100
+							lockUpdate := setupKeysWithIntent(b, eng, numVersions, numFlushedVersions)
+							keys := make([]roachpb.Key, numIntentKeys+1)
+							for i := range keys {
+								keys[i] = makeKey(nil, i)
+							}
+							keys[numIntentKeys] = makeKey(nil, numIntentKeys)
+							batch := eng.NewBatch()
+							numKeysPerRange := 100
+							numRanges := numIntentKeys / numKeysPerRange
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								if i > 0 && i%numRanges == 0 {
+									// Wrapped around.
+									b.StopTimer()
+									batch.Close()
+									batch = eng.NewBatch()
+									b.StartTimer()
+								}
+								rangeNum := i % numRanges
+								lockUpdate.Key = keys[rangeNum*numKeysPerRange]
+								lockUpdate.EndKey = keys[(rangeNum+1)*numKeysPerRange]
+								resolved, span, err := MVCCResolveWriteIntentRange(
+									context.Background(), batch, nil, lockUpdate, 1000 /* max */)
+								if err != nil {
+									b.Fatal(err)
+								}
+								if resolved != int64(numKeysPerRange) {
+									b.Fatalf("expected to resolve %d, actual %d", numKeysPerRange, resolved)
+								}
+								if span != nil {
+									b.Fatal("unexpected resume span")
+								}
+							}
+						})
+					}
+				})
+			}
+		})
+	}
+}
+
 const overhead = 48 // Per key/value overhead (empirically determined)
 
 type engineMaker func(testing.TB, string) Engine

diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go
@@ -96,6 +96,14 @@ type MVCCIterator interface {
 	// in the iteration. After this call, Valid() will be true if the
 	// iterator was not positioned at the first key.
 	Prev()
+
+	// SeekIntentGE is a specialized version of SeekGE(MVCCKey{Key: key}), when
+	// the caller expects to find an intent, and additionally has the txnUUID
+	// for the intent it is looking for. When running with separated intents,
+	// this can optimize the behavior of the underlying Engine for write heavy
+	// keys by avoiding the need to iterate over many deleted intents.
+	SeekIntentGE(key roachpb.Key, txnUUID uuid.UUID)
+
 	// Key returns the current key.
 	Key() MVCCKey
 	// UnsafeRawKey returns the current raw key which could be an encoded

diff --git a/pkg/storage/engine_key.go b/pkg/storage/engine_key.go
@@ -259,8 +259,8 @@ func (lk LockTableKey) ToEngineKey(buf []byte) (EngineKey, []byte) {
 	estimatedLen :=
 		(len(keys.LocalRangeLockTablePrefix) + len(keys.LockTableSingleKeyInfix) + len(lk.Key) + 3) +
 			engineKeyVersionLockTableLen
-	if len(buf) < estimatedLen {
-		buf = make([]byte, estimatedLen)
+	if cap(buf) < estimatedLen {
+		buf = make([]byte, 0, estimatedLen)
 	}
 	ltKey, buf := keys.LockTableSingleKey(lk.Key, buf)
 	k := EngineKey{Key: ltKey}

diff --git a/pkg/storage/intent_interleaving_iter.go b/pkg/storage/intent_interleaving_iter.go
@@ -16,10 +16,12 @@ import (
 	"sync"
 
 	"github.com/cockroachdb/cockroach/pkg/keys"
+	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
 	"github.com/cockroachdb/cockroach/pkg/util"
 	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
+	"github.com/cockroachdb/cockroach/pkg/util/uuid"
 	"github.com/cockroachdb/errors"
 )
 
@@ -265,6 +267,32 @@ func (i *intentInterleavingIter) SeekGE(key MVCCKey) {
 	i.computePos()
 }
 
+func (i *intentInterleavingIter) SeekIntentGE(key roachpb.Key, txnUUID uuid.UUID) {
+	i.dir = +1
+	i.valid = true
+
+	if i.constraint != notConstrained {
+		i.checkConstraint(key, false)
+	}
+	var engineKey EngineKey
+	engineKey, i.intentKeyBuf = LockTableKey{
+		Key:      key,
+		Strength: lock.Exclusive,
+		TxnUUID:  txnUUID[:],
+	}.ToEngineKey(i.intentKeyBuf)
+	valid, err := i.intentIter.SeekEngineKeyGE(engineKey)
+	if err != nil {
+		i.err = err
+		i.valid = false
+		return
+	}
+	if err := i.tryDecodeLockKey(valid); err != nil {
+		return
+	}
+	i.iter.SeekGE(MVCCKey{Key: key})
+	i.computePos()
+}
+
 func (i *intentInterleavingIter) checkConstraint(k roachpb.Key, isExclusiveUpper bool) {
 	kConstraint := constrainedToGlobal
 	if isLocal(k) {