From b258e82af8f87af88c7af878e29acd79bc87a812 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Tue, 8 Mar 2022 22:42:51 -0500 Subject: [PATCH 1/3] storage: temporarily disable maxItersBeforeSeek This commit temporarily disables maxItersBeforeSeek by setting its value to 0. This reveals a bug that was hidden by `maxItersBeforeSeek` and is demonstrated by the change to `TestMVCCHistories/uncertainty_interval_with_local_uncertainty_limit`. The bug is due to incorrect key ordering of MVCC keys with synthetic timestamps. With the current encoding of MVCC key versions and the custom key comparator `EngineKeyCompare`, the inclusion of a synthetic bit in a key's verion timestamp causes the key's encoding to sort _before_ the same key/timestamp without a synthetic bit. This is because `EngineKeyCompare` considers the timestamp to be larger and it sorts versions in decreasing timestamp order. As a result, a seek to a version timestamp without a synthetic bit will seek past and fail to observe a value with that same timestamp but with a synthetic bit. In other words, a seek to timestamp `10,20` will fail to see a version stored at `10,20?`. This is unintended behavior, as the synthetic bit should not affect key ordering or key equality (see `MVCCKey.Equal`). This change will be mostly reverted in a later commit when the bug is fixed. --- pkg/storage/intent_interleaving_iter.go | 20 ++++++++++++---- pkg/storage/intent_interleaving_iter_test.go | 6 ++--- pkg/storage/pebble_mvcc_scanner.go | 15 ++++++++---- ...inty_interval_with_local_uncertainty_limit | 24 +++++++------------ 4 files changed, 38 insertions(+), 27 deletions(-) diff --git a/pkg/storage/intent_interleaving_iter.go b/pkg/storage/intent_interleaving_iter.go index f2acc81583c5..07ec2b1d4e6e 100644 --- a/pkg/storage/intent_interleaving_iter.go +++ b/pkg/storage/intent_interleaving_iter.go @@ -737,12 +737,24 @@ func (i *intentInterleavingIter) SeekLT(key MVCCKey) { return } if i.constraint != notConstrained { - i.checkConstraint(key.Key, true) - if i.constraint == constrainedToLocal && bytes.Equal(key.Key, keys.LocalMax) { + // If the seek key of SeekLT is the boundary between the local and global + // keyspaces, iterators constrained in either direction are permitted. + // Iterators constrained to the local keyspace may be scanning from their + // upper bound. Iterators constrained to the global keyspace may have found + // a key on the boundary and may now be scanning before the key, using the + // boundary as an exclusive upper bound. + // NB: an iterator with bounds [L, U) is allowed to SeekLT over any key in + // [L, U]. For local keyspace iterators, U can be LocalMax and for global + // keyspace iterators L can be LocalMax. + localMax := bytes.Equal(key.Key, keys.LocalMax) + if !localMax { + i.checkConstraint(key.Key, true) + } + if localMax && i.constraint == constrainedToLocal { // Move it down to below the lock table so can iterate down cleanly into // the local key space. Note that we disallow anyone using a seek key - // that is a local key above the lock table, and there should no keys in - // the engine there either (at least not keys that we need to see using + // that is a local key above the lock table, and there should be no keys + // in the engine there either (at least not keys that we need to see using // an MVCCIterator). key.Key = keys.LocalRangeLockTablePrefix } diff --git a/pkg/storage/intent_interleaving_iter_test.go b/pkg/storage/intent_interleaving_iter_test.go index 52be98904e4f..cd586706b286 100644 --- a/pkg/storage/intent_interleaving_iter_test.go +++ b/pkg/storage/intent_interleaving_iter_test.go @@ -413,7 +413,7 @@ func TestIntentInterleavingIterBoundaries(t *testing.T) { defer iter.Close() iter.SeekLT(MVCCKey{Key: keys.MaxKey}) }) - // Boundary cases for constrainedToGlobal + // Boundary cases for constrainedToGlobal. func() { opts := IterOptions{LowerBound: keys.LocalMax} iter := newIntentInterleavingIterator(eng, opts).(*intentInterleavingIter) @@ -427,13 +427,13 @@ func TestIntentInterleavingIterBoundaries(t *testing.T) { require.Equal(t, constrainedToGlobal, iter.constraint) iter.SetUpperBound(keys.LocalMax) }) - require.Panics(t, func() { + func() { opts := IterOptions{LowerBound: keys.LocalMax} iter := newIntentInterleavingIterator(eng, opts).(*intentInterleavingIter) defer iter.Close() require.Equal(t, constrainedToGlobal, iter.constraint) iter.SeekLT(MVCCKey{Key: keys.LocalMax}) - }) + }() // Panics for using a local key that is above the lock table. require.Panics(t, func() { opts := IterOptions{UpperBound: keys.LocalMax} diff --git a/pkg/storage/pebble_mvcc_scanner.go b/pkg/storage/pebble_mvcc_scanner.go index b9f714db37f3..54b2ccd2c7c7 100644 --- a/pkg/storage/pebble_mvcc_scanner.go +++ b/pkg/storage/pebble_mvcc_scanner.go @@ -29,7 +29,7 @@ import ( ) const ( - maxItersBeforeSeek = 10 + maxItersBeforeSeek = 0 // Key value lengths take up 8 bytes (2 x Uint32). kvLenSize = 8 @@ -356,7 +356,7 @@ type pebbleMVCCScanner struct { // Stores any error returned. If non-nil, iteration short circuits. err error // Number of iterations to try before we do a Seek/SeekReverse. Stays within - // [1, maxItersBeforeSeek] and defaults to maxItersBeforeSeek/2 . + // [0, maxItersBeforeSeek] and defaults to maxItersBeforeSeek/2 . itersBeforeSeek int } @@ -483,8 +483,8 @@ func (p *pebbleMVCCScanner) incrementItersBeforeSeek() { // Decrements itersBeforeSeek while ensuring it stays positive. func (p *pebbleMVCCScanner) decrementItersBeforeSeek() { p.itersBeforeSeek-- - if p.itersBeforeSeek < 1 { - p.itersBeforeSeek = 1 + if p.itersBeforeSeek < 0 { + p.itersBeforeSeek = 0 } } @@ -971,6 +971,13 @@ func (p *pebbleMVCCScanner) addAndAdvance( func (p *pebbleMVCCScanner) seekVersion( ctx context.Context, seekTS hlc.Timestamp, uncertaintyCheck bool, ) bool { + if seekTS.IsEmpty() { + // If the seek timestamp is empty, we've already seen all versions of this + // key, so seek to the next key. Seeking to version zero of the current key + // would be incorrect, as version zero is stored before all other versions. + return p.advanceKey() + } + seekKey := MVCCKey{Key: p.curUnsafeKey.Key, Timestamp: seekTS} p.keyBuf = EncodeMVCCKeyToBuf(p.keyBuf[:0], seekKey) origKey := p.keyBuf[:len(p.curUnsafeKey.Key)] diff --git a/pkg/storage/testdata/mvcc_histories/uncertainty_interval_with_local_uncertainty_limit b/pkg/storage/testdata/mvcc_histories/uncertainty_interval_with_local_uncertainty_limit index 41f854262a02..5f1252ea8085 100644 --- a/pkg/storage/testdata/mvcc_histories/uncertainty_interval_with_local_uncertainty_limit +++ b/pkg/storage/testdata/mvcc_histories/uncertainty_interval_with_local_uncertainty_limit @@ -200,17 +200,15 @@ scan t=txn1 k=k1 localUncertaintyLimit=5,0 ---- scan: "k1"-"k1\x00" -> -run error +run ok get t=txn1 k=k2 localUncertaintyLimit=5,0 ---- get: "k2" -> -error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] -run error +run ok scan t=txn1 k=k2 localUncertaintyLimit=5,0 ---- scan: "k2"-"k2\x00" -> -error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] run ok get t=txn1 k=k3 localUncertaintyLimit=5,0 @@ -222,17 +220,15 @@ scan t=txn1 k=k3 localUncertaintyLimit=5,0 ---- scan: "k3"-"k3\x00" -> -run error +run ok get t=txn1 k=k4 localUncertaintyLimit=5,0 ---- get: "k4" -> -error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] -run error +run ok scan t=txn1 k=k4 localUncertaintyLimit=5,0 ---- scan: "k4"-"k4\x00" -> -error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] run ok get t=txn1 k=k5 localUncertaintyLimit=5,0 @@ -244,17 +240,15 @@ scan t=txn1 k=k5 localUncertaintyLimit=5,0 ---- scan: "k5"-"k5\x00" -> -run error +run ok get t=txn1 k=k6 localUncertaintyLimit=5,0 ---- get: "k6" -> -error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] -run error +run ok scan t=txn1 k=k6 localUncertaintyLimit=5,0 ---- scan: "k6"-"k6\x00" -> -error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] run ok get t=txn1 k=k7 localUncertaintyLimit=5,0 @@ -266,17 +260,15 @@ scan t=txn1 k=k7 localUncertaintyLimit=5,0 ---- scan: "k7"-"k7\x00" -> -run error +run ok get t=txn1 k=k8 localUncertaintyLimit=5,0 ---- get: "k8" -> -error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] -run error +run ok scan t=txn1 k=k8 localUncertaintyLimit=5,0 ---- scan: "k8"-"k8\x00" -> -error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] run ok From dbd892505643c47e15e7aacc2f2a4629ab091332 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Wed, 9 Mar 2022 00:23:57 -0500 Subject: [PATCH 2/3] storage: normalize MVCC version timestamps during key comparison MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes the bug revealed in the previous commit and sets the stage for generalized MVCC version normalization during key comparison, which will be needed for https://github.com/cockroachdb/cockroach/pull/77342. To do so, the commit adds a normalization step to EngineKeyCompare, the custom `Compare` function we provide to Pebble. This normalization pass currently strips the synthetic bit from version timestamps, which fixes the bug revealed in the previous commit. The normalization pass also strips zero-valued logical components, which are typically not encoded into MVCCKeys today, but may be in the future (for instance, see https://github.com/cockroachdb/pebble/pull/1314). In #77342, we can then extend this to strip the encoded logical timestamp, if present. In addition to updating the existing custom key comparator function passed to Pebble, the commit also introduces a new custom key equality function. This new function, called EngineKeyEqual, is provided to Pebble as its `Equal` function, replacing the default key equality function of `bytes.Equal`. EngineKeyEqual uses the same version normalization rules to strip portions of the key's version that should not affect ordering. The relationship between the different comparators is explored in a new property based unit test called `TestMVCCKeyCompareRandom`. The change allows us to say that for any two `MVCCKeys` `a` and `b`, the following identities hold: ``` a.Compare(b) = EngineKeyCompare(encode(a), encode(b)) a.Equal(b) = EngineKeyEqual(encode(a), encode(b)) (a.Compare(b) == 0) = a.Equal(b) (a.Compare(b) < 0) = a.Less(b) (a.Compare(b) > 0) = b.Less(a) ``` Care was taken to minimize the cost of this version normalization. With EngineKeyCompare, the normalization amounts to four new branches that should all be easy for a branch predictor to learn. With EngineKeyEqual, there is more of a concern that this change will regress performance because we switch from a direct call to `bytes.Equal` to a custom comparator. To minimize this cost, the change adds a fast-path to quickly defer to `bytes.Equal` when version normalization is not needed. Benchmarks show that with this fast-path and with an expected distribution of keys, the custom key equality function is about 2.5ns more expensive per call. This seems reasonable. ``` name time/op MVCCKeyCompare-10 12.2ns ± 1% MVCCKeyEqual-10 7.10ns ± 6% BytesEqual-10 4.72ns ± 2% ``` Release note: None. Release justification: None. Not intended for v22.1. --- pkg/storage/engine_key.go | 1 + pkg/storage/mvcc_key_test.go | 108 +++++++++++++++++- pkg/storage/pebble.go | 103 ++++++++++++++++- pkg/storage/pebble_test.go | 39 +++++-- ...inty_interval_with_local_uncertainty_limit | 24 ++-- pkg/testutils/lint/lint_test.go | 1 + 6 files changed, 247 insertions(+), 29 deletions(-) diff --git a/pkg/storage/engine_key.go b/pkg/storage/engine_key.go index 6fe0fccaae48..fcf3b89dbfc2 100644 --- a/pkg/storage/engine_key.go +++ b/pkg/storage/engine_key.go @@ -42,6 +42,7 @@ type EngineKey struct { // their particular use case, that demultiplex on the various lengths below. // If adding another length to this list, remember to search for code // referencing these lengths and fix it. +// TODO(nvanbenschoten): unify these constants with those in mvcc_key.go. const ( engineKeyNoVersion = 0 engineKeyVersionWallTimeLen = 8 diff --git a/pkg/storage/mvcc_key_test.go b/pkg/storage/mvcc_key_test.go index 093ee7e88101..4e8a2ea3d5b0 100644 --- a/pkg/storage/mvcc_key_test.go +++ b/pkg/storage/mvcc_key_test.go @@ -15,9 +15,11 @@ import ( "encoding/hex" "fmt" "math" + "math/rand" "reflect" "sort" "testing" + "testing/quick" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/hlc" @@ -71,6 +73,7 @@ func TestMVCCKeyCompare(t *testing.T) { b0 := MVCCKey{roachpb.Key("b"), hlc.Timestamp{Logical: 0}} b1 := MVCCKey{roachpb.Key("b"), hlc.Timestamp{Logical: 1}} b2 := MVCCKey{roachpb.Key("b"), hlc.Timestamp{Logical: 2}} + b2S := MVCCKey{roachpb.Key("b"), hlc.Timestamp{Logical: 2, Synthetic: true}} testcases := map[string]struct { a MVCCKey @@ -85,14 +88,76 @@ func TestMVCCKeyCompare(t *testing.T) { "empty time lt set": {b0, b1, -1}, // empty MVCC timestamps sort before non-empty "set time gt empty": {b1, b0, 1}, // empty MVCC timestamps sort before non-empty "key time precedence": {a1, b2, -1}, // a before b, but 2 before 1; key takes precedence + "synthetic equal": {b2, b2S, 0}, // synthetic bit does not affect ordering } for name, tc := range testcases { t.Run(name, func(t *testing.T) { require.Equal(t, tc.expect, tc.a.Compare(tc.b)) + require.Equal(t, tc.expect == 0, tc.a.Equal(tc.b)) + require.Equal(t, tc.expect < 0, tc.a.Less(tc.b)) + require.Equal(t, tc.expect > 0, tc.b.Less(tc.a)) + + // Comparators on encoded keys should be identical. + aEnc, bEnc := EncodeMVCCKey(tc.a), EncodeMVCCKey(tc.b) + require.Equal(t, tc.expect, EngineKeyCompare(aEnc, bEnc)) + require.Equal(t, tc.expect == 0, EngineKeyEqual(aEnc, bEnc)) }) } } +func TestMVCCKeyCompareRandom(t *testing.T) { + defer leaktest.AfterTest(t)() + + f := func(aGen, bGen randMVCCKey) bool { + a, b := MVCCKey(aGen), MVCCKey(bGen) + aEnc, bEnc := EncodeMVCCKey(a), EncodeMVCCKey(b) + + cmp := a.Compare(b) + cmpEnc := EngineKeyCompare(aEnc, bEnc) + eq := a.Equal(b) + eqEnc := EngineKeyEqual(aEnc, bEnc) + lessAB := a.Less(b) + lessBA := b.Less(a) + + if cmp != cmpEnc { + t.Logf("cmp (%v) != cmpEnc (%v)", cmp, cmpEnc) + return false + } + if eq != eqEnc { + t.Logf("eq (%v) != eqEnc (%v)", eq, eqEnc) + return false + } + if (cmp == 0) != eq { + t.Logf("(cmp == 0) (%v) != eq (%v)", cmp == 0, eq) + return false + } + if (cmp < 0) != lessAB { + t.Logf("(cmp < 0) (%v) != lessAB (%v)", cmp < 0, lessAB) + return false + } + if (cmp > 0) != lessBA { + t.Logf("(cmp > 0) (%v) != lessBA (%v)", cmp > 0, lessBA) + return false + } + return true + } + require.NoError(t, quick.Check(f, nil)) +} + +// randMVCCKey is a quick.Generator for MVCCKey. +type randMVCCKey MVCCKey + +func (k randMVCCKey) Generate(r *rand.Rand, size int) reflect.Value { + k.Key = []byte([...]string{"a", "b", "c"}[r.Intn(3)]) + k.Timestamp.WallTime = r.Int63n(5) + k.Timestamp.Logical = r.Int31n(5) + if !k.Timestamp.IsEmpty() { + // NB: the zero timestamp cannot be synthetic. + k.Timestamp.Synthetic = r.Intn(2) != 0 + } + return reflect.ValueOf(k) +} + func TestEncodeDecodeMVCCKeyAndTimestampWithLength(t *testing.T) { defer leaktest.AfterTest(t)() @@ -113,7 +178,6 @@ func TestEncodeDecodeMVCCKeyAndTimestampWithLength(t *testing.T) { "all": {"foo", hlc.Timestamp{WallTime: 1643550788737652545, Logical: 65535, Synthetic: true}, "666f6f0016cf10bc050557410000ffff010e"}, } for name, tc := range testcases { - tc := tc t.Run(name, func(t *testing.T) { // Test Encode/DecodeMVCCKey. @@ -172,6 +236,46 @@ func TestEncodeDecodeMVCCKeyAndTimestampWithLength(t *testing.T) { } } +func TestDecodeUnnormalizedMVCCKey(t *testing.T) { + defer leaktest.AfterTest(t)() + + testcases := map[string]struct { + encoded string // hex-encoded + expected MVCCKey + equalToNormal bool + }{ + "zero logical": { + encoded: "666f6f0016cf10bc05055741000000000d", + expected: MVCCKey{Key: []byte("foo"), Timestamp: hlc.Timestamp{WallTime: 1643550788737652545, Logical: 0}}, + equalToNormal: true, + }, + "zero walltime and logical": { + encoded: "666f6f000000000000000000000000000d", + expected: MVCCKey{Key: []byte("foo"), Timestamp: hlc.Timestamp{WallTime: 0, Logical: 0}}, + // We could normalize this form in EngineKeyEqual and EngineKeyCompare, + // but doing so is not worth losing the fast-path byte comparison between + // keys that only contain (at most) a walltime. + equalToNormal: false, + }, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + encoded, err := hex.DecodeString(tc.encoded) + require.NoError(t, err) + + decoded, err := DecodeMVCCKey(encoded) + require.NoError(t, err) + require.Equal(t, tc.expected, decoded) + + // Re-encode the key into its normal form. + reencoded := EncodeMVCCKey(decoded) + require.NotEqual(t, encoded, reencoded) + require.Equal(t, tc.equalToNormal, EngineKeyEqual(encoded, reencoded)) + require.Equal(t, tc.equalToNormal, EngineKeyCompare(encoded, reencoded) == 0) + }) + } +} + func TestDecodeMVCCKeyErrors(t *testing.T) { defer leaktest.AfterTest(t)() @@ -185,7 +289,6 @@ func TestDecodeMVCCKeyErrors(t *testing.T) { "invalid timestamp length suffix": {"ab00ffffffffffffffff0f", "invalid encoded mvcc key: ab00ffffffffffffffff0f"}, } for name, tc := range testcases { - tc := tc t.Run(name, func(t *testing.T) { encoded, err := hex.DecodeString(tc.encoded) require.NoError(t, err) @@ -208,7 +311,6 @@ func TestDecodeMVCCTimestampSuffixErrors(t *testing.T) { "invalid length suffix": {"ffffffffffffffff0f", "bad timestamp: found length suffix 15, actual length 9"}, } for name, tc := range testcases { - tc := tc t.Run(name, func(t *testing.T) { encoded, err := hex.DecodeString(tc.encoded) require.NoError(t, err) diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 2457a6798697..ccc54793d8ce 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -117,17 +117,106 @@ func EngineKeyCompare(a, b []byte) int { // Compare the version part of the key. Note that when the version is a // timestamp, the timestamp encoding causes byte comparison to be equivalent // to timestamp comparison. - aTS := a[aSep:aEnd] - bTS := b[bSep:bEnd] - if len(aTS) == 0 { - if len(bTS) == 0 { + aVer := a[aSep:aEnd] + bVer := b[bSep:bEnd] + if len(aVer) == 0 { + if len(bVer) == 0 { return 0 } return -1 - } else if len(bTS) == 0 { + } else if len(bVer) == 0 { return 1 } - return bytes.Compare(bTS, aTS) + aVer = normalizeEngineKeyVersionForCompare(aVer) + bVer = normalizeEngineKeyVersionForCompare(bVer) + return bytes.Compare(bVer, aVer) +} + +// EngineKeyEqual checks for equality of cockroach keys, including the version +// (which could be MVCC timestamps). +func EngineKeyEqual(a, b []byte) bool { + // NB: For performance, this routine manually splits the key into the + // user-key and version components rather than using DecodeEngineKey. In + // most situations, use DecodeEngineKey or GetKeyPartFromEngineKey or + // SplitMVCCKey instead of doing this. + aEnd := len(a) - 1 + bEnd := len(b) - 1 + if aEnd < 0 || bEnd < 0 { + // This should never happen unless there is some sort of corruption of + // the keys. + return bytes.Equal(a, b) + } + + // Last byte is the version length + 1 when there is a version, + // else it is 0. + aVerLen := int(a[aEnd]) + bVerLen := int(b[bEnd]) + + // Fast-path. If the key version is empty or contains only a walltime + // component then normalizeEngineKeyVersionForCompare is a no-op, so we don't + // need to split the "user key" from the version suffix before comparing to + // compute equality. Instead, we can check for byte equality immediately. + const withWall = mvccEncodedTimeSentinelLen + mvccEncodedTimeWallLen + if aVerLen <= withWall && bVerLen <= withWall { + return bytes.Equal(a, b) + } + + // Compute the index of the separator between the key and the version. If the + // separator is found to be at -1 for both keys, then we are comparing bare + // suffixes without a user key part. Pebble requires bare suffixes to be + // comparable with the same ordering as if they had a common user key. + aSep := aEnd - aVerLen + bSep := bEnd - bVerLen + if aSep == -1 && bSep == -1 { + aSep, bSep = 0, 0 // comparing bare suffixes + } + if aSep < 0 || bSep < 0 { + // This should never happen unless there is some sort of corruption of + // the keys. + return bytes.Equal(a, b) + } + + // Compare the "user key" part of the key. + if !bytes.Equal(a[:aSep], b[:bSep]) { + return false + } + + // Compare the version part of the key. + aVer := a[aSep:aEnd] + bVer := b[bSep:bEnd] + aVer = normalizeEngineKeyVersionForCompare(aVer) + bVer = normalizeEngineKeyVersionForCompare(bVer) + return bytes.Equal(aVer, bVer) +} + +var zeroLogical [mvccEncodedTimeLogicalLen]byte + +//gcassert:inline +func normalizeEngineKeyVersionForCompare(a []byte) []byte { + // In general, the version could also be a non-timestamp version, but we know + // that engineKeyVersionLockTableLen+mvccEncodedTimeSentinelLen is a different + // constant than the above, so there is no danger here of stripping parts from + // a non-timestamp version. + const withWall = mvccEncodedTimeSentinelLen + mvccEncodedTimeWallLen + const withLogical = withWall + mvccEncodedTimeLogicalLen + const withSynthetic = withLogical + mvccEncodedTimeSyntheticLen + if len(a) == withSynthetic { + // Strip the synthetic bit component from the timestamp version. The + // presence of the synthetic bit does not affect key ordering or equality. + a = a[:withLogical] + } + if len(a) == withLogical { + // If the timestamp version contains a logical timestamp component that is + // zero, strip the component. encodeMVCCTimestampToBuf will typically omit + // the entire logical component in these cases as an optimization, but it + // does not guarantee to never include a zero logical component. + // Additionally, we can fall into this case after stripping off other + // components of the key version earlier on in this function. + if bytes.Equal(a[withWall:], zeroLogical[:]) { + a = a[:withWall] + } + } + return a } // EngineComparer is a pebble.Comparer object that implements MVCC-specific @@ -135,6 +224,8 @@ func EngineKeyCompare(a, b []byte) int { var EngineComparer = &pebble.Comparer{ Compare: EngineKeyCompare, + Equal: EngineKeyEqual, + AbbreviatedKey: func(k []byte) uint64 { key, ok := GetKeyPartFromEngineKey(k) if !ok { diff --git a/pkg/storage/pebble_test.go b/pkg/storage/pebble_test.go index d087eb1715fe..44409107043d 100644 --- a/pkg/storage/pebble_test.go +++ b/pkg/storage/pebble_test.go @@ -14,7 +14,6 @@ import ( "bytes" "context" "fmt" - "io/ioutil" "math" "math/rand" "path/filepath" @@ -602,26 +601,42 @@ func TestPebbleIterConsistency(t *testing.T) { } func BenchmarkMVCCKeyCompare(b *testing.B) { + keys := makeRandEncodedKeys() + b.ResetTimer() + for i, j := 0, 0; i < b.N; i, j = i+1, j+3 { + _ = EngineKeyCompare(keys[i%len(keys)], keys[j%len(keys)]) + } +} + +func BenchmarkMVCCKeyEqual(b *testing.B) { + keys := makeRandEncodedKeys() + b.ResetTimer() + for i, j := 0, 0; i < b.N; i, j = i+1, j+3 { + _ = EngineKeyEqual(keys[i%len(keys)], keys[j%len(keys)]) + } +} + +func makeRandEncodedKeys() [][]byte { rng := rand.New(rand.NewSource(timeutil.Now().Unix())) keys := make([][]byte, 1000) for i := range keys { k := MVCCKey{ - Key: randutil.RandBytes(rng, 8), + Key: []byte("shared" + [...]string{"a", "b", "c"}[rng.Intn(3)]), Timestamp: hlc.Timestamp{ - WallTime: int64(rng.Intn(5)), + WallTime: rng.Int63n(5), }, } + if rng.Int31n(5) == 0 { + // 20% of keys have a logical component. + k.Timestamp.Logical = rng.Int31n(4) + 1 + } + if rng.Int31n(1000) == 0 && !k.Timestamp.IsEmpty() { + // 0.1% of keys have a synthetic component. + k.Timestamp.Synthetic = true + } keys[i] = EncodeMVCCKey(k) } - - b.ResetTimer() - var c int - for i, j := 0, 0; i < b.N; i, j = i+1, j+3 { - c = EngineKeyCompare(keys[i%len(keys)], keys[j%len(keys)]) - } - if testing.Verbose() { - fmt.Fprint(ioutil.Discard, c) - } + return keys } type testValue struct { diff --git a/pkg/storage/testdata/mvcc_histories/uncertainty_interval_with_local_uncertainty_limit b/pkg/storage/testdata/mvcc_histories/uncertainty_interval_with_local_uncertainty_limit index 5f1252ea8085..41f854262a02 100644 --- a/pkg/storage/testdata/mvcc_histories/uncertainty_interval_with_local_uncertainty_limit +++ b/pkg/storage/testdata/mvcc_histories/uncertainty_interval_with_local_uncertainty_limit @@ -200,15 +200,17 @@ scan t=txn1 k=k1 localUncertaintyLimit=5,0 ---- scan: "k1"-"k1\x00" -> -run ok +run error get t=txn1 k=k2 localUncertaintyLimit=5,0 ---- get: "k2" -> +error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] -run ok +run error scan t=txn1 k=k2 localUncertaintyLimit=5,0 ---- scan: "k2"-"k2\x00" -> +error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] run ok get t=txn1 k=k3 localUncertaintyLimit=5,0 @@ -220,15 +222,17 @@ scan t=txn1 k=k3 localUncertaintyLimit=5,0 ---- scan: "k3"-"k3\x00" -> -run ok +run error get t=txn1 k=k4 localUncertaintyLimit=5,0 ---- get: "k4" -> +error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] -run ok +run error scan t=txn1 k=k4 localUncertaintyLimit=5,0 ---- scan: "k4"-"k4\x00" -> +error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] run ok get t=txn1 k=k5 localUncertaintyLimit=5,0 @@ -240,15 +244,17 @@ scan t=txn1 k=k5 localUncertaintyLimit=5,0 ---- scan: "k5"-"k5\x00" -> -run ok +run error get t=txn1 k=k6 localUncertaintyLimit=5,0 ---- get: "k6" -> +error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] -run ok +run error scan t=txn1 k=k6 localUncertaintyLimit=5,0 ---- scan: "k6"-"k6\x00" -> +error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] run ok get t=txn1 k=k7 localUncertaintyLimit=5,0 @@ -260,15 +266,17 @@ scan t=txn1 k=k7 localUncertaintyLimit=5,0 ---- scan: "k7"-"k7\x00" -> -run ok +run error get t=txn1 k=k8 localUncertaintyLimit=5,0 ---- get: "k8" -> +error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] -run ok +run error scan t=txn1 k=k8 localUncertaintyLimit=5,0 ---- scan: "k8"-"k8\x00" -> +error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 5.000000000,0 encountered previous write with future timestamp 10.000000000,0? within uncertainty interval `t <= (local=5.000000000,0, global=10.000000000,0)`; observed timestamps: [] run ok diff --git a/pkg/testutils/lint/lint_test.go b/pkg/testutils/lint/lint_test.go index adebc98eb49f..8d0e18c98659 100644 --- a/pkg/testutils/lint/lint_test.go +++ b/pkg/testutils/lint/lint_test.go @@ -1996,6 +1996,7 @@ func TestLint(t *testing.T) { "../../sql/colfetcher", "../../sql/row", "../../kv/kvclient/rangecache", + "../../storage", ); err != nil { t.Fatal(err) } From 39fbf414a8b5147318e9d05a4986225c84108ef9 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Wed, 9 Mar 2022 00:26:12 -0500 Subject: [PATCH 3/3] storage: make maxItersBeforeSeek metamorphic To prevent maxItersBeforeSeek from hidding inconsistencies between seeking and iterating in the future, the commit makes the value metamorphic, with a range of [0,3). Going forward, unit tests will use different values for this heuristic. Release justification: None. Not intended for v22.1. --- pkg/storage/pebble_mvcc_scanner.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pkg/storage/pebble_mvcc_scanner.go b/pkg/storage/pebble_mvcc_scanner.go index 54b2ccd2c7c7..b5a4ecb12e09 100644 --- a/pkg/storage/pebble_mvcc_scanner.go +++ b/pkg/storage/pebble_mvcc_scanner.go @@ -21,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/uncertainty" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" + "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/mon" "github.com/cockroachdb/cockroach/pkg/util/protoutil" @@ -28,11 +29,14 @@ import ( "github.com/cockroachdb/pebble" ) -const ( - maxItersBeforeSeek = 0 +// Key value lengths take up 8 bytes (2 x Uint32). +const kvLenSize = 8 - // Key value lengths take up 8 bytes (2 x Uint32). - kvLenSize = 8 +var maxItersBeforeSeek = util.ConstantWithMetamorphicTestRange( + "mvcc-max-iters-before-seek", + 10, /* defaultValue */ + 0, /* min */ + 3, /* max */ ) // Struct to store MVCCScan / MVCCGet in the same binary format as that