Skip to content

Commit

Permalink
storage: GC range keys by unsetting identical suffixes
Browse files Browse the repository at this point in the history
In CockroachDB's key encoding some keys have multiple logically equivalent but
physically distinct encodings. Most notably, in CockroachDB versions 23.2 and
earlier keys written to global tables encoded MVCC timestamps with a
'synthetic bit.' In cockroachdb#101938 CockroachDB stopped encoding and decoding this
synthetic bit, transparently ignoring it.

In cockroachdb#129592 we observed the existence of a bug in the CockroachDB comparator
when comparing two MVCC timestamp suffixes, specifically outside the context of
a full MVCC key. The comparator failed to consider a timestamp with the
synthetic bit and a timestamp without the synthetic bit as logically
equivalent. There are limited instances where Pebble uses the comparator to
compare "bare suffixes," and all instances are constrained to the
implementation of range keys.

In cockroachdb#129592 it was observed that the comparator bug could prevent the garbage
collection of MVCC delete range tombstones (the single use of range keys within
CockroachDB). A cluster running 23.2 or earlier may write a MVCC delete range
tombstone with a timestamp encoding the synthetic bit. If the cluster
subsequently upgraded to 24.1 or later, the code path to clear range keys
stopped understanding synthetic bits and wrote range key unset tombstones
without the synthetic bit set. Due to the comparator bug, Pebble did not
consider these timestamp suffixes equal and the unset was ineffective.

We initially attempted to fix this issue by fixing the comparator, but
inadvertently introduced the possibility of replica divergence cockroachdb#130533 by
changing the semantics of LSM state below raft.

This commit works around this comparator bug by adapting ClearMVCCRangeKey to
write range key unsets using the verbatim suffix that was read from the storage
engine. To avoid reverting cockroachdb#101938 and re-introducing knowledge of the
synthetic bit, the MVCCRangeKey data structures are adapted to retain a copy of
the encoded timestamp suffix when reading range keys from storage engine
iterators. If later an attempt is made to clear the range key through
ClearMVCCRangeKey, this encoded timestamp suffix is used instead of re-encoding
the timestamp. Through avoiding the decoding/encoding roundtrip,
ClearMVCCRangeKey ensures that the suffixes it writes are identical to the
range keys that exist on disk, even if they encode a synthetic bit.

Release note (bug fix): Fixes a bug that could result in the inability to
garbage collect a MVCC range tombstone within a global table.
Epic: none
Informs cockroachdb#129592.
  • Loading branch information
jbowens committed Sep 12, 2024
1 parent 67e99eb commit d2f157d
Show file tree
Hide file tree
Showing 16 changed files with 280 additions and 74 deletions.
8 changes: 5 additions & 3 deletions pkg/ccl/crosscluster/replicationutils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,11 @@ func ScanSST(
intersectedSpan := scanWithin.Intersect(rangeIter.RangeBounds())
mergeRangeKV(storage.MVCCRangeKeyValue{
RangeKey: storage.MVCCRangeKey{
StartKey: intersectedSpan.Key.Clone(),
EndKey: intersectedSpan.EndKey.Clone(),
Timestamp: rangeKeyVersion.Timestamp},
StartKey: intersectedSpan.Key.Clone(),
EndKey: intersectedSpan.EndKey.Clone(),
Timestamp: rangeKeyVersion.Timestamp,
EncodedTimestampSuffix: storage.EncodeMVCCTimestampSuffix(rangeKeyVersion.Timestamp),
},
Value: rangeKeyVersion.Value,
})
}
Expand Down
1 change: 1 addition & 0 deletions pkg/kv/kvserver/batcheval/cmd_add_sstable_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1266,6 +1266,7 @@ func TestEvalAddSSTable(t *testing.T) {
require.NoError(t, err)
v.LocalTimestamp.WallTime *= 1e9
kv.RangeKey.Timestamp.WallTime *= 1e9
kv.RangeKey.EncodedTimestampSuffix = storage.EncodeMVCCTimestampSuffix(kv.RangeKey.Timestamp)
vBytes, err := storage.EncodeMVCCValue(v)
require.NoError(t, err)
expect = append(expect, storage.MVCCRangeKeyValue{RangeKey: kv.RangeKey, Value: vBytes})
Expand Down
8 changes: 5 additions & 3 deletions pkg/kv/kvserver/batcheval/cmd_delete_range_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,10 +202,12 @@ func TestDeleteRangeTombstone(t *testing.T) {

writeInitialData(t, ctx, engine)

ts := hlc.Timestamp{WallTime: tc.ts}
rangeKey := storage.MVCCRangeKey{
StartKey: roachpb.Key(tc.start),
EndKey: roachpb.Key(tc.end),
Timestamp: hlc.Timestamp{WallTime: tc.ts},
StartKey: roachpb.Key(tc.start),
EndKey: roachpb.Key(tc.end),
Timestamp: ts,
EncodedTimestampSuffix: storage.EncodeMVCCTimestampSuffix(ts),
}

// Prepare the request and environment.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ func TestKnobsUseRangeTombstonesForPointDeletes(t *testing.T) {
switch kv := kvI.(type) {
case storage.MVCCRangeKeyValue:
kv.RangeKey.Timestamp = hlc.Timestamp{}
kv.RangeKey.EncodedTimestampSuffix = nil
rangeTombstones = append(rangeTombstones, kv.RangeKey)

case storage.MVCCKeyValue:
Expand Down
7 changes: 4 additions & 3 deletions pkg/kv/kvserver/gc/gc_random_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -782,9 +782,10 @@ func mergeRanges(fragments [][]storage.MVCCRangeKeyValue) []storage.MVCCRangeKey
// tombstone types.
newPartial = append(newPartial, storage.MVCCRangeKeyValue{
RangeKey: storage.MVCCRangeKey{
StartKey: partialRangeKeys[j].RangeKey.StartKey,
EndKey: stack[i].RangeKey.EndKey,
Timestamp: partialRangeKeys[j].RangeKey.Timestamp,
StartKey: partialRangeKeys[j].RangeKey.StartKey,
EndKey: stack[i].RangeKey.EndKey,
Timestamp: partialRangeKeys[j].RangeKey.Timestamp,
EncodedTimestampSuffix: partialRangeKeys[j].RangeKey.EncodedTimestampSuffix,
},
Value: partialRangeKeys[j].Value,
})
Expand Down
7 changes: 4 additions & 3 deletions pkg/kv/kvserver/gc/gc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1512,9 +1512,10 @@ func engineData(t *testing.T, r storage.Reader, desc roachpb.RangeDescriptor) []
i++
case 0:
newPartial = append(newPartial, storage.MVCCRangeKey{
StartKey: partialRangeKeys[j].StartKey,
EndKey: newKeys[i].EndKey.Clone(),
Timestamp: partialRangeKeys[j].Timestamp,
StartKey: partialRangeKeys[j].StartKey,
EndKey: newKeys[i].EndKey.Clone(),
Timestamp: partialRangeKeys[j].Timestamp,
EncodedTimestampSuffix: partialRangeKeys[j].EncodedTimestampSuffix,
})
i++
j++
Expand Down
57 changes: 43 additions & 14 deletions pkg/storage/mvcc_history_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,22 +91,33 @@ var (
// check_for_acquire_lock t=<name> k=<key> str=<strength> [maxLockConflicts=<int>] [targetLockConflictBytes=<int>]
// acquire_lock t=<name> k=<key> str=<strength> [maxLockConflicts=<int>] [targetLockConflictBytes=<int>]
//
// cput [t=<name>] [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [resolve [status=<txnstatus>]] [ambiguousReplay] [maxLockConflicts=<int>] [targetLockConflictBytes=<int>] k=<key> v=<string> [raw] [cond=<string>]
// del [t=<name>] [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [resolve [status=<txnstatus>]] [ambiguousReplay] [maxLockConflicts=<int>] [targetLockConflictBytes=<int>] k=<key>
// del_range [t=<name>] [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [resolve [status=<txnstatus>]] [ambiguousReplay] [maxLockConflicts=<int>] [targetLockConflictBytes=<int>] k=<key> end=<key> [max=<max>] [returnKeys]
// del_range_ts [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [maxLockConflicts=<int>] [targetLockConflictBytes=<int>] k=<key> end=<key> [idempotent] [noCoveredStats]
// del_range_pred [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [maxLockConflicts=<int>] [targetLockConflictBytes=<int>] k=<key> end=<key> [startTime=<int>,max=<int>,maxBytes=<int>,rangeThreshold=<int>]
// increment [t=<name>] [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [resolve [status=<txnstatus>]] [ambiguousReplay] [maxLockConflicts=<int>] [targetLockConflictBytes=<int>] k=<key> [inc=<val>]
// initput [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] [ambiguousReplay] [maxLockConflicts=<int>] k=<key> v=<string> [raw] [failOnTombstones]
// put [t=<name>] [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [resolve [status=<txnstatus>]] [ambiguousReplay] [maxLockConflicts=<int>] k=<key> v=<string> [raw]
// cput [t=<name>] [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [resolve [status=<txnstatus>]] [ambiguousReplay] [maxLockConflicts=<int>]
// [targetLockConflictBytes=<int>] k=<key> v=<string> [raw] [cond=<string>]
// del [t=<name>] [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [resolve [status=<txnstatus>]] [ambiguousReplay] [maxLockConflicts=<int>]
// [targetLockConflictBytes=<int>] k=<key>
// del_range [t=<name>] [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [resolve [status=<txnstatus>]] [ambiguousReplay] [maxLockConflicts=<int>]
// [targetLockConflictBytes=<int>] k=<key> end=<key> [max=<max>] [returnKeys]
// del_range_ts [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [maxLockConflicts=<int>] [targetLockConflictBytes=<int>] k=<key> end=<key> [idempotent]
// [noCoveredStats]
// del_range_pred [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [maxLockConflicts=<int>] [targetLockConflictBytes=<int>] k=<key> end=<key>
// [startTime=<int>,max=<int>,maxBytes=<int>,rangeThreshold=<int>]
// increment [t=<name>] [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [resolve [status=<txnstatus>]] [ambiguousReplay] [maxLockConflicts=<int>]
// [targetLockConflictBytes=<int>] k=<key> [inc=<val>]
// initput [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] [ambiguousReplay] [maxLockConflicts=<int>] k=<key> v=<string> [raw] [failOnTombstones]
// put [t=<name>] [ts=<int>[,<int>]] [localTs=<int>[,<int>]] [resolve [status=<txnstatus>]] [ambiguousReplay] [maxLockConflicts=<int>] k=<key> v=<string> [raw]
// put_rangekey ts=<int>[,<int>] [localTs=<int>[,<int>]] k=<key> end=<key>
// put_blind_inline k=<key> v=<string> [prev=<string>]
// get [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] k=<key> [inconsistent] [skipLocked] [tombstones] [failOnMoreRecent] [localUncertaintyLimit=<int>[,<int>]] [globalUncertaintyLimit=<int>[,<int>]] [maxKeys=<int>] [targetBytes=<int>] [allowEmpty]
// scan [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] k=<key> [end=<key>] [inconsistent] [skipLocked] [tombstones] [reverse] [failOnMoreRecent] [localUncertaintyLimit=<int>[,<int>]] [globalUncertaintyLimit=<int>[,<int>]] [max=<max>] [targetbytes=<target>] [wholeRows[=<int>]] [allowEmpty]
// export [k=<key>] [end=<key>] [ts=<int>[,<int>]] [kTs=<int>[,<int>]] [startTs=<int>[,<int>]] [maxLockConflicts=<int>] [targetLockConflictBytes=<int>] [allRevisions] [targetSize=<int>] [maxSize=<int>] [stopMidKey] [fingerprint]
// get [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] k=<key> [inconsistent] [skipLocked] [tombstones] [failOnMoreRecent]
// [localUncertaintyLimit=<int>[,<int>]] [globalUncertaintyLimit=<int>[,<int>]] [maxKeys=<int>] [targetBytes=<int>] [allowEmpty]
// scan [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] k=<key> [end=<key>] [inconsistent] [skipLocked] [tombstones] [reverse] [failOnMoreRecent]
// [localUncertaintyLimit=<int>[,<int>]] [globalUncertaintyLimit=<int>[,<int>]] [max=<max>] [targetbytes=<target>] [wholeRows[=<int>]] [allowEmpty]
// export [k=<key>] [end=<key>] [ts=<int>[,<int>]] [kTs=<int>[,<int>]] [startTs=<int>[,<int>]] [maxLockConflicts=<int>] [targetLockConflictBytes=<int>]
// [allRevisions] [targetSize=<int>] [maxSize=<int>] [stopMidKey] [fingerprint]
//
// iter_new [k=<key>] [end=<key>] [prefix] [kind=key|keyAndIntents] [types=pointsOnly|pointsWithRanges|pointsAndRanges|rangesOnly] [maskBelow=<int>[,<int>]] [minTimestamp=<int>[,<int>]] [maxTimestamp=<int>[,<int>]]
// iter_new_incremental [k=<key>] [end=<key>] [startTs=<int>[,<int>]] [endTs=<int>[,<int>]] [types=pointsOnly|pointsWithRanges|pointsAndRanges|rangesOnly] [maskBelow=<int>[,<int>]] [intents=error|aggregate|emit]
// iter_new [k=<key>] [end=<key>] [prefix] [kind=key|keyAndIntents] [types=pointsOnly|pointsWithRanges|pointsAndRanges|rangesOnly] [maskBelow=<int>[,<int>]]
// [minTimestamp=<int>[,<int>]] [maxTimestamp=<int>[,<int>]]
// iter_new_incremental [k=<key>] [end=<key>] [startTs=<int>[,<int>]] [endTs=<int>[,<int>]] [types=pointsOnly|pointsWithRanges|pointsAndRanges|rangesOnly]
// [maskBelow=<int>[,<int>]] [intents=error|aggregate|emit]
// iter_seek_ge k=<key> [ts=<int>[,<int>]]
// iter_seek_lt k=<key> [ts=<int>[,<int>]]
// iter_next
Expand All @@ -128,7 +139,7 @@ var (
// replace_point_tombstones_with_range_tombstones k=<key> [end=<key>]
//
// sst_put [ts=<int>[,<int>]] [localTs=<int>[,<int>]] k=<key> [v=<string>]
// sst_put_rangekey ts=<int>[,<int>] [localTS=<int>[,<int>]] k=<key> end=<key>
// sst_put_rangekey ts=<int>[,<int>] [localTS=<int>[,<int>]] k=<key> end=<key> [syntheticBit]
// sst_clear_range k=<key> end=<key>
// sst_clear_rangekey k=<key> end=<key> ts=<int>[,<int>]
// sst_finish
Expand Down Expand Up @@ -1917,6 +1928,24 @@ func cmdPutRangeKey(e *evalCtx) error {
var value storage.MVCCValue
value.MVCCValueHeader.LocalTimestamp = hlc.ClockTimestamp(e.getTsWithName("localTs"))

// If the syntheticBit arg is present, manually construct a MVCC timestamp
// that includes the synthetic bit. Cockroach stopped writing these keys
// beginning in version 24.1. It's not possible to commit such a key through
// the PutMVCCRangeKey API, so we also need to manually encode the MVCC
// value and use PutEngineRangeKey. We keep the non-synthetic-bit case
// as-is, using PutMVCCRangeKey, since that's the codepath ordinary MVCC
// range key writes will use and we want to exercise it. See #129592.
if e.hasArg("syntheticBit") {
return e.withWriter("put_rangekey", func(rw storage.ReadWriter) error {
suffix := storage.EncodeMVCCTimestampSuffixWithSyntheticBitForTesting(rangeKey.Timestamp)
valueRaw, err := storage.EncodeMVCCValue(value)
if err != nil {
return errors.Wrapf(err, "failed to encode MVCC value for range key %s", rangeKey)
}
return rw.PutEngineRangeKey(rangeKey.StartKey, rangeKey.EndKey, suffix, valueRaw)
})
}

return e.withWriter("put_rangekey", func(rw storage.ReadWriter) error {
return rw.PutMVCCRangeKey(rangeKey, value)
})
Expand Down
74 changes: 69 additions & 5 deletions pkg/storage/mvcc_key.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"bytes"
"encoding/binary"
"fmt"
"slices"
"sort"
"strings"

Expand Down Expand Up @@ -385,15 +386,30 @@ type MVCCRangeKey struct {
StartKey roachpb.Key
EndKey roachpb.Key
Timestamp hlc.Timestamp
// EncodedTimestampSuffix is an optional encoded representation of Timestamp
// as a Pebble "suffix". When reading range keys from the engine, the
// iterator copies the verbatim encoded timestamp here. There historically
// have been multiple representations of a timestamp that were intended to
// be logically equivalent. A bug in CockroachDB's pebble.Comparer
// implementation prevented some encodings from being considered equivalent.
// See #129592.
//
// To work around this wart within the comparer, we preserve a copy of the
// physical encoded timestamp we read off the engine. If a MVCCRangeKey with
// a non-empty EncodedTimestampSuffix is cleared via ClearMVCCRangeKey, the
// RangeKeyUnset tombstone is written with the verbatim
// EncodedTimestampSuffix.
EncodedTimestampSuffix []byte
}

// AsStack returns the range key as a range key stack with the given value.
func (k MVCCRangeKey) AsStack(valueRaw []byte) MVCCRangeKeyStack {
return MVCCRangeKeyStack{
Bounds: k.Bounds(),
Versions: MVCCRangeKeyVersions{{
Timestamp: k.Timestamp,
Value: valueRaw,
Timestamp: k.Timestamp,
Value: valueRaw,
EncodedTimestampSuffix: k.EncodedTimestampSuffix,
}},
}
}
Expand All @@ -408,6 +424,7 @@ func (k MVCCRangeKey) Clone() MVCCRangeKey {
// k is already a copy, but byte slices must be cloned.
k.StartKey = k.StartKey.Clone()
k.EndKey = k.EndKey.Clone()
k.EncodedTimestampSuffix = slices.Clone(k.EncodedTimestampSuffix)
return k
}

Expand Down Expand Up @@ -507,22 +524,38 @@ type MVCCRangeKeyVersions []MVCCRangeKeyVersion
type MVCCRangeKeyVersion struct {
Timestamp hlc.Timestamp
Value []byte
// EncodedTimestampSuffix is an optional encoded representation of Timestamp
// as a Pebble "suffix". When reading range keys from the engine, the
// iterator copies the verbatim encoded timestamp here. There historically
// have been multiple representations of a timestamp that were intended to
// be logically equivalent. A bug in CockroachDB's pebble.Comparer
// implementation prevented some encodings from being considered equivalent.
// See #129592.
//
// To work around this wart within the comparer, we preserve a copy of the
// physical encoded timestamp we read off the engine. If a MVCCRangeKey with
// a non-empty EncodedTimestampSuffix is cleared via ClearMVCCRangeKey, the
// RangeKeyUnset tombstone is written with the verbatim
// EncodedTimestampSuffix.
EncodedTimestampSuffix []byte
}

// CloneInto copies the version into the provided destination
// MVCCRangeKeyVersion, reusing and overwriting its value slice.
func (v MVCCRangeKeyVersion) CloneInto(dst *MVCCRangeKeyVersion) {
dst.Timestamp = v.Timestamp
dst.Value = append(dst.Value[:0], v.Value...)
dst.EncodedTimestampSuffix = append(dst.EncodedTimestampSuffix[:0], v.EncodedTimestampSuffix...)
}

// AsRangeKey returns an MVCCRangeKey for the given version. Byte slices
// are shared with the stack.
func (s MVCCRangeKeyStack) AsRangeKey(v MVCCRangeKeyVersion) MVCCRangeKey {
return MVCCRangeKey{
StartKey: s.Bounds.Key,
EndKey: s.Bounds.EndKey,
Timestamp: v.Timestamp,
StartKey: s.Bounds.Key,
EndKey: s.Bounds.EndKey,
Timestamp: v.Timestamp,
EncodedTimestampSuffix: v.EncodedTimestampSuffix,
}
}

Expand Down Expand Up @@ -707,6 +740,7 @@ func (v MVCCRangeKeyVersions) CloneInto(c *MVCCRangeKeyVersions) {
for i := range v {
(*c)[i].Timestamp = v[i].Timestamp
(*c)[i].Value = append((*c)[i].Value[:0], v[i].Value...)
(*c)[i].EncodedTimestampSuffix = append((*c)[i].EncodedTimestampSuffix[:0], v[i].EncodedTimestampSuffix...)
}
}

Expand Down Expand Up @@ -898,6 +932,9 @@ func (v MVCCRangeKeyVersion) Clone() MVCCRangeKeyVersion {
if v.Value != nil {
v.Value = append([]byte(nil), v.Value...)
}
if v.EncodedTimestampSuffix != nil {
v.EncodedTimestampSuffix = append([]byte(nil), v.EncodedTimestampSuffix...)
}
return v
}

Expand All @@ -910,3 +947,30 @@ func (v MVCCRangeKeyVersion) Equal(o MVCCRangeKeyVersion) bool {
func (v MVCCRangeKeyVersion) String() string {
return fmt.Sprintf("%s=%x", v.Timestamp, v.Value)
}

// EncodeMVCCTimestampSuffixWithSyntheticBitForTesting is a utility to encode
// the provided timestamp as a MVCC timestamp key suffix with the synthetic bit
// set. The synthetic bit is no longer encoded/decoded into the hlc.Timestamp
// but may exist in existing databases. This utility allows a test to construct
// a timestamp with the synthetic bit for testing appropriate handling of
// existing keys with the bit set. It should only be used in tests. See #129592.
//
// TODO(jackson): Remove this function when we've migrated all keys to unset the
// synthetic bit.
func EncodeMVCCTimestampSuffixWithSyntheticBitForTesting(ts hlc.Timestamp) []byte {
const mvccEncodedTimestampWithSyntheticBitLen = mvccEncodedTimeWallLen +
mvccEncodedTimeLogicalLen +
mvccEncodedTimeSyntheticLen +
mvccEncodedTimeLengthLen
suffix := make([]byte, mvccEncodedTimestampWithSyntheticBitLen)
encodeMVCCTimestampToBuf(suffix, ts)
suffix[len(suffix)-2] = 0x01 // Synthetic bit.
suffix[len(suffix)-1] = mvccEncodedTimestampWithSyntheticBitLen
if decodedTS, err := DecodeMVCCTimestampSuffix(suffix); err != nil {
panic(err)
} else if !ts.Equal(decodedTS) {
panic(errors.AssertionFailedf("manufactured MVCC timestamp with synthetic bit decoded to %s not %s",
ts, decodedTS))
}
return suffix
}
Loading

0 comments on commit d2f157d

Please sign in to comment.