diff --git a/pkg/kv/kvserver/rangefeed/task_test.go b/pkg/kv/kvserver/rangefeed/task_test.go index 87b695a7a228..edf833d47904 100644 --- a/pkg/kv/kvserver/rangefeed/task_test.go +++ b/pkg/kv/kvserver/rangefeed/task_test.go @@ -190,6 +190,21 @@ func (s *testIterator) curKV() storage.MVCCKeyValue { return s.kvs[s.cur] } +// HasPointAndRange implements SimpleMVCCIterator. +func (s *testIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (s *testIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeTombstones implements SimpleMVCCIterator. +func (s *testIterator) RangeKeys() []storage.MVCCRangeKeyValue { + panic("not implemented") +} + func TestInitResolvedTSScan(t *testing.T) { defer leaktest.AfterTest(t)() startKey := roachpb.RKey("d") diff --git a/pkg/kv/kvserver/spanset/batch.go b/pkg/kv/kvserver/spanset/batch.go index ae012cb87a87..72928d007369 100644 --- a/pkg/kv/kvserver/spanset/batch.go +++ b/pkg/kv/kvserver/spanset/batch.go @@ -176,6 +176,21 @@ func (i *MVCCIterator) UnsafeValue() []byte { return i.i.UnsafeValue() } +// HasPointAndRange implements SimpleMVCCIterator. +func (i *MVCCIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (i *MVCCIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (i *MVCCIterator) RangeKeys() []storage.MVCCRangeKeyValue { + panic("not implemented") +} + // ComputeStats is part of the storage.MVCCIterator interface. func (i *MVCCIterator) ComputeStats( start, end roachpb.Key, nowNanos int64, @@ -599,6 +614,22 @@ func (s spanSetWriter) ClearIterRange(iter storage.MVCCIterator, start, end roac return s.w.ClearIterRange(iter, start, end) } +func (s spanSetWriter) ExperimentalPutMVCCRangeKey( + rangeKey storage.MVCCRangeKey, value []byte, +) error { + if err := s.checkAllowedRange(rangeKey.StartKey, rangeKey.EndKey); err != nil { + return err + } + return s.w.ExperimentalPutMVCCRangeKey(rangeKey, value) +} + +func (s spanSetWriter) ExperimentalClearMVCCRangeKey(rangeKey storage.MVCCRangeKey) error { + if err := s.checkAllowedRange(rangeKey.StartKey, rangeKey.EndKey); err != nil { + return err + } + return s.w.ExperimentalClearMVCCRangeKey(rangeKey) +} + func (s spanSetWriter) Merge(key storage.MVCCKey, value []byte) error { if s.spansOnly { if err := s.spans.CheckAllowed(SpanReadWrite, roachpb.Span{Key: key.Key}); err != nil { diff --git a/pkg/storage/BUILD.bazel b/pkg/storage/BUILD.bazel index ea1d295af0fc..0d1314e2a96f 100644 --- a/pkg/storage/BUILD.bazel +++ b/pkg/storage/BUILD.bazel @@ -21,6 +21,7 @@ go_library( "mvcc_incremental_iterator.go", "mvcc_key.go", "mvcc_logical_ops.go", + "mvcc_range_tombstone_iterator.go", "open.go", "pebble.go", "pebble_batch.go", @@ -108,6 +109,7 @@ go_test( "mvcc_incremental_iterator_test.go", "mvcc_key_test.go", "mvcc_logical_ops_test.go", + "mvcc_range_tombstone_iterator_test.go", "mvcc_stats_test.go", "mvcc_test.go", "pebble_file_registry_test.go", diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go index 5198a553ab36..bea62e2fb410 100644 --- a/pkg/storage/engine.go +++ b/pkg/storage/engine.go @@ -72,6 +72,15 @@ type SimpleMVCCIterator interface { // UnsafeValue returns the same value as Value, but the memory is // invalidated on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}. UnsafeValue() []byte + // HasPointAndRange returns whether the current iterator position has a point + // key and/or a range key. Range keys are only emitted when requested via + // IterOptions.KeyTypes. + HasPointAndRange() (bool, bool) + // RangeBounds returns the range bounds for the current range key, if any. + RangeBounds() (roachpb.Key, roachpb.Key) + // RangeKeys returns the range key fragments at the current iterator position, + // if any. + RangeKeys() []MVCCRangeKeyValue } // IteratorStats is returned from {MVCCIterator,EngineIterator}.Stats. @@ -309,8 +318,26 @@ type IterOptions struct { // use such an iterator is to use it in concert with an iterator without // timestamp hints, as done by MVCCIncrementalIterator. MinTimestampHint, MaxTimestampHint hlc.Timestamp + // KeyTypes specifies the types of keys to surface: point and/or range keys. + // Use HasPointAndRange() to determine which key type is present at a given + // iterator position, and RangeBounds() and RangeKeys() to access range keys. + KeyTypes IterKeyType } +// IterKeyType configures which types of keys an iterator should surface. +// +// TODO(erikgrinaker): Combine this with MVCCIterKind somehow. +type IterKeyType = pebble.IterKeyType + +const ( + // IterKeyTypePointsOnly iterates over point keys only. + IterKeyTypePointsOnly = pebble.IterKeyTypePointsOnly + // IterKeyTypePointsAndRanges iterates over both point and range keys. + IterKeyTypePointsAndRanges = pebble.IterKeyTypePointsAndRanges + // IterKeyTypeRangesOnly iterates over only range keys. + IterKeyTypeRangesOnly = pebble.IterKeyTypeRangesOnly +) + // MVCCIterKind is used to inform Reader about the kind of iteration desired // by the caller. type MVCCIterKind int @@ -584,6 +611,38 @@ type Writer interface { // returns. ClearIterRange(iter MVCCIterator, start, end roachpb.Key) error + // ExperimentalClearMVCCRangeKey deletes an MVCC range key from start + // (inclusive) to end (exclusive) at the given timestamp. For any range key + // that straddles the start and end boundaries, only the segments within the + // boundaries will be cleared. Clears are idempotent. + // + // This method is primarily intented for MVCC garbage collection and similar + // internal use. It mutates MVCC history, and does not check for intents or + // other conflicts. + // + // This method is EXPERIMENTAL. Range keys are not supported throughout the + // MVCC API, and the on-disk format is unstable. + ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error + + // ExperimentalPutMVCCRangeKey writes a value to an MVCC range key. It is + // currently only used for range tombstones, which have a value of nil. + // Range keys exist separately from point keys in Pebble, and must be + // accessed via specialized iterator options and methods. See e.g. + // IterOptions.KeyTypes and SimpleMVCCIterator.RangeKeys(). + // + // A range key does not have a distinct identity, but should be considered a + // key continuum. They will be fragmented by Pebble such that all overlapping + // range keys between two fragment bounds form a "stack" of range key + // fragments. This fragmentation is non-deterministic, and will also depend on + // the internal SST structure (and thus on Pebble compactions) and the store's + // write history. They will also split and merge along with CRDB ranges, can + // be partially removed via ExperimentalClearMVCCRangeKey, and may be + // truncated by bounded scans or iterators. + // + // This function is EXPERIMENTAL. Range keys are not handled throughout the + // MVCC API, and the on-disk format is unstable. + ExperimentalPutMVCCRangeKey(MVCCRangeKey, []byte) error + // Merge is a high-performance write operation used for values which are // accumulated over several writes. Multiple values can be merged // sequentially into a single key; a subsequent read will return a "merged" diff --git a/pkg/storage/intent_interleaving_iter.go b/pkg/storage/intent_interleaving_iter.go index bf31221fbe93..834280186722 100644 --- a/pkg/storage/intent_interleaving_iter.go +++ b/pkg/storage/intent_interleaving_iter.go @@ -715,6 +715,21 @@ func (i *intentInterleavingIter) Value() []byte { return i.iter.Value() } +// HasPointAndRange implements SimpleMVCCIterator. +func (i *intentInterleavingIter) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (i *intentInterleavingIter) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (i *intentInterleavingIter) RangeKeys() []MVCCRangeKeyValue { + panic("not implemented") +} + func (i *intentInterleavingIter) Close() { i.iter.Close() i.intentIter.Close() diff --git a/pkg/storage/multi_iterator.go b/pkg/storage/multi_iterator.go index 9838adf60ec7..6d2558e10c21 100644 --- a/pkg/storage/multi_iterator.go +++ b/pkg/storage/multi_iterator.go @@ -14,6 +14,7 @@ import ( "bytes" "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/roachpb" ) const invalidIdxSentinel = -1 @@ -92,6 +93,21 @@ func (f *multiIterator) UnsafeValue() []byte { return f.iters[f.currentIdx].UnsafeValue() } +// HasPointAndRange implements SimpleMVCCIterator. +func (f *multiIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (f *multiIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (f *multiIterator) RangeKeys() []MVCCRangeKeyValue { + panic("not implemented") +} + // Next advances the iterator to the next key/value in the iteration. After this // call, Valid() will be true if the iterator was not positioned at the last // key. diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index 64b0d1abaef2..7a46008ac6ea 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -99,6 +99,12 @@ type MVCCKeyValue struct { Value []byte } +// MVCCRangeKeyValue represents a ranged key/value pair. +type MVCCRangeKeyValue struct { + Key MVCCRangeKey + Value []byte +} + // optionalValue represents an optional roachpb.Value. It is preferred // over a *roachpb.Value to avoid the forced heap allocation. type optionalValue struct { @@ -2202,6 +2208,33 @@ func MVCCDeleteRange( return keys, res.ResumeSpan, res.NumKeys, nil } +// ExperimentalMVCCDeleteRangeUsingTombstone deletes the given MVCC keyspan at +// the given timestamp using a range tombstone (rather than point tombstones). +// This operation is non-transactional, but will check for existing intents and +// return a WriteIntentError containing up to maxIntents intents. +// +// This function is EXPERIMENTAL. Range tombstones are not supported throughout +// the MVCC API, and the on-disk format is unstable. +// +// TODO(erikgrinaker): Needs conflict handling, e.g. WriteTooOldError. +// TODO(erikgrinaker): Needs MVCCStats handling. +func ExperimentalMVCCDeleteRangeUsingTombstone( + ctx context.Context, + rw ReadWriter, + ms *enginepb.MVCCStats, + startKey, endKey roachpb.Key, + timestamp hlc.Timestamp, + maxIntents int64, +) error { + if intents, err := ScanIntents(ctx, rw, startKey, endKey, maxIntents, 0); err != nil { + return err + } else if len(intents) > 0 { + return &roachpb.WriteIntentError{Intents: intents} + } + return rw.ExperimentalPutMVCCRangeKey(MVCCRangeKey{ + StartKey: startKey, EndKey: endKey, Timestamp: timestamp}, nil) +} + func recordIteratorStats(traceSpan *tracing.Span, iteratorStats IteratorStats) { stats := iteratorStats.Stats if traceSpan != nil { @@ -3929,3 +3962,30 @@ func ComputeStatsForRange( ms.LastUpdateNanos = nowNanos return ms, nil } + +// MVCCScanRangeTombstones returns a list of range tombstones across the given +// span at the given timestamp, in end,timestamp order rather that +// start,timestamp. Any tombstones that straddle the bounds will be truncated. +func MVCCScanRangeTombstones( + ctx context.Context, reader Reader, start, end roachpb.Key, ts hlc.Timestamp, +) ([]MVCCRangeKey, error) { + var tombstones []MVCCRangeKey + iter := NewMVCCRangeTombstoneIterator(reader, MVCCRangeTombstoneIterOptions{ + LowerBound: start, + UpperBound: end, + MaxTimestamp: ts, + }) + for { + if ok, err := iter.Valid(); err != nil { + return nil, err + } else if !ok { + break + } + if err := ctx.Err(); err != nil { + return nil, err + } + tombstones = append(tombstones, iter.Key()) + iter.Next() + } + return tombstones, nil +} diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index f8fa5f827f6c..e4d6e15827d5 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -37,6 +37,7 @@ import ( "github.com/cockroachdb/datadriven" "github.com/cockroachdb/errors" "github.com/cockroachdb/redact" + "github.com/stretchr/testify/require" ) // TestMVCCHistories verifies that sequences of MVCC reads and writes @@ -55,17 +56,20 @@ import ( // resolve_intent t= k= [status=] // check_intent k= [none] // -// cput [t=] [ts=[,]] [resolve [status=]] k= v= [raw] [cond=] -// del [t=] [ts=[,]] [resolve [status=]] k= -// del_range [t=] [ts=[,]] [resolve [status=]] k= [end=] [max=] [returnKeys] -// get [t=] [ts=[,]] [resolve [status=]] k= [inconsistent] [tombstones] [failOnMoreRecent] [localUncertaintyLimit=[,]] -// increment [t=] [ts=[,]] [resolve [status=]] k= [inc=] -// put [t=] [ts=[,]] [resolve [status=]] k= v= [raw] -// scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [tombstones] [reverse] [failOnMoreRecent] [localUncertaintyLimit=[,]] [max=] [targetbytes=] [avoidExcess] [allowEmpty] +// cput [t=] [ts=[,]] [resolve [status=]] k= v= [raw] [cond=] +// del [t=] [ts=[,]] [resolve [status=]] k= +// del_range [t=] [ts=[,]] [resolve [status=]] k= [end=] [max=] [returnKeys] +// del_range_ts [ts=[,]] k= end= +// get [t=] [ts=[,]] [resolve [status=]] k= [inconsistent] [tombstones] [failOnMoreRecent] [localUncertaintyLimit=[,]] +// increment [t=] [ts=[,]] [resolve [status=]] k= [inc=] +// put [t=] [ts=[,]] [resolve [status=]] k= v= [raw] +// scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [tombstones] [reverse] [failOnMoreRecent] [localUncertaintyLimit=[,]] [max=] [targetbytes=] [avoidExcess] [allowEmpty] +// scan_range_ts k= end= [ts=[,]] // // merge [ts=[,]] k= v= [raw] // // clear_range k= end= +// clear_range_ts k= end= [ts=[,]] // // Where `` can be a simple string, or a string // prefixed by the following characters: @@ -112,8 +116,15 @@ func TestMVCCHistories(t *testing.T) { defer engine.Close() reportDataEntries := func(buf *redact.StringBuilder) error { - hasData := false - err := engine.MVCCIterate(span.Key, span.EndKey, MVCCKeyAndIntentsIterKind, func(r MVCCKeyValue) error { + rangeTombstones, err := MVCCScanRangeTombstones( + ctx, engine, span.Key, span.EndKey, hlc.Timestamp{}) + require.NoError(t, err) + for _, rt := range rangeTombstones { + buf.Printf("range tombstone: %s\n", rt) + } + hasData := len(rangeTombstones) > 0 + + err = engine.MVCCIterate(span.Key, span.EndKey, MVCCKeyAndIntentsIterKind, func(r MVCCKeyValue) error { hasData = true if r.Key.Timestamp.IsEmpty() { // Meta is at timestamp zero. @@ -396,15 +407,18 @@ var commands = map[string]cmd{ // TODO(nvanbenschoten): test "resolve_intent_range". "check_intent": {typReadOnly, cmdCheckIntent}, - "clear_range": {typDataUpdate, cmdClearRange}, - "cput": {typDataUpdate, cmdCPut}, - "del": {typDataUpdate, cmdDelete}, - "del_range": {typDataUpdate, cmdDeleteRange}, - "get": {typReadOnly, cmdGet}, - "increment": {typDataUpdate, cmdIncrement}, - "merge": {typDataUpdate, cmdMerge}, - "put": {typDataUpdate, cmdPut}, - "scan": {typReadOnly, cmdScan}, + "clear_range": {typDataUpdate, cmdClearRange}, + "clear_range_ts": {typDataUpdate, cmdClearRangeTombstone}, + "cput": {typDataUpdate, cmdCPut}, + "del": {typDataUpdate, cmdDelete}, + "del_range": {typDataUpdate, cmdDeleteRange}, + "del_range_ts": {typDataUpdate, cmdDeleteRangeTombstone}, + "get": {typReadOnly, cmdGet}, + "increment": {typDataUpdate, cmdIncrement}, + "merge": {typDataUpdate, cmdMerge}, + "put": {typDataUpdate, cmdPut}, + "scan": {typReadOnly, cmdScan}, + "scan_range_ts": {typReadOnly, cmdScanRangeTombstone}, } func cmdTxnAdvance(e *evalCtx) error { @@ -584,6 +598,16 @@ func cmdClearRange(e *evalCtx) error { return e.engine.ClearMVCCRangeAndIntents(key, endKey) } +func cmdClearRangeTombstone(e *evalCtx) error { + key, endKey := e.getKeyRange() + ts := e.getTs(nil) + return e.engine.ExperimentalClearMVCCRangeKey(MVCCRangeKey{ + StartKey: key, + EndKey: endKey, + Timestamp: ts, + }) +} + func cmdCPut(e *evalCtx) error { txn := e.getTxn(optional) ts := e.getTs(txn) @@ -660,6 +684,24 @@ func cmdDeleteRange(e *evalCtx) error { }) } +func cmdDeleteRangeTombstone(e *evalCtx) error { + key, endKey := e.getKeyRange() + ts := e.getTs(nil) + + return e.withWriter("del_range", func(rw ReadWriter) error { + err := ExperimentalMVCCDeleteRangeUsingTombstone(e.ctx, rw, nil, key, endKey, ts, 0) + if err != nil { + return err + } + e.results.buf.Printf("del_range_ts: %s\n", MVCCRangeKey{ + StartKey: key, + EndKey: endKey, + Timestamp: ts, + }) + return nil + }) +} + func cmdGet(e *evalCtx) error { txn := e.getTxn(optional) key := e.getKey() @@ -824,6 +866,23 @@ func cmdScan(e *evalCtx) error { return err } +func cmdScanRangeTombstone(e *evalCtx) error { + key, endKey := e.getKeyRange() + ts := e.getTs(nil) + + tombstones, err := MVCCScanRangeTombstones(e.ctx, e.engine, key, endKey, ts) + if err != nil { + return err + } + for _, tombstone := range tombstones { + e.results.buf.Printf("scan_range_ts: %s\n", tombstone) + } + if len(tombstones) == 0 { + e.results.buf.Printf("scan_range_ts: %v-%v -> \n", key, endKey) + } + return nil +} + // evalCtx stored the current state of the environment of a running // script. type evalCtx struct { diff --git a/pkg/storage/mvcc_incremental_iterator.go b/pkg/storage/mvcc_incremental_iterator.go index 0beb1b464ed5..a199c7329cd9 100644 --- a/pkg/storage/mvcc_incremental_iterator.go +++ b/pkg/storage/mvcc_incremental_iterator.go @@ -485,6 +485,21 @@ func (i *MVCCIncrementalIterator) UnsafeKey() MVCCKey { return i.iter.UnsafeKey() } +// HasPointAndRange implements SimpleMVCCIterator. +func (i *MVCCIncrementalIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (i *MVCCIncrementalIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (i *MVCCIncrementalIterator) RangeKeys() []MVCCRangeKeyValue { + panic("not implemented") +} + // UnsafeValue returns the same value as Value, but the memory is invalidated on // the next call to {Next,Reset,Close}. func (i *MVCCIncrementalIterator) UnsafeValue() []byte { diff --git a/pkg/storage/mvcc_key.go b/pkg/storage/mvcc_key.go index 02cb4016ba92..988370d2c907 100644 --- a/pkg/storage/mvcc_key.go +++ b/pkg/storage/mvcc_key.go @@ -79,6 +79,11 @@ func (k MVCCKey) Equal(l MVCCKey) bool { return k.Key.Compare(l.Key) == 0 && k.Timestamp.EqOrdering(l.Timestamp) } +// IsEmpty returns true iff there is a non-zero key or timestamp. +func (k MVCCKey) IsEmpty() bool { + return k.Key == nil && k.Timestamp.IsEmpty() +} + // IsValue returns true iff the timestamp is non-zero. func (k MVCCKey) IsValue() bool { return !k.Timestamp.IsEmpty() @@ -287,3 +292,66 @@ func decodeMVCCTimestampSuffix(encodedTS []byte) (hlc.Timestamp, error) { } return decodeMVCCTimestamp(encodedTS[:encodedLen-1]) } + +// MVCCRangeKey is an MVCC key span at a timestamp. +type MVCCRangeKey struct { + StartKey roachpb.Key + EndKey roachpb.Key + Timestamp hlc.Timestamp +} + +// Clone returns a copy of the range key. +func (k MVCCRangeKey) Clone() MVCCRangeKey { + // k is already a copy, but needs key clones. + k.StartKey = k.StartKey.Clone() + k.EndKey = k.EndKey.Clone() + return k +} + +// Compare returns -1 if this key is less than the given key, 0 if they're +// equal, or 1 if the given key is greater than this. Comparison is by +// start,timestamp,end, where larger timestamps sort before smaller ones except +// empty ones which sort first (like elsewhere in MVCC). +func (k MVCCRangeKey) Compare(o MVCCRangeKey) int { + if c := k.StartKey.Compare(o.StartKey); c != 0 { + return c + } + if k.Timestamp.IsEmpty() && !o.Timestamp.IsEmpty() { + return -1 + } else if !k.Timestamp.IsEmpty() && o.Timestamp.IsEmpty() { + return 1 + } else if c := k.Timestamp.Compare(o.Timestamp); c != 0 { + return -c // MVCC sorts timestamps in reverse + } + return k.EndKey.Compare(o.EndKey) +} + +// String formats the range key. +func (k MVCCRangeKey) String() string { + s := roachpb.Span{Key: k.StartKey, EndKey: k.EndKey}.String() + if !k.Timestamp.IsEmpty() { + s += fmt.Sprintf("/%s", k.Timestamp) + } + return s +} + +// Validate returns an error if the range key is invalid. +func (k MVCCRangeKey) Validate() (err error) { + return errors.Wrapf(k.validate(), "invalid range key %s", k) +} + +func (k MVCCRangeKey) validate() error { + if k.StartKey == nil { + return errors.Errorf("no start key") + } + if k.EndKey == nil { + return errors.Errorf("no end key") + } + if k.Timestamp.IsEmpty() { + return errors.Errorf("no timestamp") + } + if k.StartKey.Compare(k.EndKey) > 0 { + return errors.Errorf("start key %s is after end key %s", k.StartKey, k.EndKey) + } + return nil +} diff --git a/pkg/storage/mvcc_key_test.go b/pkg/storage/mvcc_key_test.go index 63af9beda842..63ecfb7c5e14 100644 --- a/pkg/storage/mvcc_key_test.go +++ b/pkg/storage/mvcc_key_test.go @@ -238,3 +238,94 @@ func BenchmarkDecodeMVCCKey(b *testing.B) { } benchmarkDecodeMVCCKeyResult = mvccKey // avoid compiler optimizing away function call } + +func TestMVCCRangeKeyString(t *testing.T) { + defer leaktest.AfterTest(t)() + + testcases := map[string]struct { + rk MVCCRangeKey + expect string + }{ + "empty": {MVCCRangeKey{}, "/Min"}, + "only start": {MVCCRangeKey{StartKey: roachpb.Key("foo")}, "foo"}, + "only end": {MVCCRangeKey{EndKey: roachpb.Key("foo")}, "{/Min-foo}"}, + "only timestamp": {MVCCRangeKey{Timestamp: hlc.Timestamp{Logical: 1}}, "/Min/0,1"}, + "only span": {MVCCRangeKey{StartKey: roachpb.Key("a"), EndKey: roachpb.Key("z")}, "{a-z}"}, + "all": {MVCCRangeKey{StartKey: roachpb.Key("a"), EndKey: roachpb.Key("z"), Timestamp: hlc.Timestamp{Logical: 1}}, "{a-z}/0,1"}, + "all overlapping": {MVCCRangeKey{StartKey: roachpb.Key("ab"), EndKey: roachpb.Key("af"), Timestamp: hlc.Timestamp{Logical: 1}}, "a{b-f}/0,1"}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + require.Equal(t, tc.expect, tc.rk.String()) + }) + } +} + +func TestMVCCRangeKeyCompare(t *testing.T) { + defer leaktest.AfterTest(t)() + + ab1 := MVCCRangeKey{roachpb.Key("a"), roachpb.Key("b"), hlc.Timestamp{Logical: 1}} + ac1 := MVCCRangeKey{roachpb.Key("a"), roachpb.Key("c"), hlc.Timestamp{Logical: 1}} + ac2 := MVCCRangeKey{roachpb.Key("a"), roachpb.Key("c"), hlc.Timestamp{Logical: 2}} + bc0 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("c"), hlc.Timestamp{Logical: 0}} + bc1 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("c"), hlc.Timestamp{Logical: 1}} + bc3 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("c"), hlc.Timestamp{Logical: 3}} + bd4 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("d"), hlc.Timestamp{Logical: 4}} + + testcases := map[string]struct { + a MVCCRangeKey + b MVCCRangeKey + expect int + }{ + "equal": {ac1, ac1, 0}, + "start lt": {ac1, bc1, -1}, + "start gt": {bc1, ac1, 1}, + "end lt": {ab1, ac1, -1}, + "end gt": {ac1, ab1, 1}, + "time lt": {ac2, ac1, -1}, // MVCC timestamps sort in reverse order + "time gt": {ac1, ac2, 1}, // MVCC timestamps sort in reverse order + "empty time lt set": {bc0, bc1, -1}, // empty MVCC timestamps sort before non-empty + "set time gt empty": {bc1, bc0, 1}, // empty MVCC timestamps sort before non-empty + "start time precedence": {ac2, bc3, -1}, + "time end precedence": {bd4, bc3, -1}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + require.Equal(t, tc.expect, tc.a.Compare(tc.b)) + }) + } +} + +func TestMVCCRangeKeyValidate(t *testing.T) { + defer leaktest.AfterTest(t)() + + a := roachpb.Key("a") + b := roachpb.Key("b") + blank := roachpb.Key("") + ts1 := hlc.Timestamp{Logical: 1} + + testcases := map[string]struct { + rangeKey MVCCRangeKey + expectErr string // empty if no error + }{ + "valid": {MVCCRangeKey{StartKey: a, EndKey: b, Timestamp: ts1}, ""}, + "start at end": {MVCCRangeKey{StartKey: a, EndKey: b, Timestamp: ts1}, ""}, + "blank keys": {MVCCRangeKey{StartKey: blank, EndKey: blank, Timestamp: ts1}, ""}, // equivalent to MinKey + "no start": {MVCCRangeKey{EndKey: b, Timestamp: ts1}, "{/Min-b}/0,1: no start key"}, + "no end": {MVCCRangeKey{StartKey: a, Timestamp: ts1}, "a/0,1: no end key"}, + "no timestamp": {MVCCRangeKey{StartKey: a, EndKey: b}, "{a-b}: no timestamp"}, + "empty": {MVCCRangeKey{}, "/Min: no start key"}, + "end before start": {MVCCRangeKey{StartKey: b, EndKey: a, Timestamp: ts1}, `{b-a}/0,1: start key "b" is after end key "a"`}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + err := tc.rangeKey.Validate() + if tc.expectErr == "" { + require.NoError(t, err) + } else { + require.Error(t, err) + require.Contains(t, err.Error(), tc.expectErr) + } + }) + } +} diff --git a/pkg/storage/mvcc_range_tombstone_iterator.go b/pkg/storage/mvcc_range_tombstone_iterator.go new file mode 100644 index 000000000000..91a45502f69b --- /dev/null +++ b/pkg/storage/mvcc_range_tombstone_iterator.go @@ -0,0 +1,291 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package storage + +import ( + "bytes" + + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/errors" +) + +// MVCCRangeTombstoneIterOptions are options for an MVCCRangeTombstoneIterator. +type MVCCRangeTombstoneIterOptions struct { + // LowerBound sets the inclusive lower bound of the iterator. Tombstones that + // straddle the it will have their start key truncated to the lower bound. + // + // NB: It may be tempting to use an MVCCKey here and include a timestamp, but + // this would be useless: giving e.g. a@4 would skip a tombstone starting at + // a@5, but the tombstone would logically exist at the adjacent a.Next()@5 so + // it would be emitted almost immediately anyway. + LowerBound roachpb.Key + // UpperBound sets the exclusive upper bound of the iterator. Tombstones that + // straddle it will have their end key truncated to the upper bound. + UpperBound roachpb.Key + // MinTimestamp sets the inclusive lower timestamp bound for the iterator. + MinTimestamp hlc.Timestamp + // MaxTimestamp sets the inclusive upper timestamp bound for the iterator. + MaxTimestamp hlc.Timestamp + // Fragmented will emit tombstone fragments as they are stored in Pebble. + // Fragments typically begin and end where a tombstone bound overlaps with + // another tombstone, for all overlapping tombstones. However, fragmentation + // is non-deterministic as it also depends on Pebble's internal SST structure + // and mutation history. + // + // When enabled, this results in an iteration order of StartKey,Timestamp as + // opposed to the normal EndKey,Timestamp order for range tombstones. This may + // be useful for partial results and resumption, e.g. resume spans. + Fragmented bool +} + +// MVCCRangeTombstoneIterator iterates over range tombstones in an engine and +// defragments them into contiguous range tombstones. It does not support +// seeking or backtracking, see RangeTombstoneIterOptions for lower/upper bounds +// and other options. +// +// Iteration uses EndKey,Timestamp order rather than StartKey,Timestamp. For +// example, [a-z)@3 will be emitted after [c-e)@2, but before [x-z)@1. This is a +// memory optimization when defragmenting Pebble range keys, to allow emitting +// tombstones as soon as possible. Otherwise, a single tombstone across the the +// entire key span would require all other tombstones at other timestamps to be +// buffered in memory before they could be emitted. However, see the Fragmented +// option which emits non-deterministic fragments in StartKey,Timestamp order. +type MVCCRangeTombstoneIterator struct { + iter MVCCIterator + opts MVCCRangeTombstoneIterOptions + incomplete []*MVCCRangeKey // defragmentation buffer + complete []MVCCRangeKey // queued for emission + completeIdx int // current Key() + iterDone bool // TODO(erikgrinaker): remove this + err error +} + +// NewMVCCRangeTombstoneIterator sets up a new MVCRangeTombstoneIterator and +// seeks to the first range tombstone. The caller must call Close() when done. +func NewMVCCRangeTombstoneIterator( + r Reader, opts MVCCRangeTombstoneIterOptions, +) *MVCCRangeTombstoneIterator { + iter := &MVCCRangeTombstoneIterator{ + iter: r.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ + KeyTypes: IterKeyTypeRangesOnly, + LowerBound: EncodeMVCCKey(MVCCKey{Key: opts.LowerBound}), + UpperBound: EncodeMVCCKey(MVCCKey{Key: opts.UpperBound}), + // TODO(erikgrinaker): We do not set Min/MaxTimestampHint here, because + // both are required and it's apparently not always safe to use. + }), + opts: opts, + incomplete: make([]*MVCCRangeKey, 0), + complete: make([]MVCCRangeKey, 0), + } + + // Seek the iterator to the lower bound and iterate until we've collected + // the first complete range tombstone (if any). + iter.iter.SeekGE(MVCCKey{Key: opts.LowerBound}) + iter.findCompleteTombstones() + + return iter +} + +// findCompleteTombstones processes range keys at the current iterator position +// and any subsequent iterator positions until it completes one or more +// tombstones, populating completeTombstones. Current completeTombstones are +// discarded. +func (p *MVCCRangeTombstoneIterator) findCompleteTombstones() { + p.complete = p.complete[:0] + p.completeIdx = 0 + p.updateTombstones() + + for len(p.complete) == 0 && !p.iterDone { + if ok, err := p.iter.Valid(); err != nil { + p.err = err + return + } else if !ok { + break + } + p.iter.Next() + // NB: We update tombstones even if Next() invalidates the iterator, because + // there may be incomplete tombstones that become complete when the iterator + // is exhausted. + p.updateTombstones() + } +} + +// updateTombstones inspects the range keys at the current Pebble iterator +// position, tracks tombstones in incompleteTombstones, and moves any +// completed tombstones into completeTombstones. +func (p *MVCCRangeTombstoneIterator) updateTombstones() { + var startKey, endKey roachpb.Key + var rangeKeys []MVCCRangeKeyValue + + // If the iterator is exhausted, we still want to complete any remaining + // incomplete tombstones. + if ok, err := p.iter.Valid(); err != nil { + p.err = err + return + } else if ok { + startKey, endKey = p.iter.RangeBounds() + rangeKeys = p.iter.RangeKeys() + + // TODO(erikgrinaker): Pebble does not yet truncate range keys to the + // LowerBound or UpperBound of the range, so we truncate them here. + if p.opts.LowerBound != nil && bytes.Compare(startKey, p.opts.LowerBound) < 0 { + startKey = p.opts.LowerBound + } + if p.opts.UpperBound != nil && bytes.Compare(endKey, p.opts.UpperBound) > 0 { + endKey = p.opts.UpperBound + } + } + + // TODO(erikgrinaker): Pebble does not yet respect UpperBound for range keys, + // and seems to go into an infinite loop if we try to exhaust the iterator + // here, so we use p.iterDone to mark it as done. + if len(p.opts.UpperBound) > 0 && bytes.Compare(startKey, p.opts.UpperBound) >= 0 { + p.iterDone = true + startKey, endKey, rangeKeys = nil, nil, nil + } + + // Both RangeKeys and incompleteTombstones are sorted in descending suffix + // order, so we iterate over them in lockstep and insert/update/delete + // incompleteTombstones as appropriate. + var tsIdx, rkIdx int + + for rkIdx < len(rangeKeys) { + rangeKey := rangeKeys[rkIdx] + + // Error on non-tombstone range keys. We expect all range keys to be range + // tombstones currently. + // + // TODO(erikgrinaker): Pebble returns []byte{}, even though we wrote nil. + if len(rangeKey.Value) != 0 { + p.err = errors.Errorf("unexpected value for range key %s, expected nil: %x", + rangeKey.Key, rangeKey.Value) + return + } + + // Filter rangekeys by suffix. + // + // TODO(erikgrinaker): This can be optimized by skipping unnecessary + // comparisons since rangeKeys is sorted by suffix. Maybe later. + if !p.opts.MinTimestamp.IsEmpty() && rangeKey.Key.Timestamp.Less(p.opts.MinTimestamp) { + rkIdx++ + continue + } + if !p.opts.MaxTimestamp.IsEmpty() && p.opts.MaxTimestamp.Less(rangeKey.Key.Timestamp) { + rkIdx++ + continue + } + + // If we're at the end of incompleteTombstones, this range tombstone must be new. + if tsIdx >= len(p.incomplete) { + p.incomplete = append(p.incomplete, &MVCCRangeKey{ + StartKey: append(make([]byte, 0, len(startKey)), startKey...), + EndKey: append(make([]byte, 0, len(endKey)), endKey...), + Timestamp: rangeKey.Key.Timestamp, + }) + rkIdx++ + tsIdx++ + continue + } + + incomplete := p.incomplete[tsIdx] + cmp := incomplete.Timestamp.Compare(rangeKey.Key.Timestamp) + switch { + // If the timestamps match and the key spans are adjacent or overlapping, + // this range key extends the incomplete tombstone. + case cmp == 0 && bytes.Compare(startKey, incomplete.EndKey) <= 0: + incomplete.EndKey = append(incomplete.EndKey[:0], endKey...) + tsIdx++ + rkIdx++ + + // This is a different tombstone at the same suffix: complete the existing + // tombstone and start a new one. + case cmp == 0: + p.complete = append(p.complete, *incomplete) + // NB: can't reuse slices, as they were placed in the completed tombstone. + incomplete.StartKey = append(make([]byte, 0, len(startKey)), startKey...) + incomplete.EndKey = append(make([]byte, 0, len(endKey)), endKey...) + + // This incomplete tombstone is not present at this range key: complete it + // and remove it from the list, then try again. + case cmp == 1: + p.complete = append(p.complete, *incomplete) + p.incomplete = append(p.incomplete[:tsIdx], p.incomplete[tsIdx+1:]...) + + // This range key is a new incomplete tombstone: start tracking it. + case cmp == -1: + p.incomplete = append(p.incomplete[:tsIdx+1], p.incomplete[tsIdx:]...) + p.incomplete[tsIdx] = &MVCCRangeKey{ + StartKey: append(make(roachpb.Key, 0, len(startKey)), startKey...), + EndKey: append(make(roachpb.Key, 0, len(endKey)), endKey...), + Timestamp: rangeKey.Key.Timestamp, + } + tsIdx++ + rkIdx++ + + default: + p.err = errors.Errorf("unexpected comparison result %d", cmp) + return + } + } + + // If the caller has requested tombstone fragments, we complete all tombstones + // we found during this iteration by resetting tsIdx to 0. The loop below will + // handle the rest. + if p.opts.Fragmented { + tsIdx = 0 + } + + // If there are any remaining incomplete tombstones, they must be complete: + // make them so. + for _, ts := range p.incomplete[tsIdx:] { + p.complete = append(p.complete, *ts) + } + p.incomplete = p.incomplete[:tsIdx] +} + +// Close frees up resources held by the iterator. +func (p *MVCCRangeTombstoneIterator) Close() { + p.iter.Close() + p.complete = nil + p.completeIdx = 0 +} + +// Next iterates to the next range tombstone. Note the unusual iteration +// order, see struct comment for details. +func (p *MVCCRangeTombstoneIterator) Next() { + p.completeIdx++ + if p.completeIdx >= len(p.complete) { + p.iter.Next() + // NB: Called even if Next() fails, because we may have incomplete + // tombstones that become complete when the iterator is exhausted. + p.findCompleteTombstones() + } +} + +// Key returns the current range tombstone. It will not be invalidated by the +// iterator, but will be shared by all callers. +func (p *MVCCRangeTombstoneIterator) Key() MVCCRangeKey { + return p.complete[p.completeIdx] +} + +// Valid returns (true, nil) if the iterator points to a valid key, (false, nil) +// if the iterator is exhausted, or (false, error) if an error occurred during +// iteration. +func (p *MVCCRangeTombstoneIterator) Valid() (bool, error) { + if p.err != nil { + return false, p.err + } + if _, err := p.iter.Valid(); err != nil { + return false, err + } + return p.completeIdx < len(p.complete), nil +} diff --git a/pkg/storage/mvcc_range_tombstone_iterator_test.go b/pkg/storage/mvcc_range_tombstone_iterator_test.go new file mode 100644 index 000000000000..1723d424ef21 --- /dev/null +++ b/pkg/storage/mvcc_range_tombstone_iterator_test.go @@ -0,0 +1,170 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package storage + +import ( + "context" + "testing" + + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/stretchr/testify/require" +) + +func TestMVCCRangeTombstoneIterator(t *testing.T) { + defer leaktest.AfterTest(t)() + + ctx := context.Background() + eng := NewDefaultInMemForTesting() + defer eng.Close() + + rangeKeys := []MVCCRangeKey{ + rangeKey("b", "c", 3), + rangeKey("e", "g", 3), + rangeKey("d", "f", 5), + rangeKey("d", "f", 2), + rangeKey("a", "m", 4), + rangeKey("m", "z", 4), + } + for _, rk := range rangeKeys { + require.NoError(t, ExperimentalMVCCDeleteRangeUsingTombstone( + ctx, eng, nil, rk.StartKey, rk.EndKey, rk.Timestamp, 0)) + } + + testcases := map[string]struct { + opts MVCCRangeTombstoneIterOptions + expect []MVCCRangeKey + }{ + "all tombstones": { + MVCCRangeTombstoneIterOptions{}, + []MVCCRangeKey{ + rangeKey("b", "c", 3), + rangeKey("d", "f", 5), + rangeKey("d", "f", 2), + rangeKey("e", "g", 3), + rangeKey("a", "z", 4), + }}, + "truncated tombstones": { + MVCCRangeTombstoneIterOptions{ + LowerBound: roachpb.Key("c"), + UpperBound: roachpb.Key("e"), + }, + []MVCCRangeKey{ + rangeKey("d", "e", 5), + rangeKey("c", "e", 4), + rangeKey("d", "e", 2), + }}, + "truncation between tombstone bounds": { + MVCCRangeTombstoneIterOptions{ + LowerBound: roachpb.Key("ccc"), + UpperBound: roachpb.Key("eee"), + }, + []MVCCRangeKey{ + rangeKey("d", "eee", 5), + rangeKey("ccc", "eee", 4), + rangeKey("e", "eee", 3), + rangeKey("d", "eee", 2), + }}, + "fragmented tombstones": { + MVCCRangeTombstoneIterOptions{ + Fragmented: true, + }, + []MVCCRangeKey{ + rangeKey("a", "b", 4), + rangeKey("b", "c", 4), + rangeKey("b", "c", 3), + rangeKey("c", "d", 4), + rangeKey("d", "e", 5), + rangeKey("d", "e", 4), + rangeKey("d", "e", 2), + rangeKey("e", "f", 5), + rangeKey("e", "f", 4), + rangeKey("e", "f", 3), + rangeKey("e", "f", 2), + rangeKey("f", "g", 4), + rangeKey("f", "g", 3), + rangeKey("g", "m", 4), + rangeKey("m", "z", 4), + }}, + "empty interval": { + MVCCRangeTombstoneIterOptions{ + LowerBound: roachpb.Key("A"), + UpperBound: roachpb.Key("Z"), + }, + nil}, + "zero-length interval": { + MVCCRangeTombstoneIterOptions{ + LowerBound: roachpb.Key("c"), + UpperBound: roachpb.Key("c"), + }, + nil}, + "end after start": { + MVCCRangeTombstoneIterOptions{ + LowerBound: roachpb.Key("e"), + UpperBound: roachpb.Key("d"), + }, + nil}, + "min timestamp": { + MVCCRangeTombstoneIterOptions{ + MinTimestamp: hlc.Timestamp{Logical: 3}, + }, + []MVCCRangeKey{ + rangeKey("b", "c", 3), + rangeKey("d", "f", 5), + rangeKey("e", "g", 3), + rangeKey("a", "z", 4), + }}, + "max timestamp": { + MVCCRangeTombstoneIterOptions{ + MaxTimestamp: hlc.Timestamp{Logical: 3}, + }, + []MVCCRangeKey{ + rangeKey("b", "c", 3), + rangeKey("d", "f", 2), + rangeKey("e", "g", 3), + }}, + "both timestamps": { + MVCCRangeTombstoneIterOptions{ + MinTimestamp: hlc.Timestamp{Logical: 3}, + MaxTimestamp: hlc.Timestamp{Logical: 3}, + }, + []MVCCRangeKey{ + rangeKey("b", "c", 3), + rangeKey("e", "g", 3), + }}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + var tombstones []MVCCRangeKey + iter := NewMVCCRangeTombstoneIterator(eng, tc.opts) + defer iter.Close() + for { + ok, err := iter.Valid() + require.NoError(t, err) + if !ok { + break + } + tombstones = append(tombstones, iter.Key()) + iter.Next() + } + require.Equal(t, tc.expect, tombstones) + }) + } +} + +func rangeKey(start, end string, ts int) MVCCRangeKey { + return MVCCRangeKey{ + StartKey: roachpb.Key(start), + EndKey: roachpb.Key(end), + Timestamp: hlc.Timestamp{Logical: int32(ts)}, + } +} diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 02e0bbdc8cbb..832b48bef5ed 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -481,6 +481,8 @@ func DefaultPebbleOptions() *pebble.Options { TablePropertyCollectors: PebbleTablePropertyCollectors, BlockPropertyCollectors: PebbleBlockPropertyCollectors, } + // Used for experimental MVCC range tombstones. + opts.Experimental.RangeKeys = new(pebble.RangeKeysArena) // Automatically flush 10s after the first range tombstone is added to a // memtable. This ensures that we can reclaim space even when there's no // activity on the database generating flushes. @@ -1108,6 +1110,24 @@ func (p *Pebble) ClearIterRange(iter MVCCIterator, start, end roachpb.Key) error return batch.Commit(true) } +// ExperimentalPutMVCCRangeKey implements the Engine interface. +func (p *Pebble) ExperimentalPutMVCCRangeKey(rangeKey MVCCRangeKey, value []byte) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeySet(rangeKey.StartKey, rangeKey.EndKey, + encodeMVCCTimestampSuffix(rangeKey.Timestamp), value, pebble.Sync) +} + +// ExperimentalClearMVCCRangeKey implements the Engine interface. +func (p *Pebble) ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeyUnset(rangeKey.StartKey, rangeKey.EndKey, + encodeMVCCTimestampSuffix(rangeKey.Timestamp), pebble.Sync) +} + // Merge implements the Engine interface. func (p *Pebble) Merge(key MVCCKey, value []byte) error { if len(key.Key) == 0 { @@ -1882,6 +1902,14 @@ func (p *pebbleReadOnly) ClearIterRange(iter MVCCIterator, start, end roachpb.Ke panic("not implemented") } +func (p *pebbleReadOnly) ExperimentalPutMVCCRangeKey(_ MVCCRangeKey, _ []byte) error { + panic("not implemented") +} + +func (p *pebbleReadOnly) ExperimentalClearMVCCRangeKey(_ MVCCRangeKey) error { + panic("not implemented") +} + func (p *pebbleReadOnly) Merge(key MVCCKey, value []byte) error { panic("not implemented") } diff --git a/pkg/storage/pebble_batch.go b/pkg/storage/pebble_batch.go index 117c2995cfa8..5de38132298e 100644 --- a/pkg/storage/pebble_batch.go +++ b/pkg/storage/pebble_batch.go @@ -413,6 +413,24 @@ func (p *pebbleBatch) ClearIterRange(iter MVCCIterator, start, end roachpb.Key) return nil } +// ExperimentalDeleteMVCCRange implements the Batch interface. +func (p *pebbleBatch) ExperimentalPutMVCCRangeKey(rangeKey MVCCRangeKey, value []byte) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return p.batch.Experimental().RangeKeySet(rangeKey.StartKey, rangeKey.EndKey, + encodeMVCCTimestampSuffix(rangeKey.Timestamp), value, nil) +} + +// ExperimentalClearMVCCRangeTombstone implements the Engine interface. +func (p *pebbleBatch) ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeyUnset(rangeKey.StartKey, rangeKey.EndKey, + encodeMVCCTimestampSuffix(rangeKey.Timestamp), nil) +} + // Merge implements the Batch interface. func (p *pebbleBatch) Merge(key MVCCKey, value []byte) error { if len(key.Key) == 0 { diff --git a/pkg/storage/pebble_iterator.go b/pkg/storage/pebble_iterator.go index cc4a5ebe502e..32d1fdd94c90 100644 --- a/pkg/storage/pebble_iterator.go +++ b/pkg/storage/pebble_iterator.go @@ -166,6 +166,8 @@ func (p *pebbleIterator) init(handle pebble.Reader, iterToClone cloneableIter, o panic("min timestamp hint set without max timestamp hint") } + p.options.KeyTypes = opts.KeyTypes + if doClone { var err error if p.iter, err = iterToClone.Clone(); err != nil { @@ -575,6 +577,39 @@ func (p *pebbleIterator) ValueProto(msg protoutil.Message) error { return protoutil.Unmarshal(value, msg) } +// HasPointAndRange implements the MVCCIterator interface. +func (p *pebbleIterator) HasPointAndRange() (bool, bool) { + return p.iter.HasPointAndRange() +} + +// RangeBounds implements the MVCCIterator interface. +func (p *pebbleIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + return p.iter.RangeBounds() +} + +// RangeKeys implements the MVCCIterator interface. +func (p *pebbleIterator) RangeKeys() []MVCCRangeKeyValue { + start, end := p.iter.RangeBounds() + rangeKeys := p.iter.RangeKeys() + rangeValues := make([]MVCCRangeKeyValue, 0, len(rangeKeys)) + + for _, rangeKey := range rangeKeys { + timestamp, err := decodeMVCCTimestampSuffix(rangeKey.Suffix) + if err != nil { + panic(err) // TODO(erikgrinaker): Don't panic. + } + rangeValues = append(rangeValues, MVCCRangeKeyValue{ + Key: MVCCRangeKey{ + StartKey: start, + EndKey: end, + Timestamp: timestamp, + }, + Value: rangeKey.Value, + }) + } + return rangeValues +} + // ComputeStats implements the MVCCIterator interface. func (p *pebbleIterator) ComputeStats( start, end roachpb.Key, nowNanos int64, diff --git a/pkg/storage/sst_iterator.go b/pkg/storage/sst_iterator.go index 9bd8b49b1b39..c5c81b756c37 100644 --- a/pkg/storage/sst_iterator.go +++ b/pkg/storage/sst_iterator.go @@ -158,3 +158,18 @@ func (r *sstIterator) UnsafeKey() MVCCKey { func (r *sstIterator) UnsafeValue() []byte { return r.value } + +// HasPointAndRange implements SimpleMVCCIterator. +func (r *sstIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (r *sstIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (r *sstIterator) RangeKeys() []MVCCRangeKeyValue { + panic("not implemented") +} diff --git a/pkg/storage/sst_writer.go b/pkg/storage/sst_writer.go index 9b58c8afebc1..200c8aefac8c 100644 --- a/pkg/storage/sst_writer.go +++ b/pkg/storage/sst_writer.go @@ -118,6 +118,24 @@ func (fw *SSTWriter) ClearMVCCRange(start, end MVCCKey) error { return fw.clearRange(start, end) } +// ExperimentalPutMVCCRangeKey implements the Writer interface. +func (fw *SSTWriter) ExperimentalPutMVCCRangeKey(rangeKey MVCCRangeKey, value []byte) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return fw.fw.RangeKeySet(rangeKey.StartKey, rangeKey.EndKey, + encodeMVCCTimestampSuffix(rangeKey.Timestamp), value) +} + +// ExperimentalClearMVCCRangeKey implements the Writer interface. +func (fw *SSTWriter) ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return fw.fw.RangeKeyUnset(rangeKey.StartKey, rangeKey.EndKey, + encodeMVCCTimestampSuffix(rangeKey.Timestamp)) +} + func (fw *SSTWriter) clearRange(start, end MVCCKey) error { if fw.fw == nil { return errors.New("cannot call ClearRange on a closed writer") diff --git a/pkg/storage/testdata/mvcc_histories/delete_range_tombstone b/pkg/storage/testdata/mvcc_histories/delete_range_tombstone new file mode 100644 index 000000000000..df1121b79e21 --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/delete_range_tombstone @@ -0,0 +1,111 @@ +# TODO(erikgrinaker): The MVCC API does not respect range tombstones yet, so +# we don't test point keys because they remain unaffected. +# TODO(erikgrinaker): This needs conflict tests, implement later. + +# Write some range tombstones. Some will abut and merge. +run ok +del_range_ts k=b end=c ts=3 +del_range_ts k=e end=g ts=3 +del_range_ts k=d end=f ts=5 +del_range_ts k=d end=f ts=2 +del_range_ts k=m end=z ts=1 +del_range_ts k=a end=m ts=4 +del_range_ts k=m end=z ts=4 +---- +del_range_ts: {b-c}/3.000000000,0 +del_range_ts: {e-g}/3.000000000,0 +del_range_ts: {d-f}/5.000000000,0 +del_range_ts: {d-f}/2.000000000,0 +del_range_ts: {m-z}/1.000000000,0 +del_range_ts: {a-m}/4.000000000,0 +del_range_ts: {m-z}/4.000000000,0 +>> at end: +range tombstone: {b-c}/3.000000000,0 +range tombstone: {d-f}/5.000000000,0 +range tombstone: {d-f}/2.000000000,0 +range tombstone: {e-g}/3.000000000,0 +range tombstone: {a-z}/4.000000000,0 +range tombstone: {m-z}/1.000000000,0 + +# Scan all tombstones. +run ok +scan_range_ts k=a end=z +---- +scan_range_ts: {b-c}/3.000000000,0 +scan_range_ts: {d-f}/5.000000000,0 +scan_range_ts: {d-f}/2.000000000,0 +scan_range_ts: {e-g}/3.000000000,0 +scan_range_ts: {a-z}/4.000000000,0 +scan_range_ts: {m-z}/1.000000000,0 + +# Scan truncates tombstones to scan bounds. +run ok +scan_range_ts k=c end=e +---- +scan_range_ts: {d-e}/5.000000000,0 +scan_range_ts: {c-e}/4.000000000,0 +scan_range_ts: {d-e}/2.000000000,0 + +# Scan truncates tombstones to scan bounds when not on tombstone bounds. +run ok +scan_range_ts k=ccc end=eee +---- +scan_range_ts: {d-eee}/5.000000000,0 +scan_range_ts: {ccc-eee}/4.000000000,0 +scan_range_ts: e{-ee}/3.000000000,0 +scan_range_ts: {d-eee}/2.000000000,0 + +# Scan at a timestamp. +run ok +scan_range_ts k=a end=z ts=3 +---- +scan_range_ts: {b-c}/3.000000000,0 +scan_range_ts: {d-f}/2.000000000,0 +scan_range_ts: {e-g}/3.000000000,0 +scan_range_ts: {m-z}/1.000000000,0 + +# Empty scans. +run ok +scan_range_ts k=A end=Z +scan_range_ts k=c end=c +scan_range_ts k=z end=a +---- +scan_range_ts: "A"-"Z" -> +scan_range_ts: "c"-"c" -> +scan_range_ts: "z"-"a" -> + +# Remove some tombstones, both a non-existant one and a span across two +# tombstones. +run ok +clear_range_ts k=a end=z ts=10 +clear_range_ts k=b end=g ts=3 +---- +>> at end: +range tombstone: {d-f}/5.000000000,0 +range tombstone: {d-f}/2.000000000,0 +range tombstone: {a-z}/4.000000000,0 +range tombstone: {m-z}/1.000000000,0 + +# Remove the middle section of the [a-z)@4 tombstone. Do it twice, to +# make sure clears are idempotent. +run ok +clear_range_ts k=k end=n ts=4 +clear_range_ts k=k end=n ts=4 +---- +>> at end: +range tombstone: {d-f}/5.000000000,0 +range tombstone: {d-f}/2.000000000,0 +range tombstone: {a-k}/4.000000000,0 +range tombstone: {n-z}/4.000000000,0 +range tombstone: {m-z}/1.000000000,0 + +# Remove portions of the [a-k)@4 and [n-z)@4 tombstones in one operation. +run ok +clear_range_ts k=eee end=ttt ts=4 +---- +>> at end: +range tombstone: {a-eee}/4.000000000,0 +range tombstone: {d-f}/5.000000000,0 +range tombstone: {d-f}/2.000000000,0 +range tombstone: {ttt-z}/4.000000000,0 +range tombstone: {m-z}/1.000000000,0 diff --git a/pkg/util/hlc/timestamp.go b/pkg/util/hlc/timestamp.go index 0263ac1d3946..b84b39207fa6 100644 --- a/pkg/util/hlc/timestamp.go +++ b/pkg/util/hlc/timestamp.go @@ -54,6 +54,22 @@ func (t Timestamp) LessEq(s Timestamp) bool { return t.WallTime < s.WallTime || (t.WallTime == s.WallTime && t.Logical <= s.Logical) } +// Compare returns -1 if this timestamp is lesser than the given timestamp, 1 if +// it is greater, and 0 if they are equal. +func (t Timestamp) Compare(s Timestamp) int { + if t.WallTime > s.WallTime { + return 1 + } else if t.WallTime < s.WallTime { + return -1 + } else if t.Logical > s.Logical { + return 1 + } else if t.Logical < s.Logical { + return -1 + } else { + return 0 + } +} + // String implements the fmt.Stringer interface. func (t Timestamp) String() string { // The following code was originally written as