Skip to content

Commit

Permalink
storage: add experimental MVCC range tombstone primitives
Browse files Browse the repository at this point in the history
This patch adds initial experimental primitives for MVCC range
tombstones and the range keys they build on, based on experimental
Pebble range keys,

* Data structures:
  * `MVCCRangeKey`
  * `MVCCRangeKeyValue`
  * `nil` value for range tombstones (as with point tombstones)

* Engine support for reading, writing, and clearing range keys:
  * `Engine.ExperimentalClearMVCCRangeKey()`
  * `Engine.ExperimentalPutMVCCRangeKey()`
  * `SimpleMVCCIterator.HasPointAndRange()`
  * `SimpleMVCCIterator.RangeBounds()`
  * `SimpleMVCCIterator.RangeKeys()`
  * `MVCCRangeKeyIterator`

* MVCC function for writing range tombstones:
  * `ExperimentalMVCCDeleteRangeUsingTombstone()`

Range tombstones do not have a distinct identity, and should instead be
considered a tombstone continuum: they will merge with abutting
tombstones, can be partially cleared, can split or merge along with
ranges, and so on. Bounded scans will truncate them to the scan bounds.

The generalized range keys that range tombstones build on are also
exposed via the `Engine` API. This is primarily for internal MVCC use.
Exposing this in terms of range key/value pairs rather than range
tombstones allows for additional use-cases such as ranged intents.

Range tombstones are not yet handled in the rest of the MVCC or KV API,
nor are they persisted to disk. Subsequent pull requests will extend
their functionality and integrate them with other components.

Release note: None
  • Loading branch information
erikgrinaker committed Feb 19, 2022
1 parent 9e2bf31 commit ebd1e0b
Show file tree
Hide file tree
Showing 21 changed files with 1,421 additions and 20 deletions.
15 changes: 15 additions & 0 deletions pkg/kv/kvserver/rangefeed/task_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,21 @@ func (s *testIterator) curKV() storage.MVCCKeyValue {
return s.kvs[s.cur]
}

// HasPointAndRange implements SimpleMVCCIterator.
func (s *testIterator) HasPointAndRange() (bool, bool) {
panic("not implemented")
}

// RangeBounds implements SimpleMVCCIterator.
func (s *testIterator) RangeBounds() (roachpb.Key, roachpb.Key) {
panic("not implemented")
}

// RangeTombstones implements SimpleMVCCIterator.
func (s *testIterator) RangeKeys() []storage.MVCCRangeKeyValue {
panic("not implemented")
}

func TestInitResolvedTSScan(t *testing.T) {
defer leaktest.AfterTest(t)()
startKey := roachpb.RKey("d")
Expand Down
25 changes: 25 additions & 0 deletions pkg/kv/kvserver/spanset/batch.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,21 @@ func (i *MVCCIterator) UnsafeValue() []byte {
return i.i.UnsafeValue()
}

// HasPointAndRange implements SimpleMVCCIterator.
func (i *MVCCIterator) HasPointAndRange() (bool, bool) {
panic("not implemented")
}

// RangeBounds implements SimpleMVCCIterator.
func (i *MVCCIterator) RangeBounds() (roachpb.Key, roachpb.Key) {
panic("not implemented")
}

// RangeKeys implements SimpleMVCCIterator.
func (i *MVCCIterator) RangeKeys() []storage.MVCCRangeKeyValue {
panic("not implemented")
}

// ComputeStats is part of the storage.MVCCIterator interface.
func (i *MVCCIterator) ComputeStats(
start, end roachpb.Key, nowNanos int64,
Expand Down Expand Up @@ -599,6 +614,16 @@ func (s spanSetWriter) ClearIterRange(iter storage.MVCCIterator, start, end roac
return s.w.ClearIterRange(iter, start, end)
}

func (s spanSetWriter) ExperimentalPutMVCCRangeKey(
rangeKey storage.MVCCRangeKey, value []byte,
) error {
panic("not implemented")
}

func (s spanSetWriter) ExperimentalClearMVCCRangeKey(rangeKey storage.MVCCRangeKey) error {
panic("not implemented")
}

func (s spanSetWriter) Merge(key storage.MVCCKey, value []byte) error {
if s.spansOnly {
if err := s.spans.CheckAllowed(SpanReadWrite, roachpb.Span{Key: key.Key}); err != nil {
Expand Down
2 changes: 2 additions & 0 deletions pkg/storage/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ go_library(
"mvcc_incremental_iterator.go",
"mvcc_key.go",
"mvcc_logical_ops.go",
"mvcc_range_key_iterator.go",
"open.go",
"pebble.go",
"pebble_batch.go",
Expand Down Expand Up @@ -108,6 +109,7 @@ go_test(
"mvcc_incremental_iterator_test.go",
"mvcc_key_test.go",
"mvcc_logical_ops_test.go",
"mvcc_range_key_iterator_test.go",
"mvcc_stats_test.go",
"mvcc_test.go",
"pebble_file_registry_test.go",
Expand Down
105 changes: 104 additions & 1 deletion pkg/storage/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,57 @@ type SimpleMVCCIterator interface {
// reverse iteration to forward iteration.
NextKey()
// UnsafeKey returns the same value as Key, but the memory is invalidated on
// the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}.
// the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}. If the iterator
// is on a range key only, this returns the start bound of the range key.
UnsafeKey() MVCCKey
// UnsafeValue returns the same value as Value, but the memory is
// invalidated on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}.
UnsafeValue() []byte
// HasPointAndRange returns whether the current iterator position has a point
// key and/or a range key. If Valid() returns true, one of these will be true.
// Range keys are only emitted when requested via IterOptions.KeyTypes.
HasPointAndRange() (bool, bool)
// RangeBounds returns the range bounds for the current range key fragment, if
// any. See RangeKeys() for more info on range key fragments.
//
// NB: It is possible for this to return overlapping range bounds in some
// corner cases, e.g. first [b-c\0) and then [c-d). These overlapping bounds
// will always have the same values, and will always be at the very start of
// the second bounds, i.e. the overlap is [start, start.Next()).
//
// TODO(erikgrinaker): The overlapping range bounds will soon be resolved by
// Pebble defragmenting range keys that straddle SST boundaries. Remove this
// comment then.
RangeBounds() (roachpb.Key, roachpb.Key)
// RangeKeys returns all range key fragments (at different timestamps) at the
// current key position. If we are at a point key, it will return all range
// keys that overlap that point key at any timestamp.
//
// For defragmented iteration, use MVCCRangeKeyIterator instead. Fragmented
// iteration is primarily useful in two cases:
//
// - To iterate over point keys while accessing overlapping range keys
// (e.g. to determine if it is hidden by a range tombstone).
//
// - For partial iteration with later resumption, e.g. Export requests with
// byte limits that have to return point and range key data for a partial
// key span and then resume from that point in a later request.
//
// Range keys are fragmented by Pebble such that all overlapping range keys
// between two fragment bounds form a "stack" of range key fragments at
// different timestamps. Fragmentation is desirable at the storage layer to
// store range keys across SSTs and CRDB ranges without incurring
// cross-SST/range access costs. Stacking is desirable to easily see all range
// keys that overlap with a given point, and to translate range keys from the
// 2D MVCC keyspan to the 1D Pebble keyspan.
//
// This fragmentation is non-deterministic, as it also depends on Pebble's
// internal SST structure (which changes with compactions) and the store's
// write history. They will also split and merge along with CRDB ranges, can
// be partially removed by GC, and may be truncated by iterator bounds.
//
// TODO(erikgrinaker): Write a tech note on range keys and link it here.
RangeKeys() []MVCCRangeKeyValue
}

// IteratorStats is returned from {MVCCIterator,EngineIterator}.Stats.
Expand Down Expand Up @@ -309,8 +355,27 @@ type IterOptions struct {
// use such an iterator is to use it in concert with an iterator without
// timestamp hints, as done by MVCCIncrementalIterator.
MinTimestampHint, MaxTimestampHint hlc.Timestamp
// KeyTypes specifies the types of keys to surface: point and/or range keys.
// Use HasPointAndRange() to determine which key type is present at a given
// iterator position, and RangeBounds() and RangeKeys() to access range keys.
// Defaults to IterKeyTypePointsOnly. For more info, see RangeKeys().
KeyTypes IterKeyType
}

// IterKeyType configures which types of keys an iterator should surface.
//
// TODO(erikgrinaker): Combine this with MVCCIterKind somehow.
type IterKeyType = pebble.IterKeyType

const (
// IterKeyTypePointsOnly iterates over point keys only.
IterKeyTypePointsOnly = pebble.IterKeyTypePointsOnly
// IterKeyTypePointsAndRanges iterates over both point and range keys.
IterKeyTypePointsAndRanges = pebble.IterKeyTypePointsAndRanges
// IterKeyTypeRangesOnly iterates over only range keys.
IterKeyTypeRangesOnly = pebble.IterKeyTypeRangesOnly
)

// MVCCIterKind is used to inform Reader about the kind of iteration desired
// by the caller.
type MVCCIterKind int
Expand Down Expand Up @@ -584,6 +649,44 @@ type Writer interface {
// returns.
ClearIterRange(iter MVCCIterator, start, end roachpb.Key) error

// ExperimentalClearMVCCRangeKey deletes an MVCC range key from start
// (inclusive) to end (exclusive) at the given timestamp. For any range key
// that straddles the start and end boundaries, only the segments within the
// boundaries will be cleared. Clears are idempotent.
//
// This method is primarily intended for MVCC garbage collection and similar
// internal use. It mutates MVCC history, and does not check for intents or
// other conflicts.
//
// TODO(erikgrinaker): We'll likely need another method that calls through to
// Pebble's RangeKeyDelete(), which removes all range keys in a span. This
// will be used e.g. when removing replicas.
//
// This method is EXPERIMENTAL: range keys are under active development, and
// have severe limitations including being ignored by all KV and MVCC APIs and
// only being stored in memory.
ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error

// ExperimentalPutMVCCRangeKey writes a value to an MVCC range key. It is
// currently only used for range tombstones, which have a value of nil. Range
// keys exist separately from point keys in Pebble, and must be accessed via
// specialized iterator options and methods -- see e.g. IterOptions.KeyTypes,
// SimpleMVCCIterator.RangeKeys(), and MVCCRangeKeyIterator.
//
// A range key does not have a distinct identity, but should be considered a
// key continuum. They can be non-deterministically fragmented by Pebble,
// split/merged along with CRDB ranges, partially removed with
// ExperimentalClearMVCCRangeKey, and truncated during bounded iteration.
//
// TODO(erikgrinaker): Write a tech note on range keys and link it here.
// TODO(erikgrinaker): Pebble will soon defragment range keys that span SST
// boundaries, which will make them deterministic. Update the above then.
//
// This method is EXPERIMENTAL: range keys are under active development, and
// have severe limitations including being ignored by all KV and MVCC APIs and
// only being stored in memory.
ExperimentalPutMVCCRangeKey(MVCCRangeKey, []byte) error

// Merge is a high-performance write operation used for values which are
// accumulated over several writes. Multiple values can be merged
// sequentially into a single key; a subsequent read will return a "merged"
Expand Down
15 changes: 15 additions & 0 deletions pkg/storage/intent_interleaving_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,21 @@ func (i *intentInterleavingIter) Value() []byte {
return i.iter.Value()
}

// HasPointAndRange implements SimpleMVCCIterator.
func (i *intentInterleavingIter) HasPointAndRange() (bool, bool) {
panic("not implemented")
}

// RangeBounds implements SimpleMVCCIterator.
func (i *intentInterleavingIter) RangeBounds() (roachpb.Key, roachpb.Key) {
panic("not implemented")
}

// RangeKeys implements SimpleMVCCIterator.
func (i *intentInterleavingIter) RangeKeys() []MVCCRangeKeyValue {
panic("not implemented")
}

func (i *intentInterleavingIter) Close() {
i.iter.Close()
i.intentIter.Close()
Expand Down
16 changes: 16 additions & 0 deletions pkg/storage/multi_iterator.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"bytes"

"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/roachpb"
)

const invalidIdxSentinel = -1
Expand Down Expand Up @@ -92,6 +93,21 @@ func (f *multiIterator) UnsafeValue() []byte {
return f.iters[f.currentIdx].UnsafeValue()
}

// HasPointAndRange implements SimpleMVCCIterator.
func (f *multiIterator) HasPointAndRange() (bool, bool) {
panic("not implemented")
}

// RangeBounds implements SimpleMVCCIterator.
func (f *multiIterator) RangeBounds() (roachpb.Key, roachpb.Key) {
panic("not implemented")
}

// RangeKeys implements SimpleMVCCIterator.
func (f *multiIterator) RangeKeys() []MVCCRangeKeyValue {
panic("not implemented")
}

// Next advances the iterator to the next key/value in the iteration. After this
// call, Valid() will be true if the iterator was not positioned at the last
// key.
Expand Down
34 changes: 34 additions & 0 deletions pkg/storage/mvcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ type MVCCKeyValue struct {
Value []byte
}

// MVCCRangeKeyValue represents a ranged key/value pair.
type MVCCRangeKeyValue struct {
Key MVCCRangeKey
Value []byte
}

// optionalValue represents an optional roachpb.Value. It is preferred
// over a *roachpb.Value to avoid the forced heap allocation.
type optionalValue struct {
Expand Down Expand Up @@ -2202,6 +2208,34 @@ func MVCCDeleteRange(
return keys, res.ResumeSpan, res.NumKeys, nil
}

// ExperimentalMVCCDeleteRangeUsingTombstone deletes the given MVCC keyspan at
// the given timestamp using a range tombstone (rather than point tombstones).
// This operation is non-transactional, but will check for existing intents and
// return a WriteIntentError containing up to maxIntents intents.
//
// This method is EXPERIMENTAL: range keys are under active development, and
// have severe limitations including being ignored by all KV and MVCC APIs and
// only being stored in memory.
//
// TODO(erikgrinaker): Needs handling of conflicts (e.g. WriteTooOldError),
// MVCCStats, and tests.
func ExperimentalMVCCDeleteRangeUsingTombstone(
ctx context.Context,
rw ReadWriter,
ms *enginepb.MVCCStats,
startKey, endKey roachpb.Key,
timestamp hlc.Timestamp,
maxIntents int64,
) error {
if intents, err := ScanIntents(ctx, rw, startKey, endKey, maxIntents, 0); err != nil {
return err
} else if len(intents) > 0 {
return &roachpb.WriteIntentError{Intents: intents}
}
return rw.ExperimentalPutMVCCRangeKey(MVCCRangeKey{
StartKey: startKey, EndKey: endKey, Timestamp: timestamp}, nil)
}

func recordIteratorStats(traceSpan *tracing.Span, iteratorStats IteratorStats) {
stats := iteratorStats.Stats
if traceSpan != nil {
Expand Down
Loading

0 comments on commit ebd1e0b

Please sign in to comment.