From 52a74cb023c570bd40399f079ac455d35919d45e Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sun, 6 Feb 2022 22:22:00 +0000 Subject: [PATCH] storage: add experimental MVCC range key primitives This patch adds initial experimental primitives for MVCC range keys, which will be the foundation for MVCC range tombstones. They are based on experimental Pebble range keys. * Data structures: * `MVCCRangeKey` * internal `nil` value for range tombstones (as with point tombstones) * `Engine` methods for mutating range keys: * `ExperimentalClearMVCCRangeKey()` * `ExperimentalClearAllMVCCRangeKeys()` * `ExperimentalPutMVCCRangeKey()` * `SupportsRangeKeys()` * `SimpleMVCCIterator` methods for accessing range keys: * `HasPointAndRange()` * `RangeBounds()` * `RangeKeys()` Range keys do not have a distinct identity, and should instead be considered a key continuum: they will merge with abutting keys of the same value, can be partially cleared, can split or merge along with ranges, and so on. Bounded scans will truncate them to the scan bounds. Range key support is implemented in `pebbleIterator` and `intentInterleavingIter`, but not in the rest of the MVCC or KV APIs. They are not persisted to disk either. Subsequent pull requests will extend their functionality and integrate them with other components. Release note: None --- pkg/kv/kvserver/rangefeed/task_test.go | 15 + pkg/kv/kvserver/spanset/batch.go | 32 + pkg/storage/engine.go | 151 +++- pkg/storage/engine_test.go | 481 ++++++++++++ pkg/storage/intent_interleaving_iter.go | 89 ++- pkg/storage/intent_reader_writer.go | 2 +- pkg/storage/multi_iterator.go | 16 + pkg/storage/mvcc_history_test.go | 290 +++++++- pkg/storage/mvcc_incremental_iterator.go | 15 + pkg/storage/mvcc_key.go | 68 ++ pkg/storage/mvcc_key_test.go | 110 +++ pkg/storage/pebble.go | 100 ++- pkg/storage/pebble_batch.go | 58 +- pkg/storage/pebble_iterator.go | 107 ++- pkg/storage/sst_iterator.go | 15 + pkg/storage/sst_writer.go | 15 + .../testdata/mvcc_histories/range_key_iter | 701 ++++++++++++++++++ .../testdata/mvcc_histories/range_key_put | 42 ++ 18 files changed, 2254 insertions(+), 53 deletions(-) create mode 100644 pkg/storage/testdata/mvcc_histories/range_key_iter create mode 100644 pkg/storage/testdata/mvcc_histories/range_key_put diff --git a/pkg/kv/kvserver/rangefeed/task_test.go b/pkg/kv/kvserver/rangefeed/task_test.go index 87b695a7a228..1e9ddb38a42e 100644 --- a/pkg/kv/kvserver/rangefeed/task_test.go +++ b/pkg/kv/kvserver/rangefeed/task_test.go @@ -190,6 +190,21 @@ func (s *testIterator) curKV() storage.MVCCKeyValue { return s.kvs[s.cur] } +// HasPointAndRange implements SimpleMVCCIterator. +func (s *testIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (s *testIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeTombstones implements SimpleMVCCIterator. +func (s *testIterator) RangeKeys() []storage.MVCCRangeKey { + panic("not implemented") +} + func TestInitResolvedTSScan(t *testing.T) { defer leaktest.AfterTest(t)() startKey := roachpb.RKey("d") diff --git a/pkg/kv/kvserver/spanset/batch.go b/pkg/kv/kvserver/spanset/batch.go index 109c045bacd4..5f26a98abcda 100644 --- a/pkg/kv/kvserver/spanset/batch.go +++ b/pkg/kv/kvserver/spanset/batch.go @@ -176,6 +176,21 @@ func (i *MVCCIterator) UnsafeValue() []byte { return i.i.UnsafeValue() } +// HasPointAndRange implements SimpleMVCCIterator. +func (i *MVCCIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (i *MVCCIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (i *MVCCIterator) RangeKeys() []storage.MVCCRangeKey { + panic("not implemented") +} + // ComputeStats is part of the storage.MVCCIterator interface. func (i *MVCCIterator) ComputeStats( start, end roachpb.Key, nowNanos int64, @@ -476,6 +491,11 @@ func (s spanSetReader) ConsistentIterators() bool { return s.r.ConsistentIterators() } +// SupportsRangeKeys implements the storage.Reader interface. +func (s spanSetReader) SupportsRangeKeys() bool { + return s.r.SupportsRangeKeys() +} + // PinEngineStateForIterators implements the storage.Reader interface. func (s spanSetReader) PinEngineStateForIterators() error { return s.r.PinEngineStateForIterators() @@ -589,6 +609,18 @@ func (s spanSetWriter) ClearIterRange(start, end roachpb.Key) error { return s.w.ClearIterRange(start, end) } +func (s spanSetWriter) ExperimentalPutMVCCRangeKey(rangeKey storage.MVCCRangeKey) error { + panic("not implemented") +} + +func (s spanSetWriter) ExperimentalClearMVCCRangeKey(rangeKey storage.MVCCRangeKey) error { + panic("not implemented") +} + +func (s spanSetWriter) ExperimentalClearAllMVCCRangeKeys(start, end roachpb.Key) error { + panic("not implemented") +} + func (s spanSetWriter) Merge(key storage.MVCCKey, value []byte) error { if s.spansOnly { if err := s.spans.CheckAllowed(SpanReadWrite, roachpb.Span{Key: key.Key}); err != nil { diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go index 7cfac155bb58..f2892bde4c0d 100644 --- a/pkg/storage/engine.go +++ b/pkg/storage/engine.go @@ -45,8 +45,11 @@ func init() { type SimpleMVCCIterator interface { // Close frees up resources held by the iterator. Close() - // SeekGE advances the iterator to the first key in the engine which - // is >= the provided key. + // SeekGE advances the iterator to the first key in the engine which is >= the + // provided key. If range keys are enabled and a range key straddles the seek + // point, it will be surfaced before any point keys unless seeking directly to + // a specific version that exists (including intents, which have version 0 and + // are considered colocated with the bare key). SeekGE(key MVCCKey) // Valid must be called after any call to Seek(), Next(), Prev(), or // similar methods. It returns (true, nil) if the iterator points to @@ -65,6 +68,10 @@ type SimpleMVCCIterator interface { // or the next key if the iterator is currently located at the last version // for a key. NextKey must not be used to switch iteration direction from // reverse iteration to forward iteration. + // + // If range keys are enabled, range and point keys are treated separately, + // except for intents which are colocated with the start of a range key that + // are surfaced together since they have the same Pebble key position. NextKey() // UnsafeKey returns the same value as Key, but the memory is invalidated on // the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}. @@ -72,6 +79,43 @@ type SimpleMVCCIterator interface { // UnsafeValue returns the same value as Value, but the memory is // invalidated on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}. UnsafeValue() []byte + // HasPointAndRange returns whether the current iterator position has a point + // key and/or a range key. If Valid() returns true, one of these will be true, + // otherwise they are both false. Range keys are only emitted when requested + // via IterOptions.KeyTypes. + HasPointAndRange() (bool, bool) + // RangeBounds returns the range bounds for the current range key, or + // (nil, nil) if there are none. The returned keys are only valid until + // the next iterator call. See RangeKeys() for more info on range keys. + RangeBounds() (roachpb.Key, roachpb.Key) + // RangeKeys returns all range keys (for different timestamps) at the current + // key position, or an empty list if there are none. When at a point key, it + // will return all range keys overlapping that point key. The keys are only + // valid until the next iterator operation. Currently, all range keys are MVCC + // range tombstones with an implied value of nil, and the value is therefore + // not exposed. + // + // Range keys are fragmented by Pebble such that all overlapping range keys + // between two fragment bounds form a stack of range key fragments at + // different timestamps. For example, writing [a-e)@1 and [c-g)@2 will yield + // this fragment structure: + // + // 2: |---|---| + // 1: |---|---| + // a b c d e f g + // + // Fragmentation makes all range key properties local, which avoids incurring + // unnecessary access costs across SSTs and CRDB ranges. This fragmentation is + // deterministic on the current range key state, and does not depend on write + // history. Stacking allows easy access to all range keys that overlap a given + // point key. + // + // Range keys may merge or fragment due to other range keys, split and merge + // along with CRDB ranges, can be partially removed by GC, and may be + // truncated by iterator bounds. + // + // TODO(erikgrinaker): Write a tech note on range keys and link it here. + RangeKeys() []MVCCRangeKey } // IteratorStats is returned from {MVCCIterator,EngineIterator}.Stats. @@ -128,7 +172,9 @@ type MVCCIterator interface { // keys by avoiding the need to iterate over many deleted intents. SeekIntentGE(key roachpb.Key, txnUUID uuid.UUID) - // Key returns the current key. + // Key returns the current key. If the iterator is on a range key only, this + // returns its start key, or the seek key when calling SeekGE within the range + // key bounds. Key() MVCCKey // UnsafeRawKey returns the current raw key which could be an encoded // MVCCKey, or the more general EngineKey (for a lock table key). @@ -285,8 +331,32 @@ type IterOptions struct { // use such an iterator is to use it in concert with an iterator without // timestamp hints, as done by MVCCIncrementalIterator. MinTimestampHint, MaxTimestampHint hlc.Timestamp + // KeyTypes specifies the types of keys to surface: point and/or range keys. + // Use HasPointAndRange() to determine which key type is present at a given + // iterator position, and RangeBounds() and RangeKeys() to access range keys. + // Defaults to IterKeyTypePointsOnly. For more info, see RangeKeys(). + // + // NB: range keys are only supported for use with MVCCIterators, but it is + // legal to enable them for EngineIterators in order to derive cloned + // MVCCIterators from them. Range key behavior for EngineIterators is + // undefined. + KeyTypes IterKeyType } +// IterKeyType configures which types of keys an iterator should surface. +// +// TODO(erikgrinaker): Combine this with MVCCIterKind somehow. +type IterKeyType = pebble.IterKeyType + +const ( + // IterKeyTypePointsOnly iterates over point keys only. + IterKeyTypePointsOnly = pebble.IterKeyTypePointsOnly + // IterKeyTypePointsAndRanges iterates over both point and range keys. + IterKeyTypePointsAndRanges = pebble.IterKeyTypePointsAndRanges + // IterKeyTypeRangesOnly iterates over only range keys. + IterKeyTypeRangesOnly = pebble.IterKeyTypeRangesOnly +) + // MVCCIterKind is used to inform Reader about the kind of iteration desired // by the caller. type MVCCIterKind int @@ -433,22 +503,28 @@ type Reader interface { // timestamp of the end key; all MVCCKeys at end.Key are considered excluded // in the iteration. MVCCIterate(start, end roachpb.Key, iterKind MVCCIterKind, f func(MVCCKeyValue) error) error - // NewMVCCIterator returns a new instance of an MVCCIterator over this - // engine. The caller must invoke MVCCIterator.Close() when finished - // with the iterator to free resources. + // NewMVCCIterator returns a new instance of an MVCCIterator over this engine. + // The iterator has a consistent view of the underlying reader, and will not + // see later writes -- a new iterator must be created to see these. The caller + // must invoke MVCCIterator.Close() when finished with the iterator to free + // resources. NewMVCCIterator(iterKind MVCCIterKind, opts IterOptions) MVCCIterator // NewEngineIterator returns a new instance of an EngineIterator over this // engine. The caller must invoke EngineIterator.Close() when finished // with the iterator to free resources. The caller can change IterOptions - // after this function returns. + // after this function returns. EngineIterators do not support range keys. NewEngineIterator(opts IterOptions) EngineIterator // ConsistentIterators returns true if the Reader implementation guarantees // that the different iterators constructed by this Reader will see the same - // underlying Engine state. NB: this only applies to iterators without - // timestamp hints (see IterOptions), i.e., even if this returns true, those - // iterators can be "inconsistent" in terms of seeing a different engine - // state. The only exception to this is a Reader created using NewSnapshot. + // underlying Engine state. However, if a non-Engine Reader is also a Writer + // (read: a Batch), new iterators will see subsequent writes made to itself, + // but existing iterators won't. ConsistentIterators() bool + // SupportsRangeKeys returns true if the Reader implementation supports + // range keys. + // + // TODO(erikgrinaker): Remove this after 22.2. + SupportsRangeKeys() bool // PinEngineStateForIterators ensures that the state seen by iterators // without timestamp hints (see IterOptions) is pinned and will not see @@ -528,6 +604,8 @@ type Writer interface { // this method actually removes entries from the storage engine. // // It is safe to modify the contents of the arguments after it returns. + // + // TODO(erikgrinaker): This should clear range keys too. ClearMVCCRangeAndIntents(start, end roachpb.Key) error // ClearMVCCRange removes MVCC keys from start (inclusive) to end // (exclusive). It should not be expected to clear intents, though may clear @@ -543,8 +621,59 @@ type Writer interface { // iterate over point keys and remove them from the storage engine using // per-key storage tombstones (not MVCC tombstones). Any separated // intents/locks will also be cleared. + // + // TODO(erikgrinaker): This should clear range keys too. ClearIterRange(start, end roachpb.Key) error + // ExperimentalClearMVCCRangeKey deletes an MVCC range key from start + // (inclusive) to end (exclusive) at the given timestamp. For any range key + // that straddles the start and end boundaries, only the segments within the + // boundaries will be cleared. Range keys at other timestamps are unaffected. + // Clears are idempotent. + // + // This method is primarily intended for MVCC garbage collection and similar + // internal use. + // + // This method is EXPERIMENTAL: range keys are under active development, and + // have severe limitations including being ignored by all KV and MVCC APIs and + // only being stored in memory. + ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error + + // ExperimentalClearAllMVCCRangeKeys deletes all MVCC range keys (i.e. all + // versions) from start (inclusive) to end (exclusive). For any range key + // that straddles the start and end boundaries, only the segments within the + // boundaries will be cleared. Clears are idempotent. + // + // This method is primarily intended for MVCC garbage collection and similar + // internal use. + // + // This method is EXPERIMENTAL: range keys are under active development, and + // have severe limitations including being ignored by all KV and MVCC APIs and + // only being stored in memory. + ExperimentalClearAllMVCCRangeKeys(start, end roachpb.Key) error + + // ExperimentalPutMVCCRangeKey writes an MVCC range key. It will replace any + // existing keys, or any segments that it overlaps. This is currently only + // used for range tombstones, which have an implicit value of nil, and the + // Pebble value parameter is not exposed (adding this will need changes to + // MVCC stats, GC, scans/gets, and more). + // + // A range key does not have a distinct identity, but should be considered a + // key continuum. They can be fragmented or merged by overlapping range keys, + // split/merged along with CRDB ranges, partially removed or replaced, + // and truncated during bounded iteration. + // + // Range keys exist separately from point keys in Pebble, and must be accessed + // via special iterator options and methods such as IterOptions.KeyTypes and + // SimpleMVCCIterator.RangeKeys(). + // + // TODO(erikgrinaker): Write a tech note on range keys and link it here. + // + // This method is EXPERIMENTAL: range keys are under active development, and + // have severe limitations including being ignored by all KV and MVCC APIs and + // only being stored in memory. + ExperimentalPutMVCCRangeKey(MVCCRangeKey) error + // Merge is a high-performance write operation used for values which are // accumulated over several writes. Multiple values can be merged // sequentially into a single key; a subsequent read will return a "merged" diff --git a/pkg/storage/engine_test.go b/pkg/storage/engine_test.go index 68df6fb41245..a3314ceae4a7 100644 --- a/pkg/storage/engine_test.go +++ b/pkg/storage/engine_test.go @@ -26,6 +26,7 @@ import ( "testing" "time" + "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings/cluster" @@ -1661,3 +1662,483 @@ func TestScanIntents(t *testing.T) { }) } } + +// TestEngineConsistentIterators checks iterator consistency for various readers. +// +// 1. All iterators have a consistent view of the reader as of the time of its +// creation. Subsequent writes are never visible to it. +// +// 2. Iterators on readers with ConsistentIterators=true all have a consistent +// view of the Engine as of the time of the first iterator creation or +// PinEngineStateForIterators call: newer Engine writes are never visible to new +// iterators. The opposite is true for ConsistentIterators=false: new iterators +// always see the most recent Engine state at the time of their creation. +// +// 3. New iterators on unindexed Batches never see batch writes. They do satisfy +// ConsistentIterators: they never see new Engine writes after the first +// iterator was created or a PinEngineStateForIterators call. +// +// 4. New iterators on indexed Batches see all batch writes that happened before +// the iterator was created. However, they satisfy ConsistentIterators and +// do not see new Engine writes after the first iterator was created or a +// PinEngineStateForIterators call. Via 1, they never see later batch writes. +func TestEngineConsistentIterators(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + testcases := map[string]struct { + makeReader func(Engine) Reader + expectConsistent bool + canWrite bool + readOwnWrites bool + }{ + "Engine": { + makeReader: func(e Engine) Reader { return e }, + expectConsistent: false, + canWrite: true, + readOwnWrites: true, + }, + "Batch": { + makeReader: func(e Engine) Reader { return e.NewBatch() }, + expectConsistent: true, + canWrite: true, + readOwnWrites: true, + }, + "UnindexedBatch": { + makeReader: func(e Engine) Reader { return e.NewUnindexedBatch(false) }, + expectConsistent: true, + canWrite: true, + readOwnWrites: false, + }, + "ReadOnly": { + makeReader: func(e Engine) Reader { return e.NewReadOnly(StandardDurability) }, + expectConsistent: true, + canWrite: false, + }, + "Snapshot": { + makeReader: func(e Engine) Reader { return e.NewSnapshot() }, + expectConsistent: true, + canWrite: false, + }, + } + keyKinds := []interface{}{MVCCKeyAndIntentsIterKind, MVCCKeyIterKind} + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + testutils.RunValues(t, "IterKind", keyKinds, func(t *testing.T, iterKindI interface{}) { + iterKind := iterKindI.(MVCCIterKind) + eng := NewDefaultInMemForTesting() + defer eng.Close() + + // Write initial point and range keys. + require.NoError(t, eng.PutMVCC(pointKey("a", 1), []byte("a1"))) + require.NoError(t, eng.ExperimentalPutMVCCRangeKey(rangeKey("b", "c", 1))) + + // Set up two readers: one regular and one which will be pinned. + r := tc.makeReader(eng) + defer r.Close() + rPinned := tc.makeReader(eng) + defer rPinned.Close() + + require.Equal(t, tc.expectConsistent, r.ConsistentIterators()) + + // Create an iterator. This will see the old engine state regardless + // of the type of reader. + opts := IterOptions{ + KeyTypes: IterKeyTypePointsAndRanges, + LowerBound: keys.LocalMax, + UpperBound: keys.MaxKey, + } + iterOld := r.NewMVCCIterator(iterKind, opts) + defer iterOld.Close() + + // Pin the pinned reader, if it supports it. This should ensure later + // iterators see the current state. + if rPinned.ConsistentIterators() { + require.NoError(t, rPinned.PinEngineStateForIterators()) + } else { + require.Error(t, rPinned.PinEngineStateForIterators()) + } + + // Write new point and range keys to the engine, and set up the expected + // old and new results. + require.NoError(t, eng.PutMVCC(pointKey("a", 2), []byte("a2"))) + require.NoError(t, eng.ExperimentalPutMVCCRangeKey(rangeKey("b", "c", 2))) + + expectOld := []interface{}{ + pointKV("a", 1, "a1"), + rangeKey("b", "c", 1), + } + expectNew := []interface{}{ + pointKV("a", 2, "a2"), + pointKV("a", 1, "a1"), + rangeKey("b", "c", 2), + rangeKey("b", "c", 1), + } + + // Opened iterators should all see the old iterator state, regardless of + // reader type. + require.Equal(t, expectOld, scanIter(t, iterOld)) + + // Create new iterators from the pinned readers. Consistent iterators + // should see the old state, others should see the new state. + iterPinned := rPinned.NewMVCCIterator(iterKind, opts) + defer iterPinned.Close() + if rPinned.ConsistentIterators() { + require.Equal(t, expectOld, scanIter(t, iterPinned)) + } else { + require.Equal(t, expectNew, scanIter(t, iterPinned)) + } + + // Create another iterator from the reader. Consistent iterators should + // see the old state, others should see the new state. + iterNew := r.NewMVCCIterator(iterKind, opts) + defer iterNew.Close() + if r.ConsistentIterators() { + require.Equal(t, expectOld, scanIter(t, iterNew)) + } else { + require.Equal(t, expectNew, scanIter(t, iterNew)) + } + + // If the reader is also a writer, check interactions with writes. + // In particular, a Batch should read its own writes for any new + // iterators, but not for any existing iterators. + if tc.canWrite { + w, ok := r.(Writer) + require.Equal(t, tc.canWrite, ok) + + require.NoError(t, w.PutMVCC(pointKey("a", 3), []byte("a3"))) + require.NoError(t, w.ExperimentalPutMVCCRangeKey(rangeKey("b", "c", 3))) + expectNewAndOwn := []interface{}{ + pointKV("a", 3, "a3"), + pointKV("a", 2, "a2"), + pointKV("a", 1, "a1"), + rangeKey("b", "c", 3), + rangeKey("b", "c", 2), + rangeKey("b", "c", 1), + } + expectOldAndOwn := []interface{}{ + pointKV("a", 3, "a3"), + pointKV("a", 1, "a1"), + rangeKey("b", "c", 3), + rangeKey("b", "c", 1), + } + + // TODO(erikgrinaker): existing batch iterators see new writes, and + // new batch iterators don't see new range keys. See: + // https://github.com/cockroachdb/pebble/issues/1638 + if name == "Batch" { + return + } + + // The existing iterators should see the same state as before these + // writes, because they always have a consistent view from when they + // were created. + require.Equal(t, expectOld, scanIter(t, iterOld)) + if r.ConsistentIterators() { + require.Equal(t, expectOld, scanIter(t, iterPinned)) + require.Equal(t, expectOld, scanIter(t, iterNew)) + } else { + require.Equal(t, expectNew, scanIter(t, iterPinned)) + require.Equal(t, expectNew, scanIter(t, iterNew)) + } + + // A new iterator should read our own writes if the reader supports it, + // but consistent iterators should not see the changes to the underlying + // engine either way. + iterOwn := r.NewMVCCIterator(iterKind, opts) + defer iterOwn.Close() + if tc.readOwnWrites { + if r.ConsistentIterators() { + require.Equal(t, expectOldAndOwn, scanIter(t, iterOwn)) + } else { + require.Equal(t, expectNewAndOwn, scanIter(t, iterOwn)) + } + } else { + if r.ConsistentIterators() { + require.Equal(t, expectOld, scanIter(t, iterOwn)) + } else { + require.Equal(t, expectNew, scanIter(t, iterOwn)) + } + } + } + }) + }) + } +} + +// TestEngineRangeKeyMutations tests that range key mutations work as +// expected, both for the engine directly and for batches. +func TestEngineRangeKeyMutations(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + testutils.RunTrueAndFalse(t, "batch", func(t *testing.T, useBatch bool) { + eng := NewDefaultInMemForTesting() + defer eng.Close() + + rw := ReadWriter(eng) + if useBatch { + // TODO(erikgrinaker): Pebble batches do not read back range key writes + // after the iterator was created, which breaks the iterator reuse in + // storage.Batch. See: https://github.com/cockroachdb/pebble/issues/1638 + // + //rw = eng.NewBatch() + //defer rw.Close() + return + } + + require.True(t, rw.SupportsRangeKeys()) + + // Check errors for invalid, empty, and zero-length range keys. Not + // exhaustive, since we assume validation dispatches to + // MVCCRangeKey.Validate() which is tested separately. + empty := MVCCRangeKey{} + invalid := rangeKey("b", "a", 1) + zeroLength := rangeKey("a", "a", 1) + + require.Error(t, rw.ExperimentalPutMVCCRangeKey(empty)) + require.Error(t, rw.ExperimentalPutMVCCRangeKey(invalid)) + require.Error(t, rw.ExperimentalPutMVCCRangeKey(zeroLength)) + + require.Error(t, rw.ExperimentalClearMVCCRangeKey(empty)) + require.Error(t, rw.ExperimentalClearMVCCRangeKey(invalid)) + require.Error(t, rw.ExperimentalClearMVCCRangeKey(zeroLength)) + + require.Error(t, rw.ExperimentalClearAllMVCCRangeKeys(empty.StartKey, empty.EndKey)) + require.Error(t, rw.ExperimentalClearAllMVCCRangeKeys(invalid.StartKey, invalid.EndKey)) + require.Error(t, rw.ExperimentalClearAllMVCCRangeKeys(zeroLength.StartKey, zeroLength.EndKey)) + + require.Empty(t, scanRangeKeys(t, rw)) + + // Write some range keys and read the fragmented keys back. + rangeKeys := []MVCCRangeKey{ + rangeKey("a", "d", 1), + rangeKey("f", "h", 1), + rangeKey("c", "g", 2), + } + for _, rangeKey := range rangeKeys { + require.NoError(t, rw.ExperimentalPutMVCCRangeKey(rangeKey)) + } + require.Equal(t, []MVCCRangeKey{ + rangeKey("a", "c", 1), + rangeKey("c", "d", 2), + rangeKey("c", "d", 1), + rangeKey("d", "f", 2), + rangeKey("f", "g", 2), + rangeKey("f", "g", 1), + rangeKey("g", "h", 1), + }, scanRangeKeys(t, rw)) + + // Clear the f-g portion of [f-h)@1, twice for idempotency. This should not + // affect any other range keys, apart from removing the fragment boundary + // at f for [d-g)@2. + require.NoError(t, rw.ExperimentalClearMVCCRangeKey(rangeKey("f", "g", 1))) + require.NoError(t, rw.ExperimentalClearMVCCRangeKey(rangeKey("f", "g", 1))) + require.Equal(t, []MVCCRangeKey{ + rangeKey("a", "c", 1), + rangeKey("c", "d", 2), + rangeKey("c", "d", 1), + rangeKey("d", "g", 2), + rangeKey("g", "h", 1), + }, scanRangeKeys(t, rw)) + + // Write [e-f)@2 on top of existing [d-g)@2. This should be a noop. + require.NoError(t, rw.ExperimentalPutMVCCRangeKey(rangeKey("e", "f", 2))) + require.Equal(t, []MVCCRangeKey{ + rangeKey("a", "c", 1), + rangeKey("c", "d", 2), + rangeKey("c", "d", 1), + rangeKey("d", "g", 2), + rangeKey("g", "h", 1), + }, scanRangeKeys(t, rw)) + + // Clear all range keys in the [c-f) span. + require.NoError(t, rw.ExperimentalClearAllMVCCRangeKeys(roachpb.Key("c"), roachpb.Key("f"))) + require.Equal(t, []MVCCRangeKey{ + rangeKey("a", "c", 1), + rangeKey("f", "g", 2), + rangeKey("g", "h", 1), + }, scanRangeKeys(t, rw)) + + // Write another range key to bridge the [c-g)@1 gap. + require.NoError(t, rw.ExperimentalPutMVCCRangeKey(rangeKey("c", "g", 1))) + require.Equal(t, []MVCCRangeKey{ + rangeKey("a", "f", 1), + rangeKey("f", "g", 2), + rangeKey("f", "g", 1), + rangeKey("g", "h", 1), + }, scanRangeKeys(t, rw)) + + // If using a batch, make sure nothing has been written to the engine, then + // commit the batch and make sure it gets written to the engine. + if useBatch { + require.Empty(t, scanRangeKeys(t, eng)) + require.NoError(t, rw.(Batch).Commit(true)) + require.Equal(t, []MVCCRangeKey{ + rangeKey("a", "f", 1), + rangeKey("f", "g", 2), + rangeKey("f", "g", 1), + rangeKey("g", "h", 1), + }, scanRangeKeys(t, eng)) + } + }) +} + +// TestEngineRangeKeysUnsupported tests that engines without range key +// support behave as expected, i.e. writes fail but reads degrade gracefully. +func TestEngineRangeKeysUnsupported(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + // Set up an engine with a version that doesn't support range keys. + version := clusterversion.ByKey(clusterversion.EnsurePebbleFormatVersionRangeKeys - 1) + st := cluster.MakeTestingClusterSettingsWithVersions(version, version, true) + + eng := NewDefaultInMemForTesting(Settings(st)) + defer eng.Close() + + require.NoError(t, eng.PutMVCC(pointKey("a", 1), []byte("a1"))) + + batch := eng.NewBatch() + defer batch.Close() + snapshot := eng.NewSnapshot() + defer snapshot.Close() + readOnly := eng.NewReadOnly(StandardDurability) + defer readOnly.Close() + + writers := map[string]Writer{ + "engine": eng, + "batch": batch, + } + readers := map[string]Reader{ + "engine": eng, + "batch": batch, + "snapshot": snapshot, + "readonly": readOnly, + } + + // Range key puts should error, but clears are noops (since old databases + // cannot contain range keys by definition). + for name, w := range writers { + t.Run(fmt.Sprintf("write/%s", name), func(t *testing.T) { + rangeKey := rangeKey("a", "b", 2) + err := w.ExperimentalPutMVCCRangeKey(rangeKey) + require.Error(t, err) + require.Contains(t, err.Error(), "range keys not supported") + require.NoError(t, w.ExperimentalClearMVCCRangeKey(rangeKey)) + require.NoError(t, w.ExperimentalClearAllMVCCRangeKeys(rangeKey.StartKey, rangeKey.EndKey)) + }) + } + + // All range key iterators should degrade gracefully to point key iterators, + // and be empty for IterKeyTypeRangesOnly. + keyTypes := map[string]IterKeyType{ + "PointsOnly": IterKeyTypePointsOnly, + "PointsAndRanges": IterKeyTypePointsAndRanges, + "RangesOnly": IterKeyTypeRangesOnly, + } + for name, r := range readers { + for keyTypeName, keyType := range keyTypes { + t.Run(fmt.Sprintf("read/%s/%s", name, keyTypeName), func(t *testing.T) { + require.False(t, r.SupportsRangeKeys()) + + iter := r.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{ + KeyTypes: keyType, + UpperBound: keys.MaxKey, + }) + defer iter.Close() + + iter.SeekGE(pointKey("a", 0)) + + ok, err := iter.Valid() + require.NoError(t, err) + + if keyType == IterKeyTypeRangesOnly { + // With RangesOnly, the iterator must be empty. + require.False(t, ok) + hasPoint, hasRange := iter.HasPointAndRange() + require.False(t, hasPoint) + require.False(t, hasRange) + return + } + + require.True(t, ok) + require.Equal(t, pointKey("a", 1), iter.UnsafeKey()) + require.Equal(t, []byte("a1"), iter.UnsafeValue()) + + hasPoint, hasRange := iter.HasPointAndRange() + require.True(t, hasPoint) + require.False(t, hasRange) + rangeStart, rangeEnd := iter.RangeBounds() + require.Nil(t, rangeStart) + require.Nil(t, rangeEnd) + require.Empty(t, iter.RangeKeys()) + + // Exhaust the iterator. + iter.Next() + ok, err = iter.Valid() + require.NoError(t, err) + require.False(t, ok) + }) + } + } +} + +func scanRangeKeys(t *testing.T, r Reader) []MVCCRangeKey { + t.Helper() + + iter := r.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ + KeyTypes: IterKeyTypeRangesOnly, + LowerBound: keys.LocalMax, + UpperBound: keys.MaxKey, + }) + defer iter.Close() + iter.SeekGE(MVCCKey{Key: keys.LocalMax}) + + var rangeKeys []MVCCRangeKey + for { + ok, err := iter.Valid() + require.NoError(t, err) + if !ok { + break + } + for _, rangeKey := range iter.RangeKeys() { + rangeKeys = append(rangeKeys, rangeKey.Clone()) + } + iter.Next() + } + return rangeKeys +} + +func scanIter(t *testing.T, iter MVCCIterator) []interface{} { + t.Helper() + + iter.SeekGE(MVCCKey{Key: keys.LocalMax}) + + var keys []interface{} + var prevRangeStart roachpb.Key + for { + ok, err := iter.Valid() + require.NoError(t, err) + if !ok { + break + } + hasPoint, hasRange := iter.HasPointAndRange() + if hasRange { + if rangeStart, _ := iter.RangeBounds(); !rangeStart.Equal(prevRangeStart) { + for _, rk := range iter.RangeKeys() { + keys = append(keys, rk.Clone()) + } + prevRangeStart = rangeStart.Clone() + } + } + if hasPoint { + keys = append(keys, MVCCKeyValue{ + Key: iter.Key(), + Value: iter.Value(), + }) + } + iter.Next() + } + return keys +} diff --git a/pkg/storage/intent_interleaving_iter.go b/pkg/storage/intent_interleaving_iter.go index ba038a23be93..ef2da9b85597 100644 --- a/pkg/storage/intent_interleaving_iter.go +++ b/pkg/storage/intent_interleaving_iter.go @@ -233,7 +233,8 @@ func newIntentInterleavingIterator(reader Reader, opts IterOptions) MVCCIterator if reader.ConsistentIterators() { iter = reader.NewMVCCIterator(MVCCKeyIterKind, opts) } else { - iter = newPebbleIterator(nil, intentIter.GetRawIter(), opts, StandardDurability) + iter = newPebbleIterator( + nil, intentIter.GetRawIter(), opts, StandardDurability, reader.SupportsRangeKeys()) } *iiIter = intentInterleavingIter{ @@ -299,6 +300,36 @@ func (i *intentInterleavingIter) makeLowerLimitKey() roachpb.Key { return i.intentLimitKeyBuf } +// maybeSkipIntentRangeKey will step iter once (forwards or backwards) if iter +// is positioned on a bare range key with the same key position (either start +// key or seek key) as the current intentIter intent. We consider an intent with +// timestamp 0 to be colocated with the bare key, such that seeking to the bare +// intent key will not surface the bare range key by itself first (similarly to +// seeking to an existing version of a point key). +// +// In the forward direction, this is necessary when intentIter lands on a new +// intent, to ensure iter is positioned on the provisional value instead of the +// bare range key. This must be done after positioning both iterators. +// +// In the reverse direction, this is necessary to skip over the bare range key +// when leaving the intent. This must be done before stepping either iterator. +// +// NB: This is called before computePos(), and can't rely on intentCmp. +func (i *intentInterleavingIter) maybeSkipIntentRangeKey() error { + if i.iterValid { + hasPoint, hasRange := i.iter.HasPointAndRange() + if hasRange && !hasPoint && i.iterKey.Key.Equal(i.intentKey) { + if i.dir == 1 { + i.iter.Next() + } else { + i.iter.Prev() + } + return i.tryDecodeKey() + } + } + return nil +} + func (i *intentInterleavingIter) SeekGE(key MVCCKey) { i.dir = +1 i.valid = true @@ -332,6 +363,9 @@ func (i *intentInterleavingIter) SeekGE(key MVCCKey) { if err = i.tryDecodeLockKey(iterState, err); err != nil { return } + if err := i.maybeSkipIntentRangeKey(); err != nil { + return + } } i.computePos() } @@ -361,6 +395,9 @@ func (i *intentInterleavingIter) SeekIntentGE(key roachpb.Key, txnUUID uuid.UUID if err = i.tryDecodeLockKey(iterState, err); err != nil { return } + if err := i.maybeSkipIntentRangeKey(); err != nil { + return + } i.computePos() } @@ -406,6 +443,8 @@ func (i *intentInterleavingIter) computePos() { } if i.intentKey == nil { i.intentCmp = i.dir + } else if hasPoint, _ := i.iter.HasPointAndRange(); !hasPoint { + i.intentCmp = 1 // bare range keys sort before intents } else { i.intentCmp = i.intentKey.Compare(i.iterKey.Key) } @@ -494,6 +533,9 @@ func (i *intentInterleavingIter) Next() { if err = i.tryDecodeLockKey(iterState, err); err != nil { return } + if err := i.maybeSkipIntentRangeKey(); err != nil { + return + } i.computePos() return } @@ -512,6 +554,9 @@ func (i *intentInterleavingIter) Next() { if err := i.tryDecodeKey(); err != nil { return } + if err := i.maybeSkipIntentRangeKey(); err != nil { + return + } i.intentCmp = 0 if !i.iterValid { i.err = errors.Errorf("intent has no provisional value") @@ -541,6 +586,9 @@ func (i *intentInterleavingIter) Next() { if err = i.tryDecodeLockKey(iterState, err); err != nil { return } + if err := i.maybeSkipIntentRangeKey(); err != nil { + return + } i.intentCmp = +1 if util.RaceEnabled && iterState == pebble.IterValid { cmp := i.intentKey.Compare(i.iterKey.Key) @@ -579,6 +627,9 @@ func (i *intentInterleavingIter) Next() { if err = i.tryDecodeLockKey(iterState, err); err != nil { return } + if err := i.maybeSkipIntentRangeKey(); err != nil { + return + } i.intentCmp = +1 if util.RaceEnabled && i.intentKey != nil { cmp := i.intentKey.Compare(i.iterKey.Key) @@ -605,6 +656,9 @@ func (i *intentInterleavingIter) Next() { return } } + if err := i.maybeSkipIntentRangeKey(); err != nil { + return + } i.computePos() } } @@ -641,6 +695,9 @@ func (i *intentInterleavingIter) NextKey() { if err := i.tryDecodeLockKey(iterState, err); err != nil { return } + if err := i.maybeSkipIntentRangeKey(); err != nil { + return + } i.computePos() return } @@ -658,6 +715,9 @@ func (i *intentInterleavingIter) NextKey() { return } } + if err := i.maybeSkipIntentRangeKey(); err != nil { + return + } i.computePos() } @@ -715,6 +775,24 @@ func (i *intentInterleavingIter) Value() []byte { return i.iter.Value() } +// HasPointAndRange implements SimpleMVCCIterator. +func (i *intentInterleavingIter) HasPointAndRange() (bool, bool) { + // NB: We assume that EngineIterators, i.e. intentIter, cannot have range keys. + hasPoint, hasRange := i.iter.HasPointAndRange() + hasPoint = hasPoint || (i.valid && i.isCurAtIntentIter()) + return hasPoint, hasRange +} + +// RangeBounds implements SimpleMVCCIterator. +func (i *intentInterleavingIter) RangeBounds() (roachpb.Key, roachpb.Key) { + return i.iter.RangeBounds() +} + +// RangeKeys implements SimpleMVCCIterator. +func (i *intentInterleavingIter) RangeKeys() []MVCCRangeKey { + return i.iter.RangeKeys() +} + func (i *intentInterleavingIter) Close() { i.iter.Close() i.intentIter.Close() @@ -830,6 +908,9 @@ func (i *intentInterleavingIter) Prev() { if err := i.tryDecodeKey(); err != nil { return } + if err := i.maybeSkipIntentRangeKey(); err != nil { + return + } i.intentCmp = +1 if util.RaceEnabled && i.iterValid { cmp := i.intentKey.Compare(i.iterKey.Key) @@ -862,7 +943,11 @@ func (i *intentInterleavingIter) Prev() { // The iterator is positioned at an intent in intentIter, and iter is // exhausted or positioned at a versioned value of a preceding key. // Stepping intentIter backward will ensure that intentKey is <= the key - // of iter (when neither is exhausted). + // of iter (when neither is exhausted). We also skip over the bare range + // key whose start key is colocated with the intent, if any. + if err := i.maybeSkipIntentRangeKey(); err != nil { + return + } var limitKey roachpb.Key if i.iterValid { limitKey = i.makeLowerLimitKey() diff --git a/pkg/storage/intent_reader_writer.go b/pkg/storage/intent_reader_writer.go index 3217153b8950..e2c908d4575c 100644 --- a/pkg/storage/intent_reader_writer.go +++ b/pkg/storage/intent_reader_writer.go @@ -162,7 +162,7 @@ func (imr *intentInterleavingReader) NewMVCCIterator( iterKind == MVCCKeyAndIntentsIterKind { panic("cannot ask for interleaved intents when specifying timestamp hints") } - if iterKind == MVCCKeyIterKind { + if iterKind == MVCCKeyIterKind || opts.KeyTypes == IterKeyTypeRangesOnly { return imr.wrappableReader.NewMVCCIterator(MVCCKeyIterKind, opts) } return newIntentInterleavingIterator(imr.wrappableReader, opts) diff --git a/pkg/storage/multi_iterator.go b/pkg/storage/multi_iterator.go index 9838adf60ec7..51bbb1f1e38d 100644 --- a/pkg/storage/multi_iterator.go +++ b/pkg/storage/multi_iterator.go @@ -14,6 +14,7 @@ import ( "bytes" "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/roachpb" ) const invalidIdxSentinel = -1 @@ -92,6 +93,21 @@ func (f *multiIterator) UnsafeValue() []byte { return f.iters[f.currentIdx].UnsafeValue() } +// HasPointAndRange implements SimpleMVCCIterator. +func (f *multiIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (f *multiIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (f *multiIterator) RangeKeys() []MVCCRangeKey { + panic("not implemented") +} + // Next advances the iterator to the next key/value in the iteration. After this // call, Valid() will be true if the iterator was not positioned at the last // key. diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index cd3f571bf4e0..39664e26db66 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -13,6 +13,7 @@ package storage import ( "context" "fmt" + "path/filepath" "strconv" "strings" "testing" @@ -28,6 +29,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/skip" + "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -55,13 +57,23 @@ import ( // resolve_intent t= k= [status=] // check_intent k= [none] // -// cput [t=] [ts=[,]] [resolve [status=]] k= v= [raw] [cond=] -// del [t=] [ts=[,]] [resolve [status=]] k= -// del_range [t=] [ts=[,]] [resolve [status=]] k= [end=] [max=] [returnKeys] -// get [t=] [ts=[,]] [resolve [status=]] k= [inconsistent] [tombstones] [failOnMoreRecent] [localUncertaintyLimit=[,]] [globalUncertaintyLimit=[,]] -// increment [t=] [ts=[,]] [resolve [status=]] k= [inc=] -// put [t=] [ts=[,]] [resolve [status=]] k= v= [raw] -// scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [tombstones] [reverse] [failOnMoreRecent] [localUncertaintyLimit=[,]] [globalUncertaintyLimit=[,]] [max=] [targetbytes=] [avoidExcess] [allowEmpty] +// cput [t=] [ts=[,]] [resolve [status=]] k= v= [raw] [cond=] +// del [t=] [ts=[,]] [resolve [status=]] k= +// del_range [t=] [ts=[,]] [resolve [status=]] k= [end=] [max=] [returnKeys] +// get [t=] [ts=[,]] [resolve [status=]] k= [inconsistent] [tombstones] [failOnMoreRecent] [localUncertaintyLimit=[,]] [globalUncertaintyLimit=[,]] +// increment [t=] [ts=[,]] [resolve [status=]] k= [inc=] +// put [t=] [ts=[,]] [resolve [status=]] k= v= [raw] +// put_rangekey k= end= ts=[,] +// scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [tombstones] [reverse] [failOnMoreRecent] [localUncertaintyLimit=[,]] [globalUncertaintyLimit=[,]] [max=] [targetbytes=] [avoidExcess] [allowEmpty] +// +// iter [k=] [end=] [kind=key|keyAndIntents] [types=pointsOnly|pointsWithRanges|pointsAndRanges|rangesOnly] +// iter_seek_ge k= [ts=[,]] +// iter_seek_lt k= [ts=[,]] +// iter_seek_intent_ge k= txn= +// iter_next +// iter_next_key +// iter_prev +// iter_scan [reverse] // // merge [ts=[,]] k= v= [raw] // @@ -112,8 +124,35 @@ func TestMVCCHistories(t *testing.T) { defer engine.Close() reportDataEntries := func(buf *redact.StringBuilder) error { - hasData := false - err := engine.MVCCIterate(span.Key, span.EndKey, MVCCKeyAndIntentsIterKind, func(r MVCCKeyValue) error { + var hasData bool + + iter := engine.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ + KeyTypes: IterKeyTypeRangesOnly, + LowerBound: span.Key, + UpperBound: span.EndKey, + }) + defer iter.Close() + iter.SeekGE(MVCCKey{Key: span.Key}) + for { + if ok, err := iter.Valid(); err != nil { + return err + } else if !ok { + break + } + hasData = true + start, end := iter.RangeBounds() + buf.Printf("rangekey: %s/[", roachpb.Span{Key: start, EndKey: end}) + for i, rangeKey := range iter.RangeKeys() { + if i > 0 { + buf.Printf(" ") + } + buf.Printf("%s", rangeKey.Timestamp) + } + buf.Printf("]\n") + iter.Next() + } + + err = engine.MVCCIterate(span.Key, span.EndKey, MVCCKeyAndIntentsIterKind, func(r MVCCKeyValue) error { hasData = true if r.Key.Timestamp.IsEmpty() { // Meta is at timestamp zero. @@ -135,6 +174,7 @@ func TestMVCCHistories(t *testing.T) { } e := newEvalCtx(ctx, engine) + defer e.close() datadriven.RunTest(t, path, func(t *testing.T, d *datadriven.TestData) string { // We'll be overriding cmd/cmdargs below, because the @@ -396,15 +436,25 @@ var commands = map[string]cmd{ // TODO(nvanbenschoten): test "resolve_intent_range". "check_intent": {typReadOnly, cmdCheckIntent}, - "clear_range": {typDataUpdate, cmdClearRange}, - "cput": {typDataUpdate, cmdCPut}, - "del": {typDataUpdate, cmdDelete}, - "del_range": {typDataUpdate, cmdDeleteRange}, - "get": {typReadOnly, cmdGet}, - "increment": {typDataUpdate, cmdIncrement}, - "merge": {typDataUpdate, cmdMerge}, - "put": {typDataUpdate, cmdPut}, - "scan": {typReadOnly, cmdScan}, + "clear_range": {typDataUpdate, cmdClearRange}, + "cput": {typDataUpdate, cmdCPut}, + "del": {typDataUpdate, cmdDelete}, + "del_range": {typDataUpdate, cmdDeleteRange}, + "get": {typReadOnly, cmdGet}, + "increment": {typDataUpdate, cmdIncrement}, + "merge": {typDataUpdate, cmdMerge}, + "put": {typDataUpdate, cmdPut}, + "put_rangekey": {typDataUpdate, cmdPutRangeKey}, + "scan": {typReadOnly, cmdScan}, + + "iter_new": {typReadOnly, cmdIterNew}, + "iter_seek_ge": {typReadOnly, cmdIterSeekGE}, + "iter_seek_lt": {typReadOnly, cmdIterSeekLT}, + "iter_seek_intent_ge": {typReadOnly, cmdIterSeekIntentGE}, + "iter_next": {typReadOnly, cmdIterNext}, + "iter_next_key": {typReadOnly, cmdIterNextKey}, + "iter_prev": {typReadOnly, cmdIterPrev}, + "iter_scan": {typReadOnly, cmdIterScan}, } func cmdTxnAdvance(e *evalCtx) error { @@ -832,6 +882,188 @@ func cmdScan(e *evalCtx) error { return err } +func cmdPutRangeKey(e *evalCtx) error { + var rangeKey MVCCRangeKey + rangeKey.StartKey, rangeKey.EndKey = e.getKeyRange() + rangeKey.Timestamp = e.getTs(nil) + + return e.withWriter("put_rangekey", func(rw ReadWriter) error { + return rw.ExperimentalPutMVCCRangeKey(rangeKey) + }) +} + +func cmdIterNew(e *evalCtx) error { + var opts IterOptions + if e.hasArg("k") { + opts.LowerBound, opts.UpperBound = e.getKeyRange() + } + if len(opts.UpperBound) == 0 { + opts.UpperBound = keys.MaxKey + } + kind := MVCCKeyAndIntentsIterKind + if e.hasArg("kind") { + var arg string + e.scanArg("kind", &arg) + switch arg { + case "keys": + kind = MVCCKeyIterKind + case "keysAndIntents": + kind = MVCCKeyAndIntentsIterKind + default: + return errors.Errorf("unknown iterator kind %s", arg) + } + } + if e.hasArg("types") { + var arg string + e.scanArg("types", &arg) + switch arg { + case "pointsOnly": + opts.KeyTypes = IterKeyTypePointsOnly + case "pointsAndRanges": + opts.KeyTypes = IterKeyTypePointsAndRanges + case "rangesOnly": + opts.KeyTypes = IterKeyTypeRangesOnly + default: + return errors.Errorf("unknown key type %s", arg) + } + } + + var r, closeReader Reader + rType := util.ConstantWithMetamorphicTestChoice( + fmt.Sprintf("iter-reader@%s", filepath.Base(e.td.Pos)), + "engine", "readonly", "batch", "snapshot").(string) + switch rType { + case "engine": + r = e.engine + case "readonly": + r = e.engine.NewReadOnly(StandardDurability) + case "batch": + r = e.engine.NewBatch() + closeReader = r + case "snapshot": + r = e.engine.NewSnapshot() + closeReader = r + default: + return errors.Errorf("unknown reader type %s", rType) + } + + if e.iter != nil { + e.iter.Close() + } + e.iter = &iterWithCloseReader{ + MVCCIterator: r.NewMVCCIterator(kind, opts), + closeReader: closeReader, + } + return nil +} + +func cmdIterSeekGE(e *evalCtx) error { + key := e.getKey() + ts := e.getTs(nil) + e.iter.SeekGE(MVCCKey{Key: key, Timestamp: ts}) + printIter(e) + return nil +} + +func cmdIterSeekIntentGE(e *evalCtx) error { + key := e.getKey() + var txnName string + e.scanArg("txn", &txnName) + txn := e.txns[txnName] + e.iter.SeekIntentGE(key, txn.ID) + printIter(e) + return nil +} + +func cmdIterSeekLT(e *evalCtx) error { + key := e.getKey() + ts := e.getTs(nil) + e.iter.SeekLT(MVCCKey{Key: key, Timestamp: ts}) + printIter(e) + return nil +} + +func cmdIterNext(e *evalCtx) error { + e.iter.Next() + printIter(e) + return nil +} + +func cmdIterNextKey(e *evalCtx) error { + e.iter.NextKey() + printIter(e) + return nil +} + +func cmdIterPrev(e *evalCtx) error { + e.iter.Prev() + printIter(e) + return nil +} + +func cmdIterScan(e *evalCtx) error { + reverse := e.hasArg("reverse") + for { + printIter(e) + if ok, err := e.iter.Valid(); err != nil { + e.Fatalf("%v", err) + } else if !ok { + return nil + } + if reverse { + e.iter.Prev() + } else { + e.iter.Next() + } + } +} + +func printIter(e *evalCtx) { + e.results.buf.Printf("%s:", e.td.Cmd) + defer e.results.buf.Printf("\n") + + hasPoint, hasRange := e.iter.HasPointAndRange() + ok, err := e.iter.Valid() + if err != nil { + e.results.buf.Printf(" err=%v", err) + return + } + if !ok { + if hasPoint || hasRange { + e.t.Fatalf("invalid iterator gave hasPoint=%t hasRange=%t", hasPoint, hasRange) + } + e.results.buf.Print(" .") + return + } + if !hasPoint && !hasRange { + e.t.Fatalf("valid iterator at %s without point nor range keys", e.iter.UnsafeKey()) + } + + if hasPoint { + if !e.iter.UnsafeKey().IsValue() { + meta := enginepb.MVCCMetadata{} + if err := protoutil.Unmarshal(e.iter.UnsafeValue(), &meta); err != nil { + e.Fatalf("%v", err) + } + e.results.buf.Printf(" %s=%+v", e.iter.UnsafeKey(), &meta) + } else { + e.results.buf.Printf(" %s=%s", + e.iter.UnsafeKey(), roachpb.Value{RawBytes: e.iter.UnsafeValue()}.PrettyPrint()) + } + } + if hasRange { + start, end := e.iter.RangeBounds() + e.results.buf.Printf(" %s/[", roachpb.Span{Key: start, EndKey: end}) + for i, rangeKey := range e.iter.RangeKeys() { + if i > 0 { + e.results.buf.Printf(" ") + } + e.results.buf.Printf("%s", rangeKey.Timestamp) + } + e.results.buf.Printf("]") + } +} + // evalCtx stored the current state of the environment of a running // script. type evalCtx struct { @@ -842,6 +1074,7 @@ type evalCtx struct { } ctx context.Context engine Engine + iter MVCCIterator t *testing.T td *datadriven.TestData txns map[string]*roachpb.Transaction @@ -857,6 +1090,13 @@ func newEvalCtx(ctx context.Context, engine Engine) *evalCtx { } } +func (e *evalCtx) close() { + if e.iter != nil { + e.iter.Close() + } + // engine is passed in, so it's the caller's responsibility to close it. +} + func (e *evalCtx) getTxnStatus() roachpb.TransactionStatus { status := roachpb.COMMITTED if e.hasArg("status") { @@ -1085,3 +1325,17 @@ func toKey(s string) roachpb.Key { return roachpb.Key(s) } } + +// iterWithCloseReader will close the underlying reader when the +// iterator is closed. +type iterWithCloseReader struct { + MVCCIterator + closeReader Reader +} + +func (i *iterWithCloseReader) Close() { + i.MVCCIterator.Close() + if i.closeReader != nil { + i.closeReader.Close() + } +} diff --git a/pkg/storage/mvcc_incremental_iterator.go b/pkg/storage/mvcc_incremental_iterator.go index 4004f3d84b58..f64774a085af 100644 --- a/pkg/storage/mvcc_incremental_iterator.go +++ b/pkg/storage/mvcc_incremental_iterator.go @@ -488,6 +488,21 @@ func (i *MVCCIncrementalIterator) UnsafeKey() MVCCKey { return i.iter.UnsafeKey() } +// HasPointAndRange implements SimpleMVCCIterator. +func (i *MVCCIncrementalIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (i *MVCCIncrementalIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (i *MVCCIncrementalIterator) RangeKeys() []MVCCRangeKey { + panic("not implemented") +} + // UnsafeValue returns the same value as Value, but the memory is invalidated on // the next call to {Next,Reset,Close}. func (i *MVCCIncrementalIterator) UnsafeValue() []byte { diff --git a/pkg/storage/mvcc_key.go b/pkg/storage/mvcc_key.go index 40d75745af12..47a9d0ae0a05 100644 --- a/pkg/storage/mvcc_key.go +++ b/pkg/storage/mvcc_key.go @@ -327,3 +327,71 @@ func decodeMVCCTimestampSuffix(encodedTS []byte) (hlc.Timestamp, error) { } return decodeMVCCTimestamp(encodedTS[:encodedLen-1]) } + +// MVCCRangeKey is a versioned key span. +type MVCCRangeKey struct { + StartKey roachpb.Key + EndKey roachpb.Key + Timestamp hlc.Timestamp +} + +// Clone returns a copy of the range key. +func (k MVCCRangeKey) Clone() MVCCRangeKey { + // k is already a copy, but byte slices must be cloned. + k.StartKey = k.StartKey.Clone() + k.EndKey = k.EndKey.Clone() + return k +} + +// Compare returns -1 if this key is less than the given key, 0 if they're +// equal, or 1 if this is greater. Comparison is by start,timestamp,end, where +// larger timestamps sort before smaller ones except empty ones which sort first +// (like elsewhere in MVCC). +func (k MVCCRangeKey) Compare(o MVCCRangeKey) int { + if c := k.StartKey.Compare(o.StartKey); c != 0 { + return c + } + if k.Timestamp.IsEmpty() && !o.Timestamp.IsEmpty() { + return -1 + } else if !k.Timestamp.IsEmpty() && o.Timestamp.IsEmpty() { + return 1 + } else if c := k.Timestamp.Compare(o.Timestamp); c != 0 { + return -c // timestamps sort in reverse + } + return k.EndKey.Compare(o.EndKey) +} + +// String formats the range key. +func (k MVCCRangeKey) String() string { + s := roachpb.Span{Key: k.StartKey, EndKey: k.EndKey}.String() + if !k.Timestamp.IsEmpty() { + s += fmt.Sprintf("/%s", k.Timestamp) + } + return s +} + +// Validate returns an error if the range key is invalid. +// +// This validation is for writing range keys (or checking existing range keys), +// not for filters/bounds, so e.g. specifying an empty start key is invalid even +// though it would be valid to start a range key scan at an empty start key. +func (k MVCCRangeKey) Validate() (err error) { + defer func() { + err = errors.Wrapf(err, "invalid range key %s", k) + }() + + switch { + case len(k.StartKey) == 0: + // We don't allow an empty start key, because we don't allow writing point + // keys at the empty key. The first valid key is 0x00. + return errors.Errorf("no start key") + case len(k.EndKey) == 0: + return errors.Errorf("no end key") + case k.Timestamp.IsEmpty(): + return errors.Errorf("no timestamp") + case k.StartKey.Compare(k.EndKey) >= 0: + return errors.Errorf("start key %s is at or after end key %s", k.StartKey, k.EndKey) + default: + return nil + } +} diff --git a/pkg/storage/mvcc_key_test.go b/pkg/storage/mvcc_key_test.go index 4e8a2ea3d5b0..7679abcb658f 100644 --- a/pkg/storage/mvcc_key_test.go +++ b/pkg/storage/mvcc_key_test.go @@ -381,3 +381,113 @@ func BenchmarkDecodeMVCCKey(b *testing.B) { } benchmarkDecodeMVCCKeyResult = mvccKey // avoid compiler optimizing away function call } + +func TestMVCCRangeKeyString(t *testing.T) { + defer leaktest.AfterTest(t)() + + testcases := map[string]struct { + rk MVCCRangeKey + expect string + }{ + "empty": {MVCCRangeKey{}, "/Min"}, + "only start": {MVCCRangeKey{StartKey: roachpb.Key("foo")}, "foo"}, + "only end": {MVCCRangeKey{EndKey: roachpb.Key("foo")}, "{/Min-foo}"}, + "only timestamp": {MVCCRangeKey{Timestamp: hlc.Timestamp{Logical: 1}}, "/Min/0,1"}, + "only span": {MVCCRangeKey{StartKey: roachpb.Key("a"), EndKey: roachpb.Key("z")}, "{a-z}"}, + "all": {MVCCRangeKey{StartKey: roachpb.Key("a"), EndKey: roachpb.Key("z"), Timestamp: hlc.Timestamp{Logical: 1}}, "{a-z}/0,1"}, + "all overlapping": {MVCCRangeKey{StartKey: roachpb.Key("ab"), EndKey: roachpb.Key("af"), Timestamp: hlc.Timestamp{Logical: 1}}, "a{b-f}/0,1"}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + require.Equal(t, tc.expect, tc.rk.String()) + }) + } +} + +func TestMVCCRangeKeyCompare(t *testing.T) { + defer leaktest.AfterTest(t)() + + ab1 := MVCCRangeKey{roachpb.Key("a"), roachpb.Key("b"), hlc.Timestamp{Logical: 1}} + ac1 := MVCCRangeKey{roachpb.Key("a"), roachpb.Key("c"), hlc.Timestamp{Logical: 1}} + ac2 := MVCCRangeKey{roachpb.Key("a"), roachpb.Key("c"), hlc.Timestamp{Logical: 2}} + bc0 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("c"), hlc.Timestamp{Logical: 0}} + bc1 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("c"), hlc.Timestamp{Logical: 1}} + bc3 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("c"), hlc.Timestamp{Logical: 3}} + bd4 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("d"), hlc.Timestamp{Logical: 4}} + + testcases := map[string]struct { + a MVCCRangeKey + b MVCCRangeKey + expect int + }{ + "equal": {ac1, ac1, 0}, + "start lt": {ac1, bc1, -1}, + "start gt": {bc1, ac1, 1}, + "end lt": {ab1, ac1, -1}, + "end gt": {ac1, ab1, 1}, + "time lt": {ac2, ac1, -1}, // MVCC timestamps sort in reverse order + "time gt": {ac1, ac2, 1}, // MVCC timestamps sort in reverse order + "empty time lt set": {bc0, bc1, -1}, // empty MVCC timestamps sort before non-empty + "set time gt empty": {bc1, bc0, 1}, // empty MVCC timestamps sort before non-empty + "start time precedence": {ac2, bc3, -1}, // a before b, but 3 before 2; key takes precedence + "time end precedence": {bd4, bc3, -1}, // c before d, but 4 before 3; time takes precedence + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + require.Equal(t, tc.expect, tc.a.Compare(tc.b)) + }) + } +} + +func TestMVCCRangeKeyValidate(t *testing.T) { + defer leaktest.AfterTest(t)() + + a := roachpb.Key("a") + b := roachpb.Key("b") + blank := roachpb.Key("") + ts1 := hlc.Timestamp{Logical: 1} + + testcases := map[string]struct { + rangeKey MVCCRangeKey + expectErr string // empty if no error + }{ + "valid": {MVCCRangeKey{StartKey: a, EndKey: b, Timestamp: ts1}, ""}, + "empty": {MVCCRangeKey{}, "/Min: no start key"}, + "no start": {MVCCRangeKey{EndKey: b, Timestamp: ts1}, "{/Min-b}/0,1: no start key"}, + "no end": {MVCCRangeKey{StartKey: a, Timestamp: ts1}, "a/0,1: no end key"}, + "no timestamp": {MVCCRangeKey{StartKey: a, EndKey: b}, "{a-b}: no timestamp"}, + "blank start": {MVCCRangeKey{StartKey: blank, EndKey: b, Timestamp: ts1}, "{/Min-b}/0,1: no start key"}, + "end at start": {MVCCRangeKey{StartKey: a, EndKey: a, Timestamp: ts1}, `a{-}/0,1: start key "a" is at or after end key "a"`}, + "end before start": {MVCCRangeKey{StartKey: b, EndKey: a, Timestamp: ts1}, `{b-a}/0,1: start key "b" is at or after end key "a"`}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + err := tc.rangeKey.Validate() + if tc.expectErr == "" { + require.NoError(t, err) + } else { + require.Error(t, err) + require.Contains(t, err.Error(), tc.expectErr) + } + }) + } +} + +func pointKey(key string, ts int) MVCCKey { + return MVCCKey{Key: roachpb.Key(key), Timestamp: hlc.Timestamp{WallTime: int64(ts)}} +} + +func pointKV(key string, ts int, value string) MVCCKeyValue { + return MVCCKeyValue{ + Key: pointKey(key, ts), + Value: []byte(value), + } +} + +func rangeKey(start, end string, ts int) MVCCRangeKey { + return MVCCRangeKey{ + StartKey: roachpb.Key(start), + EndKey: roachpb.Key(end), + Timestamp: hlc.Timestamp{WallTime: int64(ts)}, + } +} diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index b13256022095..901afad311ae 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -589,6 +589,8 @@ func DefaultPebbleOptions() *pebble.Options { TablePropertyCollectors: PebbleTablePropertyCollectors, BlockPropertyCollectors: PebbleBlockPropertyCollectors, } + // Used for experimental MVCC range tombstones. + opts.Experimental.RangeKeys = new(pebble.RangeKeysArena) // Automatically flush 10s after the first range tombstone is added to a // memtable. This ensures that we can reclaim space even when there's no // activity on the database generating flushes. @@ -1111,7 +1113,7 @@ func (p *Pebble) NewMVCCIterator(iterKind MVCCIterKind, opts IterOptions) MVCCIt return iter } - iter := newPebbleIterator(p.db, nil, opts, StandardDurability) + iter := newPebbleIterator(p.db, nil, opts, StandardDurability, p.SupportsRangeKeys()) if iter == nil { panic("couldn't create a new iterator") } @@ -1123,7 +1125,7 @@ func (p *Pebble) NewMVCCIterator(iterKind MVCCIterKind, opts IterOptions) MVCCIt // NewEngineIterator implements the Engine interface. func (p *Pebble) NewEngineIterator(opts IterOptions) EngineIterator { - iter := newPebbleIterator(p.db, nil, opts, StandardDurability) + iter := newPebbleIterator(p.db, nil, opts, StandardDurability, p.SupportsRangeKeys()) if iter == nil { panic("couldn't create a new iterator") } @@ -1135,6 +1137,11 @@ func (p *Pebble) ConsistentIterators() bool { return false } +// SupportsRangeKeys implements the Engine interface. +func (p *Pebble) SupportsRangeKeys() bool { + return p.db.FormatMajorVersion() >= pebble.FormatRangeKeys +} + // PinEngineStateForIterators implements the Engine interface. func (p *Pebble) PinEngineStateForIterators() error { return errors.AssertionFailedf( @@ -1236,6 +1243,52 @@ func (p *Pebble) ClearIterRange(start, end roachpb.Key) error { return batch.Commit(true) } +// ExperimentalClearMVCCRangeKey implements the Engine interface. +func (p *Pebble) ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error { + if !p.SupportsRangeKeys() { + // These databases cannot contain range keys, so clearing is a noop. + return nil + } + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeyUnset( + EncodeMVCCKeyPrefix(rangeKey.StartKey), + EncodeMVCCKeyPrefix(rangeKey.EndKey), + EncodeMVCCTimestampSuffix(rangeKey.Timestamp), + pebble.Sync) +} + +// ExperimentalClearAllMVCCRangeKeys implements the Engine interface. +func (p *Pebble) ExperimentalClearAllMVCCRangeKeys(start, end roachpb.Key) error { + if !p.SupportsRangeKeys() { + return nil // noop + } + rangeKey := MVCCRangeKey{StartKey: start, EndKey: end, Timestamp: hlc.MinTimestamp} + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeyDelete( + EncodeMVCCKeyPrefix(start), EncodeMVCCKeyPrefix(end), pebble.Sync) +} + +// ExperimentalPutMVCCRangeKey implements the Engine interface. +func (p *Pebble) ExperimentalPutMVCCRangeKey(rangeKey MVCCRangeKey) error { + if !p.SupportsRangeKeys() { + return errors.Errorf("range keys not supported by Pebble database version %s", + p.db.FormatMajorVersion()) + } + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeySet( + EncodeMVCCKeyPrefix(rangeKey.StartKey), + EncodeMVCCKeyPrefix(rangeKey.EndKey), + EncodeMVCCTimestampSuffix(rangeKey.Timestamp), + nil, + pebble.Sync) +} + // Merge implements the Engine interface. func (p *Pebble) Merge(key MVCCKey, value []byte) error { if len(key.Key) == 0 { @@ -1543,7 +1596,7 @@ func (p *Pebble) NewUnindexedBatch(writeOnly bool) Batch { func (p *Pebble) NewSnapshot() Reader { return &pebbleSnapshot{ snapshot: p.db.NewSnapshot(), - settings: p.settings, + parent: p, } } @@ -1925,13 +1978,13 @@ func (p *pebbleReadOnly) NewMVCCIterator(iterKind MVCCIterKind, opts IterOptions iter = &p.prefixIter } if iter.inuse { - return newPebbleIterator(p.parent.db, p.iter, opts, p.durability) + return newPebbleIterator(p.parent.db, p.iter, opts, p.durability, p.SupportsRangeKeys()) } if iter.iter != nil { iter.setOptions(opts, p.durability) } else { - iter.init(p.parent.db, p.iter, p.iterUnused, opts, p.durability) + iter.init(p.parent.db, p.iter, p.iterUnused, opts, p.durability, p.SupportsRangeKeys()) if p.iter == nil { // For future cloning. p.iter = iter.iter @@ -1959,13 +2012,13 @@ func (p *pebbleReadOnly) NewEngineIterator(opts IterOptions) EngineIterator { iter = &p.prefixEngineIter } if iter.inuse { - return newPebbleIterator(p.parent.db, p.iter, opts, p.durability) + return newPebbleIterator(p.parent.db, p.iter, opts, p.durability, p.SupportsRangeKeys()) } if iter.iter != nil { iter.setOptions(opts, p.durability) } else { - iter.init(p.parent.db, p.iter, p.iterUnused, opts, p.durability) + iter.init(p.parent.db, p.iter, p.iterUnused, opts, p.durability, p.SupportsRangeKeys()) if p.iter == nil { // For future cloning. p.iter = iter.iter @@ -1983,6 +2036,11 @@ func (p *pebbleReadOnly) ConsistentIterators() bool { return true } +// SupportsRangeKeys implements the Engine interface. +func (p *pebbleReadOnly) SupportsRangeKeys() bool { + return p.parent.SupportsRangeKeys() +} + // PinEngineStateForIterators implements the Engine interface. func (p *pebbleReadOnly) PinEngineStateForIterators() error { if p.iter == nil { @@ -2045,6 +2103,18 @@ func (p *pebbleReadOnly) ClearIterRange(start, end roachpb.Key) error { panic("not implemented") } +func (p *pebbleReadOnly) ExperimentalPutMVCCRangeKey(_ MVCCRangeKey) error { + panic("not implemented") +} + +func (p *pebbleReadOnly) ExperimentalClearMVCCRangeKey(_ MVCCRangeKey) error { + panic("not implemented") +} + +func (p *pebbleReadOnly) ExperimentalClearAllMVCCRangeKeys(_, _ roachpb.Key) error { + panic("not implemented") +} + func (p *pebbleReadOnly) Merge(key MVCCKey, value []byte) error { panic("not implemented") } @@ -2078,7 +2148,7 @@ func (p *pebbleReadOnly) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalO // pebbleSnapshot represents a snapshot created using Pebble.NewSnapshot(). type pebbleSnapshot struct { snapshot *pebble.Snapshot - settings *cluster.Settings + parent *Pebble closed bool } @@ -2101,7 +2171,7 @@ func (p *pebbleSnapshot) ExportMVCCToSst( ) (roachpb.BulkOpSummary, roachpb.Key, hlc.Timestamp, error) { r := wrapReader(p) // Doing defer r.Free() does not inline. - summary, k, err := pebbleExportToSst(ctx, p.settings, r, exportOptions, dest) + summary, k, err := pebbleExportToSst(ctx, p.parent.settings, r, exportOptions, dest) r.Free() return summary, k.Key, k.Timestamp, err } @@ -2165,7 +2235,9 @@ func (p *pebbleSnapshot) NewMVCCIterator(iterKind MVCCIterKind, opts IterOptions } return iter } - iter := MVCCIterator(newPebbleIterator(p.snapshot, nil, opts, StandardDurability)) + + iter := MVCCIterator(newPebbleIterator( + p.snapshot, nil, opts, StandardDurability, p.SupportsRangeKeys())) if util.RaceEnabled { iter = wrapInUnsafeIter(iter) } @@ -2174,7 +2246,8 @@ func (p *pebbleSnapshot) NewMVCCIterator(iterKind MVCCIterKind, opts IterOptions // NewEngineIterator implements the Reader interface. func (p pebbleSnapshot) NewEngineIterator(opts IterOptions) EngineIterator { - return newPebbleIterator(p.snapshot, nil, opts, StandardDurability) + return newPebbleIterator( + p.snapshot, nil, opts, StandardDurability, p.SupportsRangeKeys()) } // ConsistentIterators implements the Reader interface. @@ -2182,6 +2255,11 @@ func (p pebbleSnapshot) ConsistentIterators() bool { return true } +// SupportsRangeKeys implements the Reader interface. +func (p *pebbleSnapshot) SupportsRangeKeys() bool { + return p.parent.SupportsRangeKeys() +} + // PinEngineStateForIterators implements the Reader interface. func (p *pebbleSnapshot) PinEngineStateForIterators() error { // Snapshot already pins state, so nothing to do. diff --git a/pkg/storage/pebble_batch.go b/pkg/storage/pebble_batch.go index 6bc4043fd66f..9dfdde3b210b 100644 --- a/pkg/storage/pebble_batch.go +++ b/pkg/storage/pebble_batch.go @@ -220,13 +220,13 @@ func (p *pebbleBatch) NewMVCCIterator(iterKind MVCCIterKind, opts IterOptions) M handle = p.db } if iter.inuse { - return newPebbleIterator(handle, p.iter, opts, StandardDurability) + return newPebbleIterator(handle, p.iter, opts, StandardDurability, p.SupportsRangeKeys()) } if iter.iter != nil { iter.setOptions(opts, StandardDurability) } else { - iter.init(handle, p.iter, p.iterUnused, opts, StandardDurability) + iter.init(handle, p.iter, p.iterUnused, opts, StandardDurability, p.SupportsRangeKeys()) if p.iter == nil { // For future cloning. p.iter = iter.iter @@ -257,13 +257,13 @@ func (p *pebbleBatch) NewEngineIterator(opts IterOptions) EngineIterator { handle = p.db } if iter.inuse { - return newPebbleIterator(handle, p.iter, opts, StandardDurability) + return newPebbleIterator(handle, p.iter, opts, StandardDurability, p.SupportsRangeKeys()) } if iter.iter != nil { iter.setOptions(opts, StandardDurability) } else { - iter.init(handle, p.iter, p.iterUnused, opts, StandardDurability) + iter.init(handle, p.iter, p.iterUnused, opts, StandardDurability, p.SupportsRangeKeys()) if p.iter == nil { // For future cloning. p.iter = iter.iter @@ -280,6 +280,11 @@ func (p *pebbleBatch) ConsistentIterators() bool { return true } +// SupportsRangeKeys implements the Batch interface. +func (p *pebbleBatch) SupportsRangeKeys() bool { + return p.db.FormatMajorVersion() >= pebble.FormatRangeKeys +} + // PinEngineStateForIterators implements the Batch interface. func (p *pebbleBatch) PinEngineStateForIterators() error { if p.iter == nil { @@ -403,6 +408,51 @@ func (p *pebbleBatch) ClearIterRange(start, end roachpb.Key) error { return nil } +// ExperimentalClearMVCCRangeKey implements the Engine interface. +func (p *pebbleBatch) ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error { + if !p.SupportsRangeKeys() { + return nil // noop + } + if err := rangeKey.Validate(); err != nil { + return err + } + return p.batch.Experimental().RangeKeyUnset( + EncodeMVCCKeyPrefix(rangeKey.StartKey), + EncodeMVCCKeyPrefix(rangeKey.EndKey), + EncodeMVCCTimestampSuffix(rangeKey.Timestamp), + nil) +} + +// ExperimentalClearAllMVCCRangeKeys implements the Engine interface. +func (p *pebbleBatch) ExperimentalClearAllMVCCRangeKeys(start, end roachpb.Key) error { + if !p.SupportsRangeKeys() { + return nil // noop + } + rangeKey := MVCCRangeKey{StartKey: start, EndKey: end, Timestamp: hlc.MinTimestamp} + if err := rangeKey.Validate(); err != nil { + return err + } + return p.batch.Experimental().RangeKeyDelete( + EncodeMVCCKeyPrefix(start), EncodeMVCCKeyPrefix(end), nil) +} + +// ExperimentalPutMVCCRangeKey implements the Batch interface. +func (p *pebbleBatch) ExperimentalPutMVCCRangeKey(rangeKey MVCCRangeKey) error { + if !p.SupportsRangeKeys() { + return errors.Errorf("range keys not supported by Pebble database version %s", + p.db.FormatMajorVersion()) + } + if err := rangeKey.Validate(); err != nil { + return err + } + return p.batch.Experimental().RangeKeySet( + EncodeMVCCKeyPrefix(rangeKey.StartKey), + EncodeMVCCKeyPrefix(rangeKey.EndKey), + EncodeMVCCTimestampSuffix(rangeKey.Timestamp), + nil, + nil) +} + // Merge implements the Batch interface. func (p *pebbleBatch) Merge(key MVCCKey, value []byte) error { if len(key.Key) == 0 { diff --git a/pkg/storage/pebble_iterator.go b/pkg/storage/pebble_iterator.go index b7d0da09a496..4a1e21ab5d09 100644 --- a/pkg/storage/pebble_iterator.go +++ b/pkg/storage/pebble_iterator.go @@ -43,6 +43,10 @@ type pebbleIterator struct { upperBoundBuf [2][]byte curBuf int + // True if the iterator's underlying reader supports range keys. + // + // TODO(erikgrinaker): Remove after 22.2. + supportsRangeKeys bool // Set to true to govern whether to call SeekPrefixGE or SeekGE. Skips // SSTables based on MVCC/Engine key when true. prefix bool @@ -84,10 +88,11 @@ func newPebbleIterator( iterToClone cloneableIter, opts IterOptions, durability DurabilityRequirement, + supportsRangeKeys bool, ) *pebbleIterator { iter := pebbleIterPool.Get().(*pebbleIterator) iter.reusable = false // defensive - iter.init(handle, iterToClone, false /* iterUnused */, opts, durability) + iter.init(handle, iterToClone, false /* iterUnused */, opts, durability, supportsRangeKeys) return iter } @@ -102,12 +107,14 @@ func (p *pebbleIterator) init( iterUnused bool, opts IterOptions, durability DurabilityRequirement, + supportsRangeKeys bool, // TODO(erikgrinaker): remove after 22.2. ) { *p = pebbleIterator{ - keyBuf: p.keyBuf, - lowerBoundBuf: p.lowerBoundBuf, - upperBoundBuf: p.upperBoundBuf, - reusable: p.reusable, + keyBuf: p.keyBuf, + lowerBoundBuf: p.lowerBoundBuf, + upperBoundBuf: p.upperBoundBuf, + reusable: p.reusable, + supportsRangeKeys: supportsRangeKeys, } if iterToClone != nil { @@ -142,12 +149,25 @@ func (p *pebbleIterator) setOptions(opts IterOptions, durability DurabilityRequi panic("min timestamp hint set without max timestamp hint") } + // If this Pebble database does not support range keys yet, fall back to + // only iterating over point keys to avoid panics. This is effectively the + // same, since a database without range key support contains no range keys, + // except in the case of RangesOnly where the iterator must always be empty. + if !p.supportsRangeKeys { + if opts.KeyTypes == IterKeyTypeRangesOnly { + opts.LowerBound = nil + opts.UpperBound = []byte{0} + } + opts.KeyTypes = IterKeyTypePointsOnly + } + // Generate new Pebble iterator options. // // NB: Make sure new options are accounted for in the optsChanged check below. // Otherwise, the option may not take effect. newOptions := pebble.IterOptions{ OnlyReadGuaranteedDurable: durability == GuaranteedDurability, + KeyTypes: opts.KeyTypes, } newBuf := 1 - p.curBuf @@ -213,6 +233,7 @@ func (p *pebbleIterator) setOptions(opts IterOptions, durability DurabilityRequi // won't match the zero value of a new iterator. optsChanged := opts.Prefix != p.prefix || newOptions.OnlyReadGuaranteedDurable != p.options.OnlyReadGuaranteedDurable || + newOptions.KeyTypes != p.options.KeyTypes || !bytes.Equal(newOptions.UpperBound, p.options.UpperBound) || !bytes.Equal(newOptions.LowerBound, p.options.LowerBound) || // We can't compare these filters, so if any existing or new filters are set @@ -397,14 +418,31 @@ func (p *pebbleIterator) NextKey() { if valid, err := p.Valid(); err != nil || !valid { return } + wasPoint, _ := p.HasPointAndRange() p.keyBuf = append(p.keyBuf[:0], p.UnsafeKey().Key...) if !p.iter.Next() { return } - if bytes.Equal(p.keyBuf, p.UnsafeKey().Key) { + isPoint, _ := p.HasPointAndRange() + + // NB: a range key and point key both starting at a given key are considered + // separate keys during iteration, so calling NextKey() at the range key + // should land on the point key. + if wasPoint && isPoint && bytes.Equal(p.keyBuf, p.UnsafeKey().Key) { // This is equivalent to: // p.iter.SeekGE(EncodeKey(MVCCKey{p.UnsafeKey().Key.Next(), hlc.Timestamp{}})) p.iter.SeekGE(append(p.keyBuf, 0, 0)) + // If there's a range key straddling the seek point (e.g. a-c when seeking + // to b), it will be surfaced first. In that case, we skip past it to the + // next key, which may be either a point or range key but one starting past + // the seek key. + if isPoint, _ = p.HasPointAndRange(); !isPoint { + if rangeStart, _ := p.RangeBounds(); rangeStart.Compare(p.keyBuf) <= 0 { + if !p.iter.Next() { + return + } + } + } } } @@ -562,6 +600,63 @@ func (p *pebbleIterator) ValueProto(msg protoutil.Message) error { return protoutil.Unmarshal(value, msg) } +// HasPointAndRange implements the MVCCIterator interface. +func (p *pebbleIterator) HasPointAndRange() (bool, bool) { + // TODO(erikgrinaker): The MVCCIterator contract mandates returning false for + // an invalid iterator. We should improve pebbleIterator validity and error + // checking by doing it once per iterator operation and propagating errors. + if ok, err := p.Valid(); !ok || err != nil { + return false, false + } + return p.iter.HasPointAndRange() +} + +// RangeBounds implements the MVCCIterator interface. +func (p *pebbleIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + start, end := p.iter.RangeBounds() + + // Avoid decoding empty keys: DecodeMVCCKey() will return errors for these, + // which are expensive to construct. + if len(start) == 0 && len(end) == 0 { + return nil, nil + } + + // TODO(erikgrinaker): We should surface this error somehow, but for now we + // follow UnsafeKey()'s example and silently return empty bounds. + startKey, err := DecodeMVCCKey(start) + if err != nil { + return nil, nil + } + endKey, err := DecodeMVCCKey(end) + if err != nil { + return nil, nil + } + + return startKey.Key, endKey.Key +} + +// RangeKeys implements the MVCCIterator interface. +func (p *pebbleIterator) RangeKeys() []MVCCRangeKey { + startKey, endKey := p.RangeBounds() + rangeKeys := p.iter.RangeKeys() + rangeValues := make([]MVCCRangeKey, 0, len(rangeKeys)) + + for _, rangeKey := range rangeKeys { + timestamp, err := decodeMVCCTimestampSuffix(rangeKey.Suffix) + if err != nil { + // TODO(erikgrinaker): We should surface this error somehow, but for now + // we follow UnsafeKey()'s example and silently skip them. + continue + } + rangeValues = append(rangeValues, MVCCRangeKey{ + StartKey: startKey, + EndKey: endKey, + Timestamp: timestamp, + }) + } + return rangeValues +} + // ComputeStats implements the MVCCIterator interface. func (p *pebbleIterator) ComputeStats( start, end roachpb.Key, nowNanos int64, diff --git a/pkg/storage/sst_iterator.go b/pkg/storage/sst_iterator.go index 9bd8b49b1b39..7a2c510e11e3 100644 --- a/pkg/storage/sst_iterator.go +++ b/pkg/storage/sst_iterator.go @@ -158,3 +158,18 @@ func (r *sstIterator) UnsafeKey() MVCCKey { func (r *sstIterator) UnsafeValue() []byte { return r.value } + +// HasPointAndRange implements SimpleMVCCIterator. +func (r *sstIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (r *sstIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (r *sstIterator) RangeKeys() []MVCCRangeKey { + panic("not implemented") +} diff --git a/pkg/storage/sst_writer.go b/pkg/storage/sst_writer.go index 38067b313dad..0938af17484a 100644 --- a/pkg/storage/sst_writer.go +++ b/pkg/storage/sst_writer.go @@ -142,6 +142,21 @@ func (fw *SSTWriter) ClearMVCCRange(start, end MVCCKey) error { return fw.clearRange(start, end) } +// ExperimentalPutMVCCRangeKey implements the Writer interface. +func (fw *SSTWriter) ExperimentalPutMVCCRangeKey(rangeKey MVCCRangeKey) error { + panic("not implemented") +} + +// ExperimentalClearMVCCRangeKey implements the Writer interface. +func (fw *SSTWriter) ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error { + panic("not implemented") +} + +// ExperimentalClearAllMVCCRangeKeys implements the Writer interface. +func (fw *SSTWriter) ExperimentalClearAllMVCCRangeKeys(start, end roachpb.Key) error { + panic("not implemented") +} + func (fw *SSTWriter) clearRange(start, end MVCCKey) error { if fw.fw == nil { return errors.New("cannot call ClearRange on a closed writer") diff --git a/pkg/storage/testdata/mvcc_histories/range_key_iter b/pkg/storage/testdata/mvcc_histories/range_key_iter new file mode 100644 index 000000000000..e26ebe983d10 --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/range_key_iter @@ -0,0 +1,701 @@ +# Tests range key handling in MVCC iterators. +# +# Sets up following dataset, where x is tombstone, o-o is range tombstone, [] is intent. +# +# T +# 7 [a7] [d7] [j7] [l7] +# 6 f6 +# 5 o---------------o k5 +# 4 x x d4 g4 x +# 3 o-------o e3 o-------oh3 +# 2 a2 g2 +# 1 o---------------------------------------o o---o +# a b c d e f g h i j k l m +# +run ok +put_rangekey k=a end=k ts=1 +put_rangekey k=l end=m ts=1 +put_rangekey k=b end=d ts=3 +put_rangekey k=f end=h ts=3 +put_rangekey k=c end=g ts=5 +put k=a ts=2 v=a2 +del k=a ts=4 +del k=b ts=4 +put k=d ts=4 v=d4 +put k=e ts=3 v=e3 +put k=f ts=6 v=f6 +put k=g ts=2 v=g2 +put k=g ts=4 v=g4 +put k=h ts=3 v=h3 +del k=h ts=4 +put k=k ts=5 v=k5 +with t=A + txn_begin ts=7 + put k=a v=a7 + put k=d v=d7 + put k=j v=j7 + put k=l v=l7 +---- +>> at end: +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=7.000000000,0 wto=false gul=0,0 +rangekey: {a-b}/[1.000000000,0] +rangekey: {b-c}/[3.000000000,0 1.000000000,0] +rangekey: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +rangekey: {d-f}/[5.000000000,0 1.000000000,0] +rangekey: {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +rangekey: {g-h}/[3.000000000,0 1.000000000,0] +rangekey: {h-k}/[1.000000000,0] +rangekey: {l-m}/[1.000000000,0] +meta: "a"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "a"/7.000000000,0 -> /BYTES/a7 +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/4.000000000,0 -> / +meta: "d"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "d"/7.000000000,0 -> /BYTES/d7 +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "e"/3.000000000,0 -> /BYTES/e3 +data: "f"/6.000000000,0 -> /BYTES/f6 +data: "g"/4.000000000,0 -> /BYTES/g4 +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> / +data: "h"/3.000000000,0 -> /BYTES/h3 +meta: "j"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "j"/7.000000000,0 -> /BYTES/j7 +data: "k"/5.000000000,0 -> /BYTES/k5 +meta: "l"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "l"/7.000000000,0 -> /BYTES/l7 + +# Iterate across the entire span for all key types, and without intents. +run ok +iter_new types=pointsOnly +iter_seek_ge k=a +iter_scan +---- +iter_seek_ge: "a"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_scan: "a"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_scan: "a"/7.000000000,0=/BYTES/a7 +iter_scan: "a"/4.000000000,0=/ +iter_scan: "a"/2.000000000,0=/BYTES/a2 +iter_scan: "b"/4.000000000,0=/ +iter_scan: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_scan: "d"/7.000000000,0=/BYTES/d7 +iter_scan: "d"/4.000000000,0=/BYTES/d4 +iter_scan: "e"/3.000000000,0=/BYTES/e3 +iter_scan: "f"/6.000000000,0=/BYTES/f6 +iter_scan: "g"/4.000000000,0=/BYTES/g4 +iter_scan: "g"/2.000000000,0=/BYTES/g2 +iter_scan: "h"/4.000000000,0=/ +iter_scan: "h"/3.000000000,0=/BYTES/h3 +iter_scan: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_scan: "j"/7.000000000,0=/BYTES/j7 +iter_scan: "k"/5.000000000,0=/BYTES/k5 +iter_scan: "l"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_scan: "l"/7.000000000,0=/BYTES/l7 +iter_scan: . + +run ok +iter_new types=pointsAndRanges +iter_seek_ge k=a +iter_scan +---- +iter_seek_ge: "a"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {a-b}/[1.000000000,0] +iter_scan: "a"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {a-b}/[1.000000000,0] +iter_scan: "a"/7.000000000,0=/BYTES/a7 {a-b}/[1.000000000,0] +iter_scan: "a"/4.000000000,0=/ {a-b}/[1.000000000,0] +iter_scan: "a"/2.000000000,0=/BYTES/a2 {a-b}/[1.000000000,0] +iter_scan: {b-c}/[3.000000000,0 1.000000000,0] +iter_scan: "b"/4.000000000,0=/ {b-c}/[3.000000000,0 1.000000000,0] +iter_scan: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "e"/3.000000000,0=/BYTES/e3 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: "f"/6.000000000,0=/BYTES/f6 {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: "g"/4.000000000,0=/BYTES/g4 {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: "g"/2.000000000,0=/BYTES/g2 {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: {h-k}/[1.000000000,0] +iter_scan: "h"/4.000000000,0=/ {h-k}/[1.000000000,0] +iter_scan: "h"/3.000000000,0=/BYTES/h3 {h-k}/[1.000000000,0] +iter_scan: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {h-k}/[1.000000000,0] +iter_scan: "j"/7.000000000,0=/BYTES/j7 {h-k}/[1.000000000,0] +iter_scan: "k"/5.000000000,0=/BYTES/k5 +iter_scan: "l"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {l-m}/[1.000000000,0] +iter_scan: "l"/7.000000000,0=/BYTES/l7 {l-m}/[1.000000000,0] +iter_scan: . + +run ok +iter_new types=rangesOnly +iter_seek_ge k=a +iter_scan +---- +iter_seek_ge: {a-b}/[1.000000000,0] +iter_scan: {a-b}/[1.000000000,0] +iter_scan: {b-c}/[3.000000000,0 1.000000000,0] +iter_scan: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: {h-k}/[1.000000000,0] +iter_scan: {l-m}/[1.000000000,0] +iter_scan: . + +run ok +iter_new kind=keys types=pointsAndRanges +iter_seek_ge k=a +iter_scan +---- +iter_seek_ge: {a-b}/[1.000000000,0] +iter_scan: {a-b}/[1.000000000,0] +iter_scan: "a"/7.000000000,0=/BYTES/a7 {a-b}/[1.000000000,0] +iter_scan: "a"/4.000000000,0=/ {a-b}/[1.000000000,0] +iter_scan: "a"/2.000000000,0=/BYTES/a2 {a-b}/[1.000000000,0] +iter_scan: {b-c}/[3.000000000,0 1.000000000,0] +iter_scan: "b"/4.000000000,0=/ {b-c}/[3.000000000,0 1.000000000,0] +iter_scan: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "e"/3.000000000,0=/BYTES/e3 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: "f"/6.000000000,0=/BYTES/f6 {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: "g"/4.000000000,0=/BYTES/g4 {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: "g"/2.000000000,0=/BYTES/g2 {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: {h-k}/[1.000000000,0] +iter_scan: "h"/4.000000000,0=/ {h-k}/[1.000000000,0] +iter_scan: "h"/3.000000000,0=/BYTES/h3 {h-k}/[1.000000000,0] +iter_scan: "j"/7.000000000,0=/BYTES/j7 {h-k}/[1.000000000,0] +iter_scan: "k"/5.000000000,0=/BYTES/k5 +iter_scan: {l-m}/[1.000000000,0] +iter_scan: "l"/7.000000000,0=/BYTES/l7 {l-m}/[1.000000000,0] +iter_scan: . + +# And do the same in reverse. +run ok +iter_new types=pointsOnly +iter_seek_lt k=z +iter_scan reverse +---- +iter_seek_lt: "l"/7.000000000,0=/BYTES/l7 +iter_scan: "l"/7.000000000,0=/BYTES/l7 +iter_scan: "l"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_scan: "k"/5.000000000,0=/BYTES/k5 +iter_scan: "j"/7.000000000,0=/BYTES/j7 +iter_scan: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_scan: "h"/3.000000000,0=/BYTES/h3 +iter_scan: "h"/4.000000000,0=/ +iter_scan: "g"/2.000000000,0=/BYTES/g2 +iter_scan: "g"/4.000000000,0=/BYTES/g4 +iter_scan: "f"/6.000000000,0=/BYTES/f6 +iter_scan: "e"/3.000000000,0=/BYTES/e3 +iter_scan: "d"/4.000000000,0=/BYTES/d4 +iter_scan: "d"/7.000000000,0=/BYTES/d7 +iter_scan: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_scan: "b"/4.000000000,0=/ +iter_scan: "a"/2.000000000,0=/BYTES/a2 +iter_scan: "a"/4.000000000,0=/ +iter_scan: "a"/7.000000000,0=/BYTES/a7 +iter_scan: "a"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_scan: . + +run ok +iter_new types=pointsAndRanges +iter_seek_lt k=z +iter_scan reverse +---- +iter_seek_lt: "l"/7.000000000,0=/BYTES/l7 {l-m}/[1.000000000,0] +iter_scan: "l"/7.000000000,0=/BYTES/l7 {l-m}/[1.000000000,0] +iter_scan: "l"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {l-m}/[1.000000000,0] +iter_scan: "k"/5.000000000,0=/BYTES/k5 +iter_scan: "j"/7.000000000,0=/BYTES/j7 {h-k}/[1.000000000,0] +iter_scan: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {h-k}/[1.000000000,0] +iter_scan: "h"/3.000000000,0=/BYTES/h3 {h-k}/[1.000000000,0] +iter_scan: "h"/4.000000000,0=/ {h-k}/[1.000000000,0] +iter_scan: {h-k}/[1.000000000,0] +iter_scan: "g"/2.000000000,0=/BYTES/g2 {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: "g"/4.000000000,0=/BYTES/g4 {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: "f"/6.000000000,0=/BYTES/f6 {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: "e"/3.000000000,0=/BYTES/e3 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: "b"/4.000000000,0=/ {b-c}/[3.000000000,0 1.000000000,0] +iter_scan: {b-c}/[3.000000000,0 1.000000000,0] +iter_scan: "a"/2.000000000,0=/BYTES/a2 {a-b}/[1.000000000,0] +iter_scan: "a"/4.000000000,0=/ {a-b}/[1.000000000,0] +iter_scan: "a"/7.000000000,0=/BYTES/a7 {a-b}/[1.000000000,0] +iter_scan: "a"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {a-b}/[1.000000000,0] +iter_scan: . + +run ok +iter_new types=rangesOnly +iter_seek_lt k=z +iter_scan reverse +---- +iter_seek_lt: {l-m}/[1.000000000,0] +iter_scan: {l-m}/[1.000000000,0] +iter_scan: {h-k}/[1.000000000,0] +iter_scan: {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: {b-c}/[3.000000000,0 1.000000000,0] +iter_scan: {a-b}/[1.000000000,0] +iter_scan: . + +run ok +iter_new kind=keys types=pointsAndRanges +iter_seek_lt k=z +iter_scan reverse +---- +iter_seek_lt: "l"/7.000000000,0=/BYTES/l7 {l-m}/[1.000000000,0] +iter_scan: "l"/7.000000000,0=/BYTES/l7 {l-m}/[1.000000000,0] +iter_scan: {l-m}/[1.000000000,0] +iter_scan: "k"/5.000000000,0=/BYTES/k5 +iter_scan: "j"/7.000000000,0=/BYTES/j7 {h-k}/[1.000000000,0] +iter_scan: "h"/3.000000000,0=/BYTES/h3 {h-k}/[1.000000000,0] +iter_scan: "h"/4.000000000,0=/ {h-k}/[1.000000000,0] +iter_scan: {h-k}/[1.000000000,0] +iter_scan: "g"/2.000000000,0=/BYTES/g2 {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: "g"/4.000000000,0=/BYTES/g4 {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: {g-h}/[3.000000000,0 1.000000000,0] +iter_scan: "f"/6.000000000,0=/BYTES/f6 {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: "e"/3.000000000,0=/BYTES/e3 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: "b"/4.000000000,0=/ {b-c}/[3.000000000,0 1.000000000,0] +iter_scan: {b-c}/[3.000000000,0 1.000000000,0] +iter_scan: "a"/2.000000000,0=/BYTES/a2 {a-b}/[1.000000000,0] +iter_scan: "a"/4.000000000,0=/ {a-b}/[1.000000000,0] +iter_scan: "a"/7.000000000,0=/BYTES/a7 {a-b}/[1.000000000,0] +iter_scan: {a-b}/[1.000000000,0] +iter_scan: . + +# Bounded scans. +run ok +iter_new types=pointsAndRanges k=bbb end=fff +iter_seek_ge k=a +iter_scan +iter_seek_lt k=z +iter_scan reverse +---- +iter_seek_ge: {bbb-c}/[3.000000000,0 1.000000000,0] +iter_scan: {bbb-c}/[3.000000000,0 1.000000000,0] +iter_scan: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "e"/3.000000000,0=/BYTES/e3 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: f{-ff}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: "f"/6.000000000,0=/BYTES/f6 f{-ff}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: . +iter_seek_lt: "f"/6.000000000,0=/BYTES/f6 f{-ff}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: "f"/6.000000000,0=/BYTES/f6 f{-ff}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: f{-ff}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: "e"/3.000000000,0=/BYTES/e3 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_scan: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_scan: {bbb-c}/[3.000000000,0 1.000000000,0] +iter_scan: . + +# Seek to d, iterate a few times, then reverse direction and iterate beyond seek point. +run ok +iter_new types=pointsAndRanges +iter_seek_ge k=d +iter_next +iter_next +iter_next +iter_next +iter_prev +iter_prev +iter_prev +iter_prev +iter_prev +iter_prev +---- +iter_seek_ge: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_next: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_next: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] +iter_next: "e"/3.000000000,0=/BYTES/e3 {d-f}/[5.000000000,0 1.000000000,0] +iter_next: {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_prev: "e"/3.000000000,0=/BYTES/e3 {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_prev: "b"/4.000000000,0=/ {b-c}/[3.000000000,0 1.000000000,0] + +# Do a few seeks around an intent/point/range. +run ok +iter_new types=pointsAndRanges +iter_seek_ge k=d +iter_next +iter_seek_ge k=d ts=8 +iter_next +iter_seek_ge k=d ts=7 +iter_seek_ge k=d ts=5 +iter_next +iter_seek_ge k=d ts=4 +---- +iter_seek_ge: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_next: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_ge: {d-f}/[5.000000000,0 1.000000000,0] +iter_next: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_ge: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_ge: {d-f}/[5.000000000,0 1.000000000,0] +iter_next: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_ge: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] + +run ok +iter_new types=pointsAndRanges +iter_seek_lt k=e +iter_seek_lt k=d ts=4 +iter_seek_lt k=d ts=7 +iter_prev +iter_seek_lt k=d +---- +iter_seek_lt: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_lt: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_lt: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_seek_lt: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] + +# Do the same, but switch direction immediately. +run ok +iter_new types=pointsAndRanges +iter_seek_ge k=d +iter_prev +iter_seek_ge k=d ts=8 +iter_prev +iter_prev +iter_seek_ge k=d ts=7 +iter_seek_ge k=d ts=5 +iter_prev +iter_seek_ge k=d ts=4 +---- +iter_seek_ge: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_seek_ge: {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_seek_ge: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_ge: {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_ge: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] + +run ok +iter_new types=pointsAndRanges +iter_seek_lt k=e +iter_next +iter_seek_lt k=d ts=4 +iter_next +iter_seek_lt k=d ts=7 +iter_next +iter_seek_lt k=d +iter_next +---- +iter_seek_lt: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] +iter_next: "e"/3.000000000,0=/BYTES/e3 {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_lt: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_next: "d"/4.000000000,0=/BYTES/d4 {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_lt: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_next: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_lt: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_next: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] + +# Check that switching direction past an intent will yield the right range key. +# We also reverse seek to the intent, which must surface the correct range key. +run ok +iter_new types=pointsAndRanges +iter_seek_ge k=d +iter_prev +iter_next +iter_next +iter_prev +iter_prev +---- +iter_seek_ge: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_next: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_next: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] + +run ok +iter_new types=pointsAndRanges +iter_seek_lt k=d ts=7 +iter_prev +iter_next +iter_seek_lt k=d ts=7 +iter_next +---- +iter_seek_lt: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_next: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_lt: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_next: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] + +# Seeking to keys with an intent will hit the intent immediately, both when +# it's at the start of a range key and in the middle of one. +run ok +iter_new types=pointsAndRanges +iter_seek_ge k=j +iter_next +iter_prev +iter_prev +iter_prev +iter_seek_ge k=d +iter_next +iter_prev +iter_prev +---- +iter_seek_ge: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {h-k}/[1.000000000,0] +iter_next: "j"/7.000000000,0=/BYTES/j7 {h-k}/[1.000000000,0] +iter_prev: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {h-k}/[1.000000000,0] +iter_prev: "h"/3.000000000,0=/BYTES/h3 {h-k}/[1.000000000,0] +iter_prev: "h"/4.000000000,0=/ {h-k}/[1.000000000,0] +iter_seek_ge: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_next: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_prev: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] + +# Exhaust iterators and then switch directions. +run ok +iter_new types=pointsAndRanges +iter_seek_ge k=a ts=4 +iter_prev +iter_prev +iter_prev +iter_next +iter_next +---- +iter_seek_ge: "a"/4.000000000,0=/ {a-b}/[1.000000000,0] +iter_prev: "a"/7.000000000,0=/BYTES/a7 {a-b}/[1.000000000,0] +iter_prev: "a"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {a-b}/[1.000000000,0] +iter_prev: . +iter_next: "a"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {a-b}/[1.000000000,0] +iter_next: "a"/7.000000000,0=/BYTES/a7 {a-b}/[1.000000000,0] + +run ok +iter_new types=pointsAndRanges +iter_seek_ge k=l +iter_next +iter_next +iter_prev +iter_prev +iter_prev +---- +iter_seek_ge: "l"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {l-m}/[1.000000000,0] +iter_next: "l"/7.000000000,0=/BYTES/l7 {l-m}/[1.000000000,0] +iter_next: . +iter_prev: "l"/7.000000000,0=/BYTES/l7 {l-m}/[1.000000000,0] +iter_prev: "l"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {l-m}/[1.000000000,0] +iter_prev: "k"/5.000000000,0=/BYTES/k5 + +# Test NextKey() without and with intents/range keys, and with some seeks. +run ok +iter_new kind=keys types=pointsAndRanges +iter_seek_ge k=a +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +---- +iter_seek_ge: {a-b}/[1.000000000,0] +iter_next_key: "a"/7.000000000,0=/BYTES/a7 {a-b}/[1.000000000,0] +iter_next_key: {b-c}/[3.000000000,0 1.000000000,0] +iter_next_key: "b"/4.000000000,0=/ {b-c}/[3.000000000,0 1.000000000,0] +iter_next_key: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_next_key: {d-f}/[5.000000000,0 1.000000000,0] +iter_next_key: "d"/7.000000000,0=/BYTES/d7 {d-f}/[5.000000000,0 1.000000000,0] +iter_next_key: "e"/3.000000000,0=/BYTES/e3 {d-f}/[5.000000000,0 1.000000000,0] +iter_next_key: {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_next_key: "f"/6.000000000,0=/BYTES/f6 {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_next_key: {g-h}/[3.000000000,0 1.000000000,0] +iter_next_key: "g"/4.000000000,0=/BYTES/g4 {g-h}/[3.000000000,0 1.000000000,0] +iter_next_key: {h-k}/[1.000000000,0] +iter_next_key: "h"/4.000000000,0=/ {h-k}/[1.000000000,0] +iter_next_key: "j"/7.000000000,0=/BYTES/j7 {h-k}/[1.000000000,0] +iter_next_key: "k"/5.000000000,0=/BYTES/k5 +iter_next_key: {l-m}/[1.000000000,0] +iter_next_key: "l"/7.000000000,0=/BYTES/l7 {l-m}/[1.000000000,0] +iter_next_key: . + +run ok +iter_new types=pointsAndRanges +iter_seek_ge k=a +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +---- +iter_seek_ge: "a"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {a-b}/[1.000000000,0] +iter_next_key: {b-c}/[3.000000000,0 1.000000000,0] +iter_next_key: "b"/4.000000000,0=/ {b-c}/[3.000000000,0 1.000000000,0] +iter_next_key: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_next_key: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_next_key: "e"/3.000000000,0=/BYTES/e3 {d-f}/[5.000000000,0 1.000000000,0] +iter_next_key: {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_next_key: "f"/6.000000000,0=/BYTES/f6 {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_next_key: {g-h}/[3.000000000,0 1.000000000,0] +iter_next_key: "g"/4.000000000,0=/BYTES/g4 {g-h}/[3.000000000,0 1.000000000,0] +iter_next_key: {h-k}/[1.000000000,0] +iter_next_key: "h"/4.000000000,0=/ {h-k}/[1.000000000,0] +iter_next_key: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {h-k}/[1.000000000,0] +iter_next_key: "k"/5.000000000,0=/BYTES/k5 +iter_next_key: "l"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {l-m}/[1.000000000,0] +iter_next_key: . + +run ok +iter_new types=pointsOnly +iter_seek_ge k=a +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +---- +iter_seek_ge: "a"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_next_key: "b"/4.000000000,0=/ +iter_next_key: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_next_key: "e"/3.000000000,0=/BYTES/e3 +iter_next_key: "f"/6.000000000,0=/BYTES/f6 +iter_next_key: "g"/4.000000000,0=/BYTES/g4 +iter_next_key: "h"/4.000000000,0=/ +iter_next_key: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_next_key: "k"/5.000000000,0=/BYTES/k5 +iter_next_key: "l"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_next_key: . + +run ok +iter_new types=rangesOnly +iter_seek_ge k=a +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +iter_next_key +---- +iter_seek_ge: {a-b}/[1.000000000,0] +iter_next_key: {b-c}/[3.000000000,0 1.000000000,0] +iter_next_key: {c-d}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_next_key: {d-f}/[5.000000000,0 1.000000000,0] +iter_next_key: {f-g}/[5.000000000,0 3.000000000,0 1.000000000,0] +iter_next_key: {g-h}/[3.000000000,0 1.000000000,0] +iter_next_key: {h-k}/[1.000000000,0] +iter_next_key: {l-m}/[1.000000000,0] +iter_next_key: . + +# Test NextKey() during seeks. +run ok +iter_new types=pointsAndRanges +iter_seek_ge k=g ts=2 +iter_next_key +iter_seek_ge k=d +iter_next_key +---- +iter_seek_ge: "g"/2.000000000,0=/BYTES/g2 {g-h}/[3.000000000,0 1.000000000,0] +iter_next_key: {h-k}/[1.000000000,0] +iter_seek_ge: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_next_key: "e"/3.000000000,0=/BYTES/e3 {d-f}/[5.000000000,0 1.000000000,0] + +# Test SeekIntentGE both with and without intents and range keys. +run ok +iter_new types=pointsAndRanges +iter_seek_intent_ge k=b txn=A +iter_seek_intent_ge k=d txn=A +iter_seek_intent_ge k=i txn=A +iter_seek_intent_ge k=j txn=A +iter_seek_intent_ge k=k txn=A +---- +iter_seek_intent_ge: {b-c}/[3.000000000,0 1.000000000,0] +iter_seek_intent_ge: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_intent_ge: {h-k}/[1.000000000,0] +iter_seek_intent_ge: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true {h-k}/[1.000000000,0] +iter_seek_intent_ge: "k"/5.000000000,0=/BYTES/k5 + +run ok +iter_new kind=keys types=pointsAndRanges +iter_seek_intent_ge k=b txn=A +iter_seek_intent_ge k=d txn=A +iter_seek_intent_ge k=i txn=A +iter_seek_intent_ge k=j txn=A +iter_seek_intent_ge k=k txn=A +---- +iter_seek_intent_ge: {b-c}/[3.000000000,0 1.000000000,0] +iter_seek_intent_ge: {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_intent_ge: {h-k}/[1.000000000,0] +iter_seek_intent_ge: {h-k}/[1.000000000,0] +iter_seek_intent_ge: "k"/5.000000000,0=/BYTES/k5 + +run ok +iter_new types=pointsOnly +iter_seek_intent_ge k=b txn=A +iter_seek_intent_ge k=d txn=A +iter_seek_intent_ge k=i txn=A +iter_seek_intent_ge k=j txn=A +iter_seek_intent_ge k=k txn=A +---- +iter_seek_intent_ge: "b"/4.000000000,0=/ +iter_seek_intent_ge: "d"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_seek_intent_ge: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_seek_intent_ge: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +iter_seek_intent_ge: "k"/5.000000000,0=/BYTES/k5 + +run ok +iter_new types=rangesOnly +iter_seek_intent_ge k=b txn=A +iter_seek_intent_ge k=d txn=A +iter_seek_intent_ge k=i txn=A +iter_seek_intent_ge k=j txn=A +iter_seek_intent_ge k=k txn=A +---- +iter_seek_intent_ge: {b-c}/[3.000000000,0 1.000000000,0] +iter_seek_intent_ge: {d-f}/[5.000000000,0 1.000000000,0] +iter_seek_intent_ge: {h-k}/[1.000000000,0] +iter_seek_intent_ge: {h-k}/[1.000000000,0] +iter_seek_intent_ge: {l-m}/[1.000000000,0] diff --git a/pkg/storage/testdata/mvcc_histories/range_key_put b/pkg/storage/testdata/mvcc_histories/range_key_put new file mode 100644 index 000000000000..ed9bee4e13d8 --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/range_key_put @@ -0,0 +1,42 @@ +# Test basic MVCC range key mutations. + +run trace +# These three should merge. +put_rangekey k=c end=e ts=1 +put_rangekey k=a end=c ts=1 +put_rangekey k=e end=f ts=1 +# Write overlapping key causing fragmentation. +put_rangekey k=d end=k ts=2 +# Write key underneath that fragments in the middle. +put_rangekey k=g end=j ts=1 +# Merge keys below. +put_rangekey k=f end=g ts=1 +# Fill in the gaps to make one chunk. +put_rangekey k=a end=d ts=2 +put_rangekey k=j end=k ts=1 +---- +>> put_rangekey k=c end=e ts=1 +rangekey: {c-e}/[1.000000000,0] +>> put_rangekey k=a end=c ts=1 +rangekey: {a-e}/[1.000000000,0] +>> put_rangekey k=e end=f ts=1 +rangekey: {a-f}/[1.000000000,0] +>> put_rangekey k=d end=k ts=2 +rangekey: {a-d}/[1.000000000,0] +rangekey: {d-f}/[2.000000000,0 1.000000000,0] +rangekey: {f-k}/[2.000000000,0] +>> put_rangekey k=g end=j ts=1 +rangekey: {a-d}/[1.000000000,0] +rangekey: {d-f}/[2.000000000,0 1.000000000,0] +rangekey: {f-g}/[2.000000000,0] +rangekey: {g-j}/[2.000000000,0 1.000000000,0] +rangekey: {j-k}/[2.000000000,0] +>> put_rangekey k=f end=g ts=1 +rangekey: {a-d}/[1.000000000,0] +rangekey: {d-j}/[2.000000000,0 1.000000000,0] +rangekey: {j-k}/[2.000000000,0] +>> put_rangekey k=a end=d ts=2 +rangekey: {a-j}/[2.000000000,0 1.000000000,0] +rangekey: {j-k}/[2.000000000,0] +>> put_rangekey k=j end=k ts=1 +rangekey: {a-k}/[2.000000000,0 1.000000000,0]