From 341a77fed10a4bcddfa1a1d8c28ad20109742845 Mon Sep 17 00:00:00 2001 From: Michael Butler Date: Thu, 7 Jul 2022 22:44:59 -0400 Subject: [PATCH] storage: handle range keys in readAsOfIterator Previously, the readAsOfIterator used in RESTORE could not handle range keys. This PR implements the new SimpleMVCCIterator methods that handle range keys. Further, this patch ensures the readAsOfIterator skips over point keys shadowed by range keys at or below the caller's specified asOf timestamp. Next, Backup needs to be tought about RangeKeys. Informs #71155 Release note: none --- pkg/storage/mvcc_history_test.go | 24 +++ pkg/storage/read_as_of_iterator.go | 99 ++++++++--- .../mvcc_histories/range_key_iter_read_as_of | 168 ++++++++++++++++++ 3 files changed, 267 insertions(+), 24 deletions(-) create mode 100644 pkg/storage/testdata/mvcc_histories/range_key_iter_read_as_of diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index 15e52ec37beb..02229f264783 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -670,6 +670,7 @@ var commands = map[string]cmd{ "iter_new": {typReadOnly, cmdIterNew}, "iter_new_incremental": {typReadOnly, cmdIterNewIncremental}, // MVCCIncrementalIterator + "iter_new_read_as_of": {typReadOnly, cmdIterNewReadAsOf}, // readAsOfIterator "iter_seek_ge": {typReadOnly, cmdIterSeekGE}, "iter_seek_lt": {typReadOnly, cmdIterSeekLT}, "iter_seek_intent_ge": {typReadOnly, cmdIterSeekIntentGE}, @@ -1396,6 +1397,29 @@ func cmdIterNewIncremental(e *evalCtx) error { return nil } +func cmdIterNewReadAsOf(e *evalCtx) error { + if e.iter != nil { + e.iter.Close() + } + var asOf hlc.Timestamp + if e.hasArg("asOfTs") { + asOf = e.getTsWithName("asOfTs") + } + opts := IterOptions{ + KeyTypes: IterKeyTypePointsAndRanges, + RangeKeyMaskingBelow: asOf} + if e.hasArg("k") { + opts.LowerBound, opts.UpperBound = e.getKeyRange() + } + if len(opts.UpperBound) == 0 { + opts.UpperBound = keys.MaxKey + } + r, closer := metamorphicReader(e, "iter-reader") + iter := &iterWithCloser{r.NewMVCCIterator(MVCCKeyIterKind, opts), closer} + e.iter = NewReadAsOfIterator(iter, asOf) + return nil +} + func cmdIterSeekGE(e *evalCtx) error { key := e.getKey() ts := e.getTs(nil) diff --git a/pkg/storage/read_as_of_iterator.go b/pkg/storage/read_as_of_iterator.go index ce97929a4426..9b51620fc0ba 100644 --- a/pkg/storage/read_as_of_iterator.go +++ b/pkg/storage/read_as_of_iterator.go @@ -16,15 +16,21 @@ import ( ) // ReadAsOfIterator wraps a SimpleMVCCIterator and only surfaces the latest -// valid key of a given MVCC key that is also below the asOf timestamp, if set. -// Further, the iterator does not surface delete tombstones, nor any MVCC keys -// shadowed by delete tombstones below the asOf timestamp, if set. The iterator -// assumes that it will not encounter any write intents. +// valid point key of a given MVCC key that is also below the asOf timestamp, if +// set. Further, the iterator does not surface point or range tombstones, nor +// any MVCC keys shadowed by tombstones below the asOf timestamp, if set. The +// iterator assumes that it will not encounter any write intents. type ReadAsOfIterator struct { iter SimpleMVCCIterator // asOf is the latest timestamp of a key surfaced by the iterator. asOf hlc.Timestamp + + // valid tracks if the current key is valid + valid bool + + // err tracks if iterating to the current key returned an error + err error } var _ SimpleMVCCIterator = &ReadAsOfIterator{} @@ -45,7 +51,7 @@ func (f *ReadAsOfIterator) SeekGE(originalKey MVCCKey) { synthetic := MVCCKey{Key: originalKey.Key, Timestamp: f.asOf} f.iter.SeekGE(synthetic) - if ok := f.advance(); ok && f.UnsafeKey().Less(originalKey) { + if f.advance(); f.valid && f.UnsafeKey().Less(originalKey) { // The following is true: // originalKey.Key == f.UnsafeKey && // f.asOf timestamp (if set) >= current timestamp > originalKey timestamp. @@ -59,7 +65,7 @@ func (f *ReadAsOfIterator) SeekGE(originalKey MVCCKey) { // Valid implements the simpleMVCCIterator. func (f *ReadAsOfIterator) Valid() (bool, error) { - return f.iter.Valid() + return f.valid, f.err } // Next advances the iterator to the next valid MVCC key obeying the iterator's @@ -94,42 +100,87 @@ func (f *ReadAsOfIterator) UnsafeValue() []byte { // HasPointAndRange implements SimpleMVCCIterator. func (f *ReadAsOfIterator) HasPointAndRange() (bool, bool) { - panic("not implemented") + // the ReadAsOfIterator only surfaces point keys; therefore hasPoint is always + // true, unless the iterator is invalid, and hasRange is always false. + return f.valid, false } -// RangeBounds implements SimpleMVCCIterator. +// RangeBounds always returns an empty span, since the iterator never surfaces +// rangekeys. func (f *ReadAsOfIterator) RangeBounds() roachpb.Span { - panic("not implemented") + return roachpb.Span{} } -// RangeKeys implements SimpleMVCCIterator. +// RangeKeys is always empty since this iterator never surfaces rangeKeys. func (f *ReadAsOfIterator) RangeKeys() []MVCCRangeKeyValue { - panic("not implemented") + return []MVCCRangeKeyValue{} +} + +// updateValid updates i.valid and i.err based on the underlying iterator, and +// returns true if valid. +func (f *ReadAsOfIterator) updateValid() bool { + f.valid, f.err = f.iter.Valid() + return f.valid } // advance moves past keys with timestamps later than f.asOf and skips MVCC keys -// whose latest value (subject to f.asOF) has been deleted. Note that advance -// moves past keys above asOF before evaluating tombstones, implying the -// iterator will never call f.iter.NextKey() on a tombstone with a timestamp -// later than f.asOF. -func (f *ReadAsOfIterator) advance() bool { +// whose latest value (subject to f.asOF) has been deleted by a point or range +// tombstone. +func (f *ReadAsOfIterator) advance() { for { - if ok, err := f.Valid(); err != nil || !ok { - // No valid keys found. - return false - } else if !f.asOf.IsEmpty() && f.asOf.Less(f.iter.UnsafeKey().Timestamp) { - // Skip keys above the asOf timestamp. + if ok := f.updateValid(); !ok { + return + } + + if !f.asOf.IsEmpty() && f.asOf.Less(f.iter.UnsafeKey().Timestamp) { + // Skip keys above the asOf timestamp, regardless of type of key (e.g. point or range) + f.iter.Next() + } else if hasPoint, hasRange := f.iter.HasPointAndRange(); !hasPoint && hasRange { + // Bare range keys get surfaced before the point key, even though the + // point key shadows it; thus, because we can infer range key information + // when the point key surfaces, skip over the bare range key, and reason + // about shadowed keys at the surfaced point key. + // + // E.g. Scanning the keys below: + // 2 a2 + // 1 o---o + // a b + // + // would result in two surfaced keys: + // {a-b}@1; + // a2, {a-b}@1 + f.iter.Next() } else if len(f.iter.UnsafeValue()) == 0 { - // Skip to the next MVCC key if we find a tombstone. + // Skip to the next MVCC key if we find a point tombstone. + f.iter.NextKey() + } else if !hasRange { + // On a valid key without a range key + return + } else if f.asOfRangeKeyShadows() { + // The latest range key, as of system time, shadows the latest point key. + // This key is therefore deleted as of system time. f.iter.NextKey() } else { - // On a valid key. - return true + // On a valid key that potentially shadows range key(s) + return } } } +// asOfRangeKeyShadows returns true if there exists a range key at or below the asOf timestamp +// that shadows the latest point key +// +// TODO (msbutler): ensure this function caches range key values (#84379) before +// the 22.2 branch cut, else we face a steep perf cliff for RESTORE with range keys. +func (f *ReadAsOfIterator) asOfRangeKeyShadows() (shadows bool) { + rangeKeys := f.iter.RangeKeys() + if f.asOf.IsEmpty() { + return f.iter.UnsafeKey().Timestamp.LessEq(rangeKeys[0].RangeKey.Timestamp) + } + return HasRangeKeyBetween(rangeKeys, f.asOf, f.iter.UnsafeKey().Timestamp) +} + // NewReadAsOfIterator constructs a ReadAsOfIterator. func NewReadAsOfIterator(iter SimpleMVCCIterator, asOf hlc.Timestamp) *ReadAsOfIterator { return &ReadAsOfIterator{iter: iter, asOf: asOf} diff --git a/pkg/storage/testdata/mvcc_histories/range_key_iter_read_as_of b/pkg/storage/testdata/mvcc_histories/range_key_iter_read_as_of new file mode 100644 index 000000000000..5b49bc68c726 --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/range_key_iter_read_as_of @@ -0,0 +1,168 @@ +# Tests range key handling in ReadAsOfIterator. Note that the iterator assumes it will not see an +# intent. +# +# Sets up the following dataset, where x is tombstone, o-o is range tombstone +# +# 6 f6 +# 5 o---------------o k5 +# 4 x x d4 f4 g4 x +# 3 o-------o e3 o-------oh3 o---o +# 2 a2 f2 g2 +# 1 o---------------------------------------o +# a b c d e f g h i j k l m n o +# +run ok +put_rangekey k=a end=k ts=1 +put k=a ts=2 v=a2 +del k=a ts=4 +put_rangekey k=b end=d ts=3 +del k=b ts=4 +put k=d ts=4 v=d4 +put k=e ts=3 v=e3 +put k=f ts=2 v=f2 +put k=g ts=2 v=g2 +put_rangekey k=f end=h ts=3 +put k=f ts=4 v=f4 +put_rangekey k=c end=g ts=5 +put k=f ts=6 v=f6 +put k=g ts=4 v=g4 +put k=h ts=3 v=h3 +del k=h ts=4 +put k=k ts=5 v=k5 +put_rangekey k=m end=n ts=3 localTs=2 +---- +>> at end: +rangekey: {a-b}/[1.000000000,0=/] +rangekey: {b-c}/[3.000000000,0=/ 1.000000000,0=/] +rangekey: {c-d}/[5.000000000,0=/ 3.000000000,0=/ 1.000000000,0=/] +rangekey: {d-f}/[5.000000000,0=/ 1.000000000,0=/] +rangekey: {f-g}/[5.000000000,0=/ 3.000000000,0=/ 1.000000000,0=/] +rangekey: {g-h}/[3.000000000,0=/ 1.000000000,0=/] +rangekey: {h-k}/[1.000000000,0=/] +rangekey: {m-n}/[3.000000000,0={localTs=2.000000000,0}/] +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/4.000000000,0 -> / +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "e"/3.000000000,0 -> /BYTES/e3 +data: "f"/6.000000000,0 -> /BYTES/f6 +data: "f"/4.000000000,0 -> /BYTES/f4 +data: "f"/2.000000000,0 -> /BYTES/f2 +data: "g"/4.000000000,0 -> /BYTES/g4 +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> / +data: "h"/3.000000000,0 -> /BYTES/h3 +data: "k"/5.000000000,0 -> /BYTES/k5 + +# test range keys are ignored if above asOf, even with multiple range keys +run ok +iter_new_read_as_of asOfTs=2 +iter_seek_ge k=a +iter_scan +---- +iter_seek_ge: "a"/2.000000000,0=/BYTES/a2 +iter_scan: "a"/2.000000000,0=/BYTES/a2 +iter_scan: "f"/2.000000000,0=/BYTES/f2 +iter_scan: "g"/2.000000000,0=/BYTES/g2 +iter_scan: . + +# test range key at or below asOf properly shadows keys +run ok +iter_new_read_as_of asOfTs=3 +iter_seek_ge k=a +iter_scan +---- +iter_seek_ge: "a"/2.000000000,0=/BYTES/a2 +iter_scan: "a"/2.000000000,0=/BYTES/a2 +iter_scan: "e"/3.000000000,0=/BYTES/e3 +iter_scan: "h"/3.000000000,0=/BYTES/h3 +iter_scan: . + +# iterate over a few point tombstones at the asOf time +run ok +iter_new_read_as_of asOfTs=4 +iter_seek_ge k=a +iter_scan +---- +iter_seek_ge: "d"/4.000000000,0=/BYTES/d4 +iter_scan: "d"/4.000000000,0=/BYTES/d4 +iter_scan: "e"/3.000000000,0=/BYTES/e3 +iter_scan: "f"/4.000000000,0=/BYTES/f4 +iter_scan: "g"/4.000000000,0=/BYTES/g4 +iter_scan: . + +# iterate over ts 5-7 because the test is cheap +run ok +iter_new_read_as_of asOfTs=5 +iter_seek_ge k=a +iter_scan +---- +iter_seek_ge: "g"/4.000000000,0=/BYTES/g4 +iter_scan: "g"/4.000000000,0=/BYTES/g4 +iter_scan: "k"/5.000000000,0=/BYTES/k5 +iter_scan: . + +# iterate over ts 5-7 because the test is cheap +run ok +iter_new_read_as_of asOfTs=6 +iter_seek_ge k=a +iter_scan +---- +iter_seek_ge: "f"/6.000000000,0=/BYTES/f6 +iter_scan: "f"/6.000000000,0=/BYTES/f6 +iter_scan: "g"/4.000000000,0=/BYTES/g4 +iter_scan: "k"/5.000000000,0=/BYTES/k5 +iter_scan: . + +# iterate over ts 5-7 for completeness +run ok +iter_new_read_as_of asOfTs=7 +iter_seek_ge k=a +iter_scan +---- +iter_seek_ge: "f"/6.000000000,0=/BYTES/f6 +iter_scan: "f"/6.000000000,0=/BYTES/f6 +iter_scan: "g"/4.000000000,0=/BYTES/g4 +iter_scan: "k"/5.000000000,0=/BYTES/k5 +iter_scan: . + + +# test range key handling when asOf is empty +run ok +iter_new_read_as_of +iter_seek_ge k=a +iter_scan +---- +iter_seek_ge: "f"/6.000000000,0=/BYTES/f6 +iter_scan: "f"/6.000000000,0=/BYTES/f6 +iter_scan: "g"/4.000000000,0=/BYTES/g4 +iter_scan: "k"/5.000000000,0=/BYTES/k5 +iter_scan: . + +# seek to a point key shadowed by a range key +run ok +iter_new_read_as_of asOfTs=5 +iter_seek_ge k=d +---- +iter_seek_ge: "g"/4.000000000,0=/BYTES/g4 + +# seek to the start of a range key +run ok +iter_new_read_as_of asOfTs=5 +iter_seek_ge k=c +---- +iter_seek_ge: "g"/4.000000000,0=/BYTES/g4 + +# seek to the same point key, with AsOf empty +run ok +iter_new_read_as_of +iter_seek_ge k=d +---- +iter_seek_ge: "f"/6.000000000,0=/BYTES/f6 + +# attempt seek to the same point key, but ignore the range key because its above AsOf +run ok +iter_new_read_as_of asOfTs=4 +iter_seek_ge k=d +---- +iter_seek_ge: "d"/4.000000000,0=/BYTES/d4