Skip to content

Commit

Permalink
storage: handle range keys in readAsOfIterator
Browse files Browse the repository at this point in the history
Previously, the readAsOfIterator used in RESTORE could not handle range keys.
This PR implements the new SimpleMVCCIterator methods that handle range keys.
Further, this patch ensures the  readAsOfIterator skips over point keys
shadowed by range keys  at or below the caller's specified asOf timestamp.

Next, Backup needs to be tought about RangeKeys.

Informs cockroachdb#71155

Release note: none
  • Loading branch information
msbutler committed Jul 20, 2022
1 parent 37df70d commit 341a77f
Show file tree
Hide file tree
Showing 3 changed files with 267 additions and 24 deletions.
24 changes: 24 additions & 0 deletions pkg/storage/mvcc_history_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -670,6 +670,7 @@ var commands = map[string]cmd{

"iter_new": {typReadOnly, cmdIterNew},
"iter_new_incremental": {typReadOnly, cmdIterNewIncremental}, // MVCCIncrementalIterator
"iter_new_read_as_of": {typReadOnly, cmdIterNewReadAsOf}, // readAsOfIterator
"iter_seek_ge": {typReadOnly, cmdIterSeekGE},
"iter_seek_lt": {typReadOnly, cmdIterSeekLT},
"iter_seek_intent_ge": {typReadOnly, cmdIterSeekIntentGE},
Expand Down Expand Up @@ -1396,6 +1397,29 @@ func cmdIterNewIncremental(e *evalCtx) error {
return nil
}

func cmdIterNewReadAsOf(e *evalCtx) error {
if e.iter != nil {
e.iter.Close()
}
var asOf hlc.Timestamp
if e.hasArg("asOfTs") {
asOf = e.getTsWithName("asOfTs")
}
opts := IterOptions{
KeyTypes: IterKeyTypePointsAndRanges,
RangeKeyMaskingBelow: asOf}
if e.hasArg("k") {
opts.LowerBound, opts.UpperBound = e.getKeyRange()
}
if len(opts.UpperBound) == 0 {
opts.UpperBound = keys.MaxKey
}
r, closer := metamorphicReader(e, "iter-reader")
iter := &iterWithCloser{r.NewMVCCIterator(MVCCKeyIterKind, opts), closer}
e.iter = NewReadAsOfIterator(iter, asOf)
return nil
}

func cmdIterSeekGE(e *evalCtx) error {
key := e.getKey()
ts := e.getTs(nil)
Expand Down
99 changes: 75 additions & 24 deletions pkg/storage/read_as_of_iterator.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,21 @@ import (
)

// ReadAsOfIterator wraps a SimpleMVCCIterator and only surfaces the latest
// valid key of a given MVCC key that is also below the asOf timestamp, if set.
// Further, the iterator does not surface delete tombstones, nor any MVCC keys
// shadowed by delete tombstones below the asOf timestamp, if set. The iterator
// assumes that it will not encounter any write intents.
// valid point key of a given MVCC key that is also below the asOf timestamp, if
// set. Further, the iterator does not surface point or range tombstones, nor
// any MVCC keys shadowed by tombstones below the asOf timestamp, if set. The
// iterator assumes that it will not encounter any write intents.
type ReadAsOfIterator struct {
iter SimpleMVCCIterator

// asOf is the latest timestamp of a key surfaced by the iterator.
asOf hlc.Timestamp

// valid tracks if the current key is valid
valid bool

// err tracks if iterating to the current key returned an error
err error
}

var _ SimpleMVCCIterator = &ReadAsOfIterator{}
Expand All @@ -45,7 +51,7 @@ func (f *ReadAsOfIterator) SeekGE(originalKey MVCCKey) {
synthetic := MVCCKey{Key: originalKey.Key, Timestamp: f.asOf}
f.iter.SeekGE(synthetic)

if ok := f.advance(); ok && f.UnsafeKey().Less(originalKey) {
if f.advance(); f.valid && f.UnsafeKey().Less(originalKey) {
// The following is true:
// originalKey.Key == f.UnsafeKey &&
// f.asOf timestamp (if set) >= current timestamp > originalKey timestamp.
Expand All @@ -59,7 +65,7 @@ func (f *ReadAsOfIterator) SeekGE(originalKey MVCCKey) {

// Valid implements the simpleMVCCIterator.
func (f *ReadAsOfIterator) Valid() (bool, error) {
return f.iter.Valid()
return f.valid, f.err
}

// Next advances the iterator to the next valid MVCC key obeying the iterator's
Expand Down Expand Up @@ -94,42 +100,87 @@ func (f *ReadAsOfIterator) UnsafeValue() []byte {

// HasPointAndRange implements SimpleMVCCIterator.
func (f *ReadAsOfIterator) HasPointAndRange() (bool, bool) {
panic("not implemented")
// the ReadAsOfIterator only surfaces point keys; therefore hasPoint is always
// true, unless the iterator is invalid, and hasRange is always false.
return f.valid, false
}

// RangeBounds implements SimpleMVCCIterator.
// RangeBounds always returns an empty span, since the iterator never surfaces
// rangekeys.
func (f *ReadAsOfIterator) RangeBounds() roachpb.Span {
panic("not implemented")
return roachpb.Span{}
}

// RangeKeys implements SimpleMVCCIterator.
// RangeKeys is always empty since this iterator never surfaces rangeKeys.
func (f *ReadAsOfIterator) RangeKeys() []MVCCRangeKeyValue {
panic("not implemented")
return []MVCCRangeKeyValue{}
}

// updateValid updates i.valid and i.err based on the underlying iterator, and
// returns true if valid.
func (f *ReadAsOfIterator) updateValid() bool {
f.valid, f.err = f.iter.Valid()
return f.valid
}

// advance moves past keys with timestamps later than f.asOf and skips MVCC keys
// whose latest value (subject to f.asOF) has been deleted. Note that advance
// moves past keys above asOF before evaluating tombstones, implying the
// iterator will never call f.iter.NextKey() on a tombstone with a timestamp
// later than f.asOF.
func (f *ReadAsOfIterator) advance() bool {
// whose latest value (subject to f.asOF) has been deleted by a point or range
// tombstone.
func (f *ReadAsOfIterator) advance() {
for {
if ok, err := f.Valid(); err != nil || !ok {
// No valid keys found.
return false
} else if !f.asOf.IsEmpty() && f.asOf.Less(f.iter.UnsafeKey().Timestamp) {
// Skip keys above the asOf timestamp.
if ok := f.updateValid(); !ok {
return
}

if !f.asOf.IsEmpty() && f.asOf.Less(f.iter.UnsafeKey().Timestamp) {
// Skip keys above the asOf timestamp, regardless of type of key (e.g. point or range)
f.iter.Next()
} else if hasPoint, hasRange := f.iter.HasPointAndRange(); !hasPoint && hasRange {
// Bare range keys get surfaced before the point key, even though the
// point key shadows it; thus, because we can infer range key information
// when the point key surfaces, skip over the bare range key, and reason
// about shadowed keys at the surfaced point key.
//
// E.g. Scanning the keys below:
// 2 a2
// 1 o---o
// a b
//
// would result in two surfaced keys:
// {a-b}@1;
// a2, {a-b}@1

f.iter.Next()
} else if len(f.iter.UnsafeValue()) == 0 {
// Skip to the next MVCC key if we find a tombstone.
// Skip to the next MVCC key if we find a point tombstone.
f.iter.NextKey()
} else if !hasRange {
// On a valid key without a range key
return
} else if f.asOfRangeKeyShadows() {
// The latest range key, as of system time, shadows the latest point key.
// This key is therefore deleted as of system time.
f.iter.NextKey()
} else {
// On a valid key.
return true
// On a valid key that potentially shadows range key(s)
return
}
}
}

// asOfRangeKeyShadows returns true if there exists a range key at or below the asOf timestamp
// that shadows the latest point key
//
// TODO (msbutler): ensure this function caches range key values (#84379) before
// the 22.2 branch cut, else we face a steep perf cliff for RESTORE with range keys.
func (f *ReadAsOfIterator) asOfRangeKeyShadows() (shadows bool) {
rangeKeys := f.iter.RangeKeys()
if f.asOf.IsEmpty() {
return f.iter.UnsafeKey().Timestamp.LessEq(rangeKeys[0].RangeKey.Timestamp)
}
return HasRangeKeyBetween(rangeKeys, f.asOf, f.iter.UnsafeKey().Timestamp)
}

// NewReadAsOfIterator constructs a ReadAsOfIterator.
func NewReadAsOfIterator(iter SimpleMVCCIterator, asOf hlc.Timestamp) *ReadAsOfIterator {
return &ReadAsOfIterator{iter: iter, asOf: asOf}
Expand Down
168 changes: 168 additions & 0 deletions pkg/storage/testdata/mvcc_histories/range_key_iter_read_as_of
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# Tests range key handling in ReadAsOfIterator. Note that the iterator assumes it will not see an
# intent.
#
# Sets up the following dataset, where x is tombstone, o-o is range tombstone
#
# 6 f6
# 5 o---------------o k5
# 4 x x d4 f4 g4 x
# 3 o-------o e3 o-------oh3 o---o
# 2 a2 f2 g2
# 1 o---------------------------------------o
# a b c d e f g h i j k l m n o
#
run ok
put_rangekey k=a end=k ts=1
put k=a ts=2 v=a2
del k=a ts=4
put_rangekey k=b end=d ts=3
del k=b ts=4
put k=d ts=4 v=d4
put k=e ts=3 v=e3
put k=f ts=2 v=f2
put k=g ts=2 v=g2
put_rangekey k=f end=h ts=3
put k=f ts=4 v=f4
put_rangekey k=c end=g ts=5
put k=f ts=6 v=f6
put k=g ts=4 v=g4
put k=h ts=3 v=h3
del k=h ts=4
put k=k ts=5 v=k5
put_rangekey k=m end=n ts=3 localTs=2
----
>> at end:
rangekey: {a-b}/[1.000000000,0=/<empty>]
rangekey: {b-c}/[3.000000000,0=/<empty> 1.000000000,0=/<empty>]
rangekey: {c-d}/[5.000000000,0=/<empty> 3.000000000,0=/<empty> 1.000000000,0=/<empty>]
rangekey: {d-f}/[5.000000000,0=/<empty> 1.000000000,0=/<empty>]
rangekey: {f-g}/[5.000000000,0=/<empty> 3.000000000,0=/<empty> 1.000000000,0=/<empty>]
rangekey: {g-h}/[3.000000000,0=/<empty> 1.000000000,0=/<empty>]
rangekey: {h-k}/[1.000000000,0=/<empty>]
rangekey: {m-n}/[3.000000000,0={localTs=2.000000000,0}/<empty>]
data: "a"/4.000000000,0 -> /<empty>
data: "a"/2.000000000,0 -> /BYTES/a2
data: "b"/4.000000000,0 -> /<empty>
data: "d"/4.000000000,0 -> /BYTES/d4
data: "e"/3.000000000,0 -> /BYTES/e3
data: "f"/6.000000000,0 -> /BYTES/f6
data: "f"/4.000000000,0 -> /BYTES/f4
data: "f"/2.000000000,0 -> /BYTES/f2
data: "g"/4.000000000,0 -> /BYTES/g4
data: "g"/2.000000000,0 -> /BYTES/g2
data: "h"/4.000000000,0 -> /<empty>
data: "h"/3.000000000,0 -> /BYTES/h3
data: "k"/5.000000000,0 -> /BYTES/k5

# test range keys are ignored if above asOf, even with multiple range keys
run ok
iter_new_read_as_of asOfTs=2
iter_seek_ge k=a
iter_scan
----
iter_seek_ge: "a"/2.000000000,0=/BYTES/a2
iter_scan: "a"/2.000000000,0=/BYTES/a2
iter_scan: "f"/2.000000000,0=/BYTES/f2
iter_scan: "g"/2.000000000,0=/BYTES/g2
iter_scan: .

# test range key at or below asOf properly shadows keys
run ok
iter_new_read_as_of asOfTs=3
iter_seek_ge k=a
iter_scan
----
iter_seek_ge: "a"/2.000000000,0=/BYTES/a2
iter_scan: "a"/2.000000000,0=/BYTES/a2
iter_scan: "e"/3.000000000,0=/BYTES/e3
iter_scan: "h"/3.000000000,0=/BYTES/h3
iter_scan: .

# iterate over a few point tombstones at the asOf time
run ok
iter_new_read_as_of asOfTs=4
iter_seek_ge k=a
iter_scan
----
iter_seek_ge: "d"/4.000000000,0=/BYTES/d4
iter_scan: "d"/4.000000000,0=/BYTES/d4
iter_scan: "e"/3.000000000,0=/BYTES/e3
iter_scan: "f"/4.000000000,0=/BYTES/f4
iter_scan: "g"/4.000000000,0=/BYTES/g4
iter_scan: .

# iterate over ts 5-7 because the test is cheap
run ok
iter_new_read_as_of asOfTs=5
iter_seek_ge k=a
iter_scan
----
iter_seek_ge: "g"/4.000000000,0=/BYTES/g4
iter_scan: "g"/4.000000000,0=/BYTES/g4
iter_scan: "k"/5.000000000,0=/BYTES/k5
iter_scan: .

# iterate over ts 5-7 because the test is cheap
run ok
iter_new_read_as_of asOfTs=6
iter_seek_ge k=a
iter_scan
----
iter_seek_ge: "f"/6.000000000,0=/BYTES/f6
iter_scan: "f"/6.000000000,0=/BYTES/f6
iter_scan: "g"/4.000000000,0=/BYTES/g4
iter_scan: "k"/5.000000000,0=/BYTES/k5
iter_scan: .

# iterate over ts 5-7 for completeness
run ok
iter_new_read_as_of asOfTs=7
iter_seek_ge k=a
iter_scan
----
iter_seek_ge: "f"/6.000000000,0=/BYTES/f6
iter_scan: "f"/6.000000000,0=/BYTES/f6
iter_scan: "g"/4.000000000,0=/BYTES/g4
iter_scan: "k"/5.000000000,0=/BYTES/k5
iter_scan: .


# test range key handling when asOf is empty
run ok
iter_new_read_as_of
iter_seek_ge k=a
iter_scan
----
iter_seek_ge: "f"/6.000000000,0=/BYTES/f6
iter_scan: "f"/6.000000000,0=/BYTES/f6
iter_scan: "g"/4.000000000,0=/BYTES/g4
iter_scan: "k"/5.000000000,0=/BYTES/k5
iter_scan: .

# seek to a point key shadowed by a range key
run ok
iter_new_read_as_of asOfTs=5
iter_seek_ge k=d
----
iter_seek_ge: "g"/4.000000000,0=/BYTES/g4

# seek to the start of a range key
run ok
iter_new_read_as_of asOfTs=5
iter_seek_ge k=c
----
iter_seek_ge: "g"/4.000000000,0=/BYTES/g4

# seek to the same point key, with AsOf empty
run ok
iter_new_read_as_of
iter_seek_ge k=d
----
iter_seek_ge: "f"/6.000000000,0=/BYTES/f6

# attempt seek to the same point key, but ignore the range key because its above AsOf
run ok
iter_new_read_as_of asOfTs=4
iter_seek_ge k=d
----
iter_seek_ge: "d"/4.000000000,0=/BYTES/d4

0 comments on commit 341a77f

Please sign in to comment.