diff --git a/pkg/ccl/storageccl/engineccl/bench_test.go b/pkg/ccl/storageccl/engineccl/bench_test.go index b69650106958..2c82ded93a13 100644 --- a/pkg/ccl/storageccl/engineccl/bench_test.go +++ b/pkg/ccl/storageccl/engineccl/bench_test.go @@ -187,9 +187,9 @@ func BenchmarkTimeBoundIterate(b *testing.B) { b.Run("TimeBoundIterator", func(b *testing.B) { runIterate(b, loadFactor, func(e storage.Engine, startTime, endTime hlc.Timestamp) (storage.MVCCIterator, error) { return e.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{ - MinTimestampHint: startTime, - MaxTimestampHint: endTime, - UpperBound: roachpb.KeyMax, + MinTimestamp: startTime, + MaxTimestamp: endTime, + UpperBound: roachpb.KeyMax, }) }) }) diff --git a/pkg/kv/kvserver/batcheval/cmd_revert_range.go b/pkg/kv/kvserver/batcheval/cmd_revert_range.go index ad5e7fd1ea21..0dc10d9d3b30 100644 --- a/pkg/kv/kvserver/batcheval/cmd_revert_range.go +++ b/pkg/kv/kvserver/batcheval/cmd_revert_range.go @@ -83,11 +83,11 @@ func isEmptyKeyTimeRange( // that there is *a* key in the SST that is in the time range. Thus we should // proceed to iteration that actually checks timestamps on each key. iter, err := readWriter.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{ - KeyTypes: storage.IterKeyTypePointsAndRanges, - LowerBound: from, - UpperBound: to, - MinTimestampHint: since.Next(), // make exclusive - MaxTimestampHint: until, + KeyTypes: storage.IterKeyTypePointsAndRanges, + LowerBound: from, + UpperBound: to, + MinTimestamp: since.Next(), // make exclusive + MaxTimestamp: until, }) if err != nil { return false, err diff --git a/pkg/kv/kvserver/batcheval/cmd_revert_range_test.go b/pkg/kv/kvserver/batcheval/cmd_revert_range_test.go index 497698f88eb5..0f420c07e74d 100644 --- a/pkg/kv/kvserver/batcheval/cmd_revert_range_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_revert_range_test.go @@ -55,7 +55,7 @@ func TestCmdRevertRange(t *testing.T) { ctx := context.Background() - // Run this test on both RocksDB and Pebble. Regression test for: + // Regression test for: // https://github.com/cockroachdb/cockroach/pull/42386 eng := storage.NewDefaultInMemForTesting() defer eng.Close() @@ -118,10 +118,11 @@ func TestCmdRevertRange(t *testing.T) { ts hlc.Timestamp expected []byte resumes int + empty bool }{ - {"revert revert to time A", tsA, sumA, 4}, - {"revert revert to time B", tsB, sumB, 4}, - {"revert revert to time C (nothing)", tsC, sumC, 0}, + {"revert revert to time A", tsA, sumA, 4, false}, + {"revert revert to time B", tsB, sumB, 4, false}, + {"revert revert to time C (nothing)", tsC, sumC, 0, true}, } { t.Run(tc.name, func(t *testing.T) { batch := &wrappedBatch{Batch: eng.NewBatch()} @@ -140,9 +141,13 @@ func TestCmdRevertRange(t *testing.T) { if err != nil { t.Fatal(err) } - require.NotNil(t, result.Replicated.MVCCHistoryMutation) - require.Equal(t, result.Replicated.MVCCHistoryMutation.Spans, - []roachpb.Span{{Key: req.RequestHeader.Key, EndKey: req.RequestHeader.EndKey}}) + // If there's nothing to revert and the fast-path is hit, + // MVCCHistoryMutation will be empty. + if !tc.empty { + require.NotNil(t, result.Replicated.MVCCHistoryMutation) + require.Equal(t, result.Replicated.MVCCHistoryMutation.Spans, + []roachpb.Span{{Key: req.RequestHeader.Key, EndKey: req.RequestHeader.EndKey}}) + } if reply.ResumeSpan == nil { break } diff --git a/pkg/storage/batch_test.go b/pkg/storage/batch_test.go index e86729c4f6d1..83fd82829a0c 100644 --- a/pkg/storage/batch_test.go +++ b/pkg/storage/batch_test.go @@ -192,9 +192,9 @@ func TestReadOnlyBasics(t *testing.T) { }, func() { iter, _ := ro.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ - MinTimestampHint: hlc.MinTimestamp, - MaxTimestampHint: hlc.MaxTimestamp, - UpperBound: roachpb.KeyMax, + MinTimestamp: hlc.MinTimestamp, + MaxTimestamp: hlc.MaxTimestamp, + UpperBound: roachpb.KeyMax, }) iter.Close() }, diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go index 3b4769f289e2..c166bf0ecbe3 100644 --- a/pkg/storage/engine.go +++ b/pkg/storage/engine.go @@ -420,38 +420,22 @@ type IterOptions struct { // the iterator. UpperBound must be provided unless Prefix is true, in which // case the end of the prefix will be used as the upper bound. UpperBound roachpb.Key - // MinTimestampHint and MaxTimestampHint, if set, indicate that keys outside - // of the time range formed by [MinTimestampHint, MaxTimestampHint] do not - // need to be presented by the iterator. The underlying iterator may be able - // to efficiently skip over keys outside of the hinted time range, e.g., when - // an SST indicates that it contains no keys within the time range. Intents + // MinTimestamp and MaxTimestamp, if set, indicate that only keys + // within the time range formed by [MinTimestamp, MaxTimestamp] should be + // returned. The underlying iterator may be able to efficiently skip over + // keys outside of the hinted time range, e.g., when a block handle + // indicates that the block contains no keys within the time range. Intents // will not be visible to such iterators at all. This is only relevant for // MVCCIterators. // - // Note that time bound hints are strictly a performance optimization, and - // iterators with time bounds hints will frequently return keys outside of the - // [start, end] time range. If you must guarantee that you never see a key - // outside of the time bounds, perform your own filtering. - // - // NB: The iterator may surface stale data. Pebble range tombstones do not have - // timestamps and thus may be ignored entirely depending on whether their SST - // happens to satisfy the filter. Furthermore, keys outside the timestamp - // range may be stale and must be ignored -- for example, consider a key foo@5 - // written in an SST with timestamp range [3-7], and then a non-MVCC removal - // or update of this key in a different SST with timestamp range [3-5]. Using - // an iterator with range [6-9] would surface the old foo@5 key because it - // would return all keys in the old [3-7] SST but not take into account the - // separate [3-5] SST where foo@5 was removed or updated. See also: - // https://github.com/cockroachdb/pebble/issues/1786 + // Note that time-bound iterators previously were only a performance + // optimization but now guarantee that no keys outside of the [start, end] + // time range will be returned. // // NB: Range keys are not currently subject to timestamp filtering due to // complications with MVCCIncrementalIterator. See: // https://github.com/cockroachdb/cockroach/issues/86260 - // - // Currently, the only way to correctly use such an iterator is to use it in - // concert with an iterator without timestamp hints, as done by - // MVCCIncrementalIterator. - MinTimestampHint, MaxTimestampHint hlc.Timestamp + MinTimestamp, MaxTimestamp hlc.Timestamp // KeyTypes specifies the types of keys to surface: point and/or range keys. // Use HasPointAndRange() to determine which key type is present at a given // iterator position, and RangeBounds() and RangeKeys() to access range keys. diff --git a/pkg/storage/engine_test.go b/pkg/storage/engine_test.go index 247b1f71b608..7bbbe2320cb3 100644 --- a/pkg/storage/engine_test.go +++ b/pkg/storage/engine_test.go @@ -580,9 +580,9 @@ func TestEngineTimeBound(t *testing.T) { "right not touching": { iter: func() (MVCCIterator, error) { return batch.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ - MinTimestampHint: maxTimestamp.WallNext(), - MaxTimestampHint: maxTimestamp.WallNext().WallNext(), - UpperBound: roachpb.KeyMax, + MinTimestamp: maxTimestamp.WallNext(), + MaxTimestamp: maxTimestamp.WallNext().WallNext(), + UpperBound: roachpb.KeyMax, }) }, keys: 0, @@ -590,9 +590,9 @@ func TestEngineTimeBound(t *testing.T) { "left not touching": { iter: func() (MVCCIterator, error) { return batch.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ - MinTimestampHint: minTimestamp.WallPrev().WallPrev(), - MaxTimestampHint: minTimestamp.WallPrev(), - UpperBound: roachpb.KeyMax, + MinTimestamp: minTimestamp.WallPrev().WallPrev(), + MaxTimestamp: minTimestamp.WallPrev(), + UpperBound: roachpb.KeyMax, }) }, keys: 0, @@ -600,52 +600,57 @@ func TestEngineTimeBound(t *testing.T) { "right touching": { iter: func() (MVCCIterator, error) { return batch.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ - MinTimestampHint: maxTimestamp, - MaxTimestampHint: maxTimestamp, - UpperBound: roachpb.KeyMax, + MinTimestamp: maxTimestamp, + MaxTimestamp: maxTimestamp, + UpperBound: roachpb.KeyMax, }) }, - keys: len(times), + keys: 1, // only one key exists at the max timestamp @7 }, - "right touching ignores logical": { + // Although the block-property and table filters have historically + // ignored logical timestamps (and synthetic bits), the + // MVCCIncrementalIterator does not. It performs a strict hlc.Timestamp + // comparison. Both @7,1 and @7,2 are greater than @7, so no keys are + // visible. + "right touching enfoces logical": { iter: func() (MVCCIterator, error) { return batch.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ - MinTimestampHint: maxTimestamp.Next(), - MaxTimestampHint: maxTimestamp.Next().Next(), - UpperBound: roachpb.KeyMax, + MinTimestamp: maxTimestamp.Next(), // @7,1 + MaxTimestamp: maxTimestamp.Next().Next(), // @7,2 + UpperBound: roachpb.KeyMax, }) }, - keys: len(times), + keys: 0, }, "left touching": { iter: func() (MVCCIterator, error) { return batch.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ - MinTimestampHint: minTimestamp, - MaxTimestampHint: minTimestamp, - UpperBound: roachpb.KeyMax, + MinTimestamp: minTimestamp, + MaxTimestamp: minTimestamp, + UpperBound: roachpb.KeyMax, }) }, - keys: len(times), + keys: 1, }, "left touching upperbound": { iter: func() (MVCCIterator, error) { return batch.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ - MinTimestampHint: minTimestamp, - MaxTimestampHint: minTimestamp, - UpperBound: []byte("02"), + MinTimestamp: minTimestamp, + MaxTimestamp: minTimestamp, + UpperBound: []byte("02"), }) }, - keys: 2, + keys: 1, }, "between": { iter: func() (MVCCIterator, error) { return batch.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ - MinTimestampHint: minTimestamp.Next(), - MaxTimestampHint: minTimestamp.Next(), - UpperBound: roachpb.KeyMax, + MinTimestamp: minTimestamp.Next(), + MaxTimestamp: minTimestamp.Next(), + UpperBound: roachpb.KeyMax, }) }, - keys: len(times), + keys: 0, }, } diff --git a/pkg/storage/intent_interleaving_iter.go b/pkg/storage/intent_interleaving_iter.go index 2660d2c724b4..238119b8f203 100644 --- a/pkg/storage/intent_interleaving_iter.go +++ b/pkg/storage/intent_interleaving_iter.go @@ -69,7 +69,7 @@ var intentInterleavingReaderPool = sync.Pool{ func (imr *intentInterleavingReader) NewMVCCIterator( iterKind MVCCIterKind, opts IterOptions, ) (MVCCIterator, error) { - if (!opts.MinTimestampHint.IsEmpty() || !opts.MaxTimestampHint.IsEmpty()) && + if (!opts.MinTimestamp.IsEmpty() || !opts.MaxTimestamp.IsEmpty()) && iterKind == MVCCKeyAndIntentsIterKind { panic("cannot ask for interleaved intents when specifying timestamp hints") } @@ -233,7 +233,7 @@ func isLocal(k roachpb.Key) bool { } func newIntentInterleavingIterator(reader Reader, opts IterOptions) (MVCCIterator, error) { - if !opts.MinTimestampHint.IsEmpty() || !opts.MaxTimestampHint.IsEmpty() { + if !opts.MinTimestamp.IsEmpty() || !opts.MaxTimestamp.IsEmpty() { panic("intentInterleavingIter must not be used with timestamp hints") } var lowerIsLocal, upperIsLocal bool diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index 841909194d19..7c590b3faa84 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -103,7 +103,7 @@ var ( // scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [skipLocked] [tombstones] [reverse] [failOnMoreRecent] [localUncertaintyLimit=[,]] [globalUncertaintyLimit=[,]] [max=] [targetbytes=] [wholeRows[=]] [allowEmpty] // export [k=] [end=] [ts=[,]] [kTs=[,]] [startTs=[,]] [maxLockConflicts=] [allRevisions] [targetSize=] [maxSize=] [stopMidKey] [fingerprint] // -// iter_new [k=] [end=] [prefix] [kind=key|keyAndIntents] [types=pointsOnly|pointsWithRanges|pointsAndRanges|rangesOnly] [maskBelow=[,]] +// iter_new [k=] [end=] [prefix] [kind=key|keyAndIntents] [types=pointsOnly|pointsWithRanges|pointsAndRanges|rangesOnly] [maskBelow=[,]] [minTimestamp=[,]] [maxTimestamp=[,]] // iter_new_incremental [k=] [end=] [startTs=[,]] [endTs=[,]] [types=pointsOnly|pointsWithRanges|pointsAndRanges|rangesOnly] [maskBelow=[,]] [intents=error|aggregate|emit] // iter_seek_ge k= [ts=[,]] // iter_seek_lt k= [ts=[,]] @@ -449,6 +449,15 @@ func TestMVCCHistories(t *testing.T) { e.t = t e.td = d + defer func() { + if e.iter != nil { + if r := recover(); r != nil { + e.iter.Close() + panic(r) + } + } + }() + switch d.Cmd { case "skip": if len(d.CmdArgs) == 0 { @@ -1857,6 +1866,12 @@ func cmdIterNew(e *evalCtx) error { if e.hasArg("maskBelow") { opts.RangeKeyMaskingBelow = e.getTsWithName("maskBelow") } + if e.hasArg("minTimestamp") { + opts.MinTimestamp = e.getTsWithName("minTimestamp") + } + if e.hasArg("maxTimestamp") { + opts.MaxTimestamp = e.getTsWithName("maxTimestamp") + } if e.iter != nil { e.iter.Close() diff --git a/pkg/storage/mvcc_incremental_iterator.go b/pkg/storage/mvcc_incremental_iterator.go index 62f55d2b3473..f85b1e64e6e3 100644 --- a/pkg/storage/mvcc_incremental_iterator.go +++ b/pkg/storage/mvcc_incremental_iterator.go @@ -219,9 +219,9 @@ func NewMVCCIncrementalIterator( LowerBound: opts.StartKey, UpperBound: opts.EndKey, // The call to startTime.Next() converts our exclusive start bound into - // the inclusive start bound that MinTimestampHint expects. - MinTimestampHint: opts.StartTime.Next(), - MaxTimestampHint: opts.EndTime, + // the inclusive start bound that MinTimestampt expects. + MinTimestamp: opts.StartTime.Next(), + MaxTimestamp: opts.EndTime, RangeKeyMaskingBelow: tbiRangeKeyMasking, }) if err != nil { diff --git a/pkg/storage/pebble_iterator.go b/pkg/storage/pebble_iterator.go index 1aacfd253685..f86f3f98a04a 100644 --- a/pkg/storage/pebble_iterator.go +++ b/pkg/storage/pebble_iterator.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter" "github.com/cockroachdb/cockroach/pkg/util" + "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble" @@ -42,6 +43,16 @@ type pebbleIterator struct { rangeKeyMaskingBuf []byte // Filter to use if masking is enabled. maskFilter mvccWallTimeIntervalRangeKeyMask + // [minTimestamp,maxTimestamp] contain the encoded timestamp bounds of the + // iterator, if any. This iterator will not return keys outside these + // timestamps. These are encoded because lexicographic comparison on encoded + // timestamps is equivalent to the comparison on decoded timestamps. These + // timestamps are enforced through the IterOptions.SkipPoint function, which + // is provided with encoded keys. + // + // NB: minTimestamp and maxTimestamp are both inclusive. + minTimestamp []byte // inclusive + maxTimestamp []byte // inclusive // Buffer used to store MVCCRangeKeyVersions returned by RangeKeys(). Lazily // initialized the first time an iterator's RangeKeys() method is called. @@ -209,7 +220,7 @@ func (p *pebbleIterator) setOptions(opts IterOptions, durability DurabilityRequi if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 { panic("iterator must set prefix or upper bound or lower bound") } - if opts.MinTimestampHint.IsSet() && opts.MaxTimestampHint.IsEmpty() { + if opts.MinTimestamp.IsSet() && opts.MaxTimestamp.IsEmpty() { panic("min timestamp hint set without max timestamp hint") } if opts.Prefix && opts.RangeKeyMaskingBelow.IsSet() { @@ -249,15 +260,36 @@ func (p *pebbleIterator) setOptions(opts IterOptions, durability DurabilityRequi p.options.RangeKeyMasking.Filter = p.getBlockPropertyFilterMask } - if opts.MaxTimestampHint.IsSet() { + if opts.MaxTimestamp.IsSet() { + // Install an IterOptions.SkipPoint function to ensure that we skip over + // any keys outside the the time bounds that don't get excluded by the + // coarse, opportunistic block-property filters. To avoid decoding + // per-KV, the SkipPoint function performs lexicographic comparisons on + // encoded timestamps, which is equivalent to the decoded, logical + // comparisons when ignoring the synthetic bit. In lexicographic order, + // the encoded key with the synthetic bit set sorts after the same + // timestamp without the synthetic bit. Timestamps differing only in the + // synthetic bit should otherwise be equal, so we take care to construct + // a minimum bound without the bit and a maximum bound with the bit to + // be inclusive on both ends. + p.minTimestamp = encodeMVCCTimestamp(hlc.Timestamp{ + WallTime: opts.MinTimestamp.WallTime, + Logical: opts.MinTimestamp.Logical, + }) + p.maxTimestamp = append(encodeMVCCTimestamp(hlc.Timestamp{ + WallTime: opts.MaxTimestamp.WallTime, + Logical: opts.MaxTimestamp.Logical, + }), 0x01 /* Synthetic bit */) + p.options.SkipPoint = p.skipPointIfOutsideTimeBounds + // TODO(erikgrinaker): For compatibility with SSTables written by 21.2 nodes // or earlier, we filter on table properties too. We still wrote these // properties in 22.1, but stop doing so in 22.2. We can remove this // filtering when nodes are guaranteed to no longer have SSTables written by // 21.2 or earlier (which can still happen e.g. when clusters are upgraded // through multiple major versions in rapid succession). - encodedMinTS := string(encodeMVCCTimestamp(opts.MinTimestampHint)) - encodedMaxTS := string(encodeMVCCTimestamp(opts.MaxTimestampHint)) + encodedMinTS := string(p.minTimestamp) + encodedMaxTS := string(p.maxTimestamp) p.options.TableFilter = func(userProps map[string]string) bool { tableMinTS := userProps["crdb.ts.min"] if len(tableMinTS) == 0 { @@ -269,7 +301,7 @@ func (p *pebbleIterator) setOptions(opts IterOptions, durability DurabilityRequi } return encodedMaxTS >= tableMinTS && encodedMinTS <= tableMaxTS } - // We are given an inclusive [MinTimestampHint, MaxTimestampHint]. The + // We are given an inclusive [MinTimestamp, MaxTimestamp]. The // MVCCWAllTimeIntervalCollector has collected the WallTimes and we need // [min, max), i.e., exclusive on the upper bound. // @@ -278,8 +310,8 @@ func (p *pebbleIterator) setOptions(opts IterOptions, durability DurabilityRequi // Pebble-internal performance optimization. pkf := [2]pebble.BlockPropertyFilter{ sstable.NewBlockIntervalFilter(mvccWallTimeIntervalCollector, - uint64(opts.MinTimestampHint.WallTime), - uint64(opts.MaxTimestampHint.WallTime)+1), + uint64(opts.MinTimestamp.WallTime), + uint64(opts.MaxTimestamp.WallTime)+1), } p.options.PointKeyFilters = pkf[:1:2] // NB: We disable range key block filtering because of complications in @@ -919,6 +951,53 @@ func (p *pebbleIterator) getBlockPropertyFilterMask() pebble.BlockPropertyFilter return &p.maskFilter } +func (p *pebbleIterator) skipPointIfOutsideTimeBounds(key []byte) (skip bool) { + if len(key) == 0 { + return false + } + // Last byte is the version length + 1 when there is a version, + // else it is 0. + versionLen := int(key[len(key)-1]) + if versionLen == 0 { + // This is not an MVCC key. + return false + } + // prefixPartEnd points to the sentinel byte, unless this is a bare suffix, in + // which case the index is -1. + prefixPartEnd := len(key) - 1 - versionLen + // Sanity check: the index should be >= -1. Additionally, if the index is >= + // 0, it should point to the sentinel byte, as this is a full EngineKey. If + // the key appears invalid and we don't understand it, don't skip it so the + // iterator will observe it and hopefully propagate an error up the stack. + if prefixPartEnd < -1 || (prefixPartEnd >= 0 && key[prefixPartEnd] != sentinel) { + return false + } + + switch versionLen - 1 { + case engineKeyVersionWallTimeLen, engineKeyVersionWallAndLogicalTimeLen, engineKeyVersionWallLogicalAndSyntheticTimeLen: + // INVARIANT: -1 <= prefixPartEnd < len(b) - 1. + // Version consists of the bytes after the sentinel and before the length. + ts := key[prefixPartEnd+1 : len(key)-1] + // Lexicographic comparison on the encoded timestamps is equivalent to the + // comparison on decoded timestamps, so we avoid the need to decode the + // walltimes by performing simple byte comarisons. + if bytes.Compare(ts, p.minTimestamp) < 0 { + return true + } + if bytes.Compare(ts, p.maxTimestamp) > 0 { + return true + } + // minTimestamp ≤ ts ≤ maxTimestamp + // + // The key's timestamp is within the iterator's configured bounds. + return false + default: + // Not a MVCC key. + return false + } + +} + func (p *pebbleIterator) destroy() { if p.inuse { panic("iterator still in use") diff --git a/pkg/storage/pebble_iterator_test.go b/pkg/storage/pebble_iterator_test.go index 3c6cf8349360..3e7150e74c29 100644 --- a/pkg/storage/pebble_iterator_test.go +++ b/pkg/storage/pebble_iterator_test.go @@ -13,6 +13,8 @@ package storage import ( "bytes" "context" + "encoding/hex" + "fmt" "io/fs" "math/rand" "os" @@ -23,8 +25,11 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/testutils/datapathutils" + "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/datadriven" "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble" "github.com/cockroachdb/pebble/sstable" @@ -144,3 +149,55 @@ func TestPebbleIterator_ExternalCorruption(t *testing.T) { } it.Close() } + +func TestPebbleIterator_SkipPointIfOutsideTimeBounds(t *testing.T) { + defer leaktest.AfterTest(t)() + + var iter pebbleIterator + var sb strings.Builder + datadriven.RunTest(t, datapathutils.TestDataPath(t, "skip_point_if_outside_time_bounds"), func(t *testing.T, d *datadriven.TestData) string { + sb.Reset() + var minStr, maxStr string + d.ScanArgs(t, "min", &minStr) + d.ScanArgs(t, "max", &maxStr) + min, err := hlc.ParseTimestamp(minStr) + require.NoError(t, err) + max, err := hlc.ParseTimestamp(maxStr) + require.NoError(t, err) + + iter.setOptions(IterOptions{ + LowerBound: []byte{0x00}, // so setOptions doesn't complain + MinTimestamp: min, + MaxTimestamp: max, + }, StandardDurability) + fmt.Fprintf(&sb, "min: 0x%x\nmax: 0x%x\n", iter.minTimestamp, iter.maxTimestamp) + for _, line := range strings.Split(strings.TrimSpace(d.Input), "\n") { + if i := strings.IndexByte(line, '#'); i >= 0 { + line = line[:i] + } + if line == "" { + continue + } + var key []byte + switch { + case strings.HasPrefix(line, "0x:"): + line = line[len("0x:"):] + line = strings.Replace(line, " ", "", -1) + var err error + key, err = hex.DecodeString(line) + if err != nil { + return err.Error() + } + default: + return fmt.Sprintf("unrecognized key format %q", line) + } + fmt.Fprintf(&sb, "%s : ", line) + if iter.skipPointIfOutsideTimeBounds(key) { + fmt.Fprintln(&sb, "skip") + } else { + fmt.Fprintln(&sb, "don't skip") + } + } + return sb.String() + }) +} diff --git a/pkg/storage/pebble_test.go b/pkg/storage/pebble_test.go index 15425c3d120c..1edebb750039 100644 --- a/pkg/storage/pebble_test.go +++ b/pkg/storage/pebble_test.go @@ -757,10 +757,10 @@ func TestPebbleMVCCTimeIntervalCollectorAndFilter(t *testing.T) { expect = append(expect, kv) } iter, err := eng.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ - KeyTypes: keyType, - UpperBound: keys.MaxKey, - MinTimestampHint: tc.minTimestamp, - MaxTimestampHint: tc.maxTimestamp, + KeyTypes: keyType, + UpperBound: keys.MaxKey, + MinTimestamp: tc.minTimestamp, + MaxTimestamp: tc.maxTimestamp, }) require.NoError(t, err) defer iter.Close() @@ -830,26 +830,18 @@ func TestPebbleMVCCTimeIntervalWithClears(t *testing.T) { rangeKV("d", "e", 9, MVCCValue{}), }}, "at cleared point": {wallTS(5), wallTS(5), []interface{}{ - pointKV("a", 7, "a7"), - pointKV("a", 3, "a3"), + // a@7 and a@3's timestamps fall outside [5,5]. rangeKV("b", "c", 5, MVCCValue{}), rangeKV("d", "e", 9, MVCCValue{}), }}, - // NB: This reveals a@5 which has been deleted, because the SST block - // containing the point clear does not satisfy the [7-7] filter. "at cleared range": {wallTS(7), wallTS(7), []interface{}{ + // a@5 and a@3's timestamps fall outside [7,7]. pointKV("a", 7, "a7"), - pointKV("a", 5, "a5"), - pointKV("a", 3, "a3"), rangeKV("b", "c", 5, MVCCValue{}), rangeKV("d", "e", 9, MVCCValue{}), }}, - // NB: This reveals a@5 which has been deleted, because the SST block - // containing the point clear does not satisfy the [1-3] filter. Range keys - // are not filtered. "touches lower": {wallTS(1), wallTS(3), []interface{}{ - pointKV("a", 7, "a7"), - pointKV("a", 5, "a5"), + // a@7 and a@5's timestamps fall outside [1,3]. pointKV("a", 3, "a3"), rangeKV("b", "c", 5, MVCCValue{}), rangeKV("d", "e", 9, MVCCValue{}), @@ -878,10 +870,10 @@ func TestPebbleMVCCTimeIntervalWithClears(t *testing.T) { expect = append(expect, kv) } iter, err := eng.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ - KeyTypes: keyType, - UpperBound: keys.MaxKey, - MinTimestampHint: tc.minTimestamp, - MaxTimestampHint: tc.maxTimestamp, + KeyTypes: keyType, + UpperBound: keys.MaxKey, + MinTimestamp: tc.minTimestamp, + MaxTimestamp: tc.maxTimestamp, }) if err != nil { t.Fatal(err) @@ -960,10 +952,10 @@ func TestPebbleMVCCTimeIntervalWithRangeClears(t *testing.T) { expect = append(expect, kv) } iter, err := eng.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ - KeyTypes: keyType, - UpperBound: keys.MaxKey, - MinTimestampHint: tc.minTimestamp, - MaxTimestampHint: tc.maxTimestamp, + KeyTypes: keyType, + UpperBound: keys.MaxKey, + MinTimestamp: tc.minTimestamp, + MaxTimestamp: tc.maxTimestamp, }) if err != nil { t.Fatal(err) @@ -983,14 +975,15 @@ func TestPebbleTablePropertyFilter(t *testing.T) { defer leaktest.AfterTest(t)() // Set up a static property collector which always writes the same table - // properties [5-7] regardless of the SSTable contents. We keep the default - // block property collects too, which will use the actual SSTable timestamps. + // properties [1-7] regardless of the SSTable contents. We keep the default + // block property collects too, which will use the actual SSTable + // timestamps. overrideOptions := func(cfg *engineConfig) error { cfg.Opts.TablePropertyCollectors = []func() pebble.TablePropertyCollector{ func() pebble.TablePropertyCollector { return &staticTablePropertyCollector{ props: map[string]string{ - "crdb.ts.min": "\x00\x00\x00\x00\x00\x00\x00\x05", // WallTime: 5 + "crdb.ts.min": "\x00\x00\x00\x00\x00\x00\x00\x01", // WallTime: 1 "crdb.ts.max": "\x00\x00\x00\x00\x00\x00\x00\x07", // WallTime: 7 }, } @@ -1010,29 +1003,29 @@ func TestPebbleTablePropertyFilter(t *testing.T) { // Table and block properties now think the SST covers these spans: // // Block properties: [1-7] - // Table properties: [5-7] + // Table properties: [1-7] // // Both must be satisfied in order for the (only) SST to be included. testcases := map[string]struct { minTimestamp int64 maxTimestamp int64 - expectResult bool + expectResult []interface{} }{ - "tableprop lower inclusive": {4, 5, true}, - "tableprop upper inclusive": {7, 8, true}, - "tableprop exact": {5, 7, true}, - "tableprop within": {6, 6, true}, - "tableprop covering": {4, 8, true}, - "tableprop below": {3, 4, false}, - "both above": {8, 9, false}, - "blockprop only": {1, 3, false}, // needs both block and table props + "tableprop lower inclusive": {4, 5, []interface{}(nil)}, + "tableprop upper inclusive": {7, 8, []interface{}{pointKV("b", 7, "b7")}}, + "tableprop exact": {5, 7, []interface{}{pointKV("b", 7, "b7")}}, + "tableprop within": {6, 6, []interface{}(nil)}, + "tableprop covering": {4, 8, []interface{}{pointKV("b", 7, "b7")}}, + "tableprop below": {3, 4, []interface{}(nil)}, + "both above": {8, 9, []interface{}(nil)}, + "blockprop only": {1, 3, []interface{}{pointKV("a", 1, "a1")}}, // needs both block and table props } for name, tc := range testcases { t.Run(name, func(t *testing.T) { iter, err := eng.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ - UpperBound: keys.MaxKey, - MinTimestampHint: hlc.Timestamp{WallTime: tc.minTimestamp}, - MaxTimestampHint: hlc.Timestamp{WallTime: tc.maxTimestamp}, + UpperBound: keys.MaxKey, + MinTimestamp: hlc.Timestamp{WallTime: tc.minTimestamp}, + MaxTimestamp: hlc.Timestamp{WallTime: tc.maxTimestamp}, }) if err != nil { t.Fatal(err) @@ -1040,14 +1033,7 @@ func TestPebbleTablePropertyFilter(t *testing.T) { defer iter.Close() kvs := scanIter(t, iter) - if tc.expectResult { - require.Equal(t, []interface{}{ - pointKV("a", 1, "a1"), - pointKV("b", 7, "b7"), - }, kvs) - } else { - require.Empty(t, kvs) - } + require.Equal(t, tc.expectResult, kvs) }) } } diff --git a/pkg/storage/testdata/mvcc_histories/time_bounds b/pkg/storage/testdata/mvcc_histories/time_bounds new file mode 100644 index 000000000000..0175188149e9 --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/time_bounds @@ -0,0 +1,45 @@ +run ok +with k=k1 + put v=k1v1 ts=10,0 + put v=k1v2 ts=20,0 +put k=k2 v=k2v1 ts=5,0 +put k=k3 v=k3v1 ts=5,0 +---- +>> at end: +data: "k1"/20.000000000,0 -> /BYTES/k1v2 +data: "k1"/10.000000000,0 -> /BYTES/k1v1 +data: "k2"/5.000000000,0 -> /BYTES/k2v1 +data: "k3"/5.000000000,0 -> /BYTES/k3v1 + +# With wide bounds, all the keys should be visible. + +run ok +iter_new kind=keys minTimestamp=1,0 maxTimestamp=30,0 +iter_seek_ge k=k1 +iter_next +iter_next +iter_next +---- +iter_seek_ge: "k1"/20.000000000,0=/BYTES/k1v2 +iter_next: "k1"/10.000000000,0=/BYTES/k1v1 +iter_next: "k2"/5.000000000,0=/BYTES/k2v1 +iter_next: "k3"/5.000000000,0=/BYTES/k3v1 + +# With narrow bounds [@10,@10], only 1 key should be visible. Note min and max +# timestamp are both inclusive. + +run ok +iter_new kind=keys minTimestamp=10,0 maxTimestamp=10,0 +iter_seek_ge k=k1 +iter_next +---- +iter_seek_ge: "k1"/10.000000000,0=/BYTES/k1v1 +iter_next: . + +# Nothing visible within [@8,@9]. + +run ok +iter_new kind=keys minTimestamp=8,0 maxTimestamp=9,0 +iter_seek_ge k=k1 +---- +iter_seek_ge: . diff --git a/pkg/storage/testdata/skip_point_if_outside_time_bounds b/pkg/storage/testdata/skip_point_if_outside_time_bounds new file mode 100644 index 000000000000..2b9cc783402a --- /dev/null +++ b/pkg/storage/testdata/skip_point_if_outside_time_bounds @@ -0,0 +1,213 @@ +check min=0,0 max=6000.000000000,0 +# Test a series of 8-byte MVCC keys that all fall within bounds. +# +# user key +# / sentinel byte +# | / encoded 8-byte WallTime timestamp +# | | / timestamp length +# | | | | +0x: FFFF 00 0000000000000000 09 # WallTime=0 +0x: FFFF 00 0000000000000001 09 # WallTime=1 +0x: FFFF 00 00000574fbde5fff 09 # WallTime=-1 +0x: FFFF 00 00000574fbde6000 09 # WallTime= +# +# Test a series of 8-byte MVCC keys that all contain TS > max. +# +0x: FFFF 00 00000574fbde6001 09 # WallTime=+1 +0x: FFFF 00 ffffffffffffffff 09 # WallTime=MaxUint64 +# +# Test a series of 12-byte MVCC keys that all fall within bounds. +# +# user key +# / sentinel byte +# | / 8-byte WallTime timestamp +# | | / 4-byte Logical timestamp +# | | | / timestamp length +# | | | | | +0x: FFFF 00 0000000000000000 00000000 0d # 0,0 +0x: FFFF 00 0000000000000001 00000001 0d # 1,1 +0x: FFFF 00 00000574fbde5fff 00000001 0d # -1,1 +0x: FFFF 00 00000574fbde5fff ffffffff 0d # -1,MaxUint32 +# +# Test a series of 12-byte MVCC keys that all contain TS > max. +# +0x: FFFF 00 00000574fbde6000 00000001 0d # ,1 +0x: FFFF 00 00000574fbde6000 00000005 0d # ,5 +0x: FFFF 00 00000574fbde6000 ffffffff 0d # ,MaxUint32 +0x: FFFF 00 00000574fbde6001 00000001 0d # +1,1 +0x: FFFF 00 ffffffffffffffff ffffffff 0d # MaxUint64,MaxUint32 +# +# Test the same keys as above but with 13-byte MVCC keys (including the +# synthetic bit). +# +# user key +# / sentinel byte +# | / 8-byte WallTime timestamp +# | | / 4-byte Logical timestamp +# | | | / synthetic bit +# | | | | / timestamp length +# | | | | | | +0x: FFFF 00 0000000000000000 00000000 01 0e # 0,0? +0x: FFFF 00 0000000000000001 00000001 01 0e # 1,1? +0x: FFFF 00 00000574fbde5fff 00000001 01 0e # -1,1? +0x: FFFF 00 00000574fbde5fff ffffffff 01 0e # -1,MaxUint32? +# +# Test a series of 13-byte MVCC keys that all contain TS > max. +# +0x: FFFF 00 00000574fbde6000 00000001 01 0e # ,1? +0x: FFFF 00 00000574fbde6000 00000005 01 0e # ,5? +0x: FFFF 00 00000574fbde6000 ffffffff 01 0e # ,MaxUint32? +0x: FFFF 00 00000574fbde6001 00000001 01 0e # +1,1? +0x: FFFF 00 ffffffffffffffff ffffffff 01 0e # MaxUint64,MaxUint32? +---- +min: 0x +max: 0x00000574fbde600001 +FFFF00000000000000000009 : don't skip +FFFF00000000000000000109 : don't skip +FFFF0000000574fbde5fff09 : don't skip +FFFF0000000574fbde600009 : don't skip +FFFF0000000574fbde600109 : skip +FFFF00ffffffffffffffff09 : skip +FFFF000000000000000000000000000d : don't skip +FFFF000000000000000001000000010d : don't skip +FFFF0000000574fbde5fff000000010d : don't skip +FFFF0000000574fbde5fffffffffff0d : don't skip +FFFF0000000574fbde6000000000010d : don't skip +FFFF0000000574fbde6000000000050d : don't skip +FFFF0000000574fbde6000ffffffff0d : skip +FFFF0000000574fbde6001000000010d : skip +FFFF00ffffffffffffffffffffffff0d : skip +FFFF00000000000000000000000000010e : don't skip +FFFF00000000000000000100000001010e : don't skip +FFFF0000000574fbde5fff00000001010e : don't skip +FFFF0000000574fbde5fffffffffff010e : don't skip +FFFF0000000574fbde600000000001010e : don't skip +FFFF0000000574fbde600000000005010e : don't skip +FFFF0000000574fbde6000ffffffff010e : skip +FFFF0000000574fbde600100000001010e : skip +FFFF00ffffffffffffffffffffffff010e : skip + +# +# The next test case is adjusted from the above test case, but this time the max +# has a non-zero logical bit, and so the max itself should be 12 or 13 bit +# timestamp. +# + +check min=0,0 max=6000.000000000,1 +# Test a series of 8-byte MVCC keys that all fall within bounds. +# +# user key +# / sentinel byte +# | / encoded 8-byte WallTime timestamp +# | | / timestamp length +# | | | | +0x: FFFF 00 0000000000000000 09 # WallTime=0 +0x: FFFF 00 0000000000000001 09 # WallTime=1 +0x: FFFF 00 00000574fbde5fff 09 # WallTime=-1 +0x: FFFF 00 00000574fbde6000 09 # WallTime= +# +# Test a series of 8-byte MVCC keys that all contain TS > max. +# +0x: FFFF 00 00000574fbde6001 09 # WallTime=+1 +0x: FFFF 00 ffffffffffffffff 09 # WallTime=MaxUint64 +# +# Test a series of 12-byte MVCC keys that all fall within bounds. +# +# user key +# / sentinel byte +# | / 8-byte WallTime timestamp +# | | / 4-byte Logical timestamp +# | | | / timestamp length +# | | | | | +0x: FFFF 00 0000000000000000 00000000 0d # 0,0 +0x: FFFF 00 0000000000000001 00000001 0d # 1,1 +0x: FFFF 00 00000574fbde5fff 00000001 0d # -1,1 +0x: FFFF 00 00000574fbde5fff ffffffff 0d # -1,MaxUint32 +0x: FFFF 00 00000574fbde6000 00000001 0d # +# +# Test a series of 12-byte MVCC keys that all contain TS > max. +# +0x: FFFF 00 00000574fbde6000 00000002 0d # ,+1 +0x: FFFF 00 00000574fbde6000 00000005 0d # ,5 +0x: FFFF 00 00000574fbde6000 ffffffff 0d # ,MaxUint32 +0x: FFFF 00 00000574fbde6001 00000001 0d # +1,1 +0x: FFFF 00 ffffffffffffffff ffffffff 0d # MaxUint64,MaxUint32 +# +# Test the same keys as above but with 13-byte MVCC keys (including the +# synthetic bit). +# +# user key +# / sentinel byte +# | / 8-byte WallTime timestamp +# | | / 4-byte Logical timestamp +# | | | / synthetic bit +# | | | | / timestamp length +# | | | | | | +0x: FFFF 00 0000000000000000 00000000 01 0e # 0,0? +0x: FFFF 00 0000000000000001 00000001 01 0e # 1,1? +0x: FFFF 00 00000574fbde5fff 00000001 01 0e # -1,1? +0x: FFFF 00 00000574fbde5fff ffffffff 01 0e # -1,MaxUint32? +0x: FFFF 00 00000574fbde6000 00000001 01 0e # ? +# +# Test a series of 13-byte MVCC keys that all contain TS > max. +# +0x: FFFF 00 00000574fbde6000 00000002 01 0e # ,+1 +0x: FFFF 00 00000574fbde6000 00000005 01 0e # ,5? +0x: FFFF 00 00000574fbde6000 ffffffff 01 0e # ,MaxUint32? +0x: FFFF 00 00000574fbde6001 00000001 01 0e # +1,1? +0x: FFFF 00 ffffffffffffffff ffffffff 01 0e # MaxUint64,MaxUint32? +---- +min: 0x +max: 0x00000574fbde60000000000101 +FFFF00000000000000000009 : don't skip +FFFF00000000000000000109 : don't skip +FFFF0000000574fbde5fff09 : don't skip +FFFF0000000574fbde600009 : don't skip +FFFF0000000574fbde600109 : skip +FFFF00ffffffffffffffff09 : skip +FFFF000000000000000000000000000d : don't skip +FFFF000000000000000001000000010d : don't skip +FFFF0000000574fbde5fff000000010d : don't skip +FFFF0000000574fbde5fffffffffff0d : don't skip +FFFF0000000574fbde6000000000010d : don't skip +FFFF0000000574fbde6000000000020d : skip +FFFF0000000574fbde6000000000050d : skip +FFFF0000000574fbde6000ffffffff0d : skip +FFFF0000000574fbde6001000000010d : skip +FFFF00ffffffffffffffffffffffff0d : skip +FFFF00000000000000000000000000010e : don't skip +FFFF00000000000000000100000001010e : don't skip +FFFF0000000574fbde5fff00000001010e : don't skip +FFFF0000000574fbde5fffffffffff010e : don't skip +FFFF0000000574fbde600000000001010e : don't skip +FFFF0000000574fbde600000000002010e : skip +FFFF0000000574fbde600000000005010e : skip +FFFF0000000574fbde6000ffffffff010e : skip +FFFF0000000574fbde600100000001010e : skip +FFFF00ffffffffffffffffffffffff010e : skip + +# Test invalid keys, like keys with lock table suffixes + +check min=10.000000000,0 max=25.000000000,0 +# Test lock table keys. They just shouldn't be skipped, even if a lexiccographic +# comparison alone would've skipped them. +# +# key "prefix" +# / sentinel byte +# | / lock strength transaction UUID timestamp length (17+1) +# | | / / / +0x: FFFF 00 03 000000000000000000000000000000000000 12 +0x: FFFF 00 02 000000000000000000000000000000000000 12 +0x: FFFF 00 01 000000000000000000000000000000000000 12 +0x: FFFF 00 03 ffffffffffffffffffffffffffffffffffff 12 +0x: FFFF 00 02 ffffffffffffffffffffffffffffffffffff 12 +0x: FFFF 00 01 ffffffffffffffffffffffffffffffffffff 12 +---- +min: 0x00000002540be400 +max: 0x00000005d21dba0001 +FFFF000300000000000000000000000000000000000012 : don't skip +FFFF000200000000000000000000000000000000000012 : don't skip +FFFF000100000000000000000000000000000000000012 : don't skip +FFFF0003ffffffffffffffffffffffffffffffffffff12 : don't skip +FFFF0002ffffffffffffffffffffffffffffffffffff12 : don't skip +FFFF0001ffffffffffffffffffffffffffffffffffff12 : don't skip