diff --git a/build/bazelutil/check.sh b/build/bazelutil/check.sh index 07480d202414..94e90156edba 100755 --- a/build/bazelutil/check.sh +++ b/build/bazelutil/check.sh @@ -61,6 +61,8 @@ pkg/cmd/roachtest/BUILD.bazel EXISTING_CRDB_TEST_BUILD_CONSTRAINTS=" pkg/util/buildutil/crdb_test_off.go://go:build !crdb_test || crdb_test_off pkg/util/buildutil/crdb_test_on.go://go:build crdb_test && !crdb_test_off +pkg/storage/pebbleiter/crdb_test_off.go://go:build !crdb_test || crdb_test_off +pkg/storage/pebbleiter/crdb_test_on.go://go:build crdb_test && !crdb_test_off " if [ -z "${COCKROACH_BAZEL_CHECK_FAST:-}" ]; then diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index 43d95a490d23..3bded14ce9cd 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -505,6 +505,7 @@ ALL_TESTS = [ "//pkg/storage/enginepb:enginepb_test", "//pkg/storage/fs:fs_test", "//pkg/storage/metamorphic:metamorphic_test", + "//pkg/storage/pebbleiter:pebbleiter_test", "//pkg/storage:storage_test", "//pkg/testutils/docker:docker_test", "//pkg/testutils/floatcmp:floatcmp_test", @@ -1805,6 +1806,8 @@ GO_TARGETS = [ "//pkg/storage/fs:fs_test", "//pkg/storage/metamorphic:metamorphic", "//pkg/storage/metamorphic:metamorphic_test", + "//pkg/storage/pebbleiter:pebbleiter", + "//pkg/storage/pebbleiter:pebbleiter_test", "//pkg/storage:storage", "//pkg/storage:storage_test", "//pkg/streaming:streaming", @@ -2861,6 +2864,7 @@ GET_X_DATA_TARGETS = [ "//pkg/storage/enginepb:get_x_data", "//pkg/storage/fs:get_x_data", "//pkg/storage/metamorphic:get_x_data", + "//pkg/storage/pebbleiter:get_x_data", "//pkg/streaming:get_x_data", "//pkg/testutils:get_x_data", "//pkg/testutils/buildutil:get_x_data", diff --git a/pkg/kv/kvserver/spanset/BUILD.bazel b/pkg/kv/kvserver/spanset/BUILD.bazel index 5e7ecff39184..3451ad4a0d1b 100644 --- a/pkg/kv/kvserver/spanset/BUILD.bazel +++ b/pkg/kv/kvserver/spanset/BUILD.bazel @@ -14,6 +14,7 @@ go_library( "//pkg/keys", "//pkg/roachpb", "//pkg/storage", + "//pkg/storage/pebbleiter", "//pkg/util/hlc", "//pkg/util/log", "//pkg/util/protoutil", diff --git a/pkg/kv/kvserver/spanset/batch.go b/pkg/kv/kvserver/spanset/batch.go index a691a8684c55..79ae0fb68b3e 100644 --- a/pkg/kv/kvserver/spanset/batch.go +++ b/pkg/kv/kvserver/spanset/batch.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage" + "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/protoutil" @@ -102,21 +103,37 @@ func (i *MVCCIterator) SeekLT(key storage.MVCCKey) { // Next is part of the storage.MVCCIterator interface. func (i *MVCCIterator) Next() { i.i.Next() - i.checkAllowed(roachpb.Span{Key: i.UnsafeKey().Key}, false) + i.checkAllowedCurrPosForward(false) } // Prev is part of the storage.MVCCIterator interface. func (i *MVCCIterator) Prev() { i.i.Prev() - i.checkAllowed(roachpb.Span{Key: i.UnsafeKey().Key}, false) + i.checkAllowedCurrPosForward(false) } // NextKey is part of the storage.MVCCIterator interface. func (i *MVCCIterator) NextKey() { i.i.NextKey() - i.checkAllowed(roachpb.Span{Key: i.UnsafeKey().Key}, false) + i.checkAllowedCurrPosForward(false) } +// checkAllowedCurrPosForward checks the span starting at the current iterator +// position, if the current iterator position is valid. +func (i *MVCCIterator) checkAllowedCurrPosForward(errIfDisallowed bool) { + i.invalid = false + i.err = nil + if ok, _ := i.i.Valid(); !ok { + // If the iterator is invalid after the operation, there's nothing to + // check. We allow uses of iterators to exceed the declared span bounds + // as long as the iterator itself is configured with proper boundaries. + return + } + i.checkAllowedValidPos(roachpb.Span{Key: i.UnsafeKey().Key}, errIfDisallowed) +} + +// checkAllowed checks the provided span if the current iterator position is +// valid. func (i *MVCCIterator) checkAllowed(span roachpb.Span, errIfDisallowed bool) { i.invalid = false i.err = nil @@ -126,6 +143,10 @@ func (i *MVCCIterator) checkAllowed(span roachpb.Span, errIfDisallowed bool) { // as long as the iterator itself is configured with proper boundaries. return } + i.checkAllowedValidPos(span, errIfDisallowed) +} + +func (i *MVCCIterator) checkAllowedValidPos(span roachpb.Span, errIfDisallowed bool) { var err error if i.spansOnly { err = i.spans.CheckAllowed(SpanReadOnly, span) @@ -392,7 +413,7 @@ func (i *EngineIterator) UnsafeRawEngineKey() []byte { } // GetRawIter is part of the storage.EngineIterator interface. -func (i *EngineIterator) GetRawIter() *pebble.Iterator { +func (i *EngineIterator) GetRawIter() pebbleiter.Iterator { return i.i.GetRawIter() } diff --git a/pkg/storage/BUILD.bazel b/pkg/storage/BUILD.bazel index 7abeb8c70cfd..1d7ad11180c1 100644 --- a/pkg/storage/BUILD.bazel +++ b/pkg/storage/BUILD.bazel @@ -61,6 +61,7 @@ go_library( "//pkg/settings/cluster", "//pkg/storage/enginepb", "//pkg/storage/fs", + "//pkg/storage/pebbleiter", "//pkg/util", "//pkg/util/admission", "//pkg/util/bufalloc", diff --git a/pkg/storage/disk_map.go b/pkg/storage/disk_map.go index 7898acebc876..77a58eb8fa64 100644 --- a/pkg/storage/disk_map.go +++ b/pkg/storage/disk_map.go @@ -16,6 +16,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/diskmap" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter" "github.com/cockroachdb/cockroach/pkg/util/encoding" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/errors" @@ -52,7 +53,7 @@ type pebbleMapBatchWriter struct { // pebbleMapIterator iterates over the keys of a pebbleMap in sorted order. type pebbleMapIterator struct { allowDuplicates bool - iter *pebble.Iterator + iter pebbleiter.Iterator // makeKey is a function that transforms a key into a byte slice with a prefix // used to SeekGE() the underlying iterator. makeKey func(k []byte) []byte @@ -114,9 +115,9 @@ func (r *pebbleMap) makeKeyWithSequence(k []byte) []byte { func (r *pebbleMap) NewIterator() diskmap.SortedDiskMapIterator { return &pebbleMapIterator{ allowDuplicates: r.allowDuplicates, - iter: r.store.NewIter(&pebble.IterOptions{ + iter: pebbleiter.MaybeWrap(r.store.NewIter(&pebble.IterOptions{ UpperBound: roachpb.Key(r.prefix).PrefixEnd(), - }), + })), makeKey: r.makeKey, prefix: r.prefix, } diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go index 29363d159e89..32eb00227095 100644 --- a/pkg/storage/engine.go +++ b/pkg/storage/engine.go @@ -21,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/storage/fs" + "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter" "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/iterutil" @@ -340,7 +341,7 @@ type EngineIterator interface { Value() []byte // GetRawIter is a low-level method only for use in the storage package, // that returns the underlying pebble Iterator. - GetRawIter() *pebble.Iterator + GetRawIter() pebbleiter.Iterator // SeekEngineKeyGEWithLimit is similar to SeekEngineKeyGE, but takes an // additional exclusive upper limit parameter. The limit is semantically // best-effort, and is an optimization to avoid O(n^2) iteration behavior in @@ -1551,17 +1552,7 @@ func assertSimpleMVCCIteratorInvariants(iter SimpleMVCCIterator) error { rangeKey, value.Value.RawBytes) } } - - } else { - // Bounds and range keys must be empty. - if bounds := iter.RangeBounds(); !bounds.Equal(roachpb.Span{}) { - return errors.AssertionFailedf("hasRange=false but RangeBounds=%s", bounds) - } - if r := iter.RangeKeys(); !r.IsEmpty() || !r.Bounds.Equal(roachpb.Span{}) { - return errors.AssertionFailedf("hasRange=false but RangeKeys=%s", r) - } } - return nil } diff --git a/pkg/storage/engine_test.go b/pkg/storage/engine_test.go index a4d13cf9aff8..f033cbb311e2 100644 --- a/pkg/storage/engine_test.go +++ b/pkg/storage/engine_test.go @@ -2171,10 +2171,6 @@ func TestEngineRangeKeysUnsupported(t *testing.T) { hasPoint, hasRange := iter.HasPointAndRange() require.True(t, hasPoint) require.False(t, hasRange) - rangeBounds, err := iter.EngineRangeBounds() - require.NoError(t, err) - require.Empty(t, rangeBounds) - require.Empty(t, iter.EngineRangeKeys()) // Exhaust the iterator. ok, err = iter.NextEngineKey() diff --git a/pkg/storage/mvcc_history_metamorphic_iterator_test.go b/pkg/storage/mvcc_history_metamorphic_iterator_test.go index f5d99eaba741..5cb730ed10d7 100644 --- a/pkg/storage/mvcc_history_metamorphic_iterator_test.go +++ b/pkg/storage/mvcc_history_metamorphic_iterator_test.go @@ -172,7 +172,7 @@ func (m *metamorphicIterator) moveAround() { } hasPoint, _ := m.it.HasPointAndRange() - rangeKeys := m.it.RangeKeys().Clone() + rangeKeys := rangeKeysIfExist(m.it).Clone() var rangeKeysIgnoringTime storage.MVCCRangeKeyStack if iit != nil { rangeKeysIgnoringTime = iit.RangeKeysIgnoringTime() @@ -224,7 +224,7 @@ func (m *metamorphicIterator) moveAround() { if m.it.UnsafeKey().Equal(cur) { break // made it } - printfln("step: %s %s [changed=%t]", m.it.UnsafeKey(), m.it.RangeKeys(), m.it.RangeKeyChanged()) + printfln("step: %s %s [changed=%t]", m.it.UnsafeKey(), rangeKeysIfExist(m.it), m.it.RangeKeyChanged()) if iit != nil { // If we're an incremental iterator with time bounds, and `cur` is not within bounds, // would miss it if we used Next. So call NextIgnoringTime unconditionally. @@ -247,7 +247,7 @@ func (m *metamorphicIterator) moveAround() { valid, err := m.it.Valid() require.Nil(m.t, err) require.True(m.t, valid, "unable to recover original position following SeekLT") - printfln("rev-step: %s %s [changed=%t]", m.it.UnsafeKey(), m.it.RangeKeys(), m.it.RangeKeyChanged()) + printfln("rev-step: %s %s [changed=%t]", m.it.UnsafeKey(), rangeKeysIfExist(m.it), m.it.RangeKeyChanged()) if m.it.UnsafeKey().Equal(cur) { printfln("done") break // made it @@ -268,13 +268,13 @@ func (m *metamorphicIterator) moveAround() { rangeKeysIgnoringTime2 = iit.RangeKeysIgnoringTime() } printfln("recovered position: %s hasPoint=%t, rangeKeys=%s, rangeKeysIgnoringTime=%s", - m.it.UnsafeKey(), hasPoint2, m.it.RangeKeys(), rangeKeysIgnoringTime2) + m.it.UnsafeKey(), hasPoint2, rangeKeysIfExist(m.it), rangeKeysIgnoringTime2) } // Back where we started and hopefully in an indistinguishable state. // When the stack is empty, sometimes it's a nil slice and sometimes zero // slice. A similar problem exists with MVCCRangeKeyVersion.Value. Sidestep // them by comparing strings. - require.Equal(m.t, fmt.Sprint(rangeKeys), fmt.Sprint(m.it.RangeKeys())) + require.Equal(m.t, fmt.Sprint(rangeKeys), fmt.Sprint(rangeKeysIfExist(m.it))) if iit != nil { require.Equal(m.t, fmt.Sprint(rangeKeysIgnoringTime), fmt.Sprint(iit.RangeKeysIgnoringTime())) } diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index f090fb37f118..4f63fc87fa38 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -1868,9 +1868,18 @@ func printIter(e *evalCtx) { } } +func rangeKeysIfExist(it storage.SimpleMVCCIterator) storage.MVCCRangeKeyStack { + if valid, err := it.Valid(); !valid || err != nil { + return storage.MVCCRangeKeyStack{} + } else if _, hasRange := it.HasPointAndRange(); !hasRange { + return storage.MVCCRangeKeyStack{} + } + return it.RangeKeys() +} + func checkAndUpdateRangeKeyChanged(e *evalCtx) bool { rangeKeyChanged := e.iter.RangeKeyChanged() - rangeKeys := e.iter.RangeKeys() + rangeKeys := rangeKeysIfExist(e.iter) if incrIter := e.tryMVCCIncrementalIter(); incrIter != nil { // For MVCCIncrementalIterator, make sure RangeKeyChangedIgnoringTime() fires diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 9ecbf23ba47a..fd3c3df52691 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -34,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/storage/fs" + "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter" "github.com/cockroachdb/cockroach/pkg/util/buildutil" "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/hlc" @@ -1912,7 +1913,7 @@ type pebbleReadOnly struct { prefixEngineIter pebbleIterator normalEngineIter pebbleIterator - iter *pebble.Iterator + iter pebbleiter.Iterator iterUsed bool // avoids cloning after PinEngineStateForIterators() durability DurabilityRequirement closed bool @@ -2086,7 +2087,7 @@ func (p *pebbleReadOnly) PinEngineStateForIterators() error { if p.durability == GuaranteedDurability { o = &pebble.IterOptions{OnlyReadGuaranteedDurable: true} } - p.iter = p.parent.db.NewIter(o) + p.iter = pebbleiter.MaybeWrap(p.parent.db.NewIter(o)) // NB: p.iterUsed == false avoids cloning this in NewMVCCIterator(), since // we've just created it. } diff --git a/pkg/storage/pebble_batch.go b/pkg/storage/pebble_batch.go index 8febdc3a5d7f..8f6f7c79b63d 100644 --- a/pkg/storage/pebble_batch.go +++ b/pkg/storage/pebble_batch.go @@ -16,6 +16,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter" "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble" @@ -45,7 +46,7 @@ type pebbleBatch struct { prefixEngineIter pebbleIterator normalEngineIter pebbleIterator - iter *pebble.Iterator + iter pebbleiter.Iterator iterUsed bool // avoids cloning after PinEngineStateForIterators() writeOnly bool closed bool @@ -243,9 +244,9 @@ func (p *pebbleBatch) SupportsRangeKeys() bool { func (p *pebbleBatch) PinEngineStateForIterators() error { if p.iter == nil { if p.batch.Indexed() { - p.iter = p.batch.NewIter(nil) + p.iter = pebbleiter.MaybeWrap(p.batch.NewIter(nil)) } else { - p.iter = p.db.NewIter(nil) + p.iter = pebbleiter.MaybeWrap(p.db.NewIter(nil)) } // NB: p.iterUsed == false avoids cloning this in NewMVCCIterator(). We've // just created it, so cloning it would just be overhead. diff --git a/pkg/storage/pebble_iterator.go b/pkg/storage/pebble_iterator.go index 4d2c3f38c91f..9645a58d0a3a 100644 --- a/pkg/storage/pebble_iterator.go +++ b/pkg/storage/pebble_iterator.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter" "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/protoutil" @@ -31,7 +32,7 @@ import ( // should only be used in one of the two modes. type pebbleIterator struct { // Underlying iterator for the DB. - iter *pebble.Iterator + iter pebbleiter.Iterator options pebble.IterOptions // Reusable buffer for MVCCKey or EngineKey encoding. keyBuf []byte @@ -91,20 +92,22 @@ func newPebbleIterator( p := pebbleIterPool.Get().(*pebbleIterator) p.reusable = false // defensive p.init(nil, opts, durability, supportsRangeKeys) - p.iter = handle.NewIter(&p.options) + p.iter = pebbleiter.MaybeWrap(handle.NewIter(&p.options)) return p } // newPebbleIteratorByCloning creates a new Pebble iterator by cloning the given // iterator and reconfiguring it. func newPebbleIteratorByCloning( - iter *pebble.Iterator, opts IterOptions, durability DurabilityRequirement, supportsRangeKeys bool, + iter pebbleiter.Iterator, + opts IterOptions, + durability DurabilityRequirement, + supportsRangeKeys bool, ) *pebbleIterator { - var err error p := pebbleIterPool.Get().(*pebbleIterator) p.reusable = false // defensive p.init(nil, opts, durability, supportsRangeKeys) - p.iter, err = iter.Clone(pebble.CloneOptions{ + iter, err := iter.Clone(pebble.CloneOptions{ IterOptions: &p.options, RefreshBatchView: true, }) @@ -112,6 +115,7 @@ func newPebbleIteratorByCloning( p.Close() panic(err) } + p.iter = iter return p } @@ -128,11 +132,12 @@ func newPebbleSSTIterator( externalIterOpts = append(externalIterOpts, pebble.ExternalIterForwardOnly{}) } - var err error - if p.iter, err = pebble.NewExternalIter(DefaultPebbleOptions(), &p.options, files, externalIterOpts...); err != nil { + iter, err := pebble.NewExternalIter(DefaultPebbleOptions(), &p.options, files, externalIterOpts...) + if err != nil { p.Close() return nil, err } + p.iter = pebbleiter.MaybeWrap(iter) p.external = true return p, nil } @@ -141,7 +146,10 @@ func newPebbleSSTIterator( // reconfiguring the given iter. It is valid to pass a nil iter and then create // p.iter using p.options, to avoid redundant reconfiguration via SetOptions(). func (p *pebbleIterator) init( - iter *pebble.Iterator, opts IterOptions, durability DurabilityRequirement, supportsRangeKeys bool, + iter pebbleiter.Iterator, + opts IterOptions, + durability DurabilityRequirement, + supportsRangeKeys bool, ) { *p = pebbleIterator{ iter: iter, @@ -164,7 +172,7 @@ func (p *pebbleIterator) init( // 3. iter == nil: create a new iterator from handle. func (p *pebbleIterator) initReuseOrCreate( handle pebble.Reader, - iter *pebble.Iterator, + iter pebbleiter.Iterator, clone bool, opts IterOptions, durability DurabilityRequirement, @@ -177,7 +185,7 @@ func (p *pebbleIterator) initReuseOrCreate( p.init(nil, opts, durability, supportsRangeKeys) if iter == nil { - p.iter = handle.NewIter(&p.options) + p.iter = pebbleiter.MaybeWrap(handle.NewIter(&p.options)) } else if clone { var err error p.iter, err = iter.Clone(pebble.CloneOptions{ @@ -917,7 +925,7 @@ func (p *pebbleIterator) IsPrefix() bool { } // GetRawIter is part of the EngineIterator interface. -func (p *pebbleIterator) GetRawIter() *pebble.Iterator { +func (p *pebbleIterator) GetRawIter() pebbleiter.Iterator { return p.iter } diff --git a/pkg/storage/pebbleiter/BUILD.bazel b/pkg/storage/pebbleiter/BUILD.bazel new file mode 100644 index 000000000000..ab46ad360509 --- /dev/null +++ b/pkg/storage/pebbleiter/BUILD.bazel @@ -0,0 +1,47 @@ +load("//build/bazelutil/unused_checker:unused.bzl", "get_x_data") +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +# gazelle:exclude gen-crdb_test_off.go +# gazelle:exclude gen-crdb_test_on.go + +# keep +go_library( + name = "pebbleiter", + srcs = select({ + "//build/toolchains:crdb_test": [":gen-crdb-test-on"], + "//conditions:default": [":gen-crdb-test-off"], + }), + importpath = "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter", + visibility = ["//visibility:public"], + deps = [ + "//pkg/util", + "@com_github_cockroachdb_errors//:errors", + "@com_github_cockroachdb_pebble//:pebble", + ], +) + +REMOVE_GO_BUILD_CONSTRAINTS = "cat $< | grep -v '//go:build' | grep -v '// +build' > $@" + +genrule( + name = "gen-crdb-test-on", + srcs = ["crdb_test_on.go"], + outs = ["gen-crdb_test_on.go"], + cmd = REMOVE_GO_BUILD_CONSTRAINTS, +) + +genrule( + name = "gen-crdb-test-off", + srcs = ["crdb_test_off.go"], + outs = ["gen-crdb_test_off.go"], + cmd = REMOVE_GO_BUILD_CONSTRAINTS, +) + +go_test( + name = "pebbleiter_test", + srcs = ["crdb_test_test.go"], + args = ["-test.timeout=295s"], + embed = [":pebbleiter"], # keep + deps = ["@com_github_stretchr_testify//require"], +) + +get_x_data(name = "get_x_data") diff --git a/pkg/storage/pebbleiter/crdb_test_off.go b/pkg/storage/pebbleiter/crdb_test_off.go new file mode 100644 index 000000000000..295117ac18e0 --- /dev/null +++ b/pkg/storage/pebbleiter/crdb_test_off.go @@ -0,0 +1,26 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +//go:build !crdb_test || crdb_test_off +// +build !crdb_test crdb_test_off + +package pebbleiter + +import "github.com/cockroachdb/pebble" + +// Iterator redefines *pebble.Iterator. +type Iterator = *pebble.Iterator + +// MaybeWrap returns the provided Iterator, unmodified. +// +//gcassert:inline +func MaybeWrap(iter *pebble.Iterator) Iterator { + return iter +} diff --git a/pkg/storage/pebbleiter/crdb_test_on.go b/pkg/storage/pebbleiter/crdb_test_on.go new file mode 100644 index 000000000000..2c1104f30ec6 --- /dev/null +++ b/pkg/storage/pebbleiter/crdb_test_on.go @@ -0,0 +1,330 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +//go:build crdb_test && !crdb_test_off +// +build crdb_test,!crdb_test_off + +package pebbleiter + +import ( + "math/rand" + "time" + + "github.com/cockroachdb/cockroach/pkg/util" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble" +) + +// Iterator wraps the *pebble.Iterator in crdb_test builds with an assertionIter +// that detects when Close is called on the iterator twice. Double closes are +// problematic because they can result in an iterator being added to a sync pool +// twice, allowing concurrent use of the same iterator struct. +type Iterator = *assertionIter + +// MaybeWrap returns the provided Pebble iterator, wrapped with double close +// detection. +func MaybeWrap(iter *pebble.Iterator) Iterator { + return &assertionIter{Iterator: iter, closedCh: make(chan struct{})} +} + +// assertionIter wraps a *pebble.Iterator with assertion checking. +type assertionIter struct { + *pebble.Iterator + closed bool + closedCh chan struct{} + // unsafeBufs hold buffers used for returning values with short lifetimes to + // the caller. To assert that the client is respecting the lifetimes, + // assertionIter mangles the buffers as soon as the associated lifetime + // expires. This is the same technique applied by the unsafeMVCCIterator in + // pkg/storage, but this time applied at the API boundary between + // pkg/storage and Pebble. + // + // unsafeBufs holds two buffers per-key type and an index indicating which + // are currently in use. This is used to randomly switch to a different + // buffer, ensuring that the buffer(s) returned to the caller for the + // previous iterator position are garbage (as opposed to just state + // corresponding to the current iterator position). + unsafeBufs struct { + idx int + key [2][]byte + val [2][]byte + } + rangeKeyBufs struct { + idx int + bufs [2]rangeKeyBuf + } +} + +type rangeKeyBuf struct { + start []byte + end []byte + keys []pebble.RangeKeyData + + // bgCh is used in race builds to synchronize with a separate goroutine + // performing background buffer mangling. If non-nil, a separate mangling + // goroutine is active and periodically mangling this buffer. When the + // buffer is next used, maybeSaveAndMangleRangeKeyBufs performs a + // synchronous send to the channel to signal that the mangling goroutine + // should exit. + bgCh chan struct{} +} + +func (b *rangeKeyBuf) mangle() { + zero(b.start) + zero(b.end) + for k := range b.keys { + zero(b.keys[k].Suffix) + zero(b.keys[k].Value) + } +} + +func (i *assertionIter) Clone(cloneOpts pebble.CloneOptions) (Iterator, error) { + iter, err := i.Iterator.Clone(cloneOpts) + if err != nil { + return nil, err + } + return MaybeWrap(iter), nil +} + +func (i *assertionIter) Close() error { + if i.closed { + panic(errors.AssertionFailedf("pebble.Iterator already closed")) + } + i.closed = true + close(i.closedCh) + return i.Iterator.Close() +} + +func (i *assertionIter) Key() []byte { + if !i.Valid() { + panic(errors.AssertionFailedf("Key() called on !Valid() pebble.Iterator")) + } + idx := i.unsafeBufs.idx + copyWithMatchingNilness(&i.unsafeBufs.key[idx], i.Iterator.Key()) + return i.unsafeBufs.key[idx] +} + +func (i *assertionIter) Value() []byte { + if !i.Valid() { + panic(errors.AssertionFailedf("Value() called on !Valid() pebble.Iterator")) + } + idx := i.unsafeBufs.idx + copyWithMatchingNilness(&i.unsafeBufs.val[idx], i.Iterator.Value()) + return i.unsafeBufs.val[idx] +} + +func (i *assertionIter) RangeBounds() ([]byte, []byte) { + if !i.Valid() { + panic(errors.AssertionFailedf("RangeBounds() called on !Valid() pebble.Iterator")) + } + if _, hasRange := i.Iterator.HasPointAndRange(); !hasRange { + panic(errors.AssertionFailedf("RangeBounds() called on pebble.Iterator without range keys")) + } + // See maybeSaveAndMangleRangeKeyBufs for where these are saved. + j := i.rangeKeyBufs.idx + return i.rangeKeyBufs.bufs[j].start, i.rangeKeyBufs.bufs[j].end +} + +func (i *assertionIter) RangeKeys() []pebble.RangeKeyData { + if !i.Valid() { + panic(errors.AssertionFailedf("RangeKeys() called on !Valid() pebble.Iterator")) + } + if _, hasRange := i.Iterator.HasPointAndRange(); !hasRange { + panic(errors.AssertionFailedf("RangeKeys() called on pebble.Iterator without range keys")) + } + // See maybeSaveAndMangleRangeKeyBufs for where these are saved. + return i.rangeKeyBufs.bufs[i.rangeKeyBufs.idx].keys +} + +func (i *assertionIter) First() bool { + i.maybeMangleBufs() + defer i.maybeSaveAndMangleRangeKeyBufs() + return i.Iterator.First() +} + +func (i *assertionIter) SeekGE(key []byte) bool { + i.maybeMangleBufs() + defer i.maybeSaveAndMangleRangeKeyBufs() + return i.Iterator.SeekGE(key) +} + +func (i *assertionIter) SeekGEWithLimit(key []byte, limit []byte) pebble.IterValidityState { + i.maybeMangleBufs() + defer i.maybeSaveAndMangleRangeKeyBufs() + return i.Iterator.SeekGEWithLimit(key, limit) +} + +func (i *assertionIter) SeekPrefixGE(key []byte) bool { + i.maybeMangleBufs() + defer i.maybeSaveAndMangleRangeKeyBufs() + return i.Iterator.SeekPrefixGE(key) +} + +func (i *assertionIter) Next() bool { + i.maybeMangleBufs() + defer i.maybeSaveAndMangleRangeKeyBufs() + return i.Iterator.Next() +} + +func (i *assertionIter) NextWithLimit(limit []byte) pebble.IterValidityState { + i.maybeMangleBufs() + defer i.maybeSaveAndMangleRangeKeyBufs() + return i.Iterator.NextWithLimit(limit) +} + +func (i *assertionIter) Last() bool { + i.maybeMangleBufs() + defer i.maybeSaveAndMangleRangeKeyBufs() + return i.Iterator.Last() +} + +func (i *assertionIter) SeekLT(key []byte) bool { + i.maybeMangleBufs() + defer i.maybeSaveAndMangleRangeKeyBufs() + return i.Iterator.SeekLT(key) +} + +func (i *assertionIter) SeekLTWithLimit(key []byte, limit []byte) pebble.IterValidityState { + i.maybeMangleBufs() + defer i.maybeSaveAndMangleRangeKeyBufs() + return i.Iterator.SeekLTWithLimit(key, limit) +} + +func (i *assertionIter) Prev() bool { + i.maybeMangleBufs() + defer i.maybeSaveAndMangleRangeKeyBufs() + return i.Iterator.Prev() +} + +func (i *assertionIter) PrevWithLimit(limit []byte) pebble.IterValidityState { + i.maybeMangleBufs() + defer i.maybeSaveAndMangleRangeKeyBufs() + return i.Iterator.PrevWithLimit(limit) +} + +// maybeMangleBufs trashes the contents of buffers used to return unsafe values +// to the caller. This is used to ensure that the client respects the Pebble +// iterator interface and the lifetimes of buffers it returns. +func (i *assertionIter) maybeMangleBufs() { + if rand.Intn(2) == 0 { + idx := i.unsafeBufs.idx + zero(i.unsafeBufs.key[idx]) + zero(i.unsafeBufs.val[idx]) + if rand.Intn(2) == 0 { + // Switch to a new buffer for the next iterator position. + i.unsafeBufs.idx = (i.unsafeBufs.idx + 1) % 2 + } + } +} + +// maybeSaveAndMangleRangeKeyBufs is invoked at the end of every iterator +// operation. It saves the range keys to buffers owned by `assertionIter` and +// with random probability mangles any buffers previously returned to the user. +func (i *assertionIter) maybeSaveAndMangleRangeKeyBufs() { + // If RangeKeyChanged()=false, the pebble.Iterator contract guarantees that + // any buffers previously returned through RangeBounds() and RangeKeys() are + // still valid. + // + // NB: Only permitted to call RangeKeyChanged() if Valid(). + valid := i.Iterator.Valid() + if valid && !i.Iterator.RangeKeyChanged() { + return + } + // INVARIANT: !Valid() || RangeKeyChanged() + + // The previous range key buffers are no longer guaranteed to be stable. + // Randomly zero them to ensure we catch bugs where they're reused. + idx := i.rangeKeyBufs.idx + mangleBuf := &i.rangeKeyBufs.bufs[idx] + if rand.Intn(2) == 0 { + mangleBuf.mangle() + } + // If the new iterator position has range keys, copy them to our buffers. + if !valid { + return + } + if _, hasRange := i.Iterator.HasPointAndRange(); !hasRange { + return + } + switchBuffers := rand.Intn(2) == 0 + if switchBuffers { + // Switch to a new buffer for the new range key state. + i.rangeKeyBufs.idx = (idx + 1) % 2 + + // In race builds, mangle the old buffer from another goroutine. This is + // nice because the race detector can tell us where the improper read is + // originating. Otherwise, we're relying on the improper read + // manifesting a test assertion failure, which may be far from the + // problematic access of an unsafe buffer. + if util.RaceEnabled { + ch := make(chan struct{}) + mangleBuf.bgCh = ch + ticker := time.NewTicker(5 * time.Millisecond) + go func() { + defer ticker.Stop() + mangleBuf.mangle() + for { + select { + case <-i.closedCh: + return + case <-ch: + return + case <-ticker.C: + mangleBuf.mangle() + } + } + }() + + // If the buffer we're about to use is being mangled in the + // background, synchronize with the goroutine doing the mangling by + // sending to its channel. After the synchronous channel send, we're + // guaranteed the goroutine will not mangle the buffer again and + // we're free to use it. + if prevMangleBuf := &i.rangeKeyBufs.bufs[i.rangeKeyBufs.idx]; prevMangleBuf.bgCh != nil { + prevMangleBuf.bgCh <- struct{}{} + prevMangleBuf.bgCh = nil + } + } + } + start, end := i.Iterator.RangeBounds() + rangeKeys := i.Iterator.RangeKeys() + + buf := &i.rangeKeyBufs.bufs[i.rangeKeyBufs.idx] + buf.start = append(buf.start[:0], start...) + buf.end = append(buf.end[:0], end...) + if len(rangeKeys) > cap(buf.keys) { + buf.keys = make([]pebble.RangeKeyData, len(rangeKeys)) + } else { + buf.keys = buf.keys[:len(rangeKeys)] + } + for k := range rangeKeys { + // Preserve nil-ness returned by Pebble to ensure we're testing exactly + // what Pebble will return in production. + copyWithMatchingNilness(&buf.keys[k].Suffix, rangeKeys[k].Suffix) + copyWithMatchingNilness(&buf.keys[k].Value, rangeKeys[k].Value) + } +} + +func zero(b []byte) { + for i := range b { + b[i] = 0x00 + } +} + +func copyWithMatchingNilness(dst *[]byte, src []byte) { + if src == nil { + *dst = nil + return + } + if *dst == nil { + *dst = make([]byte, 0, len(src)) + } + *dst = append((*dst)[:0], src...) +} diff --git a/pkg/storage/pebbleiter/crdb_test_test.go b/pkg/storage/pebbleiter/crdb_test_test.go new file mode 100644 index 000000000000..374a17a887f1 --- /dev/null +++ b/pkg/storage/pebbleiter/crdb_test_test.go @@ -0,0 +1,23 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package pebbleiter + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestPebbleIterWrapped(t *testing.T) { + // Sanity-check: make sure CrdbTestBuild is set. This should be true for + // any test. + require.NotNil(t, MaybeWrap(nil)) +} diff --git a/pkg/storage/pebbleiter/pebbleiter.go b/pkg/storage/pebbleiter/pebbleiter.go new file mode 100644 index 000000000000..7da71829ec11 --- /dev/null +++ b/pkg/storage/pebbleiter/pebbleiter.go @@ -0,0 +1,15 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +// Package pebbleiter exposes a type that selectively wraps a Pebble Iterator +// only in crdb_test builds. This wrapper type performs assertions in test-only +// builds. In non-test-only builds, pebbleiter exposes the pebble.Iterator +// directly. +package pebbleiter diff --git a/pkg/storage/point_synthesizing_iter.go b/pkg/storage/point_synthesizing_iter.go index 00dfc2d67233..75810930a1ca 100644 --- a/pkg/storage/point_synthesizing_iter.go +++ b/pkg/storage/point_synthesizing_iter.go @@ -288,17 +288,11 @@ func (i *PointSynthesizingIter) updateRangeKeys() { // last call to updateRangeKeys(). if i.rangeKeyChanged { i.rangeKeyChanged = false - if i.prefix { - // A prefix iterator will always be at the start bound of the range key, - // and never move onto a different range key, so we can omit the cloning. - i.rangeKeys = i.iter.RangeKeys().Versions - } else { - rangeKeys := i.iter.RangeKeys() - i.rangeKeysStart = append(i.rangeKeysStart[:0], rangeKeys.Bounds.Key...) - rangeKeys.Versions.CloneInto(&i.rangeKeysBuf) - i.rangeKeys = i.rangeKeysBuf - i.rangeKeysStartPassed = false // we'll compare below - } + rangeKeys := i.iter.RangeKeys() + i.rangeKeysStart = append(i.rangeKeysStart[:0], rangeKeys.Bounds.Key...) + rangeKeys.Versions.CloneInto(&i.rangeKeysBuf) + i.rangeKeys = i.rangeKeysBuf + i.rangeKeysStartPassed = false // we'll compare below } // Update the current iterator position. @@ -899,19 +893,12 @@ func (i *PointSynthesizingIter) assertInvariants() error { } // rangeKeysStart must be set, and rangeKeysPos must be at or after it. - // prefix iterators do not set rangeKeysStart. - if !i.prefix { - if len(i.rangeKeysStart) == 0 { - return errors.AssertionFailedf("no rangeKeysStart at %s", i.iter.UnsafeKey()) - } - if i.rangeKeysPos.Compare(i.rangeKeysStart) < 0 { - return errors.AssertionFailedf("rangeKeysPos %s not after rangeKeysStart %s", - i.rangeKeysPos, i.rangeKeysStart) - } - } else { - if len(i.rangeKeysStart) != 0 { - return errors.AssertionFailedf("rangeKeysStart set to %s for prefix iterator", i.rangeKeysStart) - } + if len(i.rangeKeysStart) == 0 { + return errors.AssertionFailedf("no rangeKeysStart at %s", i.iter.UnsafeKey()) + } + if i.rangeKeysPos.Compare(i.rangeKeysStart) < 0 { + return errors.AssertionFailedf("rangeKeysPos %s not after rangeKeysStart %s", + i.rangeKeysPos, i.rangeKeysStart) } // rangeKeysStartPassed must match rangeKeysPos and rangeKeysStart. Note that diff --git a/pkg/storage/sst.go b/pkg/storage/sst.go index a720ab2bb337..6a6d829d23ff 100644 --- a/pkg/storage/sst.go +++ b/pkg/storage/sst.go @@ -609,6 +609,11 @@ func CheckSSTConflicts( statsDiff.Add(updateStatsOnRangeKeyMerge(sstRangeKeys.Bounds.Key, sstRangeKeys.Versions)) } extIter.SeekGE(savedExtKey) + // After seeking, the old buffers have been invalidated. + // Re-retrieve the buffers. + if extHasRange { + extRangeKeys = extIter.RangeKeys() + } } if extRangeKeysChanged && !sstPrevRangeKeys.IsEmpty() && sstPrevRangeKeys.Bounds.Overlaps(extRangeKeys.Bounds) { // Because we always re-seek the extIter after every sstIter step, diff --git a/pkg/storage/testdata/mvcc_histories/range_tombstone_gets b/pkg/storage/testdata/mvcc_histories/range_tombstone_gets index 367386d52afa..96c533fa2ceb 100644 --- a/pkg/storage/testdata/mvcc_histories/range_tombstone_gets +++ b/pkg/storage/testdata/mvcc_histories/range_tombstone_gets @@ -3,13 +3,13 @@ # Sets up the following dataset, where x is tombstone, o-o is range tombstone, [] is intent. # # T -# 6 [e6] +# 6 [e6] o6 # 5 f5 # 4 o-----------------------o o-------o [j-l)@4 has localTs=3 # 3 x d3 f3 -# 2 o---------------o h2 +# 2 o---------------o h2 o--------o # 1 a1 x c1 f1 -# a b c d e f g h i j k l +# a b c d e f g h i j k l m n o p # run ok put k=a ts=1 v=a1 @@ -21,12 +21,14 @@ del k=a ts=3 put k=d ts=3 v=d3 put k=f ts=3 v=f3 put k=h ts=2 v=h2 +del_range_ts k=n end=p ts=2 del_range_ts k=c end=i ts=4 put k=f ts=5 v=f5 del_range_ts k=j end=l ts=4 localTs=3 with t=A txn_begin k=e ts=6 put k=e v=e6 +put k=o ts=6 v=o6 ---- del: "b": found key false del: "a": found key false @@ -36,6 +38,7 @@ rangekey: {a-c}/[2.000000000,0=/] rangekey: {c-e}/[4.000000000,0=/ 2.000000000,0=/] rangekey: {e-i}/[4.000000000,0=/] rangekey: {j-l}/[4.000000000,0={localTs=3.000000000,0}/] +rangekey: {n-p}/[2.000000000,0=/] data: "a"/3.000000000,0 -> / data: "a"/1.000000000,0 -> /BYTES/a1 data: "b"/1.000000000,0 -> / @@ -47,6 +50,7 @@ data: "f"/5.000000000,0 -> /BYTES/f5 data: "f"/3.000000000,0 -> /BYTES/f3 data: "f"/1.000000000,0 -> /BYTES/f1 data: "h"/2.000000000,0 -> /BYTES/h2 +data: "o"/6.000000000,0 -> /BYTES/o6 # Run gets for all keys and all timestamps. run ok @@ -184,6 +188,87 @@ get: "h" -> /BYTES/h2 @2.000000000,0 get: "h" -> / @4.000000000,0 get: "h" -> / @4.000000000,0 +run ok +get k=l ts=1 +get k=l ts=2 +get k=l ts=3 +get k=l ts=4 +get k=l ts=5 +get k=l ts=6 +get k=l ts=1 tombstones +get k=l ts=2 tombstones +get k=l ts=3 tombstones +get k=l ts=4 tombstones +get k=l ts=5 tombstones +get k=l ts=6 tombstones +---- +get: "l" -> +get: "l" -> +get: "l" -> +get: "l" -> +get: "l" -> +get: "l" -> +get: "l" -> +get: "l" -> +get: "l" -> +get: "l" -> +get: "l" -> +get: "l" -> + +run ok +get k=n ts=1 +get k=n ts=2 +get k=n ts=3 +get k=n ts=4 +get k=n ts=5 +get k=n ts=6 +get k=n ts=1 tombstones +get k=n ts=2 tombstones +get k=n ts=3 tombstones +get k=n ts=4 tombstones +get k=n ts=5 tombstones +get k=n ts=6 tombstones +---- +get: "n" -> +get: "n" -> +get: "n" -> +get: "n" -> +get: "n" -> +get: "n" -> +get: "n" -> +get: "n" -> / @2.000000000,0 +get: "n" -> / @2.000000000,0 +get: "n" -> / @2.000000000,0 +get: "n" -> / @2.000000000,0 +get: "n" -> / @2.000000000,0 + +run ok +get k=o ts=1 +get k=o ts=2 +get k=o ts=3 +get k=o ts=4 +get k=o ts=5 +get k=o ts=6 +get k=o ts=1 tombstones +get k=o ts=2 tombstones +get k=o ts=3 tombstones +get k=o ts=4 tombstones +get k=o ts=5 tombstones +get k=o ts=6 tombstones +---- +get: "o" -> +get: "o" -> +get: "o" -> +get: "o" -> +get: "o" -> +get: "o" -> /BYTES/o6 @6.000000000,0 +get: "o" -> +get: "o" -> / @2.000000000,0 +get: "o" -> / @2.000000000,0 +get: "o" -> / @2.000000000,0 +get: "o" -> / @2.000000000,0 +get: "o" -> /BYTES/o6 @6.000000000,0 + # failOnMoreRecent: c run error get k=c ts=1 failOnMoreRecent @@ -328,3 +413,19 @@ get k=k ts=1 globalUncertaintyLimit=4 localUncertaintyLimit=3 ---- get: "k" -> error: (*roachpb.ReadWithinUncertaintyIntervalError:) ReadWithinUncertaintyIntervalError: read at time 1.000000000,0 encountered previous write with future timestamp 4.000000000,0 within uncertainty interval `t <= (local=3.000000000,0, global=0,0)`; observed timestamps: [] + +# Test a particular case where: +# - a tombstone must be synthesized due to a range tombstone +# - the uncertainty limit is such that we must check for uncertainty +# - the only point at the key is not visible at the read timstamp, but is also +# not outside the uncertainty limit +# +# In these circumstances, the scanner will seekVersion to find the first visible +# key (there is none), invalidating the underlying Pebble iterator. Although the +# underlying Pebble iterator has been invalidated, the scanner should still +# succeed in synthesizing a tombstone at the range key timestamp retrieved +# before the iterator was invalidated. +run ok +get k=o ts=5 tombstones globalUncertaintyLimit=6 localUncertaintyLimit=5 +---- +get: "o" -> / @2.000000000,0