From 786fcecbd8574f496460987dec24b1b4b7e5ae09 Mon Sep 17 00:00:00 2001 From: Andy Yang Date: Fri, 30 Jun 2023 14:53:32 -0400 Subject: [PATCH 1/2] sql: fix `SHOW SCHEMAS` database name resolution bug This patch fixes a bug in the `SHOW SCHEMAS FROM db_name` logic where a schema with the name `db_name` in the current database would result in the current database's schemas being erroneously returned instead. The logic will now simply look up the database name instead of using the schema lookup logic. Release note (bug fix): `SHOW SCHEMAS FROM db_name` will no longer incorrectly show schemas from the current database when the current database has a schema named `db_name`. --- .../testdata/logic_test/multi_region_show | 2 +- pkg/sql/delegate/show_schemas.go | 14 +--- .../logictest/testdata/logic_test/database | 75 +++++++++++++++++++ pkg/sql/opt/cat/catalog.go | 8 ++ pkg/sql/opt/testutils/testcat/BUILD.bazel | 1 + pkg/sql/opt/testutils/testcat/test_catalog.go | 10 +++ pkg/sql/opt_catalog.go | 19 +++++ pkg/sql/schema_resolver.go | 9 +++ 8 files changed, 124 insertions(+), 14 deletions(-) diff --git a/pkg/ccl/logictestccl/testdata/logic_test/multi_region_show b/pkg/ccl/logictestccl/testdata/logic_test/multi_region_show index ddf128d47e13..63e667c50afc 100644 --- a/pkg/ccl/logictestccl/testdata/logic_test/multi_region_show +++ b/pkg/ccl/logictestccl/testdata/logic_test/multi_region_show @@ -58,7 +58,7 @@ SHOW CREATE DATABASE multi_region_test_placement_restricted_db database_name create_statement multi_region_test_placement_restricted_db CREATE DATABASE multi_region_test_placement_restricted_db PRIMARY REGION "ap-southeast-2" REGIONS = "ap-southeast-2", "ca-central-1", "us-east-1" SURVIVE ZONE FAILURE PLACEMENT RESTRICTED -statement error target database or schema does not exist +statement error database "foo" does not exist SHOW CREATE DATABASE foo # Test that showing localities works for databases and schemas with weird diff --git a/pkg/sql/delegate/show_schemas.go b/pkg/sql/delegate/show_schemas.go index c241cc424338..b9420313526c 100644 --- a/pkg/sql/delegate/show_schemas.go +++ b/pkg/sql/delegate/show_schemas.go @@ -62,18 +62,6 @@ func (d *delegator) delegateShowCreateAllSchemas() (tree.Statement, error) { // Returns an error if there is no current database, or if the specified // database doesn't exist. func (d *delegator) getSpecifiedOrCurrentDatabase(specifiedDB tree.Name) (tree.Name, error) { - var name cat.SchemaName - if specifiedDB != "" { - // Note: the schema name may be interpreted as database name, - // see name_resolution.go. - name.SchemaName = specifiedDB - name.ExplicitSchema = true - } - flags := cat.Flags{AvoidDescriptorCaches: true} - _, resName, err := d.catalog.ResolveSchema(d.ctx, flags, &name) - if err != nil { - return "", err - } - return resName.CatalogName, nil + return d.catalog.LookupDatabaseName(d.ctx, flags, string(specifiedDB)) } diff --git a/pkg/sql/logictest/testdata/logic_test/database b/pkg/sql/logictest/testdata/logic_test/database index 747fcd87dc9b..5bf14deede3e 100644 --- a/pkg/sql/logictest/testdata/logic_test/database +++ b/pkg/sql/logictest/testdata/logic_test/database @@ -309,6 +309,11 @@ CREATE TABLE db69713.s.pg_constraintdef_test ( statement ok DROP DATABASE db69713; +statement ok +RESET DATABASE; + +subtest end + # Ensure user must exist to create with owner. statement error role/user "fake_user" does not exist CREATE DATABASE aa with owner fake_user @@ -361,3 +366,73 @@ database_name owner primary_region secondary_region regions surviva ifnotexistsownerdb testuser NULL NULL {} NULL subtest end + +subtest regression_105906 + +statement ok +CREATE SCHEMA regression_105906 + +statement ok +CREATE DATABASE regression_105906 + +query TT colnames,rowsort +SHOW SCHEMAS +---- +schema_name owner +crdb_internal NULL +information_schema NULL +pg_catalog NULL +pg_extension NULL +public admin +regression_105906 root + +# Note: regression_105906 should not appear in the list of schemas below +query TT colnames,rowsort +SHOW SCHEMAS FROM regression_105906 +---- +schema_name owner +crdb_internal NULL +information_schema NULL +pg_catalog NULL +pg_extension NULL +public admin + +statement ok +DROP DATABASE regression_105906 + +statement ok +DROP SCHEMA regression_105906 + +statement ok +CREATE SCHEMA "rEgReSsIoN 105906" + +statement ok +CREATE DATABASE "rEgReSsIoN 105906" + +query T rowsort +SELECT schema_name FROM [SHOW SCHEMAS] +---- +public +rEgReSsIoN 105906 +crdb_internal +information_schema +pg_catalog +pg_extension + +# Note: "rEgReSsIoN 105906" should not appear in the list of schemas below +query T rowsort +SELECT schema_name FROM [SHOW SCHEMAS FROM "rEgReSsIoN 105906"] +---- +public +crdb_internal +information_schema +pg_catalog +pg_extension + +statement ok +DROP SCHEMA "rEgReSsIoN 105906" + +statement ok +DROP DATABASE "rEgReSsIoN 105906" + +subtest end diff --git a/pkg/sql/opt/cat/catalog.go b/pkg/sql/opt/cat/catalog.go index fac86778d84b..4ae8abff4a59 100644 --- a/pkg/sql/opt/cat/catalog.go +++ b/pkg/sql/opt/cat/catalog.go @@ -83,6 +83,14 @@ type Flags struct { // returned by the Resolve methods (schemas and data sources) *must* be // immutable after construction, and therefore also thread-safe. type Catalog interface { + // LookupDatabaseName locates a database with the given name and returns + // the name if found. If no name is provided, it will return the name of + // the current database. An error is returned if no database with the given + // name exists or in the case of an empty name, there is no current database. + // TODO(yang): This function can be extended if needed in the future + // to return a new cat.Database type similar to ResolveSchema. + LookupDatabaseName(ctx context.Context, flags Flags, name string) (tree.Name, error) + // ResolveSchema locates a schema with the given name and returns it along // with the resolved SchemaName (which has all components filled in). // If the SchemaName is empty, returns the current database/schema (if one is diff --git a/pkg/sql/opt/testutils/testcat/BUILD.bazel b/pkg/sql/opt/testutils/testcat/BUILD.bazel index 3e0fa613cd19..8395f3eb142e 100644 --- a/pkg/sql/opt/testutils/testcat/BUILD.bazel +++ b/pkg/sql/opt/testutils/testcat/BUILD.bazel @@ -49,6 +49,7 @@ go_library( "//pkg/sql/sem/tree/treecmp", "//pkg/sql/sem/volatility", "//pkg/sql/sessiondatapb", + "//pkg/sql/sqlerrors", "//pkg/sql/stats", "//pkg/sql/types", "//pkg/sql/vtable", diff --git a/pkg/sql/opt/testutils/testcat/test_catalog.go b/pkg/sql/opt/testutils/testcat/test_catalog.go index 377ab7e43c87..6ea57ad367f9 100644 --- a/pkg/sql/opt/testutils/testcat/test_catalog.go +++ b/pkg/sql/opt/testutils/testcat/test_catalog.go @@ -34,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sem/catid" "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/sqlerrors" "github.com/cockroachdb/cockroach/pkg/sql/stats" "github.com/cockroachdb/cockroach/pkg/sql/types" "github.com/cockroachdb/cockroach/pkg/util/treeprinter" @@ -77,6 +78,15 @@ func New() *Catalog { } } +func (tc *Catalog) LookupDatabaseName( + _ context.Context, _ cat.Flags, name string, +) (tree.Name, error) { + if name != testDB { + return "", sqlerrors.NewUndefinedDatabaseError(name) + } + return tree.Name(name), nil +} + // ResolveSchema is part of the cat.Catalog interface. func (tc *Catalog) ResolveSchema( _ context.Context, _ cat.Flags, name *cat.SchemaName, diff --git a/pkg/sql/opt_catalog.go b/pkg/sql/opt_catalog.go index 6bf885798c81..9a3bb95d61bf 100644 --- a/pkg/sql/opt_catalog.go +++ b/pkg/sql/opt_catalog.go @@ -160,6 +160,25 @@ func (os *optSchema) getDescriptorForPermissionsCheck() catalog.Descriptor { return os.database } +// LookupDatabaseName implements the cat.Catalog interface. +func (oc *optCatalog) LookupDatabaseName( + ctx context.Context, flags cat.Flags, name string, +) (tree.Name, error) { + if flags.AvoidDescriptorCaches { + defer func(prev bool) { + oc.planner.skipDescriptorCache = prev + }(oc.planner.skipDescriptorCache) + oc.planner.skipDescriptorCache = true + } + if name == "" { + name = oc.planner.CurrentDatabase() + } + if err := oc.planner.LookupDatabase(ctx, name); err != nil { + return "", err + } + return tree.Name(name), nil +} + // ResolveSchema is part of the cat.Catalog interface. func (oc *optCatalog) ResolveSchema( ctx context.Context, flags cat.Flags, name *cat.SchemaName, diff --git a/pkg/sql/schema_resolver.go b/pkg/sql/schema_resolver.go index 2d504db13296..f2392cb5fad8 100644 --- a/pkg/sql/schema_resolver.go +++ b/pkg/sql/schema_resolver.go @@ -200,6 +200,15 @@ func (sr *schemaResolver) LookupSchema( return true, catalog.ResolvedObjectPrefix{Database: db, Schema: sc}, nil } +func (sr *schemaResolver) LookupDatabase(ctx context.Context, dbName string) error { + g := sr.byNameGetterBuilder().Get() + _, err := g.Database(ctx, dbName) + if err != nil { + return err + } + return nil +} + // CurrentDatabase implements the tree.QualifiedNameResolver interface. func (sr *schemaResolver) CurrentDatabase() string { return sr.sessionDataStack.Top().Database From 3c866e4e6b9d707128ab1aa211b25a8c3510bce1 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Wed, 5 Jul 2023 10:33:02 -0400 Subject: [PATCH 2/2] Revert "Revert "storage: use size-carrying point tombstones"" This reverts commit 90c1ee697c8e92336ab0500b96c6b0b42a41f381. Epic: CRDB-25405 Release note (performance improvement): Improve disk space reclamation heuristics making disk space reclamation more timely. --- .../settings/settings-for-tenants.txt | 2 +- docs/generated/settings/settings.html | 2 +- pkg/clusterversion/cockroach_versions.go | 19 +++++ pkg/cmd/roachtest/tests/tombstones.go | 10 +-- pkg/kv/kvserver/batch_spanset_test.go | 6 +- pkg/kv/kvserver/kvstorage/init.go | 2 +- pkg/kv/kvserver/loqrecovery/record.go | 2 +- pkg/kv/kvserver/replica_raft.go | 5 +- pkg/kv/kvserver/replica_test.go | 2 +- pkg/kv/kvserver/spanset/batch.go | 21 +++-- pkg/storage/batch_test.go | 49 ++++++++--- pkg/storage/engine.go | 55 ++++++++++-- pkg/storage/engine_test.go | 19 +++-- pkg/storage/intent_interleaving_iter_test.go | 2 +- pkg/storage/intent_reader_writer.go | 4 +- pkg/storage/intent_reader_writer_test.go | 10 +-- pkg/storage/mvcc.go | 84 +++++++++++++++---- pkg/storage/mvcc_history_test.go | 18 +++- pkg/storage/mvcc_test.go | 29 ++++++- pkg/storage/open.go | 12 +++ pkg/storage/pebble.go | 52 ++++++++---- pkg/storage/pebble_batch.go | 36 +++++--- pkg/storage/pebble_test.go | 2 +- pkg/storage/read_as_of_iterator_test.go | 6 +- pkg/storage/sst_writer.go | 24 ++++-- 25 files changed, 360 insertions(+), 113 deletions(-) diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index a064739cfce1..35ad3d1fe227 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -303,4 +303,4 @@ trace.opentelemetry.collector string address of an OpenTelemetry trace collecto trace.snapshot.rate duration 0s if non-zero, interval at which background trace snapshots are captured tenant-rw trace.span_registry.enabled boolean true if set, ongoing traces can be seen at https:///#/debug/tracez tenant-rw trace.zipkin.collector string the address of a Zipkin instance to receive traces, as :. If no port is specified, 9411 will be used. tenant-rw -version version 1000023.1-10 set the active cluster version in the format '.' tenant-rw +version version 1000023.1-14 set the active cluster version in the format '.' tenant-rw diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index 38e90d86f229..5a43fbd2da93 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -258,6 +258,6 @@
trace.snapshot.rate
duration0sif non-zero, interval at which background trace snapshots are capturedServerless/Dedicated/Self-Hosted
trace.span_registry.enabled
booleantrueif set, ongoing traces can be seen at https://<ui>/#/debug/tracezServerless/Dedicated/Self-Hosted
trace.zipkin.collector
stringthe address of a Zipkin instance to receive traces, as <host>:<port>. If no port is specified, 9411 will be used.Serverless/Dedicated/Self-Hosted -
version
version1000023.1-10set the active cluster version in the format '<major>.<minor>'Serverless/Dedicated/Self-Hosted +
version
version1000023.1-14set the active cluster version in the format '<major>.<minor>'Serverless/Dedicated/Self-Hosted diff --git a/pkg/clusterversion/cockroach_versions.go b/pkg/clusterversion/cockroach_versions.go index 6d0adb612248..ed7c33cd58f3 100644 --- a/pkg/clusterversion/cockroach_versions.go +++ b/pkg/clusterversion/cockroach_versions.go @@ -542,6 +542,17 @@ const ( // that (optionally) embed below-raft admission data. V23_2_UseACRaftEntryEntryEncodings + // V23_2_PebbleFormatDeleteSizedAndObsolete upgrades Pebble's format major + // version to FormatDeleteSizedAndObsolete, allowing use of a new sstable + // format version Pebblev4. This version has two improvements: + // a) It allows the use of DELSIZED point tombstones. + // b) It encodes the obsolence of keys in a key-kind bit. + V23_2_PebbleFormatDeleteSizedAndObsolete + + // V23_2_UseSizedPebblePointTombstones enables the use of Pebble's new + // DeleteSized operations. + V23_2_UseSizedPebblePointTombstones + // ************************************************* // Step (1) Add new versions here. // Do not add new versions to a patch release. @@ -943,6 +954,14 @@ var rawVersionsSingleton = keyedVersions{ Key: V23_2_UseACRaftEntryEntryEncodings, Version: roachpb.Version{Major: 23, Minor: 1, Internal: 10}, }, + { + Key: V23_2_PebbleFormatDeleteSizedAndObsolete, + Version: roachpb.Version{Major: 23, Minor: 1, Internal: 12}, + }, + { + Key: V23_2_UseSizedPebblePointTombstones, + Version: roachpb.Version{Major: 23, Minor: 1, Internal: 14}, + }, // ************************************************* // Step (2): Add new versions here. diff --git a/pkg/cmd/roachtest/tests/tombstones.go b/pkg/cmd/roachtest/tests/tombstones.go index a2eaa257ffad..65f979657dbc 100644 --- a/pkg/cmd/roachtest/tests/tombstones.go +++ b/pkg/cmd/roachtest/tests/tombstones.go @@ -29,12 +29,6 @@ import ( // registerPointTombstone registers the point tombstone test. func registerPointTombstone(r registry.Registry) { r.Add(registry.TestSpec{ - Skip: "pebble#2340", - SkipDetails: "This roachtest is implemented ahead of implementing and using " + - "pebble#2340 within Cockroach. Currently, this roachtest fails through " + - "a timeout because the disk space corresponding to the large KVs is " + - "never reclaimed. Once pebble#2340 is integrated into Cockroach, we " + - "expect this to begin passing, and we can un-skip it.", Name: "point-tombstone/heterogeneous-value-sizes", Owner: registry.OwnerStorage, Cluster: r.MakeClusterSpec(4), @@ -136,7 +130,7 @@ func registerPointTombstone(r registry.Registry) { require.LessOrEqual(t, statsAfterDeletes.livePercentage, 0.10) // Wait for garbage collection to delete the non-live data. - targetSize := uint64(2 << 30) /* 2 GB */ + targetSize := uint64(3 << 30) /* 3 GiB */ t.Status("waiting for garbage collection and compaction to reduce on-disk size to ", humanize.IBytes(targetSize)) m = c.NewMonitor(ctx, c.Range(1, 3)) m.Go(func(ctx context.Context) error { @@ -172,7 +166,7 @@ type tableSizeInfo struct { } func (info tableSizeInfo) String() string { - return fmt.Sprintf("databaseID: %d, tableID: %d, rangeCount: %d, approxDiskBytes: %s, liveBytes: %s, totalBytes: %s, livePercentage: %.1f", + return fmt.Sprintf("databaseID: %d, tableID: %d, rangeCount: %d, approxDiskBytes: %s, liveBytes: %s, totalBytes: %s, livePercentage: %.2f", info.databaseID, info.tableID, info.rangeCount, diff --git a/pkg/kv/kvserver/batch_spanset_test.go b/pkg/kv/kvserver/batch_spanset_test.go index 81e2559862ba..593c3ae41331 100644 --- a/pkg/kv/kvserver/batch_spanset_test.go +++ b/pkg/kv/kvserver/batch_spanset_test.go @@ -72,7 +72,7 @@ func TestSpanSetBatchBoundaries(t *testing.T) { } t.Run("writes before range", func(t *testing.T) { - if err := batch.ClearUnversioned(outsideKey.Key); !isWriteSpanErr(err) { + if err := batch.ClearUnversioned(outsideKey.Key, storage.ClearOptions{}); !isWriteSpanErr(err) { t.Errorf("ClearUnversioned: unexpected error %v", err) } if err := batch.ClearRawRange(outsideKey.Key, outsideKey2.Key, true, true); !isWriteSpanErr(err) { @@ -93,7 +93,7 @@ func TestSpanSetBatchBoundaries(t *testing.T) { }) t.Run("writes after range", func(t *testing.T) { - if err := batch.ClearUnversioned(outsideKey3.Key); !isWriteSpanErr(err) { + if err := batch.ClearUnversioned(outsideKey3.Key, storage.ClearOptions{}); !isWriteSpanErr(err) { t.Errorf("ClearUnversioned: unexpected error %v", err) } if err := batch.ClearRawRange(insideKey2.Key, outsideKey4.Key, true, true); !isWriteSpanErr(err) { @@ -303,7 +303,7 @@ func TestSpanSetBatchTimestamps(t *testing.T) { } for _, batch := range []storage.Batch{batchBefore, batchNonMVCC} { - if err := batch.ClearUnversioned(wkey.Key); !isWriteSpanErr(err) { + if err := batch.ClearUnversioned(wkey.Key, storage.ClearOptions{}); !isWriteSpanErr(err) { t.Errorf("ClearUnversioned: unexpected error %v", err) } { diff --git a/pkg/kv/kvserver/kvstorage/init.go b/pkg/kv/kvserver/kvstorage/init.go index ab0428dc212c..fd65085c6ddc 100644 --- a/pkg/kv/kvserver/kvstorage/init.go +++ b/pkg/kv/kvserver/kvstorage/init.go @@ -504,7 +504,7 @@ func LoadAndReconcileReplicas(ctx context.Context, eng storage.Engine) ([]Replic // TODO(tbg): if clearRangeData were in this package we could destroy more // effectively even if for some reason we had in the past written state // other than the HardState here (not supposed to happen, but still). - if err := eng.ClearUnversioned(logstore.NewStateLoader(repl.RangeID).RaftHardStateKey()); err != nil { + if err := eng.ClearUnversioned(logstore.NewStateLoader(repl.RangeID).RaftHardStateKey(), storage.ClearOptions{}); err != nil { return nil, errors.Wrapf(err, "removing HardState for r%d", repl.RangeID) } log.Eventf(ctx, "removed legacy uninitialized replica for r%s", repl.RangeID) diff --git a/pkg/kv/kvserver/loqrecovery/record.go b/pkg/kv/kvserver/loqrecovery/record.go index 874dfe2d3e3b..0b980aa8b95a 100644 --- a/pkg/kv/kvserver/loqrecovery/record.go +++ b/pkg/kv/kvserver/loqrecovery/record.go @@ -107,7 +107,7 @@ func RegisterOfflineRecoveryEvents( continue } if removeEvent { - if err := readWriter.ClearUnversioned(iter.UnsafeKey().Key); err != nil { + if err := readWriter.ClearUnversioned(iter.UnsafeKey().Key, storage.ClearOptions{}); err != nil { processingErrors = errors.CombineErrors(processingErrors, errors.Wrapf( err, "failed to delete replica recovery record at key %s", iter.UnsafeKey())) continue diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 4f7873fd23e0..7999dd2f1e94 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -2607,7 +2607,10 @@ func handleTruncatedStateBelowRaftPreApply( // avoid allocating when constructing Raft log keys (16 bytes). prefix := prefixBuf.RaftLogPrefix() for idx := currentTruncatedState.Index + 1; idx <= suggestedTruncatedState.Index; idx++ { - if err := readWriter.ClearUnversioned(keys.RaftLogKeyFromPrefix(prefix, idx)); err != nil { + if err := readWriter.ClearUnversioned( + keys.RaftLogKeyFromPrefix(prefix, idx), + storage.ClearOptions{}, + ); err != nil { return false, errors.Wrapf(err, "unable to clear truncated Raft entries for %+v at index %d", suggestedTruncatedState, idx) } diff --git a/pkg/kv/kvserver/replica_test.go b/pkg/kv/kvserver/replica_test.go index f311397cd8ad..65c5fcf06f10 100644 --- a/pkg/kv/kvserver/replica_test.go +++ b/pkg/kv/kvserver/replica_test.go @@ -1823,7 +1823,7 @@ func TestOptimizePuts(t *testing.T) { require.NoError(t, tc.engine.ClearMVCCRangeKey(storage.MVCCRangeKey{ StartKey: c.exKey, EndKey: c.exEndKey, Timestamp: hlc.MinTimestamp})) } else if c.exKey != nil { - require.NoError(t, tc.engine.ClearUnversioned(c.exKey)) + require.NoError(t, tc.engine.ClearUnversioned(c.exKey, storage.ClearOptions{})) } } } diff --git a/pkg/kv/kvserver/spanset/batch.go b/pkg/kv/kvserver/spanset/batch.go index 5d2a773379ae..2b49b71c388e 100644 --- a/pkg/kv/kvserver/spanset/batch.go +++ b/pkg/kv/kvserver/spanset/batch.go @@ -415,6 +415,11 @@ func (i *EngineIterator) Value() ([]byte, error) { return i.i.Value() } +// ValueLen is part of the storage.EngineIterator interface. +func (i *EngineIterator) ValueLen() int { + return i.i.ValueLen() +} + // UnsafeRawEngineKey is part of the storage.EngineIterator interface. func (i *EngineIterator) UnsafeRawEngineKey() []byte { return i.i.UnsafeRawEngineKey() @@ -522,34 +527,34 @@ func (s spanSetWriter) checkAllowed(key roachpb.Key) error { return nil } -func (s spanSetWriter) ClearMVCC(key storage.MVCCKey) error { +func (s spanSetWriter) ClearMVCC(key storage.MVCCKey, opts storage.ClearOptions) error { if err := s.checkAllowed(key.Key); err != nil { return err } - return s.w.ClearMVCC(key) + return s.w.ClearMVCC(key, opts) } -func (s spanSetWriter) ClearUnversioned(key roachpb.Key) error { +func (s spanSetWriter) ClearUnversioned(key roachpb.Key, opts storage.ClearOptions) error { if err := s.checkAllowed(key); err != nil { return err } - return s.w.ClearUnversioned(key) + return s.w.ClearUnversioned(key, opts) } func (s spanSetWriter) ClearIntent( - key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, + key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, opts storage.ClearOptions, ) error { if err := s.checkAllowed(key); err != nil { return err } - return s.w.ClearIntent(key, txnDidNotUpdateMeta, txnUUID) + return s.w.ClearIntent(key, txnDidNotUpdateMeta, txnUUID, opts) } -func (s spanSetWriter) ClearEngineKey(key storage.EngineKey) error { +func (s spanSetWriter) ClearEngineKey(key storage.EngineKey, opts storage.ClearOptions) error { if err := s.spans.CheckAllowed(SpanReadWrite, roachpb.Span{Key: key.Key}); err != nil { return err } - return s.w.ClearEngineKey(key) + return s.w.ClearEngineKey(key, opts) } func (s spanSetWriter) SingleClearEngineKey(key storage.EngineKey) error { diff --git a/pkg/storage/batch_test.go b/pkg/storage/batch_test.go index c57617ff672d..c9d8dc1d0d9b 100644 --- a/pkg/storage/batch_test.go +++ b/pkg/storage/batch_test.go @@ -13,6 +13,7 @@ package storage import ( "bytes" "context" + "encoding/binary" "fmt" "reflect" "strconv" @@ -78,7 +79,7 @@ func testBatchBasics(t *testing.T, writeOnly bool, commit func(e Engine, b Write // Write an engine value to be deleted. require.NoError(t, e.PutUnversioned(mvccKey("b").Key, []byte("value"))) - require.NoError(t, b.ClearUnversioned(mvccKey("b").Key)) + require.NoError(t, b.ClearUnversioned(mvccKey("b").Key, ClearOptions{})) // Write an engine value to be merged. require.NoError(t, e.PutUnversioned(mvccKey("c").Key, appender("foo"))) @@ -91,12 +92,25 @@ func testBatchBasics(t *testing.T, writeOnly bool, commit func(e Engine, b Write require.NoError(t, e.PutUnversioned(mvccKey("d").Key, []byte("before"))) require.NoError(t, b.SingleClearEngineKey(EngineKey{Key: mvccKey("d").Key})) + // Write a MVCC value to be deleted with a known value size. + keyF := mvccKey("f") + keyF.Timestamp.WallTime = 1 + valueF := MVCCValue{Value: roachpb.Value{RawBytes: []byte("fvalue")}} + encodedValueF, err := EncodeMVCCValue(valueF) + require.NoError(t, err) + require.NoError(t, e.PutMVCC(keyF, valueF)) + require.NoError(t, b.ClearMVCC(keyF, ClearOptions{ + ValueSizeKnown: true, + ValueSize: uint32(len(encodedValueF)), + })) + // Check all keys are in initial state (nothing from batch has gone // through to engine until commit). expValues := []MVCCKeyValue{ {Key: mvccKey("b"), Value: []byte("value")}, {Key: mvccKey("c"), Value: appender("foo")}, {Key: mvccKey("d"), Value: []byte("before")}, + {Key: keyF, Value: encodedValueF}, } kvs, err := Scan(e, localMax, roachpb.KeyMax, 0) require.NoError(t, err) @@ -199,7 +213,7 @@ func TestReadOnlyBasics(t *testing.T) { // For a read-only ReadWriter, all Writer methods should panic. failureTestCases := []func(){ func() { _ = ro.ApplyBatchRepr(nil, false) }, - func() { _ = ro.ClearUnversioned(a.Key) }, + func() { _ = ro.ClearUnversioned(a.Key, ClearOptions{}) }, func() { _ = ro.SingleClearEngineKey(EngineKey{Key: a.Key}) }, func() { _ = ro.ClearRawRange(a.Key, a.Key, true, true) }, func() { _ = ro.Merge(a, nil) }, @@ -215,7 +229,7 @@ func TestReadOnlyBasics(t *testing.T) { if err := e.PutUnversioned(mvccKey("b").Key, []byte("value")); err != nil { t.Fatal(err) } - if err := e.ClearUnversioned(mvccKey("b").Key); err != nil { + if err := e.ClearUnversioned(mvccKey("b").Key, ClearOptions{}); err != nil { t.Fatal(err) } if err := e.PutUnversioned(mvccKey("c").Key, appender("foo")); err != nil { @@ -249,13 +263,17 @@ func TestReadOnlyBasics(t *testing.T) { func TestBatchRepr(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) + // Disable metamorphism in the value-encoding; our asserts include the + // length of the encoded value to test delete-sized. + + DisableMetamorphicSimpleValueEncoding(t) testBatchBasics(t, false /* writeOnly */, func(e Engine, b WriteBatch) error { repr := b.Repr() r, err := NewBatchReader(repr) require.NoError(t, err) - const expectedCount = 5 + const expectedCount = 6 require.Equal(t, expectedCount, r.Count()) count, err := BatchCount(repr) require.NoError(t, err) @@ -267,6 +285,9 @@ func TestBatchRepr(t *testing.T) { switch r.KeyKind() { case pebble.InternalKeyKindDelete: ops = append(ops, fmt.Sprintf("delete(%s)", string(r.Key()))) + case pebble.InternalKeyKindDeleteSized: + v, _ := binary.Uvarint(r.Value()) + ops = append(ops, fmt.Sprintf("delete-sized(%s,%d)", string(r.Key()), v)) case pebble.InternalKeyKindSet: ops = append(ops, fmt.Sprintf("put(%s,%s)", string(r.Key()), string(r.Value()))) case pebble.InternalKeyKindMerge: @@ -287,6 +308,7 @@ func TestBatchRepr(t *testing.T) { "merge(c\x00)", "put(e\x00,)", "single_delete(d\x00)", + "delete-sized(f\x00\x00\x00\x00\x00\x00\x00\x00\x01\t,17)", } require.Equal(t, expOps, ops) @@ -383,7 +405,7 @@ func TestBatchGet(t *testing.T) { if err := b.PutUnversioned(mvccKey("a").Key, []byte("value")); err != nil { t.Fatal(err) } - if err := b.ClearUnversioned(mvccKey("b").Key); err != nil { + if err := b.ClearUnversioned(mvccKey("b").Key, ClearOptions{}); err != nil { t.Fatal(err) } if err := b.Merge(mvccKey("c"), appender("bar")); err != nil { @@ -435,7 +457,7 @@ func TestBatchMerge(t *testing.T) { if err := b.PutUnversioned(mvccKey("a").Key, appender("a-value")); err != nil { t.Fatal(err) } - if err := b.ClearUnversioned(mvccKey("b").Key); err != nil { + if err := b.ClearUnversioned(mvccKey("b").Key, ClearOptions{}); err != nil { t.Fatal(err) } if err := b.Merge(mvccKey("c"), appender("c-value")); err != nil { @@ -578,7 +600,10 @@ func TestBatchScanWithDelete(t *testing.T) { if err := e.PutUnversioned(mvccKey("a").Key, []byte("value")); err != nil { t.Fatal(err) } - if err := b.ClearUnversioned(mvccKey("a").Key); err != nil { + if err := b.ClearUnversioned(mvccKey("a").Key, ClearOptions{ + ValueSizeKnown: true, + ValueSize: uint32(len("value")), + }); err != nil { t.Fatal(err) } kvs, err := Scan(b, localMax, roachpb.KeyMax, 0) @@ -611,7 +636,10 @@ func TestBatchScanMaxWithDeleted(t *testing.T) { t.Fatal(err) } // Now, delete "a" in batch. - if err := b.ClearUnversioned(mvccKey("a").Key); err != nil { + if err := b.ClearUnversioned(mvccKey("a").Key, ClearOptions{ + ValueSizeKnown: true, + ValueSize: uint32(len("value1")), + }); err != nil { t.Fatal(err) } // A scan with max=1 should scan "b". @@ -862,7 +890,8 @@ func TestDecodeKey(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - e, err := Open(context.Background(), InMemory(), cluster.MakeClusterSettings(), CacheSize(1<<20 /* 1 MiB */)) + e, err := Open(context.Background(), InMemory(), + cluster.MakeTestingClusterSettings(), CacheSize(1<<20 /* 1 MiB */)) assert.NoError(t, err) defer e.Close() @@ -924,7 +953,7 @@ func TestBatchReader(t *testing.T) { require.NoError(t, b.PutEngineRangeKey(roachpb.Key("rangeFrom"), roachpb.Key("rangeTo"), []byte{7}, []byte("engineRangeKey"))) // Clear some already empty keys. - require.NoError(t, b.ClearMVCC(pointKey("mvccKey", 9))) + require.NoError(t, b.ClearMVCC(pointKey("mvccKey", 9), ClearOptions{})) require.NoError(t, b.ClearMVCCRangeKey(rangeKey("rangeFrom", "rangeTo", 9))) require.NoError(t, b.ClearRawRange(roachpb.Key("clearFrom"), roachpb.Key("clearTo"), true, true)) diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go index faf6d244786f..2519b0b0883f 100644 --- a/pkg/storage/engine.go +++ b/pkg/storage/engine.go @@ -362,6 +362,11 @@ type EngineIterator interface { // Value returns the current value as a byte slice. // REQUIRES: latest positioning function returned valid=true. Value() ([]byte, error) + // ValueLen returns the length of the current value. ValueLen should be + // preferred when the actual value is not needed. In some circumstances, the + // storage engine may be able to avoid loading the value. + // REQUIRES: latest positioning function returned valid=true. + ValueLen() int // CloneContext is a low-level method only for use in the storage package, // that provides sufficient context that the iterator may be cloned. CloneContext() CloneContext @@ -631,14 +636,22 @@ type Writer interface { // actually removes entries from the storage engine, rather than inserting // MVCC tombstones. // + // If the caller knows the size of the value that is being cleared, they + // should set ClearOptions.{ValueSizeKnown, ValueSize} accordingly to + // improve the storage engine's ability to prioritize compactions. + // // It is safe to modify the contents of the arguments after it returns. - ClearMVCC(key MVCCKey) error + ClearMVCC(key MVCCKey, opts ClearOptions) error // ClearUnversioned removes an unversioned item from the db. It is for use // with inline metadata (not intents) and other unversioned keys (like // Range-ID local keys). It does not affect range keys. // + // If the caller knows the size of the value that is being cleared, they + // should set ClearOptions.{ValueSizeKnown, ValueSize} accordingly to + // improve the storage engine's ability to prioritize compactions. + // // It is safe to modify the contents of the arguments after it returns. - ClearUnversioned(key roachpb.Key) error + ClearUnversioned(key roachpb.Key, opts ClearOptions) error // ClearIntent removes an intent from the db. Unlike ClearMVCC and // ClearUnversioned, this is a higher-level method that may make changes in // parts of the key space that are not only a function of the input, and may @@ -653,14 +666,18 @@ type Writer interface { // that does a pair. If there isn't a performance // decrease, we can stop tracking txnDidNotUpdateMeta and still optimize // ClearIntent by always doing single-clear. - ClearIntent(key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID) error + ClearIntent(key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, opts ClearOptions) error // ClearEngineKey removes the given point key from the engine. It does not // affect range keys. Note that clear actually removes entries from the // storage engine. This is a general-purpose and low-level method that should // be used sparingly, only when the other Clear* methods are not applicable. // + // If the caller knows the size of the value that is being cleared, they + // should set ClearOptions.{ValueSizeKnown, ValueSize} accordingly to + // improve the storage engine's ability to prioritize compactions. + // // It is safe to modify the contents of the arguments after it returns. - ClearEngineKey(key EngineKey) error + ClearEngineKey(key EngineKey, opts ClearOptions) error // ClearRawRange removes point and/or range keys from start (inclusive) to end // (exclusive) using Pebble range tombstones. It can be applied to a range @@ -837,6 +854,31 @@ type Writer interface { BufferedSize() int } +// ClearOptions holds optional parameters to methods that clear keys from the +// storage engine. +type ClearOptions struct { + // ValueSizeKnown indicates whether the ValueSize carries a meaningful + // value. If false, ValueSize is ignored. + ValueSizeKnown bool + // ValueSize may be provided to indicate the size of the existing KV + // record's value that is being removed. ValueSize should be the encoded + // value size that the storage engine observes. If the value is a + // MVCCMetadata, ValueSize should be the length of the encoded MVCCMetadata. + // If the value is a MVCCValue, ValueSize should be the length of the + // encoded MVCCValue. + // + // Setting ValueSize and ValueSizeKnown improves the storage engine's + // ability to estimate space amplification and prioritize compactions. + // Without it, compaction heuristics rely on average value sizes which are + // susceptible to over and under estimation. + // + // If the true value size is unknown, leave ValueSizeKnown false. + // Correctness is not compromised if ValueSize is incorrect; the underlying + // key will always be cleared regardless of whether its value size matches + // the provided value. + ValueSize uint32 +} + // ReadWriter is the read/write interface to an engine's data. type ReadWriter interface { Reader @@ -1474,7 +1516,10 @@ func ClearRangeWithHeuristic( if err != nil { return err } - if err = w.ClearEngineKey(key); err != nil { + if err = w.ClearEngineKey(key, ClearOptions{ + ValueSizeKnown: true, + ValueSize: uint32(iter.ValueLen()), + }); err != nil { return err } } diff --git a/pkg/storage/engine_test.go b/pkg/storage/engine_test.go index 8a9b28166028..2d38125772d8 100644 --- a/pkg/storage/engine_test.go +++ b/pkg/storage/engine_test.go @@ -151,7 +151,7 @@ func TestEngineBatchStaleCachedIterator(t *testing.T) { iter.SeekGE(key) - if err := batch.ClearUnversioned(key.Key); err != nil { + if err := batch.ClearUnversioned(key.Key, ClearOptions{}); err != nil { t.Fatal(err) } @@ -246,7 +246,7 @@ func TestEngineBatch(t *testing.T) { apply := func(rw ReadWriter, d data) error { if d.value == nil { - return rw.ClearUnversioned(d.key.Key) + return rw.ClearUnversioned(d.key.Key, ClearOptions{}) } else if d.merge { return rw.Merge(d.key, d.value) } @@ -277,7 +277,7 @@ func TestEngineBatch(t *testing.T) { currentBatch[k] = batch[shuffledIndices[k]] } // Reset the key - if err := engine.ClearUnversioned(key.Key); err != nil { + if err := engine.ClearUnversioned(key.Key, ClearOptions{}); err != nil { t.Fatal(err) } // Run it once with individual operations and remember the result. @@ -291,7 +291,7 @@ func TestEngineBatch(t *testing.T) { // Run the whole thing as a batch and compare. b := engine.NewBatch() defer b.Close() - if err := b.ClearUnversioned(key.Key); err != nil { + if err := b.ClearUnversioned(key.Key, ClearOptions{}); err != nil { t.Fatal(err) } for _, op := range currentBatch { @@ -350,9 +350,9 @@ func TestEnginePutGetDelete(t *testing.T) { for i, err := range []error{ engine.PutUnversioned(mvccKey("").Key, []byte("")), engine.PutUnversioned(NilKey.Key, []byte("")), - engine.ClearUnversioned(NilKey.Key), - engine.ClearUnversioned(NilKey.Key), - engine.ClearUnversioned(mvccKey("").Key), + engine.ClearUnversioned(NilKey.Key, ClearOptions{}), + engine.ClearUnversioned(NilKey.Key, ClearOptions{}), + engine.ClearUnversioned(mvccKey("").Key, ClearOptions{}), } { if err == nil { t.Fatalf("%d: illegal handling of empty key", i) @@ -382,7 +382,10 @@ func TestEnginePutGetDelete(t *testing.T) { if !bytes.Equal(val, c.value) { t.Errorf("expected key value %s to be %+v: got %+v", c.key, c.value, val) } - if err := engine.ClearUnversioned(c.key.Key); err != nil { + if err := engine.ClearUnversioned(c.key.Key, ClearOptions{ + ValueSizeKnown: true, + ValueSize: uint32(len(val)), + }); err != nil { t.Errorf("delete: expected no error, but got %s", err) } val = mvccGetRaw(t, engine, c.key) diff --git a/pkg/storage/intent_interleaving_iter_test.go b/pkg/storage/intent_interleaving_iter_test.go index ab86814d12ea..6bbcbde24be2 100644 --- a/pkg/storage/intent_interleaving_iter_test.go +++ b/pkg/storage/intent_interleaving_iter_test.go @@ -550,7 +550,7 @@ func writeRandomData( if interleave { require.NoError(t, batch.PutUnversioned(kv.key.Key, kv.val)) if !kv.liveIntent { - require.NoError(t, batch.ClearUnversioned(kv.key.Key)) + require.NoError(t, batch.ClearUnversioned(kv.key.Key, ClearOptions{})) } } else { eKey, _ := kv.key.ToEngineKey(nil) diff --git a/pkg/storage/intent_reader_writer.go b/pkg/storage/intent_reader_writer.go index c3968433485d..8d3b74c6e555 100644 --- a/pkg/storage/intent_reader_writer.go +++ b/pkg/storage/intent_reader_writer.go @@ -37,7 +37,7 @@ func wrapIntentWriter(w Writer) intentDemuxWriter { // scratch-space to avoid allocations -- its contents will be overwritten and // not appended to, and a possibly different buf returned. func (idw intentDemuxWriter) ClearIntent( - key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, buf []byte, + key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, buf []byte, opts ClearOptions, ) (_ []byte, _ error) { var engineKey EngineKey engineKey, buf = LockTableKey{ @@ -48,7 +48,7 @@ func (idw intentDemuxWriter) ClearIntent( if txnDidNotUpdateMeta { return buf, idw.w.SingleClearEngineKey(engineKey) } - return buf, idw.w.ClearEngineKey(engineKey) + return buf, idw.w.ClearEngineKey(engineKey, opts) } // PutIntent has the same behavior as Writer.PutIntent. buf is used as diff --git a/pkg/storage/intent_reader_writer_test.go b/pkg/storage/intent_reader_writer_test.go index 55d20e8dcf08..4c0e74ee10b9 100644 --- a/pkg/storage/intent_reader_writer_test.go +++ b/pkg/storage/intent_reader_writer_test.go @@ -114,12 +114,12 @@ func (p *printWriter) reset() { fmt.Fprintf(&p.b, "=== Calls ===\n") } -func (p *printWriter) ClearUnversioned(key roachpb.Key) error { +func (p *printWriter) ClearUnversioned(key roachpb.Key, opts ClearOptions) error { fmt.Fprintf(&p.b, "ClearUnversioned(%s)\n", string(key)) - return p.Writer.ClearUnversioned(key) + return p.Writer.ClearUnversioned(key, opts) } -func (p *printWriter) ClearEngineKey(key EngineKey) error { +func (p *printWriter) ClearEngineKey(key EngineKey, opts ClearOptions) error { ltKey, err := key.ToLockTableKey() var str string if err != nil { @@ -129,7 +129,7 @@ func (p *printWriter) ClearEngineKey(key EngineKey) error { str = printLTKey(ltKey) } fmt.Fprintf(&p.b, "ClearEngineKey(%s)\n", str) - return p.Writer.ClearEngineKey(key) + return p.Writer.ClearEngineKey(key, opts) } func (p *printWriter) ClearRawRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error { @@ -242,7 +242,7 @@ func TestIntentDemuxWriter(t *testing.T) { d.ScanArgs(t, "txn", &txn) txnUUID := uuid.FromUint128(uint128.FromInts(0, uint64(txn))) txnDidNotUpdateMeta := readTxnDidNotUpdateMeta(t, d) - scratch, err = w.ClearIntent(key, txnDidNotUpdateMeta, txnUUID, scratch) + scratch, err = w.ClearIntent(key, txnDidNotUpdateMeta, txnUUID, scratch, ClearOptions{}) if err != nil { return err.Error() } diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index 0bf5b95a2920..ea92fb503f0b 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -910,6 +910,8 @@ func MVCCBlindPutInlineWithPrev( prev.IsPresent(), ok, origMetaKeySize, metaKeySize, origMetaValSize, metaValSize) } } + // TODO(jackson): Thread origMetaValSize through so that a resulting + // ClearUnversioned sets ClearOptions.ValueSize[Known]. return MVCCBlindPut(ctx, rw, ms, key, hlc.Timestamp{}, hlc.ClockTimestamp{}, value, nil) } @@ -1856,7 +1858,12 @@ func mvccPutInternal( return false, err } if !value.IsPresent() { - metaKeySize, metaValSize, err = 0, 0, writer.ClearUnversioned(metaKey.Key) + metaKeySize, metaValSize, err = 0, 0, writer.ClearUnversioned(metaKey.Key, ClearOptions{ + // NB: origMetaValSize is only populated by mvccGetMetadata if + // iter != nil. + ValueSizeKnown: iter != nil, + ValueSize: uint32(origMetaValSize), + }) } else { buf.meta = enginepb.MVCCMetadata{RawBytes: value.RawBytes} metaKeySize, metaValSize, err = buf.putInlineMeta(writer, metaKey, &buf.meta) @@ -2072,7 +2079,12 @@ func mvccPutInternal( iter = nil // prevent accidental use below } - if err := writer.ClearMVCC(oldVersionKey); err != nil { + // TODO(jackson): Do we know the encoded value size in the other + // cases? + if err := writer.ClearMVCC(oldVersionKey, ClearOptions{ + ValueSizeKnown: curProvValRaw != nil, + ValueSize: uint32(len(curProvValRaw)), + }); err != nil { return false, err } } else if writeTimestamp.Less(metaTimestamp) { @@ -2650,22 +2662,27 @@ func MVCCClearTimeRange( // This can be a big win for reverting bulk-ingestion of clustered data as the // entire span may likely match and thus could be cleared in one ClearRange // instead of hundreds of thousands of individual Clears. - buf := make([]MVCCKey, clearRangeThreshold) + type bufferedKey struct { + MVCCKey + valLen uint32 + } + buf := make([]bufferedKey, clearRangeThreshold) var bufSize int var clearRangeStart MVCCKey - clearMatchingKey := func(k MVCCKey) { + clearMatchingKey := func(k MVCCKey, valLen uint32) { if len(clearRangeStart.Key) == 0 { // Currently buffering keys to clear one-by-one. if bufSize < clearRangeThreshold { buf[bufSize].Key = append(buf[bufSize].Key[:0], k.Key...) buf[bufSize].Timestamp = k.Timestamp + buf[bufSize].valLen = valLen bufSize++ } else { // Buffer is now full -- switch to just tracking the start of the range // from which we will clear when we either see a non-matching key or if // we finish iterating. - clearRangeStart = buf[0] + clearRangeStart = buf[0].MVCCKey bufSize = 0 } } @@ -2682,13 +2699,13 @@ func MVCCClearTimeRange( } else if bufSize > 0 { var encodedBufSize int64 for i := 0; i < bufSize; i++ { - encodedBufSize += int64(buf[i].EncodedSize()) + encodedBufSize += int64(buf[i].MVCCKey.EncodedSize()) } // Even though we didn't get a large enough number of keys to switch to // clearrange, the byte size of the keys we did get is now too large to // encode them all within the byte size limit, so use clearrange anyway. if batchByteSize+encodedBufSize >= maxBatchByteSize { - if err := rw.ClearMVCCVersions(buf[0], nonMatch); err != nil { + if err := rw.ClearMVCCVersions(buf[0].MVCCKey, nonMatch); err != nil { return err } batchByteSize += int64(buf[0].EncodedSize() + nonMatch.EncodedSize()) @@ -2698,11 +2715,17 @@ func MVCCClearTimeRange( if buf[i].Timestamp.IsEmpty() { // Inline metadata. Not an intent because iteration below fails // if it sees an intent. - if err := rw.ClearUnversioned(buf[i].Key); err != nil { + if err := rw.ClearUnversioned(buf[i].Key, ClearOptions{ + ValueSizeKnown: true, + ValueSize: buf[i].valLen, + }); err != nil { return err } } else { - if err := rw.ClearMVCC(buf[i]); err != nil { + if err := rw.ClearMVCC(buf[i].MVCCKey, ClearOptions{ + ValueSizeKnown: true, + ValueSize: buf[i].valLen, + }); err != nil { return err } } @@ -2951,7 +2974,7 @@ func MVCCClearTimeRange( clearedMetaKey.Key = clearedMetaKey.Key[:0] if startTime.Less(k.Timestamp) && k.Timestamp.LessEq(endTime) { - clearMatchingKey(k) + clearMatchingKey(k, uint32(valueLen)) clearedMetaKey.Key = append(clearedMetaKey.Key[:0], k.Key...) clearedMeta.KeyBytes = MVCCVersionTimestampSize clearedMeta.ValBytes = int64(valueLen) @@ -4797,7 +4820,10 @@ func mvccResolveWriteIntent( if err = rw.PutMVCC(newKey, newValue); err != nil { return false, err } - if err = rw.ClearMVCC(oldKey); err != nil { + if err = rw.ClearMVCC(oldKey, ClearOptions{ + ValueSizeKnown: true, + ValueSize: uint32(len(v)), + }); err != nil { return false, err } @@ -4840,7 +4866,10 @@ func mvccResolveWriteIntent( ctx, rw, metaKey, newMeta, true /* alreadyExists */) } else { metaKeySize = int64(metaKey.EncodedSize()) - err = rw.ClearIntent(metaKey.Key, canSingleDelHelper.onCommitIntent(), meta.Txn.ID) + err = rw.ClearIntent(metaKey.Key, canSingleDelHelper.onCommitIntent(), meta.Txn.ID, ClearOptions{ + ValueSizeKnown: true, + ValueSize: uint32(origMetaValSize), + }) } if err != nil { return false, err @@ -4879,7 +4908,10 @@ func mvccResolveWriteIntent( // - ResolveIntent with epoch 0 aborts intent from epoch 1. // First clear the provisional value. - if err := rw.ClearMVCC(latestKey); err != nil { + if err := rw.ClearMVCC(latestKey, ClearOptions{ + ValueSizeKnown: true, + ValueSize: uint32(meta.ValBytes), + }); err != nil { return false, err } @@ -4945,7 +4977,10 @@ func mvccResolveWriteIntent( if !ok { // If there is no other version, we should just clean up the key entirely. - if err = rw.ClearIntent(metaKey.Key, canSingleDelHelper.onAbortIntent(), meta.Txn.ID); err != nil { + if err = rw.ClearIntent(metaKey.Key, canSingleDelHelper.onAbortIntent(), meta.Txn.ID, ClearOptions{ + ValueSizeKnown: true, + ValueSize: uint32(origMetaValSize), + }); err != nil { return false, err } // Clear stat counters attributable to the intent we're aborting. @@ -4962,7 +4997,10 @@ func mvccResolveWriteIntent( KeyBytes: MVCCVersionTimestampSize, ValBytes: int64(nextValueLen), } - if err = rw.ClearIntent(metaKey.Key, canSingleDelHelper.onAbortIntent(), meta.Txn.ID); err != nil { + if err = rw.ClearIntent(metaKey.Key, canSingleDelHelper.onAbortIntent(), meta.Txn.ID, ClearOptions{ + ValueSizeKnown: true, + ValueSize: uint32(origMetaValSize), + }); err != nil { return false, err } metaKeySize := int64(metaKey.EncodedSize()) @@ -5307,7 +5345,10 @@ func MVCCGarbageCollect( if !implicitMeta { // This must be an inline entry since we are not allowed to clear // intents, and we've confirmed that meta.Txn == nil earlier. - if err := rw.ClearUnversioned(iter.UnsafeKey().Key); err != nil { + if err := rw.ClearUnversioned(iter.UnsafeKey().Key, ClearOptions{ + ValueSizeKnown: true, + ValueSize: uint32(iter.ValueLen()), + }); err != nil { return err } count++ @@ -5433,11 +5474,15 @@ func MVCCGarbageCollect( if !unsafeIterKey.IsValue() { break } + + var clearOpts ClearOptions if ms != nil { valLen, valIsTombstone, err := iter.MVCCValueLenAndIsTombstone() if err != nil { return err } + clearOpts.ValueSizeKnown = true + clearOpts.ValueSize = uint32(valLen) keySize := MVCCVersionTimestampSize valSize := int64(valLen) @@ -5460,7 +5505,7 @@ func MVCCGarbageCollect( ms.Add(updateStatsOnGC(gcKey.Key, keySize, valSize, false /* metaKey */, fromNS)) } count++ - if err := rw.ClearMVCC(unsafeIterKey); err != nil { + if err := rw.ClearMVCC(unsafeIterKey, clearOpts); err != nil { return err } prevNanos = unsafeIterKey.Timestamp.WallTime @@ -7070,7 +7115,10 @@ func ReplacePointTombstonesWithRangeTombstones( clearedKey.Key = append(clearedKey.Key[:0], key.Key...) clearedKey.Timestamp = key.Timestamp clearedKeySize := int64(EncodedMVCCKeyPrefixLength(clearedKey.Key)) - if err := rw.ClearMVCC(key); err != nil { + if err := rw.ClearMVCC(key, ClearOptions{ + ValueSizeKnown: true, + ValueSize: uint32(valueLen), + }); err != nil { return err } diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index 030af942c3fc..d7c1a9f13b4d 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -355,6 +355,18 @@ func TestMVCCHistories(t *testing.T) { } e := newEvalCtx(ctx, engine) + defer func() { + require.NoError(t, engine.Compact()) + m := engine.GetMetrics().Metrics + if m.Keys.MissizedTombstonesCount > 0 { + // A missized tombstone is a Pebble DELSIZED tombstone that encodes + // the wrong size of the value it deletes. This kind of tombstone is + // written when ClearOptions.ValueSizeKnown=true. If this assertion + // failed, something might be awry in the code clearing the key. Are + // we feeding the wrong value length to ValueSize? + t.Fatalf("expected to find 0 missized tombstones; found %d", m.Keys.MissizedTombstonesCount) + } + }() defer e.close() if strings.Contains(path, "_nometamorphiciter") { e.noMetamorphicIter = true @@ -878,11 +890,11 @@ func (rw intentPrintingReadWriter) PutIntent( } func (rw intentPrintingReadWriter) ClearIntent( - key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, + key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, opts storage.ClearOptions, ) error { rw.buf.Printf("called ClearIntent(%v, TDNUM(%t), %v)\n", key, txnDidNotUpdateMeta, txnUUID) - return rw.ReadWriter.ClearIntent(key, txnDidNotUpdateMeta, txnUUID) + return rw.ReadWriter.ClearIntent(key, txnDidNotUpdateMeta, txnUUID, opts) } func (e *evalCtx) tryWrapForIntentPrinting(rw storage.ReadWriter) storage.ReadWriter { @@ -992,7 +1004,7 @@ func cmdClear(e *evalCtx) error { key := e.getKey() ts := e.getTs(nil) return e.withWriter("clear", func(rw storage.ReadWriter) error { - return rw.ClearMVCC(storage.MVCCKey{Key: key, Timestamp: ts}) + return rw.ClearMVCC(storage.MVCCKey{Key: key, Timestamp: ts}, storage.ClearOptions{}) }) } diff --git a/pkg/storage/mvcc_test.go b/pkg/storage/mvcc_test.go index 8e650c5cbbfe..7c09c4d5493e 100644 --- a/pkg/storage/mvcc_test.go +++ b/pkg/storage/mvcc_test.go @@ -3685,7 +3685,8 @@ func generateBytes(rng *rand.Rand, min int, max int) []byte { } func createEngWithSeparatedIntents(t *testing.T) Engine { - eng, err := Open(context.Background(), InMemory(), cluster.MakeClusterSettings(), MaxSize(1<<20)) + eng, err := Open(context.Background(), InMemory(), + cluster.MakeTestingClusterSettings(), MaxSize(1<<20)) require.NoError(t, err) return eng } @@ -3924,7 +3925,7 @@ func TestRandomizedSavepointRollbackAndIntentResolution(t *testing.T) { rng := rand.New(rand.NewSource(seed)) ctx := context.Background() eng, err := Open( - context.Background(), InMemory(), cluster.MakeClusterSettings(), + context.Background(), InMemory(), cluster.MakeTestingClusterSettings(), func(cfg *engineConfig) error { cfg.Opts.LBaseMaxBytes = int64(100 + rng.Intn(16384)) log.Infof(ctx, "lbase: %d", cfg.Opts.LBaseMaxBytes) @@ -4871,6 +4872,9 @@ func TestMVCCGarbageCollect(t *testing.T) { assertEq(t, engine, "verification", ms, &expMS) }) } + // Compact the engine; the ForTesting() config option will assert that all + // DELSIZED tombstones were appropriately sized. + require.NoError(t, engine.Compact()) } // TestMVCCGarbageCollectNonDeleted verifies that the first value for @@ -4916,6 +4920,10 @@ func TestMVCCGarbageCollectNonDeleted(t *testing.T) { test.expError, err) } } + + // Compact the engine; the ForTesting() config option will assert that all + // DELSIZED tombstones were appropriately sized. + require.NoError(t, engine.Compact()) } // TestMVCCGarbageCollectIntent verifies that an intent cannot be GC'd. @@ -4950,6 +4958,9 @@ func TestMVCCGarbageCollectIntent(t *testing.T) { if err := MVCCGarbageCollect(ctx, engine, nil, keys, ts2); err == nil { t.Fatal("expected error garbage collecting an intent") } + // Compact the engine; the ForTesting() config option will assert that all + // DELSIZED tombstones were appropriately sized. + require.NoError(t, engine.Compact()) } // TestMVCCGarbageCollectPanicsWithMixOfLocalAndGlobalKeys verifies that @@ -5152,6 +5163,9 @@ func TestMVCCGarbageCollectUsesSeekLTAppropriately(t *testing.T) { engine := NewDefaultInMemForTesting() defer engine.Close() runTestCase(t, tc, engine) + // Compact the engine; the ForTesting() config option will assert + // that all DELSIZED tombstones were appropriately sized. + require.NoError(t, engine.Compact()) }) } } @@ -5702,6 +5716,10 @@ func TestMVCCGarbageCollectRanges(t *testing.T) { expMs, err := ComputeStats(engine, d.rangeStart, d.rangeEnd, tsMax.WallTime) require.NoError(t, err, "failed to compute stats for range") require.EqualValues(t, expMs, ms, "computed range stats vs gc'd") + + // Compact the engine; the ForTesting() config option will assert + // that all DELSIZED tombstones were appropriately sized. + require.NoError(t, engine.Compact()) }) } } @@ -5956,6 +5974,9 @@ func TestMVCCGarbageCollectClearRangeInlinedValue(t *testing.T) { require.Errorf(t, err, "expected error '%s' but found none", expectedError) require.True(t, testutils.IsError(err, expectedError), "expected error '%s' found '%s'", expectedError, err) + // Compact the engine; the ForTesting() config option will assert that all + // DELSIZED tombstones were appropriately sized. + require.NoError(t, engine.Compact()) } func TestMVCCGarbageCollectClearPointsInRange(t *testing.T) { @@ -6022,6 +6043,10 @@ func TestMVCCGarbageCollectClearPointsInRange(t *testing.T) { expMs, err := ComputeStats(engine, rangeStart, rangeEnd, tsMax.WallTime) require.NoError(t, err, "failed to compute stats for range") require.EqualValues(t, expMs, ms, "computed range stats vs gc'd") + + // Compact the engine; the ForTesting() config option will assert that all + // DELSIZED tombstones were appropriately sized. + require.NoError(t, engine.Compact()) } func TestMVCCGarbageCollectClearRangeFailure(t *testing.T) { diff --git a/pkg/storage/open.go b/pkg/storage/open.go index c59e4ea9cdfb..044301b2f4c5 100644 --- a/pkg/storage/open.go +++ b/pkg/storage/open.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cloud" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble" "github.com/cockroachdb/pebble/vfs" ) @@ -66,6 +67,17 @@ var ForceWriterParallelism ConfigOption = func(cfg *engineConfig) error { // ForTesting configures the engine for use in testing. It may randomize some // config options to improve test coverage. var ForTesting ConfigOption = func(cfg *engineConfig) error { + cfg.onClose = append(cfg.onClose, func(p *Pebble) { + m := p.db.Metrics() + if m.Keys.MissizedTombstonesCount > 0 { + // A missized tombstone is a Pebble DELSIZED tombstone that encodes + // the wrong size of the value it deletes. This kind of tombstone is + // written when ClearOptions.ValueSizeKnown=true. If this assertion + // failed, something might be awry in the code clearing the key. Are + // we feeding the wrong value length to ValueSize? + panic(errors.AssertionFailedf("expected to find 0 missized tombstones; found %d", m.Keys.MissizedTombstonesCount)) + } + }) return nil } diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 1a0e5be0c685..ec99e1cf4759 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -714,6 +714,9 @@ type PebbleConfig struct { // SharedStorage is a cloud.ExternalStorage that can be used by all Pebble // stores on this node and on other nodes to store sstables. SharedStorage cloud.ExternalStorage + + // onClose is a slice of functions to be invoked before the engine is closed. + onClose []func(*Pebble) } // EncryptionStatsHandler provides encryption related stats. @@ -796,6 +799,8 @@ type Pebble struct { // closer is populated when the database is opened. The closer is associated // with the filesystem. closer io.Closer + // onClose is a slice of functions to be invoked before the engine closes. + onClose []func(*Pebble) wrappedIntentWriter intentDemuxWriter @@ -1072,6 +1077,7 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (p *Pebble, err error) { logCtx: logCtx, storeIDPebbleLog: storeIDContainer, closer: filesystemCloser, + onClose: cfg.onClose, replayer: replay.NewWorkloadCollector(cfg.StorageConfig.Dir), } @@ -1328,6 +1334,10 @@ func (p *Pebble) Close() { p.logger.Infof("closing unopened pebble instance") return } + for _, closeFunc := range p.onClose { + closeFunc(p) + } + p.closed = true // Wait for any asynchronous goroutines to exit. @@ -1464,37 +1474,48 @@ func (p *Pebble) ApplyBatchRepr(repr []byte, sync bool) error { } // ClearMVCC implements the Engine interface. -func (p *Pebble) ClearMVCC(key MVCCKey) error { +func (p *Pebble) ClearMVCC(key MVCCKey, opts ClearOptions) error { if key.Timestamp.IsEmpty() { panic("ClearMVCC timestamp is empty") } - return p.clear(key) + return p.clear(key, opts) } // ClearUnversioned implements the Engine interface. -func (p *Pebble) ClearUnversioned(key roachpb.Key) error { - return p.clear(MVCCKey{Key: key}) +func (p *Pebble) ClearUnversioned(key roachpb.Key, opts ClearOptions) error { + return p.clear(MVCCKey{Key: key}, opts) } // ClearIntent implements the Engine interface. -func (p *Pebble) ClearIntent(key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID) error { - _, err := p.wrappedIntentWriter.ClearIntent(key, txnDidNotUpdateMeta, txnUUID, nil) +func (p *Pebble) ClearIntent( + key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, opts ClearOptions, +) error { + _, err := p.wrappedIntentWriter.ClearIntent(key, txnDidNotUpdateMeta, txnUUID, nil, opts) return err } // ClearEngineKey implements the Engine interface. -func (p *Pebble) ClearEngineKey(key EngineKey) error { +func (p *Pebble) ClearEngineKey(key EngineKey, opts ClearOptions) error { if len(key.Key) == 0 { return emptyKeyError() } - return p.db.Delete(key.Encode(), pebble.Sync) + if !opts.ValueSizeKnown || !p.settings.Version.ActiveVersionOrEmpty(context.TODO()). + IsActive(clusterversion.V23_2_UseSizedPebblePointTombstones) { + return p.db.Delete(key.Encode(), pebble.Sync) + } + return p.db.DeleteSized(key.Encode(), opts.ValueSize, pebble.Sync) } -func (p *Pebble) clear(key MVCCKey) error { +func (p *Pebble) clear(key MVCCKey, opts ClearOptions) error { if len(key.Key) == 0 { return emptyKeyError() } - return p.db.Delete(EncodeMVCCKey(key), pebble.Sync) + if !opts.ValueSizeKnown || !p.settings.Version.ActiveVersionOrEmpty(context.TODO()). + IsActive(clusterversion.V23_2_UseSizedPebblePointTombstones) { + return p.db.Delete(EncodeMVCCKey(key), pebble.Sync) + } + // Use DeleteSized to propagate the value size. + return p.db.DeleteSized(EncodeMVCCKey(key), opts.ValueSize, pebble.Sync) } // SingleClearEngineKey implements the Engine interface. @@ -2113,6 +2134,9 @@ func (p *Pebble) SetMinVersion(version roachpb.Version) error { var formatVers pebble.FormatMajorVersion // Cases are ordered from newer to older versions. switch { + case !version.Less(clusterversion.ByKey(clusterversion.V23_2_PebbleFormatDeleteSizedAndObsolete)): + formatVers = pebble.ExperimentalFormatDeleteSizedAndObsolete + case !version.Less(clusterversion.ByKey(clusterversion.V23_1EnableFlushableIngest)): formatVers = pebble.FormatFlushableIngest @@ -2358,21 +2382,21 @@ func (p *pebbleReadOnly) ApplyBatchRepr(repr []byte, sync bool) error { panic("not implemented") } -func (p *pebbleReadOnly) ClearMVCC(key MVCCKey) error { +func (p *pebbleReadOnly) ClearMVCC(key MVCCKey, opts ClearOptions) error { panic("not implemented") } -func (p *pebbleReadOnly) ClearUnversioned(key roachpb.Key) error { +func (p *pebbleReadOnly) ClearUnversioned(key roachpb.Key, opts ClearOptions) error { panic("not implemented") } func (p *pebbleReadOnly) ClearIntent( - key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, + key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, opts ClearOptions, ) error { panic("not implemented") } -func (p *pebbleReadOnly) ClearEngineKey(key EngineKey) error { +func (p *pebbleReadOnly) ClearEngineKey(key EngineKey, opts ClearOptions) error { panic("not implemented") } diff --git a/pkg/storage/pebble_batch.go b/pkg/storage/pebble_batch.go index fcfa7e7789f0..36d45def93c1 100644 --- a/pkg/storage/pebble_batch.go +++ b/pkg/storage/pebble_batch.go @@ -14,6 +14,7 @@ import ( "context" "sync" + "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage/pebbleiter" @@ -58,6 +59,7 @@ type pebbleBatch struct { iterStatsReporter iterStatsReporter batchStatsReporter batchStatsReporter settings *cluster.Settings + mayWriteSizedDeletes bool shouldWriteLocalTimestamps bool shouldWriteLocalTimestampsCached bool } @@ -112,7 +114,15 @@ func newPebbleBatch( iterStatsReporter: iterStatsReporter, batchStatsReporter: batchStatsReporter, settings: settings, + // NB: We do not use settings.Version.IsActive because we do not + // generally have a guarantee that the cluster version has been + // initialized. As a part of initializing a store, we use a Batch to + // write the store identifer key; this is written before any cluster + // version has been initialized. + mayWriteSizedDeletes: settings.Version.ActiveVersionOrEmpty(context.TODO()). + IsActive(clusterversion.V23_2_UseSizedPebblePointTombstones), } + pb.wrappedIntentWriter = wrapIntentWriter(pb) return pb } @@ -277,43 +287,49 @@ func (p *pebbleBatch) ApplyBatchRepr(repr []byte, sync bool) error { } // ClearMVCC implements the Batch interface. -func (p *pebbleBatch) ClearMVCC(key MVCCKey) error { +func (p *pebbleBatch) ClearMVCC(key MVCCKey, opts ClearOptions) error { if key.Timestamp.IsEmpty() { panic("ClearMVCC timestamp is empty") } - return p.clear(key) + return p.clear(key, opts) } // ClearUnversioned implements the Batch interface. -func (p *pebbleBatch) ClearUnversioned(key roachpb.Key) error { - return p.clear(MVCCKey{Key: key}) +func (p *pebbleBatch) ClearUnversioned(key roachpb.Key, opts ClearOptions) error { + return p.clear(MVCCKey{Key: key}, opts) } // ClearIntent implements the Batch interface. func (p *pebbleBatch) ClearIntent( - key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, + key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, opts ClearOptions, ) error { var err error - p.scratch, err = p.wrappedIntentWriter.ClearIntent(key, txnDidNotUpdateMeta, txnUUID, p.scratch) + p.scratch, err = p.wrappedIntentWriter.ClearIntent(key, txnDidNotUpdateMeta, txnUUID, p.scratch, opts) return err } // ClearEngineKey implements the Batch interface. -func (p *pebbleBatch) ClearEngineKey(key EngineKey) error { +func (p *pebbleBatch) ClearEngineKey(key EngineKey, opts ClearOptions) error { if len(key.Key) == 0 { return emptyKeyError() } p.buf = key.EncodeToBuf(p.buf[:0]) - return p.batch.Delete(p.buf, nil) + if !opts.ValueSizeKnown || !p.mayWriteSizedDeletes { + return p.batch.Delete(p.buf, nil) + } + return p.batch.DeleteSized(p.buf, opts.ValueSize, nil) } -func (p *pebbleBatch) clear(key MVCCKey) error { +func (p *pebbleBatch) clear(key MVCCKey, opts ClearOptions) error { if len(key.Key) == 0 { return emptyKeyError() } p.buf = EncodeMVCCKeyToBuf(p.buf[:0], key) - return p.batch.Delete(p.buf, nil) + if !opts.ValueSizeKnown || !p.mayWriteSizedDeletes { + return p.batch.Delete(p.buf, nil) + } + return p.batch.DeleteSized(p.buf, opts.ValueSize, nil) } // SingleClearEngineKey implements the Batch interface. diff --git a/pkg/storage/pebble_test.go b/pkg/storage/pebble_test.go index d7b33f14b8e8..d686323ed69f 100644 --- a/pkg/storage/pebble_test.go +++ b/pkg/storage/pebble_test.go @@ -801,7 +801,7 @@ func TestPebbleMVCCTimeIntervalWithClears(t *testing.T) { require.NoError(t, eng.Flush()) // Clear a@5 and [c-d)@7 in a separate SST. - require.NoError(t, eng.ClearMVCC(pointKey("a", 5))) + require.NoError(t, eng.ClearMVCC(pointKey("a", 5), ClearOptions{})) require.NoError(t, eng.ClearMVCCRangeKey(rangeKey("c", "d", 7))) require.NoError(t, eng.Flush()) diff --git a/pkg/storage/read_as_of_iterator_test.go b/pkg/storage/read_as_of_iterator_test.go index 600d341d42b4..64ddfdb6a8df 100644 --- a/pkg/storage/read_as_of_iterator_test.go +++ b/pkg/storage/read_as_of_iterator_test.go @@ -35,7 +35,8 @@ func TestReadAsOfIterator(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - pebble, err := Open(context.Background(), InMemory(), cluster.MakeClusterSettings(), CacheSize(1<<20 /* 1 MiB */)) + pebble, err := Open(context.Background(), InMemory(), + cluster.MakeTestingClusterSettings(), CacheSize(1<<20 /* 1 MiB */)) require.NoError(t, err) defer pebble.Close() @@ -109,7 +110,8 @@ func TestReadAsOfIteratorSeek(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - pebble, err := Open(context.Background(), InMemory(), cluster.MakeClusterSettings(), CacheSize(1<<20 /* 1 MiB */)) + pebble, err := Open(context.Background(), InMemory(), + cluster.MakeTestingClusterSettings(), CacheSize(1<<20 /* 1 MiB */)) require.NoError(t, err) defer pebble.Close() diff --git a/pkg/storage/sst_writer.go b/pkg/storage/sst_writer.go index 0d46ecae9701..ac812b6bd5cd 100644 --- a/pkg/storage/sst_writer.go +++ b/pkg/storage/sst_writer.go @@ -331,19 +331,19 @@ func (fw *SSTWriter) ApplyBatchRepr(repr []byte, sync bool) error { // not greater than any previous point key passed to this Writer (according to // the comparator configured during writer creation). `Close` cannot have been // called. -func (fw *SSTWriter) ClearMVCC(key MVCCKey) error { +func (fw *SSTWriter) ClearMVCC(key MVCCKey, opts ClearOptions) error { if key.Timestamp.IsEmpty() { panic("ClearMVCC timestamp is empty") } - return fw.clear(key) + return fw.clear(key, opts) } // ClearUnversioned implements the Writer interface. An error is returned if // it is not greater than any previous point key passed to this Writer // (according to the comparator configured during writer creation). `Close` // cannot have been called. -func (fw *SSTWriter) ClearUnversioned(key roachpb.Key) error { - return fw.clear(MVCCKey{Key: key}) +func (fw *SSTWriter) ClearUnversioned(key roachpb.Key, opts ClearOptions) error { + return fw.clear(MVCCKey{Key: key}, opts) } // ClearIntent implements the Writer interface. An error is returned if it is @@ -351,7 +351,7 @@ func (fw *SSTWriter) ClearUnversioned(key roachpb.Key) error { // the comparator configured during writer creation). `Close` cannot have been // called. func (fw *SSTWriter) ClearIntent( - key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, + key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID, opts ClearOptions, ) error { panic("ClearIntent is unsupported") } @@ -360,24 +360,34 @@ func (fw *SSTWriter) ClearIntent( // not greater than any previous point key passed to this Writer (according to // the comparator configured during writer creation). `Close` cannot have been // called. -func (fw *SSTWriter) ClearEngineKey(key EngineKey) error { +func (fw *SSTWriter) ClearEngineKey(key EngineKey, opts ClearOptions) error { if fw.fw == nil { return errors.New("cannot call Clear on a closed writer") } fw.scratch = key.EncodeToBuf(fw.scratch[:0]) fw.DataSize += int64(len(key.Key)) + // TODO(jackson): We could use opts.ValueSize if known, but it would require + // additional logic around ensuring the cluster version is at least + // V23_2_UseSizedPebblePointTombstones. It's probably not worth it until we + // can unconditionally use it; I don't believe we ever write point + // tombstones to sstables constructed within Cockroach. return fw.fw.Delete(fw.scratch) } // An error is returned if it is not greater than any previous point key // passed to this Writer (according to the comparator configured during writer // creation). `Close` cannot have been called. -func (fw *SSTWriter) clear(key MVCCKey) error { +func (fw *SSTWriter) clear(key MVCCKey, opts ClearOptions) error { if fw.fw == nil { return errors.New("cannot call Clear on a closed writer") } fw.scratch = EncodeMVCCKeyToBuf(fw.scratch[:0], key) fw.DataSize += int64(len(key.Key)) + // TODO(jackson): We could use opts.ValueSize if known, but it would require + // additional logic around ensuring the cluster version is at least + // V23_2_UseSizedPebblePointTombstones. It's probably not worth it until we + // can unconditionally use it; I don't believe we ever write point + // tombstones to sstables constructed within Cockroach. return fw.fw.Delete(fw.scratch) }