diff --git a/docs/generated/sql/functions.md b/docs/generated/sql/functions.md index 90fede5f81b1..925e1a816835 100644 --- a/docs/generated/sql/functions.md +++ b/docs/generated/sql/functions.md @@ -3138,6 +3138,8 @@ table. Returns an error if validation fails.

Immutable crdb_internal.trim_tenant_prefix(keys: bytes[]) → bytes[]

This function assumes the given bytes are a CockroachDB key and trims any tenant prefix from the key.

Immutable +crdb_internal.unsafe_clear_gossip_info(key: string) → bool

This function is used only by CockroachDB’s developers for testing purposes.

+
Volatile crdb_internal.validate_session_revival_token(token: bytes) → bool

Validate a token that was created by create_session_revival_token. Intended for testing.

Volatile crdb_internal.validate_ttl_scheduled_jobs() → void

Validate all TTL tables have a valid scheduled job attached.

diff --git a/pkg/gossip/gossip.go b/pkg/gossip/gossip.go index be4370aededc..17e023783b9f 100644 --- a/pkg/gossip/gossip.go +++ b/pkg/gossip/gossip.go @@ -1071,6 +1071,43 @@ func (g *Gossip) GetInfoProto(key string, msg protoutil.Message) error { return protoutil.Unmarshal(bytes, msg) } +// TryClearInfo attempts to clear an info object from the cluster's gossip +// network. It does so by retrieving the object with the corresponding key. If +// one does not exist, there's nothing to do and the method returns false. +// Otherwise, the method re-gossips the same key-value pair with a TTL that is +// long enough to reasonably ensure full propagation to all nodes in the cluster +// but short enough to expire quickly once propagated. +// +// The method is best-effort. It is possible for the info object with the low +// TTL to fail to reach full propagation before reaching its TTL. For instance, +// this is possible during a transient network partition. The effect of this is +// that the existing gossip info object with a higher (or no) TTL would remain +// in the gossip network on some nodes and may eventually propagate back out to +// other nodes once the partition heals. +func (g *Gossip) TryClearInfo(key string) (bool, error) { + // Long enough to propagate to all nodes, short enough to expire quickly. + const ttl = 1 * time.Minute + return g.tryClearInfoWithTTL(key, ttl) +} + +func (g *Gossip) tryClearInfoWithTTL(key string, ttl time.Duration) (bool, error) { + val, err := g.GetInfo(key) + if err != nil { + if errors.HasType(err, KeyNotPresentError{}) { + // Info object not known on this node. We can't force a deletion + // preemptively, e.g. with a poison entry, because we do not have a valid + // value object to populate and consumers may make assumptions about the + // format of the value. + return false, nil + } + return false, err + } + if err := g.AddInfo(key, val, ttl); err != nil { + return false, err + } + return true, nil +} + // InfoOriginatedHere returns true iff the latest info for the provided key // originated on this node. This is useful for ensuring that the system config // is regossiped as soon as possible when its lease changes hands. diff --git a/pkg/gossip/gossip_test.go b/pkg/gossip/gossip_test.go index e22cf2fac60f..a96a731d6340 100644 --- a/pkg/gossip/gossip_test.go +++ b/pkg/gossip/gossip_test.go @@ -848,6 +848,26 @@ func TestGossipPropagation(t *testing.T) { } return nil }) + + mustClear := func(g *Gossip, key string) { + if _, err := g.tryClearInfoWithTTL(key, 3*time.Second); err != nil { + t.Fatal(err) + } + } + + // Clear both entries. Verify that both are removed from the gossip network. + mustClear(local, "local") + mustClear(remote, "remote") + testutils.SucceedsSoon(t, func() error { + for gName, g := range map[string]*Gossip{"local": local, "remote": remote} { + for _, key := range []string{"local", "remote"} { + if getInfo(g, key) != nil { + return fmt.Errorf("%s info %q not cleared", gName, key) + } + } + } + return nil + }) } // Test whether propagation of an info that was generated by a prior diff --git a/pkg/kv/kvserver/batcheval/cmd_add_sstable_test.go b/pkg/kv/kvserver/batcheval/cmd_add_sstable_test.go index 5971e04b3a91..734f2ced5240 100644 --- a/pkg/kv/kvserver/batcheval/cmd_add_sstable_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_add_sstable_test.go @@ -848,7 +848,7 @@ func TestEvalAddSSTable(t *testing.T) { require.NoError(t, err) require.True(t, v.IsTombstone(), "MVCC range keys must be tombstones") require.NoError(t, storage.MVCCDeleteRangeUsingTombstone( - ctx, b, nil, kv.RangeKey.StartKey, kv.RangeKey.EndKey, kv.RangeKey.Timestamp, v.MVCCValueHeader.LocalTimestamp, nil, nil, 0)) + ctx, b, nil, kv.RangeKey.StartKey, kv.RangeKey.EndKey, kv.RangeKey.Timestamp, v.MVCCValueHeader.LocalTimestamp, nil, nil, 0, nil)) default: t.Fatalf("unknown KV type %T", kv) } diff --git a/pkg/kv/kvserver/batcheval/cmd_clear_range_test.go b/pkg/kv/kvserver/batcheval/cmd_clear_range_test.go index 643ff96419a3..4926505b95b9 100644 --- a/pkg/kv/kvserver/batcheval/cmd_clear_range_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_clear_range_test.go @@ -145,7 +145,7 @@ func TestCmdClearRange(t *testing.T) { for _, rk := range rangeTombstones { localTS := hlc.ClockTimestamp{WallTime: rk.Timestamp.WallTime - 1e9} // give range key a value if > 0 require.NoError(t, storage.MVCCDeleteRangeUsingTombstone( - ctx, eng, nil, rk.StartKey, rk.EndKey, rk.Timestamp, localTS, nil, nil, 0)) + ctx, eng, nil, rk.StartKey, rk.EndKey, rk.Timestamp, localTS, nil, nil, 0, nil)) } // Write some random point keys within the cleared span, above the range tombstones. diff --git a/pkg/kv/kvserver/batcheval/cmd_delete_range.go b/pkg/kv/kvserver/batcheval/cmd_delete_range.go index eb4a0c0eb1f9..e6c3179fbaaa 100644 --- a/pkg/kv/kvserver/batcheval/cmd_delete_range.go +++ b/pkg/kv/kvserver/batcheval/cmd_delete_range.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage" + "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/errors" ) @@ -93,10 +94,20 @@ func DeleteRange( args.Key, args.EndKey, desc.StartKey.AsRawKey(), desc.EndKey.AsRawKey()) maxIntents := storage.MaxIntentsPerWriteIntentError.Get(&cArgs.EvalCtx.ClusterSettings().SV) + // If no predicate parameters are passed, use the fast path. If we're + // deleting the entire Raft range, use an even faster path that avoids a + // point key scan to update MVCC stats. if args.Predicates == (roachpb.DeleteRangePredicates{}) { - // If no predicate parameters are passed, use the fast path. + var statsCovered *enginepb.MVCCStats + if args.Key.Equal(desc.StartKey.AsRawKey()) && args.EndKey.Equal(desc.EndKey.AsRawKey()) { + // NB: We take the fast path even if stats are estimates, because the + // slow path will likely end up with similarly poor stats anyway. + s := cArgs.EvalCtx.GetMVCCStats() + statsCovered = &s + } err := storage.MVCCDeleteRangeUsingTombstone(ctx, readWriter, cArgs.Stats, - args.Key, args.EndKey, h.Timestamp, cArgs.Now, leftPeekBound, rightPeekBound, maxIntents) + args.Key, args.EndKey, h.Timestamp, cArgs.Now, leftPeekBound, rightPeekBound, maxIntents, + statsCovered) return result.Result{}, err } diff --git a/pkg/kv/kvserver/batcheval/cmd_delete_range_test.go b/pkg/kv/kvserver/batcheval/cmd_delete_range_test.go index bae99f27b225..b8cfb058cdde 100644 --- a/pkg/kv/kvserver/batcheval/cmd_delete_range_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_delete_range_test.go @@ -62,9 +62,9 @@ func TestDeleteRangeTombstone(t *testing.T) { require.NoError(t, storage.MVCCPut(ctx, rw, nil, roachpb.Key("d"), hlc.Timestamp{WallTime: 2e9}, localTS, roachpb.MakeValueFromString("d2"), nil)) require.NoError(t, storage.MVCCDelete(ctx, rw, nil, roachpb.Key("d"), hlc.Timestamp{WallTime: 3e9}, localTS, nil)) require.NoError(t, storage.MVCCPut(ctx, rw, nil, roachpb.Key("i"), hlc.Timestamp{WallTime: 5e9}, localTS, roachpb.MakeValueFromString("i5"), &txn)) - require.NoError(t, storage.MVCCDeleteRangeUsingTombstone(ctx, rw, nil, roachpb.Key("f"), roachpb.Key("h"), hlc.Timestamp{WallTime: 3e9}, localTS, nil, nil, 0)) - require.NoError(t, storage.MVCCDeleteRangeUsingTombstone(ctx, rw, nil, roachpb.Key("Z"), roachpb.Key("a"), hlc.Timestamp{WallTime: 100e9}, localTS, nil, nil, 0)) - require.NoError(t, storage.MVCCDeleteRangeUsingTombstone(ctx, rw, nil, roachpb.Key("z"), roachpb.Key("|"), hlc.Timestamp{WallTime: 100e9}, localTS, nil, nil, 0)) + require.NoError(t, storage.MVCCDeleteRangeUsingTombstone(ctx, rw, nil, roachpb.Key("f"), roachpb.Key("h"), hlc.Timestamp{WallTime: 3e9}, localTS, nil, nil, 0, nil)) + require.NoError(t, storage.MVCCDeleteRangeUsingTombstone(ctx, rw, nil, roachpb.Key("Z"), roachpb.Key("a"), hlc.Timestamp{WallTime: 100e9}, localTS, nil, nil, 0, nil)) + require.NoError(t, storage.MVCCDeleteRangeUsingTombstone(ctx, rw, nil, roachpb.Key("z"), roachpb.Key("|"), hlc.Timestamp{WallTime: 100e9}, localTS, nil, nil, 0, nil)) } now := hlc.ClockTimestamp{Logical: 9} diff --git a/pkg/kv/kvserver/batcheval/cmd_refresh_range_test.go b/pkg/kv/kvserver/batcheval/cmd_refresh_range_test.go index 0c25563b2342..4d26aaf9ce7c 100644 --- a/pkg/kv/kvserver/batcheval/cmd_refresh_range_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_refresh_range_test.go @@ -42,7 +42,7 @@ func TestRefreshRange(t *testing.T) { require.NoError(t, storage.MVCCPut( ctx, eng, nil, roachpb.Key("c"), hlc.Timestamp{WallTime: 5}, hlc.ClockTimestamp{}, roachpb.Value{}, nil)) require.NoError(t, storage.MVCCDeleteRangeUsingTombstone( - ctx, eng, nil, roachpb.Key("d"), roachpb.Key("f"), hlc.Timestamp{WallTime: 7}, hlc.ClockTimestamp{}, nil, nil, 0)) + ctx, eng, nil, roachpb.Key("d"), roachpb.Key("f"), hlc.Timestamp{WallTime: 7}, hlc.ClockTimestamp{}, nil, nil, 0, nil)) testcases := map[string]struct { start, end string diff --git a/pkg/kv/kvserver/gc/data_distribution_test.go b/pkg/kv/kvserver/gc/data_distribution_test.go index 17183239277c..954433b80883 100644 --- a/pkg/kv/kvserver/gc/data_distribution_test.go +++ b/pkg/kv/kvserver/gc/data_distribution_test.go @@ -62,7 +62,7 @@ func (ds dataDistribution) setupTest( "invalid test data, range can't be used together with value: key=%s, rangeKey=%s", kv.Key.String(), rangeKey.String()) err := storage.MVCCDeleteRangeUsingTombstone(ctx, eng, &ms, rangeKey.StartKey, - rangeKey.EndKey, rangeKey.Timestamp, hlc.ClockTimestamp{}, nil, nil, 1) + rangeKey.EndKey, rangeKey.Timestamp, hlc.ClockTimestamp{}, nil, nil, 1, nil) require.NoError(t, err, "failed to put delete range") } else if txn == nil { if kv.Key.Timestamp.IsEmpty() { diff --git a/pkg/kv/kvserver/replica_consistency_test.go b/pkg/kv/kvserver/replica_consistency_test.go index 8fd1cdc7812b..b965485dfaaf 100644 --- a/pkg/kv/kvserver/replica_consistency_test.go +++ b/pkg/kv/kvserver/replica_consistency_test.go @@ -181,7 +181,7 @@ func TestReplicaChecksumSHA512(t *testing.T) { if len(endKey) > 0 { require.NoError(t, storage.MVCCDeleteRangeUsingTombstone( - ctx, eng, nil, key, endKey, ts, localTS, nil, nil, 0)) + ctx, eng, nil, key, endKey, ts, localTS, nil, nil, 0, nil)) } else { require.NoError(t, storage.MVCCPut(ctx, eng, nil, key, ts, localTS, value, nil)) } diff --git a/pkg/sql/BUILD.bazel b/pkg/sql/BUILD.bazel index c652503973a9..eecf7366e067 100644 --- a/pkg/sql/BUILD.bazel +++ b/pkg/sql/BUILD.bazel @@ -110,6 +110,7 @@ go_library( "explain_vec.go", "export.go", "filter.go", + "gossip.go", "grant_revoke.go", "grant_revoke_system.go", "grant_role.go", diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index 1501120152d6..c1232c85a6e8 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -2709,6 +2709,7 @@ func (ex *connExecutor) initEvalCtx(ctx context.Context, evalCtx *extendedEvalCo Tenant: p, Regions: p, JoinTokenCreator: p, + Gossip: p, PreparedStatementState: &ex.extraTxnState.prepStmtsNamespace, SessionDataStack: ex.sessionDataStack, ReCache: ex.server.reCache, diff --git a/pkg/sql/gossip.go b/pkg/sql/gossip.go new file mode 100644 index 000000000000..c4a16243678c --- /dev/null +++ b/pkg/sql/gossip.go @@ -0,0 +1,25 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package sql + +import "context" + +// TryClearGossipInfo implements the tree.GossipOperator interface. +func (p *planner) TryClearGossipInfo(ctx context.Context, key string) (bool, error) { + g, err := p.ExecCfg().Gossip.OptionalErr(0 /* issue */) + if err != nil { + return false, err + } + if err := p.RequireAdminRole(ctx, "try clear gossip info"); err != nil { + return false, err + } + return g.TryClearInfo(key) +} diff --git a/pkg/sql/logictest/testdata/logic_test/crdb_internal b/pkg/sql/logictest/testdata/logic_test/crdb_internal index 0583c5af8859..0f0fabc69a1b 100644 --- a/pkg/sql/logictest/testdata/logic_test/crdb_internal +++ b/pkg/sql/logictest/testdata/logic_test/crdb_internal @@ -1146,3 +1146,9 @@ query I SELECT crdb_internal.num_inverted_index_entries(NULL::STRING, 0) ---- 0 + +# Exercise unsafe gossip builtin functions. +query B +SELECT crdb_internal.unsafe_clear_gossip_info('unknown key') +---- +false diff --git a/pkg/sql/logictest/testdata/logic_test/pg_catalog b/pkg/sql/logictest/testdata/logic_test/pg_catalog index 2224adf84dc4..52f969f482be 100644 --- a/pkg/sql/logictest/testdata/logic_test/pg_catalog +++ b/pkg/sql/logictest/testdata/logic_test/pg_catalog @@ -4634,7 +4634,7 @@ FROM pg_proc p JOIN pg_type t ON t.typinput = p.oid WHERE t.typname = '_int4' ---- -2004 array_in array_in +2005 array_in array_in ## #16285 ## int2vectors should be 0-indexed diff --git a/pkg/sql/planner.go b/pkg/sql/planner.go index be5eb1d2c5da..a2c9a043fafa 100644 --- a/pkg/sql/planner.go +++ b/pkg/sql/planner.go @@ -385,6 +385,7 @@ func newInternalPlanner( p.extendedEvalCtx.Tenant = p p.extendedEvalCtx.Regions = p p.extendedEvalCtx.JoinTokenCreator = p + p.extendedEvalCtx.Gossip = p p.extendedEvalCtx.ClusterID = execCfg.NodeInfo.LogicalClusterID() p.extendedEvalCtx.ClusterName = execCfg.RPCContext.ClusterName() p.extendedEvalCtx.NodeID = execCfg.NodeInfo.NodeID diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 5b9eb42ec70e..50a9605c01ad 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4988,6 +4988,27 @@ value if you rely on the HLC for accuracy.`, }, ), + "crdb_internal.unsafe_clear_gossip_info": makeBuiltin( + tree.FunctionProperties{Category: builtinconstants.CategorySystemInfo}, + tree.Overload{ + Types: tree.ArgTypes{{"key", types.String}}, + ReturnType: tree.FixedReturnType(types.Bool), + Fn: func(ctx *eval.Context, args tree.Datums) (tree.Datum, error) { + key, ok := tree.AsDString(args[0]) + if !ok { + return nil, errors.Newf("expected string value, got %T", args[0]) + } + ok, err := ctx.Gossip.TryClearGossipInfo(ctx.Context, string(key)) + if err != nil { + return nil, err + } + return tree.MakeDBool(tree.DBool(ok)), nil + }, + Info: "This function is used only by CockroachDB's developers for testing purposes.", + Volatility: volatility.Volatile, + }, + ), + "crdb_internal.encode_key": makeBuiltin( tree.FunctionProperties{Category: builtinconstants.CategorySystemInfo}, tree.Overload{ diff --git a/pkg/sql/sem/eval/context.go b/pkg/sql/sem/eval/context.go index afb7694ed9a4..abff2eabbff6 100644 --- a/pkg/sql/sem/eval/context.go +++ b/pkg/sql/sem/eval/context.go @@ -153,6 +153,8 @@ type Context struct { JoinTokenCreator JoinTokenCreator + Gossip GossipOperator + PreparedStatementState PreparedStatementState // The transaction in which the statement is executing. diff --git a/pkg/sql/sem/eval/deps.go b/pkg/sql/sem/eval/deps.go index 9df07272cfc1..42b2b4f3368d 100644 --- a/pkg/sql/sem/eval/deps.go +++ b/pkg/sql/sem/eval/deps.go @@ -531,6 +531,15 @@ type JoinTokenCreator interface { CreateJoinToken(ctx context.Context) (string, error) } +// GossipOperator is capable of manipulating the cluster's gossip network. The +// methods will return errors when run by any tenant other than the system +// tenant. +type GossipOperator interface { + // TryClearGossipInfo attempts to clear an info object from the cluster's + // gossip network. + TryClearGossipInfo(ctx context.Context, key string) (bool, error) +} + // SQLStatsController is an interface embedded in EvalCtx which can be used by // the builtins to reset SQL stats in the cluster. This interface is introduced // to avoid circular dependency. diff --git a/pkg/storage/bench_pebble_test.go b/pkg/storage/bench_pebble_test.go index 760a583bf452..15108cc37e64 100644 --- a/pkg/storage/bench_pebble_test.go +++ b/pkg/storage/bench_pebble_test.go @@ -361,6 +361,24 @@ func BenchmarkMVCCDeleteRange_Pebble(b *testing.B) { } } +func BenchmarkMVCCDeleteRangeUsingTombstone_Pebble(b *testing.B) { + skip.UnderShort(b) + ctx := context.Background() + for _, numKeys := range []int{1000, 10000, 100000} { + b.Run(fmt.Sprintf("numKeys=%d", numKeys), func(b *testing.B) { + for _, valueSize := range []int{64} { + b.Run(fmt.Sprintf("valueSize=%d", valueSize), func(b *testing.B) { + for _, entireRange := range []bool{false, true} { + b.Run(fmt.Sprintf("entireRange=%t", entireRange), func(b *testing.B) { + runMVCCDeleteRangeUsingTombstone(ctx, b, setupMVCCPebble, numKeys, valueSize, entireRange) + }) + } + }) + } + }) + } +} + func BenchmarkClearMVCCVersions_Pebble(b *testing.B) { skip.UnderShort(b) ctx := context.Background() diff --git a/pkg/storage/bench_test.go b/pkg/storage/bench_test.go index 096aaee1ce6b..218bf36710a8 100644 --- a/pkg/storage/bench_test.go +++ b/pkg/storage/bench_test.go @@ -718,7 +718,7 @@ func setupMVCCData( startKey := roachpb.Key(encoding.EncodeUvarintAscending([]byte("key-"), uint64(start))) endKey := roachpb.Key(encoding.EncodeUvarintAscending([]byte("key-"), uint64(end))) require.NoError(b, MVCCDeleteRangeUsingTombstone( - ctx, batch, nil, startKey, endKey, ts, hlc.ClockTimestamp{}, nil, nil, 0)) + ctx, batch, nil, startKey, endKey, ts, hlc.ClockTimestamp{}, nil, nil, 0, nil)) } require.NoError(b, batch.Commit(false /* sync */)) batch.Close() @@ -1306,6 +1306,73 @@ func runMVCCDeleteRange(ctx context.Context, b *testing.B, emk engineMaker, valu } } +func runMVCCDeleteRangeUsingTombstone( + ctx context.Context, b *testing.B, emk engineMaker, numKeys int, valueBytes int, entireRange bool, +) { + eng, dir := setupMVCCData(ctx, b, emk, benchDataOptions{ + numVersions: 1, + numKeys: numKeys, + valueBytes: valueBytes, + }) + require.NoError(b, eng.Compact()) + + var msCovered *enginepb.MVCCStats + var leftPeekBound, rightPeekBound roachpb.Key + if entireRange { + iter := eng.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ + KeyTypes: IterKeyTypePointsAndRanges, + LowerBound: keys.LocalMax, + UpperBound: keys.MaxKey, + }) + ms, err := ComputeStatsForRange(iter, keys.LocalMax, keys.MaxKey, 0) + iter.Close() + require.NoError(b, err) + + leftPeekBound = keys.LocalMax + rightPeekBound = keys.MaxKey + msCovered = &ms + } + + eng.Close() + + b.SetBytes(int64(numKeys) * int64(overhead+valueBytes)) + b.StopTimer() + b.ResetTimer() + + locDirty := dir + "_dirty" + + for i := 0; i < b.N; i++ { + if err := os.RemoveAll(locDirty); err != nil { + b.Fatal(err) + } + if err := fileutil.CopyDir(dir, locDirty); err != nil { + b.Fatal(err) + } + func() { + eng := emk(b, locDirty) + defer eng.Close() + + b.StartTimer() + if err := MVCCDeleteRangeUsingTombstone( + ctx, + eng, + &enginepb.MVCCStats{}, + keys.LocalMax, + roachpb.KeyMax, + hlc.MaxTimestamp, + hlc.ClockTimestamp{}, + leftPeekBound, + rightPeekBound, + 0, + msCovered, + ); err != nil { + b.Fatal(err) + } + b.StopTimer() + }() + } +} + func runClearRange( ctx context.Context, b *testing.B, diff --git a/pkg/storage/metamorphic/operations.go b/pkg/storage/metamorphic/operations.go index ea5602891b65..32ec32291fc2 100644 --- a/pkg/storage/metamorphic/operations.go +++ b/pkg/storage/metamorphic/operations.go @@ -332,8 +332,8 @@ func (m mvccDeleteRangeUsingRangeTombstoneOp) run(ctx context.Context) string { return "no-op due to no non-conflicting key range" } - err := storage.MVCCDeleteRangeUsingTombstone(ctx, writer, nil, m.key, m.endKey, - m.ts, hlc.ClockTimestamp{}, m.key, m.endKey, math.MaxInt64 /* maxIntents */) + err := storage.MVCCDeleteRangeUsingTombstone(ctx, writer, nil, m.key, m.endKey, m.ts, + hlc.ClockTimestamp{}, m.key, m.endKey, math.MaxInt64 /* maxIntents */, nil /* msCovered */) if err != nil { return fmt.Sprintf("error: %s", err) } diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index 25b974485f15..8ae21870591a 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -600,6 +600,17 @@ func updateStatsOnRangeKeyCover(ts hlc.Timestamp, key MVCCKey, valueRaw []byte) return ms } +// updateStatsOnRangeKeyCoverStats updates MVCCStats for when an MVCC range +// tombstone covers existing data whose stats are already known. +func updateStatsOnRangeKeyCoverStats(ts hlc.Timestamp, cur enginepb.MVCCStats) enginepb.MVCCStats { + var ms enginepb.MVCCStats + ms.AgeTo(ts.WallTime) + ms.ContainsEstimates += cur.ContainsEstimates + ms.LiveCount -= cur.LiveCount + ms.LiveBytes -= cur.LiveBytes + return ms +} + // updateStatsOnRangeKeyMerge updates MVCCStats for a merge of two MVCC range // key stacks. Both sides of the merge must have identical versions. The merge // can happen either to the right or the left, only the merge key (i.e. the key @@ -2763,7 +2774,7 @@ func MVCCPredicateDeleteRange( batchByteSize+runByteSize >= maxBatchByteSize { if err := MVCCDeleteRangeUsingTombstone(ctx, rw, ms, runStart, runEnd.Next(), endTime, localTimestamp, leftPeekBound, rightPeekBound, - maxIntents); err != nil { + maxIntents, nil); err != nil { return err } batchByteSize += int64(MVCCRangeKey{StartKey: runStart, EndKey: runEnd, Timestamp: endTime}.EncodedSize()) @@ -2907,6 +2918,15 @@ func MVCCPredicateDeleteRange( // range tombstones that we'll merge or overlap with. These are provided to // prevent the command from reading outside of the CRDB range bounds and latch // bounds. nil means no bounds. +// +// If msCovered is given, it must contain the current stats of the data that +// will be covered by the MVCC range tombstone. This avoids scanning across all +// point keys in the span, but will still do a time-bound scan to check for +// newer point keys that we conflict with. +// +// When deleting an entire Raft range, passing the current MVCCStats as +// msCovered and setting left/rightPeekBound to start/endKey will make the +// deletion significantly faster. func MVCCDeleteRangeUsingTombstone( ctx context.Context, rw ReadWriter, @@ -2916,6 +2936,7 @@ func MVCCDeleteRangeUsingTombstone( localTimestamp hlc.ClockTimestamp, leftPeekBound, rightPeekBound roachpb.Key, maxIntents int64, + msCovered *enginepb.MVCCStats, ) error { // Validate the range key. We must do this first, to catch e.g. any bound violations. rangeKey := MVCCRangeKey{StartKey: startKey, EndKey: endKey, Timestamp: timestamp} @@ -2941,20 +2962,43 @@ func MVCCDeleteRangeUsingTombstone( return &roachpb.WriteIntentError{Intents: intents} } - // First, set up an iterator covering only the range key span itself, and scan - // it to find conflicts and update MVCC stats within it. - // - // TODO(erikgrinaker): This introduces an O(n) read penalty. We should - // optimize it, in particular by making this optional in cases where we're - // deleting an entire range and the stats can be computed without the scan. - // However, in that case we'll still have to do a time-bounded scan to check - // for conflicts. - iter := rw.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ + // If we're omitting point keys in the stats/conflict scan below, we need to + // do a separate time-bound scan for point key conflicts. + if msCovered != nil { + if err := func() error { + iter := NewMVCCIncrementalIterator(rw, MVCCIncrementalIterOptions{ + KeyTypes: IterKeyTypePointsOnly, + StartKey: startKey, + EndKey: endKey, + StartTime: timestamp.Prev(), // make inclusive + }) + defer iter.Close() + iter.SeekGE(MVCCKey{Key: startKey}) + if ok, err := iter.Valid(); err != nil { + return err + } else if ok { + key := iter.UnsafeKey().Clone() + return roachpb.NewWriteTooOldError(timestamp, key.Timestamp.Next(), key.Key.Clone()) + } + return nil + }(); err != nil { + return err + } + } + + // Scan for conflicts and MVCC stats updates. We can omit point keys from the + // scan if stats are already known for the live data. + iterOpts := IterOptions{ KeyTypes: IterKeyTypePointsAndRanges, LowerBound: startKey, UpperBound: endKey, RangeKeyMaskingBelow: timestamp, // lower point keys have already been accounted for - }) + } + if msCovered != nil { + iterOpts.KeyTypes = IterKeyTypeRangesOnly + iterOpts.RangeKeyMaskingBelow = hlc.Timestamp{} + } + iter := rw.NewMVCCIterator(MVCCKeyIterKind, iterOpts) defer iter.Close() iter.SeekGE(MVCCKey{Key: startKey}) @@ -3029,7 +3073,7 @@ func MVCCDeleteRangeUsingTombstone( // Check if the range key will merge with or fragment any existing range keys // at the bounds, and adjust stats accordingly. - if ms != nil { + if ms != nil && (!leftPeekBound.Equal(startKey) || !rightPeekBound.Equal(endKey)) { if rightPeekBound == nil { rightPeekBound = keys.MaxKey } @@ -3087,6 +3131,12 @@ func MVCCDeleteRangeUsingTombstone( } } + // If we're given MVCC stats for the covered data, mark it as deleted at the + // current timestamp. + if ms != nil && msCovered != nil { + ms.Add(updateStatsOnRangeKeyCoverStats(timestamp, *msCovered)) + } + if err := rw.PutMVCCRangeKey(rangeKey, value); err != nil { return err } diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index 9b547f380885..1d482376f2e5 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -48,7 +48,11 @@ import ( "github.com/stretchr/testify/require" ) -var sstIterVerify = util.ConstantWithMetamorphicTestBool("mvcc-histories-sst-iter-verify", false) +var ( + cmdDeleteRangeTombstoneKnownStats = util.ConstantWithMetamorphicTestBool( + "mvcc-histories-deleterange-tombstome-known-stats", false) + sstIterVerify = util.ConstantWithMetamorphicTestBool("mvcc-histories-sst-iter-verify", false) +) // TestMVCCHistories verifies that sequences of MVCC reads and writes // perform properly. @@ -74,7 +78,7 @@ var sstIterVerify = util.ConstantWithMetamorphicTestBool("mvcc-histories-sst-ite // cput [t=] [ts=[,]] [localTs=[,]] [resolve [status=]] k= v= [raw] [cond=] // del [t=] [ts=[,]] [localTs=[,]] [resolve [status=]] k= // del_range [t=] [ts=[,]] [localTs=[,]] [resolve [status=]] k= [end=] [max=] [returnKeys] -// del_range_ts [ts=[,]] [localTs=[,]] k= end= +// del_range_ts [ts=[,]] [localTs=[,]] k= end= [noCoveredStats] // del_range_pred [ts=[,]] [localTs=[,]] k= end= [startTime=,max=,maxBytes=,rangeThreshold=] // increment [t=] [ts=[,]] [localTs=[,]] [resolve [status=]] k= [inc=] // initput [t=] [ts=[,]] [resolve [status=]] k= v= [raw] [failOnTombstones] @@ -1009,8 +1013,23 @@ func cmdDeleteRangeTombstone(e *evalCtx) error { ts := e.getTs(nil) localTs := hlc.ClockTimestamp(e.getTsWithName("localTs")) + var msCovered *enginepb.MVCCStats + if cmdDeleteRangeTombstoneKnownStats && !e.hasArg("noCoveredStats") { + iter := e.engine.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ + KeyTypes: IterKeyTypePointsAndRanges, + LowerBound: key, + UpperBound: endKey, + }) + ms, err := ComputeStatsForRange(iter, key, endKey, ts.WallTime) + iter.Close() + if err != nil { + return err + } + msCovered = &ms + } + return e.withWriter("del_range_ts", func(rw ReadWriter) error { - return MVCCDeleteRangeUsingTombstone(e.ctx, rw, e.ms, key, endKey, ts, localTs, nil, nil, 0) + return MVCCDeleteRangeUsingTombstone(e.ctx, rw, e.ms, key, endKey, ts, localTs, nil, nil, 0, msCovered) }) } diff --git a/pkg/storage/mvcc_test.go b/pkg/storage/mvcc_test.go index 3e4d8d0e3d31..61d36ab01543 100644 --- a/pkg/storage/mvcc_test.go +++ b/pkg/storage/mvcc_test.go @@ -4822,15 +4822,15 @@ func TestMVCCGarbageCollect(t *testing.T) { } } if err := MVCCDeleteRangeUsingTombstone(ctx, engine, ms, roachpb.Key("r"), - roachpb.Key("r-del").Next(), ts3, hlc.ClockTimestamp{}, nil, nil, 0); err != nil { + roachpb.Key("r-del").Next(), ts3, hlc.ClockTimestamp{}, nil, nil, 0, nil); err != nil { t.Fatal(err) } if err := MVCCDeleteRangeUsingTombstone(ctx, engine, ms, roachpb.Key("t"), - roachpb.Key("u").Next(), ts2, hlc.ClockTimestamp{}, nil, nil, 0); err != nil { + roachpb.Key("u").Next(), ts2, hlc.ClockTimestamp{}, nil, nil, 0, nil); err != nil { t.Fatal(err) } if err := MVCCDeleteRangeUsingTombstone(ctx, engine, ms, roachpb.Key("t"), - roachpb.Key("u").Next(), ts3, hlc.ClockTimestamp{}, nil, nil, 0); err != nil { + roachpb.Key("u").Next(), ts3, hlc.ClockTimestamp{}, nil, nil, 0, nil); err != nil { t.Fatal(err) } if log.V(1) { @@ -5263,7 +5263,7 @@ func (d rangeTestData) populateEngine( ts = v.point.Key.Timestamp } else { require.NoError(t, MVCCDeleteRangeUsingTombstone(ctx, engine, ms, v.rangeTombstone.StartKey, - v.rangeTombstone.EndKey, v.rangeTombstone.Timestamp, hlc.ClockTimestamp{}, nil, nil, 0), + v.rangeTombstone.EndKey, v.rangeTombstone.Timestamp, hlc.ClockTimestamp{}, nil, nil, 0, nil), "failed to insert range tombstone into engine (%s)", v.rangeTombstone.String()) ts = v.rangeTombstone.Timestamp } diff --git a/pkg/storage/testdata/mvcc_histories/range_tombstone_writes b/pkg/storage/testdata/mvcc_histories/range_tombstone_writes index 1a5d13cc649b..3c8f6e1ac780 100644 --- a/pkg/storage/testdata/mvcc_histories/range_tombstone_writes +++ b/pkg/storage/testdata/mvcc_histories/range_tombstone_writes @@ -373,9 +373,10 @@ meta: "i"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 data: "i"/7.000000000,0 -> /BYTES/i7 error: (*roachpb.WriteIntentError:) conflicting intents on "i" -# Writing above an inline value should error. +# Writing above an inline value should error. We disable passing covered MVCC +# stats in metamorphic tests because it changes the error message. run error -del_range_ts k=h end=i ts=3 +del_range_ts k=h end=i ts=3 noCoveredStats ---- >> at end: rangekey: {k-p}/[4.000000000,0=/]