From 65bb6cc1714d09f8787cf4403ae7d407ee378d6c Mon Sep 17 00:00:00 2001 From: Bilal Akhtar Date: Thu, 28 Sep 2023 14:26:40 -0400 Subject: [PATCH] kvserver: Add cluster setting to use excise in snapshots This change adds a cluster setting, `kv.snapshot_receiver.excise.enabled`, to use IngestAndExcise for the replicated/user-key portion of a replica's contents instead of rangedels. This reduces write-amp as rangedels/rangekeydels have to be compacted while an excise shrinks sstables into virtual sstables to clear out contents of a replica immediately. At the moment, this is an experimental feature and should be used with caution. Epic: none Release note: None --- docs/generated/settings/settings.html | 1 + pkg/kv/kvserver/mvcc_gc_queue.go | 2 +- pkg/kv/kvserver/replica_consistency.go | 2 +- pkg/kv/kvserver/replica_raftstorage.go | 2 +- pkg/kv/kvserver/store_snapshot.go | 6 ++++-- pkg/storage/pebble.go | 27 +++++++++++++++++++++++++- 6 files changed, 34 insertions(+), 6 deletions(-) diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index b8a21c736fa0..0febaf6cb680 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -77,6 +77,7 @@
kv.replica_stats.addsst_request_size_factor
integer50000the divisor that is applied to addsstable request sizes, then recorded in a leaseholders QPS; 0 means all requests are treated as cost 1Dedicated/Self-Hosted
kv.replication_reports.interval
duration1m0sthe frequency for generating the replication_constraint_stats, replication_stats_report and replication_critical_localities reports (set to 0 to disable)Dedicated/Self-Hosted
kv.snapshot_rebalance.max_rate
byte size32 MiBthe rate limit (bytes/sec) to use for rebalance and upreplication snapshotsDedicated/Self-Hosted +
kv.snapshot_receiver.excise.enabled
booleanfalseset to true to use excises instead of range deletions for KV snapshotsDedicated/Self-Hosted
kv.transaction.max_intents_bytes
integer4194304maximum number of bytes used to track locks in transactionsServerless/Dedicated/Self-Hosted
kv.transaction.max_refresh_spans_bytes
integer4194304maximum number of bytes used to track refresh spans in serializable transactionsServerless/Dedicated/Self-Hosted
kv.transaction.reject_over_max_intents_budget.enabled
booleanfalseif set, transactions that exceed their lock tracking budget (kv.transaction.max_intents_bytes) are rejected instead of having their lock spans imprecisely compressedServerless/Dedicated/Self-Hosted diff --git a/pkg/kv/kvserver/mvcc_gc_queue.go b/pkg/kv/kvserver/mvcc_gc_queue.go index 9baf40681984..a00d12ca30b4 100644 --- a/pkg/kv/kvserver/mvcc_gc_queue.go +++ b/pkg/kv/kvserver/mvcc_gc_queue.go @@ -720,7 +720,7 @@ func (mgcq *mvccGCQueue) process( } var snap storage.Reader - if repl.store.cfg.SharedStorageEnabled || storage.UseEFOS.Get(&repl.ClusterSettings().SV) { + if repl.store.cfg.SharedStorageEnabled || storage.ShouldUseEFOS(&repl.ClusterSettings().SV) { efos := repl.store.TODOEngine().NewEventuallyFileOnlySnapshot(rditer.MakeReplicatedKeySpans(desc)) if util.RaceEnabled { ss := rditer.MakeReplicatedKeySpanSet(desc) diff --git a/pkg/kv/kvserver/replica_consistency.go b/pkg/kv/kvserver/replica_consistency.go index f9c9e296d15e..fe7dc55d482f 100644 --- a/pkg/kv/kvserver/replica_consistency.go +++ b/pkg/kv/kvserver/replica_consistency.go @@ -655,7 +655,7 @@ func (r *Replica) computeChecksumPostApply( // Raft-consistent (i.e. not in the middle of an AddSSTable). spans := rditer.MakeReplicatedKeySpans(&desc) var snap storage.Reader - if r.store.cfg.SharedStorageEnabled || storage.UseEFOS.Get(&r.ClusterSettings().SV) { + if r.store.cfg.SharedStorageEnabled || storage.ShouldUseEFOS(&r.ClusterSettings().SV) { efos := r.store.TODOEngine().NewEventuallyFileOnlySnapshot(spans) if util.RaceEnabled { ss := rditer.MakeReplicatedKeySpanSet(&desc) diff --git a/pkg/kv/kvserver/replica_raftstorage.go b/pkg/kv/kvserver/replica_raftstorage.go index 4db097d459c4..0c0d26357e75 100644 --- a/pkg/kv/kvserver/replica_raftstorage.go +++ b/pkg/kv/kvserver/replica_raftstorage.go @@ -240,7 +240,7 @@ func (r *Replica) GetSnapshot( var snap storage.Reader var startKey roachpb.RKey r.raftMu.Lock() - if r.store.cfg.SharedStorageEnabled || storage.UseEFOS.Get(&r.ClusterSettings().SV) { + if r.store.cfg.SharedStorageEnabled || storage.ShouldUseEFOS(&r.ClusterSettings().SV) { var ss *spanset.SpanSet r.mu.RLock() spans := rditer.MakeAllKeySpans(r.mu.state.Desc) // needs unreplicated to access Raft state diff --git a/pkg/kv/kvserver/store_snapshot.go b/pkg/kv/kvserver/store_snapshot.go index 07d070aca828..0fd62b969c53 100644 --- a/pkg/kv/kvserver/store_snapshot.go +++ b/pkg/kv/kvserver/store_snapshot.go @@ -15,6 +15,7 @@ import ( "io" "time" + "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/storepool" @@ -499,8 +500,9 @@ func (kvSS *kvBatchSnapshotStrategy) Receive( // TODO(jeffreyxiao): Re-evaluate as the default range size grows. keyRanges := rditer.MakeReplicatedKeySpans(header.State.Desc) - doExcise := header.SharedReplicate - if doExcise && !s.cfg.SharedStorageEnabled { + doExcise := header.SharedReplicate || (storage.UseExciseForSnapshots.Get(&s.ClusterSettings().SV) && + s.cfg.Settings.Version.IsActive(ctx, clusterversion.V23_2_PebbleFormatVirtualSSTables)) + if header.SharedReplicate && !s.cfg.SharedStorageEnabled { return noSnap, sendSnapshotError(ctx, s, stream, errors.New("cannot accept shared sstables")) } diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index dde7e17216c9..2f1bf28006b8 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -101,7 +101,10 @@ var ValueBlocksEnabled = settings.RegisterBoolSetting( // UseEFOS controls whether uses of pebble Snapshots should use // EventuallyFileOnlySnapshots instead. This reduces write-amp with the main -// tradeoff being higher space-amp. +// tradeoff being higher space-amp. Note that UseExciseForSnapshot, if true, +// effectively causes EventuallyFileOnlySnapshots to be used as well. +// +// Note: Do NOT read this setting directly. Use ShouldUseEFOS() instead. var UseEFOS = settings.RegisterBoolSetting( settings.SystemOnly, "storage.experimental.eventually_file_only_snapshots.enabled", @@ -110,6 +113,21 @@ var UseEFOS = settings.RegisterBoolSetting( "storage.experimental.eventually_file_only_snapshots.enabled", false), /* defaultValue */ settings.WithPublic) +// UseExciseForSnapshots controls whether virtual-sstable-based excises should +// be used instead of range deletions for clearing out replica contents as part +// of a rebalance/recovery snapshot application. Applied on the receiver side. +// Note that setting this setting to true also effectively causes UseEFOS above +// to become true. This interaction is why this setting is defined in the +// storage package even though it mostly affects KV. +var UseExciseForSnapshots = settings.RegisterBoolSetting( + settings.SystemOnly, + "kv.snapshot_receiver.excise.enabled", + "set to true to use excises instead of range deletions for KV snapshots", + util.ConstantWithMetamorphicTestBool( + "kv.snapshot_receiver.excise.enabled", false), /* defaultValue */ + settings.WithPublic, +) + // IngestAsFlushable controls whether ingested sstables that overlap the // memtable may be lazily ingested: written to the WAL and enqueued in the list // of flushables (eg, memtables, large batches and now lazily-ingested @@ -129,6 +147,13 @@ var IngestAsFlushable = settings.RegisterBoolSetting( util.ConstantWithMetamorphicTestBool( "storage.ingest_as_flushable.enabled", true)) +// ShouldUseEFOS returns true if either of the UseEFOS or UseExciseForSnapshots +// cluster settings are enabled, and EventuallyFileOnlySnapshots must be used +// to guarantee snapshot-like semantics. +func ShouldUseEFOS(settings *settings.Values) bool { + return UseEFOS.Get(settings) || UseExciseForSnapshots.Get(settings) +} + // EngineKeyCompare compares cockroach keys, including the version (which // could be MVCC timestamps). func EngineKeyCompare(a, b []byte) int {