From 2eaba4a93d817a987cabd0b8fe6f1f49de7aaf1d Mon Sep 17 00:00:00 2001 From: Austen McClernon Date: Mon, 14 Aug 2023 21:37:30 +0000 Subject: [PATCH] kvserver: disable enqueue into repl q on span cfg update Replicas were enqueued into the replicate queue, upon the store receiving a span config update which could affect the replica. The replicate queue shouldQueue is relatively more expensive than other queues. Introduce the cluster setting kv.enqueue_in_replicate_queue_on_span_config_update.enabled, which when set to true, enables queuing up replicas on span config updates; when set to false, disables queuing replicas on span config updates. By default, this settings is set to false. Resolves: #108724 Release note (ops change): Introduce the kv.enqueue_in_replicate_queue_on_span_config_update.enabled cluster setting. When set to true, stores in the cluster will enqueue replicas for replication changes, upon receiving config updates which could affect the replica. This setting is off by default. Enabling this setting speeds up how quickly config triggered replication changes begin, but adds additional CPU overhead. The overhead scales with the number of leaseholders. --- pkg/cmd/roachtest/tests/lease_preferences.go | 1 + pkg/kv/kvserver/client_protectedts_test.go | 3 +++ pkg/kv/kvserver/replicate_queue.go | 11 +++++++++++ pkg/kv/kvserver/store.go | 15 ++++++++++++--- 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/pkg/cmd/roachtest/tests/lease_preferences.go b/pkg/cmd/roachtest/tests/lease_preferences.go index 0c4fa710ae67..53d969b24b21 100644 --- a/pkg/cmd/roachtest/tests/lease_preferences.go +++ b/pkg/cmd/roachtest/tests/lease_preferences.go @@ -176,6 +176,7 @@ func runLeasePreferences( // https://github.com/cockroachdb/cockroach/issues/105274 settings := install.MakeClusterSettings() settings.ClusterSettings["server.span_stats.span_batch_limit"] = "4096" + settings.ClusterSettings["kv.enqueue_in_replicate_queue_on_span_config_update.enabled"] = "true" startNodes := func(nodes ...int) { for _, node := range nodes { diff --git a/pkg/kv/kvserver/client_protectedts_test.go b/pkg/kv/kvserver/client_protectedts_test.go index a76965347732..62b5f7cac415 100644 --- a/pkg/kv/kvserver/client_protectedts_test.go +++ b/pkg/kv/kvserver/client_protectedts_test.go @@ -73,6 +73,9 @@ func TestProtectedTimestamps(t *testing.T) { _, err = conn.Exec("SET CLUSTER SETTING kv.closed_timestamp.target_duration = '100ms'") // speeds up the test require.NoError(t, err) + _, err = conn.Exec("SET CLUSTER SETTING kv.enqueue_in_replicate_queue_on_span_config_update.enabled = true") // speeds up the test + require.NoError(t, err) + const tableRangeMaxBytes = 1 << 18 _, err = conn.Exec("ALTER TABLE foo CONFIGURE ZONE USING "+ "gc.ttlseconds = 1, range_max_bytes = $1, range_min_bytes = 1<<10;", tableRangeMaxBytes) diff --git a/pkg/kv/kvserver/replicate_queue.go b/pkg/kv/kvserver/replicate_queue.go index fcf8c441b956..9ca60867e98f 100644 --- a/pkg/kv/kvserver/replicate_queue.go +++ b/pkg/kv/kvserver/replicate_queue.go @@ -80,6 +80,17 @@ var MinLeaseTransferInterval = settings.RegisterDurationSetting( settings.NonNegativeDuration, ) +// EnqueueInReplicateQueueOnSpanConfigUpdateEnabled controls whether replicas +// are enqueued into the replicate queue, following a span config update which +// affects the replica. +var EnqueueInReplicateQueueOnSpanConfigUpdateEnabled = settings.RegisterBoolSetting( + settings.SystemOnly, + "kv.enqueue_in_replicate_queue_on_span_config_update.enabled", + "controls whether replicas are enqueued into the replicate queue for "+ + "processing, when a span config update occurs, which affects the replica", + false, +) + var ( metaReplicateQueueAddReplicaCount = metric.Metadata{ Name: "queue.replicate.addreplica", diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 7c34e508b78d..31865a36d628 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -2381,6 +2381,12 @@ func (s *Store) onSpanConfigUpdate(ctx context.Context, updated roachpb.Span) { now := s.cfg.Clock.NowAsClockTimestamp() + // The replicate queue has a relatively more expensive queue check + // (shouldQueue), because it scales with the number of stores, and + // performs more checks. + enqueueToReplicateQueueEnabled := EnqueueInReplicateQueueOnSpanConfigUpdateEnabled.Get( + &s.GetStoreConfig().Settings.SV) + s.mu.RLock() defer s.mu.RUnlock() if err := s.mu.replicasByKey.VisitKeyRange(ctx, sp.Key, sp.EndKey, AscendingKeyOrder, @@ -2439,9 +2445,12 @@ func (s *Store) onSpanConfigUpdate(ctx context.Context, updated roachpb.Span) { s.mergeQueue.Async(replCtx, "span config update", true /* wait */, func(ctx context.Context, h queueHelper) { h.MaybeAdd(ctx, repl, now) }) - s.replicateQueue.Async(replCtx, "span config update", true /* wait */, func(ctx context.Context, h queueHelper) { - h.MaybeAdd(ctx, repl, now) - }) + + if enqueueToReplicateQueueEnabled { + s.replicateQueue.Async(replCtx, "span config update", true /* wait */, func(ctx context.Context, h queueHelper) { + h.MaybeAdd(ctx, repl, now) + }) + } return nil // more }, ); err != nil {