From 1fb7942297f3831830eca0112ceb48e2d723728a Mon Sep 17 00:00:00 2001 From: Kai Sun Date: Tue, 4 Oct 2022 15:48:56 -0400 Subject: [PATCH 1/5] kvserver: use response data in the load-based splitter We investigated why running YCSB Workload E results in a single hot range and we observed that range queries of the form SELECT * FROM table WHERE pkey >= A LIMIT B will result in all request spans having the same end key - similar to [A, range_end) - rather than end keys that take into account the specified LIMIT. Since the majority of request spans have the same end key, the load splitter algorithm cannot find a split key without too many contained and balance between left and right requests. A proposed solution is to use the response span rather than the request span, since the response span is more accurate in reflecting the keys that this request truly iterated over. We utilize the request span as well as the response's resume span to derive the key span that this request truly iterated over. Using response data (resume span) rather than just the request span in the load-based splitter (experimentally) allows the load-based splitter to find a split key under range query workloads (YCSB Workload E, KV workload with spans). Release note (ops change): We use response data rather than just the request span in the load-based splitter to pass more accurate data about the keys iterated over to the load splitter to find a suitable split key, enabling the load splitter to find a split key under heavy range query workloads. --- pkg/kv/kvserver/replica_send.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pkg/kv/kvserver/replica_send.go b/pkg/kv/kvserver/replica_send.go index ab5a705ff0a2..ed15b2b3789a 100644 --- a/pkg/kv/kvserver/replica_send.go +++ b/pkg/kv/kvserver/replica_send.go @@ -401,6 +401,12 @@ func (r *Replica) executeBatchWithConcurrencyRetries( var requestEvalKind concurrency.RequestEvalKind var g *concurrency.Guard defer func() { + // Handle load-based splitting, if necessary. + if pErr == nil { + spansRead, _, _ := r.collectSpansRead(ba, br) + r.recordBatchForLoadBasedSplitting(ctx, ba, spansRead) + } + // NB: wrapped to delay g evaluation to its value when returning. if g != nil { r.concMgr.FinishReq(g) @@ -412,7 +418,7 @@ func (r *Replica) executeBatchWithConcurrencyRetries( // commands and wait even if the circuit breaker is tripped. pp = poison.Policy_Wait } - for first := true; ; first = false { + for { // Exit loop if context has been canceled or timed out. if err := ctx.Err(); err != nil { return nil, nil, roachpb.NewError(errors.Wrap(err, "aborted during Replica.Send")) @@ -432,11 +438,6 @@ func (r *Replica) executeBatchWithConcurrencyRetries( } } - // Handle load-based splitting, if necessary. - if first { - r.recordBatchForLoadBasedSplitting(ctx, ba, latchSpans) - } - // Acquire latches to prevent overlapping requests from executing until // this request completes. After latching, wait on any conflicting locks // to ensure that the request has full isolation during evaluation. This From d2db70ad7448d21dacad7621863fad86041a2566 Mon Sep 17 00:00:00 2001 From: Kai Sun Date: Mon, 19 Sep 2022 13:46:49 -0400 Subject: [PATCH 2/5] split: add observability for when load based splitting cannot find a key Previously, there were no metrics or logging in the load-based splitter. This was inadequate because there is minimal observability into why the load splitter could not find a split key. To address this, this patch adds metrics and logging to the load splitter, including counter metrics indicating number of times could not find a split key and popular key (>25% occurrence) and logging indicating causes for no split key (insufficient counters, imbalance, too many contained). Release note (ops change): Added observability for when load based splitting cannot find a key to indicate the reasons why the load splitter could not find a split key, enabling us to have more observability and insight to debug why a range is not splitting more easily. --- pkg/kv/kvserver/asim/state/split_decider.go | 11 +- pkg/kv/kvserver/batcheval/cmd_range_stats.go | 4 +- pkg/kv/kvserver/batcheval/eval_context.go | 8 +- pkg/kv/kvserver/merge_queue.go | 2 +- pkg/kv/kvserver/metrics.go | 23 +++ pkg/kv/kvserver/replica.go | 8 +- pkg/kv/kvserver/replica_eval_context_span.go | 8 +- pkg/kv/kvserver/replica_init.go | 2 +- pkg/kv/kvserver/replica_split_load.go | 2 +- pkg/kv/kvserver/split/BUILD.bazel | 3 + pkg/kv/kvserver/split/decider.go | 72 ++++++-- pkg/kv/kvserver/split/decider_test.go | 175 ++++++++++++++----- pkg/kv/kvserver/split/finder.go | 51 ++++++ pkg/kv/kvserver/split/finder_test.go | 128 ++++++++++++++ pkg/kv/kvserver/split_queue.go | 6 +- pkg/ts/catalog/chart_catalog.go | 12 ++ 16 files changed, 430 insertions(+), 85 deletions(-) diff --git a/pkg/kv/kvserver/asim/state/split_decider.go b/pkg/kv/kvserver/asim/state/split_decider.go index 270358e9d7de..f03ec4b4c9b0 100644 --- a/pkg/kv/kvserver/asim/state/split_decider.go +++ b/pkg/kv/kvserver/asim/state/split_decider.go @@ -11,12 +11,14 @@ package state import ( + "context" "math/rand" "time" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/workload" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/split" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/metric" ) // LoadSplitter provides an abstraction for load based splitting. It records @@ -66,7 +68,10 @@ func (s *SplitDecider) newDecider() *split.Decider { } decider := &split.Decider{} - split.Init(decider, intN, s.qpsThreshold, s.qpsRetention) + split.Init(decider, intN, s.qpsThreshold, s.qpsRetention, &split.LoadSplitterMetrics{ + PopularKeyCount: metric.NewCounter(metric.Metadata{}), + NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), + }) return decider } @@ -81,7 +86,7 @@ func (s *SplitDecider) Record(tick time.Time, rangeID RangeID, le workload.LoadE } qps := LoadEventQPS(le) - shouldSplit := decider.Record(tick, int(qps), func() roachpb.Span { + shouldSplit := decider.Record(context.Background(), tick, int(qps), func() roachpb.Span { return roachpb.Span{ Key: Key(le.Key).ToRKey().AsRawKey(), } @@ -102,7 +107,7 @@ func (s *SplitDecider) SplitKey(tick time.Time, rangeID RangeID) (Key, bool) { return InvalidKey, false } - key := decider.MaybeSplitKey(tick) + key := decider.MaybeSplitKey(context.Background(), tick) if key == nil { return InvalidKey, false } diff --git a/pkg/kv/kvserver/batcheval/cmd_range_stats.go b/pkg/kv/kvserver/batcheval/cmd_range_stats.go index 183ae552e74d..1e1d14611232 100644 --- a/pkg/kv/kvserver/batcheval/cmd_range_stats.go +++ b/pkg/kv/kvserver/batcheval/cmd_range_stats.go @@ -44,8 +44,8 @@ func RangeStats( ) (result.Result, error) { reply := resp.(*roachpb.RangeStatsResponse) reply.MVCCStats = cArgs.EvalCtx.GetMVCCStats() - reply.DeprecatedLastQueriesPerSecond = cArgs.EvalCtx.GetLastSplitQPS() - if qps, ok := cArgs.EvalCtx.GetMaxSplitQPS(); ok { + reply.DeprecatedLastQueriesPerSecond = cArgs.EvalCtx.GetLastSplitQPS(ctx) + if qps, ok := cArgs.EvalCtx.GetMaxSplitQPS(ctx); ok { reply.MaxQueriesPerSecond = qps } else { // See comment on MaxQueriesPerSecond. -1 means !ok. diff --git a/pkg/kv/kvserver/batcheval/eval_context.go b/pkg/kv/kvserver/batcheval/eval_context.go index 7f89b273c6cf..277e01c02c43 100644 --- a/pkg/kv/kvserver/batcheval/eval_context.go +++ b/pkg/kv/kvserver/batcheval/eval_context.go @@ -88,7 +88,7 @@ type EvalContext interface { // // NOTE: This should not be used when the load based splitting cluster setting // is disabled. - GetMaxSplitQPS() (float64, bool) + GetMaxSplitQPS(context.Context) (float64, bool) // GetLastSplitQPS returns the Replica's most recent queries/s request rate. // @@ -96,7 +96,7 @@ type EvalContext interface { // is disabled. // // TODO(nvanbenschoten): remove this method in v22.1. - GetLastSplitQPS() float64 + GetLastSplitQPS(context.Context) float64 GetGCThreshold() hlc.Timestamp ExcludeDataFromBackup() bool @@ -240,10 +240,10 @@ func (m *mockEvalCtxImpl) ContainsKey(key roachpb.Key) bool { func (m *mockEvalCtxImpl) GetMVCCStats() enginepb.MVCCStats { return m.Stats } -func (m *mockEvalCtxImpl) GetMaxSplitQPS() (float64, bool) { +func (m *mockEvalCtxImpl) GetMaxSplitQPS(context.Context) (float64, bool) { return m.QPS, true } -func (m *mockEvalCtxImpl) GetLastSplitQPS() float64 { +func (m *mockEvalCtxImpl) GetLastSplitQPS(context.Context) float64 { return m.QPS } func (m *mockEvalCtxImpl) CanCreateTxnRecord( diff --git a/pkg/kv/kvserver/merge_queue.go b/pkg/kv/kvserver/merge_queue.go index 0cf20d24feb1..b6eb35448354 100644 --- a/pkg/kv/kvserver/merge_queue.go +++ b/pkg/kv/kvserver/merge_queue.go @@ -217,7 +217,7 @@ func (mq *mergeQueue) process( lhsDesc := lhsRepl.Desc() lhsStats := lhsRepl.GetMVCCStats() - lhsQPS, lhsQPSOK := lhsRepl.GetMaxSplitQPS() + lhsQPS, lhsQPSOK := lhsRepl.GetMaxSplitQPS(ctx) minBytes := lhsRepl.GetMinBytes() if lhsStats.Total() >= minBytes { log.VEventf(ctx, 2, "skipping merge: LHS meets minimum size threshold %d with %d bytes", diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 0ac65b7c33fc..5f09f366c09e 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -21,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/allocatorimpl" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/rangefeed" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/split" "github.com/cockroachdb/cockroach/pkg/multitenant" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage" @@ -1644,6 +1645,20 @@ Note that the measurement does not include the duration for replicating the eval Measurement: "Nanoseconds", Unit: metric.Unit_NANOSECONDS, } + + metaPopularKeyCount = metric.Metadata{ + Name: "kv.loadsplitter.popularkey", + Help: "Load-based splitter could not find a split key and the most popular sampled split key occurs in >= 25% of the samples.", + Measurement: "Occurrences", + Unit: metric.Unit_COUNT, + } + + metaNoSplitKeyCount = metric.Metadata{ + Name: "kv.loadsplitter.nosplitkey", + Help: "Load-based splitter could not find a split key.", + Measurement: "Occurrences", + Unit: metric.Unit_COUNT, + } ) // StoreMetrics is the set of metrics for a given store. @@ -1654,6 +1669,9 @@ type StoreMetrics struct { // tenant basis. *TenantsStorageMetrics + // LoadSplitterMetrics stores metrics for load-based splitter split key. + *split.LoadSplitterMetrics + // Replica metrics. ReplicaCount *metric.Gauge // Does not include uninitialized or reserved replicas. ReservedReplicaCount *metric.Gauge @@ -2185,6 +2203,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { sm := &StoreMetrics{ registry: storeRegistry, TenantsStorageMetrics: newTenantsStorageMetrics(), + LoadSplitterMetrics: &split.LoadSplitterMetrics{ + PopularKeyCount: metric.NewCounter(metaPopularKeyCount), + NoSplitKeyCount: metric.NewCounter(metaNoSplitKeyCount), + }, // Replica metrics. ReplicaCount: metric.NewGauge(metaReplicaCount), @@ -2516,6 +2538,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { } storeRegistry.AddMetricStruct(sm) + storeRegistry.AddMetricStruct(sm.LoadSplitterMetrics) return sm } diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go index e3f8e39c2cc2..027c60123182 100644 --- a/pkg/kv/kvserver/replica.go +++ b/pkg/kv/kvserver/replica.go @@ -1150,8 +1150,8 @@ func (r *Replica) SetMVCCStatsForTesting(stats *enginepb.MVCCStats) { // works when the load based splitting cluster setting is enabled. // // Use QueriesPerSecond() for current QPS stats for all other purposes. -func (r *Replica) GetMaxSplitQPS() (float64, bool) { - return r.loadBasedSplitter.MaxQPS(r.Clock().PhysicalTime()) +func (r *Replica) GetMaxSplitQPS(ctx context.Context) (float64, bool) { + return r.loadBasedSplitter.MaxQPS(ctx, r.Clock().PhysicalTime()) } // GetLastSplitQPS returns the Replica's most recent queries/s request rate. @@ -1160,8 +1160,8 @@ func (r *Replica) GetMaxSplitQPS() (float64, bool) { // works when the load based splitting cluster setting is enabled. // // Use QueriesPerSecond() for current QPS stats for all other purposes. -func (r *Replica) GetLastSplitQPS() float64 { - return r.loadBasedSplitter.LastQPS(r.Clock().PhysicalTime()) +func (r *Replica) GetLastSplitQPS(ctx context.Context) float64 { + return r.loadBasedSplitter.LastQPS(ctx, r.Clock().PhysicalTime()) } // ContainsKey returns whether this range contains the specified key. diff --git a/pkg/kv/kvserver/replica_eval_context_span.go b/pkg/kv/kvserver/replica_eval_context_span.go index 4cee31883efa..ea103a9ec97f 100644 --- a/pkg/kv/kvserver/replica_eval_context_span.go +++ b/pkg/kv/kvserver/replica_eval_context_span.go @@ -131,14 +131,14 @@ func (rec SpanSetReplicaEvalContext) GetMVCCStats() enginepb.MVCCStats { // GetMaxSplitQPS returns the Replica's maximum queries/s rate for splitting and // merging purposes. -func (rec SpanSetReplicaEvalContext) GetMaxSplitQPS() (float64, bool) { - return rec.i.GetMaxSplitQPS() +func (rec SpanSetReplicaEvalContext) GetMaxSplitQPS(ctx context.Context) (float64, bool) { + return rec.i.GetMaxSplitQPS(ctx) } // GetLastSplitQPS returns the Replica's most recent queries/s rate for // splitting and merging purposes. -func (rec SpanSetReplicaEvalContext) GetLastSplitQPS() float64 { - return rec.i.GetLastSplitQPS() +func (rec SpanSetReplicaEvalContext) GetLastSplitQPS(ctx context.Context) float64 { + return rec.i.GetLastSplitQPS(ctx) } // CanCreateTxnRecord determines whether a transaction record can be created diff --git a/pkg/kv/kvserver/replica_init.go b/pkg/kv/kvserver/replica_init.go index d60bf9600cfb..34203d9ab9c2 100644 --- a/pkg/kv/kvserver/replica_init.go +++ b/pkg/kv/kvserver/replica_init.go @@ -98,7 +98,7 @@ func newUnloadedReplica( return float64(SplitByLoadQPSThreshold.Get(&store.cfg.Settings.SV)) }, func() time.Duration { return kvserverbase.SplitByLoadMergeDelay.Get(&store.cfg.Settings.SV) - }) + }, store.metrics.LoadSplitterMetrics) r.mu.proposals = map[kvserverbase.CmdIDKey]*ProposalData{} r.mu.checksums = map[uuid.UUID]*replicaChecksum{} r.mu.proposalBuf.Init((*replicaProposer)(r), tracker.NewLockfreeTracker(), r.Clock(), r.ClusterSettings()) diff --git a/pkg/kv/kvserver/replica_split_load.go b/pkg/kv/kvserver/replica_split_load.go index 15adcc0dad81..ea42bbabbdde 100644 --- a/pkg/kv/kvserver/replica_split_load.go +++ b/pkg/kv/kvserver/replica_split_load.go @@ -56,7 +56,7 @@ func (r *Replica) recordBatchForLoadBasedSplitting( if !r.SplitByLoadEnabled() { return } - shouldInitSplit := r.loadBasedSplitter.Record(timeutil.Now(), len(ba.Requests), func() roachpb.Span { + shouldInitSplit := r.loadBasedSplitter.Record(ctx, timeutil.Now(), len(ba.Requests), func() roachpb.Span { return spans.BoundarySpan(spanset.SpanGlobal) }) if shouldInitSplit { diff --git a/pkg/kv/kvserver/split/BUILD.bazel b/pkg/kv/kvserver/split/BUILD.bazel index 354061351004..02f19930667f 100644 --- a/pkg/kv/kvserver/split/BUILD.bazel +++ b/pkg/kv/kvserver/split/BUILD.bazel @@ -12,6 +12,8 @@ go_library( deps = [ "//pkg/keys", "//pkg/roachpb", + "//pkg/util/log", + "//pkg/util/metric", "//pkg/util/syncutil", ], ) @@ -30,6 +32,7 @@ go_test( "//pkg/roachpb", "//pkg/util/encoding", "//pkg/util/leaktest", + "//pkg/util/metric", "//pkg/util/stop", "//pkg/util/timeutil", "@com_github_stretchr_testify//assert", diff --git a/pkg/kv/kvserver/split/decider.go b/pkg/kv/kvserver/split/decider.go index c29e316d2eee..ce7d54e6c3fd 100644 --- a/pkg/kv/kvserver/split/decider.go +++ b/pkg/kv/kvserver/split/decider.go @@ -13,14 +13,18 @@ package split import ( + "context" "time" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/cockroachdb/cockroach/pkg/util/syncutil" ) const minSplitSuggestionInterval = time.Minute +const minNoSplitKeyLoggingMetricsInterval = time.Minute const minQueriesPerSecondSampleDuration = time.Second // A Decider collects measurements about the activity (measured in qps) on a @@ -48,10 +52,21 @@ const minQueriesPerSecondSampleDuration = time.Second // prevent load-based splits from being merged away until the resulting ranges // have consistently remained below a certain QPS threshold for a sufficiently // long period of time. + +// LoadSplitterMetrics consists of metrics for load-based splitter split key. +type LoadSplitterMetrics struct { + PopularKeyCount *metric.Counter + NoSplitKeyCount *metric.Counter +} + +// Decider tracks the latest QPS and if certain conditions are met, records +// incoming requests to find potential split keys and checks if sampled +// candidate split keys satisfy certain requirements. type Decider struct { - intn func(n int) int // supplied to Init - qpsThreshold func() float64 // supplied to Init - qpsRetention func() time.Duration // supplied to Init + intn func(n int) int // supplied to Init + qpsThreshold func() float64 // supplied to Init + qpsRetention func() time.Duration // supplied to Init + loadSplitterMetrics *LoadSplitterMetrics // supplied to Init mu struct { syncutil.Mutex @@ -67,6 +82,9 @@ type Decider struct { // Fields tracking split key suggestions. splitFinder *Finder // populated when engaged or decided lastSplitSuggestion time.Time // last stipulation to client to carry out split + + // Fields tracking logging / metrics around load-based splitter split key. + lastNoSplitKeyLoggingMetrics time.Time } } @@ -79,10 +97,12 @@ func Init( intn func(n int) int, qpsThreshold func() float64, qpsRetention func() time.Duration, + loadSplitterMetrics *LoadSplitterMetrics, ) { lbs.intn = intn lbs.qpsThreshold = qpsThreshold lbs.qpsRetention = qpsRetention + lbs.loadSplitterMetrics = loadSplitterMetrics } // Record notifies the Decider that 'n' operations are being carried out which @@ -93,14 +113,16 @@ func Init( // If the returned boolean is true, a split key is available (though it may // disappear as more keys are sampled) and should be initiated by the caller, // which can call MaybeSplitKey to retrieve the suggested key. -func (d *Decider) Record(now time.Time, n int, span func() roachpb.Span) bool { +func (d *Decider) Record(ctx context.Context, now time.Time, n int, span func() roachpb.Span) bool { d.mu.Lock() defer d.mu.Unlock() - return d.recordLocked(now, n, span) + return d.recordLocked(ctx, now, n, span) } -func (d *Decider) recordLocked(now time.Time, n int, span func() roachpb.Span) bool { +func (d *Decider) recordLocked( + ctx context.Context, now time.Time, n int, span func() roachpb.Span, +) bool { d.mu.count += int64(n) // First compute requests per second since the last check. @@ -137,9 +159,28 @@ func (d *Decider) recordLocked(now time.Time, n int, span func() roachpb.Span) b if s.Key != nil { d.mu.splitFinder.Record(span(), d.intn) } - if now.Sub(d.mu.lastSplitSuggestion) > minSplitSuggestionInterval && d.mu.splitFinder.Ready(now) && d.mu.splitFinder.Key() != nil { - d.mu.lastSplitSuggestion = now - return true + if d.mu.splitFinder.Ready(now) { + if d.mu.splitFinder.Key() != nil { + if now.Sub(d.mu.lastSplitSuggestion) > minSplitSuggestionInterval { + d.mu.lastSplitSuggestion = now + return true + } + } else { + if now.Sub(d.mu.lastNoSplitKeyLoggingMetrics) > minNoSplitKeyLoggingMetricsInterval { + d.mu.lastNoSplitKeyLoggingMetrics = now + insufficientCounters, imbalance, tooManyContained, imbalanceAndTooManyContained := d.mu.splitFinder.NoSplitKeyCause() + if insufficientCounters < splitKeySampleSize { + popularKeyFrequency := d.mu.splitFinder.PopularKeyFrequency() + if popularKeyFrequency >= splitKeyThreshold { + d.loadSplitterMetrics.PopularKeyCount.Inc(1) + } + d.loadSplitterMetrics.NoSplitKeyCount.Inc(1) + log.KvDistribution.Infof(ctx, + "No split key found: insufficient counters = %d, imbalance = %d, too many contained = %d, imbalance and too many contained = %d, most popular key occurs in %d%% of samples", + insufficientCounters, imbalance, tooManyContained, imbalanceAndTooManyContained, int(popularKeyFrequency*100)) + } + } + } } } return false @@ -156,22 +197,22 @@ func (d *Decider) RecordMax(now time.Time, qps float64) { } // LastQPS returns the most recent QPS measurement. -func (d *Decider) LastQPS(now time.Time) float64 { +func (d *Decider) LastQPS(ctx context.Context, now time.Time) float64 { d.mu.Lock() defer d.mu.Unlock() - d.recordLocked(now, 0, nil) // force QPS computation + d.recordLocked(ctx, now, 0, nil) // force QPS computation return d.mu.lastQPS } // MaxQPS returns the maximum QPS measurement recorded over the retention // period. If the Decider has not been recording for a full retention period, // the method returns false. -func (d *Decider) MaxQPS(now time.Time) (float64, bool) { +func (d *Decider) MaxQPS(ctx context.Context, now time.Time) (float64, bool) { d.mu.Lock() defer d.mu.Unlock() - d.recordLocked(now, 0, nil) // force QPS computation + d.recordLocked(ctx, now, 0, nil) // force QPS computation return d.mu.maxQPS.maxQPS(now, d.qpsRetention()) } @@ -180,13 +221,13 @@ func (d *Decider) MaxQPS(now time.Time) (float64, bool) { // or if it wasn't able to determine a suitable split key. // // It is legal to call MaybeSplitKey at any time. -func (d *Decider) MaybeSplitKey(now time.Time) roachpb.Key { +func (d *Decider) MaybeSplitKey(ctx context.Context, now time.Time) roachpb.Key { var key roachpb.Key d.mu.Lock() defer d.mu.Unlock() - d.recordLocked(now, 0, nil) + d.recordLocked(ctx, now, 0, nil) if d.mu.splitFinder != nil && d.mu.splitFinder.Ready(now) { // We've found a key to split at. This key might be in the middle of a // SQL row. If we fail to rectify that, we'll cause SQL crashes: @@ -240,6 +281,7 @@ func (d *Decider) Reset(now time.Time) { d.mu.maxQPS.reset(now, d.qpsRetention()) d.mu.splitFinder = nil d.mu.lastSplitSuggestion = time.Time{} + d.mu.lastNoSplitKeyLoggingMetrics = time.Time{} } // maxQPSTracker collects a series of queries-per-second measurement samples and diff --git a/pkg/kv/kvserver/split/decider_test.go b/pkg/kv/kvserver/split/decider_test.go index 3ea1821c3734..6db6a2e4223a 100644 --- a/pkg/kv/kvserver/split/decider_test.go +++ b/pkg/kv/kvserver/split/decider_test.go @@ -11,6 +11,7 @@ package split import ( + "context" "math" "math/rand" "testing" @@ -20,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/encoding" "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -38,7 +40,10 @@ func TestDecider(t *testing.T) { intn := rand.New(rand.NewSource(12)).Intn var d Decider - Init(&d, intn, func() float64 { return 10.0 }, func() time.Duration { return 2 * time.Second }) + Init(&d, intn, func() float64 { return 10.0 }, func() time.Duration { return 2 * time.Second }, &LoadSplitterMetrics{ + PopularKeyCount: metric.NewCounter(metric.Metadata{}), + NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), + }) op := func(s string) func() roachpb.Span { return func() roachpb.Span { return roachpb.Span{Key: roachpb.Key(s)} } @@ -46,38 +51,38 @@ func TestDecider(t *testing.T) { assertQPS := func(i int, expQPS float64) { t.Helper() - qps := d.LastQPS(ms(i)) + qps := d.LastQPS(context.Background(), ms(i)) assert.Equal(t, expQPS, qps) } assertMaxQPS := func(i int, expMaxQPS float64, expOK bool) { t.Helper() - maxQPS, ok := d.MaxQPS(ms(i)) + maxQPS, ok := d.MaxQPS(context.Background(), ms(i)) assert.Equal(t, expMaxQPS, maxQPS) assert.Equal(t, expOK, ok) } - assert.Equal(t, false, d.Record(ms(100), 1, nil)) + assert.Equal(t, false, d.Record(context.Background(), ms(100), 1, nil)) assertQPS(100, 0) assertMaxQPS(100, 0, false) assert.Equal(t, ms(100), d.mu.lastQPSRollover) assert.EqualValues(t, 1, d.mu.count) - assert.Equal(t, false, d.Record(ms(400), 3, nil)) + assert.Equal(t, false, d.Record(context.Background(), ms(400), 3, nil)) assertQPS(100, 0) assertQPS(700, 0) assertMaxQPS(400, 0, false) - assert.Equal(t, false, d.Record(ms(300), 3, nil)) + assert.Equal(t, false, d.Record(context.Background(), ms(300), 3, nil)) assertQPS(100, 0) assertMaxQPS(300, 0, false) - assert.Equal(t, false, d.Record(ms(900), 1, nil)) + assert.Equal(t, false, d.Record(context.Background(), ms(900), 1, nil)) assertQPS(0, 0) assertMaxQPS(900, 0, false) - assert.Equal(t, false, d.Record(ms(1099), 1, nil)) + assert.Equal(t, false, d.Record(context.Background(), ms(1099), 1, nil)) assertQPS(0, 0) assertMaxQPS(1099, 0, false) @@ -86,7 +91,7 @@ func TestDecider(t *testing.T) { // It won't engage because the duration between the rollovers is 1.1s, and // we had 10 events over that interval. - assert.Equal(t, false, d.Record(ms(1200), 1, nil)) + assert.Equal(t, false, d.Record(context.Background(), ms(1200), 1, nil)) assertQPS(0, float64(10)/float64(1.1)) assert.Equal(t, ms(1200), d.mu.lastQPSRollover) assertMaxQPS(1099, 0, false) @@ -95,11 +100,11 @@ func TestDecider(t *testing.T) { assert.Equal(t, nilFinder, d.mu.splitFinder) - assert.Equal(t, false, d.Record(ms(2199), 12, nil)) + assert.Equal(t, false, d.Record(context.Background(), ms(2199), 12, nil)) assert.Equal(t, nilFinder, d.mu.splitFinder) // 2200 is the next rollover point, and 12+1=13 qps should be computed. - assert.Equal(t, false, d.Record(ms(2200), 1, op("a"))) + assert.Equal(t, false, d.Record(context.Background(), ms(2200), 1, op("a"))) assert.Equal(t, ms(2200), d.mu.lastQPSRollover) assertQPS(0, float64(13)) assertMaxQPS(2200, 13, true) @@ -111,7 +116,7 @@ func TestDecider(t *testing.T) { // to split. We don't test the details of exactly when that happens because // this is done in the finder tests. tick := 2200 - for o := op("a"); !d.Record(ms(tick), 11, o); tick += 1000 { + for o := op("a"); !d.Record(context.Background(), ms(tick), 11, o); tick += 1000 { if tick/1000%2 == 0 { o = op("z") } else { @@ -119,7 +124,7 @@ func TestDecider(t *testing.T) { } } - assert.Equal(t, roachpb.Key("z"), d.MaybeSplitKey(ms(tick))) + assert.Equal(t, roachpb.Key("z"), d.MaybeSplitKey(context.Background(), ms(tick))) // We were told to split, but won't be told to split again for some time // to avoid busy-looping on split attempts. @@ -128,35 +133,35 @@ func TestDecider(t *testing.T) { if i%2 != 0 { o = op("a") } - assert.False(t, d.Record(ms(tick), 11, o)) - assert.True(t, d.LastQPS(ms(tick)) > 1.0) + assert.False(t, d.Record(context.Background(), ms(tick), 11, o)) + assert.True(t, d.LastQPS(context.Background(), ms(tick)) > 1.0) // Even though the split key remains. - assert.Equal(t, roachpb.Key("z"), d.MaybeSplitKey(ms(tick+999))) + assert.Equal(t, roachpb.Key("z"), d.MaybeSplitKey(context.Background(), ms(tick+999))) tick += 1000 } // But after minSplitSuggestionInterval of ticks, we get another one. - assert.True(t, d.Record(ms(tick), 11, op("a"))) + assert.True(t, d.Record(context.Background(), ms(tick), 11, op("a"))) assertQPS(tick, float64(11)) assertMaxQPS(tick, 11, true) // Split key suggestion vanishes once qps drops. tick += 1000 - assert.False(t, d.Record(ms(tick), 9, op("a"))) - assert.Equal(t, roachpb.Key(nil), d.MaybeSplitKey(ms(tick))) + assert.False(t, d.Record(context.Background(), ms(tick), 9, op("a"))) + assert.Equal(t, roachpb.Key(nil), d.MaybeSplitKey(context.Background(), ms(tick))) assert.Equal(t, nilFinder, d.mu.splitFinder) // Hammer a key with writes above threshold. There shouldn't be a split // since everyone is hitting the same key and load can't be balanced. for i := 0; i < 1000; i++ { - assert.False(t, d.Record(ms(tick), 11, op("q"))) + assert.False(t, d.Record(context.Background(), ms(tick), 11, op("q"))) tick += 1000 } assert.True(t, d.mu.splitFinder.Ready(ms(tick))) - assert.Equal(t, roachpb.Key(nil), d.MaybeSplitKey(ms(tick))) + assert.Equal(t, roachpb.Key(nil), d.MaybeSplitKey(context.Background(), ms(tick))) // But the finder keeps sampling to adapt to changing workload... for i := 0; i < 1000; i++ { - assert.False(t, d.Record(ms(tick), 11, op("p"))) + assert.False(t, d.Record(context.Background(), ms(tick), 11, op("p"))) tick += 1000 } @@ -168,7 +173,7 @@ func TestDecider(t *testing.T) { // Since the new workload is also not partitionable, nothing changes in // the decision. assert.True(t, d.mu.splitFinder.Ready(ms(tick))) - assert.Equal(t, roachpb.Key(nil), d.MaybeSplitKey(ms(tick))) + assert.Equal(t, roachpb.Key(nil), d.MaybeSplitKey(context.Background(), ms(tick))) // Get the decider engaged again so that we can test Reset(). for i := 0; i < 1000; i++ { @@ -176,16 +181,16 @@ func TestDecider(t *testing.T) { if i%2 != 0 { o = op("a") } - d.Record(ms(tick), 11, o) + d.Record(context.Background(), ms(tick), 11, o) tick += 500 } // The finder wants to split, until Reset is called, at which point it starts // back up at zero. assert.True(t, d.mu.splitFinder.Ready(ms(tick))) - assert.Equal(t, roachpb.Key("z"), d.MaybeSplitKey(ms(tick))) + assert.Equal(t, roachpb.Key("z"), d.MaybeSplitKey(context.Background(), ms(tick))) d.Reset(ms(tick)) - assert.Nil(t, d.MaybeSplitKey(ms(tick))) + assert.Nil(t, d.MaybeSplitKey(context.Background(), ms(tick))) assert.Nil(t, d.mu.splitFinder) } @@ -194,11 +199,14 @@ func TestDecider_MaxQPS(t *testing.T) { intn := rand.New(rand.NewSource(11)).Intn var d Decider - Init(&d, intn, func() float64 { return 100.0 }, func() time.Duration { return 10 * time.Second }) + Init(&d, intn, func() float64 { return 100.0 }, func() time.Duration { return 10 * time.Second }, &LoadSplitterMetrics{ + PopularKeyCount: metric.NewCounter(metric.Metadata{}), + NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), + }) assertMaxQPS := func(i int, expMaxQPS float64, expOK bool) { t.Helper() - maxQPS, ok := d.MaxQPS(ms(i)) + maxQPS, ok := d.MaxQPS(context.Background(), ms(i)) assert.Equal(t, expMaxQPS, maxQPS) assert.Equal(t, expOK, ok) } @@ -206,22 +214,22 @@ func TestDecider_MaxQPS(t *testing.T) { assertMaxQPS(1000, 0, false) // Record a large number of samples. - d.Record(ms(1500), 5, nil) - d.Record(ms(2000), 5, nil) - d.Record(ms(4500), 1, nil) - d.Record(ms(5000), 15, nil) - d.Record(ms(5500), 2, nil) - d.Record(ms(8000), 5, nil) - d.Record(ms(10000), 9, nil) + d.Record(context.Background(), ms(1500), 5, nil) + d.Record(context.Background(), ms(2000), 5, nil) + d.Record(context.Background(), ms(4500), 1, nil) + d.Record(context.Background(), ms(5000), 15, nil) + d.Record(context.Background(), ms(5500), 2, nil) + d.Record(context.Background(), ms(8000), 5, nil) + d.Record(context.Background(), ms(10000), 9, nil) assertMaxQPS(10000, 0, false) assertMaxQPS(11000, 17, true) // Record more samples with a lower QPS. - d.Record(ms(12000), 1, nil) - d.Record(ms(13000), 4, nil) - d.Record(ms(15000), 2, nil) - d.Record(ms(19000), 3, nil) + d.Record(context.Background(), ms(12000), 1, nil) + d.Record(context.Background(), ms(13000), 4, nil) + d.Record(context.Background(), ms(15000), 2, nil) + d.Record(context.Background(), ms(19000), 3, nil) assertMaxQPS(20000, 4.5, true) assertMaxQPS(21000, 4, true) @@ -237,7 +245,10 @@ func TestDeciderCallsEnsureSafeSplitKey(t *testing.T) { intn := rand.New(rand.NewSource(11)).Intn var d Decider - Init(&d, intn, func() float64 { return 1.0 }, func() time.Duration { return time.Second }) + Init(&d, intn, func() float64 { return 1.0 }, func() time.Duration { return time.Second }, &LoadSplitterMetrics{ + PopularKeyCount: metric.NewCounter(metric.Metadata{}), + NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), + }) baseKey := keys.SystemSQLCodec.TablePrefix(51) for i := 0; i < 4; i++ { @@ -253,10 +264,10 @@ func TestDeciderCallsEnsureSafeSplitKey(t *testing.T) { var now time.Time for i := 0; i < 2*int(minSplitSuggestionInterval/time.Second); i++ { now = now.Add(500 * time.Millisecond) - d.Record(now, 1, c0) + d.Record(context.Background(), now, 1, c0) now = now.Add(500 * time.Millisecond) - d.Record(now, 1, c1) - k = d.MaybeSplitKey(now) + d.Record(context.Background(), now, 1, c1) + k = d.MaybeSplitKey(context.Background(), now) if len(k) != 0 { break } @@ -270,7 +281,10 @@ func TestDeciderIgnoresEnsureSafeSplitKeyOnError(t *testing.T) { intn := rand.New(rand.NewSource(11)).Intn var d Decider - Init(&d, intn, func() float64 { return 1.0 }, func() time.Duration { return time.Second }) + Init(&d, intn, func() float64 { return 1.0 }, func() time.Duration { return time.Second }, &LoadSplitterMetrics{ + PopularKeyCount: metric.NewCounter(metric.Metadata{}), + NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), + }) baseKey := keys.SystemSQLCodec.TablePrefix(51) for i := 0; i < 4; i++ { @@ -290,10 +304,10 @@ func TestDeciderIgnoresEnsureSafeSplitKeyOnError(t *testing.T) { var now time.Time for i := 0; i < 2*int(minSplitSuggestionInterval/time.Second); i++ { now = now.Add(500 * time.Millisecond) - d.Record(now, 1, c0) + d.Record(context.Background(), now, 1, c0) now = now.Add(500 * time.Millisecond) - d.Record(now, 1, c1) - k = d.MaybeSplitKey(now) + d.Record(context.Background(), now, 1, c1) + k = d.MaybeSplitKey(context.Background(), now) if len(k) != 0 { break } @@ -392,3 +406,70 @@ func TestMaxQPSTracker(t *testing.T) { require.Equal(t, [6]float64{20, 27, 0, 0, 0, 0}, mt.windows) require.Equal(t, 1, mt.curIdx) } + +func TestDeciderMetrics(t *testing.T) { + defer leaktest.AfterTest(t)() + intn := rand.New(rand.NewSource(11)).Intn + timeStart := 1000 + + var dPopular Decider + Init(&dPopular, intn, func() float64 { return 1.0 }, func() time.Duration { return time.Second }, &LoadSplitterMetrics{ + PopularKeyCount: metric.NewCounter(metric.Metadata{}), + NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), + }) + + // No split key, popular key + for i := 0; i < 20; i++ { + dPopular.Record(context.Background(), ms(timeStart), 1, func() roachpb.Span { + return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))} + }) + } + for i := 1; i <= 2000; i++ { + dPopular.Record(context.Background(), ms(timeStart+i*50), 1, func() roachpb.Span { + return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))} + }) + } + + assert.Equal(t, dPopular.loadSplitterMetrics.PopularKeyCount.Count(), int64(2)) + assert.Equal(t, dPopular.loadSplitterMetrics.NoSplitKeyCount.Count(), int64(2)) + + // No split key, not popular key + var dNotPopular Decider + Init(&dNotPopular, intn, func() float64 { return 1.0 }, func() time.Duration { return time.Second }, &LoadSplitterMetrics{ + PopularKeyCount: metric.NewCounter(metric.Metadata{}), + NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), + }) + for i := 0; i < 20; i++ { + dNotPopular.Record(context.Background(), ms(timeStart), 1, func() roachpb.Span { + return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))} + }) + } + for i := 1; i <= 2000; i++ { + dNotPopular.Record(context.Background(), ms(timeStart+i*50), 1, func() roachpb.Span { + return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(i))} + }) + } + + assert.Equal(t, dNotPopular.loadSplitterMetrics.PopularKeyCount.Count(), int64(0)) + assert.Equal(t, dNotPopular.loadSplitterMetrics.NoSplitKeyCount.Count(), int64(2)) + + // No split key, all insufficient counters + var dAllInsufficientCounters Decider + Init(&dAllInsufficientCounters, intn, func() float64 { return 1.0 }, func() time.Duration { return time.Second }, &LoadSplitterMetrics{ + PopularKeyCount: metric.NewCounter(metric.Metadata{}), + NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), + }) + for i := 0; i < 20; i++ { + dAllInsufficientCounters.Record(context.Background(), ms(timeStart), 1, func() roachpb.Span { + return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))} + }) + } + for i := 1; i <= 80; i++ { + dAllInsufficientCounters.Record(context.Background(), ms(timeStart+i*1000), 1, func() roachpb.Span { + return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))} + }) + } + + assert.Equal(t, dAllInsufficientCounters.loadSplitterMetrics.PopularKeyCount.Count(), int64(0)) + assert.Equal(t, dAllInsufficientCounters.loadSplitterMetrics.NoSplitKeyCount.Count(), int64(0)) +} diff --git a/pkg/kv/kvserver/split/finder.go b/pkg/kv/kvserver/split/finder.go index 942216a36db6..e622e6a6e044 100644 --- a/pkg/kv/kvserver/split/finder.go +++ b/pkg/kv/kvserver/split/finder.go @@ -13,6 +13,7 @@ package split import ( "bytes" "math" + "sort" "time" "github.com/cockroachdb/cockroach/pkg/roachpb" @@ -149,3 +150,53 @@ func (f *Finder) Key() roachpb.Key { } return f.samples[bestIdx].key } + +// NoSplitKeyCause iterates over all sampled candidate split keys and +// determines the number of samples that don't pass each split key requirement +// (e.g. insufficient counters, imbalance in left and right counters, too many +// contained counters, or a combination of the last two). +func (f *Finder) NoSplitKeyCause() ( + insufficientCounters, imbalance, tooManyContained, imbalanceAndTooManyContained int, +) { + for _, s := range f.samples { + if s.left+s.right+s.contained < splitKeyMinCounter { + insufficientCounters++ + } else { + balanceScore := math.Abs(float64(s.left-s.right)) / float64(s.left+s.right) + imbalanceBool := balanceScore >= splitKeyThreshold + containedScore := float64(s.contained) / float64(s.left+s.right+s.contained) + tooManyContainedBool := containedScore >= splitKeyContainedThreshold + if imbalanceBool && !tooManyContainedBool { + imbalance++ + } else if !imbalanceBool && tooManyContainedBool { + tooManyContained++ + } else if imbalanceBool && tooManyContainedBool { + imbalanceAndTooManyContained++ + } + } + } + return +} + +// PopularKeyFrequency returns the percentage that the most popular key appears +// in f.samples. +func (f *Finder) PopularKeyFrequency() float64 { + sort.Slice(f.samples[:], func(i, j int) bool { + return bytes.Compare(f.samples[i].key, f.samples[j].key) < 0 + }) + + currentKeyCount := 1 + popularKeyCount := 1 + for i := 1; i < len(f.samples); i++ { + if bytes.Equal(f.samples[i].key, f.samples[i-1].key) { + currentKeyCount++ + } else { + currentKeyCount = 1 + } + if popularKeyCount < currentKeyCount { + popularKeyCount = currentKeyCount + } + } + + return float64(popularKeyCount) / float64(splitKeySampleSize) +} diff --git a/pkg/kv/kvserver/split/finder_test.go b/pkg/kv/kvserver/split/finder_test.go index 6f6783868e78..0afd1844fc4c 100644 --- a/pkg/kv/kvserver/split/finder_test.go +++ b/pkg/kv/kvserver/split/finder_test.go @@ -13,6 +13,7 @@ package split import ( "bytes" "context" + "math/rand" "reflect" "testing" @@ -21,6 +22,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/stop" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/stretchr/testify/assert" ) // TestSplitFinderKey verifies the Key() method correctly @@ -271,3 +273,129 @@ func TestSplitFinderRecorder(t *testing.T) { } } } + +func TestFinderNoSplitKeyCause(t *testing.T) { + samples := [splitKeySampleSize]sample{} + for i, idx := range rand.Perm(splitKeySampleSize) { + if i < 5 { + // insufficient counters + samples[idx] = sample{ + key: keys.SystemSQLCodec.TablePrefix(uint32(i)), + left: 0, + right: 0, + contained: splitKeyMinCounter - 1, + } + } else if i < 7 { + // imbalance + deviationLeft := rand.Intn(5) + deviationRight := rand.Intn(5) + samples[idx] = sample{ + key: keys.SystemSQLCodec.TablePrefix(uint32(i)), + left: 25 + deviationLeft, + right: 15 - deviationRight, + contained: int(max(float64(splitKeyMinCounter-40-deviationLeft+deviationRight), float64(40+deviationLeft-deviationRight))), + } + } else if i < 13 { + // imbalance + deviationLeft := rand.Intn(5) + deviationRight := rand.Intn(5) + samples[idx] = sample{ + key: keys.SystemSQLCodec.TablePrefix(uint32(i)), + left: 50 + deviationLeft, + right: 30 - deviationRight, + contained: int(max(float64(splitKeyMinCounter-80-deviationLeft+deviationRight), 0)), + } + } else { + // too many contained + contained := int(splitKeyMinCounter*splitKeyContainedThreshold + 1) + left := (splitKeyMinCounter - contained) / 2 + samples[idx] = sample{ + key: keys.SystemSQLCodec.TablePrefix(uint32(i)), + left: left, + right: splitKeyMinCounter - left - contained, + contained: contained, + } + } + } + + finder := NewFinder(timeutil.Now()) + finder.samples = samples + insufficientCounters, imbalance, tooManyContained, imbalanceAndTooManyContained := finder.NoSplitKeyCause() + assert.Equal(t, 5, insufficientCounters, "unexpected insufficient counters") + assert.Equal(t, 6, imbalance, "unexpected imbalance counters") + assert.Equal(t, 7, tooManyContained, "unexpected too many contained counters") + assert.Equal(t, 2, imbalanceAndTooManyContained, "unexpected imbalance and too many contained counters") +} + +func TestFinderPopularKeyFrequency(t *testing.T) { + uniqueKeySample := [splitKeySampleSize]sample{} + for i, idx := range rand.Perm(splitKeySampleSize) { + uniqueKeySample[idx] = sample{ + key: keys.SystemSQLCodec.TablePrefix(uint32(i)), + } + } + twentyPercentPopularKeySample := [splitKeySampleSize]sample{} + for i, idx := range rand.Perm(splitKeySampleSize) { + var tableID uint32 + if i <= 15 { + tableID = uint32(i / 3) + } else { + tableID = 6 + } + twentyPercentPopularKeySample[idx] = sample{ + key: keys.SystemSQLCodec.TablePrefix(tableID), + } + } + twentyFivePercentPopularKeySample := [splitKeySampleSize]sample{} + for i, idx := range rand.Perm(splitKeySampleSize) { + var tableID uint32 + if i < 8 || i >= 13 { + tableID = uint32(i / 4) + } else { + tableID = 2 + } + twentyFivePercentPopularKeySample[idx] = sample{ + key: keys.SystemSQLCodec.TablePrefix(tableID), + } + } + fiftyPercentPopularKeySample := [splitKeySampleSize]sample{} + for i, idx := range rand.Perm(splitKeySampleSize) { + fiftyPercentPopularKeySample[idx] = sample{ + key: keys.SystemSQLCodec.TablePrefix(uint32(i / 10)), + } + } + fiftyFivePercentPopularKeySample := [splitKeySampleSize]sample{} + for i, idx := range rand.Perm(splitKeySampleSize) { + var tableID uint32 + if i >= 11 { + tableID = uint32(1) + } + fiftyFivePercentPopularKeySample[idx] = sample{ + key: keys.SystemSQLCodec.TablePrefix(tableID), + } + } + sameKeySample := [splitKeySampleSize]sample{} + for _, idx := range rand.Perm(splitKeySampleSize) { + sameKeySample[idx] = sample{ + key: keys.SystemSQLCodec.TablePrefix(0), + } + } + + testCases := []struct { + samples [splitKeySampleSize]sample + expectedPopularKeyFrequency float64 + }{ + {uniqueKeySample, 0.05}, + {twentyPercentPopularKeySample, 0.2}, + {twentyFivePercentPopularKeySample, 0.25}, + {fiftyPercentPopularKeySample, 0.5}, + {fiftyFivePercentPopularKeySample, 0.55}, + {sameKeySample, 1}, + } + for i, test := range testCases { + finder := NewFinder(timeutil.Now()) + finder.samples = test.samples + popularKeyFrequency := finder.PopularKeyFrequency() + assert.Equal(t, test.expectedPopularKeyFrequency, popularKeyFrequency, "unexpected popular key frequency in test %d", i) + } +} diff --git a/pkg/kv/kvserver/split_queue.go b/pkg/kv/kvserver/split_queue.go index 514aaf98a48f..5eecb2d1c475 100644 --- a/pkg/kv/kvserver/split_queue.go +++ b/pkg/kv/kvserver/split_queue.go @@ -151,7 +151,7 @@ func (sq *splitQueue) shouldQueue( repl.GetMaxBytes(), repl.shouldBackpressureWrites(), confReader) if !shouldQ && repl.SplitByLoadEnabled() { - if splitKey := repl.loadBasedSplitter.MaybeSplitKey(timeutil.Now()); splitKey != nil { + if splitKey := repl.loadBasedSplitter.MaybeSplitKey(ctx, timeutil.Now()); splitKey != nil { shouldQ, priority = true, 1.0 // default priority } } @@ -233,10 +233,10 @@ func (sq *splitQueue) processAttempt( } now := timeutil.Now() - if splitByLoadKey := r.loadBasedSplitter.MaybeSplitKey(now); splitByLoadKey != nil { + if splitByLoadKey := r.loadBasedSplitter.MaybeSplitKey(ctx, now); splitByLoadKey != nil { batchHandledQPS, _ := r.QueriesPerSecond() raftAppliedQPS := r.WritesPerSecond() - splitQPS := r.loadBasedSplitter.LastQPS(now) + splitQPS := r.loadBasedSplitter.LastQPS(ctx, now) reason := fmt.Sprintf( "load at key %s (%.2f splitQPS, %.2f batches/sec, %.2f raft mutations/sec)", splitByLoadKey, diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go index c91f7b93110a..62d38fde0654 100644 --- a/pkg/ts/catalog/chart_catalog.go +++ b/pkg/ts/catalog/chart_catalog.go @@ -741,6 +741,18 @@ var charts = []sectionDescription{ }, }, }, + { + Organization: [][]string{{DistributionLayer, "Load", "Splitter"}}, + Charts: []chartDescription{ + { + Title: "Load Splitter", + Metrics: []string{ + "kv.loadsplitter.popularkey", + "kv.loadsplitter.nosplitkey", + }, + }, + }, + }, { Organization: [][]string{ {DistributionLayer, "Split Queue"}, From 8d4c5e1e1c727c255073fcd5f61e6b97d0406e8a Mon Sep 17 00:00:00 2001 From: Kai Sun Date: Mon, 7 Nov 2022 23:38:51 -0500 Subject: [PATCH 3/5] kvserver: Fix performance regression due to new call to collectSpansRead When we incorporated the use of response data in the load-based splitter, we called collectSpansRead, which is allocation heavy and computationally expensive, resulting in a performance regression. To address this, this patch performs 3 optimizations: 1. Remove the call to collectSpansRead; instead, add a custom function to iterate over the batch of requests / responses and calculate the true spans 2. Instead of constructing a *spanset.SpanSet and finding the union of spans (which uses O(batch_size) memory), we directly compute the union of spans while iterating over the batch resulting in only O(1) memory used 3. Lazily compute the union of true spans only when it is truly needed i.e. we are under heavy load (e.g. >2500QPS) and a load-based splitter has been initialized Release note: None --- pkg/kv/kvserver/BUILD.bazel | 1 + pkg/kv/kvserver/replica_send.go | 11 +- pkg/kv/kvserver/replica_split_load.go | 78 +++++- pkg/kv/kvserver/replica_split_load_test.go | 297 +++++++++++++++++++++ 4 files changed, 381 insertions(+), 6 deletions(-) create mode 100644 pkg/kv/kvserver/replica_split_load_test.go diff --git a/pkg/kv/kvserver/BUILD.bazel b/pkg/kv/kvserver/BUILD.bazel index 7a3d982cf125..992bff96c772 100644 --- a/pkg/kv/kvserver/BUILD.bazel +++ b/pkg/kv/kvserver/BUILD.bazel @@ -298,6 +298,7 @@ go_test( "replica_rangefeed_test.go", "replica_rankings_test.go", "replica_sideload_test.go", + "replica_split_load_test.go", "replica_sst_snapshot_storage_test.go", "replica_test.go", "replica_tscache_test.go", diff --git a/pkg/kv/kvserver/replica_send.go b/pkg/kv/kvserver/replica_send.go index ed15b2b3789a..aa920d3f544e 100644 --- a/pkg/kv/kvserver/replica_send.go +++ b/pkg/kv/kvserver/replica_send.go @@ -402,9 +402,14 @@ func (r *Replica) executeBatchWithConcurrencyRetries( var g *concurrency.Guard defer func() { // Handle load-based splitting, if necessary. - if pErr == nil { - spansRead, _, _ := r.collectSpansRead(ba, br) - r.recordBatchForLoadBasedSplitting(ctx, ba, spansRead) + if pErr == nil && br != nil { + if len(ba.Requests) != len(br.Responses) { + log.KvDistribution.Errorf(ctx, + "Requests and responses should be equal lengths: # of requests = %d, # of responses = %d", + len(ba.Requests), len(br.Responses)) + } else { + r.recordBatchForLoadBasedSplitting(ctx, ba, br) + } } // NB: wrapped to delay g evaluation to its value when returning. diff --git a/pkg/kv/kvserver/replica_split_load.go b/pkg/kv/kvserver/replica_split_load.go index ea42bbabbdde..3cc43ae28e39 100644 --- a/pkg/kv/kvserver/replica_split_load.go +++ b/pkg/kv/kvserver/replica_split_load.go @@ -13,7 +13,6 @@ package kvserver import ( "context" - "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -48,16 +47,89 @@ func (r *Replica) SplitByLoadEnabled() bool { !r.store.TestingKnobs().DisableLoadBasedSplitting } +// getResponseBoundarySpan computes the union span of the true spans that were +// iterated over using the request span and the response's resumeSpan. +// +// Assumptions: +// 1. br != nil +// 2. len(ba.Requests) == len(br.Responses) +// Assumptions are checked in executeBatchWithConcurrencyRetries. +func getResponseBoundarySpan( + ba *roachpb.BatchRequest, br *roachpb.BatchResponse, +) (responseBoundarySpan roachpb.Span) { + addSpanToBoundary := func(span roachpb.Span) { + if !responseBoundarySpan.Valid() { + responseBoundarySpan = span + } else { + responseBoundarySpan = responseBoundarySpan.Combine(span) + } + } + for i, respUnion := range br.Responses { + reqHeader := ba.Requests[i].GetInner().Header() + resp := respUnion.GetInner() + resumeSpan := resp.Header().ResumeSpan + if resumeSpan == nil { + // Fully evaluated. + addSpanToBoundary(reqHeader.Span()) + continue + } + + switch resp.(type) { + case *roachpb.GetResponse: + // The request did not evaluate. Ignore it. + continue + case *roachpb.ScanResponse: + // Not reverse (->) + // Request: [key...............endKey) + // ResumeSpan: [key......endKey) + // True span: [key......key) + // + // Assumptions (not checked to minimize overhead): + // reqHeader.EndKey == resumeSpan.EndKey + // reqHeader.Key <= resumeSpan.Key. + if reqHeader.Key.Equal(resumeSpan.Key) { + // The request did not evaluate. Ignore it. + continue + } + addSpanToBoundary(roachpb.Span{ + Key: reqHeader.Key, + EndKey: resumeSpan.Key, + }) + case *roachpb.ReverseScanResponse: + // Reverse (<-) + // Request: [key...............endKey) + // ResumeSpan: [key......endKey) + // True span: [endKey...endKey) + // + // Assumptions (not checked to minimize overhead): + // reqHeader.Key == resumeSpan.Key + // resumeSpan.EndKey <= reqHeader.EndKey. + if reqHeader.EndKey.Equal(resumeSpan.EndKey) { + // The request did not evaluate. Ignore it. + continue + } + addSpanToBoundary(roachpb.Span{ + Key: resumeSpan.EndKey, + EndKey: reqHeader.EndKey, + }) + default: + // Consider it fully evaluated, which is safe. + addSpanToBoundary(reqHeader.Span()) + } + } + return +} + // recordBatchForLoadBasedSplitting records the batch's spans to be considered // for load based splitting. func (r *Replica) recordBatchForLoadBasedSplitting( - ctx context.Context, ba *roachpb.BatchRequest, spans *spanset.SpanSet, + ctx context.Context, ba *roachpb.BatchRequest, br *roachpb.BatchResponse, ) { if !r.SplitByLoadEnabled() { return } shouldInitSplit := r.loadBasedSplitter.Record(ctx, timeutil.Now(), len(ba.Requests), func() roachpb.Span { - return spans.BoundarySpan(spanset.SpanGlobal) + return getResponseBoundarySpan(ba, br) }) if shouldInitSplit { r.store.splitQueue.MaybeAddAsync(ctx, r, r.store.Clock().NowAsClockTimestamp()) diff --git a/pkg/kv/kvserver/replica_split_load_test.go b/pkg/kv/kvserver/replica_split_load_test.go new file mode 100644 index 000000000000..3d04da3546b9 --- /dev/null +++ b/pkg/kv/kvserver/replica_split_load_test.go @@ -0,0 +1,297 @@ +/// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package kvserver + +import ( + "testing" + + "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/stretchr/testify/assert" +) + +func roachpbKey(key uint32) roachpb.Key { + return keys.SystemSQLCodec.TablePrefix(key) +} + +func requestHeaderWithNilEndKey(key uint32) roachpb.RequestHeader { + return roachpb.RequestHeader{ + Key: roachpbKey(key), + } +} + +func requestHeader(key uint32, endKey uint32) roachpb.RequestHeader { + return roachpb.RequestHeader{ + Key: roachpbKey(key), + EndKey: roachpbKey(endKey), + } +} + +func responseHeaderWithNilResumeSpan() roachpb.ResponseHeader { + return roachpb.ResponseHeader{ + ResumeSpan: nil, + } +} + +func responseHeaderWithNilEndKey(key uint32) roachpb.ResponseHeader { + return roachpb.ResponseHeader{ + ResumeSpan: &roachpb.Span{ + Key: roachpbKey(key), + }, + } +} + +func responseHeader(key uint32, endKey uint32) roachpb.ResponseHeader { + return roachpb.ResponseHeader{ + ResumeSpan: &roachpb.Span{ + Key: roachpbKey(key), + EndKey: roachpbKey(endKey), + }, + } +} + +func requestUnionGet(requestHeader roachpb.RequestHeader) roachpb.RequestUnion { + return roachpb.RequestUnion{ + Value: &roachpb.RequestUnion_Get{ + Get: &roachpb.GetRequest{ + RequestHeader: requestHeader, + }, + }, + } +} + +func responseUnionGet(responseHeader roachpb.ResponseHeader) roachpb.ResponseUnion { + return roachpb.ResponseUnion{ + Value: &roachpb.ResponseUnion_Get{ + Get: &roachpb.GetResponse{ + ResponseHeader: responseHeader, + }, + }, + } +} + +func requestUnionScan(requestHeader roachpb.RequestHeader) roachpb.RequestUnion { + return roachpb.RequestUnion{ + Value: &roachpb.RequestUnion_Scan{ + Scan: &roachpb.ScanRequest{ + RequestHeader: requestHeader, + }, + }, + } +} + +func responseUnionScan(responseHeader roachpb.ResponseHeader) roachpb.ResponseUnion { + return roachpb.ResponseUnion{ + Value: &roachpb.ResponseUnion_Scan{ + Scan: &roachpb.ScanResponse{ + ResponseHeader: responseHeader, + }, + }, + } +} + +func requestUnionReverseScan(requestHeader roachpb.RequestHeader) roachpb.RequestUnion { + return roachpb.RequestUnion{ + Value: &roachpb.RequestUnion_ReverseScan{ + ReverseScan: &roachpb.ReverseScanRequest{ + RequestHeader: requestHeader, + }, + }, + } +} + +func responseUnionReverseScan(responseHeader roachpb.ResponseHeader) roachpb.ResponseUnion { + return roachpb.ResponseUnion{ + Value: &roachpb.ResponseUnion_ReverseScan{ + ReverseScan: &roachpb.ReverseScanResponse{ + ResponseHeader: responseHeader, + }, + }, + } +} + +func requestUnionDeleteRange(requestHeader roachpb.RequestHeader) roachpb.RequestUnion { + return roachpb.RequestUnion{ + Value: &roachpb.RequestUnion_DeleteRange{ + DeleteRange: &roachpb.DeleteRangeRequest{ + RequestHeader: requestHeader, + }, + }, + } +} + +func responseUnionDeleteRange(responseHeader roachpb.ResponseHeader) roachpb.ResponseUnion { + return roachpb.ResponseUnion{ + Value: &roachpb.ResponseUnion_DeleteRange{ + DeleteRange: &roachpb.DeleteRangeResponse{ + ResponseHeader: responseHeader, + }, + }, + } +} + +func TestGetResponseBoundarySpan(t *testing.T) { + defer leaktest.AfterTest(t)() + testCases := []struct { + ba *roachpb.BatchRequest + br *roachpb.BatchResponse + expectedResponseBoundarySpan roachpb.Span + }{ + { + ba: &roachpb.BatchRequest{ + Requests: []roachpb.RequestUnion{ + requestUnionGet(requestHeaderWithNilEndKey(100)), + }, + }, + br: &roachpb.BatchResponse{ + Responses: []roachpb.ResponseUnion{ + responseUnionGet(responseHeaderWithNilResumeSpan()), + }, + }, + expectedResponseBoundarySpan: roachpb.Span{ + Key: roachpbKey(100), + }, + }, + { + ba: &roachpb.BatchRequest{ + Requests: []roachpb.RequestUnion{ + requestUnionScan(requestHeader(100, 900)), + }, + }, + br: &roachpb.BatchResponse{ + Responses: []roachpb.ResponseUnion{ + responseUnionScan(responseHeaderWithNilResumeSpan()), + }, + }, + expectedResponseBoundarySpan: roachpb.Span{ + Key: roachpbKey(100), + EndKey: roachpbKey(900), + }, + }, + { + ba: &roachpb.BatchRequest{ + Requests: []roachpb.RequestUnion{ + requestUnionScan(requestHeader(100, 900)), + }, + }, + br: &roachpb.BatchResponse{ + Responses: []roachpb.ResponseUnion{ + responseUnionScan(responseHeader(113, 900)), + }, + }, + expectedResponseBoundarySpan: roachpb.Span{ + Key: roachpbKey(100), + EndKey: roachpbKey(113), + }, + }, + { + ba: &roachpb.BatchRequest{ + Requests: []roachpb.RequestUnion{ + requestUnionReverseScan(requestHeader(100, 900)), + }, + }, + br: &roachpb.BatchResponse{ + Responses: []roachpb.ResponseUnion{ + responseUnionReverseScan(responseHeader(100, 879)), + }, + }, + expectedResponseBoundarySpan: roachpb.Span{ + Key: roachpbKey(879), + EndKey: roachpbKey(900), + }, + }, + { + ba: &roachpb.BatchRequest{ + Requests: []roachpb.RequestUnion{ + requestUnionDeleteRange(requestHeader(100, 900)), + }, + }, + br: &roachpb.BatchResponse{ + Responses: []roachpb.ResponseUnion{ + responseUnionDeleteRange(responseHeader(113, 900)), + }, + }, + expectedResponseBoundarySpan: roachpb.Span{ + Key: roachpbKey(100), + EndKey: roachpbKey(900), + }, + }, + { + ba: &roachpb.BatchRequest{ + Requests: []roachpb.RequestUnion{ + requestUnionGet(requestHeaderWithNilEndKey(100)), + }, + }, + br: &roachpb.BatchResponse{ + Responses: []roachpb.ResponseUnion{ + responseUnionGet(responseHeaderWithNilEndKey(100)), + }, + }, + expectedResponseBoundarySpan: roachpb.Span{}, + }, + { + ba: &roachpb.BatchRequest{ + Requests: []roachpb.RequestUnion{ + requestUnionScan(requestHeader(100, 900)), + }, + }, + br: &roachpb.BatchResponse{ + Responses: []roachpb.ResponseUnion{ + responseUnionScan(responseHeader(100, 900)), + }, + }, + expectedResponseBoundarySpan: roachpb.Span{}, + }, + { + ba: &roachpb.BatchRequest{ + Requests: []roachpb.RequestUnion{ + requestUnionReverseScan(requestHeader(100, 900)), + }, + }, + br: &roachpb.BatchResponse{ + Responses: []roachpb.ResponseUnion{ + responseUnionReverseScan(responseHeader(100, 900)), + }, + }, + expectedResponseBoundarySpan: roachpb.Span{}, + }, + { + ba: &roachpb.BatchRequest{ + Requests: []roachpb.RequestUnion{ + requestUnionScan(requestHeader(500, 600)), + requestUnionReverseScan(requestHeader(475, 625)), + requestUnionGet(requestHeaderWithNilEndKey(480)), + requestUnionReverseScan(requestHeader(500, 510)), + requestUnionScan(requestHeader(700, 800)), + }, + }, + br: &roachpb.BatchResponse{ + Responses: []roachpb.ResponseUnion{ + responseUnionScan(responseHeader(550, 600)), + responseUnionReverseScan(responseHeader(475, 525)), + responseUnionGet(responseHeaderWithNilResumeSpan()), + responseUnionReverseScan(responseHeaderWithNilResumeSpan()), + responseUnionScan(responseHeader(700, 800)), + }, + }, + expectedResponseBoundarySpan: roachpb.Span{ + Key: roachpbKey(480), + EndKey: roachpbKey(625), + }, + }, + } + for i, test := range testCases { + responseBoundarySpan := getResponseBoundarySpan(test.ba, test.br) + assert.Equal(t, test.expectedResponseBoundarySpan, responseBoundarySpan, "Expected response boundary span %s, got %s in test %d", + test.expectedResponseBoundarySpan, responseBoundarySpan, i) + } +} From 1cb5e6ce0ef61b89836805257f54c8cc58c7d413 Mon Sep 17 00:00:00 2001 From: Austen McClernon Date: Sat, 20 May 2023 01:25:26 +0000 Subject: [PATCH 4/5] kvserver: avoid load-based splitting between rows It was possible for a SQL row to be torn across two ranges due to the load-based splitter not rejecting potentially unsafe split keys. It is impossible to determine with keys sampled from response spans, whether a key is certainly unsafe or safe. This commit side steps this problem by re-using the `adminSplitWithDescriptor` command to find the first real key, after or at the provided `args.SplitKey`. This ensures that the split key will always be a real key whilst not requiring any checks in the splitter itself. The updated `adminSplitWithDescriptor` is local only and requires opting into finding the first safe key by setting `findFirstSafeKey` to `true`. As such, all safe split key checks are also removed from the `split` pkg, with a warning added that the any split key returned is unsafe. Resolves: https://github.com/cockroachdb/cockroach/issues/103483 Release note (bug fix): It was possible for a SQL row to be split across two ranges. When this occurred, SQL queries could return unexpected errors. This bug is resolved by these changes, as we now inspect the real keys, rather than just request keys to determine load-based split points. --- pkg/kv/kvserver/client_split_test.go | 246 +++++++++++++++++++++ pkg/kv/kvserver/replica_command.go | 33 ++- pkg/kv/kvserver/replica_split_load.go | 82 +++++++ pkg/kv/kvserver/split/BUILD.bazel | 2 - pkg/kv/kvserver/split/decider.go | 34 ++- pkg/kv/kvserver/split/decider_test.go | 78 ------- pkg/kv/kvserver/split_queue.go | 11 +- pkg/kv/kvserver/testing_knobs.go | 4 + pkg/storage/mvcc.go | 70 +++++- pkg/storage/mvcc_test.go | 300 ++++++++++++++++++++------ 10 files changed, 684 insertions(+), 176 deletions(-) diff --git a/pkg/kv/kvserver/client_split_test.go b/pkg/kv/kvserver/client_split_test.go index 3c4a981895f3..2c5720790fc3 100644 --- a/pkg/kv/kvserver/client_split_test.go +++ b/pkg/kv/kvserver/client_split_test.go @@ -3649,3 +3649,249 @@ func TestStoreRangeSplitAndMergeWithGlobalReads(t *testing.T) { repl = store.LookupReplica(roachpb.RKey(splitKey)) require.Equal(t, descKey, repl.Desc().StartKey.AsRawKey()) } + +// TestLBSplitUnsafeKeys tests that load based splits do not split between table +// rows, even when the suggested load based split key is itself between a table row. +func TestLBSplitUnsafeKeys(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + ctx := context.Background() + const indexID = 1 + + // The test is expensive and prone to timing out under race. + skip.UnderRace(t) + skip.UnderStressRace(t) + skip.UnderDeadlock(t) + + makeTestKey := func(tableID uint32, suffix []byte) roachpb.Key { + tableKey := keys.MakeTableIDIndexID(nil, tableID, indexID) + return append(tableKey, suffix...) + } + + es := func(vals ...int64) []byte { + k := []byte{} + for _, v := range vals { + k = encoding.EncodeVarintAscending(k, v) + } + return k + } + + fk := func(k []byte, famID uint32) []byte { + return keys.MakeFamilyKey(k, famID) + } + + testCases := []struct { + splitKey roachpb.Key + existingKeys []int + expSplitKey roachpb.Key + expErrStr string + }{ + // We don't know the table ID here, we append the splitKey to the + // table/index prefix. e.g. /1 will be /Table/table_id/index_id/1. + { + // /1 -> /2 + splitKey: es(1), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/0 -> /2 + splitKey: fk(es(1), 0), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/3 -> /2 + splitKey: fk(es(1), 3), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/0/0 -> /2 + splitKey: roachpb.Key(fk(es(1), 0)).Next(), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/3/0 -> /2 + splitKey: roachpb.Key(fk(es(1), 3)).Next(), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/0/0/0 -> /2 + splitKey: roachpb.Key(fk(es(1), 0)).Next().Next(), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/3/0/0 -> /2 + splitKey: roachpb.Key(fk(es(1), 3)).Next().Next(), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /0 -> /3 + // We will not split at the first row in a table, so expect the split at + // /3 intead of /2. + splitKey: es(0), + existingKeys: []int{2, 3}, + expSplitKey: es(3), + }, + { + // /2 -> /3 + // Same case as above, despite the key being safe, the split would create + // an empty LHS. + splitKey: es(2), + existingKeys: []int{2, 3}, + expSplitKey: es(3), + }, + { + // /1 -> error + // There are no rows to split on. + splitKey: es(1), + existingKeys: []int{}, + expErrStr: "could not find valid split key", + }, + { + // /1 -> error + // There is only one row to split on, the range should not be split. + splitKey: es(1), + existingKeys: []int{2}, + expErrStr: "could not find valid split key", + }, + { + // /1/0 -> error + splitKey: fk(es(1), 0), + existingKeys: []int{2}, + expErrStr: "could not find valid split key", + }, + { + // /1/3 -> error + splitKey: fk(es(1), 3), + existingKeys: []int{2}, + expErrStr: "could not find valid split key", + }, + { + // /1/3/0/0 -> error + splitKey: roachpb.Key(fk(es(1), 0)).Next().Next(), + existingKeys: []int{2}, + expErrStr: "could not find valid split key", + }, + { + // /2 -> /2 + splitKey: es(2), + existingKeys: []int{1, 2}, + expSplitKey: es(2), + }, + } + + for _, tc := range testCases { + var expectStr string + if tc.expErrStr != "" { + expectStr = tc.expErrStr + } else { + expectStr = makeTestKey(1, tc.expSplitKey).String() + } + t.Run(fmt.Sprintf("%s%v -> %s", makeTestKey(1, tc.splitKey), tc.existingKeys, expectStr), func(t *testing.T) { + var targetRange atomic.Int32 + var splitKeyOverride atomic.Value + splitKeyOverride.Store(roachpb.Key{}) + + // Mock the load based splitter key finding method. This function will be + // checked in splitQueue.shouldQueue() and splitQueue.process via + // replica.loadSplitKey. When a key is returned, the split queue calls + // replica.adminSplitWithDescriptor(...findFirstSafeSplitKey=true). + overrideLBSplitFn := func(rangeID roachpb.RangeID) (splitKey roachpb.Key, useSplitKey bool) { + if rangeID == roachpb.RangeID(targetRange.Load()) { + override := splitKeyOverride.Load() + // It is possible that the split queue is checking the range before + // we manually enqueued it. + if override == nil { + return nil, false + } + overrideKey, ok := override.(roachpb.Key) + require.Truef(t, ok, "stored value not key %+v", override) + + if len(overrideKey) == 0 { + return nil, false + } + + return override.(roachpb.Key), true + } + return nil, false + } + + serv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{ + DisableSpanConfigs: true, + Knobs: base.TestingKnobs{ + Store: &kvserver.StoreTestingKnobs{ + LoadBasedSplittingOverrideKey: overrideLBSplitFn, + DisableMergeQueue: true, + }, + }, + }) + s := serv.(*server.TestServer) + defer s.Stopper().Stop(ctx) + tdb := sqlutils.MakeSQLRunner(sqlDB) + store, err := s.Stores().GetStore(1) + require.NoError(t, err) + + // We want to exercise the case where there are column family keys. + // Create a simple table and insert the existing keys. + _ = tdb.Exec(t, + "CREATE TABLE t (k INT PRIMARY KEY, "+ + "t0 INT, t1 INT, t2 INT, t3 INT, "+ + "FAMILY (k), FAMILY (t0), FAMILY (t1), FAMILY (t2), FAMILY (t3))") + for _, k := range tc.existingKeys { + _ = tdb.Exec(t, fmt.Sprintf("INSERT INTO t VALUES (%d, %d, %d, %d, %d)", + k, k, k, k, k)) + } + + // Force a table scan to resolve descriptors. + var keyCount int + tdb.QueryRow(t, "SELECT count(k) FROM t").Scan(&keyCount) + require.Equal(t, len(tc.existingKeys), keyCount) + var tableID uint32 + tdb.QueryRow(t, "SELECT table_id FROM crdb_internal.leases where name = 't'").Scan(&tableID) + + // Split off the table range for the test, otherwise the range may + // contain multiple tables with existing values. + splitArgs := adminSplitArgs(keys.SystemSQLCodec.TablePrefix(tableID)) + _, pErr := kv.SendWrapped(ctx, store.TestSender(), splitArgs) + require.Nil(t, pErr) + + var rangeID roachpb.RangeID + tdb.QueryRow(t, "SELECT range_id FROM [SHOW RANGES FROM TABLE t] LIMIT 1").Scan(&rangeID) + targetRange.Store(int32(rangeID)) + repl, err := store.GetReplica(rangeID) + require.NoError(t, err) + + // Keep the previous end key around, we will use this to assert that no + // split has occurred when expecting an error. + prevEndKey := repl.Desc().EndKey.AsRawKey() + splitKey := makeTestKey(tableID, tc.splitKey) + + // Update the split key override so that the split queue will enqueue and + // process the range. Remove it afterwards to avoid retrying the LHS. + splitKeyOverride.Store(splitKey) + _, processErr, enqueueErr := store.Enqueue(ctx, "split", repl, false /* shouldSkipQueue */, false /* async */) + splitKeyOverride.Store(roachpb.Key{}) + require.NoError(t, enqueueErr) + + endKey := repl.Desc().EndKey.AsRawKey() + if tc.expErrStr != "" { + // We expect this split not to process, assert that the expected error + // matches the returned error and the range has the same end key. + require.ErrorContainsf(t, processErr, tc.expErrStr, + "end key %s, previous end key %s", endKey, prevEndKey) + require.Equal(t, prevEndKey, endKey) + } else { + // Otherwise, assert that the new range end key matches the expected + // end key. + require.NoError(t, processErr) + require.Equal(t, makeTestKey(tableID, tc.expSplitKey), endKey) + } + }) + } +} diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index 148b0493612d..acc70a38bc18 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -77,7 +77,7 @@ func (r *Replica) AdminSplit( err := r.executeAdminCommandWithDescriptor(ctx, func(desc *roachpb.RangeDescriptor) error { var err error - reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason) + reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason, false /* findFirstSafeKey */) return err }) return reply, err @@ -296,7 +296,6 @@ func splitTxnStickyUpdateAttempt( // affirmative the descriptor is passed to AdminSplit, which performs a // Conditional Put on the RangeDescriptor to ensure that no other operation has // modified the range in the time the decision was being made. -// TODO(tschottdorf): should assert that split key is not a local key. // // See the comment on splitTrigger for details on the complexities. func (r *Replica) adminSplitWithDescriptor( @@ -305,6 +304,7 @@ func (r *Replica) adminSplitWithDescriptor( desc *roachpb.RangeDescriptor, delayable bool, reason string, + findFirstSafeKey bool, ) (roachpb.AdminSplitResponse, error) { var err error var reply roachpb.AdminSplitResponse @@ -345,11 +345,36 @@ func (r *Replica) adminSplitWithDescriptor( ri := r.GetRangeInfo(ctx) return reply, roachpb.NewRangeKeyMismatchErrorWithCTPolicy(ctx, args.Key, args.Key, desc, &ri.Lease, ri.ClosedTimestampPolicy) } - foundSplitKey = args.SplitKey + // When findFirstSafeKey is true, we find the first key after or at + // args.SplitKey which is a safe split to split at. The current user of + // findFirstSafeKey is load based splitting, which only has knowledge of + // sampled keys from batch requests. These sampled keys can be + // arbitrarily within SQL rows due to column family keys. + // + // Not every caller requires a real key as a split point (creating empty + // table), however when we cannot verify the split key as safe, the most + // reliable method is checking existing keys. + if findFirstSafeKey { + var desiredSplitKey roachpb.RKey + if desiredSplitKey, err = keys.Addr(args.SplitKey); err != nil { + return reply, err + } + if foundSplitKey, err = storage.MVCCFirstSplitKey( + ctx, r.store.engine, desiredSplitKey, + desc.StartKey, desc.EndKey, + ); err != nil { + return reply, errors.Wrap(err, "unable to determine split key") + } else if foundSplitKey == nil { + return reply, unsplittableRangeError{} + } + } else { + foundSplitKey = args.SplitKey + } } if !kvserverbase.ContainsKey(desc, foundSplitKey) { - return reply, errors.Errorf("requested split key %s out of bounds of %s", args.SplitKey, r) + return reply, errors.Errorf("requested split key %s (found=%s) out of bounds of %s", + args.SplitKey, foundSplitKey, r) } // If predicate keys are specified, make sure they are contained by this diff --git a/pkg/kv/kvserver/replica_split_load.go b/pkg/kv/kvserver/replica_split_load.go index 3cc43ae28e39..34d26e3ead83 100644 --- a/pkg/kv/kvserver/replica_split_load.go +++ b/pkg/kv/kvserver/replica_split_load.go @@ -12,10 +12,14 @@ package kvserver import ( "context" + "time" + "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings" + "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/errors" ) // SplitByLoadEnabled wraps "kv.range_split.by_load_enabled". @@ -135,3 +139,81 @@ func (r *Replica) recordBatchForLoadBasedSplitting( r.store.splitQueue.MaybeAddAsync(ctx, r, r.store.Clock().NowAsClockTimestamp()) } } + +// loadSplitKey returns a suggested load split key for the range if it exists, +// otherwise it returns nil. If there were any errors encountered when +// validating the split key, the error is returned as well. It is guaranteed +// that the key returned, if non-nil, will be greater than the start key of the +// range and also within the range bounds. +// +// NOTE: The returned split key CAN BE BETWEEN A SQL ROW, The split key +// returned should only be used to engage a split via adminSplitWithDescriptor +// where findFirstSafeKey is set to true. +func (r *Replica) loadSplitKey(ctx context.Context, now time.Time) roachpb.Key { + var splitKey roachpb.Key + if overrideFn := r.store.cfg.TestingKnobs.LoadBasedSplittingOverrideKey; overrideFn != nil { + var useSplitKey bool + if splitKey, useSplitKey = overrideFn(r.GetRangeID()); useSplitKey { + return splitKey + } + } else { + splitKey = r.loadBasedSplitter.MaybeSplitKey(ctx, now) + } + + if splitKey == nil { + return nil + } + + // If the splitKey belongs to a Table range, try and shorten the key to just + // the row prefix. This allows us to check that splitKey doesn't map to the + // first key of the range here. If the split key contains column families, it + // is possible that the full key is strictly after every existing key for + // that row. e.g. for a table row where the table ID is 100, index ID is 1, + // primary key is a, and the column family ID is 3 (length=1): + // + // splitKey = /Table/100/1/"a"/3/1 + // existing = [..., /Table/100/1/"a"/2/1] + // + // We would not split at /Table/100/1/"a" as there's no key >= the splitKey + // in the range. + // + // NB: We handle unsafe split keys in replica.adminSplitWithDescriptor, so it + // isn't an issue if we return an unsafe key here. See the case where + // findFirstSafeKey is true. + if keyRowPrefix, err := keys.EnsureSafeSplitKey(splitKey); err == nil { + splitKey = keyRowPrefix + } + + // We swallow the error here and instead log an event. It is currently + // expected that the load based splitter may return the start key of the + // range. + if err := splitKeyPreCheck(r.Desc().RSpan(), splitKey); err != nil { + log.KvDistribution.VEventf(ctx, 1, "suggested load split key not usable: %s", err) + return nil + } + + return splitKey +} + +// splitKeyPreCheck checks that a split key is addressable and not the same as +// the start key. An error is returned if these are not true. Additional checks +// are made in adminSplitWithDescriptor when a split request is processed by +// the replica. +func splitKeyPreCheck(rspan roachpb.RSpan, splitKey roachpb.Key) error { + splitRKey, err := keys.Addr(splitKey) + if err != nil { + return err + } + + // If the split key is equal to the start key of the range, it is treated as + // a no-op in adminSplitWithDescriptor, however it is treated as an error + // here because we shouldn't be suggesting split keys that are identical to + // the start key of the range. + if splitRKey.Equal(rspan.Key) { + return errors.Errorf( + "split key is equal to range start key (split_key=%s)", + splitRKey) + } + + return nil +} diff --git a/pkg/kv/kvserver/split/BUILD.bazel b/pkg/kv/kvserver/split/BUILD.bazel index 02f19930667f..5a6e668deafc 100644 --- a/pkg/kv/kvserver/split/BUILD.bazel +++ b/pkg/kv/kvserver/split/BUILD.bazel @@ -10,7 +10,6 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/split", visibility = ["//visibility:public"], deps = [ - "//pkg/keys", "//pkg/roachpb", "//pkg/util/log", "//pkg/util/metric", @@ -30,7 +29,6 @@ go_test( deps = [ "//pkg/keys", "//pkg/roachpb", - "//pkg/util/encoding", "//pkg/util/leaktest", "//pkg/util/metric", "//pkg/util/stop", diff --git a/pkg/kv/kvserver/split/decider.go b/pkg/kv/kvserver/split/decider.go index ce7d54e6c3fd..c2c2f8fb796e 100644 --- a/pkg/kv/kvserver/split/decider.go +++ b/pkg/kv/kvserver/split/decider.go @@ -16,7 +16,6 @@ import ( "context" "time" - "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/metric" @@ -221,6 +220,8 @@ func (d *Decider) MaxQPS(ctx context.Context, now time.Time) (float64, bool) { // or if it wasn't able to determine a suitable split key. // // It is legal to call MaybeSplitKey at any time. +// WARNING: The key returned from MaybeSplitKey has no guarantee of being a +// safe split key. The key is derived from sampled spans. See below. func (d *Decider) MaybeSplitKey(ctx context.Context, now time.Time) roachpb.Key { var key roachpb.Key @@ -244,27 +245,20 @@ func (d *Decider) MaybeSplitKey(ctx context.Context, now time.Time) roachpb.Key // // /Table/51/52/53/54/55 // - // (see TestDeciderCallsEnsureSafeSplitKey). + // The key found here isn't guaranteed to be a valid SQL column family key. + // This is because the keys are sampled from StartKey and EndKey of + // requests hitting this replica. Ranged operations may well wish to + // exclude the start point by calling .Next() or may span multiple ranges, + // and so such a key may end up being returned. This is more common than + // one might think since SQL issues plenty of scans over all column + // families, meaning that we'll frequently find a key that has no column + // family suffix and thus errors out in EnsureSafeSplitKey. // - // The key found here isn't guaranteed to be a valid SQL column family - // key. This is because the keys are sampled from StartKey of requests - // hitting this replica. Ranged operations may well wish to exclude the - // start point by calling .Next() or may span multiple ranges, and so - // such a key may end up being passed to EnsureSafeSplitKey here. - // - // We take the risk that the result may sometimes not be a good split - // point (or even in this range). - // - // Note that we ignore EnsureSafeSplitKey when it returns an error since - // that error only tells us that this key couldn't possibly be a SQL - // key. This is more common than one might think since SQL issues plenty - // of scans over all column families, meaning that we'll frequently find - // a key that has no column family suffix and thus errors out in - // EnsureSafeSplitKey. + // We do not attempt to validate the key is safe here, simply return it to + // the caller as the best possible split point found so far. See + // replica.adminSplitWithDescriptor for how split keys are handled when we + // aren't certain the provided key is safe. key = d.mu.splitFinder.Key() - if safeKey, err := keys.EnsureSafeSplitKey(key); err == nil { - key = safeKey - } } return key } diff --git a/pkg/kv/kvserver/split/decider_test.go b/pkg/kv/kvserver/split/decider_test.go index 6db6a2e4223a..c41ca8edfe6f 100644 --- a/pkg/kv/kvserver/split/decider_test.go +++ b/pkg/kv/kvserver/split/decider_test.go @@ -12,14 +12,12 @@ package split import ( "context" - "math" "math/rand" "testing" "time" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" - "github.com/cockroachdb/cockroach/pkg/util/encoding" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/stretchr/testify/assert" @@ -240,82 +238,6 @@ func TestDecider_MaxQPS(t *testing.T) { assertMaxQPS(25000, 6, true) } -func TestDeciderCallsEnsureSafeSplitKey(t *testing.T) { - defer leaktest.AfterTest(t)() - intn := rand.New(rand.NewSource(11)).Intn - - var d Decider - Init(&d, intn, func() float64 { return 1.0 }, func() time.Duration { return time.Second }, &LoadSplitterMetrics{ - PopularKeyCount: metric.NewCounter(metric.Metadata{}), - NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), - }) - - baseKey := keys.SystemSQLCodec.TablePrefix(51) - for i := 0; i < 4; i++ { - baseKey = encoding.EncodeUvarintAscending(baseKey, uint64(52+i)) - } - c0 := func() roachpb.Span { return roachpb.Span{Key: append([]byte(nil), keys.MakeFamilyKey(baseKey, 1)...)} } - c1 := func() roachpb.Span { return roachpb.Span{Key: append([]byte(nil), keys.MakeFamilyKey(baseKey, 9)...)} } - - expK, err := keys.EnsureSafeSplitKey(c1().Key) - require.NoError(t, err) - - var k roachpb.Key - var now time.Time - for i := 0; i < 2*int(minSplitSuggestionInterval/time.Second); i++ { - now = now.Add(500 * time.Millisecond) - d.Record(context.Background(), now, 1, c0) - now = now.Add(500 * time.Millisecond) - d.Record(context.Background(), now, 1, c1) - k = d.MaybeSplitKey(context.Background(), now) - if len(k) != 0 { - break - } - } - - require.Equal(t, expK, k) -} - -func TestDeciderIgnoresEnsureSafeSplitKeyOnError(t *testing.T) { - defer leaktest.AfterTest(t)() - intn := rand.New(rand.NewSource(11)).Intn - - var d Decider - Init(&d, intn, func() float64 { return 1.0 }, func() time.Duration { return time.Second }, &LoadSplitterMetrics{ - PopularKeyCount: metric.NewCounter(metric.Metadata{}), - NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), - }) - - baseKey := keys.SystemSQLCodec.TablePrefix(51) - for i := 0; i < 4; i++ { - baseKey = encoding.EncodeUvarintAscending(baseKey, uint64(52+i)) - } - c0 := func() roachpb.Span { - return roachpb.Span{Key: append([]byte(nil), encoding.EncodeUvarintAscending(baseKey, math.MaxInt32+1)...)} - } - c1 := func() roachpb.Span { - return roachpb.Span{Key: append([]byte(nil), encoding.EncodeUvarintAscending(baseKey, math.MaxInt32+2)...)} - } - - _, err := keys.EnsureSafeSplitKey(c1().Key) - require.Error(t, err) - - var k roachpb.Key - var now time.Time - for i := 0; i < 2*int(minSplitSuggestionInterval/time.Second); i++ { - now = now.Add(500 * time.Millisecond) - d.Record(context.Background(), now, 1, c0) - now = now.Add(500 * time.Millisecond) - d.Record(context.Background(), now, 1, c1) - k = d.MaybeSplitKey(context.Background(), now) - if len(k) != 0 { - break - } - } - - require.Equal(t, c1().Key, k) -} - func TestMaxQPSTracker(t *testing.T) { defer leaktest.AfterTest(t)() diff --git a/pkg/kv/kvserver/split_queue.go b/pkg/kv/kvserver/split_queue.go index 5eecb2d1c475..72c8e0508a2d 100644 --- a/pkg/kv/kvserver/split_queue.go +++ b/pkg/kv/kvserver/split_queue.go @@ -151,7 +151,7 @@ func (sq *splitQueue) shouldQueue( repl.GetMaxBytes(), repl.shouldBackpressureWrites(), confReader) if !shouldQ && repl.SplitByLoadEnabled() { - if splitKey := repl.loadBasedSplitter.MaybeSplitKey(ctx, timeutil.Now()); splitKey != nil { + if splitKey := repl.loadSplitKey(ctx, timeutil.Now()); splitKey != nil { shouldQ, priority = true, 1.0 // default priority } } @@ -209,6 +209,7 @@ func (sq *splitQueue) processAttempt( desc, false, /* delayable */ "span config", + false, /* findFirstSafeSplitKey */ ); err != nil { return false, errors.Wrapf(err, "unable to split %s at key %q", r, splitKey) } @@ -227,13 +228,14 @@ func (sq *splitQueue) processAttempt( desc, false, /* delayable */ fmt.Sprintf("%s above threshold size %s", humanizeutil.IBytes(size), humanizeutil.IBytes(maxBytes)), + false, /* findFirstSafeSplitKey */ ) return err == nil, err } now := timeutil.Now() - if splitByLoadKey := r.loadBasedSplitter.MaybeSplitKey(ctx, now); splitByLoadKey != nil { + if splitByLoadKey := r.loadSplitKey(ctx, now); splitByLoadKey != nil { batchHandledQPS, _ := r.QueriesPerSecond() raftAppliedQPS := r.WritesPerSecond() splitQPS := r.loadBasedSplitter.LastQPS(ctx, now) @@ -256,6 +258,10 @@ func (sq *splitQueue) processAttempt( if expDelay := kvserverbase.SplitByLoadMergeDelay.Get(&sq.store.cfg.Settings.SV); expDelay > 0 { expTime = sq.store.Clock().Now().Add(expDelay.Nanoseconds(), 0) } + // The splitByLoadKey has no guarantee of being a safe key to split at (not + // between SQL rows). To sanitize the split point, pass + // findFirstSafeSplitKey set to true, so that the first key after the + // suggested split point which is safe to split at is used. if _, pErr := r.adminSplitWithDescriptor( ctx, roachpb.AdminSplitRequest{ @@ -268,6 +274,7 @@ func (sq *splitQueue) processAttempt( desc, false, /* delayable */ reason, + true, /* findFirstSafeSplitKey */ ); pErr != nil { return false, errors.Wrapf(pErr, "unable to split %s at key %q", r, splitByLoadKey) } diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go index 271155e9e06c..56c58806e3fa 100644 --- a/pkg/kv/kvserver/testing_knobs.go +++ b/pkg/kv/kvserver/testing_knobs.go @@ -137,6 +137,10 @@ type StoreTestingKnobs struct { DisableReplicaRebalancing bool // DisableLoadBasedSplitting turns off LBS so no splits happen because of load. DisableLoadBasedSplitting bool + // LoadBasedSplittingOverrideKey returns a key which should be used for load + // based splitting, overriding any value returned from the real load based + // splitter. + LoadBasedSplittingOverrideKey func(rangeID roachpb.RangeID) (splitKey roachpb.Key, useSplitKey bool) // DisableSplitQueue disables the split queue. DisableSplitQueue bool // DisableTimeSeriesMaintenanceQueue disables the time series maintenance diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index 9ae85491a83b..153fddab0174 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -5436,7 +5436,27 @@ func MVCCFindSplitKey( // was dangerous because partitioning can split off ranges that do not start // at valid row keys. The keys that are present in the range, by contrast, are // necessarily valid row keys. - it.SeekGE(MakeMVCCMetadataKey(key.AsRawKey())) + minSplitKey, err := mvccMinSplitKey(it, key.AsRawKey()) + if err != nil { + return nil, err + } else if minSplitKey == nil { + return nil, nil + } + + splitKey, err := it.FindSplitKey(key.AsRawKey(), endKey.AsRawKey(), minSplitKey, targetSize) + if err != nil { + return nil, err + } + // Ensure the key is a valid split point that does not fall in the middle of a + // SQL row by removing the column family ID, if any, from the end of the key. + return keys.EnsureSafeSplitKey(splitKey.Key) +} + +// mvccMinSplitKey returns the minimum key that a range may be split at. The +// caller is responsible for setting the iterator upper bound to the range end +// key. The caller is also responsible for closing the iterator. +func mvccMinSplitKey(it MVCCIterator, startKey roachpb.Key) (roachpb.Key, error) { + it.SeekGE(MakeMVCCMetadataKey(startKey)) if ok, err := it.Valid(); err != nil { return nil, err } else if !ok { @@ -5460,14 +5480,54 @@ func MVCCFindSplitKey( // Allow a split at any key that sorts after it. minSplitKey = it.Key().Key.Next() } + return minSplitKey, nil +} - splitKey, err := it.FindSplitKey(key.AsRawKey(), endKey.AsRawKey(), minSplitKey, targetSize) +// MVCCFirstSplitKey returns the first key which is safe to split at and no +// less than desiredSplitKey in the range which spans [startKey,endKey). If a +// non-nil key is returned, it is safe to split at. If a nil key is returned, no +// safe split key could be determined. The safe split key returned is +// guaranteed to be: +// +// 1. Within [startKey,endKey). +// 2. No less than desiredSplitKey. +// 3. Greater than the first key in [startKey,endKey]; or greater than all the +// first row's keys if a table range. . +// 4. Not in between the start and end of a row for table ranges. +// +// The returned split key is NOT guaranteed to be outside a no-split span, such +// as Meta1, Meta2Max or Node Liveness. +func MVCCFirstSplitKey( + _ context.Context, reader Reader, desiredSplitKey, startKey, endKey roachpb.RKey, +) (roachpb.Key, error) { + it := reader.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: endKey.AsRawKey()}) + defer it.Close() + + // If the caller has provided a desiredSplitKey less than the minimum split + // key, we update the desired split key to be the minimum split key. This + // prevents splitting before the first row in a Table range, which would + // result in the LHS having now rows. + minSplitKey, err := mvccMinSplitKey(it, startKey.AsRawKey()) if err != nil { return nil, err + } else if minSplitKey == nil { + return nil, nil } - // Ensure the key is a valid split point that does not fall in the middle of a - // SQL row by removing the column family ID, if any, from the end of the key. - return keys.EnsureSafeSplitKey(splitKey.Key) + var seekKey roachpb.Key + if minSplitKey.Compare(desiredSplitKey.AsRawKey()) > 0 { + seekKey = minSplitKey + } else { + seekKey = desiredSplitKey.AsRawKey() + } + + it.SeekGE(MakeMVCCMetadataKey(seekKey)) + if ok, err := it.Valid(); err != nil { + return nil, err + } else if !ok { + return nil, nil + } + + return keys.EnsureSafeSplitKey(it.UnsafeKey().Key.Clone()) } // willOverflow returns true iff adding both inputs would under- or overflow diff --git a/pkg/storage/mvcc_test.go b/pkg/storage/mvcc_test.go index 72475e3383b7..102d578a6e47 100644 --- a/pkg/storage/mvcc_test.go +++ b/pkg/storage/mvcc_test.go @@ -4121,23 +4121,6 @@ func TestFindValidSplitKeys(t *testing.T) { defer log.Scope(t).Close(t) userID := TestingUserDescID(0) - // Manually creates rows corresponding to the schema: - // CREATE TABLE t (id1 STRING, id2 STRING, ... PRIMARY KEY (id1, id2, ...)) - addTablePrefix := func(prefix roachpb.Key, id uint32, rowVals ...string) roachpb.Key { - tableKey := append(prefix, keys.SystemSQLCodec.TablePrefix(id)...) - rowKey := roachpb.Key(encoding.EncodeVarintAscending(tableKey, 1)) - for _, rowVal := range rowVals { - rowKey = encoding.EncodeStringAscending(rowKey, rowVal) - } - return rowKey - } - tablePrefix := func(id uint32, rowVals ...string) roachpb.Key { - return addTablePrefix(nil, id, rowVals...) - } - addColFam := func(rowKey roachpb.Key, colFam uint32) roachpb.Key { - return keys.MakeFamilyKey(append([]byte(nil), rowKey...), colFam) - } - type testCase struct { keys []roachpb.Key rangeStart roachpb.Key // optional @@ -4276,16 +4259,16 @@ func TestFindValidSplitKeys(t *testing.T) { // or return the start key of the range. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a"), 1), - addColFam(tablePrefix(userID, "a"), 2), - addColFam(tablePrefix(userID, "a"), 3), - addColFam(tablePrefix(userID, "a"), 4), - addColFam(tablePrefix(userID, "a"), 5), - addColFam(tablePrefix(userID, "b"), 1), - addColFam(tablePrefix(userID, "c"), 1), + testAddColFam(testTablePrefix(userID, "a"), 1), + testAddColFam(testTablePrefix(userID, "a"), 2), + testAddColFam(testTablePrefix(userID, "a"), 3), + testAddColFam(testTablePrefix(userID, "a"), 4), + testAddColFam(testTablePrefix(userID, "a"), 5), + testAddColFam(testTablePrefix(userID, "b"), 1), + testAddColFam(testTablePrefix(userID, "c"), 1), }, - rangeStart: tablePrefix(userID, "a"), - expSplit: tablePrefix(userID, "b"), + rangeStart: testTablePrefix(userID, "a"), + expSplit: testTablePrefix(userID, "b"), expError: false, }, // More example table data. Make sure ranges at the start of a table can @@ -4293,58 +4276,58 @@ func TestFindValidSplitKeys(t *testing.T) { // break for such ranges. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a"), 1), - addColFam(tablePrefix(userID, "b"), 1), - addColFam(tablePrefix(userID, "c"), 1), - addColFam(tablePrefix(userID, "d"), 1), + testAddColFam(testTablePrefix(userID, "a"), 1), + testAddColFam(testTablePrefix(userID, "b"), 1), + testAddColFam(testTablePrefix(userID, "c"), 1), + testAddColFam(testTablePrefix(userID, "d"), 1), }, rangeStart: keys.SystemSQLCodec.TablePrefix(userID), - expSplit: tablePrefix(userID, "c"), + expSplit: testTablePrefix(userID, "c"), expError: false, }, // More example table data. Make sure ranges at the start of a table can // be split properly even in the presence of a large first row. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a"), 1), - addColFam(tablePrefix(userID, "a"), 2), - addColFam(tablePrefix(userID, "a"), 3), - addColFam(tablePrefix(userID, "a"), 4), - addColFam(tablePrefix(userID, "a"), 5), - addColFam(tablePrefix(userID, "b"), 1), - addColFam(tablePrefix(userID, "c"), 1), + testAddColFam(testTablePrefix(userID, "a"), 1), + testAddColFam(testTablePrefix(userID, "a"), 2), + testAddColFam(testTablePrefix(userID, "a"), 3), + testAddColFam(testTablePrefix(userID, "a"), 4), + testAddColFam(testTablePrefix(userID, "a"), 5), + testAddColFam(testTablePrefix(userID, "b"), 1), + testAddColFam(testTablePrefix(userID, "c"), 1), }, rangeStart: keys.SystemSQLCodec.TablePrefix(TestingUserDescID(0)), - expSplit: tablePrefix(userID, "b"), + expSplit: testTablePrefix(userID, "b"), expError: false, }, // One partition where partition key is the first column. Checks that // split logic is not confused by the special partition start key. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a", "a"), 1), - addColFam(tablePrefix(userID, "a", "b"), 1), - addColFam(tablePrefix(userID, "a", "c"), 1), - addColFam(tablePrefix(userID, "a", "d"), 1), + testAddColFam(testTablePrefix(userID, "a", "a"), 1), + testAddColFam(testTablePrefix(userID, "a", "b"), 1), + testAddColFam(testTablePrefix(userID, "a", "c"), 1), + testAddColFam(testTablePrefix(userID, "a", "d"), 1), }, - rangeStart: tablePrefix(userID, "a"), - expSplit: tablePrefix(userID, "a", "c"), + rangeStart: testTablePrefix(userID, "a"), + expSplit: testTablePrefix(userID, "a", "c"), expError: false, }, // One partition with a large first row. Checks that our logic to avoid // splitting in the middle of a row still applies. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a", "a"), 1), - addColFam(tablePrefix(userID, "a", "a"), 2), - addColFam(tablePrefix(userID, "a", "a"), 3), - addColFam(tablePrefix(userID, "a", "a"), 4), - addColFam(tablePrefix(userID, "a", "a"), 5), - addColFam(tablePrefix(userID, "a", "b"), 1), - addColFam(tablePrefix(userID, "a", "c"), 1), + testAddColFam(testTablePrefix(userID, "a", "a"), 1), + testAddColFam(testTablePrefix(userID, "a", "a"), 2), + testAddColFam(testTablePrefix(userID, "a", "a"), 3), + testAddColFam(testTablePrefix(userID, "a", "a"), 4), + testAddColFam(testTablePrefix(userID, "a", "a"), 5), + testAddColFam(testTablePrefix(userID, "a", "b"), 1), + testAddColFam(testTablePrefix(userID, "a", "c"), 1), }, - rangeStart: tablePrefix(userID, "a"), - expSplit: tablePrefix(userID, "a", "b"), + rangeStart: testTablePrefix(userID, "a"), + expSplit: testTablePrefix(userID, "a", "b"), expError: false, }, } @@ -4366,17 +4349,7 @@ func TestFindValidSplitKeys(t *testing.T) { defer engine.Close() ms := &enginepb.MVCCStats{} - val := roachpb.MakeValueFromString(strings.Repeat("X", 10)) - for _, k := range test.keys { - // Add three MVCC versions of every key. Splits are not allowed - // between MVCC versions, so this shouldn't have any effect. - for j := 1; j <= 3; j++ { - ts := hlc.Timestamp{Logical: int32(j)} - if err := MVCCPut(ctx, engine, ms, []byte(k), ts, hlc.ClockTimestamp{}, val, nil); err != nil { - t.Fatal(err) - } - } - } + testPopulateKeysWithVersions(ctx, t, engine, ms, test.keys) rangeStart := test.keys[0] if len(test.rangeStart) > 0 { rangeStart = test.rangeStart @@ -4487,6 +4460,203 @@ func TestFindBalancedSplitKeys(t *testing.T) { } } +// testAddPrefix manually creates rows corresponding to the schema e.g. +// CREATE TABLE t (id1 STRING, id2 STRING, ... PRIMARY KEY (id1, id2, ...)) +func testAddPrefix(prefix roachpb.Key, id uint32, rowVals ...string) roachpb.Key { + tableKey := append(prefix, keys.SystemSQLCodec.TablePrefix(id)...) + rowKey := roachpb.Key(encoding.EncodeVarintAscending(tableKey, 1)) + for _, rowVal := range rowVals { + rowKey = encoding.EncodeStringAscending(rowKey, rowVal) + } + return rowKey +} + +func testTablePrefix(id uint32, rowVals ...string) roachpb.Key { + return testAddPrefix(nil, id, rowVals...) +} +func testAddColFam(rowKey roachpb.Key, colFam uint32) roachpb.Key { + return keys.MakeFamilyKey(append([]byte(nil), rowKey...), colFam) +} + +// testPopulateKeysWithVersions puts the keys into the engine provided. Each +// key is added with 3 MVCC versions with a XX.. value. +func testPopulateKeysWithVersions( + ctx context.Context, t *testing.T, engine Engine, ms *enginepb.MVCCStats, keys []roachpb.Key, +) { + val := roachpb.MakeValueFromString(strings.Repeat("X", 10)) + for _, k := range keys { + // Add three MVCC versions of every key. Splits are not allowed + // between MVCC versions, so this shouldn't have any effect. + for j := 1; j <= 3; j++ { + ts := hlc.Timestamp{Logical: int32(j)} + require.NoError( + t, + MVCCPut(ctx, engine, ms, []byte(k), ts, hlc.ClockTimestamp{}, val, nil), + ) + } + } +} + +// TestMVCCFirstSplitKey checks that the split key returned from +// MVCCFirstSplitKey is: +// (1) Within a range's bounds +// (2) No less than the desired split key. +// (3) Greater than the first key, or first row's keys in table ranges. +// (4) Not inbetween the start and end of a row for table ranges. +func TestMVCCFirstSplitKey(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + userID := TestingUserDescID(0) + + type splitExpect struct { + desired roachpb.Key + expected roachpb.Key + } + + testCases := []struct { + keys []roachpb.Key + startKey, endKey roachpb.Key + splits []splitExpect + }{ + { + // No keys, no splits. + keys: []roachpb.Key{}, + startKey: roachpb.Key("a"), + endKey: roachpb.Key("z"), + splits: []splitExpect{ + {desired: roachpb.Key("a"), expected: nil}, + {desired: roachpb.Key("m"), expected: nil}, + {desired: roachpb.Key("z"), expected: nil}, + }, + }, + { + // All keys are outside the range, no keys to spit at so expect no + // splits. + keys: []roachpb.Key{ + roachpb.Key("0"), + roachpb.Key("c"), + roachpb.Key("d"), + }, + startKey: roachpb.Key("a"), + endKey: roachpb.Key("c"), + splits: []splitExpect{ + {desired: roachpb.Key("a"), expected: nil}, + {desired: roachpb.Key("b"), expected: nil}, + {desired: roachpb.Key("c"), expected: nil}, + }, + }, + { + // Only one key within the range, require at least two keys to split. + keys: []roachpb.Key{ + // (0) is outside the range [a,c) + roachpb.Key("0"), + roachpb.Key("a"), + // (c) is outside of the range [a,c). + roachpb.Key("c"), + }, + startKey: roachpb.Key("a"), + endKey: roachpb.Key("c"), + splits: []splitExpect{ + {desired: roachpb.Key("a"), expected: nil}, + {desired: roachpb.Key("b"), expected: nil}, + {desired: roachpb.Key("c"), expected: nil}, + }, + }, + { + // Enough keys to realize a split on c. Only desiredSplitKeys <= c should + // split at c. + keys: []roachpb.Key{ + // (0) is outside the range [a,e) + roachpb.Key("0"), + roachpb.Key("b"), + roachpb.Key("c"), + // (e) is outside of the range [a,e). + roachpb.Key("e"), + }, + startKey: roachpb.Key("a"), + endKey: roachpb.Key("e"), + splits: []splitExpect{ + // Should iterate to the first split key after minSpitKey which is (c). + {desired: roachpb.Key("0"), expected: roachpb.Key("c")}, + {desired: roachpb.Key("b"), expected: roachpb.Key("c")}, + {desired: roachpb.Key("c"), expected: roachpb.Key("c")}, + // Desired split key is after the last key in the range (c), shouldn't + // split. + {desired: roachpb.Key("d"), expected: nil}, + }, + }, + { + keys: []roachpb.Key{ + testAddColFam(testTablePrefix(userID, "a"), 1), + testAddColFam(testTablePrefix(userID, "b"), 1), + testAddColFam(testTablePrefix(userID, "b"), 2), + testAddColFam(testTablePrefix(userID, "b"), 3), + testAddColFam(testTablePrefix(userID, "d"), 1), + // (e,1) is outside of the range [a,e) + testAddColFam(testTablePrefix(userID, "e"), 1), + }, + startKey: testTablePrefix(userID, "a"), + endKey: testTablePrefix(userID, "e"), + splits: []splitExpect{ + {desired: testAddColFam(testTablePrefix(userID, "a"), 0), expected: testTablePrefix(userID, "b")}, + {desired: testAddColFam(testTablePrefix(userID, "b"), 3), expected: testTablePrefix(userID, "b")}, + // The first key after the desired split key is (d,1), expect a split + // at the prefix (d). + {desired: testAddColFam(testTablePrefix(userID, "b"), 4), expected: testTablePrefix(userID, "d")}, + // Desired split key is after the last key in the range (d,1), + // shouldn't split. + {desired: testAddColFam(testTablePrefix(userID, "d"), 2), expected: nil}, + }, + }, + { + // One partiton key, where the partition key is the first column (a). + keys: []roachpb.Key{ + testAddColFam(testTablePrefix(userID, "a", "a"), 1), + testAddColFam(testTablePrefix(userID, "a", "a"), 3), + testAddColFam(testTablePrefix(userID, "a", "b"), 1), + testAddColFam(testTablePrefix(userID, "a", "c"), 1), + // (a,d,0) is outside the range [a,(a,d)). + testAddColFam(testTablePrefix(userID, "a", "d"), 0), + }, + startKey: testTablePrefix(userID, "a"), + endKey: testTablePrefix(userID, "a", "d"), + splits: []splitExpect{ + {desired: testTablePrefix(userID, "a"), expected: testTablePrefix(userID, "a", "b")}, + {desired: testAddColFam(testTablePrefix(userID, "a", "a"), 3), expected: testTablePrefix(userID, "a", "b")}, + {desired: testAddColFam(testTablePrefix(userID, "a", "b"), 2), expected: testTablePrefix(userID, "a", "c")}, + // Desired split key is after the last key in the range (a,c,1), + // shouldn't split. + {desired: testAddColFam(testTablePrefix(userID, "a", "c"), 2), expected: nil}, + {desired: testTablePrefix(userID, "a", "e"), expected: nil}, + }, + }, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("%v", tc.keys), + func(t *testing.T) { + ctx := context.Background() + engine := NewDefaultInMemForTesting() + defer engine.Close() + + testPopulateKeysWithVersions(ctx, t, engine, &enginepb.MVCCStats{}, tc.keys) + rangeStartAddr := keys.MustAddr(tc.startKey) + rangeEndAddr := keys.MustAddr(tc.endKey) + for _, split := range tc.splits { + t.Run(fmt.Sprintf("%v", split.desired), func(t *testing.T) { + desiredSplitAddr := keys.MustAddr(split.desired) + splitKey, err := MVCCFirstSplitKey(ctx, engine, desiredSplitAddr, rangeStartAddr, rangeEndAddr) + // NB: We don't expect errors. If no split key can be found, we + // expect a nil splitKey to be returned. + require.NoError(t, err) + require.Equal(t, split.expected, splitKey) + }) + } + }) + } +} + // TestMVCCGarbageCollect writes a series of gc'able bytes and then // sends an MVCC GC request and verifies cleared values and updated // stats. From 7864aae7ea2ceb2929715faca720bf2b9ce49c3a Mon Sep 17 00:00:00 2001 From: Austen McClernon Date: Tue, 30 May 2023 14:04:23 +0000 Subject: [PATCH 5/5] storage: prevent iter panic on meta1 split key It was possible that a load based split was suggested for `meta1`, which would call `MVCCFirstSplitKey` and panic as the `meta1` start key `\Min` is local, whilst the `meta1` end key is global `0x02 0xff 0xff`. Add a check that the start key is greater than the `meta1` end key before processing in `MVCCFirstSplitKey` to prevent the panic. Note `meta1` would never be split regardless, as `storage.IsValidSplitKey` would fail after finding a split key. Also note that if the desired split key is a local key, the same problem doesn't exist as the minimum split key would be used to seek the first split key instead. Fixes: #104007 Release note: None --- pkg/storage/mvcc.go | 8 +++++++- pkg/storage/mvcc_test.go | 20 ++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index 153fddab0174..f01d9a69540b 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -5496,10 +5496,16 @@ func mvccMinSplitKey(it MVCCIterator, startKey roachpb.Key) (roachpb.Key, error) // 4. Not in between the start and end of a row for table ranges. // // The returned split key is NOT guaranteed to be outside a no-split span, such -// as Meta1, Meta2Max or Node Liveness. +// as Meta2Max or Node Liveness. func MVCCFirstSplitKey( _ context.Context, reader Reader, desiredSplitKey, startKey, endKey roachpb.RKey, ) (roachpb.Key, error) { + // If the start key of the range is within the meta1 key space, the range + // cannot be split. + if startKey.Less(roachpb.RKey(keys.LocalMax)) { + return nil, nil + } + it := reader.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: endKey.AsRawKey()}) defer it.Close() diff --git a/pkg/storage/mvcc_test.go b/pkg/storage/mvcc_test.go index 102d578a6e47..427a4cd0cb7a 100644 --- a/pkg/storage/mvcc_test.go +++ b/pkg/storage/mvcc_test.go @@ -4530,6 +4530,25 @@ func TestMVCCFirstSplitKey(t *testing.T) { {desired: roachpb.Key("z"), expected: nil}, }, }, + { + // meta1 cannot be split. Previously, this test would cause a panic in + // mvccMinSplitKey, called by MVCCFirstSplitKey. The iterator is + // initialized with a global key constraint from the endKey + // ("\x02\xff\xff"), but we seekGE the start key (MinKey="") which is + // local because it is before LocalMax (0x02). + keys: []roachpb.Key{ + roachpb.Key("\x02"), + roachpb.Key("\x02\x00"), + roachpb.Key("\x02\xff"), + }, + startKey: keys.MinKey, + endKey: keys.Meta1KeyMax, + splits: []splitExpect{ + {desired: keys.MinKey, expected: nil}, + {desired: roachpb.Key("\x02"), expected: nil}, + {desired: roachpb.Key("\x02\x00"), expected: nil}, + }, + }, { // All keys are outside the range, no keys to spit at so expect no // splits. @@ -4581,6 +4600,7 @@ func TestMVCCFirstSplitKey(t *testing.T) { {desired: roachpb.Key("0"), expected: roachpb.Key("c")}, {desired: roachpb.Key("b"), expected: roachpb.Key("c")}, {desired: roachpb.Key("c"), expected: roachpb.Key("c")}, + {desired: keys.MinKey, expected: roachpb.Key("c")}, // Desired split key is after the last key in the range (c), shouldn't // split. {desired: roachpb.Key("d"), expected: nil},