diff --git a/pkg/kv/kvserver/client_split_test.go b/pkg/kv/kvserver/client_split_test.go index 3c4a981895f3..2c5720790fc3 100644 --- a/pkg/kv/kvserver/client_split_test.go +++ b/pkg/kv/kvserver/client_split_test.go @@ -3649,3 +3649,249 @@ func TestStoreRangeSplitAndMergeWithGlobalReads(t *testing.T) { repl = store.LookupReplica(roachpb.RKey(splitKey)) require.Equal(t, descKey, repl.Desc().StartKey.AsRawKey()) } + +// TestLBSplitUnsafeKeys tests that load based splits do not split between table +// rows, even when the suggested load based split key is itself between a table row. +func TestLBSplitUnsafeKeys(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + ctx := context.Background() + const indexID = 1 + + // The test is expensive and prone to timing out under race. + skip.UnderRace(t) + skip.UnderStressRace(t) + skip.UnderDeadlock(t) + + makeTestKey := func(tableID uint32, suffix []byte) roachpb.Key { + tableKey := keys.MakeTableIDIndexID(nil, tableID, indexID) + return append(tableKey, suffix...) + } + + es := func(vals ...int64) []byte { + k := []byte{} + for _, v := range vals { + k = encoding.EncodeVarintAscending(k, v) + } + return k + } + + fk := func(k []byte, famID uint32) []byte { + return keys.MakeFamilyKey(k, famID) + } + + testCases := []struct { + splitKey roachpb.Key + existingKeys []int + expSplitKey roachpb.Key + expErrStr string + }{ + // We don't know the table ID here, we append the splitKey to the + // table/index prefix. e.g. /1 will be /Table/table_id/index_id/1. + { + // /1 -> /2 + splitKey: es(1), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/0 -> /2 + splitKey: fk(es(1), 0), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/3 -> /2 + splitKey: fk(es(1), 3), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/0/0 -> /2 + splitKey: roachpb.Key(fk(es(1), 0)).Next(), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/3/0 -> /2 + splitKey: roachpb.Key(fk(es(1), 3)).Next(), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/0/0/0 -> /2 + splitKey: roachpb.Key(fk(es(1), 0)).Next().Next(), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/3/0/0 -> /2 + splitKey: roachpb.Key(fk(es(1), 3)).Next().Next(), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /0 -> /3 + // We will not split at the first row in a table, so expect the split at + // /3 intead of /2. + splitKey: es(0), + existingKeys: []int{2, 3}, + expSplitKey: es(3), + }, + { + // /2 -> /3 + // Same case as above, despite the key being safe, the split would create + // an empty LHS. + splitKey: es(2), + existingKeys: []int{2, 3}, + expSplitKey: es(3), + }, + { + // /1 -> error + // There are no rows to split on. + splitKey: es(1), + existingKeys: []int{}, + expErrStr: "could not find valid split key", + }, + { + // /1 -> error + // There is only one row to split on, the range should not be split. + splitKey: es(1), + existingKeys: []int{2}, + expErrStr: "could not find valid split key", + }, + { + // /1/0 -> error + splitKey: fk(es(1), 0), + existingKeys: []int{2}, + expErrStr: "could not find valid split key", + }, + { + // /1/3 -> error + splitKey: fk(es(1), 3), + existingKeys: []int{2}, + expErrStr: "could not find valid split key", + }, + { + // /1/3/0/0 -> error + splitKey: roachpb.Key(fk(es(1), 0)).Next().Next(), + existingKeys: []int{2}, + expErrStr: "could not find valid split key", + }, + { + // /2 -> /2 + splitKey: es(2), + existingKeys: []int{1, 2}, + expSplitKey: es(2), + }, + } + + for _, tc := range testCases { + var expectStr string + if tc.expErrStr != "" { + expectStr = tc.expErrStr + } else { + expectStr = makeTestKey(1, tc.expSplitKey).String() + } + t.Run(fmt.Sprintf("%s%v -> %s", makeTestKey(1, tc.splitKey), tc.existingKeys, expectStr), func(t *testing.T) { + var targetRange atomic.Int32 + var splitKeyOverride atomic.Value + splitKeyOverride.Store(roachpb.Key{}) + + // Mock the load based splitter key finding method. This function will be + // checked in splitQueue.shouldQueue() and splitQueue.process via + // replica.loadSplitKey. When a key is returned, the split queue calls + // replica.adminSplitWithDescriptor(...findFirstSafeSplitKey=true). + overrideLBSplitFn := func(rangeID roachpb.RangeID) (splitKey roachpb.Key, useSplitKey bool) { + if rangeID == roachpb.RangeID(targetRange.Load()) { + override := splitKeyOverride.Load() + // It is possible that the split queue is checking the range before + // we manually enqueued it. + if override == nil { + return nil, false + } + overrideKey, ok := override.(roachpb.Key) + require.Truef(t, ok, "stored value not key %+v", override) + + if len(overrideKey) == 0 { + return nil, false + } + + return override.(roachpb.Key), true + } + return nil, false + } + + serv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{ + DisableSpanConfigs: true, + Knobs: base.TestingKnobs{ + Store: &kvserver.StoreTestingKnobs{ + LoadBasedSplittingOverrideKey: overrideLBSplitFn, + DisableMergeQueue: true, + }, + }, + }) + s := serv.(*server.TestServer) + defer s.Stopper().Stop(ctx) + tdb := sqlutils.MakeSQLRunner(sqlDB) + store, err := s.Stores().GetStore(1) + require.NoError(t, err) + + // We want to exercise the case where there are column family keys. + // Create a simple table and insert the existing keys. + _ = tdb.Exec(t, + "CREATE TABLE t (k INT PRIMARY KEY, "+ + "t0 INT, t1 INT, t2 INT, t3 INT, "+ + "FAMILY (k), FAMILY (t0), FAMILY (t1), FAMILY (t2), FAMILY (t3))") + for _, k := range tc.existingKeys { + _ = tdb.Exec(t, fmt.Sprintf("INSERT INTO t VALUES (%d, %d, %d, %d, %d)", + k, k, k, k, k)) + } + + // Force a table scan to resolve descriptors. + var keyCount int + tdb.QueryRow(t, "SELECT count(k) FROM t").Scan(&keyCount) + require.Equal(t, len(tc.existingKeys), keyCount) + var tableID uint32 + tdb.QueryRow(t, "SELECT table_id FROM crdb_internal.leases where name = 't'").Scan(&tableID) + + // Split off the table range for the test, otherwise the range may + // contain multiple tables with existing values. + splitArgs := adminSplitArgs(keys.SystemSQLCodec.TablePrefix(tableID)) + _, pErr := kv.SendWrapped(ctx, store.TestSender(), splitArgs) + require.Nil(t, pErr) + + var rangeID roachpb.RangeID + tdb.QueryRow(t, "SELECT range_id FROM [SHOW RANGES FROM TABLE t] LIMIT 1").Scan(&rangeID) + targetRange.Store(int32(rangeID)) + repl, err := store.GetReplica(rangeID) + require.NoError(t, err) + + // Keep the previous end key around, we will use this to assert that no + // split has occurred when expecting an error. + prevEndKey := repl.Desc().EndKey.AsRawKey() + splitKey := makeTestKey(tableID, tc.splitKey) + + // Update the split key override so that the split queue will enqueue and + // process the range. Remove it afterwards to avoid retrying the LHS. + splitKeyOverride.Store(splitKey) + _, processErr, enqueueErr := store.Enqueue(ctx, "split", repl, false /* shouldSkipQueue */, false /* async */) + splitKeyOverride.Store(roachpb.Key{}) + require.NoError(t, enqueueErr) + + endKey := repl.Desc().EndKey.AsRawKey() + if tc.expErrStr != "" { + // We expect this split not to process, assert that the expected error + // matches the returned error and the range has the same end key. + require.ErrorContainsf(t, processErr, tc.expErrStr, + "end key %s, previous end key %s", endKey, prevEndKey) + require.Equal(t, prevEndKey, endKey) + } else { + // Otherwise, assert that the new range end key matches the expected + // end key. + require.NoError(t, processErr) + require.Equal(t, makeTestKey(tableID, tc.expSplitKey), endKey) + } + }) + } +} diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index 148b0493612d..acc70a38bc18 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -77,7 +77,7 @@ func (r *Replica) AdminSplit( err := r.executeAdminCommandWithDescriptor(ctx, func(desc *roachpb.RangeDescriptor) error { var err error - reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason) + reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason, false /* findFirstSafeKey */) return err }) return reply, err @@ -296,7 +296,6 @@ func splitTxnStickyUpdateAttempt( // affirmative the descriptor is passed to AdminSplit, which performs a // Conditional Put on the RangeDescriptor to ensure that no other operation has // modified the range in the time the decision was being made. -// TODO(tschottdorf): should assert that split key is not a local key. // // See the comment on splitTrigger for details on the complexities. func (r *Replica) adminSplitWithDescriptor( @@ -305,6 +304,7 @@ func (r *Replica) adminSplitWithDescriptor( desc *roachpb.RangeDescriptor, delayable bool, reason string, + findFirstSafeKey bool, ) (roachpb.AdminSplitResponse, error) { var err error var reply roachpb.AdminSplitResponse @@ -345,11 +345,36 @@ func (r *Replica) adminSplitWithDescriptor( ri := r.GetRangeInfo(ctx) return reply, roachpb.NewRangeKeyMismatchErrorWithCTPolicy(ctx, args.Key, args.Key, desc, &ri.Lease, ri.ClosedTimestampPolicy) } - foundSplitKey = args.SplitKey + // When findFirstSafeKey is true, we find the first key after or at + // args.SplitKey which is a safe split to split at. The current user of + // findFirstSafeKey is load based splitting, which only has knowledge of + // sampled keys from batch requests. These sampled keys can be + // arbitrarily within SQL rows due to column family keys. + // + // Not every caller requires a real key as a split point (creating empty + // table), however when we cannot verify the split key as safe, the most + // reliable method is checking existing keys. + if findFirstSafeKey { + var desiredSplitKey roachpb.RKey + if desiredSplitKey, err = keys.Addr(args.SplitKey); err != nil { + return reply, err + } + if foundSplitKey, err = storage.MVCCFirstSplitKey( + ctx, r.store.engine, desiredSplitKey, + desc.StartKey, desc.EndKey, + ); err != nil { + return reply, errors.Wrap(err, "unable to determine split key") + } else if foundSplitKey == nil { + return reply, unsplittableRangeError{} + } + } else { + foundSplitKey = args.SplitKey + } } if !kvserverbase.ContainsKey(desc, foundSplitKey) { - return reply, errors.Errorf("requested split key %s out of bounds of %s", args.SplitKey, r) + return reply, errors.Errorf("requested split key %s (found=%s) out of bounds of %s", + args.SplitKey, foundSplitKey, r) } // If predicate keys are specified, make sure they are contained by this diff --git a/pkg/kv/kvserver/replica_split_load.go b/pkg/kv/kvserver/replica_split_load.go index 3cc43ae28e39..34d26e3ead83 100644 --- a/pkg/kv/kvserver/replica_split_load.go +++ b/pkg/kv/kvserver/replica_split_load.go @@ -12,10 +12,14 @@ package kvserver import ( "context" + "time" + "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings" + "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/errors" ) // SplitByLoadEnabled wraps "kv.range_split.by_load_enabled". @@ -135,3 +139,81 @@ func (r *Replica) recordBatchForLoadBasedSplitting( r.store.splitQueue.MaybeAddAsync(ctx, r, r.store.Clock().NowAsClockTimestamp()) } } + +// loadSplitKey returns a suggested load split key for the range if it exists, +// otherwise it returns nil. If there were any errors encountered when +// validating the split key, the error is returned as well. It is guaranteed +// that the key returned, if non-nil, will be greater than the start key of the +// range and also within the range bounds. +// +// NOTE: The returned split key CAN BE BETWEEN A SQL ROW, The split key +// returned should only be used to engage a split via adminSplitWithDescriptor +// where findFirstSafeKey is set to true. +func (r *Replica) loadSplitKey(ctx context.Context, now time.Time) roachpb.Key { + var splitKey roachpb.Key + if overrideFn := r.store.cfg.TestingKnobs.LoadBasedSplittingOverrideKey; overrideFn != nil { + var useSplitKey bool + if splitKey, useSplitKey = overrideFn(r.GetRangeID()); useSplitKey { + return splitKey + } + } else { + splitKey = r.loadBasedSplitter.MaybeSplitKey(ctx, now) + } + + if splitKey == nil { + return nil + } + + // If the splitKey belongs to a Table range, try and shorten the key to just + // the row prefix. This allows us to check that splitKey doesn't map to the + // first key of the range here. If the split key contains column families, it + // is possible that the full key is strictly after every existing key for + // that row. e.g. for a table row where the table ID is 100, index ID is 1, + // primary key is a, and the column family ID is 3 (length=1): + // + // splitKey = /Table/100/1/"a"/3/1 + // existing = [..., /Table/100/1/"a"/2/1] + // + // We would not split at /Table/100/1/"a" as there's no key >= the splitKey + // in the range. + // + // NB: We handle unsafe split keys in replica.adminSplitWithDescriptor, so it + // isn't an issue if we return an unsafe key here. See the case where + // findFirstSafeKey is true. + if keyRowPrefix, err := keys.EnsureSafeSplitKey(splitKey); err == nil { + splitKey = keyRowPrefix + } + + // We swallow the error here and instead log an event. It is currently + // expected that the load based splitter may return the start key of the + // range. + if err := splitKeyPreCheck(r.Desc().RSpan(), splitKey); err != nil { + log.KvDistribution.VEventf(ctx, 1, "suggested load split key not usable: %s", err) + return nil + } + + return splitKey +} + +// splitKeyPreCheck checks that a split key is addressable and not the same as +// the start key. An error is returned if these are not true. Additional checks +// are made in adminSplitWithDescriptor when a split request is processed by +// the replica. +func splitKeyPreCheck(rspan roachpb.RSpan, splitKey roachpb.Key) error { + splitRKey, err := keys.Addr(splitKey) + if err != nil { + return err + } + + // If the split key is equal to the start key of the range, it is treated as + // a no-op in adminSplitWithDescriptor, however it is treated as an error + // here because we shouldn't be suggesting split keys that are identical to + // the start key of the range. + if splitRKey.Equal(rspan.Key) { + return errors.Errorf( + "split key is equal to range start key (split_key=%s)", + splitRKey) + } + + return nil +} diff --git a/pkg/kv/kvserver/split/BUILD.bazel b/pkg/kv/kvserver/split/BUILD.bazel index 02f19930667f..5a6e668deafc 100644 --- a/pkg/kv/kvserver/split/BUILD.bazel +++ b/pkg/kv/kvserver/split/BUILD.bazel @@ -10,7 +10,6 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/split", visibility = ["//visibility:public"], deps = [ - "//pkg/keys", "//pkg/roachpb", "//pkg/util/log", "//pkg/util/metric", @@ -30,7 +29,6 @@ go_test( deps = [ "//pkg/keys", "//pkg/roachpb", - "//pkg/util/encoding", "//pkg/util/leaktest", "//pkg/util/metric", "//pkg/util/stop", diff --git a/pkg/kv/kvserver/split/decider.go b/pkg/kv/kvserver/split/decider.go index ce7d54e6c3fd..c2c2f8fb796e 100644 --- a/pkg/kv/kvserver/split/decider.go +++ b/pkg/kv/kvserver/split/decider.go @@ -16,7 +16,6 @@ import ( "context" "time" - "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/metric" @@ -221,6 +220,8 @@ func (d *Decider) MaxQPS(ctx context.Context, now time.Time) (float64, bool) { // or if it wasn't able to determine a suitable split key. // // It is legal to call MaybeSplitKey at any time. +// WARNING: The key returned from MaybeSplitKey has no guarantee of being a +// safe split key. The key is derived from sampled spans. See below. func (d *Decider) MaybeSplitKey(ctx context.Context, now time.Time) roachpb.Key { var key roachpb.Key @@ -244,27 +245,20 @@ func (d *Decider) MaybeSplitKey(ctx context.Context, now time.Time) roachpb.Key // // /Table/51/52/53/54/55 // - // (see TestDeciderCallsEnsureSafeSplitKey). + // The key found here isn't guaranteed to be a valid SQL column family key. + // This is because the keys are sampled from StartKey and EndKey of + // requests hitting this replica. Ranged operations may well wish to + // exclude the start point by calling .Next() or may span multiple ranges, + // and so such a key may end up being returned. This is more common than + // one might think since SQL issues plenty of scans over all column + // families, meaning that we'll frequently find a key that has no column + // family suffix and thus errors out in EnsureSafeSplitKey. // - // The key found here isn't guaranteed to be a valid SQL column family - // key. This is because the keys are sampled from StartKey of requests - // hitting this replica. Ranged operations may well wish to exclude the - // start point by calling .Next() or may span multiple ranges, and so - // such a key may end up being passed to EnsureSafeSplitKey here. - // - // We take the risk that the result may sometimes not be a good split - // point (or even in this range). - // - // Note that we ignore EnsureSafeSplitKey when it returns an error since - // that error only tells us that this key couldn't possibly be a SQL - // key. This is more common than one might think since SQL issues plenty - // of scans over all column families, meaning that we'll frequently find - // a key that has no column family suffix and thus errors out in - // EnsureSafeSplitKey. + // We do not attempt to validate the key is safe here, simply return it to + // the caller as the best possible split point found so far. See + // replica.adminSplitWithDescriptor for how split keys are handled when we + // aren't certain the provided key is safe. key = d.mu.splitFinder.Key() - if safeKey, err := keys.EnsureSafeSplitKey(key); err == nil { - key = safeKey - } } return key } diff --git a/pkg/kv/kvserver/split/decider_test.go b/pkg/kv/kvserver/split/decider_test.go index 6db6a2e4223a..c41ca8edfe6f 100644 --- a/pkg/kv/kvserver/split/decider_test.go +++ b/pkg/kv/kvserver/split/decider_test.go @@ -12,14 +12,12 @@ package split import ( "context" - "math" "math/rand" "testing" "time" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" - "github.com/cockroachdb/cockroach/pkg/util/encoding" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/stretchr/testify/assert" @@ -240,82 +238,6 @@ func TestDecider_MaxQPS(t *testing.T) { assertMaxQPS(25000, 6, true) } -func TestDeciderCallsEnsureSafeSplitKey(t *testing.T) { - defer leaktest.AfterTest(t)() - intn := rand.New(rand.NewSource(11)).Intn - - var d Decider - Init(&d, intn, func() float64 { return 1.0 }, func() time.Duration { return time.Second }, &LoadSplitterMetrics{ - PopularKeyCount: metric.NewCounter(metric.Metadata{}), - NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), - }) - - baseKey := keys.SystemSQLCodec.TablePrefix(51) - for i := 0; i < 4; i++ { - baseKey = encoding.EncodeUvarintAscending(baseKey, uint64(52+i)) - } - c0 := func() roachpb.Span { return roachpb.Span{Key: append([]byte(nil), keys.MakeFamilyKey(baseKey, 1)...)} } - c1 := func() roachpb.Span { return roachpb.Span{Key: append([]byte(nil), keys.MakeFamilyKey(baseKey, 9)...)} } - - expK, err := keys.EnsureSafeSplitKey(c1().Key) - require.NoError(t, err) - - var k roachpb.Key - var now time.Time - for i := 0; i < 2*int(minSplitSuggestionInterval/time.Second); i++ { - now = now.Add(500 * time.Millisecond) - d.Record(context.Background(), now, 1, c0) - now = now.Add(500 * time.Millisecond) - d.Record(context.Background(), now, 1, c1) - k = d.MaybeSplitKey(context.Background(), now) - if len(k) != 0 { - break - } - } - - require.Equal(t, expK, k) -} - -func TestDeciderIgnoresEnsureSafeSplitKeyOnError(t *testing.T) { - defer leaktest.AfterTest(t)() - intn := rand.New(rand.NewSource(11)).Intn - - var d Decider - Init(&d, intn, func() float64 { return 1.0 }, func() time.Duration { return time.Second }, &LoadSplitterMetrics{ - PopularKeyCount: metric.NewCounter(metric.Metadata{}), - NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), - }) - - baseKey := keys.SystemSQLCodec.TablePrefix(51) - for i := 0; i < 4; i++ { - baseKey = encoding.EncodeUvarintAscending(baseKey, uint64(52+i)) - } - c0 := func() roachpb.Span { - return roachpb.Span{Key: append([]byte(nil), encoding.EncodeUvarintAscending(baseKey, math.MaxInt32+1)...)} - } - c1 := func() roachpb.Span { - return roachpb.Span{Key: append([]byte(nil), encoding.EncodeUvarintAscending(baseKey, math.MaxInt32+2)...)} - } - - _, err := keys.EnsureSafeSplitKey(c1().Key) - require.Error(t, err) - - var k roachpb.Key - var now time.Time - for i := 0; i < 2*int(minSplitSuggestionInterval/time.Second); i++ { - now = now.Add(500 * time.Millisecond) - d.Record(context.Background(), now, 1, c0) - now = now.Add(500 * time.Millisecond) - d.Record(context.Background(), now, 1, c1) - k = d.MaybeSplitKey(context.Background(), now) - if len(k) != 0 { - break - } - } - - require.Equal(t, c1().Key, k) -} - func TestMaxQPSTracker(t *testing.T) { defer leaktest.AfterTest(t)() diff --git a/pkg/kv/kvserver/split_queue.go b/pkg/kv/kvserver/split_queue.go index 5eecb2d1c475..72c8e0508a2d 100644 --- a/pkg/kv/kvserver/split_queue.go +++ b/pkg/kv/kvserver/split_queue.go @@ -151,7 +151,7 @@ func (sq *splitQueue) shouldQueue( repl.GetMaxBytes(), repl.shouldBackpressureWrites(), confReader) if !shouldQ && repl.SplitByLoadEnabled() { - if splitKey := repl.loadBasedSplitter.MaybeSplitKey(ctx, timeutil.Now()); splitKey != nil { + if splitKey := repl.loadSplitKey(ctx, timeutil.Now()); splitKey != nil { shouldQ, priority = true, 1.0 // default priority } } @@ -209,6 +209,7 @@ func (sq *splitQueue) processAttempt( desc, false, /* delayable */ "span config", + false, /* findFirstSafeSplitKey */ ); err != nil { return false, errors.Wrapf(err, "unable to split %s at key %q", r, splitKey) } @@ -227,13 +228,14 @@ func (sq *splitQueue) processAttempt( desc, false, /* delayable */ fmt.Sprintf("%s above threshold size %s", humanizeutil.IBytes(size), humanizeutil.IBytes(maxBytes)), + false, /* findFirstSafeSplitKey */ ) return err == nil, err } now := timeutil.Now() - if splitByLoadKey := r.loadBasedSplitter.MaybeSplitKey(ctx, now); splitByLoadKey != nil { + if splitByLoadKey := r.loadSplitKey(ctx, now); splitByLoadKey != nil { batchHandledQPS, _ := r.QueriesPerSecond() raftAppliedQPS := r.WritesPerSecond() splitQPS := r.loadBasedSplitter.LastQPS(ctx, now) @@ -256,6 +258,10 @@ func (sq *splitQueue) processAttempt( if expDelay := kvserverbase.SplitByLoadMergeDelay.Get(&sq.store.cfg.Settings.SV); expDelay > 0 { expTime = sq.store.Clock().Now().Add(expDelay.Nanoseconds(), 0) } + // The splitByLoadKey has no guarantee of being a safe key to split at (not + // between SQL rows). To sanitize the split point, pass + // findFirstSafeSplitKey set to true, so that the first key after the + // suggested split point which is safe to split at is used. if _, pErr := r.adminSplitWithDescriptor( ctx, roachpb.AdminSplitRequest{ @@ -268,6 +274,7 @@ func (sq *splitQueue) processAttempt( desc, false, /* delayable */ reason, + true, /* findFirstSafeSplitKey */ ); pErr != nil { return false, errors.Wrapf(pErr, "unable to split %s at key %q", r, splitByLoadKey) } diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go index 271155e9e06c..56c58806e3fa 100644 --- a/pkg/kv/kvserver/testing_knobs.go +++ b/pkg/kv/kvserver/testing_knobs.go @@ -137,6 +137,10 @@ type StoreTestingKnobs struct { DisableReplicaRebalancing bool // DisableLoadBasedSplitting turns off LBS so no splits happen because of load. DisableLoadBasedSplitting bool + // LoadBasedSplittingOverrideKey returns a key which should be used for load + // based splitting, overriding any value returned from the real load based + // splitter. + LoadBasedSplittingOverrideKey func(rangeID roachpb.RangeID) (splitKey roachpb.Key, useSplitKey bool) // DisableSplitQueue disables the split queue. DisableSplitQueue bool // DisableTimeSeriesMaintenanceQueue disables the time series maintenance diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index 9ae85491a83b..153fddab0174 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -5436,7 +5436,27 @@ func MVCCFindSplitKey( // was dangerous because partitioning can split off ranges that do not start // at valid row keys. The keys that are present in the range, by contrast, are // necessarily valid row keys. - it.SeekGE(MakeMVCCMetadataKey(key.AsRawKey())) + minSplitKey, err := mvccMinSplitKey(it, key.AsRawKey()) + if err != nil { + return nil, err + } else if minSplitKey == nil { + return nil, nil + } + + splitKey, err := it.FindSplitKey(key.AsRawKey(), endKey.AsRawKey(), minSplitKey, targetSize) + if err != nil { + return nil, err + } + // Ensure the key is a valid split point that does not fall in the middle of a + // SQL row by removing the column family ID, if any, from the end of the key. + return keys.EnsureSafeSplitKey(splitKey.Key) +} + +// mvccMinSplitKey returns the minimum key that a range may be split at. The +// caller is responsible for setting the iterator upper bound to the range end +// key. The caller is also responsible for closing the iterator. +func mvccMinSplitKey(it MVCCIterator, startKey roachpb.Key) (roachpb.Key, error) { + it.SeekGE(MakeMVCCMetadataKey(startKey)) if ok, err := it.Valid(); err != nil { return nil, err } else if !ok { @@ -5460,14 +5480,54 @@ func MVCCFindSplitKey( // Allow a split at any key that sorts after it. minSplitKey = it.Key().Key.Next() } + return minSplitKey, nil +} - splitKey, err := it.FindSplitKey(key.AsRawKey(), endKey.AsRawKey(), minSplitKey, targetSize) +// MVCCFirstSplitKey returns the first key which is safe to split at and no +// less than desiredSplitKey in the range which spans [startKey,endKey). If a +// non-nil key is returned, it is safe to split at. If a nil key is returned, no +// safe split key could be determined. The safe split key returned is +// guaranteed to be: +// +// 1. Within [startKey,endKey). +// 2. No less than desiredSplitKey. +// 3. Greater than the first key in [startKey,endKey]; or greater than all the +// first row's keys if a table range. . +// 4. Not in between the start and end of a row for table ranges. +// +// The returned split key is NOT guaranteed to be outside a no-split span, such +// as Meta1, Meta2Max or Node Liveness. +func MVCCFirstSplitKey( + _ context.Context, reader Reader, desiredSplitKey, startKey, endKey roachpb.RKey, +) (roachpb.Key, error) { + it := reader.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: endKey.AsRawKey()}) + defer it.Close() + + // If the caller has provided a desiredSplitKey less than the minimum split + // key, we update the desired split key to be the minimum split key. This + // prevents splitting before the first row in a Table range, which would + // result in the LHS having now rows. + minSplitKey, err := mvccMinSplitKey(it, startKey.AsRawKey()) if err != nil { return nil, err + } else if minSplitKey == nil { + return nil, nil } - // Ensure the key is a valid split point that does not fall in the middle of a - // SQL row by removing the column family ID, if any, from the end of the key. - return keys.EnsureSafeSplitKey(splitKey.Key) + var seekKey roachpb.Key + if minSplitKey.Compare(desiredSplitKey.AsRawKey()) > 0 { + seekKey = minSplitKey + } else { + seekKey = desiredSplitKey.AsRawKey() + } + + it.SeekGE(MakeMVCCMetadataKey(seekKey)) + if ok, err := it.Valid(); err != nil { + return nil, err + } else if !ok { + return nil, nil + } + + return keys.EnsureSafeSplitKey(it.UnsafeKey().Key.Clone()) } // willOverflow returns true iff adding both inputs would under- or overflow diff --git a/pkg/storage/mvcc_test.go b/pkg/storage/mvcc_test.go index 72475e3383b7..102d578a6e47 100644 --- a/pkg/storage/mvcc_test.go +++ b/pkg/storage/mvcc_test.go @@ -4121,23 +4121,6 @@ func TestFindValidSplitKeys(t *testing.T) { defer log.Scope(t).Close(t) userID := TestingUserDescID(0) - // Manually creates rows corresponding to the schema: - // CREATE TABLE t (id1 STRING, id2 STRING, ... PRIMARY KEY (id1, id2, ...)) - addTablePrefix := func(prefix roachpb.Key, id uint32, rowVals ...string) roachpb.Key { - tableKey := append(prefix, keys.SystemSQLCodec.TablePrefix(id)...) - rowKey := roachpb.Key(encoding.EncodeVarintAscending(tableKey, 1)) - for _, rowVal := range rowVals { - rowKey = encoding.EncodeStringAscending(rowKey, rowVal) - } - return rowKey - } - tablePrefix := func(id uint32, rowVals ...string) roachpb.Key { - return addTablePrefix(nil, id, rowVals...) - } - addColFam := func(rowKey roachpb.Key, colFam uint32) roachpb.Key { - return keys.MakeFamilyKey(append([]byte(nil), rowKey...), colFam) - } - type testCase struct { keys []roachpb.Key rangeStart roachpb.Key // optional @@ -4276,16 +4259,16 @@ func TestFindValidSplitKeys(t *testing.T) { // or return the start key of the range. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a"), 1), - addColFam(tablePrefix(userID, "a"), 2), - addColFam(tablePrefix(userID, "a"), 3), - addColFam(tablePrefix(userID, "a"), 4), - addColFam(tablePrefix(userID, "a"), 5), - addColFam(tablePrefix(userID, "b"), 1), - addColFam(tablePrefix(userID, "c"), 1), + testAddColFam(testTablePrefix(userID, "a"), 1), + testAddColFam(testTablePrefix(userID, "a"), 2), + testAddColFam(testTablePrefix(userID, "a"), 3), + testAddColFam(testTablePrefix(userID, "a"), 4), + testAddColFam(testTablePrefix(userID, "a"), 5), + testAddColFam(testTablePrefix(userID, "b"), 1), + testAddColFam(testTablePrefix(userID, "c"), 1), }, - rangeStart: tablePrefix(userID, "a"), - expSplit: tablePrefix(userID, "b"), + rangeStart: testTablePrefix(userID, "a"), + expSplit: testTablePrefix(userID, "b"), expError: false, }, // More example table data. Make sure ranges at the start of a table can @@ -4293,58 +4276,58 @@ func TestFindValidSplitKeys(t *testing.T) { // break for such ranges. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a"), 1), - addColFam(tablePrefix(userID, "b"), 1), - addColFam(tablePrefix(userID, "c"), 1), - addColFam(tablePrefix(userID, "d"), 1), + testAddColFam(testTablePrefix(userID, "a"), 1), + testAddColFam(testTablePrefix(userID, "b"), 1), + testAddColFam(testTablePrefix(userID, "c"), 1), + testAddColFam(testTablePrefix(userID, "d"), 1), }, rangeStart: keys.SystemSQLCodec.TablePrefix(userID), - expSplit: tablePrefix(userID, "c"), + expSplit: testTablePrefix(userID, "c"), expError: false, }, // More example table data. Make sure ranges at the start of a table can // be split properly even in the presence of a large first row. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a"), 1), - addColFam(tablePrefix(userID, "a"), 2), - addColFam(tablePrefix(userID, "a"), 3), - addColFam(tablePrefix(userID, "a"), 4), - addColFam(tablePrefix(userID, "a"), 5), - addColFam(tablePrefix(userID, "b"), 1), - addColFam(tablePrefix(userID, "c"), 1), + testAddColFam(testTablePrefix(userID, "a"), 1), + testAddColFam(testTablePrefix(userID, "a"), 2), + testAddColFam(testTablePrefix(userID, "a"), 3), + testAddColFam(testTablePrefix(userID, "a"), 4), + testAddColFam(testTablePrefix(userID, "a"), 5), + testAddColFam(testTablePrefix(userID, "b"), 1), + testAddColFam(testTablePrefix(userID, "c"), 1), }, rangeStart: keys.SystemSQLCodec.TablePrefix(TestingUserDescID(0)), - expSplit: tablePrefix(userID, "b"), + expSplit: testTablePrefix(userID, "b"), expError: false, }, // One partition where partition key is the first column. Checks that // split logic is not confused by the special partition start key. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a", "a"), 1), - addColFam(tablePrefix(userID, "a", "b"), 1), - addColFam(tablePrefix(userID, "a", "c"), 1), - addColFam(tablePrefix(userID, "a", "d"), 1), + testAddColFam(testTablePrefix(userID, "a", "a"), 1), + testAddColFam(testTablePrefix(userID, "a", "b"), 1), + testAddColFam(testTablePrefix(userID, "a", "c"), 1), + testAddColFam(testTablePrefix(userID, "a", "d"), 1), }, - rangeStart: tablePrefix(userID, "a"), - expSplit: tablePrefix(userID, "a", "c"), + rangeStart: testTablePrefix(userID, "a"), + expSplit: testTablePrefix(userID, "a", "c"), expError: false, }, // One partition with a large first row. Checks that our logic to avoid // splitting in the middle of a row still applies. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a", "a"), 1), - addColFam(tablePrefix(userID, "a", "a"), 2), - addColFam(tablePrefix(userID, "a", "a"), 3), - addColFam(tablePrefix(userID, "a", "a"), 4), - addColFam(tablePrefix(userID, "a", "a"), 5), - addColFam(tablePrefix(userID, "a", "b"), 1), - addColFam(tablePrefix(userID, "a", "c"), 1), + testAddColFam(testTablePrefix(userID, "a", "a"), 1), + testAddColFam(testTablePrefix(userID, "a", "a"), 2), + testAddColFam(testTablePrefix(userID, "a", "a"), 3), + testAddColFam(testTablePrefix(userID, "a", "a"), 4), + testAddColFam(testTablePrefix(userID, "a", "a"), 5), + testAddColFam(testTablePrefix(userID, "a", "b"), 1), + testAddColFam(testTablePrefix(userID, "a", "c"), 1), }, - rangeStart: tablePrefix(userID, "a"), - expSplit: tablePrefix(userID, "a", "b"), + rangeStart: testTablePrefix(userID, "a"), + expSplit: testTablePrefix(userID, "a", "b"), expError: false, }, } @@ -4366,17 +4349,7 @@ func TestFindValidSplitKeys(t *testing.T) { defer engine.Close() ms := &enginepb.MVCCStats{} - val := roachpb.MakeValueFromString(strings.Repeat("X", 10)) - for _, k := range test.keys { - // Add three MVCC versions of every key. Splits are not allowed - // between MVCC versions, so this shouldn't have any effect. - for j := 1; j <= 3; j++ { - ts := hlc.Timestamp{Logical: int32(j)} - if err := MVCCPut(ctx, engine, ms, []byte(k), ts, hlc.ClockTimestamp{}, val, nil); err != nil { - t.Fatal(err) - } - } - } + testPopulateKeysWithVersions(ctx, t, engine, ms, test.keys) rangeStart := test.keys[0] if len(test.rangeStart) > 0 { rangeStart = test.rangeStart @@ -4487,6 +4460,203 @@ func TestFindBalancedSplitKeys(t *testing.T) { } } +// testAddPrefix manually creates rows corresponding to the schema e.g. +// CREATE TABLE t (id1 STRING, id2 STRING, ... PRIMARY KEY (id1, id2, ...)) +func testAddPrefix(prefix roachpb.Key, id uint32, rowVals ...string) roachpb.Key { + tableKey := append(prefix, keys.SystemSQLCodec.TablePrefix(id)...) + rowKey := roachpb.Key(encoding.EncodeVarintAscending(tableKey, 1)) + for _, rowVal := range rowVals { + rowKey = encoding.EncodeStringAscending(rowKey, rowVal) + } + return rowKey +} + +func testTablePrefix(id uint32, rowVals ...string) roachpb.Key { + return testAddPrefix(nil, id, rowVals...) +} +func testAddColFam(rowKey roachpb.Key, colFam uint32) roachpb.Key { + return keys.MakeFamilyKey(append([]byte(nil), rowKey...), colFam) +} + +// testPopulateKeysWithVersions puts the keys into the engine provided. Each +// key is added with 3 MVCC versions with a XX.. value. +func testPopulateKeysWithVersions( + ctx context.Context, t *testing.T, engine Engine, ms *enginepb.MVCCStats, keys []roachpb.Key, +) { + val := roachpb.MakeValueFromString(strings.Repeat("X", 10)) + for _, k := range keys { + // Add three MVCC versions of every key. Splits are not allowed + // between MVCC versions, so this shouldn't have any effect. + for j := 1; j <= 3; j++ { + ts := hlc.Timestamp{Logical: int32(j)} + require.NoError( + t, + MVCCPut(ctx, engine, ms, []byte(k), ts, hlc.ClockTimestamp{}, val, nil), + ) + } + } +} + +// TestMVCCFirstSplitKey checks that the split key returned from +// MVCCFirstSplitKey is: +// (1) Within a range's bounds +// (2) No less than the desired split key. +// (3) Greater than the first key, or first row's keys in table ranges. +// (4) Not inbetween the start and end of a row for table ranges. +func TestMVCCFirstSplitKey(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + userID := TestingUserDescID(0) + + type splitExpect struct { + desired roachpb.Key + expected roachpb.Key + } + + testCases := []struct { + keys []roachpb.Key + startKey, endKey roachpb.Key + splits []splitExpect + }{ + { + // No keys, no splits. + keys: []roachpb.Key{}, + startKey: roachpb.Key("a"), + endKey: roachpb.Key("z"), + splits: []splitExpect{ + {desired: roachpb.Key("a"), expected: nil}, + {desired: roachpb.Key("m"), expected: nil}, + {desired: roachpb.Key("z"), expected: nil}, + }, + }, + { + // All keys are outside the range, no keys to spit at so expect no + // splits. + keys: []roachpb.Key{ + roachpb.Key("0"), + roachpb.Key("c"), + roachpb.Key("d"), + }, + startKey: roachpb.Key("a"), + endKey: roachpb.Key("c"), + splits: []splitExpect{ + {desired: roachpb.Key("a"), expected: nil}, + {desired: roachpb.Key("b"), expected: nil}, + {desired: roachpb.Key("c"), expected: nil}, + }, + }, + { + // Only one key within the range, require at least two keys to split. + keys: []roachpb.Key{ + // (0) is outside the range [a,c) + roachpb.Key("0"), + roachpb.Key("a"), + // (c) is outside of the range [a,c). + roachpb.Key("c"), + }, + startKey: roachpb.Key("a"), + endKey: roachpb.Key("c"), + splits: []splitExpect{ + {desired: roachpb.Key("a"), expected: nil}, + {desired: roachpb.Key("b"), expected: nil}, + {desired: roachpb.Key("c"), expected: nil}, + }, + }, + { + // Enough keys to realize a split on c. Only desiredSplitKeys <= c should + // split at c. + keys: []roachpb.Key{ + // (0) is outside the range [a,e) + roachpb.Key("0"), + roachpb.Key("b"), + roachpb.Key("c"), + // (e) is outside of the range [a,e). + roachpb.Key("e"), + }, + startKey: roachpb.Key("a"), + endKey: roachpb.Key("e"), + splits: []splitExpect{ + // Should iterate to the first split key after minSpitKey which is (c). + {desired: roachpb.Key("0"), expected: roachpb.Key("c")}, + {desired: roachpb.Key("b"), expected: roachpb.Key("c")}, + {desired: roachpb.Key("c"), expected: roachpb.Key("c")}, + // Desired split key is after the last key in the range (c), shouldn't + // split. + {desired: roachpb.Key("d"), expected: nil}, + }, + }, + { + keys: []roachpb.Key{ + testAddColFam(testTablePrefix(userID, "a"), 1), + testAddColFam(testTablePrefix(userID, "b"), 1), + testAddColFam(testTablePrefix(userID, "b"), 2), + testAddColFam(testTablePrefix(userID, "b"), 3), + testAddColFam(testTablePrefix(userID, "d"), 1), + // (e,1) is outside of the range [a,e) + testAddColFam(testTablePrefix(userID, "e"), 1), + }, + startKey: testTablePrefix(userID, "a"), + endKey: testTablePrefix(userID, "e"), + splits: []splitExpect{ + {desired: testAddColFam(testTablePrefix(userID, "a"), 0), expected: testTablePrefix(userID, "b")}, + {desired: testAddColFam(testTablePrefix(userID, "b"), 3), expected: testTablePrefix(userID, "b")}, + // The first key after the desired split key is (d,1), expect a split + // at the prefix (d). + {desired: testAddColFam(testTablePrefix(userID, "b"), 4), expected: testTablePrefix(userID, "d")}, + // Desired split key is after the last key in the range (d,1), + // shouldn't split. + {desired: testAddColFam(testTablePrefix(userID, "d"), 2), expected: nil}, + }, + }, + { + // One partiton key, where the partition key is the first column (a). + keys: []roachpb.Key{ + testAddColFam(testTablePrefix(userID, "a", "a"), 1), + testAddColFam(testTablePrefix(userID, "a", "a"), 3), + testAddColFam(testTablePrefix(userID, "a", "b"), 1), + testAddColFam(testTablePrefix(userID, "a", "c"), 1), + // (a,d,0) is outside the range [a,(a,d)). + testAddColFam(testTablePrefix(userID, "a", "d"), 0), + }, + startKey: testTablePrefix(userID, "a"), + endKey: testTablePrefix(userID, "a", "d"), + splits: []splitExpect{ + {desired: testTablePrefix(userID, "a"), expected: testTablePrefix(userID, "a", "b")}, + {desired: testAddColFam(testTablePrefix(userID, "a", "a"), 3), expected: testTablePrefix(userID, "a", "b")}, + {desired: testAddColFam(testTablePrefix(userID, "a", "b"), 2), expected: testTablePrefix(userID, "a", "c")}, + // Desired split key is after the last key in the range (a,c,1), + // shouldn't split. + {desired: testAddColFam(testTablePrefix(userID, "a", "c"), 2), expected: nil}, + {desired: testTablePrefix(userID, "a", "e"), expected: nil}, + }, + }, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("%v", tc.keys), + func(t *testing.T) { + ctx := context.Background() + engine := NewDefaultInMemForTesting() + defer engine.Close() + + testPopulateKeysWithVersions(ctx, t, engine, &enginepb.MVCCStats{}, tc.keys) + rangeStartAddr := keys.MustAddr(tc.startKey) + rangeEndAddr := keys.MustAddr(tc.endKey) + for _, split := range tc.splits { + t.Run(fmt.Sprintf("%v", split.desired), func(t *testing.T) { + desiredSplitAddr := keys.MustAddr(split.desired) + splitKey, err := MVCCFirstSplitKey(ctx, engine, desiredSplitAddr, rangeStartAddr, rangeEndAddr) + // NB: We don't expect errors. If no split key can be found, we + // expect a nil splitKey to be returned. + require.NoError(t, err) + require.Equal(t, split.expected, splitKey) + }) + } + }) + } +} + // TestMVCCGarbageCollect writes a series of gc'able bytes and then // sends an MVCC GC request and verifies cleared values and updated // stats.