From a1450135c799764931a0c9d99e032c6653ca43fb Mon Sep 17 00:00:00 2001 From: Austen McClernon Date: Sat, 20 May 2023 01:25:26 +0000 Subject: [PATCH] kvserver: avoid load-based splitting between rows It was possible for a SQL row to be torn across two ranges due to the load-based splitter not rejecting potentially unsafe split keys. It is impossible to determine with keys sampled from response spans, whether a key is certainly unsafe or safe. This commit side steps this problem by re-using the `adminSplitWithDescriptor` command to find the first real key, after or at the provided `args.SplitKey`. This ensures that the split key will always be a real key whilst not requiring any checks in the splitter itself. The updated `adminSplitWithDescriptor` is local only and requires opting into finding the first safe key by setting `findFirstSafeKey` to `true`. As such, all safe split key checks are also removed from the `split` pkg, with a warning added that the any split key returned is unsafe. Resolves: https://github.com/cockroachdb/cockroach/issues/103483 Release note (bug fix): It was possible for a SQL row to be split across two ranges. When this occurred, SQL queries could return unexpected errors. This bug is resolved by these changes, as we now inspect the real keys, rather than just request keys to determine load-based split points. --- pkg/kv/kvserver/client_split_test.go | 246 +++++++++++++++++++++ pkg/kv/kvserver/replica_command.go | 33 ++- pkg/kv/kvserver/replica_split_load.go | 39 +++- pkg/kv/kvserver/split/BUILD.bazel | 2 - pkg/kv/kvserver/split/decider.go | 34 ++- pkg/kv/kvserver/split/decider_test.go | 92 -------- pkg/kv/kvserver/split_queue.go | 7 + pkg/kv/kvserver/testing_knobs.go | 4 + pkg/storage/mvcc.go | 70 +++++- pkg/storage/mvcc_test.go | 300 ++++++++++++++++++++------ 10 files changed, 637 insertions(+), 190 deletions(-) diff --git a/pkg/kv/kvserver/client_split_test.go b/pkg/kv/kvserver/client_split_test.go index 8d1c80375087..066689b8c369 100644 --- a/pkg/kv/kvserver/client_split_test.go +++ b/pkg/kv/kvserver/client_split_test.go @@ -3685,3 +3685,249 @@ func TestStoreRangeSplitAndMergeWithGlobalReads(t *testing.T) { repl = store.LookupReplica(roachpb.RKey(splitKey)) require.Equal(t, descKey, repl.Desc().StartKey.AsRawKey()) } + +// TestLBSplitUnsafeKeys tests that load based splits do not split between table +// rows, even when the suggested load based split key is itself between a table row. +func TestLBSplitUnsafeKeys(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + ctx := context.Background() + const indexID = 1 + + // The test is expensive and prone to timing out under race. + skip.UnderRace(t) + skip.UnderStressRace(t) + skip.UnderDeadlock(t) + + makeTestKey := func(tableID uint32, suffix []byte) roachpb.Key { + tableKey := keys.MakeTableIDIndexID(nil, tableID, indexID) + return append(tableKey, suffix...) + } + + es := func(vals ...int64) []byte { + k := []byte{} + for _, v := range vals { + k = encoding.EncodeVarintAscending(k, v) + } + return k + } + + fk := func(k []byte, famID uint32) []byte { + return keys.MakeFamilyKey(k, famID) + } + + testCases := []struct { + splitKey roachpb.Key + existingKeys []int + expSplitKey roachpb.Key + expErrStr string + }{ + // We don't know the table ID here, we append the splitKey to the + // table/index prefix. e.g. /1 will be /Table/table_id/index_id/1. + { + // /1 -> /2 + splitKey: es(1), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/0 -> /2 + splitKey: fk(es(1), 0), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/3 -> /2 + splitKey: fk(es(1), 3), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/0/0 -> /2 + splitKey: roachpb.Key(fk(es(1), 0)).Next(), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/3/0 -> /2 + splitKey: roachpb.Key(fk(es(1), 3)).Next(), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/0/0/0 -> /2 + splitKey: roachpb.Key(fk(es(1), 0)).Next().Next(), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /1/3/0/0 -> /2 + splitKey: roachpb.Key(fk(es(1), 3)).Next().Next(), + existingKeys: []int{1, 2, 3}, + expSplitKey: es(2), + }, + { + // /0 -> /3 + // We will not split at the first row in a table, so expect the split at + // /3 intead of /2. + splitKey: es(0), + existingKeys: []int{2, 3}, + expSplitKey: es(3), + }, + { + // /2 -> /3 + // Same case as above, despite the key being safe, the split would create + // an empty LHS. + splitKey: es(2), + existingKeys: []int{2, 3}, + expSplitKey: es(3), + }, + { + // /1 -> error + // There are no rows to split on. + splitKey: es(1), + existingKeys: []int{}, + expErrStr: "could not find valid split key", + }, + { + // /1 -> error + // There is only one row to split on, the range should not be split. + splitKey: es(1), + existingKeys: []int{2}, + expErrStr: "could not find valid split key", + }, + { + // /1/0 -> error + splitKey: fk(es(1), 0), + existingKeys: []int{2}, + expErrStr: "could not find valid split key", + }, + { + // /1/3 -> error + splitKey: fk(es(1), 3), + existingKeys: []int{2}, + expErrStr: "could not find valid split key", + }, + { + // /1/3/0/0 -> error + splitKey: roachpb.Key(fk(es(1), 0)).Next().Next(), + existingKeys: []int{2}, + expErrStr: "could not find valid split key", + }, + { + // /2 -> /2 + splitKey: es(2), + existingKeys: []int{1, 2}, + expSplitKey: es(2), + }, + } + + for _, tc := range testCases { + var expectStr string + if tc.expErrStr != "" { + expectStr = tc.expErrStr + } else { + expectStr = makeTestKey(1, tc.expSplitKey).String() + } + t.Run(fmt.Sprintf("%s%v -> %s", makeTestKey(1, tc.splitKey), tc.existingKeys, expectStr), func(t *testing.T) { + var targetRange atomic.Int32 + var splitKeyOverride atomic.Value + splitKeyOverride.Store(roachpb.Key{}) + + // Mock the load based splitter key finding method. This function will be + // checked in splitQueue.shouldQueue() and splitQueue.process via + // replica.loadSplitKey. When a key is returned, the split queue calls + // replica.adminSplitWithDescriptor(...findFirstSafeSplitKey=true). + overrideLBSplitFn := func(rangeID roachpb.RangeID) (splitKey roachpb.Key, useSplitKey bool) { + if rangeID == roachpb.RangeID(targetRange.Load()) { + override := splitKeyOverride.Load() + // It is possible that the split queue is checking the range before + // we manually enqueued it. + if override == nil { + return nil, false + } + overrideKey, ok := override.(roachpb.Key) + require.Truef(t, ok, "stored value not key %+v", override) + + if len(overrideKey) == 0 { + return nil, false + } + + return override.(roachpb.Key), true + } + return nil, false + } + + serv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{ + DisableSpanConfigs: true, + Knobs: base.TestingKnobs{ + Store: &kvserver.StoreTestingKnobs{ + LoadBasedSplittingOverrideKey: overrideLBSplitFn, + DisableMergeQueue: true, + }, + }, + }) + s := serv.(*server.TestServer) + defer s.Stopper().Stop(ctx) + tdb := sqlutils.MakeSQLRunner(sqlDB) + store, err := s.Stores().GetStore(1) + require.NoError(t, err) + + // We want to exercise the case where there are column family keys. + // Create a simple table and insert the existing keys. + _ = tdb.Exec(t, + "CREATE TABLE t (k INT PRIMARY KEY, "+ + "t0 INT, t1 INT, t2 INT, t3 INT, "+ + "FAMILY (k), FAMILY (t0), FAMILY (t1), FAMILY (t2), FAMILY (t3))") + for _, k := range tc.existingKeys { + _ = tdb.Exec(t, fmt.Sprintf("INSERT INTO t VALUES (%d, %d, %d, %d, %d)", + k, k, k, k, k)) + } + + // Force a table scan to resolve descriptors. + var keyCount int + tdb.QueryRow(t, "SELECT count(k) FROM t").Scan(&keyCount) + require.Equal(t, len(tc.existingKeys), keyCount) + var tableID uint32 + tdb.QueryRow(t, "SELECT table_id FROM crdb_internal.leases where name = 't'").Scan(&tableID) + + // Split off the table range for the test, otherwise the range may + // contain multiple tables with existing values. + splitArgs := adminSplitArgs(keys.SystemSQLCodec.TablePrefix(tableID)) + _, pErr := kv.SendWrapped(ctx, store.TestSender(), splitArgs) + require.Nil(t, pErr) + + var rangeID roachpb.RangeID + tdb.QueryRow(t, "SELECT range_id FROM [SHOW RANGES FROM TABLE t] LIMIT 1").Scan(&rangeID) + targetRange.Store(int32(rangeID)) + repl, err := store.GetReplica(rangeID) + require.NoError(t, err) + + // Keep the previous end key around, we will use this to assert that no + // split has occurred when expecting an error. + prevEndKey := repl.Desc().EndKey.AsRawKey() + splitKey := makeTestKey(tableID, tc.splitKey) + + // Update the split key override so that the split queue will enqueue and + // process the range. Remove it afterwards to avoid retrying the LHS. + splitKeyOverride.Store(splitKey) + _, processErr, enqueueErr := store.Enqueue(ctx, "split", repl, false /* shouldSkipQueue */, false /* async */) + splitKeyOverride.Store(roachpb.Key{}) + require.NoError(t, enqueueErr) + + endKey := repl.Desc().EndKey.AsRawKey() + if tc.expErrStr != "" { + // We expect this split not to process, assert that the expected error + // matches the returned error and the range has the same end key. + require.ErrorContainsf(t, processErr, tc.expErrStr, + "end key %s, previous end key %s", endKey, prevEndKey) + require.Equal(t, prevEndKey, endKey) + } else { + // Otherwise, assert that the new range end key matches the expected + // end key. + require.NoError(t, processErr) + require.Equal(t, makeTestKey(tableID, tc.expSplitKey), endKey) + } + }) + } +} diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index 9f161d07052b..50a6d06a82c4 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -79,7 +79,7 @@ func (r *Replica) AdminSplit( err := r.executeAdminCommandWithDescriptor(ctx, func(desc *roachpb.RangeDescriptor) error { var err error - reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason) + reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason, false /* findFirstSafeKey */) return err }) return reply, err @@ -292,7 +292,6 @@ func splitTxnStickyUpdateAttempt( // affirmative the descriptor is passed to AdminSplit, which performs a // Conditional Put on the RangeDescriptor to ensure that no other operation has // modified the range in the time the decision was being made. -// TODO(tschottdorf): should assert that split key is not a local key. // // See the comment on splitTrigger for details on the complexities. func (r *Replica) adminSplitWithDescriptor( @@ -301,6 +300,7 @@ func (r *Replica) adminSplitWithDescriptor( desc *roachpb.RangeDescriptor, delayable bool, reason string, + findFirstSafeKey bool, ) (kvpb.AdminSplitResponse, error) { var err error var reply kvpb.AdminSplitResponse @@ -341,11 +341,36 @@ func (r *Replica) adminSplitWithDescriptor( ri := r.GetRangeInfo(ctx) return reply, kvpb.NewRangeKeyMismatchErrorWithCTPolicy(ctx, args.Key, args.Key, desc, &ri.Lease, ri.ClosedTimestampPolicy) } - foundSplitKey = args.SplitKey + // When findFirstSafeKey is true, we find the first key after or at + // args.SplitKey which is a safe split to split at. The current user of + // findFirstSafeKey is load based splitting, which only has knowledge of + // sampled keys from batch requests. These sampled keys can be + // arbitrarily within SQL rows due to column family keys. + // + // Not every caller requires a real key as a split point (creating empty + // table), however when we cannot verify the split key as safe, the most + // reliable method is checking existing keys. + if findFirstSafeKey { + var desiredSplitKey roachpb.RKey + if desiredSplitKey, err = keys.Addr(args.SplitKey); err != nil { + return reply, err + } + if foundSplitKey, err = storage.MVCCFirstSplitKey( + ctx, r.store.TODOEngine(), desiredSplitKey, + desc.StartKey, desc.EndKey, + ); err != nil { + return reply, errors.Wrap(err, "unable to determine split key") + } else if foundSplitKey == nil { + return reply, unsplittableRangeError{} + } + } else { + foundSplitKey = args.SplitKey + } } if !kvserverbase.ContainsKey(desc, foundSplitKey) { - return reply, errors.Errorf("requested split key %s out of bounds of %s", args.SplitKey, r) + return reply, errors.Errorf("requested split key %s (found=%s) out of bounds of %s", + args.SplitKey, foundSplitKey, r) } // If predicate keys are specified, make sure they are contained by this diff --git a/pkg/kv/kvserver/replica_split_load.go b/pkg/kv/kvserver/replica_split_load.go index 24adbc238cd2..a2e03af9c2ba 100644 --- a/pkg/kv/kvserver/replica_split_load.go +++ b/pkg/kv/kvserver/replica_split_load.go @@ -254,13 +254,48 @@ func (r *Replica) recordBatchForLoadBasedSplitting( // loadSplitKey returns a suggested load split key for the range if it exists, // otherwise it returns nil. If there were any errors encountered when -// validating the split key, the error is returned as well. +// validating the split key, the error is returned as well. It is guaranteed +// that the key returned, if non-nil, will be greater than the start key of the +// range and also within the range bounds. +// +// NOTE: The returned split key CAN BE BETWEEN A SQL ROW, The split key +// returned should only be used to engage a split via adminSplitWithDescriptor +// where findFirstSafeKey is set to true. func (r *Replica) loadSplitKey(ctx context.Context, now time.Time) roachpb.Key { - splitKey := r.loadBasedSplitter.MaybeSplitKey(ctx, now) + var splitKey roachpb.Key + if overrideFn := r.store.cfg.TestingKnobs.LoadBasedSplittingOverrideKey; overrideFn != nil { + var useSplitKey bool + if splitKey, useSplitKey = overrideFn(r.GetRangeID()); useSplitKey { + return splitKey + } + } else { + splitKey = r.loadBasedSplitter.MaybeSplitKey(ctx, now) + } + if splitKey == nil { return nil } + // If the splitKey belongs to a Table range, try and shorten the key to just + // the row prefix. This allows us to check that splitKey doesn't map to the + // first key of the range here. If the split key contains column families, it + // is possible that the full key is strictly after every existing key for + // that row. e.g. for a table row where the table ID is 100, index ID is 1, + // primary key is a, and the column family ID is 3 (length=1): + // + // splitKey = /Table/100/1/"a"/3/1 + // existing = [..., /Table/100/1/"a"/2/1] + // + // We would not split at /Table/100/1/"a" as there's no key >= the splitKey + // in the range. + // + // NB: We handle unsafe split keys in replica.adminSplitWithDescriptor, so it + // isn't an issue if we return an unsafe key here. See the case where + // findFirstSafeKey is true. + if keyRowPrefix, err := keys.EnsureSafeSplitKey(splitKey); err == nil { + splitKey = keyRowPrefix + } + // We swallow the error here and instead log an event. It is currently // expected that the load based splitter may return the start key of the // range. diff --git a/pkg/kv/kvserver/split/BUILD.bazel b/pkg/kv/kvserver/split/BUILD.bazel index 8eb21bc3c05b..f0a35996fe2a 100644 --- a/pkg/kv/kvserver/split/BUILD.bazel +++ b/pkg/kv/kvserver/split/BUILD.bazel @@ -12,7 +12,6 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/split", visibility = ["//visibility:public"], deps = [ - "//pkg/keys", "//pkg/roachpb", "//pkg/util/humanizeutil", "//pkg/util/log", @@ -39,7 +38,6 @@ go_test( "//pkg/roachpb", "//pkg/testutils/datapathutils", "//pkg/testutils/skip", - "//pkg/util/encoding", "//pkg/util/leaktest", "//pkg/util/metric", "//pkg/util/stop", diff --git a/pkg/kv/kvserver/split/decider.go b/pkg/kv/kvserver/split/decider.go index c6dab3a4cedc..560a8e860625 100644 --- a/pkg/kv/kvserver/split/decider.go +++ b/pkg/kv/kvserver/split/decider.go @@ -17,7 +17,6 @@ import ( "math/rand" "time" - "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/metric" @@ -313,6 +312,8 @@ func (d *Decider) maxStatLocked(ctx context.Context, now time.Time) (float64, bo // or if it wasn't able to determine a suitable split key. // // It is legal to call MaybeSplitKey at any time. +// WARNING: The key returned from MaybeSplitKey has no guarantee of being a +// safe split key. The key is derived from sampled spans. See below. func (d *Decider) MaybeSplitKey(ctx context.Context, now time.Time) roachpb.Key { var key roachpb.Key @@ -336,27 +337,20 @@ func (d *Decider) MaybeSplitKey(ctx context.Context, now time.Time) roachpb.Key // // /Table/51/52/53/54/55 // - // (see TestDeciderCallsEnsureSafeSplitKey). + // The key found here isn't guaranteed to be a valid SQL column family key. + // This is because the keys are sampled from StartKey and EndKey of + // requests hitting this replica. Ranged operations may well wish to + // exclude the start point by calling .Next() or may span multiple ranges, + // and so such a key may end up being returned. This is more common than + // one might think since SQL issues plenty of scans over all column + // families, meaning that we'll frequently find a key that has no column + // family suffix and thus errors out in EnsureSafeSplitKey. // - // The key found here isn't guaranteed to be a valid SQL column family - // key. This is because the keys are sampled from StartKey of requests - // hitting this replica. Ranged operations may well wish to exclude the - // start point by calling .Next() or may span multiple ranges, and so - // such a key may end up being passed to EnsureSafeSplitKey here. - // - // We take the risk that the result may sometimes not be a good split - // point (or even in this range). - // - // Note that we ignore EnsureSafeSplitKey when it returns an error since - // that error only tells us that this key couldn't possibly be a SQL - // key. This is more common than one might think since SQL issues plenty - // of scans over all column families, meaning that we'll frequently find - // a key that has no column family suffix and thus errors out in - // EnsureSafeSplitKey. + // We do not attempt to validate the key is safe here, simply return it to + // the caller as the best possible split point found so far. See + // replica.adminSplitWithDescriptor for how split keys are handled when we + // aren't certain the provided key is safe. key = d.mu.splitFinder.Key() - if safeKey, err := keys.EnsureSafeSplitKey(key); err == nil { - key = safeKey - } } return key } diff --git a/pkg/kv/kvserver/split/decider_test.go b/pkg/kv/kvserver/split/decider_test.go index 43a75ac34b3b..205d679656e4 100644 --- a/pkg/kv/kvserver/split/decider_test.go +++ b/pkg/kv/kvserver/split/decider_test.go @@ -12,14 +12,12 @@ package split import ( "context" - "math" "math/rand" "testing" "time" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" - "github.com/cockroachdb/cockroach/pkg/util/encoding" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/stretchr/testify/assert" @@ -290,96 +288,6 @@ func TestDecider_MaxStat(t *testing.T) { assertMaxStat(25000, 6, true) } -func TestDeciderCallsEnsureSafeSplitKey(t *testing.T) { - defer leaktest.AfterTest(t)() - rand := rand.New(rand.NewSource(11)) - - var d Decider - loadSplitConfig := testLoadSplitConfig{ - randSource: rand, - useWeighted: false, - statRetention: time.Second, - statThreshold: 1, - } - - Init(&d, &loadSplitConfig, &LoadSplitterMetrics{ - PopularKeyCount: metric.NewCounter(metric.Metadata{}), - NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), - }, SplitCPU) - - baseKey := keys.SystemSQLCodec.TablePrefix(51) - for i := 0; i < 4; i++ { - baseKey = encoding.EncodeUvarintAscending(baseKey, uint64(52+i)) - } - c0 := func() roachpb.Span { return roachpb.Span{Key: append([]byte(nil), keys.MakeFamilyKey(baseKey, 1)...)} } - c1 := func() roachpb.Span { return roachpb.Span{Key: append([]byte(nil), keys.MakeFamilyKey(baseKey, 9)...)} } - - expK, err := keys.EnsureSafeSplitKey(c1().Key) - require.NoError(t, err) - - var k roachpb.Key - var now time.Time - for i := 0; i < 2*int(minSplitSuggestionInterval/time.Second); i++ { - now = now.Add(500 * time.Millisecond) - d.Record(context.Background(), now, ld(1), c0) - now = now.Add(500 * time.Millisecond) - d.Record(context.Background(), now, ld(1), c1) - k = d.MaybeSplitKey(context.Background(), now) - if len(k) != 0 { - break - } - } - - require.Equal(t, expK, k) -} - -func TestDeciderIgnoresEnsureSafeSplitKeyOnError(t *testing.T) { - defer leaktest.AfterTest(t)() - rand := rand.New(rand.NewSource(11)) - var d Decider - - loadSplitConfig := testLoadSplitConfig{ - randSource: rand, - useWeighted: false, - statRetention: time.Second, - statThreshold: 1, - } - - Init(&d, &loadSplitConfig, &LoadSplitterMetrics{ - PopularKeyCount: metric.NewCounter(metric.Metadata{}), - NoSplitKeyCount: metric.NewCounter(metric.Metadata{}), - }, SplitCPU) - - baseKey := keys.SystemSQLCodec.TablePrefix(51) - for i := 0; i < 4; i++ { - baseKey = encoding.EncodeUvarintAscending(baseKey, uint64(52+i)) - } - c0 := func() roachpb.Span { - return roachpb.Span{Key: append([]byte(nil), encoding.EncodeUvarintAscending(baseKey, math.MaxInt32+1)...)} - } - c1 := func() roachpb.Span { - return roachpb.Span{Key: append([]byte(nil), encoding.EncodeUvarintAscending(baseKey, math.MaxInt32+2)...)} - } - - _, err := keys.EnsureSafeSplitKey(c1().Key) - require.Error(t, err) - - var k roachpb.Key - var now time.Time - for i := 0; i < 2*int(minSplitSuggestionInterval/time.Second); i++ { - now = now.Add(500 * time.Millisecond) - d.Record(context.Background(), now, ld(1), c0) - now = now.Add(500 * time.Millisecond) - d.Record(context.Background(), now, ld(1), c1) - k = d.MaybeSplitKey(context.Background(), now) - if len(k) != 0 { - break - } - } - - require.Equal(t, c1().Key, k) -} - func TestMaxStatTracker(t *testing.T) { defer leaktest.AfterTest(t)() diff --git a/pkg/kv/kvserver/split_queue.go b/pkg/kv/kvserver/split_queue.go index ebadd75e2950..a0954445430f 100644 --- a/pkg/kv/kvserver/split_queue.go +++ b/pkg/kv/kvserver/split_queue.go @@ -258,6 +258,7 @@ func (sq *splitQueue) processAttempt( desc, false, /* delayable */ "span config", + false, /* findFirstSafeSplitKey */ ); err != nil { return false, errors.Wrapf(err, "unable to split %s at key %q", r, splitKey) } @@ -277,6 +278,7 @@ func (sq *splitQueue) processAttempt( desc, false, /* delayable */ fmt.Sprintf("%s above threshold size %s", humanizeutil.IBytes(size), humanizeutil.IBytes(maxBytes)), + false, /* findFirstSafeSplitKey */ ); err != nil { return false, err } @@ -312,6 +314,10 @@ func (sq *splitQueue) processAttempt( if expDelay := kvserverbase.SplitByLoadMergeDelay.Get(&sq.store.cfg.Settings.SV); expDelay > 0 { expTime = sq.store.Clock().Now().Add(expDelay.Nanoseconds(), 0) } + // The splitByLoadKey has no guarantee of being a safe key to split at (not + // between SQL rows). To sanitize the split point, pass + // findFirstSafeSplitKey set to true, so that the first key after the + // suggested split point which is safe to split at is used. if _, pErr := r.adminSplitWithDescriptor( ctx, kvpb.AdminSplitRequest{ @@ -324,6 +330,7 @@ func (sq *splitQueue) processAttempt( desc, false, /* delayable */ reason, + true, /* findFirstSafeSplitKey */ ); pErr != nil { return false, errors.Wrapf(pErr, "unable to split %s at key %q", r, splitByLoadKey) } diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go index 023f87bba763..798de7e0bbb4 100644 --- a/pkg/kv/kvserver/testing_knobs.go +++ b/pkg/kv/kvserver/testing_knobs.go @@ -152,6 +152,10 @@ type StoreTestingKnobs struct { DisableReplicateQueue bool // DisableLoadBasedSplitting turns off LBS so no splits happen because of load. DisableLoadBasedSplitting bool + // LoadBasedSplittingOverrideKey returns a key which should be used for load + // based splitting, overriding any value returned from the real load based + // splitter. + LoadBasedSplittingOverrideKey func(rangeID roachpb.RangeID) (splitKey roachpb.Key, useSplitKey bool) // DisableSplitQueue disables the split queue. DisableSplitQueue bool // DisableTimeSeriesMaintenanceQueue disables the time series maintenance diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index e1c08555ffc2..37283beb5299 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -5881,7 +5881,27 @@ func MVCCFindSplitKey( // was dangerous because partitioning can split off ranges that do not start // at valid row keys. The keys that are present in the range, by contrast, are // necessarily valid row keys. - it.SeekGE(MakeMVCCMetadataKey(key.AsRawKey())) + minSplitKey, err := mvccMinSplitKey(it, key.AsRawKey()) + if err != nil { + return nil, err + } else if minSplitKey == nil { + return nil, nil + } + + splitKey, err := it.FindSplitKey(key.AsRawKey(), endKey.AsRawKey(), minSplitKey, targetSize) + if err != nil { + return nil, err + } + // Ensure the key is a valid split point that does not fall in the middle of a + // SQL row by removing the column family ID, if any, from the end of the key. + return keys.EnsureSafeSplitKey(splitKey.Key) +} + +// mvccMinSplitKey returns the minimum key that a range may be split at. The +// caller is responsible for setting the iterator upper bound to the range end +// key. The caller is also responsible for closing the iterator. +func mvccMinSplitKey(it MVCCIterator, startKey roachpb.Key) (roachpb.Key, error) { + it.SeekGE(MakeMVCCMetadataKey(startKey)) if ok, err := it.Valid(); err != nil { return nil, err } else if !ok { @@ -5905,14 +5925,54 @@ func MVCCFindSplitKey( // Allow a split at any key that sorts after it. minSplitKey = it.UnsafeKey().Key.Clone().Next() } + return minSplitKey, nil +} - splitKey, err := it.FindSplitKey(key.AsRawKey(), endKey.AsRawKey(), minSplitKey, targetSize) +// MVCCFirstSplitKey returns the first key which is safe to split at and no +// less than desiredSplitKey in the range which spans [startKey,endKey). If a +// non-nil key is returned, it is safe to split at. If a nil key is returned, no +// safe split key could be determined. The safe split key returned is +// guaranteed to be: +// (1) Within [startKey,endKey). +// (2) No less than desiredSplitKey. +// (3) Greater than the first key in [startKey,endKey]; or greater than all the +// +// first row's keys if a table range. . +// +// (4) Not in between the start and end of a row for table ranges. +// The returned split key is NOT guaranteed to be outside a no-split span, such +// as Meta1, Meta2Max or Node Liveness. +func MVCCFirstSplitKey( + _ context.Context, reader Reader, desiredSplitKey, startKey, endKey roachpb.RKey, +) (roachpb.Key, error) { + it := reader.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: endKey.AsRawKey()}) + defer it.Close() + + // If the caller has provided a desiredSplitKey less than the minimum split + // key, we update the desired split key to be the minimum split key. This + // prevents splitting before the first row in a Table range, which would + // result in the LHS having now rows. + minSplitKey, err := mvccMinSplitKey(it, startKey.AsRawKey()) if err != nil { return nil, err + } else if minSplitKey == nil { + return nil, nil } - // Ensure the key is a valid split point that does not fall in the middle of a - // SQL row by removing the column family ID, if any, from the end of the key. - return keys.EnsureSafeSplitKey(splitKey.Key) + var seekKey roachpb.Key + if minSplitKey.Compare(desiredSplitKey.AsRawKey()) > 0 { + seekKey = minSplitKey + } else { + seekKey = desiredSplitKey.AsRawKey() + } + + it.SeekGE(MakeMVCCMetadataKey(seekKey)) + if ok, err := it.Valid(); err != nil { + return nil, err + } else if !ok { + return nil, nil + } + + return keys.EnsureSafeSplitKey(it.UnsafeKey().Key.Clone()) } // willOverflow returns true iff adding both inputs would under- or overflow diff --git a/pkg/storage/mvcc_test.go b/pkg/storage/mvcc_test.go index aff80f9c0116..fe99a0e2dca3 100644 --- a/pkg/storage/mvcc_test.go +++ b/pkg/storage/mvcc_test.go @@ -4153,23 +4153,6 @@ func TestFindValidSplitKeys(t *testing.T) { defer log.Scope(t).Close(t) userID := TestingUserDescID(0) - // Manually creates rows corresponding to the schema: - // CREATE TABLE t (id1 STRING, id2 STRING, ... PRIMARY KEY (id1, id2, ...)) - addTablePrefix := func(prefix roachpb.Key, id uint32, rowVals ...string) roachpb.Key { - tableKey := append(prefix, keys.SystemSQLCodec.TablePrefix(id)...) - rowKey := roachpb.Key(encoding.EncodeVarintAscending(tableKey, 1)) - for _, rowVal := range rowVals { - rowKey = encoding.EncodeStringAscending(rowKey, rowVal) - } - return rowKey - } - tablePrefix := func(id uint32, rowVals ...string) roachpb.Key { - return addTablePrefix(nil, id, rowVals...) - } - addColFam := func(rowKey roachpb.Key, colFam uint32) roachpb.Key { - return keys.MakeFamilyKey(append([]byte(nil), rowKey...), colFam) - } - type testCase struct { keys []roachpb.Key rangeStart roachpb.Key // optional @@ -4308,16 +4291,16 @@ func TestFindValidSplitKeys(t *testing.T) { // or return the start key of the range. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a"), 1), - addColFam(tablePrefix(userID, "a"), 2), - addColFam(tablePrefix(userID, "a"), 3), - addColFam(tablePrefix(userID, "a"), 4), - addColFam(tablePrefix(userID, "a"), 5), - addColFam(tablePrefix(userID, "b"), 1), - addColFam(tablePrefix(userID, "c"), 1), + testAddColFam(testTablePrefix(userID, "a"), 1), + testAddColFam(testTablePrefix(userID, "a"), 2), + testAddColFam(testTablePrefix(userID, "a"), 3), + testAddColFam(testTablePrefix(userID, "a"), 4), + testAddColFam(testTablePrefix(userID, "a"), 5), + testAddColFam(testTablePrefix(userID, "b"), 1), + testAddColFam(testTablePrefix(userID, "c"), 1), }, - rangeStart: tablePrefix(userID, "a"), - expSplit: tablePrefix(userID, "b"), + rangeStart: testTablePrefix(userID, "a"), + expSplit: testTablePrefix(userID, "b"), expError: false, }, // More example table data. Make sure ranges at the start of a table can @@ -4325,58 +4308,58 @@ func TestFindValidSplitKeys(t *testing.T) { // break for such ranges. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a"), 1), - addColFam(tablePrefix(userID, "b"), 1), - addColFam(tablePrefix(userID, "c"), 1), - addColFam(tablePrefix(userID, "d"), 1), + testAddColFam(testTablePrefix(userID, "a"), 1), + testAddColFam(testTablePrefix(userID, "b"), 1), + testAddColFam(testTablePrefix(userID, "c"), 1), + testAddColFam(testTablePrefix(userID, "d"), 1), }, rangeStart: keys.SystemSQLCodec.TablePrefix(userID), - expSplit: tablePrefix(userID, "c"), + expSplit: testTablePrefix(userID, "c"), expError: false, }, // More example table data. Make sure ranges at the start of a table can // be split properly even in the presence of a large first row. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a"), 1), - addColFam(tablePrefix(userID, "a"), 2), - addColFam(tablePrefix(userID, "a"), 3), - addColFam(tablePrefix(userID, "a"), 4), - addColFam(tablePrefix(userID, "a"), 5), - addColFam(tablePrefix(userID, "b"), 1), - addColFam(tablePrefix(userID, "c"), 1), + testAddColFam(testTablePrefix(userID, "a"), 1), + testAddColFam(testTablePrefix(userID, "a"), 2), + testAddColFam(testTablePrefix(userID, "a"), 3), + testAddColFam(testTablePrefix(userID, "a"), 4), + testAddColFam(testTablePrefix(userID, "a"), 5), + testAddColFam(testTablePrefix(userID, "b"), 1), + testAddColFam(testTablePrefix(userID, "c"), 1), }, rangeStart: keys.SystemSQLCodec.TablePrefix(TestingUserDescID(0)), - expSplit: tablePrefix(userID, "b"), + expSplit: testTablePrefix(userID, "b"), expError: false, }, // One partition where partition key is the first column. Checks that // split logic is not confused by the special partition start key. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a", "a"), 1), - addColFam(tablePrefix(userID, "a", "b"), 1), - addColFam(tablePrefix(userID, "a", "c"), 1), - addColFam(tablePrefix(userID, "a", "d"), 1), + testAddColFam(testTablePrefix(userID, "a", "a"), 1), + testAddColFam(testTablePrefix(userID, "a", "b"), 1), + testAddColFam(testTablePrefix(userID, "a", "c"), 1), + testAddColFam(testTablePrefix(userID, "a", "d"), 1), }, - rangeStart: tablePrefix(userID, "a"), - expSplit: tablePrefix(userID, "a", "c"), + rangeStart: testTablePrefix(userID, "a"), + expSplit: testTablePrefix(userID, "a", "c"), expError: false, }, // One partition with a large first row. Checks that our logic to avoid // splitting in the middle of a row still applies. { keys: []roachpb.Key{ - addColFam(tablePrefix(userID, "a", "a"), 1), - addColFam(tablePrefix(userID, "a", "a"), 2), - addColFam(tablePrefix(userID, "a", "a"), 3), - addColFam(tablePrefix(userID, "a", "a"), 4), - addColFam(tablePrefix(userID, "a", "a"), 5), - addColFam(tablePrefix(userID, "a", "b"), 1), - addColFam(tablePrefix(userID, "a", "c"), 1), + testAddColFam(testTablePrefix(userID, "a", "a"), 1), + testAddColFam(testTablePrefix(userID, "a", "a"), 2), + testAddColFam(testTablePrefix(userID, "a", "a"), 3), + testAddColFam(testTablePrefix(userID, "a", "a"), 4), + testAddColFam(testTablePrefix(userID, "a", "a"), 5), + testAddColFam(testTablePrefix(userID, "a", "b"), 1), + testAddColFam(testTablePrefix(userID, "a", "c"), 1), }, - rangeStart: tablePrefix(userID, "a"), - expSplit: tablePrefix(userID, "a", "b"), + rangeStart: testTablePrefix(userID, "a"), + expSplit: testTablePrefix(userID, "a", "b"), expError: false, }, } @@ -4398,17 +4381,7 @@ func TestFindValidSplitKeys(t *testing.T) { defer engine.Close() ms := &enginepb.MVCCStats{} - val := roachpb.MakeValueFromString(strings.Repeat("X", 10)) - for _, k := range test.keys { - // Add three MVCC versions of every key. Splits are not allowed - // between MVCC versions, so this shouldn't have any effect. - for j := 1; j <= 3; j++ { - ts := hlc.Timestamp{Logical: int32(j)} - if err := MVCCPut(ctx, engine, ms, []byte(k), ts, hlc.ClockTimestamp{}, val, nil); err != nil { - t.Fatal(err) - } - } - } + testPopulateKeysWithVersions(ctx, t, engine, ms, test.keys) rangeStart := test.keys[0] if len(test.rangeStart) > 0 { rangeStart = test.rangeStart @@ -4519,6 +4492,203 @@ func TestFindBalancedSplitKeys(t *testing.T) { } } +// testAddPrefix manually creates rows corresponding to the schema e.g. +// CREATE TABLE t (id1 STRING, id2 STRING, ... PRIMARY KEY (id1, id2, ...)) +func testAddPrefix(prefix roachpb.Key, id uint32, rowVals ...string) roachpb.Key { + tableKey := append(prefix, keys.SystemSQLCodec.TablePrefix(id)...) + rowKey := roachpb.Key(encoding.EncodeVarintAscending(tableKey, 1)) + for _, rowVal := range rowVals { + rowKey = encoding.EncodeStringAscending(rowKey, rowVal) + } + return rowKey +} + +func testTablePrefix(id uint32, rowVals ...string) roachpb.Key { + return testAddPrefix(nil, id, rowVals...) +} +func testAddColFam(rowKey roachpb.Key, colFam uint32) roachpb.Key { + return keys.MakeFamilyKey(append([]byte(nil), rowKey...), colFam) +} + +// testPopulateKeysWithVersions puts the keys into the engine provided. Each +// key is added with 3 MVCC versions with a XX.. value. +func testPopulateKeysWithVersions( + ctx context.Context, t *testing.T, engine Engine, ms *enginepb.MVCCStats, keys []roachpb.Key, +) { + val := roachpb.MakeValueFromString(strings.Repeat("X", 10)) + for _, k := range keys { + // Add three MVCC versions of every key. Splits are not allowed + // between MVCC versions, so this shouldn't have any effect. + for j := 1; j <= 3; j++ { + ts := hlc.Timestamp{Logical: int32(j)} + require.NoError( + t, + MVCCPut(ctx, engine, ms, []byte(k), ts, hlc.ClockTimestamp{}, val, nil), + ) + } + } +} + +// TestMVCCFirstSplitKey checks that the split key returned from +// MVCCFirstSplitKey is: +// (1) Within a range's bounds +// (2) No less than the desired split key. +// (3) Greater than the first key, or first row's keys in table ranges. +// (4) Not inbetween the start and end of a row for table ranges. +func TestMVCCFirstSplitKey(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + userID := TestingUserDescID(0) + + type splitExpect struct { + desired roachpb.Key + expected roachpb.Key + } + + testCases := []struct { + keys []roachpb.Key + startKey, endKey roachpb.Key + splits []splitExpect + }{ + { + // No keys, no splits. + keys: []roachpb.Key{}, + startKey: roachpb.Key("a"), + endKey: roachpb.Key("z"), + splits: []splitExpect{ + {desired: roachpb.Key("a"), expected: nil}, + {desired: roachpb.Key("m"), expected: nil}, + {desired: roachpb.Key("z"), expected: nil}, + }, + }, + { + // All keys are outside the range, no keys to spit at so expect no + // splits. + keys: []roachpb.Key{ + roachpb.Key("0"), + roachpb.Key("c"), + roachpb.Key("d"), + }, + startKey: roachpb.Key("a"), + endKey: roachpb.Key("c"), + splits: []splitExpect{ + {desired: roachpb.Key("a"), expected: nil}, + {desired: roachpb.Key("b"), expected: nil}, + {desired: roachpb.Key("c"), expected: nil}, + }, + }, + { + // Only one key within the range, require at least two keys to split. + keys: []roachpb.Key{ + // (0) is outside the range [a,c) + roachpb.Key("0"), + roachpb.Key("a"), + // (c) is outside of the range [a,c). + roachpb.Key("c"), + }, + startKey: roachpb.Key("a"), + endKey: roachpb.Key("c"), + splits: []splitExpect{ + {desired: roachpb.Key("a"), expected: nil}, + {desired: roachpb.Key("b"), expected: nil}, + {desired: roachpb.Key("c"), expected: nil}, + }, + }, + { + // Enough keys to realize a split on c. Only desiredSplitKeys <= c should + // split at c. + keys: []roachpb.Key{ + // (0) is outside the range [a,e) + roachpb.Key("0"), + roachpb.Key("b"), + roachpb.Key("c"), + // (e) is outside of the range [a,e). + roachpb.Key("e"), + }, + startKey: roachpb.Key("a"), + endKey: roachpb.Key("e"), + splits: []splitExpect{ + // Should iterate to the first split key after minSpitKey which is (c). + {desired: roachpb.Key("0"), expected: roachpb.Key("c")}, + {desired: roachpb.Key("b"), expected: roachpb.Key("c")}, + {desired: roachpb.Key("c"), expected: roachpb.Key("c")}, + // Desired split key is after the last key in the range (c), shouldn't + // split. + {desired: roachpb.Key("d"), expected: nil}, + }, + }, + { + keys: []roachpb.Key{ + testAddColFam(testTablePrefix(userID, "a"), 1), + testAddColFam(testTablePrefix(userID, "b"), 1), + testAddColFam(testTablePrefix(userID, "b"), 2), + testAddColFam(testTablePrefix(userID, "b"), 3), + testAddColFam(testTablePrefix(userID, "d"), 1), + // (e,1) is outside of the range [a,e) + testAddColFam(testTablePrefix(userID, "e"), 1), + }, + startKey: testTablePrefix(userID, "a"), + endKey: testTablePrefix(userID, "e"), + splits: []splitExpect{ + {desired: testAddColFam(testTablePrefix(userID, "a"), 0), expected: testTablePrefix(userID, "b")}, + {desired: testAddColFam(testTablePrefix(userID, "b"), 3), expected: testTablePrefix(userID, "b")}, + // The first key after the desired split key is (d,1), expect a split + // at the prefix (d). + {desired: testAddColFam(testTablePrefix(userID, "b"), 4), expected: testTablePrefix(userID, "d")}, + // Desired split key is after the last key in the range (d,1), + // shouldn't split. + {desired: testAddColFam(testTablePrefix(userID, "d"), 2), expected: nil}, + }, + }, + { + // One partiton key, where the partition key is the first column (a). + keys: []roachpb.Key{ + testAddColFam(testTablePrefix(userID, "a", "a"), 1), + testAddColFam(testTablePrefix(userID, "a", "a"), 3), + testAddColFam(testTablePrefix(userID, "a", "b"), 1), + testAddColFam(testTablePrefix(userID, "a", "c"), 1), + // (a,d,0) is outside the range [a,(a,d)). + testAddColFam(testTablePrefix(userID, "a", "d"), 0), + }, + startKey: testTablePrefix(userID, "a"), + endKey: testTablePrefix(userID, "a", "d"), + splits: []splitExpect{ + {desired: testTablePrefix(userID, "a"), expected: testTablePrefix(userID, "a", "b")}, + {desired: testAddColFam(testTablePrefix(userID, "a", "a"), 3), expected: testTablePrefix(userID, "a", "b")}, + {desired: testAddColFam(testTablePrefix(userID, "a", "b"), 2), expected: testTablePrefix(userID, "a", "c")}, + // Desired split key is after the last key in the range (a,c,1), + // shouldn't split. + {desired: testAddColFam(testTablePrefix(userID, "a", "c"), 2), expected: nil}, + {desired: testTablePrefix(userID, "a", "e"), expected: nil}, + }, + }, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("%v", tc.keys), + func(t *testing.T) { + ctx := context.Background() + engine := NewDefaultInMemForTesting() + defer engine.Close() + + testPopulateKeysWithVersions(ctx, t, engine, &enginepb.MVCCStats{}, tc.keys) + rangeStartAddr := keys.MustAddr(tc.startKey) + rangeEndAddr := keys.MustAddr(tc.endKey) + for _, split := range tc.splits { + t.Run(fmt.Sprintf("%v", split.desired), func(t *testing.T) { + desiredSplitAddr := keys.MustAddr(split.desired) + splitKey, err := MVCCFirstSplitKey(ctx, engine, desiredSplitAddr, rangeStartAddr, rangeEndAddr) + // NB: We don't expect errors. If no split key can be found, we + // expect a nil splitKey to be returned. + require.NoError(t, err) + require.Equal(t, split.expected, splitKey) + }) + } + }) + } +} + // TestMVCCGarbageCollect writes a series of gc'able bytes and then // sends an MVCC GC request and verifies cleared values and updated // stats.