kvserver: avoid load-based splitting between rows

It was possible for a SQL row to be torn across two ranges due to the load-based splitter not rejecting potentially unsafe split keys. It is impossible to determine with keys sampled from response spans, whether a key is certainly unsafe or safe. This commit side steps this problem by re-using the `adminSplitWithDescriptor` command to find the first real key, after or at the provided `args.SplitKey`. This ensures that the split key will always be a real key whilst not requiring any checks in the splitter itself. The updated `adminSplitWithDescriptor` is local only and requires opting into finding the first safe key by setting `findFirstSafeKey` to `true`. As such, all safe split key checks are also removed from the `split` pkg, with a warning added that the any split key returned is unsafe. Resolves: cockroachdb#103483 Release note (bug fix): It was possible for a SQL row to be split across two ranges. When this occurred, SQL queries could return unexpected errors. This bug is resolved by these changes, as we now inspect the real keys, rather than just request keys to determine load-based split points.
kvoli · May 24, 2023 · dbfe79a · dbfe79a
1 parent 279a219
commit dbfe79a
Show file tree

Hide file tree

Showing 10 changed files with 620 additions and 190 deletions.
diff --git a/pkg/kv/kvserver/client_split_test.go b/pkg/kv/kvserver/client_split_test.go
@@ -3685,3 +3685,249 @@ func TestStoreRangeSplitAndMergeWithGlobalReads(t *testing.T) {
 	repl = store.LookupReplica(roachpb.RKey(splitKey))
 	require.Equal(t, descKey, repl.Desc().StartKey.AsRawKey())
 }
+
+// TestLBSplitUnsafeKeys tests that load based splits do not split between table
+// rows, even when the suggested load based split key is itself between a table row.
+func TestLBSplitUnsafeKeys(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+	ctx := context.Background()
+	const indexID = 1
+
+	// The test is expensive and prone to timing out under race.
+	skip.UnderRace(t)
+	skip.UnderStressRace(t)
+	skip.UnderDeadlock(t)
+
+	makeTestKey := func(tableID uint32, suffix []byte) roachpb.Key {
+		tableKey := keys.MakeTableIDIndexID(nil, tableID, indexID)
+		return append(tableKey, suffix...)
+	}
+
+	es := func(vals ...int64) []byte {
+		k := []byte{}
+		for _, v := range vals {
+			k = encoding.EncodeVarintAscending(k, v)
+		}
+		return k
+	}
+
+	fk := func(k []byte, famID uint32) []byte {
+		return keys.MakeFamilyKey(k, famID)
+	}
+
+	testCases := []struct {
+		splitKey     roachpb.Key
+		existingKeys []int
+		expSplitKey  roachpb.Key
+		expErrStr    string
+	}{
+		// We don't know the table ID here, we append the splitKey to the
+		// table/index prefix. e.g. /1 will be /Table/table_id/index_id/1.
+		{
+			// /1 -> /2
+			splitKey:     es(1),
+			existingKeys: []int{1, 2, 3},
+			expSplitKey:  es(2),
+		},
+		{
+			// /1/0 -> /2
+			splitKey:     fk(es(1), 0),
+			existingKeys: []int{1, 2, 3},
+			expSplitKey:  es(2),
+		},
+		{
+			// /1/3 -> /2
+			splitKey:     fk(es(1), 3),
+			existingKeys: []int{1, 2, 3},
+			expSplitKey:  es(2),
+		},
+		{
+			// /1/0/0 -> /2
+			splitKey:     roachpb.Key(fk(es(1), 0)).Next(),
+			existingKeys: []int{1, 2, 3},
+			expSplitKey:  es(2),
+		},
+		{
+			// /1/3/0 -> /2
+			splitKey:     roachpb.Key(fk(es(1), 3)).Next(),
+			existingKeys: []int{1, 2, 3},
+			expSplitKey:  es(2),
+		},
+		{
+			// /1/0/0/0 -> /2
+			splitKey:     roachpb.Key(fk(es(1), 0)).Next().Next(),
+			existingKeys: []int{1, 2, 3},
+			expSplitKey:  es(2),
+		},
+		{
+			// /1/3/0/0 -> /2
+			splitKey:     roachpb.Key(fk(es(1), 3)).Next().Next(),
+			existingKeys: []int{1, 2, 3},
+			expSplitKey:  es(2),
+		},
+		{
+			// /0 -> /3
+			// We will not split at the first row in a table, so expect the split at
+			// /3 intead of /2.
+			splitKey:     es(0),
+			existingKeys: []int{2, 3},
+			expSplitKey:  es(3),
+		},
+		{
+			// /2 -> /3
+			// Same case as above, despite the key being safe, the split would create
+			// an empty LHS.
+			splitKey:     es(2),
+			existingKeys: []int{2, 3},
+			expSplitKey:  es(3),
+		},
+		{
+			// /1 -> error
+			// There are no rows to split on.
+			splitKey:     es(1),
+			existingKeys: []int{},
+			expErrStr:    "could not find valid split key",
+		},
+		{
+			// /1 -> error
+			// There is only one row to split on, the range should not be split.
+			splitKey:     es(1),
+			existingKeys: []int{2},
+			expErrStr:    "could not find valid split key",
+		},
+		{
+			// /1/0 -> error
+			splitKey:     fk(es(1), 0),
+			existingKeys: []int{2},
+			expErrStr:    "could not find valid split key",
+		},
+		{
+			// /1/3 -> error
+			splitKey:     fk(es(1), 3),
+			existingKeys: []int{2},
+			expErrStr:    "could not find valid split key",
+		},
+		{
+			// /1/3/0/0 -> error
+			splitKey:     roachpb.Key(fk(es(1), 0)).Next().Next(),
+			existingKeys: []int{2},
+			expErrStr:    "could not find valid split key",
+		},
+		{
+			// /2 -> /2
+			splitKey:     es(2),
+			existingKeys: []int{1, 2},
+			expSplitKey:  es(2),
+		},
+	}
+
+	for _, tc := range testCases {
+		var expectStr string
+		if tc.expErrStr != "" {
+			expectStr = tc.expErrStr
+		} else {
+			expectStr = makeTestKey(1, tc.expSplitKey).String()
+		}
+		t.Run(fmt.Sprintf("%s%v -> %s", makeTestKey(1, tc.splitKey), tc.existingKeys, expectStr), func(t *testing.T) {
+			var targetRange atomic.Int32
+			var splitKeyOverride atomic.Value
+			splitKeyOverride.Store(roachpb.Key{})
+
+			// Mock the load based splitter key finding method. This function will be
+			// checked in splitQueue.shouldQueue() and splitQueue.process via
+			// replica.loadSplitKey. When a key is returned, the split queue calls
+			// replica.adminSplitWithDescriptor(...findFirstSafeSplitKey=true).
+			overrideLBSplitFn := func(rangeID roachpb.RangeID) (splitKey roachpb.Key, useSplitKey bool) {
+				if rangeID == roachpb.RangeID(targetRange.Load()) {
+					override := splitKeyOverride.Load()
+					// It is possible that the split queue is checking the range before
+					// we manually enqueued it.
+					if override == nil {
+						return nil, false
+					}
+					overrideKey, ok := override.(roachpb.Key)
+					require.Truef(t, ok, "stored value not key %+v", override)
+
+					if len(overrideKey) == 0 {
+						return nil, false
+					}
+
+					return override.(roachpb.Key), true
+				}
+				return nil, false
+			}
+
+			serv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{
+				DisableSpanConfigs: true,
+				Knobs: base.TestingKnobs{
+					Store: &kvserver.StoreTestingKnobs{
+						LoadBasedSplittingOverrideKey: overrideLBSplitFn,
+						DisableMergeQueue:             true,
+					},
+				},
+			})
+			s := serv.(*server.TestServer)
+			defer s.Stopper().Stop(ctx)
+			tdb := sqlutils.MakeSQLRunner(sqlDB)
+			store, err := s.Stores().GetStore(1)
+			require.NoError(t, err)
+
+			// We want to exercise the case where there are column family keys.
+			// Create a simple table and insert the existing keys.
+			_ = tdb.Exec(t,
+				"CREATE TABLE t (k INT PRIMARY KEY, "+
+					"t0 INT, t1 INT, t2 INT, t3 INT, "+
+					"FAMILY (k), FAMILY (t0), FAMILY (t1), FAMILY (t2), FAMILY (t3))")
+			for _, k := range tc.existingKeys {
+				_ = tdb.Exec(t, fmt.Sprintf("INSERT INTO t VALUES (%d, %d, %d, %d, %d)",
+					k, k, k, k, k))
+			}
+
+			// Force a table scan to resolve descriptors.
+			var keyCount int
+			tdb.QueryRow(t, "SELECT count(k) FROM t").Scan(&keyCount)
+			require.Equal(t, len(tc.existingKeys), keyCount)
+			var tableID uint32
+			tdb.QueryRow(t, "SELECT table_id FROM crdb_internal.leases where name = 't'").Scan(&tableID)
+
+			// Split off the table range for the test, otherwise the range may
+			// contain multiple tables with existing values.
+			splitArgs := adminSplitArgs(keys.SystemSQLCodec.TablePrefix(tableID))
+			_, pErr := kv.SendWrapped(ctx, store.TestSender(), splitArgs)
+			require.Nil(t, pErr)
+
+			var rangeID roachpb.RangeID
+			tdb.QueryRow(t, "SELECT range_id FROM [SHOW RANGES FROM TABLE t] LIMIT 1").Scan(&rangeID)
+			targetRange.Store(int32(rangeID))
+			repl, err := store.GetReplica(rangeID)
+			require.NoError(t, err)
+
+			// Keep the previous end key around, we will use this to assert that no
+			// split has occurred when expecting an error.
+			prevEndKey := repl.Desc().EndKey.AsRawKey()
+			splitKey := makeTestKey(tableID, tc.splitKey)
+
+			// Update the split key override so that the split queue will enqueue and
+			// process the range. Remove it afterwards to avoid retrying the LHS.
+			splitKeyOverride.Store(splitKey)
+			_, processErr, enqueueErr := store.Enqueue(ctx, "split", repl, false /* shouldSkipQueue */, false /* async */)
+			splitKeyOverride.Store(roachpb.Key{})
+			require.NoError(t, enqueueErr)
+
+			endKey := repl.Desc().EndKey.AsRawKey()
+			if tc.expErrStr != "" {
+				// We expect this split not to process, assert that the expected error
+				// matches the returned error and the range has the same end key.
+				require.ErrorContainsf(t, processErr, tc.expErrStr,
+					"end key %s, previous end key %s", endKey, prevEndKey)
+				require.Equal(t, prevEndKey, endKey)
+			} else {
+				// Otherwise, assert that the new range end key matches the expected
+				// end key.
+				require.NoError(t, processErr)
+				require.Equal(t, makeTestKey(tableID, tc.expSplitKey), endKey)
+			}
+		})
+	}
+}
diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go
@@ -79,7 +79,7 @@ func (r *Replica) AdminSplit(
 
 	err := r.executeAdminCommandWithDescriptor(ctx, func(desc *roachpb.RangeDescriptor) error {
 		var err error
-		reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason)
+		reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason, false /* findFirstSafeKey */)
 		return err
 	})
 	return reply, err
@@ -292,7 +292,6 @@ func splitTxnStickyUpdateAttempt(
 // affirmative the descriptor is passed to AdminSplit, which performs a
 // Conditional Put on the RangeDescriptor to ensure that no other operation has
 // modified the range in the time the decision was being made.
-// TODO(tschottdorf): should assert that split key is not a local key.
 //
 // See the comment on splitTrigger for details on the complexities.
 func (r *Replica) adminSplitWithDescriptor(
@@ -301,6 +300,7 @@ func (r *Replica) adminSplitWithDescriptor(
 	desc *roachpb.RangeDescriptor,
 	delayable bool,
 	reason string,
+	findFirstSafeKey bool,
 ) (kvpb.AdminSplitResponse, error) {
 	var err error
 	var reply kvpb.AdminSplitResponse
@@ -341,11 +341,36 @@ func (r *Replica) adminSplitWithDescriptor(
 				ri := r.GetRangeInfo(ctx)
 				return reply, kvpb.NewRangeKeyMismatchErrorWithCTPolicy(ctx, args.Key, args.Key, desc, &ri.Lease, ri.ClosedTimestampPolicy)
 			}
-			foundSplitKey = args.SplitKey
+			// When findFirstSafeKey is true, we find the first key after or at
+			// args.SplitKey which is a safe split to split at. The current user of
+			// findFirstSafeKey is load based splitting, which only has knowledge of
+			// sampled keys from batch requests. These sampled keys can be
+			// arbitrarily within SQL rows due to column family keys.
+			//
+			// Not every caller requires a real key as a split point (creating empty
+			// table), however when we cannot verify the split key as safe, the most
+			// reliable method is checking existing keys.
+			if findFirstSafeKey {
+				var desiredSplitKey roachpb.RKey
+				if desiredSplitKey, err = keys.Addr(args.SplitKey); err != nil {
+					return reply, err
+				}
+				if foundSplitKey, err = storage.MVCCFirstSplitKey(
+					ctx, r.store.TODOEngine(), desiredSplitKey,
+					desc.StartKey, desc.EndKey,
+				); err != nil {
+					return reply, errors.Wrap(err, "unable to determine split key")
+				} else if foundSplitKey == nil {
+					return reply, unsplittableRangeError{}
+				}
+			} else {
+				foundSplitKey = args.SplitKey
+			}
 		}
 
 		if !kvserverbase.ContainsKey(desc, foundSplitKey) {
-			return reply, errors.Errorf("requested split key %s out of bounds of %s", args.SplitKey, r)
+			return reply, errors.Errorf("requested split key %s (found=%s) out of bounds of %s",
+				args.SplitKey, foundSplitKey, r)
 		}
 
 		// If predicate keys are specified, make sure they are contained by this

diff --git a/pkg/kv/kvserver/replica_split_load.go b/pkg/kv/kvserver/replica_split_load.go
@@ -254,13 +254,39 @@ func (r *Replica) recordBatchForLoadBasedSplitting(
 
 // loadSplitKey returns a suggested load split key for the range if it exists,
 // otherwise it returns nil. If there were any errors encountered when
-// validating the split key, the error is returned as well.
+// validating the split key, the error is returned as well. It is guaranteed
+// that the key returned, if non-nil, will be greater than the start key of the
+// range and also within the range bounds.
+//
+// NOTE: The returned split key CAN BE BETWEEN A SQL ROW, The split key
+// returned should only be used to engage a split via adminSplitWithDescriptor
+// where findFirstSafeKey is set to true.
 func (r *Replica) loadSplitKey(ctx context.Context, now time.Time) roachpb.Key {
-	splitKey := r.loadBasedSplitter.MaybeSplitKey(ctx, now)
+	var splitKey roachpb.Key
+	if overrideFn := r.store.cfg.TestingKnobs.LoadBasedSplittingOverrideKey; overrideFn != nil {
+		var useSplitKey bool
+		if splitKey, useSplitKey = overrideFn(r.GetRangeID()); useSplitKey {
+			return splitKey
+		}
+	} else {
+		splitKey = r.loadBasedSplitter.MaybeSplitKey(ctx, now)
+	}
+
 	if splitKey == nil {
 		return nil
 	}
 
+	// If the splitKey belongs to a Table range, try and shorten the key to just
+	// the row prefix. This allows us to check that splitKey doesn't map to the
+	// first key of the range here.
+	//
+	// NB: We handle unsafe split keys in replica.adminSplitWithDescriptor, so it
+	// isn't an issue if we return an unsafe key here. See the case where
+	// findFirstSafeKey is true.
+	if keyRowPrefix, err := keys.EnsureSafeSplitKey(splitKey); err == nil {
+		splitKey = keyRowPrefix
+	}
+
 	// We swallow the error here and instead log an event. It is currently
 	// expected that the load based splitter may return the start key of the
 	// range.

diff --git a/pkg/kv/kvserver/split/BUILD.bazel b/pkg/kv/kvserver/split/BUILD.bazel
@@ -12,7 +12,6 @@ go_library(
     importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/split",
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/keys",
         "//pkg/roachpb",
         "//pkg/util/humanizeutil",
         "//pkg/util/log",
@@ -39,7 +38,6 @@ go_test(
         "//pkg/roachpb",
         "//pkg/testutils/datapathutils",
         "//pkg/testutils/skip",
-        "//pkg/util/encoding",
         "//pkg/util/leaktest",
         "//pkg/util/metric",
         "//pkg/util/stop",