kvserver: avoid load-based splitting between rows

It was possible for a SQL row to be torn across two ranges due to the load-based splitter not rejecting potentially unsafe split keys. It is impossible to determine with just the sampled request keys, whether a key is certainly unsafe or safe. This commit side steps this problem by re-using the `adminSplitWithDescriptor` command to the next real key, after the provided `args.SplitKey`. This ensures that the split key will always be a real key whilst not requiring any checks in the splitter itself. As such, all safe split key checks are also removed from the `split` pkg, with a warning added. Resolves: cockroachdb#103483 Release note (bug fix): It was possible for a SQL row to be split across two ranges. When this occurred, SQL queries could return unexpected errors. This bug is resolved by these changes, as we now sample the real keys, rather than just request keys to determine load-based split points.
kvoli · May 22, 2023 · b341eb0 · b341eb0
1 parent 72ae87f
commit b341eb0
Show file tree

Hide file tree

Showing 6 changed files with 101 additions and 122 deletions.
diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go
@@ -79,7 +79,7 @@ func (r *Replica) AdminSplit(
 
 	err := r.executeAdminCommandWithDescriptor(ctx, func(desc *roachpb.RangeDescriptor) error {
 		var err error
-		reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason)
+		reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason, false /* findNextSafeKey */)
 		return err
 	})
 	return reply, err
@@ -292,7 +292,6 @@ func splitTxnStickyUpdateAttempt(
 // affirmative the descriptor is passed to AdminSplit, which performs a
 // Conditional Put on the RangeDescriptor to ensure that no other operation has
 // modified the range in the time the decision was being made.
-// TODO(tschottdorf): should assert that split key is not a local key.
 //
 // See the comment on splitTrigger for details on the complexities.
 func (r *Replica) adminSplitWithDescriptor(
@@ -301,6 +300,7 @@ func (r *Replica) adminSplitWithDescriptor(
 	desc *roachpb.RangeDescriptor,
 	delayable bool,
 	reason string,
+	findNextSafeKey bool,
 ) (kvpb.AdminSplitResponse, error) {
 	var err error
 	var reply kvpb.AdminSplitResponse
@@ -341,11 +341,57 @@ func (r *Replica) adminSplitWithDescriptor(
 				ri := r.GetRangeInfo(ctx)
 				return reply, kvpb.NewRangeKeyMismatchErrorWithCTPolicy(ctx, args.Key, args.Key, desc, &ri.Lease, ri.ClosedTimestampPolicy)
 			}
-			foundSplitKey = args.SplitKey
+			// When findNextSafeKey is true, we find the first key after
+			// args.SplitKey which is a safe split key.The primary user of
+			// findNextSafeKey is load based splitting, which only has knowledge of
+			// sampled keys from batch requests. These sampled keys can be
+			// arbitrarily within SQL rows due to column family keys.
+			//
+			// Not every caller requires a real key as a split point (creating empty
+			// table), however when we cannot verify the split key as safe, the most
+			// reliable method is checking existing keys.
+			//
+			// TODO(kvoli): We will never split a table with just two keys because of
+			// this limitation. Instead of piggybacking the MVCCFindSplitKey
+			// function, we should add a new function which allows splitting before
+			// the args.SplitKey, provided that it is a table key.
+			if findNextSafeKey {
+				// NOTE: The requested split key cannot be less than the start key for
+				// the range, if it is we risk returning a key which straddles a row.
+				// This is treat as an error below. Catch this case here as well to
+				// prevent unecessary work.
+				if !kvserverbase.ContainsKey(desc, args.SplitKey) {
+					return reply, errors.Errorf("requested split key %s out of bounds of %s",
+						args.SplitKey, r)
+				}
+				// Ensure the requested split key is addressable and not less than the
+				// start key of the range.
+				var desiredSplitKey roachpb.RKey
+				desiredSplitKey, err = keys.Addr(args.SplitKey)
+				if err != nil {
+					return reply, err
+				}
+				// We re-use the size based splitting method to ensure finding a safe
+				// key. To make this work, the target size of the lhs/rhs is set to
+				// zero and the args.SplitKey is given in place of the start key for
+				// the range.
+				targetSize := int64(0)
+				foundSplitKey, err = storage.MVCCFindSplitKey(
+					ctx, r.store.TODOEngine(), desiredSplitKey, desc.EndKey, targetSize)
+				if err != nil {
+					return reply, errors.Wrap(err, "unable to determine split key")
+				}
+				if foundSplitKey == nil {
+					return reply, unsplittableRangeError{}
+				}
+			} else {
+				foundSplitKey = args.SplitKey
+			}
 		}
 
 		if !kvserverbase.ContainsKey(desc, foundSplitKey) {
-			return reply, errors.Errorf("requested split key %s out of bounds of %s", args.SplitKey, r)
+			return reply, errors.Errorf("requested split key %s (found=%s) out of bounds of %s",
+				args.SplitKey, foundSplitKey, r)
 		}
 
 		// If predicate keys are specified, make sure they are contained by this

diff --git a/pkg/kv/kvserver/replica_split_load.go b/pkg/kv/kvserver/replica_split_load.go
@@ -254,13 +254,30 @@ func (r *Replica) recordBatchForLoadBasedSplitting(
 
 // loadSplitKey returns a suggested load split key for the range if it exists,
 // otherwise it returns nil. If there were any errors encountered when
-// validating the split key, the error is returned as well.
+// validating the split key, the error is returned as well. It is guaranteed
+// that the key returned, if non-nil, will be greater than the start key of the
+// range and also within the range bounds.
+//
+// NOTE: The returned split key CAN BE BETWEEN A SQL ROW, The split key
+// returned should only be used to engage a split via adminSplitWithDescriptor
+// where findNextSafeKey is set to true.
 func (r *Replica) loadSplitKey(ctx context.Context, now time.Time) roachpb.Key {
 	splitKey := r.loadBasedSplitter.MaybeSplitKey(ctx, now)
 	if splitKey == nil {
 		return nil
 	}
 
+	// If the splitKey belongs to a Table range, try and shorten the key to just
+	// the row prefix. This allows us to check that splitKey doesn't map to the
+	// first key of the range here.
+	//
+	// NB: We handle unsafe split keys in replica.adminSplitWithDescriptor, so it
+	// isn't an issue if we return an unsafe key here. See the case where
+	// findNextSafeKey is true.
+	if keyRowPrefix, err := keys.EnsureSafeSplitKey(splitKey); err == nil {
+		splitKey = keyRowPrefix
+	}
+
 	// We swallow the error here and instead log an event. It is currently
 	// expected that the load based splitter may return the start key of the
 	// range.

diff --git a/pkg/kv/kvserver/split/BUILD.bazel b/pkg/kv/kvserver/split/BUILD.bazel
@@ -12,7 +12,6 @@ go_library(
     importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/split",
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/keys",
         "//pkg/roachpb",
         "//pkg/util/humanizeutil",
         "//pkg/util/log",
@@ -38,7 +37,6 @@ go_test(
         "//pkg/roachpb",
         "//pkg/testutils/datapathutils",
         "//pkg/testutils/skip",
-        "//pkg/util/encoding",
         "//pkg/util/leaktest",
         "//pkg/util/metric",
         "//pkg/util/stop",

diff --git a/pkg/kv/kvserver/split/decider.go b/pkg/kv/kvserver/split/decider.go
@@ -19,7 +19,6 @@ import (
 	"strings"
 	"time"
 
-	"github.com/cockroachdb/cockroach/pkg/keys"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/metric"
@@ -261,7 +260,9 @@ func (d *Decider) recordLocked(
 		if d.mu.splitFinder.Ready(now) &&
 			now.Sub(d.mu.lastSplitSuggestion) > minSplitSuggestionInterval {
 			if splitKey := d.mu.splitFinder.Key(); splitKey != nil {
-				log.KvDistribution.VEventf(ctx, 3, "suggesting split key %s splitter_state=%s", splitKey, d.stringLocked())
+				log.KvDistribution.VEventf(ctx, 3,
+					"suggesting split key %s splitter_state=%s",
+					splitKey, d.stringLocked())
 				d.mu.lastSplitSuggestion = now
 				d.mu.suggestionsMade++
 				return true
@@ -271,9 +272,16 @@ func (d *Decider) recordLocked(
 					noSplitKeyCauseLogMsg := d.mu.splitFinder.NoSplitKeyCauseLogMsg()
 					if noSplitKeyCauseLogMsg != "" {
 						popularKeyFrequency := d.mu.splitFinder.PopularKeyFrequency()
-						noSplitKeyCauseLogMsg += fmt.Sprintf(", most popular key occurs in %d%% of samples", int(popularKeyFrequency*100))
+						noSplitKeyCauseLogMsg += fmt.Sprintf(
+							", most popular key occurs in %d%% of samples",
+							int(popularKeyFrequency*100),
+						)
+						if log.ExpensiveLogEnabled(ctx, 3) {
+							noSplitKeyCauseLogMsg += fmt.Sprintf(" splitter_state=%s",
+								d.stringLocked(),
+							)
+						}
 						log.KvDistribution.Infof(ctx, "%s", noSplitKeyCauseLogMsg)
-						log.KvDistribution.VInfof(ctx, 3, "splitter_state=%s")
 						if popularKeyFrequency >= splitKeyThreshold {
 							d.loadSplitterMetrics.PopularKeyCount.Inc(1)
 						}
@@ -315,6 +323,8 @@ func (d *Decider) maxStatLocked(ctx context.Context, now time.Time) (float64, bo
 // or if it wasn't able to determine a suitable split key.
 //
 // It is legal to call MaybeSplitKey at any time.
+// WARNING: The key returned from MaybeSplitKey has no guarantee of being a
+// safe split key. The key is derived from sampled spans. See below.
 func (d *Decider) MaybeSplitKey(ctx context.Context, now time.Time) roachpb.Key {
 	var key roachpb.Key
 
@@ -338,27 +348,20 @@ func (d *Decider) MaybeSplitKey(ctx context.Context, now time.Time) roachpb.Key
 		//
 		//   /Table/51/52/53/54/55
 		//
-		// (see TestDeciderCallsEnsureSafeSplitKey).
-		//
-		// The key found here isn't guaranteed to be a valid SQL column family
-		// key. This is because the keys are sampled from StartKey of requests
-		// hitting this replica. Ranged operations may well wish to exclude the
-		// start point by calling .Next() or may span multiple ranges, and so
-		// such a key may end up being passed to EnsureSafeSplitKey here.
+		// The key found here isn't guaranteed to be a valid SQL column family key.
+		// This is because the keys are sampled from StartKey and EndKey of
+		// requests hitting this replica. Ranged operations may well wish to
+		// exclude the start point by calling .Next() or may span multiple ranges,
+		// and so such a key may end up being returned. This is more common than
+		// one might think since SQL issues plenty of scans over all column
+		// families, meaning that we'll frequently find a key that has no column
+		// family suffix and thus errors out in EnsureSafeSplitKey.
 		//
-		// We take the risk that the result may sometimes not be a good split
-		// point (or even in this range).
-		//
-		// Note that we ignore EnsureSafeSplitKey when it returns an error since
-		// that error only tells us that this key couldn't possibly be a SQL
-		// key. This is more common than one might think since SQL issues plenty
-		// of scans over all column families, meaning that we'll frequently find
-		// a key that has no column family suffix and thus errors out in
-		// EnsureSafeSplitKey.
+		// We do not attempt to validate the key is safe here, simply return it to
+		// the caller as the best possible split point found so far. See
+		// replica.adminSplitWithDescriptor for how split keys are handled when we
+		// aren't certain the provided key is safe.
 		key = d.mu.splitFinder.Key()
-		if safeKey, err := keys.EnsureSafeSplitKey(key); err == nil {
-			key = safeKey
-		}
 	}
 	return key
 }

diff --git a/pkg/kv/kvserver/split/decider_test.go b/pkg/kv/kvserver/split/decider_test.go
@@ -12,14 +12,12 @@ package split
 
 import (
 	"context"
-	"math"
 	"math/rand"
 	"testing"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/keys"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
-	"github.com/cockroachdb/cockroach/pkg/util/encoding"
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
 	"github.com/cockroachdb/cockroach/pkg/util/metric"
 	"github.com/stretchr/testify/assert"
@@ -290,96 +288,6 @@ func TestDecider_MaxStat(t *testing.T) {
 	assertMaxStat(25000, 6, true)
 }
 
-func TestDeciderCallsEnsureSafeSplitKey(t *testing.T) {
-	defer leaktest.AfterTest(t)()
-	rand := rand.New(rand.NewSource(11))
-
-	var d Decider
-	loadSplitConfig := testLoadSplitConfig{
-		randSource:    rand,
-		useWeighted:   false,
-		statRetention: time.Second,
-		statThreshold: 1,
-	}
-
-	Init(&d, &loadSplitConfig, &LoadSplitterMetrics{
-		PopularKeyCount: metric.NewCounter(metric.Metadata{}),
-		NoSplitKeyCount: metric.NewCounter(metric.Metadata{}),
-	}, SplitCPU)
-
-	baseKey := keys.SystemSQLCodec.TablePrefix(51)
-	for i := 0; i < 4; i++ {
-		baseKey = encoding.EncodeUvarintAscending(baseKey, uint64(52+i))
-	}
-	c0 := func() roachpb.Span { return roachpb.Span{Key: append([]byte(nil), keys.MakeFamilyKey(baseKey, 1)...)} }
-	c1 := func() roachpb.Span { return roachpb.Span{Key: append([]byte(nil), keys.MakeFamilyKey(baseKey, 9)...)} }
-
-	expK, err := keys.EnsureSafeSplitKey(c1().Key)
-	require.NoError(t, err)
-
-	var k roachpb.Key
-	var now time.Time
-	for i := 0; i < 2*int(minSplitSuggestionInterval/time.Second); i++ {
-		now = now.Add(500 * time.Millisecond)
-		d.Record(context.Background(), now, ld(1), c0)
-		now = now.Add(500 * time.Millisecond)
-		d.Record(context.Background(), now, ld(1), c1)
-		k = d.MaybeSplitKey(context.Background(), now)
-		if len(k) != 0 {
-			break
-		}
-	}
-
-	require.Equal(t, expK, k)
-}
-
-func TestDeciderIgnoresEnsureSafeSplitKeyOnError(t *testing.T) {
-	defer leaktest.AfterTest(t)()
-	rand := rand.New(rand.NewSource(11))
-	var d Decider
-
-	loadSplitConfig := testLoadSplitConfig{
-		randSource:    rand,
-		useWeighted:   false,
-		statRetention: time.Second,
-		statThreshold: 1,
-	}
-
-	Init(&d, &loadSplitConfig, &LoadSplitterMetrics{
-		PopularKeyCount: metric.NewCounter(metric.Metadata{}),
-		NoSplitKeyCount: metric.NewCounter(metric.Metadata{}),
-	}, SplitCPU)
-
-	baseKey := keys.SystemSQLCodec.TablePrefix(51)
-	for i := 0; i < 4; i++ {
-		baseKey = encoding.EncodeUvarintAscending(baseKey, uint64(52+i))
-	}
-	c0 := func() roachpb.Span {
-		return roachpb.Span{Key: append([]byte(nil), encoding.EncodeUvarintAscending(baseKey, math.MaxInt32+1)...)}
-	}
-	c1 := func() roachpb.Span {
-		return roachpb.Span{Key: append([]byte(nil), encoding.EncodeUvarintAscending(baseKey, math.MaxInt32+2)...)}
-	}
-
-	_, err := keys.EnsureSafeSplitKey(c1().Key)
-	require.Error(t, err)
-
-	var k roachpb.Key
-	var now time.Time
-	for i := 0; i < 2*int(minSplitSuggestionInterval/time.Second); i++ {
-		now = now.Add(500 * time.Millisecond)
-		d.Record(context.Background(), now, ld(1), c0)
-		now = now.Add(500 * time.Millisecond)
-		d.Record(context.Background(), now, ld(1), c1)
-		k = d.MaybeSplitKey(context.Background(), now)
-		if len(k) != 0 {
-			break
-		}
-	}
-
-	require.Equal(t, c1().Key, k)
-}
-
 func TestMaxStatTracker(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 

diff --git a/pkg/kv/kvserver/split_queue.go b/pkg/kv/kvserver/split_queue.go
@@ -258,6 +258,7 @@ func (sq *splitQueue) processAttempt(
 			desc,
 			false, /* delayable */
 			"span config",
+			false, /* findFirstSafeSplitKey */
 		); err != nil {
 			return false, errors.Wrapf(err, "unable to split %s at key %q", r, splitKey)
 		}
@@ -277,6 +278,7 @@ func (sq *splitQueue) processAttempt(
 			desc,
 			false, /* delayable */
 			fmt.Sprintf("%s above threshold size %s", humanizeutil.IBytes(size), humanizeutil.IBytes(maxBytes)),
+			false, /* findFirstSafeSplitKey */
 		); err != nil {
 			return false, err
 		}
@@ -312,6 +314,10 @@ func (sq *splitQueue) processAttempt(
 		if expDelay := kvserverbase.SplitByLoadMergeDelay.Get(&sq.store.cfg.Settings.SV); expDelay > 0 {
 			expTime = sq.store.Clock().Now().Add(expDelay.Nanoseconds(), 0)
 		}
+		// The splitByLoadKey has no guarantee of being a safe key to split at (not
+		// between SQL rows). To sanitize the split point, pass
+		// findFirstSafeSplitKey set to true, so that the first key after the
+		// suggested split point which is safe to split at is used.
 		if _, pErr := r.adminSplitWithDescriptor(
 			ctx,
 			kvpb.AdminSplitRequest{
@@ -324,6 +330,7 @@ func (sq *splitQueue) processAttempt(
 			desc,
 			false, /* delayable */
 			reason,
+			true, /* findFirstSafeSplitKey */
 		); pErr != nil {
 			return false, errors.Wrapf(pErr, "unable to split %s at key %q", r, splitByLoadKey)
 		}