allocator: select a good enough store for decom/recovery

Until now, when decommissioning a node, or when recovering from a dead node, the allocator tries to pick one of the best possible stores as the target for the recovery. Because of that, we sometimes see multiple stores recover replicas to the same store, for example, when decommissioning a node and at the same time adding a new node. This PR changes the way we select a destination store by choosing a random store out of all the stores that are "good enough" for the replica. The risk diversity is still enforced, but we may recover a replica to a store that is considered "over full", for example. Note that during upreplication the allocator will still try to use one of the "best" stores as targets. Fixes: 86265 Release note: None Release justification:
cockroachdb · Aug 17, 2022 · 16d4269 · 16d4269
1 parent ec1f2a2
commit 16d4269
Show file tree

Hide file tree

Showing 7 changed files with 232 additions and 56 deletions.
diff --git a/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go b/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go
@@ -88,6 +88,19 @@ var leaseRebalancingAggressiveness = settings.RegisterFloatSetting(
 	settings.NonNegativeFloat,
 )
 
+// enableRecoverToGoodEnoughStores enables recovering replicas to any valid
+// store, instead of a store that has low range count. With this enabled,
+// recovering from a dead node or from a decommissioning node can be faster,
+// because nodes can send replicas to more target stores (instead of multiple
+// nodes sending replicas to a few stores with a low range count).
+var enableRecoverToGoodEnoughStores = settings.RegisterBoolSetting(
+	settings.SystemOnly,
+	"kv.allocator.recover_to_good_enough_stores.enabled",
+	"if true, the allocator may recover replicas to any valid store, otherwise "+
+		"it will pick one of the most ideal stores",
+	true,
+)
+
 // AllocatorAction enumerates the various replication adjustments that may be
 // recommended by the allocator.
 type AllocatorAction int
@@ -851,21 +864,72 @@ type decisionDetails struct {
 	Existing string `json:",omitempty"`
 }
 
+// CandidateSelector is an interface to select a store from a list of
+// candidates.
+type CandidateSelector interface {
+	selectOne(cl candidateList) *candidate
+}
+
+// BestCandidateSelector in used to choose the best store to allocate.
+type BestCandidateSelector struct {
+	randGen allocatorRand
+}
+
+// NewBestCandidateSelector returns a CandidateSelector for choosing the best
+// candidate store.
+func (a *Allocator) NewBestCandidateSelector() CandidateSelector {
+	return &BestCandidateSelector{a.randGen}
+}
+
+func (s *BestCandidateSelector) selectOne(cl candidateList) *candidate {
+	return cl.selectBest(s.randGen)
+}
+
+// GoodCandidateSelector is used to choose a random store out of the stores that
+// are good enough.
+type GoodCandidateSelector struct {
+	randGen allocatorRand
+}
+
+// NewGoodCandidateSelector returns a CandidateSelector for choosing a random store
+// out of the stores that are good enough.
+func (a *Allocator) NewGoodCandidateSelector() CandidateSelector {
+	return &GoodCandidateSelector{a.randGen}
+}
+
+func (s *GoodCandidateSelector) selectOne(cl candidateList) *candidate {
+	return cl.selectGood(s.randGen)
+}
+
 func (a *Allocator) allocateTarget(
 	ctx context.Context,
 	conf roachpb.SpanConfig,
 	existingVoters, existingNonVoters []roachpb.ReplicaDescriptor,
+	replicaStatus ReplicaStatus,
 	targetType TargetReplicaType,
 ) (roachpb.ReplicationTarget, string, error) {
 	candidateStoreList, aliveStoreCount, throttled := a.StorePool.GetStoreList(storepool.StoreFilterThrottled)
 
+	// If the replica is alive we are upreplicating, and in that case we want to
+	// allocate new replicas on the best possible store. Otherwise, the replica is
+	// dead or decommissioned, and we want to recover the missing replica as soon
+	// as possible, and therefore any store that is good enough will be
+	// considered.
+	var selector CandidateSelector
+	if replicaStatus == Alive || !enableRecoverToGoodEnoughStores.Get(&a.StorePool.St.SV) {
+		selector = a.NewBestCandidateSelector()
+	} else {
+		selector = a.NewGoodCandidateSelector()
+	}
+
 	target, details := a.AllocateTargetFromList(
 		ctx,
 		candidateStoreList,
 		conf,
 		existingVoters,
 		existingNonVoters,
 		a.ScorerOptions(ctx),
+		selector,
 		// When allocating a *new* replica, we explicitly disregard nodes with any
 		// existing replicas. This is important for multi-store scenarios as
 		// otherwise, stores on the nodes that have existing replicas are simply
@@ -903,8 +967,9 @@ func (a *Allocator) AllocateVoter(
 	ctx context.Context,
 	conf roachpb.SpanConfig,
 	existingVoters, existingNonVoters []roachpb.ReplicaDescriptor,
+	replicaStatus ReplicaStatus,
 ) (roachpb.ReplicationTarget, string, error) {
-	return a.allocateTarget(ctx, conf, existingVoters, existingNonVoters, VoterTarget)
+	return a.allocateTarget(ctx, conf, existingVoters, existingNonVoters, replicaStatus, VoterTarget)
 }
 
 // AllocateNonVoter returns a suitable store for a new allocation of a
@@ -914,8 +979,9 @@ func (a *Allocator) AllocateNonVoter(
 	ctx context.Context,
 	conf roachpb.SpanConfig,
 	existingVoters, existingNonVoters []roachpb.ReplicaDescriptor,
+	replicaStatus ReplicaStatus,
 ) (roachpb.ReplicationTarget, string, error) {
-	return a.allocateTarget(ctx, conf, existingVoters, existingNonVoters, NonVoterTarget)
+	return a.allocateTarget(ctx, conf, existingVoters, existingNonVoters, replicaStatus, NonVoterTarget)
 }
 
 // AllocateTargetFromList returns a suitable store for a new allocation of a
@@ -927,6 +993,7 @@ func (a *Allocator) AllocateTargetFromList(
 	conf roachpb.SpanConfig,
 	existingVoters, existingNonVoters []roachpb.ReplicaDescriptor,
 	options ScorerOptions,
+	selector CandidateSelector,
 	allowMultipleReplsPerNode bool,
 	targetType TargetReplicaType,
 ) (roachpb.ReplicationTarget, string) {
@@ -968,7 +1035,7 @@ func (a *Allocator) AllocateTargetFromList(
 	)
 
 	log.VEventf(ctx, 3, "allocate %s: %s", targetType, candidates)
-	if target := candidates.selectGood(a.randGen); target != nil {
+	if target := selector.selectOne(candidates); target != nil {
 		log.VEventf(ctx, 3, "add target: %s", target)
 		details := decisionDetails{Target: target.compactString()}
 		detailsBytes, err := json.Marshal(details)

diff --git a/pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go b/pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go
@@ -778,6 +778,23 @@ func (cl candidateList) best() candidateList {
 	return cl
 }
 
+// good returns all the elements in a sorted (by score reversed) candidate list
+// that share the highest diversity score and are valid.
+func (cl candidateList) good() candidateList {
+	cl = cl.onlyValidAndHealthyDisk()
+	if len(cl) <= 1 {
+		return cl
+	}
+	for i := 1; i < len(cl); i++ {
+		if cl[i].necessary == cl[0].necessary &&
+			scoresAlmostEqual(cl[i].diversityScore, cl[0].diversityScore) {
+			continue
+		}
+		return cl[:i]
+	}
+	return cl
+}
+
 // worst returns all the elements in a sorted (by score reversed) candidate list
 // that share the lowest constraint score (for instance, the set of candidates
 // that result in the lowest diversity score for the range, or the set of
@@ -858,9 +875,26 @@ func (cl candidateList) selectBest(randGen allocatorRand) *candidate {
 	return best
 }
 
-// selectBad randomly chooses a bad candidate store from a sorted (by score
+// selectGood randomly chooses a good candidate store from a sorted (by score
 // reversed) candidate list using the provided random generator.
-func (cl candidateList) selectBad(randGen allocatorRand) *candidate {
+func (cl candidateList) selectGood(randGen allocatorRand) *candidate {
+	cl = cl.good()
+	if len(cl) == 0 {
+		return nil
+	}
+	if len(cl) == 1 {
+		return &cl[0]
+	}
+	randGen.Lock()
+	r := randGen.Intn(len(cl))
+	randGen.Unlock()
+	c := &cl[r]
+	return c
+}
+
+// selectWorst randomly chooses one of the worst candidate stores from a sorted
+// (by score reversed) candidate list using the provided random generator.
+func (cl candidateList) selectWorst(randGen allocatorRand) *candidate {
 	cl = cl.worst()
 	if len(cl) == 0 {
 		return nil

diff --git a/pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go b/pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go
@@ -27,6 +27,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/kr/pretty"
+	"github.com/stretchr/testify/require"
 )
 
 type storeScore struct {
@@ -95,9 +96,8 @@ func TestOnlyValidAndHealthyDisk(t *testing.T) {
 	}
 }
 
-// TestSelectBestPanic is a basic regression test against a former panic in
-// selectBest when called with just invalid/full stores.
-func TestSelectBestPanic(t *testing.T) {
+// TestNilSelection verifies selection with just invalid/full stores.
+func TestNilSelection(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	defer log.Scope(t).Close(t)
 
@@ -107,12 +107,11 @@ func TestSelectBestPanic(t *testing.T) {
 		},
 	}
 	allocRand := makeAllocatorRand(rand.NewSource(0))
-	if good := cl.selectBest(allocRand); good != nil {
-		t.Errorf("cl.selectBest() got %v, want nil", good)
-	}
+	require.Nil(t, cl.selectBest(allocRand))
+	require.Nil(t, cl.selectGood(allocRand))
 }
 
-// TestCandidateSelection tests select{best,worst} and {best,worst}constraints.
+// TestCandidateSelection tests select{Best,Good,Worst} and {best,good,worst}constraints.
 func TestCandidateSelection(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	defer log.Scope(t).Close(t)
@@ -155,57 +154,73 @@ func TestCandidateSelection(t *testing.T) {
 	testCases := []struct {
 		candidates  []scoreTuple
 		best        []scoreTuple
+		good        []scoreTuple
 		worst       []scoreTuple
 		bestChosen  scoreTuple
+		goodChosen  scoreTuple
 		worstChosen scoreTuple
 	}{
 		{
 			candidates:  []scoreTuple{{0, 0}},
 			best:        []scoreTuple{{0, 0}},
+			good:        []scoreTuple{{0, 0}},
 			worst:       []scoreTuple{{0, 0}},
 			bestChosen:  scoreTuple{0, 0},
+			goodChosen:  scoreTuple{0, 0},
 			worstChosen: scoreTuple{0, 0},
 		},
 		{
 			candidates:  []scoreTuple{{0, 0}, {0, 1}},
 			best:        []scoreTuple{{0, 0}, {0, 1}},
+			good:        []scoreTuple{{0, 0}, {0, 1}},
 			worst:       []scoreTuple{{0, 0}, {0, 1}},
 			bestChosen:  scoreTuple{0, 0},
+			goodChosen:  scoreTuple{0, 1},
 			worstChosen: scoreTuple{0, 1},
 		},
 		{
 			candidates:  []scoreTuple{{0, 0}, {0, 1}, {0, 2}},
 			best:        []scoreTuple{{0, 0}, {0, 1}, {0, 2}},
+			good:        []scoreTuple{{0, 0}, {0, 1}, {0, 2}},
 			worst:       []scoreTuple{{0, 0}, {0, 1}, {0, 2}},
-			bestChosen:  scoreTuple{0, 1},
-			worstChosen: scoreTuple{0, 2},
+			bestChosen:  scoreTuple{0, 0},
+			goodChosen:  scoreTuple{0, 0},
+			worstChosen: scoreTuple{0, 1},
 		},
 		{
 			candidates:  []scoreTuple{{1, 0}, {0, 1}},
 			best:        []scoreTuple{{1, 0}},
+			good:        []scoreTuple{{1, 0}},
 			worst:       []scoreTuple{{0, 1}},
 			bestChosen:  scoreTuple{1, 0},
+			goodChosen:  scoreTuple{1, 0},
 			worstChosen: scoreTuple{0, 1},
 		},
 		{
 			candidates:  []scoreTuple{{1, 0}, {0, 1}, {0, 2}},
 			best:        []scoreTuple{{1, 0}},
+			good:        []scoreTuple{{1, 0}},
 			worst:       []scoreTuple{{0, 1}, {0, 2}},
 			bestChosen:  scoreTuple{1, 0},
+			goodChosen:  scoreTuple{1, 0},
 			worstChosen: scoreTuple{0, 2},
 		},
 		{
 			candidates:  []scoreTuple{{1, 0}, {1, 1}, {0, 2}},
 			best:        []scoreTuple{{1, 0}, {1, 1}},
+			good:        []scoreTuple{{1, 0}, {1, 1}},
 			worst:       []scoreTuple{{0, 2}},
 			bestChosen:  scoreTuple{1, 0},
+			goodChosen:  scoreTuple{1, 1},
 			worstChosen: scoreTuple{0, 2},
 		},
 		{
 			candidates:  []scoreTuple{{1, 0}, {1, 1}, {0, 2}, {0, 3}},
 			best:        []scoreTuple{{1, 0}, {1, 1}},
+			good:        []scoreTuple{{1, 0}, {1, 1}},
 			worst:       []scoreTuple{{0, 2}, {0, 3}},
 			bestChosen:  scoreTuple{1, 0},
+			goodChosen:  scoreTuple{1, 0},
 			worstChosen: scoreTuple{0, 3},
 		},
 	}
@@ -218,6 +233,11 @@ func TestCandidateSelection(t *testing.T) {
 				t.Errorf("expected:%s actual:%s diff:%v", formatter(e), formatter(a), pretty.Diff(e, a))
 			}
 		})
+		t.Run(fmt.Sprintf("good-%s", formatter(cl)), func(t *testing.T) {
+			if a, e := cl.good(), genCandidates(tc.good, 1); !reflect.DeepEqual(a, e) {
+				t.Errorf("expected:%s actual:%s diff:%v", formatter(e), formatter(a), pretty.Diff(e, a))
+			}
+		})
 		t.Run(fmt.Sprintf("worst-%s", formatter(cl)), func(t *testing.T) {
 			// Shifting the ids is required to match the end of the list.
 			if a, e := cl.worst(), genCandidates(
@@ -237,6 +257,16 @@ func TestCandidateSelection(t *testing.T) {
 				t.Errorf("expected:%v actual:%v", tc.bestChosen, actual)
 			}
 		})
+		t.Run(fmt.Sprintf("select-good-%s", formatter(cl)), func(t *testing.T) {
+			good := cl.selectGood(allocRand)
+			if good == nil {
+				t.Fatalf("no 'good' candidate found")
+			}
+			actual := scoreTuple{int(good.diversityScore + 0.5), good.rangeCount}
+			if actual != tc.goodChosen {
+				t.Errorf("expected:%v actual:%v", tc.goodChosen, actual)
+			}
+		})
 		t.Run(fmt.Sprintf("select-worst-%s", formatter(cl)), func(t *testing.T) {
 			worst := cl.selectWorst(allocRand)
 			if worst == nil {