Skip to content

Commit

Permalink
allocator: select a good enough store for decom/recovery
Browse files Browse the repository at this point in the history
Until now, when decommissioning a node, or when recovering from a dead
node, the allocator tries to pick one of the best possible stores as
the target for the recovery.

Because of that, we sometimes see multiple stores recover replicas
to the same store, for example, when decommissioning a node and
at the same time adding a new node.

This PR changes the way we select a destination store by choosing
a random store out of all the stores that are "good enough" for
the replica. The risk diversity is still enforced, but we may
recover a replica to a store that is considered "over full", for
example.

Note that during upreplication the allocator will still try to use
one of the "best" stores as targets.

Fixes: #86265

Release note: None

Release justification: a relatively small change, and it can be
reverted by setting kv.allocator.recovery_store_selector=best.
  • Loading branch information
lidorcarmel committed Aug 24, 2022
1 parent 091c13b commit 3ce0fd5
Show file tree
Hide file tree
Showing 7 changed files with 239 additions and 56 deletions.
74 changes: 71 additions & 3 deletions pkg/kv/kvserver/allocator/allocatorimpl/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,20 @@ var leaseRebalancingAggressiveness = settings.RegisterFloatSetting(
settings.NonNegativeFloat,
)

// recoveryStoreSelector controls the strategy for choosing a store to recover
// replicas to: either to any valid store ("good") or to a store that has low
// range count ("best"). With this set to "good", recovering from a dead node or
// from a decommissioning node can be faster, because nodes can send replicas to
// more target stores (instead of multiple nodes sending replicas to a few
// stores with a low range count).
var recoveryStoreSelector = settings.RegisterStringSetting(
settings.SystemOnly,
"kv.allocator.recovery_store_selector",
"if set to 'good', the allocator may recover replicas to any valid store, if set "+
"to 'best' it will pick one of the most ideal stores",
"good",
)

// AllocatorAction enumerates the various replication adjustments that may be
// recommended by the allocator.
type AllocatorAction int
Expand Down Expand Up @@ -850,21 +864,72 @@ type decisionDetails struct {
Existing string `json:",omitempty"`
}

// CandidateSelector is an interface to select a store from a list of
// candidates.
type CandidateSelector interface {
selectOne(cl candidateList) *candidate
}

// BestCandidateSelector in used to choose the best store to allocate.
type BestCandidateSelector struct {
randGen allocatorRand
}

// NewBestCandidateSelector returns a CandidateSelector for choosing the best
// candidate store.
func (a *Allocator) NewBestCandidateSelector() CandidateSelector {
return &BestCandidateSelector{a.randGen}
}

func (s *BestCandidateSelector) selectOne(cl candidateList) *candidate {
return cl.selectBest(s.randGen)
}

// GoodCandidateSelector is used to choose a random store out of the stores that
// are good enough.
type GoodCandidateSelector struct {
randGen allocatorRand
}

// NewGoodCandidateSelector returns a CandidateSelector for choosing a random store
// out of the stores that are good enough.
func (a *Allocator) NewGoodCandidateSelector() CandidateSelector {
return &GoodCandidateSelector{a.randGen}
}

func (s *GoodCandidateSelector) selectOne(cl candidateList) *candidate {
return cl.selectGood(s.randGen)
}

func (a *Allocator) allocateTarget(
ctx context.Context,
conf roachpb.SpanConfig,
existingVoters, existingNonVoters []roachpb.ReplicaDescriptor,
replicaStatus ReplicaStatus,
targetType TargetReplicaType,
) (roachpb.ReplicationTarget, string, error) {
candidateStoreList, aliveStoreCount, throttled := a.StorePool.GetStoreList(storepool.StoreFilterThrottled)

// If the replica is alive we are upreplicating, and in that case we want to
// allocate new replicas on the best possible store. Otherwise, the replica is
// dead or decommissioned, and we want to recover the missing replica as soon
// as possible, and therefore any store that is good enough will be
// considered.
var selector CandidateSelector
if replicaStatus == Alive || recoveryStoreSelector.Get(&a.StorePool.St.SV) == "best" {
selector = a.NewBestCandidateSelector()
} else {
selector = a.NewGoodCandidateSelector()
}

target, details := a.AllocateTargetFromList(
ctx,
candidateStoreList,
conf,
existingVoters,
existingNonVoters,
a.ScorerOptions(ctx),
selector,
// When allocating a *new* replica, we explicitly disregard nodes with any
// existing replicas. This is important for multi-store scenarios as
// otherwise, stores on the nodes that have existing replicas are simply
Expand Down Expand Up @@ -902,8 +967,9 @@ func (a *Allocator) AllocateVoter(
ctx context.Context,
conf roachpb.SpanConfig,
existingVoters, existingNonVoters []roachpb.ReplicaDescriptor,
replicaStatus ReplicaStatus,
) (roachpb.ReplicationTarget, string, error) {
return a.allocateTarget(ctx, conf, existingVoters, existingNonVoters, VoterTarget)
return a.allocateTarget(ctx, conf, existingVoters, existingNonVoters, replicaStatus, VoterTarget)
}

// AllocateNonVoter returns a suitable store for a new allocation of a
Expand All @@ -913,8 +979,9 @@ func (a *Allocator) AllocateNonVoter(
ctx context.Context,
conf roachpb.SpanConfig,
existingVoters, existingNonVoters []roachpb.ReplicaDescriptor,
replicaStatus ReplicaStatus,
) (roachpb.ReplicationTarget, string, error) {
return a.allocateTarget(ctx, conf, existingVoters, existingNonVoters, NonVoterTarget)
return a.allocateTarget(ctx, conf, existingVoters, existingNonVoters, replicaStatus, NonVoterTarget)
}

// AllocateTargetFromList returns a suitable store for a new allocation of a
Expand All @@ -926,6 +993,7 @@ func (a *Allocator) AllocateTargetFromList(
conf roachpb.SpanConfig,
existingVoters, existingNonVoters []roachpb.ReplicaDescriptor,
options ScorerOptions,
selector CandidateSelector,
allowMultipleReplsPerNode bool,
targetType TargetReplicaType,
) (roachpb.ReplicationTarget, string) {
Expand Down Expand Up @@ -967,7 +1035,7 @@ func (a *Allocator) AllocateTargetFromList(
)

log.VEventf(ctx, 3, "allocate %s: %s", targetType, candidates)
if target := candidates.selectGood(a.randGen); target != nil {
if target := selector.selectOne(candidates); target != nil {
log.VEventf(ctx, 3, "add target: %s", target)
details := decisionDetails{Target: target.compactString()}
detailsBytes, err := json.Marshal(details)
Expand Down
38 changes: 36 additions & 2 deletions pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,23 @@ func (cl candidateList) best() candidateList {
return cl
}

// good returns all the elements in a sorted (by score reversed) candidate list
// that share the highest diversity score and are valid.
func (cl candidateList) good() candidateList {
cl = cl.onlyValidAndHealthyDisk()
if len(cl) <= 1 {
return cl
}
for i := 1; i < len(cl); i++ {
if cl[i].necessary == cl[0].necessary &&
scoresAlmostEqual(cl[i].diversityScore, cl[0].diversityScore) {
continue
}
return cl[:i]
}
return cl
}

// worst returns all the elements in a sorted (by score reversed) candidate list
// that share the lowest constraint score (for instance, the set of candidates
// that result in the lowest diversity score for the range, or the set of
Expand Down Expand Up @@ -858,9 +875,26 @@ func (cl candidateList) selectBest(randGen allocatorRand) *candidate {
return best
}

// selectBad randomly chooses a bad candidate store from a sorted (by score
// selectGood randomly chooses a good candidate store from a sorted (by score
// reversed) candidate list using the provided random generator.
func (cl candidateList) selectBad(randGen allocatorRand) *candidate {
func (cl candidateList) selectGood(randGen allocatorRand) *candidate {
cl = cl.good()
if len(cl) == 0 {
return nil
}
if len(cl) == 1 {
return &cl[0]
}
randGen.Lock()
r := randGen.Intn(len(cl))
randGen.Unlock()
c := &cl[r]
return c
}

// selectWorst randomly chooses one of the worst candidate stores from a sorted
// (by score reversed) candidate list using the provided random generator.
func (cl candidateList) selectWorst(randGen allocatorRand) *candidate {
cl = cl.worst()
if len(cl) == 0 {
return nil
Expand Down
48 changes: 39 additions & 9 deletions pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/kr/pretty"
"github.com/stretchr/testify/require"
)

type storeScore struct {
Expand Down Expand Up @@ -95,9 +96,8 @@ func TestOnlyValidAndHealthyDisk(t *testing.T) {
}
}

// TestSelectBestPanic is a basic regression test against a former panic in
// selectBest when called with just invalid/full stores.
func TestSelectBestPanic(t *testing.T) {
// TestNilSelection verifies selection with just invalid/full stores.
func TestNilSelection(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

Expand All @@ -107,12 +107,11 @@ func TestSelectBestPanic(t *testing.T) {
},
}
allocRand := makeAllocatorRand(rand.NewSource(0))
if good := cl.selectBest(allocRand); good != nil {
t.Errorf("cl.selectBest() got %v, want nil", good)
}
require.Nil(t, cl.selectBest(allocRand))
require.Nil(t, cl.selectGood(allocRand))
}

// TestCandidateSelection tests select{best,worst} and {best,worst}constraints.
// TestCandidateSelection tests select{Best,Good,Worst} and {best,good,worst}constraints.
func TestCandidateSelection(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
Expand Down Expand Up @@ -155,57 +154,73 @@ func TestCandidateSelection(t *testing.T) {
testCases := []struct {
candidates []scoreTuple
best []scoreTuple
good []scoreTuple
worst []scoreTuple
bestChosen scoreTuple
goodChosen scoreTuple
worstChosen scoreTuple
}{
{
candidates: []scoreTuple{{0, 0}},
best: []scoreTuple{{0, 0}},
good: []scoreTuple{{0, 0}},
worst: []scoreTuple{{0, 0}},
bestChosen: scoreTuple{0, 0},
goodChosen: scoreTuple{0, 0},
worstChosen: scoreTuple{0, 0},
},
{
candidates: []scoreTuple{{0, 0}, {0, 1}},
best: []scoreTuple{{0, 0}, {0, 1}},
good: []scoreTuple{{0, 0}, {0, 1}},
worst: []scoreTuple{{0, 0}, {0, 1}},
bestChosen: scoreTuple{0, 0},
goodChosen: scoreTuple{0, 1},
worstChosen: scoreTuple{0, 1},
},
{
candidates: []scoreTuple{{0, 0}, {0, 1}, {0, 2}},
best: []scoreTuple{{0, 0}, {0, 1}, {0, 2}},
good: []scoreTuple{{0, 0}, {0, 1}, {0, 2}},
worst: []scoreTuple{{0, 0}, {0, 1}, {0, 2}},
bestChosen: scoreTuple{0, 1},
worstChosen: scoreTuple{0, 2},
bestChosen: scoreTuple{0, 0},
goodChosen: scoreTuple{0, 0},
worstChosen: scoreTuple{0, 1},
},
{
candidates: []scoreTuple{{1, 0}, {0, 1}},
best: []scoreTuple{{1, 0}},
good: []scoreTuple{{1, 0}},
worst: []scoreTuple{{0, 1}},
bestChosen: scoreTuple{1, 0},
goodChosen: scoreTuple{1, 0},
worstChosen: scoreTuple{0, 1},
},
{
candidates: []scoreTuple{{1, 0}, {0, 1}, {0, 2}},
best: []scoreTuple{{1, 0}},
good: []scoreTuple{{1, 0}},
worst: []scoreTuple{{0, 1}, {0, 2}},
bestChosen: scoreTuple{1, 0},
goodChosen: scoreTuple{1, 0},
worstChosen: scoreTuple{0, 2},
},
{
candidates: []scoreTuple{{1, 0}, {1, 1}, {0, 2}},
best: []scoreTuple{{1, 0}, {1, 1}},
good: []scoreTuple{{1, 0}, {1, 1}},
worst: []scoreTuple{{0, 2}},
bestChosen: scoreTuple{1, 0},
goodChosen: scoreTuple{1, 1},
worstChosen: scoreTuple{0, 2},
},
{
candidates: []scoreTuple{{1, 0}, {1, 1}, {0, 2}, {0, 3}},
best: []scoreTuple{{1, 0}, {1, 1}},
good: []scoreTuple{{1, 0}, {1, 1}},
worst: []scoreTuple{{0, 2}, {0, 3}},
bestChosen: scoreTuple{1, 0},
goodChosen: scoreTuple{1, 0},
worstChosen: scoreTuple{0, 3},
},
}
Expand All @@ -218,6 +233,11 @@ func TestCandidateSelection(t *testing.T) {
t.Errorf("expected:%s actual:%s diff:%v", formatter(e), formatter(a), pretty.Diff(e, a))
}
})
t.Run(fmt.Sprintf("good-%s", formatter(cl)), func(t *testing.T) {
if a, e := cl.good(), genCandidates(tc.good, 1); !reflect.DeepEqual(a, e) {
t.Errorf("expected:%s actual:%s diff:%v", formatter(e), formatter(a), pretty.Diff(e, a))
}
})
t.Run(fmt.Sprintf("worst-%s", formatter(cl)), func(t *testing.T) {
// Shifting the ids is required to match the end of the list.
if a, e := cl.worst(), genCandidates(
Expand All @@ -237,6 +257,16 @@ func TestCandidateSelection(t *testing.T) {
t.Errorf("expected:%v actual:%v", tc.bestChosen, actual)
}
})
t.Run(fmt.Sprintf("select-good-%s", formatter(cl)), func(t *testing.T) {
good := cl.selectGood(allocRand)
if good == nil {
t.Fatalf("no 'good' candidate found")
}
actual := scoreTuple{int(good.diversityScore + 0.5), good.rangeCount}
if actual != tc.goodChosen {
t.Errorf("expected:%v actual:%v", tc.goodChosen, actual)
}
})
t.Run(fmt.Sprintf("select-worst-%s", formatter(cl)), func(t *testing.T) {
worst := cl.selectWorst(allocRand)
if worst == nil {
Expand Down
Loading

0 comments on commit 3ce0fd5

Please sign in to comment.