From 16d4269ef6194d75c87680ef8860df34fdd2d8db Mon Sep 17 00:00:00 2001 From: Lidor Carmel Date: Mon, 15 Aug 2022 11:03:43 -0700 Subject: [PATCH] allocator: select a good enough store for decom/recovery Until now, when decommissioning a node, or when recovering from a dead node, the allocator tries to pick one of the best possible stores as the target for the recovery. Because of that, we sometimes see multiple stores recover replicas to the same store, for example, when decommissioning a node and at the same time adding a new node. This PR changes the way we select a destination store by choosing a random store out of all the stores that are "good enough" for the replica. The risk diversity is still enforced, but we may recover a replica to a store that is considered "over full", for example. Note that during upreplication the allocator will still try to use one of the "best" stores as targets. Fixes: 86265 Release note: None Release justification: --- .../allocator/allocatorimpl/allocator.go | 73 ++++++++++- .../allocatorimpl/allocator_scorer.go | 38 +++++- .../allocatorimpl/allocator_scorer_test.go | 48 ++++++-- .../allocator/allocatorimpl/allocator_test.go | 116 ++++++++++++------ pkg/kv/kvserver/allocator_impl_test.go | 6 +- pkg/kv/kvserver/replica_command.go | 1 + pkg/kv/kvserver/replicate_queue.go | 6 +- 7 files changed, 232 insertions(+), 56 deletions(-) diff --git a/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go b/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go index 80b6ecfd3640..4bdeeb58d709 100644 --- a/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go +++ b/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go @@ -88,6 +88,19 @@ var leaseRebalancingAggressiveness = settings.RegisterFloatSetting( settings.NonNegativeFloat, ) +// enableRecoverToGoodEnoughStores enables recovering replicas to any valid +// store, instead of a store that has low range count. With this enabled, +// recovering from a dead node or from a decommissioning node can be faster, +// because nodes can send replicas to more target stores (instead of multiple +// nodes sending replicas to a few stores with a low range count). +var enableRecoverToGoodEnoughStores = settings.RegisterBoolSetting( + settings.SystemOnly, + "kv.allocator.recover_to_good_enough_stores.enabled", + "if true, the allocator may recover replicas to any valid store, otherwise "+ + "it will pick one of the most ideal stores", + true, +) + // AllocatorAction enumerates the various replication adjustments that may be // recommended by the allocator. type AllocatorAction int @@ -851,14 +864,64 @@ type decisionDetails struct { Existing string `json:",omitempty"` } +// CandidateSelector is an interface to select a store from a list of +// candidates. +type CandidateSelector interface { + selectOne(cl candidateList) *candidate +} + +// BestCandidateSelector in used to choose the best store to allocate. +type BestCandidateSelector struct { + randGen allocatorRand +} + +// NewBestCandidateSelector returns a CandidateSelector for choosing the best +// candidate store. +func (a *Allocator) NewBestCandidateSelector() CandidateSelector { + return &BestCandidateSelector{a.randGen} +} + +func (s *BestCandidateSelector) selectOne(cl candidateList) *candidate { + return cl.selectBest(s.randGen) +} + +// GoodCandidateSelector is used to choose a random store out of the stores that +// are good enough. +type GoodCandidateSelector struct { + randGen allocatorRand +} + +// NewGoodCandidateSelector returns a CandidateSelector for choosing a random store +// out of the stores that are good enough. +func (a *Allocator) NewGoodCandidateSelector() CandidateSelector { + return &GoodCandidateSelector{a.randGen} +} + +func (s *GoodCandidateSelector) selectOne(cl candidateList) *candidate { + return cl.selectGood(s.randGen) +} + func (a *Allocator) allocateTarget( ctx context.Context, conf roachpb.SpanConfig, existingVoters, existingNonVoters []roachpb.ReplicaDescriptor, + replicaStatus ReplicaStatus, targetType TargetReplicaType, ) (roachpb.ReplicationTarget, string, error) { candidateStoreList, aliveStoreCount, throttled := a.StorePool.GetStoreList(storepool.StoreFilterThrottled) + // If the replica is alive we are upreplicating, and in that case we want to + // allocate new replicas on the best possible store. Otherwise, the replica is + // dead or decommissioned, and we want to recover the missing replica as soon + // as possible, and therefore any store that is good enough will be + // considered. + var selector CandidateSelector + if replicaStatus == Alive || !enableRecoverToGoodEnoughStores.Get(&a.StorePool.St.SV) { + selector = a.NewBestCandidateSelector() + } else { + selector = a.NewGoodCandidateSelector() + } + target, details := a.AllocateTargetFromList( ctx, candidateStoreList, @@ -866,6 +929,7 @@ func (a *Allocator) allocateTarget( existingVoters, existingNonVoters, a.ScorerOptions(ctx), + selector, // When allocating a *new* replica, we explicitly disregard nodes with any // existing replicas. This is important for multi-store scenarios as // otherwise, stores on the nodes that have existing replicas are simply @@ -903,8 +967,9 @@ func (a *Allocator) AllocateVoter( ctx context.Context, conf roachpb.SpanConfig, existingVoters, existingNonVoters []roachpb.ReplicaDescriptor, + replicaStatus ReplicaStatus, ) (roachpb.ReplicationTarget, string, error) { - return a.allocateTarget(ctx, conf, existingVoters, existingNonVoters, VoterTarget) + return a.allocateTarget(ctx, conf, existingVoters, existingNonVoters, replicaStatus, VoterTarget) } // AllocateNonVoter returns a suitable store for a new allocation of a @@ -914,8 +979,9 @@ func (a *Allocator) AllocateNonVoter( ctx context.Context, conf roachpb.SpanConfig, existingVoters, existingNonVoters []roachpb.ReplicaDescriptor, + replicaStatus ReplicaStatus, ) (roachpb.ReplicationTarget, string, error) { - return a.allocateTarget(ctx, conf, existingVoters, existingNonVoters, NonVoterTarget) + return a.allocateTarget(ctx, conf, existingVoters, existingNonVoters, replicaStatus, NonVoterTarget) } // AllocateTargetFromList returns a suitable store for a new allocation of a @@ -927,6 +993,7 @@ func (a *Allocator) AllocateTargetFromList( conf roachpb.SpanConfig, existingVoters, existingNonVoters []roachpb.ReplicaDescriptor, options ScorerOptions, + selector CandidateSelector, allowMultipleReplsPerNode bool, targetType TargetReplicaType, ) (roachpb.ReplicationTarget, string) { @@ -968,7 +1035,7 @@ func (a *Allocator) AllocateTargetFromList( ) log.VEventf(ctx, 3, "allocate %s: %s", targetType, candidates) - if target := candidates.selectGood(a.randGen); target != nil { + if target := selector.selectOne(candidates); target != nil { log.VEventf(ctx, 3, "add target: %s", target) details := decisionDetails{Target: target.compactString()} detailsBytes, err := json.Marshal(details) diff --git a/pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go b/pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go index 2a30899403d7..6df62e184ae3 100644 --- a/pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go +++ b/pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go @@ -778,6 +778,23 @@ func (cl candidateList) best() candidateList { return cl } +// good returns all the elements in a sorted (by score reversed) candidate list +// that share the highest diversity score and are valid. +func (cl candidateList) good() candidateList { + cl = cl.onlyValidAndHealthyDisk() + if len(cl) <= 1 { + return cl + } + for i := 1; i < len(cl); i++ { + if cl[i].necessary == cl[0].necessary && + scoresAlmostEqual(cl[i].diversityScore, cl[0].diversityScore) { + continue + } + return cl[:i] + } + return cl +} + // worst returns all the elements in a sorted (by score reversed) candidate list // that share the lowest constraint score (for instance, the set of candidates // that result in the lowest diversity score for the range, or the set of @@ -858,9 +875,26 @@ func (cl candidateList) selectBest(randGen allocatorRand) *candidate { return best } -// selectBad randomly chooses a bad candidate store from a sorted (by score +// selectGood randomly chooses a good candidate store from a sorted (by score // reversed) candidate list using the provided random generator. -func (cl candidateList) selectBad(randGen allocatorRand) *candidate { +func (cl candidateList) selectGood(randGen allocatorRand) *candidate { + cl = cl.good() + if len(cl) == 0 { + return nil + } + if len(cl) == 1 { + return &cl[0] + } + randGen.Lock() + r := randGen.Intn(len(cl)) + randGen.Unlock() + c := &cl[r] + return c +} + +// selectWorst randomly chooses one of the worst candidate stores from a sorted +// (by score reversed) candidate list using the provided random generator. +func (cl candidateList) selectWorst(randGen allocatorRand) *candidate { cl = cl.worst() if len(cl) == 0 { return nil diff --git a/pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go b/pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go index a1f243ce0422..f0d49c02eb34 100644 --- a/pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go +++ b/pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go @@ -27,6 +27,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/kr/pretty" + "github.com/stretchr/testify/require" ) type storeScore struct { @@ -95,9 +96,8 @@ func TestOnlyValidAndHealthyDisk(t *testing.T) { } } -// TestSelectBestPanic is a basic regression test against a former panic in -// selectBest when called with just invalid/full stores. -func TestSelectBestPanic(t *testing.T) { +// TestNilSelection verifies selection with just invalid/full stores. +func TestNilSelection(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) @@ -107,12 +107,11 @@ func TestSelectBestPanic(t *testing.T) { }, } allocRand := makeAllocatorRand(rand.NewSource(0)) - if good := cl.selectBest(allocRand); good != nil { - t.Errorf("cl.selectBest() got %v, want nil", good) - } + require.Nil(t, cl.selectBest(allocRand)) + require.Nil(t, cl.selectGood(allocRand)) } -// TestCandidateSelection tests select{best,worst} and {best,worst}constraints. +// TestCandidateSelection tests select{Best,Good,Worst} and {best,good,worst}constraints. func TestCandidateSelection(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) @@ -155,57 +154,73 @@ func TestCandidateSelection(t *testing.T) { testCases := []struct { candidates []scoreTuple best []scoreTuple + good []scoreTuple worst []scoreTuple bestChosen scoreTuple + goodChosen scoreTuple worstChosen scoreTuple }{ { candidates: []scoreTuple{{0, 0}}, best: []scoreTuple{{0, 0}}, + good: []scoreTuple{{0, 0}}, worst: []scoreTuple{{0, 0}}, bestChosen: scoreTuple{0, 0}, + goodChosen: scoreTuple{0, 0}, worstChosen: scoreTuple{0, 0}, }, { candidates: []scoreTuple{{0, 0}, {0, 1}}, best: []scoreTuple{{0, 0}, {0, 1}}, + good: []scoreTuple{{0, 0}, {0, 1}}, worst: []scoreTuple{{0, 0}, {0, 1}}, bestChosen: scoreTuple{0, 0}, + goodChosen: scoreTuple{0, 1}, worstChosen: scoreTuple{0, 1}, }, { candidates: []scoreTuple{{0, 0}, {0, 1}, {0, 2}}, best: []scoreTuple{{0, 0}, {0, 1}, {0, 2}}, + good: []scoreTuple{{0, 0}, {0, 1}, {0, 2}}, worst: []scoreTuple{{0, 0}, {0, 1}, {0, 2}}, - bestChosen: scoreTuple{0, 1}, - worstChosen: scoreTuple{0, 2}, + bestChosen: scoreTuple{0, 0}, + goodChosen: scoreTuple{0, 0}, + worstChosen: scoreTuple{0, 1}, }, { candidates: []scoreTuple{{1, 0}, {0, 1}}, best: []scoreTuple{{1, 0}}, + good: []scoreTuple{{1, 0}}, worst: []scoreTuple{{0, 1}}, bestChosen: scoreTuple{1, 0}, + goodChosen: scoreTuple{1, 0}, worstChosen: scoreTuple{0, 1}, }, { candidates: []scoreTuple{{1, 0}, {0, 1}, {0, 2}}, best: []scoreTuple{{1, 0}}, + good: []scoreTuple{{1, 0}}, worst: []scoreTuple{{0, 1}, {0, 2}}, bestChosen: scoreTuple{1, 0}, + goodChosen: scoreTuple{1, 0}, worstChosen: scoreTuple{0, 2}, }, { candidates: []scoreTuple{{1, 0}, {1, 1}, {0, 2}}, best: []scoreTuple{{1, 0}, {1, 1}}, + good: []scoreTuple{{1, 0}, {1, 1}}, worst: []scoreTuple{{0, 2}}, bestChosen: scoreTuple{1, 0}, + goodChosen: scoreTuple{1, 1}, worstChosen: scoreTuple{0, 2}, }, { candidates: []scoreTuple{{1, 0}, {1, 1}, {0, 2}, {0, 3}}, best: []scoreTuple{{1, 0}, {1, 1}}, + good: []scoreTuple{{1, 0}, {1, 1}}, worst: []scoreTuple{{0, 2}, {0, 3}}, bestChosen: scoreTuple{1, 0}, + goodChosen: scoreTuple{1, 0}, worstChosen: scoreTuple{0, 3}, }, } @@ -218,6 +233,11 @@ func TestCandidateSelection(t *testing.T) { t.Errorf("expected:%s actual:%s diff:%v", formatter(e), formatter(a), pretty.Diff(e, a)) } }) + t.Run(fmt.Sprintf("good-%s", formatter(cl)), func(t *testing.T) { + if a, e := cl.good(), genCandidates(tc.good, 1); !reflect.DeepEqual(a, e) { + t.Errorf("expected:%s actual:%s diff:%v", formatter(e), formatter(a), pretty.Diff(e, a)) + } + }) t.Run(fmt.Sprintf("worst-%s", formatter(cl)), func(t *testing.T) { // Shifting the ids is required to match the end of the list. if a, e := cl.worst(), genCandidates( @@ -237,6 +257,16 @@ func TestCandidateSelection(t *testing.T) { t.Errorf("expected:%v actual:%v", tc.bestChosen, actual) } }) + t.Run(fmt.Sprintf("select-good-%s", formatter(cl)), func(t *testing.T) { + good := cl.selectGood(allocRand) + if good == nil { + t.Fatalf("no 'good' candidate found") + } + actual := scoreTuple{int(good.diversityScore + 0.5), good.rangeCount} + if actual != tc.goodChosen { + t.Errorf("expected:%v actual:%v", tc.goodChosen, actual) + } + }) t.Run(fmt.Sprintf("select-worst-%s", formatter(cl)), func(t *testing.T) { worst := cl.selectWorst(allocRand) if worst == nil { diff --git a/pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go b/pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go index 8699c700d5bb..4dda6c5e10cd 100644 --- a/pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go +++ b/pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go @@ -559,6 +559,7 @@ func TestAllocatorSimpleRetrieval(t *testing.T) { ctx, simpleSpanConfig, nil /* existingVoters */, nil, /* existingNonVoters */ + Dead, ) if err != nil { t.Fatalf("Unable to perform allocation: %+v", err) @@ -579,6 +580,7 @@ func TestAllocatorNoAvailableDisks(t *testing.T) { ctx, simpleSpanConfig, nil /* existingVoters */, nil, /* existingNonVoters */ + Dead, ) if !roachpb.Empty(result) { t.Errorf("expected nil result: %+v", result) @@ -594,64 +596,84 @@ func TestAllocatorReadAmpCheck(t *testing.T) { ctx := context.Background() type testCase struct { - name string - stores []*roachpb.StoreDescriptor - conf roachpb.SpanConfig - expectedAddTarget roachpb.StoreID - enforcement StoreHealthEnforcement + name string + stores []*roachpb.StoreDescriptor + conf roachpb.SpanConfig + // The expected store to add when replicas are alive. The allocator should + // pick one of the best stores, with low range count. + expectedTargetIfAlive roachpb.StoreID + // The expected store to add when a replica is dead or decommissioning. The + // allocator should pick a store that is good enough, ignoring the range + // count. + expectedTargetIfDead roachpb.StoreID + enforcement StoreHealthEnforcement } tests := []testCase{ { - name: "ignore read amp on allocation when StoreHealthNoAction enforcement", + name: "ignore read amp on allocation when StoreHealthNoAction enforcement", + stores: allStoresHighReadAmp, + conf: emptySpanConfig(), // NB: All stores have high read amp, this should be ignored and // allocate to the store with the lowest range count. - stores: allStoresHighReadAmp, - conf: emptySpanConfig(), - expectedAddTarget: roachpb.StoreID(3), - enforcement: StoreHealthNoAction, + expectedTargetIfAlive: roachpb.StoreID(3), + // Recovery of a dead node can pick any valid store, not necessarily the + // one with the lowest range count. + expectedTargetIfDead: roachpb.StoreID(2), + enforcement: StoreHealthNoAction, }, { name: "ignore read amp on allocation when storeHealthLogOnly enforcement", // NB: All stores have high read amp, this should be ignored and // allocate to the store with the lowest range count. - stores: allStoresHighReadAmp, - conf: emptySpanConfig(), - expectedAddTarget: roachpb.StoreID(3), - enforcement: StoreHealthLogOnly, + stores: allStoresHighReadAmp, + conf: emptySpanConfig(), + expectedTargetIfAlive: roachpb.StoreID(3), + // Recovery of a dead node can pick any valid store, not necessarily the + // one with the lowest range count. + expectedTargetIfDead: roachpb.StoreID(2), + enforcement: StoreHealthLogOnly, }, { name: "ignore read amp on allocation when StoreHealthBlockRebalanceTo enforcement", // NB: All stores have high read amp, this should be ignored and // allocate to the store with the lowest range count. - stores: allStoresHighReadAmp, - conf: emptySpanConfig(), - expectedAddTarget: roachpb.StoreID(3), - enforcement: StoreHealthBlockRebalanceTo, + stores: allStoresHighReadAmp, + conf: emptySpanConfig(), + expectedTargetIfAlive: roachpb.StoreID(3), + // Recovery of a dead node can pick any valid store, not necessarily the + // one with the lowest range count. + expectedTargetIfDead: roachpb.StoreID(2), + enforcement: StoreHealthBlockRebalanceTo, }, { name: "don't allocate to stores when all have high read amp and StoreHealthBlockAll", // NB: All stores have high read amp (limit + 1), none are above the watermark, select the lowest range count. - stores: allStoresHighReadAmp, - conf: emptySpanConfig(), - expectedAddTarget: roachpb.StoreID(3), - enforcement: StoreHealthBlockAll, + stores: allStoresHighReadAmp, + conf: emptySpanConfig(), + expectedTargetIfAlive: roachpb.StoreID(3), + // Recovery of a dead node can pick any valid store, not necessarily the + // one with the lowest range count. + expectedTargetIfDead: roachpb.StoreID(2), + enforcement: StoreHealthBlockAll, }, { name: "allocate to store below the mean when all have high read amp and StoreHealthBlockAll", // NB: All stores have high read amp, however store 1 is below the watermark mean read amp. - stores: allStoresHighReadAmpSkewed, - conf: emptySpanConfig(), - expectedAddTarget: roachpb.StoreID(1), - enforcement: StoreHealthBlockAll, + stores: allStoresHighReadAmpSkewed, + conf: emptySpanConfig(), + expectedTargetIfAlive: roachpb.StoreID(1), + expectedTargetIfDead: roachpb.StoreID(1), + enforcement: StoreHealthBlockAll, }, { name: "allocate to lowest range count store without high read amp when StoreHealthBlockAll enforcement", // NB: Store 1, 2 and 3 have high read amp and are above the watermark, the lowest range count (4) // should be selected. - stores: threeStoresHighReadAmpAscRangeCount, - conf: emptySpanConfig(), - expectedAddTarget: roachpb.StoreID(4), - enforcement: StoreHealthBlockAll, + stores: threeStoresHighReadAmpAscRangeCount, + conf: emptySpanConfig(), + expectedTargetIfAlive: roachpb.StoreID(4), + expectedTargetIfDead: roachpb.StoreID(4), + enforcement: StoreHealthBlockAll, }, } @@ -661,22 +683,39 @@ func TestAllocatorReadAmpCheck(t *testing.T) { for i, test := range tests { t.Run(fmt.Sprintf("%d_%s", i+1, test.name), func(t *testing.T) { - stopper, g, _, a, _ := CreateTestAllocator(ctx, 10, false /* deterministic */) + stopper, g, _, a, _ := CreateTestAllocator(ctx, 10, true /* deterministic */) defer stopper.Stop(ctx) sg := gossiputil.NewStoreGossiper(g) sg.GossipStores(test.stores, t) // Enable read disk health checking in candidate exclusion. l0SublevelsThresholdEnforce.Override(ctx, &a.StorePool.St.SV, int64(test.enforcement)) + + // Allocate a voter where all replicas are alive (e.g. up-replicating a valid range). add, _, err := a.AllocateVoter( ctx, test.conf, nil, nil, + Alive, + ) + require.NoError(t, err) + require.Truef(t, + chk(add, test.expectedTargetIfAlive), + "the addition target %+v from AllocateVoter doesn't match expectation", + add) + + // Allocate a voter where we have a dead (or decommissioning) replica. + add, _, err = a.AllocateVoter( + ctx, + test.conf, + nil, + nil, + Dead, // Dead and Decommissioning should behave the same here ) require.NoError(t, err) require.Truef(t, - chk(add, test.expectedAddTarget), + chk(add, test.expectedTargetIfDead), "the addition target %+v from AllocateVoter doesn't match expectation", add) }) @@ -695,6 +734,7 @@ func TestAllocatorTwoDatacenters(t *testing.T) { ctx, multiDCConfigSSD, nil /* existingVoters */, nil, /* existingNonVoters */ + Dead, ) if err != nil { t.Fatalf("Unable to perform allocation: %+v", err) @@ -706,6 +746,7 @@ func TestAllocatorTwoDatacenters(t *testing.T) { NodeID: result1.NodeID, StoreID: result1.StoreID, }}, nil, /* existingNonVoters */ + Dead, ) if err != nil { t.Fatalf("Unable to perform allocation: %+v", err) @@ -729,6 +770,7 @@ func TestAllocatorTwoDatacenters(t *testing.T) { StoreID: result2.StoreID, }, }, nil, /* existingNonVoters */ + Dead, ) if err == nil { t.Errorf("expected error on allocation without available stores: %+v", result3) @@ -762,6 +804,7 @@ func TestAllocatorExistingReplica(t *testing.T) { StoreID: 2, }, }, nil, /* existingNonVoters */ + Dead, ) if err != nil { t.Fatalf("Unable to perform allocation: %+v", err) @@ -865,6 +908,7 @@ func TestAllocatorMultipleStoresPerNode(t *testing.T) { { result, _, err := a.AllocateVoter( ctx, emptySpanConfig(), tc.existing, nil, + Dead, ) if e, a := tc.expectTargetAllocate, !roachpb.Empty(result); e != a { t.Errorf( @@ -2920,7 +2964,7 @@ func TestAllocatorConstraintsAndVoterConstraints(t *testing.T) { // Allocate the voting replica first, before the non-voter. This is the // order in which we'd expect the allocator to repair a given range. See // TestAllocatorComputeAction. - voterTarget, _, err := a.AllocateVoter(ctx, test.conf, test.existingVoters, test.existingNonVoters) + voterTarget, _, err := a.AllocateVoter(ctx, test.conf, test.existingVoters, test.existingNonVoters, Dead) if test.shouldVoterAllocFail { require.Errorf(t, err, "expected voter allocation to fail; got %v as a valid target instead", voterTarget) } else { @@ -2929,7 +2973,7 @@ func TestAllocatorConstraintsAndVoterConstraints(t *testing.T) { test.existingVoters = append(test.existingVoters, replicas(voterTarget.StoreID)...) } - nonVoterTarget, _, err := a.AllocateNonVoter(ctx, test.conf, test.existingVoters, test.existingNonVoters) + nonVoterTarget, _, err := a.AllocateNonVoter(ctx, test.conf, test.existingVoters, test.existingNonVoters, Dead) if test.shouldNonVoterAllocFail { require.Errorf(t, err, "expected non-voter allocation to fail; got %v as a valid target instead", nonVoterTarget) } else { @@ -3003,7 +3047,7 @@ func TestAllocatorAllocateTargetLocality(t *testing.T) { StoreID: storeID, } } - targetStore, details, err := a.AllocateVoter(ctx, emptySpanConfig(), existingRepls, nil) + targetStore, details, err := a.AllocateVoter(ctx, emptySpanConfig(), existingRepls, nil, Dead) if err != nil { t.Fatal(err) } @@ -3469,7 +3513,7 @@ func TestAllocatorNonVoterAllocationExcludesVoterNodes(t *testing.T) { sg := gossiputil.NewStoreGossiper(g) sg.GossipStores(test.stores, t) - result, _, err := a.AllocateNonVoter(ctx, test.conf, test.existingVoters, test.existingNonVoters) + result, _, err := a.AllocateNonVoter(ctx, test.conf, test.existingVoters, test.existingNonVoters, Dead) if test.shouldFail { require.Error(t, err) require.Regexp(t, test.expError, err) diff --git a/pkg/kv/kvserver/allocator_impl_test.go b/pkg/kv/kvserver/allocator_impl_test.go index 04d174baa883..ba454a8bcf01 100644 --- a/pkg/kv/kvserver/allocator_impl_test.go +++ b/pkg/kv/kvserver/allocator_impl_test.go @@ -252,14 +252,14 @@ func TestAllocatorThrottled(t *testing.T) { defer stopper.Stop(ctx) // First test to make sure we would send the replica to purgatory. - _, _, err := a.AllocateVoter(ctx, simpleSpanConfig, []roachpb.ReplicaDescriptor{}, nil) + _, _, err := a.AllocateVoter(ctx, simpleSpanConfig, []roachpb.ReplicaDescriptor{}, nil, allocatorimpl.Dead) if _, ok := IsPurgatoryError(err); !ok { t.Fatalf("expected a purgatory error, got: %+v", err) } // Second, test the normal case in which we can allocate to the store. gossiputil.NewStoreGossiper(g).GossipStores(singleStore, t) - result, _, err := a.AllocateVoter(ctx, simpleSpanConfig, []roachpb.ReplicaDescriptor{}, nil) + result, _, err := a.AllocateVoter(ctx, simpleSpanConfig, []roachpb.ReplicaDescriptor{}, nil, allocatorimpl.Dead) if err != nil { t.Fatalf("unable to perform allocation: %+v", err) } @@ -276,7 +276,7 @@ func TestAllocatorThrottled(t *testing.T) { } storeDetail.ThrottledUntil = timeutil.Now().Add(24 * time.Hour) a.StorePool.DetailsMu.Unlock() - _, _, err = a.AllocateVoter(ctx, simpleSpanConfig, []roachpb.ReplicaDescriptor{}, nil) + _, _, err = a.AllocateVoter(ctx, simpleSpanConfig, []roachpb.ReplicaDescriptor{}, nil, allocatorimpl.Dead) if _, ok := IsPurgatoryError(err); ok { t.Fatalf("expected a non purgatory error, got: %+v", err) } diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index 9adc86ffc2c0..9c0871719771 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -3227,6 +3227,7 @@ func (r *Replica) relocateOne( existingVoters, existingNonVoters, r.store.allocator.ScorerOptions(ctx), + r.store.allocator.NewBestCandidateSelector(), // NB: Allow the allocator to return target stores that might be on the // same node as an existing replica. This is to ensure that relocations // that require "lateral" movement of replicas within a node can succeed. diff --git a/pkg/kv/kvserver/replicate_queue.go b/pkg/kv/kvserver/replicate_queue.go index 0d8123d9a15b..a632f1b4bf3d 100644 --- a/pkg/kv/kvserver/replicate_queue.go +++ b/pkg/kv/kvserver/replicate_queue.go @@ -1011,7 +1011,7 @@ func (rq *replicateQueue) addOrReplaceVoters( // we're removing it (i.e. dead or decommissioning). If we left the replica in // the slice, the allocator would not be guaranteed to pick a replica that // fills the gap removeRepl leaves once it's gone. - newVoter, details, err := rq.allocator.AllocateVoter(ctx, conf, remainingLiveVoters, remainingLiveNonVoters) + newVoter, details, err := rq.allocator.AllocateVoter(ctx, conf, remainingLiveVoters, remainingLiveNonVoters, replicaStatus) if err != nil { return false, err } @@ -1043,7 +1043,7 @@ func (rq *replicateQueue) addOrReplaceVoters( oldPlusNewReplicas, roachpb.ReplicaDescriptor{NodeID: newVoter.NodeID, StoreID: newVoter.StoreID}, ) - _, _, err := rq.allocator.AllocateVoter(ctx, conf, oldPlusNewReplicas, remainingLiveNonVoters) + _, _, err := rq.allocator.AllocateVoter(ctx, conf, oldPlusNewReplicas, remainingLiveNonVoters, replicaStatus) if err != nil { // It does not seem possible to go to the next odd replica state. Note // that AllocateVoter returns an allocatorError (a PurgatoryError) @@ -1126,7 +1126,7 @@ func (rq *replicateQueue) addOrReplaceNonVoters( desc, conf := repl.DescAndSpanConfig() existingNonVoters := desc.Replicas().NonVoterDescriptors() - newNonVoter, details, err := rq.allocator.AllocateNonVoter(ctx, conf, liveVoterReplicas, liveNonVoterReplicas) + newNonVoter, details, err := rq.allocator.AllocateNonVoter(ctx, conf, liveVoterReplicas, liveNonVoterReplicas, replicaStatus) if err != nil { return false, err }