diff --git a/pkg/storage/allocator.go b/pkg/storage/allocator.go index 31dcbf4ca9c7..0463013f3c97 100644 --- a/pkg/storage/allocator.go +++ b/pkg/storage/allocator.go @@ -220,16 +220,13 @@ func (a *Allocator) AllocateTarget( return nil, errors.Errorf("%d matching stores are currently throttled", throttledStoreCount) } - candidates, err := allocateRuleSolver.Solve( + candidates := allocateRuleSolver.Solve( sl, constraints, existing, a.storePool.getNodeLocalities(existing), ) - if err != nil { - return nil, err - } - + candidates = candidates.onlyValid() if len(candidates) == 0 { return nil, &allocatorError{ required: constraints.Constraints, @@ -289,52 +286,6 @@ func (a Allocator) RemoveTarget( return roachpb.ReplicaDescriptor{}, errors.Errorf("must supply at least one replica to allocator.RemoveTarget()") } - if a.options.UseRuleSolver { - // TODO(bram): #10275 Is this getStoreList call required? Compute candidate - // requires a store list, but we should be able to create one using only - // the stores that belong to the range. - // Use an invalid range ID as we don't care about a corrupt replicas since - // as we are removing a replica and not trying to add one. - sl, _, _ := a.storePool.getStoreList(roachpb.RangeID(0)) - - var worst roachpb.ReplicaDescriptor - worstScore := math.Inf(0) - for _, exist := range existing { - if exist.StoreID == leaseStoreID { - continue - } - desc, ok := a.storePool.getStoreDescriptor(exist.StoreID) - if !ok { - continue - } - - candidate, valid := removeRuleSolver.computeCandidate(solveState{ - constraints: constraints, - store: desc, - sl: sl, - existing: nil, - existingNodeLocalities: a.storePool.getNodeLocalities(existing), - }) - // When a candidate is not valid, it means that it can be - // considered the worst existing replica. - if !valid { - return exist, nil - } - - if candidate.score < worstScore { - worstScore = candidate.score - worst = exist - } - - } - - if !math.IsInf(worstScore, 0) { - return worst, nil - } - - return roachpb.ReplicaDescriptor{}, errors.New("could not select an appropriate replica to be removed") - } - // Retrieve store descriptors for the provided replicas from the StorePool. descriptors := make([]roachpb.StoreDescriptor, 0, len(existing)) for _, exist := range existing { @@ -345,15 +296,37 @@ func (a Allocator) RemoveTarget( descriptors = append(descriptors, desc) } } - sl := makeStoreList(descriptors) - if bad := a.selectBad(sl); bad != nil { + var badStoreID roachpb.StoreID + + if a.options.UseRuleSolver { + candidates := removeRuleSolver.Solve( + sl, + constraints, + existing, + a.storePool.getNodeLocalities(existing), + ) + + if len(candidates) != 0 { + // TODO(bram): There needs some randomness here and the logic from + // selectBad around rebalanceFromConvergesOnMean. + badStoreID = candidates[len(candidates)-1].store.StoreID + } + } else { + bad := a.selectBad(sl) + if bad != nil { + badStoreID = bad.StoreID + } + } + + if badStoreID != 0 { for _, exist := range existing { - if exist.StoreID == bad.StoreID { + if exist.StoreID == badStoreID { return exist, nil } } } + return roachpb.ReplicaDescriptor{}, errors.New("could not select an appropriate replica to be removed") } @@ -430,31 +403,13 @@ func (a Allocator) RebalanceTarget( existingStoreList := makeStoreList(existingDescs) candidateStoreList := makeStoreList(candidateDescs) - existingCandidates, err := removeRuleSolver.Solve(existingStoreList, constraints, nil, nil) - if err != nil { - return nil, err - } - candidates, err := allocateRuleSolver.Solve(candidateStoreList, constraints, nil, nil) - if err != nil { - return nil, err - } - - // Find all candidates that are better than the worst existing store. - var worstCandidateStore float64 - // If any store from existing is not included in existingCandidates, it - // is because it no longer meets the constraints. If so, its score is - // considered to be 0. - if len(existingCandidates) == len(existing) { - worstCandidateStore = existingCandidates[len(existingCandidates)-1].score - } + existingCandidates := removeRuleSolver.Solve(existingStoreList, constraints, nil, nil) + candidates := allocateRuleSolver.Solve(candidateStoreList, constraints, nil, nil) // TODO(bram): #10275 Need some randomness here! - for _, cand := range candidates { - if cand.score > worstCandidateStore { - return &candidates[0].store, nil - } + if existingCandidates[len(existingCandidates)-1].less(candidates[0]) { + return &candidates[0].store, nil } - return nil, nil } diff --git a/pkg/storage/rule_solver.go b/pkg/storage/rule_solver.go index 62f5a4ede9e7..747d1585ad61 100644 --- a/pkg/storage/rule_solver.go +++ b/pkg/storage/rule_solver.go @@ -23,21 +23,43 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" ) -const ( - ruleDiversityWeight = 0.1 - ruleCapacityWeight = 0.01 -) - // candidate store for allocation. type candidate struct { - store roachpb.StoreDescriptor - score float64 + store roachpb.StoreDescriptor + valid bool + constraint float64 // Score used to pick the top candidates. + balance float64 // Score used to choose between top candidates. +} + +// less first compares constraint scores, then balance scores. +func (c candidate) less(o candidate) bool { + if !o.valid { + return false + } + if !c.valid { + return true + } + if c.constraint != o.constraint { + return c.constraint < o.constraint + } + return c.balance < o.balance +} + +type candidateList []candidate + +// onlyValid returns all the elements in a sorted candidate list that are valid. +func (cl candidateList) onlyValid() candidateList { + for i := len(cl) - 1; i >= 0; i-- { + if cl[i].valid { + return cl[:i+1] + } + } + return candidateList{} } // solveState is used to pass solution state information into a rule. type solveState struct { constraints config.Constraints - store roachpb.StoreDescriptor sl StoreList existing []roachpb.ReplicaDescriptor existingNodeLocalities map[roachpb.NodeID]roachpb.Locality @@ -47,7 +69,7 @@ type solveState struct { // disqualify a store. The store in solveState can be disqualified by // returning false. Unless disqualified, the higher the returned score, the // more likely the store will be picked as a candidate. -type rule func(state solveState) (float64, bool) +type rule func(roachpb.StoreDescriptor, solveState) (bool, float64, float64) // ruleSolver is used to test a collection of rules against stores. type ruleSolver []rule @@ -56,26 +78,28 @@ type ruleSolver []rule var allocateRuleSolver = ruleSolver{ ruleReplicasUniqueNodes, ruleConstraints, - ruleCapacity, + ruleCapacityMax, ruleDiversity, + ruleCapacity, } // removeRuleSolver is the set of rules used for removing existing replicas. var removeRuleSolver = ruleSolver{ ruleConstraints, - ruleCapacity, + ruleCapacityMax, ruleDiversity, + ruleCapacity, } // Solve runs the rules against the stores in the store list and returns all -// passing stores and their scores ordered from best to worst score. +// candidate stores and their scores ordered from best to worst score. func (rs ruleSolver) Solve( sl StoreList, c config.Constraints, existing []roachpb.ReplicaDescriptor, existingNodeLocalities map[roachpb.NodeID]roachpb.Locality, -) ([]candidate, error) { - candidates := make([]candidate, 0, len(sl.stores)) +) candidateList { + candidates := make(candidateList, len(sl.stores), len(sl.stores)) state := solveState{ constraints: c, sl: sl, @@ -83,39 +107,47 @@ func (rs ruleSolver) Solve( existingNodeLocalities: existingNodeLocalities, } - for _, store := range sl.stores { - state.store = store - if cand, ok := rs.computeCandidate(state); ok { - candidates = append(candidates, cand) - } + for i, store := range sl.stores { + candidates[i] = rs.computeCandidate(store, state) } sort.Sort(sort.Reverse(byScore(candidates))) - return candidates, nil + return candidates } // computeCandidate runs the rules against a candidate store using the provided // state and returns each candidate's score and if the candidate is valid. -func (rs ruleSolver) computeCandidate(state solveState) (candidate, bool) { - var totalScore float64 +func (rs ruleSolver) computeCandidate(store roachpb.StoreDescriptor, state solveState) candidate { + var totalConstraintScore, totalBalanceScore float64 for _, rule := range rs { - score, valid := rule(state) + valid, constraintScore, balanceScore := rule(store, state) if !valid { - return candidate{}, false + return candidate{ + store: store, + valid: false, + } } - totalScore += score + totalConstraintScore += constraintScore + totalBalanceScore += balanceScore + } + return candidate{ + store: store, + valid: true, + constraint: totalConstraintScore, + balance: totalBalanceScore, } - return candidate{store: state.store, score: totalScore}, true } // ruleReplicasUniqueNodes returns true iff no existing replica is present on -// the candidate's node. -func ruleReplicasUniqueNodes(state solveState) (float64, bool) { +// the candidate's node. All other scores are always 0. +func ruleReplicasUniqueNodes( + store roachpb.StoreDescriptor, state solveState, +) (bool, float64, float64) { for _, r := range state.existing { - if r.NodeID == state.store.Node.NodeID { - return 0, false + if r.NodeID == store.Node.NodeID { + return false, 0, 0 } } - return 0, true + return true, 0, 0 } // storeHasConstraint returns whether a store descriptor attributes or locality @@ -142,56 +174,60 @@ func storeHasConstraint(store roachpb.StoreDescriptor, c config.Constraint) bool // ruleConstraints returns true iff all required and prohibited constraints are // satisfied. Stores with attributes or localities that match the most positive // constraints return higher scores. -func ruleConstraints(state solveState) (float64, bool) { +func ruleConstraints(store roachpb.StoreDescriptor, state solveState) (bool, float64, float64) { if len(state.constraints.Constraints) == 0 { - return 0, true + return true, 0, 0 } matched := 0 for _, c := range state.constraints.Constraints { - hasConstraint := storeHasConstraint(state.store, c) + hasConstraint := storeHasConstraint(store, c) switch { - case c.Type == config.Constraint_POSITIVE && hasConstraint: - matched++ case c.Type == config.Constraint_REQUIRED && !hasConstraint: - return 0, false + return false, 0, 0 case c.Type == config.Constraint_PROHIBITED && hasConstraint: - return 0, false + return false, 0, 0 + case (c.Type == config.Constraint_POSITIVE && hasConstraint) || + (c.Type == config.Constraint_REQUIRED && hasConstraint) || + (c.Type == config.Constraint_PROHIBITED && !hasConstraint): + matched++ } } - return float64(matched) / float64(len(state.constraints.Constraints)), true + return true, float64(matched) / float64(len(state.constraints.Constraints)), 0 } // ruleDiversity returns higher scores for stores with the fewest locality tiers // in common with already existing replicas. It always returns true. -func ruleDiversity(state solveState) (float64, bool) { +func ruleDiversity(store roachpb.StoreDescriptor, state solveState) (bool, float64, float64) { minScore := 1.0 for _, locality := range state.existingNodeLocalities { - if newScore := state.store.Node.Locality.DiversityScore(locality); newScore < minScore { + if newScore := store.Node.Locality.DiversityScore(locality); newScore < minScore { minScore = newScore } } - return minScore * ruleDiversityWeight, true + return true, minScore, 0 } -// ruleCapacity returns true iff a new replica won't overfill the store. The -// score returned is inversely proportional to the number of ranges on the -// candidate store, with the most empty nodes having the highest scores. -// TODO(bram): consider splitting this into two rules. -func ruleCapacity(state solveState) (float64, bool) { - // Don't overfill stores. - if state.store.Capacity.FractionUsed() > maxFractionUsedThreshold { - return 0, false - } +// ruleCapacity returns a balance score that is inversely proportional to the +// number of ranges on the candidate store such that the most empty store will +// have the highest scores. Scores are always between 0 and 1. +func ruleCapacity(store roachpb.StoreDescriptor, state solveState) (bool, float64, float64) { + return true, 0, 1 / float64(store.Capacity.RangeCount+1) +} - return ruleCapacityWeight / float64(state.store.Capacity.RangeCount+1), true +// ruleCapacityMax ensures that we don't try to overfill a store. +func ruleCapacityMax(store roachpb.StoreDescriptor, state solveState) (bool, float64, float64) { + if store.Capacity.FractionUsed() > maxFractionUsedThreshold { + return false, 0, 0 + } + return true, 0, 0 } // byScore implements sort.Interface to sort by scores. -type byScore []candidate +type byScore candidateList var _ sort.Interface = byScore(nil) func (c byScore) Len() int { return len(c) } -func (c byScore) Less(i, j int) bool { return c[i].score < c[j].score } +func (c byScore) Less(i, j int) bool { return c[i].less(c[j]) } func (c byScore) Swap(i, j int) { c[i], c[j] = c[j], c[i] } diff --git a/pkg/storage/rule_solver_test.go b/pkg/storage/rule_solver_test.go index 9f368c026400..32db4d8b5348 100644 --- a/pkg/storage/rule_solver_test.go +++ b/pkg/storage/rule_solver_test.go @@ -17,6 +17,8 @@ package storage import ( + "bytes" + "fmt" "sort" "testing" @@ -25,17 +27,37 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/leaktest" ) -type byScoreAndID []candidate +type byScoreAndID candidateList func (c byScoreAndID) Len() int { return len(c) } func (c byScoreAndID) Less(i, j int) bool { - if c[i].score == c[j].score { + if c[i].constraint == c[j].constraint && + c[i].balance == c[j].balance && + c[i].valid == c[j].valid { return c[i].store.StoreID < c[j].store.StoreID } - return c[i].score > c[j].score + return c[i].less(c[j]) } func (c byScoreAndID) Swap(i, j int) { c[i], c[j] = c[j], c[i] } +func (c candidate) String() string { + return fmt.Sprintf("StoreID:%d, valid:%t, con:%.2f, bal:%.2f", + c.store.StoreID, c.valid, c.constraint, c.balance) +} + +func (cl candidateList) String() string { + var buffer bytes.Buffer + buffer.WriteRune('[') + for i, c := range cl { + if i != 0 { + buffer.WriteString("; ") + } + buffer.WriteString(c.String()) + } + buffer.WriteRune(']') + return buffer.String() +} + // TODO(bram): This test suite is not even close to exhaustive. The scores are // not checked and each rule should have many more test cases. Also add a // corrupt replica test and remove the 0 range ID used when calling @@ -111,29 +133,31 @@ func TestRuleSolver(t *testing.T) { storePool.mu.Unlock() testCases := []struct { - name string - rule rule - c config.Constraints - existing []roachpb.ReplicaDescriptor - expected []roachpb.StoreID + name string + rule rule + c config.Constraints + existing []roachpb.ReplicaDescriptor + expectedValid []roachpb.StoreID + expectedInvalid []roachpb.StoreID }{ { - name: "no constraints or rules", - expected: []roachpb.StoreID{storeUSa15, storeUSa1, storeUSb, storeEurope}, + name: "no constraints or rules", + expectedValid: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1, storeUSa15}, }, { name: "white list rule", - rule: func(state solveState) (float64, bool) { - switch state.store.StoreID { + rule: func(store roachpb.StoreDescriptor, _ solveState) (bool, float64, float64) { + switch store.StoreID { case storeUSa15: - return 0, true + return true, 0, 0 case storeUSb: - return 1, true + return true, 1, 0 default: - return 0, false + return false, 0, 0 } }, - expected: []roachpb.StoreID{storeUSb, storeUSa15}, + expectedValid: []roachpb.StoreID{storeUSb, storeUSa15}, + expectedInvalid: []roachpb.StoreID{storeEurope, storeUSa1}, }, { name: "ruleReplicasUniqueNodes - 2 available nodes", @@ -142,7 +166,8 @@ func TestRuleSolver(t *testing.T) { {NodeID: roachpb.NodeID(storeUSa15)}, {NodeID: roachpb.NodeID(storeUSb)}, }, - expected: []roachpb.StoreID{storeUSa1, storeEurope}, + expectedValid: []roachpb.StoreID{storeEurope, storeUSa1}, + expectedInvalid: []roachpb.StoreID{storeUSb, storeUSa15}, }, { name: "ruleReplicasUniqueNodes - 0 available nodes", @@ -153,7 +178,8 @@ func TestRuleSolver(t *testing.T) { {NodeID: roachpb.NodeID(storeUSb)}, {NodeID: roachpb.NodeID(storeEurope)}, }, - expected: nil, + expectedValid: nil, + expectedInvalid: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1, storeUSa15}, }, { name: "ruleConstraints - required constraints", @@ -163,7 +189,8 @@ func TestRuleSolver(t *testing.T) { {Value: "b", Type: config.Constraint_REQUIRED}, }, }, - expected: []roachpb.StoreID{storeUSa1, storeUSb}, + expectedValid: []roachpb.StoreID{storeUSb, storeUSa1}, + expectedInvalid: []roachpb.StoreID{storeEurope, storeUSa15}, }, { name: "ruleConstraints - required locality constraints", @@ -173,7 +200,8 @@ func TestRuleSolver(t *testing.T) { {Key: "datacenter", Value: "us", Type: config.Constraint_REQUIRED}, }, }, - expected: []roachpb.StoreID{storeUSa15, storeUSa1, storeUSb}, + expectedValid: []roachpb.StoreID{storeUSb, storeUSa1, storeUSa15}, + expectedInvalid: []roachpb.StoreID{storeEurope}, }, { name: "ruleConstraints - prohibited constraints", @@ -183,7 +211,8 @@ func TestRuleSolver(t *testing.T) { {Value: "b", Type: config.Constraint_PROHIBITED}, }, }, - expected: []roachpb.StoreID{storeUSa15, storeEurope}, + expectedValid: []roachpb.StoreID{storeEurope, storeUSa15}, + expectedInvalid: []roachpb.StoreID{storeUSb, storeUSa1}, }, { name: "ruleConstraints - prohibited locality constraints", @@ -193,7 +222,8 @@ func TestRuleSolver(t *testing.T) { {Key: "datacenter", Value: "us", Type: config.Constraint_PROHIBITED}, }, }, - expected: []roachpb.StoreID{storeEurope}, + expectedValid: []roachpb.StoreID{storeEurope}, + expectedInvalid: []roachpb.StoreID{storeUSb, storeUSa1, storeUSa15}, }, { name: "ruleConstraints - positive constraints", @@ -205,7 +235,7 @@ func TestRuleSolver(t *testing.T) { {Value: "c"}, }, }, - expected: []roachpb.StoreID{storeUSb, storeUSa1, storeUSa15, storeEurope}, + expectedValid: []roachpb.StoreID{storeUSb, storeUSa1, storeUSa15, storeEurope}, }, { name: "ruleConstraints - positive locality constraints", @@ -215,13 +245,12 @@ func TestRuleSolver(t *testing.T) { {Key: "datacenter", Value: "eur"}, }, }, - expected: []roachpb.StoreID{storeEurope, storeUSa15, storeUSa1, storeUSb}, + expectedValid: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1, storeUSa15}, }, { - name: "ruleDiversity - no existing replicas", - rule: ruleDiversity, - existing: nil, - expected: []roachpb.StoreID{storeUSa15, storeUSa1, storeUSb, storeEurope}, + name: "ruleDiversity - no existing replicas", + rule: ruleDiversity, + expectedValid: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1, storeUSa15}, }, { name: "ruleDiversity - one existing replicas", @@ -229,7 +258,7 @@ func TestRuleSolver(t *testing.T) { existing: []roachpb.ReplicaDescriptor{ {NodeID: roachpb.NodeID(storeUSa15)}, }, - expected: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1, storeUSa15}, + expectedValid: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1, storeUSa15}, }, { name: "ruleDiversity - two existing replicas", @@ -238,12 +267,18 @@ func TestRuleSolver(t *testing.T) { {NodeID: roachpb.NodeID(storeUSa15)}, {NodeID: roachpb.NodeID(storeEurope)}, }, - expected: []roachpb.StoreID{storeUSb, storeUSa1, storeUSa15, storeEurope}, + expectedValid: []roachpb.StoreID{storeUSb, storeUSa1, storeEurope, storeUSa15}, + }, + { + name: "ruleCapacityMax", + rule: ruleCapacityMax, + expectedValid: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1}, + expectedInvalid: []roachpb.StoreID{storeUSa15}, }, { - name: "ruleCapacity", - rule: ruleCapacity, - expected: []roachpb.StoreID{storeUSa1, storeEurope, storeUSb}, + name: "ruleCapacity", + rule: ruleCapacity, + expectedValid: []roachpb.StoreID{storeUSa1, storeEurope, storeUSb, storeUSa15}, }, } @@ -254,25 +289,74 @@ func TestRuleSolver(t *testing.T) { solver = ruleSolver{tc.rule} } sl, _, _ := storePool.getStoreList(roachpb.RangeID(0)) - candidates, err := solver.Solve( + candidates := solver.Solve( sl, tc.c, tc.existing, storePool.getNodeLocalities(tc.existing), ) - if err != nil { - t.Fatal(err) + sort.Sort(sort.Reverse(byScoreAndID(candidates))) + valid := candidates.onlyValid() + invalid := candidates[len(valid):] + + if len(valid) != len(tc.expectedValid) { + t.Fatalf("length of valid %+v should match %+v", valid, tc.expectedValid) + } + for i, expected := range tc.expectedValid { + if actual := valid[i].store.StoreID; actual != expected { + t.Errorf("valid[%d].store.StoreID = %d; not %d; %+v", + i, actual, expected, valid) + } } - sort.Sort(byScoreAndID(candidates)) - if len(candidates) != len(tc.expected) { - t.Fatalf("length of %+v should match %+v", candidates, tc.expected) + if len(invalid) != len(tc.expectedInvalid) { + t.Fatalf("length of invalids %+v should match %+v", invalid, tc.expectedInvalid) } - for i, expected := range tc.expected { - if actual := candidates[i].store.StoreID; actual != expected { - t.Errorf("candidates[%d].store.StoreID = %d; not %d; %+v", - i, actual, expected, candidates) + for i, expected := range tc.expectedInvalid { + if actual := invalid[i].store.StoreID; actual != expected { + t.Errorf("invalid[%d].store.StoreID = %d; not %d; %+v", + i, actual, expected, invalid) } } }) } } + +func TestOnlyValid(t *testing.T) { + defer leaktest.AfterTest(t)() + + testCases := []struct { + valid, invalid int + }{ + {0, 0}, + {1, 0}, + {0, 1}, + {1, 1}, + {2, 0}, + {2, 1}, + {2, 2}, + {1, 2}, + {0, 2}, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("%d,%d", tc.valid, tc.invalid), func(t *testing.T) { + var cl candidateList + // Order these in backward to ensure sorting works correctly. + for i := 0; i < tc.invalid; i++ { + cl = append(cl, candidate{}) + } + for i := 0; i < tc.valid; i++ { + cl = append(cl, candidate{valid: true}) + } + sort.Sort(sort.Reverse(byScore(cl))) + + valid := cl.onlyValid() + if a, e := len(valid), tc.valid; a != e { + t.Errorf("expected %d valid, actual %d", e, a) + } + if a, e := len(cl)-len(valid), tc.invalid; a != e { + t.Errorf("expected %d invalid, actual %d", e, a) + } + }) + } +}