From ec7158e3d80b882e99275c508de4f017a10b53d1 Mon Sep 17 00:00:00 2001 From: Bram Gruneir Date: Mon, 21 Nov 2016 13:42:45 -0500 Subject: [PATCH] storage: collection of fixes and updates for the rules solver - Splits the scores returned in the rule solver into a constraint and balance scores. - Add a valid field to constraints and add it to all rules. - Solve now returns all candidates instead of just the valid ones. To get only the valid candidates, the new function onlyValid and new type condidateList have also been added. - This allows us to use solve for removeTarget. It also cleans up the logic in removeTarget to more closely match the non-rule solver version. - Split the capcity rules into two rules. They were performing two different operations and didnt' make sense being combined. This will also ease the change of converting the rules to basic functions. Part of #10275 --- pkg/storage/allocator.go | 107 ++++++-------------- pkg/storage/rule_solver.go | 144 +++++++++++++++++---------- pkg/storage/rule_solver_test.go | 170 ++++++++++++++++++++++++-------- 3 files changed, 248 insertions(+), 173 deletions(-) diff --git a/pkg/storage/allocator.go b/pkg/storage/allocator.go index 31dcbf4ca9c7..0463013f3c97 100644 --- a/pkg/storage/allocator.go +++ b/pkg/storage/allocator.go @@ -220,16 +220,13 @@ func (a *Allocator) AllocateTarget( return nil, errors.Errorf("%d matching stores are currently throttled", throttledStoreCount) } - candidates, err := allocateRuleSolver.Solve( + candidates := allocateRuleSolver.Solve( sl, constraints, existing, a.storePool.getNodeLocalities(existing), ) - if err != nil { - return nil, err - } - + candidates = candidates.onlyValid() if len(candidates) == 0 { return nil, &allocatorError{ required: constraints.Constraints, @@ -289,52 +286,6 @@ func (a Allocator) RemoveTarget( return roachpb.ReplicaDescriptor{}, errors.Errorf("must supply at least one replica to allocator.RemoveTarget()") } - if a.options.UseRuleSolver { - // TODO(bram): #10275 Is this getStoreList call required? Compute candidate - // requires a store list, but we should be able to create one using only - // the stores that belong to the range. - // Use an invalid range ID as we don't care about a corrupt replicas since - // as we are removing a replica and not trying to add one. - sl, _, _ := a.storePool.getStoreList(roachpb.RangeID(0)) - - var worst roachpb.ReplicaDescriptor - worstScore := math.Inf(0) - for _, exist := range existing { - if exist.StoreID == leaseStoreID { - continue - } - desc, ok := a.storePool.getStoreDescriptor(exist.StoreID) - if !ok { - continue - } - - candidate, valid := removeRuleSolver.computeCandidate(solveState{ - constraints: constraints, - store: desc, - sl: sl, - existing: nil, - existingNodeLocalities: a.storePool.getNodeLocalities(existing), - }) - // When a candidate is not valid, it means that it can be - // considered the worst existing replica. - if !valid { - return exist, nil - } - - if candidate.score < worstScore { - worstScore = candidate.score - worst = exist - } - - } - - if !math.IsInf(worstScore, 0) { - return worst, nil - } - - return roachpb.ReplicaDescriptor{}, errors.New("could not select an appropriate replica to be removed") - } - // Retrieve store descriptors for the provided replicas from the StorePool. descriptors := make([]roachpb.StoreDescriptor, 0, len(existing)) for _, exist := range existing { @@ -345,15 +296,37 @@ func (a Allocator) RemoveTarget( descriptors = append(descriptors, desc) } } - sl := makeStoreList(descriptors) - if bad := a.selectBad(sl); bad != nil { + var badStoreID roachpb.StoreID + + if a.options.UseRuleSolver { + candidates := removeRuleSolver.Solve( + sl, + constraints, + existing, + a.storePool.getNodeLocalities(existing), + ) + + if len(candidates) != 0 { + // TODO(bram): There needs some randomness here and the logic from + // selectBad around rebalanceFromConvergesOnMean. + badStoreID = candidates[len(candidates)-1].store.StoreID + } + } else { + bad := a.selectBad(sl) + if bad != nil { + badStoreID = bad.StoreID + } + } + + if badStoreID != 0 { for _, exist := range existing { - if exist.StoreID == bad.StoreID { + if exist.StoreID == badStoreID { return exist, nil } } } + return roachpb.ReplicaDescriptor{}, errors.New("could not select an appropriate replica to be removed") } @@ -430,31 +403,13 @@ func (a Allocator) RebalanceTarget( existingStoreList := makeStoreList(existingDescs) candidateStoreList := makeStoreList(candidateDescs) - existingCandidates, err := removeRuleSolver.Solve(existingStoreList, constraints, nil, nil) - if err != nil { - return nil, err - } - candidates, err := allocateRuleSolver.Solve(candidateStoreList, constraints, nil, nil) - if err != nil { - return nil, err - } - - // Find all candidates that are better than the worst existing store. - var worstCandidateStore float64 - // If any store from existing is not included in existingCandidates, it - // is because it no longer meets the constraints. If so, its score is - // considered to be 0. - if len(existingCandidates) == len(existing) { - worstCandidateStore = existingCandidates[len(existingCandidates)-1].score - } + existingCandidates := removeRuleSolver.Solve(existingStoreList, constraints, nil, nil) + candidates := allocateRuleSolver.Solve(candidateStoreList, constraints, nil, nil) // TODO(bram): #10275 Need some randomness here! - for _, cand := range candidates { - if cand.score > worstCandidateStore { - return &candidates[0].store, nil - } + if existingCandidates[len(existingCandidates)-1].less(candidates[0]) { + return &candidates[0].store, nil } - return nil, nil } diff --git a/pkg/storage/rule_solver.go b/pkg/storage/rule_solver.go index 62f5a4ede9e7..747d1585ad61 100644 --- a/pkg/storage/rule_solver.go +++ b/pkg/storage/rule_solver.go @@ -23,21 +23,43 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" ) -const ( - ruleDiversityWeight = 0.1 - ruleCapacityWeight = 0.01 -) - // candidate store for allocation. type candidate struct { - store roachpb.StoreDescriptor - score float64 + store roachpb.StoreDescriptor + valid bool + constraint float64 // Score used to pick the top candidates. + balance float64 // Score used to choose between top candidates. +} + +// less first compares constraint scores, then balance scores. +func (c candidate) less(o candidate) bool { + if !o.valid { + return false + } + if !c.valid { + return true + } + if c.constraint != o.constraint { + return c.constraint < o.constraint + } + return c.balance < o.balance +} + +type candidateList []candidate + +// onlyValid returns all the elements in a sorted candidate list that are valid. +func (cl candidateList) onlyValid() candidateList { + for i := len(cl) - 1; i >= 0; i-- { + if cl[i].valid { + return cl[:i+1] + } + } + return candidateList{} } // solveState is used to pass solution state information into a rule. type solveState struct { constraints config.Constraints - store roachpb.StoreDescriptor sl StoreList existing []roachpb.ReplicaDescriptor existingNodeLocalities map[roachpb.NodeID]roachpb.Locality @@ -47,7 +69,7 @@ type solveState struct { // disqualify a store. The store in solveState can be disqualified by // returning false. Unless disqualified, the higher the returned score, the // more likely the store will be picked as a candidate. -type rule func(state solveState) (float64, bool) +type rule func(roachpb.StoreDescriptor, solveState) (bool, float64, float64) // ruleSolver is used to test a collection of rules against stores. type ruleSolver []rule @@ -56,26 +78,28 @@ type ruleSolver []rule var allocateRuleSolver = ruleSolver{ ruleReplicasUniqueNodes, ruleConstraints, - ruleCapacity, + ruleCapacityMax, ruleDiversity, + ruleCapacity, } // removeRuleSolver is the set of rules used for removing existing replicas. var removeRuleSolver = ruleSolver{ ruleConstraints, - ruleCapacity, + ruleCapacityMax, ruleDiversity, + ruleCapacity, } // Solve runs the rules against the stores in the store list and returns all -// passing stores and their scores ordered from best to worst score. +// candidate stores and their scores ordered from best to worst score. func (rs ruleSolver) Solve( sl StoreList, c config.Constraints, existing []roachpb.ReplicaDescriptor, existingNodeLocalities map[roachpb.NodeID]roachpb.Locality, -) ([]candidate, error) { - candidates := make([]candidate, 0, len(sl.stores)) +) candidateList { + candidates := make(candidateList, len(sl.stores), len(sl.stores)) state := solveState{ constraints: c, sl: sl, @@ -83,39 +107,47 @@ func (rs ruleSolver) Solve( existingNodeLocalities: existingNodeLocalities, } - for _, store := range sl.stores { - state.store = store - if cand, ok := rs.computeCandidate(state); ok { - candidates = append(candidates, cand) - } + for i, store := range sl.stores { + candidates[i] = rs.computeCandidate(store, state) } sort.Sort(sort.Reverse(byScore(candidates))) - return candidates, nil + return candidates } // computeCandidate runs the rules against a candidate store using the provided // state and returns each candidate's score and if the candidate is valid. -func (rs ruleSolver) computeCandidate(state solveState) (candidate, bool) { - var totalScore float64 +func (rs ruleSolver) computeCandidate(store roachpb.StoreDescriptor, state solveState) candidate { + var totalConstraintScore, totalBalanceScore float64 for _, rule := range rs { - score, valid := rule(state) + valid, constraintScore, balanceScore := rule(store, state) if !valid { - return candidate{}, false + return candidate{ + store: store, + valid: false, + } } - totalScore += score + totalConstraintScore += constraintScore + totalBalanceScore += balanceScore + } + return candidate{ + store: store, + valid: true, + constraint: totalConstraintScore, + balance: totalBalanceScore, } - return candidate{store: state.store, score: totalScore}, true } // ruleReplicasUniqueNodes returns true iff no existing replica is present on -// the candidate's node. -func ruleReplicasUniqueNodes(state solveState) (float64, bool) { +// the candidate's node. All other scores are always 0. +func ruleReplicasUniqueNodes( + store roachpb.StoreDescriptor, state solveState, +) (bool, float64, float64) { for _, r := range state.existing { - if r.NodeID == state.store.Node.NodeID { - return 0, false + if r.NodeID == store.Node.NodeID { + return false, 0, 0 } } - return 0, true + return true, 0, 0 } // storeHasConstraint returns whether a store descriptor attributes or locality @@ -142,56 +174,60 @@ func storeHasConstraint(store roachpb.StoreDescriptor, c config.Constraint) bool // ruleConstraints returns true iff all required and prohibited constraints are // satisfied. Stores with attributes or localities that match the most positive // constraints return higher scores. -func ruleConstraints(state solveState) (float64, bool) { +func ruleConstraints(store roachpb.StoreDescriptor, state solveState) (bool, float64, float64) { if len(state.constraints.Constraints) == 0 { - return 0, true + return true, 0, 0 } matched := 0 for _, c := range state.constraints.Constraints { - hasConstraint := storeHasConstraint(state.store, c) + hasConstraint := storeHasConstraint(store, c) switch { - case c.Type == config.Constraint_POSITIVE && hasConstraint: - matched++ case c.Type == config.Constraint_REQUIRED && !hasConstraint: - return 0, false + return false, 0, 0 case c.Type == config.Constraint_PROHIBITED && hasConstraint: - return 0, false + return false, 0, 0 + case (c.Type == config.Constraint_POSITIVE && hasConstraint) || + (c.Type == config.Constraint_REQUIRED && hasConstraint) || + (c.Type == config.Constraint_PROHIBITED && !hasConstraint): + matched++ } } - return float64(matched) / float64(len(state.constraints.Constraints)), true + return true, float64(matched) / float64(len(state.constraints.Constraints)), 0 } // ruleDiversity returns higher scores for stores with the fewest locality tiers // in common with already existing replicas. It always returns true. -func ruleDiversity(state solveState) (float64, bool) { +func ruleDiversity(store roachpb.StoreDescriptor, state solveState) (bool, float64, float64) { minScore := 1.0 for _, locality := range state.existingNodeLocalities { - if newScore := state.store.Node.Locality.DiversityScore(locality); newScore < minScore { + if newScore := store.Node.Locality.DiversityScore(locality); newScore < minScore { minScore = newScore } } - return minScore * ruleDiversityWeight, true + return true, minScore, 0 } -// ruleCapacity returns true iff a new replica won't overfill the store. The -// score returned is inversely proportional to the number of ranges on the -// candidate store, with the most empty nodes having the highest scores. -// TODO(bram): consider splitting this into two rules. -func ruleCapacity(state solveState) (float64, bool) { - // Don't overfill stores. - if state.store.Capacity.FractionUsed() > maxFractionUsedThreshold { - return 0, false - } +// ruleCapacity returns a balance score that is inversely proportional to the +// number of ranges on the candidate store such that the most empty store will +// have the highest scores. Scores are always between 0 and 1. +func ruleCapacity(store roachpb.StoreDescriptor, state solveState) (bool, float64, float64) { + return true, 0, 1 / float64(store.Capacity.RangeCount+1) +} - return ruleCapacityWeight / float64(state.store.Capacity.RangeCount+1), true +// ruleCapacityMax ensures that we don't try to overfill a store. +func ruleCapacityMax(store roachpb.StoreDescriptor, state solveState) (bool, float64, float64) { + if store.Capacity.FractionUsed() > maxFractionUsedThreshold { + return false, 0, 0 + } + return true, 0, 0 } // byScore implements sort.Interface to sort by scores. -type byScore []candidate +type byScore candidateList var _ sort.Interface = byScore(nil) func (c byScore) Len() int { return len(c) } -func (c byScore) Less(i, j int) bool { return c[i].score < c[j].score } +func (c byScore) Less(i, j int) bool { return c[i].less(c[j]) } func (c byScore) Swap(i, j int) { c[i], c[j] = c[j], c[i] } diff --git a/pkg/storage/rule_solver_test.go b/pkg/storage/rule_solver_test.go index 9f368c026400..32db4d8b5348 100644 --- a/pkg/storage/rule_solver_test.go +++ b/pkg/storage/rule_solver_test.go @@ -17,6 +17,8 @@ package storage import ( + "bytes" + "fmt" "sort" "testing" @@ -25,17 +27,37 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/leaktest" ) -type byScoreAndID []candidate +type byScoreAndID candidateList func (c byScoreAndID) Len() int { return len(c) } func (c byScoreAndID) Less(i, j int) bool { - if c[i].score == c[j].score { + if c[i].constraint == c[j].constraint && + c[i].balance == c[j].balance && + c[i].valid == c[j].valid { return c[i].store.StoreID < c[j].store.StoreID } - return c[i].score > c[j].score + return c[i].less(c[j]) } func (c byScoreAndID) Swap(i, j int) { c[i], c[j] = c[j], c[i] } +func (c candidate) String() string { + return fmt.Sprintf("StoreID:%d, valid:%t, con:%.2f, bal:%.2f", + c.store.StoreID, c.valid, c.constraint, c.balance) +} + +func (cl candidateList) String() string { + var buffer bytes.Buffer + buffer.WriteRune('[') + for i, c := range cl { + if i != 0 { + buffer.WriteString("; ") + } + buffer.WriteString(c.String()) + } + buffer.WriteRune(']') + return buffer.String() +} + // TODO(bram): This test suite is not even close to exhaustive. The scores are // not checked and each rule should have many more test cases. Also add a // corrupt replica test and remove the 0 range ID used when calling @@ -111,29 +133,31 @@ func TestRuleSolver(t *testing.T) { storePool.mu.Unlock() testCases := []struct { - name string - rule rule - c config.Constraints - existing []roachpb.ReplicaDescriptor - expected []roachpb.StoreID + name string + rule rule + c config.Constraints + existing []roachpb.ReplicaDescriptor + expectedValid []roachpb.StoreID + expectedInvalid []roachpb.StoreID }{ { - name: "no constraints or rules", - expected: []roachpb.StoreID{storeUSa15, storeUSa1, storeUSb, storeEurope}, + name: "no constraints or rules", + expectedValid: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1, storeUSa15}, }, { name: "white list rule", - rule: func(state solveState) (float64, bool) { - switch state.store.StoreID { + rule: func(store roachpb.StoreDescriptor, _ solveState) (bool, float64, float64) { + switch store.StoreID { case storeUSa15: - return 0, true + return true, 0, 0 case storeUSb: - return 1, true + return true, 1, 0 default: - return 0, false + return false, 0, 0 } }, - expected: []roachpb.StoreID{storeUSb, storeUSa15}, + expectedValid: []roachpb.StoreID{storeUSb, storeUSa15}, + expectedInvalid: []roachpb.StoreID{storeEurope, storeUSa1}, }, { name: "ruleReplicasUniqueNodes - 2 available nodes", @@ -142,7 +166,8 @@ func TestRuleSolver(t *testing.T) { {NodeID: roachpb.NodeID(storeUSa15)}, {NodeID: roachpb.NodeID(storeUSb)}, }, - expected: []roachpb.StoreID{storeUSa1, storeEurope}, + expectedValid: []roachpb.StoreID{storeEurope, storeUSa1}, + expectedInvalid: []roachpb.StoreID{storeUSb, storeUSa15}, }, { name: "ruleReplicasUniqueNodes - 0 available nodes", @@ -153,7 +178,8 @@ func TestRuleSolver(t *testing.T) { {NodeID: roachpb.NodeID(storeUSb)}, {NodeID: roachpb.NodeID(storeEurope)}, }, - expected: nil, + expectedValid: nil, + expectedInvalid: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1, storeUSa15}, }, { name: "ruleConstraints - required constraints", @@ -163,7 +189,8 @@ func TestRuleSolver(t *testing.T) { {Value: "b", Type: config.Constraint_REQUIRED}, }, }, - expected: []roachpb.StoreID{storeUSa1, storeUSb}, + expectedValid: []roachpb.StoreID{storeUSb, storeUSa1}, + expectedInvalid: []roachpb.StoreID{storeEurope, storeUSa15}, }, { name: "ruleConstraints - required locality constraints", @@ -173,7 +200,8 @@ func TestRuleSolver(t *testing.T) { {Key: "datacenter", Value: "us", Type: config.Constraint_REQUIRED}, }, }, - expected: []roachpb.StoreID{storeUSa15, storeUSa1, storeUSb}, + expectedValid: []roachpb.StoreID{storeUSb, storeUSa1, storeUSa15}, + expectedInvalid: []roachpb.StoreID{storeEurope}, }, { name: "ruleConstraints - prohibited constraints", @@ -183,7 +211,8 @@ func TestRuleSolver(t *testing.T) { {Value: "b", Type: config.Constraint_PROHIBITED}, }, }, - expected: []roachpb.StoreID{storeUSa15, storeEurope}, + expectedValid: []roachpb.StoreID{storeEurope, storeUSa15}, + expectedInvalid: []roachpb.StoreID{storeUSb, storeUSa1}, }, { name: "ruleConstraints - prohibited locality constraints", @@ -193,7 +222,8 @@ func TestRuleSolver(t *testing.T) { {Key: "datacenter", Value: "us", Type: config.Constraint_PROHIBITED}, }, }, - expected: []roachpb.StoreID{storeEurope}, + expectedValid: []roachpb.StoreID{storeEurope}, + expectedInvalid: []roachpb.StoreID{storeUSb, storeUSa1, storeUSa15}, }, { name: "ruleConstraints - positive constraints", @@ -205,7 +235,7 @@ func TestRuleSolver(t *testing.T) { {Value: "c"}, }, }, - expected: []roachpb.StoreID{storeUSb, storeUSa1, storeUSa15, storeEurope}, + expectedValid: []roachpb.StoreID{storeUSb, storeUSa1, storeUSa15, storeEurope}, }, { name: "ruleConstraints - positive locality constraints", @@ -215,13 +245,12 @@ func TestRuleSolver(t *testing.T) { {Key: "datacenter", Value: "eur"}, }, }, - expected: []roachpb.StoreID{storeEurope, storeUSa15, storeUSa1, storeUSb}, + expectedValid: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1, storeUSa15}, }, { - name: "ruleDiversity - no existing replicas", - rule: ruleDiversity, - existing: nil, - expected: []roachpb.StoreID{storeUSa15, storeUSa1, storeUSb, storeEurope}, + name: "ruleDiversity - no existing replicas", + rule: ruleDiversity, + expectedValid: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1, storeUSa15}, }, { name: "ruleDiversity - one existing replicas", @@ -229,7 +258,7 @@ func TestRuleSolver(t *testing.T) { existing: []roachpb.ReplicaDescriptor{ {NodeID: roachpb.NodeID(storeUSa15)}, }, - expected: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1, storeUSa15}, + expectedValid: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1, storeUSa15}, }, { name: "ruleDiversity - two existing replicas", @@ -238,12 +267,18 @@ func TestRuleSolver(t *testing.T) { {NodeID: roachpb.NodeID(storeUSa15)}, {NodeID: roachpb.NodeID(storeEurope)}, }, - expected: []roachpb.StoreID{storeUSb, storeUSa1, storeUSa15, storeEurope}, + expectedValid: []roachpb.StoreID{storeUSb, storeUSa1, storeEurope, storeUSa15}, + }, + { + name: "ruleCapacityMax", + rule: ruleCapacityMax, + expectedValid: []roachpb.StoreID{storeEurope, storeUSb, storeUSa1}, + expectedInvalid: []roachpb.StoreID{storeUSa15}, }, { - name: "ruleCapacity", - rule: ruleCapacity, - expected: []roachpb.StoreID{storeUSa1, storeEurope, storeUSb}, + name: "ruleCapacity", + rule: ruleCapacity, + expectedValid: []roachpb.StoreID{storeUSa1, storeEurope, storeUSb, storeUSa15}, }, } @@ -254,25 +289,74 @@ func TestRuleSolver(t *testing.T) { solver = ruleSolver{tc.rule} } sl, _, _ := storePool.getStoreList(roachpb.RangeID(0)) - candidates, err := solver.Solve( + candidates := solver.Solve( sl, tc.c, tc.existing, storePool.getNodeLocalities(tc.existing), ) - if err != nil { - t.Fatal(err) + sort.Sort(sort.Reverse(byScoreAndID(candidates))) + valid := candidates.onlyValid() + invalid := candidates[len(valid):] + + if len(valid) != len(tc.expectedValid) { + t.Fatalf("length of valid %+v should match %+v", valid, tc.expectedValid) + } + for i, expected := range tc.expectedValid { + if actual := valid[i].store.StoreID; actual != expected { + t.Errorf("valid[%d].store.StoreID = %d; not %d; %+v", + i, actual, expected, valid) + } } - sort.Sort(byScoreAndID(candidates)) - if len(candidates) != len(tc.expected) { - t.Fatalf("length of %+v should match %+v", candidates, tc.expected) + if len(invalid) != len(tc.expectedInvalid) { + t.Fatalf("length of invalids %+v should match %+v", invalid, tc.expectedInvalid) } - for i, expected := range tc.expected { - if actual := candidates[i].store.StoreID; actual != expected { - t.Errorf("candidates[%d].store.StoreID = %d; not %d; %+v", - i, actual, expected, candidates) + for i, expected := range tc.expectedInvalid { + if actual := invalid[i].store.StoreID; actual != expected { + t.Errorf("invalid[%d].store.StoreID = %d; not %d; %+v", + i, actual, expected, invalid) } } }) } } + +func TestOnlyValid(t *testing.T) { + defer leaktest.AfterTest(t)() + + testCases := []struct { + valid, invalid int + }{ + {0, 0}, + {1, 0}, + {0, 1}, + {1, 1}, + {2, 0}, + {2, 1}, + {2, 2}, + {1, 2}, + {0, 2}, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("%d,%d", tc.valid, tc.invalid), func(t *testing.T) { + var cl candidateList + // Order these in backward to ensure sorting works correctly. + for i := 0; i < tc.invalid; i++ { + cl = append(cl, candidate{}) + } + for i := 0; i < tc.valid; i++ { + cl = append(cl, candidate{valid: true}) + } + sort.Sort(sort.Reverse(byScore(cl))) + + valid := cl.onlyValid() + if a, e := len(valid), tc.valid; a != e { + t.Errorf("expected %d valid, actual %d", e, a) + } + if a, e := len(cl)-len(valid), tc.invalid; a != e { + t.Errorf("expected %d invalid, actual %d", e, a) + } + }) + } +}