Skip to content

Commit

Permalink
kvserver: fix bug causing spurious rebalance attempts by the store re…
Browse files Browse the repository at this point in the history
…balancer

When rebalancing a range, the store rebalancer also tries to move the lease to
the voter with the lowest QPS. However, previously, the store rebalancer was
doing this even when it found no better replacement targets for the existing
stores. This meant that we were redundantly transferring leases over to the
coldest voter, even when we weren't rebalancing the range. This was likely
contributing to the thrashing observed in
#69817.

Release note (bug fix): A bug that could previously cause redundant lease
transfers has now been fixed.
  • Loading branch information
aayushshah15 committed Jan 12, 2022
1 parent bbd8658 commit c2f7bd6
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 8 deletions.
24 changes: 21 additions & 3 deletions pkg/kv/kvserver/store_rebalancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,15 +582,27 @@ func (sr *StoreRebalancer) chooseRangeToRebalance(
log.VEventf(ctx, 3, "considering replica rebalance for r%d with %.2f qps",
replWithStats.repl.GetRangeID(), replWithStats.qps)

targetVoterRepls, targetNonVoterRepls := sr.getRebalanceTargetsBasedOnQPS(
targetVoterRepls, targetNonVoterRepls, foundRebalance := sr.getRebalanceTargetsBasedOnQPS(
ctx,
rebalanceCtx,
options,
)

if !foundRebalance {
// Bail if there are no stores that are better for the existing replicas.
// If the range needs a lease transfer to enable better load distribution,
// it will be handled by the logic in `chooseLeaseToTransfer()`.
log.VEventf(ctx, 3, "could not find rebalance opportunities for r%d", replWithStats.repl.RangeID)
continue
}

storeDescMap := storeListToMap(allStoresList)

// Pick the voter with the least QPS to be leaseholder;
// RelocateRange transfers the lease to the first provided target.
//
// TODO(aayush): Does this logic need to exist? This logic does not take
// lease preferences into account. So it is already broken in a way.
newLeaseIdx := 0
newLeaseQPS := math.MaxFloat64
var raftStatus *raft.Status
Expand Down Expand Up @@ -625,7 +637,7 @@ func (sr *StoreRebalancer) chooseRangeToRebalance(
// the stores in this cluster.
func (sr *StoreRebalancer) getRebalanceTargetsBasedOnQPS(
ctx context.Context, rbCtx rangeRebalanceContext, options scorerOptions,
) (finalVoterTargets, finalNonVoterTargets []roachpb.ReplicaDescriptor) {
) (finalVoterTargets, finalNonVoterTargets []roachpb.ReplicaDescriptor, foundRebalance bool) {
finalVoterTargets = rbCtx.rangeDesc.Replicas().VoterDescriptors()
finalNonVoterTargets = rbCtx.rangeDesc.Replicas().NonVoterDescriptors()

Expand All @@ -652,6 +664,9 @@ func (sr *StoreRebalancer) getRebalanceTargetsBasedOnQPS(
rbCtx.rangeDesc.RangeID,
)
break
} else {
// Record the fact that we found at least one rebalance opportunity.
foundRebalance = true
}
log.VEventf(
ctx,
Expand Down Expand Up @@ -712,6 +727,9 @@ func (sr *StoreRebalancer) getRebalanceTargetsBasedOnQPS(
rbCtx.rangeDesc.RangeID,
)
break
} else {
// Record the fact that we found at least one rebalance opportunity.
foundRebalance = true
}
log.VEventf(
ctx,
Expand All @@ -737,7 +755,7 @@ func (sr *StoreRebalancer) getRebalanceTargetsBasedOnQPS(
// Pretend that we've executed upon this rebalancing decision.
finalNonVoterTargets = newNonVoters
}
return finalVoterTargets, finalNonVoterTargets
return finalVoterTargets, finalNonVoterTargets, foundRebalance
}

func storeListToMap(sl StoreList) map[roachpb.StoreID]*roachpb.StoreDescriptor {
Expand Down
62 changes: 57 additions & 5 deletions pkg/kv/kvserver/store_rebalancer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
"github.com/stretchr/testify/require"
raft "go.etcd.io/etcd/raft/v3"
"go.etcd.io/etcd/raft/v3"
"go.etcd.io/etcd/raft/v3/tracker"
)

Expand Down Expand Up @@ -242,9 +243,10 @@ type testRange struct {

func loadRanges(rr *replicaRankings, s *Store, ranges []testRange) {
acc := rr.newAccumulator()
for _, r := range ranges {
repl := &Replica{store: s}
repl.mu.state.Desc = &roachpb.RangeDescriptor{}
for i, r := range ranges {
rangeID := roachpb.RangeID(i + 1)
repl := &Replica{store: s, RangeID: rangeID}
repl.mu.state.Desc = &roachpb.RangeDescriptor{RangeID: rangeID}
repl.mu.conf = s.cfg.DefaultSpanConfig
for _, storeID := range r.voters {
repl.mu.state.Desc.InternalReplicas = append(repl.mu.state.Desc.InternalReplicas, roachpb.ReplicaDescriptor{
Expand Down Expand Up @@ -706,7 +708,7 @@ func TestChooseRangeToRebalanceAcrossHeterogeneousZones(t *testing.T) {
name: "no rebalance",
voters: []roachpb.StoreID{3, 6, 9},
constraints: oneReplicaPerRegion,
expRebalancedVoters: []roachpb.StoreID{9, 6, 3},
expRebalancedVoters: []roachpb.StoreID{},
},
// A replica is in a heavily loaded region, on a relatively heavily loaded
// store. We expect it to be moved to a less busy store within the same
Expand Down Expand Up @@ -878,6 +880,56 @@ func TestChooseRangeToRebalanceAcrossHeterogeneousZones(t *testing.T) {
}
}

// TestChooseRangeToRebalanceIgnoresRangeOnBestStores tests that the store
// rebalancer does not attempt to rebalance ranges unless it finds a better set
// of target stores for it compared to its existing stores.
func TestChooseRangeToRebalanceIgnoresRangeOnBestStores(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx, finishAndGetRecording := tracing.ContextWithRecordingSpan(
context.Background(), tracing.NewTracer(), "test",
)
stopper := stop.NewStopper()
defer stopper.Stop(ctx)

stopper, g, _, a, _ := createTestAllocatorWithKnobs(ctx,
10,
false, /* deterministic */
&AllocatorTestingKnobs{AllowLeaseTransfersToReplicasNeedingSnapshots: true},
)
defer stopper.Stop(context.Background())
storeList, _, _ := a.storePool.getStoreList(storeFilterThrottled)

localDesc := *noLocalityStores[len(noLocalityStores)-1]
cfg := TestStoreConfig(nil)
cfg.Gossip = g
cfg.StorePool = a.storePool
cfg.DefaultSpanConfig.NumVoters = 1
cfg.DefaultSpanConfig.NumReplicas = 1
s := createTestStoreWithoutStart(ctx, t, stopper, testStoreOpts{createSystemRanges: true}, &cfg)
gossiputil.NewStoreGossiper(cfg.Gossip).GossipStores(noLocalityStores, t)
s.Ident = &roachpb.StoreIdent{StoreID: localDesc.StoreID}
rq := newReplicateQueue(s, a)
rr := newReplicaRankings()

sr := NewStoreRebalancer(cfg.AmbientCtx, cfg.Settings, rq, rr)

// Load a fake hot range that's already on the best stores. We want to ensure
// that the store rebalancer doesn't attempt to rebalance ranges that it
// cannot find better rebalance opportunities for.
loadRanges(rr, s, []testRange{{voters: []roachpb.StoreID{localDesc.StoreID}, qps: 100}})
hottestRanges := rr.topQPS()
sr.chooseRangeToRebalance(
ctx, &hottestRanges, &localDesc, storeList, qpsScorerOptions{qpsRebalanceThreshold: 0.05},
)
trace := finishAndGetRecording()
require.Regexpf(
t, "could not find.*opportunities for r1",
trace, "expected the store rebalancer to explicitly ignore r1; but found %s", trace,
)
}

func TestNoLeaseTransferToBehindReplicas(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
Expand Down

0 comments on commit c2f7bd6

Please sign in to comment.