Skip to content

Commit

Permalink
Merge #74726
Browse files Browse the repository at this point in the history
74726: kvserver: fix bug causing spurious rebalance attempts by the store rebalancer r=aayushshah15 a=aayushshah15

When rebalancing a range, the store rebalancer also tries to move the lease to
the voter with the lowest QPS. However, previously, the store rebalancer was
doing this even when it found no better replacement targets for the existing
stores. This meant that we were redundantly transferring leases over to the
coldest voter, even when we weren't rebalancing the range. This was likely
contributing to the thrashing observed in
#69817.

Pulled out from #72296.

Release note (bug fix): A bug that could previously cause redundant lease
transfers has now been fixed.

Co-authored-by: Aayush Shah <[email protected]>
  • Loading branch information
craig[bot] and aayushshah15 committed Jan 13, 2022
2 parents 59a6200 + c2f7bd6 commit 6b8bfc1
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 8 deletions.
24 changes: 21 additions & 3 deletions pkg/kv/kvserver/store_rebalancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,15 +582,27 @@ func (sr *StoreRebalancer) chooseRangeToRebalance(
log.VEventf(ctx, 3, "considering replica rebalance for r%d with %.2f qps",
replWithStats.repl.GetRangeID(), replWithStats.qps)

targetVoterRepls, targetNonVoterRepls := sr.getRebalanceTargetsBasedOnQPS(
targetVoterRepls, targetNonVoterRepls, foundRebalance := sr.getRebalanceTargetsBasedOnQPS(
ctx,
rebalanceCtx,
options,
)

if !foundRebalance {
// Bail if there are no stores that are better for the existing replicas.
// If the range needs a lease transfer to enable better load distribution,
// it will be handled by the logic in `chooseLeaseToTransfer()`.
log.VEventf(ctx, 3, "could not find rebalance opportunities for r%d", replWithStats.repl.RangeID)
continue
}

storeDescMap := storeListToMap(allStoresList)

// Pick the voter with the least QPS to be leaseholder;
// RelocateRange transfers the lease to the first provided target.
//
// TODO(aayush): Does this logic need to exist? This logic does not take
// lease preferences into account. So it is already broken in a way.
newLeaseIdx := 0
newLeaseQPS := math.MaxFloat64
var raftStatus *raft.Status
Expand Down Expand Up @@ -625,7 +637,7 @@ func (sr *StoreRebalancer) chooseRangeToRebalance(
// the stores in this cluster.
func (sr *StoreRebalancer) getRebalanceTargetsBasedOnQPS(
ctx context.Context, rbCtx rangeRebalanceContext, options scorerOptions,
) (finalVoterTargets, finalNonVoterTargets []roachpb.ReplicaDescriptor) {
) (finalVoterTargets, finalNonVoterTargets []roachpb.ReplicaDescriptor, foundRebalance bool) {
finalVoterTargets = rbCtx.rangeDesc.Replicas().VoterDescriptors()
finalNonVoterTargets = rbCtx.rangeDesc.Replicas().NonVoterDescriptors()

Expand All @@ -652,6 +664,9 @@ func (sr *StoreRebalancer) getRebalanceTargetsBasedOnQPS(
rbCtx.rangeDesc.RangeID,
)
break
} else {
// Record the fact that we found at least one rebalance opportunity.
foundRebalance = true
}
log.VEventf(
ctx,
Expand Down Expand Up @@ -712,6 +727,9 @@ func (sr *StoreRebalancer) getRebalanceTargetsBasedOnQPS(
rbCtx.rangeDesc.RangeID,
)
break
} else {
// Record the fact that we found at least one rebalance opportunity.
foundRebalance = true
}
log.VEventf(
ctx,
Expand All @@ -737,7 +755,7 @@ func (sr *StoreRebalancer) getRebalanceTargetsBasedOnQPS(
// Pretend that we've executed upon this rebalancing decision.
finalNonVoterTargets = newNonVoters
}
return finalVoterTargets, finalNonVoterTargets
return finalVoterTargets, finalNonVoterTargets, foundRebalance
}

func storeListToMap(sl StoreList) map[roachpb.StoreID]*roachpb.StoreDescriptor {
Expand Down
62 changes: 57 additions & 5 deletions pkg/kv/kvserver/store_rebalancer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
"github.com/stretchr/testify/require"
raft "go.etcd.io/etcd/raft/v3"
"go.etcd.io/etcd/raft/v3"
"go.etcd.io/etcd/raft/v3/tracker"
)

Expand Down Expand Up @@ -242,9 +243,10 @@ type testRange struct {

func loadRanges(rr *replicaRankings, s *Store, ranges []testRange) {
acc := rr.newAccumulator()
for _, r := range ranges {
repl := &Replica{store: s}
repl.mu.state.Desc = &roachpb.RangeDescriptor{}
for i, r := range ranges {
rangeID := roachpb.RangeID(i + 1)
repl := &Replica{store: s, RangeID: rangeID}
repl.mu.state.Desc = &roachpb.RangeDescriptor{RangeID: rangeID}
repl.mu.conf = s.cfg.DefaultSpanConfig
for _, storeID := range r.voters {
repl.mu.state.Desc.InternalReplicas = append(repl.mu.state.Desc.InternalReplicas, roachpb.ReplicaDescriptor{
Expand Down Expand Up @@ -706,7 +708,7 @@ func TestChooseRangeToRebalanceAcrossHeterogeneousZones(t *testing.T) {
name: "no rebalance",
voters: []roachpb.StoreID{3, 6, 9},
constraints: oneReplicaPerRegion,
expRebalancedVoters: []roachpb.StoreID{9, 6, 3},
expRebalancedVoters: []roachpb.StoreID{},
},
// A replica is in a heavily loaded region, on a relatively heavily loaded
// store. We expect it to be moved to a less busy store within the same
Expand Down Expand Up @@ -878,6 +880,56 @@ func TestChooseRangeToRebalanceAcrossHeterogeneousZones(t *testing.T) {
}
}

// TestChooseRangeToRebalanceIgnoresRangeOnBestStores tests that the store
// rebalancer does not attempt to rebalance ranges unless it finds a better set
// of target stores for it compared to its existing stores.
func TestChooseRangeToRebalanceIgnoresRangeOnBestStores(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx, finishAndGetRecording := tracing.ContextWithRecordingSpan(
context.Background(), tracing.NewTracer(), "test",
)
stopper := stop.NewStopper()
defer stopper.Stop(ctx)

stopper, g, _, a, _ := createTestAllocatorWithKnobs(ctx,
10,
false, /* deterministic */
&AllocatorTestingKnobs{AllowLeaseTransfersToReplicasNeedingSnapshots: true},
)
defer stopper.Stop(context.Background())
storeList, _, _ := a.storePool.getStoreList(storeFilterThrottled)

localDesc := *noLocalityStores[len(noLocalityStores)-1]
cfg := TestStoreConfig(nil)
cfg.Gossip = g
cfg.StorePool = a.storePool
cfg.DefaultSpanConfig.NumVoters = 1
cfg.DefaultSpanConfig.NumReplicas = 1
s := createTestStoreWithoutStart(ctx, t, stopper, testStoreOpts{createSystemRanges: true}, &cfg)
gossiputil.NewStoreGossiper(cfg.Gossip).GossipStores(noLocalityStores, t)
s.Ident = &roachpb.StoreIdent{StoreID: localDesc.StoreID}
rq := newReplicateQueue(s, a)
rr := newReplicaRankings()

sr := NewStoreRebalancer(cfg.AmbientCtx, cfg.Settings, rq, rr)

// Load a fake hot range that's already on the best stores. We want to ensure
// that the store rebalancer doesn't attempt to rebalance ranges that it
// cannot find better rebalance opportunities for.
loadRanges(rr, s, []testRange{{voters: []roachpb.StoreID{localDesc.StoreID}, qps: 100}})
hottestRanges := rr.topQPS()
sr.chooseRangeToRebalance(
ctx, &hottestRanges, &localDesc, storeList, qpsScorerOptions{qpsRebalanceThreshold: 0.05},
)
trace := finishAndGetRecording()
require.Regexpf(
t, "could not find.*opportunities for r1",
trace, "expected the store rebalancer to explicitly ignore r1; but found %s", trace,
)
}

func TestNoLeaseTransferToBehindReplicas(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
Expand Down

0 comments on commit 6b8bfc1

Please sign in to comment.