Skip to content

Commit

Permalink
Merge #54322
Browse files Browse the repository at this point in the history
54322: kvserver: observe lease transfer interval in considerRebalance r=ajwerner a=ajwerner

This commit plumbs the lease transfer rate limit into the lease transfer which
occurs as a part of rebalancing decision making. The decision about whether a
node should remove itself for the purpose of rebalancing happens at a much
higher rate than the the actual rebalancing. This can lead to massive
oscillations of leaseholders as the calculation on which replica should be
removed in the face of node addition changes dramatically. We utilize
a rate limit in lease transfer decisions to smooth this decision making
elsewhere. This limit however is not utilized currently when deciding to
transfer a lease for the purpose of rebalancing data. This change has
shown very positive impact mitigating these lease transfer storms. See
the commentary on #51867.

Release note (bug fix): Made lease transfers during rebalancing adhere
to the rate limit utilized in other lease transfer cases which eliminates
unexpected lease oscillations when adding a new node.

Co-authored-by: Andrew Werner <[email protected]>
  • Loading branch information
craig[bot] and ajwerner committed Sep 15, 2020
2 parents bb0ddc4 + c719214 commit 19ec4ec
Showing 1 changed file with 19 additions and 6 deletions.
25 changes: 19 additions & 6 deletions pkg/kv/kvserver/replicate_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,8 @@ func (rq *replicateQueue) addOrReplace(
}
// See about transferring the lease away if we're about to remove the
// leaseholder.
done, err := rq.maybeTransferLeaseAway(ctx, repl, existingReplicas[removeIdx].StoreID, dryRun)
done, err := rq.maybeTransferLeaseAway(
ctx, repl, existingReplicas[removeIdx].StoreID, dryRun, nil /* canTransferLease */)
if err != nil {
return false, err
}
Expand Down Expand Up @@ -642,13 +643,21 @@ func (rq *replicateQueue) findRemoveTarget(
// true to indicate to the caller that it should not pursue the current
// replication change further because it is no longer the leaseholder. When the
// returned bool is false, it should continue. On error, the caller should also
// stop.
// stop. If canTransferLease is non-nil, it is consulted and an error is
// returned if it returns false.
func (rq *replicateQueue) maybeTransferLeaseAway(
ctx context.Context, repl *Replica, removeStoreID roachpb.StoreID, dryRun bool,
ctx context.Context,
repl *Replica,
removeStoreID roachpb.StoreID,
dryRun bool,
canTransferLease func() bool,
) (done bool, _ error) {
if removeStoreID != repl.store.StoreID() {
return false, nil
}
if canTransferLease != nil && !canTransferLease() {
return false, errors.Errorf("cannot transfer lease")
}
desc, zone := repl.DescAndZone()
// The local replica was selected as the removal target, but that replica
// is the leaseholder, so transfer the lease instead. We don't check that
Expand Down Expand Up @@ -678,7 +687,8 @@ func (rq *replicateQueue) remove(
if err != nil {
return false, err
}
done, err := rq.maybeTransferLeaseAway(ctx, repl, removeReplica.StoreID, dryRun)
done, err := rq.maybeTransferLeaseAway(
ctx, repl, removeReplica.StoreID, dryRun, nil /* canTransferLease */)
if err != nil {
return false, err
}
Expand Down Expand Up @@ -722,7 +732,8 @@ func (rq *replicateQueue) removeDecommissioning(
return true, nil
}
decommissioningReplica := decommissioningReplicas[0]
done, err := rq.maybeTransferLeaseAway(ctx, repl, decommissioningReplica.StoreID, dryRun)
done, err := rq.maybeTransferLeaseAway(
ctx, repl, decommissioningReplica.StoreID, dryRun, nil /* canTransferLease */)
if err != nil {
return false, err
}
Expand Down Expand Up @@ -836,7 +847,9 @@ func (rq *replicateQueue) considerRebalance(
storeFilterThrottled)
if !ok {
log.VEventf(ctx, 1, "no suitable rebalance target")
} else if done, err := rq.maybeTransferLeaseAway(ctx, repl, removeTarget.StoreID, dryRun); err != nil {
} else if done, err := rq.maybeTransferLeaseAway(
ctx, repl, removeTarget.StoreID, dryRun, canTransferLease,
); err != nil {
log.VEventf(ctx, 1, "want to remove self, but failed to transfer lease away: %s", err)
} else if done {
// Lease is now elsewhere, so we're not in charge any more.
Expand Down

0 comments on commit 19ec4ec

Please sign in to comment.