From c719214ac13f0ab59b6e77145fadf8aaf4296572 Mon Sep 17 00:00:00 2001 From: Andrew Werner Date: Sun, 13 Sep 2020 17:08:49 -0400 Subject: [PATCH] kvserver: observe lease transfer interval in considerRebalance This commit plumbs the lease transfer rate limit into the lease transfer which occurs as a part of rebalancing decision making. The decision about whether a node should remove itself for the purpose of rebalancing happens at a much higher rate than the the actual rebalancing. This can lead to massive oscillations of leaseholders as the calculation on which replica should be removed in the face of node addition changes dramatically. We utilize a rate limit in lease transfer decisions to smooth this decision making elsewhere. This limit however is not utilized currently when deciding to transfer a lease for the purpose of rebalancing data. This change has shown very positive impact mitigating these lease transfer storms. See the commentary on #51867. Release note (bug fix): Made lease transfers during rebalancing adhere to the rate limit utilized in other lease transfer cases which eliminates unexpected lease oscillations when adding a new node. --- pkg/kv/kvserver/replicate_queue.go | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/pkg/kv/kvserver/replicate_queue.go b/pkg/kv/kvserver/replicate_queue.go index e089078532d4..f4422bcec612 100644 --- a/pkg/kv/kvserver/replicate_queue.go +++ b/pkg/kv/kvserver/replicate_queue.go @@ -460,7 +460,8 @@ func (rq *replicateQueue) addOrReplace( } // See about transferring the lease away if we're about to remove the // leaseholder. - done, err := rq.maybeTransferLeaseAway(ctx, repl, existingReplicas[removeIdx].StoreID, dryRun) + done, err := rq.maybeTransferLeaseAway( + ctx, repl, existingReplicas[removeIdx].StoreID, dryRun, nil /* canTransferLease */) if err != nil { return false, err } @@ -642,13 +643,21 @@ func (rq *replicateQueue) findRemoveTarget( // true to indicate to the caller that it should not pursue the current // replication change further because it is no longer the leaseholder. When the // returned bool is false, it should continue. On error, the caller should also -// stop. +// stop. If canTransferLease is non-nil, it is consulted and an error is +// returned if it returns false. func (rq *replicateQueue) maybeTransferLeaseAway( - ctx context.Context, repl *Replica, removeStoreID roachpb.StoreID, dryRun bool, + ctx context.Context, + repl *Replica, + removeStoreID roachpb.StoreID, + dryRun bool, + canTransferLease func() bool, ) (done bool, _ error) { if removeStoreID != repl.store.StoreID() { return false, nil } + if canTransferLease != nil && !canTransferLease() { + return false, errors.Errorf("cannot transfer lease") + } desc, zone := repl.DescAndZone() // The local replica was selected as the removal target, but that replica // is the leaseholder, so transfer the lease instead. We don't check that @@ -678,7 +687,8 @@ func (rq *replicateQueue) remove( if err != nil { return false, err } - done, err := rq.maybeTransferLeaseAway(ctx, repl, removeReplica.StoreID, dryRun) + done, err := rq.maybeTransferLeaseAway( + ctx, repl, removeReplica.StoreID, dryRun, nil /* canTransferLease */) if err != nil { return false, err } @@ -722,7 +732,8 @@ func (rq *replicateQueue) removeDecommissioning( return true, nil } decommissioningReplica := decommissioningReplicas[0] - done, err := rq.maybeTransferLeaseAway(ctx, repl, decommissioningReplica.StoreID, dryRun) + done, err := rq.maybeTransferLeaseAway( + ctx, repl, decommissioningReplica.StoreID, dryRun, nil /* canTransferLease */) if err != nil { return false, err } @@ -836,7 +847,9 @@ func (rq *replicateQueue) considerRebalance( storeFilterThrottled) if !ok { log.VEventf(ctx, 1, "no suitable rebalance target") - } else if done, err := rq.maybeTransferLeaseAway(ctx, repl, removeTarget.StoreID, dryRun); err != nil { + } else if done, err := rq.maybeTransferLeaseAway( + ctx, repl, removeTarget.StoreID, dryRun, canTransferLease, + ); err != nil { log.VEventf(ctx, 1, "want to remove self, but failed to transfer lease away: %s", err) } else if done { // Lease is now elsewhere, so we're not in charge any more.