From 0d58eee5952113e7b7730685dc24641d636cb745 Mon Sep 17 00:00:00 2001 From: Aayush Shah Date: Tue, 10 Aug 2021 18:59:50 -0400 Subject: [PATCH] kvserver: stop transferring leases to draining/suspect nodes in the StoreRebalancer This commit prevents the StoreRebalancer from transferring leases to replicas on draining or suspect nodes. In some cases, we've seen this to cause new leases to be pushed to nodes that take too long to drain or that are stuck while draining due to other bugs. Informs https://github.com/cockroachlabs/support/issues/1105 Release note: None --- pkg/kv/kvserver/store_rebalancer.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pkg/kv/kvserver/store_rebalancer.go b/pkg/kv/kvserver/store_rebalancer.go index cb4964f2ca0d..4fa981762ac7 100644 --- a/pkg/kv/kvserver/store_rebalancer.go +++ b/pkg/kv/kvserver/store_rebalancer.go @@ -425,6 +425,14 @@ func (sr *StoreRebalancer) chooseLeaseToTransfer( var raftStatus *raft.Status preferred := sr.rq.allocator.preferredLeaseholders(zone, candidates) + + // Filter both the list of preferred stores as well as the list of all + // candidate replicas to only consider live (non-suspect, non-draining) + // nodes. + const includeSuspectAndDrainingStores = false + preferred, _ = sr.rq.allocator.storePool.liveAndDeadReplicas(preferred, includeSuspectAndDrainingStores) + candidates, _ = sr.rq.allocator.storePool.liveAndDeadReplicas(candidates, includeSuspectAndDrainingStores) + for _, candidate := range candidates { if candidate.StoreID == localDesc.StoreID { continue