From 1041afb50f9bd2f7b3ccf704efb9cc2eac619b86 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <grinaker@cockroachlabs.com>
Date: Wed, 12 May 2021 14:08:19 +0000
Subject: [PATCH] kvserver: improve suspect replica GC heuristics

The replica GC queue will normally check a replica against the canonical
range descriptor every 12 hours. Under some circumstances the replica
may be considered suspect, which causes it to be checked against the
canonical descriptor every second instead. However, these heuristics
were fairly limited and missed a couple of cases that could cause stale
replicas to linger.

This patch adds two conditions to the suspect replica heuristics:
followers that have lost contact with their leader (which in particular
handles non-voting replicas), and quiescent replicas that lose contact
with any other voters (which could cause false underreplication alerts).

Since this change is expected to increase suspect replica matches, the
`ReplicaGCQueueSuspectCheckInterval` duration between checking suspect
replica descriptors was also increased from 1 to 5 seconds, and the
`replicaGCQueueTimerDuration` interval between replica GCs was increased
from 50 to 100 ms.

The previous logic would take into account replica activity such as
store startup and lease proposals as the offset for timeouts, but this
did not appear to have any significant benefit over simply using the
last check time, so these have been removed and the timeouts given more
appropriate names. The previous logic also failed to enforce the check
interval for suspect replicas, and would always check them in a tight
50ms loop, this has been fixed as well.

Release note (bug fix): Improved garbage collection of stale replicas by
proactively checking certain replicas that have lost contact with other
voting replicas.
---
 pkg/kv/kvserver/client_raft_test.go      |   8 +-
 pkg/kv/kvserver/helpers_test.go          |   7 -
 pkg/kv/kvserver/replica.go               |   7 +
 pkg/kv/kvserver/replica_gc_queue.go      | 165 ++++++++++++-----------
 pkg/kv/kvserver/replica_gc_queue_test.go |  67 ++++-----
 5 files changed, 129 insertions(+), 125 deletions(-)

diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go
index d25012c48b16..3815ce0fd168 100644
--- a/pkg/kv/kvserver/client_raft_test.go
+++ b/pkg/kv/kvserver/client_raft_test.go
@@ -2016,7 +2016,7 @@ func runReplicateRestartAfterTruncation(t *testing.T, removeBeforeTruncateAndReA
 	if removeBeforeTruncateAndReAdd {
 		// Verify old replica is GC'd. Wait out the replica gc queue
 		// inactivity threshold and force a gc scan.
-		manualClock.Increment(int64(kvserver.ReplicaGCQueueInactivityThreshold + 1))
+		manualClock.Increment(int64(kvserver.ReplicaGCQueueCheckInterval + 1))
 		testutils.SucceedsSoon(t, func() error {
 			tc.GetFirstStoreFromServer(t, 1).MustForceReplicaGCScanAndProcess()
 			_, err := tc.GetFirstStoreFromServer(t, 1).GetReplica(desc.RangeID)
@@ -2126,7 +2126,7 @@ func testReplicaAddRemove(t *testing.T, addFirst bool) {
 
 	// Wait out the range lease and the unleased duration to make the replica GC'able.
 	manualClock.Increment(store.GetStoreConfig().LeaseExpiration())
-	manualClock.Increment(int64(kvserver.ReplicaGCQueueInactivityThreshold + 1))
+	manualClock.Increment(int64(kvserver.ReplicaGCQueueCheckInterval + 1))
 	tc.GetFirstStoreFromServer(t, 1).SetReplicaGCQueueActive(true)
 	tc.GetFirstStoreFromServer(t, 1).MustForceReplicaGCScanAndProcess()
 
@@ -3275,7 +3275,7 @@ func TestReplicateRogueRemovedNode(t *testing.T) {
 	testutils.SucceedsSoon(t, func() error {
 		manualClock.Increment(store.GetStoreConfig().LeaseExpiration())
 		manualClock.Increment(int64(
-			kvserver.ReplicaGCQueueInactivityThreshold) + 1)
+			kvserver.ReplicaGCQueueCheckInterval) + 1)
 		tc.GetFirstStoreFromServer(t, 1).MustForceReplicaGCScanAndProcess()
 
 		actual := tc.ReadIntFromStores(key)
@@ -3365,7 +3365,7 @@ func TestReplicateRogueRemovedNode(t *testing.T) {
 	tc.GetFirstStoreFromServer(t, 2).SetReplicaGCQueueActive(true)
 	manualClock.Increment(store.GetStoreConfig().LeaseExpiration())
 	manualClock.Increment(int64(
-		kvserver.ReplicaGCQueueInactivityThreshold) + 1)
+		kvserver.ReplicaGCQueueCheckInterval) + 1)
 	tc.GetFirstStoreFromServer(t, 2).MustForceReplicaGCScanAndProcess()
 	tc.WaitForValues(t, key, []int64{16, 0, 0})
 
diff --git a/pkg/kv/kvserver/helpers_test.go b/pkg/kv/kvserver/helpers_test.go
index 446cc9d61db4..29e93256a6b2 100644
--- a/pkg/kv/kvserver/helpers_test.go
+++ b/pkg/kv/kvserver/helpers_test.go
@@ -469,13 +469,6 @@ func SetMockAddSSTable() (undo func()) {
 	}
 }
 
-// IsQuiescent returns whether the replica is quiescent or not.
-func (r *Replica) IsQuiescent() bool {
-	r.mu.Lock()
-	defer r.mu.Unlock()
-	return r.mu.quiescent
-}
-
 // GetQueueLastProcessed returns the last processed timestamp for the
 // specified queue, or the zero timestamp if not available.
 func (r *Replica) GetQueueLastProcessed(ctx context.Context, queue string) (hlc.Timestamp, error) {
diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go
index f2275ae5d874..772177b12a44 100644
--- a/pkg/kv/kvserver/replica.go
+++ b/pkg/kv/kvserver/replica.go
@@ -731,6 +731,13 @@ func (r *Replica) isDestroyedRLocked() (DestroyReason, error) {
 	return r.mu.destroyStatus.reason, r.mu.destroyStatus.err
 }
 
+// IsQuiescent returns whether the replica is quiescent or not.
+func (r *Replica) IsQuiescent() bool {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	return r.mu.quiescent
+}
+
 // DescAndZone returns the authoritative range descriptor as well
 // as the zone config for the replica.
 func (r *Replica) DescAndZone() (*roachpb.RangeDescriptor, *zonepb.ZoneConfig) {
diff --git a/pkg/kv/kvserver/replica_gc_queue.go b/pkg/kv/kvserver/replica_gc_queue.go
index 9345f81f3997..29eec7b94118 100644
--- a/pkg/kv/kvserver/replica_gc_queue.go
+++ b/pkg/kv/kvserver/replica_gc_queue.go
@@ -27,27 +27,16 @@ import (
 
 const (
 	// replicaGCQueueTimerDuration is the duration between GCs of queued replicas.
-	replicaGCQueueTimerDuration = 50 * time.Millisecond
+	replicaGCQueueTimerDuration = 100 * time.Millisecond
 
-	// ReplicaGCQueueInactivityThreshold is the inactivity duration after which
+	// ReplicaGCQueueCheckInterval is the inactivity duration after which
 	// a range will be considered for garbage collection. Exported for testing.
-	ReplicaGCQueueInactivityThreshold = 12 * time.Hour
-	// ReplicaGCQueueSuspectTimeout is the duration after which a Replica which
-	// is suspected to be removed should be processed by the queue.
-	// A Replica is suspected to have been removed if either it is in the
-	// candidate Raft state (which is a typical sign of having been removed
-	// from the group) or it is not in the VOTER_FULL state. Replicas which are
-	// in the LEARNER state will never become candidates. It seems possible that
-	// a range will quiesce and never tell a VOTER_OUTGOING that is was removed.
-	// Cases where a replica gets stuck in VOTER_INCOMING seem farfetched and
-	// would require the replica to be removed from the range before it ever
-	// learned about its promotion but that state shouldn't last long so we
-	// also treat idle replicas in that state as suspect.
-	//
-	// A leader unable to make progress (e.g. because it's lost a quorum) is
-	// also considered suspect, since Node.ResetQuorum() may be used to restore
-	// the range elsewhere.
-	ReplicaGCQueueSuspectTimeout = 1 * time.Second
+	ReplicaGCQueueCheckInterval = 12 * time.Hour
+	// ReplicaGCQueueSuspectCheckInterval is the duration after which a Replica
+	// which is suspected to be removed should be considered for garbage
+	// collection. See replicaIsSuspect() for details on what makes a replica
+	// suspect.
+	ReplicaGCQueueSuspectCheckInterval = 5 * time.Second
 )
 
 // Priorities for the replica GC queue.
@@ -129,96 +118,110 @@ func newReplicaGCQueue(store *Store, db *kv.DB, gossip *gossip.Gossip) *replicaG
 func (rgcq *replicaGCQueue) shouldQueue(
 	ctx context.Context, now hlc.ClockTimestamp, repl *Replica, _ *config.SystemConfig,
 ) (shouldQ bool, prio float64) {
-
+	if _, currentMember := repl.Desc().GetReplicaDescriptor(repl.store.StoreID()); !currentMember {
+		return true, replicaGCPriorityRemoved
+	}
 	lastCheck, err := repl.GetLastReplicaGCTimestamp(ctx)
 	if err != nil {
 		log.Errorf(ctx, "could not read last replica GC timestamp: %+v", err)
 		return false, 0
 	}
-	replDesc, currentMember := repl.Desc().GetReplicaDescriptor(repl.store.StoreID())
-	if !currentMember {
-		return true, replicaGCPriorityRemoved
-	}
-
-	lastActivity := hlc.Timestamp{
-		WallTime: repl.store.startedAt,
-	}
+	isSuspect := replicaIsSuspect(repl)
 
-	if lease, _ := repl.GetLease(); lease.ProposedTS != nil {
-		lastActivity.Forward(lease.ProposedTS.ToTimestamp())
-	}
+	return replicaGCShouldQueueImpl(now.ToTimestamp(), lastCheck, isSuspect)
+}
 
+func replicaIsSuspect(repl *Replica) bool {
 	// It is critical to think of the replica as suspect if it is a learner as
 	// it both shouldn't be a learner for long but will never become a candidate.
 	// It is less critical to consider joint configuration members as suspect
 	// but in cases where a replica is removed but only ever hears about the
 	// command which sets it to VOTER_OUTGOING we would conservatively wait
-	// 10 days before removing the node. Finally we consider replicas which are
+	// 12 hours before removing the node. Finally we consider replicas which are
 	// VOTER_INCOMING as suspect because no replica should stay in that state for
 	// too long and being conservative here doesn't seem worthwhile.
-	var isSuspect bool
-	if raftStatus := repl.RaftStatus(); raftStatus == nil {
-		// If a replica doesn't have an active raft group, we should check
-		// whether or not it is active. If not, we should process the replica
-		// because it has probably already been removed from its raft group but
-		// doesn't know it. Without this, node decommissioning can stall on such
-		// dormant ranges. Make sure NodeLiveness isn't nil because it can be in
-		// tests/benchmarks.
-		if repl.store.cfg.NodeLiveness != nil {
-			if liveness, ok := repl.store.cfg.NodeLiveness.Self(); ok && !liveness.Membership.Active() {
-				return true, replicaGCPriorityDefault
-			}
+	replDesc, ok := repl.Desc().GetReplicaDescriptor(repl.store.StoreID())
+	if !ok {
+		return true
+	}
+	if t := replDesc.GetType(); t != roachpb.VOTER_FULL && t != roachpb.NON_VOTER {
+		return true
+	}
+
+	// NodeLiveness can be nil in tests/benchmarks.
+	if repl.store.cfg.NodeLiveness == nil {
+		return false
+	}
+
+	// If a replica doesn't have an active raft group, we should check whether
+	// or not the node is active. If not, we should consider the replica suspect
+	// because it has probably already been removed from its raft group but
+	// doesn't know it. Without this, node decommissioning can stall on such
+	// dormant ranges.
+	raftStatus := repl.RaftStatus()
+	if raftStatus == nil {
+		liveness, ok := repl.store.cfg.NodeLiveness.Self()
+		return ok && !liveness.Membership.Active()
+	}
+
+	livenessMap := repl.store.cfg.NodeLiveness.GetIsLiveMap()
+	switch raftStatus.SoftState.RaftState {
+	// If a replica is a candidate, then by definition it has lost contact with
+	// its leader and possibly the rest of the Raft group, so consider it suspect.
+	case raft.StateCandidate, raft.StatePreCandidate:
+		return true
+
+	// If the replica is a follower, check that the leader is in our range
+	// descriptor and that we're still in touch with it. This handles e.g. a
+	// non-voting replica which has lost its leader.
+	case raft.StateFollower:
+		leadDesc, ok := repl.Desc().GetReplicaDescriptorByID(roachpb.ReplicaID(raftStatus.Lead))
+		if !ok || !livenessMap[leadDesc.NodeID].IsLive {
+			return true
 		}
-	} else if t := replDesc.GetType(); t != roachpb.VOTER_FULL && t != roachpb.NON_VOTER {
-		isSuspect = true
-	} else {
-		switch raftStatus.SoftState.RaftState {
-		case raft.StateCandidate, raft.StatePreCandidate:
-			isSuspect = true
-		case raft.StateLeader:
-			// If the replica is the leader, we check whether it has a quorum.
-			// Otherwise, it's possible that e.g. Node.ResetQuorum will be used
-			// to recover the range elsewhere, and we should relinquish our
-			// lease and GC the range.
-			if repl.store.cfg.NodeLiveness != nil {
-				livenessMap := repl.store.cfg.NodeLiveness.GetIsLiveMap()
-				isSuspect = !repl.Desc().Replicas().CanMakeProgress(func(d roachpb.ReplicaDescriptor) bool {
-					return livenessMap[d.NodeID].IsLive
-				})
+
+	// If the replica is a leader, check that it has a quorum. This handles e.g.
+	// a stuck leader with a lost quorum being replaced via Node.ResetQuorum,
+	// which must cause the stale leader to relinquish its lease and GC itself.
+	case raft.StateLeader:
+		if !repl.Desc().Replicas().CanMakeProgress(func(d roachpb.ReplicaDescriptor) bool {
+			return livenessMap[d.NodeID].IsLive
+		}) {
+			return true
+		}
+	}
+
+	// If the replica is quiesced, consider it suspect if any of the other
+	// voters are unavailable. This tries to detect cases where a quiesced
+	// replica does not notice that it's been removed from the range and then
+	// triggers an underreplicated alert when the range membership changes again
+	// later. In other cases where a quiesced replica fails to notice that it's
+	// been removed it will be GCed during the next periodic check (every 12 hours).
+	if repl.IsQuiescent() {
+		for _, rd := range repl.Desc().Replicas().VoterDescriptors() {
+			if !livenessMap[rd.NodeID].IsLive {
+				return true
 			}
 		}
 	}
-	return replicaGCShouldQueueImpl(now.ToTimestamp(), lastCheck, lastActivity, isSuspect)
+
+	return false
 }
 
-func replicaGCShouldQueueImpl(
-	now, lastCheck, lastActivity hlc.Timestamp, isSuspect bool,
-) (bool, float64) {
-	timeout := ReplicaGCQueueInactivityThreshold
+func replicaGCShouldQueueImpl(now, lastCheck hlc.Timestamp, isSuspect bool) (bool, float64) {
+	timeout := ReplicaGCQueueCheckInterval
 	priority := replicaGCPriorityDefault
 
 	if isSuspect {
-		// If the range is suspect (which happens if its former replica set
-		// ignores it), let it expire much earlier.
-		timeout = ReplicaGCQueueSuspectTimeout
+		timeout = ReplicaGCQueueSuspectCheckInterval
 		priority = replicaGCPrioritySuspect
-	} else if now.Less(lastCheck.Add(ReplicaGCQueueInactivityThreshold.Nanoseconds(), 0)) {
-		// Return false immediately if the previous check was less than the
-		// check interval in the past. Note that we don't do this if the
-		// replica is in candidate state, in which case we want to be more
-		// aggressive - a failed rebalance attempt could have checked this
-		// range, and candidate state suggests that a retry succeeded. See
-		// #7489.
-		return false, 0
 	}
 
-	shouldQ := lastActivity.Add(timeout.Nanoseconds(), 0).Less(now)
-
-	if !shouldQ {
+	// Only queue for GC if the timeout interval has passed since the last check.
+	if !lastCheck.Add(timeout.Nanoseconds(), 0).Less(now) {
 		return false, 0
 	}
-
-	return shouldQ, priority
+	return true, priority
 }
 
 // process performs a consistent lookup on the range descriptor to see if we are
diff --git a/pkg/kv/kvserver/replica_gc_queue_test.go b/pkg/kv/kvserver/replica_gc_queue_test.go
index 71a56568cf79..cf7ee8cb8d32 100644
--- a/pkg/kv/kvserver/replica_gc_queue_test.go
+++ b/pkg/kv/kvserver/replica_gc_queue_test.go
@@ -11,12 +11,14 @@
 package kvserver
 
 import (
+	"fmt"
 	"testing"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/util/hlc"
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
+	"github.com/stretchr/testify/require"
 )
 
 func TestReplicaGCShouldQueue(t *testing.T) {
@@ -27,45 +29,44 @@ func TestReplicaGCShouldQueue(t *testing.T) {
 		return hlc.Timestamp{WallTime: t.Nanoseconds()}
 	}
 
-	base := 2 * (ReplicaGCQueueSuspectTimeout + ReplicaGCQueueInactivityThreshold)
+	base := 2 * (ReplicaGCQueueSuspectCheckInterval + ReplicaGCQueueCheckInterval)
 	var (
 		z       = ts(0)
 		bTS     = ts(base)
-		cTS     = ts(base + ReplicaGCQueueSuspectTimeout)
-		cTSnext = ts(base + ReplicaGCQueueSuspectTimeout + 1)
-		iTSprev = ts(base + ReplicaGCQueueInactivityThreshold - 1)
-		iTS     = ts(base + ReplicaGCQueueInactivityThreshold)
+		sTS     = ts(base + ReplicaGCQueueSuspectCheckInterval)
+		sTSnext = ts(base + ReplicaGCQueueSuspectCheckInterval + 1)
+		iTS     = ts(base + ReplicaGCQueueCheckInterval)
+		iTSnext = ts(base + ReplicaGCQueueCheckInterval + 1)
 	)
 
-	for i, test := range []struct {
-		now, lastCheck, lastActivity hlc.Timestamp
-		isCandidate                  bool
-
-		shouldQ  bool
-		priority float64
+	testcases := []struct {
+		now            hlc.Timestamp
+		lastCheck      hlc.Timestamp
+		isSuspect      bool
+		expectQueue    bool
+		expectPriority float64
 	}{
-		// Test outcomes when range is in candidate state.
-
-		// All timestamps current: candidacy plays no role.
-		{now: z, lastCheck: z, lastActivity: z, isCandidate: true, shouldQ: false, priority: 0},
+		// All timestamps current: suspect plays no role.
+		{now: z, lastCheck: z, isSuspect: true, expectQueue: false, expectPriority: 0},
 		// Threshold: no action taken.
-		{now: cTS, lastCheck: z, lastActivity: bTS, isCandidate: true, shouldQ: false, priority: 0},
-		// Queue with priority.
-		{now: cTSnext, lastCheck: z, lastActivity: bTS, isCandidate: true, shouldQ: true, priority: 1},
-		// Last processed recently: candidate still gets processed eagerly.
-		{now: cTSnext, lastCheck: bTS, lastActivity: z, isCandidate: true, shouldQ: true, priority: 1},
-		// Last processed recently: non-candidate stays put.
-		{now: cTSnext, lastCheck: bTS, lastActivity: z, isCandidate: false, shouldQ: false, priority: 0},
-		// Still no effect until iTS reached.
-		{now: iTSprev, lastCheck: bTS, lastActivity: z, isCandidate: false, shouldQ: false, priority: 0},
-		{now: iTS, lastCheck: bTS, lastActivity: z, isCandidate: true, shouldQ: true, priority: 1},
-		// Verify again that candidacy increases priority.
-		{now: iTS, lastCheck: bTS, lastActivity: z, isCandidate: false, shouldQ: true, priority: 0},
-	} {
-		if sq, pr := replicaGCShouldQueueImpl(
-			test.now, test.lastCheck, test.lastActivity, test.isCandidate,
-		); sq != test.shouldQ || pr != test.priority {
-			t.Errorf("%d: %+v: got (%t,%f)", i, test, sq, pr)
-		}
+		{now: sTS, lastCheck: bTS, isSuspect: true, expectQueue: false, expectPriority: 0},
+		// Last processed recently: suspect still gets processed eagerly.
+		{now: sTSnext, lastCheck: bTS, isSuspect: true, expectQueue: true, expectPriority: 1},
+		// Last processed recently: non-suspect stays put.
+		{now: sTSnext, lastCheck: bTS, isSuspect: false, expectQueue: false, expectPriority: 0},
+		// No effect until iTS crossed.
+		{now: iTS, lastCheck: bTS, isSuspect: false, expectQueue: false, expectPriority: 0},
+		{now: iTSnext, lastCheck: bTS, isSuspect: false, expectQueue: true, expectPriority: 0},
+		// Verify again that suspect increases priority.
+		{now: iTSnext, lastCheck: bTS, isSuspect: true, expectQueue: true, expectPriority: 1},
+	}
+	for _, tc := range testcases {
+		tc := tc
+		name := fmt.Sprintf("now=%s lastCheck=%s isSuspect=%v", tc.now, tc.lastCheck, tc.isSuspect)
+		t.Run(name, func(t *testing.T) {
+			shouldQueue, priority := replicaGCShouldQueueImpl(tc.now, tc.lastCheck, tc.isSuspect)
+			require.Equal(t, tc.expectQueue, shouldQueue)
+			require.Equal(t, tc.expectPriority, priority)
+		})
 	}
 }