From c3bbfe1ff5b83e91ed495f7069c6b0d5529bdf56 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Mon, 8 Aug 2022 13:34:29 +0200
Subject: [PATCH 1/4] kvserver: extract method updatePausedFollowersLocked

Intentionally leaving this unsimplified as a purely mechanical commit.

Release note: None
---
 pkg/kv/kvserver/replica_raft.go          | 48 +---------------------
 pkg/kv/kvserver/replica_raft_overload.go | 52 ++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go
index 7f10109ab59a..b204747a9c2a 100644
--- a/pkg/kv/kvserver/replica_raft.go
+++ b/pkg/kv/kvserver/replica_raft.go
@@ -1186,53 +1186,7 @@ func (r *Replica) tick(
 		return false, nil
 	}
 
-	if r.replicaID == r.mu.leaderID && len(ioOverloadMap) > 0 && quotaPoolEnabledForRange(*r.descRLocked()) {
-		// When multiple followers are overloaded, we may not be able to exclude all
-		// of them from replication traffic due to quorum constraints. We would like
-		// a given Range to deterministically exclude the same store (chosen
-		// randomly), so that across multiple Ranges we have a chance of removing
-		// load from all overloaded Stores in the cluster. (It would be a bad idea
-		// to roll a per-Range dice here on every tick, since that would rapidly
-		// include and exclude individual followers from replication traffic, which
-		// would be akin to a high rate of packet loss. Once we've decided to ignore
-		// a follower, this decision should be somewhat stable for at least a few
-		// seconds).
-		//
-		// Note that we don't enable this mechanism for the liveness range (see
-		// quotaPoolEnabledForRange), simply to play it safe, as we know that the
-		// liveness range is unlikely to be a major contributor to any follower's
-		// I/O and wish to reduce the likelihood of a problem in replication pausing
-		// contributing to an outage of that critical range.
-		seed := int64(r.RangeID)
-		now := r.store.Clock().Now().GoTime()
-		d := computeExpendableOverloadedFollowersInput{
-			replDescs:     r.descRLocked().Replicas(),
-			ioOverloadMap: ioOverloadMap,
-			getProgressMap: func(_ context.Context) map[uint64]tracker.Progress {
-				prs := r.mu.internalRaftGroup.Status().Progress
-				updateRaftProgressFromActivity(ctx, prs, r.descRLocked().Replicas().AsProto(), func(id roachpb.ReplicaID) bool {
-					return r.mu.lastUpdateTimes.isFollowerActiveSince(ctx, id, now, r.store.cfg.RangeLeaseActiveDuration())
-				})
-				return prs
-			},
-			minLiveMatchIndex: r.mu.proposalQuotaBaseIndex,
-			seed:              seed,
-		}
-		r.mu.pausedFollowers, _ = computeExpendableOverloadedFollowers(ctx, d)
-		for replicaID := range r.mu.pausedFollowers {
-			// We're dropping messages to those followers (see handleRaftReady) but
-			// it's a good idea to tell raft not to even bother sending in the first
-			// place. Raft will react to this by moving the follower to probing state
-			// where it will be contacted only sporadically until it responds to an
-			// MsgApp (which it can only do once we stop dropping messages). Something
-			// similar would result naturally if we didn't report as unreachable, but
-			// with more wasted work.
-			r.mu.internalRaftGroup.ReportUnreachable(uint64(replicaID))
-		}
-	} else if len(r.mu.pausedFollowers) > 0 {
-		// No store in the cluster is overloaded, or this replica is not raft leader.
-		r.mu.pausedFollowers = nil
-	}
+	r.updatePausedFollowersLocked(ctx, ioOverloadMap)
 
 	now := r.store.Clock().NowAsClockTimestamp()
 	if r.maybeQuiesceRaftMuLockedReplicaMuLocked(ctx, now, livenessMap) {
diff --git a/pkg/kv/kvserver/replica_raft_overload.go b/pkg/kv/kvserver/replica_raft_overload.go
index 0495417a2613..4a46a1ba01d0 100644
--- a/pkg/kv/kvserver/replica_raft_overload.go
+++ b/pkg/kv/kvserver/replica_raft_overload.go
@@ -206,3 +206,55 @@ func (osm *overloadedStoresMap) Swap(
 	v, _ := (*atomic.Value)(osm).Swap(m).(map[roachpb.StoreID]*admissionpb.IOThreshold)
 	return v
 }
+
+func (r *Replica) updatePausedFollowersLocked(
+	ctx context.Context, ioOverloadMap map[roachpb.StoreID]*admissionpb.IOThreshold,
+) {
+	if r.replicaID == r.mu.leaderID && len(ioOverloadMap) > 0 && quotaPoolEnabledForRange(*r.descRLocked()) {
+		// When multiple followers are overloaded, we may not be able to exclude all
+		// of them from replication traffic due to quorum constraints. We would like
+		// a given Range to deterministically exclude the same store (chosen
+		// randomly), so that across multiple Ranges we have a chance of removing
+		// load from all overloaded Stores in the cluster. (It would be a bad idea
+		// to roll a per-Range dice here on every tick, since that would rapidly
+		// include and exclude individual followers from replication traffic, which
+		// would be akin to a high rate of packet loss. Once we've decided to ignore
+		// a follower, this decision should be somewhat stable for at least a few
+		// seconds).
+		//
+		// Note that we don't enable this mechanism for the liveness range (see
+		// quotaPoolEnabledForRange), simply to play it safe, as we know that the
+		// liveness range is unlikely to be a major contributor to any follower's
+		// I/O and wish to reduce the likelihood of a problem in replication pausing
+		// contributing to an outage of that critical range.
+		seed := int64(r.RangeID)
+		now := r.store.Clock().Now().GoTime()
+		d := computeExpendableOverloadedFollowersInput{
+			replDescs:     r.descRLocked().Replicas(),
+			ioOverloadMap: ioOverloadMap,
+			getProgressMap: func(_ context.Context) map[uint64]tracker.Progress {
+				prs := r.mu.internalRaftGroup.Status().Progress
+				updateRaftProgressFromActivity(ctx, prs, r.descRLocked().Replicas().AsProto(), func(id roachpb.ReplicaID) bool {
+					return r.mu.lastUpdateTimes.isFollowerActiveSince(ctx, id, now, r.store.cfg.RangeLeaseActiveDuration())
+				})
+				return prs
+			},
+			minLiveMatchIndex: r.mu.proposalQuotaBaseIndex,
+			seed:              seed,
+		}
+		r.mu.pausedFollowers, _ = computeExpendableOverloadedFollowers(ctx, d)
+		for replicaID := range r.mu.pausedFollowers {
+			// We're dropping messages to those followers (see handleRaftReady) but
+			// it's a good idea to tell raft not to even bother sending in the first
+			// place. Raft will react to this by moving the follower to probing state
+			// where it will be contacted only sporadically until it responds to an
+			// MsgApp (which it can only do once we stop dropping messages). Something
+			// similar would result naturally if we didn't report as unreachable, but
+			// with more wasted work.
+			r.mu.internalRaftGroup.ReportUnreachable(uint64(replicaID))
+		}
+	} else if len(r.mu.pausedFollowers) > 0 {
+		// No store in the cluster is overloaded, or this replica is not raft leader.
+		r.mu.pausedFollowers = nil
+	}
+}

From bc8c25a2608b80ba1d64d94db973f21ee9444a41 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Mon, 8 Aug 2022 13:46:02 +0200
Subject: [PATCH 2/4] kvserver: simplify updatePausedFollowersLocked

Release note: None
---
 pkg/kv/kvserver/replica_raft_overload.go | 97 +++++++++++++-----------
 1 file changed, 53 insertions(+), 44 deletions(-)

diff --git a/pkg/kv/kvserver/replica_raft_overload.go b/pkg/kv/kvserver/replica_raft_overload.go
index 4a46a1ba01d0..98e9a9ac33e2 100644
--- a/pkg/kv/kvserver/replica_raft_overload.go
+++ b/pkg/kv/kvserver/replica_raft_overload.go
@@ -210,51 +210,60 @@ func (osm *overloadedStoresMap) Swap(
 func (r *Replica) updatePausedFollowersLocked(
 	ctx context.Context, ioOverloadMap map[roachpb.StoreID]*admissionpb.IOThreshold,
 ) {
-	if r.replicaID == r.mu.leaderID && len(ioOverloadMap) > 0 && quotaPoolEnabledForRange(*r.descRLocked()) {
-		// When multiple followers are overloaded, we may not be able to exclude all
-		// of them from replication traffic due to quorum constraints. We would like
-		// a given Range to deterministically exclude the same store (chosen
-		// randomly), so that across multiple Ranges we have a chance of removing
-		// load from all overloaded Stores in the cluster. (It would be a bad idea
-		// to roll a per-Range dice here on every tick, since that would rapidly
-		// include and exclude individual followers from replication traffic, which
-		// would be akin to a high rate of packet loss. Once we've decided to ignore
-		// a follower, this decision should be somewhat stable for at least a few
-		// seconds).
-		//
-		// Note that we don't enable this mechanism for the liveness range (see
-		// quotaPoolEnabledForRange), simply to play it safe, as we know that the
-		// liveness range is unlikely to be a major contributor to any follower's
+	r.mu.pausedFollowers = nil
+
+	if len(ioOverloadMap) == 0 {
+		return
+	}
+
+	if r.replicaID != r.mu.leaderID {
+		// Only the raft leader pauses followers. Followers never send meaningful
+		// amounts of data in raft messages, so pausing doesn't make sense on them.
+		return
+	}
+
+	if !quotaPoolEnabledForRange(*r.descRLocked()) {
+		// If the quota pool isn't enabled (like for the liveness range), play it
+		// safe. The range is unlikely to be a major contributor to any follower's
 		// I/O and wish to reduce the likelihood of a problem in replication pausing
 		// contributing to an outage of that critical range.
-		seed := int64(r.RangeID)
-		now := r.store.Clock().Now().GoTime()
-		d := computeExpendableOverloadedFollowersInput{
-			replDescs:     r.descRLocked().Replicas(),
-			ioOverloadMap: ioOverloadMap,
-			getProgressMap: func(_ context.Context) map[uint64]tracker.Progress {
-				prs := r.mu.internalRaftGroup.Status().Progress
-				updateRaftProgressFromActivity(ctx, prs, r.descRLocked().Replicas().AsProto(), func(id roachpb.ReplicaID) bool {
-					return r.mu.lastUpdateTimes.isFollowerActiveSince(ctx, id, now, r.store.cfg.RangeLeaseActiveDuration())
-				})
-				return prs
-			},
-			minLiveMatchIndex: r.mu.proposalQuotaBaseIndex,
-			seed:              seed,
-		}
-		r.mu.pausedFollowers, _ = computeExpendableOverloadedFollowers(ctx, d)
-		for replicaID := range r.mu.pausedFollowers {
-			// We're dropping messages to those followers (see handleRaftReady) but
-			// it's a good idea to tell raft not to even bother sending in the first
-			// place. Raft will react to this by moving the follower to probing state
-			// where it will be contacted only sporadically until it responds to an
-			// MsgApp (which it can only do once we stop dropping messages). Something
-			// similar would result naturally if we didn't report as unreachable, but
-			// with more wasted work.
-			r.mu.internalRaftGroup.ReportUnreachable(uint64(replicaID))
-		}
-	} else if len(r.mu.pausedFollowers) > 0 {
-		// No store in the cluster is overloaded, or this replica is not raft leader.
-		r.mu.pausedFollowers = nil
+		return
+	}
+
+	// When multiple followers are overloaded, we may not be able to exclude all
+	// of them from replication traffic due to quorum constraints. We would like
+	// a given Range to deterministically exclude the same store (chosen
+	// randomly), so that across multiple Ranges we have a chance of removing
+	// load from all overloaded Stores in the cluster. (It would be a bad idea
+	// to roll a per-Range dice here on every tick, since that would rapidly
+	// include and exclude individual followers from replication traffic, which
+	// would be akin to a high rate of packet loss. Once we've decided to ignore
+	// a follower, this decision should be somewhat stable for at least a few
+	// seconds).
+	seed := int64(r.RangeID)
+	now := r.store.Clock().Now().GoTime()
+	d := computeExpendableOverloadedFollowersInput{
+		replDescs:     r.descRLocked().Replicas(),
+		ioOverloadMap: ioOverloadMap,
+		getProgressMap: func(_ context.Context) map[uint64]tracker.Progress {
+			prs := r.mu.internalRaftGroup.Status().Progress
+			updateRaftProgressFromActivity(ctx, prs, r.descRLocked().Replicas().AsProto(), func(id roachpb.ReplicaID) bool {
+				return r.mu.lastUpdateTimes.isFollowerActiveSince(ctx, id, now, r.store.cfg.RangeLeaseActiveDuration())
+			})
+			return prs
+		},
+		minLiveMatchIndex: r.mu.proposalQuotaBaseIndex,
+		seed:              seed,
+	}
+	r.mu.pausedFollowers, _ = computeExpendableOverloadedFollowers(ctx, d)
+	for replicaID := range r.mu.pausedFollowers {
+		// We're dropping messages to those followers (see handleRaftReady) but
+		// it's a good idea to tell raft not to even bother sending in the first
+		// place. Raft will react to this by moving the follower to probing state
+		// where it will be contacted only sporadically until it responds to an
+		// MsgApp (which it can only do once we stop dropping messages). Something
+		// similar would result naturally if we didn't report as unreachable, but
+		// with more wasted work.
+		r.mu.internalRaftGroup.ReportUnreachable(uint64(replicaID))
 	}
 }

From dfac2ff5071914fd6c37998e7ac84a51f37c5a9b Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Mon, 8 Aug 2022 13:47:19 +0200
Subject: [PATCH 3/4] kvserver: only pause followers when holding active lease

If the raft leader is not the leaseholder (which includes the case in
which we just transferred the lease away), leave all followers unpaused.
Otherwise, the leaseholder won't learn that the entries it submitted
were committed which effectively causes range unavailability.

Fixes #84884.

Release note: None
---
 pkg/kv/kvserver/replica_raft_overload.go | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pkg/kv/kvserver/replica_raft_overload.go b/pkg/kv/kvserver/replica_raft_overload.go
index 98e9a9ac33e2..155a2d171726 100644
--- a/pkg/kv/kvserver/replica_raft_overload.go
+++ b/pkg/kv/kvserver/replica_raft_overload.go
@@ -230,6 +230,15 @@ func (r *Replica) updatePausedFollowersLocked(
 		return
 	}
 
+	status := r.leaseStatusAtRLocked(ctx, r.Clock().NowAsClockTimestamp())
+	if !status.IsValid() || !status.OwnedBy(r.StoreID()) {
+		// If we're not the leaseholder (which includes the case in which we just
+		// transferred the lease away), leave all followers unpaused. Otherwise, the
+		// leaseholder won't learn that the entries it submitted were committed
+		// which effectively causes range unavailability.
+		return
+	}
+
 	// When multiple followers are overloaded, we may not be able to exclude all
 	// of them from replication traffic due to quorum constraints. We would like
 	// a given Range to deterministically exclude the same store (chosen

From e4ae047573f90b952299148123a4a6bd138e60a4 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Mon, 8 Aug 2022 15:46:31 +0200
Subject: [PATCH 4/4] kvserver: never pause local replica

As observed in #84884, an overloaded store that held leases could end up
"pausing" replication traffic to itself. This (likely) had no practical
effect since the leader never sends messages to itself, but it meant
reporting bogus counts of paused replicas.

This commit ensures that a raft leader will never pause itself.

Release note: None
---
 pkg/kv/kvserver/replica_raft_overload.go                | 5 +++--
 pkg/kv/kvserver/replica_raft_overload_test.go           | 4 ++++
 pkg/kv/kvserver/testdata/replica_raft_overload/self.txt | 4 ++++
 3 files changed, 11 insertions(+), 2 deletions(-)
 create mode 100644 pkg/kv/kvserver/testdata/replica_raft_overload/self.txt

diff --git a/pkg/kv/kvserver/replica_raft_overload.go b/pkg/kv/kvserver/replica_raft_overload.go
index 155a2d171726..003ede45322e 100644
--- a/pkg/kv/kvserver/replica_raft_overload.go
+++ b/pkg/kv/kvserver/replica_raft_overload.go
@@ -43,6 +43,7 @@ var pauseReplicationIOThreshold = settings.RegisterFloatSetting(
 )
 
 type computeExpendableOverloadedFollowersInput struct {
+	self      roachpb.ReplicaID
 	replDescs roachpb.ReplicaSet
 	// TODO(tbg): all entries are overloaded, so consdier removing the IOThreshold here
 	// because it's confusing.
@@ -104,11 +105,10 @@ func computeExpendableOverloadedFollowers(
 	var nonLive map[roachpb.ReplicaID]nonLiveReason
 	var liveOverloadedVoterCandidates map[roachpb.ReplicaID]struct{}
 	var liveOverloadedNonVoterCandidates map[roachpb.ReplicaID]struct{}
-
 	var prs map[uint64]tracker.Progress
 
 	for _, replDesc := range d.replDescs.AsProto() {
-		if _, overloaded := d.ioOverloadMap[replDesc.StoreID]; !overloaded {
+		if _, overloaded := d.ioOverloadMap[replDesc.StoreID]; !overloaded || replDesc.ReplicaID == d.self {
 			continue
 		}
 		// There's at least one overloaded follower, so initialize
@@ -252,6 +252,7 @@ func (r *Replica) updatePausedFollowersLocked(
 	seed := int64(r.RangeID)
 	now := r.store.Clock().Now().GoTime()
 	d := computeExpendableOverloadedFollowersInput{
+		self:          r.replicaID,
 		replDescs:     r.descRLocked().Replicas(),
 		ioOverloadMap: ioOverloadMap,
 		getProgressMap: func(_ context.Context) map[uint64]tracker.Progress {
diff --git a/pkg/kv/kvserver/replica_raft_overload_test.go b/pkg/kv/kvserver/replica_raft_overload_test.go
index bc11a5b69a13..842d08cd268e 100644
--- a/pkg/kv/kvserver/replica_raft_overload_test.go
+++ b/pkg/kv/kvserver/replica_raft_overload_test.go
@@ -46,6 +46,7 @@ func TestReplicaRaftOverload_computeExpendableOverloadedFollowers(t *testing.T)
 			require.Equal(t, "run", d.Cmd)
 			var seed uint64
 			var replDescs roachpb.ReplicaSet
+			var self roachpb.ReplicaID
 			ioOverloadMap := map[roachpb.StoreID]*admissionpb.IOThreshold{}
 			snapshotMap := map[roachpb.ReplicaID]struct{}{}
 			downMap := map[roachpb.ReplicaID]struct{}{}
@@ -65,6 +66,8 @@ func TestReplicaRaftOverload_computeExpendableOverloadedFollowers(t *testing.T)
 					switch arg.Key {
 					case "min-live-match-index":
 						minLiveMatchIndex = id
+					case "self":
+						self = roachpb.ReplicaID(id)
 					case "voters", "learners":
 						replicaID := roachpb.ReplicaID(id)
 						if matchS != "" {
@@ -134,6 +137,7 @@ func TestReplicaRaftOverload_computeExpendableOverloadedFollowers(t *testing.T)
 			}
 
 			m, _ := computeExpendableOverloadedFollowers(ctx, computeExpendableOverloadedFollowersInput{
+				self:              self,
 				replDescs:         replDescs,
 				ioOverloadMap:     ioOverloadMap,
 				getProgressMap:    getProgressMap,
diff --git a/pkg/kv/kvserver/testdata/replica_raft_overload/self.txt b/pkg/kv/kvserver/testdata/replica_raft_overload/self.txt
new file mode 100644
index 000000000000..c73da0d7eadf
--- /dev/null
+++ b/pkg/kv/kvserver/testdata/replica_raft_overload/self.txt
@@ -0,0 +1,4 @@
+# Won't consider itself for pausing.
+run voters=(1,2,3) overloaded=(1) self=1
+----
+[]