From fefe74eacd53d43abced0ab6d8b65b508a6efc00 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <grinaker@cockroachlabs.com>
Date: Sat, 17 Jun 2023 16:10:39 +0000
Subject: [PATCH] kvserver: don't campaign when unquiescing for a Raft message

This patch forgets the leader when unquiescing in response to a Raft
message and finding a dead leader (according to liveness). We don't
campaign, because that could result in election ties, but forgetting the
leader allows us to grant (pre)votes even though we've heard from the
leader recently, avoiding having to wait out an election timeout.

Epic: none
Release note: None
---
 pkg/kv/kvserver/client_raft_test.go     | 142 ++++++++++++++++++++++++
 pkg/kv/kvserver/helpers_test.go         |   4 +
 pkg/kv/kvserver/replica_raft.go         |  79 ++++++++++---
 pkg/kv/kvserver/replica_raft_quiesce.go |  12 +-
 pkg/kv/kvserver/store_raft.go           |   3 +-
 pkg/kv/kvserver/testing_knobs.go        |   4 +
 6 files changed, 225 insertions(+), 19 deletions(-)

diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go
index fdcf3ddbde18..e856c51f6076 100644
--- a/pkg/kv/kvserver/client_raft_test.go
+++ b/pkg/kv/kvserver/client_raft_test.go
@@ -6689,3 +6689,145 @@ func TestRaftUnquiesceLeaderNoProposal(t *testing.T) {
 	require.Equal(t, initialStatus.Progress[1].Match, status.Progress[1].Match)
 	t.Logf("n1 still leader with no new proposals at log index %d", status.Progress[1].Match)
 }
+
+// TestRaftPreVoteUnquiesceDeadLeader tests that if a quorum of replicas independently
+// consider the leader dead, they can successfully hold an election despite
+// having recently heard from a leader under the PreVote+CheckQuorum condition.
+// It also tests that it does not result in an election tie.
+//
+// We quiesce the range and partition away the leader as such:
+//
+//	               n1 (leader)
+//	              x  x
+//	             x    x
+//	(follower) n2 ---- n3 (follower)
+//
+// We also mark the leader as dead in liveness, and then unquiesce n2. This
+// should detect the dead leader via liveness, transition to pre-candidate, and
+// solicit a prevote from n3. This should cause n3 to unquiesce, detect the dead
+// leader, forget it (becoming a leaderless follower), and grant the prevote.
+func TestRaftPreVoteUnquiesceDeadLeader(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+
+	// Timing-sensitive, so skip under deadlock detector and stressrace.
+	skip.UnderDeadlock(t)
+	skip.UnderStressRace(t)
+
+	ctx := context.Background()
+	manualClock := hlc.NewHybridManualClock()
+
+	// Disable expiration-based leases, since these prevent quiescence.
+	st := cluster.MakeTestingClusterSettings()
+	kvserver.TransferExpirationLeasesFirstEnabled.Override(ctx, &st.SV, false)
+	kvserver.ExpirationLeasesOnly.Override(ctx, &st.SV, false)
+
+	tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
+		ReplicationMode: base.ReplicationManual,
+		ServerArgs: base.TestServerArgs{
+			Settings: st,
+			RaftConfig: base.RaftConfig{
+				RaftEnableCheckQuorum: true,
+				RaftTickInterval:      200 * time.Millisecond, // speed up test
+				// Set an large election timeout. We don't want replicas to call
+				// elections due to timeouts, we want them to campaign and obtain
+				// prevotes because they detected a dead leader when unquiescing.
+				RaftElectionTimeoutTicks: 100,
+			},
+			Knobs: base.TestingKnobs{
+				Server: &server.TestingKnobs{
+					WallClock: manualClock,
+				},
+				Store: &kvserver.StoreTestingKnobs{
+					DisableLivenessMapConnHealth: true, // to mark n1 as not live
+				},
+			},
+		},
+	})
+	defer tc.Stopper().Stop(ctx)
+
+	logStatus := func(s *raft.Status) {
+		t.Helper()
+		require.NotNil(t, s)
+		t.Logf("n%d %s at term=%d commit=%d", s.ID, s.RaftState, s.Term, s.Commit)
+	}
+
+	// Create a range, upreplicate it, and replicate a write.
+	sender := tc.GetFirstStoreFromServer(t, 0).TestSender()
+	key := tc.ScratchRange(t)
+	desc := tc.AddVotersOrFatal(t, key, tc.Targets(1, 2)...)
+
+	_, pErr := kv.SendWrapped(ctx, sender, incrementArgs(key, 1))
+	require.NoError(t, pErr.GoError())
+	tc.WaitForValues(t, key, []int64{1, 1, 1})
+
+	repl1, err := tc.GetFirstStoreFromServer(t, 0).GetReplica(desc.RangeID)
+	require.NoError(t, err)
+	repl2, err := tc.GetFirstStoreFromServer(t, 1).GetReplica(desc.RangeID)
+	require.NoError(t, err)
+	repl3, err := tc.GetFirstStoreFromServer(t, 2).GetReplica(desc.RangeID)
+	require.NoError(t, err)
+
+	// Set up a complete partition for n1, but don't activate it yet.
+	var partitioned atomic.Bool
+	dropRaftMessagesFrom(t, tc.Servers[0], desc.RangeID, []roachpb.ReplicaID{2, 3}, &partitioned)
+	dropRaftMessagesFrom(t, tc.Servers[1], desc.RangeID, []roachpb.ReplicaID{1}, &partitioned)
+	dropRaftMessagesFrom(t, tc.Servers[2], desc.RangeID, []roachpb.ReplicaID{1}, &partitioned)
+
+	// Make sure the lease is on n1 and that everyone has applied it.
+	tc.TransferRangeLeaseOrFatal(t, desc, tc.Target(0))
+	_, pErr = kv.SendWrapped(ctx, sender, incrementArgs(key, 1))
+	require.NoError(t, pErr.GoError())
+	tc.WaitForValues(t, key, []int64{2, 2, 2})
+	t.Logf("n1 has lease")
+
+	// Wait for the range to quiesce.
+	require.Eventually(t, func() bool {
+		return repl1.IsQuiescent() && repl2.IsQuiescent() && repl3.IsQuiescent()
+	}, 10*time.Second, 100*time.Millisecond)
+	t.Logf("n1, n2, and n3 quiesced")
+
+	// Partition n1.
+	partitioned.Store(true)
+	t.Logf("n1 partitioned")
+
+	// Pause n1's heartbeats and move the clock to expire its lease and liveness.
+	l1 := tc.Server(0).NodeLiveness().(*liveness.NodeLiveness)
+	resumeHeartbeats := l1.PauseAllHeartbeatsForTest()
+	defer resumeHeartbeats()
+
+	lv, ok := l1.Self()
+	require.True(t, ok)
+	manualClock.Forward(lv.Expiration.WallTime + 1)
+
+	for i := 0; i < tc.NumServers(); i++ {
+		isLive, err := tc.Server(i).NodeLiveness().(*liveness.NodeLiveness).IsLive(1)
+		require.NoError(t, err)
+		require.False(t, isLive)
+		tc.GetFirstStoreFromServer(t, i).UpdateLivenessMap()
+	}
+	t.Logf("n1 not live")
+
+	// Fetch the leader's initial status.
+	initialStatus := repl1.RaftStatus()
+	require.Equal(t, raft.StateLeader, initialStatus.RaftState)
+	logStatus(initialStatus)
+
+	// Unquiesce n2. This should cause it to see n1 as dead and immediately
+	// transition to pre-candidate, sending prevotes to n3. When n3 receives the
+	// prevote request and unquiesces, it will also see n1 as dead, and become a
+	// leaderless follower which enables it to grant n2's prevote despite having
+	// heard from n1 recently, and they can hold an election.
+	//
+	// n2 always wins, since n3 shouldn't become a candidate, only a leaderless
+	// follower, and we've disabled the election timeout.
+	require.True(t, repl2.MaybeUnquiesce())
+	t.Logf("n2 unquiesced")
+
+	require.Eventually(t, func() bool {
+		status := repl2.RaftStatus()
+		logStatus(status)
+		return status.RaftState == raft.StateLeader
+	}, 5*time.Second, 500*time.Millisecond)
+	t.Logf("n2 is leader")
+}
diff --git a/pkg/kv/kvserver/helpers_test.go b/pkg/kv/kvserver/helpers_test.go
index 1039d7383f39..7c463ecec827 100644
--- a/pkg/kv/kvserver/helpers_test.go
+++ b/pkg/kv/kvserver/helpers_test.go
@@ -97,6 +97,10 @@ func (s *Store) ComputeMVCCStats() (enginepb.MVCCStats, error) {
 	return totalStats, err
 }
 
+func (s *Store) UpdateLivenessMap() {
+	s.updateLivenessMap()
+}
+
 // ConsistencyQueueShouldQueue invokes the shouldQueue method on the
 // store's consistency queue.
 func ConsistencyQueueShouldQueue(
diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go
index f49b0c355fda..5cd10631f401 100644
--- a/pkg/kv/kvserver/replica_raft.go
+++ b/pkg/kv/kvserver/replica_raft.go
@@ -567,11 +567,11 @@ var errRemoved = errors.New("replica removed")
 func (r *Replica) stepRaftGroup(req *kvserverpb.RaftMessageRequest) error {
 	// We're processing an incoming raft message (from a batch that may
 	// include MsgVotes), so don't campaign if we wake up our raft
-	// group.
-	return r.withRaftGroup(false, func(raftGroup *raft.RawNode) (bool, error) {
+	// group to avoid election ties.
+	const mayCampaign = false
+	return r.withRaftGroup(mayCampaign, func(raftGroup *raft.RawNode) (bool, error) {
 		// If we're a follower, and we receive a message from a non-leader replica
-		// while quiesced, we wake up the leader too. This prevents spurious
-		// elections.
+		// while quiesced, we wake up the leader too to prevent spurious elections.
 		//
 		// This typically happens in the case of a partial network partition where
 		// some other replica is partitioned away from the leader but can reach this
@@ -584,19 +584,17 @@ func (r *Replica) stepRaftGroup(req *kvserverpb.RaftMessageRequest) error {
 		// up-to-date on the log), then we'll immediately transfer leadership back
 		// to the leaseholder, i.e. the old leader, and the cycle repeats.
 		//
+		// Even though we don't campaign, if we find our leader dead according
+		// to liveness we'll forget it and become a leaderless follower, allowing
+		// us to grant (pre)votes. See forgetLeaderLocked().
+		//
 		// Note that such partial partitions will typically result in persistent
 		// mass unquiescence due to the continuous prevotes.
 		if r.mu.quiescent {
 			st := r.raftBasicStatusRLocked()
 			hasLeader := st.RaftState == raft.StateFollower && st.Lead != 0
 			fromLeader := uint64(req.FromReplica.ReplicaID) == st.Lead
-
-			var wakeLeader, mayCampaign bool
-			if hasLeader && !fromLeader {
-				// TODO(erikgrinaker): This is likely to result in election ties, find
-				// some way to avoid that.
-				wakeLeader, mayCampaign = true, true
-			}
+			wakeLeader := hasLeader && !fromLeader
 			r.maybeUnquiesceLocked(wakeLeader, mayCampaign)
 		}
 		r.mu.lastUpdateTimes.update(req.FromReplica.ReplicaID, timeutil.Now())
@@ -2116,9 +2114,19 @@ func shouldCampaignOnWake(
 	return !livenessEntry.IsLive
 }
 
-// maybeCampaignOnWakeLocked is called when the range wakes from a
-// dormant state (either the initial "raftGroup == nil" state or after
-// being quiescent) and campaigns for raft leadership if appropriate.
+// maybeCampaignOnWakeLocked is called when the replica wakes from a dormant
+// state (either the initial "raftGroup == nil" state or after being quiescent),
+// and campaigns for raft leadership if appropriate: if it has no leader, or it
+// finds a dead leader in liveness.
+//
+// This will use PreVote+CheckQuorum, so it won't disturb an established leader
+// if one currently exists. However, if other replicas wake up to find a dead
+// leader in liveness, they will forget it and grant us a prevote, allowing us
+// to win an election immediately if a quorum considers the leader dead.
+//
+// This may result in a tie if multiple replicas wake simultaneously. However,
+// if other replicas wake in response to our (pre)vote request, they won't
+// campaign to avoid the tie.
 func (r *Replica) maybeCampaignOnWakeLocked(ctx context.Context) {
 	// Raft panics if a node that is not currently a member of the
 	// group tries to campaign. This method should never be called
@@ -2139,6 +2147,47 @@ func (r *Replica) maybeCampaignOnWakeLocked(ctx context.Context) {
 	}
 }
 
+// maybeForgetLeaderOnWakeLocked is called when the replica wakes from being
+// quiescent and should not campaign for leadership (to avoid election ties).
+// If it is a follower of a now-dead leader (according to liveness) it will
+// forget the leader to allow granting (pre)votes to a campaigner, such that if
+// a quorum see the leader dead they can elect a new leader immediately. Not
+// relevant when initializing the raft group, since it doesn't have a leader.
+func (r *Replica) maybeForgetLeaderOnWakeLocked(ctx context.Context) {
+	raftStatus := r.mu.internalRaftGroup.BasicStatus()
+	livenessMap, _ := r.store.livenessMap.Load().(livenesspb.IsLiveMap)
+	if shouldForgetLeaderOnWake(raftStatus, livenessMap, r.descRLocked()) {
+		r.forgetLeaderLocked(ctx)
+	}
+}
+
+func shouldForgetLeaderOnWake(
+	raftStatus raft.BasicStatus,
+	livenessMap livenesspb.IsLiveMap,
+	desc *roachpb.RangeDescriptor,
+) bool {
+	// If we're not a follower with a leader, there's noone to forget.
+	if raftStatus.RaftState != raft.StateFollower || raftStatus.Lead == raft.None {
+		return false
+	}
+
+	// If the leader isn't in our descriptor, assume it was removed and
+	// forget about it.
+	replDesc, ok := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(raftStatus.Lead))
+	if !ok {
+		return true
+	}
+
+	// If we don't know about the leader's liveness, assume it's dead and forget it.
+	livenessEntry, ok := livenessMap[replDesc.NodeID]
+	if !ok {
+		return true
+	}
+
+	// Forget the leader if it's no longer live.
+	return !livenessEntry.IsLive
+}
+
 // shouldCampaignOnLeaseRequestRedirect returns whether a replica that is
 // redirecting a lease request to its range's Raft leader should simultaneously
 // campaign to acquire Raft leadership itself. A follower replica may want to
@@ -2273,7 +2322,7 @@ func (r *Replica) forceCampaignLocked(ctx context.Context) {
 // timeout. However, if they independently see the leader dead in liveness when
 // unquiescing, they can forget the leader and grant the prevote. If a quorum of
 // replicas independently consider the leader dead, the candidate wins the
-// election.  If the leader isn't dead after all, the replica will revert to a
+// election. If the leader isn't dead after all, the replica will revert to a
 // follower upon hearing from it.
 //
 // In particular, since a quorum must agree that the leader is dead and forget
diff --git a/pkg/kv/kvserver/replica_raft_quiesce.go b/pkg/kv/kvserver/replica_raft_quiesce.go
index 57430f9ab92d..36fa5f6d475c 100644
--- a/pkg/kv/kvserver/replica_raft_quiesce.go
+++ b/pkg/kv/kvserver/replica_raft_quiesce.go
@@ -69,9 +69,13 @@ func (r *Replica) maybeUnquiesce(wakeLeader, mayCampaign bool) bool {
 // command anyway, or it knows the leader is awake because it received a message
 // from it.
 //
-// If mayCampaign is true, the replica may campaign if appropriate. This will
-// respect PreVote and CheckQuorum, and thus won't disrupt a current leader.
-// Should typically be true, unless the caller wants to avoid election ties.
+// If mayCampaign is true, the replica may campaign if it thinks the leader has
+// died in the meanwhile. This will respect PreVote and CheckQuorum, and thus
+// won't disrupt a current leader. Otherwise, if the leader is dead it will
+// forget about it and become a leaderless follower. Thus, if a quorum of
+// replicas independently consider the leader to be dead when unquiescing, they
+// can hold an election immediately despite PreVote+CheckQuorum. Should
+// typically be true, unless the caller wants to avoid election ties.
 func (r *Replica) maybeUnquiesceLocked(wakeLeader, mayCampaign bool) bool {
 	if !r.canUnquiesceRLocked() {
 		return false
@@ -106,6 +110,8 @@ func (r *Replica) maybeUnquiesceLocked(wakeLeader, mayCampaign bool) bool {
 	// we're wrong about it being dead.
 	if mayCampaign {
 		r.maybeCampaignOnWakeLocked(ctx)
+	} else {
+		r.maybeForgetLeaderOnWakeLocked(ctx)
 	}
 
 	return true
diff --git a/pkg/kv/kvserver/store_raft.go b/pkg/kv/kvserver/store_raft.go
index 0c06b7e29cbb..61b8d432c291 100644
--- a/pkg/kv/kvserver/store_raft.go
+++ b/pkg/kv/kvserver/store_raft.go
@@ -817,7 +817,8 @@ func (s *Store) updateLivenessMap() {
 		// will continually probe the connection. The check can also have false
 		// positives if the node goes down after populating the map, but that
 		// matters even less.
-		entry.IsLive = (s.cfg.NodeDialer.ConnHealth(nodeID, rpc.SystemClass) == nil)
+		entry.IsLive = !s.TestingKnobs().DisableLivenessMapConnHealth &&
+			(s.cfg.NodeDialer.ConnHealth(nodeID, rpc.SystemClass) == nil)
 		nextMap[nodeID] = entry
 	}
 	s.livenessMap.Store(nextMap)
diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go
index 02cf4c88d016..0b26fe770412 100644
--- a/pkg/kv/kvserver/testing_knobs.go
+++ b/pkg/kv/kvserver/testing_knobs.go
@@ -261,6 +261,10 @@ type StoreTestingKnobs struct {
 	RefreshReasonTicksPeriod int
 	// DisableProcessRaft disables the process raft loop.
 	DisableProcessRaft func(roachpb.StoreID) bool
+	// DisableLivenessMapConnHealth disables the ConnHealth check in
+	// updateIsLiveMap, which is useful in tests where we manipulate the node's
+	// liveness record but still keep the connection alive.
+	DisableLivenessMapConnHealth bool
 	// DisableLastProcessedCheck disables checking on replica queue last processed times.
 	DisableLastProcessedCheck bool
 	// ReplicateQueueAcceptsUnsplit allows the replication queue to