diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go index fdcf3ddbde18..e856c51f6076 100644 --- a/pkg/kv/kvserver/client_raft_test.go +++ b/pkg/kv/kvserver/client_raft_test.go @@ -6689,3 +6689,145 @@ func TestRaftUnquiesceLeaderNoProposal(t *testing.T) { require.Equal(t, initialStatus.Progress[1].Match, status.Progress[1].Match) t.Logf("n1 still leader with no new proposals at log index %d", status.Progress[1].Match) } + +// TestRaftPreVoteUnquiesceDeadLeader tests that if a quorum of replicas independently +// consider the leader dead, they can successfully hold an election despite +// having recently heard from a leader under the PreVote+CheckQuorum condition. +// It also tests that it does not result in an election tie. +// +// We quiesce the range and partition away the leader as such: +// +// n1 (leader) +// x x +// x x +// (follower) n2 ---- n3 (follower) +// +// We also mark the leader as dead in liveness, and then unquiesce n2. This +// should detect the dead leader via liveness, transition to pre-candidate, and +// solicit a prevote from n3. This should cause n3 to unquiesce, detect the dead +// leader, forget it (becoming a leaderless follower), and grant the prevote. +func TestRaftPreVoteUnquiesceDeadLeader(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + // Timing-sensitive, so skip under deadlock detector and stressrace. + skip.UnderDeadlock(t) + skip.UnderStressRace(t) + + ctx := context.Background() + manualClock := hlc.NewHybridManualClock() + + // Disable expiration-based leases, since these prevent quiescence. + st := cluster.MakeTestingClusterSettings() + kvserver.TransferExpirationLeasesFirstEnabled.Override(ctx, &st.SV, false) + kvserver.ExpirationLeasesOnly.Override(ctx, &st.SV, false) + + tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgs: base.TestServerArgs{ + Settings: st, + RaftConfig: base.RaftConfig{ + RaftEnableCheckQuorum: true, + RaftTickInterval: 200 * time.Millisecond, // speed up test + // Set an large election timeout. We don't want replicas to call + // elections due to timeouts, we want them to campaign and obtain + // prevotes because they detected a dead leader when unquiescing. + RaftElectionTimeoutTicks: 100, + }, + Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + WallClock: manualClock, + }, + Store: &kvserver.StoreTestingKnobs{ + DisableLivenessMapConnHealth: true, // to mark n1 as not live + }, + }, + }, + }) + defer tc.Stopper().Stop(ctx) + + logStatus := func(s *raft.Status) { + t.Helper() + require.NotNil(t, s) + t.Logf("n%d %s at term=%d commit=%d", s.ID, s.RaftState, s.Term, s.Commit) + } + + // Create a range, upreplicate it, and replicate a write. + sender := tc.GetFirstStoreFromServer(t, 0).TestSender() + key := tc.ScratchRange(t) + desc := tc.AddVotersOrFatal(t, key, tc.Targets(1, 2)...) + + _, pErr := kv.SendWrapped(ctx, sender, incrementArgs(key, 1)) + require.NoError(t, pErr.GoError()) + tc.WaitForValues(t, key, []int64{1, 1, 1}) + + repl1, err := tc.GetFirstStoreFromServer(t, 0).GetReplica(desc.RangeID) + require.NoError(t, err) + repl2, err := tc.GetFirstStoreFromServer(t, 1).GetReplica(desc.RangeID) + require.NoError(t, err) + repl3, err := tc.GetFirstStoreFromServer(t, 2).GetReplica(desc.RangeID) + require.NoError(t, err) + + // Set up a complete partition for n1, but don't activate it yet. + var partitioned atomic.Bool + dropRaftMessagesFrom(t, tc.Servers[0], desc.RangeID, []roachpb.ReplicaID{2, 3}, &partitioned) + dropRaftMessagesFrom(t, tc.Servers[1], desc.RangeID, []roachpb.ReplicaID{1}, &partitioned) + dropRaftMessagesFrom(t, tc.Servers[2], desc.RangeID, []roachpb.ReplicaID{1}, &partitioned) + + // Make sure the lease is on n1 and that everyone has applied it. + tc.TransferRangeLeaseOrFatal(t, desc, tc.Target(0)) + _, pErr = kv.SendWrapped(ctx, sender, incrementArgs(key, 1)) + require.NoError(t, pErr.GoError()) + tc.WaitForValues(t, key, []int64{2, 2, 2}) + t.Logf("n1 has lease") + + // Wait for the range to quiesce. + require.Eventually(t, func() bool { + return repl1.IsQuiescent() && repl2.IsQuiescent() && repl3.IsQuiescent() + }, 10*time.Second, 100*time.Millisecond) + t.Logf("n1, n2, and n3 quiesced") + + // Partition n1. + partitioned.Store(true) + t.Logf("n1 partitioned") + + // Pause n1's heartbeats and move the clock to expire its lease and liveness. + l1 := tc.Server(0).NodeLiveness().(*liveness.NodeLiveness) + resumeHeartbeats := l1.PauseAllHeartbeatsForTest() + defer resumeHeartbeats() + + lv, ok := l1.Self() + require.True(t, ok) + manualClock.Forward(lv.Expiration.WallTime + 1) + + for i := 0; i < tc.NumServers(); i++ { + isLive, err := tc.Server(i).NodeLiveness().(*liveness.NodeLiveness).IsLive(1) + require.NoError(t, err) + require.False(t, isLive) + tc.GetFirstStoreFromServer(t, i).UpdateLivenessMap() + } + t.Logf("n1 not live") + + // Fetch the leader's initial status. + initialStatus := repl1.RaftStatus() + require.Equal(t, raft.StateLeader, initialStatus.RaftState) + logStatus(initialStatus) + + // Unquiesce n2. This should cause it to see n1 as dead and immediately + // transition to pre-candidate, sending prevotes to n3. When n3 receives the + // prevote request and unquiesces, it will also see n1 as dead, and become a + // leaderless follower which enables it to grant n2's prevote despite having + // heard from n1 recently, and they can hold an election. + // + // n2 always wins, since n3 shouldn't become a candidate, only a leaderless + // follower, and we've disabled the election timeout. + require.True(t, repl2.MaybeUnquiesce()) + t.Logf("n2 unquiesced") + + require.Eventually(t, func() bool { + status := repl2.RaftStatus() + logStatus(status) + return status.RaftState == raft.StateLeader + }, 5*time.Second, 500*time.Millisecond) + t.Logf("n2 is leader") +} diff --git a/pkg/kv/kvserver/helpers_test.go b/pkg/kv/kvserver/helpers_test.go index 1039d7383f39..7c463ecec827 100644 --- a/pkg/kv/kvserver/helpers_test.go +++ b/pkg/kv/kvserver/helpers_test.go @@ -97,6 +97,10 @@ func (s *Store) ComputeMVCCStats() (enginepb.MVCCStats, error) { return totalStats, err } +func (s *Store) UpdateLivenessMap() { + s.updateLivenessMap() +} + // ConsistencyQueueShouldQueue invokes the shouldQueue method on the // store's consistency queue. func ConsistencyQueueShouldQueue( diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index f49b0c355fda..5cd10631f401 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -567,11 +567,11 @@ var errRemoved = errors.New("replica removed") func (r *Replica) stepRaftGroup(req *kvserverpb.RaftMessageRequest) error { // We're processing an incoming raft message (from a batch that may // include MsgVotes), so don't campaign if we wake up our raft - // group. - return r.withRaftGroup(false, func(raftGroup *raft.RawNode) (bool, error) { + // group to avoid election ties. + const mayCampaign = false + return r.withRaftGroup(mayCampaign, func(raftGroup *raft.RawNode) (bool, error) { // If we're a follower, and we receive a message from a non-leader replica - // while quiesced, we wake up the leader too. This prevents spurious - // elections. + // while quiesced, we wake up the leader too to prevent spurious elections. // // This typically happens in the case of a partial network partition where // some other replica is partitioned away from the leader but can reach this @@ -584,19 +584,17 @@ func (r *Replica) stepRaftGroup(req *kvserverpb.RaftMessageRequest) error { // up-to-date on the log), then we'll immediately transfer leadership back // to the leaseholder, i.e. the old leader, and the cycle repeats. // + // Even though we don't campaign, if we find our leader dead according + // to liveness we'll forget it and become a leaderless follower, allowing + // us to grant (pre)votes. See forgetLeaderLocked(). + // // Note that such partial partitions will typically result in persistent // mass unquiescence due to the continuous prevotes. if r.mu.quiescent { st := r.raftBasicStatusRLocked() hasLeader := st.RaftState == raft.StateFollower && st.Lead != 0 fromLeader := uint64(req.FromReplica.ReplicaID) == st.Lead - - var wakeLeader, mayCampaign bool - if hasLeader && !fromLeader { - // TODO(erikgrinaker): This is likely to result in election ties, find - // some way to avoid that. - wakeLeader, mayCampaign = true, true - } + wakeLeader := hasLeader && !fromLeader r.maybeUnquiesceLocked(wakeLeader, mayCampaign) } r.mu.lastUpdateTimes.update(req.FromReplica.ReplicaID, timeutil.Now()) @@ -2116,9 +2114,19 @@ func shouldCampaignOnWake( return !livenessEntry.IsLive } -// maybeCampaignOnWakeLocked is called when the range wakes from a -// dormant state (either the initial "raftGroup == nil" state or after -// being quiescent) and campaigns for raft leadership if appropriate. +// maybeCampaignOnWakeLocked is called when the replica wakes from a dormant +// state (either the initial "raftGroup == nil" state or after being quiescent), +// and campaigns for raft leadership if appropriate: if it has no leader, or it +// finds a dead leader in liveness. +// +// This will use PreVote+CheckQuorum, so it won't disturb an established leader +// if one currently exists. However, if other replicas wake up to find a dead +// leader in liveness, they will forget it and grant us a prevote, allowing us +// to win an election immediately if a quorum considers the leader dead. +// +// This may result in a tie if multiple replicas wake simultaneously. However, +// if other replicas wake in response to our (pre)vote request, they won't +// campaign to avoid the tie. func (r *Replica) maybeCampaignOnWakeLocked(ctx context.Context) { // Raft panics if a node that is not currently a member of the // group tries to campaign. This method should never be called @@ -2139,6 +2147,47 @@ func (r *Replica) maybeCampaignOnWakeLocked(ctx context.Context) { } } +// maybeForgetLeaderOnWakeLocked is called when the replica wakes from being +// quiescent and should not campaign for leadership (to avoid election ties). +// If it is a follower of a now-dead leader (according to liveness) it will +// forget the leader to allow granting (pre)votes to a campaigner, such that if +// a quorum see the leader dead they can elect a new leader immediately. Not +// relevant when initializing the raft group, since it doesn't have a leader. +func (r *Replica) maybeForgetLeaderOnWakeLocked(ctx context.Context) { + raftStatus := r.mu.internalRaftGroup.BasicStatus() + livenessMap, _ := r.store.livenessMap.Load().(livenesspb.IsLiveMap) + if shouldForgetLeaderOnWake(raftStatus, livenessMap, r.descRLocked()) { + r.forgetLeaderLocked(ctx) + } +} + +func shouldForgetLeaderOnWake( + raftStatus raft.BasicStatus, + livenessMap livenesspb.IsLiveMap, + desc *roachpb.RangeDescriptor, +) bool { + // If we're not a follower with a leader, there's noone to forget. + if raftStatus.RaftState != raft.StateFollower || raftStatus.Lead == raft.None { + return false + } + + // If the leader isn't in our descriptor, assume it was removed and + // forget about it. + replDesc, ok := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(raftStatus.Lead)) + if !ok { + return true + } + + // If we don't know about the leader's liveness, assume it's dead and forget it. + livenessEntry, ok := livenessMap[replDesc.NodeID] + if !ok { + return true + } + + // Forget the leader if it's no longer live. + return !livenessEntry.IsLive +} + // shouldCampaignOnLeaseRequestRedirect returns whether a replica that is // redirecting a lease request to its range's Raft leader should simultaneously // campaign to acquire Raft leadership itself. A follower replica may want to @@ -2273,7 +2322,7 @@ func (r *Replica) forceCampaignLocked(ctx context.Context) { // timeout. However, if they independently see the leader dead in liveness when // unquiescing, they can forget the leader and grant the prevote. If a quorum of // replicas independently consider the leader dead, the candidate wins the -// election. If the leader isn't dead after all, the replica will revert to a +// election. If the leader isn't dead after all, the replica will revert to a // follower upon hearing from it. // // In particular, since a quorum must agree that the leader is dead and forget diff --git a/pkg/kv/kvserver/replica_raft_quiesce.go b/pkg/kv/kvserver/replica_raft_quiesce.go index 57430f9ab92d..36fa5f6d475c 100644 --- a/pkg/kv/kvserver/replica_raft_quiesce.go +++ b/pkg/kv/kvserver/replica_raft_quiesce.go @@ -69,9 +69,13 @@ func (r *Replica) maybeUnquiesce(wakeLeader, mayCampaign bool) bool { // command anyway, or it knows the leader is awake because it received a message // from it. // -// If mayCampaign is true, the replica may campaign if appropriate. This will -// respect PreVote and CheckQuorum, and thus won't disrupt a current leader. -// Should typically be true, unless the caller wants to avoid election ties. +// If mayCampaign is true, the replica may campaign if it thinks the leader has +// died in the meanwhile. This will respect PreVote and CheckQuorum, and thus +// won't disrupt a current leader. Otherwise, if the leader is dead it will +// forget about it and become a leaderless follower. Thus, if a quorum of +// replicas independently consider the leader to be dead when unquiescing, they +// can hold an election immediately despite PreVote+CheckQuorum. Should +// typically be true, unless the caller wants to avoid election ties. func (r *Replica) maybeUnquiesceLocked(wakeLeader, mayCampaign bool) bool { if !r.canUnquiesceRLocked() { return false @@ -106,6 +110,8 @@ func (r *Replica) maybeUnquiesceLocked(wakeLeader, mayCampaign bool) bool { // we're wrong about it being dead. if mayCampaign { r.maybeCampaignOnWakeLocked(ctx) + } else { + r.maybeForgetLeaderOnWakeLocked(ctx) } return true diff --git a/pkg/kv/kvserver/store_raft.go b/pkg/kv/kvserver/store_raft.go index 0c06b7e29cbb..61b8d432c291 100644 --- a/pkg/kv/kvserver/store_raft.go +++ b/pkg/kv/kvserver/store_raft.go @@ -817,7 +817,8 @@ func (s *Store) updateLivenessMap() { // will continually probe the connection. The check can also have false // positives if the node goes down after populating the map, but that // matters even less. - entry.IsLive = (s.cfg.NodeDialer.ConnHealth(nodeID, rpc.SystemClass) == nil) + entry.IsLive = !s.TestingKnobs().DisableLivenessMapConnHealth && + (s.cfg.NodeDialer.ConnHealth(nodeID, rpc.SystemClass) == nil) nextMap[nodeID] = entry } s.livenessMap.Store(nextMap) diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go index 02cf4c88d016..0b26fe770412 100644 --- a/pkg/kv/kvserver/testing_knobs.go +++ b/pkg/kv/kvserver/testing_knobs.go @@ -261,6 +261,10 @@ type StoreTestingKnobs struct { RefreshReasonTicksPeriod int // DisableProcessRaft disables the process raft loop. DisableProcessRaft func(roachpb.StoreID) bool + // DisableLivenessMapConnHealth disables the ConnHealth check in + // updateIsLiveMap, which is useful in tests where we manipulate the node's + // liveness record but still keep the connection alive. + DisableLivenessMapConnHealth bool // DisableLastProcessedCheck disables checking on replica queue last processed times. DisableLastProcessedCheck bool // ReplicateQueueAcceptsUnsplit allows the replication queue to