From 4038ef8c23a9d8f712f7dac81bddbe09d64c4f4a Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 15 Jun 2023 12:53:51 +0000 Subject: [PATCH 1/3] kvserver: clarify Raft campaign behavior with PreVote+CheckQuorum This patch tweaks and clarifies explicit campaign behavior when Raft PreVote and CheckQuorum are enabled. In this case, followers will only grant prevotes if they haven't heard from a leader in the past election timeout. It also adds a test covering this. Epic: none Release note: None --- pkg/base/config.go | 3 +- pkg/kv/kvserver/client_raft_test.go | 77 ++++++++++++++++++++ pkg/kv/kvserver/helpers_test.go | 7 ++ pkg/kv/kvserver/replica_proposal_buf.go | 9 ++- pkg/kv/kvserver/replica_proposal_buf_test.go | 6 ++ pkg/kv/kvserver/replica_raft.go | 18 +++++ 6 files changed, 116 insertions(+), 4 deletions(-) diff --git a/pkg/base/config.go b/pkg/base/config.go index e5899c3a72f8..e5385724d17e 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -256,7 +256,8 @@ var ( // partition from stealing away leadership from an established leader // (assuming they have an up-to-date log, which they do with a read-only // workload). With asymmetric or partial network partitions, this can allow an - // unreachable node to steal away leadership, leading to range unavailability. + // unreachable node to steal leadership away from the leaseholder, leading to + // range unavailability if the leaseholder can no longer reach the leader. // // The asymmetric partition concerns have largely been addressed by RPC // dialback (see rpc.dialback.enabled), but the partial partition concerns diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go index 72fdbe5b30b1..f86f0eabd316 100644 --- a/pkg/kv/kvserver/client_raft_test.go +++ b/pkg/kv/kvserver/client_raft_test.go @@ -5933,6 +5933,83 @@ func TestRaftSnapshotsWithMVCCRangeKeysEverywhere(t *testing.T) { } } +// TestRaftCampaignPreVoteCheckQuorum tests that campaignLocked() respects +// PreVote+CheckQuorum, by not granting prevotes if there is an active leader. +func TestRaftCampaignPreVoteCheckQuorum(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + // Timing-sensitive, so skip under deadlock detector and stressrace. + skip.UnderDeadlock(t) + skip.UnderStressRace(t) + + ctx := context.Background() + + tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgs: base.TestServerArgs{ + RaftConfig: base.RaftConfig{ + RaftEnableCheckQuorum: true, + RaftTickInterval: 100 * time.Millisecond, // speed up test + }, + }, + }) + defer tc.Stopper().Stop(ctx) + + logStatus := func(s *raft.Status) { + t.Helper() + require.NotNil(t, s) + t.Logf("n%d %s at term=%d commit=%d", s.ID, s.RaftState, s.Term, s.Commit) + } + + sender := tc.GetFirstStoreFromServer(t, 0).TestSender() + + // Create a range, upreplicate it, and replicate a write. + key := tc.ScratchRange(t) + desc := tc.AddVotersOrFatal(t, key, tc.Targets(1, 2)...) + _, pErr := kv.SendWrapped(ctx, sender, incrementArgs(key, 1)) + require.NoError(t, pErr.GoError()) + tc.WaitForValues(t, key, []int64{1, 1, 1}) + + repl1, err := tc.GetFirstStoreFromServer(t, 0).GetReplica(desc.RangeID) + require.NoError(t, err) + repl2, err := tc.GetFirstStoreFromServer(t, 1).GetReplica(desc.RangeID) + require.NoError(t, err) + repl3, err := tc.GetFirstStoreFromServer(t, 2).GetReplica(desc.RangeID) + require.NoError(t, err) + repls := []*kvserver.Replica{repl1, repl2, repl3} + + // Make sure n1 is leader. + initialStatus := repl1.RaftStatus() + require.Equal(t, raft.StateLeader, initialStatus.RaftState) + logStatus(initialStatus) + t.Logf("n1 is leader") + + // Campaign n3. It shouldn't win prevotes, reverting to follower + // in the current term. + repl3.Campaign(ctx) + t.Logf("n3 campaigning") + + require.Eventually(t, func() bool { + status := repl3.RaftStatus() + logStatus(status) + return status.RaftState == raft.StateFollower + }, 10*time.Second, 500*time.Millisecond) + t.Logf("n3 reverted to follower") + + // n1 should still be the leader in the same term, with n2 and n3 followers. + for _, repl := range repls { + st := repl.RaftStatus() + logStatus(st) + if st.ID == 1 { + require.Equal(t, raft.StateLeader, st.RaftState) + } else { + require.Equal(t, raft.StateFollower, st.RaftState) + } + require.Equal(t, initialStatus.Term, st.Term) + } +} + // TestRaftPreVote tests that Raft PreVote works properly, including the recent // leader check only enabled via CheckQuorum. Specifically, a replica that's // partitioned away from the leader (or restarted) should not be able to call an diff --git a/pkg/kv/kvserver/helpers_test.go b/pkg/kv/kvserver/helpers_test.go index 04a550293da0..c6fd7b88b7ff 100644 --- a/pkg/kv/kvserver/helpers_test.go +++ b/pkg/kv/kvserver/helpers_test.go @@ -282,6 +282,13 @@ func (r *Replica) LastAssignedLeaseIndex() kvpb.LeaseAppliedIndex { return r.mu.proposalBuf.LastAssignedLeaseIndexRLocked() } +// Campaign campaigns the replica. +func (r *Replica) Campaign(ctx context.Context) { + r.mu.Lock() + defer r.mu.Unlock() + r.campaignLocked(ctx) +} + // LastAssignedLeaseIndexRLocked is like LastAssignedLeaseIndex, but requires // b.mu to be held in read mode. func (b *propBuf) LastAssignedLeaseIndexRLocked() kvpb.LeaseAppliedIndex { diff --git a/pkg/kv/kvserver/replica_proposal_buf.go b/pkg/kv/kvserver/replica_proposal_buf.go index d97220444277..593cbc47cd0f 100644 --- a/pkg/kv/kvserver/replica_proposal_buf.go +++ b/pkg/kv/kvserver/replica_proposal_buf.go @@ -162,6 +162,7 @@ type proposer interface { // The following require the proposer to hold an exclusive lock. withGroupLocked(func(proposerRaft) error) error registerProposalLocked(*ProposalData) + campaignLocked(ctx context.Context) // rejectProposalWithRedirectLocked rejects a proposal and redirects the // proposer to try it on another node. This is used to sometimes reject lease // acquisitions when another replica is the leader; the intended consequence @@ -670,9 +671,7 @@ func (b *propBuf) maybeRejectUnsafeProposalLocked( b.p.rejectProposalWithRedirectLocked(ctx, p, li.leader) if b.p.shouldCampaignOnRedirect(raftGroup) { log.VEventf(ctx, 2, "campaigning because Raft leader not live in node liveness map") - if err := raftGroup.Campaign(); err != nil { - log.VEventf(ctx, 1, "failed to campaign: %s", err) - } + b.p.campaignLocked(ctx) } return true } @@ -1340,6 +1339,10 @@ func (rp *replicaProposer) shouldCampaignOnRedirect(raftGroup proposerRaft) bool ) } +func (rp *replicaProposer) campaignLocked(ctx context.Context) { + (*Replica)(rp).campaignLocked(ctx) +} + func (rp *replicaProposer) flowControlHandle(ctx context.Context) kvflowcontrol.Handle { handle, found := rp.mu.replicaFlowControlIntegration.handle() if !found { diff --git a/pkg/kv/kvserver/replica_proposal_buf_test.go b/pkg/kv/kvserver/replica_proposal_buf_test.go index c0ba04eeaa26..93c6cebcc22a 100644 --- a/pkg/kv/kvserver/replica_proposal_buf_test.go +++ b/pkg/kv/kvserver/replica_proposal_buf_test.go @@ -236,6 +236,12 @@ func (t *testProposer) shouldCampaignOnRedirect(raftGroup proposerRaft) bool { return t.leaderNotLive } +func (t *testProposer) campaignLocked(ctx context.Context) { + if err := t.raftGroup.Campaign(); err != nil { + panic(err) + } +} + func (t *testProposer) rejectProposalWithRedirectLocked( _ context.Context, _ *ProposalData, redirectTo roachpb.ReplicaID, ) { diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index cd43e7e778ad..3c065edd9d7c 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -2274,6 +2274,24 @@ func shouldCampaignOnLeaseRequestRedirect( return !livenessEntry.Liveness.IsLive(now) } +// campaignLocked campaigns for raft leadership, using PreVote and, if +// CheckQuorum is enabled, the recent leader condition. That is, followers will +// not grant prevotes if we're behind on the log and, with CheckQuorum, if +// they've heard from a leader in the past election timeout interval. +// +// The CheckQuorum condition can delay elections, particularly with quiesced +// ranges that don't tick. However, it is necessary to avoid spurious elections +// and stolen leaderships during partial/asymmetric network partitions, which +// can lead to permanent unavailability if the leaseholder can no longer reach +// the leader. +// +// Only followers enforce the CheckQuorum recent leader condition though, so if +// a quorum of followers consider the leader dead and choose to become +// pre-candidates and campaign then they will grant prevotes and can hold an +// election without waiting out the election timeout, but this can result in +// election ties if a quorum does so simultaneously. Followers and +// pre-candidates will also grant any number of pre-votes, both for themselves +// and anyone else that's eligible. func (r *Replica) campaignLocked(ctx context.Context) { log.VEventf(ctx, 3, "campaigning") if err := r.mu.internalRaftGroup.Campaign(); err != nil { From 99c8d4e4d0d25d5ffbddd0492eaf52488bf297bc Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 15 Jun 2023 13:15:58 +0000 Subject: [PATCH 2/3] kvserver: add `Replica.forceCampaignLocked()` This patch adds `forceCampaignLocked()`, which can be used to force an election, transitioning the replica directly to candidate and bypassing PreVote+CheckQuorum. Epic: none Release note: None --- pkg/kv/kvserver/client_raft_test.go | 78 +++++++++++++++++++++++++++++ pkg/kv/kvserver/helpers_test.go | 7 +++ pkg/kv/kvserver/replica_raft.go | 17 +++++++ 3 files changed, 102 insertions(+) diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go index f86f0eabd316..59653f7f726b 100644 --- a/pkg/kv/kvserver/client_raft_test.go +++ b/pkg/kv/kvserver/client_raft_test.go @@ -6010,6 +6010,84 @@ func TestRaftCampaignPreVoteCheckQuorum(t *testing.T) { } } +// TestRaftForceCampaignPreVoteCheckQuorum tests that forceCampaignLocked() +// ignores PreVote+CheckQuorum, transitioning directly to candidate and bumping +// the term. It may not actually win or hold onto leadership, but bumping the +// term is proof enough that it called an election. +func TestRaftForceCampaignPreVoteCheckQuorum(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + // Timing-sensitive, so skip under deadlock detector and stressrace. + skip.UnderDeadlock(t) + skip.UnderStressRace(t) + + ctx := context.Background() + + tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgs: base.TestServerArgs{ + RaftConfig: base.RaftConfig{ + RaftEnableCheckQuorum: true, + RaftTickInterval: 200 * time.Millisecond, // speed up test + RaftHeartbeatIntervalTicks: 10, // allow n3 to win the election + RaftElectionTimeoutTicks: 20, + }, + }, + }) + defer tc.Stopper().Stop(ctx) + + logStatus := func(s *raft.Status) { + t.Helper() + require.NotNil(t, s) + t.Logf("n%d %s at term=%d commit=%d", s.ID, s.RaftState, s.Term, s.Commit) + } + + sender := tc.GetFirstStoreFromServer(t, 0).TestSender() + + // Create a range, upreplicate it, and replicate a write. + key := tc.ScratchRange(t) + desc := tc.AddVotersOrFatal(t, key, tc.Targets(1, 2)...) + _, pErr := kv.SendWrapped(ctx, sender, incrementArgs(key, 1)) + require.NoError(t, pErr.GoError()) + tc.WaitForValues(t, key, []int64{1, 1, 1}) + + repl1, err := tc.GetFirstStoreFromServer(t, 0).GetReplica(desc.RangeID) + require.NoError(t, err) + repl2, err := tc.GetFirstStoreFromServer(t, 2).GetReplica(desc.RangeID) + require.NoError(t, err) + repl3, err := tc.GetFirstStoreFromServer(t, 2).GetReplica(desc.RangeID) + require.NoError(t, err) + repls := []*kvserver.Replica{repl1, repl2, repl3} + + // Make sure n1 is leader. + initialStatus := repl1.RaftStatus() + require.Equal(t, raft.StateLeader, initialStatus.RaftState) + logStatus(initialStatus) + t.Logf("n1 is leader in term %d", initialStatus.Term) + + // Force-campaign n3. It may not win or hold onto leadership, but it's enough + // to know that it bumped the term. + repl3.ForceCampaign(ctx) + t.Logf("n3 campaigning") + + var leaderStatus *raft.Status + require.Eventually(t, func() bool { + for _, repl := range repls { + st := repl.RaftStatus() + logStatus(st) + if st.Term <= initialStatus.Term { + return false + } + if st.RaftState == raft.StateLeader { + leaderStatus = st + } + } + return leaderStatus != nil + }, 10*time.Second, 500*time.Millisecond) + t.Logf("n%d is leader, with bumped term %d", leaderStatus.ID, leaderStatus.Term) +} + // TestRaftPreVote tests that Raft PreVote works properly, including the recent // leader check only enabled via CheckQuorum. Specifically, a replica that's // partitioned away from the leader (or restarted) should not be able to call an diff --git a/pkg/kv/kvserver/helpers_test.go b/pkg/kv/kvserver/helpers_test.go index c6fd7b88b7ff..3d81bb6aebb9 100644 --- a/pkg/kv/kvserver/helpers_test.go +++ b/pkg/kv/kvserver/helpers_test.go @@ -289,6 +289,13 @@ func (r *Replica) Campaign(ctx context.Context) { r.campaignLocked(ctx) } +// ForceCampaign force-campaigns the replica. +func (r *Replica) ForceCampaign(ctx context.Context) { + r.mu.Lock() + defer r.mu.Unlock() + r.forceCampaignLocked(ctx) +} + // LastAssignedLeaseIndexRLocked is like LastAssignedLeaseIndex, but requires // b.mu to be held in read mode. func (b *propBuf) LastAssignedLeaseIndexRLocked() kvpb.LeaseAppliedIndex { diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 3c065edd9d7c..20665a1f44b0 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -2300,6 +2300,23 @@ func (r *Replica) campaignLocked(ctx context.Context) { r.store.enqueueRaftUpdateCheck(r.RangeID) } +// forceCampaignLocked campaigns for raft leadership, but skips the +// pre-candidate/pre-vote stage, calling an immediate election as candidate, and +// bypasses the CheckQuorum recent leader condition for votes. +// +// This will disrupt an existing leader, and can cause prolonged unavailability +// under partial/asymmetric network partitions. It should only be used when the +// caller is certain that the current leader is actually dead, and we're not +// simply partitioned away from it and/or liveness. +func (r *Replica) forceCampaignLocked(ctx context.Context) { + log.VEventf(ctx, 3, "force campaigning") + msg := raftpb.Message{To: uint64(r.replicaID), Type: raftpb.MsgTimeoutNow} + if err := r.mu.internalRaftGroup.Step(msg); err != nil { + log.VEventf(ctx, 1, "failed to campaign: %s", err) + } + r.store.enqueueRaftUpdateCheck(r.RangeID) +} + // a lastUpdateTimesMap is maintained on the Raft leader to keep track of the // last communication received from followers, which in turn informs the quota // pool and log truncations. From 2959ddad072f0d66e17d59846e8cdd1367188530 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 15 Jun 2023 13:18:44 +0000 Subject: [PATCH 3/3] kvserver: force-campaign leaseholder on leader removal Previously, when the leader was removed from the range via a conf change, the first voter in the range descriptor would campaign to avoid waiting for an election timeout. This had a few drawbacks: * If the first voter is unavailable or lags, noone will campaign. * If the first voter isn't the leaseholder, it has to immediately transfer leadership to the leaseholder. * It used Raft PreVote, so it wouldn't be able to win when CheckQuorum is enabled, since followers won't grant votes when they've recently heard from the leader. This patch instead campaigns on the current leaseholder. We know there can only be one, avoiding election ties. The conf change is typically proposed by the leaseholder anyway so it's likely to be up-to-date. And we want it to be colocated with the leader. If there is no leaseholder then waiting out the election timeout is less problematic, since either we'll have to wait out the lease interval anyway, or the range is idle. It also forces an election by transitioning directly to candidate, bypassing PreVote. This is ok, since we know the current leader is dead. To avoid election ties in mixed 23.1/23.2 clusters, we retain the old voter designation until the upgrade is finalized, but always force an election instead of using pre-vote. Epic: none Release note: None --- pkg/kv/kvserver/client_raft_test.go | 103 ++++++++++++++++++++++++++++ pkg/kv/kvserver/replica_raft.go | 81 ++++++++++++++-------- 2 files changed, 155 insertions(+), 29 deletions(-) diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go index 59653f7f726b..aaa4785e97ba 100644 --- a/pkg/kv/kvserver/client_raft_test.go +++ b/pkg/kv/kvserver/client_raft_test.go @@ -6479,3 +6479,106 @@ func TestRaftCheckQuorum(t *testing.T) { }) }) } + +// TestRaftLeaderRemovesItself tests that when a raft leader removes itself via +// a conf change the leaseholder campaigns for leadership, ignoring +// PreVote+CheckQuorum and transitioning directly to candidate. +// +// We set up three replicas: +// +// n1: Raft leader +// n2: follower +// n3: follower + leaseholder +// +// We disable leader following the leaseholder (which would otherwise transfer +// leadership if we happened to tick during the conf change), and then remove n1 +// from the range. n3 should acquire leadership. +// +// We disable election timeouts, such that the only way n3 can become leader is +// by campaigning explicitly. Furthermore, it must skip pre-votes, since with +// PreVote+CheckQuorum n2 wouldn't vote for it (it would think n1 was still the +// leader). +func TestRaftLeaderRemovesItself(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + // Timing-sensitive, so skip under deadlock detector and stressrace. + skip.UnderDeadlock(t) + skip.UnderStressRace(t) + + ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) + defer cancel() + + tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgs: base.TestServerArgs{ + RaftConfig: base.RaftConfig{ + RaftEnableCheckQuorum: true, + RaftTickInterval: 100 * time.Millisecond, // speed up test + // Set a large election timeout. We don't want replicas to call + // elections due to timeouts, we want them to campaign and obtain + // votes despite PreVote+CheckQuorum. + RaftElectionTimeoutTicks: 200, + }, + Knobs: base.TestingKnobs{ + Store: &kvserver.StoreTestingKnobs{ + DisableLeaderFollowsLeaseholder: true, // the leader should stay put + }, + }, + }, + }) + defer tc.Stopper().Stop(ctx) + + logStatus := func(s *raft.Status) { + t.Helper() + require.NotNil(t, s) + t.Logf("n%d %s at term=%d commit=%d", s.ID, s.RaftState, s.Term, s.Commit) + } + + send1 := tc.GetFirstStoreFromServer(t, 0).TestSender() + send3 := tc.GetFirstStoreFromServer(t, 2).TestSender() + + // Create a range, upreplicate it, and replicate a write. + key := tc.ScratchRange(t) + desc := tc.AddVotersOrFatal(t, key, tc.Targets(1, 2)...) + _, pErr := kv.SendWrapped(ctx, send1, incrementArgs(key, 1)) + require.NoError(t, pErr.GoError()) + tc.WaitForValues(t, key, []int64{1, 1, 1}) + + repl1, err := tc.GetFirstStoreFromServer(t, 0).GetReplica(desc.RangeID) + require.NoError(t, err) + repl2, err := tc.GetFirstStoreFromServer(t, 1).GetReplica(desc.RangeID) + require.NoError(t, err) + repl3, err := tc.GetFirstStoreFromServer(t, 2).GetReplica(desc.RangeID) + require.NoError(t, err) + + // Move the lease to n3, and make sure everyone has applied it. + tc.TransferRangeLeaseOrFatal(t, desc, tc.Target(2)) + require.Eventually(t, func() bool { + lease, _ := repl3.GetLease() + return lease.Replica.ReplicaID == repl3.ReplicaID() + }, 10*time.Second, 500*time.Millisecond) + _, pErr = kv.SendWrapped(ctx, send3, incrementArgs(key, 1)) + require.NoError(t, pErr.GoError()) + tc.WaitForValues(t, key, []int64{2, 2, 2}) + t.Logf("n3 has lease") + + // Make sure n1 is still leader. + st := repl1.RaftStatus() + require.Equal(t, raft.StateLeader, st.RaftState) + logStatus(st) + + // Remove n1 and wait for n3 to become leader. + tc.RemoveVotersOrFatal(t, key, tc.Target(0)) + t.Logf("n1 removed from range") + + require.Eventually(t, func() bool { + logStatus(repl2.RaftStatus()) + logStatus(repl3.RaftStatus()) + if repl3.RaftStatus().RaftState == raft.StateLeader { + t.Logf("n3 is leader") + return true + } + return false + }, 10*time.Second, 500*time.Millisecond) +} diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 20665a1f44b0..4e07aef86788 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -17,6 +17,7 @@ import ( "strings" "time" + "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/apply" @@ -33,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/uncertainty" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb" @@ -1166,12 +1168,15 @@ func (r *Replica) handleRaftReadyRaftMuLocked( r.deliverLocalRaftMsgsRaftMuLockedReplicaMuLocked(ctx, raftGroup) if stats.apply.numConfChangeEntries > 0 { - // If the raft leader got removed, campaign the first remaining voter. - // - // NB: this must be called after Advance() above since campaigning is - // a no-op in the presence of unapplied conf changes. - if shouldCampaignAfterConfChange(ctx, r.store.StoreID(), r.descRLocked(), raftGroup) { - r.campaignLocked(ctx) + // If the raft leader got removed, campaign on the leaseholder. Uses + // forceCampaignLocked() to bypass PreVote+CheckQuorum, since we otherwise + // wouldn't get prevotes from other followers who recently heard from the + // old leader. We know the leader isn't around anymore anyway. + leaseStatus := r.leaseStatusAtRLocked(ctx, r.store.Clock().NowAsClockTimestamp()) + raftStatus := raftGroup.BasicStatus() + if shouldCampaignAfterConfChange(ctx, r.store.ClusterSettings(), r.store.StoreID(), + r.descRLocked(), raftStatus, leaseStatus) { + r.forceCampaignLocked(ctx) } } @@ -2602,40 +2607,58 @@ func ComputeRaftLogSize( return ms.SysBytes + totalSideloaded, nil } +// shouldCampaignAfterConfChange returns true if the current replica should +// campaign after a conf change. If the leader replica is removed, the +// leaseholder should campaign. We don't want to campaign on multiple replicas, +// since that would cause ties. +// +// If there is no current leaseholder we'll have to wait out the election +// timeout before someone campaigns, but that's ok -- either we'll have to wait +// for the lease to expire anyway, or the range is presumably idle. +// +// The caller should campaign using forceCampaignLocked(), transitioning +// directly to candidate and bypassing PreVote+CheckQuorum. Otherwise it won't +// receive prevotes since other replicas have heard from the leader recently. func shouldCampaignAfterConfChange( ctx context.Context, + st *cluster.Settings, storeID roachpb.StoreID, desc *roachpb.RangeDescriptor, - raftGroup *raft.RawNode, + raftStatus raft.BasicStatus, + leaseStatus kvserverpb.LeaseStatus, ) bool { - // If a config change was carried out, it's possible that the Raft - // leader was removed. Verify that, and if so, campaign if we are - // the first remaining voter replica. Without this, the range will - // be leaderless (and thus unavailable) for a few seconds. - // - // We can't (or rather shouldn't) campaign on all remaining voters - // because that can lead to a stalemate. For example, three voters - // may all make it through PreVote and then reject each other. - st := raftGroup.BasicStatus() - if st.Lead == 0 { - // Leader unknown. This isn't what we expect in steady state, so we - // don't do anything. + if raftStatus.Lead == 0 { + // Leader unknown. We can't know if it was removed by the conf change, and + // because we force an election without prevote we don't want to risk + // throwing spurious elections. + return false + } + if raftStatus.RaftState == raft.StateLeader { + // We're already the leader, no point in campaigning. return false } if !desc.IsInitialized() { - // We don't have an initialized, so we can't figure out who is supposed - // to campaign. It's possible that it's us and we're waiting for the - // initial snapshot, but it's hard to tell. Don't do anything. + // No descriptor, so we don't know if the leader has been removed. We + // don't expect to hit this, but let's be defensive. return false } - // If the leader is no longer in the descriptor but we are the first voter, - // campaign. - _, leaderStillThere := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(st.Lead)) - if !leaderStillThere && storeID == desc.Replicas().VoterDescriptors()[0].StoreID { - log.VEventf(ctx, 3, "leader got removed by conf change") - return true + if _, ok := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(raftStatus.Lead)); ok { + // The leader is still in the descriptor. + return false } - return false + // Prior to 23.2, the first voter in the descriptor campaigned, so we do + // the same in mixed-version clusters to avoid ties. + if !st.Version.IsActive(ctx, clusterversion.V23_2) { + if storeID != desc.Replicas().VoterDescriptors()[0].StoreID { + // We're not the designated campaigner. + return false + } + } else if !leaseStatus.OwnedBy(storeID) || !leaseStatus.IsValid() { + // We're not the leaseholder. + return false + } + log.VEventf(ctx, 3, "leader got removed by conf change, campaigning") + return true } // printRaftTail pretty-prints the tail of the log and returns it as a string,