diff --git a/pkg/base/config.go b/pkg/base/config.go index e5899c3a72f8..e5385724d17e 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -256,7 +256,8 @@ var ( // partition from stealing away leadership from an established leader // (assuming they have an up-to-date log, which they do with a read-only // workload). With asymmetric or partial network partitions, this can allow an - // unreachable node to steal away leadership, leading to range unavailability. + // unreachable node to steal leadership away from the leaseholder, leading to + // range unavailability if the leaseholder can no longer reach the leader. // // The asymmetric partition concerns have largely been addressed by RPC // dialback (see rpc.dialback.enabled), but the partial partition concerns diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go index 72fdbe5b30b1..aaa4785e97ba 100644 --- a/pkg/kv/kvserver/client_raft_test.go +++ b/pkg/kv/kvserver/client_raft_test.go @@ -5933,6 +5933,161 @@ func TestRaftSnapshotsWithMVCCRangeKeysEverywhere(t *testing.T) { } } +// TestRaftCampaignPreVoteCheckQuorum tests that campaignLocked() respects +// PreVote+CheckQuorum, by not granting prevotes if there is an active leader. +func TestRaftCampaignPreVoteCheckQuorum(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + // Timing-sensitive, so skip under deadlock detector and stressrace. + skip.UnderDeadlock(t) + skip.UnderStressRace(t) + + ctx := context.Background() + + tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgs: base.TestServerArgs{ + RaftConfig: base.RaftConfig{ + RaftEnableCheckQuorum: true, + RaftTickInterval: 100 * time.Millisecond, // speed up test + }, + }, + }) + defer tc.Stopper().Stop(ctx) + + logStatus := func(s *raft.Status) { + t.Helper() + require.NotNil(t, s) + t.Logf("n%d %s at term=%d commit=%d", s.ID, s.RaftState, s.Term, s.Commit) + } + + sender := tc.GetFirstStoreFromServer(t, 0).TestSender() + + // Create a range, upreplicate it, and replicate a write. + key := tc.ScratchRange(t) + desc := tc.AddVotersOrFatal(t, key, tc.Targets(1, 2)...) + _, pErr := kv.SendWrapped(ctx, sender, incrementArgs(key, 1)) + require.NoError(t, pErr.GoError()) + tc.WaitForValues(t, key, []int64{1, 1, 1}) + + repl1, err := tc.GetFirstStoreFromServer(t, 0).GetReplica(desc.RangeID) + require.NoError(t, err) + repl2, err := tc.GetFirstStoreFromServer(t, 1).GetReplica(desc.RangeID) + require.NoError(t, err) + repl3, err := tc.GetFirstStoreFromServer(t, 2).GetReplica(desc.RangeID) + require.NoError(t, err) + repls := []*kvserver.Replica{repl1, repl2, repl3} + + // Make sure n1 is leader. + initialStatus := repl1.RaftStatus() + require.Equal(t, raft.StateLeader, initialStatus.RaftState) + logStatus(initialStatus) + t.Logf("n1 is leader") + + // Campaign n3. It shouldn't win prevotes, reverting to follower + // in the current term. + repl3.Campaign(ctx) + t.Logf("n3 campaigning") + + require.Eventually(t, func() bool { + status := repl3.RaftStatus() + logStatus(status) + return status.RaftState == raft.StateFollower + }, 10*time.Second, 500*time.Millisecond) + t.Logf("n3 reverted to follower") + + // n1 should still be the leader in the same term, with n2 and n3 followers. + for _, repl := range repls { + st := repl.RaftStatus() + logStatus(st) + if st.ID == 1 { + require.Equal(t, raft.StateLeader, st.RaftState) + } else { + require.Equal(t, raft.StateFollower, st.RaftState) + } + require.Equal(t, initialStatus.Term, st.Term) + } +} + +// TestRaftForceCampaignPreVoteCheckQuorum tests that forceCampaignLocked() +// ignores PreVote+CheckQuorum, transitioning directly to candidate and bumping +// the term. It may not actually win or hold onto leadership, but bumping the +// term is proof enough that it called an election. +func TestRaftForceCampaignPreVoteCheckQuorum(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + // Timing-sensitive, so skip under deadlock detector and stressrace. + skip.UnderDeadlock(t) + skip.UnderStressRace(t) + + ctx := context.Background() + + tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgs: base.TestServerArgs{ + RaftConfig: base.RaftConfig{ + RaftEnableCheckQuorum: true, + RaftTickInterval: 200 * time.Millisecond, // speed up test + RaftHeartbeatIntervalTicks: 10, // allow n3 to win the election + RaftElectionTimeoutTicks: 20, + }, + }, + }) + defer tc.Stopper().Stop(ctx) + + logStatus := func(s *raft.Status) { + t.Helper() + require.NotNil(t, s) + t.Logf("n%d %s at term=%d commit=%d", s.ID, s.RaftState, s.Term, s.Commit) + } + + sender := tc.GetFirstStoreFromServer(t, 0).TestSender() + + // Create a range, upreplicate it, and replicate a write. + key := tc.ScratchRange(t) + desc := tc.AddVotersOrFatal(t, key, tc.Targets(1, 2)...) + _, pErr := kv.SendWrapped(ctx, sender, incrementArgs(key, 1)) + require.NoError(t, pErr.GoError()) + tc.WaitForValues(t, key, []int64{1, 1, 1}) + + repl1, err := tc.GetFirstStoreFromServer(t, 0).GetReplica(desc.RangeID) + require.NoError(t, err) + repl2, err := tc.GetFirstStoreFromServer(t, 2).GetReplica(desc.RangeID) + require.NoError(t, err) + repl3, err := tc.GetFirstStoreFromServer(t, 2).GetReplica(desc.RangeID) + require.NoError(t, err) + repls := []*kvserver.Replica{repl1, repl2, repl3} + + // Make sure n1 is leader. + initialStatus := repl1.RaftStatus() + require.Equal(t, raft.StateLeader, initialStatus.RaftState) + logStatus(initialStatus) + t.Logf("n1 is leader in term %d", initialStatus.Term) + + // Force-campaign n3. It may not win or hold onto leadership, but it's enough + // to know that it bumped the term. + repl3.ForceCampaign(ctx) + t.Logf("n3 campaigning") + + var leaderStatus *raft.Status + require.Eventually(t, func() bool { + for _, repl := range repls { + st := repl.RaftStatus() + logStatus(st) + if st.Term <= initialStatus.Term { + return false + } + if st.RaftState == raft.StateLeader { + leaderStatus = st + } + } + return leaderStatus != nil + }, 10*time.Second, 500*time.Millisecond) + t.Logf("n%d is leader, with bumped term %d", leaderStatus.ID, leaderStatus.Term) +} + // TestRaftPreVote tests that Raft PreVote works properly, including the recent // leader check only enabled via CheckQuorum. Specifically, a replica that's // partitioned away from the leader (or restarted) should not be able to call an @@ -6324,3 +6479,106 @@ func TestRaftCheckQuorum(t *testing.T) { }) }) } + +// TestRaftLeaderRemovesItself tests that when a raft leader removes itself via +// a conf change the leaseholder campaigns for leadership, ignoring +// PreVote+CheckQuorum and transitioning directly to candidate. +// +// We set up three replicas: +// +// n1: Raft leader +// n2: follower +// n3: follower + leaseholder +// +// We disable leader following the leaseholder (which would otherwise transfer +// leadership if we happened to tick during the conf change), and then remove n1 +// from the range. n3 should acquire leadership. +// +// We disable election timeouts, such that the only way n3 can become leader is +// by campaigning explicitly. Furthermore, it must skip pre-votes, since with +// PreVote+CheckQuorum n2 wouldn't vote for it (it would think n1 was still the +// leader). +func TestRaftLeaderRemovesItself(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + // Timing-sensitive, so skip under deadlock detector and stressrace. + skip.UnderDeadlock(t) + skip.UnderStressRace(t) + + ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) + defer cancel() + + tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgs: base.TestServerArgs{ + RaftConfig: base.RaftConfig{ + RaftEnableCheckQuorum: true, + RaftTickInterval: 100 * time.Millisecond, // speed up test + // Set a large election timeout. We don't want replicas to call + // elections due to timeouts, we want them to campaign and obtain + // votes despite PreVote+CheckQuorum. + RaftElectionTimeoutTicks: 200, + }, + Knobs: base.TestingKnobs{ + Store: &kvserver.StoreTestingKnobs{ + DisableLeaderFollowsLeaseholder: true, // the leader should stay put + }, + }, + }, + }) + defer tc.Stopper().Stop(ctx) + + logStatus := func(s *raft.Status) { + t.Helper() + require.NotNil(t, s) + t.Logf("n%d %s at term=%d commit=%d", s.ID, s.RaftState, s.Term, s.Commit) + } + + send1 := tc.GetFirstStoreFromServer(t, 0).TestSender() + send3 := tc.GetFirstStoreFromServer(t, 2).TestSender() + + // Create a range, upreplicate it, and replicate a write. + key := tc.ScratchRange(t) + desc := tc.AddVotersOrFatal(t, key, tc.Targets(1, 2)...) + _, pErr := kv.SendWrapped(ctx, send1, incrementArgs(key, 1)) + require.NoError(t, pErr.GoError()) + tc.WaitForValues(t, key, []int64{1, 1, 1}) + + repl1, err := tc.GetFirstStoreFromServer(t, 0).GetReplica(desc.RangeID) + require.NoError(t, err) + repl2, err := tc.GetFirstStoreFromServer(t, 1).GetReplica(desc.RangeID) + require.NoError(t, err) + repl3, err := tc.GetFirstStoreFromServer(t, 2).GetReplica(desc.RangeID) + require.NoError(t, err) + + // Move the lease to n3, and make sure everyone has applied it. + tc.TransferRangeLeaseOrFatal(t, desc, tc.Target(2)) + require.Eventually(t, func() bool { + lease, _ := repl3.GetLease() + return lease.Replica.ReplicaID == repl3.ReplicaID() + }, 10*time.Second, 500*time.Millisecond) + _, pErr = kv.SendWrapped(ctx, send3, incrementArgs(key, 1)) + require.NoError(t, pErr.GoError()) + tc.WaitForValues(t, key, []int64{2, 2, 2}) + t.Logf("n3 has lease") + + // Make sure n1 is still leader. + st := repl1.RaftStatus() + require.Equal(t, raft.StateLeader, st.RaftState) + logStatus(st) + + // Remove n1 and wait for n3 to become leader. + tc.RemoveVotersOrFatal(t, key, tc.Target(0)) + t.Logf("n1 removed from range") + + require.Eventually(t, func() bool { + logStatus(repl2.RaftStatus()) + logStatus(repl3.RaftStatus()) + if repl3.RaftStatus().RaftState == raft.StateLeader { + t.Logf("n3 is leader") + return true + } + return false + }, 10*time.Second, 500*time.Millisecond) +} diff --git a/pkg/kv/kvserver/helpers_test.go b/pkg/kv/kvserver/helpers_test.go index 04a550293da0..3d81bb6aebb9 100644 --- a/pkg/kv/kvserver/helpers_test.go +++ b/pkg/kv/kvserver/helpers_test.go @@ -282,6 +282,20 @@ func (r *Replica) LastAssignedLeaseIndex() kvpb.LeaseAppliedIndex { return r.mu.proposalBuf.LastAssignedLeaseIndexRLocked() } +// Campaign campaigns the replica. +func (r *Replica) Campaign(ctx context.Context) { + r.mu.Lock() + defer r.mu.Unlock() + r.campaignLocked(ctx) +} + +// ForceCampaign force-campaigns the replica. +func (r *Replica) ForceCampaign(ctx context.Context) { + r.mu.Lock() + defer r.mu.Unlock() + r.forceCampaignLocked(ctx) +} + // LastAssignedLeaseIndexRLocked is like LastAssignedLeaseIndex, but requires // b.mu to be held in read mode. func (b *propBuf) LastAssignedLeaseIndexRLocked() kvpb.LeaseAppliedIndex { diff --git a/pkg/kv/kvserver/replica_proposal_buf.go b/pkg/kv/kvserver/replica_proposal_buf.go index d97220444277..593cbc47cd0f 100644 --- a/pkg/kv/kvserver/replica_proposal_buf.go +++ b/pkg/kv/kvserver/replica_proposal_buf.go @@ -162,6 +162,7 @@ type proposer interface { // The following require the proposer to hold an exclusive lock. withGroupLocked(func(proposerRaft) error) error registerProposalLocked(*ProposalData) + campaignLocked(ctx context.Context) // rejectProposalWithRedirectLocked rejects a proposal and redirects the // proposer to try it on another node. This is used to sometimes reject lease // acquisitions when another replica is the leader; the intended consequence @@ -670,9 +671,7 @@ func (b *propBuf) maybeRejectUnsafeProposalLocked( b.p.rejectProposalWithRedirectLocked(ctx, p, li.leader) if b.p.shouldCampaignOnRedirect(raftGroup) { log.VEventf(ctx, 2, "campaigning because Raft leader not live in node liveness map") - if err := raftGroup.Campaign(); err != nil { - log.VEventf(ctx, 1, "failed to campaign: %s", err) - } + b.p.campaignLocked(ctx) } return true } @@ -1340,6 +1339,10 @@ func (rp *replicaProposer) shouldCampaignOnRedirect(raftGroup proposerRaft) bool ) } +func (rp *replicaProposer) campaignLocked(ctx context.Context) { + (*Replica)(rp).campaignLocked(ctx) +} + func (rp *replicaProposer) flowControlHandle(ctx context.Context) kvflowcontrol.Handle { handle, found := rp.mu.replicaFlowControlIntegration.handle() if !found { diff --git a/pkg/kv/kvserver/replica_proposal_buf_test.go b/pkg/kv/kvserver/replica_proposal_buf_test.go index c0ba04eeaa26..93c6cebcc22a 100644 --- a/pkg/kv/kvserver/replica_proposal_buf_test.go +++ b/pkg/kv/kvserver/replica_proposal_buf_test.go @@ -236,6 +236,12 @@ func (t *testProposer) shouldCampaignOnRedirect(raftGroup proposerRaft) bool { return t.leaderNotLive } +func (t *testProposer) campaignLocked(ctx context.Context) { + if err := t.raftGroup.Campaign(); err != nil { + panic(err) + } +} + func (t *testProposer) rejectProposalWithRedirectLocked( _ context.Context, _ *ProposalData, redirectTo roachpb.ReplicaID, ) { diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index cd43e7e778ad..4e07aef86788 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -17,6 +17,7 @@ import ( "strings" "time" + "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/apply" @@ -33,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/uncertainty" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb" @@ -1166,12 +1168,15 @@ func (r *Replica) handleRaftReadyRaftMuLocked( r.deliverLocalRaftMsgsRaftMuLockedReplicaMuLocked(ctx, raftGroup) if stats.apply.numConfChangeEntries > 0 { - // If the raft leader got removed, campaign the first remaining voter. - // - // NB: this must be called after Advance() above since campaigning is - // a no-op in the presence of unapplied conf changes. - if shouldCampaignAfterConfChange(ctx, r.store.StoreID(), r.descRLocked(), raftGroup) { - r.campaignLocked(ctx) + // If the raft leader got removed, campaign on the leaseholder. Uses + // forceCampaignLocked() to bypass PreVote+CheckQuorum, since we otherwise + // wouldn't get prevotes from other followers who recently heard from the + // old leader. We know the leader isn't around anymore anyway. + leaseStatus := r.leaseStatusAtRLocked(ctx, r.store.Clock().NowAsClockTimestamp()) + raftStatus := raftGroup.BasicStatus() + if shouldCampaignAfterConfChange(ctx, r.store.ClusterSettings(), r.store.StoreID(), + r.descRLocked(), raftStatus, leaseStatus) { + r.forceCampaignLocked(ctx) } } @@ -2274,6 +2279,24 @@ func shouldCampaignOnLeaseRequestRedirect( return !livenessEntry.Liveness.IsLive(now) } +// campaignLocked campaigns for raft leadership, using PreVote and, if +// CheckQuorum is enabled, the recent leader condition. That is, followers will +// not grant prevotes if we're behind on the log and, with CheckQuorum, if +// they've heard from a leader in the past election timeout interval. +// +// The CheckQuorum condition can delay elections, particularly with quiesced +// ranges that don't tick. However, it is necessary to avoid spurious elections +// and stolen leaderships during partial/asymmetric network partitions, which +// can lead to permanent unavailability if the leaseholder can no longer reach +// the leader. +// +// Only followers enforce the CheckQuorum recent leader condition though, so if +// a quorum of followers consider the leader dead and choose to become +// pre-candidates and campaign then they will grant prevotes and can hold an +// election without waiting out the election timeout, but this can result in +// election ties if a quorum does so simultaneously. Followers and +// pre-candidates will also grant any number of pre-votes, both for themselves +// and anyone else that's eligible. func (r *Replica) campaignLocked(ctx context.Context) { log.VEventf(ctx, 3, "campaigning") if err := r.mu.internalRaftGroup.Campaign(); err != nil { @@ -2282,6 +2305,23 @@ func (r *Replica) campaignLocked(ctx context.Context) { r.store.enqueueRaftUpdateCheck(r.RangeID) } +// forceCampaignLocked campaigns for raft leadership, but skips the +// pre-candidate/pre-vote stage, calling an immediate election as candidate, and +// bypasses the CheckQuorum recent leader condition for votes. +// +// This will disrupt an existing leader, and can cause prolonged unavailability +// under partial/asymmetric network partitions. It should only be used when the +// caller is certain that the current leader is actually dead, and we're not +// simply partitioned away from it and/or liveness. +func (r *Replica) forceCampaignLocked(ctx context.Context) { + log.VEventf(ctx, 3, "force campaigning") + msg := raftpb.Message{To: uint64(r.replicaID), Type: raftpb.MsgTimeoutNow} + if err := r.mu.internalRaftGroup.Step(msg); err != nil { + log.VEventf(ctx, 1, "failed to campaign: %s", err) + } + r.store.enqueueRaftUpdateCheck(r.RangeID) +} + // a lastUpdateTimesMap is maintained on the Raft leader to keep track of the // last communication received from followers, which in turn informs the quota // pool and log truncations. @@ -2567,40 +2607,58 @@ func ComputeRaftLogSize( return ms.SysBytes + totalSideloaded, nil } +// shouldCampaignAfterConfChange returns true if the current replica should +// campaign after a conf change. If the leader replica is removed, the +// leaseholder should campaign. We don't want to campaign on multiple replicas, +// since that would cause ties. +// +// If there is no current leaseholder we'll have to wait out the election +// timeout before someone campaigns, but that's ok -- either we'll have to wait +// for the lease to expire anyway, or the range is presumably idle. +// +// The caller should campaign using forceCampaignLocked(), transitioning +// directly to candidate and bypassing PreVote+CheckQuorum. Otherwise it won't +// receive prevotes since other replicas have heard from the leader recently. func shouldCampaignAfterConfChange( ctx context.Context, + st *cluster.Settings, storeID roachpb.StoreID, desc *roachpb.RangeDescriptor, - raftGroup *raft.RawNode, + raftStatus raft.BasicStatus, + leaseStatus kvserverpb.LeaseStatus, ) bool { - // If a config change was carried out, it's possible that the Raft - // leader was removed. Verify that, and if so, campaign if we are - // the first remaining voter replica. Without this, the range will - // be leaderless (and thus unavailable) for a few seconds. - // - // We can't (or rather shouldn't) campaign on all remaining voters - // because that can lead to a stalemate. For example, three voters - // may all make it through PreVote and then reject each other. - st := raftGroup.BasicStatus() - if st.Lead == 0 { - // Leader unknown. This isn't what we expect in steady state, so we - // don't do anything. + if raftStatus.Lead == 0 { + // Leader unknown. We can't know if it was removed by the conf change, and + // because we force an election without prevote we don't want to risk + // throwing spurious elections. + return false + } + if raftStatus.RaftState == raft.StateLeader { + // We're already the leader, no point in campaigning. return false } if !desc.IsInitialized() { - // We don't have an initialized, so we can't figure out who is supposed - // to campaign. It's possible that it's us and we're waiting for the - // initial snapshot, but it's hard to tell. Don't do anything. + // No descriptor, so we don't know if the leader has been removed. We + // don't expect to hit this, but let's be defensive. return false } - // If the leader is no longer in the descriptor but we are the first voter, - // campaign. - _, leaderStillThere := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(st.Lead)) - if !leaderStillThere && storeID == desc.Replicas().VoterDescriptors()[0].StoreID { - log.VEventf(ctx, 3, "leader got removed by conf change") - return true + if _, ok := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(raftStatus.Lead)); ok { + // The leader is still in the descriptor. + return false } - return false + // Prior to 23.2, the first voter in the descriptor campaigned, so we do + // the same in mixed-version clusters to avoid ties. + if !st.Version.IsActive(ctx, clusterversion.V23_2) { + if storeID != desc.Replicas().VoterDescriptors()[0].StoreID { + // We're not the designated campaigner. + return false + } + } else if !leaseStatus.OwnedBy(storeID) || !leaseStatus.IsValid() { + // We're not the leaseholder. + return false + } + log.VEventf(ctx, 3, "leader got removed by conf change, campaigning") + return true } // printRaftTail pretty-prints the tail of the log and returns it as a string,