Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

raft: forget the leader when stepping down if it's safe #134990

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
60 changes: 44 additions & 16 deletions pkg/raft/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,25 @@ func newRaft(c *Config) *raft {
if c.Applied > 0 {
raftlog.appliedTo(c.Applied, 0 /* size */)
}
r.becomeFollower(r.Term, r.lead)

if r.lead == r.id {
// If we were the leader, we must have waited out the leadMaxSupported. This
// is done in the kvserver layer before reaching this point. Therefore, it
// should be safe to defortify and become a follower while forgetting that
// we were the leader. If we don't forget that we were the leader, it will
// lead to situations where r.id == r.lead but r.state != StateLeader which
// might confuse the layers above raft.
r.deFortify(r.id, r.Term)
r.becomeFollower(r.Term, None)
} else {
// If we weren't the leader, we should NOT forget who the leader is to avoid
// regressing the leadMaxSupported. We can't just forget the leader because
// we might have been fortified which would lead to a case where
// r.lead == None && r.leadEpoch != 0. Also, we can't call r.defortify()
// because we might have a fortified leader, and calling defortify would
// incorrectly allow us to campaign/vote.
r.becomeFollower(r.Term, r.lead)
}

var nodesStrs []string
for _, n := range r.trk.VoterNodes() {
Expand Down Expand Up @@ -1039,6 +1057,7 @@ func (r *raft) reset(term uint64) {
r.setTerm(term)
}

r.lead = None
r.electionElapsed = 0
r.heartbeatElapsed = 0
r.resetRandomizedElectionTimeout()
Expand Down Expand Up @@ -1261,12 +1280,20 @@ func (r *raft) becomeFollower(term uint64, lead pb.PeerID) {
r.lead = None
lead = None
}
r.state = pb.StateFollower
r.step = stepFollower
r.reset(term)
r.tick = r.tickElection

// Start de-fortifying eagerly so we don't have to wait out a full heartbeat
// timeout before sending the first de-fortification message.
if r.shouldBcastDeFortify() {
r.bcastDeFortify()
}

r.reset(term)
r.setLead(lead)
r.state = pb.StateFollower
r.logger.Infof("%x became follower at term %d", r.id, r.Term)
r.logger.Debugf("%x reset election elapsed to %d", r.id, r.electionElapsed)
}

func (r *raft) becomeCandidate() {
Expand Down Expand Up @@ -2167,7 +2194,7 @@ func stepCandidate(r *raft, m pb.Message) error {
case quorum.VoteLost:
// pb.MsgPreVoteResp contains future term of pre-candidate
// m.Term > r.Term; reuse r.Term
r.becomeFollower(r.Term, r.lead)
r.becomeFollower(r.Term, None)
}
}
return nil
Expand Down Expand Up @@ -2274,14 +2301,7 @@ func (r *raft) checkQuorumActive() {
}
if !quorumActiveByHeartbeats && !quorumActiveByFortification {
r.logger.Warningf("%x stepped down to follower since quorum is not active", r.id)
// NB: Stepping down because of CheckQuorum is a special, in that we know
// the LeadSupportUntil is in the past. This means that the leader can
// safely call a new election or vote for a different peer without
// regressing LeadSupportUntil. We don't need to/want to give this any
// special treatment -- instead, we handle this like the general step down
// case by simply remembering the term/lead information from our stint as
// the leader.
r.becomeFollower(r.Term, r.id)
r.becomeFollower(r.Term, None)
}
// Mark everyone (but ourselves) as inactive in preparation for the next
// CheckQuorum.
Expand Down Expand Up @@ -2710,9 +2730,12 @@ func (r *raft) switchToConfig(cfg quorum.Config, progressMap tracker.ProgressMap
// interruption). This might still drop some proposals but it's better than
// nothing.
//
// NB: Similar to the CheckQuorum step down case, we must remember our
// prior stint as leader, lest we regress the QSE.
r.becomeFollower(r.Term, r.lead)
// A learner can't campaign or participate in elections, and in order for a
// learner to get promoted to a voter, it needs a new leader to get elected
// and propose that change. Therefore, it should be safe at this point to
// defortify and forget that we were the leader at this term and step down.
r.deFortify(r.id, r.Term)
r.becomeFollower(r.Term, None)
return cs
}

Expand Down Expand Up @@ -2778,7 +2801,12 @@ func (r *raft) resetRandomizedElectionTimeout() {
func (r *raft) transferLeader(to pb.PeerID) {
assertTrue(r.state == pb.StateLeader, "only the leader can transfer leadership")
r.send(pb.Message{To: to, Type: pb.MsgTimeoutNow})
r.becomeFollower(r.Term, r.lead)
// When a leader transfers leadership to another replica, it instructs the
// replica to campaign without performing the campaign checks. Therefore, it
// should be safe to defortify and forget the we were the leader at this term
// when stepping down.
r.deFortify(r.id, r.Term)
r.becomeFollower(r.Term, None)
}

func (r *raft) abortLeaderTransfer() {
Expand Down
2 changes: 1 addition & 1 deletion pkg/raft/raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3548,7 +3548,7 @@ func TestLeaderTransferLeaderStepsDownImmediately(t *testing.T) {
nt.send(pb.Message{From: 3, To: 1, Type: pb.MsgTransferLeader})

require.Equal(t, uint64(1), lead.Term)
checkLeaderTransferState(t, lead, pb.StateFollower, 1)
checkLeaderTransferState(t, lead, pb.StateFollower, None)

// Eventually, the previous leader gives up on waiting and calls an election
// to reestablish leadership at the next term.
Expand Down
46 changes: 30 additions & 16 deletions pkg/raft/rawnode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -594,31 +594,45 @@ func TestRawNodeRestart(t *testing.T) {
MustSync: false,
}

storage := newTestMemoryStorage(withPeers(1))
storage := newTestMemoryStorage(withPeers(1, 2))
storage.SetHardState(st)
storage.Append(entries)
rawNode, err := NewRawNode(newTestConfig(1, 10, 1, storage))
rawNode1, err := NewRawNode(newTestConfig(1, 10, 1, storage))
require.NoError(t, err)
rd := rawNode.Ready()
rawNode2, err := NewRawNode(newTestConfig(2, 10, 1, storage))
require.NoError(t, err)
rd := rawNode1.Ready()
assert.Equal(t, want, rd)
rawNode.Advance(rd)
assert.False(t, rawNode.HasReady())
// Ensure that the HardState was correctly loaded post restart.
assert.Equal(t, uint64(1), rawNode.raft.Term)
assert.Equal(t, uint64(1), rawNode.raft.raftLog.committed)
assert.Equal(t, pb.PeerID(1), rawNode.raft.lead)
assert.True(t, rawNode.raft.state == pb.StateFollower)
assert.Equal(t, pb.Epoch(1), rawNode.raft.leadEpoch)
rawNode1.Advance(rd)
assert.False(t, rawNode1.HasReady())

// Ensure that the HardState was correctly loaded post rawNode1 restart.
assert.Equal(t, uint64(1), rawNode1.raft.Term)
assert.Equal(t, uint64(1), rawNode1.raft.raftLog.committed)
assert.True(t, rawNode1.raft.state == pb.StateFollower)
// Since rawNode1 was the leader, it should become a follower while forgetting
// that it was the leader/leadEpoch was for this term.
assert.Equal(t, None, rawNode1.raft.lead)
assert.Equal(t, pb.Epoch(0), rawNode1.raft.leadEpoch)

// Ensure that the HardState was correctly loaded post rawNode2 restart.
assert.Equal(t, uint64(1), rawNode2.raft.Term)
assert.Equal(t, uint64(1), rawNode2.raft.raftLog.committed)
assert.True(t, rawNode2.raft.state == pb.StateFollower)
// Since rawNode2 was a follower, it should remember who the leader was, and
// what the leadEpoch was.
assert.Equal(t, pb.PeerID(1), rawNode2.raft.lead)
assert.Equal(t, pb.Epoch(1), rawNode2.raft.leadEpoch)

// Ensure we campaign after the election timeout has elapsed.
for i := int64(0); i < rawNode.raft.randomizedElectionTimeout; i++ {
for i := int64(0); i < rawNode1.raft.randomizedElectionTimeout; i++ {
// TODO(arul): consider getting rid of this hack to reset the epoch so that
// we can call an election without panicking.
rawNode.raft.leadEpoch = 0
rawNode.raft.tick()
rawNode1.raft.leadEpoch = 0
rawNode1.raft.tick()
}
assert.Equal(t, pb.StateCandidate, rawNode.raft.state)
assert.Equal(t, uint64(2), rawNode.raft.Term) // this should in-turn bump the term
assert.Equal(t, pb.StateCandidate, rawNode1.raft.state)
assert.Equal(t, uint64(2), rawNode1.raft.Term) // this should in-turn bump the term
}

func TestRawNodeRestartFromSnapshot(t *testing.T) {
Expand Down
5 changes: 5 additions & 0 deletions pkg/raft/testdata/async_storage_writes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@ add-nodes 3 voters=(1,2,3) index=10 async-storage-writes=true
----
INFO 1 switched to configuration voters=(1 2 3)
INFO 1 became follower at term 0
DEBUG 1 reset election elapsed to 0
INFO newRaft 1 [peers: [1,2,3], term: 0, commit: 10, applied: 10, lastindex: 10, lastterm: 1]
INFO 2 switched to configuration voters=(1 2 3)
INFO 2 became follower at term 0
DEBUG 2 reset election elapsed to 0
INFO newRaft 2 [peers: [1,2,3], term: 0, commit: 10, applied: 10, lastindex: 10, lastterm: 1]
INFO 3 switched to configuration voters=(1 2 3)
INFO 3 became follower at term 0
DEBUG 3 reset election elapsed to 0
INFO newRaft 3 [peers: [1,2,3], term: 0, commit: 10, applied: 10, lastindex: 10, lastterm: 1]

campaign 1
Expand All @@ -36,11 +39,13 @@ stabilize
1->2 MsgVote Term:1 Log:1/10
INFO 2 [term: 0] received a MsgVote message with higher term from 1 [term: 1], advancing term
INFO 2 became follower at term 1
DEBUG 2 reset election elapsed to 0
INFO 2 [logterm: 1, index: 10, vote: 0] cast MsgVote for 1 [logterm: 1, index: 10] at term 1
> 3 receiving messages
1->3 MsgVote Term:1 Log:1/10
INFO 3 [term: 0] received a MsgVote message with higher term from 1 [term: 1], advancing term
INFO 3 became follower at term 1
DEBUG 3 reset election elapsed to 0
INFO 3 [logterm: 1, index: 10, vote: 0] cast MsgVote for 1 [logterm: 1, index: 10] at term 1
> 1 processing append thread
Processing:
Expand Down
1 change: 1 addition & 0 deletions pkg/raft/testdata/campaign_learner_must_vote.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ stabilize 3
DEBUG 3 setting election elapsed to start from 3 ticks after store liveness support expired
INFO 3 [term: 1] received a MsgVote message with higher term from 2 [term: 2], advancing term
INFO 3 became follower at term 2
DEBUG 3 reset election elapsed to 0
INFO 3 [logterm: 1, index: 3, vote: 0] cast MsgVote for 2 [logterm: 1, index: 4] at term 2
> 3 handling Ready
Ready MustSync=true:
Expand Down
4 changes: 4 additions & 0 deletions pkg/raft/testdata/confchange_disable_validation.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,19 @@ add-nodes 4 voters=(1) index=2 max-committed-size-per-ready=1 disable-conf-chang
----
INFO 1 switched to configuration voters=(1)
INFO 1 became follower at term 0
DEBUG 1 reset election elapsed to 0
INFO newRaft 1 [peers: [1], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 2 switched to configuration voters=(1)
INFO 2 became follower at term 0
DEBUG 2 reset election elapsed to 0
INFO newRaft 2 [peers: [1], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 3 switched to configuration voters=(1)
INFO 3 became follower at term 0
DEBUG 3 reset election elapsed to 0
INFO newRaft 3 [peers: [1], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 4 switched to configuration voters=(1)
INFO 4 became follower at term 0
DEBUG 4 reset election elapsed to 0
INFO newRaft 4 [peers: [1], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]

campaign 1
Expand Down
4 changes: 4 additions & 0 deletions pkg/raft/testdata/confchange_drop_if_unapplied.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,19 @@ add-nodes 4 voters=(1) index=2 disable-conf-change-validation=true
----
INFO 1 switched to configuration voters=(1)
INFO 1 became follower at term 0
DEBUG 1 reset election elapsed to 0
INFO newRaft 1 [peers: [1], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 2 switched to configuration voters=(1)
INFO 2 became follower at term 0
DEBUG 2 reset election elapsed to 0
INFO newRaft 2 [peers: [1], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 3 switched to configuration voters=(1)
INFO 3 became follower at term 0
DEBUG 3 reset election elapsed to 0
INFO newRaft 3 [peers: [1], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 4 switched to configuration voters=(1)
INFO 4 became follower at term 0
DEBUG 4 reset election elapsed to 0
INFO newRaft 4 [peers: [1], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]

campaign 1
Expand Down
5 changes: 5 additions & 0 deletions pkg/raft/testdata/confchange_fortification_safety.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,19 @@ add-nodes 4 voters=(1, 2, 3) index=2
----
INFO 1 switched to configuration voters=(1 2 3)
INFO 1 became follower at term 0
DEBUG 1 reset election elapsed to 0
INFO newRaft 1 [peers: [1,2,3], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 2 switched to configuration voters=(1 2 3)
INFO 2 became follower at term 0
DEBUG 2 reset election elapsed to 0
INFO newRaft 2 [peers: [1,2,3], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 3 switched to configuration voters=(1 2 3)
INFO 3 became follower at term 0
DEBUG 3 reset election elapsed to 0
INFO newRaft 3 [peers: [1,2,3], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 4 switched to configuration voters=(1 2 3)
INFO 4 became follower at term 0
DEBUG 4 reset election elapsed to 0
INFO newRaft 4 [peers: [1,2,3], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]

campaign 1
Expand Down Expand Up @@ -145,6 +149,7 @@ stabilize 1 4
1->4 MsgFortifyLeader Term:1 Log:0/0
INFO 4 [term: 0] received a MsgFortifyLeader message with higher term from 1 [term: 1], new leader indicated, advancing term
INFO 4 became follower at term 1
DEBUG 4 reset election elapsed to 0
1->4 MsgApp Term:1 Log:1/3 Commit:4 Entries:[1/4 EntryConfChangeV2 v4]
DEBUG 4 [logterm: 0, index: 3] rejected MsgApp [logterm: 1, index: 3] from 1
> 4 handling Ready
Expand Down
3 changes: 3 additions & 0 deletions pkg/raft/testdata/confchange_v1_add_single.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ add-nodes 1 voters=(1) index=2
----
INFO 1 switched to configuration voters=(1)
INFO 1 became follower at term 0
DEBUG 1 reset election elapsed to 0
INFO newRaft 1 [peers: [1], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]

campaign 1
Expand Down Expand Up @@ -32,6 +33,7 @@ add-nodes 1
----
INFO 2 switched to configuration voters=()
INFO 2 became follower at term 0
DEBUG 2 reset election elapsed to 0
INFO newRaft 2 [peers: [], term: 0, commit: 0, applied: 0, lastindex: 0, lastterm: 0]

# n1 commits the conf change using itself as commit quorum, immediately transitions into
Expand Down Expand Up @@ -62,6 +64,7 @@ stabilize
1->2 MsgFortifyLeader Term:1 Log:0/0
INFO 2 [term: 0] received a MsgFortifyLeader message with higher term from 1 [term: 1], new leader indicated, advancing term
INFO 2 became follower at term 1
DEBUG 2 reset election elapsed to 0
1->2 MsgApp Term:1 Log:1/3 Commit:4 Entries:[1/4 EntryConfChange v2]
DEBUG 2 [logterm: 0, index: 3] rejected MsgApp [logterm: 1, index: 3] from 1
> 2 handling Ready
Expand Down
9 changes: 6 additions & 3 deletions pkg/raft/testdata/confchange_v1_remove_leader.txt
Original file line number Diff line number Diff line change
Expand Up @@ -138,14 +138,17 @@ stabilize 1
1->2 MsgApp Term:1 Log:1/8 Commit:7
1->3 MsgApp Term:1 Log:1/8 Commit:7
INFO 1 switched to configuration voters=(2 3) learners=(1)
DEBUG 1 setting election elapsed to start from 3 ticks after store liveness support expired
INFO 1 became follower at term 1
DEBUG 1 reset election elapsed to 0
> 1 handling Ready
Ready MustSync=false:
Ready MustSync=true:
State:StateFollower
HardState Term:1 Vote:1 Commit:7 Lead:0 LeadEpoch:0

raft-state
----
1: StateFollower (Non-Voter) Term:1 Lead:0 LeadEpoch:1
1: StateFollower (Non-Voter) Term:1 Lead:0 LeadEpoch:0
2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1
3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1

Expand Down Expand Up @@ -206,7 +209,7 @@ stabilize
# n1 can no longer propose.
propose 1 baz
----
INFO 1 not forwarding to itself at term 1; dropping proposal
INFO 1 no leader at term 1; dropping proposal
raft proposal dropped

# Nor can it campaign to become leader.
Expand Down
5 changes: 5 additions & 0 deletions pkg/raft/testdata/confchange_v2_add_double_auto.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ add-nodes 1 voters=(1) index=2
----
INFO 1 switched to configuration voters=(1)
INFO 1 became follower at term 0
DEBUG 1 reset election elapsed to 0
INFO newRaft 1 [peers: [1], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]

campaign 1
Expand All @@ -33,9 +34,11 @@ add-nodes 2
----
INFO 2 switched to configuration voters=()
INFO 2 became follower at term 0
DEBUG 2 reset election elapsed to 0
INFO newRaft 2 [peers: [], term: 0, commit: 0, applied: 0, lastindex: 0, lastterm: 0]
INFO 3 switched to configuration voters=()
INFO 3 became follower at term 0
DEBUG 3 reset election elapsed to 0
INFO newRaft 3 [peers: [], term: 0, commit: 0, applied: 0, lastindex: 0, lastterm: 0]

# Process n1 once, so that it can append the entry.
Expand Down Expand Up @@ -86,6 +89,7 @@ stabilize 1 2
1->2 MsgFortifyLeader Term:1 Log:0/0
INFO 2 [term: 0] received a MsgFortifyLeader message with higher term from 1 [term: 1], new leader indicated, advancing term
INFO 2 became follower at term 1
DEBUG 2 reset election elapsed to 0
1->2 MsgApp Term:1 Log:1/3 Commit:4 Entries:[1/4 EntryConfChangeV2 v2 v3]
DEBUG 2 [logterm: 0, index: 3] rejected MsgApp [logterm: 1, index: 3] from 1
> 2 handling Ready
Expand Down Expand Up @@ -180,6 +184,7 @@ stabilize 1 3
1->3 MsgFortifyLeader Term:1 Log:0/0
INFO 3 [term: 0] received a MsgFortifyLeader message with higher term from 1 [term: 1], new leader indicated, advancing term
INFO 3 became follower at term 1
DEBUG 3 reset election elapsed to 0
1->3 MsgApp Term:1 Log:1/3 Commit:4 Entries:[1/4 EntryConfChangeV2 v2 v3]
DEBUG 3 [logterm: 0, index: 3] rejected MsgApp [logterm: 1, index: 3] from 1
1->3 MsgFortifyLeader Term:1 Log:0/0
Expand Down
Loading