diff --git a/raft/raft.go b/raft/raft.go index f408a326b2d9..bea1f99a2e3b 100644 --- a/raft/raft.go +++ b/raft/raft.go @@ -734,7 +734,7 @@ func (r *raft) Step(m pb.Message) error { } case m.Term < r.Term: - if r.checkQuorum && (m.Type == pb.MsgHeartbeat || m.Type == pb.MsgApp) { + if (r.checkQuorum || r.preVote) && (m.Type == pb.MsgHeartbeat || m.Type == pb.MsgApp) { // We have received messages from a leader at a lower term. It is possible // that these messages were simply delayed in the network, but this could // also mean that this node has advanced its term number during a network @@ -748,7 +748,15 @@ func (r *raft) Step(m pb.Message) error { // removed node will send MsgVotes (or MsgPreVotes) which will be ignored, // but it will not receive MsgApp or MsgHeartbeat, so it will not create // disruptive term increases + // The above comments also true for Pre-Vote r.send(pb.Message{To: m.From, Type: pb.MsgAppResp}) + } else if m.Type == pb.MsgPreVote { + // Before Pre-Vote enable, there may have candidate with higher term, + // but less log. After update to Pre-Vote, the cluster may deadlock if + // we drop messages with a lower term. + r.logger.Infof("%x [logterm: %d, index: %d, vote: %x] rejected %s from %x [logterm: %d, index: %d] at term %d", + r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.Vote, m.Type, m.From, m.LogTerm, m.Index, r.Term) + r.send(pb.Message{To: m.From, Term: r.Term, Type: pb.MsgPreVoteResp, Reject: true}) } else { // ignore other cases r.logger.Infof("%x [term: %d] ignored a %s message with lower term from %x [term: %d]", diff --git a/raft/raft_test.go b/raft/raft_test.go index 529b8a80706e..aff8616bfcd6 100644 --- a/raft/raft_test.go +++ b/raft/raft_test.go @@ -3290,6 +3290,150 @@ func TestPreVoteWithSplitVote(t *testing.T) { } } +// simulate rolling update a cluster for Pre-Vote. cluster has 3 nodes [n1, n2, n3]. +// n1 is leader with term 2 +// n2 is follower with term 2 +// n3 is partitioned, with term 4 and less log, state is candidate +func newPreVoteMigrationCluster(t *testing.T) *network { + n1 := newTestRaft(1, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage()) + n2 := newTestRaft(2, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage()) + n3 := newTestRaft(3, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage()) + + n1.becomeFollower(1, None) + n2.becomeFollower(1, None) + n3.becomeFollower(1, None) + + n1.preVote = true + n2.preVote = true + // We intentionally do not enable PreVote for n3, this is done so in order + // to simulate a rolling restart process where it's possible to have a mixed + // version cluster with replicas with PreVote enabled, and replicas without. + + nt := newNetwork(n1, n2, n3) + nt.send(pb.Message{From: 1, To: 1, Type: pb.MsgHup}) + + // Cause a network partition to isolate n3. + nt.isolate(3) + nt.send(pb.Message{From: 1, To: 1, Type: pb.MsgProp, Entries: []pb.Entry{{Data: []byte("some data")}}}) + nt.send(pb.Message{From: 3, To: 3, Type: pb.MsgHup}) + nt.send(pb.Message{From: 3, To: 3, Type: pb.MsgHup}) + + // check state + // n1.state == StateLeader + // n2.state == StateFollower + // n3.state == StateCandidate + if n1.state != StateLeader { + t.Fatalf("node 1 state: %s, want %s", n1.state, StateLeader) + } + if n2.state != StateFollower { + t.Fatalf("node 2 state: %s, want %s", n2.state, StateFollower) + } + if n3.state != StateCandidate { + t.Fatalf("node 3 state: %s, want %s", n3.state, StateCandidate) + } + + // check term + // n1.Term == 2 + // n2.Term == 2 + // n3.Term == 4 + if n1.Term != 2 { + t.Fatalf("node 1 term: %d, want %d", n1.Term, 2) + } + if n2.Term != 2 { + t.Fatalf("node 2 term: %d, want %d", n2.Term, 2) + } + if n3.Term != 4 { + t.Fatalf("node 3 term: %d, want %d", n3.Term, 4) + } + + // Enable prevote on n3, then recover the network + n3.preVote = true + nt.recover() + + return nt +} + +func TestPreVoteMigrationCanCompleteElection(t *testing.T) { + nt := newPreVoteMigrationCluster(t) + + // n1 is leader with term 2 + // n2 is follower with term 2 + // n3 is pre-candidate with term 3, and less log + n2 := nt.peers[2].(*raft) + n3 := nt.peers[3].(*raft) + + // simulate leader down + nt.isolate(1) + + // Call for elections from both n2 and n3. + nt.send(pb.Message{From: 3, To: 3, Type: pb.MsgHup}) + nt.send(pb.Message{From: 2, To: 2, Type: pb.MsgHup}) + + // check state + // n2.state == Follower + // n2.state == PreCandidate + if n2.state != StateFollower { + t.Errorf("node 2 state: %s, want %s", n2.state, StateFollower) + } + if n3.state != StatePreCandidate { + t.Errorf("node 3 state: %s, want %s", n3.state, StatePreCandidate) + } + + nt.send(pb.Message{From: 3, To: 3, Type: pb.MsgHup}) + nt.send(pb.Message{From: 2, To: 2, Type: pb.MsgHup}) + + // Do we have a leader? + if n2.state != StateLeader && n3.state != StateFollower { + t.Errorf("no leader") + } +} + +func TestPreVoteMigrationWithFreeStuckPreCandidate(t *testing.T) { + nt := newPreVoteMigrationCluster(t) + + // n1 is leader with term 2 + // n2 is follower with term 2 + // n3 is pre-candidate with term 3, and less log + n1 := nt.peers[1].(*raft) + n2 := nt.peers[2].(*raft) + n3 := nt.peers[3].(*raft) + + nt.send(pb.Message{From: 3, To: 3, Type: pb.MsgHup}) + + if n1.state != StateLeader { + t.Errorf("node 1 state: %s, want %s", n1.state, StateLeader) + } + if n2.state != StateFollower { + t.Errorf("node 2 state: %s, want %s", n2.state, StateFollower) + } + if n3.state != StatePreCandidate { + t.Errorf("node 3 state: %s, want %s", n3.state, StatePreCandidate) + } + + // Pre-Vote again for safety + nt.send(pb.Message{From: 3, To: 3, Type: pb.MsgHup}) + + if n1.state != StateLeader { + t.Errorf("node 1 state: %s, want %s", n1.state, StateLeader) + } + if n2.state != StateFollower { + t.Errorf("node 2 state: %s, want %s", n2.state, StateFollower) + } + if n3.state != StatePreCandidate { + t.Errorf("node 3 state: %s, want %s", n3.state, StatePreCandidate) + } + + nt.send(pb.Message{From: 1, To: 3, Type: pb.MsgHeartbeat, Term: n1.Term}) + + // Disrupt the leader so that the stuck peer is freed + if n1.state != StateFollower { + t.Errorf("state = %s, want %s", n1.state, StateFollower) + } + if n3.Term != n1.Term { + t.Errorf("term = %d, want %d", n3.Term, n1.Term) + } +} + func entsWithConfig(configFunc func(*Config), terms ...uint64) *raft { storage := NewMemoryStorage() for i, term := range terms {