From b05496210b50830981a77fb1575f53996ec06932 Mon Sep 17 00:00:00 2001 From: Arul Ajmani Date: Mon, 14 Oct 2024 18:09:09 -0400 Subject: [PATCH] raft: stop broadcasting MsgDeFortifyLeader once it is safe This patch identifies the scenario where a leader can safely stop de-fortifying its peers -- when a new leader has been elected and has been able to commit an entry. Informs https://github.com/cockroachdb/cockroach/issues/132581 Release note: None --- pkg/raft/raft.go | 32 +- .../testdata/de_fortification_checkquorum.txt | 312 +++++++++++++++++- pkg/raft/tracker/fortificationtracker.go | 51 +++ 3 files changed, 388 insertions(+), 7 deletions(-) diff --git a/pkg/raft/raft.go b/pkg/raft/raft.go index 8e7e3ce05d52..edfc193965b1 100644 --- a/pkg/raft/raft.go +++ b/pkg/raft/raft.go @@ -841,9 +841,10 @@ func (r *raft) sendDeFortify(to pb.PeerID) { case r.Term == fortifiedTerm: r.deFortify(r.id, fortifiedTerm) case r.Term > fortifiedTerm: - // NB: The current term has advanced, so we don't attempt to de-fortify, as the de-fortification - // attempt corresponds to a prior term. These term checks would have happened in Step had we - // sent a self-addressed message; we decided not to, so we need to handle this especially + // NB: The current term has advanced, so we don't attempt to de-fortify, + // as the de-fortification attempt corresponds to a prior term. These term + // checks would have happened in Step had we sent a self-addressed + // message; we decided not to, so we need to handle this especially // ourselves. r.logger.Debugf("de-foritfying self at term %d is a no-op; current term %d", fortifiedTerm, r.Term, @@ -907,9 +908,28 @@ func (r *raft) bcastDeFortify() { // MsgDeFortifyLeader to all peers or not. func (r *raft) shouldBcastDeFortify() bool { assertTrue(r.state != pb.StateLeader, "leaders should not be de-fortifying without stepping down") - // TODO(arul): expand this condition to ensure a new leader has committed an - // entry. - return r.fortificationTracker.FortificationEnabledForTerm() && r.fortificationTracker.CanDefortify() + + if !r.fortificationTracker.NeedsDefortify() { + // Fast path for if we've determined we no longer need to de-fortify. + return false + } + if !r.fortificationTracker.CanDefortify() { + return false + } + + committedTerm, err := r.raftLog.term(r.raftLog.committed) + if err != nil { + // NB: Extremely rare case where we can't figure out what the currently + // committed term is. Conservatively send out a MsgDeFortifyLeader, but log + // loudly. + r.logger.Warningf("failed to get committed term: %v", err) + return true + } + + // Inform the fortification tracker of the committed term, and see whether the + // answer to whether we should de-fortify has changed. + r.fortificationTracker.InformCommittedTerm(committedTerm) + return r.fortificationTracker.NeedsDefortify() } // maybeUnpauseAndBcastAppend unpauses and attempts to send an MsgApp to all the diff --git a/pkg/raft/testdata/de_fortification_checkquorum.txt b/pkg/raft/testdata/de_fortification_checkquorum.txt index 32954a61e2cd..2a868d6d2a22 100644 --- a/pkg/raft/testdata/de_fortification_checkquorum.txt +++ b/pkg/raft/testdata/de_fortification_checkquorum.txt @@ -142,7 +142,30 @@ raft-state 2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 -# And as a result, 2 is able to win the election. +# A new leader hasn't been elected yet, and by extension, no new entry has been +# committed yet. As a result, 1 should continue to broadcast a +# MsgDeFortifyLeader on every hertbeat tick. +tick-heartbeat 1 +---- +ok + +stabilize +---- +> 1 handling Ready + Ready MustSync=false: + Messages: + 1->2 MsgDeFortifyLeader Term:1 Log:0/0 + 1->3 MsgDeFortifyLeader Term:1 Log:0/0 +> 2 receiving messages + 1->2 MsgDeFortifyLeader Term:1 Log:0/0 + DEBUG 2 is not fortifying 1; de-fortification is a no-op +> 3 receiving messages + 1->3 MsgDeFortifyLeader Term:1 Log:0/0 + DEBUG 3 is not fortifying 1; de-fortification is a no-op + + +# But because all peers have de-fortified at this point, 2 is able to call and +# win the election. campaign 2 ---- INFO 2 is starting a new election at term 1 @@ -159,3 +182,290 @@ raft-state 1: StateFollower (Voter) Term:2 Lead:2 LeadEpoch:1 2: StateLeader (Voter) Term:2 Lead:2 LeadEpoch:1 3: StateFollower (Voter) Term:2 Lead:2 LeadEpoch:1 + +tick-heartbeat 1 +---- +ok + +# Note that 1 does not send a MsgDeFortifyLeader now that 2 has committed an +# entry. +stabilize +---- +ok + +# Lastly, we'll create a scenario where 2 steps down because of check quorum and +# a new leader is elected as a result, but 2 doesn't hear about this. It should +# continue to broadcast MsgDeFortifyLeader until it hears about this (i.e. it +# learns of an entry committed by a new leader). + +# Usual steps -- set randomized timeout for the benefit of the test, tick an +# election twice, and expire support to allow step down. +set-randomized-election-timeout 2 timeout=3 +---- +ok + +withdraw-support 1 2 +---- + 1 2 3 +1 1 x 1 +2 x 1 1 +3 x 1 1 + +withdraw-support 3 2 +---- + 1 2 3 +1 1 x 1 +2 x 1 1 +3 x x 1 + +support-expired 2 +---- +ok + +tick-election 2 +---- +INFO 2 leader at term 2 does not support itself in the liveness fabric +INFO 2 leader at term 2 does not support itself in the liveness fabric +DEBUG 2 does not have store liveness support from a quorum of peers +INFO 2 leader at term 2 does not support itself in the liveness fabric + +tick-election 2 +---- +INFO 2 leader at term 2 does not support itself in the liveness fabric +INFO 2 leader at term 2 does not support itself in the liveness fabric +DEBUG 2 has not received messages from a quorum of peers in the last election timeout +DEBUG 2 does not have store liveness support from a quorum of peers +WARN 2 stepped down to follower since quorum is not active +INFO 2 became follower at term 2 + +# Tick 1 once to ensure it grants its prevote. +set-randomized-election-timeout 1 timeout=5 +---- +ok + +tick-heartbeat 1 +---- +DEBUG 1 setting election elapsed to start from 3 ticks after store liveness support expired + +campaign 3 +---- +INFO 3 is starting a new election at term 2 +INFO 3 became pre-candidate at term 2 +INFO 3 [logterm: 2, index: 12] sent MsgPreVote request to 1 at term 2 +INFO 3 [logterm: 2, index: 12] sent MsgPreVote request to 2 at term 2 + +stabilize 3 +---- +> 3 handling Ready + Ready MustSync=true: + State:StatePreCandidate + HardState Term:2 Commit:12 Lead:0 LeadEpoch:0 + Messages: + 3->1 MsgPreVote Term:3 Log:2/12 + 3->2 MsgPreVote Term:3 Log:2/12 + INFO 3 received MsgPreVoteResp from 3 at term 2 + INFO 3 has received 1 MsgPreVoteResp votes and 0 vote rejections + +deliver-msgs drop=(2) +---- +dropped: 3->2 MsgPreVote Term:3 Log:2/12 + +stabilize 1 3 +---- +> 1 handling Ready + Ready MustSync=true: + HardState Term:2 Vote:2 Commit:12 Lead:2 LeadEpoch:0 +> 1 receiving messages + 3->1 MsgPreVote Term:3 Log:2/12 + INFO 1 [logterm: 2, index: 12, vote: 2] cast MsgPreVote for 3 [logterm: 2, index: 12] at term 2 +> 1 handling Ready + Ready MustSync=false: + Messages: + 1->3 MsgPreVoteResp Term:3 Log:0/0 +> 3 receiving messages + 1->3 MsgPreVoteResp Term:3 Log:0/0 + INFO 3 received MsgPreVoteResp from 1 at term 2 + INFO 3 has received 2 MsgPreVoteResp votes and 0 vote rejections + INFO 3 became candidate at term 3 + INFO 3 [logterm: 2, index: 12] sent MsgVote request to 1 at term 3 + INFO 3 [logterm: 2, index: 12] sent MsgVote request to 2 at term 3 +> 3 handling Ready + Ready MustSync=true: + State:StateCandidate + HardState Term:3 Vote:3 Commit:12 Lead:0 LeadEpoch:0 + Messages: + 3->1 MsgVote Term:3 Log:2/12 + 3->2 MsgVote Term:3 Log:2/12 + INFO 3 received MsgVoteResp from 3 at term 3 + INFO 3 has received 1 MsgVoteResp votes and 0 vote rejections +> 1 receiving messages + 3->1 MsgVote Term:3 Log:2/12 + INFO 1 [term: 2] received a MsgVote message with higher term from 3 [term: 3] + INFO 1 became follower at term 3 + INFO 1 [logterm: 2, index: 12, vote: 0] cast MsgVote for 3 [logterm: 2, index: 12] at term 3 +> 1 handling Ready + Ready MustSync=true: + HardState Term:3 Vote:3 Commit:12 Lead:0 LeadEpoch:0 + Messages: + 1->3 MsgVoteResp Term:3 Log:0/0 +> 3 receiving messages + 1->3 MsgVoteResp Term:3 Log:0/0 + INFO 3 received MsgVoteResp from 1 at term 3 + INFO 3 has received 2 MsgVoteResp votes and 0 vote rejections + INFO 3 became leader at term 3 +> 3 handling Ready + Ready MustSync=true: + State:StateLeader + HardState Term:3 Vote:3 Commit:12 Lead:3 LeadEpoch:1 + Entries: + 3/13 EntryNormal "" + Messages: + 3->1 MsgFortifyLeader Term:3 Log:0/0 + 3->2 MsgFortifyLeader Term:3 Log:0/0 + 3->1 MsgApp Term:3 Log:2/12 Commit:12 Entries:[3/13 EntryNormal ""] + 3->2 MsgApp Term:3 Log:2/12 Commit:12 Entries:[3/13 EntryNormal ""] +> 1 receiving messages + 3->1 MsgFortifyLeader Term:3 Log:0/0 + 3->1 MsgApp Term:3 Log:2/12 Commit:12 Entries:[3/13 EntryNormal ""] +> 1 handling Ready + Ready MustSync=true: + HardState Term:3 Vote:3 Commit:12 Lead:3 LeadEpoch:1 + Entries: + 3/13 EntryNormal "" + Messages: + 1->3 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1 + 1->3 MsgAppResp Term:3 Log:0/13 Commit:12 +> 3 receiving messages + 1->3 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1 + 1->3 MsgAppResp Term:3 Log:0/13 Commit:12 +> 3 handling Ready + Ready MustSync=true: + HardState Term:3 Vote:3 Commit:13 Lead:3 LeadEpoch:1 + CommittedEntries: + 3/13 EntryNormal "" + Messages: + 3->1 MsgApp Term:3 Log:2/12 Commit:12 Entries:[3/13 EntryNormal ""] + 3->1 MsgApp Term:3 Log:3/13 Commit:13 +> 1 receiving messages + 3->1 MsgApp Term:3 Log:2/12 Commit:12 Entries:[3/13 EntryNormal ""] + 3->1 MsgApp Term:3 Log:3/13 Commit:13 +> 1 handling Ready + Ready MustSync=true: + HardState Term:3 Vote:3 Commit:13 Lead:3 LeadEpoch:1 + CommittedEntries: + 3/13 EntryNormal "" + Messages: + 1->3 MsgAppResp Term:3 Log:0/13 Commit:12 + 1->3 MsgAppResp Term:3 Log:0/13 Commit:13 +> 3 receiving messages + 1->3 MsgAppResp Term:3 Log:0/13 Commit:12 + 1->3 MsgAppResp Term:3 Log:0/13 Commit:13 + +deliver-msgs drop=(2) +---- +dropped: 3->2 MsgVote Term:3 Log:2/12 +dropped: 3->2 MsgFortifyLeader Term:3 Log:0/0 +dropped: 3->2 MsgApp Term:3 Log:2/12 Commit:12 Entries:[3/13 EntryNormal ""] + +# 3 has been elected leader and has committed an entry. 2 is obilivious to this. +raft-state +---- +1: StateFollower (Voter) Term:3 Lead:3 LeadEpoch:1 +2: StateFollower (Voter) Term:2 Lead:0 LeadEpoch:1 +3: StateLeader (Voter) Term:3 Lead:3 LeadEpoch:1 + +raft-log 3 +---- +1/11 EntryNormal "" +2/12 EntryNormal "" +3/13 EntryNormal "" + +raft-log 2 +---- +1/11 EntryNormal "" +2/12 EntryNormal "" + +# And as a result, it continues to send out de-fortification requests. +tick-heartbeat 2 +---- +ok + +stabilize +---- +> 2 handling Ready + Ready MustSync=true: + State:StateFollower + HardState Term:2 Vote:2 Commit:12 Lead:2 LeadEpoch:0 + Messages: + 2->1 MsgDeFortifyLeader Term:2 Log:0/0 + 2->3 MsgDeFortifyLeader Term:2 Log:0/0 +> 1 receiving messages + 2->1 MsgDeFortifyLeader Term:2 Log:0/0 + INFO 1 [term: 3] ignored a MsgDeFortifyLeader message with lower term from 2 [term: 2] +> 3 receiving messages + 2->3 MsgDeFortifyLeader Term:2 Log:0/0 + INFO 3 [term: 3] ignored a MsgDeFortifyLeader message with lower term from 2 [term: 2] + +tick-heartbeat 3 +---- +ok + +# Catch 2 up. +stabilize +---- +> 3 handling Ready + Ready MustSync=false: + Messages: + 3->2 MsgFortifyLeader Term:3 Log:0/0 + 3->2 MsgApp Term:3 Log:2/12 Commit:13 Entries:[3/13 EntryNormal ""] +> 2 receiving messages + 3->2 MsgFortifyLeader Term:3 Log:0/0 + INFO 2 [term: 2] received a MsgFortifyLeader message with higher term from 3 [term: 3] + INFO 2 became follower at term 3 + 3->2 MsgApp Term:3 Log:2/12 Commit:13 Entries:[3/13 EntryNormal ""] +> 2 handling Ready + Ready MustSync=true: + HardState Term:3 Commit:13 Lead:3 LeadEpoch:1 + Entries: + 3/13 EntryNormal "" + CommittedEntries: + 3/13 EntryNormal "" + Messages: + 2->3 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1 + 2->3 MsgAppResp Term:3 Log:0/13 Commit:13 +> 3 receiving messages + 2->3 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1 + 2->3 MsgAppResp Term:3 Log:0/13 Commit:13 +> 3 handling Ready + Ready MustSync=false: + Messages: + 3->2 MsgApp Term:3 Log:2/12 Commit:13 Entries:[3/13 EntryNormal ""] +> 2 receiving messages + 3->2 MsgApp Term:3 Log:2/12 Commit:13 Entries:[3/13 EntryNormal ""] +> 2 handling Ready + Ready MustSync=false: + Messages: + 2->3 MsgAppResp Term:3 Log:0/13 Commit:13 +> 3 receiving messages + 2->3 MsgAppResp Term:3 Log:0/13 Commit:13 + +raft-state +---- +1: StateFollower (Voter) Term:3 Lead:3 LeadEpoch:1 +2: StateFollower (Voter) Term:3 Lead:3 LeadEpoch:1 +3: StateLeader (Voter) Term:3 Lead:3 LeadEpoch:1 + +raft-log 2 +---- +1/11 EntryNormal "" +2/12 EntryNormal "" +3/13 EntryNormal "" + +# And now, 2 shouldn't send any de-fortification requests. +tick-heartbeat 2 +---- +ok + +stabilize +---- +ok diff --git a/pkg/raft/tracker/fortificationtracker.go b/pkg/raft/tracker/fortificationtracker.go index 2061ea1a4378..b1625005a712 100644 --- a/pkg/raft/tracker/fortificationtracker.go +++ b/pkg/raft/tracker/fortificationtracker.go @@ -41,6 +41,11 @@ type FortificationTracker struct { // raft leader attempted to fortify its term, and save this state. fortificationEnabledForTerm bool + // needsDefortification tracks whether the node should broadcast + // MsgDeFortifyLeader to all followers to de-fortify the term being tracked in + // the tracker or not. + needsDefortification bool + // fortification contains a map of nodes which have fortified the leader // through fortification handshakes, and the corresponding Store Liveness // epochs that they have supported the leader in. @@ -109,6 +114,10 @@ func (ft *FortificationTracker) RecordFortification(id pb.PeerID, epoch pb.Epoch func (ft *FortificationTracker) Reset(term uint64) { ft.term = term ft.fortificationEnabledForTerm = false + // Whether we need to de-fortify or not is first contingent on whether + // fortification was enabled for the term being tracked or not. If + // fortification was attempted, though, we'll need to de-fortify. + ft.needsDefortification = true clear(ft.fortification) ft.leaderMaxSupported.Reset() } @@ -207,6 +216,48 @@ func (ft *FortificationTracker) CanDefortify() bool { return ft.storeLiveness.SupportExpired(leaderMaxSupported) } +// NeedsDefortify returns whether the node should still continue to broadcast +// MsgDeFortifyLeader to all followers to de-fortify the term being tracked in +// the tracker or not. +func (ft *FortificationTracker) NeedsDefortify() bool { + if !ft.fortificationEnabledForTerm { + // We never attempted to fortify this term, so we don't need to de-fortify. + return false + } + return ft.needsDefortification +} + +// InformCommittedTerm informs the fortification tracker that an entry proposed +// in the supplied term has been committed. +func (ft *FortificationTracker) InformCommittedTerm(committedTerm uint64) { + if committedTerm > ft.term { + // The committed term (T+1) has advanced beyond the term being tracked in + // the fortification tracker (T). This means that not only was a new leader + // elected at term T+1, but it was also able to commit a log entry. This + // means that a majority of followers are no longer supporting the old + // leader at term T. This allows us to stop de-fortifying term T. + // + // Note that even if a minority of followers are still supporting the old + // leader at term T, and they never hear from the new leader at term T+1, + // this shouldn't prevent us from electing a new leader at term T+2 in the + // future. That's because when campaigning for term T+2, candidates will + // include their most recent log entry. This must be at term T+1 for any + // viable candidate. Then, even if a candidate needs a vote from a follower + // in the minority that is still supporting the leader at term T, the + // follower will grant its vote when it notices the candidate is + // campaigning with a log entry that was committed at term T+1. + // + // To reiterate, it's safe to stop de-fortifying once a new leader has been + // elected at term T' > T, and the new leader has commited a log entry at + // term T'[1]. We're in this case -- save some state, so we can safely say no + // the next time we're asked whether we need to de-fortify or not. + // + // [1] Note that the leader doesn't need to have this log entry committed at + // T' in its log, it just needs to know such an entry exists. + ft.needsDefortification = false + } +} + // ConfigChangeSafe returns whether it is safe to propose a configuration change // or not, given the current state of lead support. //