Skip to content

Commit

Permalink
raft: respect leader fortification in MsgForgetLeader
Browse files Browse the repository at this point in the history
Fixes cockroachdb#124496.

This commit makes peers consult leader fortification state when
processing a MsgForgetLeader message. If the leader is fortified and
currently supported, the peer will ignore the message.

Epic: None
Release note: None
  • Loading branch information
nvanbenschoten committed Aug 14, 2024
1 parent e35dc31 commit 74cc8e2
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 2 deletions.
39 changes: 38 additions & 1 deletion pkg/raft/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -995,7 +995,12 @@ func (r *raft) hup(t CampaignType) {
r.logger.Debugf("%x ignoring MsgHup because already leader", r.id)
return
}

// TODO(arul): we will eventually want some kind of logic like this.
//
//if r.supportingFortifiedLeader() && t != campaignTransfer {
// r.logger.Debugf("%x ignoring MsgHup due to leader fortification", r.id)
// return
//}
if !r.promotable() {
r.logger.Warningf("%x is unpromotable and can not campaign", r.id)
return
Expand All @@ -1009,6 +1014,21 @@ func (r *raft) hup(t CampaignType) {
r.campaign(t)
}

// supportingFortifiedLeader returns whether the peer is providing fortification
// support to a leader. When a peer is providing support to a leader, it must
// not campaign or vote to disrupt that leader's term, unless specifically asked
// to do so by the leader.
// TODO(arul): this is a placeholder implementation. Move it around as you see
// fit.
func (r *raft) supportingFortifiedLeader() bool {
if r.leadEpoch == 0 {
return false // not supporting any leader
}
assertTrue(r.lead != None, "leader epoch is set but leader is not")
epoch, ok := r.storeLiveness.SupportFor(r.lead)
return ok && epoch == r.leadEpoch
}

// errBreak is a sentinel error used to break a callback-based loop.
var errBreak = errors.New("break")

Expand Down Expand Up @@ -1727,13 +1747,30 @@ func stepFollower(r *raft, m pb.Message) error {
m.To = r.lead
r.send(m)
case pb.MsgForgetLeader:
// TODO(nvanbenschoten): remove MsgForgetLeader once the sole caller in
// Replica.maybeForgetLeaderOnVoteRequestLocked is removed. This will
// accompany the removal of epoch-based leases.
if r.lead == None {
r.logger.Infof("%x no leader at term %d; dropping forget leader msg", r.id, r.Term)
return nil
}
if r.supportingFortifiedLeader() && r.lead != m.From {
r.logger.Infof("%x [term %d] ignored MsgForgetLeader from %x due to leader fortification", r.id, r.Term, m.From)
return nil
}
r.logger.Infof("%x forgetting leader %x at term %d", r.id, r.lead, r.Term)
r.lead = None
case pb.MsgTimeoutNow:
// TODO(nvanbenschoten): we will eventually want some kind of logic like
// this. However, even this may not be enough, because we're calling a
// campaignTransfer election, which bypasses leader fortification checks. It
// may never be safe for MsgTimeoutNow to come from anyone but the leader.
// We need to think about this more.
//
//if r.supportingFortifiedLeader() && r.lead != m.From {
// r.logger.Infof("%x [term %d] ignored MsgTimeoutNow from %x due to leader fortification", r.id, r.Term, m.From)
// return nil
//}
r.logger.Infof("%x [term %d] received MsgTimeoutNow from %x and starts an election to get leadership", r.id, r.Term, m.From)
// Leadership transfers never use pre-vote even if r.preVote is true; we
// know we are not recovering from a partition so there is no need for the
Expand Down
37 changes: 36 additions & 1 deletion pkg/raft/testdata/forget_leader.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,20 @@ raft-state
3: StateFollower (Voter) Term:1 Lead:1
4: StateFollower (Non-Voter) Term:1 Lead:1

# ForgetLeader is ignored if the follower is supporting the leader's store
# liveness epoch.
forget-leader 2
----
INFO 2 [term 1] ignored MsgForgetLeader from 0 due to leader fortification

withdraw-support 2 1
----
1 2 3 4
1 1 1 1 1
2 x 1 1 1
3 1 1 1 1
4 1 1 1 1

# ForgetLeader causes a follower to forget its leader, but remain in the current
# term. It's a noop if it's called again.
forget-leader 2
Expand All @@ -54,7 +68,20 @@ raft-state
3: StateFollower (Voter) Term:1 Lead:1
4: StateFollower (Non-Voter) Term:1 Lead:1

# ForgetLeader also works on learners.
# ForgetLeader also works on learners, but only if they are not supporting the
# leader's store liveness epoch.
forget-leader 4
----
INFO 4 [term 1] ignored MsgForgetLeader from 0 due to leader fortification

withdraw-support 4 1
----
1 2 3 4
1 1 1 1 1
2 x 1 1 1
3 1 1 1 1
4 x 1 1 1

forget-leader 4
----
INFO 4 forgetting leader 1 at term 1
Expand Down Expand Up @@ -169,6 +196,14 @@ tick-heartbeat 2
----
ok

withdraw-support 2 3
----
1 2 3 4
1 1 1 1 1
2 x 1 x 1
3 1 1 1 1
4 x 1 1 1

forget-leader 2
----
INFO 2 forgetting leader 3 at term 2
Expand Down
20 changes: 20 additions & 0 deletions pkg/raft/testdata/forget_leader_prevote_checkquorum.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,19 @@ raft-state
2: StateFollower (Voter) Term:1 Lead:1
3: StateFollower (Voter) Term:1 Lead:1

# ForgetLeader is ignored if the follower is supporting the leader's store
# liveness epoch.
forget-leader 2
----
INFO 2 [term 1] ignored MsgForgetLeader from 0 due to leader fortification

withdraw-support 2 1
----
1 2 3
1 1 1 1
2 x 1 1
3 1 1 1

# If 2 forgets the leader, then 3 can obtain prevotes and hold an election
# despite 2 having heard from the leader recently.
forget-leader 2
Expand Down Expand Up @@ -187,6 +200,13 @@ stabilize 2
Messages:
2->3 MsgAppResp Term:2 Log:0/13

withdraw-support 2 3
----
1 2 3
1 1 1 1
2 x 1 x
3 1 1 1

forget-leader 2
----
INFO 2 forgetting leader 3 at term 2
Expand Down

0 comments on commit 74cc8e2

Please sign in to comment.