diff --git a/raft.go b/raft.go index cb2c4f7d7620..5a1505628a4b 100644 --- a/raft.go +++ b/raft.go @@ -1478,10 +1478,13 @@ func stepLeader(r *raft, m pb.Message) error { switch { case pr.State == tracker.StateProbe: pr.BecomeReplicate() - case pr.State == tracker.StateSnapshot && pr.Match >= pr.PendingSnapshot: - // TODO(tbg): we should also enter this branch if a snapshot is - // received that is below pr.PendingSnapshot but which makes it - // possible to use the log again. + case pr.State == tracker.StateSnapshot && pr.Match+1 >= r.raftLog.firstIndex(): + // Note that we don't take into account PendingSnapshot to + // enter this branch. No matter at which index a snapshot + // was actually applied, as long as this allows catching up + // the follower from the log, we will accept it. This gives + // systems more flexibility in how they implement snapshots; + // see the comments on PendingSnapshot. r.logger.Debugf("%x recovered from needing snapshot, resumed sending replication messages to %x [%s]", r.id, m.From, pr) // Transition back to replicating state via probing state // (which takes the snapshot into account). If we didn't @@ -1560,10 +1563,6 @@ func stepLeader(r *raft, m pb.Message) error { if pr.State != tracker.StateSnapshot { return nil } - // TODO(tbg): this code is very similar to the snapshot handling in - // MsgAppResp above. In fact, the code there is more correct than the - // code here and should likely be updated to match (or even better, the - // logic pulled into a newly created Progress state machine handler). if !m.Reject { pr.BecomeProbe() r.logger.Debugf("%x snapshot succeeded, resumed sending replication messages to %x [%s]", r.id, m.From, pr) diff --git a/testdata/snapshot_succeed_via_app_resp_behind.txt b/testdata/snapshot_succeed_via_app_resp_behind.txt index f40336de6692..094969828b19 100644 --- a/testdata/snapshot_succeed_via_app_resp_behind.txt +++ b/testdata/snapshot_succeed_via_app_resp_behind.txt @@ -1,8 +1,5 @@ # This is a variant of snapshot_succeed_via_app_resp in which the snapshot # that is being sent is behind the PendingSnapshot index tracked by the leader. -# -# In this case, the leader keeps the follower in StateSnapshot even though it -# could move it back to StateReplicate. # Turn off output during the setup of the test. log-level none @@ -151,11 +148,16 @@ stabilize 1 ---- > 1 receiving messages 3->1 MsgAppResp Term:1 Log:0/11 + DEBUG 1 recovered from needing snapshot, resumed sending replication messages to 3 [StateSnapshot match=11 next=12 paused pendingSnap=12] +> 1 handling Ready + Ready MustSync=false: + Messages: + 1->3 MsgApp Term:1 Log:1/11 Commit:12 Entries:[1/12 EntryNormal "\"foo\""] -# 3 is still pending a snapshot, even though its match is 11 which -# the leader could catch the follower up from. +# 3 is in StateReplicate thanks to receiving the snapshot at index 11. +# This is despite its PendingSnapshot having been 12. status 1 ---- 1: StateReplicate match=12 next=13 2: StateReplicate match=12 next=13 -3: StateSnapshot match=11 next=12 paused pendingSnap=12 +3: StateReplicate match=11 next=13 inflight=1 diff --git a/tracker/progress.go b/tracker/progress.go index 5948fadfdfc6..4002b1ec3543 100644 --- a/tracker/progress.go +++ b/tracker/progress.go @@ -42,11 +42,27 @@ type Progress struct { // before and stops sending any replication message. State StateType - // PendingSnapshot is used in StateSnapshot. - // If there is a pending snapshot, the pendingSnapshot will be set to the - // index of the snapshot. If pendingSnapshot is set, the replication process of - // this Progress will be paused. raft will not resend snapshot until the pending one - // is reported to be failed. + // PendingSnapshot is used in StateSnapshot and tracks the last index of the + // leader at the time at which it realized a snapshot was necessary. This + // matches the index in the MsgSnap message emitted from raft. + // + // While there is a pending snapshot, replication to the follower is paused. + // The follower will transition back to StateReplicate if the leader + // receives an MsgAppResp from it that reconnects the follower to the + // leader's log (such an MsgAppResp is emitted when the follower applies a + // snapshot). It may be surprising that PendingSnapshot is not taken into + // account here, but consider that complex systems may delegate the sending + // of snapshots to alternative datasources (i.e. not the leader). In such + // setups, it is difficult to manufacture a snapshot at a particular index + // requested by raft and the actual index may be ahead or behind. This + // should be okay, as long as the snapshot allows replication to resume. + // + // The follower will transition to StateProbe if ReportSnapshot is called on + // the leader; if SnapshotFinish is passed then PendingSnapshot becomes the + // basis for the next attempt to append. In practice, the first mechanism is + // the one that is relevant in most cases. However, if this MsgAppResp is + // lost (fallible network) then the second mechanism ensures that in this + // case the follower does not erroneously remain in StateSnapshot. PendingSnapshot uint64 // RecentActive is true if the progress is recently active. Receiving any messages