From ff112fef699b35e040f58904fcedcbb95bc6b2a1 Mon Sep 17 00:00:00 2001
From: fanmin shi <fanmin.shi@coreos.com>
Date: Thu, 4 May 2017 15:57:25 -0700
Subject: [PATCH] etcdserver: renaming db happens before snapshot persists to
 wal and snap files

In the case that follower recieves a snapshot from leader
and crashes before renaming xxx.snap.db to db but after
snapshot has persisted to .wal and .snap, restarting
follower results loading old db, new .wal, and new .snap.
This will causes a index mismatch between snap metadata index
and consistent index from db.

This pr forces an ordering where saving/renaming db must
happen before snapshot is persisted to wal and snap file.
this ensures that db file can never be newer than  wal and snap file.
hence, it guarantees the invariant snapshot.Metadata.Index <= db.ConsistentIndex()
in NewServer() when checking validity of db and snap file.

FIXES #7628
---
 etcdserver/raft.go   | 17 ++++++++++-------
 etcdserver/server.go |  4 +++-
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/etcdserver/raft.go b/etcdserver/raft.go
index b87ceea54660..d8d8d087021c 100644
--- a/etcdserver/raft.go
+++ b/etcdserver/raft.go
@@ -83,7 +83,8 @@ type RaftTimer interface {
 type apply struct {
 	entries  []raftpb.Entry
 	snapshot raftpb.Snapshot
-	raftDone <-chan struct{} // rx {} after raft has persisted messages
+	// notifyc synchronizes etcd server applies with the raft node
+	notifyc chan struct{}
 }
 
 type raftNode struct {
@@ -190,11 +191,11 @@ func (r *raftNode) start(rh *raftReadyHandler) {
 					}
 				}
 
-				raftDone := make(chan struct{}, 1)
+				notifyc := make(chan struct{}, 1)
 				ap := apply{
 					entries:  rd.CommittedEntries,
 					snapshot: rd.Snapshot,
-					raftDone: raftDone,
+					notifyc:  notifyc,
 				}
 
 				updateCommittedIndex(&ap, rh)
@@ -223,6 +224,8 @@ func (r *raftNode) start(rh *raftReadyHandler) {
 				// gofail: var raftAfterSave struct{}
 
 				if !raft.IsEmptySnap(rd.Snapshot) {
+					// wait for snapshot db to be renamed to in-use db
+					<-notifyc
 					// gofail: var raftBeforeSaveSnap struct{}
 					if err := r.storage.SaveSnap(rd.Snapshot); err != nil {
 						plog.Fatalf("raft save snapshot error: %v", err)
@@ -240,7 +243,7 @@ func (r *raftNode) start(rh *raftReadyHandler) {
 					msgs := r.processMessages(rd.Messages)
 
 					// now unblocks 'applyAll' that waits on Raft log disk writes before triggering snapshots
-					raftDone <- struct{}{}
+					notifyc <- struct{}{}
 
 					// Candidate or follower needs to wait for all pending configuration
 					// changes to be applied before sending messages.
@@ -259,9 +262,9 @@ func (r *raftNode) start(rh *raftReadyHandler) {
 					if waitApply {
 						// blocks until 'applyAll' calls 'applyWait.Trigger'
 						// to be in sync with scheduled config-change job
-						// (assume raftDone has cap of 1)
+						// (assume notifyc has cap of 1)
 						select {
-						case raftDone <- struct{}{}:
+						case notifyc <- struct{}{}:
 						case <-r.stopped:
 							return
 						}
@@ -271,7 +274,7 @@ func (r *raftNode) start(rh *raftReadyHandler) {
 					r.transport.Send(msgs)
 				} else {
 					// leader already processed 'MsgSnap' and signaled
-					raftDone <- struct{}{}
+					notifyc <- struct{}{}
 				}
 
 				r.Advance()
diff --git a/etcdserver/server.go b/etcdserver/server.go
index 33430276bff6..361e23e964ab 100644
--- a/etcdserver/server.go
+++ b/etcdserver/server.go
@@ -778,7 +778,7 @@ func (s *EtcdServer) applyAll(ep *etcdProgress, apply *apply) {
 	// wait for the raft routine to finish the disk writes before triggering a
 	// snapshot. or applied index might be greater than the last index in raft
 	// storage, since the raft routine might be slower than apply routine.
-	<-apply.raftDone
+	<-apply.notifyc
 
 	s.triggerSnapshot(ep)
 	select {
@@ -812,6 +812,8 @@ func (s *EtcdServer) applySnapshot(ep *etcdProgress, apply *apply) {
 	if err := os.Rename(snapfn, fn); err != nil {
 		plog.Panicf("rename snapshot file error: %v", err)
 	}
+	// raft can now claim the snapshot has been applied
+	apply.notifyc <- struct{}{}
 
 	newbe := newBackend(fn, s.Cfg.QuotaBackendBytes)