From 4bec0d7d6727965411a4fdc4a3c1a623238be2ca Mon Sep 17 00:00:00 2001
From: Gyuho Lee <gyuhox@gmail.com>
Date: Thu, 19 Apr 2018 14:00:31 -0700
Subject: [PATCH 1/6] etcdserver: add "InitialElectionTickAdvance"

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
---
 etcdserver/config.go | 34 ++++++++++++++++++++++++++++++++--
 etcdserver/server.go |  4 ++++
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/etcdserver/config.go b/etcdserver/config.go
index 9d9cd4047f3..e67628f9473 100644
--- a/etcdserver/config.go
+++ b/etcdserver/config.go
@@ -55,8 +55,38 @@ type ServerConfig struct {
 	// whose Host header value exists in this white list.
 	HostWhitelist map[string]struct{}
 
-	TickMs           uint
-	ElectionTicks    int
+	TickMs        uint
+	ElectionTicks int
+
+	// InitialElectionTickAdvance is true, then local member fast-forwards
+	// election ticks to speed up "initial" leader election trigger. This
+	// benefits the case of larger election ticks. For instance, cross
+	// datacenter deployment may require longer election timeout of 10-second.
+	// If true, local node does not need wait up to 10-second. Instead,
+	// forwards its election ticks to 8-second, and have only 2-second left
+	// before leader election.
+	//
+	// Major assumptions are that:
+	//  - cluster has no active leader thus advancing ticks enables faster
+	//    leader election, or
+	//  - cluster already has an established leader, and rejoining follower
+	//    is likely to receive heartbeats from the leader after tick advance
+	//    and before election timeout.
+	//
+	// However, when network from leader to rejoining follower is congested,
+	// and the follower does not receive leader heartbeat within left election
+	// ticks, disruptive election has to happen thus affecting cluster
+	// availabilities.
+	//
+	// Disabling this would slow down initial bootstrap process for cross
+	// datacenter deployments. Make your own tradeoffs by configuring
+	// --initial-election-tick-advance at the cost of slow initial bootstrap.
+	//
+	// If single-node, it advances ticks regardless.
+	//
+	// See https://github.com/coreos/etcd/issues/9333 for more detail.
+	InitialElectionTickAdvance bool
+
 	BootstrapTimeout time.Duration
 
 	AutoCompactionRetention time.Duration
diff --git a/etcdserver/server.go b/etcdserver/server.go
index e61bcc92a1a..f7e32fec2ed 100644
--- a/etcdserver/server.go
+++ b/etcdserver/server.go
@@ -635,6 +635,10 @@ func (s *EtcdServer) adjustTicks() {
 		return
 	}
 
+	if !s.Cfg.InitialElectionTickAdvance {
+		return
+	}
+
 	// retry up to "rafthttp.ConnReadTimeout", which is 5-sec
 	// until peer connection reports; otherwise:
 	// 1. all connections failed, or

From 3fe9030d348e3cf17ed48da5dfbc6a4c4355f9af Mon Sep 17 00:00:00 2001
From: Gyuho Lee <gyuhox@gmail.com>
Date: Thu, 19 Apr 2018 14:01:23 -0700
Subject: [PATCH 2/6] integration: set InitialElectionTickAdvance to true by
 default

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
---
 integration/cluster.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/integration/cluster.go b/integration/cluster.go
index 38b140eb073..e872e552106 100644
--- a/integration/cluster.go
+++ b/integration/cluster.go
@@ -593,6 +593,7 @@ func mustNewMember(t *testing.T, mcfg memberConfig) *member {
 		m.ServerConfig.PeerTLSInfo = *m.PeerTLSInfo
 	}
 	m.ElectionTicks = electionTicks
+	m.InitialElectionTickAdvance = true
 	m.TickMs = uint(tickDuration / time.Millisecond)
 	m.QuotaBackendBytes = mcfg.quotaBackendBytes
 	m.MaxTxnOps = mcfg.maxTxnOps

From bffc532f9f6af500e8ad27ca476507be81a80fd5 Mon Sep 17 00:00:00 2001
From: Gyuho Lee <gyuhox@gmail.com>
Date: Thu, 19 Apr 2018 14:03:17 -0700
Subject: [PATCH 3/6] embed: add "InitialElectionTickAdvance"

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
---
 embed/config.go | 39 +++++++++++++++++++++++++---
 embed/etcd.go   | 67 +++++++++++++++++++++++++------------------------
 2 files changed, 69 insertions(+), 37 deletions(-)

diff --git a/embed/config.go b/embed/config.go
index 5b361ab8622..28c3a0d78d7 100644
--- a/embed/config.go
+++ b/embed/config.go
@@ -121,8 +121,38 @@ type Config struct {
 	// TickMs is the number of milliseconds between heartbeat ticks.
 	// TODO: decouple tickMs and heartbeat tick (current heartbeat tick = 1).
 	// make ticks a cluster wide configuration.
-	TickMs            uint  `json:"heartbeat-interval"`
-	ElectionMs        uint  `json:"election-timeout"`
+	TickMs     uint `json:"heartbeat-interval"`
+	ElectionMs uint `json:"election-timeout"`
+
+	// InitialElectionTickAdvance is true, then local member fast-forwards
+	// election ticks to speed up "initial" leader election trigger. This
+	// benefits the case of larger election ticks. For instance, cross
+	// datacenter deployment may require longer election timeout of 10-second.
+	// If true, local node does not need wait up to 10-second. Instead,
+	// forwards its election ticks to 8-second, and have only 2-second left
+	// before leader election.
+	//
+	// Major assumptions are that:
+	//  - cluster has no active leader thus advancing ticks enables faster
+	//    leader election, or
+	//  - cluster already has an established leader, and rejoining follower
+	//    is likely to receive heartbeats from the leader after tick advance
+	//    and before election timeout.
+	//
+	// However, when network from leader to rejoining follower is congested,
+	// and the follower does not receive leader heartbeat within left election
+	// ticks, disruptive election has to happen thus affecting cluster
+	// availabilities.
+	//
+	// Disabling this would slow down initial bootstrap process for cross
+	// datacenter deployments. Make your own tradeoffs by configuring
+	// --initial-election-tick-advance at the cost of slow initial bootstrap.
+	//
+	// If single-node, it advances ticks regardless.
+	//
+	// See https://github.com/coreos/etcd/issues/9333 for more detail.
+	InitialElectionTickAdvance bool `json:"initial-election-tick-advance"`
+
 	QuotaBackendBytes int64 `json:"quota-backend-bytes"`
 	MaxTxnOps         uint  `json:"max-txn-ops"`
 	MaxRequestBytes   uint  `json:"max-request-bytes"`
@@ -305,8 +335,9 @@ func NewConfig() *Config {
 		GRPCKeepAliveInterval: DefaultGRPCKeepAliveInterval,
 		GRPCKeepAliveTimeout:  DefaultGRPCKeepAliveTimeout,
 
-		TickMs:     100,
-		ElectionMs: 1000,
+		TickMs:                     100,
+		ElectionMs:                 1000,
+		InitialElectionTickAdvance: true,
 
 		LPUrls: []url.URL{*lpurl},
 		LCUrls: []url.URL{*lcurl},
diff --git a/embed/etcd.go b/embed/etcd.go
index c59cdf08350..7fe59e9635c 100644
--- a/embed/etcd.go
+++ b/embed/etcd.go
@@ -158,39 +158,40 @@ func StartEtcd(inCfg *Config) (e *Etcd, err error) {
 	}
 
 	srvcfg := etcdserver.ServerConfig{
-		Name:                    cfg.Name,
-		ClientURLs:              cfg.ACUrls,
-		PeerURLs:                cfg.APUrls,
-		DataDir:                 cfg.Dir,
-		DedicatedWALDir:         cfg.WalDir,
-		SnapCount:               cfg.SnapCount,
-		MaxSnapFiles:            cfg.MaxSnapFiles,
-		MaxWALFiles:             cfg.MaxWalFiles,
-		InitialPeerURLsMap:      urlsmap,
-		InitialClusterToken:     token,
-		DiscoveryURL:            cfg.Durl,
-		DiscoveryProxy:          cfg.Dproxy,
-		NewCluster:              cfg.IsNewCluster(),
-		PeerTLSInfo:             cfg.PeerTLSInfo,
-		TickMs:                  cfg.TickMs,
-		ElectionTicks:           cfg.ElectionTicks(),
-		AutoCompactionRetention: autoCompactionRetention,
-		AutoCompactionMode:      cfg.AutoCompactionMode,
-		QuotaBackendBytes:       cfg.QuotaBackendBytes,
-		MaxTxnOps:               cfg.MaxTxnOps,
-		MaxRequestBytes:         cfg.MaxRequestBytes,
-		StrictReconfigCheck:     cfg.StrictReconfigCheck,
-		ClientCertAuthEnabled:   cfg.ClientTLSInfo.ClientCertAuth,
-		AuthToken:               cfg.AuthToken,
-		CORS:                    cfg.CORS,
-		HostWhitelist:           cfg.HostWhitelist,
-		InitialCorruptCheck:     cfg.ExperimentalInitialCorruptCheck,
-		CorruptCheckTime:        cfg.ExperimentalCorruptCheckTime,
-		PreVote:                 cfg.PreVote,
-		Logger:                  cfg.logger,
-		LoggerConfig:            cfg.loggerConfig,
-		Debug:                   cfg.Debug,
-		ForceNewCluster:         cfg.ForceNewCluster,
+		Name:                       cfg.Name,
+		ClientURLs:                 cfg.ACUrls,
+		PeerURLs:                   cfg.APUrls,
+		DataDir:                    cfg.Dir,
+		DedicatedWALDir:            cfg.WalDir,
+		SnapCount:                  cfg.SnapCount,
+		MaxSnapFiles:               cfg.MaxSnapFiles,
+		MaxWALFiles:                cfg.MaxWalFiles,
+		InitialPeerURLsMap:         urlsmap,
+		InitialClusterToken:        token,
+		DiscoveryURL:               cfg.Durl,
+		DiscoveryProxy:             cfg.Dproxy,
+		NewCluster:                 cfg.IsNewCluster(),
+		PeerTLSInfo:                cfg.PeerTLSInfo,
+		TickMs:                     cfg.TickMs,
+		ElectionTicks:              cfg.ElectionTicks(),
+		InitialElectionTickAdvance: cfg.InitialElectionTickAdvance,
+		AutoCompactionRetention:    autoCompactionRetention,
+		AutoCompactionMode:         cfg.AutoCompactionMode,
+		QuotaBackendBytes:          cfg.QuotaBackendBytes,
+		MaxTxnOps:                  cfg.MaxTxnOps,
+		MaxRequestBytes:            cfg.MaxRequestBytes,
+		StrictReconfigCheck:        cfg.StrictReconfigCheck,
+		ClientCertAuthEnabled:      cfg.ClientTLSInfo.ClientCertAuth,
+		AuthToken:                  cfg.AuthToken,
+		CORS:                       cfg.CORS,
+		HostWhitelist:              cfg.HostWhitelist,
+		InitialCorruptCheck:        cfg.ExperimentalInitialCorruptCheck,
+		CorruptCheckTime:           cfg.ExperimentalCorruptCheckTime,
+		PreVote:                    cfg.PreVote,
+		Logger:                     cfg.logger,
+		LoggerConfig:               cfg.loggerConfig,
+		Debug:                      cfg.Debug,
+		ForceNewCluster:            cfg.ForceNewCluster,
 	}
 	if e.Server, err = etcdserver.NewServer(srvcfg); err != nil {
 		return e, err

From 85b7a59c56d470ec99dfcd32dbcf75466ec79bf3 Mon Sep 17 00:00:00 2001
From: Gyuho Lee <gyuhox@gmail.com>
Date: Thu, 19 Apr 2018 14:07:38 -0700
Subject: [PATCH 4/6] etcdmain: add "--initial-election-tick-advance"

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
---
 etcdmain/config.go | 1 +
 etcdmain/help.go   | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/etcdmain/config.go b/etcdmain/config.go
index c4a6d0a481a..b884a111311 100644
--- a/etcdmain/config.go
+++ b/etcdmain/config.go
@@ -153,6 +153,7 @@ func newConfig() *config {
 	fs.Uint64Var(&cfg.ec.SnapCount, "snapshot-count", cfg.ec.SnapCount, "Number of committed transactions to trigger a snapshot to disk.")
 	fs.UintVar(&cfg.ec.TickMs, "heartbeat-interval", cfg.ec.TickMs, "Time (in milliseconds) of a heartbeat interval.")
 	fs.UintVar(&cfg.ec.ElectionMs, "election-timeout", cfg.ec.ElectionMs, "Time (in milliseconds) for an election to timeout.")
+	fs.BoolVar(&cfg.ec.InitialElectionTickAdvance, "initial-election-tick-advance", cfg.ec.InitialElectionTickAdvance, "Whether to fast-forward initial election ticks on boot for faster election.")
 	fs.Int64Var(&cfg.ec.QuotaBackendBytes, "quota-backend-bytes", cfg.ec.QuotaBackendBytes, "Raise alarms when backend size exceeds the given quota. 0 means use the default quota.")
 	fs.UintVar(&cfg.ec.MaxTxnOps, "max-txn-ops", cfg.ec.MaxTxnOps, "Maximum number of operations permitted in a transaction.")
 	fs.UintVar(&cfg.ec.MaxRequestBytes, "max-request-bytes", cfg.ec.MaxRequestBytes, "Maximum client request size in bytes the server will accept.")
diff --git a/etcdmain/help.go b/etcdmain/help.go
index 72e50043a09..55334f3fb4c 100644
--- a/etcdmain/help.go
+++ b/etcdmain/help.go
@@ -55,6 +55,8 @@ Member:
     Time (in milliseconds) of a heartbeat interval.
   --election-timeout '1000'
     Time (in milliseconds) for an election to timeout. See tuning documentation for details.
+  --initial-election-tick-advance 'true'
+    Whether to fast-forward initial election ticks on boot for faster election.
   --listen-peer-urls 'http://localhost:2380'
     List of URLs to listen on for peer traffic.
   --listen-client-urls 'http://localhost:2379'

From 21d2e2ab6edc5bdf38e5eef101382627a0233118 Mon Sep 17 00:00:00 2001
From: Gyuho Lee <gyuhox@gmail.com>
Date: Thu, 19 Apr 2018 14:19:12 -0700
Subject: [PATCH 5/6] etcdserver: add more tick fast-forward logs

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
---
 etcdserver/config.go | 1 +
 etcdserver/server.go | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/etcdserver/config.go b/etcdserver/config.go
index e67628f9473..f2a941e1c78 100644
--- a/etcdserver/config.go
+++ b/etcdserver/config.go
@@ -293,6 +293,7 @@ func (c *ServerConfig) print(initial bool) {
 			zap.String("heartbeat-interval", fmt.Sprintf("%v", time.Duration(c.TickMs)*time.Millisecond)),
 			zap.Int("election-tick-ms", c.ElectionTicks),
 			zap.String("election-timeout", fmt.Sprintf("%v", time.Duration(c.ElectionTicks*int(c.TickMs))*time.Millisecond)),
+			zap.Bool("initial-election-tick-advance", c.InitialElectionTickAdvance),
 			zap.Uint64("snapshot-count", c.SnapCount),
 			zap.Strings("advertise-client-urls", c.getACURLs()),
 			zap.Strings("initial-advertise-peer-urls", c.getAPURLs()),
diff --git a/etcdserver/server.go b/etcdserver/server.go
index f7e32fec2ed..5a3490ec917 100644
--- a/etcdserver/server.go
+++ b/etcdserver/server.go
@@ -636,8 +636,14 @@ func (s *EtcdServer) adjustTicks() {
 	}
 
 	if !s.Cfg.InitialElectionTickAdvance {
+		if lg != nil {
+			lg.Info("skipping initial election tick advance", zap.Int("election-ticks", s.Cfg.ElectionTicks))
+		}
 		return
 	}
+	if lg != nil {
+		lg.Info("starting initial election tick advance", zap.Int("election-ticks", s.Cfg.ElectionTicks))
+	}
 
 	// retry up to "rafthttp.ConnReadTimeout", which is 5-sec
 	// until peer connection reports; otherwise:

From 2d7cb9dac7eb53bfd804c80c36586d9d10a9e674 Mon Sep 17 00:00:00 2001
From: Gyuho Lee <gyuhox@gmail.com>
Date: Thu, 19 Apr 2018 14:44:22 -0700
Subject: [PATCH 6/6] CHANGELOG: add latest changes

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
---
 CHANGELOG-3.3.md | 18 ++++++++++++++++++
 CHANGELOG-3.4.md | 12 ++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/CHANGELOG-3.3.md b/CHANGELOG-3.3.md
index 9764da5478c..379636e5fae 100644
--- a/CHANGELOG-3.3.md
+++ b/CHANGELOG-3.3.md
@@ -6,6 +6,7 @@ See [code changes](https://github.com/coreos/etcd/compare/v3.3.3...v3.3.4) and [
 
 ### Metrics, Monitoring
 
+- Add [`etcd_server_is_leader`](https://github.com/coreos/etcd/pull/9587) Prometheus metric.
 - Fix [`etcd_debugging_server_lease_expired_total`](https://github.com/coreos/etcd/pull/9557) Prometheus metric.
 - Fix [race conditions in v2 server stat collecting](https://github.com/coreos/etcd/pull/9562).
 
@@ -16,6 +17,23 @@ See [code changes](https://github.com/coreos/etcd/compare/v3.3.3...v3.3.4) and [
   - However, a certificate whose SAN field does [not include any domain names but only IP addresses](https://github.com/coreos/etcd/issues/9541) would request `*tls.ClientHelloInfo` with an empty `ServerName` field, thus failing to trigger the TLS reload on initial TLS handshake; this becomes a problem when expired certificates need to be replaced online.
   - Now, `(*tls.Config).Certificates` is created empty on initial TLS client handshake, first to trigger `(*tls.Config).GetCertificate`, and then to populate rest of the certificates on every new TLS connection, even when client SNI is empty (e.g. cert only includes IPs).
 
+### Added: `etcd`
+
+- Add [`--initial-election-tick-advance`](https://github.com/coreos/etcd/pull/9591) flag to configure initial election tick fast-forward.
+  - By default, `--initial-election-tick-advance=true`, then local member fast-forwards election ticks to speed up "initial" leader election trigger.
+  - This benefits the case of larger election ticks. For instance, cross datacenter deployment may require longer election timeout of 10-second. If true, local node does not need wait up to 10-second. Instead, forwards its election ticks to 8-second, and have only 2-second left before leader election.
+  - Major assumptions are that: cluster has no active leader thus advancing ticks enables faster leader election. Or cluster already has an established leader, and rejoining follower is likely to receive heartbeats from the leader after tick advance and before election timeout.
+  - However, when network from leader to rejoining follower is congested, and the follower does not receive leader heartbeat within left election ticks, disruptive election has to happen thus affecting cluster availabilities.
+  - Now, this can be disabled by setting `--initial-election-tick-advance=false`.
+  - Disabling this would slow down initial bootstrap process for cross datacenter deployments. Make tradeoffs by configuring `--initial-election-tick-advance` at the cost of slow initial bootstrap.
+  - If single-node, it advances ticks regardless.
+  - Address [disruptive rejoining follower node](https://github.com/coreos/etcd/issues/9333).
+
+### Added: `embed`
+
+- Add [`embed.Config.InitialElectionTickAdvance`](https://github.com/coreos/etcd/pull/9591) to enable/disable initial election tick fast-forward.
+  - `embed.NewConfig()` would return `*embed.Config` with `InitialElectionTickAdvance` as true by default.
+
 
 ## [v3.3.3](https://github.com/coreos/etcd/releases/tag/v3.3.3) (2018-03-29)
 
diff --git a/CHANGELOG-3.4.md b/CHANGELOG-3.4.md
index 54bae2953db..0cad8a38374 100644
--- a/CHANGELOG-3.4.md
+++ b/CHANGELOG-3.4.md
@@ -92,6 +92,7 @@ See [code changes](https://github.com/coreos/etcd/compare/v3.3.0...v3.4.0) and [
 
 ### Metrics, Monitoring
 
+- Add [`etcd_server_is_leader`](https://github.com/coreos/etcd/pull/9587) Prometheus metric.
 - Add [`etcd_debugging_mvcc_db_total_size_in_use_in_bytes`](https://github.com/coreos/etcd/pull/9256) Prometheus metric.
 - Add missing [`etcd_network_peer_sent_failures_total` count](https://github.com/coreos/etcd/pull/9437).
 - Fix [`etcd_debugging_server_lease_expired_total`](https://github.com/coreos/etcd/pull/9557) Prometheus metric.
@@ -122,6 +123,15 @@ See [security doc](https://github.com/coreos/etcd/blob/master/Documentation/op-g
 
 ### Added: `etcd`
 
+- Add [`--initial-election-tick-advance`](https://github.com/coreos/etcd/pull/9591) flag to configure initial election tick fast-forward.
+  - By default, `--initial-election-tick-advance=true`, then local member fast-forwards election ticks to speed up "initial" leader election trigger.
+  - This benefits the case of larger election ticks. For instance, cross datacenter deployment may require longer election timeout of 10-second. If true, local node does not need wait up to 10-second. Instead, forwards its election ticks to 8-second, and have only 2-second left before leader election.
+  - Major assumptions are that: cluster has no active leader thus advancing ticks enables faster leader election. Or cluster already has an established leader, and rejoining follower is likely to receive heartbeats from the leader after tick advance and before election timeout.
+  - However, when network from leader to rejoining follower is congested, and the follower does not receive leader heartbeat within left election ticks, disruptive election has to happen thus affecting cluster availabilities.
+  - Now, this can be disabled by setting `--initial-election-tick-advance=false`.
+  - Disabling this would slow down initial bootstrap process for cross datacenter deployments. Make tradeoffs by configuring `--initial-election-tick-advance` at the cost of slow initial bootstrap.
+  - If single-node, it advances ticks regardless.
+  - Address [disruptive rejoining follower node](https://github.com/coreos/etcd/issues/9333).
 - Add [`--pre-vote`](https://github.com/coreos/etcd/pull/9352) flag to enable to run an additional Raft election phase.
   - For instance, a flaky(or rejoining) member may drop in and out, and start campaign. This member will end up with a higher term, and ignore all incoming messages with lower term. In this case, a new leader eventually need to get elected, thus disruptive to cluster availability. Raft implements Pre-Vote phase to prevent this kind of disruptions. If enabled, Raft runs an additional phase of election to check if pre-candidate can get enough votes to win an election.
   - `--pre-vote=false` by default.
@@ -153,6 +163,8 @@ See [security doc](https://github.com/coreos/etcd/blob/master/Documentation/op-g
 
 ### Added: `embed`
 
+- Add [`embed.Config.InitialElectionTickAdvance`](https://github.com/coreos/etcd/pull/9591) to enable/disable initial election tick fast-forward.
+  - `embed.NewConfig()` would return `*embed.Config` with `InitialElectionTickAdvance` as true by default.
 - Add [`embed.Config.Logger`](https://github.com/coreos/etcd/pull/9518) to support [structured logger `zap`](https://github.com/uber-go/zap) in server-side.
 - Define [`embed.CompactorModePeriodic`](https://godoc.org/github.com/coreos/etcd/embed#pkg-variables) for `compactor.ModePeriodic`.
 - Define [`embed.CompactorModeRevision`](https://godoc.org/github.com/coreos/etcd/embed#pkg-variables) for `compactor.ModeRevision`.