Merge pull request #9591 from gyuho/election

*: add --initial-election-tick-advance to configure election fast-forward on bootstrap
etcd-io · Apr 23, 2018 · 4f45f5d · 4f45f5d
2 parents e81f9d8 + 2d7cb9d
commit 4f45f5d
Show file tree

Hide file tree

Showing 9 changed files with 146 additions and 39 deletions.
diff --git a/CHANGELOG-3.3.md b/CHANGELOG-3.3.md
@@ -6,6 +6,7 @@ See [code changes](https://github.com/coreos/etcd/compare/v3.3.3...v3.3.4) and [
 
 ### Metrics, Monitoring
 
+- Add [`etcd_server_is_leader`](https://github.com/coreos/etcd/pull/9587) Prometheus metric.
 - Fix [`etcd_debugging_server_lease_expired_total`](https://github.com/coreos/etcd/pull/9557) Prometheus metric.
 - Fix [race conditions in v2 server stat collecting](https://github.com/coreos/etcd/pull/9562).
 
@@ -16,6 +17,23 @@ See [code changes](https://github.com/coreos/etcd/compare/v3.3.3...v3.3.4) and [
   - However, a certificate whose SAN field does [not include any domain names but only IP addresses](https://github.com/coreos/etcd/issues/9541) would request `*tls.ClientHelloInfo` with an empty `ServerName` field, thus failing to trigger the TLS reload on initial TLS handshake; this becomes a problem when expired certificates need to be replaced online.
   - Now, `(*tls.Config).Certificates` is created empty on initial TLS client handshake, first to trigger `(*tls.Config).GetCertificate`, and then to populate rest of the certificates on every new TLS connection, even when client SNI is empty (e.g. cert only includes IPs).
 
+### Added: `etcd`
+
+- Add [`--initial-election-tick-advance`](https://github.com/coreos/etcd/pull/9591) flag to configure initial election tick fast-forward.
+  - By default, `--initial-election-tick-advance=true`, then local member fast-forwards election ticks to speed up "initial" leader election trigger.
+  - This benefits the case of larger election ticks. For instance, cross datacenter deployment may require longer election timeout of 10-second. If true, local node does not need wait up to 10-second. Instead, forwards its election ticks to 8-second, and have only 2-second left before leader election.
+  - Major assumptions are that: cluster has no active leader thus advancing ticks enables faster leader election. Or cluster already has an established leader, and rejoining follower is likely to receive heartbeats from the leader after tick advance and before election timeout.
+  - However, when network from leader to rejoining follower is congested, and the follower does not receive leader heartbeat within left election ticks, disruptive election has to happen thus affecting cluster availabilities.
+  - Now, this can be disabled by setting `--initial-election-tick-advance=false`.
+  - Disabling this would slow down initial bootstrap process for cross datacenter deployments. Make tradeoffs by configuring `--initial-election-tick-advance` at the cost of slow initial bootstrap.
+  - If single-node, it advances ticks regardless.
+  - Address [disruptive rejoining follower node](https://github.com/coreos/etcd/issues/9333).
+
+### Added: `embed`
+
+- Add [`embed.Config.InitialElectionTickAdvance`](https://github.com/coreos/etcd/pull/9591) to enable/disable initial election tick fast-forward.
+  - `embed.NewConfig()` would return `*embed.Config` with `InitialElectionTickAdvance` as true by default.
+
 
 ## [v3.3.3](https://github.com/coreos/etcd/releases/tag/v3.3.3) (2018-03-29)
 

diff --git a/CHANGELOG-3.4.md b/CHANGELOG-3.4.md
@@ -94,6 +94,7 @@ See [code changes](https://github.com/coreos/etcd/compare/v3.3.0...v3.4.0) and [
 
 ### Metrics, Monitoring
 
+- Add [`etcd_server_is_leader`](https://github.com/coreos/etcd/pull/9587) Prometheus metric.
 - Add [`etcd_debugging_mvcc_db_total_size_in_use_in_bytes`](https://github.com/coreos/etcd/pull/9256) Prometheus metric.
 - Add missing [`etcd_network_peer_sent_failures_total` count](https://github.com/coreos/etcd/pull/9437).
 - Fix [`etcd_debugging_server_lease_expired_total`](https://github.com/coreos/etcd/pull/9557) Prometheus metric.
@@ -124,6 +125,15 @@ See [security doc](https://github.com/coreos/etcd/blob/master/Documentation/op-g
 
 ### Added: `etcd`
 
+- Add [`--initial-election-tick-advance`](https://github.com/coreos/etcd/pull/9591) flag to configure initial election tick fast-forward.
+  - By default, `--initial-election-tick-advance=true`, then local member fast-forwards election ticks to speed up "initial" leader election trigger.
+  - This benefits the case of larger election ticks. For instance, cross datacenter deployment may require longer election timeout of 10-second. If true, local node does not need wait up to 10-second. Instead, forwards its election ticks to 8-second, and have only 2-second left before leader election.
+  - Major assumptions are that: cluster has no active leader thus advancing ticks enables faster leader election. Or cluster already has an established leader, and rejoining follower is likely to receive heartbeats from the leader after tick advance and before election timeout.
+  - However, when network from leader to rejoining follower is congested, and the follower does not receive leader heartbeat within left election ticks, disruptive election has to happen thus affecting cluster availabilities.
+  - Now, this can be disabled by setting `--initial-election-tick-advance=false`.
+  - Disabling this would slow down initial bootstrap process for cross datacenter deployments. Make tradeoffs by configuring `--initial-election-tick-advance` at the cost of slow initial bootstrap.
+  - If single-node, it advances ticks regardless.
+  - Address [disruptive rejoining follower node](https://github.com/coreos/etcd/issues/9333).
 - Add [`--pre-vote`](https://github.com/coreos/etcd/pull/9352) flag to enable to run an additional Raft election phase.
   - For instance, a flaky(or rejoining) member may drop in and out, and start campaign. This member will end up with a higher term, and ignore all incoming messages with lower term. In this case, a new leader eventually need to get elected, thus disruptive to cluster availability. Raft implements Pre-Vote phase to prevent this kind of disruptions. If enabled, Raft runs an additional phase of election to check if pre-candidate can get enough votes to win an election.
   - `--pre-vote=false` by default.
@@ -155,6 +165,8 @@ See [security doc](https://github.com/coreos/etcd/blob/master/Documentation/op-g
 
 ### Added: `embed`
 
+- Add [`embed.Config.InitialElectionTickAdvance`](https://github.com/coreos/etcd/pull/9591) to enable/disable initial election tick fast-forward.
+  - `embed.NewConfig()` would return `*embed.Config` with `InitialElectionTickAdvance` as true by default.
 - Add [`embed.Config.Logger`](https://github.com/coreos/etcd/pull/9518) to support [structured logger `zap`](https://github.com/uber-go/zap) in server-side.
 - Define [`embed.CompactorModePeriodic`](https://godoc.org/github.com/coreos/etcd/embed#pkg-variables) for `compactor.ModePeriodic`.
 - Define [`embed.CompactorModeRevision`](https://godoc.org/github.com/coreos/etcd/embed#pkg-variables) for `compactor.ModeRevision`.

diff --git a/embed/config.go b/embed/config.go
@@ -121,8 +121,38 @@ type Config struct {
 	// TickMs is the number of milliseconds between heartbeat ticks.
 	// TODO: decouple tickMs and heartbeat tick (current heartbeat tick = 1).
 	// make ticks a cluster wide configuration.
-	TickMs            uint  `json:"heartbeat-interval"`
-	ElectionMs        uint  `json:"election-timeout"`
+	TickMs     uint `json:"heartbeat-interval"`
+	ElectionMs uint `json:"election-timeout"`
+
+	// InitialElectionTickAdvance is true, then local member fast-forwards
+	// election ticks to speed up "initial" leader election trigger. This
+	// benefits the case of larger election ticks. For instance, cross
+	// datacenter deployment may require longer election timeout of 10-second.
+	// If true, local node does not need wait up to 10-second. Instead,
+	// forwards its election ticks to 8-second, and have only 2-second left
+	// before leader election.
+	//
+	// Major assumptions are that:
+	//  - cluster has no active leader thus advancing ticks enables faster
+	//    leader election, or
+	//  - cluster already has an established leader, and rejoining follower
+	//    is likely to receive heartbeats from the leader after tick advance
+	//    and before election timeout.
+	//
+	// However, when network from leader to rejoining follower is congested,
+	// and the follower does not receive leader heartbeat within left election
+	// ticks, disruptive election has to happen thus affecting cluster
+	// availabilities.
+	//
+	// Disabling this would slow down initial bootstrap process for cross
+	// datacenter deployments. Make your own tradeoffs by configuring
+	// --initial-election-tick-advance at the cost of slow initial bootstrap.
+	//
+	// If single-node, it advances ticks regardless.
+	//
+	// See https://github.com/coreos/etcd/issues/9333 for more detail.
+	InitialElectionTickAdvance bool `json:"initial-election-tick-advance"`
+
 	QuotaBackendBytes int64 `json:"quota-backend-bytes"`
 	MaxTxnOps         uint  `json:"max-txn-ops"`
 	MaxRequestBytes   uint  `json:"max-request-bytes"`
@@ -305,8 +335,9 @@ func NewConfig() *Config {
 		GRPCKeepAliveInterval: DefaultGRPCKeepAliveInterval,
 		GRPCKeepAliveTimeout:  DefaultGRPCKeepAliveTimeout,
 
-		TickMs:     100,
-		ElectionMs: 1000,
+		TickMs:                     100,
+		ElectionMs:                 1000,
+		InitialElectionTickAdvance: true,
 
 		LPUrls: []url.URL{*lpurl},
 		LCUrls: []url.URL{*lcurl},

diff --git a/embed/etcd.go b/embed/etcd.go
@@ -158,39 +158,40 @@ func StartEtcd(inCfg *Config) (e *Etcd, err error) {
 	}
 
 	srvcfg := etcdserver.ServerConfig{
-		Name:                    cfg.Name,
-		ClientURLs:              cfg.ACUrls,
-		PeerURLs:                cfg.APUrls,
-		DataDir:                 cfg.Dir,
-		DedicatedWALDir:         cfg.WalDir,
-		SnapCount:               cfg.SnapCount,
-		MaxSnapFiles:            cfg.MaxSnapFiles,
-		MaxWALFiles:             cfg.MaxWalFiles,
-		InitialPeerURLsMap:      urlsmap,
-		InitialClusterToken:     token,
-		DiscoveryURL:            cfg.Durl,
-		DiscoveryProxy:          cfg.Dproxy,
-		NewCluster:              cfg.IsNewCluster(),
-		PeerTLSInfo:             cfg.PeerTLSInfo,
-		TickMs:                  cfg.TickMs,
-		ElectionTicks:           cfg.ElectionTicks(),
-		AutoCompactionRetention: autoCompactionRetention,
-		AutoCompactionMode:      cfg.AutoCompactionMode,
-		QuotaBackendBytes:       cfg.QuotaBackendBytes,
-		MaxTxnOps:               cfg.MaxTxnOps,
-		MaxRequestBytes:         cfg.MaxRequestBytes,
-		StrictReconfigCheck:     cfg.StrictReconfigCheck,
-		ClientCertAuthEnabled:   cfg.ClientTLSInfo.ClientCertAuth,
-		AuthToken:               cfg.AuthToken,
-		CORS:                    cfg.CORS,
-		HostWhitelist:           cfg.HostWhitelist,
-		InitialCorruptCheck:     cfg.ExperimentalInitialCorruptCheck,
-		CorruptCheckTime:        cfg.ExperimentalCorruptCheckTime,
-		PreVote:                 cfg.PreVote,
-		Logger:                  cfg.logger,
-		LoggerConfig:            cfg.loggerConfig,
-		Debug:                   cfg.Debug,
-		ForceNewCluster:         cfg.ForceNewCluster,
+		Name:                       cfg.Name,
+		ClientURLs:                 cfg.ACUrls,
+		PeerURLs:                   cfg.APUrls,
+		DataDir:                    cfg.Dir,
+		DedicatedWALDir:            cfg.WalDir,
+		SnapCount:                  cfg.SnapCount,
+		MaxSnapFiles:               cfg.MaxSnapFiles,
+		MaxWALFiles:                cfg.MaxWalFiles,
+		InitialPeerURLsMap:         urlsmap,
+		InitialClusterToken:        token,
+		DiscoveryURL:               cfg.Durl,
+		DiscoveryProxy:             cfg.Dproxy,
+		NewCluster:                 cfg.IsNewCluster(),
+		PeerTLSInfo:                cfg.PeerTLSInfo,
+		TickMs:                     cfg.TickMs,
+		ElectionTicks:              cfg.ElectionTicks(),
+		InitialElectionTickAdvance: cfg.InitialElectionTickAdvance,
+		AutoCompactionRetention:    autoCompactionRetention,
+		AutoCompactionMode:         cfg.AutoCompactionMode,
+		QuotaBackendBytes:          cfg.QuotaBackendBytes,
+		MaxTxnOps:                  cfg.MaxTxnOps,
+		MaxRequestBytes:            cfg.MaxRequestBytes,
+		StrictReconfigCheck:        cfg.StrictReconfigCheck,
+		ClientCertAuthEnabled:      cfg.ClientTLSInfo.ClientCertAuth,
+		AuthToken:                  cfg.AuthToken,
+		CORS:                       cfg.CORS,
+		HostWhitelist:              cfg.HostWhitelist,
+		InitialCorruptCheck:        cfg.ExperimentalInitialCorruptCheck,
+		CorruptCheckTime:           cfg.ExperimentalCorruptCheckTime,
+		PreVote:                    cfg.PreVote,
+		Logger:                     cfg.logger,
+		LoggerConfig:               cfg.loggerConfig,
+		Debug:                      cfg.Debug,
+		ForceNewCluster:            cfg.ForceNewCluster,
 	}
 	if e.Server, err = etcdserver.NewServer(srvcfg); err != nil {
 		return e, err

diff --git a/etcdmain/config.go b/etcdmain/config.go
@@ -153,6 +153,7 @@ func newConfig() *config {
 	fs.Uint64Var(&cfg.ec.SnapCount, "snapshot-count", cfg.ec.SnapCount, "Number of committed transactions to trigger a snapshot to disk.")
 	fs.UintVar(&cfg.ec.TickMs, "heartbeat-interval", cfg.ec.TickMs, "Time (in milliseconds) of a heartbeat interval.")
 	fs.UintVar(&cfg.ec.ElectionMs, "election-timeout", cfg.ec.ElectionMs, "Time (in milliseconds) for an election to timeout.")
+	fs.BoolVar(&cfg.ec.InitialElectionTickAdvance, "initial-election-tick-advance", cfg.ec.InitialElectionTickAdvance, "Whether to fast-forward initial election ticks on boot for faster election.")
 	fs.Int64Var(&cfg.ec.QuotaBackendBytes, "quota-backend-bytes", cfg.ec.QuotaBackendBytes, "Raise alarms when backend size exceeds the given quota. 0 means use the default quota.")
 	fs.UintVar(&cfg.ec.MaxTxnOps, "max-txn-ops", cfg.ec.MaxTxnOps, "Maximum number of operations permitted in a transaction.")
 	fs.UintVar(&cfg.ec.MaxRequestBytes, "max-request-bytes", cfg.ec.MaxRequestBytes, "Maximum client request size in bytes the server will accept.")

diff --git a/etcdmain/help.go b/etcdmain/help.go
@@ -55,6 +55,8 @@ Member:
     Time (in milliseconds) of a heartbeat interval.
   --election-timeout '1000'
     Time (in milliseconds) for an election to timeout. See tuning documentation for details.
+  --initial-election-tick-advance 'true'
+    Whether to fast-forward initial election ticks on boot for faster election.
   --listen-peer-urls 'http://localhost:2380'
     List of URLs to listen on for peer traffic.
   --listen-client-urls 'http://localhost:2379'

diff --git a/etcdserver/config.go b/etcdserver/config.go
@@ -55,8 +55,38 @@ type ServerConfig struct {
 	// whose Host header value exists in this white list.
 	HostWhitelist map[string]struct{}
 
-	TickMs           uint
-	ElectionTicks    int
+	TickMs        uint
+	ElectionTicks int
+
+	// InitialElectionTickAdvance is true, then local member fast-forwards
+	// election ticks to speed up "initial" leader election trigger. This
+	// benefits the case of larger election ticks. For instance, cross
+	// datacenter deployment may require longer election timeout of 10-second.
+	// If true, local node does not need wait up to 10-second. Instead,
+	// forwards its election ticks to 8-second, and have only 2-second left
+	// before leader election.
+	//
+	// Major assumptions are that:
+	//  - cluster has no active leader thus advancing ticks enables faster
+	//    leader election, or
+	//  - cluster already has an established leader, and rejoining follower
+	//    is likely to receive heartbeats from the leader after tick advance
+	//    and before election timeout.
+	//
+	// However, when network from leader to rejoining follower is congested,
+	// and the follower does not receive leader heartbeat within left election
+	// ticks, disruptive election has to happen thus affecting cluster
+	// availabilities.
+	//
+	// Disabling this would slow down initial bootstrap process for cross
+	// datacenter deployments. Make your own tradeoffs by configuring
+	// --initial-election-tick-advance at the cost of slow initial bootstrap.
+	//
+	// If single-node, it advances ticks regardless.
+	//
+	// See https://github.com/coreos/etcd/issues/9333 for more detail.
+	InitialElectionTickAdvance bool
+
 	BootstrapTimeout time.Duration
 
 	AutoCompactionRetention time.Duration
@@ -263,6 +293,7 @@ func (c *ServerConfig) print(initial bool) {
 			zap.String("heartbeat-interval", fmt.Sprintf("%v", time.Duration(c.TickMs)*time.Millisecond)),
 			zap.Int("election-tick-ms", c.ElectionTicks),
 			zap.String("election-timeout", fmt.Sprintf("%v", time.Duration(c.ElectionTicks*int(c.TickMs))*time.Millisecond)),
+			zap.Bool("initial-election-tick-advance", c.InitialElectionTickAdvance),
 			zap.Uint64("snapshot-count", c.SnapCount),
 			zap.Strings("advertise-client-urls", c.getACURLs()),
 			zap.Strings("initial-advertise-peer-urls", c.getAPURLs()),

diff --git a/etcdserver/server.go b/etcdserver/server.go
@@ -635,6 +635,16 @@ func (s *EtcdServer) adjustTicks() {
 		return
 	}
 
+	if !s.Cfg.InitialElectionTickAdvance {
+		if lg != nil {
+			lg.Info("skipping initial election tick advance", zap.Int("election-ticks", s.Cfg.ElectionTicks))
+		}
+		return
+	}
+	if lg != nil {
+		lg.Info("starting initial election tick advance", zap.Int("election-ticks", s.Cfg.ElectionTicks))
+	}
+
 	// retry up to "rafthttp.ConnReadTimeout", which is 5-sec
 	// until peer connection reports; otherwise:
 	// 1. all connections failed, or

diff --git a/integration/cluster.go b/integration/cluster.go
@@ -593,6 +593,7 @@ func mustNewMember(t *testing.T, mcfg memberConfig) *member {
 		m.ServerConfig.PeerTLSInfo = *m.PeerTLSInfo
 	}
 	m.ElectionTicks = electionTicks
+	m.InitialElectionTickAdvance = true
 	m.TickMs = uint(tickDuration / time.Millisecond)
 	m.QuotaBackendBytes = mcfg.quotaBackendBytes
 	m.MaxTxnOps = mcfg.maxTxnOps