Skip to content

Commit

Permalink
Merge pull request #9565 from gyuho/quorum-disaster
Browse files Browse the repository at this point in the history
functional: simulate quorum disaster
  • Loading branch information
gyuho authored Apr 12, 2018
2 parents 3f29d25 + f72449c commit 70341b1
Show file tree
Hide file tree
Showing 37 changed files with 3,132 additions and 1,591 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG-3.4.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ See [code changes](https://github.com/coreos/etcd/compare/v3.3.0...v3.4.0) and [
- Make [Lease `Lookup` non-blocking with concurrent `Grant`/`Revoke`](https://github.com/coreos/etcd/pull/9229).
- Make etcd server return `raft.ErrProposalDropped` on internal Raft proposal drop in [v3 applier](https://github.com/coreos/etcd/pull/9549) and [v2 applier](https://github.com/coreos/etcd/pull/9558).
- e.g. a node is removed from cluster, or [`raftpb.MsgProp` arrives at current leader while there is an ongoing leadership transfer](https://github.com/coreos/etcd/issues/8975).
- Improve [functional tester](https://github.com/coreos/etcd/tree/master/functional) coverage: [proxy layer to run network fault tests in CI](https://github.com/coreos/etcd/pull/9081), [TLS is enabled both for server and client](https://github.com/coreos/etcd/pull/9534), [liveness mode](https://github.com/coreos/etcd/issues/9230), [shuffle test sequence](https://github.com/coreos/etcd/issues/9381), [membership reconfiguration failure cases](https://github.com/coreos/etcd/pull/9564), [disastrous quorum loss and snapshot recovery](TODO).
- Improve [functional tester](https://github.com/coreos/etcd/tree/master/functional) coverage: [proxy layer to run network fault tests in CI](https://github.com/coreos/etcd/pull/9081), [TLS is enabled both for server and client](https://github.com/coreos/etcd/pull/9534), [liveness mode](https://github.com/coreos/etcd/issues/9230), [shuffle test sequence](https://github.com/coreos/etcd/issues/9381), [membership reconfiguration failure cases](https://github.com/coreos/etcd/pull/9564), [disastrous quorum loss and snapshot recover from a seed member](https://github.com/coreos/etcd/pull/9565).

### Breaking Changes

Expand Down
82 changes: 63 additions & 19 deletions functional.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ agent-configs:
- etcd-exec-path: ./bin/etcd
agent-addr: 127.0.0.1:19027
failpoint-http-addr: http://127.0.0.1:7381
base-dir: /tmp/etcd-agent-data-1
etcd-log-path: /tmp/etcd-agent-data-1/current-etcd.log
base-dir: /tmp/etcd-functional-1
etcd-log-path: /tmp/etcd-functional-1/etcd.log
etcd-client-proxy: false
etcd-peer-proxy: true
etcd-client-endpoint: 127.0.0.1:1379
etcd:
name: s1
data-dir: /tmp/etcd-agent-data-1/etcd.data
wal-dir: /tmp/etcd-agent-data-1/etcd.data/member/wal
data-dir: /tmp/etcd-functional-1/etcd.data
wal-dir: /tmp/etcd-functional-1/etcd.data/member/wal
heartbeat-interval: 100
election-timeout: 1000
listen-client-urls: ["https://127.0.0.1:1379"]
Expand All @@ -34,18 +34,32 @@ agent-configs:
quota-backend-bytes: 10740000000 # 10 GiB
pre-vote: true
initial-corrupt-check: true
client-cert-data: ""
client-cert-path: ""
client-key-data: ""
client-key-path: ""
client-trusted-ca-data: ""
client-trusted-ca-path: ""
peer-cert-data: ""
peer-cert-path: ""
peer-key-data: ""
peer-key-path: ""
peer-trusted-ca-data: ""
peer-trusted-ca-path: ""
snapshot-path: /tmp/etcd-functional-1.snapshot.db

- etcd-exec-path: ./bin/etcd
agent-addr: 127.0.0.1:29027
failpoint-http-addr: http://127.0.0.1:7382
base-dir: /tmp/etcd-agent-data-2
etcd-log-path: /tmp/etcd-agent-data-2/current-etcd.log
base-dir: /tmp/etcd-functional-2
etcd-log-path: /tmp/etcd-functional-2/etcd.log
etcd-client-proxy: false
etcd-peer-proxy: true
etcd-client-endpoint: 127.0.0.1:2379
etcd:
name: s2
data-dir: /tmp/etcd-agent-data-2/etcd.data
wal-dir: /tmp/etcd-agent-data-2/etcd.data/member/wal
data-dir: /tmp/etcd-functional-2/etcd.data
wal-dir: /tmp/etcd-functional-2/etcd.data/member/wal
heartbeat-interval: 100
election-timeout: 1000
listen-client-urls: ["https://127.0.0.1:2379"]
Expand All @@ -69,18 +83,32 @@ agent-configs:
quota-backend-bytes: 10740000000 # 10 GiB
pre-vote: true
initial-corrupt-check: true
client-cert-data: ""
client-cert-path: ""
client-key-data: ""
client-key-path: ""
client-trusted-ca-data: ""
client-trusted-ca-path: ""
peer-cert-data: ""
peer-cert-path: ""
peer-key-data: ""
peer-key-path: ""
peer-trusted-ca-data: ""
peer-trusted-ca-path: ""
snapshot-path: /tmp/etcd-functional-2.snapshot.db

- etcd-exec-path: ./bin/etcd
agent-addr: 127.0.0.1:39027
failpoint-http-addr: http://127.0.0.1:7383
base-dir: /tmp/etcd-agent-data-3
etcd-log-path: /tmp/etcd-agent-data-3/current-etcd.log
base-dir: /tmp/etcd-functional-3
etcd-log-path: /tmp/etcd-functional-3/etcd.log
etcd-client-proxy: false
etcd-peer-proxy: true
etcd-client-endpoint: 127.0.0.1:3379
etcd:
name: s3
data-dir: /tmp/etcd-agent-data-3/etcd.data
wal-dir: /tmp/etcd-agent-data-3/etcd.data/member/wal
data-dir: /tmp/etcd-functional-3/etcd.data
wal-dir: /tmp/etcd-functional-3/etcd.data/member/wal
heartbeat-interval: 100
election-timeout: 1000
listen-client-urls: ["https://127.0.0.1:3379"]
Expand All @@ -104,6 +132,19 @@ agent-configs:
quota-backend-bytes: 10740000000 # 10 GiB
pre-vote: true
initial-corrupt-check: true
client-cert-data: ""
client-cert-path: ""
client-key-data: ""
client-key-path: ""
client-trusted-ca-data: ""
client-trusted-ca-path: ""
peer-cert-data: ""
peer-cert-path: ""
peer-key-data: ""
peer-key-path: ""
peer-trusted-ca-data: ""
peer-trusted-ca-path: ""
snapshot-path: /tmp/etcd-functional-3.snapshot.db

tester-config:
data-dir: /tmp/etcd-tester-data
Expand All @@ -116,15 +157,14 @@ tester-config:

round-limit: 1
exit-on-failure: true
consistency-check: true
enable-pprof: true

failure-delay-ms: 7000
failure-shuffle: true
case-delay-ms: 7000
case-shuffle: true

# For full descriptions,
# https://godoc.org/github.com/coreos/etcd/functional/rpcpb#FailureCase
failure-cases:
# https://godoc.org/github.com/coreos/etcd/functional/rpcpb#Case
cases:
- SIGTERM_ONE_FOLLOWER
- SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
- SIGTERM_LEADER
Expand Down Expand Up @@ -153,9 +193,9 @@ tester-config:
- RANDOM_DELAY_PEER_PORT_TX_RX_ALL
- NO_FAIL_WITH_STRESS
- NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS

# - SIGQUIT_AND_REMOVE_LEADER
# - SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT
# - SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH

failpoint-commands:
- panic("etcd-tester")
Expand All @@ -164,14 +204,18 @@ tester-config:
runner-exec-path: ./bin/etcd-runner
external-exec-path: ""

stress-types:
stressers:
- KV
- LEASE
# - ELECTION_RUNNER
# - WATCH_RUNNER
# - LOCK_RACER_RUNNER
# - LEASE_RUNNER

checkers:
- KV_HASH
- LEASE_EXPIRE

stress-key-size: 100
stress-key-size-large: 32769
stress-key-suffix-range: 250000
Expand Down
2 changes: 1 addition & 1 deletion functional/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[`functional`](https://godoc.org/github.com/coreos/etcd/functional) verifies the correct behavior of etcd under various system and network malfunctions. It sets up an etcd cluster under high pressure loads and continuously injects failures into the cluster. Then it expects the etcd cluster to recover within a few seconds. This has been extremely helpful to find critical bugs.

See [`rpcpb.FailureCase`](https://godoc.org/github.com/coreos/etcd/functional/rpcpb#FailureCase) for all failure cases.
See [`rpcpb.Case`](https://godoc.org/github.com/coreos/etcd/functional/rpcpb#Case) for all failure cases.

See [functional.yaml](https://github.com/coreos/etcd/blob/master/functional.yaml) for an example configuration.

Expand Down
70 changes: 67 additions & 3 deletions functional/agent/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ func (srv *Server) handleTesterRequest(req *rpcpb.Request) (resp *rpcpb.Response
case rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA:
return srv.handle_SIGQUIT_ETCD_AND_REMOVE_DATA()

case rpcpb.Operation_SAVE_SNAPSHOT:
return srv.handle_SAVE_SNAPSHOT()
case rpcpb.Operation_RESTORE_RESTART_FROM_SNAPSHOT:
return srv.handle_RESTORE_RESTART_FROM_SNAPSHOT()
case rpcpb.Operation_RESTART_FROM_SNAPSHOT:
return srv.handle_RESTART_FROM_SNAPSHOT()

case rpcpb.Operation_SIGQUIT_ETCD_AND_ARCHIVE_DATA:
return srv.handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA()
case rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT:
Expand Down Expand Up @@ -96,7 +103,7 @@ func (srv *Server) handle_INITIAL_START_ETCD(req *rpcpb.Request) (*rpcpb.Respons
return nil, err
}

srv.creatEtcdCmd()
srv.creatEtcdCmd(false)

if err = srv.saveTLSAssets(); err != nil {
return nil, err
Expand Down Expand Up @@ -225,8 +232,11 @@ func (srv *Server) createEtcdLogFile() error {
return nil
}

func (srv *Server) creatEtcdCmd() {
func (srv *Server) creatEtcdCmd(fromSnapshot bool) {
etcdPath, etcdFlags := srv.Member.EtcdExecPath, srv.Member.Etcd.Flags()
if fromSnapshot {
etcdFlags = srv.Member.EtcdOnSnapshotRestore.Flags()
}
u, _ := url.Parse(srv.Member.FailpointHTTPAddr)
srv.lg.Info("creating etcd command",
zap.String("etcd-exec-path", etcdPath),
Expand Down Expand Up @@ -416,7 +426,7 @@ func (srv *Server) handle_RESTART_ETCD() (*rpcpb.Response, error) {
}
}

srv.creatEtcdCmd()
srv.creatEtcdCmd(false)

if err = srv.saveTLSAssets(); err != nil {
return nil, err
Expand Down Expand Up @@ -502,6 +512,60 @@ func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA() (*rpcpb.Response, error
}, nil
}

func (srv *Server) handle_SAVE_SNAPSHOT() (*rpcpb.Response, error) {
err := srv.Member.SaveSnapshot(srv.lg)
if err != nil {
return nil, err
}
return &rpcpb.Response{
Success: true,
Status: "saved snapshot",
SnapshotInfo: srv.Member.SnapshotInfo,
}, nil
}

func (srv *Server) handle_RESTORE_RESTART_FROM_SNAPSHOT() (resp *rpcpb.Response, err error) {
err = srv.Member.RestoreSnapshot(srv.lg)
if err != nil {
return nil, err
}
resp, err = srv.handle_RESTART_FROM_SNAPSHOT()
if resp != nil && err == nil {
resp.Status = "restored snapshot and " + resp.Status
}
return resp, err
}

func (srv *Server) handle_RESTART_FROM_SNAPSHOT() (resp *rpcpb.Response, err error) {
srv.creatEtcdCmd(true)

if err = srv.saveTLSAssets(); err != nil {
return nil, err
}
if err = srv.startEtcdCmd(); err != nil {
return nil, err
}
srv.lg.Info("restarted etcd", zap.String("command-path", srv.etcdCmd.Path))
if err = srv.loadAutoTLSAssets(); err != nil {
return nil, err
}

// wait some time for etcd listener start
// before setting up proxy
// TODO: local tests should handle port conflicts
// with clients on restart
time.Sleep(time.Second)
if err = srv.startProxy(); err != nil {
return nil, err
}

return &rpcpb.Response{
Success: true,
Status: "restarted etcd from snapshot",
SnapshotInfo: srv.Member.SnapshotInfo,
}, nil
}

func (srv *Server) handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA() (*rpcpb.Response, error) {
srv.stopProxy()

Expand Down
Loading

0 comments on commit 70341b1

Please sign in to comment.