diff --git a/server/etcdserver/api/v3client/v3client.go b/server/etcdserver/api/v3client/v3client.go index c44479ffad24..25a7395a2245 100644 --- a/server/etcdserver/api/v3client/v3client.go +++ b/server/etcdserver/api/v3client/v3client.go @@ -39,7 +39,7 @@ func New(s *etcdserver.EtcdServer) *clientv3.Client { wc := adapter.WatchServerToWatchClient(v3rpc.NewWatchServer(s)) c.Watcher = &watchWrapper{clientv3.NewWatchFromWatchClient(wc, c)} - mc := adapter.MaintenanceServerToMaintenanceClient(v3rpc.NewMaintenanceServer(s)) + mc := adapter.MaintenanceServerToMaintenanceClient(v3rpc.NewMaintenanceServer(s, v3rpc.NewHealthChecker(nil, s))) c.Maintenance = clientv3.NewMaintenanceFromMaintenanceClient(mc, c) clc := adapter.ClusterServerToClusterClient(v3rpc.NewClusterServer(s)) diff --git a/server/etcdserver/api/v3rpc/grpc.go b/server/etcdserver/api/v3rpc/grpc.go index 148914e0f9a9..0ec84b58a43b 100644 --- a/server/etcdserver/api/v3rpc/grpc.go +++ b/server/etcdserver/api/v3rpc/grpc.go @@ -75,14 +75,15 @@ func Server(s *etcdserver.EtcdServer, tls *tls.Config, interceptor grpc.UnarySer pb.RegisterLeaseServer(grpcServer, NewQuotaLeaseServer(s)) pb.RegisterClusterServer(grpcServer, NewClusterServer(s)) pb.RegisterAuthServer(grpcServer, NewAuthServer(s)) - pb.RegisterMaintenanceServer(grpcServer, NewMaintenanceServer(s)) // server should register all the services manually // use empty service name for all etcd services' health status, // see https://github.com/grpc/grpc/blob/master/doc/health-checking.md for more hsrv := health.NewServer() - hsrv.SetServingStatus("", healthpb.HealthCheckResponse_SERVING) healthpb.RegisterHealthServer(grpcServer, hsrv) + hc := NewHealthChecker(hsrv, s) + + pb.RegisterMaintenanceServer(grpcServer, NewMaintenanceServer(s, hc)) // set zero values for metrics registered for this grpc server grpc_prometheus.Register(grpcServer) diff --git a/server/etcdserver/api/v3rpc/health.go b/server/etcdserver/api/v3rpc/health.go new file mode 100644 index 000000000000..3e2419da25f2 --- /dev/null +++ b/server/etcdserver/api/v3rpc/health.go @@ -0,0 +1,155 @@ +// Copyright 2023 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v3rpc + +import ( + "context" + "fmt" + "sync" + "time" + + "go.uber.org/zap" + "google.golang.org/grpc/health" + healthpb "google.golang.org/grpc/health/grpc_health_v1" + + "go.etcd.io/raft/v3" + + "go.etcd.io/etcd/api/v3/etcdserverpb" + "go.etcd.io/etcd/client/pkg/v3/types" + "go.etcd.io/etcd/server/v3/auth" + "go.etcd.io/etcd/server/v3/etcdserver" +) + +type healthNotifier interface { + StartServe() + StopServe(reason string) +} + +type healthChecker struct { + hs *health.Server + + mu sync.RWMutex + served bool + + // implements readyz prober + s *etcdserver.EtcdServer + // prober configuration TODO 2023-09-13 externalize as etcd server flag + successThreshold int + failureThreshold int + healthCheckInterval time.Duration + // prober state + successCount int + failureCount int +} + +func NewHealthChecker(hs *health.Server, s *etcdserver.EtcdServer) healthNotifier { + hc := &healthChecker{hs: hs, s: s, successThreshold: 2, failureThreshold: 3, healthCheckInterval: 100 * time.Millisecond} + // set grpc health server as serving status blindly since + // the grpc server will serve iff s.ReadyNotify() is closed. + hc.StartServe() + + go hc.startProbe() + return hc +} + +func (hc *healthChecker) StartServe() { + if hc.hs != nil { + if hc.s.Logger() != nil { + hc.s.Logger().Info("start serving gRPC requests from client") + } + hc.hs.SetServingStatus("", healthpb.HealthCheckResponse_SERVING) + hc.changeServedTo(true) + } +} + +func (hc *healthChecker) StopServe(reason string) { + if hc.hs != nil { + if hc.s.Logger() != nil { + hc.s.Logger().Warn("stop serving gRPC requests from client", zap.String("reason", reason)) + } + hc.hs.SetServingStatus("", healthpb.HealthCheckResponse_NOT_SERVING) + hc.changeServedTo(false) + } +} + +func (hc *healthChecker) changeServedTo(served bool) { + hc.mu.Lock() + defer hc.mu.Unlock() + hc.served = served +} + +func (hc *healthChecker) IsServed() bool { + hc.mu.RLock() + defer hc.mu.RUnlock() + return hc.served +} + +func (hc *healthChecker) startProbe() { + // stop probing if there is no consumer. + if hc.hs == nil { + return + } + + ticker := time.NewTicker(hc.healthCheckInterval) + defer ticker.Stop() + for { + select { + case <-hc.s.StoppingNotify(): + return + case <-ticker.C: + if ready, notReadyReason := checkReadyz(hc.s, 2*time.Second); ready { + hc.successCount += 1 + hc.failureCount = 0 + if hc.successCount >= hc.successThreshold && !hc.IsServed() { + hc.StartServe() + } + } else { + hc.failureCount += 1 + hc.successCount = 0 + hc.s.Logger().Warn("readyz check failed", zap.Int("failure-count", hc.failureCount)) + if hc.failureCount >= hc.failureThreshold && hc.IsServed() { + hc.StopServe(notReadyReason) + } + } + } + } +} + +// checkReadyz prober implementation should be in sync with readyz design in the future. +// https://docs.google.com/document/d/1109lUxD326yRwmMVX-tkJMpm8pTbOw7MD1XsSIo37MU +// it is now in sync with etcdhttp.HandleHealth implementation +// 1. checkLeader +// 2. checkAlarms +// 3. checkQuorumRead +func checkReadyz(s *etcdserver.EtcdServer, timeout time.Duration) (ready bool, notReadyReason string) { + if s.Leader() == types.ID(raft.None) { + return false, "local member does not have leader" + } + for _, alarm := range s.Alarms() { + if types.ID(alarm.MemberID) == s.MemberId() && alarm.Alarm == etcdserverpb.AlarmType_CORRUPT { + return false, "local member has corrupt alarm" + } + } + ctx, cancel := context.WithTimeout(context.Background(), timeout) + // range requests will only be timed-out in the LinearizableReadNotify phase, and + // blocking in the mvcc phase in case of defrag. + // However, use solely LinearizableReadNotify to indicate readyz will produce false positive serving status when defrag is active. + _, err := s.Range(ctx, &etcdserverpb.RangeRequest{KeysOnly: true, Limit: 1}) + cancel() + if err != nil && err != auth.ErrUserEmpty && err != auth.ErrPermissionDenied { + return false, fmt.Sprintf("local member quorum read with %s timeout failed due to %v", timeout.String(), err) + } + return true, "" +} diff --git a/server/etcdserver/api/v3rpc/maintenance.go b/server/etcdserver/api/v3rpc/maintenance.go index 9920a9cbd6b8..29d40c26b6e0 100644 --- a/server/etcdserver/api/v3rpc/maintenance.go +++ b/server/etcdserver/api/v3rpc/maintenance.go @@ -22,6 +22,8 @@ import ( "github.com/dustin/go-humanize" + "go.etcd.io/raft/v3" + pb "go.etcd.io/etcd/api/v3/etcdserverpb" "go.etcd.io/etcd/api/v3/v3rpc/rpctypes" "go.etcd.io/etcd/api/v3/version" @@ -32,7 +34,6 @@ import ( "go.etcd.io/etcd/server/v3/storage/backend" "go.etcd.io/etcd/server/v3/storage/mvcc" "go.etcd.io/etcd/server/v3/storage/schema" - "go.etcd.io/raft/v3" "go.uber.org/zap" ) @@ -75,10 +76,11 @@ type maintenanceServer struct { cs ClusterStatusGetter d Downgrader vs serverversion.Server + hs healthNotifier } -func NewMaintenanceServer(s *etcdserver.EtcdServer) pb.MaintenanceServer { - srv := &maintenanceServer{lg: s.Cfg.Logger, rg: s, hasher: s.KV().HashStorage(), bg: s, a: s, lt: s, hdr: newHeader(s), cs: s, d: s, vs: etcdserver.NewServerVersionAdapter(s)} +func NewMaintenanceServer(s *etcdserver.EtcdServer, hs healthNotifier) pb.MaintenanceServer { + srv := &maintenanceServer{lg: s.Cfg.Logger, rg: s, hasher: s.KV().HashStorage(), bg: s, a: s, lt: s, hdr: newHeader(s), cs: s, d: s, vs: etcdserver.NewServerVersionAdapter(s), hs: hs} if srv.lg == nil { srv.lg = zap.NewNop() } @@ -87,6 +89,8 @@ func NewMaintenanceServer(s *etcdserver.EtcdServer) pb.MaintenanceServer { func (ms *maintenanceServer) Defragment(ctx context.Context, sr *pb.DefragmentRequest) (*pb.DefragmentResponse, error) { ms.lg.Info("starting defragment") + ms.hs.StopServe("defrag is active") + defer ms.hs.StartServe() err := ms.bg.Backend().Defrag() if err != nil { ms.lg.Warn("failed to defragment", zap.Error(err)) diff --git a/tests/e2e/failover_test.go b/tests/e2e/failover_test.go new file mode 100644 index 000000000000..d9baaf8cf4b2 --- /dev/null +++ b/tests/e2e/failover_test.go @@ -0,0 +1,162 @@ +// Copyright 2023 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !cluster_proxy + +package e2e + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + + clientv3 "go.etcd.io/etcd/client/v3" + "go.etcd.io/etcd/tests/v3/framework/config" + "go.etcd.io/etcd/tests/v3/framework/e2e" +) + +const ( + // in sync with how kubernetes uses etcd + // https://github.com/kubernetes/kubernetes/blob/release-1.28/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go#L59-L71 + keepaliveTime = 30 * time.Second + keepaliveTimeout = 10 * time.Second + dialTimeout = 20 * time.Second + + clientRuntime = 10 * time.Second + failureDetectionLatency = 6 * time.Second + // expect no more than 5 failed requests + failedRequests = 5 +) + +func TestFailover(t *testing.T) { + tcs := []struct { + name string + clusterOptions []e2e.EPClusterOption + failureInjector func(t *testing.T, clus *e2e.EtcdProcessCluster) + failureDetectionLatency time.Duration + }{ + { + name: "network_partition", + clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithIsPeerTLS(true), e2e.WithPeerProxy(true)}, + failureInjector: blackhole, + failureDetectionLatency: failureDetectionLatency, + }, + { + name: "stalled_disk_write", + clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithGoFailEnabled(true)}, + failureInjector: blockDiskWrite, + failureDetectionLatency: failureDetectionLatency, + }, + { + name: "defrag", + clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithGoFailEnabled(true)}, + failureInjector: triggerDefrag, + }, + } + + for _, tc := range tcs { + t.Run(tc.name, func(t *testing.T) { + e2e.BeforeTest(t) + clus, err := e2e.NewEtcdProcessCluster(context.TODO(), t, tc.clusterOptions...) + t.Cleanup(func() { clus.Stop() }) + endpoints := clus.EndpointsGRPC() + + cnt, success := 0, 0 + donec := make(chan struct{}) + errc := make(chan error, 1) + + go func() { + var lastErr error + var cc *clientv3.Client + defer func() { + if cc != nil { + cc.Close() + } + errc <- lastErr + close(donec) + close(errc) + }() + cc, cerr := clientv3.New(clientv3.Config{ + DialTimeout: dialTimeout, + DialKeepAliveTime: keepaliveTime, + DialKeepAliveTimeout: keepaliveTimeout, + Endpoints: endpoints, + DialOptions: []grpc.DialOption{ + grpc.WithDisableServiceConfig(), + grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy": "round_robin", "healthCheckConfig": {"serviceName": ""}}`), + // the following service config will disable grpc client health check + //grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy": "round_robin"}`), + }, + }) + require.NoError(t, cerr) + timeout := time.After(clientRuntime) + + time.Sleep(tc.failureDetectionLatency) + for { + select { + case <-timeout: + return + default: + } + cctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + //start := time.Now() + _, err = cc.Get(cctx, "health") + //t.Logf("TS (%s): number #%d health check took %v", time.Now().UTC().Format(time.RFC3339), cnt, time.Since(start)) + cancel() + cnt++ + if err != nil { + lastErr = err + continue + } + success++ + } + }() + + tc.failureInjector(t, clus) + + <-donec + err, ok := <-errc + if ok && err != nil { + t.Logf("etcd client failed to fail over, error (%v)", err) + } + t.Logf("request failure rate is %.2f%%, traffic volume success %d requests, total %d requests", (1-float64(success)/float64(cnt))*100, success, cnt) + // expect no more than 5 failed requests + require.InDelta(t, cnt, success, failedRequests) + }) + } +} + +func blackhole(t *testing.T, clus *e2e.EtcdProcessCluster) { + member := clus.Procs[0] + proxy := member.PeerProxy() + t.Logf("Blackholing traffic from and to member %q", member.Config().Name) + proxy.BlackholeTx() + proxy.BlackholeRx() +} + +func blockDiskWrite(t *testing.T, clus *e2e.EtcdProcessCluster) { + err := clus.Procs[0].Failpoints().Setup(context.Background(), "raftBeforeSave", `sleep(10000)`) + require.NoError(t, err) + clus.Procs[0].Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{}) +} + +func triggerDefrag(t *testing.T, clus *e2e.EtcdProcessCluster) { + err := clus.Procs[0].Failpoints().Setup(context.Background(), "defragBeforeCopy", `sleep(8000)`) + require.NoError(t, err) + err = clus.Procs[0].Etcdctl().Defragment(context.Background(), config.DefragOption{Timeout: time.Minute}) + require.NoError(t, err) +}