From ddf7a69fbab7ddded8f673f09683024be1e9e4da Mon Sep 17 00:00:00 2001 From: Marek Siarkowicz Date: Thu, 27 Jan 2022 18:25:39 +0100 Subject: [PATCH 1/6] server: Use named struct initialization in healthcheck test Signed-off-by: Siyuan Zhang --- etcdserver/api/etcdhttp/metrics_test.go | 56 ++++++++++++------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/etcdserver/api/etcdhttp/metrics_test.go b/etcdserver/api/etcdhttp/metrics_test.go index 3468d9fd60d..bd6f2b1de45 100644 --- a/etcdserver/api/etcdhttp/metrics_test.go +++ b/etcdserver/api/etcdhttp/metrics_test.go @@ -68,46 +68,46 @@ func TestHealthHandler(t *testing.T) { health string }{ { - []*pb.AlarmMember{}, - "/health", - http.StatusOK, - "true", + alarms: []*pb.AlarmMember{}, + healthCheckURL: "/health", + statusCode: http.StatusOK, + health: "true", }, { - []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, - "/health", - http.StatusServiceUnavailable, - "false", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, + healthCheckURL: "/health", + statusCode: http.StatusServiceUnavailable, + health: "false", }, { - []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, - "/health?exclude=NOSPACE", - http.StatusOK, - "true", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, + healthCheckURL: "/health?exclude=NOSPACE", + statusCode: http.StatusOK, + health: "true", }, { - []*pb.AlarmMember{}, - "/health?exclude=NOSPACE", - http.StatusOK, - "true", + alarms: []*pb.AlarmMember{}, + healthCheckURL: "/health?exclude=NOSPACE", + statusCode: http.StatusOK, + health: "true", }, { - []*pb.AlarmMember{{MemberID: uint64(1), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(2), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(3), Alarm: pb.AlarmType_NOSPACE}}, - "/health?exclude=NOSPACE", - http.StatusOK, - "true", + alarms: []*pb.AlarmMember{{MemberID: uint64(1), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(2), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(3), Alarm: pb.AlarmType_NOSPACE}}, + healthCheckURL: "/health?exclude=NOSPACE", + statusCode: http.StatusOK, + health: "true", }, { - []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}}, - "/health?exclude=NOSPACE", - http.StatusServiceUnavailable, - "false", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}}, + healthCheckURL: "/health?exclude=NOSPACE", + statusCode: http.StatusServiceUnavailable, + health: "false", }, { - []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}}, - "/health?exclude=NOSPACE&exclude=CORRUPT", - http.StatusOK, - "true", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}}, + healthCheckURL: "/health?exclude=NOSPACE&exclude=CORRUPT", + statusCode: http.StatusOK, + health: "true", }, } From 34d2e743d2fe2823ec291821764b53ec50c25e72 Mon Sep 17 00:00:00 2001 From: Marek Siarkowicz Date: Thu, 27 Jan 2022 18:33:41 +0100 Subject: [PATCH 2/6] server: Rename test case expect fields Signed-off-by: Siyuan Zhang --- etcdserver/api/etcdhttp/metrics_test.go | 71 +++++++++++++------------ 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/etcdserver/api/etcdhttp/metrics_test.go b/etcdserver/api/etcdhttp/metrics_test.go index bd6f2b1de45..c7a67f5c441 100644 --- a/etcdserver/api/etcdhttp/metrics_test.go +++ b/etcdserver/api/etcdhttp/metrics_test.go @@ -64,50 +64,51 @@ func TestHealthHandler(t *testing.T) { tests := []struct { alarms []*pb.AlarmMember healthCheckURL string - statusCode int - health string + + expectStatusCode int + expectHealth string }{ { - alarms: []*pb.AlarmMember{}, - healthCheckURL: "/health", - statusCode: http.StatusOK, - health: "true", + alarms: []*pb.AlarmMember{}, + healthCheckURL: "/health", + expectStatusCode: http.StatusOK, + expectHealth: "true", }, { - alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, - healthCheckURL: "/health", - statusCode: http.StatusServiceUnavailable, - health: "false", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, + healthCheckURL: "/health", + expectStatusCode: http.StatusServiceUnavailable, + expectHealth: "false", }, { - alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, - healthCheckURL: "/health?exclude=NOSPACE", - statusCode: http.StatusOK, - health: "true", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, + healthCheckURL: "/health?exclude=NOSPACE", + expectStatusCode: http.StatusOK, + expectHealth: "true", }, { - alarms: []*pb.AlarmMember{}, - healthCheckURL: "/health?exclude=NOSPACE", - statusCode: http.StatusOK, - health: "true", + alarms: []*pb.AlarmMember{}, + healthCheckURL: "/health?exclude=NOSPACE", + expectStatusCode: http.StatusOK, + expectHealth: "true", }, { - alarms: []*pb.AlarmMember{{MemberID: uint64(1), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(2), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(3), Alarm: pb.AlarmType_NOSPACE}}, - healthCheckURL: "/health?exclude=NOSPACE", - statusCode: http.StatusOK, - health: "true", + alarms: []*pb.AlarmMember{{MemberID: uint64(1), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(2), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(3), Alarm: pb.AlarmType_NOSPACE}}, + healthCheckURL: "/health?exclude=NOSPACE", + expectStatusCode: http.StatusOK, + expectHealth: "true", }, { - alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}}, - healthCheckURL: "/health?exclude=NOSPACE", - statusCode: http.StatusServiceUnavailable, - health: "false", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}}, + healthCheckURL: "/health?exclude=NOSPACE", + expectStatusCode: http.StatusServiceUnavailable, + expectHealth: "false", }, { - alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}}, - healthCheckURL: "/health?exclude=NOSPACE&exclude=CORRUPT", - statusCode: http.StatusOK, - health: "true", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}}, + healthCheckURL: "/health?exclude=NOSPACE&exclude=CORRUPT", + expectStatusCode: http.StatusOK, + expectHealth: "true", }, } @@ -117,7 +118,7 @@ func TestHealthHandler(t *testing.T) { HandleMetricsHealth(mux, &fakeServerV2{ fakeServer: fakeServer{alarms: tt.alarms}, Stats: &fakeStats{}, - health: tt.health, + health: tt.expectHealth, }) ts := httptest.NewServer(mux) defer ts.Close() @@ -130,15 +131,15 @@ func TestHealthHandler(t *testing.T) { t.Errorf("got nil http response with http request %s in test case #%d", tt.healthCheckURL, i+1) return } - if res.StatusCode != tt.statusCode { - t.Errorf("want statusCode %d but got %d in test case #%d", tt.statusCode, res.StatusCode, i+1) + if res.StatusCode != tt.expectStatusCode { + t.Errorf("want statusCode %d but got %d in test case #%d", tt.expectStatusCode, res.StatusCode, i+1) } health, err := parseHealthOutput(res.Body) if err != nil { t.Errorf("fail parse health check output %v", err) } - if health.Health != tt.health { - t.Errorf("want health %s but got %s", tt.health, health.Health) + if health.Health != tt.expectHealth { + t.Errorf("want health %s but got %s", tt.expectHealth, health.Health) } }() } From e74970d5a17265810c22f276bd9611e4ff635aed Mon Sep 17 00:00:00 2001 From: Marek Siarkowicz Date: Thu, 27 Jan 2022 18:46:44 +0100 Subject: [PATCH 3/6] server: Run health check tests in subtests Signed-off-by: Siyuan Zhang --- etcdserver/api/etcdhttp/metrics_test.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/etcdserver/api/etcdhttp/metrics_test.go b/etcdserver/api/etcdhttp/metrics_test.go index c7a67f5c441..8d825d9eedc 100644 --- a/etcdserver/api/etcdhttp/metrics_test.go +++ b/etcdserver/api/etcdhttp/metrics_test.go @@ -62,6 +62,7 @@ func TestHealthHandler(t *testing.T) { // define the input and expected output // input: alarms, and healthCheckURL tests := []struct { + name string alarms []*pb.AlarmMember healthCheckURL string @@ -69,42 +70,49 @@ func TestHealthHandler(t *testing.T) { expectHealth string }{ { + name: "Healthy if no alarm", alarms: []*pb.AlarmMember{}, healthCheckURL: "/health", expectStatusCode: http.StatusOK, expectHealth: "true", }, { + name: "Unhealthy if NOSPACE alarm is on", alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, healthCheckURL: "/health", expectStatusCode: http.StatusServiceUnavailable, expectHealth: "false", }, { + name: "Healthy if NOSPACE alarm is on and excluded", alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, healthCheckURL: "/health?exclude=NOSPACE", expectStatusCode: http.StatusOK, expectHealth: "true", }, { + name: "Healthy if NOSPACE alarm is excluded", alarms: []*pb.AlarmMember{}, healthCheckURL: "/health?exclude=NOSPACE", expectStatusCode: http.StatusOK, expectHealth: "true", }, { + name: "Healthy if multiple NOSPACE alarms are on and excluded", alarms: []*pb.AlarmMember{{MemberID: uint64(1), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(2), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(3), Alarm: pb.AlarmType_NOSPACE}}, healthCheckURL: "/health?exclude=NOSPACE", expectStatusCode: http.StatusOK, expectHealth: "true", }, { + name: "Unhealthy if NOSPACE alarms is excluded and CORRUPT is on", alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}}, healthCheckURL: "/health?exclude=NOSPACE", expectStatusCode: http.StatusServiceUnavailable, expectHealth: "false", }, { + name: "Unhealthy if both NOSPACE and CORRUPT are on and excluded", alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}}, healthCheckURL: "/health?exclude=NOSPACE&exclude=CORRUPT", expectStatusCode: http.StatusOK, @@ -113,7 +121,7 @@ func TestHealthHandler(t *testing.T) { } for i, tt := range tests { - func() { + t.Run(tt.name, func(t *testing.T) { mux := http.NewServeMux() HandleMetricsHealth(mux, &fakeServerV2{ fakeServer: fakeServer{alarms: tt.alarms}, @@ -141,7 +149,7 @@ func TestHealthHandler(t *testing.T) { if health.Health != tt.expectHealth { t.Errorf("want health %s but got %s", tt.expectHealth, health.Health) } - }() + }) } } From f009772c84e76eb480ae24a54a4329501361d49c Mon Sep 17 00:00:00 2001 From: Siyuan Zhang Date: Wed, 13 Dec 2023 17:19:22 -0800 Subject: [PATCH 4/6] server: Refactor health checks Signed-off-by: Siyuan Zhang --- etcdserver/api/etcdhttp/metrics.go | 114 +++++++++++++++++++++++------ proxy/grpcproxy/health.go | 2 +- 2 files changed, 91 insertions(+), 25 deletions(-) diff --git a/etcdserver/api/etcdhttp/metrics.go b/etcdserver/api/etcdhttp/metrics.go index e5c062e4372..4c319eb8c9b 100644 --- a/etcdserver/api/etcdhttp/metrics.go +++ b/etcdserver/api/etcdhttp/metrics.go @@ -17,9 +17,11 @@ package etcdhttp import ( "context" "encoding/json" + "fmt" "net/http" "time" + "go.etcd.io/etcd/auth" "go.etcd.io/etcd/etcdserver" "go.etcd.io/etcd/etcdserver/etcdserverpb" "go.etcd.io/etcd/raft" @@ -36,7 +38,30 @@ const ( // HandleMetricsHealth registers metrics and health handlers. func HandleMetricsHealth(mux *http.ServeMux, srv etcdserver.ServerV2) { mux.Handle(PathMetrics, promhttp.Handler()) - mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet) Health { return checkHealth(srv, excludedAlarms) })) + mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health { + if h := checkAlarms(srv, excludedAlarms); h.Health != "true" { + return h + } + if h := checkLeader(srv, serializable); h.Health != "true" { + return h + } + return checkV2API(srv) + })) +} + +// HandleMetricsHealthForV3 registers metrics and health handlers. it checks health by using v3 range request +// and its corresponding timeout. +func HandleMetricsHealthForV3(mux *http.ServeMux, srv *etcdserver.EtcdServer) { + mux.Handle(PathMetrics, promhttp.Handler()) + mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health { + if h := checkAlarms(srv, excludedAlarms); h.Health != "true" { + return h + } + if h := checkLeader(srv, serializable); h.Health != "true" { + return h + } + return checkV3API(srv, serializable) + })) } // HandlePrometheus registers prometheus handler on '/metrics'. @@ -45,7 +70,7 @@ func HandlePrometheus(mux *http.ServeMux) { } // NewHealthHandler handles '/health' requests. -func NewHealthHandler(hfunc func(excludedAlarms AlarmSet) Health) http.HandlerFunc { +func NewHealthHandler(hfunc func(excludedAlarms AlarmSet, serializable bool) Health) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { w.Header().Set("Allow", http.MethodGet) @@ -54,7 +79,19 @@ func NewHealthHandler(hfunc func(excludedAlarms AlarmSet) Health) http.HandlerFu return } excludedAlarms := getExcludedAlarms(r) - h := hfunc(excludedAlarms) + // Passing the query parameter "serializable=true" ensures that the + // health of the local etcd is checked vs the health of the cluster. + // This is useful for probes attempting to validate the liveness of + // the etcd process vs readiness of the cluster to serve requests. + serializableFlag := getSerializableFlag(r) + h := hfunc(excludedAlarms, serializableFlag) + defer func() { + if h.Health == "true" { + healthSuccess.Inc() + } else { + healthFailed.Inc() + } + }() d, _ := json.Marshal(h) if h.Health != "true" { http.Error(w, string(d), http.StatusServiceUnavailable) @@ -62,6 +99,7 @@ func NewHealthHandler(hfunc func(excludedAlarms AlarmSet) Health) http.HandlerFu } w.WriteHeader(http.StatusOK) w.Write(d) + plog.Debugf("/health OK (status code %d)", http.StatusOK) } } @@ -89,6 +127,7 @@ func init() { // TODO: remove manual parsing in etcdctl cluster-health type Health struct { Health string `json:"health"` + Reason string `json:"-"` } type AlarmSet map[string]struct{} @@ -107,11 +146,14 @@ func getExcludedAlarms(r *http.Request) (alarms AlarmSet) { return alarms } -// TODO: server NOSPACE, etcdserver.ErrNoLeader in health API +func getSerializableFlag(r *http.Request) bool { + return r.URL.Query().Get("serializable") == "true" +} -func checkHealth(srv etcdserver.ServerV2, excludedAlarms AlarmSet) Health { - h := Health{Health: "true"} +// TODO: etcdserver.ErrNoLeader in health API +func checkAlarms(srv etcdserver.ServerV2, excludedAlarms AlarmSet) Health { + h := Health{Health: "true"} as := srv.Alarms() if len(as) > 0 { for _, v := range as { @@ -120,34 +162,58 @@ func checkHealth(srv etcdserver.ServerV2, excludedAlarms AlarmSet) Health { plog.Debugf("/health excluded alarm %s", v.String()) continue } + h.Health = "false" + switch v.Alarm { + case etcdserverpb.AlarmType_NOSPACE: + h.Reason = "ALARM NOSPACE" + case etcdserverpb.AlarmType_CORRUPT: + h.Reason = "ALARM CORRUPT" + default: + h.Reason = "ALARM UNKNOWN" + } plog.Warningf("/health error due to %s", v.String()) return h } } - if h.Health == "true" { - if uint64(srv.Leader()) == raft.None { - h.Health = "false" - plog.Warningf("/health error; no leader (status code %d)", http.StatusServiceUnavailable) - } + return h +} + +func checkLeader(srv etcdserver.ServerV2, serializable bool) Health { + h := Health{Health: "true"} + if !serializable && (uint64(srv.Leader()) == raft.None) { + h.Health = "false" + h.Reason = "RAFT NO LEADER" + plog.Warningf("/health error; no leader (status code %d)", http.StatusServiceUnavailable) } + return h +} - if h.Health == "true" { - ctx, cancel := context.WithTimeout(context.Background(), time.Second) - _, err := srv.Do(ctx, etcdserverpb.Request{Method: "QGET"}) - cancel() - if err != nil { - h.Health = "false" - plog.Warningf("/health error; QGET failed %v (status code %d)", err, http.StatusServiceUnavailable) - } +func checkV2API(srv etcdserver.ServerV2) Health { + h := Health{Health: "true"} + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + _, err := srv.Do(ctx, etcdserverpb.Request{Method: "QGET"}) + cancel() + if err != nil { + h.Health = "false" + h.Reason = fmt.Sprintf("QGET ERROR:%s", err) + plog.Warningf("/health error; QGET failed %v (status code %d)", err, http.StatusServiceUnavailable) + return h } + return h +} - if h.Health == "true" { - healthSuccess.Inc() - plog.Debugf("/health OK (status code %d)", http.StatusOK) - } else { - healthFailed.Inc() +func checkV3API(srv *etcdserver.EtcdServer, serializable bool) Health { + h := Health{Health: "true"} + ctx, cancel := context.WithTimeout(context.Background(), srv.Cfg.ReqTimeout()) + _, err := srv.Range(ctx, &etcdserverpb.RangeRequest{KeysOnly: true, Limit: 1, Serializable: serializable}) + cancel() + if err != nil && err != auth.ErrUserEmpty && err != auth.ErrPermissionDenied { + h.Health = "false" + h.Reason = fmt.Sprintf("RANGE ERROR:%s", err) + plog.Warningf("serving /health false; Range failed %v (status code %d)", err, http.StatusServiceUnavailable) + return h } return h } diff --git a/proxy/grpcproxy/health.go b/proxy/grpcproxy/health.go index ad7358ec579..cccbd96c711 100644 --- a/proxy/grpcproxy/health.go +++ b/proxy/grpcproxy/health.go @@ -26,7 +26,7 @@ import ( // HandleHealth registers health handler on '/health'. func HandleHealth(mux *http.ServeMux, c *clientv3.Client) { - mux.Handle(etcdhttp.PathHealth, etcdhttp.NewHealthHandler(func(excludedAlarms etcdhttp.AlarmSet) etcdhttp.Health { return checkHealth(c) })) + mux.Handle(etcdhttp.PathHealth, etcdhttp.NewHealthHandler(func(excludedAlarms etcdhttp.AlarmSet, serializable bool) etcdhttp.Health { return checkHealth(c) })) } func checkHealth(c *clientv3.Client) etcdhttp.Health { From cc44646a2ecaf0f12ff583d978c7b8b4c769b925 Mon Sep 17 00:00:00 2001 From: Siyuan Zhang Date: Tue, 12 Dec 2023 15:39:09 -0800 Subject: [PATCH 5/6] server: Cover V3 health with tests Signed-off-by: Siyuan Zhang --- embed/etcd.go | 1 + etcdserver/api/etcdhttp/base.go | 1 - etcdserver/api/etcdhttp/metrics.go | 41 +++++++++++++++--------- etcdserver/api/etcdhttp/metrics_test.go | 42 +++++++++++++++++++++---- etcdserver/api/v2http/client.go | 1 + etcdserver/server.go | 4 +++ 6 files changed, 69 insertions(+), 21 deletions(-) diff --git a/embed/etcd.go b/embed/etcd.go index 223a8aaeaa8..29f65424b06 100644 --- a/embed/etcd.go +++ b/embed/etcd.go @@ -773,6 +773,7 @@ func (e *Etcd) serveClients() (err error) { } else { mux := http.NewServeMux() etcdhttp.HandleBasic(mux, e.Server) + etcdhttp.HandleMetricsHealth(mux, e.Server) h = mux } diff --git a/etcdserver/api/etcdhttp/base.go b/etcdserver/api/etcdhttp/base.go index c9df62ea8e6..4ff4b9b0273 100644 --- a/etcdserver/api/etcdhttp/base.go +++ b/etcdserver/api/etcdhttp/base.go @@ -51,7 +51,6 @@ func HandleBasic(mux *http.ServeMux, server etcdserver.ServerPeer) { // TODO: deprecate '/config/local/log' in v3.5 mux.HandleFunc(configPath+"/local/log", logHandleFunc) - HandleMetricsHealth(mux, server) mux.HandleFunc(versionPath, versionHandler(server.Cluster(), serveVersion)) } diff --git a/etcdserver/api/etcdhttp/metrics.go b/etcdserver/api/etcdhttp/metrics.go index 4c319eb8c9b..15a4917f272 100644 --- a/etcdserver/api/etcdhttp/metrics.go +++ b/etcdserver/api/etcdhttp/metrics.go @@ -23,7 +23,8 @@ import ( "go.etcd.io/etcd/auth" "go.etcd.io/etcd/etcdserver" - "go.etcd.io/etcd/etcdserver/etcdserverpb" + pb "go.etcd.io/etcd/etcdserver/etcdserverpb" + "go.etcd.io/etcd/pkg/types" "go.etcd.io/etcd/raft" "github.com/prometheus/client_golang/prometheus" @@ -35,8 +36,19 @@ const ( PathHealth = "/health" ) -// HandleMetricsHealth registers metrics and health handlers. -func HandleMetricsHealth(mux *http.ServeMux, srv etcdserver.ServerV2) { +type ServerHealth interface { + serverHealthV2V3 + Range(context.Context, *pb.RangeRequest) (*pb.RangeResponse, error) + Config() etcdserver.ServerConfig +} + +type serverHealthV2V3 interface { + Alarms() []*pb.AlarmMember + Leader() types.ID +} + +// HandleMetricsHealthForV2 registers metrics and health handlers for v2. +func HandleMetricsHealthForV2(mux *http.ServeMux, srv etcdserver.ServerV2) { mux.Handle(PathMetrics, promhttp.Handler()) mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health { if h := checkAlarms(srv, excludedAlarms); h.Health != "true" { @@ -49,9 +61,9 @@ func HandleMetricsHealth(mux *http.ServeMux, srv etcdserver.ServerV2) { })) } -// HandleMetricsHealthForV3 registers metrics and health handlers. it checks health by using v3 range request +// HandleMetricsHealth registers metrics and health handlers. it checks health by using v3 range request // and its corresponding timeout. -func HandleMetricsHealthForV3(mux *http.ServeMux, srv *etcdserver.EtcdServer) { +func HandleMetricsHealth(mux *http.ServeMux, srv ServerHealth) { mux.Handle(PathMetrics, promhttp.Handler()) mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health { if h := checkAlarms(srv, excludedAlarms); h.Health != "true" { @@ -60,7 +72,7 @@ func HandleMetricsHealthForV3(mux *http.ServeMux, srv *etcdserver.EtcdServer) { if h := checkLeader(srv, serializable); h.Health != "true" { return h } - return checkV3API(srv, serializable) + return checkAPI(srv, serializable) })) } @@ -152,7 +164,7 @@ func getSerializableFlag(r *http.Request) bool { // TODO: etcdserver.ErrNoLeader in health API -func checkAlarms(srv etcdserver.ServerV2, excludedAlarms AlarmSet) Health { +func checkAlarms(srv serverHealthV2V3, excludedAlarms AlarmSet) Health { h := Health{Health: "true"} as := srv.Alarms() if len(as) > 0 { @@ -165,9 +177,9 @@ func checkAlarms(srv etcdserver.ServerV2, excludedAlarms AlarmSet) Health { h.Health = "false" switch v.Alarm { - case etcdserverpb.AlarmType_NOSPACE: + case pb.AlarmType_NOSPACE: h.Reason = "ALARM NOSPACE" - case etcdserverpb.AlarmType_CORRUPT: + case pb.AlarmType_CORRUPT: h.Reason = "ALARM CORRUPT" default: h.Reason = "ALARM UNKNOWN" @@ -180,7 +192,7 @@ func checkAlarms(srv etcdserver.ServerV2, excludedAlarms AlarmSet) Health { return h } -func checkLeader(srv etcdserver.ServerV2, serializable bool) Health { +func checkLeader(srv serverHealthV2V3, serializable bool) Health { h := Health{Health: "true"} if !serializable && (uint64(srv.Leader()) == raft.None) { h.Health = "false" @@ -193,7 +205,7 @@ func checkLeader(srv etcdserver.ServerV2, serializable bool) Health { func checkV2API(srv etcdserver.ServerV2) Health { h := Health{Health: "true"} ctx, cancel := context.WithTimeout(context.Background(), time.Second) - _, err := srv.Do(ctx, etcdserverpb.Request{Method: "QGET"}) + _, err := srv.Do(ctx, pb.Request{Method: "QGET"}) cancel() if err != nil { h.Health = "false" @@ -204,10 +216,11 @@ func checkV2API(srv etcdserver.ServerV2) Health { return h } -func checkV3API(srv *etcdserver.EtcdServer, serializable bool) Health { +func checkAPI(srv ServerHealth, serializable bool) Health { h := Health{Health: "true"} - ctx, cancel := context.WithTimeout(context.Background(), srv.Cfg.ReqTimeout()) - _, err := srv.Range(ctx, &etcdserverpb.RangeRequest{KeysOnly: true, Limit: 1, Serializable: serializable}) + cfg := srv.Config() + ctx, cancel := context.WithTimeout(context.Background(), cfg.ReqTimeout()) + _, err := srv.Range(ctx, &pb.RangeRequest{KeysOnly: true, Limit: 1, Serializable: serializable}) cancel() if err != nil && err != auth.ErrUserEmpty && err != auth.ErrPermissionDenied { h.Health = "false" diff --git a/etcdserver/api/etcdhttp/metrics_test.go b/etcdserver/api/etcdhttp/metrics_test.go index 8d825d9eedc..0fc65853439 100644 --- a/etcdserver/api/etcdhttp/metrics_test.go +++ b/etcdserver/api/etcdhttp/metrics_test.go @@ -24,6 +24,7 @@ import ( "net/http/httptest" "testing" + "go.etcd.io/etcd/auth" "go.etcd.io/etcd/etcdserver" stats "go.etcd.io/etcd/etcdserver/api/v2stats" pb "go.etcd.io/etcd/etcdserver/etcdserverpb" @@ -38,25 +39,34 @@ func (s *fakeStats) SelfStats() []byte { return nil } func (s *fakeStats) LeaderStats() []byte { return nil } func (s *fakeStats) StoreStats() []byte { return nil } -type fakeServerV2 struct { +type fakeHealthServer struct { fakeServer stats.Stats - health string + health string + apiError error } -func (s *fakeServerV2) Leader() types.ID { +func (s *fakeHealthServer) Range(ctx context.Context, request *pb.RangeRequest) (*pb.RangeResponse, error) { + return nil, s.apiError +} + +func (s *fakeHealthServer) Config() etcdserver.ServerConfig { + return etcdserver.ServerConfig{} +} + +func (s *fakeHealthServer) Leader() types.ID { if s.health == "true" { return 1 } return types.ID(raft.None) } -func (s *fakeServerV2) Do(ctx context.Context, r pb.Request) (etcdserver.Response, error) { +func (s *fakeHealthServer) Do(ctx context.Context, r pb.Request) (etcdserver.Response, error) { if s.health == "true" { return etcdserver.Response{}, nil } return etcdserver.Response{}, fmt.Errorf("fail health check") } -func (s *fakeServerV2) ClientCertAuthEnabled() bool { return false } +func (s *fakeHealthServer) ClientCertAuthEnabled() bool { return false } func TestHealthHandler(t *testing.T) { // define the input and expected output @@ -65,6 +75,7 @@ func TestHealthHandler(t *testing.T) { name string alarms []*pb.AlarmMember healthCheckURL string + apiError error expectStatusCode int expectHealth string @@ -118,15 +129,34 @@ func TestHealthHandler(t *testing.T) { expectStatusCode: http.StatusOK, expectHealth: "true", }, + { + healthCheckURL: "/health", + apiError: auth.ErrUserEmpty, + expectStatusCode: http.StatusOK, + expectHealth: "true", + }, + { + healthCheckURL: "/health", + apiError: auth.ErrPermissionDenied, + expectStatusCode: http.StatusOK, + expectHealth: "true", + }, + { + healthCheckURL: "/health", + apiError: fmt.Errorf("Unexpected error"), + expectStatusCode: http.StatusServiceUnavailable, + expectHealth: "false", + }, } for i, tt := range tests { t.Run(tt.name, func(t *testing.T) { mux := http.NewServeMux() - HandleMetricsHealth(mux, &fakeServerV2{ + HandleMetricsHealth(mux, &fakeHealthServer{ fakeServer: fakeServer{alarms: tt.alarms}, Stats: &fakeStats{}, health: tt.expectHealth, + apiError: tt.apiError, }) ts := httptest.NewServer(mux) defer ts.Close() diff --git a/etcdserver/api/v2http/client.go b/etcdserver/api/v2http/client.go index 1d1e592b25d..a26e748fe52 100644 --- a/etcdserver/api/v2http/client.go +++ b/etcdserver/api/v2http/client.go @@ -55,6 +55,7 @@ const ( func NewClientHandler(lg *zap.Logger, server etcdserver.ServerPeer, timeout time.Duration) http.Handler { mux := http.NewServeMux() etcdhttp.HandleBasic(mux, server) + etcdhttp.HandleMetricsHealthForV2(mux, server) handleV2(lg, mux, server, timeout) return requestLogger(lg, mux) } diff --git a/etcdserver/server.go b/etcdserver/server.go index d24963f9dcc..c044e3d44de 100644 --- a/etcdserver/server.go +++ b/etcdserver/server.go @@ -667,6 +667,10 @@ func (s *EtcdServer) getLogger() *zap.Logger { return l } +func (s *EtcdServer) Config() ServerConfig { + return s.Cfg +} + func tickToDur(ticks int, tickMs uint) string { return fmt.Sprintf("%v", time.Duration(ticks)*time.Duration(tickMs)*time.Millisecond) } From 4a8381a4613edbd9b6f71e4ee830890ada13b049 Mon Sep 17 00:00:00 2001 From: Marek Siarkowicz Date: Thu, 27 Jan 2022 19:04:41 +0100 Subject: [PATCH 6/6] server: Split metrics and health code Signed-off-by: Siyuan Zhang --- embed/etcd.go | 6 +- etcdmain/etcd.go | 2 +- etcdserver/api/etcdhttp/health.go | 223 ++++++++++++++++++ .../{metrics_test.go => health_test.go} | 25 +- etcdserver/api/etcdhttp/metrics.go | 206 +--------------- etcdserver/api/v2http/client.go | 3 +- 6 files changed, 241 insertions(+), 224 deletions(-) create mode 100644 etcdserver/api/etcdhttp/health.go rename etcdserver/api/etcdhttp/{metrics_test.go => health_test.go} (88%) diff --git a/embed/etcd.go b/embed/etcd.go index 29f65424b06..3cbed0fdd09 100644 --- a/embed/etcd.go +++ b/embed/etcd.go @@ -773,7 +773,8 @@ func (e *Etcd) serveClients() (err error) { } else { mux := http.NewServeMux() etcdhttp.HandleBasic(mux, e.Server) - etcdhttp.HandleMetricsHealth(mux, e.Server) + etcdhttp.HandleMetrics(mux) + etcdhttp.HandleHealth(mux, e.Server) h = mux } @@ -862,7 +863,8 @@ func (e *Etcd) serveMetrics() (err error) { if len(e.cfg.ListenMetricsUrls) > 0 { metricsMux := http.NewServeMux() - etcdhttp.HandleMetricsHealth(metricsMux, e.Server) + etcdhttp.HandleMetrics(metricsMux) + etcdhttp.HandleHealth(metricsMux, e.Server) for _, murl := range e.cfg.ListenMetricsUrls { tlsInfo := &e.cfg.ClientTLSInfo diff --git a/etcdmain/etcd.go b/etcdmain/etcd.go index 51696290f68..9f748c9d483 100644 --- a/etcdmain/etcd.go +++ b/etcdmain/etcd.go @@ -542,7 +542,7 @@ func startProxy(cfg *config) error { plog.Infof("v2 proxy started listening on client requests on %q", host) } mux := http.NewServeMux() - etcdhttp.HandlePrometheus(mux) // v2 proxy just uses the same port + etcdhttp.HandleMetrics(mux) // v2 proxy just uses the same port mux.Handle("/", ph) plog.Fatal(http.Serve(l, mux)) }() diff --git a/etcdserver/api/etcdhttp/health.go b/etcdserver/api/etcdhttp/health.go new file mode 100644 index 00000000000..ee0de3221e9 --- /dev/null +++ b/etcdserver/api/etcdhttp/health.go @@ -0,0 +1,223 @@ +// Copyright 2017 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package etcdhttp + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "time" + + "go.etcd.io/etcd/auth" + "go.etcd.io/etcd/etcdserver" + pb "go.etcd.io/etcd/etcdserver/etcdserverpb" + "go.etcd.io/etcd/pkg/types" + "go.etcd.io/etcd/raft" + + "github.com/prometheus/client_golang/prometheus" +) + +const ( + PathHealth = "/health" +) + +type ServerHealth interface { + serverHealthV2V3 + Range(context.Context, *pb.RangeRequest) (*pb.RangeResponse, error) + Config() etcdserver.ServerConfig +} + +type serverHealthV2V3 interface { + Alarms() []*pb.AlarmMember + Leader() types.ID +} + +// HandleHealthForV2 registers metrics and health handlers for v2. +func HandleHealthForV2(mux *http.ServeMux, srv etcdserver.ServerV2) { + mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health { + if h := checkAlarms(srv, excludedAlarms); h.Health != "true" { + return h + } + if h := checkLeader(srv, serializable); h.Health != "true" { + return h + } + return checkV2API(srv) + })) +} + +// HandleHealth registers metrics and health handlers. it checks health by using v3 range request +// and its corresponding timeout. +func HandleHealth(mux *http.ServeMux, srv ServerHealth) { + mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health { + if h := checkAlarms(srv, excludedAlarms); h.Health != "true" { + return h + } + if h := checkLeader(srv, serializable); h.Health != "true" { + return h + } + return checkAPI(srv, serializable) + })) +} + +// NewHealthHandler handles '/health' requests. +func NewHealthHandler(hfunc func(excludedAlarms AlarmSet, serializable bool) Health) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.Header().Set("Allow", http.MethodGet) + http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed) + plog.Warningf("/health error (status code %d)", http.StatusMethodNotAllowed) + return + } + excludedAlarms := getExcludedAlarms(r) + // Passing the query parameter "serializable=true" ensures that the + // health of the local etcd is checked vs the health of the cluster. + // This is useful for probes attempting to validate the liveness of + // the etcd process vs readiness of the cluster to serve requests. + serializableFlag := getSerializableFlag(r) + h := hfunc(excludedAlarms, serializableFlag) + defer func() { + if h.Health == "true" { + healthSuccess.Inc() + } else { + healthFailed.Inc() + } + }() + d, _ := json.Marshal(h) + if h.Health != "true" { + http.Error(w, string(d), http.StatusServiceUnavailable) + return + } + w.WriteHeader(http.StatusOK) + w.Write(d) + plog.Debugf("/health OK (status code %d)", http.StatusOK) + } +} + +var ( + healthSuccess = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "etcd", + Subsystem: "server", + Name: "health_success", + Help: "The total number of successful health checks", + }) + healthFailed = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "etcd", + Subsystem: "server", + Name: "health_failures", + Help: "The total number of failed health checks", + }) +) + +func init() { + prometheus.MustRegister(healthSuccess) + prometheus.MustRegister(healthFailed) +} + +// Health defines etcd server health status. +// TODO: remove manual parsing in etcdctl cluster-health +type Health struct { + Health string `json:"health"` + Reason string `json:"-"` +} + +type AlarmSet map[string]struct{} + +func getExcludedAlarms(r *http.Request) (alarms AlarmSet) { + alarms = make(map[string]struct{}, 2) + alms, found := r.URL.Query()["exclude"] + if found { + for _, alm := range alms { + if len(alms) == 0 { + continue + } + alarms[alm] = struct{}{} + } + } + return alarms +} + +func getSerializableFlag(r *http.Request) bool { + return r.URL.Query().Get("serializable") == "true" +} + +// TODO: etcdserver.ErrNoLeader in health API + +func checkAlarms(srv serverHealthV2V3, excludedAlarms AlarmSet) Health { + h := Health{Health: "true"} + as := srv.Alarms() + if len(as) > 0 { + for _, v := range as { + alarmName := v.Alarm.String() + if _, found := excludedAlarms[alarmName]; found { + plog.Debugf("/health excluded alarm %s", v.String()) + continue + } + + h.Health = "false" + switch v.Alarm { + case pb.AlarmType_NOSPACE: + h.Reason = "ALARM NOSPACE" + case pb.AlarmType_CORRUPT: + h.Reason = "ALARM CORRUPT" + default: + h.Reason = "ALARM UNKNOWN" + } + plog.Warningf("/health error due to %s", v.String()) + return h + } + } + + return h +} + +func checkLeader(srv serverHealthV2V3, serializable bool) Health { + h := Health{Health: "true"} + if !serializable && (uint64(srv.Leader()) == raft.None) { + h.Health = "false" + h.Reason = "RAFT NO LEADER" + plog.Warningf("/health error; no leader (status code %d)", http.StatusServiceUnavailable) + } + return h +} + +func checkV2API(srv etcdserver.ServerV2) Health { + h := Health{Health: "true"} + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + _, err := srv.Do(ctx, pb.Request{Method: "QGET"}) + cancel() + if err != nil { + h.Health = "false" + h.Reason = fmt.Sprintf("QGET ERROR:%s", err) + plog.Warningf("/health error; QGET failed %v (status code %d)", err, http.StatusServiceUnavailable) + return h + } + return h +} + +func checkAPI(srv ServerHealth, serializable bool) Health { + h := Health{Health: "true"} + cfg := srv.Config() + ctx, cancel := context.WithTimeout(context.Background(), cfg.ReqTimeout()) + _, err := srv.Range(ctx, &pb.RangeRequest{KeysOnly: true, Limit: 1, Serializable: serializable}) + cancel() + if err != nil && err != auth.ErrUserEmpty && err != auth.ErrPermissionDenied { + h.Health = "false" + h.Reason = fmt.Sprintf("RANGE ERROR:%s", err) + plog.Warningf("serving /health false; Range failed %v (status code %d)", err, http.StatusServiceUnavailable) + return h + } + return h +} diff --git a/etcdserver/api/etcdhttp/metrics_test.go b/etcdserver/api/etcdhttp/health_test.go similarity index 88% rename from etcdserver/api/etcdhttp/metrics_test.go rename to etcdserver/api/etcdhttp/health_test.go index 0fc65853439..23eafd7efd7 100644 --- a/etcdserver/api/etcdhttp/metrics_test.go +++ b/etcdserver/api/etcdhttp/health_test.go @@ -19,29 +19,20 @@ import ( "encoding/json" "fmt" "io" - "io/ioutil" "net/http" "net/http/httptest" "testing" "go.etcd.io/etcd/auth" "go.etcd.io/etcd/etcdserver" - stats "go.etcd.io/etcd/etcdserver/api/v2stats" pb "go.etcd.io/etcd/etcdserver/etcdserverpb" "go.etcd.io/etcd/pkg/testutil" "go.etcd.io/etcd/pkg/types" "go.etcd.io/etcd/raft" ) -type fakeStats struct{} - -func (s *fakeStats) SelfStats() []byte { return nil } -func (s *fakeStats) LeaderStats() []byte { return nil } -func (s *fakeStats) StoreStats() []byte { return nil } - type fakeHealthServer struct { fakeServer - stats.Stats health string apiError error } @@ -130,18 +121,21 @@ func TestHealthHandler(t *testing.T) { expectHealth: "true", }, { + name: "Healthy even if authentication failed", healthCheckURL: "/health", apiError: auth.ErrUserEmpty, expectStatusCode: http.StatusOK, expectHealth: "true", }, { + name: "Healthy even if authorization failed", healthCheckURL: "/health", apiError: auth.ErrPermissionDenied, expectStatusCode: http.StatusOK, expectHealth: "true", }, { + name: "Unhealthy if api is not available", healthCheckURL: "/health", apiError: fmt.Errorf("Unexpected error"), expectStatusCode: http.StatusServiceUnavailable, @@ -149,12 +143,11 @@ func TestHealthHandler(t *testing.T) { }, } - for i, tt := range tests { + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { mux := http.NewServeMux() - HandleMetricsHealth(mux, &fakeHealthServer{ + HandleHealth(mux, &fakeHealthServer{ fakeServer: fakeServer{alarms: tt.alarms}, - Stats: &fakeStats{}, health: tt.expectHealth, apiError: tt.apiError, }) @@ -163,14 +156,14 @@ func TestHealthHandler(t *testing.T) { res, err := ts.Client().Do(&http.Request{Method: http.MethodGet, URL: testutil.MustNewURL(t, ts.URL+tt.healthCheckURL)}) if err != nil { - t.Errorf("fail serve http request %s %v in test case #%d", tt.healthCheckURL, err, i+1) + t.Errorf("fail serve http request %s %v", tt.healthCheckURL, err) } if res == nil { - t.Errorf("got nil http response with http request %s in test case #%d", tt.healthCheckURL, i+1) + t.Errorf("got nil http response with http request %s", tt.healthCheckURL) return } if res.StatusCode != tt.expectStatusCode { - t.Errorf("want statusCode %d but got %d in test case #%d", tt.expectStatusCode, res.StatusCode, i+1) + t.Errorf("want statusCode %d but got %d", tt.expectStatusCode, res.StatusCode) } health, err := parseHealthOutput(res.Body) if err != nil { @@ -185,7 +178,7 @@ func TestHealthHandler(t *testing.T) { func parseHealthOutput(body io.Reader) (Health, error) { obj := Health{} - d, derr := ioutil.ReadAll(body) + d, derr := io.ReadAll(body) if derr != nil { return obj, derr } diff --git a/etcdserver/api/etcdhttp/metrics.go b/etcdserver/api/etcdhttp/metrics.go index 15a4917f272..13057798134 100644 --- a/etcdserver/api/etcdhttp/metrics.go +++ b/etcdserver/api/etcdhttp/metrics.go @@ -15,218 +15,16 @@ package etcdhttp import ( - "context" - "encoding/json" - "fmt" "net/http" - "time" - "go.etcd.io/etcd/auth" - "go.etcd.io/etcd/etcdserver" - pb "go.etcd.io/etcd/etcdserver/etcdserverpb" - "go.etcd.io/etcd/pkg/types" - "go.etcd.io/etcd/raft" - - "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) const ( PathMetrics = "/metrics" - PathHealth = "/health" ) -type ServerHealth interface { - serverHealthV2V3 - Range(context.Context, *pb.RangeRequest) (*pb.RangeResponse, error) - Config() etcdserver.ServerConfig -} - -type serverHealthV2V3 interface { - Alarms() []*pb.AlarmMember - Leader() types.ID -} - -// HandleMetricsHealthForV2 registers metrics and health handlers for v2. -func HandleMetricsHealthForV2(mux *http.ServeMux, srv etcdserver.ServerV2) { - mux.Handle(PathMetrics, promhttp.Handler()) - mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health { - if h := checkAlarms(srv, excludedAlarms); h.Health != "true" { - return h - } - if h := checkLeader(srv, serializable); h.Health != "true" { - return h - } - return checkV2API(srv) - })) -} - -// HandleMetricsHealth registers metrics and health handlers. it checks health by using v3 range request -// and its corresponding timeout. -func HandleMetricsHealth(mux *http.ServeMux, srv ServerHealth) { - mux.Handle(PathMetrics, promhttp.Handler()) - mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health { - if h := checkAlarms(srv, excludedAlarms); h.Health != "true" { - return h - } - if h := checkLeader(srv, serializable); h.Health != "true" { - return h - } - return checkAPI(srv, serializable) - })) -} - -// HandlePrometheus registers prometheus handler on '/metrics'. -func HandlePrometheus(mux *http.ServeMux) { +// HandleMetrics registers prometheus handler on '/metrics'. +func HandleMetrics(mux *http.ServeMux) { mux.Handle(PathMetrics, promhttp.Handler()) } - -// NewHealthHandler handles '/health' requests. -func NewHealthHandler(hfunc func(excludedAlarms AlarmSet, serializable bool) Health) http.HandlerFunc { - return func(w http.ResponseWriter, r *http.Request) { - if r.Method != http.MethodGet { - w.Header().Set("Allow", http.MethodGet) - http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed) - plog.Warningf("/health error (status code %d)", http.StatusMethodNotAllowed) - return - } - excludedAlarms := getExcludedAlarms(r) - // Passing the query parameter "serializable=true" ensures that the - // health of the local etcd is checked vs the health of the cluster. - // This is useful for probes attempting to validate the liveness of - // the etcd process vs readiness of the cluster to serve requests. - serializableFlag := getSerializableFlag(r) - h := hfunc(excludedAlarms, serializableFlag) - defer func() { - if h.Health == "true" { - healthSuccess.Inc() - } else { - healthFailed.Inc() - } - }() - d, _ := json.Marshal(h) - if h.Health != "true" { - http.Error(w, string(d), http.StatusServiceUnavailable) - return - } - w.WriteHeader(http.StatusOK) - w.Write(d) - plog.Debugf("/health OK (status code %d)", http.StatusOK) - } -} - -var ( - healthSuccess = prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "etcd", - Subsystem: "server", - Name: "health_success", - Help: "The total number of successful health checks", - }) - healthFailed = prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "etcd", - Subsystem: "server", - Name: "health_failures", - Help: "The total number of failed health checks", - }) -) - -func init() { - prometheus.MustRegister(healthSuccess) - prometheus.MustRegister(healthFailed) -} - -// Health defines etcd server health status. -// TODO: remove manual parsing in etcdctl cluster-health -type Health struct { - Health string `json:"health"` - Reason string `json:"-"` -} - -type AlarmSet map[string]struct{} - -func getExcludedAlarms(r *http.Request) (alarms AlarmSet) { - alarms = make(map[string]struct{}, 2) - alms, found := r.URL.Query()["exclude"] - if found { - for _, alm := range alms { - if len(alms) == 0 { - continue - } - alarms[alm] = struct{}{} - } - } - return alarms -} - -func getSerializableFlag(r *http.Request) bool { - return r.URL.Query().Get("serializable") == "true" -} - -// TODO: etcdserver.ErrNoLeader in health API - -func checkAlarms(srv serverHealthV2V3, excludedAlarms AlarmSet) Health { - h := Health{Health: "true"} - as := srv.Alarms() - if len(as) > 0 { - for _, v := range as { - alarmName := v.Alarm.String() - if _, found := excludedAlarms[alarmName]; found { - plog.Debugf("/health excluded alarm %s", v.String()) - continue - } - - h.Health = "false" - switch v.Alarm { - case pb.AlarmType_NOSPACE: - h.Reason = "ALARM NOSPACE" - case pb.AlarmType_CORRUPT: - h.Reason = "ALARM CORRUPT" - default: - h.Reason = "ALARM UNKNOWN" - } - plog.Warningf("/health error due to %s", v.String()) - return h - } - } - - return h -} - -func checkLeader(srv serverHealthV2V3, serializable bool) Health { - h := Health{Health: "true"} - if !serializable && (uint64(srv.Leader()) == raft.None) { - h.Health = "false" - h.Reason = "RAFT NO LEADER" - plog.Warningf("/health error; no leader (status code %d)", http.StatusServiceUnavailable) - } - return h -} - -func checkV2API(srv etcdserver.ServerV2) Health { - h := Health{Health: "true"} - ctx, cancel := context.WithTimeout(context.Background(), time.Second) - _, err := srv.Do(ctx, pb.Request{Method: "QGET"}) - cancel() - if err != nil { - h.Health = "false" - h.Reason = fmt.Sprintf("QGET ERROR:%s", err) - plog.Warningf("/health error; QGET failed %v (status code %d)", err, http.StatusServiceUnavailable) - return h - } - return h -} - -func checkAPI(srv ServerHealth, serializable bool) Health { - h := Health{Health: "true"} - cfg := srv.Config() - ctx, cancel := context.WithTimeout(context.Background(), cfg.ReqTimeout()) - _, err := srv.Range(ctx, &pb.RangeRequest{KeysOnly: true, Limit: 1, Serializable: serializable}) - cancel() - if err != nil && err != auth.ErrUserEmpty && err != auth.ErrPermissionDenied { - h.Health = "false" - h.Reason = fmt.Sprintf("RANGE ERROR:%s", err) - plog.Warningf("serving /health false; Range failed %v (status code %d)", err, http.StatusServiceUnavailable) - return h - } - return h -} diff --git a/etcdserver/api/v2http/client.go b/etcdserver/api/v2http/client.go index a26e748fe52..0d291572732 100644 --- a/etcdserver/api/v2http/client.go +++ b/etcdserver/api/v2http/client.go @@ -55,7 +55,8 @@ const ( func NewClientHandler(lg *zap.Logger, server etcdserver.ServerPeer, timeout time.Duration) http.Handler { mux := http.NewServeMux() etcdhttp.HandleBasic(mux, server) - etcdhttp.HandleMetricsHealthForV2(mux, server) + etcdhttp.HandleMetrics(mux) + etcdhttp.HandleHealthForV2(mux, server) handleV2(lg, mux, server, timeout) return requestLogger(lg, mux) }