Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server: fix broken /health #45119

Merged
merged 1 commit into from
Mar 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pkg/cli/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@ func waitForClientReadinessAndGetClientGRPCConn(
// Access the /health endpoint. Until/unless this succeeds, the
// node is not yet fully initialized and ready to accept
// Bootstrap requests.
ac := serverpb.NewStatusClient(conn)
_, err := ac.Details(ctx, &serverpb.DetailsRequest{})
ac := serverpb.NewAdminClient(conn)
_, err := ac.Health(ctx, &serverpb.HealthRequest{})
return err
}); err != nil {
err = errors.Wrapf(err, "node not ready to perform cluster initialization")
Expand Down
2 changes: 1 addition & 1 deletion pkg/cli/zip.go
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ func runDebugZip(cmd *cobra.Command, args []string) error {
for _, r := range []zipRequest{
{
fn: func(ctx context.Context) (interface{}, error) {
return status.Details(ctx, &serverpb.DetailsRequest{NodeId: id, Ready: false})
return status.Details(ctx, &serverpb.DetailsRequest{NodeId: id})
},
pathName: prefix + "/details",
},
Expand Down
59 changes: 52 additions & 7 deletions pkg/server/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/security"
"github.com/cockroachdb/cockroach/pkg/server/debug"
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/server/telemetry"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql"
Expand Down Expand Up @@ -1294,19 +1295,63 @@ func (s *adminServer) Cluster(

// Health returns liveness for the node target of the request.
//
// NB: this is deprecated. If you're looking for the code serving /health,
// you want (*statusServer).Details.
// See the docstring for HealthRequest for more details about
// what this function precisely reports.
//
// Note: Health is non-privileged and non-authenticated and thus
// must not report privileged information.
func (s *adminServer) Health(
ctx context.Context, req *serverpb.HealthRequest,
) (*serverpb.HealthResponse, error) {
isLive, err := s.server.nodeLiveness.IsLive(s.server.NodeID())
telemetry.Inc(telemetryHealthCheck)

resp := &serverpb.HealthResponse{}
// If Ready is not set, the client doesn't want to know whether this node is
// ready to receive client traffic.
if !req.Ready {
return resp, nil
}

if err := s.checkReadinessForHealthCheck(); err != nil {
return nil, err
}
return resp, nil
}

func (s *adminServer) checkReadinessForHealthCheck() error {
serveMode := s.server.grpc.mode.get()
switch serveMode {
case modeInitializing:
return status.Error(codes.Unavailable, "node is waiting for cluster initialization")
case modeDraining:
// grpc.mode is set to modeDraining when the Drain(DrainMode_CLIENT) has
// been called (client connections are to be drained).
return status.Errorf(codes.Unavailable, "node is shutting down")
case modeOperational:
break
default:
return s.serverError(errors.Newf("unknown mode: %v", serveMode))
}

// TODO(knz): update this code when progress is made on
// https://github.com/cockroachdb/cockroach/issues/45123
l, err := s.server.nodeLiveness.GetLiveness(s.server.NodeID())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #45123, I think you want to use storage.LivenessStatus(.) (and no this should not just sit around in storage like that at all, but there it is).

if err != nil {
return nil, status.Errorf(codes.Internal, err.Error())
return s.serverError(err)
}
if !isLive {
return nil, status.Errorf(codes.Unavailable, "node is not live")
if !l.IsLive(s.server.clock.Now().GoTime()) {
return status.Errorf(codes.Unavailable, "node is not healthy")
}
return &serverpb.HealthResponse{}, nil
if l.Draining {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the problem with just using server.grpc.mode.get()? This is set synchronously with SetDraining() so I doubt it can really trail liveness (we might have to flip the order, but still). And even if it does trail, it will be by essentially nothing, not on any timescale that matters.

// l.Draining indicates that the node is draining leases.
// This is set when Drain(DrainMode_LEASES) is called.
// It's possible that l.Draining is set without
// grpc.mode being modeDraining, if a RPC client
// has requested DrainMode_LEASES but not DrainMode_CLIENT.
return status.Errorf(codes.Unavailable, "node is shutting down")
}

return nil
}

// getLivenessStatusMap generates a map from NodeID to LivenessStatus for all
Expand Down
37 changes: 20 additions & 17 deletions pkg/server/admin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1230,8 +1230,11 @@ func TestClusterAPI(t *testing.T) {

func TestHealthAPI(t *testing.T) {
defer leaktest.AfterTest(t)()

ctx := context.Background()

s, _, _ := serverutils.StartServer(t, base.TestServerArgs{})
defer s.Stopper().Stop(context.TODO())
defer s.Stopper().Stop(ctx)

// We need to retry because the node ID isn't set until after
// bootstrapping.
Expand All @@ -1250,24 +1253,24 @@ func TestHealthAPI(t *testing.T) {
}
s.Clock().Update(hlc.Timestamp(self.Expiration).Add(1, 0))

expected := "503 Service Unavailable"
var resp serverpb.HealthResponse
for {
if err := getAdminJSONProto(s, "health", &resp); !testutils.IsError(err, expected) {
type timeouter interface {
Timeout() bool
}
if _, ok := err.(timeouter); ok {
// Special case for `*http.httpError` which can happen since we
// have timeouts on our requests and things may not be going so smoothly
// on the server side. See:
// https://github.com/cockroachdb/cockroach/issues/18469
log.Warningf(context.Background(), "ignoring timeout error: %s (%T)", err, err)
continue
}
t.Errorf("expected %q error, got %v (%T)", expected, err, err)
testutils.SucceedsSoon(t, func() error {
err := getAdminJSONProto(s, "health?ready=1", &resp)
if err == nil {
return errors.New("health OK, still waiting for unhealth")
}

t.Logf("observed error: %v", err)
if !testutils.IsError(err, `(?s)503 Service Unavailable.*"error": "node is not healthy"`) {
return err
}
break
return nil
})

// After the node reports an error with `?ready=1`, the health
// endpoint must still succeed without error when `?ready=1` is not specified.
if err := getAdminJSONProto(s, "health", &resp); err != nil {
t.Fatal(err)
}
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/server/grpc_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ var rpcsAllowedWhileBootstrapping = map[string]struct{}{
"/cockroach.rpc.Heartbeat/Ping": {},
"/cockroach.gossip.Gossip/Gossip": {},
"/cockroach.server.serverpb.Init/Bootstrap": {},
"/cockroach.server.serverpb.Status/Details": {},
"/cockroach.server.serverpb.Admin/Health": {},
}

// intercept implements filtering rules for each server state.
Expand Down
8 changes: 8 additions & 0 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -1392,6 +1392,11 @@ func (s *Server) Start(ctx context.Context) error {
return err
}
}
// Handle /health early. This is necessary for orchestration. Note
// that /health is not authenticated, on purpose. This is both
// because it needs to be available before the cluster is up and can
// serve authentication requests, and also because it must work for
// monitoring tools which operate without authentication.
s.mux.Handle("/health", gwMux)

s.engines, err = s.cfg.CreateEngines(ctx)
Expand Down Expand Up @@ -1729,11 +1734,14 @@ func (s *Server) Start(ctx context.Context) error {

s.mux.Handle(adminPrefix, authHandler)
// Exempt the health check endpoint from authentication.
// This mirrors the handling of /health above.
s.mux.Handle("/_admin/v1/health", gwMux)
s.mux.Handle(ts.URLPrefix, authHandler)
s.mux.Handle(statusPrefix, authHandler)
// The /login endpoint is, by definition, available pre-authentication.
s.mux.Handle(loginPath, gwMux)
s.mux.Handle(logoutPath, authHandler)
// The /_status/vars endpoint is not authenticated either. Useful for monitoring.
s.mux.Handle(statusVars, http.HandlerFunc(s.status.handleVars))
log.Event(ctx, "added http endpoints")

Expand Down
Loading