cockroachdb · craig · Mar 14, 2020 · Feb 14, 2020 · tbg · Feb 19, 2020
@@ -109,8 +109,8 @@ func waitForClientReadinessAndGetClientGRPCConn(
 				// Access the /health endpoint. Until/unless this succeeds, the
 				// node is not yet fully initialized and ready to accept
 				// Bootstrap requests.
-				ac := serverpb.NewStatusClient(conn)
-				_, err := ac.Details(ctx, &serverpb.DetailsRequest{})
+				ac := serverpb.NewAdminClient(conn)
+				_, err := ac.Health(ctx, &serverpb.HealthRequest{})
 				return err
 			}); err != nil {
 			err = errors.Wrapf(err, "node not ready to perform cluster initialization")

@@ -383,7 +383,7 @@ func runDebugZip(cmd *cobra.Command, args []string) error {
 			for _, r := range []zipRequest{
 				{
 					fn: func(ctx context.Context) (interface{}, error) {
-						return status.Details(ctx, &serverpb.DetailsRequest{NodeId: id, Ready: false})
+						return status.Details(ctx, &serverpb.DetailsRequest{NodeId: id})
 					},
 					pathName: prefix + "/details",
 				},

@@ -36,6 +36,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/security"
 	"github.com/cockroachdb/cockroach/pkg/server/debug"
 	"github.com/cockroachdb/cockroach/pkg/server/serverpb"
+	"github.com/cockroachdb/cockroach/pkg/server/telemetry"
 	"github.com/cockroachdb/cockroach/pkg/settings"
 	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/sql"
@@ -1294,19 +1295,63 @@ func (s *adminServer) Cluster(
 
 // Health returns liveness for the node target of the request.
 //
-// NB: this is deprecated. If you're looking for the code serving /health,
-// you want (*statusServer).Details.
+// See the docstring for HealthRequest for more details about
+// what this function precisely reports.
+//
+// Note: Health is non-privileged and non-authenticated and thus
+// must not report privileged information.
 func (s *adminServer) Health(
 	ctx context.Context, req *serverpb.HealthRequest,
 ) (*serverpb.HealthResponse, error) {
-	isLive, err := s.server.nodeLiveness.IsLive(s.server.NodeID())
+	telemetry.Inc(telemetryHealthCheck)
+
+	resp := &serverpb.HealthResponse{}
+	// If Ready is not set, the client doesn't want to know whether this node is
+	// ready to receive client traffic.
+	if !req.Ready {
+		return resp, nil
+	}
+
+	if err := s.checkReadinessForHealthCheck(); err != nil {
+		return nil, err
+	}
+	return resp, nil
+}
+
+func (s *adminServer) checkReadinessForHealthCheck() error {
+	serveMode := s.server.grpc.mode.get()
+	switch serveMode {
+	case modeInitializing:
+		return status.Error(codes.Unavailable, "node is waiting for cluster initialization")
+	case modeDraining:
+		// grpc.mode is set to modeDraining when the Drain(DrainMode_CLIENT) has
+		// been called (client connections are to be drained).
+		return status.Errorf(codes.Unavailable, "node is shutting down")
+	case modeOperational:
+		break
+	default:
+		return s.serverError(errors.Newf("unknown mode: %v", serveMode))
+	}
+
+	// TODO(knz): update this code when progress is made on
+	// https://github.com/cockroachdb/cockroach/issues/45123
+	l, err := s.server.nodeLiveness.GetLiveness(s.server.NodeID())
 	if err != nil {
-		return nil, status.Errorf(codes.Internal, err.Error())
+		return s.serverError(err)
 	}
-	if !isLive {
-		return nil, status.Errorf(codes.Unavailable, "node is not live")
+	if !l.IsLive(s.server.clock.Now().GoTime()) {
+		return status.Errorf(codes.Unavailable, "node is not healthy")
 	}
-	return &serverpb.HealthResponse{}, nil
+	if l.Draining {
+		// l.Draining indicates that the node is draining leases.
+		// This is set when Drain(DrainMode_LEASES) is called.
+		// It's possible that l.Draining is set without
+		// grpc.mode being modeDraining, if a RPC client
+		// has requested DrainMode_LEASES but not DrainMode_CLIENT.
+		return status.Errorf(codes.Unavailable, "node is shutting down")
+	}
+
+	return nil
 }
 
 // getLivenessStatusMap generates a map from NodeID to LivenessStatus for all

@@ -1230,8 +1230,11 @@ func TestClusterAPI(t *testing.T) {
 
 func TestHealthAPI(t *testing.T) {
 	defer leaktest.AfterTest(t)()
+
+	ctx := context.Background()
+
 	s, _, _ := serverutils.StartServer(t, base.TestServerArgs{})
-	defer s.Stopper().Stop(context.TODO())
+	defer s.Stopper().Stop(ctx)
 
 	// We need to retry because the node ID isn't set until after
 	// bootstrapping.
@@ -1250,24 +1253,24 @@ func TestHealthAPI(t *testing.T) {
 	}
 	s.Clock().Update(hlc.Timestamp(self.Expiration).Add(1, 0))
 
-	expected := "503 Service Unavailable"
 	var resp serverpb.HealthResponse
-	for {
-		if err := getAdminJSONProto(s, "health", &resp); !testutils.IsError(err, expected) {
-			type timeouter interface {
-				Timeout() bool
-			}
-			if _, ok := err.(timeouter); ok {
-				// Special case for `*http.httpError` which can happen since we
-				// have timeouts on our requests and things may not be going so smoothly
-				// on the server side. See:
-				// https://github.com/cockroachdb/cockroach/issues/18469
-				log.Warningf(context.Background(), "ignoring timeout error: %s (%T)", err, err)
-				continue
-			}
-			t.Errorf("expected %q error, got %v (%T)", expected, err, err)
+	testutils.SucceedsSoon(t, func() error {
+		err := getAdminJSONProto(s, "health?ready=1", &resp)
+		if err == nil {
+			return errors.New("health OK, still waiting for unhealth")
+		}
+
+		t.Logf("observed error: %v", err)
+		if !testutils.IsError(err, `(?s)503 Service Unavailable.*"error": "node is not healthy"`) {
+			return err
 		}
-		break
+		return nil
+	})
+
+	// After the node reports an error with `?ready=1`, the health
+	// endpoint must still succeed without error when `?ready=1` is not specified.
+	if err := getAdminJSONProto(s, "health", &resp); err != nil {
+		t.Fatal(err)
 	}
 }
 

diff --git a/pkg/server/grpc_server.go b/pkg/server/grpc_server.go
@@ -66,7 +66,7 @@ var rpcsAllowedWhileBootstrapping = map[string]struct{}{
 	"/cockroach.rpc.Heartbeat/Ping":             {},
 	"/cockroach.gossip.Gossip/Gossip":           {},
 	"/cockroach.server.serverpb.Init/Bootstrap": {},
-	"/cockroach.server.serverpb.Status/Details": {},
+	"/cockroach.server.serverpb.Admin/Health":   {},
 }
 
 // intercept implements filtering rules for each server state.

diff --git a/pkg/server/server.go b/pkg/server/server.go
@@ -1392,6 +1392,11 @@ func (s *Server) Start(ctx context.Context) error {
 			return err
 		}
 	}
+	// Handle /health early. This is necessary for orchestration.  Note
+	// that /health is not authenticated, on purpose. This is both
+	// because it needs to be available before the cluster is up and can
+	// serve authentication requests, and also because it must work for
+	// monitoring tools which operate without authentication.
 	s.mux.Handle("/health", gwMux)
 
 	s.engines, err = s.cfg.CreateEngines(ctx)
@@ -1729,11 +1734,14 @@ func (s *Server) Start(ctx context.Context) error {
 
 	s.mux.Handle(adminPrefix, authHandler)
 	// Exempt the health check endpoint from authentication.
+	// This mirrors the handling of /health above.
 	s.mux.Handle("/_admin/v1/health", gwMux)
 	s.mux.Handle(ts.URLPrefix, authHandler)
 	s.mux.Handle(statusPrefix, authHandler)
+	// The /login endpoint is, by definition, available pre-authentication.
 	s.mux.Handle(loginPath, gwMux)
 	s.mux.Handle(logoutPath, authHandler)
+	// The /_status/vars endpoint is not authenticated either. Useful for monitoring.
 	s.mux.Handle(statusVars, http.HandlerFunc(s.status.handleVars))
 	log.Event(ctx, "added http endpoints")