Skip to content

Commit

Permalink
Merge #45119
Browse files Browse the repository at this point in the history
45119: server: fix broken /health r=tbg,andreimatei a=knz

Informs #44832.
Fixes #45020.

This patch fixes `/health` by ensuring it does not require
authentication, does not perform KV operations and does not expose
sensitive details.

Reminder, for posterity:

- `/_status/details` exposes health, readiness and node details,
  which is potentially privileged information. This requires an
  authenticated admin user. It also performs a KV operation for
  authentication and thus subject to cluster availability.

- `/_admin/v1/health` only exposes health and readiness. It
  is non-privileged and does not require authentication. It does
  not perform KV operations and is thus not subject to cluster
  availability.

- `/health` is now an alias for `/_admin/v1/health`.  (It used to be a
  non-authenticated alias for `/_status/details` which was both a UX
  and security bug. See release notes below for details.)

- both `/health` and `/_admin/v1/health` accept a boolean flag e.g.
  via `?ready=1`. When `ready` is specified, a HTTP error is returned if
  the node is up but not able to accept client connections, or not
  live, or shutting down. When `ready` is *not* specified, a HTTP success is always
  returned if the node is up, even when it cannot accept client
  connections or is not live.

Release justification: Category 3: Fixes for high-priority or high-severity bugs in existing functionality

Release note (security update): The non-authenticated `/health` HTTP
endpoint was previously exposing the private IP address of the node,
which can be privileged information in some deployments. This has been
corrected. Deployments using automation to retrieve a node build
details, address details etc should use `/_status/details` instead
and use a valid admin authentication cookie.

Release note (bug fix): Accesses to `/health` using a non-root
authentication token do not hang any more
when a node is currently under load or if a system range is
unavailable.

Co-authored-by: Raphael 'kena' Poss <[email protected]>
  • Loading branch information
craig[bot] and knz committed Mar 14, 2020
2 parents 3d8b9d5 + 125a058 commit 33d7147
Show file tree
Hide file tree
Showing 14 changed files with 1,007 additions and 938 deletions.
4 changes: 2 additions & 2 deletions pkg/cli/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@ func waitForClientReadinessAndGetClientGRPCConn(
// Access the /health endpoint. Until/unless this succeeds, the
// node is not yet fully initialized and ready to accept
// Bootstrap requests.
ac := serverpb.NewStatusClient(conn)
_, err := ac.Details(ctx, &serverpb.DetailsRequest{})
ac := serverpb.NewAdminClient(conn)
_, err := ac.Health(ctx, &serverpb.HealthRequest{})
return err
}); err != nil {
err = errors.Wrapf(err, "node not ready to perform cluster initialization")
Expand Down
2 changes: 1 addition & 1 deletion pkg/cli/zip.go
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ func runDebugZip(cmd *cobra.Command, args []string) error {
for _, r := range []zipRequest{
{
fn: func(ctx context.Context) (interface{}, error) {
return status.Details(ctx, &serverpb.DetailsRequest{NodeId: id, Ready: false})
return status.Details(ctx, &serverpb.DetailsRequest{NodeId: id})
},
pathName: prefix + "/details",
},
Expand Down
59 changes: 52 additions & 7 deletions pkg/server/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/security"
"github.com/cockroachdb/cockroach/pkg/server/debug"
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/server/telemetry"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql"
Expand Down Expand Up @@ -1294,19 +1295,63 @@ func (s *adminServer) Cluster(

// Health returns liveness for the node target of the request.
//
// NB: this is deprecated. If you're looking for the code serving /health,
// you want (*statusServer).Details.
// See the docstring for HealthRequest for more details about
// what this function precisely reports.
//
// Note: Health is non-privileged and non-authenticated and thus
// must not report privileged information.
func (s *adminServer) Health(
ctx context.Context, req *serverpb.HealthRequest,
) (*serverpb.HealthResponse, error) {
isLive, err := s.server.nodeLiveness.IsLive(s.server.NodeID())
telemetry.Inc(telemetryHealthCheck)

resp := &serverpb.HealthResponse{}
// If Ready is not set, the client doesn't want to know whether this node is
// ready to receive client traffic.
if !req.Ready {
return resp, nil
}

if err := s.checkReadinessForHealthCheck(); err != nil {
return nil, err
}
return resp, nil
}

func (s *adminServer) checkReadinessForHealthCheck() error {
serveMode := s.server.grpc.mode.get()
switch serveMode {
case modeInitializing:
return status.Error(codes.Unavailable, "node is waiting for cluster initialization")
case modeDraining:
// grpc.mode is set to modeDraining when the Drain(DrainMode_CLIENT) has
// been called (client connections are to be drained).
return status.Errorf(codes.Unavailable, "node is shutting down")
case modeOperational:
break
default:
return s.serverError(errors.Newf("unknown mode: %v", serveMode))
}

// TODO(knz): update this code when progress is made on
// https://github.com/cockroachdb/cockroach/issues/45123
l, err := s.server.nodeLiveness.GetLiveness(s.server.NodeID())
if err != nil {
return nil, status.Errorf(codes.Internal, err.Error())
return s.serverError(err)
}
if !isLive {
return nil, status.Errorf(codes.Unavailable, "node is not live")
if !l.IsLive(s.server.clock.Now().GoTime()) {
return status.Errorf(codes.Unavailable, "node is not healthy")
}
return &serverpb.HealthResponse{}, nil
if l.Draining {
// l.Draining indicates that the node is draining leases.
// This is set when Drain(DrainMode_LEASES) is called.
// It's possible that l.Draining is set without
// grpc.mode being modeDraining, if a RPC client
// has requested DrainMode_LEASES but not DrainMode_CLIENT.
return status.Errorf(codes.Unavailable, "node is shutting down")
}

return nil
}

// getLivenessStatusMap generates a map from NodeID to LivenessStatus for all
Expand Down
37 changes: 20 additions & 17 deletions pkg/server/admin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1230,8 +1230,11 @@ func TestClusterAPI(t *testing.T) {

func TestHealthAPI(t *testing.T) {
defer leaktest.AfterTest(t)()

ctx := context.Background()

s, _, _ := serverutils.StartServer(t, base.TestServerArgs{})
defer s.Stopper().Stop(context.TODO())
defer s.Stopper().Stop(ctx)

// We need to retry because the node ID isn't set until after
// bootstrapping.
Expand All @@ -1250,24 +1253,24 @@ func TestHealthAPI(t *testing.T) {
}
s.Clock().Update(hlc.Timestamp(self.Expiration).Add(1, 0))

expected := "503 Service Unavailable"
var resp serverpb.HealthResponse
for {
if err := getAdminJSONProto(s, "health", &resp); !testutils.IsError(err, expected) {
type timeouter interface {
Timeout() bool
}
if _, ok := err.(timeouter); ok {
// Special case for `*http.httpError` which can happen since we
// have timeouts on our requests and things may not be going so smoothly
// on the server side. See:
// https://github.com/cockroachdb/cockroach/issues/18469
log.Warningf(context.Background(), "ignoring timeout error: %s (%T)", err, err)
continue
}
t.Errorf("expected %q error, got %v (%T)", expected, err, err)
testutils.SucceedsSoon(t, func() error {
err := getAdminJSONProto(s, "health?ready=1", &resp)
if err == nil {
return errors.New("health OK, still waiting for unhealth")
}

t.Logf("observed error: %v", err)
if !testutils.IsError(err, `(?s)503 Service Unavailable.*"error": "node is not healthy"`) {
return err
}
break
return nil
})

// After the node reports an error with `?ready=1`, the health
// endpoint must still succeed without error when `?ready=1` is not specified.
if err := getAdminJSONProto(s, "health", &resp); err != nil {
t.Fatal(err)
}
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/server/grpc_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ var rpcsAllowedWhileBootstrapping = map[string]struct{}{
"/cockroach.rpc.Heartbeat/Ping": {},
"/cockroach.gossip.Gossip/Gossip": {},
"/cockroach.server.serverpb.Init/Bootstrap": {},
"/cockroach.server.serverpb.Status/Details": {},
"/cockroach.server.serverpb.Admin/Health": {},
}

// intercept implements filtering rules for each server state.
Expand Down
8 changes: 8 additions & 0 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -1392,6 +1392,11 @@ func (s *Server) Start(ctx context.Context) error {
return err
}
}
// Handle /health early. This is necessary for orchestration. Note
// that /health is not authenticated, on purpose. This is both
// because it needs to be available before the cluster is up and can
// serve authentication requests, and also because it must work for
// monitoring tools which operate without authentication.
s.mux.Handle("/health", gwMux)

s.engines, err = s.cfg.CreateEngines(ctx)
Expand Down Expand Up @@ -1729,11 +1734,14 @@ func (s *Server) Start(ctx context.Context) error {

s.mux.Handle(adminPrefix, authHandler)
// Exempt the health check endpoint from authentication.
// This mirrors the handling of /health above.
s.mux.Handle("/_admin/v1/health", gwMux)
s.mux.Handle(ts.URLPrefix, authHandler)
s.mux.Handle(statusPrefix, authHandler)
// The /login endpoint is, by definition, available pre-authentication.
s.mux.Handle(loginPath, gwMux)
s.mux.Handle(logoutPath, authHandler)
// The /_status/vars endpoint is not authenticated either. Useful for monitoring.
s.mux.Handle(statusVars, http.HandlerFunc(s.status.handleVars))
log.Event(ctx, "added http endpoints")

Expand Down
Loading

0 comments on commit 33d7147

Please sign in to comment.