Skip to content

Commit

Permalink
server: wait for SQL readiness in the /health?ready=1 probe
Browse files Browse the repository at this point in the history
Release note (api change): the health API now checks that the SQL
server is ready to accept clients when a readiness check is requested.
  • Loading branch information
knz committed Jan 23, 2021
1 parent 2b2d350 commit 3f0c542
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 4 deletions.
8 changes: 6 additions & 2 deletions pkg/server/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -1358,13 +1358,13 @@ func (s *adminServer) Health(
return resp, nil
}

if err := s.checkReadinessForHealthCheck(); err != nil {
if err := s.checkReadinessForHealthCheck(ctx); err != nil {
return nil, err
}
return resp, nil
}

func (s *adminServer) checkReadinessForHealthCheck() error {
func (s *adminServer) checkReadinessForHealthCheck(ctx context.Context) error {
serveMode := s.server.grpc.mode.get()
switch serveMode {
case modeInitializing:
Expand Down Expand Up @@ -1397,6 +1397,10 @@ func (s *adminServer) checkReadinessForHealthCheck() error {
return status.Errorf(codes.Unavailable, "node is shutting down")
}

if !s.server.sqlServer.acceptingClients.Get() {
return status.Errorf(codes.Unavailable, "node is not accepting SQL clients")
}

return nil
}

Expand Down
16 changes: 14 additions & 2 deletions pkg/server/admin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1355,6 +1355,7 @@ func TestHealthAPI(t *testing.T) {

s, _, _ := serverutils.StartServer(t, base.TestServerArgs{})
defer s.Stopper().Stop(ctx)
ts := s.(*TestServer)

// We need to retry because the node ID isn't set until after
// bootstrapping.
Expand All @@ -1363,15 +1364,26 @@ func TestHealthAPI(t *testing.T) {
return getAdminJSONProto(s, "health", &resp)
})

// Make the SQL listener appear unavailable. Verify that health fails after that.
ts.sqlServer.acceptingClients.Set(false)
var resp serverpb.HealthResponse
err := getAdminJSONProto(s, "health?ready=1", &resp)
if err == nil {
t.Error("server appears ready even though SQL listener is not")
}
ts.sqlServer.acceptingClients.Set(true)
err = getAdminJSONProto(s, "health?ready=1", &resp)
if err != nil {
t.Errorf("server not ready after SQL listener is ready again: %v", err)
}

// Expire this node's liveness record by pausing heartbeats and advancing the
// server's clock.
ts := s.(*TestServer)
defer ts.nodeLiveness.PauseAllHeartbeatsForTest()()
self, ok := ts.nodeLiveness.Self()
assert.True(t, ok)
s.Clock().Update(self.Expiration.ToTimestamp().Add(1, 0).UnsafeToClockTimestamp())

var resp serverpb.HealthResponse
testutils.SucceedsSoon(t, func() error {
err := getAdminJSONProto(s, "health?ready=1", &resp)
if err == nil {
Expand Down
1 change: 1 addition & 0 deletions pkg/server/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ func (s *Server) drainClients(ctx context.Context, reporter func(int, redact.Saf
// Mark the server as draining in a way that probes to
// /health?ready=1 will notice.
s.grpc.setMode(modeDraining)
s.sqlServer.acceptingClients.Set(false)
// Wait for drainUnreadyWait. This will fail load balancer checks and
// delay draining so that client traffic can move off this node.
time.Sleep(drainWait.Get(&s.st.SV))
Expand Down
3 changes: 3 additions & 0 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -2073,6 +2073,9 @@ func (s *SQLServer) startServeSQL(
return err
}
}

s.acceptingClients.Set(true)

return nil
}

Expand Down
5 changes: 5 additions & 0 deletions pkg/server/server_sql.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/netutil"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"github.com/marusama/semaphore"
Expand Down Expand Up @@ -111,6 +112,10 @@ type SQLServer struct {
// connManager is the connection manager to use to set up additional
// SQL listeners in AcceptClients().
connManager netutil.Server

// set to true when the server has started accepting client conns.
// Used by health checks.
acceptingClients syncutil.AtomicBool
}

// sqlServerOptionalKVArgs are the arguments supplied to newSQLServer which are
Expand Down

0 comments on commit 3f0c542

Please sign in to comment.