From 3f0c54213315dedaf926d3701f985b4227809453 Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Wed, 20 Jan 2021 14:29:41 +0100 Subject: [PATCH] server: wait for SQL readiness in the `/health?ready=1` probe Release note (api change): the health API now checks that the SQL server is ready to accept clients when a readiness check is requested. --- pkg/server/admin.go | 8 ++++++-- pkg/server/admin_test.go | 16 ++++++++++++++-- pkg/server/drain.go | 1 + pkg/server/server.go | 3 +++ pkg/server/server_sql.go | 5 +++++ 5 files changed, 29 insertions(+), 4 deletions(-) diff --git a/pkg/server/admin.go b/pkg/server/admin.go index 887f9f0035cc..b6e28d60285f 100644 --- a/pkg/server/admin.go +++ b/pkg/server/admin.go @@ -1358,13 +1358,13 @@ func (s *adminServer) Health( return resp, nil } - if err := s.checkReadinessForHealthCheck(); err != nil { + if err := s.checkReadinessForHealthCheck(ctx); err != nil { return nil, err } return resp, nil } -func (s *adminServer) checkReadinessForHealthCheck() error { +func (s *adminServer) checkReadinessForHealthCheck(ctx context.Context) error { serveMode := s.server.grpc.mode.get() switch serveMode { case modeInitializing: @@ -1397,6 +1397,10 @@ func (s *adminServer) checkReadinessForHealthCheck() error { return status.Errorf(codes.Unavailable, "node is shutting down") } + if !s.server.sqlServer.acceptingClients.Get() { + return status.Errorf(codes.Unavailable, "node is not accepting SQL clients") + } + return nil } diff --git a/pkg/server/admin_test.go b/pkg/server/admin_test.go index d9acddbcf8d4..6b0844faf74d 100644 --- a/pkg/server/admin_test.go +++ b/pkg/server/admin_test.go @@ -1355,6 +1355,7 @@ func TestHealthAPI(t *testing.T) { s, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) defer s.Stopper().Stop(ctx) + ts := s.(*TestServer) // We need to retry because the node ID isn't set until after // bootstrapping. @@ -1363,15 +1364,26 @@ func TestHealthAPI(t *testing.T) { return getAdminJSONProto(s, "health", &resp) }) + // Make the SQL listener appear unavailable. Verify that health fails after that. + ts.sqlServer.acceptingClients.Set(false) + var resp serverpb.HealthResponse + err := getAdminJSONProto(s, "health?ready=1", &resp) + if err == nil { + t.Error("server appears ready even though SQL listener is not") + } + ts.sqlServer.acceptingClients.Set(true) + err = getAdminJSONProto(s, "health?ready=1", &resp) + if err != nil { + t.Errorf("server not ready after SQL listener is ready again: %v", err) + } + // Expire this node's liveness record by pausing heartbeats and advancing the // server's clock. - ts := s.(*TestServer) defer ts.nodeLiveness.PauseAllHeartbeatsForTest()() self, ok := ts.nodeLiveness.Self() assert.True(t, ok) s.Clock().Update(self.Expiration.ToTimestamp().Add(1, 0).UnsafeToClockTimestamp()) - var resp serverpb.HealthResponse testutils.SucceedsSoon(t, func() error { err := getAdminJSONProto(s, "health?ready=1", &resp) if err == nil { diff --git a/pkg/server/drain.go b/pkg/server/drain.go index a21e9c3202a6..4b4db5f3d26d 100644 --- a/pkg/server/drain.go +++ b/pkg/server/drain.go @@ -188,6 +188,7 @@ func (s *Server) drainClients(ctx context.Context, reporter func(int, redact.Saf // Mark the server as draining in a way that probes to // /health?ready=1 will notice. s.grpc.setMode(modeDraining) + s.sqlServer.acceptingClients.Set(false) // Wait for drainUnreadyWait. This will fail load balancer checks and // delay draining so that client traffic can move off this node. time.Sleep(drainWait.Get(&s.st.SV)) diff --git a/pkg/server/server.go b/pkg/server/server.go index db1cb44c21ff..316d3d0943ee 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -2073,6 +2073,9 @@ func (s *SQLServer) startServeSQL( return err } } + + s.acceptingClients.Set(true) + return nil } diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index 439d0e86e000..9c522ad0b0c7 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -70,6 +70,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/mon" "github.com/cockroachdb/cockroach/pkg/util/netutil" "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" "github.com/marusama/semaphore" @@ -111,6 +112,10 @@ type SQLServer struct { // connManager is the connection manager to use to set up additional // SQL listeners in AcceptClients(). connManager netutil.Server + + // set to true when the server has started accepting client conns. + // Used by health checks. + acceptingClients syncutil.AtomicBool } // sqlServerOptionalKVArgs are the arguments supplied to newSQLServer which are