From 147552f7a42c8c90ba2d9aa7186119949f4ceb4e Mon Sep 17 00:00:00 2001 From: Cameron Nunez Date: Mon, 7 Feb 2022 11:47:18 -0500 Subject: [PATCH] server: enhance comments surrounding draining This patch improves the comments related to the draining process, fixing incorrect statements and providing more details to make the process more easily understood. Release note: None --- .../settings/settings-for-tenants.txt | 6 +-- docs/generated/settings/settings.html | 6 +-- pkg/kv/kvserver/store.go | 9 ++-- pkg/server/admin.go | 2 +- pkg/server/admin_test.go | 4 +- pkg/server/drain.go | 45 ++++++++++++------- pkg/server/server_sql.go | 11 +++-- pkg/server/tenant_admin.go | 2 +- pkg/sql/pgwire/server.go | 10 ++--- 9 files changed, 57 insertions(+), 38 deletions(-) diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index c665a1c7ccb4..cf8543ce2634 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -60,9 +60,9 @@ server.oidc_authentication.provider_url string sets OIDC provider URL ({provide server.oidc_authentication.redirect_url string https://localhost:8080/oidc/v1/callback sets OIDC redirect URL via a URL string or a JSON string containing a required `redirect_urls` key with an object that maps from region keys to URL strings (URLs should point to your load balancer and must route to the path /oidc/v1/callback) server.oidc_authentication.scopes string openid sets OIDC scopes to include with authentication request (space delimited list of strings, required to start with `openid`) server.rangelog.ttl duration 720h0m0s if nonzero, range log entries older than this duration are deleted every 10m0s. Should not be lowered below 24 hours. -server.shutdown.drain_wait duration 0s the amount of time a server waits in an unready state before proceeding with the rest of the shutdown process (note that the --drain-wait parameter for cockroach node drain may need adjustment after changing this setting) -server.shutdown.lease_transfer_wait duration 5s the amount of time a server waits to transfer range leases before proceeding with the rest of the shutdown process (note that the --drain-wait parameter for cockroach node drain may need adjustment after changing this setting) -server.shutdown.query_wait duration 10s the server will wait for at least this amount of time for active queries to finish (note that the --drain-wait parameter for cockroach node drain may need adjustment after changing this setting) +server.shutdown.drain_wait duration 0s the amount of time a server waits in an unready state before proceeding with a drain (note that the --drain-wait parameter for cockroach node drain may need adjustment after changing this setting) +server.shutdown.lease_transfer_wait duration 5s the timeout for a single iteration of the range lease transfer phase of draining (note that the --drain-wait parameter for cockroach node drain may need adjustment after changing this setting) +server.shutdown.query_wait duration 10s the timeout for waiting for active queries to finish during a drain (note that the --drain-wait parameter for cockroach node drain may need adjustment after changing this setting) server.time_until_store_dead duration 5m0s the time after which if there is no new gossiped information about a store, it is considered dead server.user_login.cert_password_method.auto_scram_promotion.enabled boolean true whether to automatically promote cert-password authentication to use SCRAM server.user_login.min_password_length integer 1 the minimum length accepted for passwords set in cleartext via SQL. Note that a value lower than 1 is ignored: passwords cannot be empty in any case. diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index 4f1e1b6f9ad7..5595bcf2481c 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -71,9 +71,9 @@ server.oidc_authentication.redirect_urlstringhttps://localhost:8080/oidc/v1/callbacksets OIDC redirect URL via a URL string or a JSON string containing a required `redirect_urls` key with an object that maps from region keys to URL strings (URLs should point to your load balancer and must route to the path /oidc/v1/callback) server.oidc_authentication.scopesstringopenidsets OIDC scopes to include with authentication request (space delimited list of strings, required to start with `openid`) server.rangelog.ttlduration720h0m0sif nonzero, range log entries older than this duration are deleted every 10m0s. Should not be lowered below 24 hours. -server.shutdown.drain_waitduration0sthe amount of time a server waits in an unready state before proceeding with the rest of the shutdown process (note that the --drain-wait parameter for cockroach node drain may need adjustment after changing this setting) -server.shutdown.lease_transfer_waitduration5sthe amount of time a server waits to transfer range leases before proceeding with the rest of the shutdown process (note that the --drain-wait parameter for cockroach node drain may need adjustment after changing this setting) -server.shutdown.query_waitduration10sthe server will wait for at least this amount of time for active queries to finish (note that the --drain-wait parameter for cockroach node drain may need adjustment after changing this setting) +server.shutdown.drain_waitduration0sthe amount of time a server waits in an unready state before proceeding with a drain (note that the --drain-wait parameter for cockroach node drain may need adjustment after changing this setting) +server.shutdown.lease_transfer_waitduration5sthe timeout for a single iteration of the range lease transfer phase of draining (note that the --drain-wait parameter for cockroach node drain may need adjustment after changing this setting) +server.shutdown.query_waitduration10sthe timeout for waiting for active queries to finish during a drain (note that the --drain-wait parameter for cockroach node drain may need adjustment after changing this setting) server.time_until_store_deadduration5m0sthe time after which if there is no new gossiped information about a store, it is considered dead server.user_login.cert_password_method.auto_scram_promotion.enabledbooleantruewhether to automatically promote cert-password authentication to use SCRAM server.user_login.min_password_lengthinteger1the minimum length accepted for passwords set in cleartext via SQL. Note that a value lower than 1 is ignored: passwords cannot be empty in any case. diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 9ad8d59a8ed5..63de91819a50 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -188,13 +188,12 @@ var queueAdditionOnSystemConfigUpdateBurst = settings.RegisterIntSetting( settings.NonNegativeInt, ) -// leaseTransferWait limits the amount of time a drain command waits for lease -// and Raft leadership transfers. +// leaseTransferWait is the timeout for a single iteration of draining range leases. var leaseTransferWait = func() *settings.DurationSetting { s := settings.RegisterDurationSetting( settings.TenantWritable, leaseTransferWaitSettingName, - "the amount of time a server waits to transfer range leases before proceeding with the rest of the shutdown process "+ + "the timeout for a single iteration of the range lease transfer phase of draining "+ "(note that the --drain-wait parameter for cockroach node drain may need adjustment "+ "after changing this setting)", 5*time.Second, @@ -1355,6 +1354,10 @@ func (s *Store) AnnotateCtx(ctx context.Context) context.Context { // range leases, and attempts to transfer away any leases owned. // When called with 'false', returns to the normal mode of operation. // +// Note: this code represents ONE round of draining. This code is iterated on +// indefinitely until all leases are transferred away. +// This iteration can be found here: pkg/cli/start.go, pkg/cli/quit.go. +// // The reporter callback, if non-nil, is called on a best effort basis // to report work that needed to be done and which may or may not have // been done by the time this call returns. See the explanation in diff --git a/pkg/server/admin.go b/pkg/server/admin.go index a384e0f66d13..7e5eab421e49 100644 --- a/pkg/server/admin.go +++ b/pkg/server/admin.go @@ -1811,7 +1811,7 @@ func (s *adminServer) checkReadinessForHealthCheck(ctx context.Context) error { return status.Errorf(codes.Unavailable, "node is shutting down") } - if !s.server.sqlServer.acceptingClients.Get() { + if !s.server.sqlServer.isReady.Get() { return status.Errorf(codes.Unavailable, "node is not accepting SQL clients") } diff --git a/pkg/server/admin_test.go b/pkg/server/admin_test.go index f3819e377c9e..3123b8cc2169 100644 --- a/pkg/server/admin_test.go +++ b/pkg/server/admin_test.go @@ -1379,13 +1379,13 @@ func TestHealthAPI(t *testing.T) { }) // Make the SQL listener appear unavailable. Verify that health fails after that. - ts.sqlServer.acceptingClients.Set(false) + ts.sqlServer.isReady.Set(false) var resp serverpb.HealthResponse err := getAdminJSONProto(s, "health?ready=1", &resp) if err == nil { t.Error("server appears ready even though SQL listener is not") } - ts.sqlServer.acceptingClients.Set(true) + ts.sqlServer.isReady.Set(true) err = getAdminJSONProto(s, "health?ready=1", &resp) if err != nil { t.Errorf("server not ready after SQL listener is ready again: %v", err) diff --git a/pkg/server/drain.go b/pkg/server/drain.go index ab258f6c1106..aee0cf46b08c 100644 --- a/pkg/server/drain.go +++ b/pkg/server/drain.go @@ -33,7 +33,7 @@ var ( queryWait = settings.RegisterDurationSetting( settings.TenantWritable, "server.shutdown.query_wait", - "the server will wait for at least this amount of time for active queries to finish "+ + "the timeout for waiting for active queries to finish during a drain "+ "(note that the --drain-wait parameter for cockroach node drain may need adjustment "+ "after changing this setting)", 10*time.Second, @@ -42,8 +42,7 @@ var ( drainWait = settings.RegisterDurationSetting( settings.TenantWritable, "server.shutdown.drain_wait", - "the amount of time a server waits in an unready state before proceeding with the rest "+ - "of the shutdown process "+ + "the amount of time a server waits in an unready state before proceeding with a drain "+ "(note that the --drain-wait parameter for cockroach node drain may need adjustment "+ "after changing this setting)", 0*time.Second, @@ -231,6 +230,10 @@ func delegateDrain( } // runDrain idempotently activates the draining mode. +// Note: this represents ONE round of draining. This code is iterated on +// indefinitely until all range leases have been drained. +// This iteration can be found here: pkg/cli/start.go, pkg/cli/quit.go. +// // Note: new code should not be taught to use this method // directly. Use the Drain() RPC instead with a suitably crafted // DrainRequest. @@ -279,17 +282,17 @@ func (s *drainServer) runDrain( func (s *drainServer) drainInner( ctx context.Context, reporter func(int, redact.SafeString), verbose bool, ) (err error) { - // First drain all clients and SQL leases. + // Drain the SQL layer. + // Drains all SQL connections, distributed SQL execution flows, and SQL table leases. if err = s.drainClients(ctx, reporter); err != nil { return err } - // Finally, mark the node as draining in liveness and drain the - // range leases. + // Mark the node as draining in liveness and drain all range leases. return s.drainNode(ctx, reporter, verbose) } -// isDraining returns true if either clients are being drained -// or one of the stores on the node is not accepting replicas. +// isDraining returns true if either SQL client connections are being drained +// or if one of the stores on the node is not accepting replicas. func (s *drainServer) isDraining() bool { return s.sqlServer.pgServer.IsDraining() || (s.kvServer.node != nil && s.kvServer.node.IsDraining()) } @@ -299,26 +302,34 @@ func (s *drainServer) drainClients( ctx context.Context, reporter func(int, redact.SafeString), ) error { shouldDelayDraining := !s.isDraining() - // Mark the server as draining in a way that probes to - // /health?ready=1 will notice. + + // Set the gRPC mode of the node to "draining" and mark the node as "not ready". + // Probes to /health?ready=1 will now notice the change in the node's readiness. s.grpc.setMode(modeDraining) - s.sqlServer.acceptingClients.Set(false) - // Wait for drainUnreadyWait. This will fail load balancer checks and - // delay draining so that client traffic can move off this node. + s.sqlServer.isReady.Set(false) + + // Wait the duration of drainWait. + // This will fail load balancer checks and delay draining so that client + // traffic can move off this node. // Note delay only happens on first call to drain. if shouldDelayDraining { s.drainSleepFn(drainWait.Get(&s.sqlServer.execCfg.Settings.SV)) } - // Disable incoming SQL clients up to the queryWait timeout. + // Drain all SQL connections. + // The queryWait duration is a timeout for waiting on clients + // to self-disconnect. If the timeout is reached, any remaining connections + // will be closed. queryMaxWait := queryWait.Get(&s.sqlServer.execCfg.Settings.SV) if err := s.sqlServer.pgServer.Drain(ctx, queryMaxWait, reporter); err != nil { return err } - // Stop ongoing SQL execution up to the queryWait timeout. + + // Drain all distributed SQL execution flows. + // The queryWait duration is used to wait on currently running flows to finish. s.sqlServer.distSQLServer.Drain(ctx, queryMaxWait, reporter) - // Drain the SQL leases. This must be done after the pgServer has + // Drain all SQL table leases. This must be done after the pgServer has // given sessions a chance to finish ongoing work. s.sqlServer.leaseMgr.SetDraining(ctx, true /* drain */, reporter) @@ -335,8 +346,10 @@ func (s *drainServer) drainNode( // No KV subsystem. Nothing to do. return nil } + // Set the node's liveness status to "draining". if err = s.kvServer.nodeLiveness.SetDraining(ctx, true /* drain */, reporter); err != nil { return err } + // Mark the stores of the node as "draining" and drain all range leases. return s.kvServer.node.SetDraining(true /* drain */, reporter, verbose) } diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index 78756794a7d6..613955b378ed 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -157,9 +157,12 @@ type SQLServer struct { // SQL listeners in AcceptClients(). connManager netutil.Server - // set to true when the server has started accepting client conns. - // Used by health checks. - acceptingClients syncutil.AtomicBool + // isReady is the health status of the node. When true, the node is healthy; + // load balancers and connection management tools treat the node as "ready". + // When false, the node is unhealthy or "not ready", with load balancers and + // connection management tools learning this status from health checks. + // This is set to true when the server has started accepting client conns. + isReady syncutil.AtomicBool } // sqlServerOptionalKVArgs are the arguments supplied to newSQLServer which are @@ -1332,7 +1335,7 @@ func (s *SQLServer) startServeSQL( } } - s.acceptingClients.Set(true) + s.isReady.Set(true) return nil } diff --git a/pkg/server/tenant_admin.go b/pkg/server/tenant_admin.go index c7d6f68637bf..9511fc7ccba0 100644 --- a/pkg/server/tenant_admin.go +++ b/pkg/server/tenant_admin.go @@ -92,7 +92,7 @@ func (t *tenantAdminServer) Health( return resp, nil } - if !t.sqlServer.acceptingClients.Get() { + if !t.sqlServer.isReady.Get() { return nil, status.Errorf(codes.Unavailable, "node is not accepting SQL clients") } diff --git a/pkg/sql/pgwire/server.go b/pkg/sql/pgwire/server.go index cda8fa1f1aff..32e0baba792a 100644 --- a/pkg/sql/pgwire/server.go +++ b/pkg/sql/pgwire/server.go @@ -385,8 +385,8 @@ func (s *Server) Metrics() (res []interface{}) { } } -// Drain prevents new connections from being served and waits for drainWait for -// open connections to terminate before canceling them. +// Drain prevents new connections from being served and waits the duration of +// queryWait for open connections to terminate before canceling them. // An error will be returned when connections that have been canceled have not // responded to this cancellation and closed themselves in time. The server // will remain in draining state, though open connections may continue to @@ -400,9 +400,9 @@ func (s *Server) Metrics() (res []interface{}) { // been done by the time this call returns. See the explanation in // pkg/server/drain.go for details. func (s *Server) Drain( - ctx context.Context, drainWait time.Duration, reporter func(int, redact.SafeString), + ctx context.Context, queryWait time.Duration, reporter func(int, redact.SafeString), ) error { - return s.drainImpl(ctx, drainWait, cancelMaxWait, reporter) + return s.drainImpl(ctx, queryWait, cancelMaxWait, reporter) } // Undrain switches the server back to the normal mode of operation in which @@ -489,7 +489,7 @@ func (s *Server) drainImpl( } }() - // Wait for all connections to finish up to drainWait. + // Wait for connections to finish up their queries for the duration of queryWait. select { case <-time.After(queryWait): log.Ops.Warningf(ctx, "canceling all sessions after waiting %s", queryWait)