Skip to content

Commit

Permalink
Merge pull request #47692 from knz/20200420-backport20.1-45149-46396
Browse files Browse the repository at this point in the history
  • Loading branch information
knz authored May 2, 2020
2 parents 48092ca + 3c5a671 commit e27059e
Show file tree
Hide file tree
Showing 37 changed files with 1,797 additions and 975 deletions.
1 change: 1 addition & 0 deletions docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
<tr><td><code>server.rangelog.ttl</code></td><td>duration</td><td><code>720h0m0s</code></td><td>if nonzero, range log entries older than this duration are deleted every 10m0s. Should not be lowered below 24 hours.</td></tr>
<tr><td><code>server.remote_debugging.mode</code></td><td>string</td><td><code>local</code></td><td>set to enable remote debugging, localhost-only or disable (any, local, off)</td></tr>
<tr><td><code>server.shutdown.drain_wait</code></td><td>duration</td><td><code>0s</code></td><td>the amount of time a server waits in an unready state before proceeding with the rest of the shutdown process</td></tr>
<tr><td><code>server.shutdown.lease_transfer_wait</code></td><td>duration</td><td><code>5s</code></td><td>the amount of time a server waits to transfer range leases before proceeding with the rest of the shutdown process</td></tr>
<tr><td><code>server.shutdown.query_wait</code></td><td>duration</td><td><code>10s</code></td><td>the server will wait for at least this amount of time for active queries to finish</td></tr>
<tr><td><code>server.time_until_store_dead</code></td><td>duration</td><td><code>5m0s</code></td><td>the time after which if there is no new gossiped information about a store, it is considered dead</td></tr>
<tr><td><code>server.user_login.timeout</code></td><td>duration</td><td><code>10s</code></td><td>timeout after which client authentication times out if some system range is unavailable (0 = no timeout)</td></tr>
Expand Down
4 changes: 2 additions & 2 deletions pkg/cli/cli_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1387,11 +1387,11 @@ Available Commands:
start-single-node start a single-node cluster
init initialize a cluster
cert create ca, node, and client certs
quit drain and shutdown node
quit drain and shut down a node
sql open a sql shell
auth-session log in and out of HTTP sessions
node list, inspect or remove nodes
node list, inspect, drain or remove nodes
dump dump sql tables
nodelocal upload and delete nodelocal files
Expand Down
7 changes: 7 additions & 0 deletions pkg/cli/cliflags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,13 @@ long and not particularly human-readable.`,
Description: `Deprecated: use 'node decommission' instead.`,
}

DrainWait = FlagInfo{
Name: "drain-wait",
Description: `
When non-zero, wait for the specified amount of time for the node to
drain all active client connections and migrate away range leases.`,
}

Wait = FlagInfo{
Name: "wait",
Description: `
Expand Down
9 changes: 8 additions & 1 deletion pkg/cli/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ func initCLIDefaults() {
startCtx.inBackground = false

quitCtx.serverDecommission = false
quitCtx.drainWait = 10 * time.Minute

nodeCtx.nodeDecommissionWait = nodeDecommissionWaitAll
nodeCtx.statusShowRanges = false
Expand Down Expand Up @@ -334,10 +335,16 @@ var startCtx struct {
logDir log.DirName
}

// quitCtx captures the command-line parameters of the `quit` command.
// quitCtx captures the command-line parameters of the `quit` and
// `node drain` commands.
// Defaults set by InitCLIDefaults() above.
var quitCtx struct {
// serverDecommission indicates the server should be decommissioned
// before it is drained.
serverDecommission bool
// drainWait is the amount of time to wait for the server
// to drain. Set to 0 to disable a timeout (let the server decide).
drainWait time.Duration
}

// nodeCtx captures the command-line parameters of the `node` command.
Expand Down
7 changes: 1 addition & 6 deletions pkg/cli/demo_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -353,12 +353,7 @@ func (c *transientCluster) DrainNode(nodeID roachpb.NodeID) error {
}
defer finish()

onModes := make([]int32, len(server.GracefulDrainModes))
for i, m := range server.GracefulDrainModes {
onModes[i] = int32(m)
}

if err := doShutdown(ctx, adminClient, onModes); err != nil {
if err := drainAndShutdown(ctx, adminClient); err != nil {
return err
}
c.servers[nodeIndex] = nil
Expand Down
7 changes: 7 additions & 0 deletions pkg/cli/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,13 @@ func init() {
_ = f.MarkDeprecated(cliflags.Decommission.Name, `use 'cockroach node decommission' then 'cockroach quit' instead`)
}

// Quit and node drain.
for _, cmd := range []*cobra.Command{quitCmd, drainNodeCmd} {
f := cmd.Flags()
DurationFlag(f, &quitCtx.drainWait, cliflags.DrainWait, quitCtx.drainWait)
}

// SQL and demo commands.
for _, cmd := range append([]*cobra.Command{sqlShellCmd, demoCmd}, demoCmd.Commands()...) {
f := cmd.Flags()
VarFlag(f, &sqlCtx.setStmts, cliflags.Set)
Expand Down
2 changes: 1 addition & 1 deletion pkg/cli/interactive_tests/test_missing_log_output.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ start_test "Test that quit does not show INFO by default with --logtostderr"
# that the default logging level is WARNING, so that no INFO messages
# are printed between the marker and the (first line) error message
# from quit. Quit will error out because the server is already stopped.
send "echo marker; $argv quit --logtostderr 2>&1 | grep -vE '^\[WEF\]\[0-9\]+'\r"
send "echo marker; $argv quit --logtostderr 2>&1 | grep -vE '^\[WEF\]\[0-9\]+|^node is draining'\r"
eexpect "marker\r\nok"
eexpect ":/# "
end_test
Expand Down
41 changes: 39 additions & 2 deletions pkg/cli/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -493,18 +493,55 @@ func runRecommissionNode(cmd *cobra.Command, args []string) error {
return printDecommissionStatus(*resp)
}

var drainNodeCmd = &cobra.Command{
Use: "drain",
Short: "drain a node without shutting it down",
Long: `
Prepare a server for shutting down. This stops accepting client
connections, stops extant connections, and finally pushes range
leases onto other nodes, subject to various timeout parameters
configurable via cluster settings.`,
Args: cobra.NoArgs,
RunE: MaybeDecorateGRPCError(runDrain),
}

// runNodeDrain calls the Drain RPC without the flag to stop the
// server process.
func runDrain(cmd *cobra.Command, args []string) (err error) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

// At the end, we'll report "ok" if there was no error.
defer func() {
if err == nil {
fmt.Println("ok")
}
}()

// Establish a RPC connection.
c, finish, err := getAdminClient(ctx, serverCfg)
if err != nil {
return err
}
defer finish()

_, _, err = doDrain(ctx, c)
return err
}

// Sub-commands for node command.
var nodeCmds = []*cobra.Command{
lsNodesCmd,
statusNodeCmd,
decommissionNodeCmd,
recommissionNodeCmd,
drainNodeCmd,
}

var nodeCmd = &cobra.Command{
Use: "node [command]",
Short: "list, inspect or remove nodes",
Long: "List, inspect or remove nodes.",
Short: "list, inspect, drain or remove nodes",
Long: "List, inspect, drain or remove nodes.",
RunE: usageAndErr,
}

Expand Down
Loading

0 comments on commit e27059e

Please sign in to comment.