Skip to content

Commit

Permalink
roachprod: add stop-sql subcommand
Browse files Browse the repository at this point in the history
This adds a `roachprod stop-sql` command. It is similar to
`roachprod stop` in the sense that it takes a similar set of flags:
the signal to be sent to the processes, whether to wait for the
process to finish, and for how long. However, one crucial difference
is that `roachprod stop` stops *all* processes started by
roachprod (cockroach or otherwise), whereas `stop-sql` was designed
specifically for stopping tenant processes (SQL instances).

To achieve that, we set a `ROACHPROD_VIRTUAL_CLUSTER` environment
variable to the corresponding cockroach process when starting it. This
label is then used to find the correct process to stop when requested.

We also make use of the label to name the systemctl unit when starting
cockroach processes on remote clusters. This allows multiple tenants
to be co-located on the same VM.

Epic: none

Release note: None
  • Loading branch information
renatolabs committed Oct 5, 2023
1 parent 6787167 commit 4032b58
Show file tree
Hide file tree
Showing 9 changed files with 176 additions and 63 deletions.
13 changes: 10 additions & 3 deletions pkg/cmd/roachprod/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ var (
secure = false
virtualClusterName string
sqlInstance int
virtualClusterID int
extraSSHOptions = ""
nodeEnv []string
tag string
Expand Down Expand Up @@ -216,9 +217,15 @@ Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS
startInstanceAsSeparateProcessCmd.Flags().IntVar(&startOpts.SQLInstance,
"sql-instance", 0, "specific SQL/HTTP instance to connect to (this is a roachprod abstraction distinct from the internal instance ID)")

stopCmd.Flags().IntVar(&sig, "sig", sig, "signal to pass to kill")
stopCmd.Flags().BoolVar(&waitFlag, "wait", waitFlag, "wait for processes to exit")
stopCmd.Flags().IntVar(&maxWait, "max-wait", maxWait, "approx number of seconds to wait for processes to exit")
// Flags for processes that stop (kill) processes.
for _, stopProcessesCmd := range []*cobra.Command{stopCmd, stopInstanceAsSeparateProcessCmd} {
stopProcessesCmd.Flags().IntVar(&sig, "sig", sig, "signal to pass to kill")
stopProcessesCmd.Flags().BoolVar(&waitFlag, "wait", waitFlag, "wait for processes to exit")
stopProcessesCmd.Flags().IntVar(&maxWait, "max-wait", maxWait, "approx number of seconds to wait for processes to exit")
}

stopInstanceAsSeparateProcessCmd.Flags().IntVarP(&virtualClusterID, "cluster-id", "t", virtualClusterID, "internal ID for the virtual cluster")
stopInstanceAsSeparateProcessCmd.Flags().IntVar(&sqlInstance, "sql-instance", 0, "specific SQL/HTTP instance to stop")

syncCmd.Flags().BoolVar(&listOpts.IncludeVolumes, "include-volumes", false, "Include volumes when syncing")

Expand Down
41 changes: 40 additions & 1 deletion pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,7 @@ shutdown cockroach. The --wait flag causes stop to loop waiting for all
processes with the right ROACHPROD environment variable to exit. Note that stop
will wait forever if you specify --wait with a non-terminating signal (e.g.
SIGHUP), unless you also configure --max-wait.
--wait defaults to true for signal 9 (SIGKILL) and false for all other signals.
` + tagHelp + `
`,
Expand Down Expand Up @@ -571,6 +572,43 @@ environment variables to the cockroach process.
}),
}

var stopInstanceAsSeparateProcessCmd = &cobra.Command{
Use: "stop-sql <virtual-cluster> --tenant-id <id> --sql-instance <instance> [--sig] [--wait]",
Short: "stop sql instances on a cluster",
Long: `Stop sql instances on a cluster.
Stop roachprod created sql instances running on the nodes in a cluster. Every
process started by roachprod is tagged with a ROACHPROD environment variable
which is used by "stop-sql" to locate the processes and terminate them. By default,
processes are killed with signal 9 (SIGKILL) giving them no chance for a graceful
exit.
The --sig flag will pass a signal to kill to allow us finer control over how we
shutdown processes. The --wait flag causes stop to loop waiting for all
processes with the right ROACHPROD environment variable to exit. Note that stop
will wait forever if you specify --wait with a non-terminating signal (e.g.
SIGHUP), unless you also configure --max-wait.
--wait defaults to true for signal 9 (SIGKILL) and false for all other signals.
`,
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
wait := waitFlag
if sig == 9 /* SIGKILL */ && !cmd.Flags().Changed("wait") {
wait = true
}
stopOpts := roachprod.StopOpts{
Wait: wait,
MaxWait: maxWait,
Sig: sig,
VirtualClusterID: virtualClusterID,
SQLInstance: sqlInstance,
}
virtualCluster := args[0]
return roachprod.StopServiceForVirtualCluster(context.Background(), config.Logger, virtualCluster, stopOpts)
}),
}

var initCmd = &cobra.Command{
Use: "init <cluster>",
Short: "initialize the cluster",
Expand Down Expand Up @@ -679,7 +717,7 @@ of nodes, outputting a line whenever a change is detected:
var signalCmd = &cobra.Command{
Use: "signal <cluster> <signal>",
Short: "send signal to cluster",
Long: "Send a POSIX signal to the nodes in a cluster, specified by its integer code.",
Long: "Send a POSIX signal, specified by its integer code, to every process started via roachprod in a cluster.",
Args: cobra.ExactArgs(2),
Run: wrap(func(cmd *cobra.Command, args []string) error {
sig, err := strconv.ParseInt(args[1], 10, 8)
Expand Down Expand Up @@ -1361,6 +1399,7 @@ func main() {
startCmd,
stopCmd,
startInstanceAsSeparateProcessCmd,
stopInstanceAsSeparateProcessCmd,
initCmd,
runCmd,
signalCmd,
Expand Down
61 changes: 43 additions & 18 deletions pkg/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,14 +292,18 @@ func (c *SyncedCluster) roachprodEnvValue(node Node) string {
return strings.Join(parts, "/")
}

func envVarRegex(name, value string) string {
escaped := strings.ReplaceAll(value, "/", "\\/")
// We look for either a trailing space or a slash (in which case, we
// tolerate any remaining tag suffix). The env var may also be the
// last environment variable declared, so we also account for that.
return fmt.Sprintf(`(%[1]s=%[2]s$|%[1]s=%[2]s[ \/])`, name, escaped)
}

// roachprodEnvRegex returns a regexp that matches the ROACHPROD value for the
// given node.
func (c *SyncedCluster) roachprodEnvRegex(node Node) string {
escaped := strings.Replace(c.roachprodEnvValue(node), "/", "\\/", -1)
// We look for either a trailing space or a slash (in which case, we tolerate
// any remaining tag suffix). ROACHPROD may also be the last environment
// variable declared, so we also account for that.
return fmt.Sprintf(`(ROACHPROD=%[1]s$|ROACHPROD=%[1]s[ \/])`, escaped)
return envVarRegex("ROACHPROD", c.roachprodEnvValue(node))
}

// validateHostnameCmd wraps the command given with a check that the
Expand Down Expand Up @@ -394,36 +398,47 @@ func (c *SyncedCluster) newSession(
// Stop is used to stop cockroach on all nodes in the cluster.
//
// It sends a signal to all processes that have been started with ROACHPROD env
// var and optionally waits until the processes stop.
// var and optionally waits until the processes stop. If the virtualClusterLabel
// is not empty, then only sql processes with a matching label are stopped.
//
// When running roachprod stop without other flags, the signal is 9 (SIGKILL)
// and wait is true.
//
// If maxWait is non-zero, Stop stops waiting after that approximate
// number of seconds.
func (c *SyncedCluster) Stop(
ctx context.Context, l *logger.Logger, sig int, wait bool, maxWait int,
ctx context.Context,
l *logger.Logger,
sig int,
wait bool,
maxWait int,
virtualClusterLabel string,
) error {
display := fmt.Sprintf("%s: stopping", c.Name)
if wait {
display += " and waiting"
}
return c.kill(ctx, l, "stop", display, sig, wait, maxWait)
return c.kill(ctx, l, "stop", display, sig, wait, maxWait, virtualClusterLabel)
}

// Signal sends a signal to the CockroachDB process.
func (c *SyncedCluster) Signal(ctx context.Context, l *logger.Logger, sig int) error {
display := fmt.Sprintf("%s: sending signal %d", c.Name, sig)
return c.kill(ctx, l, "signal", display, sig, false /* wait */, 0 /* maxWait */)
return c.kill(ctx, l, "signal", display, sig, false /* wait */, 0 /* maxWait */, "")
}

// kill sends the signal sig to all nodes in the cluster using the kill command.
// cmdName and display specify the roachprod subcommand and a status message,
// for output/logging. If wait is true, the command will wait for the processes
// to exit, up to maxWait seconds.
// TODO(herko): This command does not support virtual clusters yet.
func (c *SyncedCluster) kill(
ctx context.Context, l *logger.Logger, cmdName, display string, sig int, wait bool, maxWait int,
ctx context.Context,
l *logger.Logger,
cmdName, display string,
sig int,
wait bool,
maxWait int,
virtualClusterLabel string,
) error {
if sig == 9 {
// `kill -9` without wait is never what a caller wants. See #77334.
Expand Down Expand Up @@ -454,24 +469,34 @@ func (c *SyncedCluster) kill(
)
}

var virtualClusterFilter string
if virtualClusterLabel != "" {
virtualClusterFilter = fmt.Sprintf(
"grep -E '%s' |",
envVarRegex("ROACHPROD_VIRTUAL_CLUSTER", virtualClusterLabel),
)
}

// NB: the awkward-looking `awk` invocation serves to avoid having the
// awk process match its own output from `ps`.
cmd := fmt.Sprintf(`
mkdir -p %[1]s
echo ">>> roachprod %[1]s: $(date)" >> %[2]s/roachprod.log
ps axeww -o pid -o command >> %[2]s/roachprod.log
pids=$(ps axeww -o pid -o command | \
%[3]s \
sed 's/export ROACHPROD=//g' | \
awk '/%[3]s/ { print $1 }')
awk '/%[4]s/ { print $1 }')
if [ -n "${pids}" ]; then
kill -%[4]d ${pids}
%[5]s
kill -%[5]d ${pids}
%[6]s
fi`,
cmdName, // [1]
c.LogDir(node, "", 0), // [2]
c.roachprodEnvRegex(node), // [3]
sig, // [4]
waitCmd, // [5]
virtualClusterFilter, // [3]
c.roachprodEnvRegex(node), // [4]
sig, // [5]
waitCmd, // [6]
)

return c.runCmdOnSingleNode(ctx, l, node, cmd, defaultCmdOpts("kill"))
Expand All @@ -481,7 +506,7 @@ fi`,
// Wipe TODO(peter): document
func (c *SyncedCluster) Wipe(ctx context.Context, l *logger.Logger, preserveCerts bool) error {
display := fmt.Sprintf("%s: wiping", c.Name)
if err := c.Stop(ctx, l, 9, true /* wait */, 0 /* maxWait */); err != nil {
if err := c.Stop(ctx, l, 9, true /* wait */, 0 /* maxWait */, ""); err != nil {
return err
}
return c.Parallel(ctx, l, c.Nodes, func(ctx context.Context, node Node) (*RunResultDetails, error) {
Expand Down
39 changes: 26 additions & 13 deletions pkg/roachprod/install/cockroach.go
Original file line number Diff line number Diff line change
Expand Up @@ -566,23 +566,36 @@ func (c *SyncedCluster) generateStartCmd(
"GOTRACEBACK=crash",
"COCKROACH_SKIP_ENABLING_DIAGNOSTIC_REPORTING=1",
}, c.Env...), getEnvVars()...),
Binary: cockroachNodeBinary(c, node),
Args: args,
MemoryMax: config.MemoryMax,
NumFilesLimit: startOpts.NumFilesLimit,
Local: c.IsLocal(),
Binary: cockroachNodeBinary(c, node),
Args: args,
MemoryMax: config.MemoryMax,
NumFilesLimit: startOpts.NumFilesLimit,
VirtualClusterLabel: VirtualClusterLabel(startOpts.VirtualClusterName, startOpts.SQLInstance),
Local: c.IsLocal(),
})
}

type startTemplateData struct {
Local bool
LogDir string
Binary string
KeyCmd string
MemoryMax string
NumFilesLimit int64
Args []string
EnvVars []string
Local bool
LogDir string
Binary string
KeyCmd string
MemoryMax string
NumFilesLimit int64
VirtualClusterLabel string
Args []string
EnvVars []string
}

// VirtualClusterLabel is the value used to "label" virtual cluster
// (cockroach) processes running locally or in a VM. This is used by
// roachprod to monitor identify such processes and monitor them.
func VirtualClusterLabel(virtualClusterName string, sqlInstance int) string {
if virtualClusterName == "" || virtualClusterName == SystemInterfaceName {
return "cockroach-system"
}

return fmt.Sprintf("cockroach-%s_%d", virtualClusterName, sqlInstance)
}

func execStartTemplate(data startTemplateData) (string, error) {
Expand Down
19 changes: 10 additions & 9 deletions pkg/roachprod/install/scripts/start.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ BINARY=#{shesc .Binary#}
KEY_CMD=#{.KeyCmd#}
MEMORY_MAX=#{.MemoryMax#}
NUM_FILES_LIMIT=#{.NumFilesLimit#}
VIRTUAL_CLUSTER_LABEL=#{.VirtualClusterLabel#}
ARGS=(
#{range .Args -#}
#{shesc .#}
Expand Down Expand Up @@ -50,7 +51,7 @@ if [[ -n "${LOCAL}" || "${1-}" == "run" ]]; then
mkdir -p "${BAZEL_COVER_DIR}"
fi
CODE=0
"${BINARY}" "${ARGS[@]}" >> "${LOG_DIR}/cockroach.stdout.log" 2>> "${LOG_DIR}/cockroach.stderr.log" || CODE="$?"
ROACHPROD_VIRTUAL_CLUSTER="${VIRTUAL_CLUSTER_LABEL}" "${BINARY}" "${ARGS[@]}" >> "${LOG_DIR}/cockroach.stdout.log" 2>> "${LOG_DIR}/cockroach.stderr.log" || CODE="$?"
if [[ -z "${LOCAL}" || "${CODE}" -ne 0 ]]; then
echo "cockroach exited with code ${CODE}: $(date)" | tee -a "${LOG_DIR}"/{roachprod,cockroach.{exit,std{out,err}}}.log
fi
Expand All @@ -73,18 +74,18 @@ sudo systemctl reset-failed cockroach 2>/dev/null || true
# The first time we run, install a small script that shows some helpful
# information when we ssh in.
if [ ! -e "${HOME}/.profile-cockroach" ]; then
cat > "${HOME}/.profile-cockroach" <<'EOQ'
cat > "${HOME}/.profile-${VIRTUAL_CLUSTER_LABEL}" <<'EOQ'
echo ""
if systemctl is-active -q cockroach; then
echo "cockroach is running; see: systemctl status cockroach"
elif systemctl is-failed -q cockroach; then
echo "cockroach stopped; see: systemctl status cockroach"
if systemctl is-active -q ${VIRTUAL_CLUSTER_LABEL}; then
echo "${VIRTUAL_CLUSTER_LABEL} is running; see: systemctl status ${VIRTUAL_CLUSTER_LABEL}"
elif systemctl is-failed -q ${VIRTUAL_CLUSTER_LABEL}; then
echo "${VIRTUAL_CLUSTER_LABEL} stopped; see: systemctl status ${VIRTUAL_CLUSTER_LABEL}"
else
echo "cockroach not started"
echo "${VIRTUAL_CLUSTER_LABEL} not started"
fi
echo ""
EOQ
echo ". ${HOME}/.profile-cockroach" >> "${HOME}/.profile"
echo ". ${HOME}/.profile-${VIRTUAL_CLUSTER_LABEL}" >> "${HOME}/.profile"
fi

# We run this script (with arg "run") as a service unit. We do not use --user
Expand All @@ -93,7 +94,7 @@ fi
# The "notify" service type means that systemd-run waits until cockroach
# notifies systemd that it is ready; NotifyAccess=all is needed because this
# notification doesn't come from the main PID (which is bash).
sudo systemd-run --unit cockroach \
sudo systemd-run --unit "${VIRTUAL_CLUSTER_LABEL}" \
--same-dir --uid "$(id -u)" --gid "$(id -g)" \
--service-type=notify -p NotifyAccess=all \
-p "MemoryMax=${MEMORY_MAX}" \
Expand Down
11 changes: 6 additions & 5 deletions pkg/roachprod/install/start_template_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ func TestExecStartTemplate(t *testing.T) {
LogDir: "./path with spaces/logs/$THIS_DOES_NOT_EVER_GET_EXPANDED",
KeyCmd: `echo foo && \
echo bar $HOME`,
EnvVars: []string{"ROACHPROD=1/tigtag", "COCKROACH=foo", "ROCKCOACH=17%"},
Binary: "./cockroach",
Args: []string{`start`, `--log`, `file-defaults: {dir: '/path with spaces/logs', exit-on-error: false}`},
MemoryMax: "81%",
Local: true,
EnvVars: []string{"ROACHPROD=1/tigtag", "COCKROACH=foo", "ROCKCOACH=17%"},
Binary: "./cockroach",
Args: []string{`start`, `--log`, `file-defaults: {dir: '/path with spaces/logs', exit-on-error: false}`},
MemoryMax: "81%",
VirtualClusterLabel: "cockroach-system",
Local: true,
}
datadriven.Walk(t, datapathutils.TestDataPath(t, "start"), func(t *testing.T, path string) {
datadriven.RunTest(t, path, func(t *testing.T, td *datadriven.TestData) string {
Expand Down
19 changes: 10 additions & 9 deletions pkg/roachprod/install/testdata/start/start.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ KEY_CMD=echo foo && \
echo bar $HOME
MEMORY_MAX=81%
NUM_FILES_LIMIT=0
VIRTUAL_CLUSTER_LABEL=cockroach-system
ARGS=(
start
--log
Expand Down Expand Up @@ -54,7 +55,7 @@ if [[ -n "${LOCAL}" || "${1-}" == "run" ]]; then
mkdir -p "${BAZEL_COVER_DIR}"
fi
CODE=0
"${BINARY}" "${ARGS[@]}" >> "${LOG_DIR}/cockroach.stdout.log" 2>> "${LOG_DIR}/cockroach.stderr.log" || CODE="$?"
ROACHPROD_VIRTUAL_CLUSTER="${VIRTUAL_CLUSTER_LABEL}" "${BINARY}" "${ARGS[@]}" >> "${LOG_DIR}/cockroach.stdout.log" 2>> "${LOG_DIR}/cockroach.stderr.log" || CODE="$?"
if [[ -z "${LOCAL}" || "${CODE}" -ne 0 ]]; then
echo "cockroach exited with code ${CODE}: $(date)" | tee -a "${LOG_DIR}"/{roachprod,cockroach.{exit,std{out,err}}}.log
fi
Expand All @@ -77,18 +78,18 @@ sudo systemctl reset-failed cockroach 2>/dev/null || true
# The first time we run, install a small script that shows some helpful
# information when we ssh in.
if [ ! -e "${HOME}/.profile-cockroach" ]; then
cat > "${HOME}/.profile-cockroach" <<'EOQ'
cat > "${HOME}/.profile-${VIRTUAL_CLUSTER_LABEL}" <<'EOQ'
echo ""
if systemctl is-active -q cockroach; then
echo "cockroach is running; see: systemctl status cockroach"
elif systemctl is-failed -q cockroach; then
echo "cockroach stopped; see: systemctl status cockroach"
if systemctl is-active -q ${VIRTUAL_CLUSTER_LABEL}; then
echo "${VIRTUAL_CLUSTER_LABEL} is running; see: systemctl status ${VIRTUAL_CLUSTER_LABEL}"
elif systemctl is-failed -q ${VIRTUAL_CLUSTER_LABEL}; then
echo "${VIRTUAL_CLUSTER_LABEL} stopped; see: systemctl status ${VIRTUAL_CLUSTER_LABEL}"
else
echo "cockroach not started"
echo "${VIRTUAL_CLUSTER_LABEL} not started"
fi
echo ""
EOQ
echo ". ${HOME}/.profile-cockroach" >> "${HOME}/.profile"
echo ". ${HOME}/.profile-${VIRTUAL_CLUSTER_LABEL}" >> "${HOME}/.profile"
fi

# We run this script (with arg "run") as a service unit. We do not use --user
Expand All @@ -97,7 +98,7 @@ fi
# The "notify" service type means that systemd-run waits until cockroach
# notifies systemd that it is ready; NotifyAccess=all is needed because this
# notification doesn't come from the main PID (which is bash).
sudo systemd-run --unit cockroach \
sudo systemd-run --unit "${VIRTUAL_CLUSTER_LABEL}" \
--same-dir --uid "$(id -u)" --gid "$(id -g)" \
--service-type=notify -p NotifyAccess=all \
-p "MemoryMax=${MEMORY_MAX}" \
Expand Down
Loading

0 comments on commit 4032b58

Please sign in to comment.