Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
111533: roachprod: support managing shared process virtual clusters r=herkolategan a=renatolabs

This PR adds support for shared process virtual clusters using `start-sql` and `stop-sql`; previously, these subcommands would always create separate process virtual clusters. After this PR, `start-sql` will create shared process tenants by default. Alternatively, the `--external-cluster` command line flag can be used to indicate where the separate processes should be deployed. Similarly, `stop-sql` may be used to stop separate process virtual clusters, as before, by killing the corresponding OS process; or shared process virtual clusters by using SQL (`STOP SERVICE`).

A few other changes introduced in this PR, done to support this work:

* Made `ExecSQL` more flexible;
* Fixed a bug in `pgurl`, where we would include the virtual cluster name in the connection url returned by `roachprod pgurl`. This is not allowed since the URL pointed to the SQL server process, which does not handle that option.
* Remove ability to create tenants by specifying a tenant ID. We are now creating tenants with `CREATE TENANT` instead of `crdb_internal.create_tenant`. The tenant ID is computed automatically and used when starting the cockroach process for separate process deployments.

Epic: none

Release note: None

Co-authored-by: Renato Costa <[email protected]>
  • Loading branch information
craig[bot] and renatolabs committed Oct 19, 2023
2 parents 8015fa3 + 42d27f7 commit 11cb0ae
Show file tree
Hide file tree
Showing 7 changed files with 414 additions and 213 deletions.
29 changes: 13 additions & 16 deletions pkg/cmd/roachprod/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ var (
secure = false
virtualClusterName string
sqlInstance int
virtualClusterID int
extraSSHOptions = ""
nodeEnv []string
tag string
Expand Down Expand Up @@ -81,6 +80,9 @@ var (

// storageCluster is used for cluster virtualization and multi-tenant functionality.
storageCluster string
// externalProcessNodes indicates the cluster/nodes where external
// process SQL instances should be deployed.
externalProcessNodes string

revertUpdate bool
)
Expand Down Expand Up @@ -209,24 +211,19 @@ func initFlags() {
`Recurrence and scheduled backup options specification.
Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS first_run = 'now'"`)

startInstanceAsSeparateProcessCmd.Flags().StringVarP(&storageCluster,
"storage-cluster", "S", "", "storage cluster")
_ = startInstanceAsSeparateProcessCmd.MarkFlagRequired("storage-cluster")
startInstanceAsSeparateProcessCmd.Flags().IntVarP(&startOpts.VirtualClusterID,
"cluster-id", "i", startOpts.VirtualClusterID, "internal ID for the virtual cluster")
startInstanceAsSeparateProcessCmd.Flags().IntVar(&startOpts.SQLInstance,
"sql-instance", 0, "specific SQL/HTTP instance to connect to (this is a roachprod abstraction distinct from the internal instance ID)")
startInstanceCmd.Flags().StringVarP(&storageCluster, "storage-cluster", "S", "", "storage cluster")
_ = startInstanceCmd.MarkFlagRequired("storage-cluster")
startInstanceCmd.Flags().IntVar(&startOpts.SQLInstance,
"sql-instance", 0, "specific SQL/HTTP instance to connect to (this is a roachprod abstraction for separate-process deployments distinct from the internal instance ID)")
startInstanceCmd.Flags().StringVar(&externalProcessNodes, "external-cluster", externalProcessNodes, "start service in external mode, as a separate process in the given nodes")

// Flags for processes that stop (kill) processes.
for _, stopProcessesCmd := range []*cobra.Command{stopCmd, stopInstanceAsSeparateProcessCmd} {
for _, stopProcessesCmd := range []*cobra.Command{stopCmd, stopInstanceCmd} {
stopProcessesCmd.Flags().IntVar(&sig, "sig", sig, "signal to pass to kill")
stopProcessesCmd.Flags().BoolVar(&waitFlag, "wait", waitFlag, "wait for processes to exit")
stopProcessesCmd.Flags().IntVar(&maxWait, "max-wait", maxWait, "approx number of seconds to wait for processes to exit")
}

stopInstanceAsSeparateProcessCmd.Flags().IntVarP(&virtualClusterID, "cluster-id", "t", virtualClusterID, "internal ID for the virtual cluster")
stopInstanceAsSeparateProcessCmd.Flags().IntVar(&sqlInstance, "sql-instance", 0, "specific SQL/HTTP instance to stop")

syncCmd.Flags().BoolVar(&listOpts.IncludeVolumes, "include-volumes", false, "Include volumes when syncing")

wipeCmd.Flags().BoolVar(&wipePreserveCerts, "preserve-certs", false, "do not wipe certificates")
Expand Down Expand Up @@ -336,15 +333,15 @@ Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS
&ssh.InsecureIgnoreHostKey, "insecure-ignore-host-key", true, "don't check ssh host keys")
}

for _, cmd := range []*cobra.Command{startCmd, startInstanceAsSeparateProcessCmd} {
for _, cmd := range []*cobra.Command{startCmd, startInstanceCmd} {
cmd.Flags().BoolVar(&startOpts.Sequential,
"sequential", startOpts.Sequential, "start nodes sequentially so node IDs match hostnames")
cmd.Flags().Int64Var(&startOpts.NumFilesLimit, "num-files-limit", startOpts.NumFilesLimit,
"limit the number of files that can be created by the cockroach process")
}

for _, cmd := range []*cobra.Command{
startCmd, startInstanceAsSeparateProcessCmd, statusCmd, stopCmd, runCmd,
startCmd, startInstanceCmd, statusCmd, stopCmd, runCmd,
} {
cmd.Flags().StringVar(&tag, "tag", "", "the process tag")
}
Expand All @@ -360,11 +357,11 @@ Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS
cmd.Flags().StringVarP(&config.Binary,
"binary", "b", config.Binary, "the remote cockroach binary to use")
}
for _, cmd := range []*cobra.Command{startCmd, startInstanceAsSeparateProcessCmd, sqlCmd, pgurlCmd, adminurlCmd, runCmd} {
for _, cmd := range []*cobra.Command{startCmd, startInstanceCmd, sqlCmd, pgurlCmd, adminurlCmd, runCmd} {
cmd.Flags().BoolVar(&secure,
"secure", false, "use a secure cluster")
}
for _, cmd := range []*cobra.Command{pgurlCmd, sqlCmd, adminurlCmd} {
for _, cmd := range []*cobra.Command{pgurlCmd, sqlCmd, adminurlCmd, stopInstanceCmd} {
cmd.Flags().StringVar(&virtualClusterName,
"cluster", "", "specific virtual cluster to connect to")
cmd.Flags().IntVar(&sqlInstance,
Expand Down
70 changes: 38 additions & 32 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -525,10 +525,8 @@ SIGHUP), unless you also configure --max-wait.
}),
}

// TODO(herko/renato): maybe also support adding SQL instances to a
// shared-process node.
var startInstanceAsSeparateProcessCmd = &cobra.Command{
Use: "start-sql <virtual-cluster> --storage-cluster <storage-cluster>",
var startInstanceCmd = &cobra.Command{
Use: "start-sql <name> --storage-cluster <storage-cluster> [--external-cluster <virtual-cluster-nodes]",
Short: "start the SQL/HTTP service for a virtual cluster as a separate process",
Long: `Start SQL/HTTP instances for a virtual cluster as separate processes.
Expand All @@ -538,27 +536,30 @@ will create the virtual cluster on the storage cluster if it does not
exist already. If creating multiple virtual clusters on the same
node, the --sql-instance flag must be passed to differentiate them.
The --tenant-id flag can be used to specify the tenant ID; it defaults to 2.
The instance is started in shared process (in memory) mode by
default. To start an external process instance, pass the
--external-cluster flag indicating where the SQL server processes
should be started.
The --secure flag can be used to start nodes in secure mode (i.e. using
certs). When specified, there is a one time initialization for the cluster to
create and distribute the certs. Note that running some modes in secure mode
and others in insecure mode is not a supported Cockroach configuration.
As a debugging aid, the --sequential flag starts the nodes sequentially so node
IDs match hostnames. Otherwise nodes are started in parallel.
As a debugging aid, the --sequential flag starts the services
sequentially; otherwise services are started in parallel.
The --binary flag specifies the remote binary to run. It is up to the roachprod
user to ensure this binary exists, usually via "roachprod put". Note that no
cockroach software is installed by default on a newly created cluster.
The --binary flag specifies the remote binary to run, if starting
external services. It is up to the roachprod user to ensure this
binary exists, usually via "roachprod put". Note that no cockroach
software is installed by default on a newly created cluster.
The --args and --env flags can be used to pass arbitrary command line flags and
environment variables to the cockroach process.
` + tagHelp + `
`,
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
targetRoachprodCluster := args[0]
clusterSettingsOpts := []install.ClusterSettingOption{
install.TagOption(tag),
install.PGUrlCertsDirOption(pgurlCertsDir),
Expand All @@ -570,33 +571,38 @@ environment variables to the cockroach process.

// Always pick a random available port when starting virtual
// clusters. We do not expose the functionality of choosing a
// specific port, so this is fine.
// specific port for separate-process deployments; for
// shared-process, the port will always be based on the system
// tenant service.
//
// TODO(renato): remove this once #111052 is addressed.
startOpts.SQLPort = 0
startOpts.AdminUIPort = 0

startOpts.Target = install.StartSharedProcessForVirtualCluster
if externalProcessNodes != "" {
startOpts.Target = install.StartServiceForVirtualCluster
}

startOpts.VirtualClusterName = args[0]
return roachprod.StartServiceForVirtualCluster(context.Background(),
config.Logger, targetRoachprodCluster, storageCluster, startOpts, clusterSettingsOpts...)
config.Logger, externalProcessNodes, storageCluster, startOpts, clusterSettingsOpts...)
}),
}

var stopInstanceAsSeparateProcessCmd = &cobra.Command{
Use: "stop-sql <virtual-cluster> --tenant-id <id> --sql-instance <instance> [--sig] [--wait]",
var stopInstanceCmd = &cobra.Command{
Use: "stop-sql <cluster> --cluster <name> --sql-instance <instance> [--sig] [--wait]",
Short: "stop sql instances on a cluster",
Long: `Stop sql instances on a cluster.
Stop roachprod created sql instances running on the nodes in a cluster. Every
process started by roachprod is tagged with a ROACHPROD environment variable
which is used by "stop-sql" to locate the processes and terminate them. By default,
processes are killed with signal 9 (SIGKILL) giving them no chance for a graceful
exit.
Stop roachprod created virtual clusters (shared or separate process). By default,
separate processes are killed with signal 9 (SIGKILL) giving them no chance for a
graceful exit.
The --sig flag will pass a signal to kill to allow us finer control over how we
shutdown processes. The --wait flag causes stop to loop waiting for all
processes with the right ROACHPROD environment variable to exit. Note that stop
will wait forever if you specify --wait with a non-terminating signal (e.g.
SIGHUP), unless you also configure --max-wait.
processes to exit. Note that stop will wait forever if you specify --wait with a
non-terminating signal (e.g. SIGHUP), unless you also configure --max-wait.
--wait defaults to true for signal 9 (SIGKILL) and false for all other signals.
`,
Expand All @@ -607,14 +613,14 @@ SIGHUP), unless you also configure --max-wait.
wait = true
}
stopOpts := roachprod.StopOpts{
Wait: wait,
MaxWait: maxWait,
Sig: sig,
VirtualClusterID: virtualClusterID,
SQLInstance: sqlInstance,
Wait: wait,
MaxWait: maxWait,
Sig: sig,
VirtualClusterName: virtualClusterName,
SQLInstance: sqlInstance,
}
virtualCluster := args[0]
return roachprod.StopServiceForVirtualCluster(context.Background(), config.Logger, virtualCluster, stopOpts)
clusterName := args[0]
return roachprod.StopServiceForVirtualCluster(context.Background(), config.Logger, clusterName, stopOpts)
}),
}

Expand Down Expand Up @@ -1407,8 +1413,8 @@ func main() {
monitorCmd,
startCmd,
stopCmd,
startInstanceAsSeparateProcessCmd,
stopInstanceAsSeparateProcessCmd,
startInstanceCmd,
stopInstanceCmd,
initCmd,
runCmd,
signalCmd,
Expand Down
89 changes: 64 additions & 25 deletions pkg/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,17 +396,18 @@ func (c *SyncedCluster) newSession(
return newRemoteSession(l, command)
}

// Stop is used to stop cockroach on all nodes in the cluster.
// Stop is used to stop processes or virtual clusters.
//
// It sends a signal to all processes that have been started with ROACHPROD env
// var and optionally waits until the processes stop. If the virtualClusterLabel
// is not empty, then only sql processes with a matching label are stopped.
// It sends a signal to all processes that have been started with
// ROACHPROD env var and optionally waits until the processes stop. If
// the virtualClusterLabel is not empty, then only the corresponding
// virtual cluster is stopped (stopping the corresponding sql server
// process for separate process deployments, or stopping the service
// for shared-process configurations.)
//
// When running roachprod stop without other flags, the signal is 9 (SIGKILL)
// and wait is true.
//
// If maxWait is non-zero, Stop stops waiting after that approximate
// number of seconds.
// When Stop needs to kill a process without other flags, the signal
// is 9 (SIGKILL) and wait is true. If maxWait is non-zero, Stop stops
// waiting after that approximate number of seconds.
func (c *SyncedCluster) Stop(
ctx context.Context,
l *logger.Logger,
Expand All @@ -415,20 +416,58 @@ func (c *SyncedCluster) Stop(
maxWait int,
virtualClusterLabel string,
) error {
// virtualClusterDisplay includes information about the virtual
// cluster associated with OS processes being stopped in this
// function.
var virtualClusterDisplay string
// virtualClusterName is the virtualClusterName associated with the
// label passed, if any.
var virtualClusterName string
// killProcesses indicates whether processed need to be stopped.
killProcesses := true

if virtualClusterLabel != "" {
virtualClusterName, sqlInstance, err := VirtualClusterInfoFromLabel(virtualClusterLabel)
name, sqlInstance, err := VirtualClusterInfoFromLabel(virtualClusterLabel)
if err != nil {
return err
}

services, err := c.DiscoverServices(ctx, name, ServiceTypeSQL)
if err != nil {
return err
}

virtualClusterDisplay = fmt.Sprintf(" virtual cluster %q, instance %d", virtualClusterName, sqlInstance)
if len(services) == 0 {
return fmt.Errorf("no service for virtual cluster %q", virtualClusterName)
}

virtualClusterName = name
if services[0].ServiceMode == ServiceModeShared {
// For shared process virtual clusters, we just stop the service
// via SQL.
killProcesses = false
} else {
virtualClusterDisplay = fmt.Sprintf(" virtual cluster %q, instance %d", virtualClusterName, sqlInstance)
}

}
display := fmt.Sprintf("%s: stopping%s", c.Name, virtualClusterDisplay)
if wait {
display += " and waiting"

if killProcesses {
display := fmt.Sprintf("%s: stopping%s", c.Name, virtualClusterDisplay)
if wait {
display += " and waiting"
}
return c.kill(ctx, l, "stop", display, sig, wait, maxWait, virtualClusterLabel)
} else {
res, err := c.ExecSQL(ctx, l, c.Nodes[:1], "", 0, []string{
"-e", fmt.Sprintf("ALTER TENANT '%s' STOP SERVICE", virtualClusterName),
})
if err != nil || res[0].Err != nil {
return err
}
}
return c.kill(ctx, l, "stop", display, sig, wait, maxWait, virtualClusterLabel)

return nil
}

// Signal sends a signal to the CockroachDB process.
Expand Down Expand Up @@ -1654,7 +1693,7 @@ tar cvf %[3]s certs
// DistributeTenantCerts will generate and distribute certificates to all of the
// nodes, using the host cluster to generate tenant certificates.
func (c *SyncedCluster) DistributeTenantCerts(
ctx context.Context, l *logger.Logger, hostCluster *SyncedCluster, tenantID int,
ctx context.Context, l *logger.Logger, hostCluster *SyncedCluster, virtualClusterID int,
) error {
if hostCluster.checkForTenantCertificates(ctx, l) {
return nil
Expand All @@ -1669,7 +1708,7 @@ func (c *SyncedCluster) DistributeTenantCerts(
return err
}

if err := hostCluster.createTenantCertBundle(ctx, l, tenantCertsTarName, tenantID, nodeNames); err != nil {
if err := hostCluster.createTenantCertBundle(ctx, l, tenantCertsTarName, virtualClusterID, nodeNames); err != nil {
return err
}

Expand All @@ -1688,7 +1727,11 @@ func (c *SyncedCluster) DistributeTenantCerts(
// This function assumes it is running on a host cluster node that already has
// had the main cert bundle created.
func (c *SyncedCluster) createTenantCertBundle(
ctx context.Context, l *logger.Logger, bundleName string, tenantID int, nodeNames []string,
ctx context.Context,
l *logger.Logger,
bundleName string,
virtualClusterID int,
nodeNames []string,
) error {
display := fmt.Sprintf("%s: initializing tenant certs", c.Name)
return c.Parallel(ctx, l, c.Nodes[0:1], func(ctx context.Context, node Node) (*RunResultDetails, error) {
Expand Down Expand Up @@ -1718,7 +1761,7 @@ tar cvf %[4]s $CERT_DIR
`,
cockroachNodeBinary(c, node),
strings.Join(nodeNames, " "),
tenantID,
virtualClusterID,
bundleName,
)

Expand Down Expand Up @@ -2543,11 +2586,7 @@ func (c *SyncedCluster) pgurls(
if err != nil {
return nil, err
}
sharedClusterName := ""
if desc.ServiceMode == ServiceModeShared {
sharedClusterName = virtualClusterName
}
m[node] = c.NodeURL(host, desc.Port, sharedClusterName)
m[node] = c.NodeURL(host, desc.Port, virtualClusterName, desc.ServiceMode)
}
return m, nil
}
Expand Down Expand Up @@ -2929,7 +2968,7 @@ func (c *SyncedCluster) Init(ctx context.Context, l *logger.Logger, node Node) e
return errors.WithDetail(errors.CombineErrors(err, res.Err), "install.Init() failed: unable to initialize cluster.")
}

if res, err := c.setClusterSettings(ctx, l, node); err != nil || (res != nil && res.Err != nil) {
if res, err := c.setClusterSettings(ctx, l, node, ""); err != nil || (res != nil && res.Err != nil) {
return errors.WithDetail(errors.CombineErrors(err, res.Err), "install.Init() failed: unable to set cluster settings.")
}

Expand Down
Loading

0 comments on commit 11cb0ae

Please sign in to comment.