Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release-23.2: roachprod: support managing shared process virtual clusters #112709

Merged
merged 6 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 13 additions & 16 deletions pkg/cmd/roachprod/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ var (
secure = false
virtualClusterName string
sqlInstance int
virtualClusterID int
extraSSHOptions = ""
nodeEnv []string
tag string
Expand Down Expand Up @@ -81,6 +80,9 @@ var (

// storageCluster is used for cluster virtualization and multi-tenant functionality.
storageCluster string
// externalProcessNodes indicates the cluster/nodes where external
// process SQL instances should be deployed.
externalProcessNodes string

revertUpdate bool
)
Expand Down Expand Up @@ -209,24 +211,19 @@ func initFlags() {
`Recurrence and scheduled backup options specification.
Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS first_run = 'now'"`)

startInstanceAsSeparateProcessCmd.Flags().StringVarP(&storageCluster,
"storage-cluster", "S", "", "storage cluster")
_ = startInstanceAsSeparateProcessCmd.MarkFlagRequired("storage-cluster")
startInstanceAsSeparateProcessCmd.Flags().IntVarP(&startOpts.VirtualClusterID,
"cluster-id", "i", startOpts.VirtualClusterID, "internal ID for the virtual cluster")
startInstanceAsSeparateProcessCmd.Flags().IntVar(&startOpts.SQLInstance,
"sql-instance", 0, "specific SQL/HTTP instance to connect to (this is a roachprod abstraction distinct from the internal instance ID)")
startInstanceCmd.Flags().StringVarP(&storageCluster, "storage-cluster", "S", "", "storage cluster")
_ = startInstanceCmd.MarkFlagRequired("storage-cluster")
startInstanceCmd.Flags().IntVar(&startOpts.SQLInstance,
"sql-instance", 0, "specific SQL/HTTP instance to connect to (this is a roachprod abstraction for separate-process deployments distinct from the internal instance ID)")
startInstanceCmd.Flags().StringVar(&externalProcessNodes, "external-cluster", externalProcessNodes, "start service in external mode, as a separate process in the given nodes")

// Flags for processes that stop (kill) processes.
for _, stopProcessesCmd := range []*cobra.Command{stopCmd, stopInstanceAsSeparateProcessCmd} {
for _, stopProcessesCmd := range []*cobra.Command{stopCmd, stopInstanceCmd} {
stopProcessesCmd.Flags().IntVar(&sig, "sig", sig, "signal to pass to kill")
stopProcessesCmd.Flags().BoolVar(&waitFlag, "wait", waitFlag, "wait for processes to exit")
stopProcessesCmd.Flags().IntVar(&maxWait, "max-wait", maxWait, "approx number of seconds to wait for processes to exit")
}

stopInstanceAsSeparateProcessCmd.Flags().IntVarP(&virtualClusterID, "cluster-id", "t", virtualClusterID, "internal ID for the virtual cluster")
stopInstanceAsSeparateProcessCmd.Flags().IntVar(&sqlInstance, "sql-instance", 0, "specific SQL/HTTP instance to stop")

syncCmd.Flags().BoolVar(&listOpts.IncludeVolumes, "include-volumes", false, "Include volumes when syncing")

wipeCmd.Flags().BoolVar(&wipePreserveCerts, "preserve-certs", false, "do not wipe certificates")
Expand Down Expand Up @@ -336,15 +333,15 @@ Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS
&ssh.InsecureIgnoreHostKey, "insecure-ignore-host-key", true, "don't check ssh host keys")
}

for _, cmd := range []*cobra.Command{startCmd, startInstanceAsSeparateProcessCmd} {
for _, cmd := range []*cobra.Command{startCmd, startInstanceCmd} {
cmd.Flags().BoolVar(&startOpts.Sequential,
"sequential", startOpts.Sequential, "start nodes sequentially so node IDs match hostnames")
cmd.Flags().Int64Var(&startOpts.NumFilesLimit, "num-files-limit", startOpts.NumFilesLimit,
"limit the number of files that can be created by the cockroach process")
}

for _, cmd := range []*cobra.Command{
startCmd, startInstanceAsSeparateProcessCmd, statusCmd, stopCmd, runCmd,
startCmd, startInstanceCmd, statusCmd, stopCmd, runCmd,
} {
cmd.Flags().StringVar(&tag, "tag", "", "the process tag")
}
Expand All @@ -360,11 +357,11 @@ Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS
cmd.Flags().StringVarP(&config.Binary,
"binary", "b", config.Binary, "the remote cockroach binary to use")
}
for _, cmd := range []*cobra.Command{startCmd, startInstanceAsSeparateProcessCmd, sqlCmd, pgurlCmd, adminurlCmd, runCmd} {
for _, cmd := range []*cobra.Command{startCmd, startInstanceCmd, sqlCmd, pgurlCmd, adminurlCmd, runCmd} {
cmd.Flags().BoolVar(&secure,
"secure", false, "use a secure cluster")
}
for _, cmd := range []*cobra.Command{pgurlCmd, sqlCmd, adminurlCmd} {
for _, cmd := range []*cobra.Command{pgurlCmd, sqlCmd, adminurlCmd, stopInstanceCmd} {
cmd.Flags().StringVar(&virtualClusterName,
"cluster", "", "specific virtual cluster to connect to")
cmd.Flags().IntVar(&sqlInstance,
Expand Down
70 changes: 38 additions & 32 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -525,10 +525,8 @@ SIGHUP), unless you also configure --max-wait.
}),
}

// TODO(herko/renato): maybe also support adding SQL instances to a
// shared-process node.
var startInstanceAsSeparateProcessCmd = &cobra.Command{
Use: "start-sql <virtual-cluster> --storage-cluster <storage-cluster>",
var startInstanceCmd = &cobra.Command{
Use: "start-sql <name> --storage-cluster <storage-cluster> [--external-cluster <virtual-cluster-nodes]",
Short: "start the SQL/HTTP service for a virtual cluster as a separate process",
Long: `Start SQL/HTTP instances for a virtual cluster as separate processes.

Expand All @@ -538,27 +536,30 @@ will create the virtual cluster on the storage cluster if it does not
exist already. If creating multiple virtual clusters on the same
node, the --sql-instance flag must be passed to differentiate them.

The --tenant-id flag can be used to specify the tenant ID; it defaults to 2.
The instance is started in shared process (in memory) mode by
default. To start an external process instance, pass the
--external-cluster flag indicating where the SQL server processes
should be started.

The --secure flag can be used to start nodes in secure mode (i.e. using
certs). When specified, there is a one time initialization for the cluster to
create and distribute the certs. Note that running some modes in secure mode
and others in insecure mode is not a supported Cockroach configuration.

As a debugging aid, the --sequential flag starts the nodes sequentially so node
IDs match hostnames. Otherwise nodes are started in parallel.
As a debugging aid, the --sequential flag starts the services
sequentially; otherwise services are started in parallel.

The --binary flag specifies the remote binary to run. It is up to the roachprod
user to ensure this binary exists, usually via "roachprod put". Note that no
cockroach software is installed by default on a newly created cluster.
The --binary flag specifies the remote binary to run, if starting
external services. It is up to the roachprod user to ensure this
binary exists, usually via "roachprod put". Note that no cockroach
software is installed by default on a newly created cluster.

The --args and --env flags can be used to pass arbitrary command line flags and
environment variables to the cockroach process.
` + tagHelp + `
`,
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
targetRoachprodCluster := args[0]
clusterSettingsOpts := []install.ClusterSettingOption{
install.TagOption(tag),
install.PGUrlCertsDirOption(pgurlCertsDir),
Expand All @@ -570,33 +571,38 @@ environment variables to the cockroach process.

// Always pick a random available port when starting virtual
// clusters. We do not expose the functionality of choosing a
// specific port, so this is fine.
// specific port for separate-process deployments; for
// shared-process, the port will always be based on the system
// tenant service.
//
// TODO(renato): remove this once #111052 is addressed.
startOpts.SQLPort = 0
startOpts.AdminUIPort = 0

startOpts.Target = install.StartSharedProcessForVirtualCluster
if externalProcessNodes != "" {
startOpts.Target = install.StartServiceForVirtualCluster
}

startOpts.VirtualClusterName = args[0]
return roachprod.StartServiceForVirtualCluster(context.Background(),
config.Logger, targetRoachprodCluster, storageCluster, startOpts, clusterSettingsOpts...)
config.Logger, externalProcessNodes, storageCluster, startOpts, clusterSettingsOpts...)
}),
}

var stopInstanceAsSeparateProcessCmd = &cobra.Command{
Use: "stop-sql <virtual-cluster> --tenant-id <id> --sql-instance <instance> [--sig] [--wait]",
var stopInstanceCmd = &cobra.Command{
Use: "stop-sql <cluster> --cluster <name> --sql-instance <instance> [--sig] [--wait]",
Short: "stop sql instances on a cluster",
Long: `Stop sql instances on a cluster.

Stop roachprod created sql instances running on the nodes in a cluster. Every
process started by roachprod is tagged with a ROACHPROD environment variable
which is used by "stop-sql" to locate the processes and terminate them. By default,
processes are killed with signal 9 (SIGKILL) giving them no chance for a graceful
exit.
Stop roachprod created virtual clusters (shared or separate process). By default,
separate processes are killed with signal 9 (SIGKILL) giving them no chance for a
graceful exit.

The --sig flag will pass a signal to kill to allow us finer control over how we
shutdown processes. The --wait flag causes stop to loop waiting for all
processes with the right ROACHPROD environment variable to exit. Note that stop
will wait forever if you specify --wait with a non-terminating signal (e.g.
SIGHUP), unless you also configure --max-wait.
processes to exit. Note that stop will wait forever if you specify --wait with a
non-terminating signal (e.g. SIGHUP), unless you also configure --max-wait.

--wait defaults to true for signal 9 (SIGKILL) and false for all other signals.
`,
Expand All @@ -607,14 +613,14 @@ SIGHUP), unless you also configure --max-wait.
wait = true
}
stopOpts := roachprod.StopOpts{
Wait: wait,
MaxWait: maxWait,
Sig: sig,
VirtualClusterID: virtualClusterID,
SQLInstance: sqlInstance,
Wait: wait,
MaxWait: maxWait,
Sig: sig,
VirtualClusterName: virtualClusterName,
SQLInstance: sqlInstance,
}
virtualCluster := args[0]
return roachprod.StopServiceForVirtualCluster(context.Background(), config.Logger, virtualCluster, stopOpts)
clusterName := args[0]
return roachprod.StopServiceForVirtualCluster(context.Background(), config.Logger, clusterName, stopOpts)
}),
}

Expand Down Expand Up @@ -1407,8 +1413,8 @@ func main() {
monitorCmd,
startCmd,
stopCmd,
startInstanceAsSeparateProcessCmd,
stopInstanceAsSeparateProcessCmd,
startInstanceCmd,
stopInstanceCmd,
initCmd,
runCmd,
signalCmd,
Expand Down
89 changes: 64 additions & 25 deletions pkg/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,17 +396,18 @@ func (c *SyncedCluster) newSession(
return newRemoteSession(l, command)
}

// Stop is used to stop cockroach on all nodes in the cluster.
// Stop is used to stop processes or virtual clusters.
//
// It sends a signal to all processes that have been started with ROACHPROD env
// var and optionally waits until the processes stop. If the virtualClusterLabel
// is not empty, then only sql processes with a matching label are stopped.
// It sends a signal to all processes that have been started with
// ROACHPROD env var and optionally waits until the processes stop. If
// the virtualClusterLabel is not empty, then only the corresponding
// virtual cluster is stopped (stopping the corresponding sql server
// process for separate process deployments, or stopping the service
// for shared-process configurations.)
//
// When running roachprod stop without other flags, the signal is 9 (SIGKILL)
// and wait is true.
//
// If maxWait is non-zero, Stop stops waiting after that approximate
// number of seconds.
// When Stop needs to kill a process without other flags, the signal
// is 9 (SIGKILL) and wait is true. If maxWait is non-zero, Stop stops
// waiting after that approximate number of seconds.
func (c *SyncedCluster) Stop(
ctx context.Context,
l *logger.Logger,
Expand All @@ -415,20 +416,58 @@ func (c *SyncedCluster) Stop(
maxWait int,
virtualClusterLabel string,
) error {
// virtualClusterDisplay includes information about the virtual
// cluster associated with OS processes being stopped in this
// function.
var virtualClusterDisplay string
// virtualClusterName is the virtualClusterName associated with the
// label passed, if any.
var virtualClusterName string
// killProcesses indicates whether processed need to be stopped.
killProcesses := true

if virtualClusterLabel != "" {
virtualClusterName, sqlInstance, err := VirtualClusterInfoFromLabel(virtualClusterLabel)
name, sqlInstance, err := VirtualClusterInfoFromLabel(virtualClusterLabel)
if err != nil {
return err
}

services, err := c.DiscoverServices(ctx, name, ServiceTypeSQL)
if err != nil {
return err
}

virtualClusterDisplay = fmt.Sprintf(" virtual cluster %q, instance %d", virtualClusterName, sqlInstance)
if len(services) == 0 {
return fmt.Errorf("no service for virtual cluster %q", virtualClusterName)
}

virtualClusterName = name
if services[0].ServiceMode == ServiceModeShared {
// For shared process virtual clusters, we just stop the service
// via SQL.
killProcesses = false
} else {
virtualClusterDisplay = fmt.Sprintf(" virtual cluster %q, instance %d", virtualClusterName, sqlInstance)
}

}
display := fmt.Sprintf("%s: stopping%s", c.Name, virtualClusterDisplay)
if wait {
display += " and waiting"

if killProcesses {
display := fmt.Sprintf("%s: stopping%s", c.Name, virtualClusterDisplay)
if wait {
display += " and waiting"
}
return c.kill(ctx, l, "stop", display, sig, wait, maxWait, virtualClusterLabel)
} else {
res, err := c.ExecSQL(ctx, l, c.Nodes[:1], "", 0, []string{
"-e", fmt.Sprintf("ALTER TENANT '%s' STOP SERVICE", virtualClusterName),
})
if err != nil || res[0].Err != nil {
return err
}
}
return c.kill(ctx, l, "stop", display, sig, wait, maxWait, virtualClusterLabel)

return nil
}

// Signal sends a signal to the CockroachDB process.
Expand Down Expand Up @@ -1654,7 +1693,7 @@ tar cvf %[3]s certs
// DistributeTenantCerts will generate and distribute certificates to all of the
// nodes, using the host cluster to generate tenant certificates.
func (c *SyncedCluster) DistributeTenantCerts(
ctx context.Context, l *logger.Logger, hostCluster *SyncedCluster, tenantID int,
ctx context.Context, l *logger.Logger, hostCluster *SyncedCluster, virtualClusterID int,
) error {
if hostCluster.checkForTenantCertificates(ctx, l) {
return nil
Expand All @@ -1669,7 +1708,7 @@ func (c *SyncedCluster) DistributeTenantCerts(
return err
}

if err := hostCluster.createTenantCertBundle(ctx, l, tenantCertsTarName, tenantID, nodeNames); err != nil {
if err := hostCluster.createTenantCertBundle(ctx, l, tenantCertsTarName, virtualClusterID, nodeNames); err != nil {
return err
}

Expand All @@ -1688,7 +1727,11 @@ func (c *SyncedCluster) DistributeTenantCerts(
// This function assumes it is running on a host cluster node that already has
// had the main cert bundle created.
func (c *SyncedCluster) createTenantCertBundle(
ctx context.Context, l *logger.Logger, bundleName string, tenantID int, nodeNames []string,
ctx context.Context,
l *logger.Logger,
bundleName string,
virtualClusterID int,
nodeNames []string,
) error {
display := fmt.Sprintf("%s: initializing tenant certs", c.Name)
return c.Parallel(ctx, l, c.Nodes[0:1], func(ctx context.Context, node Node) (*RunResultDetails, error) {
Expand Down Expand Up @@ -1718,7 +1761,7 @@ tar cvf %[4]s $CERT_DIR
`,
cockroachNodeBinary(c, node),
strings.Join(nodeNames, " "),
tenantID,
virtualClusterID,
bundleName,
)

Expand Down Expand Up @@ -2543,11 +2586,7 @@ func (c *SyncedCluster) pgurls(
if err != nil {
return nil, err
}
sharedClusterName := ""
if desc.ServiceMode == ServiceModeShared {
sharedClusterName = virtualClusterName
}
m[node] = c.NodeURL(host, desc.Port, sharedClusterName)
m[node] = c.NodeURL(host, desc.Port, virtualClusterName, desc.ServiceMode)
}
return m, nil
}
Expand Down Expand Up @@ -2929,7 +2968,7 @@ func (c *SyncedCluster) Init(ctx context.Context, l *logger.Logger, node Node) e
return errors.WithDetail(errors.CombineErrors(err, res.Err), "install.Init() failed: unable to initialize cluster.")
}

if res, err := c.setClusterSettings(ctx, l, node); err != nil || (res != nil && res.Err != nil) {
if res, err := c.setClusterSettings(ctx, l, node, ""); err != nil || (res != nil && res.Err != nil) {
return errors.WithDetail(errors.CombineErrors(err, res.Err), "install.Init() failed: unable to set cluster settings.")
}

Expand Down
Loading