From 56a7916bc98e383b910ca8fe2953be0a341aba1a Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Mon, 30 Oct 2023 11:50:30 -0400 Subject: [PATCH] roachprod: support config profiles This commit adds a few changes with the goal of supporting starting cockroach processes with "config-profiles". Most importantly, roachprod will now wait for the default target cluster if a config profile is detected: this causes subsequent initialization tasks to make use of the application tenant instead of the system one. We also stop assuming that the system tenant is the default tenant; now, when a virtual cluster is not specified, we do not attemtp to connect to it automatically, falling back to the configured target cluster on crdb. Epic: none Release note: None --- pkg/roachprod/install/cockroach.go | 79 ++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 4 deletions(-) diff --git a/pkg/roachprod/install/cockroach.go b/pkg/roachprod/install/cockroach.go index 5e6c98ec5225..5d5cfa8bd629 100644 --- a/pkg/roachprod/install/cockroach.go +++ b/pkg/roachprod/install/cockroach.go @@ -415,6 +415,9 @@ func (c *SyncedCluster) Start(ctx context.Context, l *logger.Logger, startOpts S } if startOpts.GetInitTarget() == node { + if err := c.waitForDefaultTargetCluster(ctx, l, startOpts); err != nil { + return res, errors.Wrap(err, "failed to wait for default target cluster") + } c.createAdminUserForSecureCluster(ctx, l, startOpts) return c.setClusterSettings(ctx, l, node, startOpts.VirtualClusterName) } @@ -501,7 +504,13 @@ func (c *SyncedCluster) NodeURL( } else { v.Add("sslmode", "disable") } - if serviceMode == ServiceModeShared && virtualClusterName != "" && virtualClusterName != "system" { + + // Add the virtual cluster name option explicitly for shared-process + // tenants or for the system tenant. This is to make sure we connect + // to the system tenant in case we have previously changed the + // default virtual cluster. + if (serviceMode == ServiceModeShared && virtualClusterName != "") || + virtualClusterName == SystemInterfaceName { v.Add("options", fmt.Sprintf("-ccluster=%s", virtualClusterName)) } u.RawQuery = v.Encode() @@ -542,9 +551,6 @@ func (c *SyncedCluster) ExecOrInteractiveSQL( if err != nil { return err } - if virtualClusterName == "" { - virtualClusterName = SystemInterfaceName - } url := c.NodeURL("localhost", desc.Port, virtualClusterName, desc.ServiceMode) binary := cockroachNodeBinary(c, c.Nodes[0]) allArgs := []string{binary, "sql", "--url", url} @@ -948,6 +954,71 @@ func (c *SyncedCluster) initializeCluster( return res, err } +// waitForDefaultTargetCluster checks for the existence of a +// config-profile flag that leads to the use of an application tenant +// as 'default target cluster'; if that is the case, we wait for all +// nodes to be aware of the cluster setting before proceding. Without +// this logic, follow-up tasks in the process of creating the cluster +// could run before the cluster setting is propagated, and they would +// apply to the system tenant instead. +func (c *SyncedCluster) waitForDefaultTargetCluster( + ctx context.Context, l *logger.Logger, startOpts StartOpts, +) error { + var hasCustomTargetCluster bool + for _, arg := range startOpts.ExtraArgs { + // If there is a config profile and that is set to either a '+app' + // profile or 'replication-source', we know that the default + // target cluster setting will be set to the application tenant. + if strings.Contains(arg, "config-profile") && + (strings.Contains(arg, "+app") || strings.Contains(arg, "replication-source")) { + hasCustomTargetCluster = true + break + } + } + + if !hasCustomTargetCluster { + return nil + } + + l.Printf("waiting for default target cluster") + retryOpts := retry.Options{MaxRetries: 20} + return retryOpts.Do(ctx, func(ctx context.Context) error { + // TODO(renato): use server.controller.default_target_cluster once + // 23.1 is no longer supported. + const stmt = "SHOW CLUSTER SETTING server.controller.default_tenant" + res, err := c.ExecSQL(ctx, l, Nodes{startOpts.GetInitTarget()}, SystemInterfaceName, 0, []string{"-e", stmt}) + if err != nil { + return errors.Wrap(err, "error reading cluster setting") + } + + if len(res) > 0 { + if res[0].Err != nil { + return errors.Wrapf(res[0].Err, "node %d", res[0].Node) + } + + if strings.Contains(res[0].CombinedOut, "system") { + return errors.Newf("target cluster on n%d is still system", res[0].Node) + } + } + + // Once we know the cluster setting points to the default target + // cluster, we attempt to run a dummy SQL statement until that + // succeeds (i.e., until the target cluster is able to handle + // requests.) + const pingStmt = "SELECT 1;" + res, err = c.ExecSQL(ctx, l, Nodes{startOpts.GetInitTarget()}, "", 0, []string{"-e", pingStmt}) + if err != nil { + return errors.Wrap(err, "error connecting to default target cluster") + } + + if res[0] != nil && res[0].Err != nil { + err = errors.CombineErrors(err, res[0].Err) + } + + return err + }) +} + // createAdminUserForSecureCluster creates a `roach` user with admin // privileges. The password used matches the virtual cluster name // ('system' for the storage cluster). If it cannot be created, this