From e6cf25d31f2d078ada0881f98b6e17e77c2a621f Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Wed, 15 Mar 2023 10:32:12 -0400 Subject: [PATCH 1/2] roachprod: provide workaround for long-running AWS clusters In #98076, we started validating hostnames before running any commands to avoid situations where a stale cache could lead to unintended interference with other clusters due to public IP reuse. The check relies on the VM's `hostname` matching the expected cluster name in the cache. GCP and Azure clusters set the hostname to the instance name by default, but that is not the case for AWS; the aforementioned PR explicitly sets the hostname when the instance is created. However, in the case of long running AWS clusters (created before host validation was introduced) or clusters that are created with an outdated version of `roachprod`, the hostname will still be the default AWS hostname, and any interaction with that cluster will fail if using a recent `roachprod` version. To remedy this situation, this commit includes: * better error reporting. When we attempt to run a command on an AWS cluster and host validation fails, we display a message to the user explaining that their hostnames may need fixing. * if the user confirms that the cluster still exists (by running `roachprod list`), they are able to automatically fix the hostnames to the expected value by running a new `fix-long-running-aws-hostnames` command. This is a temporary workaround that should be removed once we no longer have clusters that would be affected by this issue. This commit will be reverted once we no longer have clusters created with the default hostnames; this will be easier to achieve once we have an easy way for everyone to upgrade their `roachprod` (see #97311). Epic: none Release note: None --- pkg/cmd/roachprod/main.go | 13 +++++++ pkg/roachprod/install/cluster_synced.go | 48 +++++++++++++++++++++++-- pkg/roachprod/install/session.go | 10 ++++++ pkg/roachprod/roachprod.go | 16 +++++++++ 4 files changed, 84 insertions(+), 3 deletions(-) diff --git a/pkg/cmd/roachprod/main.go b/pkg/cmd/roachprod/main.go index e917112e0b9e..03c991762d93 100644 --- a/pkg/cmd/roachprod/main.go +++ b/pkg/cmd/roachprod/main.go @@ -1050,6 +1050,18 @@ var storageSnapshotCmd = &cobra.Command{ }), } +var fixLongRunningAWSHostnamesCmd = &cobra.Command{ + Use: "fix-long-running-aws-hostnames ", + Short: "changes the hostnames of VMs in long-running AWS clusters", + Long: `This is a temporary workaround, and will be removed once we no longer ` + + `have AWS clusters that were created with the default hostname.`, + + Args: cobra.ExactArgs(1), + Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) { + return roachprod.FixLongRunningAWSHostnames(context.Background(), roachprodLibraryLogger, args[0]) + }), +} + func main() { loggerCfg := logger.Config{Stdout: os.Stdout, Stderr: os.Stderr} var loggerError error @@ -1105,6 +1117,7 @@ func main() { grafanaDumpCmd, grafanaURLCmd, rootStorageCmd, + fixLongRunningAWSHostnamesCmd, ) setBashCompletionFunction() diff --git a/pkg/roachprod/install/cluster_synced.go b/pkg/roachprod/install/cluster_synced.go index eeaa3ccffcc8..ef3de628e094 100644 --- a/pkg/roachprod/install/cluster_synced.go +++ b/pkg/roachprod/install/cluster_synced.go @@ -330,6 +330,19 @@ func (c *SyncedCluster) roachprodEnvRegex(node Node) string { // By wrapping every command with a hostname check as is done here, we // ensure that the cached cluster information is still correct. func (c *SyncedCluster) validateHostnameCmd(cmd string, node Node) string { + // TODO(renato): remove this logic once we no longer have AWS clusters + // created with the default hostnames. + var awsNote string + nodeVM := c.VMs[node-1] + if nodeVM.Provider == aws.ProviderName { + awsNote = fmt.Sprintf( + "\nNOTE: host validation failed in AWS cluster. If you are sure this cluster still "+ + "exists (i.e., you can see it when you run 'roachprod list'), then please run:\n\t"+ + "roachprod fix-long-running-aws-hostnames %s", + c.Name, + ) + } + isValidHost := fmt.Sprintf("[[ `hostname` == '%s' ]]", vm.Name(c.Name, int(node))) errMsg := fmt.Sprintf("expected host to be part of %s, but is `hostname`", c.Name) elseBranch := "fi" @@ -343,10 +356,10 @@ fi return fmt.Sprintf(` if ! %s; then - echo "%s" + echo "%s%s" exit 1 %s -`, isValidHost, errMsg, elseBranch) +`, isValidHost, errMsg, awsNote, elseBranch) } // validateHost will run `validateHostnameCmd` on the node passed to @@ -373,12 +386,15 @@ func (c *SyncedCluster) newSession( node: node, user: c.user(node), host: c.Host(node), - cmd: c.validateHostnameCmd(cmd, node), + cmd: cmd, } for _, opt := range options { opt(command) } + if !command.hostValidationDisabled { + command.cmd = c.validateHostnameCmd(cmd, node) + } return newRemoteSession(l, command) } @@ -2542,6 +2558,32 @@ func (c *SyncedCluster) Init(ctx context.Context, l *logger.Logger, node Node) e return nil } +// FixLongRunningAWSHostnames updates the hostname on an AWS cluster +// that was created with the default hostname. +// +// TODO(renato): remove this function (and corresponding roachprod +// command) once we no longer have clusters created with the default +// hostnames. +func (c *SyncedCluster) FixLongRunningAWSHostnames(ctx context.Context, l *logger.Logger) error { + for i, nodeVM := range c.VMs { + node := Node(i + 1) + newHostname := vm.Name(c.Name, int(node)) + if nodeVM.Provider == aws.ProviderName { + l.Printf("changing hostname of VM %d", node) + cmd := fmt.Sprintf("sudo hostnamectl set-hostname %s", newHostname) + s := c.newSession(l, node, cmd, withHostValidationDisabled()) + defer s.Close() + out, err := s.CombinedOutput(ctx) + if err != nil { + return fmt.Errorf("could not fix hostname for node %d: %w\n%s", node, err, string(out)) + } + l.Printf("hostname successfully changed to %s", newHostname) + } + } + + return nil +} + // GenFilenameFromArgs given a list of cmd args, returns an alphahumeric string up to // `maxLen` in length with hyphen delimiters, suitable for use in a filename. // e.g. ["/bin/bash", "-c", "'sudo dmesg > dmesg.txt'"] -> binbash-c-sudo-dmesg diff --git a/pkg/roachprod/install/session.go b/pkg/roachprod/install/session.go index 437ea5cb191e..cf73c7da010d 100644 --- a/pkg/roachprod/install/session.go +++ b/pkg/roachprod/install/session.go @@ -54,6 +54,10 @@ type remoteCommand struct { cmd string debugDisabled bool debugName string + + // TODO(renato): remove this option once we no longer have AWS + // clusters with default hostnames. + hostValidationDisabled bool } type remoteSessionOption = func(c *remoteCommand) @@ -70,6 +74,12 @@ func withDebugName(name string) remoteSessionOption { } } +func withHostValidationDisabled() remoteSessionOption { + return func(c *remoteCommand) { + c.hostValidationDisabled = true + } +} + func newRemoteSession(l *logger.Logger, command *remoteCommand) *remoteSession { var loggingArgs []string diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index f15cdad5bc31..0b416b74d52a 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -1811,3 +1811,19 @@ func createAttachMountVolumes( } return nil } + +func FixLongRunningAWSHostnames(ctx context.Context, l *logger.Logger, clusterName string) error { + if err := LoadClusters(); err != nil { + return err + } + c, err := newCluster(l, clusterName) + if err != nil { + return err + } + + if err := c.FixLongRunningAWSHostnames(ctx, l); err != nil { + return err + } + l.Printf("Done! You are now able to use your AWS cluster normally.") + return nil +} From 13825718aec1f97949700633a23c439ea275a4db Mon Sep 17 00:00:00 2001 From: Marcus Gartner Date: Wed, 15 Mar 2023 17:39:12 -0400 Subject: [PATCH 2/2] tree: fix tuple encoding performance regression This commit fixes a performance regression in pgwire encoding of tuples introduced in #95009. Informs #98306 Epic: None Release note: None --- pkg/sql/sem/tree/pgwire_encode.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pkg/sql/sem/tree/pgwire_encode.go b/pkg/sql/sem/tree/pgwire_encode.go index 269609233e56..6b157f853fa6 100644 --- a/pkg/sql/sem/tree/pgwire_encode.go +++ b/pkg/sql/sem/tree/pgwire_encode.go @@ -50,11 +50,14 @@ func (d *DTuple) pgwireFormat(ctx *FmtCtx) { // string printer called pgwireFormatStringInTuple(). ctx.WriteByte('(') comma := "" + tc := d.ResolvedType().TupleContents() for i, v := range d.D { ctx.WriteString(comma) - t := v.ResolvedType() - if tc := d.ResolvedType().TupleContents(); i < len(tc) { + var t *types.T + if i < len(tc) { t = tc[i] + } else { + t = v.ResolvedType() } switch dv := UnwrapDOidWrapper(v).(type) { case dNull: