Skip to content

Commit

Permalink
Merge #98682 #98717
Browse files Browse the repository at this point in the history
98682: roachprod: provide workaround for long-running AWS clusters r=srosenberg a=renatolabs

In #98076, we started validating hostnames before running any commands to avoid situations where a stale cache could lead to unintended interference with other clusters due to public IP reuse. The check relies on the VM's `hostname` matching the expected cluster name in the cache. GCP and Azure clusters set the hostname to the instance name by default, but that is not the case for AWS; the aforementioned PR explicitly sets the hostname when the instance is created.

However, in the case of long running AWS clusters (created before host validation was introduced) or clusters that are created with an outdated version of `roachprod`, the hostname will still be the default AWS hostname, and any interaction with that cluster will fail if using a recent `roachprod` version. To remedy this situation, this commit includes:

* better error reporting. When we attempt to run a command on an AWS cluster and host validation fails, we display a message to the user explaining that their hostnames may need fixing.

* if the user confirms that the cluster still exists (by running `roachprod list`), they are able to automatically fix the hostnames to the expected value by running a new `fix-long-running-aws-hostnames` command. This is a temporary workaround that should be removed once we no longer have clusters that would be affected by this issue.

This commit will be reverted once we no longer have clusters created with the default hostnames; this will be easier to achieve once we have an easy way for everyone to upgrade their `roachprod` (see #97311).

Epic: none

Release note: None

98717: tree: fix tuple encoding performance regression r=mgartner a=mgartner

This commit fixes a performance regression in pgwire encoding of tuples
introduced in #95009.

Informs #98306

Epic: None

Release note: None

Co-authored-by: Renato Costa <[email protected]>
Co-authored-by: Marcus Gartner <[email protected]>
  • Loading branch information
3 people committed Mar 16, 2023
3 parents dbbf20d + e6cf25d + 1382571 commit 683545c
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 5 deletions.
13 changes: 13 additions & 0 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1050,6 +1050,18 @@ var storageSnapshotCmd = &cobra.Command{
}),
}

var fixLongRunningAWSHostnamesCmd = &cobra.Command{
Use: "fix-long-running-aws-hostnames <cluster>",
Short: "changes the hostnames of VMs in long-running AWS clusters",
Long: `This is a temporary workaround, and will be removed once we no longer ` +
`have AWS clusters that were created with the default hostname.`,

Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) {
return roachprod.FixLongRunningAWSHostnames(context.Background(), roachprodLibraryLogger, args[0])
}),
}

func main() {
loggerCfg := logger.Config{Stdout: os.Stdout, Stderr: os.Stderr}
var loggerError error
Expand Down Expand Up @@ -1105,6 +1117,7 @@ func main() {
grafanaDumpCmd,
grafanaURLCmd,
rootStorageCmd,
fixLongRunningAWSHostnamesCmd,
)
setBashCompletionFunction()

Expand Down
48 changes: 45 additions & 3 deletions pkg/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,19 @@ func (c *SyncedCluster) roachprodEnvRegex(node Node) string {
// By wrapping every command with a hostname check as is done here, we
// ensure that the cached cluster information is still correct.
func (c *SyncedCluster) validateHostnameCmd(cmd string, node Node) string {
// TODO(renato): remove this logic once we no longer have AWS clusters
// created with the default hostnames.
var awsNote string
nodeVM := c.VMs[node-1]
if nodeVM.Provider == aws.ProviderName {
awsNote = fmt.Sprintf(
"\nNOTE: host validation failed in AWS cluster. If you are sure this cluster still "+
"exists (i.e., you can see it when you run 'roachprod list'), then please run:\n\t"+
"roachprod fix-long-running-aws-hostnames %s",
c.Name,
)
}

isValidHost := fmt.Sprintf("[[ `hostname` == '%s' ]]", vm.Name(c.Name, int(node)))
errMsg := fmt.Sprintf("expected host to be part of %s, but is `hostname`", c.Name)
elseBranch := "fi"
Expand All @@ -343,10 +356,10 @@ fi

return fmt.Sprintf(`
if ! %s; then
echo "%s"
echo "%s%s"
exit 1
%s
`, isValidHost, errMsg, elseBranch)
`, isValidHost, errMsg, awsNote, elseBranch)
}

// validateHost will run `validateHostnameCmd` on the node passed to
Expand All @@ -373,12 +386,15 @@ func (c *SyncedCluster) newSession(
node: node,
user: c.user(node),
host: c.Host(node),
cmd: c.validateHostnameCmd(cmd, node),
cmd: cmd,
}

for _, opt := range options {
opt(command)
}
if !command.hostValidationDisabled {
command.cmd = c.validateHostnameCmd(cmd, node)
}
return newRemoteSession(l, command)
}

Expand Down Expand Up @@ -2542,6 +2558,32 @@ func (c *SyncedCluster) Init(ctx context.Context, l *logger.Logger, node Node) e
return nil
}

// FixLongRunningAWSHostnames updates the hostname on an AWS cluster
// that was created with the default hostname.
//
// TODO(renato): remove this function (and corresponding roachprod
// command) once we no longer have clusters created with the default
// hostnames.
func (c *SyncedCluster) FixLongRunningAWSHostnames(ctx context.Context, l *logger.Logger) error {
for i, nodeVM := range c.VMs {
node := Node(i + 1)
newHostname := vm.Name(c.Name, int(node))
if nodeVM.Provider == aws.ProviderName {
l.Printf("changing hostname of VM %d", node)
cmd := fmt.Sprintf("sudo hostnamectl set-hostname %s", newHostname)
s := c.newSession(l, node, cmd, withHostValidationDisabled())
defer s.Close()
out, err := s.CombinedOutput(ctx)
if err != nil {
return fmt.Errorf("could not fix hostname for node %d: %w\n%s", node, err, string(out))
}
l.Printf("hostname successfully changed to %s", newHostname)
}
}

return nil
}

// GenFilenameFromArgs given a list of cmd args, returns an alphahumeric string up to
// `maxLen` in length with hyphen delimiters, suitable for use in a filename.
// e.g. ["/bin/bash", "-c", "'sudo dmesg > dmesg.txt'"] -> binbash-c-sudo-dmesg
Expand Down
10 changes: 10 additions & 0 deletions pkg/roachprod/install/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ type remoteCommand struct {
cmd string
debugDisabled bool
debugName string

// TODO(renato): remove this option once we no longer have AWS
// clusters with default hostnames.
hostValidationDisabled bool
}

type remoteSessionOption = func(c *remoteCommand)
Expand All @@ -70,6 +74,12 @@ func withDebugName(name string) remoteSessionOption {
}
}

func withHostValidationDisabled() remoteSessionOption {
return func(c *remoteCommand) {
c.hostValidationDisabled = true
}
}

func newRemoteSession(l *logger.Logger, command *remoteCommand) *remoteSession {
var loggingArgs []string

Expand Down
16 changes: 16 additions & 0 deletions pkg/roachprod/roachprod.go
Original file line number Diff line number Diff line change
Expand Up @@ -1811,3 +1811,19 @@ func createAttachMountVolumes(
}
return nil
}

func FixLongRunningAWSHostnames(ctx context.Context, l *logger.Logger, clusterName string) error {
if err := LoadClusters(); err != nil {
return err
}
c, err := newCluster(l, clusterName)
if err != nil {
return err
}

if err := c.FixLongRunningAWSHostnames(ctx, l); err != nil {
return err
}
l.Printf("Done! You are now able to use your AWS cluster normally.")
return nil
}
7 changes: 5 additions & 2 deletions pkg/sql/sem/tree/pgwire_encode.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,14 @@ func (d *DTuple) pgwireFormat(ctx *FmtCtx) {
// string printer called pgwireFormatStringInTuple().
ctx.WriteByte('(')
comma := ""
tc := d.ResolvedType().TupleContents()
for i, v := range d.D {
ctx.WriteString(comma)
t := v.ResolvedType()
if tc := d.ResolvedType().TupleContents(); i < len(tc) {
var t *types.T
if i < len(tc) {
t = tc[i]
} else {
t = v.ResolvedType()
}
switch dv := UnwrapDOidWrapper(v).(type) {
case dNull:
Expand Down

0 comments on commit 683545c

Please sign in to comment.