From 6c15cc0deb61b6e1bac758a938f7078bb5ca219a Mon Sep 17 00:00:00 2001 From: Yahor Yuzefovich Date: Mon, 6 Mar 2023 19:24:48 -0800 Subject: [PATCH 1/2] logictest: unskip 3node-tenant config in statement_statistics This test now passed (since we support DistSQL in multi-tenancy now). Epic: None Release note: None --- pkg/sql/logictest/testdata/logic_test/statement_statistics | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/sql/logictest/testdata/logic_test/statement_statistics b/pkg/sql/logictest/testdata/logic_test/statement_statistics index 8d568fb12d2f..773d2aec8f8c 100644 --- a/pkg/sql/logictest/testdata/logic_test/statement_statistics +++ b/pkg/sql/logictest/testdata/logic_test/statement_statistics @@ -151,7 +151,6 @@ SET application_name = ''; statement ok RESET distsql -skipif config 3node-tenant-default-configs #52763 query TT colnames SELECT key,flags FROM test.crdb_internal.node_statement_statistics From 465a0f4fab182f6c4d2d641d0e7548e47ebe6388 Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Fri, 3 Mar 2023 11:44:00 -0500 Subject: [PATCH 2/2] roachprod: validate host before running command This commit adds a wrapper to every command run on a remote roachprod cluster with checks that the node still belongs to the cluster roachprod thinks it does. Specifically, it stops roachprod commands from interfering with unrelated clusters if the roachprod cache is not up to date. Consider the following set of steps: ```console $ roachprod create cluster-a -n 1 --lifetime 1h $ roachprod put cluster-a ./cockroach ./cockroach $ roachprod start cluster-a ``` One hour later, `cluster-a`'s lifetime expires and the VM is destroyed. Another user (or a Team City agent) creates another cluster: ```console $ roachprod create cluster-b -n 1 $ roachprod put cluster-b ./cockroach ./cockroach $ roachprod start cluster-b ``` In the process of creating `cluster-b`, it is possible that the public IP for cluster A's VM is reused and assigned to cluster B's VM. To simulate that situation on a single client, we can manually edit roachprod's cache. Now suppose the creator of `cluster-a`, not knowing the cluster was expired and now with a stale cache, runs: ```console $ roachprod stop cluster-a ``` This will unintentionally stop cockroach on `cluster-b`! A client with an updated cache will see the following output: ```console $ roachprod status cluster-b cluster-b: status 1/1 17:14:31 main.go:518: 1: not running ``` In addition, it's confusing, from cluster B's perspective, why the cockroach process died -- all we know is that it was killed and the process exited with code 137: ```console $ roachprod run cluster-b 'sudo journalctl' | grep cockroach Mar 06 17:18:33 renato-cluster-b-0001 systemd[1]: Starting /usr/bin/bash ./cockroach.sh run... Mar 06 17:18:33 renato-cluster-b-0001 bash[13384]: cockroach start: Mon Mar 6 17:18:33 UTC 2023, logging to logs Mar 06 17:18:34 renato-cluster-b-0001 systemd[1]: Started /usr/bin/bash ./cockroach.sh run. Mar 06 17:19:04 renato-cluster-b-0001 bash[13381]: ./cockroach.sh: line 67: 13386 Killed "${BINARY}" "${ARGS[@]}" >> "${LOG_DIR}/cockroach.stdout.log" 2>> "${LOG_DIR}/cockroach.stderr.log" Mar 06 17:19:04 renato-cluster-b-0001 bash[13617]: cockroach exited with code 137: Mon Mar 6 17:19:04 UTC 2023 Mar 06 17:19:04 renato-cluster-b-0001 systemd[1]: cockroach.service: Main process exited, code=exited, status=137/n/a Mar 06 17:19:04 renato-cluster-b-0001 systemd[1]: cockroach.service: Failed with result 'exit-code'. ``` With the changes in this commit, the `roachprod stop` call will now fail since the hostnames don't match: ```console $ roachprod stop cluster-a cluster-a: stopping and waiting 1/1 0: COMMAND_PROBLEM: exit status 1 (1) COMMAND_PROBLEM Wraps: (2) exit status 1 Error types: (1) errors.Cmd (2) *exec.ExitError: expected host to be part of cluster-a, but is cluster-b-0001 ``` Finally, `roachprod ssh` calls will now also fail if the hostnames do not match, instead of silently connecting to the wrong cluster. Resolves #89437. Resolves #63637. Release note: None --- pkg/roachprod/install/BUILD.bazel | 1 + pkg/roachprod/install/cluster_synced.go | 110 +++++++++++++++++-- pkg/roachprod/install/cluster_synced_test.go | 10 +- pkg/roachprod/roachprod.go | 2 +- pkg/roachprod/vm/aws/aws.go | 4 +- pkg/roachprod/vm/aws/support.go | 11 +- 6 files changed, 119 insertions(+), 19 deletions(-) diff --git a/pkg/roachprod/install/BUILD.bazel b/pkg/roachprod/install/BUILD.bazel index 642c3d3c4636..94b01bf661e8 100644 --- a/pkg/roachprod/install/BUILD.bazel +++ b/pkg/roachprod/install/BUILD.bazel @@ -29,6 +29,7 @@ go_library( "//pkg/roachprod/logger", "//pkg/roachprod/ssh", "//pkg/roachprod/ui", + "//pkg/roachprod/vm", "//pkg/roachprod/vm/aws", "//pkg/roachprod/vm/gce", "//pkg/roachprod/vm/local", diff --git a/pkg/roachprod/install/cluster_synced.go b/pkg/roachprod/install/cluster_synced.go index b9b3ddf3e371..eeaa3ccffcc8 100644 --- a/pkg/roachprod/install/cluster_synced.go +++ b/pkg/roachprod/install/cluster_synced.go @@ -36,6 +36,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/roachprod/ssh" "github.com/cockroachdb/cockroach/pkg/roachprod/ui" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/roachprod/vm/aws" "github.com/cockroachdb/cockroach/pkg/roachprod/vm/local" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -287,8 +288,77 @@ func (c *SyncedCluster) roachprodEnvValue(node Node) string { func (c *SyncedCluster) roachprodEnvRegex(node Node) string { escaped := strings.Replace(c.roachprodEnvValue(node), "/", "\\/", -1) // We look for either a trailing space or a slash (in which case, we tolerate - // any remaining tag suffix). - return fmt.Sprintf(`ROACHPROD=%s[ \/]`, escaped) + // any remaining tag suffix). ROACHPROD may also be the last environment + // variable declared, so we also account for that. + return fmt.Sprintf(`(ROACHPROD=%[1]s$|ROACHPROD=%[1]s[ \/])`, escaped) +} + +// validateHostnameCmd wraps the command given with a check that the +// remote node is still part of the `SyncedCluster`. When `cmd` is +// empty, the command returned can be used to validate whether the +// host matches the name expected by roachprod, and can be used to +// validate the contents of the cache for that cluster. +// +// Since `SyncedCluster` is created from a potentially stale cache, it +// is possible for the following events to happen: +// +// Client 1: +// - cluster A is created and information is persisted in roachprod's cache. +// - cluster's A lifetime expires, VMs are destroyed. +// +// Client 2 +// - cluster B is created; public IP of one of the VMs of cluster +// A is reused and assigned to one of cluster B's VMs. +// +// Client 1: +// - command with side-effects is run on cluster A (e.g., +// `roachprod stop`). Public IP now belongs to a VM in cluster +// B. Client unknowingly disturbs cluster B, thinking it's cluster A. +// +// Client 2: +// - notices that cluster B is behaving strangely (e.g., cockroach +// process died). A lot of time is spent trying to debug the +// issue. +// +// This scenario is possible and has happened a number of times in the +// past (for one such occurrence, see #97100). A particularly +// problematic instance of this bug happens when "cluster B" is +// running a roachtest. This interaction leads to issues that are hard +// to debug or make sense of, which ultimately wastes a lot of +// engineering time. +// +// By wrapping every command with a hostname check as is done here, we +// ensure that the cached cluster information is still correct. +func (c *SyncedCluster) validateHostnameCmd(cmd string, node Node) string { + isValidHost := fmt.Sprintf("[[ `hostname` == '%s' ]]", vm.Name(c.Name, int(node))) + errMsg := fmt.Sprintf("expected host to be part of %s, but is `hostname`", c.Name) + elseBranch := "fi" + if cmd != "" { + elseBranch = fmt.Sprintf(` +else + %s +fi +`, cmd) + } + + return fmt.Sprintf(` +if ! %s; then + echo "%s" + exit 1 +%s +`, isValidHost, errMsg, elseBranch) +} + +// validateHost will run `validateHostnameCmd` on the node passed to +// make sure it still belongs to the SyncedCluster. Returns an error +// when the hostnames don't match, indicating that the roachprod cache +// is stale. +func (c *SyncedCluster) validateHost(ctx context.Context, l *logger.Logger, node Node) error { + if c.IsLocal() { + return nil + } + cmd := c.validateHostnameCmd("", node) + return c.Run(ctx, l, l.Stdout, l.Stderr, Nodes{node}, "validate-ssh-host", cmd) } // cmdDebugName is the suffix of the generated ssh debug file @@ -303,7 +373,7 @@ func (c *SyncedCluster) newSession( node: node, user: c.user(node), host: c.Host(node), - cmd: cmd, + cmd: c.validateHostnameCmd(cmd, node), } for _, opt := range options { @@ -1559,6 +1629,9 @@ func (c *SyncedCluster) PutString( func (c *SyncedCluster) Put( ctx context.Context, l *logger.Logger, nodes Nodes, src string, dest string, ) error { + if err := c.validateHost(ctx, l, nodes[0]); err != nil { + return err + } // Check if source file exists and if it's a symlink. var potentialSymlinkPath string var err error @@ -1821,11 +1894,16 @@ func (c *SyncedCluster) Put( // user and assumes that the current user used to create c has the ability to // sudo into . func (c *SyncedCluster) Logs( + l *logger.Logger, src, dest, user, filter, programFilter string, interval time.Duration, from, to time.Time, out io.Writer, ) error { + if err := c.validateHost(context.TODO(), l, c.Nodes[0]); err != nil { + return err + } + rsyncNodeLogs := func(ctx context.Context, node Node) error { base := fmt.Sprintf("%d.logs", node) local := filepath.Join(dest, base) + "/" @@ -1950,6 +2028,9 @@ func (c *SyncedCluster) Logs( // Get TODO(peter): document func (c *SyncedCluster) Get(l *logger.Logger, nodes Nodes, src, dest string) error { + if err := c.validateHost(context.TODO(), l, nodes[0]); err != nil { + return err + } // TODO(peter): Only get 10 nodes at a time. When a node completes, output a // line indicating that. var detail string @@ -2197,8 +2278,19 @@ func (c *SyncedCluster) pghosts( return m, nil } -// SSH TODO(peter): document +// SSH creates an interactive shell connecting the caller to the first +// node on the cluster (or to all nodes in an iterm2 split screen if +// supported). +// +// CAUTION: this script will `exec` the ssh utility, so it must not be +// used in any roachtest code. This is for use within `./roachprod` +// exclusively. func (c *SyncedCluster) SSH(ctx context.Context, l *logger.Logger, sshArgs, args []string) error { + targetNode := c.Nodes[0] + if err := c.validateHost(ctx, l, targetNode); err != nil { + return err + } + if len(c.Nodes) != 1 && len(args) == 0 { // If trying to ssh to more than 1 node and the ssh session is interactive, // try sshing with an iTerm2 split screen configuration. @@ -2210,7 +2302,7 @@ func (c *SyncedCluster) SSH(ctx context.Context, l *logger.Logger, sshArgs, args // Perform template expansion on the arguments. e := expander{ - node: c.Nodes[0], + node: targetNode, } var expandedArgs []string for _, arg := range args { @@ -2226,19 +2318,19 @@ func (c *SyncedCluster) SSH(ctx context.Context, l *logger.Logger, sshArgs, args allArgs = []string{ "/bin/bash", "-c", } - cmd := fmt.Sprintf("cd %s ; ", c.localVMDir(c.Nodes[0])) + cmd := fmt.Sprintf("cd %s ; ", c.localVMDir(targetNode)) if len(args) == 0 /* interactive */ { cmd += "/bin/bash " } if len(args) > 0 { - cmd += fmt.Sprintf("export ROACHPROD=%s ; ", c.roachprodEnvValue(c.Nodes[0])) + cmd += fmt.Sprintf("export ROACHPROD=%s ; ", c.roachprodEnvValue(targetNode)) cmd += strings.Join(expandedArgs, " ") } allArgs = append(allArgs, cmd) } else { allArgs = []string{ "ssh", - fmt.Sprintf("%s@%s", c.user(c.Nodes[0]), c.Host(c.Nodes[0])), + fmt.Sprintf("%s@%s", c.user(targetNode), c.Host(targetNode)), "-o", "UserKnownHostsFile=/dev/null", "-o", "StrictHostKeyChecking=no", } @@ -2246,7 +2338,7 @@ func (c *SyncedCluster) SSH(ctx context.Context, l *logger.Logger, sshArgs, args allArgs = append(allArgs, sshArgs...) if len(args) > 0 { allArgs = append(allArgs, fmt.Sprintf( - "export ROACHPROD=%s ;", c.roachprodEnvValue(c.Nodes[0]), + "export ROACHPROD=%s ;", c.roachprodEnvValue(targetNode), )) } allArgs = append(allArgs, expandedArgs...) diff --git a/pkg/roachprod/install/cluster_synced_test.go b/pkg/roachprod/install/cluster_synced_test.go index d02ff540008e..89f941679b64 100644 --- a/pkg/roachprod/install/cluster_synced_test.go +++ b/pkg/roachprod/install/cluster_synced_test.go @@ -36,35 +36,35 @@ func TestRoachprodEnv(t *testing.T) { node: 1, tag: "", value: "1", - regex: `ROACHPROD=1[ \/]`, + regex: `(ROACHPROD=1$|ROACHPROD=1[ \/])`, }, { clusterName: "local-foo", node: 2, tag: "", value: "local-foo/2", - regex: `ROACHPROD=local-foo\/2[ \/]`, + regex: `(ROACHPROD=local-foo\/2$|ROACHPROD=local-foo\/2[ \/])`, }, { clusterName: "a", node: 3, tag: "foo", value: "3/foo", - regex: `ROACHPROD=3\/foo[ \/]`, + regex: `(ROACHPROD=3\/foo$|ROACHPROD=3\/foo[ \/])`, }, { clusterName: "a", node: 4, tag: "foo/bar", value: "4/foo/bar", - regex: `ROACHPROD=4\/foo\/bar[ \/]`, + regex: `(ROACHPROD=4\/foo\/bar$|ROACHPROD=4\/foo\/bar[ \/])`, }, { clusterName: "local-foo", node: 5, tag: "tag", value: "local-foo/5/tag", - regex: `ROACHPROD=local-foo\/5\/tag[ \/]`, + regex: `(ROACHPROD=local-foo\/5\/tag$|ROACHPROD=local-foo\/5\/tag[ \/])`, }, } diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index 063dc756f675..f15cdad5bc31 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -1356,7 +1356,7 @@ func Logs(l *logger.Logger, clusterName, dest, username string, logsOpts LogsOpt return err } return c.Logs( - logsOpts.Dir, dest, username, logsOpts.Filter, logsOpts.ProgramFilter, + l, logsOpts.Dir, dest, username, logsOpts.Filter, logsOpts.ProgramFilter, logsOpts.Interval, logsOpts.From, logsOpts.To, logsOpts.Out, ) } diff --git a/pkg/roachprod/vm/aws/aws.go b/pkg/roachprod/vm/aws/aws.go index 2ead4fc53fb5..2d36c4e04b2e 100644 --- a/pkg/roachprod/vm/aws/aws.go +++ b/pkg/roachprod/vm/aws/aws.go @@ -343,7 +343,7 @@ func (o *ProviderOpts) ConfigureCreateFlags(flags *pflag.FlagSet) { " rate limit (per second) for instance creation. This is used to avoid hitting the request"+ " limits from aws, which can vary based on the region, and the size of the cluster being"+ " created. Try lowering this limit when hitting 'Request limit exceeded' errors.") - flags.StringVar(&providerInstance.IAMProfile, ProviderName+"- iam-profile", providerInstance.IAMProfile, + flags.StringVar(&providerInstance.IAMProfile, ProviderName+"-iam-profile", providerInstance.IAMProfile, "the IAM instance profile to associate with created VMs if non-empty") } @@ -1002,7 +1002,7 @@ func (p *Provider) runInstance( extraMountOpts = "nobarrier" } } - filename, err := writeStartupScript(extraMountOpts, providerOpts.UseMultipleDisks) + filename, err := writeStartupScript(name, extraMountOpts, providerOpts.UseMultipleDisks) if err != nil { return errors.Wrapf(err, "could not write AWS startup script to temp file") } diff --git a/pkg/roachprod/vm/aws/support.go b/pkg/roachprod/vm/aws/support.go index 05904b9f455d..69b0b023371b 100644 --- a/pkg/roachprod/vm/aws/support.go +++ b/pkg/roachprod/vm/aws/support.go @@ -149,6 +149,10 @@ echo "kernel.core_pattern=$CORE_PATTERN" >> /etc/sysctl.conf sysctl --system # reload sysctl settings +# set hostname according to the name used by roachprod. There's host +# validation logic that relies on this -- see comment on cluster_synced.go +sudo hostnamectl set-hostname {{.VMName}} + sudo touch /mnt/data1/.roachprod-initialized ` @@ -158,13 +162,16 @@ sudo touch /mnt/data1/.roachprod-initialized // // extraMountOpts, if not empty, is appended to the default mount options. It is // a comma-separated list of options for the "mount -o" flag. -func writeStartupScript(extraMountOpts string, useMultiple bool) (string, error) { +func writeStartupScript(name string, extraMountOpts string, useMultiple bool) (string, error) { type tmplParams struct { + VMName string ExtraMountOpts string UseMultipleDisks bool } - args := tmplParams{ExtraMountOpts: extraMountOpts, UseMultipleDisks: useMultiple} + args := tmplParams{ + VMName: name, ExtraMountOpts: extraMountOpts, UseMultipleDisks: useMultiple, + } tmpfile, err := os.CreateTemp("", "aws-startup-script") if err != nil {