diff --git a/pkg/cmd/roachprod/install/session.go b/pkg/cmd/roachprod/install/session.go index 21252fc44dbd..a35a99c90f47 100644 --- a/pkg/cmd/roachprod/install/session.go +++ b/pkg/cmd/roachprod/install/session.go @@ -17,13 +17,18 @@ package install import ( "context" + "fmt" "io" + "io/ioutil" "os" "os/exec" "path/filepath" "sync" + "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/pkg/errors" ) type session interface { @@ -43,13 +48,21 @@ type session interface { type remoteSession struct { *exec.Cmd - cancel func() + cancel func() + logfile string // captures ssh -vvv } func newRemoteSession(user, host string) (*remoteSession, error) { + logfile := filepath.Join( + os.TempDir(), + fmt.Sprintf("ssh_%s_%s", host, timeutil.Now().Format(time.RFC3339)), + ) args := []string{ user + "@" + host, - "-q", + "-vvv", "-E", logfile, + // NB: -q suppresses -E, at least on OSX. Difficult decisions will have + // to be made if omitting -q leads to annoyance on stdout/stderr. + // "-q", "-o", "UserKnownHostsFile=/dev/null", "-o", "StrictHostKeyChecking=no", // Send keep alives every minute to prevent connections without activity @@ -62,17 +75,26 @@ func newRemoteSession(user, host string) (*remoteSession, error) { args = append(args, sshAuthArgs()...) ctx, cancel := context.WithCancel(context.Background()) cmd := exec.CommandContext(ctx, "ssh", args...) - return &remoteSession{cmd, cancel}, nil + return &remoteSession{cmd, cancel, logfile}, nil +} + +func (s *remoteSession) errWithDebug(err error) error { + if err != nil { + debug, _ := ioutil.ReadFile(s.logfile) + err = errors.Wrapf(err, "ssh verbose log:\n%s\n%s", s.Cmd.Args, debug) + } + return err } func (s *remoteSession) CombinedOutput(cmd string) ([]byte, error) { s.Cmd.Args = append(s.Cmd.Args, cmd) - return s.Cmd.CombinedOutput() + b, err := s.Cmd.CombinedOutput() + return b, s.errWithDebug(err) } func (s *remoteSession) Run(cmd string) error { s.Cmd.Args = append(s.Cmd.Args, cmd) - return s.Cmd.Run() + return s.errWithDebug(s.Cmd.Run()) } func (s *remoteSession) Start(cmd string) error { @@ -116,6 +138,7 @@ func (s *remoteSession) Wait() error { } func (s *remoteSession) Close() { + _ = os.Remove(s.logfile) s.cancel() } diff --git a/pkg/cmd/roachprod/vm/aws/support.go b/pkg/cmd/roachprod/vm/aws/support.go index 223fb53061f7..40858b6caffa 100644 --- a/pkg/cmd/roachprod/vm/aws/support.go +++ b/pkg/cmd/roachprod/vm/aws/support.go @@ -96,7 +96,7 @@ sudo service sshguard stop sudo sh -c 'echo "MaxStartups 64:30:128" >> /etc/ssh/sshd_config' # Crank up the logging for issues such as: # https://github.com/cockroachdb/cockroach/issues/36929 -sudo sed -i'' 's/LogLevel.*$/LogLevel DEBUG/' /etc/ssh/sshd_config +sudo sed -i'' 's/LogLevel.*$/LogLevel DEBUG3/' /etc/ssh/sshd_config sudo service sshd restart # increase the default maximum number of open file descriptors for # root and non-root users. Load generators running a lot of concurrent diff --git a/pkg/cmd/roachprod/vm/gce/utils.go b/pkg/cmd/roachprod/vm/gce/utils.go index d3b6fa0c5ac6..ccc3538f74dc 100644 --- a/pkg/cmd/roachprod/vm/gce/utils.go +++ b/pkg/cmd/roachprod/vm/gce/utils.go @@ -84,7 +84,7 @@ sudo service sshguard stop sudo sh -c 'echo "MaxStartups 64:30:128" >> /etc/ssh/sshd_config' # Crank up the logging for issues such as: # https://github.com/cockroachdb/cockroach/issues/36929 -sudo sed -i'' 's/LogLevel.*$/LogLevel DEBUG/' /etc/ssh/sshd_config +sudo sed -i'' 's/LogLevel.*$/LogLevel DEBUG3/' /etc/ssh/sshd_config sudo service sshd restart # increase the default maximum number of open file descriptors for # root and non-root users. Load generators running a lot of concurrent diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 2dd100483af4..73d14e3e8295 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1940,11 +1940,15 @@ func (m *monitor) wait(args ...string) error { } func waitForFullReplication(t *test, db *gosql.DB) { + tStart := timeutil.Now() for ok := false; !ok; time.Sleep(time.Second) { if err := db.QueryRow( "SELECT min(array_length(replicas, 1)) >= 3 FROM crdb_internal.ranges", ).Scan(&ok); err != nil { t.Fatal(err) } + if timeutil.Since(tStart) > 30*time.Second { + t.l.Printf("still waiting for full replication") + } } } diff --git a/pkg/cmd/roachtest/cluster_init.go b/pkg/cmd/roachtest/cluster_init.go index 307758f292c8..3518b2cb1564 100644 --- a/pkg/cmd/roachtest/cluster_init.go +++ b/pkg/cmd/roachtest/cluster_init.go @@ -34,6 +34,13 @@ func runClusterInit(ctx context.Context, t *test, c *cluster) { addrs := c.InternalAddr(ctx, c.All()) + // TODO(tbg): this should never happen, but I saw it locally. The result + // is the test hanging forever, because all nodes will create their own + // single node cluster and waitForFullReplication never returns. + if addrs[0] == "" { + t.Fatal("no address for first node") + } + // Legacy-style init where we start node 1 without a join flag and then point // the other nodes at it. func() {