Skip to content

Commit

Permalink
Merge #37483
Browse files Browse the repository at this point in the history
37483: roachprod: more ssh{,d} logging r=ajwerner a=tbg

Stash the verbose debug logs in a file for each remote ssh session, and
surface it with the error should one occur. Crank up the server verbosity.

For the latest in many past incidents where this would've been useful, see:

#36720 (comment)

Release note: None

Co-authored-by: Tobias Schottdorf <[email protected]>
  • Loading branch information
craig[bot] and tbg committed May 13, 2019
2 parents 3c14d73 + 444c28e commit a7f2e6a
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 7 deletions.
33 changes: 28 additions & 5 deletions pkg/cmd/roachprod/install/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,18 @@ package install

import (
"context"
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"sync"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/pkg/errors"
)

type session interface {
Expand All @@ -43,13 +48,21 @@ type session interface {

type remoteSession struct {
*exec.Cmd
cancel func()
cancel func()
logfile string // captures ssh -vvv
}

func newRemoteSession(user, host string) (*remoteSession, error) {
logfile := filepath.Join(
os.TempDir(),
fmt.Sprintf("ssh_%s_%s", host, timeutil.Now().Format(time.RFC3339)),
)
args := []string{
user + "@" + host,
"-q",
"-vvv", "-E", logfile,
// NB: -q suppresses -E, at least on OSX. Difficult decisions will have
// to be made if omitting -q leads to annoyance on stdout/stderr.
// "-q",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "StrictHostKeyChecking=no",
// Send keep alives every minute to prevent connections without activity
Expand All @@ -62,17 +75,26 @@ func newRemoteSession(user, host string) (*remoteSession, error) {
args = append(args, sshAuthArgs()...)
ctx, cancel := context.WithCancel(context.Background())
cmd := exec.CommandContext(ctx, "ssh", args...)
return &remoteSession{cmd, cancel}, nil
return &remoteSession{cmd, cancel, logfile}, nil
}

func (s *remoteSession) errWithDebug(err error) error {
if err != nil {
debug, _ := ioutil.ReadFile(s.logfile)
err = errors.Wrapf(err, "ssh verbose log:\n%s\n%s", s.Cmd.Args, debug)
}
return err
}

func (s *remoteSession) CombinedOutput(cmd string) ([]byte, error) {
s.Cmd.Args = append(s.Cmd.Args, cmd)
return s.Cmd.CombinedOutput()
b, err := s.Cmd.CombinedOutput()
return b, s.errWithDebug(err)
}

func (s *remoteSession) Run(cmd string) error {
s.Cmd.Args = append(s.Cmd.Args, cmd)
return s.Cmd.Run()
return s.errWithDebug(s.Cmd.Run())
}

func (s *remoteSession) Start(cmd string) error {
Expand Down Expand Up @@ -116,6 +138,7 @@ func (s *remoteSession) Wait() error {
}

func (s *remoteSession) Close() {
_ = os.Remove(s.logfile)
s.cancel()
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachprod/vm/aws/support.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ sudo service sshguard stop
sudo sh -c 'echo "MaxStartups 64:30:128" >> /etc/ssh/sshd_config'
# Crank up the logging for issues such as:
# https://github.com/cockroachdb/cockroach/issues/36929
sudo sed -i'' 's/LogLevel.*$/LogLevel DEBUG/' /etc/ssh/sshd_config
sudo sed -i'' 's/LogLevel.*$/LogLevel DEBUG3/' /etc/ssh/sshd_config
sudo service sshd restart
# increase the default maximum number of open file descriptors for
# root and non-root users. Load generators running a lot of concurrent
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachprod/vm/gce/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ sudo service sshguard stop
sudo sh -c 'echo "MaxStartups 64:30:128" >> /etc/ssh/sshd_config'
# Crank up the logging for issues such as:
# https://github.com/cockroachdb/cockroach/issues/36929
sudo sed -i'' 's/LogLevel.*$/LogLevel DEBUG/' /etc/ssh/sshd_config
sudo sed -i'' 's/LogLevel.*$/LogLevel DEBUG3/' /etc/ssh/sshd_config
sudo service sshd restart
# increase the default maximum number of open file descriptors for
# root and non-root users. Load generators running a lot of concurrent
Expand Down
4 changes: 4 additions & 0 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1940,11 +1940,15 @@ func (m *monitor) wait(args ...string) error {
}

func waitForFullReplication(t *test, db *gosql.DB) {
tStart := timeutil.Now()
for ok := false; !ok; time.Sleep(time.Second) {
if err := db.QueryRow(
"SELECT min(array_length(replicas, 1)) >= 3 FROM crdb_internal.ranges",
).Scan(&ok); err != nil {
t.Fatal(err)
}
if timeutil.Since(tStart) > 30*time.Second {
t.l.Printf("still waiting for full replication")
}
}
}
7 changes: 7 additions & 0 deletions pkg/cmd/roachtest/cluster_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ func runClusterInit(ctx context.Context, t *test, c *cluster) {

addrs := c.InternalAddr(ctx, c.All())

// TODO(tbg): this should never happen, but I saw it locally. The result
// is the test hanging forever, because all nodes will create their own
// single node cluster and waitForFullReplication never returns.
if addrs[0] == "" {
t.Fatal("no address for first node")
}

// Legacy-style init where we start node 1 without a join flag and then point
// the other nodes at it.
func() {
Expand Down

0 comments on commit a7f2e6a

Please sign in to comment.