Skip to content

Commit

Permalink
Merge pull request #89016 from smg260/backport22.2-87304
Browse files Browse the repository at this point in the history
  • Loading branch information
smg260 authored Sep 30, 2022
2 parents c69571d + 1e50f25 commit 1836d21
Show file tree
Hide file tree
Showing 9 changed files with 425 additions and 150 deletions.
3 changes: 3 additions & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go_library(
name = "roachtest_lib",
srcs = [
"cluster.go",
"github.go",
"main.go",
"monitor.go",
"slack.go",
Expand Down Expand Up @@ -64,6 +65,7 @@ go_test(
size = "small",
srcs = [
"cluster_test.go",
"github_test.go",
"main_test.go",
"test_registry_test.go",
"test_test.go",
Expand All @@ -78,6 +80,7 @@ go_test(
"//pkg/cmd/roachtest/test",
"//pkg/internal/team",
"//pkg/roachprod/logger",
"//pkg/roachprod/vm",
"//pkg/testutils",
"//pkg/util/quotapool",
"//pkg/util/stop",
Expand Down
62 changes: 31 additions & 31 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -837,9 +837,9 @@ func (f *clusterFactory) clusterMock(cfg clusterConfig) *clusterImpl {
// NOTE: setTest() needs to be called before a test can use this cluster.
func (f *clusterFactory) newCluster(
ctx context.Context, cfg clusterConfig, setStatus func(string), teeOpt logger.TeeOptType,
) (*clusterImpl, error) {
) (*clusterImpl, *vm.CreateOpts, error) {
if ctx.Err() != nil {
return nil, errors.Wrap(ctx.Err(), "newCluster")
return nil, nil, errors.Wrap(ctx.Err(), "newCluster")
}

if overrideFlagset != nil && overrideFlagset.Changed("nodes") {
Expand All @@ -850,9 +850,9 @@ func (f *clusterFactory) newCluster(
// For tests, use a mock cluster.
c := f.clusterMock(cfg)
if err := f.r.registerCluster(c); err != nil {
return nil, err
return nil, nil, err
}
return c, nil
return c, nil, nil
}

if cfg.localCluster {
Expand All @@ -875,7 +875,7 @@ func (f *clusterFactory) newCluster(
// We must release the allocation because cluster creation is not possible at this point.
cfg.alloc.Release()

return nil, err
return nil, nil, err
}
if cfg.spec.Cloud != spec.Local {
providerOptsContainer.SetProviderOpts(cfg.spec.Cloud, providerOpts)
Expand Down Expand Up @@ -929,18 +929,18 @@ func (f *clusterFactory) newCluster(
err = roachprod.Create(ctx, l, cfg.username, cfg.spec.NodeCount, createVMOpts, providerOptsContainer)
if err == nil {
if err := f.r.registerCluster(c); err != nil {
return nil, err
return nil, nil, err
}
c.status("idle")
l.Close()
return c, nil
return c, &createVMOpts, nil
}

if errors.HasType(err, (*roachprod.ClusterAlreadyExistsError)(nil)) {
// If the cluster couldn't be created because it existed already, bail.
// In reality when this is hit is when running with the `local` flag
// or a destroy from the previous iteration failed.
return nil, err
return nil, nil, err
}

l.PrintfCtx(ctx, "cluster creation failed, cleaning up in case it was partially created: %s", err)
Expand All @@ -952,7 +952,7 @@ func (f *clusterFactory) newCluster(
if i >= maxAttempts {
// Here we have to release the alloc, as we are giving up.
cfg.alloc.Release()
return nil, err
return nil, nil, err
}
// Try again to create the cluster.
}
Expand Down Expand Up @@ -1128,13 +1128,13 @@ func (c *clusterImpl) Node(i int) option.NodeListOption {

// FetchLogs downloads the logs from the cluster using `roachprod get`.
// The logs will be placed in the test's artifacts dir.
func (c *clusterImpl) FetchLogs(ctx context.Context, t test.Test) error {
func (c *clusterImpl) FetchLogs(ctx context.Context, l *logger.Logger) error {
if c.spec.NodeCount == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return nil
}

t.L().Printf("fetching logs\n")
l.Printf("fetching logs\n")
c.status("fetching logs")

// Don't hang forever if we can't fetch the logs.
Expand All @@ -1145,14 +1145,14 @@ func (c *clusterImpl) FetchLogs(ctx context.Context, t test.Test) error {
}

if err := c.Get(ctx, c.l, "logs" /* src */, path /* dest */); err != nil {
t.L().Printf("failed to fetch logs: %v", err)
l.Printf("failed to fetch logs: %v", err)
if ctx.Err() != nil {
return errors.Wrap(err, "cluster.FetchLogs")
}
}

if err := c.RunE(ctx, c.All(), "mkdir -p logs/redacted && ./cockroach debug merge-logs --redact logs/*.log > logs/redacted/combined.log"); err != nil {
t.L().Printf("failed to redact logs: %v", err)
l.Printf("failed to redact logs: %v", err)
if ctx.Err() != nil {
return err
}
Expand Down Expand Up @@ -1204,17 +1204,17 @@ func (c *clusterImpl) CopyRoachprodState(ctx context.Context) error {
// the first available node. They can be visualized via:
//
// `COCKROACH_DEBUG_TS_IMPORT_FILE=tsdump.gob ./cockroach start-single-node --insecure --store=$(mktemp -d)`
func (c *clusterImpl) FetchTimeseriesData(ctx context.Context, t test.Test) error {
func (c *clusterImpl) FetchTimeseriesData(ctx context.Context, l *logger.Logger) error {
return contextutil.RunWithTimeout(ctx, "fetch tsdata", 5*time.Minute, func(ctx context.Context) error {
node := 1
for ; node <= c.spec.NodeCount; node++ {
db, err := c.ConnE(ctx, t.L(), node)
db, err := c.ConnE(ctx, l, node)
if err == nil {
err = db.Ping()
db.Close()
}
if err != nil {
t.L().Printf("node %d not responding to SQL, trying next one", node)
l.Printf("node %d not responding to SQL, trying next one", node)
continue
}
break
Expand All @@ -1241,7 +1241,7 @@ func (c *clusterImpl) FetchTimeseriesData(ctx context.Context, t test.Test) erro
); err != nil {
return errors.Wrap(err, "cluster.FetchTimeseriesData")
}
db, err := c.ConnE(ctx, t.L(), node)
db, err := c.ConnE(ctx, l, node)
if err != nil {
return err
}
Expand Down Expand Up @@ -1277,13 +1277,13 @@ COCKROACH_DEBUG_TS_IMPORT_FILE=tsdump.gob cockroach start-single-node --insecure

// FetchDebugZip downloads the debug zip from the cluster using `roachprod ssh`.
// The logs will be placed in the test's artifacts dir.
func (c *clusterImpl) FetchDebugZip(ctx context.Context, t test.Test) error {
func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error {
if c.spec.NodeCount == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return nil
}

t.L().Printf("fetching debug zip\n")
l.Printf("fetching debug zip\n")
c.status("fetching debug zip")

// Don't hang forever if we can't fetch the debug zip.
Expand All @@ -1304,7 +1304,7 @@ func (c *clusterImpl) FetchDebugZip(ctx context.Context, t test.Test) error {
si := strconv.Itoa(i)
cmd := []string{"./cockroach", "debug", "zip", "--exclude-files='*.log,*.txt,*.pprof'", "--url", "{pgurl:" + si + "}", zipName}
if err := c.RunE(ctx, c.All(), cmd...); err != nil {
t.L().Printf("./cockroach debug zip failed: %v", err)
l.Printf("./cockroach debug zip failed: %v", err)
if i < c.spec.NodeCount {
continue
}
Expand Down Expand Up @@ -1424,14 +1424,14 @@ func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, t *testImpl)

// FetchDmesg grabs the dmesg logs if possible. This requires being able to run
// `sudo dmesg` on the remote nodes.
func (c *clusterImpl) FetchDmesg(ctx context.Context, t test.Test) error {
func (c *clusterImpl) FetchDmesg(ctx context.Context, l *logger.Logger) error {
if c.spec.NodeCount == 0 || c.IsLocal() {
// No nodes can happen during unit tests and implies nothing to do.
// Also, don't grab dmesg on local runs.
return nil
}

t.L().Printf("fetching dmesg\n")
l.Printf("fetching dmesg\n")
c.status("fetching dmesg")

// Don't hang forever.
Expand All @@ -1455,7 +1455,7 @@ func (c *clusterImpl) FetchDmesg(ctx context.Context, t test.Test) error {
if result.Err != nil {
// Store `Run` errors to return later (after copying files from successful nodes).
combinedDmesgError = errors.CombineErrors(combinedDmesgError, result.Err)
t.L().Printf("running dmesg failed on node %d: %v", result.Node, result.Err)
l.Printf("running dmesg failed on node %d: %v", result.Node, result.Err)
} else {
// Only run `Get` on successful nodes to avoid pseudo-failure on `Get` caused by an earlier failure on `Run`.
successfulNodes = append(successfulNodes, int(result.Node))
Expand All @@ -1464,7 +1464,7 @@ func (c *clusterImpl) FetchDmesg(ctx context.Context, t test.Test) error {

// Get dmesg files from successful nodes only.
if err := c.Get(ctx, c.l, name /* src */, path /* dest */, successfulNodes); err != nil {
t.L().Printf("getting dmesg files failed: %v", err)
l.Printf("getting dmesg files failed: %v", err)
return errors.Wrap(err, "cluster.FetchDmesg")
}

Expand All @@ -1475,14 +1475,14 @@ func (c *clusterImpl) FetchDmesg(ctx context.Context, t test.Test) error {

// FetchJournalctl grabs the journalctl logs if possible. This requires being
// able to run `sudo journalctl` on the remote nodes.
func (c *clusterImpl) FetchJournalctl(ctx context.Context, t test.Test) error {
func (c *clusterImpl) FetchJournalctl(ctx context.Context, l *logger.Logger) error {
if c.spec.NodeCount == 0 || c.IsLocal() {
// No nodes can happen during unit tests and implies nothing to do.
// Also, don't grab journalctl on local runs.
return nil
}

t.L().Printf("fetching journalctl\n")
l.Printf("fetching journalctl\n")
c.status("fetching journalctl")

// Don't hang forever.
Expand All @@ -1506,7 +1506,7 @@ func (c *clusterImpl) FetchJournalctl(ctx context.Context, t test.Test) error {
if result.Err != nil {
// Store `Run` errors to return later (after copying files from successful nodes).
combinedJournalctlError = errors.CombineErrors(combinedJournalctlError, result.Err)
t.L().Printf("running journalctl failed on node %d: %v", result.Node, result.Err)
l.Printf("running journalctl failed on node %d: %v", result.Node, result.Err)
} else {
// Only run `Get` on successful nodes to avoid pseudo-failure on `Get` caused by an earlier failure on `Run`.
successfulNodes = append(successfulNodes, int(result.Node))
Expand All @@ -1515,7 +1515,7 @@ func (c *clusterImpl) FetchJournalctl(ctx context.Context, t test.Test) error {

// Get files from successful nodes only.
if err := c.Get(ctx, c.l, name /* src */, path /* dest */, successfulNodes); err != nil {
t.L().Printf("getting files failed: %v", err)
l.Printf("getting files failed: %v", err)
return errors.Wrap(err, "cluster.FetchJournalctl")
}

Expand All @@ -1525,7 +1525,7 @@ func (c *clusterImpl) FetchJournalctl(ctx context.Context, t test.Test) error {
}

// FetchCores fetches any core files on the cluster.
func (c *clusterImpl) FetchCores(ctx context.Context, t test.Test) error {
func (c *clusterImpl) FetchCores(ctx context.Context, l *logger.Logger) error {
if c.spec.NodeCount == 0 || c.IsLocal() {
// No nodes can happen during unit tests and implies nothing to do.
// Also, don't grab dmesg on local runs.
Expand All @@ -1537,11 +1537,11 @@ func (c *clusterImpl) FetchCores(ctx context.Context, t test.Test) error {
// from having the cores, but we should push them straight into a temp
// bucket on S3 instead. OTOH, the ROI of this may be low; I don't know
// of a recent example where we've wanted the Core dumps.
t.L().Printf("skipped fetching cores\n")
l.Printf("skipped fetching cores\n")
return nil
}

t.L().Printf("fetching cores\n")
l.Printf("fetching cores\n")
c.status("fetching cores")

// Don't hang forever. The core files can be large, so we give a generous
Expand Down
1 change: 0 additions & 1 deletion pkg/cmd/roachtest/cluster/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ go_library(
deps = [
"//pkg/cmd/roachtest/option",
"//pkg/cmd/roachtest/spec",
"//pkg/cmd/roachtest/test",
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/roachprod/prometheus",
Expand Down
3 changes: 1 addition & 2 deletions pkg/cmd/roachtest/cluster/cluster_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import (

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus"
Expand Down Expand Up @@ -131,7 +130,7 @@ type Cluster interface {
ctx context.Context, l *logger.Logger, src, dest, branch string, node option.NodeListOption,
) error

FetchTimeseriesData(ctx context.Context, t test.Test) error
FetchTimeseriesData(ctx context.Context, l *logger.Logger) error
RefetchCertsFromNode(ctx context.Context, node int) error

StartGrafana(ctx context.Context, l *logger.Logger, promCfg *prometheus.Config) error
Expand Down
Loading

0 comments on commit 1836d21

Please sign in to comment.