Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roachtest: fix zipping of artifacts to include other zips #95374

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 31 additions & 31 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -792,9 +792,9 @@ func (f *clusterFactory) clusterMock(cfg clusterConfig) *clusterImpl {
// NOTE: setTest() needs to be called before a test can use this cluster.
func (f *clusterFactory) newCluster(
ctx context.Context, cfg clusterConfig, setStatus func(string), teeOpt logger.TeeOptType,
) (*clusterImpl, error) {
) (*clusterImpl, *vm.CreateOpts, error) {
if ctx.Err() != nil {
return nil, errors.Wrap(ctx.Err(), "newCluster")
return nil, nil, errors.Wrap(ctx.Err(), "newCluster")
}

if overrideFlagset != nil && overrideFlagset.Changed("nodes") {
Expand All @@ -805,9 +805,9 @@ func (f *clusterFactory) newCluster(
// For tests, use a mock cluster.
c := f.clusterMock(cfg)
if err := f.r.registerCluster(c); err != nil {
return nil, err
return nil, nil, err
}
return c, nil
return c, nil, nil
}

if cfg.localCluster {
Expand All @@ -827,7 +827,7 @@ func (f *clusterFactory) newCluster(
// that each create attempt gets a unique cluster name.
createVMOpts, providerOpts, err := cfg.spec.RoachprodOpts("", cfg.useIOBarrier)
if err != nil {
return nil, err
return nil, nil, err
}
if cfg.spec.Cloud != spec.Local {
providerOptsContainer.SetProviderOpts(cfg.spec.Cloud, providerOpts)
Expand Down Expand Up @@ -875,18 +875,18 @@ func (f *clusterFactory) newCluster(
err = roachprod.Create(ctx, l, cfg.username, cfg.spec.NodeCount, createVMOpts, providerOptsContainer)
if err == nil {
if err := f.r.registerCluster(c); err != nil {
return nil, err
return nil, nil, err
}
c.status("idle")
l.Close()
return c, nil
return c, &createVMOpts, nil
}

if errors.HasType(err, (*roachprod.ClusterAlreadyExistsError)(nil)) {
// If the cluster couldn't be created because it existed already, bail.
// In reality when this is hit is when running with the `local` flag
// or a destroy from the previous iteration failed.
return nil, err
return nil, nil, err
}

l.PrintfCtx(ctx, "cluster creation failed, cleaning up in case it was partially created: %s", err)
Expand All @@ -898,7 +898,7 @@ func (f *clusterFactory) newCluster(
if i >= maxAttempts {
// Here we have to release the alloc, as we are giving up.
cfg.alloc.Release()
return nil, err
return nil, nil, err
}
// Try again to create the cluster.
}
Expand Down Expand Up @@ -1076,13 +1076,13 @@ func (c *clusterImpl) Node(i int) option.NodeListOption {

// FetchLogs downloads the logs from the cluster using `roachprod get`.
// The logs will be placed in the test's artifacts dir.
func (c *clusterImpl) FetchLogs(ctx context.Context, t test.Test) error {
func (c *clusterImpl) FetchLogs(ctx context.Context, l *logger.Logger) error {
if c.spec.NodeCount == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return nil
}

t.L().Printf("fetching logs\n")
l.Printf("fetching logs\n")
c.status("fetching logs")

// Don't hang forever if we can't fetch the logs.
Expand All @@ -1093,14 +1093,14 @@ func (c *clusterImpl) FetchLogs(ctx context.Context, t test.Test) error {
}

if err := c.Get(ctx, c.l, "logs" /* src */, path /* dest */); err != nil {
t.L().Printf("failed to fetch logs: %v", err)
l.Printf("failed to fetch logs: %v", err)
if ctx.Err() != nil {
return errors.Wrap(err, "cluster.FetchLogs")
}
}

if err := c.RunE(ctx, c.All(), "mkdir -p logs/redacted && ./cockroach debug merge-logs --redact logs/*.log > logs/redacted/combined.log"); err != nil {
t.L().Printf("failed to redact logs: %v", err)
l.Printf("failed to redact logs: %v", err)
if ctx.Err() != nil {
return err
}
Expand Down Expand Up @@ -1152,17 +1152,17 @@ func (c *clusterImpl) CopyRoachprodState(ctx context.Context) error {
// the first available node. They can be visualized via:
//
// `COCKROACH_DEBUG_TS_IMPORT_FILE=tsdump.gob ./cockroach start-single-node --insecure --store=$(mktemp -d)`
func (c *clusterImpl) FetchTimeseriesData(ctx context.Context, t test.Test) error {
func (c *clusterImpl) FetchTimeseriesData(ctx context.Context, l *logger.Logger) error {
return contextutil.RunWithTimeout(ctx, "fetch tsdata", 5*time.Minute, func(ctx context.Context) error {
node := 1
for ; node <= c.spec.NodeCount; node++ {
db, err := c.ConnE(ctx, t.L(), node)
db, err := c.ConnE(ctx, l, node)
if err == nil {
err = db.Ping()
db.Close()
}
if err != nil {
t.L().Printf("node %d not responding to SQL, trying next one", node)
l.Printf("node %d not responding to SQL, trying next one", node)
continue
}
break
Expand All @@ -1181,7 +1181,7 @@ func (c *clusterImpl) FetchTimeseriesData(ctx context.Context, t test.Test) erro
); err != nil {
return errors.Wrap(err, "cluster.FetchTimeseriesData")
}
db, err := c.ConnE(ctx, t.L(), node)
db, err := c.ConnE(ctx, l, node)
if err != nil {
return err
}
Expand Down Expand Up @@ -1217,13 +1217,13 @@ COCKROACH_DEBUG_TS_IMPORT_FILE=tsdump.gob cockroach start-single-node --insecure

// FetchDebugZip downloads the debug zip from the cluster using `roachprod ssh`.
// The logs will be placed in the test's artifacts dir.
func (c *clusterImpl) FetchDebugZip(ctx context.Context, t test.Test) error {
func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error {
if c.spec.NodeCount == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return nil
}

t.L().Printf("fetching debug zip\n")
l.Printf("fetching debug zip\n")
c.status("fetching debug zip")

// Don't hang forever if we can't fetch the debug zip.
Expand All @@ -1244,7 +1244,7 @@ func (c *clusterImpl) FetchDebugZip(ctx context.Context, t test.Test) error {
si := strconv.Itoa(i)
cmd := []string{"./cockroach", "debug", "zip", "--exclude-files='*.log,*.txt,*.pprof'", "--url", "{pgurl:" + si + "}", zipName}
if err := c.RunE(ctx, c.All(), cmd...); err != nil {
t.L().Printf("./cockroach debug zip failed: %v", err)
l.Printf("./cockroach debug zip failed: %v", err)
if i < c.spec.NodeCount {
continue
}
Expand Down Expand Up @@ -1364,14 +1364,14 @@ func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, t *testImpl)

// FetchDmesg grabs the dmesg logs if possible. This requires being able to run
// `sudo dmesg` on the remote nodes.
func (c *clusterImpl) FetchDmesg(ctx context.Context, t test.Test) error {
func (c *clusterImpl) FetchDmesg(ctx context.Context, l *logger.Logger) error {
if c.spec.NodeCount == 0 || c.IsLocal() {
// No nodes can happen during unit tests and implies nothing to do.
// Also, don't grab dmesg on local runs.
return nil
}

t.L().Printf("fetching dmesg\n")
l.Printf("fetching dmesg\n")
c.status("fetching dmesg")

// Don't hang forever.
Expand All @@ -1395,7 +1395,7 @@ func (c *clusterImpl) FetchDmesg(ctx context.Context, t test.Test) error {
if result.Err != nil {
// Store `Run` errors to return later (after copying files from successful nodes).
combinedDmesgError = errors.CombineErrors(combinedDmesgError, result.Err)
t.L().Printf("running dmesg failed on node %d: %v", result.Node, result.Err)
l.Printf("running dmesg failed on node %d: %v", result.Node, result.Err)
} else {
// Only run `Get` on successful nodes to avoid pseudo-failure on `Get` caused by an earlier failure on `Run`.
successfulNodes = append(successfulNodes, int(result.Node))
Expand All @@ -1404,7 +1404,7 @@ func (c *clusterImpl) FetchDmesg(ctx context.Context, t test.Test) error {

// Get dmesg files from successful nodes only.
if err := c.Get(ctx, c.l, name /* src */, path /* dest */, successfulNodes); err != nil {
t.L().Printf("getting dmesg files failed: %v", err)
l.Printf("getting dmesg files failed: %v", err)
return errors.Wrap(err, "cluster.FetchDmesg")
}

Expand All @@ -1415,14 +1415,14 @@ func (c *clusterImpl) FetchDmesg(ctx context.Context, t test.Test) error {

// FetchJournalctl grabs the journalctl logs if possible. This requires being
// able to run `sudo journalctl` on the remote nodes.
func (c *clusterImpl) FetchJournalctl(ctx context.Context, t test.Test) error {
func (c *clusterImpl) FetchJournalctl(ctx context.Context, l *logger.Logger) error {
if c.spec.NodeCount == 0 || c.IsLocal() {
// No nodes can happen during unit tests and implies nothing to do.
// Also, don't grab journalctl on local runs.
return nil
}

t.L().Printf("fetching journalctl\n")
l.Printf("fetching journalctl\n")
c.status("fetching journalctl")

// Don't hang forever.
Expand All @@ -1446,7 +1446,7 @@ func (c *clusterImpl) FetchJournalctl(ctx context.Context, t test.Test) error {
if result.Err != nil {
// Store `Run` errors to return later (after copying files from successful nodes).
combinedJournalctlError = errors.CombineErrors(combinedJournalctlError, result.Err)
t.L().Printf("running journalctl failed on node %d: %v", result.Node, result.Err)
l.Printf("running journalctl failed on node %d: %v", result.Node, result.Err)
} else {
// Only run `Get` on successful nodes to avoid pseudo-failure on `Get` caused by an earlier failure on `Run`.
successfulNodes = append(successfulNodes, int(result.Node))
Expand All @@ -1455,7 +1455,7 @@ func (c *clusterImpl) FetchJournalctl(ctx context.Context, t test.Test) error {

// Get files from successful nodes only.
if err := c.Get(ctx, c.l, name /* src */, path /* dest */, successfulNodes); err != nil {
t.L().Printf("getting files failed: %v", err)
l.Printf("getting files failed: %v", err)
return errors.Wrap(err, "cluster.FetchJournalctl")
}

Expand All @@ -1465,7 +1465,7 @@ func (c *clusterImpl) FetchJournalctl(ctx context.Context, t test.Test) error {
}

// FetchCores fetches any core files on the cluster.
func (c *clusterImpl) FetchCores(ctx context.Context, t test.Test) error {
func (c *clusterImpl) FetchCores(ctx context.Context, l *logger.Logger) error {
if c.spec.NodeCount == 0 || c.IsLocal() {
// No nodes can happen during unit tests and implies nothing to do.
// Also, don't grab dmesg on local runs.
Expand All @@ -1477,11 +1477,11 @@ func (c *clusterImpl) FetchCores(ctx context.Context, t test.Test) error {
// from having the cores, but we should push them straight into a temp
// bucket on S3 instead. OTOH, the ROI of this may be low; I don't know
// of a recent example where we've wanted the Core dumps.
t.L().Printf("skipped fetching cores\n")
l.Printf("skipped fetching cores\n")
return nil
}

t.L().Printf("fetching cores\n")
l.Printf("fetching cores\n")
c.status("fetching cores")

// Don't hang forever. The core files can be large, so we give a generous
Expand Down
1 change: 0 additions & 1 deletion pkg/cmd/roachtest/cluster/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ go_library(
deps = [
"//pkg/cmd/roachtest/option",
"//pkg/cmd/roachtest/spec",
"//pkg/cmd/roachtest/test",
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"@com_github_cockroachdb_errors//:errors",
Expand Down
4 changes: 2 additions & 2 deletions pkg/cmd/roachtest/cluster/cluster_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import (

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
)
Expand Down Expand Up @@ -129,6 +128,7 @@ type Cluster interface {
GitClone(
ctx context.Context, l *logger.Logger, src, dest, branch string, node option.NodeListOption,
) error
FetchTimeseriesData(ctx context.Context, t test.Test) error

FetchTimeseriesData(ctx context.Context, l *logger.Logger) error
RefetchCertsFromNode(ctx context.Context, node int) error
}
Loading