Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roachtest: collect failure artifacts when restore fails #104868

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1473,8 +1473,8 @@ COCKROACH_DEBUG_TS_IMPORT_FILE=tsdump.gob cockroach start-single-node --insecure
}

// FetchDebugZip downloads the debug zip from the cluster using `roachprod ssh`.
// The logs will be placed in the test's artifacts dir.
func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error {
// The logs will be placed at `dest`, relative to the test's artifacts dir.
func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger, dest string) error {
if c.spec.NodeCount == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return nil
Expand All @@ -1486,7 +1486,7 @@ func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error
// Don't hang forever if we can't fetch the debug zip.
return timeutil.RunWithTimeout(ctx, "debug zip", 5*time.Minute, func(ctx context.Context) error {
const zipName = "debug.zip"
path := filepath.Join(c.t.ArtifactsDir(), zipName)
path := filepath.Join(c.t.ArtifactsDir(), dest)
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return err
}
Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/cluster/cluster_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ type Cluster interface {
) error

FetchTimeseriesData(ctx context.Context, l *logger.Logger) error
FetchDebugZip(ctx context.Context, l *logger.Logger, dest string) error
RefetchCertsFromNode(ctx context.Context, node int) error

StartGrafana(ctx context.Context, l *logger.Logger, promCfg *prometheus.Config) error
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1317,7 +1317,7 @@ func (r *testRunner) collectArtifacts(
if err := c.FetchTimeseriesData(ctx, t.L()); err != nil {
t.L().Printf("failed to fetch timeseries data: %s", err)
}
if err := c.FetchDebugZip(ctx, t.L()); err != nil {
if err := c.FetchDebugZip(ctx, t.L(), "debug.zip"); err != nil {
t.L().Printf("failed to collect zip: %s", err)
}
})
Expand Down
64 changes: 57 additions & 7 deletions pkg/cmd/roachtest/tests/mixed_version_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"encoding/json"
"fmt"
"math/rand"
"os"
"path/filepath"
"reflect"
"regexp"
Expand Down Expand Up @@ -986,6 +987,7 @@ func backupCollectionDesc(fullSpec, incSpec backupSpec) string {
// state involved in the mixed-version backup test.
type mixedVersionBackup struct {
cluster cluster.Cluster
t test.Test
roachNodes option.NodeListOption
// backup collections that are created along the test
collections []*backupCollection
Expand All @@ -1010,13 +1012,13 @@ type mixedVersionBackup struct {
}

func newMixedVersionBackup(
c cluster.Cluster, roachNodes option.NodeListOption, dbs ...string,
t test.Test, c cluster.Cluster, roachNodes option.NodeListOption, dbs ...string,
) *mixedVersionBackup {
var tablesLoaded atomic.Bool
tablesLoaded.Store(false)

return &mixedVersionBackup{
cluster: c, dbs: dbs, roachNodes: roachNodes, tablesLoaded: &tablesLoaded,
t: t, cluster: c, dbs: dbs, roachNodes: roachNodes, tablesLoaded: &tablesLoaded,
}
}

Expand Down Expand Up @@ -1692,7 +1694,10 @@ func (mvb *mixedVersionBackup) disableJobAdoption(
if err := retry.ForDuration(testutils.DefaultSucceedsSoonDuration, func() error {
db := h.Connect(node)
var count int
err := db.QueryRow(`SELECT count(*) FROM [SHOW JOBS] WHERE status = 'running'`).Scan(&count)
err := db.QueryRow(fmt.Sprintf(
`SELECT count(*) FROM [SHOW JOBS] WHERE status = 'running' AND coordinator_id = %d`,
node,
)).Scan(&count)
if err != nil {
l.Printf("node %d: error querying running jobs (%s)", node, err)
return err
Expand Down Expand Up @@ -1797,7 +1802,7 @@ func (mvb *mixedVersionBackup) planAndRunBackups(
}

if len(tc.FromVersionNodes) > 0 {
const numCollections = 3
const numCollections = 2
rng.Shuffle(len(collectionSpecs), func(i, j int) {
collectionSpecs[i], collectionSpecs[j] = collectionSpecs[j], collectionSpecs[i]
})
Expand Down Expand Up @@ -1835,6 +1840,33 @@ func (mvb *mixedVersionBackup) checkFiles(
return h.Exec(rng, checkFilesStmt)
}

// collectFailureArtifacts fetches cockroach logs and a debug.zip and
// saves them to a directory in the test's artifacts dir. This is done
// so that we can report multiple restore failures in the same test,
// and make each failure actionable. If artifacts cannot be collected,
// the original restore error is returned, along with the error
// encountered while fetching the artifacts.
func (mvb *mixedVersionBackup) collectFailureArtifacts(
ctx context.Context, l *logger.Logger, restoreErr error, errID int,
) (error, error) {
dirName := fmt.Sprintf("restore_failure_%d", errID)
rootDir := filepath.Join(mvb.t.ArtifactsDir(), dirName)
logsDir := filepath.Join(rootDir, "logs")
if err := os.MkdirAll(filepath.Dir(logsDir), 0755); err != nil {
return restoreErr, fmt.Errorf("could not create directory %s: %w", rootDir, err)
}

if err := mvb.cluster.Get(ctx, l, "logs" /* src */, logsDir, mvb.roachNodes); err != nil {
return restoreErr, fmt.Errorf("could not fetch logs: %w", err)
}
zipLocation := filepath.Join(dirName, "debug.zip")
if err := mvb.cluster.FetchDebugZip(ctx, l, zipLocation); err != nil {
return restoreErr, err
}

return fmt.Errorf("%w (artifacts collected in %s)", restoreErr, dirName), nil
}

// verifyBackupCollection restores the backup collection passed and
// verifies that the contents after the restore match the contents
// when the backup was taken.
Expand Down Expand Up @@ -1995,14 +2027,32 @@ func (mvb *mixedVersionBackup) verifyAllBackups(
}
if err := mvb.verifyBackupCollection(ctx, l, rng, h, collection, version); err != nil {
l.Printf("restore error: %v", err)
restoreErrors = append(restoreErrors, err)
// Attempt to collect logs and debug.zip at the time of this
// restore failure; if we can't, log the error encountered and
// move on.
restoreErr, collectionErr := mvb.collectFailureArtifacts(ctx, l, err, len(restoreErrors)+1)
if collectionErr != nil {
l.Printf("could not collect failure artifacts: %v", collectionErr)
}
restoreErrors = append(restoreErrors, restoreErr)
}
}
}

verify(h.Context().FromVersion)
verify(clusterupgrade.MainVersion)

// If the context was canceled (most likely due to a test timeout),
// return early. In these cases, it's likely that `restoreErrors`
// will have a number of "restore failures" that all happened
// because the underlying context was canceled, so proceeding with
// the error reporting logic below is confusing, as it makes it look
// like multiple failures occurred. It also makes the actually
// important "timed out" message less prominent.
if err := ctx.Err(); err != nil {
return err
}

if len(restoreErrors) > 0 {
if len(restoreErrors) == 1 {
// Simplify error reporting if only one error was found.
Expand All @@ -2011,7 +2061,7 @@ func (mvb *mixedVersionBackup) verifyAllBackups(

msgs := make([]string, 0, len(restoreErrors))
for j, err := range restoreErrors {
msgs = append(msgs, fmt.Sprintf("%d: %s", j, err.Error()))
msgs = append(msgs, fmt.Sprintf("%d: %s", j+1, err.Error()))
}
return fmt.Errorf("%d errors during restore:\n%s", len(restoreErrors), strings.Join(msgs, "\n"))
}
Expand Down Expand Up @@ -2066,7 +2116,7 @@ func registerBackupMixedVersion(r registry.Registry) {
Arg("{pgurl%s}", roachNodes).
Option("tolerate-errors")

backupTest := newMixedVersionBackup(c, roachNodes, "bank", "tpcc")
backupTest := newMixedVersionBackup(t, c, roachNodes, "bank", "tpcc")

mvt.OnStartup("set short job interval", backupTest.setShortJobIntervals)
mvt.OnStartup("take backup in previous version", backupTest.takePreviousVersionBackup)
Expand Down