From 86c4583dc2a1e6995fed92d4b4d6400f117bedc7 Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Tue, 13 Jun 2023 13:33:46 -0400 Subject: [PATCH] roachtest: collect failure artifacts when restore fails This commit updates the `backup-restore/mixed-version` roachtest to collect artifacts (cockroach logs and a debug.zip) when a restore fails in the last step of the test (when all backups taken are restored). In that step, we do not immediately fail the test when a restore fails but instead attempt to restore every backup and return a list of errors found when the process is done. However, restoring cluster backups involves wiping the cluster which also deletes existing cockroach logs up to that point. This makes debugging a restore failure that happened prior to a cluster restore impossible. After this commit, a restore failure in that test will cause a `restore_failure_N` directory to be created in the artifacts directory, including the cockroach logs collected right after the failure, as well as a debug.zip created at the same time. This will make issues such as #104604 more actionable. Epic: none Release note: None --- pkg/cmd/roachtest/cluster.go | 6 +-- .../roachtest/cluster/cluster_interface.go | 1 + pkg/cmd/roachtest/test_runner.go | 2 +- .../roachtest/tests/mixed_version_backup.go | 46 +++++++++++++++++-- 4 files changed, 46 insertions(+), 9 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 07ff5c2d8bc4..563687c2595b 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1473,8 +1473,8 @@ COCKROACH_DEBUG_TS_IMPORT_FILE=tsdump.gob cockroach start-single-node --insecure } // FetchDebugZip downloads the debug zip from the cluster using `roachprod ssh`. -// The logs will be placed in the test's artifacts dir. -func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error { +// The logs will be placed at `dest`, relative to the test's artifacts dir. +func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger, dest string) error { if c.spec.NodeCount == 0 { // No nodes can happen during unit tests and implies nothing to do. return nil @@ -1486,7 +1486,7 @@ func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error // Don't hang forever if we can't fetch the debug zip. return timeutil.RunWithTimeout(ctx, "debug zip", 5*time.Minute, func(ctx context.Context) error { const zipName = "debug.zip" - path := filepath.Join(c.t.ArtifactsDir(), zipName) + path := filepath.Join(c.t.ArtifactsDir(), dest) if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { return err } diff --git a/pkg/cmd/roachtest/cluster/cluster_interface.go b/pkg/cmd/roachtest/cluster/cluster_interface.go index fd2547f70bf0..62b80862d1cb 100644 --- a/pkg/cmd/roachtest/cluster/cluster_interface.go +++ b/pkg/cmd/roachtest/cluster/cluster_interface.go @@ -134,6 +134,7 @@ type Cluster interface { ) error FetchTimeseriesData(ctx context.Context, l *logger.Logger) error + FetchDebugZip(ctx context.Context, l *logger.Logger, dest string) error RefetchCertsFromNode(ctx context.Context, node int) error StartGrafana(ctx context.Context, l *logger.Logger, promCfg *prometheus.Config) error diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 800d05b045b5..b362d56a015d 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -1317,7 +1317,7 @@ func (r *testRunner) collectArtifacts( if err := c.FetchTimeseriesData(ctx, t.L()); err != nil { t.L().Printf("failed to fetch timeseries data: %s", err) } - if err := c.FetchDebugZip(ctx, t.L()); err != nil { + if err := c.FetchDebugZip(ctx, t.L(), "debug.zip"); err != nil { t.L().Printf("failed to collect zip: %s", err) } }) diff --git a/pkg/cmd/roachtest/tests/mixed_version_backup.go b/pkg/cmd/roachtest/tests/mixed_version_backup.go index 54ba4fec1bc8..888412fcdc4f 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_backup.go +++ b/pkg/cmd/roachtest/tests/mixed_version_backup.go @@ -16,6 +16,7 @@ import ( "encoding/json" "fmt" "math/rand" + "os" "path/filepath" "reflect" "regexp" @@ -986,6 +987,7 @@ func backupCollectionDesc(fullSpec, incSpec backupSpec) string { // state involved in the mixed-version backup test. type mixedVersionBackup struct { cluster cluster.Cluster + t test.Test roachNodes option.NodeListOption // backup collections that are created along the test collections []*backupCollection @@ -1010,13 +1012,13 @@ type mixedVersionBackup struct { } func newMixedVersionBackup( - c cluster.Cluster, roachNodes option.NodeListOption, dbs ...string, + t test.Test, c cluster.Cluster, roachNodes option.NodeListOption, dbs ...string, ) *mixedVersionBackup { var tablesLoaded atomic.Bool tablesLoaded.Store(false) return &mixedVersionBackup{ - cluster: c, dbs: dbs, roachNodes: roachNodes, tablesLoaded: &tablesLoaded, + t: t, cluster: c, dbs: dbs, roachNodes: roachNodes, tablesLoaded: &tablesLoaded, } } @@ -1835,6 +1837,33 @@ func (mvb *mixedVersionBackup) checkFiles( return h.Exec(rng, checkFilesStmt) } +// collectFailureArtifacts fetches cockroach logs and a debug.zip and +// saves them to a directory in the test's artifacts dir. This is done +// so that we can report multiple restore failures in the same test, +// and make each failure actionable. If artifacts cannot be collected, +// the original restore error is returned, along with the error +// encountered while fetching the artifacts. +func (mvb *mixedVersionBackup) collectFailureArtifacts( + ctx context.Context, l *logger.Logger, restoreErr error, errID int, +) (error, error) { + dirName := fmt.Sprintf("restore_failure_%d", errID) + rootDir := filepath.Join(mvb.t.ArtifactsDir(), dirName) + logsDir := filepath.Join(rootDir, "logs") + if err := os.MkdirAll(filepath.Dir(logsDir), 0755); err != nil { + return restoreErr, fmt.Errorf("could not create directory %s: %w", rootDir, err) + } + + if err := mvb.cluster.Get(ctx, l, "logs" /* src */, logsDir, mvb.roachNodes); err != nil { + return restoreErr, fmt.Errorf("could not fetch logs: %w", err) + } + zipLocation := filepath.Join(dirName, "debug.zip") + if err := mvb.cluster.FetchDebugZip(ctx, l, zipLocation); err != nil { + return restoreErr, err + } + + return fmt.Errorf("%w (artifacts collected in %s)", restoreErr, dirName), nil +} + // verifyBackupCollection restores the backup collection passed and // verifies that the contents after the restore match the contents // when the backup was taken. @@ -1995,7 +2024,14 @@ func (mvb *mixedVersionBackup) verifyAllBackups( } if err := mvb.verifyBackupCollection(ctx, l, rng, h, collection, version); err != nil { l.Printf("restore error: %v", err) - restoreErrors = append(restoreErrors, err) + // Attempt to collect logs and debug.zip at the time of this + // restore failure; if we can't, log the error encountered and + // move on. + restoreErr, collectionErr := mvb.collectFailureArtifacts(ctx, l, err, len(restoreErrors)+1) + if collectionErr != nil { + l.Printf("could not collect failure artifacts: %v", collectionErr) + } + restoreErrors = append(restoreErrors, restoreErr) } } } @@ -2011,7 +2047,7 @@ func (mvb *mixedVersionBackup) verifyAllBackups( msgs := make([]string, 0, len(restoreErrors)) for j, err := range restoreErrors { - msgs = append(msgs, fmt.Sprintf("%d: %s", j, err.Error())) + msgs = append(msgs, fmt.Sprintf("%d: %s", j+1, err.Error())) } return fmt.Errorf("%d errors during restore:\n%s", len(restoreErrors), strings.Join(msgs, "\n")) } @@ -2066,7 +2102,7 @@ func registerBackupMixedVersion(r registry.Registry) { Arg("{pgurl%s}", roachNodes). Option("tolerate-errors") - backupTest := newMixedVersionBackup(c, roachNodes, "bank", "tpcc") + backupTest := newMixedVersionBackup(t, c, roachNodes, "bank", "tpcc") mvt.OnStartup("set short job interval", backupTest.setShortJobIntervals) mvt.OnStartup("take backup in previous version", backupTest.takePreviousVersionBackup)