From 3ce32e34ca62f512a7f1b02db9384e8e6f5fcc58 Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Tue, 13 Jun 2023 13:33:46 -0400 Subject: [PATCH 1/3] roachtest: collect failure artifacts when restore fails This commit updates the `backup-restore/mixed-version` roachtest to collect artifacts (cockroach logs and a debug.zip) when a restore fails in the last step of the test (when all backups taken are restored). In that step, we do not immediately fail the test when a restore fails but instead attempt to restore every backup and return a list of errors found when the process is done. However, restoring cluster backups involves wiping the cluster which also deletes existing cockroach logs up to that point. This makes debugging a restore failure that happened prior to a cluster restore impossible. After this commit, a restore failure in that test will cause a `restore_failure_N` directory to be created in the artifacts directory, including the cockroach logs collected right after the failure, as well as a debug.zip created at the same time. This will make issues such as #104604 more actionable. Epic: none Release note: None --- pkg/cmd/roachtest/cluster.go | 6 +-- .../roachtest/cluster/cluster_interface.go | 1 + pkg/cmd/roachtest/test_runner.go | 2 +- .../roachtest/tests/mixed_version_backup.go | 46 +++++++++++++++++-- 4 files changed, 46 insertions(+), 9 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 4c497acb1074..0552758991f1 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1466,8 +1466,8 @@ COCKROACH_DEBUG_TS_IMPORT_FILE=tsdump.gob cockroach start-single-node --insecure } // FetchDebugZip downloads the debug zip from the cluster using `roachprod ssh`. -// The logs will be placed in the test's artifacts dir. -func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error { +// The logs will be placed at `dest`, relative to the test's artifacts dir. +func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger, dest string) error { if c.spec.NodeCount == 0 { // No nodes can happen during unit tests and implies nothing to do. return nil @@ -1479,7 +1479,7 @@ func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error // Don't hang forever if we can't fetch the debug zip. return contextutil.RunWithTimeout(ctx, "debug zip", 5*time.Minute, func(ctx context.Context) error { const zipName = "debug.zip" - path := filepath.Join(c.t.ArtifactsDir(), zipName) + path := filepath.Join(c.t.ArtifactsDir(), dest) if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { return err } diff --git a/pkg/cmd/roachtest/cluster/cluster_interface.go b/pkg/cmd/roachtest/cluster/cluster_interface.go index 547275d32e23..181c7360656c 100644 --- a/pkg/cmd/roachtest/cluster/cluster_interface.go +++ b/pkg/cmd/roachtest/cluster/cluster_interface.go @@ -134,6 +134,7 @@ type Cluster interface { ) error FetchTimeseriesData(ctx context.Context, l *logger.Logger) error + FetchDebugZip(ctx context.Context, l *logger.Logger, dest string) error RefetchCertsFromNode(ctx context.Context, node int) error StartGrafana(ctx context.Context, l *logger.Logger, promCfg *prometheus.Config) error diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index f9ed1eb1dc63..0f39e2df2847 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -1266,7 +1266,7 @@ func (r *testRunner) collectClusterArtifacts( if err := c.FetchTimeseriesData(ctx, l); err != nil { l.Printf("failed to fetch timeseries data: %s", err) } - if err := c.FetchDebugZip(ctx, l); err != nil { + if err := c.FetchDebugZip(ctx, l, "debug.zip"); err != nil { l.Printf("failed to collect zip: %s", err) } } diff --git a/pkg/cmd/roachtest/tests/mixed_version_backup.go b/pkg/cmd/roachtest/tests/mixed_version_backup.go index 54ba4fec1bc8..888412fcdc4f 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_backup.go +++ b/pkg/cmd/roachtest/tests/mixed_version_backup.go @@ -16,6 +16,7 @@ import ( "encoding/json" "fmt" "math/rand" + "os" "path/filepath" "reflect" "regexp" @@ -986,6 +987,7 @@ func backupCollectionDesc(fullSpec, incSpec backupSpec) string { // state involved in the mixed-version backup test. type mixedVersionBackup struct { cluster cluster.Cluster + t test.Test roachNodes option.NodeListOption // backup collections that are created along the test collections []*backupCollection @@ -1010,13 +1012,13 @@ type mixedVersionBackup struct { } func newMixedVersionBackup( - c cluster.Cluster, roachNodes option.NodeListOption, dbs ...string, + t test.Test, c cluster.Cluster, roachNodes option.NodeListOption, dbs ...string, ) *mixedVersionBackup { var tablesLoaded atomic.Bool tablesLoaded.Store(false) return &mixedVersionBackup{ - cluster: c, dbs: dbs, roachNodes: roachNodes, tablesLoaded: &tablesLoaded, + t: t, cluster: c, dbs: dbs, roachNodes: roachNodes, tablesLoaded: &tablesLoaded, } } @@ -1835,6 +1837,33 @@ func (mvb *mixedVersionBackup) checkFiles( return h.Exec(rng, checkFilesStmt) } +// collectFailureArtifacts fetches cockroach logs and a debug.zip and +// saves them to a directory in the test's artifacts dir. This is done +// so that we can report multiple restore failures in the same test, +// and make each failure actionable. If artifacts cannot be collected, +// the original restore error is returned, along with the error +// encountered while fetching the artifacts. +func (mvb *mixedVersionBackup) collectFailureArtifacts( + ctx context.Context, l *logger.Logger, restoreErr error, errID int, +) (error, error) { + dirName := fmt.Sprintf("restore_failure_%d", errID) + rootDir := filepath.Join(mvb.t.ArtifactsDir(), dirName) + logsDir := filepath.Join(rootDir, "logs") + if err := os.MkdirAll(filepath.Dir(logsDir), 0755); err != nil { + return restoreErr, fmt.Errorf("could not create directory %s: %w", rootDir, err) + } + + if err := mvb.cluster.Get(ctx, l, "logs" /* src */, logsDir, mvb.roachNodes); err != nil { + return restoreErr, fmt.Errorf("could not fetch logs: %w", err) + } + zipLocation := filepath.Join(dirName, "debug.zip") + if err := mvb.cluster.FetchDebugZip(ctx, l, zipLocation); err != nil { + return restoreErr, err + } + + return fmt.Errorf("%w (artifacts collected in %s)", restoreErr, dirName), nil +} + // verifyBackupCollection restores the backup collection passed and // verifies that the contents after the restore match the contents // when the backup was taken. @@ -1995,7 +2024,14 @@ func (mvb *mixedVersionBackup) verifyAllBackups( } if err := mvb.verifyBackupCollection(ctx, l, rng, h, collection, version); err != nil { l.Printf("restore error: %v", err) - restoreErrors = append(restoreErrors, err) + // Attempt to collect logs and debug.zip at the time of this + // restore failure; if we can't, log the error encountered and + // move on. + restoreErr, collectionErr := mvb.collectFailureArtifacts(ctx, l, err, len(restoreErrors)+1) + if collectionErr != nil { + l.Printf("could not collect failure artifacts: %v", collectionErr) + } + restoreErrors = append(restoreErrors, restoreErr) } } } @@ -2011,7 +2047,7 @@ func (mvb *mixedVersionBackup) verifyAllBackups( msgs := make([]string, 0, len(restoreErrors)) for j, err := range restoreErrors { - msgs = append(msgs, fmt.Sprintf("%d: %s", j, err.Error())) + msgs = append(msgs, fmt.Sprintf("%d: %s", j+1, err.Error())) } return fmt.Errorf("%d errors during restore:\n%s", len(restoreErrors), strings.Join(msgs, "\n")) } @@ -2066,7 +2102,7 @@ func registerBackupMixedVersion(r registry.Registry) { Arg("{pgurl%s}", roachNodes). Option("tolerate-errors") - backupTest := newMixedVersionBackup(c, roachNodes, "bank", "tpcc") + backupTest := newMixedVersionBackup(t, c, roachNodes, "bank", "tpcc") mvt.OnStartup("set short job interval", backupTest.setShortJobIntervals) mvt.OnStartup("take backup in previous version", backupTest.takePreviousVersionBackup) From d4d8cc1b7911df40a629d52d487d6ccf6680769b Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Tue, 13 Jun 2023 16:22:26 -0400 Subject: [PATCH 2/3] roachtest: fix query to check for currently running jobs It was missing the `coordinator_id` for the node being waited on. Epic: none Release note: None --- pkg/cmd/roachtest/tests/mixed_version_backup.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/cmd/roachtest/tests/mixed_version_backup.go b/pkg/cmd/roachtest/tests/mixed_version_backup.go index 888412fcdc4f..97e7ff2bb97b 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_backup.go +++ b/pkg/cmd/roachtest/tests/mixed_version_backup.go @@ -1694,7 +1694,10 @@ func (mvb *mixedVersionBackup) disableJobAdoption( if err := retry.ForDuration(testutils.DefaultSucceedsSoonDuration, func() error { db := h.Connect(node) var count int - err := db.QueryRow(`SELECT count(*) FROM [SHOW JOBS] WHERE status = 'running'`).Scan(&count) + err := db.QueryRow(fmt.Sprintf( + `SELECT count(*) FROM [SHOW JOBS] WHERE status = 'running' AND coordinator_id = %d`, + node, + )).Scan(&count) if err != nil { l.Printf("node %d: error querying running jobs (%s)", node, err) return err From a1b1588b5cdf9734025dfd08ad8246e30764a24a Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Wed, 14 Jun 2023 12:48:36 -0400 Subject: [PATCH 3/3] roachtest: only take create two backups in mixed-version This reduces the chance of a test timeout. Over time, we still get good coverage of the different backup scenarios as they are picked randomly. Epic: none Release note: None --- pkg/cmd/roachtest/tests/mixed_version_backup.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pkg/cmd/roachtest/tests/mixed_version_backup.go b/pkg/cmd/roachtest/tests/mixed_version_backup.go index 97e7ff2bb97b..7ca5f4a9ca84 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_backup.go +++ b/pkg/cmd/roachtest/tests/mixed_version_backup.go @@ -1802,7 +1802,7 @@ func (mvb *mixedVersionBackup) planAndRunBackups( } if len(tc.FromVersionNodes) > 0 { - const numCollections = 3 + const numCollections = 2 rng.Shuffle(len(collectionSpecs), func(i, j int) { collectionSpecs[i], collectionSpecs[j] = collectionSpecs[j], collectionSpecs[i] }) @@ -2042,6 +2042,17 @@ func (mvb *mixedVersionBackup) verifyAllBackups( verify(h.Context().FromVersion) verify(clusterupgrade.MainVersion) + // If the context was canceled (most likely due to a test timeout), + // return early. In these cases, it's likely that `restoreErrors` + // will have a number of "restore failures" that all happened + // because the underlying context was canceled, so proceeding with + // the error reporting logic below is confusing, as it makes it look + // like multiple failures occurred. It also makes the actually + // important "timed out" message less prominent. + if err := ctx.Err(); err != nil { + return err + } + if len(restoreErrors) > 0 { if len(restoreErrors) == 1 { // Simplify error reporting if only one error was found.