Skip to content

Commit

Permalink
roachtest: collect failure artifacts when restore fails
Browse files Browse the repository at this point in the history
This commit updates the `backup-restore/mixed-version` roachtest to
collect artifacts (cockroach logs and a debug.zip) when a restore
fails in the last step of the test (when all backups taken are
restored). In that step, we do not immediately fail the test when a
restore fails but instead attempt to restore every backup and return a
list of errors found when the process is done. However, restoring
cluster backups involves wiping the cluster which also deletes
existing cockroach logs up to that point. This makes debugging a
restore failure that happened prior to a cluster restore impossible.

After this commit, a restore failure in that test will cause a
`restore_failure_N` directory to be created in the artifacts
directory, including the cockroach logs collected right after the
failure, as well as a debug.zip created at the same time.

This will make issues such as cockroachdb#104604 more actionable.

Epic: none

Release note: None
  • Loading branch information
renatolabs committed Jun 14, 2023
1 parent f88077d commit 86c4583
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 9 deletions.
6 changes: 3 additions & 3 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1473,8 +1473,8 @@ COCKROACH_DEBUG_TS_IMPORT_FILE=tsdump.gob cockroach start-single-node --insecure
}

// FetchDebugZip downloads the debug zip from the cluster using `roachprod ssh`.
// The logs will be placed in the test's artifacts dir.
func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error {
// The logs will be placed at `dest`, relative to the test's artifacts dir.
func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger, dest string) error {
if c.spec.NodeCount == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return nil
Expand All @@ -1486,7 +1486,7 @@ func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error
// Don't hang forever if we can't fetch the debug zip.
return timeutil.RunWithTimeout(ctx, "debug zip", 5*time.Minute, func(ctx context.Context) error {
const zipName = "debug.zip"
path := filepath.Join(c.t.ArtifactsDir(), zipName)
path := filepath.Join(c.t.ArtifactsDir(), dest)
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return err
}
Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/cluster/cluster_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ type Cluster interface {
) error

FetchTimeseriesData(ctx context.Context, l *logger.Logger) error
FetchDebugZip(ctx context.Context, l *logger.Logger, dest string) error
RefetchCertsFromNode(ctx context.Context, node int) error

StartGrafana(ctx context.Context, l *logger.Logger, promCfg *prometheus.Config) error
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1317,7 +1317,7 @@ func (r *testRunner) collectArtifacts(
if err := c.FetchTimeseriesData(ctx, t.L()); err != nil {
t.L().Printf("failed to fetch timeseries data: %s", err)
}
if err := c.FetchDebugZip(ctx, t.L()); err != nil {
if err := c.FetchDebugZip(ctx, t.L(), "debug.zip"); err != nil {
t.L().Printf("failed to collect zip: %s", err)
}
})
Expand Down
46 changes: 41 additions & 5 deletions pkg/cmd/roachtest/tests/mixed_version_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"encoding/json"
"fmt"
"math/rand"
"os"
"path/filepath"
"reflect"
"regexp"
Expand Down Expand Up @@ -986,6 +987,7 @@ func backupCollectionDesc(fullSpec, incSpec backupSpec) string {
// state involved in the mixed-version backup test.
type mixedVersionBackup struct {
cluster cluster.Cluster
t test.Test
roachNodes option.NodeListOption
// backup collections that are created along the test
collections []*backupCollection
Expand All @@ -1010,13 +1012,13 @@ type mixedVersionBackup struct {
}

func newMixedVersionBackup(
c cluster.Cluster, roachNodes option.NodeListOption, dbs ...string,
t test.Test, c cluster.Cluster, roachNodes option.NodeListOption, dbs ...string,
) *mixedVersionBackup {
var tablesLoaded atomic.Bool
tablesLoaded.Store(false)

return &mixedVersionBackup{
cluster: c, dbs: dbs, roachNodes: roachNodes, tablesLoaded: &tablesLoaded,
t: t, cluster: c, dbs: dbs, roachNodes: roachNodes, tablesLoaded: &tablesLoaded,
}
}

Expand Down Expand Up @@ -1835,6 +1837,33 @@ func (mvb *mixedVersionBackup) checkFiles(
return h.Exec(rng, checkFilesStmt)
}

// collectFailureArtifacts fetches cockroach logs and a debug.zip and
// saves them to a directory in the test's artifacts dir. This is done
// so that we can report multiple restore failures in the same test,
// and make each failure actionable. If artifacts cannot be collected,
// the original restore error is returned, along with the error
// encountered while fetching the artifacts.
func (mvb *mixedVersionBackup) collectFailureArtifacts(
ctx context.Context, l *logger.Logger, restoreErr error, errID int,
) (error, error) {
dirName := fmt.Sprintf("restore_failure_%d", errID)
rootDir := filepath.Join(mvb.t.ArtifactsDir(), dirName)
logsDir := filepath.Join(rootDir, "logs")
if err := os.MkdirAll(filepath.Dir(logsDir), 0755); err != nil {
return restoreErr, fmt.Errorf("could not create directory %s: %w", rootDir, err)
}

if err := mvb.cluster.Get(ctx, l, "logs" /* src */, logsDir, mvb.roachNodes); err != nil {
return restoreErr, fmt.Errorf("could not fetch logs: %w", err)
}
zipLocation := filepath.Join(dirName, "debug.zip")
if err := mvb.cluster.FetchDebugZip(ctx, l, zipLocation); err != nil {
return restoreErr, err
}

return fmt.Errorf("%w (artifacts collected in %s)", restoreErr, dirName), nil
}

// verifyBackupCollection restores the backup collection passed and
// verifies that the contents after the restore match the contents
// when the backup was taken.
Expand Down Expand Up @@ -1995,7 +2024,14 @@ func (mvb *mixedVersionBackup) verifyAllBackups(
}
if err := mvb.verifyBackupCollection(ctx, l, rng, h, collection, version); err != nil {
l.Printf("restore error: %v", err)
restoreErrors = append(restoreErrors, err)
// Attempt to collect logs and debug.zip at the time of this
// restore failure; if we can't, log the error encountered and
// move on.
restoreErr, collectionErr := mvb.collectFailureArtifacts(ctx, l, err, len(restoreErrors)+1)
if collectionErr != nil {
l.Printf("could not collect failure artifacts: %v", collectionErr)
}
restoreErrors = append(restoreErrors, restoreErr)
}
}
}
Expand All @@ -2011,7 +2047,7 @@ func (mvb *mixedVersionBackup) verifyAllBackups(

msgs := make([]string, 0, len(restoreErrors))
for j, err := range restoreErrors {
msgs = append(msgs, fmt.Sprintf("%d: %s", j, err.Error()))
msgs = append(msgs, fmt.Sprintf("%d: %s", j+1, err.Error()))
}
return fmt.Errorf("%d errors during restore:\n%s", len(restoreErrors), strings.Join(msgs, "\n"))
}
Expand Down Expand Up @@ -2066,7 +2102,7 @@ func registerBackupMixedVersion(r registry.Registry) {
Arg("{pgurl%s}", roachNodes).
Option("tolerate-errors")

backupTest := newMixedVersionBackup(c, roachNodes, "bank", "tpcc")
backupTest := newMixedVersionBackup(t, c, roachNodes, "bank", "tpcc")

mvt.OnStartup("set short job interval", backupTest.setShortJobIntervals)
mvt.OnStartup("take backup in previous version", backupTest.takePreviousVersionBackup)
Expand Down

0 comments on commit 86c4583

Please sign in to comment.