Skip to content

Commit

Permalink
Merge pull request #104910 from renatolabs/backport23.1-104868
Browse files Browse the repository at this point in the history
  • Loading branch information
renatolabs authored Jun 14, 2023
2 parents 6c40223 + a1b1588 commit d7157c2
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 11 deletions.
6 changes: 3 additions & 3 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1466,8 +1466,8 @@ COCKROACH_DEBUG_TS_IMPORT_FILE=tsdump.gob cockroach start-single-node --insecure
}

// FetchDebugZip downloads the debug zip from the cluster using `roachprod ssh`.
// The logs will be placed in the test's artifacts dir.
func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error {
// The logs will be placed at `dest`, relative to the test's artifacts dir.
func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger, dest string) error {
if c.spec.NodeCount == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return nil
Expand All @@ -1479,7 +1479,7 @@ func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error
// Don't hang forever if we can't fetch the debug zip.
return contextutil.RunWithTimeout(ctx, "debug zip", 5*time.Minute, func(ctx context.Context) error {
const zipName = "debug.zip"
path := filepath.Join(c.t.ArtifactsDir(), zipName)
path := filepath.Join(c.t.ArtifactsDir(), dest)
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return err
}
Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/cluster/cluster_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ type Cluster interface {
) error

FetchTimeseriesData(ctx context.Context, l *logger.Logger) error
FetchDebugZip(ctx context.Context, l *logger.Logger, dest string) error
RefetchCertsFromNode(ctx context.Context, node int) error

StartGrafana(ctx context.Context, l *logger.Logger, promCfg *prometheus.Config) error
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1266,7 +1266,7 @@ func (r *testRunner) collectClusterArtifacts(
if err := c.FetchTimeseriesData(ctx, l); err != nil {
l.Printf("failed to fetch timeseries data: %s", err)
}
if err := c.FetchDebugZip(ctx, l); err != nil {
if err := c.FetchDebugZip(ctx, l, "debug.zip"); err != nil {
l.Printf("failed to collect zip: %s", err)
}
}
Expand Down
64 changes: 57 additions & 7 deletions pkg/cmd/roachtest/tests/mixed_version_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"encoding/json"
"fmt"
"math/rand"
"os"
"path/filepath"
"reflect"
"regexp"
Expand Down Expand Up @@ -986,6 +987,7 @@ func backupCollectionDesc(fullSpec, incSpec backupSpec) string {
// state involved in the mixed-version backup test.
type mixedVersionBackup struct {
cluster cluster.Cluster
t test.Test
roachNodes option.NodeListOption
// backup collections that are created along the test
collections []*backupCollection
Expand All @@ -1010,13 +1012,13 @@ type mixedVersionBackup struct {
}

func newMixedVersionBackup(
c cluster.Cluster, roachNodes option.NodeListOption, dbs ...string,
t test.Test, c cluster.Cluster, roachNodes option.NodeListOption, dbs ...string,
) *mixedVersionBackup {
var tablesLoaded atomic.Bool
tablesLoaded.Store(false)

return &mixedVersionBackup{
cluster: c, dbs: dbs, roachNodes: roachNodes, tablesLoaded: &tablesLoaded,
t: t, cluster: c, dbs: dbs, roachNodes: roachNodes, tablesLoaded: &tablesLoaded,
}
}

Expand Down Expand Up @@ -1692,7 +1694,10 @@ func (mvb *mixedVersionBackup) disableJobAdoption(
if err := retry.ForDuration(testutils.DefaultSucceedsSoonDuration, func() error {
db := h.Connect(node)
var count int
err := db.QueryRow(`SELECT count(*) FROM [SHOW JOBS] WHERE status = 'running'`).Scan(&count)
err := db.QueryRow(fmt.Sprintf(
`SELECT count(*) FROM [SHOW JOBS] WHERE status = 'running' AND coordinator_id = %d`,
node,
)).Scan(&count)
if err != nil {
l.Printf("node %d: error querying running jobs (%s)", node, err)
return err
Expand Down Expand Up @@ -1797,7 +1802,7 @@ func (mvb *mixedVersionBackup) planAndRunBackups(
}

if len(tc.FromVersionNodes) > 0 {
const numCollections = 3
const numCollections = 2
rng.Shuffle(len(collectionSpecs), func(i, j int) {
collectionSpecs[i], collectionSpecs[j] = collectionSpecs[j], collectionSpecs[i]
})
Expand Down Expand Up @@ -1835,6 +1840,33 @@ func (mvb *mixedVersionBackup) checkFiles(
return h.Exec(rng, checkFilesStmt)
}

// collectFailureArtifacts fetches cockroach logs and a debug.zip and
// saves them to a directory in the test's artifacts dir. This is done
// so that we can report multiple restore failures in the same test,
// and make each failure actionable. If artifacts cannot be collected,
// the original restore error is returned, along with the error
// encountered while fetching the artifacts.
func (mvb *mixedVersionBackup) collectFailureArtifacts(
ctx context.Context, l *logger.Logger, restoreErr error, errID int,
) (error, error) {
dirName := fmt.Sprintf("restore_failure_%d", errID)
rootDir := filepath.Join(mvb.t.ArtifactsDir(), dirName)
logsDir := filepath.Join(rootDir, "logs")
if err := os.MkdirAll(filepath.Dir(logsDir), 0755); err != nil {
return restoreErr, fmt.Errorf("could not create directory %s: %w", rootDir, err)
}

if err := mvb.cluster.Get(ctx, l, "logs" /* src */, logsDir, mvb.roachNodes); err != nil {
return restoreErr, fmt.Errorf("could not fetch logs: %w", err)
}
zipLocation := filepath.Join(dirName, "debug.zip")
if err := mvb.cluster.FetchDebugZip(ctx, l, zipLocation); err != nil {
return restoreErr, err
}

return fmt.Errorf("%w (artifacts collected in %s)", restoreErr, dirName), nil
}

// verifyBackupCollection restores the backup collection passed and
// verifies that the contents after the restore match the contents
// when the backup was taken.
Expand Down Expand Up @@ -1995,14 +2027,32 @@ func (mvb *mixedVersionBackup) verifyAllBackups(
}
if err := mvb.verifyBackupCollection(ctx, l, rng, h, collection, version); err != nil {
l.Printf("restore error: %v", err)
restoreErrors = append(restoreErrors, err)
// Attempt to collect logs and debug.zip at the time of this
// restore failure; if we can't, log the error encountered and
// move on.
restoreErr, collectionErr := mvb.collectFailureArtifacts(ctx, l, err, len(restoreErrors)+1)
if collectionErr != nil {
l.Printf("could not collect failure artifacts: %v", collectionErr)
}
restoreErrors = append(restoreErrors, restoreErr)
}
}
}

verify(h.Context().FromVersion)
verify(clusterupgrade.MainVersion)

// If the context was canceled (most likely due to a test timeout),
// return early. In these cases, it's likely that `restoreErrors`
// will have a number of "restore failures" that all happened
// because the underlying context was canceled, so proceeding with
// the error reporting logic below is confusing, as it makes it look
// like multiple failures occurred. It also makes the actually
// important "timed out" message less prominent.
if err := ctx.Err(); err != nil {
return err
}

if len(restoreErrors) > 0 {
if len(restoreErrors) == 1 {
// Simplify error reporting if only one error was found.
Expand All @@ -2011,7 +2061,7 @@ func (mvb *mixedVersionBackup) verifyAllBackups(

msgs := make([]string, 0, len(restoreErrors))
for j, err := range restoreErrors {
msgs = append(msgs, fmt.Sprintf("%d: %s", j, err.Error()))
msgs = append(msgs, fmt.Sprintf("%d: %s", j+1, err.Error()))
}
return fmt.Errorf("%d errors during restore:\n%s", len(restoreErrors), strings.Join(msgs, "\n"))
}
Expand Down Expand Up @@ -2066,7 +2116,7 @@ func registerBackupMixedVersion(r registry.Registry) {
Arg("{pgurl%s}", roachNodes).
Option("tolerate-errors")

backupTest := newMixedVersionBackup(c, roachNodes, "bank", "tpcc")
backupTest := newMixedVersionBackup(t, c, roachNodes, "bank", "tpcc")

mvt.OnStartup("set short job interval", backupTest.setShortJobIntervals)
mvt.OnStartup("take backup in previous version", backupTest.takePreviousVersionBackup)
Expand Down

0 comments on commit d7157c2

Please sign in to comment.