From 956aeafa45d22a3601649524820e1d7c76c13263 Mon Sep 17 00:00:00 2001 From: Miral Gadani Date: Thu, 25 May 2023 15:26:34 +0000 Subject: [PATCH] roachtest: categorise errors that occur during test teardown Epic: none Fixes: #98366 Release note: None --- pkg/cmd/roachtest/cluster.go | 22 ++++++++++------------ pkg/cmd/roachtest/github.go | 5 +++++ pkg/cmd/roachtest/github_test.go | 4 ++++ pkg/cmd/roachtest/test_impl.go | 5 ++--- pkg/cmd/roachtest/test_runner.go | 22 ++++++++++++++++------ pkg/cmd/roachtest/tests/BUILD.bazel | 1 + 6 files changed, 38 insertions(+), 21 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 51b1473710a3..3a8e6ae740ac 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1609,33 +1609,31 @@ func (c *clusterImpl) HealthStatus( return results, nil } -// FailOnInvalidDescriptors fails the test if there exists any descriptors in +// assertValidDescriptors fails the test if there exists any descriptors in // the crdb_internal.invalid_objects virtual table. -func (c *clusterImpl) FailOnInvalidDescriptors(ctx context.Context, db *gosql.DB, t *testImpl) { +func (c *clusterImpl) assertValidDescriptors(ctx context.Context, db *gosql.DB, t *testImpl) error { t.L().Printf("checking for invalid descriptors") - if err := contextutil.RunWithTimeout( + return contextutil.RunWithTimeout( ctx, "invalid descriptors check", 1*time.Minute, func(ctx context.Context) error { return roachtestutil.CheckInvalidDescriptors(ctx, db) }, - ); err != nil { - t.Errorf("invalid descriptors check failed: %v", err) - } + ) } -// FailOnReplicaDivergence fails the test if +// assertConsistentReplicas fails the test if // crdb_internal.check_consistency(true, ”, ”) indicates that any ranges' // replicas are inconsistent with each other. -func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, db *gosql.DB, t *testImpl) { +func (c *clusterImpl) assertConsistentReplicas( + ctx context.Context, db *gosql.DB, t *testImpl, +) error { t.L().Printf("checking for replica divergence") - if err := contextutil.RunWithTimeout( + return contextutil.RunWithTimeout( ctx, "consistency check", 5*time.Minute, func(ctx context.Context) error { return roachtestutil.CheckReplicaDivergenceOnDB(ctx, t.L(), db) }, - ); err != nil { - t.Errorf("consistency check failed: %v", err) - } + ) } // FetchDmesg grabs the dmesg logs if possible. This requires being able to run diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go index c54cc79956fd..7fb83a3b7ea0 100644 --- a/pkg/cmd/roachtest/github.go +++ b/pkg/cmd/roachtest/github.go @@ -38,6 +38,7 @@ const ( otherErr issueCategory = iota clusterCreationErr sshErr + teardownErr ) func newGithubIssues(disable bool, c *clusterImpl, vmCreateOpts *vm.CreateOpts) *githubIssues { @@ -124,6 +125,8 @@ func (g *githubIssues) createPostRequest( issueOwner = registry.OwnerTestEng issueName = "ssh_problem" messagePrefix = fmt.Sprintf("test %s failed due to ", t.Name()) + } else if cat == teardownErr { + messagePrefix = fmt.Sprintf("test %s failed during teardown (see teardown.log) due to ", t.Name()) } // Issues posted from roachtest are identifiable as such and @@ -219,6 +222,8 @@ func (g *githubIssues) MaybePost(t *testImpl, l *logger.Logger, message string) cat = clusterCreationErr } else if failureContainsError(firstFailure, rperrors.ErrSSH255) { cat = sshErr + } else if failureContainsError(firstFailure, errDuringTeardown) { + cat = teardownErr } return g.issuePoster( diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go index 58d0d898cf3c..cb7cdb41a244 100644 --- a/pkg/cmd/roachtest/github_test.go +++ b/pkg/cmd/roachtest/github_test.go @@ -147,6 +147,8 @@ func TestCreatePostRequest(t *testing.T) { }, //Simulate failure loading TEAMS.yaml {true, false, true, false, "", otherErr, false, false, nil}, + //Error during teardown + {true, false, false, false, "", teardownErr, false, false, nil}, } reg := makeTestRegistry(spec.GCE, "", "", false, false) @@ -219,6 +221,8 @@ func TestCreatePostRequest(t *testing.T) { expectedLabel = "T-testeng" expectedName = "ssh_problem" expectedMessagePrefix = "test github_test failed due to " + } else if c.category == teardownErr { + expectedMessagePrefix = "test github_test failed during teardown (see teardown.log) due to " } require.Contains(t, req.MentionOnCreate, expectedTeam) diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go index 3e3222d82eef..5feb6d23e255 100644 --- a/pkg/cmd/roachtest/test_impl.go +++ b/pkg/cmd/roachtest/test_impl.go @@ -366,8 +366,7 @@ func (t *testImpl) addFailure(depth int, format string, args ...interface{}) { t.mu.output = append(t.mu.output, '\n') } -// We take the first error from each failure which is the -// "squashed" error that contains all information of a failure +// We take the "squashed" error that contains information of all the errors for each failure. func formatFailure(b *strings.Builder, reportFailures ...failure) { for i, failure := range reportFailures { if i > 0 { @@ -398,7 +397,7 @@ func (t *testImpl) failedRLocked() bool { func (t *testImpl) firstFailure() failure { t.mu.RLock() defer t.mu.RUnlock() - if len(t.mu.failures) <= 0 { + if len(t.mu.failures) == 0 { return failure{} } return t.mu.failures[0] diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 303921a5dbee..392290d8a5ae 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -61,6 +61,9 @@ var ( // reference error used when cluster creation fails for a test errClusterProvisioningFailed = fmt.Errorf("cluster could not be created") + // reference error for any failures during test teardown + errDuringTeardown = fmt.Errorf("error during test tear down") + prometheusNameSpace = "roachtest" // prometheusScrapeInterval should be consistent with the scrape interval defined in // https://grafana.testeng.crdb.io/prometheus/config @@ -943,7 +946,7 @@ func (r *testRunner) runTest( durationStr := fmt.Sprintf("%.2fs", t.duration().Seconds()) if t.Failed() { - output := fmt.Sprintf("test artifacts and logs in: %s\n%s", t.ArtifactsDir(), t.failureMsg()) + output := fmt.Sprintf("%s\ntest artifacts and logs in: %s", t.failureMsg(), t.ArtifactsDir()) if teamCity { // If `##teamcity[testFailed ...]` is not present before `##teamCity[testFinished ...]`, @@ -1065,7 +1068,7 @@ func (r *testRunner) runTest( if t.Failed() { s = "failure" } - t.L().Printf("tearing down after %s; see teardown.log", s) + t.L().Printf("tearing down after %s; see teardown.log (subsequent failures may still occur there)", s) case <-time.After(timeout): // NB: We're adding the timeout failure intentionally without cancelling the context // to capture as much state as possible during artifact collection. @@ -1090,6 +1093,9 @@ func (r *testRunner) teardownTest( ctx context.Context, t *testImpl, c *clusterImpl, timedOut bool, ) error { + teardownError := func(err error) { + t.Error(errors.Mark(err, errDuringTeardown)) + } // We still have to collect artifacts and run post-flight checks, and any of // these might hang. So they go into a goroutine and the main goroutine // abandons them after a timeout. We intentionally don't wait for the @@ -1151,7 +1157,7 @@ func (r *testRunner) teardownTest( if err := c.assertNoDeadNode(ctx, t); err != nil { // Some tests expect dead nodes, so they may opt out of this check. if t.spec.SkipPostValidations®istry.PostValidationNoDeadNodes == 0 { - t.Error(err) + teardownError(err) } else { t.L().Printf("dead node(s) detected but expected") } @@ -1161,7 +1167,7 @@ func (r *testRunner) teardownTest( // and select the first one that succeeds to run the validation queries statuses, err := c.HealthStatus(ctx, t.L(), c.All()) if err != nil { - t.Error(errors.WithDetail(err, "Unable to check health status")) + teardownError(errors.WithDetail(err, "Unable to check health status")) } var db *gosql.DB @@ -1198,12 +1204,16 @@ func (r *testRunner) teardownTest( // If this validation fails due to a timeout, it is very likely that // the replica divergence check below will also fail. if t.spec.SkipPostValidations®istry.PostValidationInvalidDescriptors == 0 { - c.FailOnInvalidDescriptors(ctx, db, t) + if err := c.assertValidDescriptors(ctx, db, t); err != nil { + teardownError(errors.WithDetail(err, "invalid descriptors check failed")) + } } // Detect replica divergence (i.e. ranges in which replicas have arrived // at the same log position with different states). if t.spec.SkipPostValidations®istry.PostValidationReplicaDivergence == 0 { - c.FailOnReplicaDivergence(ctx, db, t) + if err := c.assertConsistentReplicas(ctx, db, t); err != nil { + teardownError(errors.WithDetail(err, "consistency check failed")) + } } } else { t.L().Printf("no live node found, skipping validation checks") diff --git a/pkg/cmd/roachtest/tests/BUILD.bazel b/pkg/cmd/roachtest/tests/BUILD.bazel index b075d0fe9b3a..49649b81349d 100644 --- a/pkg/cmd/roachtest/tests/BUILD.bazel +++ b/pkg/cmd/roachtest/tests/BUILD.bazel @@ -52,6 +52,7 @@ go_library( "drain.go", "drop.go", "drt.go", + "dummy.go", "encryption.go", "event_log.go", "failover.go",