From 25045db5319afeeca93ac8d0a2056543f23bdf67 Mon Sep 17 00:00:00 2001 From: Miral Gadani Date: Tue, 7 Feb 2023 16:38:03 +0000 Subject: [PATCH] roachtest: report timeout failures accordingly Timeout failures are recorded at actual timeout, with subsequent failures secondary. `addFailure` accepts a depth parameter and no longer includes context cancellation, which is done separately. Epic: none Fixes: #91237 Release note: None --- pkg/cmd/roachtest/test_impl.go | 49 +++++++++++-------- pkg/cmd/roachtest/test_runner.go | 16 +++--- .../lint/passes/fmtsafe/functions.go | 3 ++ 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go index 63a31c139f65..503667ee1d86 100644 --- a/pkg/cmd/roachtest/test_impl.go +++ b/pkg/cmd/roachtest/test_impl.go @@ -288,52 +288,47 @@ func collectErrors(args []interface{}) []error { // ATTENTION: Since this calls panic(errTestFatal), it should only be called // from a test's closure. The test runner itself should never call this. func (t *testImpl) Fatal(args ...interface{}) { - t.addFailure("", args...) + t.addFailureAndCancel(1, "", args...) panic(errTestFatal) } // Fatalf is like Fatal, but takes a format string. func (t *testImpl) Fatalf(format string, args ...interface{}) { - t.addFailure(format, args...) + t.addFailureAndCancel(1, format, args...) panic(errTestFatal) } // FailNow implements the TestingT interface. func (t *testImpl) FailNow() { - t.addFailure("FailNow called") + t.addFailureAndCancel(1, "FailNow called") panic(errTestFatal) } // Error implements the TestingT interface func (t *testImpl) Error(args ...interface{}) { - t.addFailure("", args...) + t.addFailureAndCancel(1, "", args...) } // Errorf implements the TestingT interface. func (t *testImpl) Errorf(format string, args ...interface{}) { - t.addFailure(format, args...) + t.addFailureAndCancel(1, format, args...) } -// We take the first error from each failure which is the -// "squashed" error that contains all information of a failure -func formatFailure(b *strings.Builder, reportFailures ...failure) { - for i, failure := range reportFailures { - if i > 0 { - fmt.Fprintln(b) - } - file, line, fn, ok := errors.GetOneLineSource(failure.squashedErr) - if !ok { - file, line, fn = "", 0, "unknown" - } - fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, failure.squashedErr) +func (t *testImpl) addFailureAndCancel(depth int, format string, args ...interface{}) { + t.addFailure(depth+1, format, args...) + if t.mu.cancel != nil { + t.mu.cancel() } } -func (t *testImpl) addFailure(format string, args ...interface{}) { +// addFailure depth indicates how many stack frames to skip when reporting the +// site of the failure in logs. `0` will report the caller of addFailure, `1` the +// caller of the caller of addFailure, etc. +func (t *testImpl) addFailure(depth int, format string, args ...interface{}) { if format == "" { format = strings.Repeat(" %v", len(args))[1:] } - reportFailure := newFailure(errors.NewWithDepthf(2, format, args...), collectErrors(args)) + reportFailure := newFailure(errors.NewWithDepthf(depth+1, format, args...), collectErrors(args)) t.mu.Lock() defer t.mu.Unlock() @@ -368,8 +363,20 @@ func (t *testImpl) addFailure(format string, args ...interface{}) { t.mu.output = append(t.mu.output, msg...) t.mu.output = append(t.mu.output, '\n') - if t.mu.cancel != nil { - t.mu.cancel() +} + +// We take the first error from each failure which is the +// "squashed" error that contains all information of a failure +func formatFailure(b *strings.Builder, reportFailures ...failure) { + for i, failure := range reportFailures { + if i > 0 { + fmt.Fprintln(b) + } + file, line, fn, ok := errors.GetOneLineSource(failure.squashedErr) + if !ok { + file, line, fn = "", 0, "unknown" + } + fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, failure.squashedErr) } } diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 53e1cf6ac1f5..ed3050bfbb34 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -984,10 +984,9 @@ func (r *testRunner) runTest( } t.L().Printf("tearing down after %s; see teardown.log", s) case <-time.After(timeout): - // NB: we're intentionally not failing the test if it hasn't - // already. This will be done at the very end of this method, - // after we've collected artifacts. - t.L().Printf("test timed out after %s; check __stacks.log and CRDB logs for goroutine dumps", timeout) + // NB: We're adding the timeout failure intentionally without cancelling the context + // to capture as much state as possible during artifact collection. + t.addFailure(0, "test timed out (%s)", timeout) timedOut = true } @@ -1104,10 +1103,11 @@ func (r *testRunner) teardownTest( // around so someone can poke at it. _ = c.StopE(ctx, t.L(), option.DefaultStopOpts(), c.All()) - // The hung test may, against all odds, still not have reported an error. - // We delayed it to improve artifacts collection, and now we ensure the test - // is marked as failing. - t.Errorf("test timed out (%s)", t.Spec().(*registry.TestSpec).Timeout) + // We previously added a timeout failure without cancellation, so we cancel here. + if t.mu.cancel != nil { + t.mu.cancel() + } + t.L().Printf("test timed out; check __stacks.log and CRDB logs for goroutine dumps") } return nil } diff --git a/pkg/testutils/lint/passes/fmtsafe/functions.go b/pkg/testutils/lint/passes/fmtsafe/functions.go index 512e5625eae5..9c878a24013a 100644 --- a/pkg/testutils/lint/passes/fmtsafe/functions.go +++ b/pkg/testutils/lint/passes/fmtsafe/functions.go @@ -96,6 +96,9 @@ var requireConstFmt = map[string]bool{ "(*github.com/cockroachdb/cockroach/pkg/cmd/roachtest.testImpl).addFailure": true, "(*main.testImpl).addFailure": true, + "(*github.com/cockroachdb/cockroach/pkg/cmd/roachtest.testImpl).addFailureAndCancel": true, + "(*main.testImpl).addFailureAndCancel": true, + "(*main.testImpl).Fatalf": true, "(*github.com/cockroachdb/cockroach/pkg/cmd/roachtest.testImpl).Fatalf": true,