Skip to content

Commit

Permalink
roachtest: report timeout failures accordingly
Browse files Browse the repository at this point in the history
Timeout failures are recorded at actual timeout, with
subsequent failures secondary.

`addFailure` accepts a depth parameter and no longer
includes context cancellation, which is done separately.

Epic: none
Fixes: #91237
Release note: None
  • Loading branch information
Miral Gadani committed Feb 8, 2023
1 parent 9444613 commit 25045db
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 29 deletions.
49 changes: 28 additions & 21 deletions pkg/cmd/roachtest/test_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,52 +288,47 @@ func collectErrors(args []interface{}) []error {
// ATTENTION: Since this calls panic(errTestFatal), it should only be called
// from a test's closure. The test runner itself should never call this.
func (t *testImpl) Fatal(args ...interface{}) {
t.addFailure("", args...)
t.addFailureAndCancel(1, "", args...)
panic(errTestFatal)
}

// Fatalf is like Fatal, but takes a format string.
func (t *testImpl) Fatalf(format string, args ...interface{}) {
t.addFailure(format, args...)
t.addFailureAndCancel(1, format, args...)
panic(errTestFatal)
}

// FailNow implements the TestingT interface.
func (t *testImpl) FailNow() {
t.addFailure("FailNow called")
t.addFailureAndCancel(1, "FailNow called")
panic(errTestFatal)
}

// Error implements the TestingT interface
func (t *testImpl) Error(args ...interface{}) {
t.addFailure("", args...)
t.addFailureAndCancel(1, "", args...)
}

// Errorf implements the TestingT interface.
func (t *testImpl) Errorf(format string, args ...interface{}) {
t.addFailure(format, args...)
t.addFailureAndCancel(1, format, args...)
}

// We take the first error from each failure which is the
// "squashed" error that contains all information of a failure
func formatFailure(b *strings.Builder, reportFailures ...failure) {
for i, failure := range reportFailures {
if i > 0 {
fmt.Fprintln(b)
}
file, line, fn, ok := errors.GetOneLineSource(failure.squashedErr)
if !ok {
file, line, fn = "<unknown>", 0, "unknown"
}
fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, failure.squashedErr)
func (t *testImpl) addFailureAndCancel(depth int, format string, args ...interface{}) {
t.addFailure(depth+1, format, args...)
if t.mu.cancel != nil {
t.mu.cancel()
}
}

func (t *testImpl) addFailure(format string, args ...interface{}) {
// addFailure depth indicates how many stack frames to skip when reporting the
// site of the failure in logs. `0` will report the caller of addFailure, `1` the
// caller of the caller of addFailure, etc.
func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
if format == "" {
format = strings.Repeat(" %v", len(args))[1:]
}
reportFailure := newFailure(errors.NewWithDepthf(2, format, args...), collectErrors(args))
reportFailure := newFailure(errors.NewWithDepthf(depth+1, format, args...), collectErrors(args))

t.mu.Lock()
defer t.mu.Unlock()
Expand Down Expand Up @@ -368,8 +363,20 @@ func (t *testImpl) addFailure(format string, args ...interface{}) {

t.mu.output = append(t.mu.output, msg...)
t.mu.output = append(t.mu.output, '\n')
if t.mu.cancel != nil {
t.mu.cancel()
}

// We take the first error from each failure which is the
// "squashed" error that contains all information of a failure
func formatFailure(b *strings.Builder, reportFailures ...failure) {
for i, failure := range reportFailures {
if i > 0 {
fmt.Fprintln(b)
}
file, line, fn, ok := errors.GetOneLineSource(failure.squashedErr)
if !ok {
file, line, fn = "<unknown>", 0, "unknown"
}
fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, failure.squashedErr)
}
}

Expand Down
16 changes: 8 additions & 8 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -984,10 +984,9 @@ func (r *testRunner) runTest(
}
t.L().Printf("tearing down after %s; see teardown.log", s)
case <-time.After(timeout):
// NB: we're intentionally not failing the test if it hasn't
// already. This will be done at the very end of this method,
// after we've collected artifacts.
t.L().Printf("test timed out after %s; check __stacks.log and CRDB logs for goroutine dumps", timeout)
// NB: We're adding the timeout failure intentionally without cancelling the context
// to capture as much state as possible during artifact collection.
t.addFailure(0, "test timed out (%s)", timeout)
timedOut = true
}

Expand Down Expand Up @@ -1104,10 +1103,11 @@ func (r *testRunner) teardownTest(
// around so someone can poke at it.
_ = c.StopE(ctx, t.L(), option.DefaultStopOpts(), c.All())

// The hung test may, against all odds, still not have reported an error.
// We delayed it to improve artifacts collection, and now we ensure the test
// is marked as failing.
t.Errorf("test timed out (%s)", t.Spec().(*registry.TestSpec).Timeout)
// We previously added a timeout failure without cancellation, so we cancel here.
if t.mu.cancel != nil {
t.mu.cancel()
}
t.L().Printf("test timed out; check __stacks.log and CRDB logs for goroutine dumps")
}
return nil
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/testutils/lint/passes/fmtsafe/functions.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ var requireConstFmt = map[string]bool{
"(*github.com/cockroachdb/cockroach/pkg/cmd/roachtest.testImpl).addFailure": true,
"(*main.testImpl).addFailure": true,

"(*github.com/cockroachdb/cockroach/pkg/cmd/roachtest.testImpl).addFailureAndCancel": true,
"(*main.testImpl).addFailureAndCancel": true,

"(*main.testImpl).Fatalf": true,
"(*github.com/cockroachdb/cockroach/pkg/cmd/roachtest.testImpl).Fatalf": true,

Expand Down

0 comments on commit 25045db

Please sign in to comment.