Skip to content

Commit

Permalink
roachtest: string match for transient errors as a fallback
Browse files Browse the repository at this point in the history
The `require` package is commonly used through roachtest to
assert that no error occured. i.e. `require.NoError(t, err)`
However, this function does not preserve the error object. This
causes our transient error flake detection to not work. Since
`require` is an upstream dependency, we cannot easily change this.

This change adds a fallback to our flake detection that string
matches for the `TRANSIENT_ERROR` message we add. If found it
will mark the error as a flake to reduce noise.

However, we have seen other cases where we do not preserve the
error object but the code lives somewhere that is easily changeable for us.
In those cases, we ideally should fix the code instead of resorting
to this fallback.

To make sure we still do that, the fallback also explicity checks for
a message that `require.NoError` prepends to all errors. If we find
additional cases that require this fallback, we can review and add
them on a case by case basis.
  • Loading branch information
DarrylWong committed Oct 24, 2024
1 parent ac44457 commit c547767
Show file tree
Hide file tree
Showing 8 changed files with 268 additions and 0 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ go_test(
"//pkg/util/version",
"@com_github_cockroachdb_datadriven//:datadriven",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_redact//:redact",
"@com_github_data_dog_go_sqlmock//:go-sqlmock",
"@com_github_kr_pretty//:pretty",
"@com_github_prometheus_client_golang//prometheus",
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/roachtest/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,9 @@ func (g *githubIssues) createPostRequest(
// error, redirect that to Test Eng with the corresponding label as
// title override.
errWithOwner := failuresAsErrorWithOwnership(failures)
if errWithOwner == nil {
errWithOwner = transientErrorOwnershipFallback(failures)
}
if errWithOwner != nil {
handleErrorWithOwnership(*errWithOwner)
}
Expand Down
9 changes: 9 additions & 0 deletions pkg/cmd/roachtest/github_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/testutils/echotest"
"github.com/cockroachdb/datadriven"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -214,6 +215,14 @@ func TestCreatePostRequest(t *testing.T) {
refError = registry.ErrorWithOwner(registry.OwnerSQLFoundations, refError)
case "error-with-owner-test-eng":
refError = registry.ErrorWithOwner(registry.OwnerTestEng, refError)
case "require-no-error-failed":
// Attempts to mimic how the require package creates failures by losing
// the error object and prepending a message. Similar to above we don't use
// %+v to avoid stack traces.
refError = errors.Newf("Received unexpected error:\n%s", redact.SafeString(refError.Error()))
case "lose-error-object":
// Lose the error object which should make our flake detection fail.
refError = errors.Newf("%s", redact.SafeString(refError.Error()))
}
}
}
Expand Down
65 changes: 65 additions & 0 deletions pkg/cmd/roachtest/test_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"io"
"math/rand"
"os"
"regexp"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -548,6 +549,70 @@ func failuresMatchingError(failures []failure, refError any) bool {
return false
}

var transientErrorRegex = regexp.MustCompile(`TRANSIENT_ERROR\((.+)\)`)

// transientErrorOwnershipFallback string matches for `TRANSIENT_ERROR` in the provided
// failures and returns a new ErrorWithOwnership if it does. It iterates through each failure,
// checking both the squashedErr and list of errors for `TRANSIENT_ERROR`.
//
// This is needed as the `require` package does not preserve the error object needed
// for us to properly check if it is a transient error using `failuresMatchingError`.
// Note the match is additionally guarded by the unique substring which denotes that
// the error originated from the require package. If we see somewhere else that does not
// preserve the error object, we want to investigate whether it can be fixed before resorting
// to this fallback. See: #131094
func transientErrorOwnershipFallback(failures []failure) *registry.ErrorWithOwnership {
const unexpectedErrPrefix = "Received unexpected error:"
var errWithOwner registry.ErrorWithOwnership
isTransient := func(err error) bool {
// Both squashedErr and errors should be non nil in actual roachtest failures,
// but for testing we sometimes only set one for simplicity.
if err == nil {
return false
}
// The require package prepends this message to `require.NoError` failures.
// We may see `TRANSIENT_ERROR` without this prefix, but don't mark it as
// a flake as we may be able to fix the code that doesn't preserve the error.
if !strings.Contains(err.Error(), unexpectedErrPrefix) {
return false
}

if match := transientErrorRegex.FindString(err.Error()); match != "" {
problemCause := strings.TrimPrefix(match, "TRANSIENT_ERROR(")
problemCause = strings.TrimSuffix(problemCause, ")")
// The cause will be used to create the github issue creation, so we don't want
// it to be blank. Instead, return false and let us investigate what is creating
// a transient error with no cause.
if problemCause == "" {
return false
}

errWithOwner = registry.ErrorWithOwner(
registry.OwnerTestEng, err,
registry.WithTitleOverride(problemCause),
registry.InfraFlake,
)
return true
}

return false
}

for _, f := range failures {
for _, err := range f.errors {
if isTransient(err) {
return &errWithOwner
}
}

if isTransient(f.squashedErr) {
return &errWithOwner
}
}

return nil
}

func (t *testImpl) ArtifactsDir() string {
return t.artifactsDir
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1227,6 +1227,9 @@ func (r *testRunner) runTest(
if s.Run != nil {
if t.Failed() {
errWithOwner := failuresAsErrorWithOwnership(t.failures())
if errWithOwner == nil {
errWithOwner = transientErrorOwnershipFallback(t.failures())
}
if errWithOwner == nil || !errWithOwner.InfraFlake {
r.status.fail[t] = struct{}{}
}
Expand Down
67 changes: 67 additions & 0 deletions pkg/cmd/roachtest/test_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod"
"github.com/cockroachdb/cockroach/pkg/roachprod/cloud"
rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
Expand Down Expand Up @@ -527,3 +528,69 @@ func TestGCESameDefaultZone(t *testing.T) {
})
}
}

func TestTransientErrorFallback(t *testing.T) {
ctx := context.Background()
stopper := stop.NewStopper()
defer stopper.Stop(ctx)
cr := newClusterRegistry()
runner := newUnitTestRunner(cr, stopper)

var buf syncedBuffer
lopt := loggingOpt{
l: nilLogger(),
tee: logger.NoTee,
stdout: &buf,
stderr: &buf,
artifactsDir: "",
}
copt := clustersOpt{
typ: roachprodCluster,
user: "test_user",
cpuQuota: 1000,
debugMode: NoDebug,
}

// Test that if a test fails with a transient error handled by the `require` package,
// the test runner will correctly still identify it as a flake and the run will have
// no failed tests.
t.Run("Require API", func(t *testing.T) {
mockTest := registry.TestSpec{
Name: `ssh flake`,
Owner: OwnerUnitTest,
Cluster: spec.MakeClusterSpec(0),
CompatibleClouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
CockroachBinary: registry.StandardCockroach,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
require.NoError(t, rperrors.NewSSHError(errors.New("oops")))
},
}
err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */
defaultParallelism, copt, testOpts{}, lopt)
require.NoError(t, err)
})

// Now test that if the transient error is not handled by the `require` package,
// but similarly lost due to casting to a string, the test runner *won't* mark
// it as a flake and we will have a failed test.
t.Run("Require API Not Used", func(t *testing.T) {
mockTest := registry.TestSpec{
Name: `ssh flake`,
Owner: OwnerUnitTest,
Cluster: spec.MakeClusterSpec(0),
CompatibleClouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
CockroachBinary: registry.StandardCockroach,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
err := errors.Newf("%s", rperrors.NewSSHError(errors.New("oops")))
t.Fatal(err)
},
}
err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */
defaultParallelism, copt, testOpts{}, lopt)
if !testutils.IsError(err, "some tests failed") {
t.Fatalf("expected error \"some tests failed\", got: %v", err)
}
})
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# When a transient error is lost as a result of an unknown case
# casting it to a string, check that our fallback transient error handling
# *doesn't* catch it. This should be investigated to determine if it should
# be fixed to preserve the error object or added to the list of exceptions

add-failure name=(oops) type=(ssh-flake, lose-error-object)
----
ok

post
----
----
roachtest.github_test [failed]() on test_branch @ [test_SHA](). A Side-Eye cluster snapshot was captured on timeout: [https://app.side-eye.io/snapshots/1](https://app.side-eye.io/snapshots/1).


```
TRANSIENT_ERROR(ssh_problem): oops
```

Parameters:
- <code>ROACHTEST_arch=amd64</code>
- <code>ROACHTEST_cloud=gce</code>
- <code>ROACHTEST_coverageBuild=false</code>
- <code>ROACHTEST_cpu=4</code>
- <code>ROACHTEST_encrypted=false</code>
- <code>ROACHTEST_fs=ext4</code>
- <code>ROACHTEST_localSSD=true</code>
- <code>ROACHTEST_runtimeAssertionsBuild=false</code>
- <code>ROACHTEST_ssd=0</code>
<details><summary>Help</summary>
<p>


See: [roachtest README](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/README.md)



See: [How To Investigate \(internal\)](https://cockroachlabs.atlassian.net/l/c/SSSBr8c7)



See: [Grafana](https://go.crdb.dev/roachtest-grafana//github-test/1689957243000/1689957853000)

</p>
</details>
/cc @cockroachdb/unowned
<sub>

[This test on roachdash](https://roachdash.crdb.dev/?filter=status:open%20t:.*github_test.*&sort=title+created&display=lastcommented+project) | [Improve this report!](https://github.com/cockroachdb/cockroach/tree/master/pkg/cmd/bazci/githubpost/issues)

</sub>

------
Labels:
- <code>O-roachtest</code>
- <code>C-test-failure</code>
- <code>release-blocker</code>
Rendered:https://github.com/cockroachdb/cockroach/issues/new?body=roachtest.github_test+%5Bfailed%5D%28%29+on+test_branch+%40+%5Btest_SHA%5D%28%29.+A+Side-Eye+cluster+snapshot+was+captured+on+timeout%3A+%5Bhttps%3A%2F%2Fapp.side-eye.io%2Fsnapshots%2F1%5D%28https%3A%2F%2Fapp.side-eye.io%2Fsnapshots%2F1%29.%0A%0A%0A%60%60%60%0ATRANSIENT_ERROR%28ssh_problem%29%3A+oops%0A%60%60%60%0A%0AParameters%3A%0A+-+%3Ccode%3EROACHTEST_arch%3Damd64%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_cloud%3Dgce%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_coverageBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_cpu%3D4%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_encrypted%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_fs%3Dext4%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_localSSD%3Dtrue%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_runtimeAssertionsBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_ssd%3D0%3C%2Fcode%3E%0A%3Cdetails%3E%3Csummary%3EHelp%3C%2Fsummary%3E%0A%3Cp%3E%0A%0A%0ASee%3A+%5Broachtest+README%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Fblob%2Fmaster%2Fpkg%2Fcmd%2Froachtest%2FREADME.md%29%0A%0A%0A%0ASee%3A+%5BHow+To+Investigate+%5C%28internal%5C%29%5D%28https%3A%2F%2Fcockroachlabs.atlassian.net%2Fl%2Fc%2FSSSBr8c7%29%0A%0A%0A%0ASee%3A+%5BGrafana%5D%28https%3A%2F%2Fgo.crdb.dev%2Froachtest-grafana%2F%2Fgithub-test%2F1689957243000%2F1689957853000%29%0A%0A%3C%2Fp%3E%0A%3C%2Fdetails%3E%0A%2Fcc+%40cockroachdb%2Funowned%0A%3Csub%3E%0A%0A%5BThis+test+on+roachdash%5D%28https%3A%2F%2Froachdash.crdb.dev%2F%3Ffilter%3Dstatus%3Aopen%2520t%3A.%2Agithub_test.%2A%26sort%3Dtitle%2Bcreated%26display%3Dlastcommented%2Bproject%29+%7C+%5BImprove+this+report%21%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Ftree%2Fmaster%2Fpkg%2Fcmd%2Fbazci%2Fgithubpost%2Fissues%29%0A%0A%3C%2Fsub%3E%0A%0A------%0ALabels%3A%0A-+%3Ccode%3EO-roachtest%3C%2Fcode%3E%0A-+%3Ccode%3EC-test-failure%3C%2Fcode%3E%0A-+%3Ccode%3Erelease-blocker%3C%2Fcode%3E%0A&title=roachtest%3A+github_test+failed
----
----
60 changes: 60 additions & 0 deletions pkg/cmd/roachtest/testdata/github/require_no_error_transient_error
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# When a transient error is lost as a result of the require package
# casting it to a string, check that our fallback transient error handling
# still catches it.

add-failure name=(oops) type=(ssh-flake, require-no-error-failed)
----
ok

post
----
----
roachtest.ssh_problem [failed]() on test_branch @ [test_SHA](). A Side-Eye cluster snapshot was captured on timeout: [https://app.side-eye.io/snapshots/1](https://app.side-eye.io/snapshots/1).


```
test github_test failed: Received unexpected error:
TRANSIENT_ERROR(ssh_problem): oops
```

Parameters:
- <code>ROACHTEST_arch=amd64</code>
- <code>ROACHTEST_cloud=gce</code>
- <code>ROACHTEST_coverageBuild=false</code>
- <code>ROACHTEST_cpu=4</code>
- <code>ROACHTEST_encrypted=false</code>
- <code>ROACHTEST_fs=ext4</code>
- <code>ROACHTEST_localSSD=true</code>
- <code>ROACHTEST_runtimeAssertionsBuild=false</code>
- <code>ROACHTEST_ssd=0</code>
<details><summary>Help</summary>
<p>


See: [roachtest README](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/README.md)



See: [How To Investigate \(internal\)](https://cockroachlabs.atlassian.net/l/c/SSSBr8c7)



See: [Grafana](https://go.crdb.dev/roachtest-grafana//github-test/1689957243000/1689957853000)

</p>
</details>
/cc @cockroachdb/test-eng
<sub>

[This test on roachdash](https://roachdash.crdb.dev/?filter=status:open%20t:.*ssh_problem.*&sort=title+created&display=lastcommented+project) | [Improve this report!](https://github.com/cockroachdb/cockroach/tree/master/pkg/cmd/bazci/githubpost/issues)

</sub>

------
Labels:
- <code>O-roachtest</code>
- <code>X-infra-flake</code>
- <code>T-testeng</code>
Rendered:https://github.com/cockroachdb/cockroach/issues/new?body=roachtest.ssh_problem+%5Bfailed%5D%28%29+on+test_branch+%40+%5Btest_SHA%5D%28%29.+A+Side-Eye+cluster+snapshot+was+captured+on+timeout%3A+%5Bhttps%3A%2F%2Fapp.side-eye.io%2Fsnapshots%2F1%5D%28https%3A%2F%2Fapp.side-eye.io%2Fsnapshots%2F1%29.%0A%0A%0A%60%60%60%0Atest+github_test+failed%3A+Received+unexpected+error%3A%0ATRANSIENT_ERROR%28ssh_problem%29%3A+oops%0A%60%60%60%0A%0AParameters%3A%0A+-+%3Ccode%3EROACHTEST_arch%3Damd64%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_cloud%3Dgce%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_coverageBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_cpu%3D4%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_encrypted%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_fs%3Dext4%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_localSSD%3Dtrue%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_runtimeAssertionsBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_ssd%3D0%3C%2Fcode%3E%0A%3Cdetails%3E%3Csummary%3EHelp%3C%2Fsummary%3E%0A%3Cp%3E%0A%0A%0ASee%3A+%5Broachtest+README%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Fblob%2Fmaster%2Fpkg%2Fcmd%2Froachtest%2FREADME.md%29%0A%0A%0A%0ASee%3A+%5BHow+To+Investigate+%5C%28internal%5C%29%5D%28https%3A%2F%2Fcockroachlabs.atlassian.net%2Fl%2Fc%2FSSSBr8c7%29%0A%0A%0A%0ASee%3A+%5BGrafana%5D%28https%3A%2F%2Fgo.crdb.dev%2Froachtest-grafana%2F%2Fgithub-test%2F1689957243000%2F1689957853000%29%0A%0A%3C%2Fp%3E%0A%3C%2Fdetails%3E%0A%2Fcc+%40cockroachdb%2Ftest-eng%0A%3Csub%3E%0A%0A%5BThis+test+on+roachdash%5D%28https%3A%2F%2Froachdash.crdb.dev%2F%3Ffilter%3Dstatus%3Aopen%2520t%3A.%2Assh_problem.%2A%26sort%3Dtitle%2Bcreated%26display%3Dlastcommented%2Bproject%29+%7C+%5BImprove+this+report%21%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Ftree%2Fmaster%2Fpkg%2Fcmd%2Fbazci%2Fgithubpost%2Fissues%29%0A%0A%3C%2Fsub%3E%0A%0A------%0ALabels%3A%0A-+%3Ccode%3EO-roachtest%3C%2Fcode%3E%0A-+%3Ccode%3EX-infra-flake%3C%2Fcode%3E%0A-+%3Ccode%3ET-testeng%3C%2Fcode%3E%0A&title=roachtest%3A+ssh_problem+failed
----
----

0 comments on commit c547767

Please sign in to comment.