From c54776761badfc23b189e10225439d7f8ad06113 Mon Sep 17 00:00:00 2001 From: DarrylWong Date: Tue, 15 Oct 2024 15:49:49 -0400 Subject: [PATCH] roachtest: string match for transient errors as a fallback The `require` package is commonly used through roachtest to assert that no error occured. i.e. `require.NoError(t, err)` However, this function does not preserve the error object. This causes our transient error flake detection to not work. Since `require` is an upstream dependency, we cannot easily change this. This change adds a fallback to our flake detection that string matches for the `TRANSIENT_ERROR` message we add. If found it will mark the error as a flake to reduce noise. However, we have seen other cases where we do not preserve the error object but the code lives somewhere that is easily changeable for us. In those cases, we ideally should fix the code instead of resorting to this fallback. To make sure we still do that, the fallback also explicity checks for a message that `require.NoError` prepends to all errors. If we find additional cases that require this fallback, we can review and add them on a case by case basis. --- pkg/cmd/roachtest/BUILD.bazel | 1 + pkg/cmd/roachtest/github.go | 3 + pkg/cmd/roachtest/github_test.go | 9 +++ pkg/cmd/roachtest/test_impl.go | 65 ++++++++++++++++++ pkg/cmd/roachtest/test_runner.go | 3 + pkg/cmd/roachtest/test_test.go | 67 +++++++++++++++++++ .../lost_error_object_and_transient_error | 60 +++++++++++++++++ .../github/require_no_error_transient_error | 60 +++++++++++++++++ 8 files changed, 268 insertions(+) create mode 100644 pkg/cmd/roachtest/testdata/github/lost_error_object_and_transient_error create mode 100644 pkg/cmd/roachtest/testdata/github/require_no_error_transient_error diff --git a/pkg/cmd/roachtest/BUILD.bazel b/pkg/cmd/roachtest/BUILD.bazel index efae87705d1b..f532bbc04084 100644 --- a/pkg/cmd/roachtest/BUILD.bazel +++ b/pkg/cmd/roachtest/BUILD.bazel @@ -127,6 +127,7 @@ go_test( "//pkg/util/version", "@com_github_cockroachdb_datadriven//:datadriven", "@com_github_cockroachdb_errors//:errors", + "@com_github_cockroachdb_redact//:redact", "@com_github_data_dog_go_sqlmock//:go-sqlmock", "@com_github_kr_pretty//:pretty", "@com_github_prometheus_client_golang//prometheus", diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go index 0c71712f20ab..5691127c51a3 100644 --- a/pkg/cmd/roachtest/github.go +++ b/pkg/cmd/roachtest/github.go @@ -210,6 +210,9 @@ func (g *githubIssues) createPostRequest( // error, redirect that to Test Eng with the corresponding label as // title override. errWithOwner := failuresAsErrorWithOwnership(failures) + if errWithOwner == nil { + errWithOwner = transientErrorOwnershipFallback(failures) + } if errWithOwner != nil { handleErrorWithOwnership(*errWithOwner) } diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go index 7da19923c5bf..61c7466d0e66 100644 --- a/pkg/cmd/roachtest/github_test.go +++ b/pkg/cmd/roachtest/github_test.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/testutils/echotest" "github.com/cockroachdb/datadriven" "github.com/cockroachdb/errors" + "github.com/cockroachdb/redact" "github.com/stretchr/testify/require" ) @@ -214,6 +215,14 @@ func TestCreatePostRequest(t *testing.T) { refError = registry.ErrorWithOwner(registry.OwnerSQLFoundations, refError) case "error-with-owner-test-eng": refError = registry.ErrorWithOwner(registry.OwnerTestEng, refError) + case "require-no-error-failed": + // Attempts to mimic how the require package creates failures by losing + // the error object and prepending a message. Similar to above we don't use + // %+v to avoid stack traces. + refError = errors.Newf("Received unexpected error:\n%s", redact.SafeString(refError.Error())) + case "lose-error-object": + // Lose the error object which should make our flake detection fail. + refError = errors.Newf("%s", redact.SafeString(refError.Error())) } } } diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go index 05da8257f8d5..fa677cd1e15c 100644 --- a/pkg/cmd/roachtest/test_impl.go +++ b/pkg/cmd/roachtest/test_impl.go @@ -11,6 +11,7 @@ import ( "io" "math/rand" "os" + "regexp" "strings" "sync" "time" @@ -548,6 +549,70 @@ func failuresMatchingError(failures []failure, refError any) bool { return false } +var transientErrorRegex = regexp.MustCompile(`TRANSIENT_ERROR\((.+)\)`) + +// transientErrorOwnershipFallback string matches for `TRANSIENT_ERROR` in the provided +// failures and returns a new ErrorWithOwnership if it does. It iterates through each failure, +// checking both the squashedErr and list of errors for `TRANSIENT_ERROR`. +// +// This is needed as the `require` package does not preserve the error object needed +// for us to properly check if it is a transient error using `failuresMatchingError`. +// Note the match is additionally guarded by the unique substring which denotes that +// the error originated from the require package. If we see somewhere else that does not +// preserve the error object, we want to investigate whether it can be fixed before resorting +// to this fallback. See: #131094 +func transientErrorOwnershipFallback(failures []failure) *registry.ErrorWithOwnership { + const unexpectedErrPrefix = "Received unexpected error:" + var errWithOwner registry.ErrorWithOwnership + isTransient := func(err error) bool { + // Both squashedErr and errors should be non nil in actual roachtest failures, + // but for testing we sometimes only set one for simplicity. + if err == nil { + return false + } + // The require package prepends this message to `require.NoError` failures. + // We may see `TRANSIENT_ERROR` without this prefix, but don't mark it as + // a flake as we may be able to fix the code that doesn't preserve the error. + if !strings.Contains(err.Error(), unexpectedErrPrefix) { + return false + } + + if match := transientErrorRegex.FindString(err.Error()); match != "" { + problemCause := strings.TrimPrefix(match, "TRANSIENT_ERROR(") + problemCause = strings.TrimSuffix(problemCause, ")") + // The cause will be used to create the github issue creation, so we don't want + // it to be blank. Instead, return false and let us investigate what is creating + // a transient error with no cause. + if problemCause == "" { + return false + } + + errWithOwner = registry.ErrorWithOwner( + registry.OwnerTestEng, err, + registry.WithTitleOverride(problemCause), + registry.InfraFlake, + ) + return true + } + + return false + } + + for _, f := range failures { + for _, err := range f.errors { + if isTransient(err) { + return &errWithOwner + } + } + + if isTransient(f.squashedErr) { + return &errWithOwner + } + } + + return nil +} + func (t *testImpl) ArtifactsDir() string { return t.artifactsDir } diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index f084f83c013a..00d3f76b294f 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -1227,6 +1227,9 @@ func (r *testRunner) runTest( if s.Run != nil { if t.Failed() { errWithOwner := failuresAsErrorWithOwnership(t.failures()) + if errWithOwner == nil { + errWithOwner = transientErrorOwnershipFallback(t.failures()) + } if errWithOwner == nil || !errWithOwner.InfraFlake { r.status.fail[t] = struct{}{} } diff --git a/pkg/cmd/roachtest/test_test.go b/pkg/cmd/roachtest/test_test.go index 7d10161dcf24..c384550d08ff 100644 --- a/pkg/cmd/roachtest/test_test.go +++ b/pkg/cmd/roachtest/test_test.go @@ -25,6 +25,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod" "github.com/cockroachdb/cockroach/pkg/roachprod/cloud" + rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce" @@ -527,3 +528,69 @@ func TestGCESameDefaultZone(t *testing.T) { }) } } + +func TestTransientErrorFallback(t *testing.T) { + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + cr := newClusterRegistry() + runner := newUnitTestRunner(cr, stopper) + + var buf syncedBuffer + lopt := loggingOpt{ + l: nilLogger(), + tee: logger.NoTee, + stdout: &buf, + stderr: &buf, + artifactsDir: "", + } + copt := clustersOpt{ + typ: roachprodCluster, + user: "test_user", + cpuQuota: 1000, + debugMode: NoDebug, + } + + // Test that if a test fails with a transient error handled by the `require` package, + // the test runner will correctly still identify it as a flake and the run will have + // no failed tests. + t.Run("Require API", func(t *testing.T) { + mockTest := registry.TestSpec{ + Name: `ssh flake`, + Owner: OwnerUnitTest, + Cluster: spec.MakeClusterSpec(0), + CompatibleClouds: registry.AllExceptAWS, + Suites: registry.Suites(registry.Nightly), + CockroachBinary: registry.StandardCockroach, + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + require.NoError(t, rperrors.NewSSHError(errors.New("oops"))) + }, + } + err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */ + defaultParallelism, copt, testOpts{}, lopt) + require.NoError(t, err) + }) + + // Now test that if the transient error is not handled by the `require` package, + // but similarly lost due to casting to a string, the test runner *won't* mark + // it as a flake and we will have a failed test. + t.Run("Require API Not Used", func(t *testing.T) { + mockTest := registry.TestSpec{ + Name: `ssh flake`, + Owner: OwnerUnitTest, + Cluster: spec.MakeClusterSpec(0), + CompatibleClouds: registry.AllExceptAWS, + Suites: registry.Suites(registry.Nightly), + CockroachBinary: registry.StandardCockroach, + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + err := errors.Newf("%s", rperrors.NewSSHError(errors.New("oops"))) + t.Fatal(err) + }, + } + err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */ + defaultParallelism, copt, testOpts{}, lopt) + if !testutils.IsError(err, "some tests failed") { + t.Fatalf("expected error \"some tests failed\", got: %v", err) + } + }) +} diff --git a/pkg/cmd/roachtest/testdata/github/lost_error_object_and_transient_error b/pkg/cmd/roachtest/testdata/github/lost_error_object_and_transient_error new file mode 100644 index 000000000000..a7fef522f801 --- /dev/null +++ b/pkg/cmd/roachtest/testdata/github/lost_error_object_and_transient_error @@ -0,0 +1,60 @@ +# When a transient error is lost as a result of an unknown case +# casting it to a string, check that our fallback transient error handling +# *doesn't* catch it. This should be investigated to determine if it should +# be fixed to preserve the error object or added to the list of exceptions + +add-failure name=(oops) type=(ssh-flake, lose-error-object) +---- +ok + +post +---- +---- +roachtest.github_test [failed]() on test_branch @ [test_SHA](). A Side-Eye cluster snapshot was captured on timeout: [https://app.side-eye.io/snapshots/1](https://app.side-eye.io/snapshots/1). + + +``` +TRANSIENT_ERROR(ssh_problem): oops +``` + +Parameters: + - ROACHTEST_arch=amd64 + - ROACHTEST_cloud=gce + - ROACHTEST_coverageBuild=false + - ROACHTEST_cpu=4 + - ROACHTEST_encrypted=false + - ROACHTEST_fs=ext4 + - ROACHTEST_localSSD=true + - ROACHTEST_runtimeAssertionsBuild=false + - ROACHTEST_ssd=0 +
Help +

+ + +See: [roachtest README](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/README.md) + + + +See: [How To Investigate \(internal\)](https://cockroachlabs.atlassian.net/l/c/SSSBr8c7) + + + +See: [Grafana](https://go.crdb.dev/roachtest-grafana//github-test/1689957243000/1689957853000) + +

+
+/cc @cockroachdb/unowned + + +[This test on roachdash](https://roachdash.crdb.dev/?filter=status:open%20t:.*github_test.*&sort=title+created&display=lastcommented+project) | [Improve this report!](https://github.com/cockroachdb/cockroach/tree/master/pkg/cmd/bazci/githubpost/issues) + + + +------ +Labels: +- O-roachtest +- C-test-failure +- release-blocker +Rendered:https://github.com/cockroachdb/cockroach/issues/new?body=roachtest.github_test+%5Bfailed%5D%28%29+on+test_branch+%40+%5Btest_SHA%5D%28%29.+A+Side-Eye+cluster+snapshot+was+captured+on+timeout%3A+%5Bhttps%3A%2F%2Fapp.side-eye.io%2Fsnapshots%2F1%5D%28https%3A%2F%2Fapp.side-eye.io%2Fsnapshots%2F1%29.%0A%0A%0A%60%60%60%0ATRANSIENT_ERROR%28ssh_problem%29%3A+oops%0A%60%60%60%0A%0AParameters%3A%0A+-+%3Ccode%3EROACHTEST_arch%3Damd64%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_cloud%3Dgce%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_coverageBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_cpu%3D4%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_encrypted%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_fs%3Dext4%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_localSSD%3Dtrue%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_runtimeAssertionsBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_ssd%3D0%3C%2Fcode%3E%0A%3Cdetails%3E%3Csummary%3EHelp%3C%2Fsummary%3E%0A%3Cp%3E%0A%0A%0ASee%3A+%5Broachtest+README%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Fblob%2Fmaster%2Fpkg%2Fcmd%2Froachtest%2FREADME.md%29%0A%0A%0A%0ASee%3A+%5BHow+To+Investigate+%5C%28internal%5C%29%5D%28https%3A%2F%2Fcockroachlabs.atlassian.net%2Fl%2Fc%2FSSSBr8c7%29%0A%0A%0A%0ASee%3A+%5BGrafana%5D%28https%3A%2F%2Fgo.crdb.dev%2Froachtest-grafana%2F%2Fgithub-test%2F1689957243000%2F1689957853000%29%0A%0A%3C%2Fp%3E%0A%3C%2Fdetails%3E%0A%2Fcc+%40cockroachdb%2Funowned%0A%3Csub%3E%0A%0A%5BThis+test+on+roachdash%5D%28https%3A%2F%2Froachdash.crdb.dev%2F%3Ffilter%3Dstatus%3Aopen%2520t%3A.%2Agithub_test.%2A%26sort%3Dtitle%2Bcreated%26display%3Dlastcommented%2Bproject%29+%7C+%5BImprove+this+report%21%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Ftree%2Fmaster%2Fpkg%2Fcmd%2Fbazci%2Fgithubpost%2Fissues%29%0A%0A%3C%2Fsub%3E%0A%0A------%0ALabels%3A%0A-+%3Ccode%3EO-roachtest%3C%2Fcode%3E%0A-+%3Ccode%3EC-test-failure%3C%2Fcode%3E%0A-+%3Ccode%3Erelease-blocker%3C%2Fcode%3E%0A&title=roachtest%3A+github_test+failed +---- +---- diff --git a/pkg/cmd/roachtest/testdata/github/require_no_error_transient_error b/pkg/cmd/roachtest/testdata/github/require_no_error_transient_error new file mode 100644 index 000000000000..a1ccb0b3d423 --- /dev/null +++ b/pkg/cmd/roachtest/testdata/github/require_no_error_transient_error @@ -0,0 +1,60 @@ +# When a transient error is lost as a result of the require package +# casting it to a string, check that our fallback transient error handling +# still catches it. + +add-failure name=(oops) type=(ssh-flake, require-no-error-failed) +---- +ok + +post +---- +---- +roachtest.ssh_problem [failed]() on test_branch @ [test_SHA](). A Side-Eye cluster snapshot was captured on timeout: [https://app.side-eye.io/snapshots/1](https://app.side-eye.io/snapshots/1). + + +``` +test github_test failed: Received unexpected error: +TRANSIENT_ERROR(ssh_problem): oops +``` + +Parameters: + - ROACHTEST_arch=amd64 + - ROACHTEST_cloud=gce + - ROACHTEST_coverageBuild=false + - ROACHTEST_cpu=4 + - ROACHTEST_encrypted=false + - ROACHTEST_fs=ext4 + - ROACHTEST_localSSD=true + - ROACHTEST_runtimeAssertionsBuild=false + - ROACHTEST_ssd=0 +
Help +

+ + +See: [roachtest README](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/README.md) + + + +See: [How To Investigate \(internal\)](https://cockroachlabs.atlassian.net/l/c/SSSBr8c7) + + + +See: [Grafana](https://go.crdb.dev/roachtest-grafana//github-test/1689957243000/1689957853000) + +

+
+/cc @cockroachdb/test-eng + + +[This test on roachdash](https://roachdash.crdb.dev/?filter=status:open%20t:.*ssh_problem.*&sort=title+created&display=lastcommented+project) | [Improve this report!](https://github.com/cockroachdb/cockroach/tree/master/pkg/cmd/bazci/githubpost/issues) + + + +------ +Labels: +- O-roachtest +- X-infra-flake +- T-testeng +Rendered:https://github.com/cockroachdb/cockroach/issues/new?body=roachtest.ssh_problem+%5Bfailed%5D%28%29+on+test_branch+%40+%5Btest_SHA%5D%28%29.+A+Side-Eye+cluster+snapshot+was+captured+on+timeout%3A+%5Bhttps%3A%2F%2Fapp.side-eye.io%2Fsnapshots%2F1%5D%28https%3A%2F%2Fapp.side-eye.io%2Fsnapshots%2F1%29.%0A%0A%0A%60%60%60%0Atest+github_test+failed%3A+Received+unexpected+error%3A%0ATRANSIENT_ERROR%28ssh_problem%29%3A+oops%0A%60%60%60%0A%0AParameters%3A%0A+-+%3Ccode%3EROACHTEST_arch%3Damd64%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_cloud%3Dgce%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_coverageBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_cpu%3D4%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_encrypted%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_fs%3Dext4%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_localSSD%3Dtrue%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_runtimeAssertionsBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_ssd%3D0%3C%2Fcode%3E%0A%3Cdetails%3E%3Csummary%3EHelp%3C%2Fsummary%3E%0A%3Cp%3E%0A%0A%0ASee%3A+%5Broachtest+README%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Fblob%2Fmaster%2Fpkg%2Fcmd%2Froachtest%2FREADME.md%29%0A%0A%0A%0ASee%3A+%5BHow+To+Investigate+%5C%28internal%5C%29%5D%28https%3A%2F%2Fcockroachlabs.atlassian.net%2Fl%2Fc%2FSSSBr8c7%29%0A%0A%0A%0ASee%3A+%5BGrafana%5D%28https%3A%2F%2Fgo.crdb.dev%2Froachtest-grafana%2F%2Fgithub-test%2F1689957243000%2F1689957853000%29%0A%0A%3C%2Fp%3E%0A%3C%2Fdetails%3E%0A%2Fcc+%40cockroachdb%2Ftest-eng%0A%3Csub%3E%0A%0A%5BThis+test+on+roachdash%5D%28https%3A%2F%2Froachdash.crdb.dev%2F%3Ffilter%3Dstatus%3Aopen%2520t%3A.%2Assh_problem.%2A%26sort%3Dtitle%2Bcreated%26display%3Dlastcommented%2Bproject%29+%7C+%5BImprove+this+report%21%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Ftree%2Fmaster%2Fpkg%2Fcmd%2Fbazci%2Fgithubpost%2Fissues%29%0A%0A%3C%2Fsub%3E%0A%0A------%0ALabels%3A%0A-+%3Ccode%3EO-roachtest%3C%2Fcode%3E%0A-+%3Ccode%3EX-infra-flake%3C%2Fcode%3E%0A-+%3Ccode%3ET-testeng%3C%2Fcode%3E%0A&title=roachtest%3A+ssh_problem+failed +---- +----