From de4ee4316368f6db1c3d9e84e16acff3ae5e2652 Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Thu, 31 Aug 2023 10:30:22 -0400 Subject: [PATCH] roachtest: treat cluster setup errros the same as creation errors After a cluster is created (or reused) and is ready to run a test, the test runner will upload initial files (such as a cockroach binary and required libraries) to all nodes. When this process fails (typically because of SSH flakes), the worker returns an error, causing the entire test suite to terminate early before all tests had a chance to run. This is not ideal since these SSH flakes are intermittent and should not cause the test runner to stop. In this commit, we unify the handling of these errors with handling creation failures: the error is marked as a "cluster creation error" flake, causing it to be reported as such. Crucially, the roachtest worker will not return an error and other tests will continue to run. Fixes: #109279 Release note: None --- pkg/cmd/roachtest/test_runner.go | 38 +++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index faa032a42447..d700f86ad560 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -688,7 +688,6 @@ func (r *testRunner) runWorker( wStatus.SetTest(nil /* test */, testToRun) c, vmCreateOpts, clusterCreateErr = allocateCluster(ctx, testToRun.spec, arch, testToRun.alloc, artifactsRootDir, wStatus) if clusterCreateErr != nil { - clusterCreateErr = errors.Mark(clusterCreateErr, errClusterProvisioningFailed) atomic.AddInt32(&r.numClusterErrs, 1) shout(ctx, l, stdout, "Unable to create (or reuse) cluster for test %s due to: %s.", testToRun.spec.Name, clusterCreateErr) @@ -736,30 +735,47 @@ func (r *testRunner) runWorker( } github := newGithubIssues(r.config.disableIssue, c, vmCreateOpts) - if clusterCreateErr != nil { - // N.B. cluster creation must have failed... - // We don't want to prematurely abort the test suite since it's likely a transient issue. - // Instead, let's report an infrastructure issue, mark the test as failed and continue with the next test. + // handleClusterCreationFailure can be called when the `err` given + // occurred for reasons related to creating or setting up a + // cluster for a test. + handleClusterCreationFailure := func(err error) { + // Marking the error with this sentinel error allows the GitHub + // issue poster to detect this is an infrastructure flake and + // post the issue accordingly. + clusterError := errors.Mark(err, errClusterProvisioningFailed) + t.Error(clusterError) - // Generate failure reason and mark the test failed to preclude fetching (cluster) artifacts. - t.Error(clusterCreateErr) // N.B. issue title is of the form "roachtest: ${t.spec.Name} failed" (see UnitTestFormatter). if err := github.MaybePost(t, l, t.failureMsg()); err != nil { shout(ctx, l, stdout, "failed to post issue: %s", err) } + } + + if clusterCreateErr != nil { + // N.B. cluster creation failed. We mark the test as failed and + // continue with the next test. + handleClusterCreationFailure(clusterCreateErr) } else { // Now run the test. l.PrintfCtx(ctx, "Starting test: %s:%d on cluster=%s (arch=%q)", testToRun.spec.Name, testToRun.runNum, c.Name(), arch) c.setTest(t) + + var setupErr error if c.spec.NodeCount > 0 { // skip during tests - err = c.PutDefaultCockroach(ctx, l, t.Cockroach()) + setupErr = c.PutDefaultCockroach(ctx, l, t.Cockroach()) } - if err == nil { - err = c.PutLibraries(ctx, "./lib", t.spec.NativeLibs) + if setupErr == nil { + setupErr = c.PutLibraries(ctx, "./lib", t.spec.NativeLibs) } - if err == nil { + if setupErr != nil { + // If there was an error setting up the cluster (uploading + // initial files), we treat the error just like a cluster + // creation failure: the error is reported as an + // infrastructure flake, and we continue with the next test. + handleClusterCreationFailure(setupErr) + } else { // Tell the cluster that, from now on, it will be run "on behalf of this // test". c.status("running test")