Skip to content

Commit

Permalink
roachtest: treat cluster setup errros the same as creation errors
Browse files Browse the repository at this point in the history
After a cluster is created (or reused) and is ready to run a test, the
test runner will upload initial files (such as a cockroach binary and
required libraries) to all nodes. When this process fails (typically
because of SSH flakes), the worker returns an error, causing the
entire test suite to terminate early before all tests had a chance to
run.

This is not ideal since these SSH flakes are intermittent and should
not cause the test runner to stop. In this commit, we unify the
handling of these errors with handling creation failures: the error is
marked as a "cluster creation error" flake, causing it to be reported
as such. Crucially, the roachtest worker will not return an error and
other tests will continue to run.

Fixes: cockroachdb#109279

Release note: None
  • Loading branch information
renatolabs committed Aug 31, 2023
1 parent d435960 commit de4ee43
Showing 1 changed file with 27 additions and 11 deletions.
38 changes: 27 additions & 11 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,6 @@ func (r *testRunner) runWorker(
wStatus.SetTest(nil /* test */, testToRun)
c, vmCreateOpts, clusterCreateErr = allocateCluster(ctx, testToRun.spec, arch, testToRun.alloc, artifactsRootDir, wStatus)
if clusterCreateErr != nil {
clusterCreateErr = errors.Mark(clusterCreateErr, errClusterProvisioningFailed)
atomic.AddInt32(&r.numClusterErrs, 1)
shout(ctx, l, stdout, "Unable to create (or reuse) cluster for test %s due to: %s.",
testToRun.spec.Name, clusterCreateErr)
Expand Down Expand Up @@ -736,30 +735,47 @@ func (r *testRunner) runWorker(
}
github := newGithubIssues(r.config.disableIssue, c, vmCreateOpts)

if clusterCreateErr != nil {
// N.B. cluster creation must have failed...
// We don't want to prematurely abort the test suite since it's likely a transient issue.
// Instead, let's report an infrastructure issue, mark the test as failed and continue with the next test.
// handleClusterCreationFailure can be called when the `err` given
// occurred for reasons related to creating or setting up a
// cluster for a test.
handleClusterCreationFailure := func(err error) {
// Marking the error with this sentinel error allows the GitHub
// issue poster to detect this is an infrastructure flake and
// post the issue accordingly.
clusterError := errors.Mark(err, errClusterProvisioningFailed)
t.Error(clusterError)

// Generate failure reason and mark the test failed to preclude fetching (cluster) artifacts.
t.Error(clusterCreateErr)
// N.B. issue title is of the form "roachtest: ${t.spec.Name} failed" (see UnitTestFormatter).
if err := github.MaybePost(t, l, t.failureMsg()); err != nil {
shout(ctx, l, stdout, "failed to post issue: %s", err)
}
}

if clusterCreateErr != nil {
// N.B. cluster creation failed. We mark the test as failed and
// continue with the next test.
handleClusterCreationFailure(clusterCreateErr)
} else {
// Now run the test.
l.PrintfCtx(ctx, "Starting test: %s:%d on cluster=%s (arch=%q)", testToRun.spec.Name, testToRun.runNum, c.Name(), arch)

c.setTest(t)

var setupErr error
if c.spec.NodeCount > 0 { // skip during tests
err = c.PutDefaultCockroach(ctx, l, t.Cockroach())
setupErr = c.PutDefaultCockroach(ctx, l, t.Cockroach())
}
if err == nil {
err = c.PutLibraries(ctx, "./lib", t.spec.NativeLibs)
if setupErr == nil {
setupErr = c.PutLibraries(ctx, "./lib", t.spec.NativeLibs)
}

if err == nil {
if setupErr != nil {
// If there was an error setting up the cluster (uploading
// initial files), we treat the error just like a cluster
// creation failure: the error is reported as an
// infrastructure flake, and we continue with the next test.
handleClusterCreationFailure(setupErr)
} else {
// Tell the cluster that, from now on, it will be run "on behalf of this
// test".
c.status("running test")
Expand Down

0 comments on commit de4ee43

Please sign in to comment.