Merge #122783

122783: ci: autokill roachtest nightlies when failure rate exceeds threshold r=vidit-bhat a=vidit-bhat Nightly roachtests are fairly stable, exhibiting failure rates < 5%, on average. Occasionally, a regression, typically merged the day of the nightly run, or a infrastructure change/transient issue, can result in a cascade of failures. Since a high failure rate is likely indicative of an issue which may impact a large subset of the roachtests, the preference is to kill the CI job on the grounds of having reached a point of diminished returns. This PR introduces a roachtest CLI argument, `auto-kill-threshold`, which when exceeded would auto-kill the nightlies. Epic: none Fixes: #120160 Release note: None Co-authored-by: Vidit Bhat <[email protected]>
cockroachdb · May 6, 2024 · 4dc9d94 · 4dc9d94
2 parents 6299317 + 6a1c6b6
commit 4dc9d94
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 11 deletions.
diff --git a/build/teamcity/cockroach/nightlies/roachtest_nightly_impl.sh b/build/teamcity/cockroach/nightlies/roachtest_nightly_impl.sh
@@ -63,6 +63,7 @@ build/teamcity-roachtest-invoke.sh \
   --cloud="${CLOUD}" \
   --count="${COUNT-1}" \
   --clear-cluster-cache="${CLEAR_CLUSTER_CACHE:-true}" \
+  --auto-kill-threshold="${AUTO_KILL_THRESHOLD:-0.05}" \
   --parallelism="${PARALLELISM}" \
   --cpu-quota="${CPUQUOTA}" \
   --cluster-id="${TC_BUILD_ID}" \

diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go
@@ -82,6 +82,26 @@ func generateHelpCommand(
 	}
 }
 
+func failuresAsErrorWithOwnership(failures []failure) *registry.ErrorWithOwnership {
+	var transientError rperrors.TransientError
+	var err registry.ErrorWithOwnership
+	if failuresMatchingError(failures, &transientError) {
+		err = registry.ErrorWithOwner(
+			registry.OwnerTestEng, transientError,
+			registry.WithTitleOverride(transientError.Cause),
+			registry.InfraFlake,
+		)
+
+		return &err
+	}
+
+	if errWithOwner := failuresSpecifyOwner(failures); errWithOwner != nil {
+		return errWithOwner
+	}
+
+	return nil
+}
+
 // postIssueCondition encapsulates a condition that causes issue
 // posting to be skipped. The `reason` field contains a textual
 // description as to why issue posting was skipped.
@@ -167,19 +187,12 @@ func (g *githubIssues) createPostRequest(
 	}
 
 	issueClusterName := ""
-	errWithOwnership := failuresSpecifyOwner(failures)
-	var transientError rperrors.TransientError
 	// If we find a failure that was labeled as a roachprod transient
 	// error, redirect that to Test Eng with the corresponding label as
 	// title override.
-	if failuresMatchingError(failures, &transientError) {
-		handleErrorWithOwnership(registry.ErrorWithOwner(
-			registry.OwnerTestEng, transientError,
-			registry.WithTitleOverride(transientError.Cause),
-			registry.InfraFlake,
-		))
-	} else if errWithOwnership != nil {
-		handleErrorWithOwnership(*errWithOwnership)
+	errWithOwner := failuresAsErrorWithOwnership(failures)
+	if errWithOwner != nil {
+		handleErrorWithOwnership(*errWithOwner)
 	}
 
 	// Issues posted from roachtest are identifiable as such, and they are also release blockers

diff --git a/pkg/cmd/roachtest/roachtestflags/flags.go b/pkg/cmd/roachtest/roachtestflags/flags.go
@@ -357,6 +357,12 @@ var (
 		Usage: `Use SpotVM to run tests, If the provider does not support spotVM, it will be ignored`,
 	})
 
+	AutoKillThreshold float64 = 1.0
+	_                         = registerRunFlag(&AutoKillThreshold, FlagInfo{
+		Name:  "auto-kill-threshold",
+		Usage: `Percentage of failed tests before all remaining tests are automatically terminated.`,
+	})
+
 	GlobalSeed int64 = randutil.NewPseudoSeed()
 	_                = registerRunFlag(&GlobalSeed, FlagInfo{
 		Name:  "global-seed",

diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
@@ -371,6 +371,7 @@ func (r *testRunner) Run(
 				lopt,
 				topt,
 				l,
+				n*count,
 			)
 
 			if err != nil {
@@ -552,6 +553,7 @@ func (r *testRunner) runWorker(
 	lopt loggingOpt,
 	topt testOpts,
 	l *logger.Logger,
+	maxTotalFailures int,
 ) error {
 	stdout := lopt.stdout
 
@@ -610,6 +612,14 @@ func (r *testRunner) runWorker(
 			}
 		}
 
+		// stop the tests if the failure rate has been exceeded
+		r.status.Lock()
+		failureRate := float64(len(r.status.fail)) / float64(maxTotalFailures)
+		r.status.Unlock()
+		if failureRate > roachtestflags.AutoKillThreshold {
+			return errors.Errorf("failure rate %.2f exceeds limit %.2f", failureRate, roachtestflags.AutoKillThreshold)
+		}
+
 		wStatus.SetTest(nil /* test */, testToRunRes{})
 
 		testToRun := testToRunRes{noWork: true}
@@ -1140,7 +1150,10 @@ func (r *testRunner) runTest(
 		// Only include tests with a Run function in the summary output.
 		if s.Run != nil {
 			if t.Failed() {
-				r.status.fail[t] = struct{}{}
+				errWithOwner := failuresAsErrorWithOwnership(t.failures())
+				if errWithOwner == nil || !errWithOwner.InfraFlake {
+					r.status.fail[t] = struct{}{}
+				}
 			} else if s.Skip != "" {
 				r.status.skip[t] = struct{}{}
 			} else {