Skip to content

Commit

Permalink
Merge #122783
Browse files Browse the repository at this point in the history
122783: ci: autokill roachtest nightlies when failure rate exceeds threshold r=vidit-bhat a=vidit-bhat

Nightly roachtests are fairly stable, exhibiting failure rates < 5%, on average. Occasionally, a regression, typically merged the day of the nightly run, or a infrastructure change/transient issue, can result in a cascade of failures. Since a high failure rate is likely indicative of an issue which may impact a large subset of the roachtests, the preference is to kill the CI job on the grounds of having reached a point of diminished returns.

This PR introduces a roachtest CLI argument, `auto-kill-threshold`, which when exceeded would auto-kill the nightlies.

Epic: none
Fixes: #120160 
Release note: None

Co-authored-by: Vidit Bhat <[email protected]>
  • Loading branch information
craig[bot] and vidit-bhat committed May 6, 2024
2 parents 6299317 + 6a1c6b6 commit 4dc9d94
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ build/teamcity-roachtest-invoke.sh \
--cloud="${CLOUD}" \
--count="${COUNT-1}" \
--clear-cluster-cache="${CLEAR_CLUSTER_CACHE:-true}" \
--auto-kill-threshold="${AUTO_KILL_THRESHOLD:-0.05}" \
--parallelism="${PARALLELISM}" \
--cpu-quota="${CPUQUOTA}" \
--cluster-id="${TC_BUILD_ID}" \
Expand Down
33 changes: 23 additions & 10 deletions pkg/cmd/roachtest/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,26 @@ func generateHelpCommand(
}
}

func failuresAsErrorWithOwnership(failures []failure) *registry.ErrorWithOwnership {
var transientError rperrors.TransientError
var err registry.ErrorWithOwnership
if failuresMatchingError(failures, &transientError) {
err = registry.ErrorWithOwner(
registry.OwnerTestEng, transientError,
registry.WithTitleOverride(transientError.Cause),
registry.InfraFlake,
)

return &err
}

if errWithOwner := failuresSpecifyOwner(failures); errWithOwner != nil {
return errWithOwner
}

return nil
}

// postIssueCondition encapsulates a condition that causes issue
// posting to be skipped. The `reason` field contains a textual
// description as to why issue posting was skipped.
Expand Down Expand Up @@ -167,19 +187,12 @@ func (g *githubIssues) createPostRequest(
}

issueClusterName := ""
errWithOwnership := failuresSpecifyOwner(failures)
var transientError rperrors.TransientError
// If we find a failure that was labeled as a roachprod transient
// error, redirect that to Test Eng with the corresponding label as
// title override.
if failuresMatchingError(failures, &transientError) {
handleErrorWithOwnership(registry.ErrorWithOwner(
registry.OwnerTestEng, transientError,
registry.WithTitleOverride(transientError.Cause),
registry.InfraFlake,
))
} else if errWithOwnership != nil {
handleErrorWithOwnership(*errWithOwnership)
errWithOwner := failuresAsErrorWithOwnership(failures)
if errWithOwner != nil {
handleErrorWithOwnership(*errWithOwner)
}

// Issues posted from roachtest are identifiable as such, and they are also release blockers
Expand Down
6 changes: 6 additions & 0 deletions pkg/cmd/roachtest/roachtestflags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,12 @@ var (
Usage: `Use SpotVM to run tests, If the provider does not support spotVM, it will be ignored`,
})

AutoKillThreshold float64 = 1.0
_ = registerRunFlag(&AutoKillThreshold, FlagInfo{
Name: "auto-kill-threshold",
Usage: `Percentage of failed tests before all remaining tests are automatically terminated.`,
})

GlobalSeed int64 = randutil.NewPseudoSeed()
_ = registerRunFlag(&GlobalSeed, FlagInfo{
Name: "global-seed",
Expand Down
15 changes: 14 additions & 1 deletion pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,7 @@ func (r *testRunner) Run(
lopt,
topt,
l,
n*count,
)

if err != nil {
Expand Down Expand Up @@ -552,6 +553,7 @@ func (r *testRunner) runWorker(
lopt loggingOpt,
topt testOpts,
l *logger.Logger,
maxTotalFailures int,
) error {
stdout := lopt.stdout

Expand Down Expand Up @@ -610,6 +612,14 @@ func (r *testRunner) runWorker(
}
}

// stop the tests if the failure rate has been exceeded
r.status.Lock()
failureRate := float64(len(r.status.fail)) / float64(maxTotalFailures)
r.status.Unlock()
if failureRate > roachtestflags.AutoKillThreshold {
return errors.Errorf("failure rate %.2f exceeds limit %.2f", failureRate, roachtestflags.AutoKillThreshold)
}

wStatus.SetTest(nil /* test */, testToRunRes{})

testToRun := testToRunRes{noWork: true}
Expand Down Expand Up @@ -1140,7 +1150,10 @@ func (r *testRunner) runTest(
// Only include tests with a Run function in the summary output.
if s.Run != nil {
if t.Failed() {
r.status.fail[t] = struct{}{}
errWithOwner := failuresAsErrorWithOwnership(t.failures())
if errWithOwner == nil || !errWithOwner.InfraFlake {
r.status.fail[t] = struct{}{}
}
} else if s.Skip != "" {
r.status.skip[t] = struct{}{}
} else {
Expand Down

0 comments on commit 4dc9d94

Please sign in to comment.