Skip to content

Commit

Permalink
Merge #88492 #89913
Browse files Browse the repository at this point in the history
88492: Roachtest redirect SSH flakes to test-eng r=tbg a=smg260

*See second commit note at the bottom* 

This PR inspects the failure output of a roachtest, and if it sees an SSH_PROBLEM, overrides the owning team to test-eng when reporting the github issue. 

Currently errors are classified as an `SSH` error by roachprod if the exit code is `255` with an accompanying message prefixed with `SSH_PROBLEM` [[1]](https://github.com/cockroachdb/cockroach/blob/ad3bd1355463cefdc07e995765fa82adfe391d05/pkg/roachprod/errors/errors.go#L112). The errors are stringified and saved into `t.mu.output|failureMsg`. Thus in the test_runner at the call site of issue posting, we can check `t.mu.output` for `SSH_PROBLEM` and override the team and issue name accordingly.


Resolves: #82398

Release justification: test-only change
Release note: none

89913: changefeedccl: job-level retry when error message is about draining r=[miretskiy] a=HonoreDB

See #https://github.com/cockroachlabs/support/issues/1839. The flow retryable error marker doesn't survive every path by which it can bubble up, so just look for the single word "draining" as false positives are much better than false negatives.

Fixes #89663

Release note (enterprise change): Fixed a bug that could cause changefeeds to fail during a rolling restart.

Co-authored-by: Miral Gadani <[email protected]>
Co-authored-by: Aaron Zinger <[email protected]>
  • Loading branch information
3 people committed Oct 25, 2022
3 parents 847c146 + 741a235 + 531232c commit 774682b
Show file tree
Hide file tree
Showing 11 changed files with 201 additions and 98 deletions.
4 changes: 3 additions & 1 deletion pkg/ccl/changefeedccl/changefeedbase/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,9 @@ func IsRetryableError(err error) bool {
// that we can't recover the structure and we have to rely on this
// unfortunate string comparison.
errStr := err.Error()
if strings.Contains(errStr, retryableErrorString) || strings.Contains(errStr, kvcoord.SendErrorString) {
if strings.Contains(errStr, retryableErrorString) ||
strings.Contains(errStr, kvcoord.SendErrorString) ||
strings.Contains(errStr, "draining") {
return true
}

Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ go_library(
"//pkg/internal/team",
"//pkg/roachprod",
"//pkg/roachprod/config",
"//pkg/roachprod/errors",
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/roachprod/prometheus",
Expand Down
48 changes: 42 additions & 6 deletions pkg/cmd/roachtest/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/internal/team"
rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
)
Expand All @@ -32,6 +33,14 @@ type githubIssues struct {
teamLoader func() (team.Map, error)
}

type issueCategory int

const (
otherErr issueCategory = iota
clusterCreationErr
sshErr
)

func newGithubIssues(
disable bool, c *clusterImpl, vmCreateOpts *vm.CreateOpts, l *logger.Logger,
) *githubIssues {
Expand Down Expand Up @@ -59,16 +68,33 @@ func (g *githubIssues) shouldPost(t test.Test) bool {
t.Spec().(*registry.TestSpec).Cluster.NodeCount > 0
}

func (g *githubIssues) createPostRequest(t test.Test, message string) issues.PostRequest {
func (g *githubIssues) createPostRequest(
t test.Test, cat issueCategory, message string,
) issues.PostRequest {
var mention []string
var projColID int

issueOwner := t.Spec().(*registry.TestSpec).Owner
issueName := t.Name()

messagePrefix := ""
// Overrides to shield eng teams from potential flakes
if cat == clusterCreationErr {
issueOwner = registry.OwnerDevInf
issueName = "cluster_creation"
messagePrefix = fmt.Sprintf("test %s was skipped due to ", t.Name())
} else if cat == sshErr {
issueOwner = registry.OwnerTestEng
issueName = "ssh_problem"
messagePrefix = fmt.Sprintf("test %s failed due to ", t.Name())
}

teams, err := g.teamLoader()
if err != nil {
t.Fatalf("could not load teams: %v", err)
}

if sl, ok := teams.GetAliasesForPurpose(ownerToAlias(t.Spec().(*registry.TestSpec).Owner), team.PurposeRoachtest); ok {
if sl, ok := teams.GetAliasesForPurpose(ownerToAlias(issueOwner), team.PurposeRoachtest); ok {
for _, alias := range sl {
mention = append(mention, "@"+string(alias))
}
Expand Down Expand Up @@ -112,8 +138,8 @@ func (g *githubIssues) createPostRequest(t test.Test, message string) issues.Pos
MentionOnCreate: mention,
ProjectColumnID: projColID,
PackageName: "roachtest",
TestName: t.Name(),
Message: message,
TestName: issueName,
Message: messagePrefix + message,
Artifacts: artifacts,
ExtraLabels: labels,
ExtraParams: clusterParams,
Expand All @@ -130,14 +156,24 @@ func (g *githubIssues) createPostRequest(t test.Test, message string) issues.Pos
}
}

func (g *githubIssues) MaybePost(t test.Test, message string) error {
func (g *githubIssues) MaybePost(t *testImpl, message string) error {
if !g.shouldPost(t) {
return nil
}

cat := otherErr

// Overrides to shield eng teams from potential flakes
firstFailure := t.firstFailure()
if failureContainsError(firstFailure, errClusterProvisioningFailed) {
cat = clusterCreationErr
} else if failureContainsError(firstFailure, rperrors.ErrSSH255) {
cat = sshErr
}

return g.issuePoster(
context.Background(),
issues.UnitTestFormatter,
g.createPostRequest(t, message),
g.createPostRequest(t, cat, message),
)
}
41 changes: 32 additions & 9 deletions pkg/cmd/roachtest/github_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ var (
teamsYaml = `cockroachdb/unowned:
aliases:
cockroachdb/rfc-prs: other
triage_column_id: 0`
triage_column_id: 0
cockroachdb/test-eng:
triage_column_id: 14041337
cockroachdb/dev-inf:
triage_column_id: 10210759`

validTeamsFn = func() (team.Map, error) { return loadYamlTeams(teamsYaml) }
invalidTeamsFn = func() (team.Map, error) { return loadYamlTeams("invalid yaml") }
Expand Down Expand Up @@ -57,7 +61,7 @@ func TestShouldPost(t *testing.T) {
envTcBuildBranch string
expected bool
}{
/* Cases 1 - 4 verify that issues are not posted if any of on the relevant criteria checks fail */
/* Cases 1 - 4 verify that issues are not posted if any of the relevant criteria checks fail */
// disable
{true, 1, "token", "master", false},
// nodeCount
Expand Down Expand Up @@ -102,10 +106,11 @@ func TestCreatePostRequest(t *testing.T) {
clusterCreationFailed bool
loadTeamsFailed bool
localSSD bool
category issueCategory
expectedPost bool
expectedParams map[string]string
}{
{true, false, false, false, true,
{true, false, false, false, otherErr, true,
prefixAll(map[string]string{
"cloud": "gce",
"encrypted": "false",
Expand All @@ -115,7 +120,7 @@ func TestCreatePostRequest(t *testing.T) {
"localSSD": "false",
}),
},
{true, false, false, true, true,
{true, false, false, true, clusterCreationErr, true,
prefixAll(map[string]string{
"cloud": "gce",
"encrypted": "false",
Expand All @@ -128,15 +133,15 @@ func TestCreatePostRequest(t *testing.T) {
// Assert that release-blocker label exists when !nonReleaseBlocker
// Also ensure that in the event of a failed cluster creation,
// nil `vmOptions` and `clusterImpl` are not dereferenced
{false, true, false, false, true,
{false, true, false, false, sshErr, true,
prefixAll(map[string]string{
"cloud": "gce",
"ssd": "0",
"cpu": "4",
}),
},
//Simulate failure loading TEAMS.yaml
{true, false, true, false, false, nil},
{true, false, true, false, otherErr, false, nil},
}

reg, _ := makeTestRegistry(spec.GCE, "", "", false)
Expand All @@ -145,7 +150,7 @@ func TestCreatePostRequest(t *testing.T) {
clusterSpec := reg.MakeClusterSpec(1)

testSpec := &registry.TestSpec{
Name: "githubPost",
Name: "github_test",
Owner: OwnerUnitTest,
Cluster: clusterSpec,
NonReleaseBlocker: c.nonReleaseBlocker,
Expand Down Expand Up @@ -183,9 +188,9 @@ func TestCreatePostRequest(t *testing.T) {

if c.loadTeamsFailed {
// Assert that if TEAMS.yaml cannot be loaded then function panics.
assert.Panics(t, func() { github.createPostRequest(ti, "message") })
assert.Panics(t, func() { github.createPostRequest(ti, c.category, "message") })
} else {
req := github.createPostRequest(ti, "message")
req := github.createPostRequest(ti, c.category, "message")

if c.expectedParams != nil {
require.Equal(t, c.expectedParams, req.ExtraParams)
Expand All @@ -196,6 +201,24 @@ func TestCreatePostRequest(t *testing.T) {
if !c.nonReleaseBlocker {
require.True(t, contains(req.ExtraLabels, nil, "release-blocker"))
}

expectedTeam := "@cockroachdb/unowned"
expectedName := "github_test"
expectedMessagePrefix := ""

if c.category == clusterCreationErr {
expectedTeam = "@cockroachdb/dev-inf"
expectedName = "cluster_creation"
expectedMessagePrefix = "test github_test was skipped due to "
} else if c.category == sshErr {
expectedTeam = "@cockroachdb/test-eng"
expectedName = "ssh_problem"
expectedMessagePrefix = "test github_test failed due to "
}

require.Contains(t, req.MentionOnCreate, expectedTeam)
require.Equal(t, expectedName, req.TestName)
require.True(t, strings.HasPrefix(req.Message, expectedMessagePrefix), req.Message)
}
}
}
11 changes: 9 additions & 2 deletions pkg/cmd/roachtest/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ runner itself.
if errors.Is(err, errTestsFailed) {
code = ExitCodeTestsFailed
}
if errors.Is(err, errClusterProvisioningFailed) {
if errors.Is(err, errSomeClusterProvisioningFailed) {
code = ExitCodeClusterProvisioningFailed
}
// Cobra has already printed the error message.
Expand Down Expand Up @@ -382,8 +382,15 @@ func runTests(register func(registry.Registry), cfg cliCfg) error {

filter := registry.NewTestFilter(cfg.args)
clusterType := roachprodCluster
bindTo := ""
if local {
clusterType = localCluster

// This will suppress the annoying "Allow incoming network connections" popup from
// OSX when running a roachtest
bindTo = "localhost"

fmt.Printf("--local specified. Binding http listener to localhost only")
if cfg.parallelism != 1 {
fmt.Printf("--local specified. Overriding --parallelism to 1.\n")
cfg.parallelism = 1
Expand All @@ -398,7 +405,7 @@ func runTests(register func(registry.Registry), cfg cliCfg) error {
keepClustersOnTestFailure: cfg.debugEnabled,
clusterID: cfg.clusterID,
}
if err := runner.runHTTPServer(cfg.httpPort, os.Stdout); err != nil {
if err := runner.runHTTPServer(cfg.httpPort, os.Stdout, bindTo); err != nil {
return err
}

Expand Down
Loading

0 comments on commit 774682b

Please sign in to comment.