Skip to content

Commit

Permalink
Merge #110831
Browse files Browse the repository at this point in the history
110831: roachprod: classify dns errors r=smg260 a=herkolategan

Previously DNS errors were not marked making it hard to classify these errors
later on. This change marks all DNS related errors in order to better report on
the errors.

roachtest then classifies these errors as infra flake to help with triaging.

Epic: None
Release Note: None

Co-authored-by: Herko Lategan <[email protected]>
  • Loading branch information
craig[bot] and herkolategan committed Sep 21, 2023
2 parents 02435d6 + cf8a7dd commit a3dccc6
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 11 deletions.
2 changes: 2 additions & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ go_library(
"//pkg/roachprod/logger",
"//pkg/roachprod/prometheus",
"//pkg/roachprod/vm",
"//pkg/roachprod/vm/gce",
"//pkg/testutils/skip",
"//pkg/util/allstacks",
"//pkg/util/ctxgroup",
Expand Down Expand Up @@ -94,6 +95,7 @@ go_test(
"//pkg/roachprod/errors",
"//pkg/roachprod/logger",
"//pkg/roachprod/vm",
"//pkg/roachprod/vm/gce",
"//pkg/testutils",
"//pkg/testutils/echotest",
"//pkg/util/quotapool",
Expand Down
6 changes: 6 additions & 0 deletions pkg/cmd/roachtest/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
)

type githubIssues struct {
Expand Down Expand Up @@ -156,6 +157,11 @@ func (g *githubIssues) createPostRequest(
issueName = "ssh_problem"
messagePrefix = fmt.Sprintf("test %s failed due to ", testName)
infraFlake = true
case failureContainsError(firstFailure, gce.ErrDNSOperation):
issueOwner = registry.OwnerTestEng
issueName = "dns_problem"
messagePrefix = fmt.Sprintf("test %s failed due to ", testName)
infraFlake = true
case failureContainsError(firstFailure, errDuringPostAssertions):
messagePrefix = fmt.Sprintf("test %s failed during post test assertions (see test-post-assertions.log) due to ", testName)
}
Expand Down
10 changes: 9 additions & 1 deletion pkg/cmd/roachtest/github_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/internal/team"
rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
"github.com/cockroachdb/cockroach/pkg/testutils/echotest"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -176,6 +177,8 @@ func TestCreatePostRequest(t *testing.T) {
{true, false, true, false, nil, "", createFailure(errors.New("other")), false, false, false, nil},
//Error during post test assertions
{true, false, false, false, nil, "", createFailure(errDuringPostAssertions), false, false, false, nil},
//Error during dns operation
{true, false, false, false, nil, "", createFailure(gce.ErrDNSOperation), true, false, true, nil},
// Assert that extra labels in the test spec are added to the issue.
{true, false, false, false, []string{"foo-label"}, "", createFailure(errors.New("other")), true, false, false,
prefixAll(map[string]string{
Expand Down Expand Up @@ -263,7 +266,12 @@ func TestCreatePostRequest(t *testing.T) {
expectedLabels := []string{}
expectedMessagePrefix := ""

if errors.Is(c.failure.squashedErr, errClusterProvisioningFailed) {
if errors.Is(c.failure.squashedErr, gce.ErrDNSOperation) {
expectedTeam = "@cockroachdb/test-eng"
expectedLabels = []string{"T-testeng", "X-infra-flake"}
expectedName = "dns_problem"
expectedMessagePrefix = "test github_test failed due to "
} else if errors.Is(c.failure.squashedErr, errClusterProvisioningFailed) {
expectedTeam = "@cockroachdb/test-eng"
expectedLabels = []string{"T-testeng", "X-infra-flake"}
expectedName = "cluster_creation"
Expand Down
17 changes: 17 additions & 0 deletions pkg/cmd/roachtest/testdata/help_command_createpost_7.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
echo
----
----


See: [roachtest README](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/README.md)



See: [How To Investigate \(internal\)](https://cockroachlabs.atlassian.net/l/c/SSSBr8c7)



See: [Grafana](https://go.crdb.dev/roachtest-grafana//github-test/1689957243000/1689957853000)

----
----
31 changes: 21 additions & 10 deletions pkg/roachprod/vm/gce/dns.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ const (
dnsMaxResults = 1000
)

var ErrDNSOperation = fmt.Errorf("error during Google Cloud DNS operation")

var _ vm.DNSProvider = &dnsProvider{}

// dnsProvider implements the vm.DNSProvider interface.
Expand Down Expand Up @@ -97,7 +99,7 @@ func (n dnsProvider) CreateRecords(ctx context.Context, records ...vm.DNSRecord)
cmd := exec.Command("gcloud", args...)
out, err := cmd.CombinedOutput()
if err != nil {
return errors.Wrapf(err, "output: %s", out)
return markDNSOperationError(errors.Wrapf(err, "output: %s", out))
}
}
// The DNS records are not immediately available after creation. We wait until
Expand All @@ -119,13 +121,14 @@ func (n dnsProvider) LookupSRVRecords(
func (n dnsProvider) DeleteRecordsBySubdomain(subdomain string) error {
suffix := fmt.Sprintf("%s.%s.", subdomain, n.Domain())
records, err := n.listSRVRecords(suffix, dnsMaxResults)
if err != nil {
return err
}

names := make(map[string]struct{})
for _, record := range records {
names[record.Name] = struct{}{}
}
if err != nil {
return err
}
for name := range names {
// Only delete records that match the subdomain. The initial filter by
// gcloud does not specifically match suffixes, hence we check here to
Expand All @@ -140,7 +143,7 @@ func (n dnsProvider) DeleteRecordsBySubdomain(subdomain string) error {
cmd := exec.Command("gcloud", args...)
out, err := cmd.CombinedOutput()
if err != nil {
return errors.Wrapf(err, "output: %s", out)
return markDNSOperationError(errors.Wrapf(err, "output: %s", out))
}
}
return nil
Expand Down Expand Up @@ -171,7 +174,7 @@ func (n dnsProvider) lookupSRVRecords(
// the errors ambiguous and not useful. The errors are not exported, so we
// have to check the error message.
if dnsError.Err != "server misbehaving" && dnsError.Err != "no such host" && !dnsError.IsNotFound {
return dnsError
return markDNSOperationError(dnsError)
}
}
return nil
Expand Down Expand Up @@ -200,7 +203,7 @@ func (n dnsProvider) listSRVRecords(filter string, limit int) ([]vm.DNSRecord, e
cmd := exec.Command("gcloud", args...)
res, err := cmd.CombinedOutput()
if err != nil {
return nil, errors.Wrapf(err, "output: %s", res)
return nil, markDNSOperationError(errors.Wrapf(err, "output: %s", res))
}
var jsonList []struct {
Name string `json:"name"`
Expand All @@ -212,7 +215,7 @@ func (n dnsProvider) listSRVRecords(filter string, limit int) ([]vm.DNSRecord, e

err = json.Unmarshal(res, &jsonList)
if err != nil {
return nil, errors.Wrapf(err, "error unmarshalling output: %s", res)
return nil, markDNSOperationError(errors.Wrapf(err, "error unmarshalling output: %s", res))
}

records := make([]vm.DNSRecord, 0)
Expand Down Expand Up @@ -266,6 +269,14 @@ func (n dnsProvider) waitForRecordsAvailable(ctx context.Context, records ...vm.
}
time.Sleep(2 * time.Second)
}
return errors.Newf("waiting for DNS records to become available: %d out of %d records not available",
len(notAvailable), len(records))
return markDNSOperationError(
errors.Newf("waiting for DNS records to become available: %d out of %d records not available",
len(notAvailable), len(records)),
)
}

// markDNSOperationError should be used to mark any external DNS API or Google
// Cloud DNS CLI errors as DNS operation errors.
func markDNSOperationError(err error) error {
return errors.Mark(err, ErrDNSOperation)
}

0 comments on commit a3dccc6

Please sign in to comment.