Skip to content

Commit

Permalink
roachprod: retry dns lookup on network failures
Browse files Browse the repository at this point in the history
Previously we had failures in net.LookupSRV that appear to be network flakes. It
would be preferable to not fail a whole `roachtest` on a single DNS network flake.
This change wraps a retry around the dns lookup in order to prevent flakiness
and have a chance of recovery.

Fixes: cockroachdb#110884

Epic: None
Release Note: None
  • Loading branch information
herkolategan committed Oct 4, 2023
1 parent 5874e3e commit b872307
Showing 1 changed file with 28 additions and 19 deletions.
47 changes: 28 additions & 19 deletions pkg/roachprod/vm/gce/dns.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,22 +151,33 @@ func (n dnsProvider) Domain() string {
return dnsDomain
}

// lookupSRVRecords uses standard net tools to perform a DNS lookup. For
// lookups, we prefer this to using the gcloud command as it is faster, and
// preferable when service information is being queried regularly.
// lookupSRVRecords uses standard net tools to perform a DNS lookup. This
// function will retry the lookup several times if there are any intermittent
// network problems. For lookups, we prefer this to using the gcloud command as
// it is faster, and preferable when service information is being queried
// regularly.
func (n dnsProvider) lookupSRVRecords(
ctx context.Context, service, proto, name string,
) ([]vm.DNSRecord, error) {
cName, srvRecords, err := n.resolver.LookupSRV(ctx, service, proto, name)
if dnsError := (*net.DNSError)(nil); errors.As(err, &dnsError) {
// We ignore some errors here as they are likely due to the record name not
// existing. The net.LookupSRV function tends to return "server misbehaving"
// and "no such host" errors when no record entries are found. Hence, making
// the errors ambiguous and not useful. The errors are not exported, so we
// have to check the error message.
if dnsError.Err != "server misbehaving" && dnsError.Err != "no such host" && !dnsError.IsNotFound {
return nil, err
var err error
var cName string
var srvRecords []*net.SRV
err = retry.WithMaxAttempts(ctx, retry.Options{}, 10, func() error {
cName, srvRecords, err = n.resolver.LookupSRV(ctx, service, proto, name)
if dnsError := (*net.DNSError)(nil); errors.As(err, &dnsError) {
// We ignore some errors here as they are likely due to the record name not
// existing. The net.LookupSRV function tends to return "server misbehaving"
// and "no such host" errors when no record entries are found. Hence, making
// the errors ambiguous and not useful. The errors are not exported, so we
// have to check the error message.
if dnsError.Err != "server misbehaving" && dnsError.Err != "no such host" && !dnsError.IsNotFound {
return dnsError
}
}
return nil
})
if err != nil {
return nil, err
}
records := make([]vm.DNSRecord, len(srvRecords))
for i, srvRecord := range srvRecords {
Expand Down Expand Up @@ -237,10 +248,7 @@ func (n dnsProvider) waitForRecordsAvailable(ctx context.Context, records ...vm.
}] = struct{}{}
}

return retry.WithMaxAttempts(ctx, retry.Options{
InitialBackoff: 5 * time.Second,
Multiplier: 1,
}, 20, func() error {
for attempts := 0; attempts < 30; attempts++ {
for key := range notAvailable {
foundRecords, err := n.lookupSRVRecords(ctx, "", "", key.name)
if err != nil {
Expand All @@ -256,7 +264,8 @@ func (n dnsProvider) waitForRecordsAvailable(ctx context.Context, records ...vm.
if len(notAvailable) == 0 {
return nil
}
return errors.Newf("waiting for DNS records to become available: %d out of %d records not available",
len(notAvailable), len(records))
})
time.Sleep(2 * time.Second)
}
return errors.Newf("waiting for DNS records to become available: %d out of %d records not available",
len(notAvailable), len(records))
}

0 comments on commit b872307

Please sign in to comment.