Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release-23.1: roachprod: retry dns lookup on network failures #111763

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 28 additions & 19 deletions pkg/roachprod/vm/gce/dns.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,22 +151,33 @@ func (n dnsProvider) Domain() string {
return dnsDomain
}

// lookupSRVRecords uses standard net tools to perform a DNS lookup. For
// lookups, we prefer this to using the gcloud command as it is faster, and
// preferable when service information is being queried regularly.
// lookupSRVRecords uses standard net tools to perform a DNS lookup. This
// function will retry the lookup several times if there are any intermittent
// network problems. For lookups, we prefer this to using the gcloud command as
// it is faster, and preferable when service information is being queried
// regularly.
func (n dnsProvider) lookupSRVRecords(
ctx context.Context, service, proto, name string,
) ([]vm.DNSRecord, error) {
cName, srvRecords, err := n.resolver.LookupSRV(ctx, service, proto, name)
if dnsError := (*net.DNSError)(nil); errors.As(err, &dnsError) {
// We ignore some errors here as they are likely due to the record name not
// existing. The net.LookupSRV function tends to return "server misbehaving"
// and "no such host" errors when no record entries are found. Hence, making
// the errors ambiguous and not useful. The errors are not exported, so we
// have to check the error message.
if dnsError.Err != "server misbehaving" && dnsError.Err != "no such host" && !dnsError.IsNotFound {
return nil, err
var err error
var cName string
var srvRecords []*net.SRV
err = retry.WithMaxAttempts(ctx, retry.Options{}, 10, func() error {
cName, srvRecords, err = n.resolver.LookupSRV(ctx, service, proto, name)
if dnsError := (*net.DNSError)(nil); errors.As(err, &dnsError) {
// We ignore some errors here as they are likely due to the record name not
// existing. The net.LookupSRV function tends to return "server misbehaving"
// and "no such host" errors when no record entries are found. Hence, making
// the errors ambiguous and not useful. The errors are not exported, so we
// have to check the error message.
if dnsError.Err != "server misbehaving" && dnsError.Err != "no such host" && !dnsError.IsNotFound {
return dnsError
}
}
return nil
})
if err != nil {
return nil, err
}
records := make([]vm.DNSRecord, len(srvRecords))
for i, srvRecord := range srvRecords {
Expand Down Expand Up @@ -237,10 +248,7 @@ func (n dnsProvider) waitForRecordsAvailable(ctx context.Context, records ...vm.
}] = struct{}{}
}

return retry.WithMaxAttempts(ctx, retry.Options{
InitialBackoff: 5 * time.Second,
Multiplier: 1,
}, 20, func() error {
for attempts := 0; attempts < 30; attempts++ {
for key := range notAvailable {
foundRecords, err := n.lookupSRVRecords(ctx, "", "", key.name)
if err != nil {
Expand All @@ -256,7 +264,8 @@ func (n dnsProvider) waitForRecordsAvailable(ctx context.Context, records ...vm.
if len(notAvailable) == 0 {
return nil
}
return errors.Newf("waiting for DNS records to become available: %d out of %d records not available",
len(notAvailable), len(records))
})
time.Sleep(2 * time.Second)
}
return errors.Newf("waiting for DNS records to become available: %d out of %d records not available",
len(notAvailable), len(records))
}