Skip to content

Commit

Permalink
detect Retry-After during async “does resource exist?” flow
Browse files Browse the repository at this point in the history
  • Loading branch information
jackfrancis committed Oct 5, 2022
1 parent 071dd13 commit bcee198
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 6 deletions.
44 changes: 38 additions & 6 deletions azure/services/async/async.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ package async

import (
"context"
"net/http"
"strconv"
"time"

"github.com/Azure/go-autorest/autorest"
azureautorest "github.com/Azure/go-autorest/autorest/azure"
"github.com/pkg/errors"
infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
Expand Down Expand Up @@ -76,7 +79,7 @@ func processOngoingOperation(ctx context.Context, scope FutureScope, client Futu

// Operation is still in progress, update conditions and requeue.
log.V(2).Info("long running operation is still ongoing", "service", serviceName, "resource", resourceName)
return nil, azure.WithTransientError(azure.NewOperationNotDoneError(future), retryAfter(sdkFuture))
return nil, azure.WithTransientError(azure.NewOperationNotDoneError(future), getRequeueAfterFromFuture(sdkFuture))
}
if err != nil {
log.V(2).Error(err, "error checking long running operation status after it finished")
Expand Down Expand Up @@ -111,7 +114,8 @@ func (s *Service) CreateResource(ctx context.Context, spec azure.ResourceSpecGet
// Get the resource if it already exists, and use it to construct the desired resource parameters.
var existingResource interface{}
if existing, err := s.Creator.Get(ctx, spec); err != nil && !azure.ResourceNotFound(err) {
return nil, errors.Wrapf(err, "failed to get existing resource %s/%s (service: %s)", rgName, resourceName, serviceName)
errWrapped := errors.Wrapf(err, "failed to get existing resource %s/%s (service: %s)", rgName, resourceName, serviceName)
return nil, azure.WithTransientError(errWrapped, getRetryAfterFromError(err))
} else if err == nil {
existingResource = existing
log.V(2).Info("successfully got existing resource", "service", serviceName, "resource", resourceName, "resourceGroup", rgName)
Expand All @@ -136,7 +140,7 @@ func (s *Service) CreateResource(ctx context.Context, spec azure.ResourceSpecGet
return nil, errors.Wrapf(err, "failed to create resource %s/%s (service: %s)", rgName, resourceName, serviceName)
}
s.Scope.SetLongRunningOperationState(future)
return nil, azure.WithTransientError(azure.NewOperationNotDoneError(future), retryAfter(sdkFuture))
return nil, azure.WithTransientError(azure.NewOperationNotDoneError(future), getRequeueAfterFromFuture(sdkFuture))
} else if err != nil {
return nil, errors.Wrapf(err, "failed to create resource %s/%s (service: %s)", rgName, resourceName, serviceName)
}
Expand Down Expand Up @@ -170,7 +174,7 @@ func (s *Service) DeleteResource(ctx context.Context, spec azure.ResourceSpecGet
return errors.Wrapf(err, "failed to delete resource %s/%s (service: %s)", rgName, resourceName, serviceName)
}
s.Scope.SetLongRunningOperationState(future)
return azure.WithTransientError(azure.NewOperationNotDoneError(future), retryAfter(sdkFuture))
return azure.WithTransientError(azure.NewOperationNotDoneError(future), getRequeueAfterFromFuture(sdkFuture))
} else if err != nil {
if azure.ResourceNotFound(err) {
// already deleted
Expand All @@ -183,12 +187,40 @@ func (s *Service) DeleteResource(ctx context.Context, spec azure.ResourceSpecGet
return nil
}

// retryAfter returns the max between the `RETRY-AFTER` header and the default requeue time.
// getRequeueAfterFromFuture returns the max between the `RETRY-AFTER` header and the default requeue time.
// This ensures we respect the retry-after header if it is set and avoid retrying too often during an API throttling event.
func retryAfter(sdkFuture azureautorest.FutureAPI) time.Duration {
func getRequeueAfterFromFuture(sdkFuture azureautorest.FutureAPI) time.Duration {
retryAfter, _ := sdkFuture.GetPollingDelay()
if retryAfter < reconciler.DefaultReconcilerRequeue {
retryAfter = reconciler.DefaultReconcilerRequeue
}
return retryAfter
}

// getRetryAfterFromError returns the time.Duration from the http.Response in the autorest.DetailedError.
// If there is no Response object, or if there is no meaningful Retry-After header data, we return a default.
func getRetryAfterFromError(err error) time.Duration {
// In case we aren't able to introspect Retry-After from the error type, we'll return this default
ret := reconciler.DefaultReconcilerRequeue
var detailedError autorest.DetailedError
// if we have a strongly typed autorest.DetailedError then we can introspect the HTTP response data
if errors.As(err, &detailedError) {
if detailedError.Response != nil {
// If we have Retry-After HTTP header data for any reason, prefer it
if retryAfter := detailedError.Response.Header.Get("Retry-After"); retryAfter != "" {
// This handles the case where Retry-After data is in the form of units of seconds
if rai, err := strconv.Atoi(retryAfter); err == nil {
ret = time.Duration(rai) * time.Second
// This handles the case where Retry-After data is in the form of absolute time
} else if t, err := time.Parse(time.RFC1123, retryAfter); err == nil {
ret = time.Until(t)
}
// If we didn't find Retry-After HTTP header data but the response type is 429,
// we'll have to come up with our sane default.
} else if detailedError.Response.StatusCode == http.StatusTooManyRequests {
ret = reconciler.DefaultHTTP429RetryAfter
}
}
}
return ret
}
85 changes: 85 additions & 0 deletions azure/services/async/async_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"errors"
"net/http"
"testing"
"time"

"github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-05-01/resources"
"github.com/Azure/go-autorest/autorest"
Expand All @@ -31,6 +32,7 @@ import (
"sigs.k8s.io/cluster-api-provider-azure/azure/mock_azure"
"sigs.k8s.io/cluster-api-provider-azure/azure/services/async/mock_async"
gomockinternal "sigs.k8s.io/cluster-api-provider-azure/internal/test/matchers/gomock"
"sigs.k8s.io/cluster-api-provider-azure/util/reconciler"
)

var (
Expand Down Expand Up @@ -434,3 +436,86 @@ func TestDeleteResource(t *testing.T) {
})
}
}

func TestGetRetryAfterFromError(t *testing.T) {
cases := []struct {
name string
input error
expected time.Duration
expectedRangeTolerance time.Duration
}{
{
name: "Retry-After header data present in the form of units of seconds",
input: autorest.DetailedError{
Response: &http.Response{
Header: http.Header{
"Retry-After": []string{"2"},
},
},
},
expected: 2 * time.Second,
},
{
name: "Retry-After header data present in the form of units of absolute time",
input: autorest.DetailedError{
Response: &http.Response{
Header: http.Header{
"Retry-After": []string{time.Now().Add(1 * time.Hour).Format(time.RFC1123)},
},
},
},
expected: 1 * time.Hour,
expectedRangeTolerance: 5 * time.Second,
},
{
name: "Retry-After header data not present",
input: autorest.DetailedError{
Response: &http.Response{
Header: http.Header{
"foo": []string{"bar"},
},
},
},
expected: reconciler.DefaultReconcilerRequeue,
},
{
name: "Retry-After header data not present in HTTP 429",
input: autorest.DetailedError{
Response: &http.Response{
StatusCode: http.StatusTooManyRequests,
Header: http.Header{
"foo": []string{"bar"},
},
},
},
expected: reconciler.DefaultHTTP429RetryAfter,
},
{
name: "nil http.Response",
input: autorest.DetailedError{
Response: nil,
},
expected: reconciler.DefaultReconcilerRequeue,
},
{
name: "error type is not autorest.DetailedError",
input: errors.New("error"),
expected: reconciler.DefaultReconcilerRequeue,
},
}

for _, c := range cases {
c := c
t.Run(c.name, func(t *testing.T) {
t.Parallel()
g := NewWithT(t)
ret := getRetryAfterFromError(c.input)
if c.expectedRangeTolerance > 0 {
g.Expect(ret).To(BeNumerically("<", c.expected))
g.Expect(ret + c.expectedRangeTolerance).To(BeNumerically(">", c.expected))
} else {
g.Expect(ret).To(Equal(c.expected))
}
})
}
}
2 changes: 2 additions & 0 deletions util/reconciler/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ const (
DefaultAzureCallTimeout = 2 * time.Second
// DefaultReconcilerRequeue is the default value for the reconcile retry.
DefaultReconcilerRequeue = 15 * time.Second
// DefaultHTTP429RetryAfter is a default backoff wait time when we get a HTTP 429 response with no Retry-After data.
DefaultHTTP429RetryAfter = 1 * time.Minute
)

// DefaultedLoopTimeout will default the timeout if it is zero-valued.
Expand Down

0 comments on commit bcee198

Please sign in to comment.