diff --git a/azure/services/async/async.go b/azure/services/async/async.go index a4cfad9909a..cba96258858 100644 --- a/azure/services/async/async.go +++ b/azure/services/async/async.go @@ -18,8 +18,11 @@ package async import ( "context" + "net/http" + "strconv" "time" + "github.com/Azure/go-autorest/autorest" azureautorest "github.com/Azure/go-autorest/autorest/azure" "github.com/pkg/errors" infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" @@ -76,7 +79,7 @@ func processOngoingOperation(ctx context.Context, scope FutureScope, client Futu // Operation is still in progress, update conditions and requeue. log.V(2).Info("long running operation is still ongoing", "service", serviceName, "resource", resourceName) - return nil, azure.WithTransientError(azure.NewOperationNotDoneError(future), retryAfter(sdkFuture)) + return nil, azure.WithTransientError(azure.NewOperationNotDoneError(future), getRequeueAfterFromFuture(sdkFuture)) } if err != nil { log.V(2).Error(err, "error checking long running operation status after it finished") @@ -111,7 +114,8 @@ func (s *Service) CreateResource(ctx context.Context, spec azure.ResourceSpecGet // Get the resource if it already exists, and use it to construct the desired resource parameters. var existingResource interface{} if existing, err := s.Creator.Get(ctx, spec); err != nil && !azure.ResourceNotFound(err) { - return nil, errors.Wrapf(err, "failed to get existing resource %s/%s (service: %s)", rgName, resourceName, serviceName) + errWrapped := errors.Wrapf(err, "failed to get existing resource %s/%s (service: %s)", rgName, resourceName, serviceName) + return nil, azure.WithTransientError(errWrapped, getRetryAfterFromError(err)) } else if err == nil { existingResource = existing log.V(2).Info("successfully got existing resource", "service", serviceName, "resource", resourceName, "resourceGroup", rgName) @@ -136,7 +140,7 @@ func (s *Service) CreateResource(ctx context.Context, spec azure.ResourceSpecGet return nil, errors.Wrapf(err, "failed to create resource %s/%s (service: %s)", rgName, resourceName, serviceName) } s.Scope.SetLongRunningOperationState(future) - return nil, azure.WithTransientError(azure.NewOperationNotDoneError(future), retryAfter(sdkFuture)) + return nil, azure.WithTransientError(azure.NewOperationNotDoneError(future), getRequeueAfterFromFuture(sdkFuture)) } else if err != nil { return nil, errors.Wrapf(err, "failed to create resource %s/%s (service: %s)", rgName, resourceName, serviceName) } @@ -170,7 +174,7 @@ func (s *Service) DeleteResource(ctx context.Context, spec azure.ResourceSpecGet return errors.Wrapf(err, "failed to delete resource %s/%s (service: %s)", rgName, resourceName, serviceName) } s.Scope.SetLongRunningOperationState(future) - return azure.WithTransientError(azure.NewOperationNotDoneError(future), retryAfter(sdkFuture)) + return azure.WithTransientError(azure.NewOperationNotDoneError(future), getRequeueAfterFromFuture(sdkFuture)) } else if err != nil { if azure.ResourceNotFound(err) { // already deleted @@ -183,12 +187,40 @@ func (s *Service) DeleteResource(ctx context.Context, spec azure.ResourceSpecGet return nil } -// retryAfter returns the max between the `RETRY-AFTER` header and the default requeue time. +// getRequeueAfterFromFuture returns the max between the `RETRY-AFTER` header and the default requeue time. // This ensures we respect the retry-after header if it is set and avoid retrying too often during an API throttling event. -func retryAfter(sdkFuture azureautorest.FutureAPI) time.Duration { +func getRequeueAfterFromFuture(sdkFuture azureautorest.FutureAPI) time.Duration { retryAfter, _ := sdkFuture.GetPollingDelay() if retryAfter < reconciler.DefaultReconcilerRequeue { retryAfter = reconciler.DefaultReconcilerRequeue } return retryAfter } + +// getRetryAfterFromError returns the time.Duration from the http.Response in the autorest.DetailedError. +// If there is no Response object, or if there is no meaningful Retry-After header data, we return a default. +func getRetryAfterFromError(err error) time.Duration { + // In case we aren't able to introspect Retry-After from the error type, we'll return this default + ret := reconciler.DefaultReconcilerRequeue + var detailedError autorest.DetailedError + // if we have a strongly typed autorest.DetailedError then we can introspect the HTTP response data + if errors.As(err, &detailedError) { + if detailedError.Response != nil { + // If we have Retry-After HTTP header data for any reason, prefer it + if retryAfter := detailedError.Response.Header.Get("Retry-After"); retryAfter != "" { + // This handles the case where Retry-After data is in the form of units of seconds + if rai, err := strconv.Atoi(retryAfter); err == nil { + ret = time.Duration(rai) * time.Second + // This handles the case where Retry-After data is in the form of absolute time + } else if t, err := time.Parse(time.RFC1123, retryAfter); err == nil { + ret = time.Until(t) + } + // If we didn't find Retry-After HTTP header data but the response type is 429, + // we'll have to come up with our sane default. + } else if detailedError.Response.StatusCode == http.StatusTooManyRequests { + ret = reconciler.DefaultHTTP429RetryAfter + } + } + } + return ret +} diff --git a/azure/services/async/async_test.go b/azure/services/async/async_test.go index ef0bd3d011c..ad1df74a21a 100644 --- a/azure/services/async/async_test.go +++ b/azure/services/async/async_test.go @@ -21,6 +21,7 @@ import ( "errors" "net/http" "testing" + "time" "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-05-01/resources" "github.com/Azure/go-autorest/autorest" @@ -31,6 +32,7 @@ import ( "sigs.k8s.io/cluster-api-provider-azure/azure/mock_azure" "sigs.k8s.io/cluster-api-provider-azure/azure/services/async/mock_async" gomockinternal "sigs.k8s.io/cluster-api-provider-azure/internal/test/matchers/gomock" + "sigs.k8s.io/cluster-api-provider-azure/util/reconciler" ) var ( @@ -434,3 +436,86 @@ func TestDeleteResource(t *testing.T) { }) } } + +func TestGetRetryAfterFromError(t *testing.T) { + cases := []struct { + name string + input error + expected time.Duration + expectedRangeTolerance time.Duration + }{ + { + name: "Retry-After header data present in the form of units of seconds", + input: autorest.DetailedError{ + Response: &http.Response{ + Header: http.Header{ + "Retry-After": []string{"2"}, + }, + }, + }, + expected: 2 * time.Second, + }, + { + name: "Retry-After header data present in the form of units of absolute time", + input: autorest.DetailedError{ + Response: &http.Response{ + Header: http.Header{ + "Retry-After": []string{time.Now().Add(1 * time.Hour).Format(time.RFC1123)}, + }, + }, + }, + expected: 1 * time.Hour, + expectedRangeTolerance: 5 * time.Second, + }, + { + name: "Retry-After header data not present", + input: autorest.DetailedError{ + Response: &http.Response{ + Header: http.Header{ + "foo": []string{"bar"}, + }, + }, + }, + expected: reconciler.DefaultReconcilerRequeue, + }, + { + name: "Retry-After header data not present in HTTP 429", + input: autorest.DetailedError{ + Response: &http.Response{ + StatusCode: http.StatusTooManyRequests, + Header: http.Header{ + "foo": []string{"bar"}, + }, + }, + }, + expected: reconciler.DefaultHTTP429RetryAfter, + }, + { + name: "nil http.Response", + input: autorest.DetailedError{ + Response: nil, + }, + expected: reconciler.DefaultReconcilerRequeue, + }, + { + name: "error type is not autorest.DetailedError", + input: errors.New("error"), + expected: reconciler.DefaultReconcilerRequeue, + }, + } + + for _, c := range cases { + c := c + t.Run(c.name, func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + ret := getRetryAfterFromError(c.input) + if c.expectedRangeTolerance > 0 { + g.Expect(ret).To(BeNumerically("<", c.expected)) + g.Expect(ret + c.expectedRangeTolerance).To(BeNumerically(">", c.expected)) + } else { + g.Expect(ret).To(Equal(c.expected)) + } + }) + } +} diff --git a/util/reconciler/defaults.go b/util/reconciler/defaults.go index 41f92d40d3c..00850b54a31 100644 --- a/util/reconciler/defaults.go +++ b/util/reconciler/defaults.go @@ -31,6 +31,8 @@ const ( DefaultAzureCallTimeout = 2 * time.Second // DefaultReconcilerRequeue is the default value for the reconcile retry. DefaultReconcilerRequeue = 15 * time.Second + // DefaultHTTP429RetryAfter is a default backoff wait time when we get a HTTP 429 response with no Retry-After data. + DefaultHTTP429RetryAfter = 1 * time.Minute ) // DefaultedLoopTimeout will default the timeout if it is zero-valued.