diff --git a/cmd/clusterctl/client/cluster/cert_manager.go b/cmd/clusterctl/client/cluster/cert_manager.go index c457f39b1b95..c6810fcecb3f 100644 --- a/cmd/clusterctl/client/cluster/cert_manager.go +++ b/cmd/clusterctl/client/cluster/cert_manager.go @@ -36,9 +36,6 @@ const ( waitCertManagerInterval = 1 * time.Second waitCertManagerTimeout = 10 * time.Minute - retryCreateCertManagerObject = 3 - retryIntervalCreateCertManagerObject = 1 * time.Second - certManagerImageComponent = "cert-manager" ) @@ -120,6 +117,7 @@ func (cm *certManagerClient) EnsureWebhook() error { } // installs the web-hook + createCertManagerBackoff := newBackoff() objs = sortResourcesForCreate(objs) for i := range objs { o := objs[i] @@ -127,7 +125,7 @@ func (cm *certManagerClient) EnsureWebhook() error { // Create the Kubernetes object. // Nb. The operation is wrapped in a retry loop to make ensureCerts more resilient to unexpected conditions. - if err := retry(retryCreateCertManagerObject, retryIntervalCreateCertManagerObject, func() error { + if err := retryWithExponentialBackoff(createCertManagerBackoff, func() error { return cm.createObj(o) }); err != nil { return err diff --git a/cmd/clusterctl/client/cluster/client.go b/cmd/clusterctl/client/cluster/client.go index 96dd4bb717bd..9811bc6594ee 100644 --- a/cmd/clusterctl/client/cluster/client.go +++ b/cmd/clusterctl/client/cluster/client.go @@ -20,11 +20,13 @@ import ( "context" "time" + "github.com/pkg/errors" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/util/wait" "sigs.k8s.io/cluster-api/cmd/clusterctl/client/config" "sigs.k8s.io/cluster-api/cmd/clusterctl/client/repository" "sigs.k8s.io/cluster-api/cmd/clusterctl/internal/test" + logf "sigs.k8s.io/cluster-api/cmd/clusterctl/log" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -193,3 +195,38 @@ type Proxy interface { } var _ Proxy = &test.FakeProxy{} + +// retryWithExponentialBackoff repeats an operation until it passes or the exponential backoff times out. +func retryWithExponentialBackoff(opts wait.Backoff, operation func() error) error { //nolint:unparam + log := logf.Log + + i := 0 + err := wait.ExponentialBackoff(opts, func() (bool, error) { + i++ + if err := operation(); err != nil { + if i < opts.Steps { + log.V(5).Info("Operation failed, retry", "Error", err) + return false, nil + } + return false, err + } + return true, nil + }) + if err != nil { + return errors.Wrapf(err, "action failed after %d attempts", i) + } + return nil +} + +// newBackoff creates a new API Machinery backoff parameter set suitable for use with clusterctl operations. +func newBackoff() wait.Backoff { + // Return a exponential backoff configuration which returns durations for a total time of ~40s. + // Example: 0, .5s, 1.2s, 2.3s, 4s, 6s, 10s, 16s, 24s, 37s + // Jitter is added as a random fraction of the duration multiplied by the jitter factor. + return wait.Backoff{ + Duration: 500 * time.Millisecond, + Factor: 1.5, + Steps: 10, + Jitter: 0.4, + } +} diff --git a/cmd/clusterctl/client/cluster/components.go b/cmd/clusterctl/client/cluster/components.go index 7d37ee60e831..c705e863443e 100644 --- a/cmd/clusterctl/client/cluster/components.go +++ b/cmd/clusterctl/client/cluster/components.go @@ -19,7 +19,6 @@ package cluster import ( "fmt" "strings" - "time" "github.com/pkg/errors" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -34,11 +33,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -const ( - retryCreateComponentObject = 3 - retryIntervalComponentObject = 1 * time.Second -) - type DeleteOptions struct { Provider clusterctlv1.Provider IncludeNamespace bool @@ -63,12 +57,13 @@ type providerComponents struct { } func (p *providerComponents) Create(objs []unstructured.Unstructured) error { + createComponentObjectBackoff := newBackoff() for i := range objs { obj := objs[i] // Create the Kubernetes object. // Nb. The operation is wrapped in a retry loop to make Create more resilient to unexpected conditions. - if err := retry(retryCreateComponentObject, retryIntervalComponentObject, func() error { + if err := retryWithExponentialBackoff(createComponentObjectBackoff, func() error { return p.createObj(obj) }); err != nil { return err diff --git a/cmd/clusterctl/client/cluster/inventory.go b/cmd/clusterctl/client/cluster/inventory.go index c837837677b6..3f9631d4818e 100644 --- a/cmd/clusterctl/client/cluster/inventory.go +++ b/cmd/clusterctl/client/cluster/inventory.go @@ -35,9 +35,6 @@ import ( const ( embeddedCustomResourceDefinitionPath = "cmd/clusterctl/config/manifest/clusterctl-api.yaml" - retryCreateInventoryObject = 3 - retryIntervalCreateInventoryObject = 1 * time.Second - waitInventoryCRDInterval = 250 * time.Millisecond waitInventoryCRDTimeout = 1 * time.Minute ) @@ -123,13 +120,14 @@ func (p *inventoryClient) EnsureCustomResourceDefinitions() error { } // Install the CRDs. + createInventoryObjectBackoff := newBackoff() for i := range objs { o := objs[i] log.V(5).Info("Creating", logf.UnstructuredToValues(o)...) // Create the Kubernetes object. // Nb. The operation is wrapped in a retry loop to make EnsureCustomResourceDefinitions more resilient to unexpected conditions. - if err := retry(retryCreateInventoryObject, retryIntervalCreateInventoryObject, func() error { + if err := retryWithExponentialBackoff(createInventoryObjectBackoff, func() error { return p.createObj(o) }); err != nil { return err diff --git a/cmd/clusterctl/client/cluster/mover.go b/cmd/clusterctl/client/cluster/mover.go index 9cce78f8c6b6..d5bff891d30e 100644 --- a/cmd/clusterctl/client/cluster/mover.go +++ b/cmd/clusterctl/client/cluster/mover.go @@ -18,7 +18,6 @@ package cluster import ( "fmt" - "time" "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" @@ -29,7 +28,6 @@ import ( kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/version" - "k8s.io/apimachinery/pkg/util/wait" clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" logf "sigs.k8s.io/cluster-api/cmd/clusterctl/log" "sigs.k8s.io/controller-runtime/pkg/client" @@ -389,20 +387,16 @@ func (o *objectMover) ensureNamespaces(graph *objectGraph, toProxy Proxy) error return nil } -const ( - retryCreateTargetObject = 3 - retryIntervalCreateTargetObject = 1 * time.Second -) - // createGroup creates all the Kubernetes objects into the target management cluster corresponding to the object graph nodes in a moveGroup. func (o *objectMover) createGroup(group moveGroup, toProxy Proxy) error { + createTargetObjectBackoff := newBackoff() errList := []error{} for i := range group { nodeToCreate := group[i] // Creates the Kubernetes object corresponding to the nodeToCreate. // Nb. The operation is wrapped in a retry loop to make move more resilient to unexpected conditions. - err := retry(retryCreateTargetObject, retryIntervalCreateTargetObject, func() error { + err := retryWithExponentialBackoff(createTargetObjectBackoff, func() error { return o.createTargetObject(nodeToCreate, toProxy) }) if err != nil { @@ -509,20 +503,16 @@ func (o *objectMover) createTargetObject(nodeToCreate *node, toProxy Proxy) erro return nil } -const ( - retryDeleteSourceObject = 3 - retryIntervalDeleteSourceObject = 1 * time.Second -) - // deleteGroup deletes all the Kubernetes objects from the source management cluster corresponding to the object graph nodes in a moveGroup. func (o *objectMover) deleteGroup(group moveGroup) error { + deleteSourceObjectBackoff := newBackoff() errList := []error{} for i := range group { nodeToDelete := group[i] // Delete the Kubernetes object corresponding to the current node. // Nb. The operation is wrapped in a retry loop to make move more resilient to unexpected conditions. - err := retry(retryDeleteSourceObject, retryIntervalDeleteSourceObject, func() error { + err := retryWithExponentialBackoff(deleteSourceObjectBackoff, func() error { return o.deleteSourceObject(nodeToDelete) }) @@ -583,24 +573,6 @@ func (o *objectMover) deleteSourceObject(nodeToDelete *node) error { return nil } -func retry(attempts int, interval time.Duration, action func() error) error { //nolint:unparam - log := logf.Log - - var errorToReturn error - for i := 0; i < attempts; i++ { - if err := action(); err != nil { - errorToReturn = err - - log.V(5).Info("Operation failed, retry", "Error", err) - pause := wait.Jitter(interval, 1) - time.Sleep(pause) - continue - } - return nil - } - return errors.Wrapf(errorToReturn, "action failed after %d attempts", attempts) -} - // checkTargetProviders checks that all the providers installed in the source cluster exists in the target cluster as well (with a version >= of the current version). func (o *objectMover) checkTargetProviders(namespace string, toInventory InventoryClient) error { // Gets the list of providers in the source/target cluster.