From b8026b35a857fb15407290507cb3ae202fa581b6 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Wed, 25 Mar 2020 20:16:48 +0100 Subject: [PATCH] clusterctl add retries --- cmd/clusterctl/client/cluster/cert_manager.go | 2 +- cmd/clusterctl/client/cluster/client.go | 32 ++- cmd/clusterctl/client/cluster/components.go | 2 +- cmd/clusterctl/client/cluster/inventory.go | 68 ++++-- cmd/clusterctl/client/cluster/mover.go | 230 +++++++++++------- cmd/clusterctl/client/cluster/mover_test.go | 1 - cmd/clusterctl/client/cluster/objectgraph.go | 66 +++-- cmd/clusterctl/client/cluster/proxy.go | 26 +- 8 files changed, 295 insertions(+), 132 deletions(-) diff --git a/cmd/clusterctl/client/cluster/cert_manager.go b/cmd/clusterctl/client/cluster/cert_manager.go index c6810fcecb3f..4f9b9b6ff9b5 100644 --- a/cmd/clusterctl/client/cluster/cert_manager.go +++ b/cmd/clusterctl/client/cluster/cert_manager.go @@ -117,7 +117,7 @@ func (cm *certManagerClient) EnsureWebhook() error { } // installs the web-hook - createCertManagerBackoff := newBackoff() + createCertManagerBackoff := newWriteBackoff() objs = sortResourcesForCreate(objs) for i := range objs { o := objs[i] diff --git a/cmd/clusterctl/client/cluster/client.go b/cmd/clusterctl/client/cluster/client.go index 9811bc6594ee..fa4911068b48 100644 --- a/cmd/clusterctl/client/cluster/client.go +++ b/cmd/clusterctl/client/cluster/client.go @@ -205,7 +205,7 @@ func retryWithExponentialBackoff(opts wait.Backoff, operation func() error) erro i++ if err := operation(); err != nil { if i < opts.Steps { - log.V(5).Info("Operation failed, retry", "Error", err) + log.V(5).Info("Operation failed, retrying with backoff", "Cause", err.Error()) return false, nil } return false, err @@ -218,8 +218,8 @@ func retryWithExponentialBackoff(opts wait.Backoff, operation func() error) erro return nil } -// newBackoff creates a new API Machinery backoff parameter set suitable for use with clusterctl operations. -func newBackoff() wait.Backoff { +// newWriteBackoff creates a new API Machinery backoff parameter set suitable for use with clusterctl write operations. +func newWriteBackoff() wait.Backoff { // Return a exponential backoff configuration which returns durations for a total time of ~40s. // Example: 0, .5s, 1.2s, 2.3s, 4s, 6s, 10s, 16s, 24s, 37s // Jitter is added as a random fraction of the duration multiplied by the jitter factor. @@ -230,3 +230,29 @@ func newBackoff() wait.Backoff { Jitter: 0.4, } } + +// newConnectBackoff creates a new API Machinery backoff parameter set suitable for use when clusterctl connect to a cluster. +func newConnectBackoff() wait.Backoff { + // Return a exponential backoff configuration which returns durations for a total time of ~15s. + // Example: 0, .25s, .6s, 1.2, 2.1s, 3.4s, 5.5s, 8s, 12s + // Jitter is added as a random fraction of the duration multiplied by the jitter factor. + return wait.Backoff{ + Duration: 250 * time.Millisecond, + Factor: 1.5, + Steps: 9, + Jitter: 0.1, + } +} + +// newReadBackoff creates a new API Machinery backoff parameter set suitable for use with clusterctl read operations. +func newReadBackoff() wait.Backoff { + // Return a exponential backoff configuration which returns durations for a total time of ~15s. + // Example: 0, .25s, .6s, 1.2, 2.1s, 3.4s, 5.5s, 8s, 12s + // Jitter is added as a random fraction of the duration multiplied by the jitter factor. + return wait.Backoff{ + Duration: 250 * time.Millisecond, + Factor: 1.5, + Steps: 9, + Jitter: 0.1, + } +} diff --git a/cmd/clusterctl/client/cluster/components.go b/cmd/clusterctl/client/cluster/components.go index c705e863443e..6f1beaa59441 100644 --- a/cmd/clusterctl/client/cluster/components.go +++ b/cmd/clusterctl/client/cluster/components.go @@ -57,7 +57,7 @@ type providerComponents struct { } func (p *providerComponents) Create(objs []unstructured.Unstructured) error { - createComponentObjectBackoff := newBackoff() + createComponentObjectBackoff := newWriteBackoff() for i := range objs { obj := objs[i] diff --git a/cmd/clusterctl/client/cluster/inventory.go b/cmd/clusterctl/client/cluster/inventory.go index 1d73907b6ce3..66203305608d 100644 --- a/cmd/clusterctl/client/cluster/inventory.go +++ b/cmd/clusterctl/client/cluster/inventory.go @@ -91,18 +91,29 @@ func newInventoryClient(proxy Proxy, pollImmediateWaiter PollImmediateWaiter) *i func (p *inventoryClient) EnsureCustomResourceDefinitions() error { log := logf.Log - c, err := p.proxy.NewClient() + // Being this the first connection of many clusterctl operations, we want to fail fast if there is no + // connectivity to the cluster, so we try to get a client as a first thing. + // NB. NewClient has an internal retry loop that should mitigate temporary connection glitch; here we are + // trying to detect persistent connection problems (>10s) before entering in longer retry loops while executing + // clusterctl operations. + _, err := p.proxy.NewClient() if err != nil { return err } // Check the CRDs already exists, if yes, exit immediately. - l := &clusterctlv1.ProviderList{} - if err = c.List(ctx, l); err == nil { - return nil + // Nb. The operation is wrapped in a retry loop to make EnsureCustomResourceDefinitions more resilient to unexpected conditions. + var crdIsIstalled bool + listInventoryBackoff := newReadBackoff() + if err := retryWithExponentialBackoff(listInventoryBackoff, func() error { + var err error + crdIsIstalled, err = checkInventoryCRDs(p.proxy) + return err + }); err != nil { + return err } - if !apimeta.IsNoMatchError(err) { - return errors.Wrap(err, "failed to check if the clusterctl inventory CRD exists") + if crdIsIstalled { + return nil } log.V(1).Info("Installing the clusterctl inventory CRD") @@ -120,7 +131,7 @@ func (p *inventoryClient) EnsureCustomResourceDefinitions() error { } // Install the CRDs. - createInventoryObjectBackoff := newBackoff() + createInventoryObjectBackoff := newWriteBackoff() for i := range objs { o := objs[i] log.V(5).Info("Creating", logf.UnstructuredToValues(o)...) @@ -166,6 +177,23 @@ func (p *inventoryClient) EnsureCustomResourceDefinitions() error { return nil } +// checkInventoryCRDs checks if the inventory CRDs are installed in the cluster. +func checkInventoryCRDs(proxy Proxy) (bool, error) { + c, err := proxy.NewClient() + if err != nil { + return false, err + } + + l := &clusterctlv1.ProviderList{} + if err = c.List(ctx, l); err == nil { + return true, nil + } + if !apimeta.IsNoMatchError(err) { + return false, errors.Wrap(err, "failed to check if the clusterctl inventory CRD exists") + } + return false, nil +} + func (p *inventoryClient) createObj(o unstructured.Unstructured) error { c, err := p.proxy.NewClient() if err != nil { @@ -190,8 +218,7 @@ func (p *inventoryClient) createObj(o unstructured.Unstructured) error { func (p *inventoryClient) Create(m clusterctlv1.Provider) error { // Create the Kubernetes object. - // Nb. The operation is wrapped in a retry loop to make Create more resilient to unexpected conditions. - createInventoryObjectBackoff := newBackoff() + createInventoryObjectBackoff := newWriteBackoff() return retryWithExponentialBackoff(createInventoryObjectBackoff, func() error { cl, err := p.proxy.NewClient() if err != nil { @@ -227,16 +254,29 @@ func (p *inventoryClient) Create(m clusterctlv1.Provider) error { } func (p *inventoryClient) List() (*clusterctlv1.ProviderList, error) { - cl, err := p.proxy.NewClient() - if err != nil { + providerList := &clusterctlv1.ProviderList{} + + listProvidersBackoff := newReadBackoff() + if err := retryWithExponentialBackoff(listProvidersBackoff, func() error { + return listProviders(p.proxy, providerList) + }); err != nil { return nil, err } - providerList := &clusterctlv1.ProviderList{} + return providerList, nil +} + +// listProviders retrieves the list of provider inventory objects. +func listProviders(proxy Proxy, providerList *clusterctlv1.ProviderList) error { + cl, err := proxy.NewClient() + if err != nil { + return err + } + if err := cl.List(ctx, providerList); err != nil { - return nil, errors.Wrap(err, "failed get providers") + return errors.Wrap(err, "failed get providers") } - return providerList, nil + return nil } func (p *inventoryClient) GetDefaultProviderName(providerType clusterctlv1.ProviderType) (string, error) { diff --git a/cmd/clusterctl/client/cluster/mover.go b/cmd/clusterctl/client/cluster/mover.go index 2fd6082b11d1..5150a824aa40 100644 --- a/cmd/clusterctl/client/cluster/mover.go +++ b/cmd/clusterctl/client/cluster/mover.go @@ -99,22 +99,17 @@ func newObjectMover(fromProxy Proxy, fromProviderInventory InventoryClient) *obj // checkProvisioningCompleted checks if Cluster API has already completed the provisioning of the infrastructure for the objects involved in the move operation. func (o *objectMover) checkProvisioningCompleted(graph *objectGraph) error { errList := []error{} - cFrom, err := o.fromProxy.NewClient() - if err != nil { - return err - } // Checking all the clusters have infrastructure is ready - for _, cluster := range graph.getClusters() { + readClusterBackoff := newReadBackoff() + clusters := graph.getClusters() + for i := range clusters { + cluster := clusters[i] clusterObj := &clusterv1.Cluster{} - clusterObjKey := client.ObjectKey{ - Namespace: cluster.identity.Namespace, - Name: cluster.identity.Name, - } - - if err := cFrom.Get(ctx, clusterObjKey, clusterObj); err != nil { - return errors.Wrapf(err, "error reading %q %s/%s", - clusterObj.GroupVersionKind(), clusterObj.GetNamespace(), clusterObj.GetName()) + if err := retryWithExponentialBackoff(readClusterBackoff, func() error { + return getClusterObj(o.fromProxy, cluster, clusterObj) + }); err != nil { + return err } if !clusterObj.Status.InfrastructureReady { @@ -135,16 +130,15 @@ func (o *objectMover) checkProvisioningCompleted(graph *objectGraph) error { // Checking all the machine have a NodeRef // Nb. NodeRef is considered a better signal than InfrastructureReady, because it ensures the node in the workload cluster is up and running. - for _, machine := range graph.getMachines() { + readMachinesBackoff := newReadBackoff() + machines := graph.getMachines() + for i := range machines { + machine := machines[i] machineObj := &clusterv1.Machine{} - machineObjKey := client.ObjectKey{ - Namespace: machine.identity.Namespace, - Name: machine.identity.Name, - } - - if err := cFrom.Get(ctx, machineObjKey, machineObj); err != nil { - return errors.Wrapf(err, "error reading %q %s/%s", - machineObj.GroupVersionKind(), machineObj.GetNamespace(), machineObj.GetName()) + if err := retryWithExponentialBackoff(readMachinesBackoff, func() error { + return getMachineObj(o.fromProxy, machine, machineObj) + }); err != nil { + return err } if machineObj.Status.NodeRef == nil { @@ -155,6 +149,42 @@ func (o *objectMover) checkProvisioningCompleted(graph *objectGraph) error { return kerrors.NewAggregate(errList) } +// getClusterObj retrieves the the clusterObj corresponding to a node with type Cluster. +func getClusterObj(proxy Proxy, cluster *node, clusterObj *clusterv1.Cluster) error { + c, err := proxy.NewClient() + if err != nil { + return err + } + clusterObjKey := client.ObjectKey{ + Namespace: cluster.identity.Namespace, + Name: cluster.identity.Name, + } + + if err := c.Get(ctx, clusterObjKey, clusterObj); err != nil { + return errors.Wrapf(err, "error reading %q %s/%s", + clusterObj.GroupVersionKind(), clusterObj.GetNamespace(), clusterObj.GetName()) + } + return nil +} + +// getMachineObj retrieves the the machineObj corresponding to a node with type Machine. +func getMachineObj(proxy Proxy, machine *node, machineObj *clusterv1.Machine) error { + c, err := proxy.NewClient() + if err != nil { + return err + } + machineObjKey := client.ObjectKey{ + Namespace: machine.identity.Namespace, + Name: machine.identity.Name, + } + + if err := c.Get(ctx, machineObjKey, machineObj); err != nil { + return errors.Wrapf(err, "error reading %q %s/%s", + machineObj.GroupVersionKind(), machineObj.GetNamespace(), machineObj.GetName()) + } + return nil +} + // Move moves all the Cluster API objects existing in a namespace (or from all the namespaces if empty) to a target management cluster func (o *objectMover) move(graph *objectGraph, toProxy Proxy) error { log := logf.Log @@ -280,48 +310,55 @@ func getMoveSequence(graph *objectGraph) *moveSequence { return moveSequence } -// setClusterPause sets the paused field on a Cluster object. +// setClusterPause sets the paused field on nodes referring to Cluster objects. func setClusterPause(proxy Proxy, clusters []*node, value bool) error { log := logf.Log patch := client.RawPatch(types.MergePatchType, []byte(fmt.Sprintf("{\"spec\":{\"paused\":%t}}", value))) - for _, cluster := range clusters { + setClusterPauseBackoff := newWriteBackoff() + for i := range clusters { + cluster := clusters[i] log.V(5).Info("Set Cluster.Spec.Paused", "Paused", value, "Cluster", cluster.identity.Name, "Namespace", cluster.identity.Namespace) - cFrom, err := proxy.NewClient() - if err != nil { + // Nb. The operation is wrapped in a retry loop to make setClusterPause more resilient to unexpected conditions. + if err := retryWithExponentialBackoff(setClusterPauseBackoff, func() error { + return patchCluster(proxy, cluster, patch) + }); err != nil { return err } + } + return nil +} - clusterObj := &clusterv1.Cluster{} - clusterObjKey := client.ObjectKey{ - Namespace: cluster.identity.Namespace, - Name: cluster.identity.Name, - } +// patchCluster applies a patch to a node referring to a Cluster object. +func patchCluster(proxy Proxy, cluster *node, patch client.Patch) error { + cFrom, err := proxy.NewClient() + if err != nil { + return err + } - if err := cFrom.Get(ctx, clusterObjKey, clusterObj); err != nil { - return errors.Wrapf(err, "error reading %q %s/%s", - clusterObj.GroupVersionKind(), clusterObj.GetNamespace(), clusterObj.GetName()) - } + clusterObj := &clusterv1.Cluster{} + clusterObjKey := client.ObjectKey{ + Namespace: cluster.identity.Namespace, + Name: cluster.identity.Name, + } - if err := cFrom.Patch(ctx, clusterObj, patch); err != nil { - return errors.Wrapf(err, "error pausing reconciliation for %q %s/%s", - clusterObj.GroupVersionKind(), clusterObj.GetNamespace(), clusterObj.GetName()) - } + if err := cFrom.Get(ctx, clusterObjKey, clusterObj); err != nil { + return errors.Wrapf(err, "error reading %q %s/%s", + clusterObj.GroupVersionKind(), clusterObj.GetNamespace(), clusterObj.GetName()) + } + if err := cFrom.Patch(ctx, clusterObj, patch); err != nil { + return errors.Wrapf(err, "error pausing reconciliation for %q %s/%s", + clusterObj.GroupVersionKind(), clusterObj.GetNamespace(), clusterObj.GetName()) } + return nil } // ensureNamespaces ensures all the expected target namespaces are in place before creating objects. func (o *objectMover) ensureNamespaces(graph *objectGraph, toProxy Proxy) error { - log := logf.Log - - cs, err := toProxy.NewClient() - if err != nil { - return err - } - + ensureNamespaceBackoff := newWriteBackoff() namespaces := sets.NewString() for _, node := range graph.getNodesWithClusterTenants() { namespace := node.identity.Namespace @@ -332,64 +369,81 @@ func (o *objectMover) ensureNamespaces(graph *objectGraph, toProxy Proxy) error } namespaces.Insert(namespace) - // Otherwise check if namespace exists (also dealing with RBAC restrictions). - ns := &corev1.Namespace{} - key := client.ObjectKey{ - Name: namespace, + if err := retryWithExponentialBackoff(ensureNamespaceBackoff, func() error { + return o.ensureNamespace(toProxy, namespace) + }); err != nil { + return err } + } - if err := cs.Get(ctx, key, ns); err == nil { - return nil - } - if apierrors.IsForbidden(err) { - namespaces := &corev1.NamespaceList{} - namespaceExists := false - for { - if err := cs.List(ctx, namespaces, client.Continue(namespaces.Continue)); err != nil { - return err - } + return nil +} - for _, ns := range namespaces.Items { - if ns.Name == namespace { - namespaceExists = true - break - } - } +// ensureNamespace ensures a target namespaces is in place before creating objects. +func (o *objectMover) ensureNamespace(toProxy Proxy, namespace string) error { + log := logf.Log - if namespaces.Continue == "" { + cs, err := toProxy.NewClient() + if err != nil { + return err + } + + // Otherwise check if namespace exists (also dealing with RBAC restrictions). + ns := &corev1.Namespace{} + key := client.ObjectKey{ + Name: namespace, + } + + if err := cs.Get(ctx, key, ns); err == nil { + return nil + } + if apierrors.IsForbidden(err) { + namespaces := &corev1.NamespaceList{} + namespaceExists := false + for { + if err := cs.List(ctx, namespaces, client.Continue(namespaces.Continue)); err != nil { + return err + } + + for _, ns := range namespaces.Items { + if ns.Name == namespace { + namespaceExists = true break } } - if namespaceExists { - continue - } - } - if !apierrors.IsNotFound(err) { - return err - } - // If the namespace does not exists, create it. - ns = &corev1.Namespace{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "v1", - Kind: "Namespace", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: namespace, - }, + if namespaces.Continue == "" { + break + } } - log.V(1).Info("Creating", ns.Kind, ns.Name) - if err := cs.Create(ctx, ns); err != nil && !apierrors.IsAlreadyExists(err) { - return err + if namespaceExists { + return nil } } + if !apierrors.IsNotFound(err) { + return err + } + // If the namespace does not exists, create it. + ns = &corev1.Namespace{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Namespace", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: namespace, + }, + } + log.V(1).Info("Creating", ns.Kind, ns.Name) + if err := cs.Create(ctx, ns); err != nil && !apierrors.IsAlreadyExists(err) { + return err + } return nil } // createGroup creates all the Kubernetes objects into the target management cluster corresponding to the object graph nodes in a moveGroup. func (o *objectMover) createGroup(group moveGroup, toProxy Proxy) error { - createTargetObjectBackoff := newBackoff() + createTargetObjectBackoff := newWriteBackoff() errList := []error{} for i := range group { nodeToCreate := group[i] @@ -505,7 +559,7 @@ func (o *objectMover) createTargetObject(nodeToCreate *node, toProxy Proxy) erro // deleteGroup deletes all the Kubernetes objects from the source management cluster corresponding to the object graph nodes in a moveGroup. func (o *objectMover) deleteGroup(group moveGroup) error { - deleteSourceObjectBackoff := newBackoff() + deleteSourceObjectBackoff := newWriteBackoff() errList := []error{} for i := range group { nodeToDelete := group[i] diff --git a/cmd/clusterctl/client/cluster/mover_test.go b/cmd/clusterctl/client/cluster/mover_test.go index 1dbd5b20a754..2db6d6ab73c7 100644 --- a/cmd/clusterctl/client/cluster/mover_test.go +++ b/cmd/clusterctl/client/cluster/mover_test.go @@ -20,7 +20,6 @@ import ( "testing" . "github.com/onsi/gomega" - corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" diff --git a/cmd/clusterctl/client/cluster/objectgraph.go b/cmd/clusterctl/client/cluster/objectgraph.go index 867a19a0e1a5..21e5dda11af5 100644 --- a/cmd/clusterctl/client/cluster/objectgraph.go +++ b/cmd/clusterctl/client/cluster/objectgraph.go @@ -171,14 +171,12 @@ func (o *objectGraph) objToNode(obj *unstructured.Unstructured) *node { func (o *objectGraph) getDiscoveryTypes() ([]metav1.TypeMeta, error) { discoveredTypes := []metav1.TypeMeta{} - c, err := o.proxy.NewClient() - if err != nil { - return nil, err - } - crdList := &apiextensionsv1.CustomResourceDefinitionList{} - if err := c.List(ctx, crdList, client.MatchingLabels{clusterctlv1.ClusterctlLabelName: ""}); err != nil { - return nil, errors.Wrap(err, "failed to get the list of CRDs required for the move discovery phase") + getDiscoveryTypesBackoff := newReadBackoff() + if err := retryWithExponentialBackoff(getDiscoveryTypesBackoff, func() error { + return getCRDList(o.proxy, crdList) + }); err != nil { + return nil, err } for _, crd := range crdList.Items { @@ -203,32 +201,42 @@ func (o *objectGraph) getDiscoveryTypes() ([]metav1.TypeMeta, error) { return discoveredTypes, nil } +func getCRDList(proxy Proxy, crdList *apiextensionsv1.CustomResourceDefinitionList) error { + c, err := proxy.NewClient() + if err != nil { + return err + } + + if err := c.List(ctx, crdList, client.HasLabels{clusterctlv1.ClusterctlLabelName}); err != nil { + return errors.Wrap(err, "failed to get the list of CRDs required for the move discovery phase") + } + return nil +} + // Discovery reads all the Kubernetes objects existing in a namespace (or in all namespaces if empty) for the types received in input, and then adds // everything to the objects graph. func (o *objectGraph) Discovery(namespace string, types []metav1.TypeMeta) error { log := logf.Log log.Info("Discovering Cluster API objects") - c, err := o.proxy.NewClient() - if err != nil { - return err - } - selectors := []client.ListOption{} if namespace != "" { selectors = append(selectors, client.InNamespace(namespace)) } - for _, typeMeta := range types { + discoveryBackoff := newReadBackoff() + for i := range types { + typeMeta := types[i] objList := new(unstructured.UnstructuredList) - objList.SetAPIVersion(typeMeta.APIVersion) - objList.SetKind(typeMeta.Kind) - if err := c.List(ctx, objList, selectors...); err != nil { - if apierrors.IsNotFound(err) { - continue - } - return errors.Wrapf(err, "failed to list %q resources", objList.GroupVersionKind()) + if err := retryWithExponentialBackoff(discoveryBackoff, func() error { + return getObjList(o.proxy, typeMeta, selectors, objList) + }); err != nil { + return err + } + + if len(objList.Items) == 0 { + continue } log.V(5).Info(typeMeta.Kind, "Count", len(objList.Items)) @@ -250,6 +258,24 @@ func (o *objectGraph) Discovery(namespace string, types []metav1.TypeMeta) error return nil } +func getObjList(proxy Proxy, typeMeta metav1.TypeMeta, selectors []client.ListOption, objList *unstructured.UnstructuredList) error { + c, err := proxy.NewClient() + if err != nil { + return err + } + + objList.SetAPIVersion(typeMeta.APIVersion) + objList.SetKind(typeMeta.Kind) + + if err := c.List(ctx, objList, selectors...); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return errors.Wrapf(err, "failed to list %q resources", objList.GroupVersionKind()) + } + return nil +} + // getClusters returns the list of Clusters existing in the object graph. func (o *objectGraph) getClusters() []*node { clusters := []*node{} diff --git a/cmd/clusterctl/client/cluster/proxy.go b/cmd/clusterctl/client/cluster/proxy.go index b4a5b61b239d..d28597760177 100644 --- a/cmd/clusterctl/client/cluster/proxy.go +++ b/cmd/clusterctl/client/cluster/proxy.go @@ -70,8 +70,17 @@ func (k *proxy) NewClient() (client.Client, error) { return nil, err } - c, err := client.New(config, client.Options{Scheme: Scheme}) - if err != nil { + var c client.Client + // Nb. The operation is wrapped in a retry loop to make newClientSet more resilient to temporary connection problems. + connectBackoff := newConnectBackoff() + if err := retryWithExponentialBackoff(connectBackoff, func() error { + var err error + c, err = client.New(config, client.Options{Scheme: Scheme}) + if err != nil { + return err + } + return nil + }); err != nil { return nil, errors.Wrap(err, "failed to connect to the management cluster") } @@ -179,8 +188,17 @@ func (k *proxy) newClientSet() (*kubernetes.Clientset, error) { return nil, err } - cs, err := kubernetes.NewForConfig(config) - if err != nil { + var cs *kubernetes.Clientset + // Nb. The operation is wrapped in a retry loop to make newClientSet more resilient to temporary connection problems. + connectBackoff := newConnectBackoff() + if err := retryWithExponentialBackoff(connectBackoff, func() error { + var err error + cs, err = kubernetes.NewForConfig(config) + if err != nil { + return err + } + return nil + }); err != nil { return nil, errors.Wrap(err, "failed to create the client-go client") }