Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🏃clusterctl: add retries to clustectl move #2776

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/clusterctl/client/cluster/cert_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ func (cm *certManagerClient) EnsureWebhook() error {
}

// installs the web-hook
createCertManagerBackoff := newBackoff()
createCertManagerBackoff := newWriteBackoff()
objs = sortResourcesForCreate(objs)
for i := range objs {
o := objs[i]
Expand Down
32 changes: 29 additions & 3 deletions cmd/clusterctl/client/cluster/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ func retryWithExponentialBackoff(opts wait.Backoff, operation func() error) erro
i++
if err := operation(); err != nil {
if i < opts.Steps {
log.V(5).Info("Operation failed, retry", "Error", err)
log.V(5).Info("Operation failed, retrying with backoff", "Cause", err.Error())
return false, nil
}
return false, err
Expand All @@ -218,8 +218,8 @@ func retryWithExponentialBackoff(opts wait.Backoff, operation func() error) erro
return nil
}

// newBackoff creates a new API Machinery backoff parameter set suitable for use with clusterctl operations.
func newBackoff() wait.Backoff {
// newWriteBackoff creates a new API Machinery backoff parameter set suitable for use with clusterctl write operations.
func newWriteBackoff() wait.Backoff {
// Return a exponential backoff configuration which returns durations for a total time of ~40s.
// Example: 0, .5s, 1.2s, 2.3s, 4s, 6s, 10s, 16s, 24s, 37s
// Jitter is added as a random fraction of the duration multiplied by the jitter factor.
Expand All @@ -230,3 +230,29 @@ func newBackoff() wait.Backoff {
Jitter: 0.4,
}
}

// newConnectBackoff creates a new API Machinery backoff parameter set suitable for use when clusterctl connect to a cluster.
func newConnectBackoff() wait.Backoff {
// Return a exponential backoff configuration which returns durations for a total time of ~15s.
// Example: 0, .25s, .6s, 1.2, 2.1s, 3.4s, 5.5s, 8s, 12s
// Jitter is added as a random fraction of the duration multiplied by the jitter factor.
return wait.Backoff{
Duration: 250 * time.Millisecond,
Factor: 1.5,
Steps: 9,
Jitter: 0.1,
}
}

// newReadBackoff creates a new API Machinery backoff parameter set suitable for use with clusterctl read operations.
func newReadBackoff() wait.Backoff {
// Return a exponential backoff configuration which returns durations for a total time of ~15s.
// Example: 0, .25s, .6s, 1.2, 2.1s, 3.4s, 5.5s, 8s, 12s
// Jitter is added as a random fraction of the duration multiplied by the jitter factor.
return wait.Backoff{
Duration: 250 * time.Millisecond,
Factor: 1.5,
Steps: 9,
Jitter: 0.1,
}
Comment on lines +235 to +257
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these two seem to have effectively the same values, is that intended?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I do expect some tuning here.

}
2 changes: 1 addition & 1 deletion cmd/clusterctl/client/cluster/components.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ type providerComponents struct {
}

func (p *providerComponents) Create(objs []unstructured.Unstructured) error {
createComponentObjectBackoff := newBackoff()
createComponentObjectBackoff := newWriteBackoff()
for i := range objs {
obj := objs[i]

Expand Down
68 changes: 54 additions & 14 deletions cmd/clusterctl/client/cluster/inventory.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,18 +91,29 @@ func newInventoryClient(proxy Proxy, pollImmediateWaiter PollImmediateWaiter) *i
func (p *inventoryClient) EnsureCustomResourceDefinitions() error {
log := logf.Log

c, err := p.proxy.NewClient()
// Being this the first connection of many clusterctl operations, we want to fail fast if there is no
// connectivity to the cluster, so we try to get a client as a first thing.
// NB. NewClient has an internal retry loop that should mitigate temporary connection glitch; here we are
// trying to detect persistent connection problems (>10s) before entering in longer retry loops while executing
// clusterctl operations.
_, err := p.proxy.NewClient()
if err != nil {
return err
}

// Check the CRDs already exists, if yes, exit immediately.
l := &clusterctlv1.ProviderList{}
if err = c.List(ctx, l); err == nil {
return nil
// Nb. The operation is wrapped in a retry loop to make EnsureCustomResourceDefinitions more resilient to unexpected conditions.
var crdIsIstalled bool
listInventoryBackoff := newReadBackoff()
if err := retryWithExponentialBackoff(listInventoryBackoff, func() error {
var err error
crdIsIstalled, err = checkInventoryCRDs(p.proxy)
return err
}); err != nil {
return err
}
if !apimeta.IsNoMatchError(err) {
return errors.Wrap(err, "failed to check if the clusterctl inventory CRD exists")
if crdIsIstalled {
return nil
}

log.V(1).Info("Installing the clusterctl inventory CRD")
Expand All @@ -120,7 +131,7 @@ func (p *inventoryClient) EnsureCustomResourceDefinitions() error {
}

// Install the CRDs.
createInventoryObjectBackoff := newBackoff()
createInventoryObjectBackoff := newWriteBackoff()
for i := range objs {
o := objs[i]
log.V(5).Info("Creating", logf.UnstructuredToValues(o)...)
Expand Down Expand Up @@ -166,6 +177,23 @@ func (p *inventoryClient) EnsureCustomResourceDefinitions() error {
return nil
}

// checkInventoryCRDs checks if the inventory CRDs are installed in the cluster.
func checkInventoryCRDs(proxy Proxy) (bool, error) {
c, err := proxy.NewClient()
if err != nil {
return false, err
}

l := &clusterctlv1.ProviderList{}
if err = c.List(ctx, l); err == nil {
return true, nil
}
if !apimeta.IsNoMatchError(err) {
return false, errors.Wrap(err, "failed to check if the clusterctl inventory CRD exists")
}
return false, nil
}

func (p *inventoryClient) createObj(o unstructured.Unstructured) error {
c, err := p.proxy.NewClient()
if err != nil {
Expand All @@ -190,8 +218,7 @@ func (p *inventoryClient) createObj(o unstructured.Unstructured) error {

func (p *inventoryClient) Create(m clusterctlv1.Provider) error {
// Create the Kubernetes object.
// Nb. The operation is wrapped in a retry loop to make Create more resilient to unexpected conditions.
createInventoryObjectBackoff := newBackoff()
createInventoryObjectBackoff := newWriteBackoff()
return retryWithExponentialBackoff(createInventoryObjectBackoff, func() error {
cl, err := p.proxy.NewClient()
if err != nil {
Expand Down Expand Up @@ -227,16 +254,29 @@ func (p *inventoryClient) Create(m clusterctlv1.Provider) error {
}

func (p *inventoryClient) List() (*clusterctlv1.ProviderList, error) {
cl, err := p.proxy.NewClient()
if err != nil {
providerList := &clusterctlv1.ProviderList{}

listProvidersBackoff := newReadBackoff()
if err := retryWithExponentialBackoff(listProvidersBackoff, func() error {
return listProviders(p.proxy, providerList)
}); err != nil {
return nil, err
}

providerList := &clusterctlv1.ProviderList{}
return providerList, nil
}

// listProviders retrieves the list of provider inventory objects.
func listProviders(proxy Proxy, providerList *clusterctlv1.ProviderList) error {
cl, err := proxy.NewClient()
if err != nil {
return err
}

if err := cl.List(ctx, providerList); err != nil {
return nil, errors.Wrap(err, "failed get providers")
return errors.Wrap(err, "failed get providers")
}
return providerList, nil
return nil
}

func (p *inventoryClient) GetDefaultProviderName(providerType clusterctlv1.ProviderType) (string, error) {
Expand Down
Loading