Skip to content
This repository has been archived by the owner on May 6, 2022. It is now read-only.

Commit

Permalink
4xx, 5xx and Connection timeout should be retriable (not terminal err…
Browse files Browse the repository at this point in the history
…ors) (#1765)

* Make all HTTP errors except 400 temporary (retriable immediately)
* Make 400 error retriable after spec update
* Fix various issues with retry timeout and rate limiting for orphan mitigation
  • Loading branch information
nilebox authored Mar 24, 2018
1 parent 355f01f commit e15b737
Show file tree
Hide file tree
Showing 8 changed files with 516 additions and 165 deletions.
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_account = params.TEST_ACCOUNT
def test_zone = params.TEST_ZONE ?: 'us-west1-b'
def namespace = 'catalog'
def root_path = 'src/github.com/kubernetes-incubator/service-catalog'
def timeoutMin = 30
def timeoutMin = 50
def certFolder = '/tmp/sc-certs'

node {
Expand Down
16 changes: 10 additions & 6 deletions pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -660,15 +660,19 @@ func (c *controller) reconciliationRetryDurationExceeded(operationStartTime *met
// shouldStartOrphanMitigation returns whether an error with the given status
// code indicates that orphan migitation should start.
func shouldStartOrphanMitigation(statusCode int) bool {
is2XX := (statusCode >= 200 && statusCode < 300)
is5XX := (statusCode >= 500 && statusCode < 600)
is2XX := statusCode >= 200 && statusCode < 300
is5XX := statusCode >= 500 && statusCode < 600

return (is2XX && statusCode != http.StatusOK) ||
statusCode == http.StatusRequestTimeout ||
is5XX
return (is2XX && statusCode != http.StatusOK) || is5XX
}

// ReconciliationAction reprents a type of action the reconciler should take
// isRetriableHTTPStatus returns whether an error with the given HTTP status
// code is retriable.
func isRetriableHTTPStatus(statusCode int) bool {
return statusCode != http.StatusBadRequest
}

// ReconciliationAction represents a type of action the reconciler should take
// for a resource.
type ReconciliationAction string

Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/controller_binding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2375,7 +2375,7 @@ func TestReconcileBindingWithSetOrphanMitigation(t *testing.T) {
bindReactionError: osb.HTTPStatusCodeError{
StatusCode: 408,
},
setOrphanMitigation: true,
setOrphanMitigation: false,
shouldReturnError: false,
},
{
Expand Down
147 changes: 102 additions & 45 deletions pkg/controller/controller_instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -385,28 +385,31 @@ func (c *controller) reconcileServiceInstanceAdd(instance *v1beta1.ServiceInstan

response, err := brokerClient.ProvisionInstance(request)
if err != nil {
// A failure HTTP response code is treated as a terminal
// failure. Depending on the specific response, we may also
// need to initiate orphan mitigation.
if httpErr, ok := osb.IsHTTPError(err); ok {
msg := fmt.Sprintf(
"Error provisioning ServiceInstance of %s at ClusterServiceBroker %q: %s",
pretty.ClusterServiceClassName(serviceClass), brokerName, httpErr,
)
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, errorProvisionCallFailedReason, msg)
// Depending on the specific response, we may need to initiate orphan mitigation.
shouldMitigateOrphan := shouldStartOrphanMitigation(httpErr.StatusCode)
if isRetriableHTTPStatus(httpErr.StatusCode) {
return c.processTemporaryProvisionFailure(instance, readyCond, shouldMitigateOrphan)
}
// A failure with a given HTTP response code is treated as a terminal
// failure.
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, "ClusterServiceBrokerReturnedFailure", msg)
return c.processProvisionFailure(instance, readyCond, failedCond, shouldStartOrphanMitigation(httpErr.StatusCode))
return c.processTerminalProvisionFailure(instance, readyCond, failedCond, shouldMitigateOrphan)
}

reason := errorErrorCallingProvisionReason

// A timeout error is considered a terminal failure and we
// A timeout error is considered a retriable error, but we
// should initiate orphan mitigation.
if urlErr, ok := err.(*url.Error); ok && urlErr.Timeout() {
msg := fmt.Sprintf("Communication with the ClusterServiceBroker timed out; operation will not be retried: %v", urlErr)
msg := fmt.Sprintf("Communication with the ClusterServiceBroker timed out; operation will be retried: %v", urlErr)
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, msg)
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, reason, msg)
return c.processProvisionFailure(instance, readyCond, failedCond, true)
return c.processTemporaryProvisionFailure(instance, readyCond, true)
}

// All other errors should be retried, unless the
Expand All @@ -417,7 +420,7 @@ func (c *controller) reconcileServiceInstanceAdd(instance *v1beta1.ServiceInstan
if c.reconciliationRetryDurationExceeded(instance.Status.OperationStartTime) {
msg := "Stopping reconciliation retries because too much time has elapsed"
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, errorReconciliationRetryTimeoutReason, msg)
return c.processProvisionFailure(instance, readyCond, failedCond, false)
return c.processTerminalProvisionFailure(instance, readyCond, failedCond, false)
}

return c.processServiceInstanceOperationError(instance, readyCond)
Expand Down Expand Up @@ -495,17 +498,22 @@ func (c *controller) reconcileServiceInstanceUpdate(instance *v1beta1.ServiceIns
if httpErr, ok := osb.IsHTTPError(err); ok {
msg := fmt.Sprintf("ClusterServiceBroker returned a failure for update call; update will not be retried: %v", httpErr)
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, errorUpdateInstanceCallFailedReason, msg)

if isRetriableHTTPStatus(httpErr.StatusCode) {
return c.processTemporaryUpdateServiceInstanceFailure(instance, readyCond)
}
// A failure with a given HTTP response code is treated as a terminal
// failure.
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, errorUpdateInstanceCallFailedReason, msg)
return c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond)
return c.processTerminalUpdateServiceInstanceFailure(instance, readyCond, failedCond)
}

reason := errorErrorCallingUpdateInstanceReason

if urlErr, ok := err.(*url.Error); ok && urlErr.Timeout() {
msg := fmt.Sprintf("Communication with the ClusterServiceBroker timed out; update will not be retried: %v", urlErr)
msg := fmt.Sprintf("Communication with the ClusterServiceBroker timed out; update will be retried: %v", urlErr)
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, msg)
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, reason, msg)
return c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond)
return c.processTemporaryUpdateServiceInstanceFailure(instance, readyCond)
}

msg := fmt.Sprintf("The update call failed and will be retried: Error communicating with broker for updating: %s", err)
Expand All @@ -519,7 +527,7 @@ func (c *controller) reconcileServiceInstanceUpdate(instance *v1beta1.ServiceIns
msg = "Stopping reconciliation retries because too much time has elapsed"
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, errorReconciliationRetryTimeoutReason, msg)
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, errorReconciliationRetryTimeoutReason, msg)
return c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond)
return c.processTerminalUpdateServiceInstanceFailure(instance, readyCond, failedCond)
}

readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, msg)
Expand Down Expand Up @@ -775,14 +783,12 @@ func (c *controller) pollServiceInstance(instance *v1beta1.ServiceInstance) erro
reason := errorProvisionCallFailedReason
message := "Provision call failed: " + description
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, message)
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, reason, message)
err = c.processProvisionFailure(instance, readyCond, failedCond, false)
err = c.processTemporaryProvisionFailure(instance, readyCond, false)
default:
reason := errorUpdateInstanceCallFailedReason
message := "Update call failed: " + description
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, message)
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, reason, message)
err = c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond)
err = c.processTemporaryUpdateServiceInstanceFailure(instance, readyCond)
}
if err != nil {
return c.handleServiceInstancePollingError(instance, err)
Expand Down Expand Up @@ -836,10 +842,10 @@ func (c *controller) processServiceInstancePollingFailureRetryTimeout(instance *
case provisioning:
// always finish polling instance, as triggering OM will return an error
c.finishPollingServiceInstance(instance)
return c.processProvisionFailure(instance, readyCond, failedCond, true)
return c.processTerminalProvisionFailure(instance, readyCond, failedCond, true)
default:
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, errorReconciliationRetryTimeoutReason, msg)
err = c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond)
err = c.processTerminalUpdateServiceInstanceFailure(instance, readyCond, failedCond)
}
if err != nil {
return c.handleServiceInstancePollingError(instance, err)
Expand Down Expand Up @@ -1597,27 +1603,40 @@ func (c *controller) processProvisionSuccess(instance *v1beta1.ServiceInstance,
return nil
}

// processProvisionFailure handles the logging and updating of a
// processTerminalProvisionFailure handles the logging and updating of a
// ServiceInstance that hit a terminal failure during provision reconciliation.
func (c *controller) processProvisionFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition, shouldMitigateOrphan bool) error {
func (c *controller) processTerminalProvisionFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition, shouldMitigateOrphan bool) error {
if failedCond == nil {
return fmt.Errorf("failedCond must not be nil")
}
return c.processProvisionFailure(instance, readyCond, failedCond, shouldMitigateOrphan)
}

if readyCond != nil {
c.recorder.Event(instance, corev1.EventTypeWarning, readyCond.Reason, readyCond.Message)
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionReady, readyCond.Status, readyCond.Reason, readyCond.Message)
}
// processTemporaryProvisionFailure handles the logging and updating of a
// ServiceInstance that hit a temporary error during provision reconciliation.
func (c *controller) processTemporaryProvisionFailure(instance *v1beta1.ServiceInstance, readyCond *v1beta1.ServiceInstanceCondition, shouldMitigateOrphan bool) error {
return c.processProvisionFailure(instance, readyCond, nil, shouldMitigateOrphan)
}

// processProvisionFailure handles the logging and updating of a
// ServiceInstance that hit a temporary or a terminal failure during provision
// reconciliation.
func (c *controller) processProvisionFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition, shouldMitigateOrphan bool) error {
c.recorder.Event(instance, corev1.EventTypeWarning, readyCond.Reason, readyCond.Message)
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionReady, readyCond.Status, readyCond.Reason, readyCond.Message)

c.recorder.Event(instance, corev1.EventTypeWarning, failedCond.Reason, failedCond.Message)
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionFailed, failedCond.Status, failedCond.Reason, failedCond.Message)
var errorMessage error
if failedCond != nil {
c.recorder.Event(instance, corev1.EventTypeWarning, failedCond.Reason, failedCond.Message)
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionFailed, failedCond.Status, failedCond.Reason, failedCond.Message)
errorMessage = fmt.Errorf(failedCond.Message)
} else {
errorMessage = fmt.Errorf(readyCond.Message)
}

// Need to vary return error depending on whether the worker should
// requeue this resource.
var err error
if shouldMitigateOrphan {
// Copy original failure reason/message to a new OrphanMitigation condition
c.recorder.Event(instance, corev1.EventTypeWarning, readyCond.Reason, readyCond.Message)
c.recorder.Event(instance, corev1.EventTypeWarning, startingInstanceOrphanMitigationReason, startingInstanceOrphanMitigationMessage)
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionOrphanMitigation,
v1beta1.ConditionTrue, readyCond.Reason, readyCond.Message)
// Overwrite Ready condition reason/message with reporting on orphan mitigation
Expand All @@ -1626,24 +1645,35 @@ func (c *controller) processProvisionFailure(instance *v1beta1.ServiceInstance,
startingInstanceOrphanMitigationReason,
startingInstanceOrphanMitigationMessage)

instance.Status.OperationStartTime = nil
instance.Status.AsyncOpInProgress = false
instance.Status.OrphanMitigationInProgress = true

err = fmt.Errorf(failedCond.Message)
} else {
clearServiceInstanceCurrentOperation(instance)
// Deprovisioning is not required for provisioning that has failed with an
// error that doesn't require orphan mitigation
instance.Status.DeprovisionStatus = v1beta1.ServiceInstanceDeprovisionStatusNotRequired
instance.Status.ReconciledGeneration = instance.Status.ObservedGeneration
}

if failedCond == nil || shouldMitigateOrphan {
// Don't reset the current operation if the error is retriable
// or requires an orphan mitigation.
// Only reset the OSB operation status
clearServiceInstanceAsyncOsbOperation(instance)
} else {
// Reset the current operation if there was a terminal error
clearServiceInstanceCurrentOperation(instance)
}

if _, err := c.updateServiceInstanceStatus(instance); err != nil {
return err
}

return err
// The instance will be requeued in any case, since we updated the status
// a few lines above.
// But we still need to return a non-nil error for retriable errors and
// orphan mitigation to avoid resetting the rate limiter.
if failedCond == nil || shouldMitigateOrphan {
return errorMessage
}
return nil
}

// processProvisionAsyncResponse handles the logging and updating
Expand Down Expand Up @@ -1679,20 +1709,49 @@ func (c *controller) processUpdateServiceInstanceSuccess(instance *v1beta1.Servi
return nil
}

// processTerminalUpdateServiceInstanceFailure handles the logging and updating of a
// ServiceInstance that hit a terminal failure during update reconciliation.
func (c *controller) processTerminalUpdateServiceInstanceFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition) error {
if failedCond == nil {
return fmt.Errorf("failedCond must not be nil")
}
return c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond)
}

// processTemporaryUpdateServiceInstanceFailure handles the logging and updating of a
// ServiceInstance that hit a temporary error during update reconciliation.
func (c *controller) processTemporaryUpdateServiceInstanceFailure(instance *v1beta1.ServiceInstance, readyCond *v1beta1.ServiceInstanceCondition) error {
return c.processUpdateServiceInstanceFailure(instance, readyCond, nil)
}

// processUpdateServiceInstanceFailure handles the logging and updating of a
// ServiceInstance that hit a terminal failure during update reconciliation.
func (c *controller) processUpdateServiceInstanceFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition) error {
c.recorder.Event(instance, corev1.EventTypeWarning, readyCond.Reason, readyCond.Message)

setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionReady, readyCond.Status, readyCond.Reason, readyCond.Message)
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionFailed, failedCond.Status, failedCond.Reason, failedCond.Message)
clearServiceInstanceCurrentOperation(instance)
instance.Status.ReconciledGeneration = instance.Status.ObservedGeneration

if failedCond != nil {
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionFailed, failedCond.Status, failedCond.Reason, failedCond.Message)
// Reset the current operation if there was a terminal error
clearServiceInstanceCurrentOperation(instance)
} else {
// Don't reset the current operation if the error is retriable
// or requires an orphan mitigation.
// Only reset the OSB operation status
clearServiceInstanceAsyncOsbOperation(instance)
}

if _, err := c.updateServiceInstanceStatus(instance); err != nil {
return err
}

// The instance will be requeued in any case, since we updated the status
// a few lines above.
// But we still need to return a non-nil error for retriable errors
// to avoid resetting the rate limiter.
if failedCond == nil {
return fmt.Errorf(readyCond.Message)
}
return nil
}

Expand Down Expand Up @@ -1731,7 +1790,6 @@ func (c *controller) processDeprovisionSuccess(instance *v1beta1.ServiceInstance
instance.Status.ExternalProperties = nil
instance.Status.ProvisionStatus = v1beta1.ServiceInstanceProvisionStatusNotProvisioned
instance.Status.DeprovisionStatus = v1beta1.ServiceInstanceDeprovisionStatusSucceeded
instance.Status.ReconciledGeneration = instance.Status.ObservedGeneration

if mitigatingOrphan {
if _, err := c.updateServiceInstanceStatus(instance); err != nil {
Expand Down Expand Up @@ -1775,7 +1833,6 @@ func (c *controller) processDeprovisionFailure(instance *v1beta1.ServiceInstance
}

clearServiceInstanceCurrentOperation(instance)
instance.Status.ReconciledGeneration = instance.Status.ObservedGeneration
instance.Status.DeprovisionStatus = v1beta1.ServiceInstanceDeprovisionStatusFailed

if _, err := c.updateServiceInstanceStatus(instance); err != nil {
Expand Down
Loading

0 comments on commit e15b737

Please sign in to comment.