Skip to content
This repository has been archived by the owner on May 6, 2022. It is now read-only.

4xx, 5xx and Connection timeout should be retriable (not terminal errors) #1765

Merged
merged 20 commits into from
Mar 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_account = params.TEST_ACCOUNT
def test_zone = params.TEST_ZONE ?: 'us-west1-b'
def namespace = 'catalog'
def root_path = 'src/github.com/kubernetes-incubator/service-catalog'
def timeoutMin = 30
def timeoutMin = 50
def certFolder = '/tmp/sc-certs'

node {
Expand Down
16 changes: 10 additions & 6 deletions pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -660,15 +660,19 @@ func (c *controller) reconciliationRetryDurationExceeded(operationStartTime *met
// shouldStartOrphanMitigation returns whether an error with the given status
// code indicates that orphan migitation should start.
func shouldStartOrphanMitigation(statusCode int) bool {
is2XX := (statusCode >= 200 && statusCode < 300)
is5XX := (statusCode >= 500 && statusCode < 600)
is2XX := statusCode >= 200 && statusCode < 300
is5XX := statusCode >= 500 && statusCode < 600

return (is2XX && statusCode != http.StatusOK) ||
statusCode == http.StatusRequestTimeout ||
is5XX
return (is2XX && statusCode != http.StatusOK) || is5XX
}

// ReconciliationAction reprents a type of action the reconciler should take
// isRetriableHTTPStatus returns whether an error with the given HTTP status
// code is retriable.
func isRetriableHTTPStatus(statusCode int) bool {
return statusCode != http.StatusBadRequest
}

// ReconciliationAction represents a type of action the reconciler should take
// for a resource.
type ReconciliationAction string

Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/controller_binding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2375,7 +2375,7 @@ func TestReconcileBindingWithSetOrphanMitigation(t *testing.T) {
bindReactionError: osb.HTTPStatusCodeError{
StatusCode: 408,
},
setOrphanMitigation: true,
setOrphanMitigation: false,
shouldReturnError: false,
},
{
Expand Down
147 changes: 102 additions & 45 deletions pkg/controller/controller_instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,28 +371,31 @@ func (c *controller) reconcileServiceInstanceAdd(instance *v1beta1.ServiceInstan

response, err := brokerClient.ProvisionInstance(request)
if err != nil {
// A failure HTTP response code is treated as a terminal
// failure. Depending on the specific response, we may also
// need to initiate orphan mitigation.
if httpErr, ok := osb.IsHTTPError(err); ok {
msg := fmt.Sprintf(
"Error provisioning ServiceInstance of %s at ClusterServiceBroker %q: %s",
pretty.ClusterServiceClassName(serviceClass), brokerName, httpErr,
)
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, errorProvisionCallFailedReason, msg)
// Depending on the specific response, we may need to initiate orphan mitigation.
shouldMitigateOrphan := shouldStartOrphanMitigation(httpErr.StatusCode)
if isRetriableHTTPStatus(httpErr.StatusCode) {
return c.processTemporaryProvisionFailure(instance, readyCond, shouldMitigateOrphan)
}
// A failure with a given HTTP response code is treated as a terminal
// failure.
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, "ClusterServiceBrokerReturnedFailure", msg)
return c.processProvisionFailure(instance, readyCond, failedCond, shouldStartOrphanMitigation(httpErr.StatusCode))
return c.processTerminalProvisionFailure(instance, readyCond, failedCond, shouldMitigateOrphan)
}

reason := errorErrorCallingProvisionReason

// A timeout error is considered a terminal failure and we
// A timeout error is considered a retriable error, but we
// should initiate orphan mitigation.
if urlErr, ok := err.(*url.Error); ok && urlErr.Timeout() {
msg := fmt.Sprintf("Communication with the ClusterServiceBroker timed out; operation will not be retried: %v", urlErr)
msg := fmt.Sprintf("Communication with the ClusterServiceBroker timed out; operation will be retried: %v", urlErr)
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, msg)
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, reason, msg)
return c.processProvisionFailure(instance, readyCond, failedCond, true)
return c.processTemporaryProvisionFailure(instance, readyCond, true)
}

// All other errors should be retried, unless the
Expand All @@ -403,7 +406,7 @@ func (c *controller) reconcileServiceInstanceAdd(instance *v1beta1.ServiceInstan
if c.reconciliationRetryDurationExceeded(instance.Status.OperationStartTime) {
msg := "Stopping reconciliation retries because too much time has elapsed"
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, errorReconciliationRetryTimeoutReason, msg)
return c.processProvisionFailure(instance, readyCond, failedCond, false)
return c.processTerminalProvisionFailure(instance, readyCond, failedCond, false)
}

return c.processServiceInstanceOperationError(instance, readyCond)
Expand Down Expand Up @@ -481,17 +484,22 @@ func (c *controller) reconcileServiceInstanceUpdate(instance *v1beta1.ServiceIns
if httpErr, ok := osb.IsHTTPError(err); ok {
msg := fmt.Sprintf("ClusterServiceBroker returned a failure for update call; update will not be retried: %v", httpErr)
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, errorUpdateInstanceCallFailedReason, msg)

if isRetriableHTTPStatus(httpErr.StatusCode) {
return c.processTemporaryUpdateServiceInstanceFailure(instance, readyCond)
}
// A failure with a given HTTP response code is treated as a terminal
// failure.
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, errorUpdateInstanceCallFailedReason, msg)
return c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond)
return c.processTerminalUpdateServiceInstanceFailure(instance, readyCond, failedCond)
}

reason := errorErrorCallingUpdateInstanceReason

if urlErr, ok := err.(*url.Error); ok && urlErr.Timeout() {
msg := fmt.Sprintf("Communication with the ClusterServiceBroker timed out; update will not be retried: %v", urlErr)
msg := fmt.Sprintf("Communication with the ClusterServiceBroker timed out; update will be retried: %v", urlErr)
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, msg)
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, reason, msg)
return c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond)
return c.processTemporaryUpdateServiceInstanceFailure(instance, readyCond)
}

msg := fmt.Sprintf("The update call failed and will be retried: Error communicating with broker for updating: %s", err)
Expand All @@ -505,7 +513,7 @@ func (c *controller) reconcileServiceInstanceUpdate(instance *v1beta1.ServiceIns
msg = "Stopping reconciliation retries because too much time has elapsed"
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, errorReconciliationRetryTimeoutReason, msg)
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, errorReconciliationRetryTimeoutReason, msg)
return c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond)
return c.processTerminalUpdateServiceInstanceFailure(instance, readyCond, failedCond)
}

readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, msg)
Expand Down Expand Up @@ -761,14 +769,12 @@ func (c *controller) pollServiceInstance(instance *v1beta1.ServiceInstance) erro
reason := errorProvisionCallFailedReason
message := "Provision call failed: " + description
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, message)
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, reason, message)
err = c.processProvisionFailure(instance, readyCond, failedCond, false)
err = c.processTemporaryProvisionFailure(instance, readyCond, false)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This introduced a bug where if the polling operation returns state=failed, in combo with the change on this line: https://github.com/kubernetes-incubator/service-catalog/blob/e15b73719911853d3755b71f3d8b26b21296d0a3/pkg/controller/controller_instance.go#L1652

causes a failed provision after polling to never attempt deprovision.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@n3wscott apologies, this is because I thought that for async operations we never need to perform an orphan mitigation.
If what @kibbles-n-bytes wrote about bindings in openservicebrokerapi/servicebroker#334 (comment) is true about instances too, then we could just change a flag to true there:

err = c.processTemporaryProvisionFailure(instance, readyCond, true)

i.e. perform orphan mitigation immediately instead of waiting for deletion of the instance.
Thoughts?

@n3wscott also, can you create an issue to track this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had: #1879

default:
reason := errorUpdateInstanceCallFailedReason
message := "Update call failed: " + description
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, message)
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, reason, message)
err = c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond)
err = c.processTemporaryUpdateServiceInstanceFailure(instance, readyCond)
}
if err != nil {
return c.handleServiceInstancePollingError(instance, err)
Expand Down Expand Up @@ -822,10 +828,10 @@ func (c *controller) processServiceInstancePollingFailureRetryTimeout(instance *
case provisioning:
// always finish polling instance, as triggering OM will return an error
c.finishPollingServiceInstance(instance)
return c.processProvisionFailure(instance, readyCond, failedCond, true)
return c.processTerminalProvisionFailure(instance, readyCond, failedCond, true)
default:
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, errorReconciliationRetryTimeoutReason, msg)
err = c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond)
err = c.processTerminalUpdateServiceInstanceFailure(instance, readyCond, failedCond)
}
if err != nil {
return c.handleServiceInstancePollingError(instance, err)
Expand Down Expand Up @@ -1577,27 +1583,40 @@ func (c *controller) processProvisionSuccess(instance *v1beta1.ServiceInstance,
return nil
}

// processProvisionFailure handles the logging and updating of a
// processTerminalProvisionFailure handles the logging and updating of a
// ServiceInstance that hit a terminal failure during provision reconciliation.
func (c *controller) processProvisionFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition, shouldMitigateOrphan bool) error {
func (c *controller) processTerminalProvisionFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition, shouldMitigateOrphan bool) error {
if failedCond == nil {
return fmt.Errorf("failedCond must not be nil")
}
return c.processProvisionFailure(instance, readyCond, failedCond, shouldMitigateOrphan)
}

if readyCond != nil {
c.recorder.Event(instance, corev1.EventTypeWarning, readyCond.Reason, readyCond.Message)
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionReady, readyCond.Status, readyCond.Reason, readyCond.Message)
}
// processTemporaryProvisionFailure handles the logging and updating of a
// ServiceInstance that hit a temporary error during provision reconciliation.
func (c *controller) processTemporaryProvisionFailure(instance *v1beta1.ServiceInstance, readyCond *v1beta1.ServiceInstanceCondition, shouldMitigateOrphan bool) error {
return c.processProvisionFailure(instance, readyCond, nil, shouldMitigateOrphan)
}

// processProvisionFailure handles the logging and updating of a
// ServiceInstance that hit a temporary or a terminal failure during provision
// reconciliation.
func (c *controller) processProvisionFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition, shouldMitigateOrphan bool) error {
c.recorder.Event(instance, corev1.EventTypeWarning, readyCond.Reason, readyCond.Message)
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionReady, readyCond.Status, readyCond.Reason, readyCond.Message)

c.recorder.Event(instance, corev1.EventTypeWarning, failedCond.Reason, failedCond.Message)
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionFailed, failedCond.Status, failedCond.Reason, failedCond.Message)
var errorMessage error
if failedCond != nil {
c.recorder.Event(instance, corev1.EventTypeWarning, failedCond.Reason, failedCond.Message)
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionFailed, failedCond.Status, failedCond.Reason, failedCond.Message)
errorMessage = fmt.Errorf(failedCond.Message)
} else {
errorMessage = fmt.Errorf(readyCond.Message)
}

// Need to vary return error depending on whether the worker should
// requeue this resource.
var err error
if shouldMitigateOrphan {
// Copy original failure reason/message to a new OrphanMitigation condition
c.recorder.Event(instance, corev1.EventTypeWarning, readyCond.Reason, readyCond.Message)
c.recorder.Event(instance, corev1.EventTypeWarning, startingInstanceOrphanMitigationReason, startingInstanceOrphanMitigationMessage)
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionOrphanMitigation,
v1beta1.ConditionTrue, readyCond.Reason, readyCond.Message)
// Overwrite Ready condition reason/message with reporting on orphan mitigation
Expand All @@ -1606,24 +1625,35 @@ func (c *controller) processProvisionFailure(instance *v1beta1.ServiceInstance,
startingInstanceOrphanMitigationReason,
startingInstanceOrphanMitigationMessage)

instance.Status.OperationStartTime = nil
instance.Status.AsyncOpInProgress = false
instance.Status.OrphanMitigationInProgress = true

err = fmt.Errorf(failedCond.Message)
} else {
clearServiceInstanceCurrentOperation(instance)
// Deprovisioning is not required for provisioning that has failed with an
// error that doesn't require orphan mitigation
instance.Status.DeprovisionStatus = v1beta1.ServiceInstanceDeprovisionStatusNotRequired
instance.Status.ReconciledGeneration = instance.Status.ObservedGeneration
}

if failedCond == nil || shouldMitigateOrphan {
// Don't reset the current operation if the error is retriable
// or requires an orphan mitigation.
// Only reset the OSB operation status
clearServiceInstanceAsyncOsbOperation(instance)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kibbles-n-bytes note that I am reusing your newly introduced method there and in the update failure handling too.
Please double check if this is a correct behavior.
The goal here is to keep the OperationStartTime untouched to avoid retrying indefinitely.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the idea behind this seems ok to me

Copy link
Contributor

@kibbles-n-bytes kibbles-n-bytes Mar 23, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nilebox As a consequence of this, it seems like we'll only be trying the deprovision once in the case of a final orphan mitigation after a reconciliation retry duration timeout during a provision. For now, I think it's fine, though we should re-look at it in the future.

} else {
// Reset the current operation if there was a terminal error
clearServiceInstanceCurrentOperation(instance)
}

if _, err := c.updateServiceInstanceStatus(instance); err != nil {
return err
}

return err
// The instance will be requeued in any case, since we updated the status
// a few lines above.
// But we still need to return a non-nil error for retriable errors and
// orphan mitigation to avoid resetting the rate limiter.
if failedCond == nil || shouldMitigateOrphan {
return errorMessage
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See the comment above that clarifies why do we need to return a non-nil error.

}
return nil
}

// processProvisionAsyncResponse handles the logging and updating
Expand Down Expand Up @@ -1659,20 +1689,49 @@ func (c *controller) processUpdateServiceInstanceSuccess(instance *v1beta1.Servi
return nil
}

// processTerminalUpdateServiceInstanceFailure handles the logging and updating of a
// ServiceInstance that hit a terminal failure during update reconciliation.
func (c *controller) processTerminalUpdateServiceInstanceFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition) error {
if failedCond == nil {
return fmt.Errorf("failedCond must not be nil")
}
return c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond)
}

// processTemporaryUpdateServiceInstanceFailure handles the logging and updating of a
// ServiceInstance that hit a temporary error during update reconciliation.
func (c *controller) processTemporaryUpdateServiceInstanceFailure(instance *v1beta1.ServiceInstance, readyCond *v1beta1.ServiceInstanceCondition) error {
return c.processUpdateServiceInstanceFailure(instance, readyCond, nil)
}

// processUpdateServiceInstanceFailure handles the logging and updating of a
// ServiceInstance that hit a terminal failure during update reconciliation.
func (c *controller) processUpdateServiceInstanceFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition) error {
c.recorder.Event(instance, corev1.EventTypeWarning, readyCond.Reason, readyCond.Message)

setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionReady, readyCond.Status, readyCond.Reason, readyCond.Message)
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionFailed, failedCond.Status, failedCond.Reason, failedCond.Message)
clearServiceInstanceCurrentOperation(instance)
instance.Status.ReconciledGeneration = instance.Status.ObservedGeneration

if failedCond != nil {
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionFailed, failedCond.Status, failedCond.Reason, failedCond.Message)
// Reset the current operation if there was a terminal error
clearServiceInstanceCurrentOperation(instance)
} else {
// Don't reset the current operation if the error is retriable
// or requires an orphan mitigation.
// Only reset the OSB operation status
clearServiceInstanceAsyncOsbOperation(instance)
}

if _, err := c.updateServiceInstanceStatus(instance); err != nil {
return err
}

// The instance will be requeued in any case, since we updated the status
// a few lines above.
// But we still need to return a non-nil error for retriable errors
// to avoid resetting the rate limiter.
if failedCond == nil {
return fmt.Errorf(readyCond.Message)
}
return nil
}

Expand Down Expand Up @@ -1711,7 +1770,6 @@ func (c *controller) processDeprovisionSuccess(instance *v1beta1.ServiceInstance
instance.Status.ExternalProperties = nil
instance.Status.ProvisionStatus = v1beta1.ServiceInstanceProvisionStatusNotProvisioned
instance.Status.DeprovisionStatus = v1beta1.ServiceInstanceDeprovisionStatusSucceeded
instance.Status.ReconciledGeneration = instance.Status.ObservedGeneration

if mitigatingOrphan {
if _, err := c.updateServiceInstanceStatus(instance); err != nil {
Expand Down Expand Up @@ -1755,7 +1813,6 @@ func (c *controller) processDeprovisionFailure(instance *v1beta1.ServiceInstance
}

clearServiceInstanceCurrentOperation(instance)
instance.Status.ReconciledGeneration = instance.Status.ObservedGeneration
instance.Status.DeprovisionStatus = v1beta1.ServiceInstanceDeprovisionStatusFailed

if _, err := c.updateServiceInstanceStatus(instance); err != nil {
Expand Down
Loading