-
Notifications
You must be signed in to change notification settings - Fork 382
4xx, 5xx and Connection timeout should be retriable (not terminal errors) #1765
Changes from all commits
7996d89
4847388
cb98892
a40fc51
0aa8f29
6a95318
126638a
4c4c327
f421a1e
bf2cc12
7bb4e73
3e40460
0e02459
7c33d06
e4e5afc
77bbfbd
fbd39b1
89cdb33
67f82e6
5aa0139
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -371,28 +371,31 @@ func (c *controller) reconcileServiceInstanceAdd(instance *v1beta1.ServiceInstan | |
|
||
response, err := brokerClient.ProvisionInstance(request) | ||
if err != nil { | ||
// A failure HTTP response code is treated as a terminal | ||
// failure. Depending on the specific response, we may also | ||
// need to initiate orphan mitigation. | ||
if httpErr, ok := osb.IsHTTPError(err); ok { | ||
msg := fmt.Sprintf( | ||
"Error provisioning ServiceInstance of %s at ClusterServiceBroker %q: %s", | ||
pretty.ClusterServiceClassName(serviceClass), brokerName, httpErr, | ||
) | ||
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, errorProvisionCallFailedReason, msg) | ||
// Depending on the specific response, we may need to initiate orphan mitigation. | ||
shouldMitigateOrphan := shouldStartOrphanMitigation(httpErr.StatusCode) | ||
if isRetriableHTTPStatus(httpErr.StatusCode) { | ||
return c.processTemporaryProvisionFailure(instance, readyCond, shouldMitigateOrphan) | ||
} | ||
// A failure with a given HTTP response code is treated as a terminal | ||
// failure. | ||
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, "ClusterServiceBrokerReturnedFailure", msg) | ||
return c.processProvisionFailure(instance, readyCond, failedCond, shouldStartOrphanMitigation(httpErr.StatusCode)) | ||
return c.processTerminalProvisionFailure(instance, readyCond, failedCond, shouldMitigateOrphan) | ||
} | ||
|
||
reason := errorErrorCallingProvisionReason | ||
|
||
// A timeout error is considered a terminal failure and we | ||
// A timeout error is considered a retriable error, but we | ||
// should initiate orphan mitigation. | ||
if urlErr, ok := err.(*url.Error); ok && urlErr.Timeout() { | ||
msg := fmt.Sprintf("Communication with the ClusterServiceBroker timed out; operation will not be retried: %v", urlErr) | ||
msg := fmt.Sprintf("Communication with the ClusterServiceBroker timed out; operation will be retried: %v", urlErr) | ||
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, msg) | ||
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, reason, msg) | ||
return c.processProvisionFailure(instance, readyCond, failedCond, true) | ||
return c.processTemporaryProvisionFailure(instance, readyCond, true) | ||
} | ||
|
||
// All other errors should be retried, unless the | ||
|
@@ -403,7 +406,7 @@ func (c *controller) reconcileServiceInstanceAdd(instance *v1beta1.ServiceInstan | |
if c.reconciliationRetryDurationExceeded(instance.Status.OperationStartTime) { | ||
msg := "Stopping reconciliation retries because too much time has elapsed" | ||
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, errorReconciliationRetryTimeoutReason, msg) | ||
return c.processProvisionFailure(instance, readyCond, failedCond, false) | ||
return c.processTerminalProvisionFailure(instance, readyCond, failedCond, false) | ||
} | ||
|
||
return c.processServiceInstanceOperationError(instance, readyCond) | ||
|
@@ -481,17 +484,22 @@ func (c *controller) reconcileServiceInstanceUpdate(instance *v1beta1.ServiceIns | |
if httpErr, ok := osb.IsHTTPError(err); ok { | ||
msg := fmt.Sprintf("ClusterServiceBroker returned a failure for update call; update will not be retried: %v", httpErr) | ||
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, errorUpdateInstanceCallFailedReason, msg) | ||
|
||
if isRetriableHTTPStatus(httpErr.StatusCode) { | ||
return c.processTemporaryUpdateServiceInstanceFailure(instance, readyCond) | ||
} | ||
// A failure with a given HTTP response code is treated as a terminal | ||
// failure. | ||
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, errorUpdateInstanceCallFailedReason, msg) | ||
return c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond) | ||
return c.processTerminalUpdateServiceInstanceFailure(instance, readyCond, failedCond) | ||
} | ||
|
||
reason := errorErrorCallingUpdateInstanceReason | ||
|
||
if urlErr, ok := err.(*url.Error); ok && urlErr.Timeout() { | ||
msg := fmt.Sprintf("Communication with the ClusterServiceBroker timed out; update will not be retried: %v", urlErr) | ||
msg := fmt.Sprintf("Communication with the ClusterServiceBroker timed out; update will be retried: %v", urlErr) | ||
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, msg) | ||
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, reason, msg) | ||
return c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond) | ||
return c.processTemporaryUpdateServiceInstanceFailure(instance, readyCond) | ||
} | ||
|
||
msg := fmt.Sprintf("The update call failed and will be retried: Error communicating with broker for updating: %s", err) | ||
|
@@ -505,7 +513,7 @@ func (c *controller) reconcileServiceInstanceUpdate(instance *v1beta1.ServiceIns | |
msg = "Stopping reconciliation retries because too much time has elapsed" | ||
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, errorReconciliationRetryTimeoutReason, msg) | ||
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, errorReconciliationRetryTimeoutReason, msg) | ||
return c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond) | ||
return c.processTerminalUpdateServiceInstanceFailure(instance, readyCond, failedCond) | ||
} | ||
|
||
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, msg) | ||
|
@@ -761,14 +769,12 @@ func (c *controller) pollServiceInstance(instance *v1beta1.ServiceInstance) erro | |
reason := errorProvisionCallFailedReason | ||
message := "Provision call failed: " + description | ||
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, message) | ||
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, reason, message) | ||
err = c.processProvisionFailure(instance, readyCond, failedCond, false) | ||
err = c.processTemporaryProvisionFailure(instance, readyCond, false) | ||
default: | ||
reason := errorUpdateInstanceCallFailedReason | ||
message := "Update call failed: " + description | ||
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, reason, message) | ||
failedCond := newServiceInstanceFailedCondition(v1beta1.ConditionTrue, reason, message) | ||
err = c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond) | ||
err = c.processTemporaryUpdateServiceInstanceFailure(instance, readyCond) | ||
} | ||
if err != nil { | ||
return c.handleServiceInstancePollingError(instance, err) | ||
|
@@ -822,10 +828,10 @@ func (c *controller) processServiceInstancePollingFailureRetryTimeout(instance * | |
case provisioning: | ||
// always finish polling instance, as triggering OM will return an error | ||
c.finishPollingServiceInstance(instance) | ||
return c.processProvisionFailure(instance, readyCond, failedCond, true) | ||
return c.processTerminalProvisionFailure(instance, readyCond, failedCond, true) | ||
default: | ||
readyCond := newServiceInstanceReadyCondition(v1beta1.ConditionFalse, errorReconciliationRetryTimeoutReason, msg) | ||
err = c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond) | ||
err = c.processTerminalUpdateServiceInstanceFailure(instance, readyCond, failedCond) | ||
} | ||
if err != nil { | ||
return c.handleServiceInstancePollingError(instance, err) | ||
|
@@ -1577,27 +1583,40 @@ func (c *controller) processProvisionSuccess(instance *v1beta1.ServiceInstance, | |
return nil | ||
} | ||
|
||
// processProvisionFailure handles the logging and updating of a | ||
// processTerminalProvisionFailure handles the logging and updating of a | ||
// ServiceInstance that hit a terminal failure during provision reconciliation. | ||
func (c *controller) processProvisionFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition, shouldMitigateOrphan bool) error { | ||
func (c *controller) processTerminalProvisionFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition, shouldMitigateOrphan bool) error { | ||
if failedCond == nil { | ||
return fmt.Errorf("failedCond must not be nil") | ||
} | ||
return c.processProvisionFailure(instance, readyCond, failedCond, shouldMitigateOrphan) | ||
} | ||
|
||
if readyCond != nil { | ||
c.recorder.Event(instance, corev1.EventTypeWarning, readyCond.Reason, readyCond.Message) | ||
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionReady, readyCond.Status, readyCond.Reason, readyCond.Message) | ||
} | ||
// processTemporaryProvisionFailure handles the logging and updating of a | ||
// ServiceInstance that hit a temporary error during provision reconciliation. | ||
func (c *controller) processTemporaryProvisionFailure(instance *v1beta1.ServiceInstance, readyCond *v1beta1.ServiceInstanceCondition, shouldMitigateOrphan bool) error { | ||
return c.processProvisionFailure(instance, readyCond, nil, shouldMitigateOrphan) | ||
} | ||
|
||
// processProvisionFailure handles the logging and updating of a | ||
// ServiceInstance that hit a temporary or a terminal failure during provision | ||
// reconciliation. | ||
func (c *controller) processProvisionFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition, shouldMitigateOrphan bool) error { | ||
c.recorder.Event(instance, corev1.EventTypeWarning, readyCond.Reason, readyCond.Message) | ||
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionReady, readyCond.Status, readyCond.Reason, readyCond.Message) | ||
|
||
c.recorder.Event(instance, corev1.EventTypeWarning, failedCond.Reason, failedCond.Message) | ||
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionFailed, failedCond.Status, failedCond.Reason, failedCond.Message) | ||
var errorMessage error | ||
if failedCond != nil { | ||
c.recorder.Event(instance, corev1.EventTypeWarning, failedCond.Reason, failedCond.Message) | ||
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionFailed, failedCond.Status, failedCond.Reason, failedCond.Message) | ||
errorMessage = fmt.Errorf(failedCond.Message) | ||
} else { | ||
errorMessage = fmt.Errorf(readyCond.Message) | ||
} | ||
|
||
// Need to vary return error depending on whether the worker should | ||
// requeue this resource. | ||
var err error | ||
if shouldMitigateOrphan { | ||
// Copy original failure reason/message to a new OrphanMitigation condition | ||
c.recorder.Event(instance, corev1.EventTypeWarning, readyCond.Reason, readyCond.Message) | ||
c.recorder.Event(instance, corev1.EventTypeWarning, startingInstanceOrphanMitigationReason, startingInstanceOrphanMitigationMessage) | ||
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionOrphanMitigation, | ||
v1beta1.ConditionTrue, readyCond.Reason, readyCond.Message) | ||
// Overwrite Ready condition reason/message with reporting on orphan mitigation | ||
|
@@ -1606,24 +1625,35 @@ func (c *controller) processProvisionFailure(instance *v1beta1.ServiceInstance, | |
startingInstanceOrphanMitigationReason, | ||
startingInstanceOrphanMitigationMessage) | ||
|
||
instance.Status.OperationStartTime = nil | ||
instance.Status.AsyncOpInProgress = false | ||
instance.Status.OrphanMitigationInProgress = true | ||
|
||
err = fmt.Errorf(failedCond.Message) | ||
} else { | ||
clearServiceInstanceCurrentOperation(instance) | ||
// Deprovisioning is not required for provisioning that has failed with an | ||
// error that doesn't require orphan mitigation | ||
instance.Status.DeprovisionStatus = v1beta1.ServiceInstanceDeprovisionStatusNotRequired | ||
instance.Status.ReconciledGeneration = instance.Status.ObservedGeneration | ||
} | ||
|
||
if failedCond == nil || shouldMitigateOrphan { | ||
// Don't reset the current operation if the error is retriable | ||
// or requires an orphan mitigation. | ||
// Only reset the OSB operation status | ||
clearServiceInstanceAsyncOsbOperation(instance) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kibbles-n-bytes note that I am reusing your newly introduced method there and in the update failure handling too. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the idea behind this seems ok to me There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nilebox As a consequence of this, it seems like we'll only be trying the deprovision once in the case of a final orphan mitigation after a reconciliation retry duration timeout during a provision. For now, I think it's fine, though we should re-look at it in the future. |
||
} else { | ||
// Reset the current operation if there was a terminal error | ||
clearServiceInstanceCurrentOperation(instance) | ||
} | ||
|
||
if _, err := c.updateServiceInstanceStatus(instance); err != nil { | ||
return err | ||
} | ||
|
||
return err | ||
// The instance will be requeued in any case, since we updated the status | ||
// a few lines above. | ||
// But we still need to return a non-nil error for retriable errors and | ||
// orphan mitigation to avoid resetting the rate limiter. | ||
if failedCond == nil || shouldMitigateOrphan { | ||
return errorMessage | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See the comment above that clarifies why do we need to return a non-nil error. |
||
} | ||
return nil | ||
} | ||
|
||
// processProvisionAsyncResponse handles the logging and updating | ||
|
@@ -1659,20 +1689,49 @@ func (c *controller) processUpdateServiceInstanceSuccess(instance *v1beta1.Servi | |
return nil | ||
} | ||
|
||
// processTerminalUpdateServiceInstanceFailure handles the logging and updating of a | ||
// ServiceInstance that hit a terminal failure during update reconciliation. | ||
func (c *controller) processTerminalUpdateServiceInstanceFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition) error { | ||
if failedCond == nil { | ||
return fmt.Errorf("failedCond must not be nil") | ||
} | ||
return c.processUpdateServiceInstanceFailure(instance, readyCond, failedCond) | ||
} | ||
|
||
// processTemporaryUpdateServiceInstanceFailure handles the logging and updating of a | ||
// ServiceInstance that hit a temporary error during update reconciliation. | ||
func (c *controller) processTemporaryUpdateServiceInstanceFailure(instance *v1beta1.ServiceInstance, readyCond *v1beta1.ServiceInstanceCondition) error { | ||
return c.processUpdateServiceInstanceFailure(instance, readyCond, nil) | ||
} | ||
|
||
// processUpdateServiceInstanceFailure handles the logging and updating of a | ||
// ServiceInstance that hit a terminal failure during update reconciliation. | ||
func (c *controller) processUpdateServiceInstanceFailure(instance *v1beta1.ServiceInstance, readyCond, failedCond *v1beta1.ServiceInstanceCondition) error { | ||
c.recorder.Event(instance, corev1.EventTypeWarning, readyCond.Reason, readyCond.Message) | ||
|
||
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionReady, readyCond.Status, readyCond.Reason, readyCond.Message) | ||
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionFailed, failedCond.Status, failedCond.Reason, failedCond.Message) | ||
clearServiceInstanceCurrentOperation(instance) | ||
instance.Status.ReconciledGeneration = instance.Status.ObservedGeneration | ||
|
||
if failedCond != nil { | ||
setServiceInstanceCondition(instance, v1beta1.ServiceInstanceConditionFailed, failedCond.Status, failedCond.Reason, failedCond.Message) | ||
// Reset the current operation if there was a terminal error | ||
clearServiceInstanceCurrentOperation(instance) | ||
} else { | ||
// Don't reset the current operation if the error is retriable | ||
// or requires an orphan mitigation. | ||
// Only reset the OSB operation status | ||
clearServiceInstanceAsyncOsbOperation(instance) | ||
} | ||
|
||
if _, err := c.updateServiceInstanceStatus(instance); err != nil { | ||
return err | ||
} | ||
|
||
// The instance will be requeued in any case, since we updated the status | ||
// a few lines above. | ||
// But we still need to return a non-nil error for retriable errors | ||
// to avoid resetting the rate limiter. | ||
if failedCond == nil { | ||
return fmt.Errorf(readyCond.Message) | ||
} | ||
return nil | ||
} | ||
|
||
|
@@ -1711,7 +1770,6 @@ func (c *controller) processDeprovisionSuccess(instance *v1beta1.ServiceInstance | |
instance.Status.ExternalProperties = nil | ||
instance.Status.ProvisionStatus = v1beta1.ServiceInstanceProvisionStatusNotProvisioned | ||
instance.Status.DeprovisionStatus = v1beta1.ServiceInstanceDeprovisionStatusSucceeded | ||
instance.Status.ReconciledGeneration = instance.Status.ObservedGeneration | ||
|
||
if mitigatingOrphan { | ||
if _, err := c.updateServiceInstanceStatus(instance); err != nil { | ||
|
@@ -1755,7 +1813,6 @@ func (c *controller) processDeprovisionFailure(instance *v1beta1.ServiceInstance | |
} | ||
|
||
clearServiceInstanceCurrentOperation(instance) | ||
instance.Status.ReconciledGeneration = instance.Status.ObservedGeneration | ||
instance.Status.DeprovisionStatus = v1beta1.ServiceInstanceDeprovisionStatusFailed | ||
|
||
if _, err := c.updateServiceInstanceStatus(instance); err != nil { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This introduced a bug where if the polling operation returns
state=failed
, in combo with the change on this line: https://github.com/kubernetes-incubator/service-catalog/blob/e15b73719911853d3755b71f3d8b26b21296d0a3/pkg/controller/controller_instance.go#L1652causes a failed provision after polling to never attempt deprovision.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@n3wscott apologies, this is because I thought that for async operations we never need to perform an orphan mitigation.
If what @kibbles-n-bytes wrote about bindings in openservicebrokerapi/servicebroker#334 (comment) is true about instances too, then we could just change a flag to
true
there:i.e. perform orphan mitigation immediately instead of waiting for deletion of the instance.
Thoughts?
@n3wscott also, can you create an issue to track this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had: #1879