Skip to content

Commit

Permalink
Add retries when SA not found in IAM (#4441) (#8476)
Browse files Browse the repository at this point in the history
Signed-off-by: Modular Magician <[email protected]>
  • Loading branch information
modular-magician authored Feb 16, 2021
1 parent 71c15de commit 7a9d14c
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .changelog/4441.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:enhancement
iam: added a retry condition that retries editing `iam_binding` and `iam_member` resources on policies that have frequently deleted service accounts
```
18 changes: 18 additions & 0 deletions google/error_retry_predicates.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,3 +341,21 @@ func isCloudRunCreationConflict(err error) (bool, string) {

return false, ""
}

// If a service account is deleted in the middle of updating an IAM policy
// it can cause the API to return an error. In fine-grained IAM resources we
// read the policy, modify it, then send it back to the API. Retrying is
// useful particularly in high-traffic projects.
// We don't want to retry _every_ time we see this error because the
// user-provided SA could trigger this too. At the callsite, we should check
// if the current etag matches the old etag and short-circuit if they do as
// that indicates the new config is the likely problem.
func iamServiceAccountNotFound(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 400 && strings.Contains(gerr.Body, "Service account") && strings.Contains(gerr.Body, "does not exist") {
return true, "service account not found in IAM"
}
}

return false, ""
}
25 changes: 25 additions & 0 deletions google/iam.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,31 @@ func iamPolicyReadModifyWrite(updater ResourceIamUpdater, modify iamPolicyModify
}
continue
}

// retry in the case that a service account is not found. This can happen when a service account is deleted
// out of band.
if isServiceAccountNotFoundError, _ := iamServiceAccountNotFound(err); isServiceAccountNotFoundError {
// calling a retryable function within a retry loop is not
// strictly the _best_ idea, but this error only happens in
// high-traffic projects anyways
currentPolicy, rerr := iamPolicyReadWithRetry(updater)
if rerr != nil {
if p.Etag != currentPolicy.Etag {
// not matching indicates that there is a new state to attempt to apply
log.Printf("current and old etag did not match for %s, retrying", updater.DescribeResource())
time.Sleep(backoff)
backoff = backoff * 2
continue
}

log.Printf("current and old etag matched for %s, not retrying", updater.DescribeResource())
} else {
// if the error is non-nil, just fall through and return the base error
log.Printf("[DEBUG]: error checking etag for policy %s. error: %v", updater.DescribeResource(), rerr)
}
}

log.Printf("[DEBUG]: not retrying IAM policy for %s. error: %v", updater.DescribeResource(), err)
return errwrap.Wrapf(fmt.Sprintf("Error applying IAM policy for %s: {{err}}", updater.DescribeResource()), err)
}
log.Printf("[DEBUG]: Set policy for %s", updater.DescribeResource())
Expand Down

0 comments on commit 7a9d14c

Please sign in to comment.