From 7a9d14cf66aa9876e7bbd190cef7ab1f613ae4a4 Mon Sep 17 00:00:00 2001 From: The Magician Date: Tue, 16 Feb 2021 12:49:20 -0800 Subject: [PATCH] Add retries when SA not found in IAM (#4441) (#8476) Signed-off-by: Modular Magician --- .changelog/4441.txt | 3 +++ google/error_retry_predicates.go | 18 ++++++++++++++++++ google/iam.go | 25 +++++++++++++++++++++++++ 3 files changed, 46 insertions(+) create mode 100644 .changelog/4441.txt diff --git a/.changelog/4441.txt b/.changelog/4441.txt new file mode 100644 index 00000000000..e445d00dca6 --- /dev/null +++ b/.changelog/4441.txt @@ -0,0 +1,3 @@ +```release-note:enhancement +iam: added a retry condition that retries editing `iam_binding` and `iam_member` resources on policies that have frequently deleted service accounts +``` diff --git a/google/error_retry_predicates.go b/google/error_retry_predicates.go index c6f126db397..3d2b188c93b 100644 --- a/google/error_retry_predicates.go +++ b/google/error_retry_predicates.go @@ -341,3 +341,21 @@ func isCloudRunCreationConflict(err error) (bool, string) { return false, "" } + +// If a service account is deleted in the middle of updating an IAM policy +// it can cause the API to return an error. In fine-grained IAM resources we +// read the policy, modify it, then send it back to the API. Retrying is +// useful particularly in high-traffic projects. +// We don't want to retry _every_ time we see this error because the +// user-provided SA could trigger this too. At the callsite, we should check +// if the current etag matches the old etag and short-circuit if they do as +// that indicates the new config is the likely problem. +func iamServiceAccountNotFound(err error) (bool, string) { + if gerr, ok := err.(*googleapi.Error); ok { + if gerr.Code == 400 && strings.Contains(gerr.Body, "Service account") && strings.Contains(gerr.Body, "does not exist") { + return true, "service account not found in IAM" + } + } + + return false, "" +} diff --git a/google/iam.go b/google/iam.go index ca9873d9ff0..58ea9078799 100644 --- a/google/iam.go +++ b/google/iam.go @@ -148,6 +148,31 @@ func iamPolicyReadModifyWrite(updater ResourceIamUpdater, modify iamPolicyModify } continue } + + // retry in the case that a service account is not found. This can happen when a service account is deleted + // out of band. + if isServiceAccountNotFoundError, _ := iamServiceAccountNotFound(err); isServiceAccountNotFoundError { + // calling a retryable function within a retry loop is not + // strictly the _best_ idea, but this error only happens in + // high-traffic projects anyways + currentPolicy, rerr := iamPolicyReadWithRetry(updater) + if rerr != nil { + if p.Etag != currentPolicy.Etag { + // not matching indicates that there is a new state to attempt to apply + log.Printf("current and old etag did not match for %s, retrying", updater.DescribeResource()) + time.Sleep(backoff) + backoff = backoff * 2 + continue + } + + log.Printf("current and old etag matched for %s, not retrying", updater.DescribeResource()) + } else { + // if the error is non-nil, just fall through and return the base error + log.Printf("[DEBUG]: error checking etag for policy %s. error: %v", updater.DescribeResource(), rerr) + } + } + + log.Printf("[DEBUG]: not retrying IAM policy for %s. error: %v", updater.DescribeResource(), err) return errwrap.Wrapf(fmt.Sprintf("Error applying IAM policy for %s: {{err}}", updater.DescribeResource()), err) } log.Printf("[DEBUG]: Set policy for %s", updater.DescribeResource())