From 0510b7ec35517ba241f50ac737d5ac9bbaadd63c Mon Sep 17 00:00:00 2001 From: Joel Thompson Date: Thu, 27 Sep 2018 10:54:59 -0400 Subject: [PATCH] logical/aws: Harden WAL entry creation (#5202) * logical/aws: Harden WAL entry creation If AWS IAM user creation failed in any way, the WAL corresponding to the IAM user would get left around and Vault would try to roll it back. However, because the user never existed, the rollback failed. Thus, the WAL would essentially get "stuck" and Vault would continually attempt to roll it back, failing every time. A similar situation could arise if the IAM user that Vault created got deleted out of band, or if Vault deleted it but was unable to write the lease revocation back to storage (e.g., a storage failure). This attempts to harden it in two ways. One is by deleting the WAL log entry if the IAM user creation fails. However, the WAL deletion could still fail, and this wouldn't help where the user is deleted out of band, so second, consider the user rolled back if the user just doesn't exist, under certain circumstances. Fixes #5190 * Fix segfault in expiration unit tests TestExpiration_Tidy was passing in a leaseEntry that had a nil Secret, which then caused a segfault as the changes to revokeEntry didn't check whether Secret was nil; this is probably unlikely to occur in real life, but good to be extra cautious. * Fix potential segfault Missed the else... * Respond to PR feedback --- builtin/logical/aws/backend.go | 4 ++- builtin/logical/aws/path_user.go | 31 +++++++++++++++++++++++ builtin/logical/aws/secret_access_keys.go | 4 +++ logical/lease.go | 2 +- vault/expiration.go | 7 +++++ 5 files changed, 46 insertions(+), 2 deletions(-) diff --git a/builtin/logical/aws/backend.go b/builtin/logical/aws/backend.go index 1c621f119924..f19b46aeaaff 100644 --- a/builtin/logical/aws/backend.go +++ b/builtin/logical/aws/backend.go @@ -48,7 +48,7 @@ func Backend() *backend { }, WALRollback: b.walRollback, - WALRollbackMinAge: 5 * time.Minute, + WALRollbackMinAge: minAwsUserRollbackAge, BackendType: logical.TypeLogical, } @@ -135,3 +135,5 @@ func (b *backend) clientSTS(ctx context.Context, s logical.Storage) (stsiface.ST return b.stsClient, nil } + +const minAwsUserRollbackAge = 5 * time.Minute diff --git a/builtin/logical/aws/path_user.go b/builtin/logical/aws/path_user.go index 7030f502ecc0..7ea0def82c6e 100644 --- a/builtin/logical/aws/path_user.go +++ b/builtin/logical/aws/path_user.go @@ -4,8 +4,10 @@ import ( "context" "fmt" "strings" + "time" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/iam" "github.com/hashicorp/errwrap" "github.com/hashicorp/vault/helper/strutil" @@ -130,6 +132,35 @@ func (b *backend) pathUserRollback(ctx context.Context, req *logical.Request, _k MaxItems: aws.Int64(1000), }) if err != nil { + // This isn't guaranteed to be perfect; for example, an IAM user + // might have gotten put into the WAL but then the IAM user creation + // failed (e.g., Vault didn't have permissions) and then the WAL + // deletion failed as well. Then, if Vault doesn't have access to + // call iam:ListGroupsForUser, AWS will return an access denied error + // and the WAL will never get cleaned up. But this is better than + // just having Vault "forget" about a user it actually created. + // + // BEWARE a potential race condition -- where this is called + // immediately after a user is created. AWS eventual consistency + // might say the user doesn't exist when the user does in fact + // exist, and this could cause Vault to forget about the user. + // This won't happen if the user creation fails (because the WAL + // minimum age is 5 minutes, and AWS eventual consistency is, in + // practice, never that long), but it could happen if a lease holder + // asks immediately after getting a user to revoke the lease, causing + // Vault to leak the secret, which would be a Very Bad Thing to allow. + // So we make sure that, if there's an associated lease, it must be at + // least 5 minutes old as well. + if aerr, ok := err.(awserr.Error); ok { + acceptMissingIamUsers := false + if req.Secret == nil || time.Since(req.Secret.IssueTime) > time.Duration(minAwsUserRollbackAge) { + // WAL rollback + acceptMissingIamUsers = true + } + if aerr.Code() == iam.ErrCodeNoSuchEntityException && acceptMissingIamUsers { + return nil + } + } return err } groups := groupsResp.Groups diff --git a/builtin/logical/aws/secret_access_keys.go b/builtin/logical/aws/secret_access_keys.go index 54c0c088ace3..1081a63193f6 100644 --- a/builtin/logical/aws/secret_access_keys.go +++ b/builtin/logical/aws/secret_access_keys.go @@ -185,6 +185,10 @@ func (b *backend) secretAccessKeysCreate( UserName: aws.String(username), }) if err != nil { + if walErr := framework.DeleteWAL(ctx, s, walId); walErr != nil { + iamErr := errwrap.Wrapf("error creating IAM user: {{err}}", err) + return nil, errwrap.Wrap(errwrap.Wrapf("failed to delete WAL entry: {{err}}", walErr), iamErr) + } return logical.ErrorResponse(fmt.Sprintf( "Error creating IAM user: %s", err)), awsutil.CheckAWSError(err) } diff --git a/logical/lease.go b/logical/lease.go index 90c089476618..97bbe4f6582b 100644 --- a/logical/lease.go +++ b/logical/lease.go @@ -23,7 +23,7 @@ type LeaseOptions struct { Increment time.Duration `json:"-"` // IssueTime is the time of issue for the original lease. This is - // only available on a Renew operation and has no effect when returning + // only available on Renew and Revoke operations and has no effect when returning // a response. It can be used to enforce maximum lease periods by // a logical backend. IssueTime time.Time `json:"-"` diff --git a/vault/expiration.go b/vault/expiration.go index b50cc2914a48..f98331d1295d 100644 --- a/vault/expiration.go +++ b/vault/expiration.go @@ -1277,6 +1277,13 @@ func (m *ExpirationManager) revokeEntry(ctx context.Context, le *leaseEntry) err return nil } + if le.Secret != nil { + // not sure if this is really valid to have a leaseEntry with a nil Secret + // (if there's a nil Secret, what are you really leasing?), but the tests + // create one, and good to be defensive + le.Secret.IssueTime = le.IssueTime + } + // Make sure we're operating in the right namespace nsCtx := namespace.ContextWithNamespace(ctx, le.namespace)