Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
105456: cloud: surface more AWS errors r=shermanCRL a=shermanCRL

Surface errors that would otherwise be redacted, making diagnosis in (e.g.) Splunk difficult.

Where before it might say:

`stepping through state failed with error: failed to get s3 object: ‹×›`

It now will say:

`stepping through state failed with error: failed to get s3 object: AssumeRole: AccessDenied: ‹×›`

#### User story
The user story is "make known failures grep-able / searchable in redacted logs". In the current state, the failure examples above are not knowable in a redacted log. The troubleshooter needs to get privileged access to see the real cause. This means an hour to initial diagnosis. With the above, we hope the initial diagnosis will be minutes.

#### Details
awserr.Code() is just a string (an enum really), let's return it. Doesn't contain sensitive information.

Look for "AccessDenied" and "AssumeRole" keywords in the unredacted error, and surface them without sensitive information.

Retains the previous stringy error messages, in case something is depending on them.

Also reduced the Bazel timeout for test run, as a local convenience -- maybe too little for the real tests, let's see

Jira: none
Epic: CRDB-26887


Co-authored-by: Matt Sherman <[email protected]>
  • Loading branch information
craig[bot] and shermanCRL committed Jul 14, 2023
2 parents 302310d + 2132ebd commit fdff086
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 12 deletions.
5 changes: 4 additions & 1 deletion pkg/cloud/amazon/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,12 @@ go_library(

go_test(
name = "amazon_test",
size = "small",
srcs = [
"aws_kms_test.go",
"s3_storage_test.go",
],
args = ["-test.timeout=295s"],
args = ["-test.timeout=55s"],
embed = [":amazon"],
deps = [
"//pkg/base",
Expand All @@ -63,8 +64,10 @@ go_test(
"//pkg/testutils",
"//pkg/testutils/skip",
"//pkg/util/leaktest",
"@com_github_aws_aws_sdk_go//aws/awserr",
"@com_github_aws_aws_sdk_go//aws/credentials",
"@com_github_aws_aws_sdk_go//aws/session",
"@com_github_aws_aws_sdk_go//service/s3",
"@com_github_cockroachdb_errors//:errors",
"@com_github_stretchr_testify//require",
],
Expand Down
56 changes: 45 additions & 11 deletions pkg/cloud/amazon/s3_storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,7 @@ func (s *s3Storage) Writer(ctx context.Context, basename string) (io.WriteCloser
SSEKMSKeyId: nilIfEmpty(s.conf.ServerKMSID),
StorageClass: nilIfEmpty(s.conf.StorageClass),
})
err = interpretAWSError(err)
return errors.Wrap(err, "upload failed")
}), nil
}
Expand All @@ -738,17 +739,10 @@ func (s *s3Storage) openStreamAt(

out, err := client.GetObjectWithContext(ctx, req)
if err != nil {
if aerr := (awserr.Error)(nil); errors.As(err, &aerr) {
switch aerr.Code() {
// Relevant 404 errors reported by AWS.
case s3.ErrCodeNoSuchBucket, s3.ErrCodeNoSuchKey:
// nolint:errwrap
return nil, errors.Wrapf(
errors.Wrap(cloud.ErrFileDoesNotExist, "s3 object does not exist"),
"%v",
err.Error(),
)
}
err = interpretAWSError(err)
if errors.Is(err, cloud.ErrFileDoesNotExist) {
// keep this string in case anyone is depending on it
err = errors.Wrap(err, "s3 object does not exist")
}
return nil, errors.Wrap(err, "failed to get s3 object")
}
Expand Down Expand Up @@ -790,6 +784,7 @@ func (s *s3Storage) ReadFile(
// so try a Size() request.
x, err := s.Size(ctx, basename)
if err != nil {
err = interpretAWSError(err)
return nil, 0, errors.Wrap(err, "content-length missing from GetObject and Size() failed")
}
fileSize = x
Expand Down Expand Up @@ -851,12 +846,50 @@ func (s *s3Storage) List(ctx context.Context, prefix, delim string, fn cloud.Lis
if err := client.ListObjectsPagesWithContext(
ctx, s3Input, pageFn,
); err != nil {
err = interpretAWSError(err)
return errors.Wrap(err, `failed to list s3 bucket`)
}

return fnErr
}

// interpretAWSError attempts to surface safe information that otherwise would be redacted
func interpretAWSError(err error) error {
if err == nil {
return nil
}

if strings.Contains(err.Error(), "AssumeRole") {
err = errors.Wrap(err, "AssumeRole")
}

if strings.Contains(err.Error(), "AccessDenied") {
err = errors.Wrap(err, "AccessDenied")
}

if aerr := (awserr.Error)(nil); errors.As(err, &aerr) {
code := aerr.Code()

if code != "" {
// nolint:errwrap
err = errors.Wrapf(err, "%v", code)

switch code {
// Relevant 404 errors reported by AWS.
case s3.ErrCodeNoSuchBucket, s3.ErrCodeNoSuchKey:
// nolint:errwrap
err = errors.Wrapf(
errors.Wrap(cloud.ErrFileDoesNotExist, "s3 object does not exist"),
"%v",
err.Error(),
)
}
}
}

return err
}

func (s *s3Storage) Delete(ctx context.Context, basename string) error {
client, err := s.getClient(ctx)
if err != nil {
Expand Down Expand Up @@ -890,6 +923,7 @@ func (s *s3Storage) Size(ctx context.Context, basename string) (int64, error) {
return err
})
if err != nil {
err = interpretAWSError(err)
return 0, errors.Wrap(err, "failed to get s3 object headers")
}
return *out.ContentLength, nil
Expand Down
96 changes: 96 additions & 0 deletions pkg/cloud/amazon/s3_storage_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ import (
"strings"
"testing"

"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/aws/credentials"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/s3"
"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/blobs"
"github.com/cockroachdb/cockroach/pkg/cloud"
Expand Down Expand Up @@ -396,6 +398,100 @@ func TestS3DisallowImplicitCredentials(t *testing.T) {
require.True(t, strings.Contains(err.Error(), "implicit"))
}

type awserror struct {
error
orig error
code, message string
}

var _ awserr.Error = awserror{}

func (a awserror) Code() string {
return a.code
}

func (a awserror) Message() string {
return a.message
}

func (a awserror) OrigErr() error {
return a.orig
}

func TestInterpretAWSCode(t *testing.T) {
{
// with code
input := awserror{
error: errors.New("hello"),
code: s3.ErrCodeBucketAlreadyOwnedByYou,
}
got := interpretAWSError(input)
require.NotNil(t, got, "expected tryAWSCode to recognize an awserr.Error type")
require.False(t, errors.Is(got, cloud.ErrFileDoesNotExist), "should not include cloud.ErrFileDoesNotExist in the error chain")
require.True(t, strings.Contains(got.Error(), s3.ErrCodeBucketAlreadyOwnedByYou), "aws error code should be in the error chain")
}

{
// with keywords
input := awserror{
error: errors.New("‹AccessDenied: User: arn:aws:sts::12345:assumed-role/12345 is not authorized to perform: sts:AssumeRole on resource: arn:aws:iam::12345›"),
}
got := interpretAWSError(input)
require.NotNil(t, got, "expected interpretAWSError to recognize keywords")
require.True(t, strings.Contains(got.Error(), "AccessDenied"), "expected to see AccessDenied in error chain")
require.True(t, strings.Contains(got.Error(), "AssumeRole"), "expected to see AssumeRole in error chain")
}

{
// with particular code
input := awserror{
error: errors.New("hello"),
code: s3.ErrCodeNoSuchBucket,
}
got := interpretAWSError(input)
require.NotNil(t, got, "expected tryAWSCode to regognize awserr.Error")
require.True(t, errors.Is(got, cloud.ErrFileDoesNotExist), "expected cloud.ErrFileDoesNotExist in the error chain")
require.True(t, strings.Contains(got.Error(), s3.ErrCodeNoSuchBucket), "aws error code should be in the error chain")
}

{
// with keywords and code
input := awserror{
error: errors.New("‹AccessDenied: User: arn:aws:sts::12345:assumed-role/12345 is not authorized to perform: sts:AssumeRole on resource: arn:aws:iam::12345›"),
code: s3.ErrCodeObjectAlreadyInActiveTierError,
}
got := interpretAWSError(input)
require.NotNil(t, got, "expected interpretAWSError to recognize keywords")
require.True(t, strings.Contains(got.Error(), "AccessDenied"), "expected to see AccessDenied in error chain")
require.True(t, strings.Contains(got.Error(), "AssumeRole"), "expected to see AssumeRole in error chain")
require.True(t, strings.Contains(got.Error(), s3.ErrCodeObjectAlreadyInActiveTierError), "aws error code should be in the error chain")
require.True(t, strings.Contains(got.Error(), "12345"), "SDK error should appear in the error chain")

// the keywords and code should come through while the original got redacted
redacted := errors.Redact(got)
require.True(t, strings.Contains(got.Error(), "AccessDenied"), "expected to see AccessDenied in error chain after redaction")
require.True(t, strings.Contains(got.Error(), "AssumeRole"), "expected to see AssumeRole in error chain after redaction")
require.True(t, strings.Contains(got.Error(), s3.ErrCodeObjectAlreadyInActiveTierError), "aws error code should be in the error chain after redaction")
require.False(t, strings.Contains(redacted, "12345"), "SDK error should have been redacted")
}

{
// no keywords or code
input := awserror{
error: errors.New("hello"),
}
got := interpretAWSError(input)
require.Equal(t, input, got, "expected interpretAWSError to pass through the same error")
}

{
// not an AWS error type
input := errors.New("some other generic error")
got := interpretAWSError(input)
require.Equal(t, input, got, "expected interpretAWSError to pass through the same error")
}
}

// S3 has two "does not exist" errors - ErrCodeNoSuchBucket and ErrCodeNoSuchKey.
// ErrCodeNoSuchKey is tested via the general test in external_storage_test.go.
// This test attempts to ReadFile from a bucket which does not exist.
Expand Down

0 comments on commit fdff086

Please sign in to comment.