From c478d16b616d095789315b7c108d38108d122f1c Mon Sep 17 00:00:00 2001 From: Chris Cotter Date: Mon, 5 Aug 2024 15:39:27 +0000 Subject: [PATCH 1/2] fix(storage): retry gRPC DEADLINE_EXCEEDED errors GCS is in some cases returning these for internal timeouts, so they need to be retried. I added an extra context error check to our retry logic to ensure that they can always be distinguished from user-set deadlines as well as some tests. See internal bug b/ --- storage/client_test.go | 38 ++++++++++++++++++++++++++++++++++++++ storage/invoke.go | 10 +++++++--- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/storage/client_test.go b/storage/client_test.go index 0a98f9a624c8..27e8eeba0f43 100644 --- a/storage/client_test.go +++ b/storage/client_test.go @@ -33,6 +33,7 @@ import ( "github.com/googleapis/gax-go/v2/callctx" "google.golang.org/api/iterator" "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" ) var emulatorClients map[string]storageClient @@ -1415,6 +1416,43 @@ func TestRetryMaxAttemptsEmulated(t *testing.T) { }) } +// Test that a timeout returns a DeadlineExceeded error, in spite of DeadlineExceeded being a retryable +// status when it is returned by the server. +func TestTimeoutErrorEmulated(t *testing.T) { + transportClientTest(t, func(t *testing.T, project, bucket string, client storageClient) { + ctx, cancel := context.WithTimeout(context.Background(), time.Nanosecond) + defer cancel() + time.Sleep(5 * time.Nanosecond) + config := &retryConfig{backoff: &gax.Backoff{Initial: 10 * time.Millisecond}} + _, err := client.GetBucket(ctx, bucket, nil, idempotent(true), withRetryConfig(config)) + + // Error may come through as a context.DeadlineExceeded (HTTP) or status.DeadlineExceeded (gRPC) + if !(errors.Is(err, context.DeadlineExceeded) || status.Code(err) == codes.DeadlineExceeded) { + t.Errorf("GetBucket: got unexpected error %v; want DeadlineExceeded", err) + } + + // Validate that error was not retried. If it was retried, that will be mentioned + // in the error string because of wrapping. + if strings.Contains(err.Error(), "retry") { + t.Errorf("GetBucket: got error %v, expected non-retried error", err) + } + }) +} + +// Test that server-side DEADLINE_EXCEEDED errors are retried as expected with gRPC. +func TestRetryDeadlineExceedeEmulated(t *testing.T) { + transportClientTest(t, func(t *testing.T, project, bucket string, client storageClient) { + ctx := context.Background() + instructions := map[string][]string{"storage.buckets.get": {"return-504", "return-504"}} + testID := createRetryTest(t, project, bucket, client, instructions) + ctx = callctx.SetHeaders(ctx, "x-retry-test-id", testID) + config := &retryConfig{maxAttempts: expectedAttempts(4), backoff: &gax.Backoff{Initial: 10 * time.Millisecond}} + if _, err := client.GetBucket(ctx, bucket, nil, idempotent(true), withRetryConfig(config)); err != nil { + t.Fatalf("GetBucket: got unexpected error %v, want nil", err) + } + }) +} + // createRetryTest creates a bucket in the emulator and sets up a test using the // Retry Test API for the given instructions. This is intended for emulator tests // of retry behavior that are not covered by conformance tests. diff --git a/storage/invoke.go b/storage/invoke.go index 5d71533d09ea..9fb24e8fd080 100644 --- a/storage/invoke.go +++ b/storage/invoke.go @@ -74,7 +74,11 @@ func run(ctx context.Context, call func(ctx context.Context) error, retry *retry return true, fmt.Errorf("storage: retry failed after %v attempts; last error: %w", *retry.maxAttempts, err) } attempts++ - return !errorFunc(err), err + retryable := errorFunc(err) + if ctxErr := ctx.Err(); errors.Is(ctxErr, context.Canceled) || errors.Is(ctxErr, context.DeadlineExceeded) { + retryable = false + } + return !retryable, err }) } @@ -129,9 +133,9 @@ func ShouldRetry(err error) bool { return true } } - // UNAVAILABLE, RESOURCE_EXHAUSTED, and INTERNAL codes are all retryable for gRPC. + // UNAVAILABLE, RESOURCE_EXHAUSTED, INTERNAL, and DEADLINE_EXCEEDED codes are all retryable for gRPC. if st, ok := status.FromError(err); ok { - if code := st.Code(); code == codes.Unavailable || code == codes.ResourceExhausted || code == codes.Internal { + if code := st.Code(); code == codes.Unavailable || code == codes.ResourceExhausted || code == codes.Internal || code == codes.DeadlineExceeded { return true } } From 225bfac286319c695dd8b47c3e538fa4ffda20d0 Mon Sep 17 00:00:00 2001 From: Chris Cotter Date: Tue, 6 Aug 2024 21:10:49 +0000 Subject: [PATCH 2/2] add comment explaining error --- storage/invoke.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/storage/invoke.go b/storage/invoke.go index 9fb24e8fd080..60eaa7885e71 100644 --- a/storage/invoke.go +++ b/storage/invoke.go @@ -75,6 +75,10 @@ func run(ctx context.Context, call func(ctx context.Context) error, retry *retry } attempts++ retryable := errorFunc(err) + // Explicitly check context cancellation so that we can distinguish between a + // DEADLINE_EXCEEDED error from the server and a user-set context deadline. + // Unfortunately gRPC will codes.DeadlineExceeded (which may be retryable if it's + // sent by the server) in both cases. if ctxErr := ctx.Err(); errors.Is(ctxErr, context.Canceled) || errors.Is(ctxErr, context.DeadlineExceeded) { retryable = false }