From 00f9f17846b43a2594a95f86e714266c3a85bdff Mon Sep 17 00:00:00 2001 From: pducolin <45568537+pducolin@users.noreply.github.com> Date: Wed, 17 Apr 2024 16:07:29 +0200 Subject: [PATCH] [e2e] retry stack up by default (#24746) * [e2e] retry stack up by default * [e2e] add stack name tag to dd event * Update test/new-e2e/pkg/utils/infra/stack_manager.go Co-authored-by: Florent Clarret * address review * move error handling out of switch * return reUp strategy by default * [e2e] cleanup known errors and rename to known errors --------- Co-authored-by: Florent Clarret --- .../pkg/utils/infra/retriable_errors.go | 23 ++------- test/new-e2e/pkg/utils/infra/stack_manager.go | 49 ++++++++++--------- 2 files changed, 29 insertions(+), 43 deletions(-) diff --git a/test/new-e2e/pkg/utils/infra/retriable_errors.go b/test/new-e2e/pkg/utils/infra/retriable_errors.go index 55da7316d6f56..0e7215c6ef789 100644 --- a/test/new-e2e/pkg/utils/infra/retriable_errors.go +++ b/test/new-e2e/pkg/utils/infra/retriable_errors.go @@ -14,39 +14,22 @@ const ( noRetry retryType = "NoRetry" ) -type retriableError struct { +type knownError struct { errorMessage string retryType retryType } -func getKnownRetriableErrors() []retriableError { +func getKnownErrors() []knownError { // Add here errors that are known to be flakes and that should be retried - return []retriableError{ + return []knownError{ { errorMessage: "i/o timeout", retryType: reCreate, }, - { - errorMessage: "creating EC2 Instance: IdempotentParameterMismatch:", - retryType: reUp, - }, - { - errorMessage: "InvalidInstanceID.NotFound", - retryType: reUp, - }, - { - errorMessage: "create: timeout while waiting for state to become 'tfSTABLE'", - retryType: reUp, - }, { // https://datadoghq.atlassian.net/browse/ADXT-1 errorMessage: "failed attempts: dial tcp :22: connect: connection refused", retryType: reCreate, }, - { - // https://datadoghq.atlassian.net/browse/ADXT-163 - errorMessage: "couldn't find resource", - retryType: reUp, - }, } } diff --git a/test/new-e2e/pkg/utils/infra/stack_manager.go b/test/new-e2e/pkg/utils/infra/stack_manager.go index 695cf85d17981..54d6fefd32453 100644 --- a/test/new-e2e/pkg/utils/infra/stack_manager.go +++ b/test/new-e2e/pkg/utils/infra/stack_manager.go @@ -62,7 +62,7 @@ func (i internalError) Error() string { type StackManager struct { stacks *safeStackMap - retriableErrors []retriableError + knownErrors []knownError } type safeStackMap struct { @@ -114,8 +114,8 @@ func GetStackManager() *StackManager { func newStackManager() (*StackManager, error) { return &StackManager{ - stacks: newSafeStackMap(), - retriableErrors: getKnownRetriableErrors(), + stacks: newSafeStackMap(), + knownErrors: getKnownErrors(), }, nil } @@ -318,26 +318,29 @@ func (sm *StackManager) getStack(ctx context.Context, name string, config runner if err == nil { break } - if retryStrategy := sm.getRetryStrategyFrom(err); retryStrategy != noRetry { - fmt.Fprintf(logger, "Got error that should be retried during stack up, retrying with %s strategy", retryStrategy) - err := sendEventToDatadog(fmt.Sprintf("[E2E] Stack %s : retrying Pulumi stack up", name), err.Error(), []string{"operation:up", fmt.Sprintf("retry:%s", retryStrategy)}, logger) - if err != nil { - fmt.Fprintf(logger, "Got error when sending event to Datadog: %v", err) - } - if retryStrategy == reCreate { - // If we are recreating the stack, we should destroy the stack first - destroyCtx, cancel := context.WithTimeout(ctx, stackDestroyTimeout) - _, err := stack.Destroy(destroyCtx, optdestroy.ProgressStreams(logger), optdestroy.DebugLogging(loggingOptions)) - cancel() - if err != nil { - return stack, auto.UpResult{}, err - } + retryStrategy := sm.getRetryStrategyFrom(err) + err := sendEventToDatadog(fmt.Sprintf("[E2E] Stack %s : error on Pulumi stack up", name), err.Error(), []string{"operation:up", fmt.Sprintf("retry:%s", retryStrategy), fmt.Sprintf("stack:%s", stack.Name())}, logger) + if err != nil { + fmt.Fprintf(logger, "Got error when sending event to Datadog: %v", err) + } + switch retryStrategy { + case reUp: + fmt.Fprint(logger, "Got error during stack up, retrying") + case reCreate: + fmt.Fprint(logger, "Got error during stack up, recreating stack") + destroyCtx, cancel := context.WithTimeout(ctx, stackDestroyTimeout) + _, err := stack.Destroy(destroyCtx, optdestroy.ProgressStreams(logger), optdestroy.DebugLogging(loggingOptions)) + cancel() + if err != nil { + return stack, auto.UpResult{}, err } - } else { - break + case noRetry: + fmt.Fprint(logger, "Got error during stack up, giving up") + return stack, upResult, err } } + return stack, upResult, err } @@ -390,12 +393,12 @@ func runFuncWithRecover(f pulumi.RunFunc) pulumi.RunFunc { } func (sm *StackManager) getRetryStrategyFrom(err error) retryType { - for _, retriableError := range sm.retriableErrors { - if strings.Contains(err.Error(), retriableError.errorMessage) { - return retriableError.retryType + for _, knownError := range sm.knownErrors { + if strings.Contains(err.Error(), knownError.errorMessage) { + return knownError.retryType } } - return noRetry + return reUp } // sendEventToDatadog sends an event to Datadog, it will use the API Key from environment variable DD_API_KEY if present, otherwise it will use the one from SSM Parameter Store