Skip to content

Commit

Permalink
[e2e] retry stack up by default (#24746)
Browse files Browse the repository at this point in the history
* [e2e] retry stack up by default

* [e2e] add stack name tag to dd event

* Update test/new-e2e/pkg/utils/infra/stack_manager.go

Co-authored-by: Florent Clarret <[email protected]>

* address review

* move error handling out of switch
* return reUp strategy by default

* [e2e] cleanup known errors and rename to known errors

---------

Co-authored-by: Florent Clarret <[email protected]>
  • Loading branch information
2 people authored and alexgallotta committed May 9, 2024
1 parent cbe5b17 commit ccb507c
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 43 deletions.
23 changes: 3 additions & 20 deletions test/new-e2e/pkg/utils/infra/retriable_errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,39 +14,22 @@ const (
noRetry retryType = "NoRetry"
)

type retriableError struct {
type knownError struct {
errorMessage string
retryType retryType
}

func getKnownRetriableErrors() []retriableError {
func getKnownErrors() []knownError {
// Add here errors that are known to be flakes and that should be retried
return []retriableError{
return []knownError{
{
errorMessage: "i/o timeout",
retryType: reCreate,
},
{
errorMessage: "creating EC2 Instance: IdempotentParameterMismatch:",
retryType: reUp,
},
{
errorMessage: "InvalidInstanceID.NotFound",
retryType: reUp,
},
{
errorMessage: "create: timeout while waiting for state to become 'tfSTABLE'",
retryType: reUp,
},
{
// https://datadoghq.atlassian.net/browse/ADXT-1
errorMessage: "failed attempts: dial tcp :22: connect: connection refused",
retryType: reCreate,
},
{
// https://datadoghq.atlassian.net/browse/ADXT-163
errorMessage: "couldn't find resource",
retryType: reUp,
},
}
}
49 changes: 26 additions & 23 deletions test/new-e2e/pkg/utils/infra/stack_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func (i internalError) Error() string {
type StackManager struct {
stacks *safeStackMap

retriableErrors []retriableError
knownErrors []knownError
}

type safeStackMap struct {
Expand Down Expand Up @@ -114,8 +114,8 @@ func GetStackManager() *StackManager {

func newStackManager() (*StackManager, error) {
return &StackManager{
stacks: newSafeStackMap(),
retriableErrors: getKnownRetriableErrors(),
stacks: newSafeStackMap(),
knownErrors: getKnownErrors(),
}, nil
}

Expand Down Expand Up @@ -318,26 +318,29 @@ func (sm *StackManager) getStack(ctx context.Context, name string, config runner
if err == nil {
break
}
if retryStrategy := sm.getRetryStrategyFrom(err); retryStrategy != noRetry {
fmt.Fprintf(logger, "Got error that should be retried during stack up, retrying with %s strategy", retryStrategy)
err := sendEventToDatadog(fmt.Sprintf("[E2E] Stack %s : retrying Pulumi stack up", name), err.Error(), []string{"operation:up", fmt.Sprintf("retry:%s", retryStrategy)}, logger)
if err != nil {
fmt.Fprintf(logger, "Got error when sending event to Datadog: %v", err)
}

if retryStrategy == reCreate {
// If we are recreating the stack, we should destroy the stack first
destroyCtx, cancel := context.WithTimeout(ctx, stackDestroyTimeout)
_, err := stack.Destroy(destroyCtx, optdestroy.ProgressStreams(logger), optdestroy.DebugLogging(loggingOptions))
cancel()
if err != nil {
return stack, auto.UpResult{}, err
}
retryStrategy := sm.getRetryStrategyFrom(err)
err := sendEventToDatadog(fmt.Sprintf("[E2E] Stack %s : error on Pulumi stack up", name), err.Error(), []string{"operation:up", fmt.Sprintf("retry:%s", retryStrategy), fmt.Sprintf("stack:%s", stack.Name())}, logger)
if err != nil {
fmt.Fprintf(logger, "Got error when sending event to Datadog: %v", err)
}
switch retryStrategy {
case reUp:
fmt.Fprint(logger, "Got error during stack up, retrying")
case reCreate:
fmt.Fprint(logger, "Got error during stack up, recreating stack")
destroyCtx, cancel := context.WithTimeout(ctx, stackDestroyTimeout)
_, err := stack.Destroy(destroyCtx, optdestroy.ProgressStreams(logger), optdestroy.DebugLogging(loggingOptions))
cancel()
if err != nil {
return stack, auto.UpResult{}, err
}
} else {
break
case noRetry:
fmt.Fprint(logger, "Got error during stack up, giving up")
return stack, upResult, err
}
}

return stack, upResult, err
}

Expand Down Expand Up @@ -390,12 +393,12 @@ func runFuncWithRecover(f pulumi.RunFunc) pulumi.RunFunc {
}

func (sm *StackManager) getRetryStrategyFrom(err error) retryType {
for _, retriableError := range sm.retriableErrors {
if strings.Contains(err.Error(), retriableError.errorMessage) {
return retriableError.retryType
for _, knownError := range sm.knownErrors {
if strings.Contains(err.Error(), knownError.errorMessage) {
return knownError.retryType
}
}
return noRetry
return reUp
}

// sendEventToDatadog sends an event to Datadog, it will use the API Key from environment variable DD_API_KEY if present, otherwise it will use the one from SSM Parameter Store
Expand Down

0 comments on commit ccb507c

Please sign in to comment.