Skip to content

Commit

Permalink
Check if container healthchecks failed and rollback (#38)
Browse files Browse the repository at this point in the history
* Check if container healthchecks failed and rollback

* Update tests
  • Loading branch information
cszatmary authored Mar 31, 2021
1 parent 1195b11 commit fce0d62
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 21 deletions.
51 changes: 45 additions & 6 deletions awsecs/awsecs.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package awsecs

import (
stderrors "errors"
"fmt"
"log"
"regexp"
Expand All @@ -15,6 +16,9 @@ import (
"github.com/pkg/errors"
)

// ErrHealthcheckFailed indicates that a ECS task failed a container healthcheck.
var ErrHealthcheckFailed = stderrors.New("health check failed")

// Deploy registers a new task for the given service in ECS in order to create a new deployment.
func Deploy(service *config.Service, ecsClient ecsiface.ECSAPI) error {
// Ensure we've been passed a valid cluster ARN and exit if not
Expand Down Expand Up @@ -80,20 +84,25 @@ func UpdateService(service *config.Service, ecsClient ecsiface.ECSAPI) error {
return nil
}

// CheckDrain checks if all old tasks have drained.
// CheckDrain checks if all old tasks have drained. If the tasks are failing healthchecks,
// the return error will wrap ErrHealthcheckFailed.
func CheckDrain(service *config.Service, ecsClient ecsiface.ECSAPI) (bool, error) {
serviceInput := &ecs.DescribeServicesInput{
respDescribeServices, err := ecsClient.DescribeServices(&ecs.DescribeServicesInput{
Services: []*string{
&service.Name,
},
Cluster: &service.Cluster,
}

respDescribeServices, err := ecsClient.DescribeServices(serviceInput)
})
if err != nil {
return false, errors.Wrapf(err, "failed to get current service: %s", service.Name)
}

if len(respDescribeServices.Failures) > 0 {
var sb strings.Builder
for _, f := range respDescribeServices.Failures {
sb.WriteString(f.String())
}
return false, errors.Wrapf(err, "failed to get service: %v", sb)
}
if len(respDescribeServices.Services) != 1 {
return false, errors.Wrapf(err, "expected 1 service named %s, got %d", service.Name, len(respDescribeServices.Services))
}
Expand All @@ -113,6 +122,36 @@ func CheckDrain(service *config.Service, ecsClient ecsiface.ECSAPI) (bool, error
}
}

// Check and see if container healthchecks failed so we can provide more details

// TODO(@cszatmary): The response could be paginated, may need to handle this in the future
respListTasks, err := ecsClient.ListTasks(&ecs.ListTasksInput{
Cluster: &service.Cluster,
ServiceName: &service.Name,
})
if err != nil {
return false, errors.Wrapf(err, "failed to list tasks for service: %s", service.Name)
}
respDescribeTasks, err := ecsClient.DescribeTasks(&ecs.DescribeTasksInput{
Tasks: respListTasks.TaskArns,
})
if err != nil {
return false, errors.Wrapf(err, "failed to get tasks for service: %s", service.Name)
}
if len(respDescribeTasks.Failures) > 0 {
var sb strings.Builder
for _, f := range respDescribeServices.Failures {
sb.WriteString(f.String())
}
return false, errors.Wrapf(err, "failed to get tasks: %v", sb)
}

for _, task := range respDescribeTasks.Tasks {
if *task.HealthStatus != "HEALTHY" && *task.TaskDefinitionArn == service.TaskDefinitionARN {
return false, errors.Wrapf(ErrHealthcheckFailed, "task %s is unhealthy", *task.TaskArn)
}
}

return false, nil
}

Expand Down
84 changes: 78 additions & 6 deletions awsecs/mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package awsecs
import (
"errors"
"fmt"
"math/rand"
"strconv"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/ecs"
Expand All @@ -23,9 +25,29 @@ func (ms *mockService) TaskDefinitionArn() string {
return fmt.Sprintf("arn:aws:ecs:us-east-1:123456:task-definition/%s:%d", ms.name, ms.taskDefVersion)
}

type mockTask struct {
id string
taskDefArn string
healthy bool
serviceName string
clusterName string
}

func (mt *mockTask) Arn() string {
return fmt.Sprintf("arn:aws:ecs:us-east-1:123456:task/%s/%s", mt.clusterName, mt.id)
}

func (mt *mockTask) HealthStatus() string {
if mt.healthy {
return "HEALTHY"
}
return "UNHEALTHY"
}

type MockECSClient struct {
ecsiface.ECSAPI
services map[string]*mockService
tasks []mockTask
}

func NewMockECSClient(serviceNames []string, imageName, gitsha string) *MockECSClient {
Expand Down Expand Up @@ -60,7 +82,7 @@ func (mc *MockECSClient) DescribeServices(input *ecs.DescribeServicesInput) (*ec
for _, serviceName := range input.Services {
s, ok := mc.services[*serviceName]
if !ok {
return nil, errors.New("Service not found")
return nil, errors.New("service not found")
}

outServices = append(outServices, &ecs.Service{
Expand Down Expand Up @@ -92,7 +114,7 @@ func (mc *MockECSClient) DescribeTaskDefinition(input *ecs.DescribeTaskDefinitio
}

if service == nil {
return nil, errors.New("Task Definition not found")
return nil, errors.New("task Definition not found")
}

image := fmt.Sprintf("123456.dkr.ecr.us-east-1.amazonaws.com/%s:%s", service.imageName, service.gitsha)
Expand All @@ -113,7 +135,7 @@ func (mc *MockECSClient) DescribeTaskDefinition(input *ecs.DescribeTaskDefinitio
func (mc *MockECSClient) RegisterTaskDefinition(input *ecs.RegisterTaskDefinitionInput) (*ecs.RegisterTaskDefinitionOutput, error) {
service, ok := mc.services[*input.Family]
if !ok {
return nil, errors.New("Task definition family not found")
return nil, errors.New("task definition family not found")
}

// "Create" new task def version
Expand All @@ -129,13 +151,63 @@ func (mc *MockECSClient) RegisterTaskDefinition(input *ecs.RegisterTaskDefinitio
func (mc *MockECSClient) UpdateService(input *ecs.UpdateServiceInput) (*ecs.UpdateServiceOutput, error) {
_, ok := mc.services[*input.Service]
if !ok {
return nil, errors.New("Service not found")
return nil, errors.New("service not found")
}

// We don't actually use the return value
return &ecs.UpdateServiceOutput{}, nil
}

func (mc *MockECSClient) CreateMockTasks(clusterName, serviceName, taskDefArn string, healthy bool, count int) {
for i := 0; i < count; i++ {
mc.tasks = append(mc.tasks, mockTask{
id: strconv.Itoa(rand.Int()),
clusterName: clusterName,
serviceName: serviceName,
taskDefArn: taskDefArn,
healthy: healthy,
})
}
}

func (mc *MockECSClient) ListTasks(input *ecs.ListTasksInput) (*ecs.ListTasksOutput, error) {
var taskArns []*string
for _, t := range mc.tasks {
if input.Cluster != nil && t.clusterName != *input.Cluster {
continue
}
if input.ServiceName != nil && t.serviceName != *input.ServiceName {
continue
}
taskArns = append(taskArns, aws.String(t.Arn()))
}
return &ecs.ListTasksOutput{TaskArns: taskArns}, nil
}

func (mc *MockECSClient) DescribeTasks(input *ecs.DescribeTasksInput) (*ecs.DescribeTasksOutput, error) {
arnSet := make(map[string]bool)
for _, arn := range input.Tasks {
arnSet[*arn] = true
}

var tasks []*ecs.Task
for _, t := range mc.tasks {
if input.Cluster != nil && t.clusterName != *input.Cluster {
continue
}
ok := arnSet[t.Arn()]
if input.Tasks != nil && !ok {
continue
}
tasks = append(tasks, &ecs.Task{
TaskArn: aws.String(t.Arn()),
TaskDefinitionArn: aws.String(t.taskDefArn),
HealthStatus: aws.String(t.HealthStatus()),
})
}
return &ecs.DescribeTasksOutput{Tasks: tasks}, nil
}

// Event Bridge mocks

type mockScheduledTask struct {
Expand Down Expand Up @@ -166,7 +238,7 @@ func NewMockEventBridgeClient(taskNames []string) *MockEventBridgeClient {
func (mc *MockEventBridgeClient) ListTargetsByRule(input *eventbridge.ListTargetsByRuleInput) (*eventbridge.ListTargetsByRuleOutput, error) {
t, ok := mc.tasks[*input.Rule]
if !ok {
return nil, errors.New("Rule not found")
return nil, errors.New("rule not found")
}

return &eventbridge.ListTargetsByRuleOutput{
Expand All @@ -183,7 +255,7 @@ func (mc *MockEventBridgeClient) ListTargetsByRule(input *eventbridge.ListTarget
func (mc *MockEventBridgeClient) PutTargets(input *eventbridge.PutTargetsInput) (*eventbridge.PutTargetsOutput, error) {
t, ok := mc.tasks[*input.Rule]
if !ok {
return nil, errors.New("Rule not found")
return nil, errors.New("rule not found")
}

t.taskDefARN = *input.Targets[0].EcsParameters.TaskDefinitionArn
Expand Down
14 changes: 7 additions & 7 deletions deploy/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,6 @@ func CheckDrained(services []*config.Service, ecsClient ecsiface.ECSAPI) []Resul

drained, err := awsecs.CheckDrain(service, ecsClient)
if err != nil {
// This should never error otherwise Deploy would have failed
// If this happens abort because it will never succeed
resultChan <- Result{service, err}
return
Expand All @@ -197,16 +196,18 @@ func CheckDrained(services []*config.Service, ecsClient ecsiface.ECSAPI) []Resul
}(s)
}

// Set of service names that succeeded
succeededServices := make(map[string]bool)
// Set of service names that finished the check
finishedServices := make(map[string]bool)
results := make([]Result, 0, len(services))

loop:
for i := 0; i < len(services); i++ {
select {
case result := <-resultChan:
log.Printf("Version %s successfully deployed to %s\n", color.Green(result.Service.Gitsha), color.Cyan(result.Service.Name))
succeededServices[result.Service.Name] = true
if result.Err != nil {
log.Printf("Version %s successfully deployed to %s\n", color.Green(result.Service.Gitsha), color.Cyan(result.Service.Name))
}
finishedServices[result.Service.Name] = true
results = append(results, result)
case <-time.After(timeoutDuration):
// Stop looping, anything that didn't succeed has now failed
Expand All @@ -216,8 +217,7 @@ loop:

// Figure out which, if any, services timed out
for _, s := range services {
succeeded := succeededServices[s.Name]
if !succeeded {
if finished := finishedServices[s.Name]; !finished {
result := Result{s, ErrTimedOut}
results = append(results, result)
}
Expand Down
79 changes: 79 additions & 0 deletions deploy/deploy_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package deploy_test

import (
"errors"
"fmt"
"net/http"
"net/http/httptest"
Expand Down Expand Up @@ -387,6 +388,20 @@ func TestCheckDrain(t *testing.T) {
"example-service",
gitsha,
)
mockClient.CreateMockTasks(
"arn:aws:ecs:us-east-1:123456:cluster/prod-cluster",
"example-production",
"arn:aws:ecs:us-east-1:123456:task-definition/example-production:1",
true,
2,
)
mockClient.CreateMockTasks(
"arn:aws:ecs:us-east-1:123456:cluster/prod-cluster",
"example-staging",
"arn:aws:ecs:us-east-1:123456:task-definition/example-staging:1",
true,
2,
)

expectedResults := []deploy.Result{
{
Expand Down Expand Up @@ -438,6 +453,70 @@ func TestCheckDrainFailed(t *testing.T) {
},
}

mockClient := awsecs.NewMockECSClient(
[]string{
"example-production",
"example-staging",
},
"example-service",
gitsha,
)
mockClient.CreateMockTasks(
"arn:aws:ecs:us-east-1:123456:cluster/prod-cluster",
"example-production",
"arn:aws:ecs:us-east-1:123456:task-definition/example-production:1",
false,
2,
)
mockClient.CreateMockTasks(
"arn:aws:ecs:us-east-1:123456:cluster/prod-cluster",
"example-staging",
"arn:aws:ecs:us-east-1:123456:task-definition/example-staging:1",
false,
2,
)
mockClient.SetServiceStatus("example-production", "ACTIVE")
mockClient.SetServiceStatus("example-staging", "ACTIVE")

results := deploy.CheckDrained(services, mockClient)
var gotServices []*config.Service
var errs []error
for _, r := range results {
gotServices = append(gotServices, r.Service)
errs = append(errs, r.Err)
}

assert.ElementsMatch(t, services, gotServices)

for _, err := range errs {
if !errors.Is(err, awsecs.ErrHealthcheckFailed) {
assert.Fail(t, "expected ErrHealthcheckFailed, got", err)
}
}
}

func TestCheckDrainTimedOut(t *testing.T) {
deploy.TimeoutDuration(1 * time.Second)
deploy.CheckIntervalDuration(250 * time.Millisecond)

gitsha := "da39a3ee5e6b4b0d3255bfef95601890afd80709"
services := []*config.Service{
{
Name: "example-production",
Gitsha: gitsha,
Cluster: "arn:aws:ecs:us-east-1:123456:cluster/prod-cluster",
URL: "https://example.touchbistro.io/ping",
TaskDefinitionARN: "arn:aws:ecs:us-east-1:123456:task-definition/example-production:1",
},
{
Name: "example-staging",
Gitsha: gitsha,
Cluster: "arn:aws:ecs:us-east-1:123456:cluster/non-prod-cluster",
URL: "https://staging.example.touchbistro.io/ping",
TaskDefinitionARN: "arn:aws:ecs:us-east-1:123456:task-definition/example-staging:1",
},
}

mockClient := awsecs.NewMockECSClient(
[]string{
"example-production",
Expand Down
Loading

0 comments on commit fce0d62

Please sign in to comment.