From 58c076c8cd203edd165855d8d360703a9f1df168 Mon Sep 17 00:00:00 2001 From: William Shelley Date: Tue, 8 Oct 2024 11:06:18 +0100 Subject: [PATCH] feat: create monitoring in cloudwatch for aws batch failures (#367) * feat: add eventbridge rule and sns topic * fix: tflint and add data source * fix: event pattern * docs: update Terraform docs --------- Co-authored-by: github-actions[bot] --- infra/terraform/modules/service/README.md | 3 + infra/terraform/modules/service/batch.tf | 92 ++++++++++++++++++++++- 2 files changed, 94 insertions(+), 1 deletion(-) diff --git a/infra/terraform/modules/service/README.md b/infra/terraform/modules/service/README.md index 3d6f53398e..55aee575be 100644 --- a/infra/terraform/modules/service/README.md +++ b/infra/terraform/modules/service/README.md @@ -22,9 +22,11 @@ | [ecs\_cluster](#module\_ecs\_cluster) | terraform-aws-modules/ecs/aws//modules/cluster | ~> 5.10 | | [ecs\_service](#module\_ecs\_service) | terraform-aws-modules/ecs/aws//modules/service | ~> 5.10 | | [eventbridge](#module\_eventbridge) | terraform-aws-modules/eventbridge/aws | ~> 3.7 | +| [eventbridge\_sns](#module\_eventbridge\_sns) | terraform-aws-modules/eventbridge/aws | ~> 3.7 | | [log\_bucket](#module\_log\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 4.0 | | [records](#module\_records) | terraform-aws-modules/route53/aws//modules/records | ~> 4.0 | | [route53\_records](#module\_route53\_records) | terraform-aws-modules/acm/aws | ~> 5.0 | +| [sns\_batch\_fail](#module\_sns\_batch\_fail) | terraform-aws-modules/sns/aws | ~> 6.1 | ## Resources @@ -34,6 +36,7 @@ | [aws_cloudwatch_log_group.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_lb_listener_rule.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lb_listener_rule) | resource | | [aws_lb_target_group.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lb_target_group) | resource | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_canonical_user_id.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/canonical_user_id) | data source | | [aws_cloudfront_log_delivery_canonical_user_id.cloudfront](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/cloudfront_log_delivery_canonical_user_id) | data source | | [aws_route53_zone.public](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/route53_zone) | data source | diff --git a/infra/terraform/modules/service/batch.tf b/infra/terraform/modules/service/batch.tf index 7beb1d3b95..c48ee12243 100644 --- a/infra/terraform/modules/service/batch.tf +++ b/infra/terraform/modules/service/batch.tf @@ -1,3 +1,5 @@ +data "aws_caller_identity" "current" {} + locals { default_retry_policy = { attempts = 1 @@ -155,9 +157,97 @@ module "eventbridge" { } schedules = local.schedules + +} + +module "eventbridge_sns" { + source = "terraform-aws-modules/eventbridge/aws" + version = "~> 3.7" + + create_bus = false + + role_name = "vol-app-${var.environment}-batch-fail-role" + + rules = { + batch-fail-sns = { + name = "${var.environment}-batch-fail-event" + description = "Capture failed Batch Events sent to SNS" + event_pattern = jsonencode({ + "source" : ["aws.batch"], + "detail-type" : ["Batch Job State Change"], + "detail" : { + "status" : [ + "FAILED" + ], + "jobName" : [{ + "wildcard" : "vol-app-${var.environment}-*" + }] + } + }) + enabled = true + } + } + + targets = { + batch-fail-sns = [ + { + name = "batch-fail-event" + arn = module.sns_batch_fail.topic_arn + } + ] + } + +} + +module "sns_batch_fail" { + source = "terraform-aws-modules/sns/aws" + version = "~> 6.1" + + name = "vol-app-${var.environment}-batch-fail-topic" + use_name_prefix = true + display_name = "batch-event-failed" + + + create_topic_policy = true + enable_default_topic_policy = true + topic_policy_statements = { + pub = { + actions = ["sns:Publish"] + principals = [{ + type = "AWS" + identifiers = [ + "arn:aws:iam::${data.aws_caller_identity.current.account_id}:root" + ] + }] + }, + + sub = { + actions = [ + "sns:Subscribe", + "sns:Receive", + ] + + principals = [{ + type = "Service" + identifiers = ["events.amazonaws.com"] + }] + + conditions = [{ + test = "ArnLike" + variable = "aws:SourceArn" + values = [module.eventbridge_sns.eventbridge_bus_arn] + }] + } + } + + tags = { + "Name" = "vol-app-${var.environment}-aws-sns-batch-fail" + + } + } resource "aws_cloudwatch_log_group" "this" { name = "/aws/batch/vol-app-${var.environment}" retention_in_days = 1 -} +} \ No newline at end of file