Skip to content

Commit

Permalink
[terraform] create backup cloudwatch alarms
Browse files Browse the repository at this point in the history
Summary:
This creates the backup cloudwatch alarms that were added to backup in https://phab.comm.dev/D13942

Depends on D13943

Test Plan:
Triggered alarms by including tracing error logs with each newly implemented `errorType` in the `main.rs` file in backup. Confirmed that alarms were triggered and emails were sent.

Linear issues were also created for error log alarms
{F3248170}

Reviewers: bartek, varun

Reviewed By: bartek, varun

Subscribers: ashoat, tomek

Differential Revision: https://phab.comm.dev/D13944
  • Loading branch information
wyilio committed Nov 18, 2024
1 parent b93d103 commit 9fe6ac4
Showing 1 changed file with 82 additions and 0 deletions.
82 changes: 82 additions & 0 deletions services/terraform/remote/alarms_backup.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
locals {
backup_error_patterns = {
Auth = { name = "Auth", pattern = "Auth Error" },
Blob = { name = "Blob", pattern = "Blob Error" },
DDB = { name = "DDB", pattern = "DDB Error" },
WS = { name = "WS", pattern = "WS Error" },
}
}

resource "aws_sns_topic" "backup_error_topic" {
name = "backup-error-topic"
}

resource "aws_sns_topic_subscription" "backup_email_subscription" {
topic_arn = aws_sns_topic.backup_error_topic.arn
protocol = "email"
endpoint = local.error_reports_subscribed_email
}

resource "aws_cloudwatch_log_metric_filter" "backup_error_filters" {
for_each = local.backup_error_patterns

name = "Backup${each.value.name}ErrorCount"
pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }"
log_group_name = "/ecs/backup-service-task-def"

metric_transformation {
name = "Backup${each.value.name}ErrorCount"
namespace = "BackupServiceMetricFilters"
value = "1"
}
}

resource "aws_cloudwatch_metric_alarm" "backup_error_alarms" {
for_each = local.backup_error_patterns

alarm_name = "Backup${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "Backup${each.value.name}ErrorCount"
namespace = "BackupServiceMetricFilters"
period = "300"
statistic = "Sum"
threshold = 1
alarm_description = "Alarm when Backup ${each.value.name} errors exceed threshold"
actions_enabled = true
alarm_actions = [aws_sns_topic.backup_error_topic.arn]
}

resource "aws_cloudwatch_metric_alarm" "backup_memory_utilization" {
alarm_name = "BackupMemoryUtilizationAlarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "MemoryUtilization"
namespace = "AWS/ECS"
period = 60
statistic = "Average"
threshold = 90
alarm_description = "Alarm when Backup service memory utilization exceeds 90%"
dimensions = {
ClusterName = aws_ecs_cluster.comm_services.name
ServiceName = aws_ecs_service.backup_service.name
}
alarm_actions = [aws_sns_topic.backup_error_topic.arn]
}

resource "aws_cloudwatch_metric_alarm" "backup_cpu_utilization" {
alarm_name = "BackupCPUUtilizationAlarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "CPUUtilization"
namespace = "AWS/ECS"
period = 60
statistic = "Average"
threshold = 90
alarm_description = "Alarm when Backup service CPU utilization exceeds 90%"
dimensions = {
ClusterName = aws_ecs_cluster.comm_services.name
ServiceName = aws_ecs_service.backup_service.name
}
alarm_actions = [aws_sns_topic.backup_error_topic.arn]
}

0 comments on commit 9fe6ac4

Please sign in to comment.