Skip to content

Commit

Permalink
🔧 Update delius-core alarms (#7549)
Browse files Browse the repository at this point in the history
  • Loading branch information
georgepstaylor authored Aug 19, 2024
1 parent 78a531e commit 554f673
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,13 @@ resource "aws_cloudwatch_metric_alarm" "memory_over_threshold" {
return_data = true
expression = "ANOMALY_DETECTION_BAND(m1, 20)"
}

}

resource "aws_cloudwatch_log_metric_filter" "log_error_filter" {
name = "ldap-${var.env_name}-error"
pattern = "%err=[1-9][0-9]+%"
name = "ldap-${var.env_name}-error"
pattern = "%${join("|", local.formatted_error_codes)}%"

log_group_name = aws_cloudwatch_log_group.ldap_ecs.name

metric_transformation {
Expand All @@ -93,7 +95,7 @@ resource "aws_cloudwatch_metric_alarm" "high_error_volume" {
namespace = "ldapMetrics"
metric_name = "ErrorCount"
statistic = "Sum"
period = "300"
period = "600"
evaluation_periods = "1"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
Expand All @@ -102,21 +104,6 @@ resource "aws_cloudwatch_metric_alarm" "high_error_volume" {
comparison_operator = "GreaterThanThreshold"
}

resource "aws_cloudwatch_metric_alarm" "warning_error_volume" {
alarm_name = "ldap-${var.env_name}-warning-error-count"
alarm_description = "Triggers alarm if there are more than 5 errors in the last 2 minutes"
namespace = "ldapMetrics"
metric_name = "ErrorCount"
statistic = "Sum"
period = "120"
evaluation_periods = "1"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
threshold = "5"
treat_missing_data = "missing"
comparison_operator = "GreaterThanThreshold"
}

resource "aws_cloudwatch_metric_alarm" "ecs_running_tasks_less_than_one" {
alarm_name = "ldap-${var.env_name}-no-running-tasks"
actions_enabled = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,15 @@ locals {
domain_type_sub = [for k, v in local.domain_types : v.type if k != "modernisation-platform.service.justice.gov.uk"]

certificate_arn = aws_acm_certificate.external.arn
}

error_codes = [
1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14,
16, 17, 18, 19, 20, 21, 33, 34, 35, 36, 48, 49,
50, 51, 52, 53, 54, 60, 61, 64, 65, 66, 67, 68,
69, 70, 71, 76, 80, 81, 82, 83, 84, 85, 86, 87,
88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 100, 101,
112, 113, 114, 118, 119, 120, 121, 122, 123, 4096,
16654
]
formatted_error_codes = [for error_code in local.error_codes : "err=${error_code}\\s"]
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,21 @@ locals {
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
}
status-check-failed-attached-ebs = {
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "60"
datapoints_to_alarm = "1"
metric_name = "StatusCheckFailed_AttachedEBS"
namespace = "AWS/EC2"
period = "60"
statistic = "Maximum"
threshold = "1"
datapoints_to_alarm = "10"
evaluation_periods = "10"
alarm_description = "Triggers if there has been a status check failure for attached EBS volumes within the last hour."
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ resource "aws_iam_role_policy_attachment" "rds_enhanced_monitoring" {

data "aws_iam_policy_document" "rds_enhanced_monitoring" {
count = var.create_rds ? 1 : (var.rds_monitoring_interval != null || var.rds_monitoring_interval != 0 ? 1 : 0)

statement {
actions = [
"sts:AssumeRole",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,17 @@ resource "aws_cloudwatch_metric_alarm" "rds_cpu_over_threshold" {
)
}

resource "aws_cloudwatch_metric_alarm" "ram_over_threshold" {
resource "aws_cloudwatch_metric_alarm" "rds_memory_over_threshold" {
count = var.create_rds ? 1 : 0
alarm_name = "${var.name}-rds-ram-threshold"
alarm_description = "Triggers alarm if RDS RAM crosses a threshold"
alarm_name = "${var.name}-rds-memory-threshold"
alarm_description = "Triggers alarm if RDS Memory crosses a threshold"
namespace = "AWS/RDS"
metric_name = "FreeableMemory"
statistic = "Average"
period = "60"
evaluation_periods = "10"
# add sns topic later
# alarm_actions = [aws_sns_topic.alerting.arn]
# ok_actions = [aws_sns_topic.alerting.arn]
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
threshold = "800000000"
treat_missing_data = "missing"
comparison_operator = "LessThanThreshold"
Expand All @@ -54,7 +53,7 @@ resource "aws_cloudwatch_metric_alarm" "ram_over_threshold" {
)
}

resource "aws_cloudwatch_metric_alarm" "read_latency_over_threshold" {
resource "aws_cloudwatch_metric_alarm" "rds_read_latency_over_threshold" {
count = var.create_rds ? 1 : 0
alarm_name = "${var.name}-rds-read-latency-threshold"
alarm_description = "Triggers alarm if RDS read latency crosses a threshold"
Expand All @@ -63,9 +62,8 @@ resource "aws_cloudwatch_metric_alarm" "read_latency_over_threshold" {
statistic = "Average"
period = "60"
evaluation_periods = "5"
# add sns topic later
# alarm_actions = [aws_sns_topic.alerting.arn]
# ok_actions = [aws_sns_topic.alerting.arn]
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
threshold = "5"
treat_missing_data = "missing"
comparison_operator = "GreaterThanThreshold"
Expand All @@ -82,7 +80,7 @@ resource "aws_cloudwatch_metric_alarm" "read_latency_over_threshold" {
)
}

resource "aws_cloudwatch_metric_alarm" "write_latency_over_threshold" {
resource "aws_cloudwatch_metric_alarm" "rds_write_latency_over_threshold" {
count = var.create_rds ? 1 : 0
alarm_name = "${var.name}-rds-write-latency-threshold"
alarm_description = "Triggers alarm if RDS write latency crosses a threshold"
Expand All @@ -91,9 +89,8 @@ resource "aws_cloudwatch_metric_alarm" "write_latency_over_threshold" {
statistic = "Average"
period = "60"
evaluation_periods = "5"
# add sns topic later
# alarm_actions = [aws_sns_topic.alerting.arn]
# ok_actions = [aws_sns_topic.alerting.arn]
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
threshold = "5"
treat_missing_data = "missing"
comparison_operator = "GreaterThanThreshold"
Expand All @@ -110,7 +107,7 @@ resource "aws_cloudwatch_metric_alarm" "write_latency_over_threshold" {
)
}

resource "aws_cloudwatch_metric_alarm" "db_connections_over_threshold" {
resource "aws_cloudwatch_metric_alarm" "rds_connections_over_threshold" {
count = var.create_rds ? 1 : 0
alarm_name = "${var.name}-rds-db-connections-threshold"
alarm_description = "Triggers alarm if RDS database connections crosses a threshold"
Expand All @@ -119,9 +116,8 @@ resource "aws_cloudwatch_metric_alarm" "db_connections_over_threshold" {
statistic = "Average"
period = "60"
evaluation_periods = "5"
# add sns topic later
# alarm_actions = [aws_sns_topic.alerting.arn]
# ok_actions = [aws_sns_topic.alerting.arn]
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
threshold = "100"
treat_missing_data = "missing"
comparison_operator = "GreaterThanThreshold"
Expand All @@ -138,18 +134,17 @@ resource "aws_cloudwatch_metric_alarm" "db_connections_over_threshold" {
)
}

resource "aws_cloudwatch_metric_alarm" "db_queue_depth_over_threshold" {
resource "aws_cloudwatch_metric_alarm" "rds_allocated_storage_queue_depth_over_threshold" {
count = var.create_rds ? 1 : 0
alarm_name = "${var.name}-rds-db-queue-depth-threshold"
alarm_name = "${var.name}-rds-queue-depth-threshold"
alarm_description = "Triggers alarm if RDS database queue depth crosses a threshold"
namespace = "AWS/RDS"
metric_name = "DiskQueueDepth"
statistic = "Average"
period = "300"
evaluation_periods = "5"
# add sns topic later
# alarm_actions = [aws_sns_topic.alerting.arn]
# ok_actions = [aws_sns_topic.alerting.arn]
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
threshold = "60"
treat_missing_data = "missing"
comparison_operator = "GreaterThanThreshold"
Expand All @@ -165,3 +160,31 @@ resource "aws_cloudwatch_metric_alarm" "db_queue_depth_over_threshold" {
}
)
}

resource "aws_cloudwatch_metric_alarm" "rds_freeable_memory_less_than_threshold" {
count = var.create_rds ? 1 : 0
alarm_name = "${var.name}-rds-freeable-memory-threshold"
alarm_description = "Triggers alarm if RDS freeable memory crosses a threshold"
namespace = "AWS/RDS"
metric_name = "FreeableMemory"
statistic = "Average"
period = "60"
evaluation_periods = "15"
datapoints_to_alarm = 15
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
threshold = "800000000"
treat_missing_data = "missing"
comparison_operator = "LessThanThreshold"

dimensions = {
DBInstanceIdentifier = aws_db_instance.this[0].identifier
}

tags = merge(
var.tags,
{
Name = var.name
}
)
}

0 comments on commit 554f673

Please sign in to comment.