From c09442ab1f2a40b05a2f7c7c6bc2dacd4e6ff171 Mon Sep 17 00:00:00 2001
From: James Green <james.green6@digital.justice.gov.uk>
Date: Mon, 9 Oct 2023 12:05:46 +0100
Subject: [PATCH] Added new max scaling metric to ECS

During load testing we found that the average increase takes too long to
hit the threshold, thus we needed a new "max" metric, this should hit
the threshold sooner when there is a spike in traffic thus allowing it
to scale"
---
 modules/dns/ecs_auto_scaling.tf | 57 +++++++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 6 deletions(-)

diff --git a/modules/dns/ecs_auto_scaling.tf b/modules/dns/ecs_auto_scaling.tf
index f6b88143..f40db42d 100644
--- a/modules/dns/ecs_auto_scaling.tf
+++ b/modules/dns/ecs_auto_scaling.tf
@@ -6,8 +6,8 @@ resource "aws_appautoscaling_target" "auth_ecs_target" {
   scalable_dimension = "ecs:service:DesiredCount"
 }
 
-resource "aws_appautoscaling_policy" "ecs_policy_up" {
-  name               = "ECS Scale Up"
+resource "aws_appautoscaling_policy" "ecs_policy_up_average" {
+  name               = "ECS Scale Up Average"
   service_namespace  = "ecs"
   policy_type        = "StepScaling"
   resource_id        = "service/${aws_ecs_cluster.server_cluster.name}/${aws_ecs_service.service.name}"
@@ -26,6 +26,27 @@ resource "aws_appautoscaling_policy" "ecs_policy_up" {
   depends_on = [aws_appautoscaling_target.auth_ecs_target]
 }
 
+resource "aws_appautoscaling_policy" "ecs_policy_up_max" {
+  name               = "ECS Scale Up Max"
+  service_namespace  = "ecs"
+  policy_type        = "StepScaling"
+  resource_id        = "service/${aws_ecs_cluster.server_cluster.name}/${aws_ecs_service.service.name}"
+  scalable_dimension = "ecs:service:DesiredCount"
+
+  step_scaling_policy_configuration {
+    adjustment_type         = "ChangeInCapacity"
+    metric_aggregation_type = "Maximum"
+    cooldown                = 300
+
+    step_adjustment {
+      metric_interval_lower_bound = 0
+      scaling_adjustment          = 1
+    }
+  }
+
+  depends_on = [aws_appautoscaling_target.auth_ecs_target]
+}
+
 resource "aws_appautoscaling_policy" "ecs_policy_down" {
   name               = "ECS Scale Down"
   service_namespace  = "ecs"
@@ -46,8 +67,8 @@ resource "aws_appautoscaling_policy" "ecs_policy_down" {
   depends_on = [aws_appautoscaling_target.auth_ecs_target]
 }
 
-resource "aws_cloudwatch_metric_alarm" "ecs_cpu_alarm_high" {
-  alarm_name          = "${var.prefix}-ecs-cpu-alarm-high"
+resource "aws_cloudwatch_metric_alarm" "ecs_cpu_average_alarm_high" {
+  alarm_name          = "${var.prefix}-ecs-cpu-average-alarm-high"
   comparison_operator = "GreaterThanOrEqualToThreshold"
   evaluation_periods  = "1"
   metric_name         = "CPUUtilization"
@@ -61,10 +82,34 @@ resource "aws_cloudwatch_metric_alarm" "ecs_cpu_alarm_high" {
     ServiceName = aws_ecs_service.service.name
   }
 
-  alarm_description = "This alarm tells ECS to scale up based on high CPU"
+  alarm_description = "This alarm tells ECS to scale up based on average high CPU"
+
+  alarm_actions = [
+      aws_appautoscaling_policy.ecs_policy_up_average.arn
+  ]
+
+  treat_missing_data = "breaching"
+}
+
+resource "aws_cloudwatch_metric_alarm" "ecs_cpu_maximum_alarm_high" {
+  alarm_name          = "${var.prefix}-ecs-cpu-maximum-alarm-high"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "CPUUtilization"
+  namespace           = "AWS/ECS"
+  period              = "60"
+  statistic           = "Maximum"
+  threshold           = "80"
+
+  dimensions = {
+    ClusterName = aws_ecs_cluster.server_cluster.name
+    ServiceName = aws_ecs_service.service.name
+  }
+
+  alarm_description = "This alarm tells ECS to scale up based on maxmium high CPU"
 
   alarm_actions = [
-    aws_appautoscaling_policy.ecs_policy_up.arn
+      aws_appautoscaling_policy.ecs_policy_up_max.arn
   ]
 
   treat_missing_data = "breaching"