From 9b4eb802464d4f4127482b69f2b245250ab77982 Mon Sep 17 00:00:00 2001 From: velotioaastha Date: Mon, 8 Jul 2024 18:54:55 -0400 Subject: [PATCH 1/2] Implement autoscaling in Kubernetes for Terraform EKS --- deployment-size.tf | 50 ++++++++++++++++++------------- main.tf | 34 +++++++++++---------- modules/app_eks/add-ons.tf | 38 +++++++++++------------ modules/app_eks/iam-policies.tf | 4 +-- modules/app_eks/iam-roles.tf | 8 ++--- modules/app_eks/main.tf | 4 +-- modules/app_eks/variables.tf | 12 ++++++++ modules/app_lb/outputs.tf | 4 +-- modules/database/main.tf | 2 +- modules/endpoint/main.tf | 10 +++---- modules/endpoint/variables.tf | 10 +++---- modules/iam_role/main.tf | 14 ++++----- modules/iam_role/variables.tf | 2 +- modules/private_link/main.tf | 10 +++---- modules/private_link/variables.tf | 4 +-- variables.tf | 24 +++++++++++---- 16 files changed, 133 insertions(+), 97 deletions(-) diff --git a/deployment-size.tf b/deployment-size.tf index f6aedbe6..68f80dd0 100644 --- a/deployment-size.tf +++ b/deployment-size.tf @@ -6,34 +6,44 @@ locals { deployment_size = { small = { - db = "db.r6g.large", - node_count = 2, - node_instance = "r6i.xlarge" - cache = "cache.m6g.large" + db = "db.r6g.large", + node_count = 2, + node_instance = "r6i.xlarge", + cache = "cache.m6g.large", + min_node_count = 1, + max_node_count = 3 }, medium = { - db = "db.r6g.xlarge", - node_count = 2, - node_instance = "r6i.xlarge" - cache = "cache.m6g.large" + db = "db.r6g.xlarge", + node_count = 2, + node_instance = "r6i.xlarge", + cache = "cache.m6g.large", + min_node_count = 1, + max_node_count = 4 }, large = { - db = "db.r6g.2xlarge", - node_count = 2, - node_instance = "r6i.2xlarge" - cache = "cache.m6g.xlarge" + db = "db.r6g.2xlarge", + node_count = 2, + node_instance = "r6i.2xlarge", + cache = "cache.m6g.xlarge", + min_node_count = 2, + max_node_count = 6 }, xlarge = { - db = "db.r6g.4xlarge", - node_count = 3, - node_instance = "r6i.2xlarge" - cache = "cache.m6g.xlarge" + db = "db.r6g.4xlarge", + node_count = 3, + node_instance = "r6i.2xlarge", + cache = "cache.m6g.xlarge", + min_node_count = 2, + max_node_count = 8 }, xxlarge = { - db = "db.r6g.8xlarge", - node_count = 3, - node_instance = "r6i.4xlarge" - cache = "cache.m6g.2xlarge" + db = "db.r6g.8xlarge", + node_count = 3, + node_instance = "r6i.4xlarge", + cache = "cache.m6g.2xlarge", + min_node_count = 3, + max_node_count = 10 } } } \ No newline at end of file diff --git a/main.tf b/main.tf index dfd091ec..3a4ec1e2 100644 --- a/main.tf +++ b/main.tf @@ -128,6 +128,8 @@ module "app_eks" { instance_types = try([local.deployment_size[var.size].node_instance], var.kubernetes_instance_types) desired_capacity = try(local.deployment_size[var.size].node_count, var.kubernetes_node_count) + min_capacity = try(local.deployment_size[var.size].min_node_count, var.min_node_count) + max_capacity = try(local.deployment_size[var.size].max_node_count, var.max_node_count) map_accounts = var.kubernetes_map_accounts map_roles = var.kubernetes_map_roles map_users = var.kubernetes_map_users @@ -370,12 +372,12 @@ module "wandb" { # To support otel rds and redis metrics need operator-wandb chart minimum version 0.13.8 ( yace subchart) yace = var.enable_yace ? { - install = true - regions = [data.aws_region.current.name] - serviceAccount = { annotations = { "eks.amazonaws.com/role-arn" = module.iam_role[0].role_arn} } - } : { - install = false - regions = [] + install = true + regions = [data.aws_region.current.name] + serviceAccount = { annotations = { "eks.amazonaws.com/role-arn" = module.iam_role[0].role_arn } } + } : { + install = false + regions = [] serviceAccount = {} } @@ -386,13 +388,13 @@ module "wandb" { prometheus = { config = { scrape_configs = [ - { job_name = "yace" - scheme = "http" - metrics_path = "/metrics" + { job_name = "yace" + scheme = "http" + metrics_path = "/metrics" dns_sd_configs = [ { names = ["yace"] - type = "A" - port = 5000 + type = "A" + port = 5000 } ] } @@ -408,11 +410,11 @@ module "wandb" { } } } - } : { config = { - receivers = {} - service = {} - } - } + } : { config = { + receivers = {} + service = {} + } + } } mysql = { install = false } diff --git a/modules/app_eks/add-ons.tf b/modules/app_eks/add-ons.tf index 56503d6c..19f2ce56 100644 --- a/modules/app_eks/add-ons.tf +++ b/modules/app_eks/add-ons.tf @@ -32,43 +32,43 @@ resource "aws_iam_role" "oidc" { ### add-ons for eks version 1.28 resource "aws_eks_addon" "aws_efs_csi_driver" { - depends_on = [ - aws_eks_addon.vpc_cni - ] - cluster_name = var.namespace - addon_name = "aws-efs-csi-driver" - addon_version = "v2.0.4-eksbuild.1" - resolve_conflicts = "OVERWRITE" + depends_on = [ + aws_eks_addon.vpc_cni + ] + cluster_name = var.namespace + addon_name = "aws-efs-csi-driver" + addon_version = "v2.0.4-eksbuild.1" + resolve_conflicts = "OVERWRITE" } resource "aws_eks_addon" "aws_ebs_csi_driver" { depends_on = [ aws_eks_addon.vpc_cni ] - cluster_name = var.namespace - addon_name = "aws-ebs-csi-driver" - addon_version = "v1.31.0-eksbuild.1" - resolve_conflicts = "OVERWRITE" + cluster_name = var.namespace + addon_name = "aws-ebs-csi-driver" + addon_version = "v1.31.0-eksbuild.1" + resolve_conflicts = "OVERWRITE" } resource "aws_eks_addon" "coredns" { depends_on = [ aws_eks_addon.vpc_cni ] - cluster_name = var.namespace - addon_name = "coredns" - addon_version = "v1.10.1-eksbuild.11" - resolve_conflicts = "OVERWRITE" + cluster_name = var.namespace + addon_name = "coredns" + addon_version = "v1.10.1-eksbuild.11" + resolve_conflicts = "OVERWRITE" } resource "aws_eks_addon" "kube_proxy" { depends_on = [ aws_eks_addon.vpc_cni ] - cluster_name = var.namespace - addon_name = "kube-proxy" - addon_version = "v1.28.8-eksbuild.5" - resolve_conflicts = "OVERWRITE" + cluster_name = var.namespace + addon_name = "kube-proxy" + addon_version = "v1.28.8-eksbuild.5" + resolve_conflicts = "OVERWRITE" } resource "aws_eks_addon" "vpc_cni" { diff --git a/modules/app_eks/iam-policies.tf b/modules/app_eks/iam-policies.tf index 6ce0528a..f5e3d134 100644 --- a/modules/app_eks/iam-policies.tf +++ b/modules/app_eks/iam-policies.tf @@ -53,8 +53,8 @@ resource "aws_iam_policy" "irsa" { Version = "2012-10-17" Statement = [ { - Effect = "Allow" - Action = [ + Effect = "Allow" + Action = [ "s3:*", "kms:*", ] diff --git a/modules/app_eks/iam-roles.tf b/modules/app_eks/iam-roles.tf index 9654b4ce..fd2dfc4d 100644 --- a/modules/app_eks/iam-roles.tf +++ b/modules/app_eks/iam-roles.tf @@ -8,15 +8,15 @@ resource "aws_iam_role" "node" { resource "aws_iam_role" "irsa" { name = "${var.namespace}-irsa-role" assume_role_policy = jsonencode({ - Version = "2012-10-17" + Version = "2012-10-17" Statement = [ { - Sid = "" - Effect = "Allow" + Sid = "" + Effect = "Allow" Principal = { Federated = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${aws_iam_openid_connect_provider.eks.url}" } - Action = "sts:AssumeRoleWithWebIdentity" + Action = "sts:AssumeRoleWithWebIdentity" Condition = { StringLike = { "${aws_iam_openid_connect_provider.eks.url}:sub" = "system:serviceaccount:${var.namespace}:*" diff --git a/modules/app_eks/main.tf b/modules/app_eks/main.tf index 9210ad9d..1e1ec60e 100644 --- a/modules/app_eks/main.tf +++ b/modules/app_eks/main.tf @@ -55,10 +55,10 @@ module "eks" { iam_role_arn = aws_iam_role.node.arn, instance_types = var.instance_types, kubelet_extra_args = local.system_reserved != "" ? "--system-reserved=${local.system_reserved}" : "", - max_capacity = 5, + max_capacity = var.max_capacity, metadata_http_put_response_hop_limit = 2 metadata_http_tokens = "required", - min_capacity = var.desired_capacity, + min_capacity = var.min_capacity, version = var.cluster_version, } } diff --git a/modules/app_eks/variables.tf b/modules/app_eks/variables.tf index 64e6df6e..392dccb4 100644 --- a/modules/app_eks/variables.tf +++ b/modules/app_eks/variables.tf @@ -122,6 +122,18 @@ variable "desired_capacity" { default = 2 } +variable "min_capacity" { + description = "Minimum number of worker nodes." + type = number + default = 1 +} + +variable "max_capacity" { + description = "Maximum number of worker nodes." + type = number + default = 6 +} + variable "system_reserved_cpu_millicores" { description = "(Optional) The amount of 'system-reserved' CPU millicores to pass to the kubelet. For example: 100. A value of -1 disables the flag." type = number diff --git a/modules/app_lb/outputs.tf b/modules/app_lb/outputs.tf index 20724c32..6f8fa61f 100644 --- a/modules/app_lb/outputs.tf +++ b/modules/app_lb/outputs.tf @@ -15,9 +15,9 @@ output "tg_app_arn" { } output "alb_name" { -value = aws_lb.alb.arn + value = aws_lb.alb.arn } output "nlb_security_group" { - value = var.enable_private_only_traffic? aws_security_group.inbound_private[0].id : null + value = var.enable_private_only_traffic ? aws_security_group.inbound_private[0].id : null } \ No newline at end of file diff --git a/modules/database/main.tf b/modules/database/main.tf index c5d7b8bb..f60e984d 100644 --- a/modules/database/main.tf +++ b/modules/database/main.tf @@ -9,7 +9,7 @@ resource "random_string" "master_password" { } locals { - engine_version_tag = "80" + engine_version_tag = "80" parameter_family = "aurora-mysql8.0" parameter_group_name = "${var.namespace}-aurora-db-${local.engine_version_tag}-parameter-group" parameter_cluster_name = "${var.namespace}-aurora-${local.engine_version_tag}-cluster-parameter-group" diff --git a/modules/endpoint/main.tf b/modules/endpoint/main.tf index dc64c71c..2c8ebe6b 100644 --- a/modules/endpoint/main.tf +++ b/modules/endpoint/main.tf @@ -1,9 +1,9 @@ resource "aws_vpc_endpoint" "default" { - vpc_id = var.network_id - service_name = var.service_name - vpc_endpoint_type = "Gateway" - auto_accept = true - route_table_ids = var.private_route_table_id + vpc_id = var.network_id + service_name = var.service_name + vpc_endpoint_type = "Gateway" + auto_accept = true + route_table_ids = var.private_route_table_id policy = < Date: Tue, 9 Jul 2024 16:55:49 -0400 Subject: [PATCH 2/2] Implement autoscaling in Kubernetes for Terraform EKS --- deployment-size.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deployment-size.tf b/deployment-size.tf index 68f80dd0..8dce5edd 100644 --- a/deployment-size.tf +++ b/deployment-size.tf @@ -18,8 +18,8 @@ locals { node_count = 2, node_instance = "r6i.xlarge", cache = "cache.m6g.large", - min_node_count = 1, - max_node_count = 4 + min_node_count = 2, + max_node_count = 5 }, large = { db = "db.r6g.2xlarge",