From a9181ae2ddfafc03af70b1eab667eac7dd17bf1a Mon Sep 17 00:00:00 2001 From: Vara Bonthu Date: Sat, 25 Feb 2023 20:50:45 +0000 Subject: [PATCH] feat: Added SQS queue creation with events for Karpenter (#1458) --- examples/karpenter/main.tf | 1 - modules/kubernetes-addons/README.md | 4 +- modules/kubernetes-addons/karpenter/README.md | 17 +++++-- modules/kubernetes-addons/karpenter/data.tf | 27 ++++++++--- modules/kubernetes-addons/karpenter/locals.tf | 48 ++++++++++++++++++- modules/kubernetes-addons/karpenter/main.tf | 40 ++++++++++++++++ .../kubernetes-addons/karpenter/outputs.tf | 20 ++++++++ .../kubernetes-addons/karpenter/variables.tf | 27 +++++++---- modules/kubernetes-addons/main.tf | 17 ++++--- modules/kubernetes-addons/variables.tf | 18 +++++-- 10 files changed, 187 insertions(+), 32 deletions(-) diff --git a/examples/karpenter/main.tf b/examples/karpenter/main.tf index 92b5c2a900..2811a9868e 100644 --- a/examples/karpenter/main.tf +++ b/examples/karpenter/main.tf @@ -171,7 +171,6 @@ module "eks_blueprints_kubernetes_addons" { } karpenter_node_iam_instance_profile = module.karpenter.instance_profile_name karpenter_enable_spot_termination_handling = true - karpenter_sqs_queue_arn = module.karpenter.queue_arn tags = local.tags } diff --git a/modules/kubernetes-addons/README.md b/modules/kubernetes-addons/README.md index 855935c25c..2d682a152c 100644 --- a/modules/kubernetes-addons/README.md +++ b/modules/kubernetes-addons/README.md @@ -277,7 +277,6 @@ | [karpenter\_helm\_config](#input\_karpenter\_helm\_config) | Karpenter autoscaler add-on config | `any` | `{}` | no | | [karpenter\_irsa\_policies](#input\_karpenter\_irsa\_policies) | Additional IAM policies for a IAM role for service accounts | `list(string)` | `[]` | no | | [karpenter\_node\_iam\_instance\_profile](#input\_karpenter\_node\_iam\_instance\_profile) | Karpenter Node IAM Instance profile id | `string` | `""` | no | -| [karpenter\_sqs\_queue\_arn](#input\_karpenter\_sqs\_queue\_arn) | (Optional) ARN of SQS used by Karpenter when native node termination handling is enabled | `string` | `""` | no | | [keda\_helm\_config](#input\_keda\_helm\_config) | KEDA Event-based autoscaler add-on config | `any` | `{}` | no | | [keda\_irsa\_policies](#input\_keda\_irsa\_policies) | Additional IAM policies for a IAM role for service accounts | `list(string)` | `[]` | no | | [kube\_prometheus\_stack\_helm\_config](#input\_kube\_prometheus\_stack\_helm\_config) | Community kube-prometheus-stack Helm Chart config | `any` | `{}` | no | @@ -314,6 +313,9 @@ | [spark\_history\_server\_irsa\_policies](#input\_spark\_history\_server\_irsa\_policies) | Additional IAM policies for a IAM role for service accounts | `list(string)` | `[]` | no | | [spark\_history\_server\_s3a\_path](#input\_spark\_history\_server\_s3a\_path) | s3a path with prefix for Spark history server e.g., s3a:/// | `string` | `""` | no | | [spark\_k8s\_operator\_helm\_config](#input\_spark\_k8s\_operator\_helm\_config) | Spark on K8s Operator Helm Chart config | `any` | `{}` | no | +| [sqs\_queue\_kms\_data\_key\_reuse\_period\_seconds](#input\_sqs\_queue\_kms\_data\_key\_reuse\_period\_seconds) | The length of time, in seconds, for which Amazon SQS can reuse a data key to encrypt or decrypt messages before calling AWS KMS again | `number` | `null` | no | +| [sqs\_queue\_kms\_master\_key\_id](#input\_sqs\_queue\_kms\_master\_key\_id) | The ID of an AWS-managed customer master key (CMK) for Amazon SQS or a custom CMK | `string` | `null` | no | +| [sqs\_queue\_managed\_sse\_enabled](#input\_sqs\_queue\_managed\_sse\_enabled) | Enable server-side encryption (SSE) for a SQS queue | `bool` | `true` | no | | [strimzi\_kafka\_operator\_helm\_config](#input\_strimzi\_kafka\_operator\_helm\_config) | Kafka Strimzi Helm Chart config | `any` | `{}` | no | | [sysdig\_agent\_helm\_config](#input\_sysdig\_agent\_helm\_config) | Sysdig Helm Chart config | `any` | `{}` | no | | [tags](#input\_tags) | Additional tags (e.g. `map('BusinessUnit`,`XYZ`) | `map(string)` | `{}` | no | diff --git a/modules/kubernetes-addons/karpenter/README.md b/modules/kubernetes-addons/karpenter/README.md index 914ccbf7bc..5342000c07 100644 --- a/modules/kubernetes-addons/karpenter/README.md +++ b/modules/kubernetes-addons/karpenter/README.md @@ -28,30 +28,41 @@ For more details checkout [Karpenter](https://karpenter.sh/docs/getting-started/ | Name | Type | |------|------| +| [aws_cloudwatch_event_rule.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_target.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_iam_policy.karpenter](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | -| [aws_arn.queue](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/arn) | data source | +| [aws_sqs_queue.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue) | resource | +| [aws_sqs_queue_policy.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue_policy) | resource | | [aws_iam_policy_document.karpenter](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.sqs_queue](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [addon\_context](#input\_addon\_context) | Input configuration for the addon |
object({
aws_caller_identity_account_id = string
aws_caller_identity_arn = string
aws_eks_cluster_endpoint = string
aws_partition_id = string
aws_region_name = string
eks_cluster_id = string
eks_oidc_issuer_url = string
eks_oidc_provider_arn = string
tags = map(string)
irsa_iam_role_path = string
irsa_iam_permissions_boundary = string
})
| n/a | yes | -| [enable\_spot\_termination\_handling](#input\_enable\_spot\_termination\_handling) | Determines whether to enable native spot termination handling | `bool` | `false` | no | +| [enable\_spot\_termination](#input\_enable\_spot\_termination) | Determines whether to enable native spot termination handling | `bool` | `false` | no | | [helm\_config](#input\_helm\_config) | Helm provider config for the Karpenter | `any` | `{}` | no | | [irsa\_policies](#input\_irsa\_policies) | Additional IAM policies for a IAM role for service accounts | `list(string)` | `[]` | no | | [manage\_via\_gitops](#input\_manage\_via\_gitops) | Determines if the add-on should be managed via GitOps. | `bool` | `false` | no | | [node\_iam\_instance\_profile](#input\_node\_iam\_instance\_profile) | Karpenter Node IAM Instance profile id | `string` | `""` | no | | [path](#input\_path) | Path in which to create the Karpenter policy | `string` | `"/"` | no | -| [sqs\_queue\_arn](#input\_sqs\_queue\_arn) | (Optional) ARN of SQS used by Karpenter when native node termination handling is enabled | `string` | `""` | no | +| [sqs\_queue\_kms\_data\_key\_reuse\_period\_seconds](#input\_sqs\_queue\_kms\_data\_key\_reuse\_period\_seconds) | The length of time, in seconds, for which Amazon SQS can reuse a data key to encrypt or decrypt messages before calling AWS KMS again | `number` | `null` | no | +| [sqs\_queue\_kms\_master\_key\_id](#input\_sqs\_queue\_kms\_master\_key\_id) | The ID of an AWS-managed customer master key (CMK) for Amazon SQS or a custom CMK | `string` | `null` | no | +| [sqs\_queue\_managed\_sse\_enabled](#input\_sqs\_queue\_managed\_sse\_enabled) | Enable server-side encryption (SSE) for a SQS queue | `bool` | `true` | no | ## Outputs | Name | Description | |------|-------------| | [argocd\_gitops\_config](#output\_argocd\_gitops\_config) | Configuration used for managing the add-on with ArgoCD | +| [event\_rules](#output\_event\_rules) | Map of the event rules created and their attributes | | [irsa\_arn](#output\_irsa\_arn) | IAM role ARN for the service account | | [irsa\_name](#output\_irsa\_name) | IAM role name for the service account | | [release\_metadata](#output\_release\_metadata) | Map of attributes of the Helm release metadata | | [service\_account](#output\_service\_account) | Name of Kubernetes service account | +| [sqs\_queue\_arn](#output\_sqs\_queue\_arn) | The ARN of the SQS queue | +| [sqs\_queue\_name](#output\_sqs\_queue\_name) | The name of the created Amazon SQS queue | +| [sqs\_queue\_url](#output\_sqs\_queue\_url) | The URL for the created Amazon SQS queue | diff --git a/modules/kubernetes-addons/karpenter/data.tf b/modules/kubernetes-addons/karpenter/data.tf index 9d5615cb58..e13812016a 100644 --- a/modules/kubernetes-addons/karpenter/data.tf +++ b/modules/kubernetes-addons/karpenter/data.tf @@ -1,8 +1,4 @@ -data "aws_arn" "queue" { - count = var.enable_spot_termination_handling ? 1 : 0 - - arn = var.sqs_queue_arn -} +data "aws_partition" "current" {} data "aws_iam_policy_document" "karpenter" { statement { @@ -89,7 +85,7 @@ data "aws_iam_policy_document" "karpenter" { } dynamic "statement" { - for_each = var.sqs_queue_arn != "" ? [1] : [] + for_each = var.enable_spot_termination ? [1] : [] content { actions = [ @@ -98,7 +94,24 @@ data "aws_iam_policy_document" "karpenter" { "sqs:GetQueueUrl", "sqs:ReceiveMessage", ] - resources = [var.sqs_queue_arn] + resources = [aws_sqs_queue.this[0].arn] + } + } +} + +data "aws_iam_policy_document" "sqs_queue" { + count = var.enable_spot_termination ? 1 : 0 + + statement { + sid = "SqsWrite" + actions = ["sqs:SendMessage"] + principals { + type = "Service" + identifiers = [ + "events.${local.dns_suffix}", + "sqs.${local.dns_suffix}" + ] } + resources = [aws_sqs_queue.this[0].arn] } } diff --git a/modules/kubernetes-addons/karpenter/locals.tf b/modules/kubernetes-addons/karpenter/locals.tf index 2f1ef1ddac..09dcfabb71 100644 --- a/modules/kubernetes-addons/karpenter/locals.tf +++ b/modules/kubernetes-addons/karpenter/locals.tf @@ -26,7 +26,7 @@ locals { clusterName: ${var.addon_context.eks_cluster_id} clusterEndpoint: ${var.addon_context.aws_eks_cluster_endpoint} defaultInstanceProfile: ${var.node_iam_instance_profile} - interruptionQueueName: ${try(data.aws_arn.queue[0].resource, "")} + interruptionQueueName: ${try(aws_sqs_queue.this[0].name, "")} EOT ] description = "karpenter Helm Chart for Node Autoscaling" @@ -48,6 +48,50 @@ locals { serviceAccountName = local.service_account controllerClusterEndpoint = var.addon_context.aws_eks_cluster_endpoint awsDefaultInstanceProfile = var.node_iam_instance_profile - awsInterruptionQueueName = try(data.aws_arn.queue[0].resource, "") + awsInterruptionQueueName = try(aws_sqs_queue.this[0].name, "") + } + + dns_suffix = data.aws_partition.current.dns_suffix + + # Karpenter Spot Interruption Event rules + event_rules = { + health_event = { + name = "HealthEvent" + description = "Karpenter Interrupt - AWS health event for EC2" + event_pattern = { + source = ["aws.health"] + detail-type = ["AWS Health Event"] + detail = { + service = ["EC2"] + } + } + } + spot_interupt = { + name = "SpotInterrupt" + description = "Karpenter Interrupt - A spot interruption warning was triggered for the node" + event_pattern = { + source = ["aws.ec2"] + detail-type = ["EC2 Spot Instance Interruption Warning"] + } + } + instance_rebalance = { + name = "InstanceRebalance" + description = "Karpenter Interrupt - A spot rebalance recommendation was triggered for the node" + event_pattern = { + source = ["aws.ec2"] + detail-type = ["EC2 Instance Rebalance Recommendation"] + } + } + instance_state_change = { + name = "InstanceStateChange" + description = "Karpenter interrupt - EC2 instance state-change notification" + event_pattern = { + source = ["aws.ec2"] + detail-type = ["EC2 Instance State-change Notification"] + detail = { + state = ["stopping", "terminated", "shutting-down", "stopped"] #ignored pending and running + } + } + } } } diff --git a/modules/kubernetes-addons/karpenter/main.tf b/modules/kubernetes-addons/karpenter/main.tf index 9428220f62..70382bf450 100644 --- a/modules/kubernetes-addons/karpenter/main.tf +++ b/modules/kubernetes-addons/karpenter/main.tf @@ -13,3 +13,43 @@ resource "aws_iam_policy" "karpenter" { policy = data.aws_iam_policy_document.karpenter.json path = var.path } + +#tfsec:ignore:aws-sqs-enable-queue-encryption +resource "aws_sqs_queue" "this" { + count = var.enable_spot_termination ? 1 : 0 + + name = "karpenter-${var.addon_context.eks_cluster_id}" + message_retention_seconds = 300 + sqs_managed_sse_enabled = var.sqs_queue_managed_sse_enabled + kms_master_key_id = var.sqs_queue_kms_master_key_id + kms_data_key_reuse_period_seconds = var.sqs_queue_kms_data_key_reuse_period_seconds + + tags = var.addon_context.tags +} + +resource "aws_sqs_queue_policy" "this" { + count = var.enable_spot_termination ? 1 : 0 + + queue_url = aws_sqs_queue.this[0].id + policy = data.aws_iam_policy_document.sqs_queue[0].json +} + +resource "aws_cloudwatch_event_rule" "this" { + for_each = { for k, v in local.event_rules : k => v if var.enable_spot_termination } + + name = each.value.name + description = each.value.description + event_pattern = jsonencode(each.value.event_pattern) + tags = merge( + { "ClusterName" : var.addon_context.eks_cluster_id }, + var.addon_context.tags, + ) +} + +resource "aws_cloudwatch_event_target" "this" { + for_each = { for k, v in local.event_rules : k => v if var.enable_spot_termination } + + rule = aws_cloudwatch_event_rule.this[each.key].name + arn = aws_sqs_queue.this[0].arn + target_id = "KarpenterInterruptionQueueTarget" +} diff --git a/modules/kubernetes-addons/karpenter/outputs.tf b/modules/kubernetes-addons/karpenter/outputs.tf index 0776dcd7ef..f93e8a3350 100644 --- a/modules/kubernetes-addons/karpenter/outputs.tf +++ b/modules/kubernetes-addons/karpenter/outputs.tf @@ -22,3 +22,23 @@ output "service_account" { description = "Name of Kubernetes service account" value = module.helm_addon.service_account } + +output "sqs_queue_arn" { + description = "The ARN of the SQS queue" + value = try(aws_sqs_queue.this[0].arn, null) +} + +output "sqs_queue_name" { + description = "The name of the created Amazon SQS queue" + value = try(aws_sqs_queue.this[0].name, null) +} + +output "sqs_queue_url" { + description = "The URL for the created Amazon SQS queue" + value = try(aws_sqs_queue.this[0].url, null) +} + +output "event_rules" { + description = "Map of the event rules created and their attributes" + value = aws_cloudwatch_event_rule.this +} diff --git a/modules/kubernetes-addons/karpenter/variables.tf b/modules/kubernetes-addons/karpenter/variables.tf index 1887baf33c..21daffef11 100644 --- a/modules/kubernetes-addons/karpenter/variables.tf +++ b/modules/kubernetes-addons/karpenter/variables.tf @@ -22,19 +22,12 @@ variable "node_iam_instance_profile" { default = "" } -# tflint-ignore: terraform_unused_declarations -variable "enable_spot_termination_handling" { +variable "enable_spot_termination" { description = "Determines whether to enable native spot termination handling" type = bool default = false } -variable "sqs_queue_arn" { - description = "(Optional) ARN of SQS used by Karpenter when native node termination handling is enabled" - type = string - default = "" -} - variable "addon_context" { description = "Input configuration for the addon" type = object({ @@ -57,3 +50,21 @@ variable "path" { type = string default = "/" } + +variable "sqs_queue_managed_sse_enabled" { + description = "Enable server-side encryption (SSE) for a SQS queue" + type = bool + default = true +} + +variable "sqs_queue_kms_master_key_id" { + description = "The ID of an AWS-managed customer master key (CMK) for Amazon SQS or a custom CMK" + type = string + default = null +} + +variable "sqs_queue_kms_data_key_reuse_period_seconds" { + description = "The length of time, in seconds, for which Amazon SQS can reuse a data key to encrypt or decrypt messages before calling AWS KMS again" + type = number + default = null +} diff --git a/modules/kubernetes-addons/main.tf b/modules/kubernetes-addons/main.tf index 8d08af00de..15e842b308 100644 --- a/modules/kubernetes-addons/main.tf +++ b/modules/kubernetes-addons/main.tf @@ -317,13 +317,15 @@ module "karpenter" { count = var.enable_karpenter ? 1 : 0 - helm_config = var.karpenter_helm_config - irsa_policies = var.karpenter_irsa_policies - node_iam_instance_profile = var.karpenter_node_iam_instance_profile - enable_spot_termination_handling = var.karpenter_enable_spot_termination_handling - sqs_queue_arn = var.karpenter_sqs_queue_arn - manage_via_gitops = var.argocd_manage_add_ons - addon_context = local.addon_context + helm_config = var.karpenter_helm_config + irsa_policies = var.karpenter_irsa_policies + node_iam_instance_profile = var.karpenter_node_iam_instance_profile + enable_spot_termination = var.karpenter_enable_spot_termination_handling + manage_via_gitops = var.argocd_manage_add_ons + addon_context = local.addon_context + sqs_queue_managed_sse_enabled = var.sqs_queue_managed_sse_enabled + sqs_queue_kms_master_key_id = var.sqs_queue_kms_master_key_id + sqs_queue_kms_data_key_reuse_period_seconds = var.sqs_queue_kms_data_key_reuse_period_seconds } module "keda" { @@ -530,6 +532,7 @@ module "secrets_store_csi_driver" { manage_via_gitops = var.argocd_manage_add_ons addon_context = local.addon_context } + module "aws_privateca_issuer" { count = var.enable_aws_privateca_issuer ? 1 : 0 source = "./aws-privateca-issuer" diff --git a/modules/kubernetes-addons/variables.tf b/modules/kubernetes-addons/variables.tf index 4ff3be58ae..7ca52ca2e0 100644 --- a/modules/kubernetes-addons/variables.tf +++ b/modules/kubernetes-addons/variables.tf @@ -911,10 +911,22 @@ variable "karpenter_enable_spot_termination_handling" { default = false } -variable "karpenter_sqs_queue_arn" { - description = "(Optional) ARN of SQS used by Karpenter when native node termination handling is enabled" +variable "sqs_queue_managed_sse_enabled" { + description = "Enable server-side encryption (SSE) for a SQS queue" + type = bool + default = true +} + +variable "sqs_queue_kms_master_key_id" { + description = "The ID of an AWS-managed customer master key (CMK) for Amazon SQS or a custom CMK" type = string - default = "" + default = null +} + +variable "sqs_queue_kms_data_key_reuse_period_seconds" { + description = "The length of time, in seconds, for which Amazon SQS can reuse a data key to encrypt or decrypt messages before calling AWS KMS again" + type = number + default = null } #-----------KEDA ADDON-------------