diff --git a/examples/node_draining/README.md b/examples/node_draining/README.md new file mode 100644 index 0000000000..a8bc7868a3 --- /dev/null +++ b/examples/node_draining/README.md @@ -0,0 +1,63 @@ +## EKS node drainer with terraform + +Changing the AMI version of worker group or changing the kubernetes version will cause recreation of the nodes. +By default the nodes won't drain themselves before they got removed. So we have some downtime. +Node groups are a good alternative but not sufficient for us as we require to [set custom security groups](https://github.com/aws/containers-roadmap/issues/609). + +We are adding a termination lifecycle to run kubectl drain before we shutdown nodes. + +The serverless python drain function is from https://github.com/aws-samples/amazon-k8s-node-drainer +and translated to terraform. + +**STEPS** +* build python zip https://docs.aws.amazon.com/lambda/latest/dg/python-package.html +``` +# script need to be adapted to your system (python 3 version) +# you will find better solution to deploy serverless function but it serves the example purpose +./build.sh +``` +* apply +``` +terraform init +terraform apply +# will fail once because the subnets are not yet in the data filter +# solving subnet dependencies needs to happen on other layer but not important for this example +terraform apply +``` + +### Testing seamless worker upgrade + +* update kubeconfig and deploy example grafana with pod disruption budget +``` +aws eks update-kubeconfig --name $CLUSTER_NAME --alias drainer + +# optional install latest cni plugin to ensure we can destroy cluster clean +# https://github.com/terraform-aws-modules/terraform-aws-eks/issues/285 +# https://docs.aws.amazon.com/eks/latest/userguide/cni-upgrades.html +kubectl apply -f https://raw.githubusercontent.com/aws/amazon-vpc-cni-k8s/release-1.6/config/v1.6/aws-k8s-cni.yaml + +# install metrics server to watch node resource allocation +helm upgrade --install grafana stable/grafana --set podDisruptionBudget.minAvailable=1 --set replicas=2 --set persistence.enabled=true --set persistence.type=statefulset + +# check that volumes are allocated in different regions +kubectl get pv -o custom-columns=PVC-NAME:.spec.claimRef.name,REGION:.metadata.labels + +kubectl get pods +``` +* change version number of the ami version of nodes to see node drainer in action +* you can also verify the output in cloudwatch of the lambda function +``` +terraform apply -var ami_version=v20200609 +``` +* now the nodes will not get deleted before they have been drained completely +* the drainer will respect pod disruption budget in our example that's one running grafana replica + +### Drawbacks +* terminating instances will just continue after the lifecycle timeout is reaching regardless of failure during draining + +### Info +* this example shows an example for a single AZ workergroup which is necessary if you are using EBS volumes with Statefulsets. + * not really necessary for node draining remove if you want +* the drainer works also in combination with cluster-autoscaler + * how to use cluster-autoscaler is already well documented in [examples/irsa](../irsa) + * a full example with no guarantees can we found at [eks-node-drainer](https://github.com/karlderkaefer/eks-node-drainer) diff --git a/examples/node_draining/build.sh b/examples/node_draining/build.sh new file mode 100755 index 0000000000..7de51ab33e --- /dev/null +++ b/examples/node_draining/build.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +cd drainer || exit 1 +mkdir -p dist +cp -R src/*.py dist/ +cd src || exit 1 +python3 -m venv v-env +source v-env/bin/activate +pip install -r requirements.txt +deactivate +cp -R v-env/lib/python3.5/site-packages/* ../dist/ +cd ../.. || return diff --git a/examples/node_draining/drainer.tf b/examples/node_draining/drainer.tf new file mode 100644 index 0000000000..bbcbbfd31d --- /dev/null +++ b/examples/node_draining/drainer.tf @@ -0,0 +1,116 @@ +data "archive_file" "node_drainer" { + count = var.drainer_enabled ? 1 : 0 + type = "zip" + source_dir = "${path.module}/drainer/dist" + output_path = "${path.module}/lambda_function.zip" +} + +resource "aws_iam_role" "node_drainer" { + count = var.drainer_enabled ? 1 : 0 + name = "NodeDrainerRole" + assume_role_policy = data.aws_iam_policy_document.node_drainer_assume_role[0].json +} + +data "aws_iam_policy_document" "node_drainer_assume_role" { + count = var.drainer_enabled ? 1 : 0 + statement { + sid = "AssumeRolePolicy" + effect = "Allow" + actions = [ + "sts:AssumeRole" + ] + principals { + type = "Service" + identifiers = ["lambda.amazonaws.com"] + } + } +} + +data "aws_iam_policy_document" "node_drainer" { + count = var.drainer_enabled ? 1 : 0 + statement { + sid = "LoggingPolicy" + effect = "Allow" + actions = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents", + ] + resources = [ + "arn:aws:logs:*:*:*" + ] + } + statement { + sid = "AutoscalePolicy" + effect = "Allow" + actions = [ + "autoscaling:CompleteLifecycleAction", + "ec2:DescribeInstances", + "eks:DescribeCluster", + "sts:GetCallerIdentity", + ] + resources = [ + "*" + ] + } +} + +resource "aws_iam_role_policy" "node_drainer" { + count = var.drainer_enabled ? 1 : 0 + role = aws_iam_role.node_drainer[0].id + policy = data.aws_iam_policy_document.node_drainer[0].json +} + +resource "aws_lambda_function" "node_drainer" { + count = var.drainer_enabled ? 1 : 0 + filename = data.archive_file.node_drainer[0].output_path + function_name = var.drainer_lambda_function_name + role = aws_iam_role.node_drainer[0].arn + handler = "handler.lambda_handler" + memory_size = 300 + timeout = var.drainer_lambda_timeout + + source_code_hash = filebase64sha256(data.archive_file.node_drainer[0].output_path) + + runtime = "python3.7" + + environment { + variables = { + CLUSTER_NAME = var.cluster_name + } + } + depends_on = [ + aws_iam_role.node_drainer, + aws_cloudwatch_log_group.node_drainer, + data.archive_file.node_drainer, + ] +} + +# This is to optionally manage the CloudWatch Log Group for the Lambda Function. +# If skipping this resource configuration, also add "logs:CreateLogGroup" to the IAM policy below. +resource "aws_cloudwatch_log_group" "node_drainer" { + count = var.drainer_enabled ? 1 : 0 + name = "/aws/lambda/${var.drainer_lambda_function_name}" + retention_in_days = 14 +} + +resource "aws_lambda_permission" "node_drainer" { + count = var.drainer_enabled ? 1 : 0 + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.node_drainer[0].function_name + principal = "events.amazonaws.com" +} + +resource "aws_cloudwatch_event_rule" "terminating_events" { + count = var.drainer_enabled ? 1 : 0 + name = "asg-terminate-events-${var.cluster_name}" + description = "Capture all terminating autoscaling events for cluster ${var.cluster_name}" + + event_pattern = templatefile("${path.module}/event-rule.tpl", { cluster_name = var.cluster_name }) +} + +resource "aws_cloudwatch_event_target" "terminating_events" { + count = var.drainer_enabled ? 1 : 0 + rule = aws_cloudwatch_event_rule.terminating_events[0].name + arn = aws_lambda_function.node_drainer[0].arn +} diff --git a/examples/node_draining/drainer/__init__.py b/examples/node_draining/drainer/__init__.py new file mode 100644 index 0000000000..cc2c489b27 --- /dev/null +++ b/examples/node_draining/drainer/__init__.py @@ -0,0 +1,4 @@ +import os +import sys + +sys.path.append(os.path.dirname(os.path.realpath(__file__))) diff --git a/examples/node_draining/drainer/src/handler.py b/examples/node_draining/drainer/src/handler.py new file mode 100644 index 0000000000..d2d9990434 --- /dev/null +++ b/examples/node_draining/drainer/src/handler.py @@ -0,0 +1,210 @@ +import boto3 +import base64 +import json +import logging +import os.path +import re +import yaml + +from botocore.signers import RequestSigner +import kubernetes as k8s +from kubernetes.client.rest import ApiException + +from k8s_utils import (abandon_lifecycle_action, cordon_node, node_exists, remove_all_pods) + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +KUBE_FILEPATH = '/tmp/kubeconfig' +REGION = os.environ['AWS_REGION'] + +eks = boto3.client('eks', region_name=REGION) +ec2 = boto3.client('ec2', region_name=REGION) +asg = boto3.client('autoscaling', region_name=REGION) +s3 = boto3.client('s3', region_name=REGION) + + +def create_kube_config(eks, cluster_name): + """Creates the Kubernetes config file required when instantiating the API client.""" + cluster_info = eks.describe_cluster(name=cluster_name)['cluster'] + certificate = cluster_info['certificateAuthority']['data'] + endpoint = cluster_info['endpoint'] + + kube_config = { + 'apiVersion': 'v1', + 'clusters': [ + { + 'cluster': + { + 'server': endpoint, + 'certificate-authority-data': certificate + }, + 'name': 'k8s' + + }], + 'contexts': [ + { + 'context': + { + 'cluster': 'k8s', + 'user': 'aws' + }, + 'name': 'aws' + }], + 'current-context': 'aws', + 'Kind': 'config', + 'users': [ + { + 'name': 'aws', + 'user': 'lambda' + }] + } + + with open(KUBE_FILEPATH, 'w') as f: + yaml.dump(kube_config, f, default_flow_style=False) + + +def get_bearer_token(cluster, region): + """Creates the authentication to token required by AWS IAM Authenticator. This is + done by creating a base64 encoded string which represents a HTTP call to the STS + GetCallerIdentity Query Request (https://docs.aws.amazon.com/STS/latest/APIReference/API_GetCallerIdentity.html). + The AWS IAM Authenticator decodes the base64 string and makes the request on behalf of the user. + """ + STS_TOKEN_EXPIRES_IN = 60 + session = boto3.session.Session() + + client = session.client('sts', region_name=region) + service_id = client.meta.service_model.service_id + + signer = RequestSigner( + service_id, + region, + 'sts', + 'v4', + session.get_credentials(), + session.events + ) + + params = { + 'method': 'GET', + 'url': 'https://sts.{}.amazonaws.com/?Action=GetCallerIdentity&Version=2011-06-15'.format(region), + 'body': {}, + 'headers': { + 'x-k8s-aws-id': cluster + }, + 'context': {} + } + + signed_url = signer.generate_presigned_url( + params, + region_name=region, + expires_in=STS_TOKEN_EXPIRES_IN, + operation_name='' + ) + + base64_url = base64.urlsafe_b64encode(signed_url.encode('utf-8')).decode('utf-8') + + # need to remove base64 encoding padding: + # https://github.com/kubernetes-sigs/aws-iam-authenticator/issues/202 + return 'k8s-aws-v1.' + re.sub(r'=*', '', base64_url) + +def search(d, key, default=None): + """Return a value corresponding to the specified key in the (possibly + nested) dictionary d. If there is no item with that key, return + default. + """ + stack = [iter(d.items())] + while stack: + for k, v in stack[-1]: + if isinstance(v, dict): + stack.append(iter(v.items())) + break + elif k == key: + return v + else: + stack.pop() + return default + +def _lambda_handler(env, k8s_config, k8s_client, event): + kube_config_bucket = env['kube_config_bucket'] + cluster_name = env['cluster_name'] + + if not os.path.exists(KUBE_FILEPATH): + if kube_config_bucket: + logger.info('No kubeconfig file found. Downloading...') + s3.download_file(kube_config_bucket, env['kube_config_object'], KUBE_FILEPATH) + else: + logger.info('No kubeconfig file found. Generating...') + create_kube_config(eks, cluster_name) + logger.info("received event: {}".format(event)) + + lifecycle_hook_name = search(event, 'LifecycleHookName') + if lifecycle_hook_name is None: + logger.warning("could not find LifecycleHookName in event data. Skip draining node.") + return + + expected_lifecycle_hook = "node-drainer-{}".format(cluster_name) + + if lifecycle_hook_name != expected_lifecycle_hook: + logger.warning("lifecycle hook name: {} did not match with expected: {}".format( + lifecycle_hook_name, + expected_lifecycle_hook) + ) + logger.warning("skip draining node.") + return + + auto_scaling_group_name = event['detail']['AutoScalingGroupName'] + + instance_id = event['detail']['EC2InstanceId'] + logger.info('Instance ID: ' + instance_id) + instance = ec2.describe_instances(InstanceIds=[instance_id])['Reservations'][0]['Instances'][0] + + node_name = instance['PrivateDnsName'] + logger.info('Node name: ' + node_name) + + # skip draining on purpose e.g. to destroy cluster + skip_draining = search(event, "ignore_draining") + if skip_draining == "true": + logger.info("ignore_draining on purpose. completing lifecycle") + asg.complete_lifecycle_action(LifecycleHookName=lifecycle_hook_name, + AutoScalingGroupName=auto_scaling_group_name, + LifecycleActionResult='CONTINUE', + InstanceId=instance_id) + return + + # Configure + k8s_config.load_kube_config(KUBE_FILEPATH) + configuration = k8s_client.Configuration() + if not kube_config_bucket: + configuration.api_key['authorization'] = get_bearer_token(cluster_name, REGION) + configuration.api_key_prefix['authorization'] = 'Bearer' + # API + api = k8s_client.ApiClient(configuration) + v1 = k8s_client.CoreV1Api(api) + + try: + if not node_exists(v1, node_name): + logger.error('Node not found.') + abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id) + return + + cordon_node(v1, node_name) + + remove_all_pods(v1, node_name) + + asg.complete_lifecycle_action(LifecycleHookName=lifecycle_hook_name, + AutoScalingGroupName=auto_scaling_group_name, + LifecycleActionResult='CONTINUE', + InstanceId=instance_id) + except ApiException: + logger.exception('There was an error removing the pods from the node {}'.format(node_name)) + abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id) + + +def lambda_handler(event, _): + env = { + 'cluster_name': os.environ.get('CLUSTER_NAME'), + 'kube_config_bucket': os.environ.get('KUBE_CONFIG_BUCKET'), + 'kube_config_object': os.environ.get('KUBE_CONFIG_OBJECT') + } + return _lambda_handler(env, k8s.config, k8s.client, event) diff --git a/examples/node_draining/drainer/src/k8s_utils.py b/examples/node_draining/drainer/src/k8s_utils.py new file mode 100644 index 0000000000..574aa9ecf4 --- /dev/null +++ b/examples/node_draining/drainer/src/k8s_utils.py @@ -0,0 +1,121 @@ +import logging +import time + +from kubernetes.client.rest import ApiException + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +MIRROR_POD_ANNOTATION_KEY = "kubernetes.io/config.mirror" +CONTROLLER_KIND_DAEMON_SET = "DaemonSet" + + +def cordon_node(api, node_name): + """Marks the specified node as unschedulable, which means that no new pods can be launched on the + node by the Kubernetes scheduler. + """ + patch_body = { + 'apiVersion': 'v1', + 'kind': 'Node', + 'metadata': { + 'name': node_name + }, + 'spec': { + 'unschedulable': True + } + } + + api.patch_node(node_name, patch_body) + + +def remove_all_pods(api, node_name, poll=5): + """Removes all Kubernetes pods from the specified node.""" + pods = get_evictable_pods(api, node_name) + + logger.debug('Number of pods to delete: ' + str(len(pods))) + + evict_until_completed(api, pods, poll) + wait_until_empty(api, node_name, poll) + + +def pod_is_evictable(pod): + if pod.metadata.annotations is not None and pod.metadata.annotations.get(MIRROR_POD_ANNOTATION_KEY): + logger.info("Skipping mirror pod {}/{}".format(pod.metadata.namespace, pod.metadata.name)) + return False + if pod.metadata.owner_references is None: + return True + for ref in pod.metadata.owner_references: + if ref.controller is not None and ref.controller: + if ref.kind == CONTROLLER_KIND_DAEMON_SET: + logger.info("Skipping DaemonSet {}/{}".format(pod.metadata.namespace, pod.metadata.name)) + return False + return True + + +def get_evictable_pods(api, node_name): + field_selector = 'spec.nodeName=' + node_name + pods = api.list_pod_for_all_namespaces(watch=False, field_selector=field_selector, include_uninitialized=True) + return [pod for pod in pods.items if pod_is_evictable(pod)] + + +def evict_until_completed(api, pods, poll): + pending = pods + while True: + pending = evict_pods(api, pending) + if (len(pending)) <= 0: + return + time.sleep(poll) + + +def evict_pods(api, pods): + remaining = [] + for pod in pods: + logger.info('Evicting pod {} in namespace {}'.format(pod.metadata.name, pod.metadata.namespace)) + body = { + 'apiVersion': 'policy/v1beta1', + 'kind': 'Eviction', + 'deleteOptions': {}, + 'metadata': { + 'name': pod.metadata.name, + 'namespace': pod.metadata.namespace + } + } + try: + api.create_namespaced_pod_eviction(pod.metadata.name, pod.metadata.namespace, body) + except ApiException as err: + if err.status == 429: + remaining.append(pod) + logger.warning("Pod {}/{} could not be evicted due to disruption budget. Will retry.".format(pod.metadata.namespace, pod.metadata.name)) + else: + logger.exception("Unexpected error adding eviction for pod {}/{}".format(pod.metadata.namespace, pod.metadata.name)) + except: + logger.exception("Unexpected error adding eviction for pod {}/{}".format(pod.metadata.namespace, pod.metadata.name)) + return remaining + + +def wait_until_empty(api, node_name, poll): + logger.info("Waiting for evictions to complete") + while True: + pods = get_evictable_pods(api, node_name) + if len(pods) <= 0: + logger.info("All pods evicted successfully") + return + logger.debug("Still waiting for deletion of the following pods: {}".format(", ".join(map(lambda pod: pod.metadata.namespace + "/" + pod.metadata.name, pods)))) + time.sleep(poll) + + +def node_exists(api, node_name): + """Determines whether the specified node is still part of the cluster.""" + nodes = api.list_node(include_uninitialized=True, pretty=True).items + node = next((n for n in nodes if n.metadata.name == node_name), None) + return False if not node else True + + +def abandon_lifecycle_action(asg_client, auto_scaling_group_name, lifecycle_hook_name, instance_id): + """Completes the lifecycle action with the ABANDON result, which stops any remaining actions, + such as other lifecycle hooks. + """ + asg_client.complete_lifecycle_action(LifecycleHookName=lifecycle_hook_name, + AutoScalingGroupName=auto_scaling_group_name, + LifecycleActionResult='ABANDON', + InstanceId=instance_id) diff --git a/examples/node_draining/drainer/src/requirements.txt b/examples/node_draining/drainer/src/requirements.txt new file mode 100644 index 0000000000..f9d7b050d1 --- /dev/null +++ b/examples/node_draining/drainer/src/requirements.txt @@ -0,0 +1 @@ +kubernetes==9.0.0 diff --git a/examples/node_draining/event-rule.tpl b/examples/node_draining/event-rule.tpl new file mode 100644 index 0000000000..bdd4c9f658 --- /dev/null +++ b/examples/node_draining/event-rule.tpl @@ -0,0 +1,15 @@ +{ + "source": [ + "aws.autoscaling" + ], + "detail-type": [ + "EC2 Instance-terminate Lifecycle Action" + ], + "detail": { + "NotificationMetadata": { + "cluster_name": [ + "${cluster_name}" + ] + } + } +} diff --git a/examples/node_draining/main.tf b/examples/node_draining/main.tf new file mode 100644 index 0000000000..06bbe67c4c --- /dev/null +++ b/examples/node_draining/main.tf @@ -0,0 +1,145 @@ +data "aws_eks_cluster" "cluster" { + name = module.eks.cluster_id +} + +data "aws_eks_cluster_auth" "cluster" { + name = module.eks.cluster_id +} + +data "aws_caller_identity" "current" {} + +data "aws_availability_zones" "available" { + state = "available" +} + +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "2.33.0" + name = "${var.cluster_name}-vpc" + cidr = "10.0.0.0/16" + azs = data.aws_availability_zones.available.names + private_subnets = [ + "10.0.1.0/24", + "10.0.2.0/24", + "10.0.3.0/24"] + public_subnets = [ + "10.0.4.0/24", + "10.0.5.0/24", + "10.0.6.0/24"] + enable_nat_gateway = true + single_nat_gateway = true + enable_dns_hostnames = true + + public_subnet_tags = { + "subnet-type" = "public" + "kubernetes.io/cluster/${var.cluster_name}" = "shared" + "kubernetes.io/role/elb" = "1" + } + + private_subnet_tags = { + "subnet-type" = "private" + "kubernetes.io/cluster/${var.cluster_name}" = "shared" + "kubernetes.io/role/internal-elb" = "1" + } + tags = merge( + var.tags, + { + Cluster = var.cluster_name + }, + ) + enable_flow_log = true + create_flow_log_cloudwatch_log_group = true + create_flow_log_cloudwatch_iam_role = true +} + +// this only works because we have more than 2 AZ and 3 private subnets +data "aws_subnet" "private_subnets_per_zone" { + for_each = toset(data.aws_availability_zones.available.names) + vpc_id = module.vpc.vpc_id + availability_zone = each.value + tags = { + subnet-type = "private" + } +} + +module "eks" { + source = "terraform-aws-modules/eks/aws" + version = "~> 11.1" + + cluster_name = var.cluster_name + subnets = module.vpc.private_subnets + + cluster_version = var.cluster_version + + // worker_ami_name_filter = "amazon-eks-node-${var.cluster_version}-*v20200423" + // worker_ami_name_filter = "amazon-eks-node-${var.cluster_version}-*v20200406" + worker_ami_name_filter = "amazon-eks-node-${var.cluster_version}-*${var.ami_version}" + + vpc_id = module.vpc.vpc_id + + worker_create_initial_lifecycle_hooks = true + + tags = merge( + var.tags, + { + Cluster = var.cluster_name + }, + ) + + workers_group_defaults = { + instance_type = "t2.medium" + additional_userdata = "echo foo bar" + asg_min_size = 1 + asg_max_size = 5 + asg_desired_capacity = 1 + asg_recreate_on_change = true + // ensure that system pods have enough resources adapt to your need + kubelet_extra_args = "--system-reserved=cpu=100m,memory=100Mi,ephemeral-storage=1Gi --kube-reserved=cpu=100m,memory=200Mi,ephemeral-storage=1Gi --eviction-hard=memory.available<100Mi,nodefs.available<5% --enforce-node-allocatable=pods" + asg_initial_lifecycle_hooks = [ + { + name = "node-drainer-${var.cluster_name}" + lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING" + default_result = "ABANDON" + // timeout after 6min + heartbeat_timeout = var.asg_hook_timeout + // we adding some metadata to filter shutdown events only for our cluster + notification_metadata = "{ \"cluster_name\": \"${var.cluster_name}\" }" + } + ] + tags = [ + { + key = "k8s.io/cluster-autoscaler/enabled" + propagate_at_launch = "false" + value = "true" + }, + { + key = "k8s.io/cluster-autoscaler/${var.cluster_name}" + propagate_at_launch = "false" + value = "true" + } + ] + } + + worker_groups = [ + { + name = "worker-group-1" + subnets = [ + data.aws_subnet.private_subnets_per_zone[data.aws_availability_zones.available.names[0]].id, + ] + }, + { + name = "worker-group-2" + subnets = [ + data.aws_subnet.private_subnets_per_zone[data.aws_availability_zones.available.names[1]].id, + ] + } + ] + + map_roles = [ + { + rolearn = aws_iam_role.node_drainer[0].arn + username = "lambda" + groups = [] + }, + ] +} diff --git a/examples/node_draining/outputs.tf b/examples/node_draining/outputs.tf new file mode 100644 index 0000000000..27e372c980 --- /dev/null +++ b/examples/node_draining/outputs.tf @@ -0,0 +1,35 @@ +output "cluster_endpoint" { + description = "Endpoint for EKS control plane." + value = module.eks.cluster_endpoint +} + +output "cluster_security_group_id" { + description = "Security group ids attached to the cluster control plane." + value = module.eks.cluster_security_group_id +} + +output "kubectl_config" { + description = "kubectl config as generated by the module." + value = module.eks.kubeconfig +} + +output "config_map_aws_auth" { + description = "A kubernetes configuration to authenticate to this EKS cluster." + value = module.eks.config_map_aws_auth +} + +output "region" { + description = "AWS region." + value = var.region +} + +output "private_subnets" { + value = module.vpc.private_subnets +} + +output "private_subnet_zones" { + value = { + for o in data.aws_subnet.private_subnets_per_zone : o.id => o.availability_zone + } +} + diff --git a/examples/node_draining/providers.tf b/examples/node_draining/providers.tf new file mode 100644 index 0000000000..690f58da49 --- /dev/null +++ b/examples/node_draining/providers.tf @@ -0,0 +1,22 @@ +terraform { + required_version = ">= 0.12.0" + required_providers { + aws = ">= 2.28.1" + kubernetes = "~> 1.11" + random = "~> 2.1" + local = "~> 1.2" + template = "~> 2.1" + } +} + +provider "aws" { + region = var.region +} + +provider "kubernetes" { + host = data.aws_eks_cluster.cluster.endpoint + cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.cluster.token + load_config_file = false + version = "~> 1.11" +} diff --git a/examples/node_draining/rbac.tf b/examples/node_draining/rbac.tf new file mode 100644 index 0000000000..b9580d3598 --- /dev/null +++ b/examples/node_draining/rbac.tf @@ -0,0 +1,32 @@ +resource "kubernetes_cluster_role" "node_drainer" { + metadata { + name = "node-drainer" + } + rule { + api_groups = [""] + resources = ["pods", "pods/eviction", "nodes"] + verbs = ["create", "list", "patch"] + } + depends_on = [ + module.eks.kubeconfig + ] +} + +resource "kubernetes_cluster_role_binding" "node_drainer" { + metadata { + name = "node-drainer" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role.node_drainer.metadata[0].name + } + subject { + kind = "User" + name = "lambda" + api_group = "rbac.authorization.k8s.io" + } + depends_on = [ + module.eks.kubeconfig + ] +} diff --git a/examples/node_draining/variables.tf b/examples/node_draining/variables.tf new file mode 100644 index 0000000000..bf61c7fb4c --- /dev/null +++ b/examples/node_draining/variables.tf @@ -0,0 +1,46 @@ +variable "region" { + default = "eu-central-1" +} + +variable "cluster_version" { + default = "1.16" +} + +variable "cluster_name" { + type = string + default = "eks-test" +} + +// https://docs.aws.amazon.com/de_de/eks/latest/userguide/eks-linux-ami-versions.html +variable "ami_version" { + default = "v20200423" + type = string +} + +variable "tags" { + default = { + Environment = "test-draining" + } + type = map(string) +} + +variable "asg_hook_timeout" { + default = 360 + description = "timeout in sec to wait until lifecycle is completed. If reached the instance will complete hook and shutdown instance will continue" +} + +// drainer variables +variable "drainer_enabled" { + default = true + type = bool +} + +variable "drainer_lambda_function_name" { + default = "node-drainer" + type = string +} + +variable "drainer_lambda_timeout" { + type = number + default = 120 +}