diff --git a/cluster-autoscaler/cloudprovider/aws/README.md b/cluster-autoscaler/cloudprovider/aws/README.md index 4175c4febb8f..c7d84ddc4547 100644 --- a/cluster-autoscaler/cloudprovider/aws/README.md +++ b/cluster-autoscaler/cloudprovider/aws/README.md @@ -138,8 +138,9 @@ If you'd like to scale node groups from 0, an `autoscaling:DescribeLaunchConfigu ``` ## Common Notes and Gotchas: -- The `/etc/ssl/certs/ca-certificates.crt` should exist by default on your ec2 instance. If you use Amazon Linux 2, use `/etc/ssl/certs/ca-bundle.crt` instead. -- Cluster autoscaler is not zone aware (for now), so if you wish to span multiple availability zones in your autoscaling groups beware that cluster autoscaler will not evenly distribute them. For more information, see https://github.com/kubernetes/contrib/pull/1552#discussion_r75532949. +- The `/etc/ssl/certs/ca-bundle.crt` should exist by default on ec2 instance in your EKS cluster. If you use other cluster privision tools like [kops](https://github.com/kubernetes/kops) with different operating systems other than Amazon Linux 2, please use `/etc/ssl/certs/ca-certificates.crt` or correct path on your host instead for the volume hostPath in your cluster autoscaler manifest. +- Cluster autoscaler does not support Auto Scaling Groups which span multiple Availability Zones; instead you should use an Auto Scaling Group for each Availability Zone and enable the [--balance-similar-node-groups](../../FAQ.md#im-running-cluster-with-nodes-in-multiple-zones-for-ha-purposes-is-that-supported-by-cluster-autoscaler) feature. If you do use a single Auto Scaling Group that spans multiple Availability Zones you will find that AWS unexpectedly terminates nodes without them being drained because of the [rebalancing feature](https://docs.aws.amazon.com/autoscaling/ec2/userguide/auto-scaling-benefits.html#arch-AutoScalingMultiAZ). +- EBS volumes cannot span multiple AWS Availability Zones. If you have a Pod with Persistent Volume in an AZ, It must be running on a k8s/EKS node which is in the same Availability Zone of the Persistent Volume. If AWS Auto Scaling Group launches a new k8s/EKS node in different AZ and moves this Pod into the new node, The Persistent volume in previous AZ will not be available from the new AZ. The pod will stay in Pending status. The Workaround is using a single AZ for the k8s/EKS nodes. - By default, cluster autoscaler will not terminate nodes running pods in the kube-system namespace. You can override this default behaviour by passing in the `--skip-nodes-with-system-pods=false` flag. - By default, cluster autoscaler will wait 10 minutes between scale down operations, you can adjust this using the `--scale-down-delay-after-add`, `--scale-down-delay-after-delete`, and `--scale-down-delay-after-failure` flag. E.g. `--scale-down-delay-after-add=5m` to decrease the scale down delay to 5 minutes after a node has been added. - If you're running multiple ASGs, the `--expander` flag supports three options: `random`, `most-pods` and `least-waste`. `random` will expand a random ASG on scale up. `most-pods` will scale up the ASG that will scheduable the most amount of pods. `least-waste` will expand the ASG that will waste the least amount of CPU/MEM resources. In the event of a tie, cluster autoscaler will fall back to `random`. diff --git a/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go b/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go index 7751f4dd0408..7f6381cfbb4e 100644 --- a/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go +++ b/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go @@ -27,6 +27,42 @@ type instanceType struct { // InstanceTypes is a map of ec2 resources var InstanceTypes = map[string]*instanceType{ + "a1": { + InstanceType: "a1", + VCPU: 16, + MemoryMb: 0, + GPU: 0, + }, + "a1.2xlarge": { + InstanceType: "a1.2xlarge", + VCPU: 8, + MemoryMb: 16384, + GPU: 0, + }, + "a1.4xlarge": { + InstanceType: "a1.4xlarge", + VCPU: 16, + MemoryMb: 32768, + GPU: 0, + }, + "a1.large": { + InstanceType: "a1.large", + VCPU: 2, + MemoryMb: 4096, + GPU: 0, + }, + "a1.medium": { + InstanceType: "a1.medium", + VCPU: 1, + MemoryMb: 2048, + GPU: 0, + }, + "a1.xlarge": { + InstanceType: "a1.xlarge", + VCPU: 4, + MemoryMb: 8192, + GPU: 0, + }, "c1.medium": { InstanceType: "c1.medium", VCPU: 2, @@ -195,6 +231,48 @@ var InstanceTypes = map[string]*instanceType{ MemoryMb: 8192, GPU: 0, }, + "c5n": { + InstanceType: "c5n", + VCPU: 72, + MemoryMb: 0, + GPU: 0, + }, + "c5n.18xlarge": { + InstanceType: "c5n.18xlarge", + VCPU: 72, + MemoryMb: 196608, + GPU: 0, + }, + "c5n.2xlarge": { + InstanceType: "c5n.2xlarge", + VCPU: 8, + MemoryMb: 21504, + GPU: 0, + }, + "c5n.4xlarge": { + InstanceType: "c5n.4xlarge", + VCPU: 16, + MemoryMb: 43008, + GPU: 0, + }, + "c5n.9xlarge": { + InstanceType: "c5n.9xlarge", + VCPU: 36, + MemoryMb: 98304, + GPU: 0, + }, + "c5n.large": { + InstanceType: "c5n.large", + VCPU: 2, + MemoryMb: 5376, + GPU: 0, + }, + "c5n.xlarge": { + InstanceType: "c5n.xlarge", + VCPU: 4, + MemoryMb: 10752, + GPU: 0, + }, "cc2.8xlarge": { InstanceType: "cc2.8xlarge", VCPU: 32, @@ -307,7 +385,7 @@ var InstanceTypes = map[string]*instanceType{ InstanceType: "g3s.xlarge", VCPU: 4, MemoryMb: 31232, - GPU: 0, + GPU: 1, }, "h1": { InstanceType: "h1", @@ -573,6 +651,12 @@ var InstanceTypes = map[string]*instanceType{ MemoryMb: 8192, GPU: 0, }, + "m5.metal": { + InstanceType: "m5.metal", + VCPU: 96, + MemoryMb: 393216, + GPU: 0, + }, "m5.xlarge": { InstanceType: "m5.xlarge", VCPU: 4, @@ -615,6 +699,42 @@ var InstanceTypes = map[string]*instanceType{ MemoryMb: 16384, GPU: 0, }, + "m5ad.12xlarge": { + InstanceType: "m5ad.12xlarge", + VCPU: 48, + MemoryMb: 196608, + GPU: 0, + }, + "m5ad.24xlarge": { + InstanceType: "m5ad.24xlarge", + VCPU: 96, + MemoryMb: 393216, + GPU: 0, + }, + "m5ad.2xlarge": { + InstanceType: "m5ad.2xlarge", + VCPU: 8, + MemoryMb: 32768, + GPU: 0, + }, + "m5ad.4xlarge": { + InstanceType: "m5ad.4xlarge", + VCPU: 16, + MemoryMb: 65536, + GPU: 0, + }, + "m5ad.large": { + InstanceType: "m5ad.large", + VCPU: 2, + MemoryMb: 8192, + GPU: 0, + }, + "m5ad.xlarge": { + InstanceType: "m5ad.xlarge", + VCPU: 4, + MemoryMb: 16384, + GPU: 0, + }, "m5d": { InstanceType: "m5d", VCPU: 96, @@ -651,6 +771,12 @@ var InstanceTypes = map[string]*instanceType{ MemoryMb: 8192, GPU: 0, }, + "m5d.metal": { + InstanceType: "m5d.metal", + VCPU: 96, + MemoryMb: 393216, + GPU: 0, + }, "m5d.xlarge": { InstanceType: "m5d.xlarge", VCPU: 4, @@ -705,6 +831,18 @@ var InstanceTypes = map[string]*instanceType{ MemoryMb: 249856, GPU: 4, }, + "p3dn": { + InstanceType: "p3dn", + VCPU: 96, + MemoryMb: 786432, + GPU: 8, + }, + "p3dn.24xlarge": { + InstanceType: "p3dn.24xlarge", + VCPU: 96, + MemoryMb: 786432, + GPU: 8, + }, "r3": { InstanceType: "r3", VCPU: 32, @@ -819,6 +957,12 @@ var InstanceTypes = map[string]*instanceType{ MemoryMb: 16384, GPU: 0, }, + "r5.metal": { + InstanceType: "r5.metal", + VCPU: 96, + MemoryMb: 786432, + GPU: 0, + }, "r5.xlarge": { InstanceType: "r5.xlarge", VCPU: 4, @@ -861,6 +1005,42 @@ var InstanceTypes = map[string]*instanceType{ MemoryMb: 32768, GPU: 0, }, + "r5ad.12xlarge": { + InstanceType: "r5ad.12xlarge", + VCPU: 48, + MemoryMb: 393216, + GPU: 0, + }, + "r5ad.24xlarge": { + InstanceType: "r5ad.24xlarge", + VCPU: 96, + MemoryMb: 786432, + GPU: 0, + }, + "r5ad.2xlarge": { + InstanceType: "r5ad.2xlarge", + VCPU: 8, + MemoryMb: 65536, + GPU: 0, + }, + "r5ad.4xlarge": { + InstanceType: "r5ad.4xlarge", + VCPU: 16, + MemoryMb: 131072, + GPU: 0, + }, + "r5ad.large": { + InstanceType: "r5ad.large", + VCPU: 2, + MemoryMb: 16384, + GPU: 0, + }, + "r5ad.xlarge": { + InstanceType: "r5ad.xlarge", + VCPU: 4, + MemoryMb: 32768, + GPU: 0, + }, "r5d": { InstanceType: "r5d", VCPU: 96, @@ -897,6 +1077,12 @@ var InstanceTypes = map[string]*instanceType{ MemoryMb: 16384, GPU: 0, }, + "r5d.metal": { + InstanceType: "r5d.metal", + VCPU: 96, + MemoryMb: 786432, + GPU: 0, + }, "r5d.xlarge": { InstanceType: "r5d.xlarge", VCPU: 4, @@ -993,6 +1179,48 @@ var InstanceTypes = map[string]*instanceType{ MemoryMb: 16384, GPU: 0, }, + "t3a.2xlarge": { + InstanceType: "t3a.2xlarge", + VCPU: 8, + MemoryMb: 32768, + GPU: 0, + }, + "t3a.large": { + InstanceType: "t3a.large", + VCPU: 2, + MemoryMb: 8192, + GPU: 0, + }, + "t3a.medium": { + InstanceType: "t3a.medium", + VCPU: 2, + MemoryMb: 4096, + GPU: 0, + }, + "t3a.micro": { + InstanceType: "t3a.micro", + VCPU: 2, + MemoryMb: 1024, + GPU: 0, + }, + "t3a.nano": { + InstanceType: "t3a.nano", + VCPU: 2, + MemoryMb: 512, + GPU: 0, + }, + "t3a.small": { + InstanceType: "t3a.small", + VCPU: 2, + MemoryMb: 2048, + GPU: 0, + }, + "t3a.xlarge": { + InstanceType: "t3a.xlarge", + VCPU: 4, + MemoryMb: 16384, + GPU: 0, + }, "u-12tb1": { InstanceType: "u-12tb1", VCPU: 448, @@ -1107,6 +1335,12 @@ var InstanceTypes = map[string]*instanceType{ MemoryMb: 16384, GPU: 0, }, + "z1d.metal": { + InstanceType: "z1d.metal", + VCPU: 48, + MemoryMb: 393216, + GPU: 0, + }, "z1d.xlarge": { InstanceType: "z1d.xlarge", VCPU: 4, diff --git a/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml b/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml index f7a8290aeb22..221531ec7cd4 100644 --- a/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml +++ b/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml @@ -42,7 +42,7 @@ rules: resources: ["poddisruptionbudgets"] verbs: ["watch","list"] - apiGroups: ["apps"] - resources: ["statefulsets"] + resources: ["statefulsets", "replicasets"] verbs: ["watch","list","get"] - apiGroups: ["storage.k8s.io"] resources: ["storageclasses"] @@ -121,7 +121,7 @@ spec: spec: serviceAccountName: cluster-autoscaler containers: - - image: k8s.gcr.io/cluster-autoscaler:v1.2.2 + - image: k8s.gcr.io/cluster-autoscaler:v1.12.7 name: cluster-autoscaler resources: limits: @@ -146,4 +146,4 @@ spec: volumes: - name: ssl-certs hostPath: - path: "/etc/ssl/certs/ca-certificates.crt" + path: "/etc/ssl/certs/ca-bundle.crt" diff --git a/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-multi-asg.yaml b/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-multi-asg.yaml index e047f32b43e3..832acd58b503 100644 --- a/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-multi-asg.yaml +++ b/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-multi-asg.yaml @@ -42,7 +42,7 @@ rules: resources: ["poddisruptionbudgets"] verbs: ["watch","list"] - apiGroups: ["apps"] - resources: ["statefulsets"] + resources: ["statefulsets", "replicasets"] verbs: ["watch","list","get"] - apiGroups: ["storage.k8s.io"] resources: ["storageclasses"] @@ -121,7 +121,7 @@ spec: spec: serviceAccountName: cluster-autoscaler containers: - - image: k8s.gcr.io/cluster-autoscaler:v1.2.2 + - image: k8s.gcr.io/cluster-autoscaler:v1.12.7 name: cluster-autoscaler resources: limits: @@ -147,4 +147,4 @@ spec: volumes: - name: ssl-certs hostPath: - path: "/etc/ssl/certs/ca-certificates.crt" + path: "/etc/ssl/certs/ca-bundle.crt" diff --git a/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-one-asg.yaml b/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-one-asg.yaml index 7c9f387eecd8..2803a0192b17 100644 --- a/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-one-asg.yaml +++ b/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-one-asg.yaml @@ -42,7 +42,7 @@ rules: resources: ["poddisruptionbudgets"] verbs: ["watch","list"] - apiGroups: ["apps"] - resources: ["statefulsets"] + resources: ["statefulsets", "replicasets"] verbs: ["watch","list","get"] - apiGroups: ["storage.k8s.io"] resources: ["storageclasses"] @@ -121,7 +121,7 @@ spec: spec: serviceAccountName: cluster-autoscaler containers: - - image: k8s.gcr.io/cluster-autoscaler:v1.2.2 + - image: k8s.gcr.io/cluster-autoscaler:v1.12.7 name: cluster-autoscaler resources: limits: @@ -145,4 +145,4 @@ spec: volumes: - name: ssl-certs hostPath: - path: "/etc/ssl/certs/ca-certificates.crt" + path: "/etc/ssl/certs/ca-bundle.crt" diff --git a/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-run-on-master.yaml b/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-run-on-master.yaml index 9b94f97e4e8f..69a9108637a3 100644 --- a/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-run-on-master.yaml +++ b/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-run-on-master.yaml @@ -42,7 +42,7 @@ rules: resources: ["poddisruptionbudgets"] verbs: ["watch","list"] - apiGroups: ["apps"] - resources: ["statefulsets"] + resources: ["statefulsets", "replicasets"] verbs: ["watch","list","get"] - apiGroups: ["storage.k8s.io"] resources: ["storageclasses"] @@ -126,7 +126,7 @@ spec: nodeSelector: kubernetes.io/role: master containers: - - image: k8s.gcr.io/cluster-autoscaler:v1.2.2 + - image: k8s.gcr.io/cluster-autoscaler:v1.12.7 name: cluster-autoscaler resources: limits: @@ -150,4 +150,4 @@ spec: volumes: - name: ssl-certs hostPath: - path: "/etc/ssl/certs/ca-certificates.crt" + path: "/etc/ssl/certs/ca-bundle.crt"