From 5c16aa3e280cca26009424144dd03e64e0caf35e Mon Sep 17 00:00:00 2001 From: Benjamin Pineau Date: Thu, 8 Apr 2021 13:34:34 +0200 Subject: [PATCH 1/6] Set maxAsgNamesPerDescribe to the new maximum value While this was previously effectively limited to 50, `DescribeAutoScalingGroups` now supports fetching 100 ASG per calls on all regions, matching what's documented: https://docs.aws.amazon.com/autoscaling/ec2/APIReference/API_DescribeAutoScalingGroups.html ``` AutoScalingGroupNames.member.N The names of the Auto Scaling groups. By default, you can only specify up to 50 names. You can optionally increase this limit using the MaxRecords parameter. MaxRecords The maximum number of items to return with this call. The default value is 50 and the maximum value is 100. ``` Doubling this halves API calls on large clusters, which should help to prevent throttling. --- .../cloudprovider/aws/auto_scaling_test.go | 14 +++++++------- .../cloudprovider/aws/aws_manager.go | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/aws/auto_scaling_test.go b/cluster-autoscaler/cloudprovider/aws/auto_scaling_test.go index 90e5dba7e261..8b85887574e8 100644 --- a/cluster-autoscaler/cloudprovider/aws/auto_scaling_test.go +++ b/cluster-autoscaler/cloudprovider/aws/auto_scaling_test.go @@ -27,22 +27,22 @@ import ( "github.com/stretchr/testify/require" ) -func TestMoreThen50Groups(t *testing.T) { +func TestMoreThen100Groups(t *testing.T) { service := &AutoScalingMock{} autoScalingWrapper := &autoScalingWrapper{ autoScaling: service, } - // Generate 51 ASG names - names := make([]string, 51) + // Generate 101 ASG names + names := make([]string, 101) for i := 0; i < len(names); i++ { names[i] = fmt.Sprintf("asg-%d", i) } - // First batch, first 50 elements + // First batch, first 100 elements service.On("DescribeAutoScalingGroupsPages", &autoscaling.DescribeAutoScalingGroupsInput{ - AutoScalingGroupNames: aws.StringSlice(names[:50]), + AutoScalingGroupNames: aws.StringSlice(names[:100]), MaxRecords: aws.Int64(maxRecordsReturnedByAPI), }, mock.AnythingOfType("func(*autoscaling.DescribeAutoScalingGroupsOutput, bool) bool"), @@ -51,10 +51,10 @@ func TestMoreThen50Groups(t *testing.T) { fn(testNamedDescribeAutoScalingGroupsOutput("asg-1", 1, "test-instance-id"), false) }).Return(nil) - // Second batch, element 51 + // Second batch, element 101 service.On("DescribeAutoScalingGroupsPages", &autoscaling.DescribeAutoScalingGroupsInput{ - AutoScalingGroupNames: aws.StringSlice([]string{"asg-50"}), + AutoScalingGroupNames: aws.StringSlice([]string{"asg-100"}), MaxRecords: aws.Int64(maxRecordsReturnedByAPI), }, mock.AnythingOfType("func(*autoscaling.DescribeAutoScalingGroupsOutput, bool) bool"), diff --git a/cluster-autoscaler/cloudprovider/aws/aws_manager.go b/cluster-autoscaler/cloudprovider/aws/aws_manager.go index d9f71bec8281..33de44085f37 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_manager.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_manager.go @@ -49,7 +49,7 @@ const ( operationWaitTimeout = 5 * time.Second operationPollInterval = 100 * time.Millisecond maxRecordsReturnedByAPI = 100 - maxAsgNamesPerDescribe = 50 + maxAsgNamesPerDescribe = 100 refreshInterval = 1 * time.Minute autoDiscovererTypeASG = "asg" asgAutoDiscovererKeyTag = "tag" From 5a90b8d1fafb9b9cc92158b84939697d7255a909 Mon Sep 17 00:00:00 2001 From: Adrian Lai Date: Wed, 7 Jul 2021 16:00:57 +0100 Subject: [PATCH 2/6] Break out unmarshal from GenerateEC2InstanceTypes Refactor to allow for optimisation --- .../cloudprovider/aws/aws_util.go | 36 ++++-- .../cloudprovider/aws/aws_util_test.go | 119 +++++++++++++++++- 2 files changed, 143 insertions(+), 12 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/aws/aws_util.go b/cluster-autoscaler/cloudprovider/aws/aws_util.go index 3ace0a6fd9b3..6bf4dd5b94e8 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_util.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_util.go @@ -20,14 +20,20 @@ import ( "encoding/json" "errors" "fmt" - "github.com/aws/aws-sdk-go/aws/endpoints" + "io" "io/ioutil" - klog "k8s.io/klog/v2" "net/http" "os" "regexp" "strconv" "strings" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/ec2metadata" + "github.com/aws/aws-sdk-go/aws/endpoints" + "github.com/aws/aws-sdk-go/aws/session" + + klog "k8s.io/klog/v2" ) var ( @@ -82,16 +88,9 @@ func GenerateEC2InstanceTypes(region string) (map[string]*InstanceType, error) { defer res.Body.Close() - body, err := ioutil.ReadAll(res.Body) - if err != nil { - klog.Warningf("Error parsing %s skipping...\n", url) - continue - } - - var unmarshalled = response{} - err = json.Unmarshal(body, &unmarshalled) + unmarshalled, err := unmarshalProductsResponse(res.Body) if err != nil { - klog.Warningf("Error unmarshalling %s, skip...\n", url) + klog.Warningf("Error parsing %s skipping...\n%s\n", url, err) continue } @@ -127,6 +126,21 @@ func GetStaticEC2InstanceTypes() (map[string]*InstanceType, string) { return InstanceTypes, staticListLastUpdateTime } +func unmarshalProductsResponse(r io.Reader) (*response, error) { + body, err := ioutil.ReadAll(r) + if err != nil { + return nil, err + } + + var unmarshalled = response{} + err = json.Unmarshal(body, &unmarshalled) + if err != nil { + return nil, err + } + + return &unmarshalled, nil +} + func parseMemory(memory string) int64 { reg, err := regexp.Compile("[^0-9\\.]+") if err != nil { diff --git a/cluster-autoscaler/cloudprovider/aws/aws_util_test.go b/cluster-autoscaler/cloudprovider/aws/aws_util_test.go index 6027babd8900..e29860b41f6c 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_util_test.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_util_test.go @@ -17,12 +17,14 @@ limitations under the License. package aws import ( - "github.com/stretchr/testify/assert" "net/http" "net/http/httptest" "os" "strconv" + "strings" "testing" + + "github.com/stretchr/testify/assert" ) func TestGetStaticEC2InstanceTypes(t *testing.T) { @@ -111,3 +113,118 @@ func TestGetCurrentAwsRegionWithRegionEnv(t *testing.T) { assert.Nil(t, err) assert.Equal(t, region, result) } + +func TestUnmarshalProductsResponse(t *testing.T) { + body := ` +{ + "products": { + "VVD8BG8WWFD3DAZN" : { + "sku" : "VVD8BG8WWFD3DAZN", + "productFamily" : "Compute Instance", + "attributes" : { + "servicecode" : "AmazonEC2", + "location" : "US East (N. Virginia)", + "locationType" : "AWS Region", + "instanceType" : "r5b.4xlarge", + "currentGeneration" : "Yes", + "instanceFamily" : "Memory optimized", + "vcpu" : "16", + "physicalProcessor" : "Intel Xeon Platinum 8259 (Cascade Lake)", + "clockSpeed" : "3.1 GHz", + "memory" : "128 GiB", + "storage" : "EBS only", + "networkPerformance" : "Up to 10 Gigabit", + "processorArchitecture" : "64-bit", + "tenancy" : "Shared", + "operatingSystem" : "Linux", + "licenseModel" : "No License required", + "usagetype" : "UnusedBox:r5b.4xlarge", + "operation" : "RunInstances:0004", + "availabilityzone" : "NA", + "capacitystatus" : "UnusedCapacityReservation", + "classicnetworkingsupport" : "false", + "dedicatedEbsThroughput" : "10 Gbps", + "ecu" : "NA", + "enhancedNetworkingSupported" : "Yes", + "instancesku" : "G4NFAXD9TGJM3RY8", + "intelAvxAvailable" : "Yes", + "intelAvx2Available" : "No", + "intelTurboAvailable" : "No", + "marketoption" : "OnDemand", + "normalizationSizeFactor" : "32", + "preInstalledSw" : "SQL Std", + "servicename" : "Amazon Elastic Compute Cloud", + "vpcnetworkingsupport" : "true" + } + }, + "C36QEQQQJ8ZR7N32" : { + "sku" : "C36QEQQQJ8ZR7N32", + "productFamily" : "Compute Instance", + "attributes" : { + "servicecode" : "AmazonEC2", + "location" : "US East (N. Virginia)", + "locationType" : "AWS Region", + "instanceType" : "d3en.8xlarge", + "currentGeneration" : "Yes", + "instanceFamily" : "Storage optimized", + "vcpu" : "32", + "physicalProcessor" : "Intel Xeon Platinum 8259 (Cascade Lake)", + "clockSpeed" : "3.1 GHz", + "memory" : "128 GiB", + "storage" : "16 x 14000 HDD", + "networkPerformance" : "50 Gigabit", + "processorArchitecture" : "64-bit", + "tenancy" : "Dedicated", + "operatingSystem" : "SUSE", + "licenseModel" : "No License required", + "usagetype" : "DedicatedRes:d3en.8xlarge", + "operation" : "RunInstances:000g", + "availabilityzone" : "NA", + "capacitystatus" : "AllocatedCapacityReservation", + "classicnetworkingsupport" : "false", + "dedicatedEbsThroughput" : "5000 Mbps", + "ecu" : "NA", + "enhancedNetworkingSupported" : "Yes", + "instancesku" : "2XW3BCEZ83WMGFJY", + "intelAvxAvailable" : "Yes", + "intelAvx2Available" : "Yes", + "intelTurboAvailable" : "Yes", + "marketoption" : "OnDemand", + "normalizationSizeFactor" : "64", + "preInstalledSw" : "NA", + "processorFeatures" : "AVX; AVX2; Intel AVX; Intel AVX2; Intel AVX512; Intel Turbo", + "servicename" : "Amazon Elastic Compute Cloud", + "vpcnetworkingsupport" : "true" + } + } + } +} +` + r := strings.NewReader(body) + resp, err := unmarshalProductsResponse(r) + assert.Nil(t, err) + assert.Len(t, resp.Products, 2) + assert.NotNil(t, resp.Products["VVD8BG8WWFD3DAZN"]) + assert.NotNil(t, resp.Products["C36QEQQQJ8ZR7N32"]) + assert.Equal(t, resp.Products["VVD8BG8WWFD3DAZN"].Attributes.InstanceType, "r5b.4xlarge") + assert.Equal(t, resp.Products["C36QEQQQJ8ZR7N32"].Attributes.InstanceType, "d3en.8xlarge") + + invalidJsonTests := map[string]string{ + "[": "[", + "]": "]", + "}": "}", + "{": "{", + "Plain text": "invalid", + "List": "[]", + "Invalid products ([])": `{"products":[]}`, + "Invalid product ([])": `{"products":{"zz":[]}}`, + } + for name, body := range invalidJsonTests { + t.Run(name, func(t *testing.T) { + r := strings.NewReader(body) + resp, err := unmarshalProductsResponse(r) + assert.NotNil(t, err) + assert.Nil(t, resp) + }) + } +} From 819c6d1f4b86caacf637883b6b15000d08e2fe26 Mon Sep 17 00:00:00 2001 From: Adrian Lai Date: Wed, 7 Jul 2021 16:31:28 +0100 Subject: [PATCH 3/6] Optimise GenerateEC2InstanceTypes unmarshal memory usage The pricing json for us-east-1 is currently 129MB. Currently fetching this into memory and parsing results in a large memory footprint on startup, and can lead to the autoscaler being OOMKilled. Change the ReadAll/Unmarshal logic to a stream decoder to significantly reduce the memory use. --- .../cloudprovider/aws/aws_util.go | 44 +++++++++++++++++-- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/aws/aws_util.go b/cluster-autoscaler/cloudprovider/aws/aws_util.go index 6bf4dd5b94e8..9dc474460b04 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_util.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_util.go @@ -21,7 +21,6 @@ import ( "errors" "fmt" "io" - "io/ioutil" "net/http" "os" "regexp" @@ -127,16 +126,53 @@ func GetStaticEC2InstanceTypes() (map[string]*InstanceType, string) { } func unmarshalProductsResponse(r io.Reader) (*response, error) { - body, err := ioutil.ReadAll(r) + dec := json.NewDecoder(r) + t, err := dec.Token() if err != nil { return nil, err } + if delim, ok := t.(json.Delim); !ok || delim.String() != "{" { + return nil, errors.New("Invalid products json") + } + + unmarshalled := response{map[string]product{}} + + for dec.More() { + t, err = dec.Token() + if err != nil { + return nil, err + } + + if t == "products" { + tt, err := dec.Token() + if err != nil { + return nil, err + } + if delim, ok := tt.(json.Delim); !ok || delim.String() != "{" { + return nil, errors.New("Invalid products json") + } + for dec.More() { + productCode, err := dec.Token() + if err != nil { + return nil, err + } + + prod := product{} + if err = dec.Decode(&prod); err != nil { + return nil, err + } + unmarshalled.Products[productCode.(string)] = prod + } + } + } - var unmarshalled = response{} - err = json.Unmarshal(body, &unmarshalled) + t, err = dec.Token() if err != nil { return nil, err } + if delim, ok := t.(json.Delim); !ok || delim.String() != "}" { + return nil, errors.New("Invalid products json") + } return &unmarshalled, nil } From cfcbe00eb93be3253f3c123993093f6a76a4c151 Mon Sep 17 00:00:00 2001 From: darkpssngr Date: Wed, 9 Jun 2021 11:23:44 +0530 Subject: [PATCH 4/6] use aws sdk to find region --- .../cloudprovider/aws/aws_util.go | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/aws/aws_util.go b/cluster-autoscaler/cloudprovider/aws/aws_util.go index 9dc474460b04..62e0d31eff0e 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_util.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_util.go @@ -36,7 +36,7 @@ import ( ) var ( - ec2MetaDataServiceUrl = "http://169.254.169.254/latest/dynamic/instance-identity/document" + ec2MetaDataServiceUrl = "http://169.254.169.254" ec2PricingServiceUrlTemplate = "https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/%s/index.json" ec2PricingServiceUrlTemplateCN = "https://pricing.cn-north-1.amazonaws.com.cn/offers/v1.0/cn/AmazonEC2/current/%s/index.json" staticListLastUpdateTime = "2019-10-14" @@ -205,26 +205,13 @@ func GetCurrentAwsRegion() (string, error) { region, present := os.LookupEnv("AWS_REGION") if !present { - klog.V(1).Infof("fetching %s\n", ec2MetaDataServiceUrl) - res, err := http.Get(ec2MetaDataServiceUrl) + c := aws.NewConfig(). + WithEndpoint(ec2MetaDataServiceUrl) + sess, err := session.NewSession() if err != nil { - return "", fmt.Errorf("Error fetching %s", ec2MetaDataServiceUrl) + return "", fmt.Errorf("failed to create session") } - - defer res.Body.Close() - - body, err := ioutil.ReadAll(res.Body) - if err != nil { - return "", fmt.Errorf("Error parsing %s", ec2MetaDataServiceUrl) - } - - var unmarshalled = map[string]string{} - err = json.Unmarshal(body, &unmarshalled) - if err != nil { - klog.Warningf("Error unmarshalling %s, skip...\n", ec2MetaDataServiceUrl) - } - - region = unmarshalled["region"] + return ec2metadata.New(sess, c).Region() } return region, nil From 9cad9d89c485e011f9c9246f0d3857bd3b07b3f8 Mon Sep 17 00:00:00 2001 From: darkpssngr Date: Wed, 9 Jun 2021 11:24:27 +0530 Subject: [PATCH 5/6] update readme --- cluster-autoscaler/cloudprovider/aws/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cluster-autoscaler/cloudprovider/aws/README.md b/cluster-autoscaler/cloudprovider/aws/README.md index a07dcc1d22f1..0cd32506ebc4 100644 --- a/cluster-autoscaler/cloudprovider/aws/README.md +++ b/cluster-autoscaler/cloudprovider/aws/README.md @@ -354,3 +354,6 @@ To refresh static list, please run `go run ec2_instance_types/gen.go` under `aws:///us-east-1a/i-01234abcdef`. * If you want to use regional STS endpoints (e.g. when using VPC endpoint for STS) the env `AWS_STS_REGIONAL_ENDPOINTS=regional` should be set. +* If you want to run it on instances with IMDSv1 disabled make sure your + EC2 launch configuration has the setting `Metadata response hop limit` set to `2`. + Otherwise, the `/latest/api/token` call will timeout and result in an error. From eeedfb92c71e29f02638ca12a130f06ca5627457 Mon Sep 17 00:00:00 2001 From: darkpssngr Date: Tue, 20 Jul 2021 20:53:06 +0530 Subject: [PATCH 6/6] Update cluster-autoscaler/cloudprovider/aws/README.md Co-authored-by: Guy Templeton --- cluster-autoscaler/cloudprovider/aws/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster-autoscaler/cloudprovider/aws/README.md b/cluster-autoscaler/cloudprovider/aws/README.md index 0cd32506ebc4..e4e6e2684ece 100644 --- a/cluster-autoscaler/cloudprovider/aws/README.md +++ b/cluster-autoscaler/cloudprovider/aws/README.md @@ -356,4 +356,4 @@ To refresh static list, please run `go run ec2_instance_types/gen.go` under STS) the env `AWS_STS_REGIONAL_ENDPOINTS=regional` should be set. * If you want to run it on instances with IMDSv1 disabled make sure your EC2 launch configuration has the setting `Metadata response hop limit` set to `2`. - Otherwise, the `/latest/api/token` call will timeout and result in an error. + Otherwise, the `/latest/api/token` call will timeout and result in an error. See [AWS docs here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html#configuring-instance-metadata-options) for further information.