diff --git a/cluster-autoscaler/cloudprovider/aws/README.md b/cluster-autoscaler/cloudprovider/aws/README.md index 651d90911616..8d630801ad1a 100644 --- a/cluster-autoscaler/cloudprovider/aws/README.md +++ b/cluster-autoscaler/cloudprovider/aws/README.md @@ -145,6 +145,20 @@ be labeled or tainted when they join the cluster, such as: * `k8s.io/cluster-autoscaler/node-template/taint/dedicated`: `NoSchedule` * `k8s.io/cluster-autoscaler/node-template/taint/tier:` `batch:NoSchedule` +ASG labels can specify autoscaling options, overriding the global cluster-autoscaler +settings for the labeled ASGs. Those labels takes the same values format as the +cluter-autoscaler command line flags they override (a float or a duration, encoded +as string). Currently supported autoscaling options (and example values) are: + +* `k8s.io/cluster-autoscaler/node-template/autoscaling-options/scaledownutilizationthreshold`: `0.5` + (overrides `--scale-down-utilization-threshold` value for that specific ASG) +* `k8s.io/cluster-autoscaler/node-template/autoscaling-options/scaledowngpuutilizationthreshold`: `0.5` + (overrides `--scale-down-gpu-utilization-threshold` value for that specific ASG) +* `k8s.io/cluster-autoscaler/node-template/autoscaling-options/scaledownunneededtime`: `10m0s` + (overrides `--scale-down-unneeded-time` value for that specific ASG) +* `k8s.io/cluster-autoscaler/node-template/autoscaling-options/scaledownunreadytime`: `20m0s` + (overrides `--scale-down-unready-time` value for that specific ASG) + **NOTE:** It is your responsibility to ensure such labels and/or taints are applied via the node's kubelet configuration at startup. Cluster Autoscaler will not set the node taints for you. diff --git a/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go b/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go index 59719f40a370..8c87fecbf1d0 100644 --- a/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go +++ b/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go @@ -45,6 +45,7 @@ type asgCache struct { asgAutoDiscoverySpecs []asgAutoDiscoveryConfig explicitlyConfigured map[AwsRef]bool + autoscalingOptions map[AwsRef]map[string]string } type launchTemplate struct { @@ -80,6 +81,7 @@ func newASGCache(service autoScalingWrapper, explicitSpecs []string, autoDiscove interrupt: make(chan struct{}), asgAutoDiscoverySpecs: autoDiscoverySpecs, explicitlyConfigured: make(map[AwsRef]bool), + autoscalingOptions: make(map[AwsRef]map[string]string), } if err := registry.parseExplicitAsgs(explicitSpecs); err != nil { @@ -176,6 +178,13 @@ func (m *asgCache) Get() []*asg { return m.registeredAsgs } +// GetAutoscalingOptions return autoscaling options strings obtained from ASG tags. +func (m *asgCache) GetAutoscalingOptions(ref AwsRef) map[string]string { + m.mutex.Lock() + defer m.mutex.Unlock() + return m.autoscalingOptions[ref] +} + // FindForInstance returns AsgConfig of the given Instance func (m *asgCache) FindForInstance(instance AwsInstanceRef) *asg { m.mutex.Lock() @@ -396,8 +405,19 @@ func (m *asgCache) regenerate() error { } } + // Rebuild autoscaling options cache + newAutoscalingOptions := make(map[AwsRef]map[string]string) + for _, asg := range m.registeredAsgs { + options := extractAutoscalingOptionsFromTags(asg.Tags) + if !reflect.DeepEqual(m.autoscalingOptions[asg.AwsRef], options) { + klog.V(4).Infof("Extracted autoscaling options from %q ASG tags: %v", asg.Name, options) + } + newAutoscalingOptions[asg.AwsRef] = options + } + m.asgToInstances = newAsgToInstancesCache m.instanceToAsg = newInstanceToAsgCache + m.autoscalingOptions = newAutoscalingOptions return nil } diff --git a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go index 6445a44e7834..f6795a013889 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go @@ -219,7 +219,10 @@ func (ng *AwsNodeGroup) Delete() error { // GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular // NodeGroup. Returning a nil will result in using default options. func (ng *AwsNodeGroup) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) { - return nil, cloudprovider.ErrNotImplemented + if ng.asg == nil || ng.asg.Tags == nil || len(ng.asg.Tags) == 0 { + return &defaults, nil + } + return ng.awsManager.GetAsgOptions(*ng.asg, defaults), nil } // IncreaseSize increases Asg size diff --git a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go index 4ae024c437d4..77f4a927e6d4 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go @@ -91,6 +91,7 @@ func newTestAwsManagerWithService(service autoScaling, autoDiscoverySpecs []asgA interrupt: make(chan struct{}), asgAutoDiscoverySpecs: autoDiscoverySpecs, service: wrapper, + autoscalingOptions: make(map[AwsRef]map[string]string), }, } } diff --git a/cluster-autoscaler/cloudprovider/aws/aws_manager.go b/cluster-autoscaler/cloudprovider/aws/aws_manager.go index db0bfdf4e9dc..9d49ae5fde34 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_manager.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_manager.go @@ -25,6 +25,7 @@ import ( "math/rand" "os" "regexp" + "strconv" "strings" "time" @@ -39,6 +40,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" + "k8s.io/autoscaler/cluster-autoscaler/config" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" klog "k8s.io/klog/v2" provider_aws "k8s.io/legacy-cloud-providers/aws" @@ -52,6 +54,7 @@ const ( refreshInterval = 1 * time.Minute autoDiscovererTypeASG = "asg" asgAutoDiscovererKeyTag = "tag" + optionsTagsPrefix = "k8s.io/cluster-autoscaler/node-template/autoscaling-options/" ) // AwsManager is handles aws communication and data caching. @@ -283,6 +286,10 @@ func (m *AwsManager) getAsgs() []*asg { return m.asgCache.Get() } +func (m *AwsManager) getAutoscalingOptions(ref AwsRef) map[string]string { + return m.asgCache.GetAutoscalingOptions(ref) +} + // SetAsgSize sets ASG size. func (m *AwsManager) SetAsgSize(asg *asg, size int) error { return m.asgCache.SetAsgSize(asg, size) @@ -348,6 +355,52 @@ func (m *AwsManager) buildInstanceType(asg *asg) (string, error) { return "", errors.New("Unable to get instance type from launch config or launch template") } +// GetAsgOptions parse options extracted from ASG tags and merges them with provided defaults +func (m *AwsManager) GetAsgOptions(asg asg, defaults config.NodeGroupAutoscalingOptions) *config.NodeGroupAutoscalingOptions { + options := m.getAutoscalingOptions(asg.AwsRef) + if options == nil || len(options) == 0 { + return &defaults + } + + if stringOpt, found := options[config.DefaultScaleDownUtilizationThresholdKey]; found { + if opt, err := strconv.ParseFloat(stringOpt, 64); err != nil { + klog.Warning("failed to convert asg %s %s tag to float: %v", + asg.Name, config.DefaultScaleDownUtilizationThresholdKey, err) + } else { + defaults.ScaleDownUtilizationThreshold = opt + } + } + + if stringOpt, found := options[config.DefaultScaleDownGpuUtilizationThresholdKey]; found { + if opt, err := strconv.ParseFloat(stringOpt, 64); err != nil { + klog.Warning("failed to convert asg %s %s tag to float: %v", + asg.Name, config.DefaultScaleDownGpuUtilizationThresholdKey, err) + } else { + defaults.ScaleDownGpuUtilizationThreshold = opt + } + } + + if stringOpt, found := options[config.DefaultScaleDownUnneededTimeKey]; found { + if opt, err := time.ParseDuration(stringOpt); err != nil { + klog.Warning("failed to convert asg %s %s tag to duration: %v", + asg.Name, config.DefaultScaleDownUnneededTimeKey, err) + } else { + defaults.ScaleDownUnneededTime = opt + } + } + + if stringOpt, found := options[config.DefaultScaleDownUnreadyTimeKey]; found { + if opt, err := time.ParseDuration(stringOpt); err != nil { + klog.Warning("failed to convert asg %s %s tag to duration: %v", + asg.Name, config.DefaultScaleDownUnreadyTimeKey, err) + } else { + defaults.ScaleDownUnreadyTime = opt + } + } + + return &defaults +} + func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*apiv1.Node, error) { node := apiv1.Node{} nodeName := fmt.Sprintf("%s-asg-%d", asg.Name, rand.Int63()) @@ -419,6 +472,21 @@ func extractLabelsFromAsg(tags []*autoscaling.TagDescription) map[string]string return result } +func extractAutoscalingOptionsFromTags(tags []*autoscaling.TagDescription) map[string]string { + options := make(map[string]string) + for _, tag := range tags { + if !strings.HasPrefix(aws.StringValue(tag.Key), optionsTagsPrefix) { + continue + } + splits := strings.Split(aws.StringValue(tag.Key), optionsTagsPrefix) + if len(splits) != 2 || splits[1] == "" { + continue + } + options[splits[1]] = aws.StringValue(tag.Value) + } + return options +} + func extractAllocatableResourcesFromAsg(tags []*autoscaling.TagDescription) map[string]*resource.Quantity { result := make(map[string]*resource.Quantity) diff --git a/cluster-autoscaler/cloudprovider/aws/aws_manager_test.go b/cluster-autoscaler/cloudprovider/aws/aws_manager_test.go index 722b70a60132..2f8bc9b09235 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_manager_test.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_manager_test.go @@ -28,6 +28,7 @@ import ( "strconv" "strings" "testing" + "time" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/ec2metadata" @@ -38,6 +39,7 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" + "k8s.io/autoscaler/cluster-autoscaler/config" provider_aws "k8s.io/legacy-cloud-providers/aws" ) @@ -112,6 +114,77 @@ func TestExtractAllocatableResourcesFromAsg(t *testing.T) { assert.Equal(t, (&expectedEphemeralStorage).String(), labels["ephemeral-storage"].String()) } +func TestGetAsgOptions(t *testing.T) { + defaultOptions := config.NodeGroupAutoscalingOptions{ + ScaleDownUtilizationThreshold: 0.1, + ScaleDownGpuUtilizationThreshold: 0.2, + ScaleDownUnneededTime: time.Second, + ScaleDownUnreadyTime: time.Minute, + } + + tests := []struct { + description string + tags map[string]string + expected *config.NodeGroupAutoscalingOptions + }{ + { + description: "use defaults on unspecified tags", + tags: make(map[string]string), + expected: &defaultOptions, + }, + { + description: "keep defaults on invalid tags values", + tags: map[string]string{ + "scaledownutilizationthreshold": "not-a-float", + "scaledownunneededtime": "not-a-duration", + "ScaleDownUnreadyTime": "", + }, + expected: &defaultOptions, + }, + { + description: "use provided tags and fill missing with defaults", + tags: map[string]string{ + "scaledownutilizationthreshold": "0.42", + "scaledownunneededtime": "1h", + }, + expected: &config.NodeGroupAutoscalingOptions{ + ScaleDownUtilizationThreshold: 0.42, + ScaleDownGpuUtilizationThreshold: defaultOptions.ScaleDownGpuUtilizationThreshold, + ScaleDownUnneededTime: time.Hour, + ScaleDownUnreadyTime: defaultOptions.ScaleDownUnreadyTime, + }, + }, + { + description: "ignore unknown tags", + tags: map[string]string{ + "scaledownutilizationthreshold": "0.6", + "scaledowngpuutilizationthreshold": "0.7", + "scaledownunneededtime": "1m", + "scaledownunreadytime": "1h", + "notyetspecified": "42", + }, + expected: &config.NodeGroupAutoscalingOptions{ + ScaleDownUtilizationThreshold: 0.6, + ScaleDownGpuUtilizationThreshold: 0.7, + ScaleDownUnneededTime: time.Minute, + ScaleDownUnreadyTime: time.Hour, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.description, func(t *testing.T) { + testAsg := asg{AwsRef: AwsRef{Name: "testAsg"}} + cache, _ := newASGCache(autoScalingWrapper{}, []string{}, []asgAutoDiscoveryConfig{}) + cache.autoscalingOptions[testAsg.AwsRef] = tt.tags + awsManager := &AwsManager{asgCache: cache} + + actual := awsManager.GetAsgOptions(testAsg, defaultOptions) + assert.Equal(t, tt.expected, actual) + }) + } +} + func TestBuildNodeFromTemplate(t *testing.T) { awsManager := &AwsManager{} asg := &asg{AwsRef: AwsRef{Name: "test-auto-scaling-group"}}