Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automated cherry pick of #4588: Support attribute-based instance selection for AWS #5145

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cluster-autoscaler/cloudprovider/aws/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ should be updated to restrict the resources/add conditionals:
"Action": [
"autoscaling:SetDesiredCapacity",
"autoscaling:TerminateInstanceInAutoScalingGroup",
"ec2:DescribeImages",
"ec2:DescribeInstanceTypes",
"ec2:GetInstanceTypesFromInstanceRequirements",
"eks:DescribeNodegroup"
],
"Resource": ["*"]
Expand Down
31 changes: 23 additions & 8 deletions cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ type launchTemplate struct {
}

type mixedInstancesPolicy struct {
launchTemplate *launchTemplate
instanceTypesOverrides []string
launchTemplate *launchTemplate
instanceTypesOverrides []string
instanceRequirementsOverrides *autoscaling.InstanceRequirements
}

type asg struct {
Expand Down Expand Up @@ -533,17 +534,31 @@ func (m *asgCache) buildAsgFromAWS(g *autoscaling.Group) (*asg, error) {
}

if g.MixedInstancesPolicy != nil {
getInstanceTypes := func(data []*autoscaling.LaunchTemplateOverrides) []string {
res := make([]string, len(data))
for i := 0; i < len(data); i++ {
res[i] = aws.StringValue(data[i].InstanceType)
getInstanceTypes := func(overrides []*autoscaling.LaunchTemplateOverrides) []string {
res := []string{}
for _, override := range overrides {
if override.InstanceType != nil {
res = append(res, *override.InstanceType)
}
}
return res
}

getInstanceTypeRequirements := func(overrides []*autoscaling.LaunchTemplateOverrides) *autoscaling.InstanceRequirements {
if len(overrides) == 1 && overrides[0].InstanceRequirements != nil {
return overrides[0].InstanceRequirements
}
return nil
}

asg.MixedInstancesPolicy = &mixedInstancesPolicy{
launchTemplate: buildLaunchTemplateFromSpec(g.MixedInstancesPolicy.LaunchTemplate.LaunchTemplateSpecification),
instanceTypesOverrides: getInstanceTypes(g.MixedInstancesPolicy.LaunchTemplate.Overrides),
launchTemplate: buildLaunchTemplateFromSpec(g.MixedInstancesPolicy.LaunchTemplate.LaunchTemplateSpecification),
instanceTypesOverrides: getInstanceTypes(g.MixedInstancesPolicy.LaunchTemplate.Overrides),
instanceRequirementsOverrides: getInstanceTypeRequirements(g.MixedInstancesPolicy.LaunchTemplate.Overrides),
}

if len(asg.MixedInstancesPolicy.instanceTypesOverrides) != 0 && asg.MixedInstancesPolicy.instanceRequirementsOverrides != nil {
return nil, fmt.Errorf("invalid setup of both instance type and instance requirements overrides configured")
}
}

Expand Down
57 changes: 57 additions & 0 deletions cluster-autoscaler/cloudprovider/aws/aws_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ func (m *AwsManager) getAsgTemplate(asg *asg) (*asgTemplate, error) {
Tags: asg.Tags,
}, nil
}

return nil, fmt.Errorf("ASG %q uses the unknown EC2 instance type %q", asg.Name, instanceTypeName)
}

Expand Down Expand Up @@ -404,6 +405,10 @@ func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*ap
node.Status.Capacity[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(template.InstanceType.GPU, resource.DecimalSI)
node.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(template.InstanceType.MemoryMb*1024*1024, resource.DecimalSI)

if err := m.updateCapacityWithRequirementsOverrides(&node.Status.Capacity, asg.MixedInstancesPolicy); err != nil {
return nil, err
}

resourcesFromTags := extractAllocatableResourcesFromAsg(template.Tags)
for resourceName, val := range resourcesFromTags {
node.Status.Capacity[apiv1.ResourceName(resourceName)] = *val
Expand Down Expand Up @@ -464,6 +469,58 @@ func joinNodeLabelsChoosingUserValuesOverAPIValues(extractedLabels map[string]st
return result
}

func (m *AwsManager) updateCapacityWithRequirementsOverrides(capacity *apiv1.ResourceList, policy *mixedInstancesPolicy) error {
if policy == nil {
return nil
}

instanceRequirements, err := m.getInstanceRequirementsFromMixedInstancesPolicy(policy)
if err != nil {
return fmt.Errorf("error while building node template using instance requirements: (%s)", err)
}

if instanceRequirements.VCpuCount != nil && instanceRequirements.VCpuCount.Min != nil {
(*capacity)[apiv1.ResourceCPU] = *resource.NewQuantity(*instanceRequirements.VCpuCount.Min, resource.DecimalSI)
}

if instanceRequirements.MemoryMiB != nil && instanceRequirements.MemoryMiB.Min != nil {
(*capacity)[apiv1.ResourceMemory] = *resource.NewQuantity(*instanceRequirements.MemoryMiB.Min*1024*1024, resource.DecimalSI)
}

for _, manufacturer := range instanceRequirements.AcceleratorManufacturers {
if *manufacturer == autoscaling.AcceleratorManufacturerNvidia {
for _, acceleratorType := range instanceRequirements.AcceleratorTypes {
if *acceleratorType == autoscaling.AcceleratorTypeGpu {
(*capacity)[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(*instanceRequirements.AcceleratorCount.Min, resource.DecimalSI)
}
}
}
}

return nil
}

func (m *AwsManager) getInstanceRequirementsFromMixedInstancesPolicy(policy *mixedInstancesPolicy) (*ec2.InstanceRequirements, error) {
instanceRequirements := &ec2.InstanceRequirements{}
if policy.instanceRequirementsOverrides != nil {
var err error
instanceRequirements, err = m.awsService.getEC2RequirementsFromAutoscaling(policy.instanceRequirementsOverrides)
if err != nil {
return nil, err
}
} else if policy.launchTemplate != nil {
templateData, err := m.awsService.getLaunchTemplateData(policy.launchTemplate.name, policy.launchTemplate.version)
if err != nil {
return nil, err
}

if templateData.InstanceRequirements != nil {
instanceRequirements = templateData.InstanceRequirements
}
}
return instanceRequirements, nil
}

func buildGenericLabels(template *asgTemplate, nodeName string) map[string]string {
result := make(map[string]string)

Expand Down
32 changes: 32 additions & 0 deletions cluster-autoscaler/cloudprovider/aws/aws_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
provider_aws "k8s.io/legacy-cloud-providers/aws"
)

Expand Down Expand Up @@ -468,6 +469,37 @@ func TestBuildNodeFromTemplate(t *testing.T) {
observedTaints := observedNode.Spec.Taints
assert.Equal(t, 1, len(observedTaints))
assert.Equal(t, gpuTaint, observedTaints[0])

// Node with instance requirements
asg.MixedInstancesPolicy = &mixedInstancesPolicy{
instanceRequirementsOverrides: &autoscaling.InstanceRequirements{
VCpuCount: &autoscaling.VCpuCountRequest{
Min: aws.Int64(4),
Max: aws.Int64(8),
},
MemoryMiB: &autoscaling.MemoryMiBRequest{
Min: aws.Int64(4),
Max: aws.Int64(8),
},
AcceleratorTypes: []*string{aws.String(autoscaling.AcceleratorTypeGpu)},
AcceleratorManufacturers: []*string{aws.String(autoscaling.AcceleratorManufacturerNvidia)},
AcceleratorCount: &autoscaling.AcceleratorCountRequest{
Min: aws.Int64(4),
Max: aws.Int64(8),
},
},
}
observedNode, observedErr = awsManager.buildNodeFromTemplate(asg, &asgTemplate{
InstanceType: c5Instance,
})

assert.NoError(t, observedErr)
observedMemoryRequirement := observedNode.Status.Capacity[apiv1.ResourceMemory]
assert.Equal(t, int64(4*1024*1024), observedMemoryRequirement.Value())
observedVCpuRequirement := observedNode.Status.Capacity[apiv1.ResourceCPU]
assert.Equal(t, int64(4), observedVCpuRequirement.Value())
observedGpuRequirement := observedNode.Status.Capacity[gpu.ResourceNvidiaGPU]
assert.Equal(t, int64(4), observedGpuRequirement.Value())
}

func TestExtractLabelsFromAsg(t *testing.T) {
Expand Down
Loading