From a76493fe6d852b42ea438e9d643a4c79d031429e Mon Sep 17 00:00:00 2001 From: Ratnadeep Debnath Date: Thu, 21 Jan 2021 20:54:55 +0530 Subject: [PATCH] Initial work on explicity specify labels to use for node balancing. --- .../config/autoscaling_options.go | 3 +++ cluster-autoscaler/core/scale_test_common.go | 2 +- cluster-autoscaler/main.go | 4 +++- .../processors/nodegroupset/aws_nodegroups.go | 9 +++++++-- .../nodegroupset/azure_nodegroups.go | 10 ++++++++-- .../nodegroupset/azure_nodegroups_test.go | 6 +++--- .../nodegroupset/balancing_processor_test.go | 10 +++++----- .../nodegroupset/compare_nodegroups.go | 18 +++++++++++++++--- .../nodegroupset/compare_nodegroups_test.go | 14 +++++++------- .../processors/nodegroupset/gce_nodegroups.go | 9 +++++++-- .../nodegroupset/nodegroup_set_processor.go | 4 ++-- cluster-autoscaler/processors/processors.go | 2 +- 12 files changed, 62 insertions(+), 29 deletions(-) diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index de038ad6c7d5..21d61c685956 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -138,6 +138,9 @@ type AutoscalingOptions struct { // BalancingExtraIgnoredLabels is a list of labels to additionally ignore when comparing if two node groups are similar. // Labels in BasicIgnoredLabels and the cloud provider-specific ignored labels are always ignored. BalancingExtraIgnoredLabels []string + // BalancingNodeLabels is a list of labels to consider when comparing if two node groups are similar. + // Labels in BasicIgnoredLabels and the cloud provider-specific ignored labels are always ignored. + BalancingNodeLabels []string // AWSUseStaticInstanceList tells if AWS cloud provider use static instance type list or dynamically fetch from remote APIs. AWSUseStaticInstanceList bool // ConcurrentGceRefreshes is the maximum number of concurrently refreshed instance groups or instance templates. diff --git a/cluster-autoscaler/core/scale_test_common.go b/cluster-autoscaler/core/scale_test_common.go index 8779ae670a9d..e8925c1a7628 100644 --- a/cluster-autoscaler/core/scale_test_common.go +++ b/cluster-autoscaler/core/scale_test_common.go @@ -133,7 +133,7 @@ func NewTestProcessors() *processors.AutoscalingProcessors { return &processors.AutoscalingProcessors{ PodListProcessor: NewFilterOutSchedulablePodListProcessor(), NodeGroupListProcessor: &nodegroups.NoOpNodeGroupListProcessor{}, - NodeGroupSetProcessor: nodegroupset.NewDefaultNodeGroupSetProcessor([]string{}), + NodeGroupSetProcessor: nodegroupset.NewDefaultNodeGroupSetProcessor([]string{}, []string{}), // TODO(bskiba): change scale up test so that this can be a NoOpProcessor ScaleUpStatusProcessor: &status.EventingScaleUpStatusProcessor{}, ScaleDownStatusProcessor: &status.NoOpScaleDownStatusProcessor{}, diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 3e0e37bd5b1d..354d2c720234 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -171,6 +171,7 @@ var ( ignoreTaintsFlag = multiStringFlag("ignore-taint", "Specifies a taint to ignore in node templates when considering to scale a node group") balancingIgnoreLabelsFlag = multiStringFlag("balancing-ignore-label", "Specifies a label to ignore in addition to the basic and cloud-provider set of labels when comparing if two node groups are similar") + balancingNodeLabelsFlag = multiStringFlag("balancing-node-label", "Specifies a label to consider in addition to the basic and cloud-provider set of labels when comparing if two node groups are similar") awsUseStaticInstanceList = flag.Bool("aws-use-static-instance-list", false, "Should CA fetch instance types in runtime or use a static list. AWS only") concurrentGceRefreshes = flag.Int("gce-concurrent-refreshes", 1, "Maximum number of concurrent refreshes per cloud object type.") enableProfiling = flag.Bool("profiling", false, "Is debug/pprof endpoint enabled") @@ -241,6 +242,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { NewPodScaleUpDelay: *newPodScaleUpDelay, IgnoredTaints: *ignoreTaintsFlag, BalancingExtraIgnoredLabels: *balancingIgnoreLabelsFlag, + BalancingNodeLabels: *balancingNodeLabelsFlag, KubeConfigPath: *kubeConfigFile, NodeDeletionDelayTimeout: *nodeDeletionDelayTimeout, AWSUseStaticInstanceList: *awsUseStaticInstanceList, @@ -318,7 +320,7 @@ func buildAutoscaler() (core.Autoscaler, error) { } opts.Processors.NodeGroupSetProcessor = &nodegroupset.BalancingNodeGroupSetProcessor{ - Comparator: nodeInfoComparatorBuilder(autoscalingOptions.BalancingExtraIgnoredLabels), + Comparator: nodeInfoComparatorBuilder(autoscalingOptions.BalancingExtraIgnoredLabels, autoscalingOptions.BalancingNodeLabels), } // These metrics should be published only once. diff --git a/cluster-autoscaler/processors/nodegroupset/aws_nodegroups.go b/cluster-autoscaler/processors/nodegroupset/aws_nodegroups.go index d9fe4e0c23f7..194e9d48345f 100644 --- a/cluster-autoscaler/processors/nodegroupset/aws_nodegroups.go +++ b/cluster-autoscaler/processors/nodegroupset/aws_nodegroups.go @@ -23,7 +23,7 @@ import ( // CreateAwsNodeInfoComparator returns a comparator that checks if two nodes should be considered // part of the same NodeGroupSet. This is true if they match usual conditions checked by IsCloudProviderNodeInfoSimilar, // even if they have different AWS-specific labels. -func CreateAwsNodeInfoComparator(extraIgnoredLabels []string) NodeInfoComparator { +func CreateAwsNodeInfoComparator(extraIgnoredLabels, extraNodeLabels []string) NodeInfoComparator { awsIgnoredLabels := map[string]bool{ "alpha.eksctl.io/instance-id": true, // this is a label used by eksctl to identify instances. "alpha.eksctl.io/nodegroup-name": true, // this is a label used by eksctl to identify "node group" names. @@ -31,6 +31,7 @@ func CreateAwsNodeInfoComparator(extraIgnoredLabels []string) NodeInfoComparator "k8s.amazonaws.com/eniConfig": true, // this is a label used by the AWS CNI for custom networking. "lifecycle": true, // this is a label used by the AWS for spot. } + awsIncludedLabels := make(map[string]bool) for k, v := range BasicIgnoredLabels { awsIgnoredLabels[k] = v @@ -40,7 +41,11 @@ func CreateAwsNodeInfoComparator(extraIgnoredLabels []string) NodeInfoComparator awsIgnoredLabels[k] = true } + for _, k := range extraNodeLabels { + awsIncludedLabels[k] = true + } + return func(n1, n2 *schedulerframework.NodeInfo) bool { - return IsCloudProviderNodeInfoSimilar(n1, n2, awsIgnoredLabels) + return IsCloudProviderNodeInfoSimilar(n1, n2, awsIgnoredLabels, awsIncludedLabels) } } diff --git a/cluster-autoscaler/processors/nodegroupset/azure_nodegroups.go b/cluster-autoscaler/processors/nodegroupset/azure_nodegroups.go index 3eff7323c33a..cad6bd4bc314 100644 --- a/cluster-autoscaler/processors/nodegroupset/azure_nodegroups.go +++ b/cluster-autoscaler/processors/nodegroupset/azure_nodegroups.go @@ -32,8 +32,10 @@ func nodesFromSameAzureNodePool(n1, n2 *schedulerframework.NodeInfo) bool { // CreateAzureNodeInfoComparator returns a comparator that checks if two nodes should be considered // part of the same NodeGroupSet. This is true if they either belong to the same Azure agentpool // or match usual conditions checked by IsCloudProviderNodeInfoSimilar, even if they have different agentpool labels. -func CreateAzureNodeInfoComparator(extraIgnoredLabels []string) NodeInfoComparator { +func CreateAzureNodeInfoComparator(extraIgnoredLabels, extraNodeLabels []string) NodeInfoComparator { azureIgnoredLabels := make(map[string]bool) + azureIncludedLabels := make(map[string]bool) + for k, v := range BasicIgnoredLabels { azureIgnoredLabels[k] = v } @@ -42,10 +44,14 @@ func CreateAzureNodeInfoComparator(extraIgnoredLabels []string) NodeInfoComparat azureIgnoredLabels[k] = true } + for _, k := range extraNodeLabels { + azureIncludedLabels[k] = true + } + return func(n1, n2 *schedulerframework.NodeInfo) bool { if nodesFromSameAzureNodePool(n1, n2) { return true } - return IsCloudProviderNodeInfoSimilar(n1, n2, azureIgnoredLabels) + return IsCloudProviderNodeInfoSimilar(n1, n2, azureIgnoredLabels, azureIncludedLabels) } } diff --git a/cluster-autoscaler/processors/nodegroupset/azure_nodegroups_test.go b/cluster-autoscaler/processors/nodegroupset/azure_nodegroups_test.go index 3d64b373cc0f..69b70cb16a3c 100644 --- a/cluster-autoscaler/processors/nodegroupset/azure_nodegroups_test.go +++ b/cluster-autoscaler/processors/nodegroupset/azure_nodegroups_test.go @@ -29,7 +29,7 @@ import ( ) func TestIsAzureNodeInfoSimilar(t *testing.T) { - comparator := CreateAzureNodeInfoComparator([]string{"example.com/ready"}) + comparator := CreateAzureNodeInfoComparator([]string{"example.com/ready"}, []string{}) n1 := BuildTestNode("node1", 1000, 2000) n1.ObjectMeta.Labels["test-label"] = "test-value" n1.ObjectMeta.Labels["character"] = "thing" @@ -71,12 +71,12 @@ func TestIsAzureNodeInfoSimilar(t *testing.T) { func TestFindSimilarNodeGroupsAzureBasic(t *testing.T) { context := &context.AutoscalingContext{} ni1, ni2, ni3 := buildBasicNodeGroups(context) - processor := &BalancingNodeGroupSetProcessor{Comparator: CreateAzureNodeInfoComparator([]string{})} + processor := &BalancingNodeGroupSetProcessor{Comparator: CreateAzureNodeInfoComparator([]string{}, []string{})} basicSimilarNodeGroupsTest(t, context, processor, ni1, ni2, ni3) } func TestFindSimilarNodeGroupsAzureByLabel(t *testing.T) { - processor := &BalancingNodeGroupSetProcessor{Comparator: CreateAzureNodeInfoComparator([]string{})} + processor := &BalancingNodeGroupSetProcessor{Comparator: CreateAzureNodeInfoComparator([]string{}, []string{})} context := &context.AutoscalingContext{} n1 := BuildTestNode("n1", 1000, 1000) diff --git a/cluster-autoscaler/processors/nodegroupset/balancing_processor_test.go b/cluster-autoscaler/processors/nodegroupset/balancing_processor_test.go index 18ee99f147e8..ce3b66727b0e 100644 --- a/cluster-autoscaler/processors/nodegroupset/balancing_processor_test.go +++ b/cluster-autoscaler/processors/nodegroupset/balancing_processor_test.go @@ -84,7 +84,7 @@ func basicSimilarNodeGroupsTest( func TestFindSimilarNodeGroups(t *testing.T) { context := &context.AutoscalingContext{} ni1, ni2, ni3 := buildBasicNodeGroups(context) - processor := NewDefaultNodeGroupSetProcessor([]string{}) + processor := NewDefaultNodeGroupSetProcessor([]string{}, []string{}) basicSimilarNodeGroupsTest(t, context, processor, ni1, ni2, ni3) } @@ -94,7 +94,7 @@ func TestFindSimilarNodeGroupsCustomLabels(t *testing.T) { ni1.Node().Labels["example.com/ready"] = "true" ni2.Node().Labels["example.com/ready"] = "false" - processor := NewDefaultNodeGroupSetProcessor([]string{"example.com/ready"}) + processor := NewDefaultNodeGroupSetProcessor([]string{"example.com/ready"}, []string{}) basicSimilarNodeGroupsTest(t, context, processor, ni1, ni2, ni3) } @@ -112,7 +112,7 @@ func TestFindSimilarNodeGroupsCustomComparator(t *testing.T) { } func TestBalanceSingleGroup(t *testing.T) { - processor := NewDefaultNodeGroupSetProcessor([]string{}) + processor := NewDefaultNodeGroupSetProcessor([]string{}, []string{}) context := &context.AutoscalingContext{} provider := testprovider.NewTestCloudProvider(nil, nil) @@ -132,7 +132,7 @@ func TestBalanceSingleGroup(t *testing.T) { } func TestBalanceUnderMaxSize(t *testing.T) { - processor := NewDefaultNodeGroupSetProcessor([]string{}) + processor := NewDefaultNodeGroupSetProcessor([]string{}, []string{}) context := &context.AutoscalingContext{} provider := testprovider.NewTestCloudProvider(nil, nil) @@ -182,7 +182,7 @@ func TestBalanceUnderMaxSize(t *testing.T) { } func TestBalanceHittingMaxSize(t *testing.T) { - processor := NewDefaultNodeGroupSetProcessor([]string{}) + processor := NewDefaultNodeGroupSetProcessor([]string{}, []string{}) context := &context.AutoscalingContext{} provider := testprovider.NewTestCloudProvider(nil, nil) diff --git a/cluster-autoscaler/processors/nodegroupset/compare_nodegroups.go b/cluster-autoscaler/processors/nodegroupset/compare_nodegroups.go index 709064055871..83a9a976c321 100644 --- a/cluster-autoscaler/processors/nodegroupset/compare_nodegroups.go +++ b/cluster-autoscaler/processors/nodegroupset/compare_nodegroups.go @@ -18,6 +18,7 @@ package nodegroupset import ( "math" + "fmt" apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -91,8 +92,10 @@ func compareLabels(nodes []*schedulerframework.NodeInfo, ignoredLabels map[strin } // CreateGenericNodeInfoComparator returns a generic comparator that checks for node group similarity -func CreateGenericNodeInfoComparator(extraIgnoredLabels []string) NodeInfoComparator { +func CreateGenericNodeInfoComparator(extraIgnoredLabels, extraNodeLabels []string) NodeInfoComparator { genericIgnoredLabels := make(map[string]bool) + genericIncludedLabels := make(map[string]bool) + for k, v := range BasicIgnoredLabels { genericIgnoredLabels[k] = v } @@ -100,8 +103,12 @@ func CreateGenericNodeInfoComparator(extraIgnoredLabels []string) NodeInfoCompar genericIgnoredLabels[k] = true } + for _, k := range extraNodeLabels { + genericIncludedLabels[k] = true + } + return func(n1, n2 *schedulerframework.NodeInfo) bool { - return IsCloudProviderNodeInfoSimilar(n1, n2, genericIgnoredLabels) + return IsCloudProviderNodeInfoSimilar(n1, n2, genericIgnoredLabels, genericIncludedLabels) } } @@ -110,7 +117,7 @@ func CreateGenericNodeInfoComparator(extraIgnoredLabels []string) NodeInfoCompar // somewhat arbitrary, but generally we check if resources provided by both nodes // are similar enough to likely be the same type of machine and if the set of labels // is the same (except for a set of labels passed in to be ignored like hostname or zone). -func IsCloudProviderNodeInfoSimilar(n1, n2 *schedulerframework.NodeInfo, ignoredLabels map[string]bool) bool { +func IsCloudProviderNodeInfoSimilar(n1, n2 *schedulerframework.NodeInfo, ignoredLabels, includedLabels map[string]bool) bool { capacity := make(map[apiv1.ResourceName][]resource.Quantity) allocatable := make(map[apiv1.ResourceName][]resource.Quantity) free := make(map[apiv1.ResourceName][]resource.Quantity) @@ -160,5 +167,10 @@ func IsCloudProviderNodeInfoSimilar(n1, n2 *schedulerframework.NodeInfo, ignored return false } + fmt.Println(includedLabels, len(includedLabels)) + if len(includedLabels) > 0 && !compareLabels(nodes, includedLabels) { + return false + } + return true } diff --git a/cluster-autoscaler/processors/nodegroupset/compare_nodegroups_test.go b/cluster-autoscaler/processors/nodegroupset/compare_nodegroups_test.go index 43236ad90771..ae12e173cb67 100644 --- a/cluster-autoscaler/processors/nodegroupset/compare_nodegroups_test.go +++ b/cluster-autoscaler/processors/nodegroupset/compare_nodegroups_test.go @@ -41,14 +41,14 @@ func checkNodesSimilarWithPods(t *testing.T, n1, n2 *apiv1.Node, pods1, pods2 [] } func TestIdenticalNodesSimilar(t *testing.T) { - comparator := CreateGenericNodeInfoComparator([]string{}) + comparator := CreateGenericNodeInfoComparator([]string{}, []string{}) n1 := BuildTestNode("node1", 1000, 2000) n2 := BuildTestNode("node2", 1000, 2000) checkNodesSimilar(t, n1, n2, comparator, true) } func TestNodesSimilarVariousRequirements(t *testing.T) { - comparator := CreateGenericNodeInfoComparator([]string{}) + comparator := CreateGenericNodeInfoComparator([]string{}, []string{}) n1 := BuildTestNode("node1", 1000, 2000) // Different CPU capacity @@ -74,7 +74,7 @@ func TestNodesSimilarVariousRequirements(t *testing.T) { } func TestNodesSimilarVariousRequirementsAndPods(t *testing.T) { - comparator := CreateGenericNodeInfoComparator([]string{}) + comparator := CreateGenericNodeInfoComparator([]string{}, []string{}) n1 := BuildTestNode("node1", 1000, 2000) p1 := BuildTestPod("pod1", 500, 1000) p1.Spec.NodeName = "node1" @@ -100,7 +100,7 @@ func TestNodesSimilarVariousRequirementsAndPods(t *testing.T) { } func TestNodesSimilarVariousMemoryRequirements(t *testing.T) { - comparator := CreateGenericNodeInfoComparator([]string{}) + comparator := CreateGenericNodeInfoComparator([]string{}, []string{}) n1 := BuildTestNode("node1", 1000, 1000) // Different memory capacity within tolerance @@ -115,7 +115,7 @@ func TestNodesSimilarVariousMemoryRequirements(t *testing.T) { } func TestNodesSimilarVariousLargeMemoryRequirementsM5XLarge(t *testing.T) { - comparator := CreateGenericNodeInfoComparator([]string{}) + comparator := CreateGenericNodeInfoComparator([]string{}, []string{}) // Use realistic memory capacity (taken from real nodes) // 15944120 KB ~= 16GiB (m5.xLarge) @@ -137,7 +137,7 @@ func TestNodesSimilarVariousLargeMemoryRequirementsM5XLarge(t *testing.T) { } func TestNodesSimilarVariousLargeMemoryRequirementsM516XLarge(t *testing.T) { - comparator := CreateGenericNodeInfoComparator([]string{}) + comparator := CreateGenericNodeInfoComparator([]string{}, []string{}) // Use realistic memory capacity (taken from real nodes) // 257217528 KB ~= 256GiB (m5.16xLarge) @@ -159,7 +159,7 @@ func TestNodesSimilarVariousLargeMemoryRequirementsM516XLarge(t *testing.T) { } func TestNodesSimilarVariousLabels(t *testing.T) { - comparator := CreateGenericNodeInfoComparator([]string{"example.com/ready"}) + comparator := CreateGenericNodeInfoComparator([]string{"example.com/ready"}, []string{}) n1 := BuildTestNode("node1", 1000, 2000) n1.ObjectMeta.Labels["test-label"] = "test-value" n1.ObjectMeta.Labels["character"] = "winnie the pooh" diff --git a/cluster-autoscaler/processors/nodegroupset/gce_nodegroups.go b/cluster-autoscaler/processors/nodegroupset/gce_nodegroups.go index b25615925a41..6ed42fa8bfd0 100644 --- a/cluster-autoscaler/processors/nodegroupset/gce_nodegroups.go +++ b/cluster-autoscaler/processors/nodegroupset/gce_nodegroups.go @@ -23,10 +23,11 @@ import ( // CreateGceNodeInfoComparator returns a comparator that checks if two nodes should be considered // part of the same NodeGroupSet. This is true if they match usual conditions checked by IsCloudProviderNodeInfoSimilar, // even if they have different GCE-specific labels. -func CreateGceNodeInfoComparator(extraIgnoredLabels []string) NodeInfoComparator { +func CreateGceNodeInfoComparator(extraIgnoredLabels, extraNodeLabels []string) NodeInfoComparator { gceIgnoredLabels := map[string]bool{ "topology.gke.io/zone": true, } + gceIncludedLabels := make(map[string]bool) for k, v := range BasicIgnoredLabels { gceIgnoredLabels[k] = v @@ -36,7 +37,11 @@ func CreateGceNodeInfoComparator(extraIgnoredLabels []string) NodeInfoComparator gceIgnoredLabels[k] = true } + for _, k := range extraNodeLabels { + gceIncludedLabels[k] = true + } + return func(n1, n2 *schedulerframework.NodeInfo) bool { - return IsCloudProviderNodeInfoSimilar(n1, n2, gceIgnoredLabels) + return IsCloudProviderNodeInfoSimilar(n1, n2, gceIgnoredLabels, gceIncludedLabels) } } diff --git a/cluster-autoscaler/processors/nodegroupset/nodegroup_set_processor.go b/cluster-autoscaler/processors/nodegroupset/nodegroup_set_processor.go index 4a63d086a9ae..c644662158a5 100644 --- a/cluster-autoscaler/processors/nodegroupset/nodegroup_set_processor.go +++ b/cluster-autoscaler/processors/nodegroupset/nodegroup_set_processor.go @@ -70,8 +70,8 @@ func (n *NoOpNodeGroupSetProcessor) BalanceScaleUpBetweenGroups(context *context func (n *NoOpNodeGroupSetProcessor) CleanUp() {} // NewDefaultNodeGroupSetProcessor creates an instance of NodeGroupSetProcessor. -func NewDefaultNodeGroupSetProcessor(ignoredLabels []string) NodeGroupSetProcessor { +func NewDefaultNodeGroupSetProcessor(ignoredLabels, includedLabels []string) NodeGroupSetProcessor { return &BalancingNodeGroupSetProcessor{ - Comparator: CreateGenericNodeInfoComparator(ignoredLabels), + Comparator: CreateGenericNodeInfoComparator(ignoredLabels, includedLabels), } } diff --git a/cluster-autoscaler/processors/processors.go b/cluster-autoscaler/processors/processors.go index a47786a61960..03744e515bb1 100644 --- a/cluster-autoscaler/processors/processors.go +++ b/cluster-autoscaler/processors/processors.go @@ -53,7 +53,7 @@ func DefaultProcessors() *AutoscalingProcessors { return &AutoscalingProcessors{ PodListProcessor: pods.NewDefaultPodListProcessor(), NodeGroupListProcessor: nodegroups.NewDefaultNodeGroupListProcessor(), - NodeGroupSetProcessor: nodegroupset.NewDefaultNodeGroupSetProcessor([]string{}), + NodeGroupSetProcessor: nodegroupset.NewDefaultNodeGroupSetProcessor([]string{}, []string{}), ScaleUpStatusProcessor: status.NewDefaultScaleUpStatusProcessor(), ScaleDownNodeProcessor: nodes.NewPreFilteringScaleDownNodeProcessor(), ScaleDownStatusProcessor: status.NewDefaultScaleDownStatusProcessor(),