From 1af8021b9de7acb1995c5d5e6da6ace2ddad2f36 Mon Sep 17 00:00:00 2001 From: Kushagra Date: Mon, 29 Jan 2024 22:51:05 +0000 Subject: [PATCH 1/5] Consider atomic nodes --- .../core/scaledown/planner/planner.go | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go index 032d93c4f09c..9b2f32d113a4 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner.go +++ b/cluster-autoscaler/core/scaledown/planner/planner.go @@ -23,6 +23,7 @@ import ( apiv1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/context" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/eligibility" @@ -261,6 +262,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand unremovableTimeout := p.latestUpdate.Add(p.context.AutoscalingOptions.UnremovableNodeRecheckTimeout) unremovableCount := 0 var removableList []simulator.NodeToBeRemoved + atmomicScaleDownNodesCount := 0 p.unremovableNodes.Update(p.context.ClusterSnapshot.NodeInfos(), p.latestUpdate) currentlyUnneededNodeNames, utilizationMap, ineligible := p.eligibilityChecker.FilterOutUnremovable(p.context, scaleDownCandidates, p.latestUpdate, p.unremovableNodes) for _, n := range ineligible { @@ -274,7 +276,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand klog.Warningf("%d out of %d nodes skipped in scale down simulation due to timeout.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames)) break } - if len(removableList) >= p.unneededNodesLimit() { + if len(removableList)-atmomicScaleDownNodesCount >= p.unneededNodesLimit() { klog.V(4).Infof("%d out of %d nodes skipped in scale down simulation: there are already %d unneeded nodes so no point in looking for more.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames), len(removableList)) break } @@ -287,6 +289,9 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand delete(podDestinations, removable.Node.Name) p.context.RemainingPdbTracker.RemovePods(removable.PodsToReschedule) removableList = append(removableList, *removable) + if p.atomicScaleDownNode(removable) { + atmomicScaleDownNodesCount++ + } } if unremovable != nil { unremovableCount += 1 @@ -299,6 +304,24 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand } } +// atomicScaleDownNode checks if the removable node would be considered for atmonic scale down. +func (p *Planner) atomicScaleDownNode(node *simulator.NodeToBeRemoved) bool { + nodeGroup, err := p.context.CloudProvider.NodeGroupForNode(node.Node) + if err != nil { + klog.Errorf("failed to get node info for %v: %s", node.Node.Name, err) + return false + } + autoscalingOptions, err := nodeGroup.GetOptions(p.context.NodeGroupDefaults) + if err != nil && err != cloudprovider.ErrNotImplemented { + klog.Errorf("Failed to get autoscaling options for node group %s: %v", nodeGroup.Id(), err) + return false + } + if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling { + return true + } + return false +} + // unneededNodesLimit returns the number of nodes after which calculating more // unneeded nodes is a waste of time. The reasoning behind it is essentially as // follows. From f70c09f00e6f9c900cf31dcdfa1c331d87efaebf Mon Sep 17 00:00:00 2001 From: Kushagra Date: Mon, 29 Jan 2024 22:51:05 +0000 Subject: [PATCH 2/5] Consider atomic nodes --- cluster-autoscaler/core/scaledown/planner/planner.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go index 9b2f32d113a4..2b7b40606547 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner.go +++ b/cluster-autoscaler/core/scaledown/planner/planner.go @@ -262,7 +262,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand unremovableTimeout := p.latestUpdate.Add(p.context.AutoscalingOptions.UnremovableNodeRecheckTimeout) unremovableCount := 0 var removableList []simulator.NodeToBeRemoved - atmomicScaleDownNodesCount := 0 + atomicScaleDownNodesCount := 0 p.unremovableNodes.Update(p.context.ClusterSnapshot.NodeInfos(), p.latestUpdate) currentlyUnneededNodeNames, utilizationMap, ineligible := p.eligibilityChecker.FilterOutUnremovable(p.context, scaleDownCandidates, p.latestUpdate, p.unremovableNodes) for _, n := range ineligible { @@ -276,7 +276,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand klog.Warningf("%d out of %d nodes skipped in scale down simulation due to timeout.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames)) break } - if len(removableList)-atmomicScaleDownNodesCount >= p.unneededNodesLimit() { + if len(removableList)-atomicScaleDownNodesCount >= p.unneededNodesLimit() { klog.V(4).Infof("%d out of %d nodes skipped in scale down simulation: there are already %d unneeded nodes so no point in looking for more.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames), len(removableList)) break } @@ -290,7 +290,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand p.context.RemainingPdbTracker.RemovePods(removable.PodsToReschedule) removableList = append(removableList, *removable) if p.atomicScaleDownNode(removable) { - atmomicScaleDownNodesCount++ + atomicScaleDownNodesCount++ } } if unremovable != nil { @@ -304,7 +304,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand } } -// atomicScaleDownNode checks if the removable node would be considered for atmonic scale down. +// atomicScaleDownNode checks if the removable node would be considered for atomic scale down. func (p *Planner) atomicScaleDownNode(node *simulator.NodeToBeRemoved) bool { nodeGroup, err := p.context.CloudProvider.NodeGroupForNode(node.Node) if err != nil { From bd5a51abb0203936fa4591e1a0e85275cddbaba1 Mon Sep 17 00:00:00 2001 From: Kushagra Date: Tue, 30 Jan 2024 14:04:32 +0000 Subject: [PATCH 3/5] Add unit tests --- .../core/scaledown/planner/planner_test.go | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/cluster-autoscaler/core/scaledown/planner/planner_test.go b/cluster-autoscaler/core/scaledown/planner/planner_test.go index e92eb1ab4223..19f160b1b0af 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner_test.go +++ b/cluster-autoscaler/core/scaledown/planner/planner_test.go @@ -484,6 +484,10 @@ func TestUpdateClusterState(t *testing.T) { assert.NoError(t, err) registry := kube_util.NewListerRegistry(nil, nil, nil, nil, nil, nil, nil, rsLister, nil) provider := testprovider.NewTestCloudProvider(nil, nil) + provider.AddNodeGroup("ng1", 0, 0, 0) + for _, node := range tc.nodes { + provider.AddNode("ng1", node) + } context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{ NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ ScaleDownUnneededTime: 10 * time.Minute, @@ -521,6 +525,7 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) { name string previouslyUnneeded int nodes int + opts *config.NodeGroupAutoscalingOptions maxParallelism int maxUnneededTime time.Duration updateInterval time.Duration @@ -589,6 +594,78 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) { updateInterval: 30 * time.Second, wantUnneeded: 30, }, + { + name: "atomic sclale down - default settings", + previouslyUnneeded: 5, + nodes: 100, + maxParallelism: 10, + maxUnneededTime: 1 * time.Minute, + updateInterval: 10 * time.Second, + wantUnneeded: 100, + opts: &config.NodeGroupAutoscalingOptions{ + ZeroOrMaxNodeScaling: true, + }, + }, + { + name: "atomic sclale down - quick loops", + previouslyUnneeded: 5, + nodes: 100, + maxParallelism: 10, + maxUnneededTime: 1 * time.Minute, + updateInterval: 1 * time.Second, + wantUnneeded: 100, + opts: &config.NodeGroupAutoscalingOptions{ + ZeroOrMaxNodeScaling: true, + }, + }, + { + name: "atomic sclale down - slow loops", + previouslyUnneeded: 5, + nodes: 100, + maxParallelism: 10, + maxUnneededTime: 1 * time.Minute, + updateInterval: 30 * time.Second, + wantUnneeded: 100, + opts: &config.NodeGroupAutoscalingOptions{ + ZeroOrMaxNodeScaling: true, + }, + }, + { + name: "atomic sclale down - too many unneeded", + previouslyUnneeded: 77, + nodes: 100, + maxParallelism: 10, + maxUnneededTime: 1 * time.Minute, + updateInterval: 30 * time.Second, + wantUnneeded: 100, + opts: &config.NodeGroupAutoscalingOptions{ + ZeroOrMaxNodeScaling: true, + }, + }, + { + name: "atomic sclale down - no uneeded", + previouslyUnneeded: 0, + nodes: 100, + maxParallelism: 10, + maxUnneededTime: 1 * time.Minute, + updateInterval: 30 * time.Second, + wantUnneeded: 100, + opts: &config.NodeGroupAutoscalingOptions{ + ZeroOrMaxNodeScaling: true, + }, + }, + { + name: "atomic sclale down - short uneeded time and short update interval", + previouslyUnneeded: 0, + nodes: 500, + maxParallelism: 1, + maxUnneededTime: 1 * time.Second, + updateInterval: 1 * time.Second, + wantUnneeded: 500, + opts: &config.NodeGroupAutoscalingOptions{ + ZeroOrMaxNodeScaling: true, + }, + }, } for _, tc := range testCases { tc := tc @@ -603,6 +680,10 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) { previouslyUnneeded[i] = simulator.NodeToBeRemoved{Node: nodes[i]} } provider := testprovider.NewTestCloudProvider(nil, nil) + provider.AddNodeGroupWithCustomOptions("ng1", 0, 0, 0, tc.opts) + for _, node := range nodes { + provider.AddNode("ng1", node) + } context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{ NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ ScaleDownUnneededTime: tc.maxUnneededTime, From e02b467208838b6e9f67a499ebe789831c70561c Mon Sep 17 00:00:00 2001 From: Kushagra Date: Tue, 30 Jan 2024 14:04:32 +0000 Subject: [PATCH 4/5] Add unit tests --- cluster-autoscaler/core/scaledown/planner/planner.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go index 2b7b40606547..f715df85db66 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner.go +++ b/cluster-autoscaler/core/scaledown/planner/planner.go @@ -291,6 +291,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand removableList = append(removableList, *removable) if p.atomicScaleDownNode(removable) { atomicScaleDownNodesCount++ + klog.V(2).Infof("Considering node %s for atomic scale down. Total atomic scale down nodes count: %d", removable.Node.Name, atomicScaleDownNodesCount) } } if unremovable != nil { From a5502e3ca931ef38308cb5404c2ffd268fb57088 Mon Sep 17 00:00:00 2001 From: Kushagra Date: Tue, 30 Jan 2024 14:04:32 +0000 Subject: [PATCH 5/5] Add unit tests --- cluster-autoscaler/core/scaledown/planner/planner.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go index f715df85db66..70740604e88b 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner.go +++ b/cluster-autoscaler/core/scaledown/planner/planner.go @@ -277,7 +277,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand break } if len(removableList)-atomicScaleDownNodesCount >= p.unneededNodesLimit() { - klog.V(4).Infof("%d out of %d nodes skipped in scale down simulation: there are already %d unneeded nodes so no point in looking for more.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames), len(removableList)) + klog.V(4).Infof("%d out of %d nodes skipped in scale down simulation: there are already %d unneeded nodes so no point in looking for more. Total atomic scale down nodes: %d", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames), len(removableList), atomicScaleDownNodesCount) break } removable, unremovable := p.rs.SimulateNodeRemoval(node, podDestinations, p.latestUpdate, p.context.RemainingPdbTracker)