Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consider atomic nodes #6477

Merged
merged 5 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion cluster-autoscaler/core/scaledown/planner/planner.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (

apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/eligibility"
Expand Down Expand Up @@ -261,6 +262,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
unremovableTimeout := p.latestUpdate.Add(p.context.AutoscalingOptions.UnremovableNodeRecheckTimeout)
unremovableCount := 0
var removableList []simulator.NodeToBeRemoved
atomicScaleDownNodesCount := 0
p.unremovableNodes.Update(p.context.ClusterSnapshot.NodeInfos(), p.latestUpdate)
currentlyUnneededNodeNames, utilizationMap, ineligible := p.eligibilityChecker.FilterOutUnremovable(p.context, scaleDownCandidates, p.latestUpdate, p.unremovableNodes)
for _, n := range ineligible {
Expand All @@ -274,7 +276,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
klog.Warningf("%d out of %d nodes skipped in scale down simulation due to timeout.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames))
break
}
if len(removableList) >= p.unneededNodesLimit() {
if len(removableList)-atomicScaleDownNodesCount >= p.unneededNodesLimit() {
klog.V(4).Infof("%d out of %d nodes skipped in scale down simulation: there are already %d unneeded nodes so no point in looking for more.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames), len(removableList))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's include atomicScaleDownNodesCount in the log.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant in this line: just report atomicScaleDownNodesCount value. Doing it for every considered node is going to result in a lot of log spam.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. Done.

break
}
Expand All @@ -287,6 +289,9 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
delete(podDestinations, removable.Node.Name)
p.context.RemainingPdbTracker.RemovePods(removable.PodsToReschedule)
removableList = append(removableList, *removable)
if p.atomicScaleDownNode(removable) {
atomicScaleDownNodesCount++
}
}
if unremovable != nil {
unremovableCount += 1
Expand All @@ -299,6 +304,24 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
}
}

// atomicScaleDownNode checks if the removable node would be considered for atomic scale down.
func (p *Planner) atomicScaleDownNode(node *simulator.NodeToBeRemoved) bool {
nodeGroup, err := p.context.CloudProvider.NodeGroupForNode(node.Node)
if err != nil {
klog.Errorf("failed to get node info for %v: %s", node.Node.Name, err)
return false
}
autoscalingOptions, err := nodeGroup.GetOptions(p.context.NodeGroupDefaults)
if err != nil && err != cloudprovider.ErrNotImplemented {
klog.Errorf("Failed to get autoscaling options for node group %s: %v", nodeGroup.Id(), err)
return false
}
if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling {
return true
}
return false
}

// unneededNodesLimit returns the number of nodes after which calculating more
// unneeded nodes is a waste of time. The reasoning behind it is essentially as
// follows.
Expand Down
81 changes: 81 additions & 0 deletions cluster-autoscaler/core/scaledown/planner/planner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,10 @@ func TestUpdateClusterState(t *testing.T) {
assert.NoError(t, err)
registry := kube_util.NewListerRegistry(nil, nil, nil, nil, nil, nil, nil, rsLister, nil)
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 0, 0, 0)
for _, node := range tc.nodes {
provider.AddNode("ng1", node)
}
context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{
NodeGroupDefaults: config.NodeGroupAutoscalingOptions{
ScaleDownUnneededTime: 10 * time.Minute,
Expand Down Expand Up @@ -521,6 +525,7 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) {
name string
previouslyUnneeded int
nodes int
opts *config.NodeGroupAutoscalingOptions
maxParallelism int
maxUnneededTime time.Duration
updateInterval time.Duration
Expand Down Expand Up @@ -589,6 +594,78 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) {
updateInterval: 30 * time.Second,
wantUnneeded: 30,
},
{
name: "atomic sclale down - default settings",
previouslyUnneeded: 5,
nodes: 100,
maxParallelism: 10,
maxUnneededTime: 1 * time.Minute,
updateInterval: 10 * time.Second,
wantUnneeded: 100,
opts: &config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
},
},
{
name: "atomic sclale down - quick loops",
previouslyUnneeded: 5,
nodes: 100,
maxParallelism: 10,
maxUnneededTime: 1 * time.Minute,
updateInterval: 1 * time.Second,
wantUnneeded: 100,
opts: &config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
},
},
{
name: "atomic sclale down - slow loops",
previouslyUnneeded: 5,
nodes: 100,
maxParallelism: 10,
maxUnneededTime: 1 * time.Minute,
updateInterval: 30 * time.Second,
wantUnneeded: 100,
opts: &config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
},
},
{
name: "atomic sclale down - too many unneeded",
previouslyUnneeded: 77,
nodes: 100,
maxParallelism: 10,
maxUnneededTime: 1 * time.Minute,
updateInterval: 30 * time.Second,
wantUnneeded: 100,
opts: &config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
},
},
{
name: "atomic sclale down - no uneeded",
previouslyUnneeded: 0,
nodes: 100,
maxParallelism: 10,
maxUnneededTime: 1 * time.Minute,
updateInterval: 30 * time.Second,
wantUnneeded: 100,
opts: &config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
},
},
{
name: "atomic sclale down - short uneeded time and short update interval",
previouslyUnneeded: 0,
nodes: 500,
maxParallelism: 1,
maxUnneededTime: 1 * time.Second,
updateInterval: 1 * time.Second,
wantUnneeded: 500,
opts: &config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
},
},
}
for _, tc := range testCases {
tc := tc
Expand All @@ -603,6 +680,10 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) {
previouslyUnneeded[i] = simulator.NodeToBeRemoved{Node: nodes[i]}
}
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroupWithCustomOptions("ng1", 0, 0, 0, tc.opts)
for _, node := range nodes {
provider.AddNode("ng1", node)
}
context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{
NodeGroupDefaults: config.NodeGroupAutoscalingOptions{
ScaleDownUnneededTime: tc.maxUnneededTime,
Expand Down
Loading