Skip to content

Commit

Permalink
Per NodeGroup config for scale-down options
Browse files Browse the repository at this point in the history
This is the implementation of
kubernetes#3583 (comment).
  • Loading branch information
MaciekPytel authored and piotrnosek committed Nov 30, 2021
1 parent 7136181 commit 0035b16
Show file tree
Hide file tree
Showing 9 changed files with 409 additions and 191 deletions.
21 changes: 10 additions & 11 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,6 @@ type GpuLimits struct {
// NodeGroupAutoscalingOptions contain various options to customize how autoscaling of
// a given NodeGroup works. Different options can be used for each NodeGroup.
type NodeGroupAutoscalingOptions struct {
ScaleDownUnneededTime time.Duration
}

// AutoscalingOptions contain various options to customize how autoscaling works
type AutoscalingOptions struct {
// NodeGroupAutoscalingOptions are default values for per NodeGroup options.
// They will be used any time a specific value is not provided for a given NodeGroup.
NodeGroupAutoscalingOptions
// MaxEmptyBulkDelete is a number of empty nodes that can be removed at the same time.
MaxEmptyBulkDelete int
// ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down if cpu or memory utilization is over threshold.
// Well-utilized nodes are not touched.
ScaleDownUtilizationThreshold float64
Expand All @@ -51,9 +41,18 @@ type AutoscalingOptions struct {
ScaleDownGpuUtilizationThreshold float64
// ScaleDownUnneededTime sets the duration CA expects a node to be unneeded/eligible for removal
// before scaling down the node.
// ScaleDownUnneededTime time.Duration
ScaleDownUnneededTime time.Duration
// ScaleDownUnreadyTime represents how long an unready node should be unneeded before it is eligible for scale down
ScaleDownUnreadyTime time.Duration
}

// AutoscalingOptions contain various options to customize how autoscaling works
type AutoscalingOptions struct {
// NodeGroupAutoscalingOptions are default values for per NodeGroup options.
// They will be used any time a specific value is not provided for a given NodeGroup.
NodeGroupAutoscalingOptions
// MaxEmptyBulkDelete is a number of empty nodes that can be removed at the same time.
MaxEmptyBulkDelete int
// MaxNodesTotal sets the maximum number of nodes in the whole cluster
MaxNodesTotal int
// MaxCoresTotal sets the maximum number of cores in the whole cluster
Expand Down
80 changes: 60 additions & 20 deletions cluster-autoscaler/core/scale_down.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/context"
core_utils "k8s.io/autoscaler/cluster-autoscaler/core/utils"
"k8s.io/autoscaler/cluster-autoscaler/metrics"
"k8s.io/autoscaler/cluster-autoscaler/processors"
"k8s.io/autoscaler/cluster-autoscaler/simulator"
"k8s.io/autoscaler/cluster-autoscaler/utils"
"k8s.io/autoscaler/cluster-autoscaler/utils/deletetaint"
Expand Down Expand Up @@ -359,6 +360,7 @@ func (limits *scaleDownResourcesLimits) tryDecrementLimitsByDelta(delta scaleDow
// ScaleDown is responsible for maintaining the state needed to perform unneeded node removals.
type ScaleDown struct {
context *context.AutoscalingContext
processors *processors.AutoscalingProcessors
clusterStateRegistry *clusterstate.ClusterStateRegistry
unneededNodes map[string]time.Time
unneededNodesList []*apiv1.Node
Expand All @@ -371,9 +373,10 @@ type ScaleDown struct {
}

// NewScaleDown builds new ScaleDown object.
func NewScaleDown(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry) *ScaleDown {
func NewScaleDown(context *context.AutoscalingContext, processors *processors.AutoscalingProcessors, clusterStateRegistry *clusterstate.ClusterStateRegistry) *ScaleDown {
return &ScaleDown{
context: context,
processors: processors,
clusterStateRegistry: clusterStateRegistry,
unneededNodes: make(map[string]time.Time),
unremovableNodes: make(map[string]time.Time),
Expand All @@ -388,6 +391,8 @@ func NewScaleDown(context *context.AutoscalingContext, clusterStateRegistry *clu

// CleanUp cleans up the internal ScaleDown state.
func (sd *ScaleDown) CleanUp(timestamp time.Time) {
// Use default ScaleDownUnneededTime as in this context the value
// doesn't apply to any specific NodeGroup.
sd.usageTracker.CleanUp(timestamp.Add(-sd.context.ScaleDownUnneededTime))
sd.clearUnremovableNodeReasons()
}
Expand Down Expand Up @@ -423,7 +428,23 @@ func (sd *ScaleDown) checkNodeUtilization(timestamp time.Time, node *apiv1.Node,
klog.Warningf("Failed to calculate utilization for %s: %v", node.Name, err)
}

if !sd.isNodeBelowUtilizationThreshold(node, utilInfo) {
nodeGroup, err := sd.context.CloudProvider.NodeGroupForNode(node)
if err != nil {
return simulator.UnexpectedError, nil
}
if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() {
// We should never get here as non-autoscaled nodes should not be included in scaleDownCandidates list
// (and the default PreFilteringScaleDownNodeProcessor would indeed filter them out).
klog.V(4).Infof("Skipped %s from delete considered - the node is not autoscaled", node.Name)
return simulator.NotAutoscaled, nil
}

underutilized, err := sd.isNodeBelowUtilizationThreshold(node, nodeGroup, utilInfo)
if err != nil {
klog.Warningf("Failed to check utilization thresholds for %s: %v", node.Name, err)
return simulator.UnexpectedError, nil
}
if !underutilized {
klog.V(4).Infof("Node %s is not suitable for removal - %s utilization too big (%f)", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
return simulator.NotUnderutilized, &utilInfo
}
Expand Down Expand Up @@ -616,17 +637,24 @@ func (sd *ScaleDown) UpdateUnneededNodes(
}

// isNodeBelowUtilizationThreshold determines if a given node utilization is below threshold.
func (sd *ScaleDown) isNodeBelowUtilizationThreshold(node *apiv1.Node, utilInfo simulator.UtilizationInfo) bool {
func (sd *ScaleDown) isNodeBelowUtilizationThreshold(node *apiv1.Node, nodeGroup cloudprovider.NodeGroup, utilInfo simulator.UtilizationInfo) (bool, error) {
var threshold float64
var err error
if gpu.NodeHasGpu(sd.context.CloudProvider.GPULabel(), node) {
if utilInfo.Utilization >= sd.context.ScaleDownGpuUtilizationThreshold {
return false
threshold, err = sd.processors.NodeGroupConfigProcessor.GetScaleDownGpuUtilizationThreshold(sd.context, nodeGroup)
if err != nil {
return false, err
}
} else {
if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold {
return false
threshold, err = sd.processors.NodeGroupConfigProcessor.GetScaleDownUtilizationThreshold(sd.context, nodeGroup)
if err != nil {
return false, err
}
}
return true
if utilInfo.Utilization >= threshold {
return false, nil
}
return true, nil
}

// updateUnremovableNodes updates unremovableNodes map according to current
Expand Down Expand Up @@ -812,18 +840,6 @@ func (sd *ScaleDown) TryToScaleDown(
ready, _, _ := kube_util.GetReadinessState(node)
readinessMap[node.Name] = ready

// Check how long a ready node was underutilized.
if ready && !unneededSince.Add(sd.context.ScaleDownUnneededTime).Before(currentTime) {
sd.addUnremovableNodeReason(node, simulator.NotUnneededLongEnough)
continue
}

// Unready nodes may be deleted after a different time than underutilized nodes.
if !ready && !unneededSince.Add(sd.context.ScaleDownUnreadyTime).Before(currentTime) {
sd.addUnremovableNodeReason(node, simulator.NotUnreadyLongEnough)
continue
}

nodeGroup, err := sd.context.CloudProvider.NodeGroupForNode(node)
if err != nil {
klog.Errorf("Error while checking node group for %s: %v", node.Name, err)
Expand All @@ -836,6 +852,30 @@ func (sd *ScaleDown) TryToScaleDown(
continue
}

if ready {
// Check how long a ready node was underutilized.
unneededTime, err := sd.processors.NodeGroupConfigProcessor.GetScaleDownUnneededTime(sd.context, nodeGroup)
if err != nil {
klog.Errorf("Error trying to get ScaleDownUnneededTime for node %s (in group: %s)", node.Name, nodeGroup.Id())
continue
}
if !unneededSince.Add(unneededTime).Before(currentTime) {
sd.addUnremovableNodeReason(node, simulator.NotUnneededLongEnough)
continue
}
} else {
// Unready nodes may be deleted after a different time than underutilized nodes.
unreadyTime, err := sd.processors.NodeGroupConfigProcessor.GetScaleDownUnreadyTime(sd.context, nodeGroup)
if err != nil {
klog.Errorf("Error trying to get ScaleDownUnnreadyTime for node %s (in group: %s)", node.Name, nodeGroup.Id())
continue
}
if !unneededSince.Add(unreadyTime).Before(currentTime) {
sd.addUnremovableNodeReason(node, simulator.NotUnreadyLongEnough)
continue
}
}

size, found := nodeGroupSize[nodeGroup.Id()]
if !found {
klog.Errorf("Error while checking node group size %s: group size not found in cache", nodeGroup.Id())
Expand Down
Loading

0 comments on commit 0035b16

Please sign in to comment.