Skip to content

Commit

Permalink
Merge pull request #6931 from k8s-infra-cherrypick-robot/cherry-pick-…
Browse files Browse the repository at this point in the history
…6769-to-cluster-autoscaler-release-1.30

[cluster-autoscaler-release-1.30] Refactor scale-up to apply resource limits before creating a node group
  • Loading branch information
k8s-ci-robot authored Jul 8, 2024
2 parents 7d1f87f + a6ce449 commit f83dde0
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 20 deletions.
41 changes: 22 additions & 19 deletions cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,11 +187,31 @@ func (o *ScaleUpOrchestrator) ScaleUp(
}
klog.V(1).Infof("Estimated %d nodes needed in %s", bestOption.NodeCount, bestOption.NodeGroup.Id())

// Cap new nodes to supported number of nodes in the cluster.
newNodes, aErr := o.GetCappedNewNodeCount(bestOption.NodeCount, len(nodes)+len(upcomingNodes))
if aErr != nil {
return status.UpdateScaleUpError(&status.ScaleUpStatus{PodsTriggeredScaleUp: bestOption.Pods}, aErr)
}

// Apply upper limits for resources in the cluster.
nodeInfo, found := nodeInfos[bestOption.NodeGroup.Id()]
if !found {
// This should never happen, as we already should have retrieved nodeInfo for any considered nodegroup.
klog.Errorf("No node info for: %s", bestOption.NodeGroup.Id())
return status.UpdateScaleUpError(
&status.ScaleUpStatus{PodsTriggeredScaleUp: bestOption.Pods},
errors.NewAutoscalerError(
errors.CloudProviderError,
"No node info for best expansion option!"))
}
newNodes, aErr = o.resourceManager.ApplyLimits(o.autoscalingContext, newNodes, resourcesLeft, nodeInfo, bestOption.NodeGroup)
if aErr != nil {
return status.UpdateScaleUpError(
&status.ScaleUpStatus{PodsTriggeredScaleUp: bestOption.Pods},
aErr)
}

// If necessary, create the node group. This is no longer simulation, an empty node group will be created by cloud provider if supported.
createNodeGroupResults := make([]nodegroups.CreateNodeGroupResult, 0)
if !bestOption.NodeGroup.Exist() {
var scaleUpStatus *status.ScaleUpStatus
Expand All @@ -215,25 +235,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
klog.V(2).Info("No similar node groups found")
}

nodeInfo, found := nodeInfos[bestOption.NodeGroup.Id()]
if !found {
// This should never happen, as we already should have retrieved nodeInfo for any considered nodegroup.
klog.Errorf("No node info for: %s", bestOption.NodeGroup.Id())
return status.UpdateScaleUpError(
&status.ScaleUpStatus{CreateNodeGroupResults: createNodeGroupResults, PodsTriggeredScaleUp: bestOption.Pods},
errors.NewAutoscalerError(
errors.CloudProviderError,
"No node info for best expansion option!"))
}

// Apply upper limits for CPU and memory.
newNodes, aErr = o.resourceManager.ApplyLimits(o.autoscalingContext, newNodes, resourcesLeft, nodeInfo, bestOption.NodeGroup)
if aErr != nil {
return status.UpdateScaleUpError(
&status.ScaleUpStatus{CreateNodeGroupResults: createNodeGroupResults, PodsTriggeredScaleUp: bestOption.Pods},
aErr)
}

// Balance between similar node groups.
targetNodeGroups := []cloudprovider.NodeGroup{bestOption.NodeGroup}
for _, ng := range bestOption.SimilarNodeGroups {
targetNodeGroups = append(targetNodeGroups, ng)
Expand All @@ -254,6 +256,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
aErr)
}

// Execute scale up.
klog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
aErr, failedNodeGroups := o.scaleUpExecutor.ExecuteScaleUps(scaleUpInfos, nodeInfos, now)
if aErr != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -822,8 +822,48 @@ func TestNoScaleUpMaxCoresLimitHit(t *testing.T) {
simpleNoScaleUpTest(t, config, results)
}

func TestNoCreateNodeGroupMaxCoresLimitHit(t *testing.T) {
options := defaultOptions
options.MaxCoresTotal = 7
options.MaxMemoryTotal = 100000
options.NodeAutoprovisioningEnabled = true

largeNode := BuildTestNode("n", 8000, 8000)
SetNodeReadyState(largeNode, true, time.Time{})
largeNodeInfo := schedulerframework.NewNodeInfo()
largeNodeInfo.SetNode(largeNode)

config := &ScaleUpTestConfig{
EnableAutoprovisioning: true,
Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng-small"},
},
Pods: []PodConfig{},
ExtraPods: []PodConfig{
{Name: "large-pod", Cpu: 8000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
},
Options: &options,
NodeTemplateConfigs: map[string]*NodeTemplateConfig{
"n1-standard-8": {
NodeGroupName: "autoprovisioned-n1-standard-8",
MachineType: "n1-standard-8",
NodeInfo: largeNodeInfo,
},
},
}
results := &ScaleTestResults{
NoScaleUpReason: "Insufficient cpu",
ScaleUpStatus: ScaleUpStatusInfo{
PodsRemainUnschedulable: []string{"large-pod"},
},
}

simpleNoScaleUpTest(t, config, results)
}

func simpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig, expectedResults *ScaleTestResults) {
results := runSimpleScaleUpTest(t, config)
assert.NotNil(t, results.GroupSizeChanges, "Expected scale up event")
assert.NotNil(t, results.GroupSizeChanges[0], "Expected scale up event")
assert.Equal(t, expectedResults.FinalOption, results.GroupSizeChanges[0])
assert.True(t, results.ScaleUpStatus.WasSuccessful())
Expand Down Expand Up @@ -920,14 +960,21 @@ func runSimpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig) *ScaleUpTestR
}
return nil
}
onCreateGroupFunc := func(nodeGroup string) error {
if config.OnCreateGroup != nil {
return config.OnCreateGroup(nodeGroup)
}
return fmt.Errorf("unexpected node group create: OnCreateGroup not defined")
}
if len(config.NodeTemplateConfigs) > 0 {
machineTypes := []string{}
machineTemplates := map[string]*schedulerframework.NodeInfo{}
for _, ntc := range config.NodeTemplateConfigs {
machineTypes = append(machineTypes, ntc.MachineType)
machineTemplates[ntc.NodeGroupName] = ntc.NodeInfo
machineTemplates[ntc.MachineType] = ntc.NodeInfo
}
provider = testprovider.NewTestAutoprovisioningCloudProvider(onScaleUpFunc, nil, nil, nil, machineTypes, machineTemplates)
provider = testprovider.NewTestAutoprovisioningCloudProvider(onScaleUpFunc, nil, onCreateGroupFunc, nil, machineTypes, machineTemplates)
} else {
provider = testprovider.NewTestCloudProvider(onScaleUpFunc, nil)
}
Expand Down Expand Up @@ -975,6 +1022,10 @@ func runSimpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig) *ScaleUpTestR
clusterState.UpdateNodes(nodes, nodeInfos, time.Now())
processors := NewTestProcessors(&context)
processors.ScaleStateNotifier.Register(clusterState)
if config.EnableAutoprovisioning {
processors.NodeGroupListProcessor = &MockAutoprovisioningNodeGroupListProcessor{T: t}
processors.NodeGroupManager = &MockAutoprovisioningNodeGroupManager{T: t, ExtraGroups: 0}
}
orchestrator := New()
orchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{})
expander := NewMockRepotingStrategy(t, config.ExpansionOptionToChoose)
Expand Down
2 changes: 2 additions & 0 deletions cluster-autoscaler/core/test/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,11 @@ type ScaleUpTestConfig struct {
Pods []PodConfig
ExtraPods []PodConfig
OnScaleUp testcloudprovider.OnScaleUpFunc
OnCreateGroup testcloudprovider.OnNodeGroupCreateFunc
ExpansionOptionToChoose *GroupSizeChange
Options *config.AutoscalingOptions
NodeTemplateConfigs map[string]*NodeTemplateConfig
EnableAutoprovisioning bool
}

// ScaleUpTestResult represents a node groups scale up result
Expand Down

0 comments on commit f83dde0

Please sign in to comment.