From 4901303bef7fe5659db2193ac14ed100d3a45ac5 Mon Sep 17 00:00:00 2001 From: Maxime Fischer Date: Sun, 12 Nov 2023 22:29:40 +0000 Subject: [PATCH 01/10] Stop (un)tainting nodes from unselected node groups. --- cluster-autoscaler/core/static_autoscaler.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index ac3103973931..9a1675fcd1df 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -228,6 +228,8 @@ func (a *StaticAutoscaler) cleanUpIfRequired() { // CA can die at any time. Removing taints that might have been left from the previous run. if allNodes, err := a.AllNodeLister().List(); err != nil { klog.Errorf("Failed to list ready nodes, not cleaning up taints: %v", err) + } else if allNodes, err = filterNodeGroups(a.CloudProvider, allNodes...); err != nil { + klog.Errorf("Failed to filter nodes based on selected node groups: %v", err) } else { taints.CleanAllToBeDeleted(allNodes, a.AutoscalingContext.ClientSet, a.Recorder, a.CordonNodeBeforeTerminate) @@ -665,6 +667,11 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr scaleDownStatus.Result == scaledownstatus.ScaleDownNoUnneeded) && a.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintCount != 0 { taintableNodes := a.scaleDownPlanner.UnneededNodes() + allNodes, err := filterNodeGroups(a.CloudProvider, allNodes...) + if err != nil { + klog.Warningf("Failed filtering nodes based on selected node groups: %v", err) + } + // REVIEW: Shall we do the below calls if there's an error? This is essentially gonna mean that untaintableNodes is empty since allNodes is empty. untaintableNodes := subtractNodes(allNodes, taintableNodes) actuation.UpdateSoftDeletionTaints(a.AutoscalingContext, taintableNodes, untaintableNodes) } @@ -967,6 +974,19 @@ func (a *StaticAutoscaler) obtainNodeLists(cp cloudprovider.CloudProvider) ([]*a return allNodes, readyNodes, nil } +func filterNodeGroups(cp cloudprovider.CloudProvider, nodes ...*apiv1.Node) ([]*apiv1.Node, caerrors.AutoscalerError) { + filtered := make([]*apiv1.Node, 0, len(nodes)) + for _, n := range nodes { + if ng, err := cp.NodeGroupForNode(n); err != nil { + klog.Errorf("Failed to get node groupe from node: %v", err) + return nil, caerrors.ToAutoscalerError(caerrors.CloudProviderError, err) + } else if ng != nil { + filtered = append(filtered, n) + } + } + return filtered, nil +} + func (a *StaticAutoscaler) updateClusterState(allNodes []*apiv1.Node, nodeInfosForGroups map[string]*schedulerframework.NodeInfo, currentTime time.Time) caerrors.AutoscalerError { err := a.clusterStateRegistry.UpdateNodes(allNodes, nodeInfosForGroups, currentTime) if err != nil { From 91477aca4a7f97bdb3ef309130300a081490e47e Mon Sep 17 00:00:00 2001 From: Maxime Fischer Date: Mon, 13 Nov 2023 22:37:25 +0000 Subject: [PATCH 02/10] Add test for node from unselected node group. --- .../core/static_autoscaler_test.go | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 75b0db264d50..9ac8267eee19 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -18,6 +18,7 @@ package core import ( "bytes" + stdcontext "context" "flag" "fmt" "os" @@ -172,6 +173,13 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { SetNodeReadyState(n2, true, time.Now()) n3 := BuildTestNode("n3", 1000, 1000) n4 := BuildTestNode("n4", 1000, 1000) + n5 := BuildTestNode("n5", 1000, 1000) + n5.Spec.Taints = append(n5.Spec.Taints, apiv1.Taint{ + Key: taints.DeletionCandidateTaint, + Value: fmt.Sprint(time.Now().Unix()), + Effect: apiv1.TaintEffectPreferNoSchedule, + }) + SetNodeReadyState(n5, true, time.Now()) p1 := BuildTestPod("p1", 600, 100) p1.Spec.NodeName = "n1" @@ -214,7 +222,11 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { } processorCallbacks := newStaticAutoscalerProcessorCallbacks() - context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, nil, provider, processorCallbacks, nil) + clientset := fake.NewSimpleClientset(n1, n2, n3, n4, n5) + newN5, err := clientset.CoreV1().Nodes().Get(stdcontext.TODO(), n5.Name, metav1.GetOptions{}) + fmt.Println("#################################", newN5, err) + + context, err := NewScaleTestAutoscalingContext(options, clientset, nil, provider, processorCallbacks, nil) assert.NoError(t, err) setUpScaleDownActuator(&context, options) @@ -328,7 +340,7 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { mock.AssertExpectationsForObjects(t, allPodListerMock, podDisruptionBudgetListerMock, daemonSetListerMock, onScaleUpMock, onScaleDownMock) - // Scale up to node gorup min size. + // Scale up to node group min size. readyNodeLister.SetNodes([]*apiv1.Node{n4}) allNodeLister.SetNodes([]*apiv1.Node{n4}) allPodListerMock.On("List").Return([]*apiv1.Pod{}, nil).Twice() @@ -342,6 +354,22 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { err = autoscaler.RunOnce(time.Now().Add(5 * time.Hour)) assert.NoError(t, err) mock.AssertExpectationsForObjects(t, onScaleUpMock) + + // Node from non-selected node groups should keep their taints. + autoscaler.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintCount = 10 + autoscaler.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintTime = 3 * time.Second + + readyNodeLister.SetNodes([]*apiv1.Node{n5}) + allNodeLister.SetNodes([]*apiv1.Node{n5}) + allPodListerMock.On("List").Return([]*apiv1.Pod{}, nil).Twice() + daemonSetListerMock.On("List", labels.Everything()).Return([]*appsv1.DaemonSet{}, nil) + podDisruptionBudgetListerMock.On("List").Return([]*policyv1.PodDisruptionBudget{}, nil) + + err = autoscaler.RunOnce(time.Now().Add(5 * time.Hour)) + assert.NoError(t, err) + newN5, err = clientset.CoreV1().Nodes().Get(stdcontext.TODO(), n5.Name, metav1.GetOptions{}) + assert.NoError(t, err) + assert.Equal(t, n5.Spec.Taints, newN5.Spec.Taints) } func TestStaticAutoscalerRunOnceWithAutoprovisionedEnabled(t *testing.T) { From cc6ecec2d3e0412d4c3b1199ea9bbd4e7742f99c Mon Sep 17 00:00:00 2001 From: maxime Date: Tue, 12 Dec 2023 11:26:23 +0000 Subject: [PATCH 03/10] Remove debug line. --- cluster-autoscaler/core/static_autoscaler_test.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 9ac8267eee19..d4ba1818fdae 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -223,9 +223,6 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { processorCallbacks := newStaticAutoscalerProcessorCallbacks() clientset := fake.NewSimpleClientset(n1, n2, n3, n4, n5) - newN5, err := clientset.CoreV1().Nodes().Get(stdcontext.TODO(), n5.Name, metav1.GetOptions{}) - fmt.Println("#################################", newN5, err) - context, err := NewScaleTestAutoscalingContext(options, clientset, nil, provider, processorCallbacks, nil) assert.NoError(t, err) From 7383034c082215abc10d4ff4182e3452aa4f5512 Mon Sep 17 00:00:00 2001 From: maxime Date: Tue, 12 Dec 2023 11:28:02 +0000 Subject: [PATCH 04/10] Fix typo. --- cluster-autoscaler/core/static_autoscaler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 9a1675fcd1df..ad34bbc2c0eb 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -978,7 +978,7 @@ func filterNodeGroups(cp cloudprovider.CloudProvider, nodes ...*apiv1.Node) ([]* filtered := make([]*apiv1.Node, 0, len(nodes)) for _, n := range nodes { if ng, err := cp.NodeGroupForNode(n); err != nil { - klog.Errorf("Failed to get node groupe from node: %v", err) + klog.Errorf("Failed to get a node group node node: %v", err) return nil, caerrors.ToAutoscalerError(caerrors.CloudProviderError, err) } else if ng != nil { filtered = append(filtered, n) From c3810ec199d5d16314ef433023ecd560906300b9 Mon Sep 17 00:00:00 2001 From: maxime Date: Tue, 12 Dec 2023 11:33:33 +0000 Subject: [PATCH 05/10] Rename filterNodeGroups to filterNodeFromSelectedGroups. --- cluster-autoscaler/core/static_autoscaler.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index ad34bbc2c0eb..9f76a2a8dc30 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -228,7 +228,7 @@ func (a *StaticAutoscaler) cleanUpIfRequired() { // CA can die at any time. Removing taints that might have been left from the previous run. if allNodes, err := a.AllNodeLister().List(); err != nil { klog.Errorf("Failed to list ready nodes, not cleaning up taints: %v", err) - } else if allNodes, err = filterNodeGroups(a.CloudProvider, allNodes...); err != nil { + } else if allNodes, err = filterNodesFromSelectedGroups(a.CloudProvider, allNodes...); err != nil { klog.Errorf("Failed to filter nodes based on selected node groups: %v", err) } else { taints.CleanAllToBeDeleted(allNodes, @@ -667,7 +667,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr scaleDownStatus.Result == scaledownstatus.ScaleDownNoUnneeded) && a.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintCount != 0 { taintableNodes := a.scaleDownPlanner.UnneededNodes() - allNodes, err := filterNodeGroups(a.CloudProvider, allNodes...) + allNodes, err := filterNodesFromSelectedGroups(a.CloudProvider, allNodes...) if err != nil { klog.Warningf("Failed filtering nodes based on selected node groups: %v", err) } @@ -974,7 +974,7 @@ func (a *StaticAutoscaler) obtainNodeLists(cp cloudprovider.CloudProvider) ([]*a return allNodes, readyNodes, nil } -func filterNodeGroups(cp cloudprovider.CloudProvider, nodes ...*apiv1.Node) ([]*apiv1.Node, caerrors.AutoscalerError) { +func filterNodesFromSelectedGroups(cp cloudprovider.CloudProvider, nodes ...*apiv1.Node) ([]*apiv1.Node, caerrors.AutoscalerError) { filtered := make([]*apiv1.Node, 0, len(nodes)) for _, n := range nodes { if ng, err := cp.NodeGroupForNode(n); err != nil { From 95ad6c99a290ac1f0ef5f7858d3c4e8b3bf6db68 Mon Sep 17 00:00:00 2001 From: maxime Date: Tue, 12 Dec 2023 15:49:38 +0000 Subject: [PATCH 06/10] Fallback to initial node list if filtering fails. --- cluster-autoscaler/core/static_autoscaler.go | 43 +++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 9f76a2a8dc30..52ee36a5e45d 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -228,10 +228,15 @@ func (a *StaticAutoscaler) cleanUpIfRequired() { // CA can die at any time. Removing taints that might have been left from the previous run. if allNodes, err := a.AllNodeLister().List(); err != nil { klog.Errorf("Failed to list ready nodes, not cleaning up taints: %v", err) - } else if allNodes, err = filterNodesFromSelectedGroups(a.CloudProvider, allNodes...); err != nil { - klog.Errorf("Failed to filter nodes based on selected node groups: %v", err) } else { - taints.CleanAllToBeDeleted(allNodes, + // Make sure we are only cleaning taints from selected node groups. + // If filtering fails, we use the initial list of all nodes as a fallback. + selectedNodes, err = filterNodesFromSelectedGroups(a.CloudProvider, allNodes...) + if err != nil { + klog.Warningf("Failed to filter nodes based on selected node groups: %v", err) + selectedNodes = allNodes + } + taints.CleanAllToBeDeleted(selectedNodes, a.AutoscalingContext.ClientSet, a.Recorder, a.CordonNodeBeforeTerminate) if a.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintCount == 0 { // Clean old taints if soft taints handling is disabled @@ -667,12 +672,19 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr scaleDownStatus.Result == scaledownstatus.ScaleDownNoUnneeded) && a.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintCount != 0 { taintableNodes := a.scaleDownPlanner.UnneededNodes() - allNodes, err := filterNodesFromSelectedGroups(a.CloudProvider, allNodes...) + + // Make sure we are only cleaning taints from selected node groups. + // If filtering fails, we use the initial list of all nodes as a fallback. + selectedNodes, err := filterNodesFromSelectedGroups(a.CloudProvider, allNodes...) if err != nil { klog.Warningf("Failed filtering nodes based on selected node groups: %v", err) + selectedNodes = allNodes } - // REVIEW: Shall we do the below calls if there's an error? This is essentially gonna mean that untaintableNodes is empty since allNodes is empty. - untaintableNodes := subtractNodes(allNodes, taintableNodes) + + // This is a sanity check to make sure `taintableNodes` only includes + // nodes from selected nodes. + taintableNodes = intersectNodes(selectedNodes, taintableNodes) + untaintableNodes := subtractNodes(selectedNodes, taintableNodes) actuation.UpdateSoftDeletionTaints(a.AutoscalingContext, taintableNodes, untaintableNodes) } @@ -1085,6 +1097,25 @@ func subtractNodes(a []*apiv1.Node, b []*apiv1.Node) []*apiv1.Node { return subtractNodesByName(a, nodeNames(b)) } +func filterNodesByName(nodes []*apiv1.Node, names []string) []*apiv1.Node { + c := make([]*apiv1.Node, 0, len(names)) + filterSet := make(map[string]bool, len(names)) + for _, name := range names { + filterSet[name] = true + } + for _, n := range nodes { + if filterSet[n.Name] { + c = append(c, n) + } + } + return c +} + +// intersectNodes gives intersection of 2 node lists +func intersectNodes(a []*apiv1.Node, b []*apiv1.Node) []*apiv1.Node { + return filterNodesByName(a, nodeNames(b)) +} + func nodeNames(ns []*apiv1.Node) []string { names := make([]string, len(ns)) for i, node := range ns { From 368441dcd0e4dc93fcc7e5774537fb53940dbee3 Mon Sep 17 00:00:00 2001 From: maxime Date: Tue, 12 Dec 2023 17:16:01 +0000 Subject: [PATCH 07/10] Fix compilation errors. --- cluster-autoscaler/core/static_autoscaler.go | 2 +- cluster-autoscaler/core/static_autoscaler_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 52ee36a5e45d..6e953946e0fc 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -231,7 +231,7 @@ func (a *StaticAutoscaler) cleanUpIfRequired() { } else { // Make sure we are only cleaning taints from selected node groups. // If filtering fails, we use the initial list of all nodes as a fallback. - selectedNodes, err = filterNodesFromSelectedGroups(a.CloudProvider, allNodes...) + selectedNodes, err := filterNodesFromSelectedGroups(a.CloudProvider, allNodes...) if err != nil { klog.Warningf("Failed to filter nodes based on selected node groups: %v", err) selectedNodes = allNodes diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index d4ba1818fdae..c81dc1321045 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -364,7 +364,7 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { err = autoscaler.RunOnce(time.Now().Add(5 * time.Hour)) assert.NoError(t, err) - newN5, err = clientset.CoreV1().Nodes().Get(stdcontext.TODO(), n5.Name, metav1.GetOptions{}) + newN5, err := clientset.CoreV1().Nodes().Get(stdcontext.TODO(), n5.Name, metav1.GetOptions{}) assert.NoError(t, err) assert.Equal(t, n5.Spec.Taints, newN5.Spec.Taints) } From 5f6eedc8f7b9d7621cc760392c87d4af4cc26eb0 Mon Sep 17 00:00:00 2001 From: maxime Date: Tue, 12 Dec 2023 17:52:29 +0000 Subject: [PATCH 08/10] Split new test into a separate function. --- .../core/static_autoscaler_test.go | 149 +++++++++++++++--- 1 file changed, 124 insertions(+), 25 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index c81dc1321045..da4d9ad19369 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -173,13 +173,6 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { SetNodeReadyState(n2, true, time.Now()) n3 := BuildTestNode("n3", 1000, 1000) n4 := BuildTestNode("n4", 1000, 1000) - n5 := BuildTestNode("n5", 1000, 1000) - n5.Spec.Taints = append(n5.Spec.Taints, apiv1.Taint{ - Key: taints.DeletionCandidateTaint, - Value: fmt.Sprint(time.Now().Unix()), - Effect: apiv1.TaintEffectPreferNoSchedule, - }) - SetNodeReadyState(n5, true, time.Now()) p1 := BuildTestPod("p1", 600, 100) p1.Spec.NodeName = "n1" @@ -222,8 +215,7 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { } processorCallbacks := newStaticAutoscalerProcessorCallbacks() - clientset := fake.NewSimpleClientset(n1, n2, n3, n4, n5) - context, err := NewScaleTestAutoscalingContext(options, clientset, nil, provider, processorCallbacks, nil) + context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, nil, provider, processorCallbacks, nil) assert.NoError(t, err) setUpScaleDownActuator(&context, options) @@ -351,22 +343,6 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { err = autoscaler.RunOnce(time.Now().Add(5 * time.Hour)) assert.NoError(t, err) mock.AssertExpectationsForObjects(t, onScaleUpMock) - - // Node from non-selected node groups should keep their taints. - autoscaler.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintCount = 10 - autoscaler.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintTime = 3 * time.Second - - readyNodeLister.SetNodes([]*apiv1.Node{n5}) - allNodeLister.SetNodes([]*apiv1.Node{n5}) - allPodListerMock.On("List").Return([]*apiv1.Pod{}, nil).Twice() - daemonSetListerMock.On("List", labels.Everything()).Return([]*appsv1.DaemonSet{}, nil) - podDisruptionBudgetListerMock.On("List").Return([]*policyv1.PodDisruptionBudget{}, nil) - - err = autoscaler.RunOnce(time.Now().Add(5 * time.Hour)) - assert.NoError(t, err) - newN5, err := clientset.CoreV1().Nodes().Get(stdcontext.TODO(), n5.Name, metav1.GetOptions{}) - assert.NoError(t, err) - assert.Equal(t, n5.Spec.Taints, newN5.Spec.Taints) } func TestStaticAutoscalerRunOnceWithAutoprovisionedEnabled(t *testing.T) { @@ -1016,6 +992,129 @@ func TestStaticAutoscalerRunOnceWithFilteringOnUpcomingNodesEnabledNoScaleUp(t * podDisruptionBudgetListerMock, daemonSetListerMock, onScaleUpMock, onScaleDownMock) } +// We should not touch taints from unselected node groups. +func TestStaticAutoscalerRunOnceWithUnselectedNodeGroups(t *testing.T) { + readyNodeLister := kubernetes.NewTestNodeLister(nil) + allNodeLister := kubernetes.NewTestNodeLister(nil) + allPodListerMock := &podListerMock{} + podDisruptionBudgetListerMock := &podDisruptionBudgetListerMock{} + daemonSetListerMock := &daemonSetListerMock{} + onScaleUpMock := &onScaleUpMock{} + onScaleDownMock := &onScaleDownMock{} + deleteFinished := make(chan bool, 1) + + n1 := BuildTestNode("n1", 1000, 1000) + n1.Spec.Taints = append(n1.Spec.Taints, apiv1.Taint{ + Key: taints.DeletionCandidateTaint, + Value: fmt.Sprint(time.Now().Unix()), + Effect: apiv1.TaintEffectPreferNoSchedule, + }) + SetNodeReadyState(n1, true, time.Now()) + n2 := BuildTestNode("n2", 1000, 1000) + n2.Spec.Taints = append(n2.Spec.Taints, apiv1.Taint{ + Key: taints.DeletionCandidateTaint, + Value: fmt.Sprint(time.Now().Unix()), + Effect: apiv1.TaintEffectPreferNoSchedule, + }) + SetNodeReadyState(n2, true, time.Now()) + + p1 := BuildTestPod("p1", 600, 100) + p1.Spec.NodeName = n1.Name + + provider := testprovider.NewTestAutoprovisioningCloudProvider( + func(id string, delta int) error { + return onScaleUpMock.ScaleUp(id, delta) + }, func(id string, name string) error { + ret := onScaleDownMock.ScaleDown(id, name) + deleteFinished <- true + return ret + }, + nil, nil, nil, nil) + provider.AddNodeGroup("ng1", 1, 10, 1) + provider.AddNode("ng1", n1) + ng1 := reflect.ValueOf(provider.GetNodeGroup("ng1")).Interface().(*testprovider.TestNodeGroup) + assert.NotNil(t, ng1) + assert.NotNil(t, provider) + + // Create context with mocked lister registry. + options := config.AutoscalingOptions{ + NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ + ScaleDownUnneededTime: time.Minute, + ScaleDownUnreadyTime: time.Minute, + ScaleDownUtilizationThreshold: 0.5, + MaxNodeProvisionTime: 10 * time.Second, + }, + EstimatorName: estimator.BinpackingEstimatorName, + EnforceNodeGroupMinSize: true, + ScaleDownEnabled: true, + MaxNodesTotal: 1, + MaxCoresTotal: 10, + MaxMemoryTotal: 100000, + } + processorCallbacks := newStaticAutoscalerProcessorCallbacks() + + clientset := fake.NewSimpleClientset(n1, n2) + context, err := NewScaleTestAutoscalingContext(options, clientset, nil, provider, processorCallbacks, nil) + assert.NoError(t, err) + + setUpScaleDownActuator(&context, options) + + listerRegistry := kube_util.NewListerRegistry(allNodeLister, readyNodeLister, allPodListerMock, podDisruptionBudgetListerMock, daemonSetListerMock, + nil, nil, nil, nil) + context.ListerRegistry = listerRegistry + + clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ + OkTotalUnreadyCount: 1, + } + processors := NewTestProcessors(&context) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults)) + sdPlanner, sdActuator := newScaleDownPlannerAndActuator(t, &context, processors, clusterState) + suOrchestrator := orchestrator.New() + suOrchestrator.Initialize(&context, processors, clusterState, taints.TaintConfig{}) + + context.AutoscalingOptions.MaxBulkSoftTaintCount = 10 + context.AutoscalingOptions.MaxBulkSoftTaintTime = 3 * time.Second + + autoscaler := &StaticAutoscaler{ + AutoscalingContext: &context, + clusterStateRegistry: clusterState, + lastScaleUpTime: time.Now(), + lastScaleDownFailTime: time.Now(), + scaleDownPlanner: sdPlanner, + scaleDownActuator: sdActuator, + scaleUpOrchestrator: suOrchestrator, + processors: processors, + processorCallbacks: processorCallbacks, + initialized: true, + } + + // Node from selected node groups can get their deletion candidate taints removed. + readyNodeLister.SetNodes([]*apiv1.Node{n1}) + allNodeLister.SetNodes([]*apiv1.Node{n1}) + allPodListerMock.On("List").Return([]*apiv1.Pod{p1}, nil).Twice() + daemonSetListerMock.On("List", labels.Everything()).Return([]*appsv1.DaemonSet{}, nil) + podDisruptionBudgetListerMock.On("List").Return([]*policyv1.PodDisruptionBudget{}, nil) + + err = autoscaler.RunOnce(time.Now().Add(5 * time.Hour)) + assert.NoError(t, err) + newN1, err := clientset.CoreV1().Nodes().Get(stdcontext.TODO(), n1.Name, metav1.GetOptions{}) + assert.NoError(t, err) + assert.Empty(t, newN1.Spec.Taints) + + // Node from non-selected node groups should keep their deletion candidate taints. + readyNodeLister.SetNodes([]*apiv1.Node{n2}) + allNodeLister.SetNodes([]*apiv1.Node{n2}) + allPodListerMock.On("List").Return([]*apiv1.Pod{}, nil).Twice() + daemonSetListerMock.On("List", labels.Everything()).Return([]*appsv1.DaemonSet{}, nil) + podDisruptionBudgetListerMock.On("List").Return([]*policyv1.PodDisruptionBudget{}, nil) + + err = autoscaler.RunOnce(time.Now().Add(5 * time.Hour)) + assert.NoError(t, err) + newN2, err := clientset.CoreV1().Nodes().Get(stdcontext.TODO(), n2.Name, metav1.GetOptions{}) + assert.NoError(t, err) + assert.NotEmpty(t, newN2.Spec.Taints) +} + func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { // setup provider := &mockprovider.CloudProvider{} From a47ef89f5a6eefa991858524ca017bfec1900503 Mon Sep 17 00:00:00 2001 From: maxime Date: Fri, 19 Jan 2024 14:10:33 +0000 Subject: [PATCH 09/10] Only log when we fail to get NodeGroup for a node. --- cluster-autoscaler/core/static_autoscaler.go | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 3c44851169cc..c3bc91cdc2c5 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -229,12 +229,7 @@ func (a *StaticAutoscaler) cleanUpIfRequired() { klog.Errorf("Failed to list ready nodes, not cleaning up taints: %v", err) } else { // Make sure we are only cleaning taints from selected node groups. - // If filtering fails, we use the initial list of all nodes as a fallback. - selectedNodes, err := filterNodesFromSelectedGroups(a.CloudProvider, allNodes...) - if err != nil { - klog.Warningf("Failed to filter nodes based on selected node groups: %v", err) - selectedNodes = allNodes - } + selectedNodes := filterNodesFromSelectedGroups(a.CloudProvider, allNodes...) taints.CleanAllToBeDeleted(selectedNodes, a.AutoscalingContext.ClientSet, a.Recorder, a.CordonNodeBeforeTerminate) if a.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintCount == 0 { @@ -669,12 +664,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr taintableNodes := a.scaleDownPlanner.UnneededNodes() // Make sure we are only cleaning taints from selected node groups. - // If filtering fails, we use the initial list of all nodes as a fallback. - selectedNodes, err := filterNodesFromSelectedGroups(a.CloudProvider, allNodes...) - if err != nil { - klog.Warningf("Failed filtering nodes based on selected node groups: %v", err) - selectedNodes = allNodes - } + selectedNodes := filterNodesFromSelectedGroups(a.CloudProvider, allNodes...) // This is a sanity check to make sure `taintableNodes` only includes // nodes from selected nodes. @@ -983,17 +973,16 @@ func (a *StaticAutoscaler) obtainNodeLists(cp cloudprovider.CloudProvider) ([]*a return allNodes, readyNodes, nil } -func filterNodesFromSelectedGroups(cp cloudprovider.CloudProvider, nodes ...*apiv1.Node) ([]*apiv1.Node, caerrors.AutoscalerError) { +func filterNodesFromSelectedGroups(cp cloudprovider.CloudProvider, nodes ...*apiv1.Node) []*apiv1.Node { filtered := make([]*apiv1.Node, 0, len(nodes)) for _, n := range nodes { if ng, err := cp.NodeGroupForNode(n); err != nil { klog.Errorf("Failed to get a node group node node: %v", err) - return nil, caerrors.ToAutoscalerError(caerrors.CloudProviderError, err) } else if ng != nil { filtered = append(filtered, n) } } - return filtered, nil + return filtered } func (a *StaticAutoscaler) updateClusterState(allNodes []*apiv1.Node, nodeInfosForGroups map[string]*schedulerframework.NodeInfo, currentTime time.Time) caerrors.AutoscalerError { From e8e3ad0b1f16ae770f3e52c084d8430bf2104e8d Mon Sep 17 00:00:00 2001 From: maxime Date: Mon, 22 Jan 2024 11:24:40 +0000 Subject: [PATCH 10/10] Move to table-based tests. --- .../core/static_autoscaler_test.go | 153 ++++++++---------- 1 file changed, 65 insertions(+), 88 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 57d71900f387..c5e73b202fe2 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1123,15 +1123,6 @@ func TestStaticAutoscalerRunOnceWithFilteringOnUpcomingNodesEnabledNoScaleUp(t * // We should not touch taints from unselected node groups. func TestStaticAutoscalerRunOnceWithUnselectedNodeGroups(t *testing.T) { - readyNodeLister := kubernetes.NewTestNodeLister(nil) - allNodeLister := kubernetes.NewTestNodeLister(nil) - allPodListerMock := &podListerMock{} - podDisruptionBudgetListerMock := &podDisruptionBudgetListerMock{} - daemonSetListerMock := &daemonSetListerMock{} - onScaleUpMock := &onScaleUpMock{} - onScaleDownMock := &onScaleDownMock{} - deleteFinished := make(chan bool, 1) - n1 := BuildTestNode("n1", 1000, 1000) n1.Spec.Taints = append(n1.Spec.Taints, apiv1.Taint{ Key: taints.DeletionCandidateTaint, @@ -1150,98 +1141,84 @@ func TestStaticAutoscalerRunOnceWithUnselectedNodeGroups(t *testing.T) { p1 := BuildTestPod("p1", 600, 100) p1.Spec.NodeName = n1.Name - provider := testprovider.NewTestAutoprovisioningCloudProvider( - func(id string, delta int) error { - return onScaleUpMock.ScaleUp(id, delta) - }, func(id string, name string) error { - ret := onScaleDownMock.ScaleDown(id, name) - deleteFinished <- true - return ret - }, - nil, nil, nil, nil) + // set minimal cloud provider where only ng1 is defined as selected node group + provider := testprovider.NewTestCloudProvider(nil, nil) provider.AddNodeGroup("ng1", 1, 10, 1) provider.AddNode("ng1", n1) - ng1 := reflect.ValueOf(provider.GetNodeGroup("ng1")).Interface().(*testprovider.TestNodeGroup) - assert.NotNil(t, ng1) assert.NotNil(t, provider) - // Create context with mocked lister registry. - options := config.AutoscalingOptions{ - NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ - ScaleDownUnneededTime: time.Minute, - ScaleDownUnreadyTime: time.Minute, - ScaleDownUtilizationThreshold: 0.5, - MaxNodeProvisionTime: 10 * time.Second, + tests := map[string]struct { + node *apiv1.Node + pods []*apiv1.Pod + expectedTaints []apiv1.Taint + }{ + "Node from selected node groups can get their deletion candidate taints removed": { + node: n1, + pods: []*apiv1.Pod{p1}, + expectedTaints: []apiv1.Taint{}, + }, + "Node from non-selected node groups should keep their deletion candidate taints": { + node: n2, + pods: nil, + expectedTaints: n2.Spec.Taints, }, - EstimatorName: estimator.BinpackingEstimatorName, - EnforceNodeGroupMinSize: true, - ScaleDownEnabled: true, - MaxNodesTotal: 1, - MaxCoresTotal: 10, - MaxMemoryTotal: 100000, } - processorCallbacks := newStaticAutoscalerProcessorCallbacks() - clientset := fake.NewSimpleClientset(n1, n2) - context, err := NewScaleTestAutoscalingContext(options, clientset, nil, provider, processorCallbacks, nil) - assert.NoError(t, err) + for name, test := range tests { + // prevent issues with scoping, we should be able to get rid of that with Go 1.22 + test := test + t.Run(name, func(t *testing.T) { + t.Parallel() + // Create fake listers for the generated nodes, nothing returned by the rest (but the ones used in the tested path have to be defined). + readyNodeLister := kubernetes.NewTestNodeLister([]*apiv1.Node{test.node}) + allNodeLister := kubernetes.NewTestNodeLister([]*apiv1.Node{test.node}) + allPodListerMock := kubernetes.NewTestPodLister(test.pods) + daemonSetLister, err := kubernetes.NewTestDaemonSetLister(nil) + assert.NoError(t, err) + listerRegistry := kube_util.NewListerRegistry(allNodeLister, readyNodeLister, allPodListerMock, + kubernetes.NewTestPodDisruptionBudgetLister(nil), daemonSetLister, + nil, nil, nil, nil) + + // Create context with minimal autoscalingOptions that guarantee we reach the tested logic. + autoscalingOptions := config.AutoscalingOptions{ + ScaleDownEnabled: true, + MaxBulkSoftTaintCount: 10, + MaxBulkSoftTaintTime: 3 * time.Second, + } + processorCallbacks := newStaticAutoscalerProcessorCallbacks() + clientset := fake.NewSimpleClientset(test.node) + context, err := NewScaleTestAutoscalingContext(autoscalingOptions, clientset, listerRegistry, provider, processorCallbacks, nil) + assert.NoError(t, err) - setUpScaleDownActuator(&context, options) + // Create CSR with unhealthy cluster protection effectively disabled, to guarantee we reach the tested logic. + clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ + OkTotalUnreadyCount: 1, + } + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(autoscalingOptions.NodeGroupDefaults)) - listerRegistry := kube_util.NewListerRegistry(allNodeLister, readyNodeLister, allPodListerMock, podDisruptionBudgetListerMock, daemonSetListerMock, - nil, nil, nil, nil) - context.ListerRegistry = listerRegistry + // Setting the Actuator is necessary for testing any scale-down logic, it shouldn't have anything to do in this test. + sdActuator := actuation.NewActuator(&context, clusterState, deletiontracker.NewNodeDeletionTracker(0*time.Second), options.NodeDeleteOptions{}, nil, NewTestProcessors(&context).NodeGroupConfigProcessor) + context.ScaleDownActuator = sdActuator - clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ - OkTotalUnreadyCount: 1, - } - processors := NewTestProcessors(&context) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults)) - sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState) - suOrchestrator := orchestrator.New() - suOrchestrator.Initialize(&context, processors, clusterState, taints.TaintConfig{}) + // Fake planner that keeps track of the scale-down candidates passed to UpdateClusterState. + sdPlanner := &candidateTrackingFakePlanner{} - context.AutoscalingOptions.MaxBulkSoftTaintCount = 10 - context.AutoscalingOptions.MaxBulkSoftTaintTime = 3 * time.Second + autoscaler := &StaticAutoscaler{ + AutoscalingContext: &context, + clusterStateRegistry: clusterState, + scaleDownPlanner: sdPlanner, + scaleDownActuator: sdActuator, + processors: NewTestProcessors(&context), + processorCallbacks: processorCallbacks, + } - autoscaler := &StaticAutoscaler{ - AutoscalingContext: &context, - clusterStateRegistry: clusterState, - lastScaleUpTime: time.Now(), - lastScaleDownFailTime: time.Now(), - scaleDownPlanner: sdPlanner, - scaleDownActuator: sdActuator, - scaleUpOrchestrator: suOrchestrator, - processors: processors, - processorCallbacks: processorCallbacks, - initialized: true, + err = autoscaler.RunOnce(time.Now().Add(5 * time.Hour)) + assert.NoError(t, err) + newNode, err := clientset.CoreV1().Nodes().Get(stdcontext.TODO(), test.node.Name, metav1.GetOptions{}) + assert.NoError(t, err) + assert.Equal(t, test.expectedTaints, newNode.Spec.Taints) + }) } - - // Node from selected node groups can get their deletion candidate taints removed. - readyNodeLister.SetNodes([]*apiv1.Node{n1}) - allNodeLister.SetNodes([]*apiv1.Node{n1}) - allPodListerMock.On("List").Return([]*apiv1.Pod{p1}, nil).Twice() - daemonSetListerMock.On("List", labels.Everything()).Return([]*appsv1.DaemonSet{}, nil) - podDisruptionBudgetListerMock.On("List").Return([]*policyv1.PodDisruptionBudget{}, nil) - - err = autoscaler.RunOnce(time.Now().Add(5 * time.Hour)) - assert.NoError(t, err) - newN1, err := clientset.CoreV1().Nodes().Get(stdcontext.TODO(), n1.Name, metav1.GetOptions{}) - assert.NoError(t, err) - assert.Empty(t, newN1.Spec.Taints) - - // Node from non-selected node groups should keep their deletion candidate taints. - readyNodeLister.SetNodes([]*apiv1.Node{n2}) - allNodeLister.SetNodes([]*apiv1.Node{n2}) - allPodListerMock.On("List").Return([]*apiv1.Pod{}, nil).Twice() - daemonSetListerMock.On("List", labels.Everything()).Return([]*appsv1.DaemonSet{}, nil) - podDisruptionBudgetListerMock.On("List").Return([]*policyv1.PodDisruptionBudget{}, nil) - - err = autoscaler.RunOnce(time.Now().Add(5 * time.Hour)) - assert.NoError(t, err) - newN2, err := clientset.CoreV1().Nodes().Get(stdcontext.TODO(), n2.Name, metav1.GetOptions{}) - assert.NoError(t, err) - assert.NotEmpty(t, newN2.Spec.Taints) } func TestStaticAutoscalerRunOnceWithBypassedSchedulers(t *testing.T) {