diff --git a/cluster-autoscaler/cloudprovider/alicloud/alicloud_auto_scaling_group.go b/cluster-autoscaler/cloudprovider/alicloud/alicloud_auto_scaling_group.go index 4ecb64c49b56..7ae1f9295de8 100644 --- a/cluster-autoscaler/cloudprovider/alicloud/alicloud_auto_scaling_group.go +++ b/cluster-autoscaler/cloudprovider/alicloud/alicloud_auto_scaling_group.go @@ -150,6 +150,11 @@ func (asg *Asg) DeleteNodes(nodes []*apiv1.Node) error { return asg.manager.DeleteInstances(nodeIds) } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (asg *Asg) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // Id returns asg id. func (asg *Asg) Id() string { return asg.id diff --git a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go index 118f5e91e88d..1670e98e574c 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go @@ -349,6 +349,11 @@ func (ng *AwsNodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return ng.awsManager.DeleteInstances(refs) } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (ng *AwsNodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // Id returns asg id. func (ng *AwsNodeGroup) Id() string { return ng.asg.Name diff --git a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go index c69ff8f07083..9d3731f5d1b9 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go @@ -471,6 +471,11 @@ func (as *AgentPool) DeleteNodes(nodes []*apiv1.Node) error { return as.DeleteInstances(refs) } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (as *AgentPool) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // Debug returns a debug string for the agent pool. func (as *AgentPool) Debug() string { return fmt.Sprintf("%s (%d:%d)", as.Name, as.MinSize(), as.MaxSize()) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index c4399ea45b37..807e000f426b 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -626,6 +626,11 @@ func (scaleSet *ScaleSet) DeleteNodes(nodes []*apiv1.Node) error { return scaleSet.DeleteInstances(refs, hasUnregisteredNodes) } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (scaleSet *ScaleSet) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // Id returns ScaleSet id. func (scaleSet *ScaleSet) Id() string { return scaleSet.Name diff --git a/cluster-autoscaler/cloudprovider/azure/azure_vms_pool.go b/cluster-autoscaler/cloudprovider/azure/azure_vms_pool.go index b9387d4aebda..c3db897ea919 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_vms_pool.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_vms_pool.go @@ -120,6 +120,11 @@ func (agentPool *VMsPool) DeleteNodes(nodes []*apiv1.Node) error { return cloudprovider.ErrNotImplemented } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (agentPool *VMsPool) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. func (agentPool *VMsPool) DecreaseTargetSize(delta int) error { // TODO(wenxuan): Implement this method diff --git a/cluster-autoscaler/cloudprovider/baiducloud/baiducloud_cloud_provider.go b/cluster-autoscaler/cloudprovider/baiducloud/baiducloud_cloud_provider.go index dfbd9c2095ec..1698eb23cbde 100644 --- a/cluster-autoscaler/cloudprovider/baiducloud/baiducloud_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/baiducloud/baiducloud_cloud_provider.go @@ -315,6 +315,11 @@ func (asg *Asg) DeleteNodes(nodes []*apiv1.Node) error { return asg.baiducloudManager.ScaleDownCluster(nodeID) } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (asg *Asg) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // Belongs returns true if the given node belongs to the NodeGroup. func (asg *Asg) Belongs(instanceID string) (bool, error) { targetAsg, err := asg.baiducloudManager.GetAsgForInstance(instanceID) diff --git a/cluster-autoscaler/cloudprovider/bizflycloud/bizflycloud_node_group.go b/cluster-autoscaler/cloudprovider/bizflycloud/bizflycloud_node_group.go index 5b4cea3b869a..8ed2c7b647bb 100644 --- a/cluster-autoscaler/cloudprovider/bizflycloud/bizflycloud_node_group.go +++ b/cluster-autoscaler/cloudprovider/bizflycloud/bizflycloud_node_group.go @@ -132,6 +132,11 @@ func (n *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (n *NodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/brightbox/brightbox_node_group.go b/cluster-autoscaler/cloudprovider/brightbox/brightbox_node_group.go index db184f7fd04b..63406917db12 100644 --- a/cluster-autoscaler/cloudprovider/brightbox/brightbox_node_group.go +++ b/cluster-autoscaler/cloudprovider/brightbox/brightbox_node_group.go @@ -153,6 +153,11 @@ func (ng *brightboxNodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (ng *brightboxNodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This // function doesn't permit to delete any existing node and can be used // only to reduce the request for new nodes that have not been yet diff --git a/cluster-autoscaler/cloudprovider/cherryservers/cherry_node_group.go b/cluster-autoscaler/cloudprovider/cherryservers/cherry_node_group.go index 33dfd8bd181a..b69b5ec5db96 100644 --- a/cluster-autoscaler/cloudprovider/cherryservers/cherry_node_group.go +++ b/cluster-autoscaler/cloudprovider/cherryservers/cherry_node_group.go @@ -189,6 +189,11 @@ func (ng *cherryNodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (ng *cherryNodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // getNodesToDelete safely gets all of the nodes added to the delete queue. // "safely", as in it locks, gets and then releases the queue. func (ng *cherryNodeGroup) getNodesToDelete() []*apiv1.Node { diff --git a/cluster-autoscaler/cloudprovider/civo/civo_node_group.go b/cluster-autoscaler/cloudprovider/civo/civo_node_group.go index ae3ddf34f430..83d2e754f203 100644 --- a/cluster-autoscaler/cloudprovider/civo/civo_node_group.go +++ b/cluster-autoscaler/cloudprovider/civo/civo_node_group.go @@ -144,6 +144,11 @@ func (n *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (n *NodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/cloud_provider.go b/cluster-autoscaler/cloudprovider/cloud_provider.go index 171a89cfab35..623e9bd345d2 100644 --- a/cluster-autoscaler/cloudprovider/cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/cloud_provider.go @@ -195,6 +195,12 @@ type NodeGroup interface { // should wait until node group size is updated. Implementation required. DeleteNodes([]*apiv1.Node) error + // ForceDeleteNodes deletes nodes from this node group, without checking for + // constraints like minimal size validation etc. Error is returned either on + // failure or if the given node doesn't belong to this node group. This function + // should wait until node group size is updated. + ForceDeleteNodes([]*apiv1.Node) error + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/cloudstack/cloudstack_node_group.go b/cluster-autoscaler/cloudprovider/cloudstack/cloudstack_node_group.go index 383587f30eb7..6e8b767c0d52 100644 --- a/cluster-autoscaler/cloudprovider/cloudstack/cloudstack_node_group.go +++ b/cluster-autoscaler/cloudprovider/cloudstack/cloudstack_node_group.go @@ -123,6 +123,11 @@ func (asg *asg) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (asg *asg) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // Id returns cluster id. func (asg *asg) Id() string { return asg.cluster.ID diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go index 481039024726..c8204a9c8c65 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go @@ -177,6 +177,11 @@ func (ng *nodegroup) DeleteNodes(nodes []*corev1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (ng *nodegroup) ForceDeleteNodes(nodes []*corev1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. // This function doesn't permit to delete any existing node and can be // used only to reduce the request for new nodes that have not been diff --git a/cluster-autoscaler/cloudprovider/digitalocean/digitalocean_node_group.go b/cluster-autoscaler/cloudprovider/digitalocean/digitalocean_node_group.go index 3f27d0e67180..3f880553f2ce 100644 --- a/cluster-autoscaler/cloudprovider/digitalocean/digitalocean_node_group.go +++ b/cluster-autoscaler/cloudprovider/digitalocean/digitalocean_node_group.go @@ -139,6 +139,11 @@ func (n *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (n *NodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/equinixmetal/node_group.go b/cluster-autoscaler/cloudprovider/equinixmetal/node_group.go index 4be962eecbb7..d7367a753d0e 100644 --- a/cluster-autoscaler/cloudprovider/equinixmetal/node_group.go +++ b/cluster-autoscaler/cloudprovider/equinixmetal/node_group.go @@ -226,6 +226,11 @@ func (ng *equinixMetalNodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (ng *equinixMetalNodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the cluster node_count in Equinix Metal. func (ng *equinixMetalNodeGroup) DecreaseTargetSize(delta int) error { if delta >= 0 { diff --git a/cluster-autoscaler/cloudprovider/exoscale/exoscale_node_group_instance_pool.go b/cluster-autoscaler/cloudprovider/exoscale/exoscale_node_group_instance_pool.go index bc82deb903f8..359c447b66cd 100644 --- a/cluster-autoscaler/cloudprovider/exoscale/exoscale_node_group_instance_pool.go +++ b/cluster-autoscaler/cloudprovider/exoscale/exoscale_node_group_instance_pool.go @@ -131,6 +131,11 @@ func (n *instancePoolNodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (n *instancePoolNodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/exoscale/exoscale_node_group_sks_nodepool.go b/cluster-autoscaler/cloudprovider/exoscale/exoscale_node_group_sks_nodepool.go index 9c0d3131438d..82fc0823050b 100644 --- a/cluster-autoscaler/cloudprovider/exoscale/exoscale_node_group_sks_nodepool.go +++ b/cluster-autoscaler/cloudprovider/exoscale/exoscale_node_group_sks_nodepool.go @@ -138,6 +138,11 @@ func (n *sksNodepoolNodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (n *sksNodepoolNodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/externalgrpc/externalgrpc_node_group.go b/cluster-autoscaler/cloudprovider/externalgrpc/externalgrpc_node_group.go index 52a467fd24d4..2f8000aded1d 100644 --- a/cluster-autoscaler/cloudprovider/externalgrpc/externalgrpc_node_group.go +++ b/cluster-autoscaler/cloudprovider/externalgrpc/externalgrpc_node_group.go @@ -122,6 +122,11 @@ func (n *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (n *NodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go index d9cb61695405..73d5773e7472 100644 --- a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go @@ -292,6 +292,11 @@ func (mig *gceMig) DeleteNodes(nodes []*apiv1.Node) error { if int(size) <= mig.MinSize() { return fmt.Errorf("min size reached, nodes will not be deleted") } + return mig.ForceDeleteNodes(nodes) +} + +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (mig *gceMig) ForceDeleteNodes(nodes []*apiv1.Node) error { refs := make([]GceRef, 0, len(nodes)) for _, node := range nodes { diff --git a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider_test.go index 964e65b1744b..00f91ea5cfdf 100644 --- a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider_test.go @@ -405,6 +405,14 @@ func TestMig(t *testing.T) { assert.Equal(t, "min size reached, nodes will not be deleted", err.Error()) mock.AssertExpectationsForObjects(t, gceManagerMock) + // Test ForceDeleteNodes - ignore the min size constraint. + gceManagerMock.On("GetMigForInstance", n1ref).Return(mig1, nil).Once() + gceManagerMock.On("GetMigForInstance", n2ref).Return(mig1, nil).Once() + gceManagerMock.On("DeleteInstances", []GceRef{n1ref, n2ref}).Return(nil).Once() + err = mig1.ForceDeleteNodes([]*apiv1.Node{n1, n2}) + assert.NoError(t, err) + mock.AssertExpectationsForObjects(t, gceManagerMock) + // Test Nodes. gceManagerMock.On("GetMigNodes", mock.AnythingOfType("*gce.gceMig")).Return( []GceInstance{ diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go index 6dffa1d972d9..c0f7510a8b3d 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go @@ -214,6 +214,11 @@ func (n *hetznerNodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (n *hetznerNodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud_auto_scaling_group.go b/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud_auto_scaling_group.go index 9440345ceab7..f4f86d9f2d63 100644 --- a/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud_auto_scaling_group.go +++ b/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud_auto_scaling_group.go @@ -146,6 +146,11 @@ func (asg *AutoScalingGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (asg *AutoScalingGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/ionoscloud/ionoscloud_cloud_provider.go b/cluster-autoscaler/cloudprovider/ionoscloud/ionoscloud_cloud_provider.go index 23b2dc5e1114..7669cba28b09 100644 --- a/cluster-autoscaler/cloudprovider/ionoscloud/ionoscloud_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/ionoscloud/ionoscloud_cloud_provider.go @@ -103,6 +103,11 @@ func (n *nodePool) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (n *nodePool) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/kamatera/kamatera_node_group.go b/cluster-autoscaler/cloudprovider/kamatera/kamatera_node_group.go index 130a3c6d7764..eb55ebcdfbb0 100644 --- a/cluster-autoscaler/cloudprovider/kamatera/kamatera_node_group.go +++ b/cluster-autoscaler/cloudprovider/kamatera/kamatera_node_group.go @@ -112,6 +112,11 @@ func (n *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (n *NodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/kubemark/kubemark_linux.go b/cluster-autoscaler/cloudprovider/kubemark/kubemark_linux.go index f12b7f589da4..a38148a1e616 100644 --- a/cluster-autoscaler/cloudprovider/kubemark/kubemark_linux.go +++ b/cluster-autoscaler/cloudprovider/kubemark/kubemark_linux.go @@ -238,6 +238,11 @@ func (nodeGroup *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (nodeGroup *NodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // IncreaseSize increases NodeGroup size. func (nodeGroup *NodeGroup) IncreaseSize(delta int) error { if delta <= 0 { diff --git a/cluster-autoscaler/cloudprovider/kwok/kwok_nodegroups.go b/cluster-autoscaler/cloudprovider/kwok/kwok_nodegroups.go index 91933140720c..4fdd9ca402af 100644 --- a/cluster-autoscaler/cloudprovider/kwok/kwok_nodegroups.go +++ b/cluster-autoscaler/cloudprovider/kwok/kwok_nodegroups.go @@ -120,6 +120,11 @@ func (nodeGroup *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (nodeGroup *NodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/linode/linode_node_group.go b/cluster-autoscaler/cloudprovider/linode/linode_node_group.go index d2f660fd3b06..c5a521646f2e 100644 --- a/cluster-autoscaler/cloudprovider/linode/linode_node_group.go +++ b/cluster-autoscaler/cloudprovider/linode/linode_node_group.go @@ -126,6 +126,11 @@ func (n *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (n *NodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/magnum/magnum_nodegroup.go b/cluster-autoscaler/cloudprovider/magnum/magnum_nodegroup.go index 1d7863f00945..90d47c0ecb48 100644 --- a/cluster-autoscaler/cloudprovider/magnum/magnum_nodegroup.go +++ b/cluster-autoscaler/cloudprovider/magnum/magnum_nodegroup.go @@ -139,6 +139,11 @@ func (ng *magnumNodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (ng *magnumNodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the cluster node_count in magnum. func (ng *magnumNodeGroup) DecreaseTargetSize(delta int) error { ng.clusterUpdateLock.Lock() diff --git a/cluster-autoscaler/cloudprovider/mocks/NodeGroup.go b/cluster-autoscaler/cloudprovider/mocks/NodeGroup.go index 04a66a2e006e..d8a0e340e348 100644 --- a/cluster-autoscaler/cloudprovider/mocks/NodeGroup.go +++ b/cluster-autoscaler/cloudprovider/mocks/NodeGroup.go @@ -125,6 +125,20 @@ func (_m *NodeGroup) DeleteNodes(_a0 []*v1.Node) error { return r0 } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (_m *NodeGroup) ForceDeleteNodes(_a0 []*v1.Node) error { + ret := _m.Called(_a0) + + var r0 error + if rf, ok := ret.Get(0).(func([]*v1.Node) error); ok { + r0 = rf(_a0) + } else { + r0 = ret.Error(0) + } + + return r0 +} + // Exist provides a mock function with given fields: func (_m *NodeGroup) Exist() bool { ret := _m.Called() diff --git a/cluster-autoscaler/cloudprovider/oci/instancepools/oci_instance_pool.go b/cluster-autoscaler/cloudprovider/oci/instancepools/oci_instance_pool.go index 8d402fbe94ea..0158c757c350 100644 --- a/cluster-autoscaler/cloudprovider/oci/instancepools/oci_instance_pool.go +++ b/cluster-autoscaler/cloudprovider/oci/instancepools/oci_instance_pool.go @@ -108,6 +108,11 @@ func (ip *InstancePoolNodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return ip.manager.DeleteInstances(*ip, refs) } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (ip *InstancePoolNodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the instance-pool based node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go index 914c16ef666f..d31618f71811 100644 --- a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go +++ b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go @@ -182,6 +182,11 @@ func (np *nodePool) DeleteNodes(nodes []*apiv1.Node) (err error) { return deleteInstancesErr } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (np *nodePool) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_node_group.go b/cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_node_group.go index 185adde536ab..daaa9ec44d1a 100644 --- a/cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_node_group.go +++ b/cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_node_group.go @@ -166,6 +166,11 @@ func (ng *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (ng *NodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/rancher/rancher_nodegroup.go b/cluster-autoscaler/cloudprovider/rancher/rancher_nodegroup.go index 045f78802920..459e22c888cf 100644 --- a/cluster-autoscaler/cloudprovider/rancher/rancher_nodegroup.go +++ b/cluster-autoscaler/cloudprovider/rancher/rancher_nodegroup.go @@ -134,6 +134,11 @@ func (ng *nodeGroup) DeleteNodes(toDelete []*corev1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (ng *nodeGroup) ForceDeleteNodes(nodes []*corev1.Node) error { + return cloudprovider.ErrNotImplemented +} + func (ng *nodeGroup) findNodeByProviderID(providerID string) (*node, error) { nodes, err := ng.nodes() if err != nil { diff --git a/cluster-autoscaler/cloudprovider/scaleway/scaleway_node_group.go b/cluster-autoscaler/cloudprovider/scaleway/scaleway_node_group.go index bd264e5c7b92..0869541d6341 100644 --- a/cluster-autoscaler/cloudprovider/scaleway/scaleway_node_group.go +++ b/cluster-autoscaler/cloudprovider/scaleway/scaleway_node_group.go @@ -135,6 +135,11 @@ func (ng *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (ng *NodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud_auto_scaling_group.go b/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud_auto_scaling_group.go index 88972c01d73b..d8be0deb6406 100644 --- a/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud_auto_scaling_group.go +++ b/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud_auto_scaling_group.go @@ -231,6 +231,11 @@ func (asg *tcAsg) DeleteNodes(nodes []*apiv1.Node) error { return asg.tencentcloudManager.DeleteInstances(refs) } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (asg *tcAsg) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // Id returns asg id. func (asg *tcAsg) Id() string { return asg.tencentcloudRef.ID diff --git a/cluster-autoscaler/cloudprovider/test/test_cloud_provider.go b/cluster-autoscaler/cloudprovider/test/test_cloud_provider.go index 414e8cae5d1c..4ab9d114a0b5 100644 --- a/cluster-autoscaler/cloudprovider/test/test_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/test/test_cloud_provider.go @@ -453,6 +453,11 @@ func (tng *TestNodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (tng *TestNodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return tng.DeleteNodes(nodes) +} + // Id returns an unique identifier of the node group. func (tng *TestNodeGroup) Id() string { tng.Lock() diff --git a/cluster-autoscaler/cloudprovider/volcengine/volcengine_auto_scaling_group.go b/cluster-autoscaler/cloudprovider/volcengine/volcengine_auto_scaling_group.go index 413d4670f015..5b17c5b6f237 100644 --- a/cluster-autoscaler/cloudprovider/volcengine/volcengine_auto_scaling_group.go +++ b/cluster-autoscaler/cloudprovider/volcengine/volcengine_auto_scaling_group.go @@ -106,6 +106,11 @@ func (asg *AutoScalingGroup) DeleteNodes(nodes []*apiv1.Node) error { return asg.manager.DeleteScalingInstances(asg.asgId, instanceIds) } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (asg *AutoScalingGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + func (asg *AutoScalingGroup) belongs(node *apiv1.Node) (bool, error) { instanceId, err := ecsInstanceFromProviderId(node.Spec.ProviderID) if err != nil { diff --git a/cluster-autoscaler/cloudprovider/vultr/vultr_node_group.go b/cluster-autoscaler/cloudprovider/vultr/vultr_node_group.go index afa711fc34f3..51674e05fc43 100644 --- a/cluster-autoscaler/cloudprovider/vultr/vultr_node_group.go +++ b/cluster-autoscaler/cloudprovider/vultr/vultr_node_group.go @@ -129,6 +129,11 @@ func (n *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error { return nil } +// ForceDeleteNodes deletes nodes from the group regardless of constraints. +func (n *NodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error { + return cloudprovider.ErrNotImplemented +} + // DecreaseTargetSize decreases the target size of the node group. This function // doesn't permit to delete any existing node and can be used only to reduce the // request for new nodes that have not been yet fulfilled. Delta should be negative. diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index d3d6fdbfa3f0..aa902058c184 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -307,6 +307,8 @@ type AutoscalingOptions struct { CheckCapacityProvisioningRequestMaxBatchSize int // CheckCapacityProvisioningRequestBatchTimebox is the maximum time to spend processing a batch of provisioning requests CheckCapacityProvisioningRequestBatchTimebox time.Duration + // ForceDeleteLongUnregisteredNodes is used to enable/disable ignoring min size constraints during removal of long unregistered nodes + ForceDeleteLongUnregisteredNodes bool } // KubeClientOptions specify options for kube client diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 216a684ee304..f14e8db7f681 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -424,7 +424,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr unregisteredNodes := a.clusterStateRegistry.GetUnregisteredNodes() if len(unregisteredNodes) > 0 { klog.V(1).Infof("%d unregistered nodes present", len(unregisteredNodes)) - removedAny, err := a.removeOldUnregisteredNodes(unregisteredNodes, autoscalingContext, + removedAny, err := a.removeOldUnregisteredNodes(unregisteredNodes, a.clusterStateRegistry, currentTime, autoscalingContext.LogRecorder) // There was a problem with removing unregistered nodes. Retry in the next loop. if err != nil { @@ -752,62 +752,54 @@ func fixNodeGroupSize(context *context.AutoscalingContext, clusterStateRegistry return fixed, nil } -// Removes unregistered nodes if needed. Returns true if anything was removed and error if such occurred. -func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clusterstate.UnregisteredNode, context *context.AutoscalingContext, +// removeOldUnregisteredNodes removes unregistered nodes if needed. Returns true +// if anything was removed and error if such occurred. +func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clusterstate.UnregisteredNode, csr *clusterstate.ClusterStateRegistry, currentTime time.Time, logRecorder *utils.LogEventRecorder) (bool, error) { - nodeGroups := a.nodeGroupsById() - nodesToDeleteByNodeGroupId := make(map[string][]clusterstate.UnregisteredNode) - for _, unregisteredNode := range allUnregisteredNodes { - nodeGroup, err := a.CloudProvider.NodeGroupForNode(unregisteredNode.Node) - if err != nil { - klog.Warningf("Failed to get node group for %s: %v", unregisteredNode.Node.Name, err) - continue - } - if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { - klog.Warningf("No node group for node %s, skipping", unregisteredNode.Node.Name) - continue - } - - maxNodeProvisionTime, err := csr.MaxNodeProvisionTime(nodeGroup) - if err != nil { - return false, fmt.Errorf("failed to retrieve maxNodeProvisionTime for node %s in nodeGroup %s", unregisteredNode.Node.Name, nodeGroup.Id()) - } - - if unregisteredNode.UnregisteredSince.Add(maxNodeProvisionTime).Before(currentTime) { - klog.V(0).Infof("Marking unregistered node %v for removal", unregisteredNode.Node.Name) - nodesToDeleteByNodeGroupId[nodeGroup.Id()] = append(nodesToDeleteByNodeGroupId[nodeGroup.Id()], unregisteredNode) - } + unregisteredNodesToRemove, err := a.oldUnregisteredNodes(allUnregisteredNodes, csr, currentTime) + if err != nil { + return false, err } + nodeGroups := a.nodeGroupsById() removedAny := false - for nodeGroupId, unregisteredNodesToDelete := range nodesToDeleteByNodeGroupId { + for nodeGroupId, unregisteredNodesToDelete := range unregisteredNodesToRemove { nodeGroup := nodeGroups[nodeGroupId] klog.V(0).Infof("Removing %v unregistered nodes for node group %v", len(unregisteredNodesToDelete), nodeGroupId) - size, err := nodeGroup.TargetSize() - if err != nil { - klog.Warningf("Failed to get node group size; nodeGroup=%v; err=%v", nodeGroup.Id(), err) - continue - } - possibleToDelete := size - nodeGroup.MinSize() - if possibleToDelete <= 0 { - klog.Warningf("Node group %s min size reached, skipping removal of %v unregistered nodes", nodeGroupId, len(unregisteredNodesToDelete)) - continue - } - if len(unregisteredNodesToDelete) > possibleToDelete { - klog.Warningf("Capping node group %s unregistered node removal to %d nodes, removing all %d would exceed min size constaint", nodeGroupId, possibleToDelete, len(unregisteredNodesToDelete)) - unregisteredNodesToDelete = unregisteredNodesToDelete[:possibleToDelete] + if !a.ForceDeleteLongUnregisteredNodes { + size, err := nodeGroup.TargetSize() + if err != nil { + klog.Warningf("Failed to get node group size; nodeGroup=%v; err=%v", nodeGroup.Id(), err) + continue + } + possibleToDelete := size - nodeGroup.MinSize() + if possibleToDelete <= 0 { + klog.Warningf("Node group %s min size reached, skipping removal of %v unregistered nodes", nodeGroupId, len(unregisteredNodesToDelete)) + continue + } + if len(unregisteredNodesToDelete) > possibleToDelete { + klog.Warningf("Capping node group %s unregistered node removal to %d nodes, removing all %d would exceed min size constaint", nodeGroupId, possibleToDelete, len(unregisteredNodesToDelete)) + unregisteredNodesToDelete = unregisteredNodesToDelete[:possibleToDelete] + } } - nodesToDelete := toNodes(unregisteredNodesToDelete) - nodesToDelete, err = overrideNodesToDeleteForZeroOrMax(a.NodeGroupDefaults, nodeGroup, nodesToDelete) + nodesToDelete := toNodes(unregisteredNodesToDelete) + nodesToDelete, err := overrideNodesToDeleteForZeroOrMax(a.NodeGroupDefaults, nodeGroup, nodesToDelete) if err != nil { klog.Warningf("Failed to remove unregistered nodes from node group %s: %v", nodeGroupId, err) continue } - err = nodeGroup.DeleteNodes(nodesToDelete) + if a.ForceDeleteLongUnregisteredNodes { + err = nodeGroup.ForceDeleteNodes(nodesToDelete) + if err == cloudprovider.ErrNotImplemented { + err = nodeGroup.DeleteNodes(nodesToDelete) + } + } else { + err = nodeGroup.DeleteNodes(nodesToDelete) + } csr.InvalidateNodeInstancesCacheEntry(nodeGroup) if err != nil { klog.Warningf("Failed to remove %v unregistered nodes from node group %s: %v", len(nodesToDelete), nodeGroupId, err) @@ -827,6 +819,34 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu return removedAny, nil } +// oldUnregisteredNodes returns old unregistered nodes grouped by their node group id. +func (a *StaticAutoscaler) oldUnregisteredNodes(allUnregisteredNodes []clusterstate.UnregisteredNode, csr *clusterstate.ClusterStateRegistry, currentTime time.Time) (map[string][]clusterstate.UnregisteredNode, error) { + nodesByNodeGroupId := make(map[string][]clusterstate.UnregisteredNode) + for _, unregisteredNode := range allUnregisteredNodes { + nodeGroup, err := a.CloudProvider.NodeGroupForNode(unregisteredNode.Node) + if err != nil { + klog.Warningf("Failed to get node group for %s: %v", unregisteredNode.Node.Name, err) + continue + } + if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { + klog.Warningf("No node group for node %s, skipping", unregisteredNode.Node.Name) + continue + } + + maxNodeProvisionTime, err := csr.MaxNodeProvisionTime(nodeGroup) + if err != nil { + return nil, fmt.Errorf("failed to retrieve maxNodeProvisionTime for node %s in nodeGroup %s", unregisteredNode.Node.Name, nodeGroup.Id()) + } + + if unregisteredNode.UnregisteredSince.Add(maxNodeProvisionTime).Before(currentTime) { + klog.V(0).Infof("Marking unregistered node %v for removal", unregisteredNode.Node.Name) + nodesByNodeGroupId[nodeGroup.Id()] = append(nodesByNodeGroupId[nodeGroup.Id()], unregisteredNode) + } + } + + return nodesByNodeGroupId, nil +} + func toNodes(unregisteredNodes []clusterstate.UnregisteredNode) []*apiv1.Node { nodes := []*apiv1.Node{} for _, n := range unregisteredNodes { diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index fcff16331eef..2df10d4b7355 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -876,134 +876,144 @@ func TestStaticAutoscalerRunOnceWithAutoprovisionedEnabled(t *testing.T) { } func TestStaticAutoscalerRunOnceWithALongUnregisteredNode(t *testing.T) { - readyNodeLister := kubernetes.NewTestNodeLister(nil) - allNodeLister := kubernetes.NewTestNodeLister(nil) - allPodListerMock := &podListerMock{} - podDisruptionBudgetListerMock := &podDisruptionBudgetListerMock{} - daemonSetListerMock := &daemonSetListerMock{} - onScaleUpMock := &onScaleUpMock{} - onScaleDownMock := &onScaleDownMock{} - deleteFinished := make(chan bool, 1) - - now := time.Now() - later := now.Add(1 * time.Minute) + for _, forceDeleteLongUnregisteredNodes := range []bool{false, true} { + t.Run(fmt.Sprintf("forceDeleteLongUnregisteredNodes=%v", forceDeleteLongUnregisteredNodes), func(t *testing.T) { + readyNodeLister := kubernetes.NewTestNodeLister(nil) + allNodeLister := kubernetes.NewTestNodeLister(nil) + allPodListerMock := &podListerMock{} + podDisruptionBudgetListerMock := &podDisruptionBudgetListerMock{} + daemonSetListerMock := &daemonSetListerMock{} + onScaleUpMock := &onScaleUpMock{} + onScaleDownMock := &onScaleDownMock{} + deleteFinished := make(chan bool, 1) + + now := time.Now() + later := now.Add(1 * time.Minute) + + n1 := BuildTestNode("n1", 1000, 1000) + SetNodeReadyState(n1, true, time.Now()) + n2 := BuildTestNode("n2", 1000, 1000) + SetNodeReadyState(n2, true, time.Now()) + + p1 := BuildTestPod("p1", 600, 100) + p1.Spec.NodeName = "n1" + p2 := BuildTestPod("p2", 600, 100, MarkUnschedulable()) + + provider := testprovider.NewTestCloudProvider( + func(id string, delta int) error { + return onScaleUpMock.ScaleUp(id, delta) + }, func(id string, name string) error { + ret := onScaleDownMock.ScaleDown(id, name) + deleteFinished <- true + return ret + }) + provider.AddNodeGroup("ng1", 2, 10, 2) + provider.AddNode("ng1", n1) - n1 := BuildTestNode("n1", 1000, 1000) - SetNodeReadyState(n1, true, time.Now()) - n2 := BuildTestNode("n2", 1000, 1000) - SetNodeReadyState(n2, true, time.Now()) + // broken node, that will be just hanging out there during + // the test (it can't be removed since that would validate group min size) + brokenNode := BuildTestNode("broken", 1000, 1000) + provider.AddNode("ng1", brokenNode) - p1 := BuildTestPod("p1", 600, 100) - p1.Spec.NodeName = "n1" - p2 := BuildTestPod("p2", 600, 100, MarkUnschedulable()) + ng1 := reflect.ValueOf(provider.GetNodeGroup("ng1")).Interface().(*testprovider.TestNodeGroup) + assert.NotNil(t, ng1) + assert.NotNil(t, provider) - provider := testprovider.NewTestCloudProvider( - func(id string, delta int) error { - return onScaleUpMock.ScaleUp(id, delta) - }, func(id string, name string) error { - ret := onScaleDownMock.ScaleDown(id, name) - deleteFinished <- true - return ret - }) - provider.AddNodeGroup("ng1", 2, 10, 2) - provider.AddNode("ng1", n1) + // Create context with mocked lister registry. + options := config.AutoscalingOptions{ + NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ + ScaleDownUnneededTime: time.Minute, + ScaleDownUnreadyTime: time.Minute, + ScaleDownUtilizationThreshold: 0.5, + MaxNodeProvisionTime: 10 * time.Second, + }, + EstimatorName: estimator.BinpackingEstimatorName, + ScaleDownEnabled: true, + MaxNodesTotal: 10, + MaxCoresTotal: 10, + MaxMemoryTotal: 100000, + ForceDeleteLongUnregisteredNodes: forceDeleteLongUnregisteredNodes, + } + processorCallbacks := newStaticAutoscalerProcessorCallbacks() - // broken node, that will be just hanging out there during - // the test (it can't be removed since that would validate group min size) - brokenNode := BuildTestNode("broken", 1000, 1000) - provider.AddNode("ng1", brokenNode) + context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, nil, provider, processorCallbacks, nil) + assert.NoError(t, err) - ng1 := reflect.ValueOf(provider.GetNodeGroup("ng1")).Interface().(*testprovider.TestNodeGroup) - assert.NotNil(t, ng1) - assert.NotNil(t, provider) + setUpScaleDownActuator(&context, options) - // Create context with mocked lister registry. - options := config.AutoscalingOptions{ - NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ - ScaleDownUnneededTime: time.Minute, - ScaleDownUnreadyTime: time.Minute, - ScaleDownUtilizationThreshold: 0.5, - MaxNodeProvisionTime: 10 * time.Second, - }, - EstimatorName: estimator.BinpackingEstimatorName, - ScaleDownEnabled: true, - MaxNodesTotal: 10, - MaxCoresTotal: 10, - MaxMemoryTotal: 100000, - } - processorCallbacks := newStaticAutoscalerProcessorCallbacks() + listerRegistry := kube_util.NewListerRegistry(allNodeLister, readyNodeLister, allPodListerMock, + podDisruptionBudgetListerMock, daemonSetListerMock, + nil, nil, nil, nil) + context.ListerRegistry = listerRegistry - context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, nil, provider, processorCallbacks, nil) - assert.NoError(t, err) + clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ + OkTotalUnreadyCount: 1, + } + processors := processorstest.NewTestProcessors(&context) - setUpScaleDownActuator(&context, options) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), processors.AsyncNodeGroupStateChecker) + // broken node detected as unregistered - listerRegistry := kube_util.NewListerRegistry(allNodeLister, readyNodeLister, allPodListerMock, - podDisruptionBudgetListerMock, daemonSetListerMock, - nil, nil, nil, nil) - context.ListerRegistry = listerRegistry + nodes := []*apiv1.Node{n1} + // nodeInfos, _ := getNodeInfosForGroups(nodes, provider, listerRegistry, []*appsv1.DaemonSet{}, context.PredicateChecker) + clusterState.UpdateNodes(nodes, nil, now) - clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ - OkTotalUnreadyCount: 1, - } - processors := processorstest.NewTestProcessors(&context) + // broken node failed to register in time + clusterState.UpdateNodes(nodes, nil, later) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), processors.AsyncNodeGroupStateChecker) - // broken node detected as unregistered + sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil) + suOrchestrator := orchestrator.New() + suOrchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{}) - nodes := []*apiv1.Node{n1} - // nodeInfos, _ := getNodeInfosForGroups(nodes, provider, listerRegistry, []*appsv1.DaemonSet{}, context.PredicateChecker) - clusterState.UpdateNodes(nodes, nil, now) + autoscaler := &StaticAutoscaler{ + AutoscalingContext: &context, + clusterStateRegistry: clusterState, + lastScaleUpTime: time.Now(), + lastScaleDownFailTime: time.Now(), + scaleDownPlanner: sdPlanner, + scaleDownActuator: sdActuator, + scaleUpOrchestrator: suOrchestrator, + processors: processors, + loopStartNotifier: loopstart.NewObserversList(nil), + processorCallbacks: processorCallbacks, + } - // broken node failed to register in time - clusterState.UpdateNodes(nodes, nil, later) + // If deletion of unregistered nodes is not forced, we need to simulate + // additional scale-up to respect min size constraints. + if !forceDeleteLongUnregisteredNodes { + // Scale up. + readyNodeLister.SetNodes(nodes) + allNodeLister.SetNodes(nodes) + allPodListerMock.On("List").Return([]*apiv1.Pod{p1, p2}, nil).Twice() + daemonSetListerMock.On("List", labels.Everything()).Return([]*appsv1.DaemonSet{}, nil).Once() + podDisruptionBudgetListerMock.On("List").Return([]*policyv1.PodDisruptionBudget{}, nil).Once() + onScaleUpMock.On("ScaleUp", "ng1", 1).Return(nil).Once() + + err = autoscaler.RunOnce(later.Add(time.Hour)) + assert.NoError(t, err) + mock.AssertExpectationsForObjects(t, allPodListerMock, + podDisruptionBudgetListerMock, daemonSetListerMock, onScaleUpMock, onScaleDownMock) + + nodes = append(nodes, n2) + provider.AddNode("ng1", n2) + ng1.SetTargetSize(3) + } - sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil) - suOrchestrator := orchestrator.New() - suOrchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{}) + // Remove broken node + readyNodeLister.SetNodes(nodes) + allNodeLister.SetNodes(nodes) + allPodListerMock.On("List").Return([]*apiv1.Pod{}, nil).Twice() + onScaleDownMock.On("ScaleDown", "ng1", "broken").Return(nil).Once() + daemonSetListerMock.On("List", labels.Everything()).Return([]*appsv1.DaemonSet{}, nil).Once() + podDisruptionBudgetListerMock.On("List").Return([]*policyv1.PodDisruptionBudget{}, nil).Once() - autoscaler := &StaticAutoscaler{ - AutoscalingContext: &context, - clusterStateRegistry: clusterState, - lastScaleUpTime: time.Now(), - lastScaleDownFailTime: time.Now(), - scaleDownPlanner: sdPlanner, - scaleDownActuator: sdActuator, - scaleUpOrchestrator: suOrchestrator, - processors: processors, - loopStartNotifier: loopstart.NewObserversList(nil), - processorCallbacks: processorCallbacks, + err = autoscaler.RunOnce(later.Add(2 * time.Hour)) + waitForDeleteToFinish(t, deleteFinished) + assert.NoError(t, err) + mock.AssertExpectationsForObjects(t, allPodListerMock, + podDisruptionBudgetListerMock, daemonSetListerMock, onScaleUpMock, onScaleDownMock) + }) } - - // Scale up. - readyNodeLister.SetNodes([]*apiv1.Node{n1}) - allNodeLister.SetNodes([]*apiv1.Node{n1}) - allPodListerMock.On("List").Return([]*apiv1.Pod{p1, p2}, nil).Twice() - daemonSetListerMock.On("List", labels.Everything()).Return([]*appsv1.DaemonSet{}, nil).Once() - podDisruptionBudgetListerMock.On("List").Return([]*policyv1.PodDisruptionBudget{}, nil).Once() - onScaleUpMock.On("ScaleUp", "ng1", 1).Return(nil).Once() - - err = autoscaler.RunOnce(later.Add(time.Hour)) - assert.NoError(t, err) - mock.AssertExpectationsForObjects(t, allPodListerMock, - podDisruptionBudgetListerMock, daemonSetListerMock, onScaleUpMock, onScaleDownMock) - - // Remove broken node after going over min size - provider.AddNode("ng1", n2) - ng1.SetTargetSize(3) - - readyNodeLister.SetNodes([]*apiv1.Node{n1, n2}) - allNodeLister.SetNodes([]*apiv1.Node{n1, n2}) - allPodListerMock.On("List").Return([]*apiv1.Pod{p1, p2}, nil).Twice() - onScaleDownMock.On("ScaleDown", "ng1", "broken").Return(nil).Once() - daemonSetListerMock.On("List", labels.Everything()).Return([]*appsv1.DaemonSet{}, nil).Once() - podDisruptionBudgetListerMock.On("List").Return([]*policyv1.PodDisruptionBudget{}, nil).Once() - - err = autoscaler.RunOnce(later.Add(2 * time.Hour)) - waitForDeleteToFinish(t, deleteFinished) - assert.NoError(t, err) - mock.AssertExpectationsForObjects(t, allPodListerMock, - podDisruptionBudgetListerMock, daemonSetListerMock, onScaleUpMock, onScaleDownMock) } func TestStaticAutoscalerRunOncePodsWithPriorities(t *testing.T) { @@ -2247,12 +2257,12 @@ func TestRemoveOldUnregisteredNodes(t *testing.T) { } // Nothing should be removed. The unregistered node is not old enough. - removed, err := autoscaler.removeOldUnregisteredNodes(unregisteredNodes, context, clusterState, now.Add(-50*time.Minute), fakeLogRecorder) + removed, err := autoscaler.removeOldUnregisteredNodes(unregisteredNodes, clusterState, now.Add(-50*time.Minute), fakeLogRecorder) assert.NoError(t, err) assert.False(t, removed) // ng1_2 should be removed. - removed, err = autoscaler.removeOldUnregisteredNodes(unregisteredNodes, context, clusterState, now, fakeLogRecorder) + removed, err = autoscaler.removeOldUnregisteredNodes(unregisteredNodes, clusterState, now, fakeLogRecorder) assert.NoError(t, err) assert.True(t, removed) deletedNode := core_utils.GetStringFromChan(deletedNodes) @@ -2307,12 +2317,12 @@ func TestRemoveOldUnregisteredNodesAtomic(t *testing.T) { } // Nothing should be removed. The unregistered node is not old enough. - removed, err := autoscaler.removeOldUnregisteredNodes(unregisteredNodes, context, clusterState, now.Add(-50*time.Minute), fakeLogRecorder) + removed, err := autoscaler.removeOldUnregisteredNodes(unregisteredNodes, clusterState, now.Add(-50*time.Minute), fakeLogRecorder) assert.NoError(t, err) assert.False(t, removed) // unregNode is long unregistered, so all of the nodes should be removed due to ZeroOrMaxNodeScaling option - removed, err = autoscaler.removeOldUnregisteredNodes(unregisteredNodes, context, clusterState, now, fakeLogRecorder) + removed, err = autoscaler.removeOldUnregisteredNodes(unregisteredNodes, clusterState, now, fakeLogRecorder) assert.NoError(t, err) assert.True(t, removed) wantNames, deletedNames := []string{}, []string{} diff --git a/cluster-autoscaler/expander/waste/waste_test.go b/cluster-autoscaler/expander/waste/waste_test.go index c552fb3645df..105b251afbf5 100644 --- a/cluster-autoscaler/expander/waste/waste_test.go +++ b/cluster-autoscaler/expander/waste/waste_test.go @@ -42,8 +42,10 @@ func (f *FakeNodeGroup) IncreaseSize(delta int) error { return nil } func (f *FakeNodeGroup) AtomicIncreaseSize(delta int) error { return cloudprovider.ErrNotImplemented } func (f *FakeNodeGroup) DecreaseTargetSize(delta int) error { return nil } func (f *FakeNodeGroup) DeleteNodes([]*apiv1.Node) error { return nil } -func (f *FakeNodeGroup) Id() string { return f.id } -func (f *FakeNodeGroup) Debug() string { return f.id } + +func (f *FakeNodeGroup) ForceDeleteNodes([]*apiv1.Node) error { return nil } +func (f *FakeNodeGroup) Id() string { return f.id } +func (f *FakeNodeGroup) Debug() string { return f.id } func (f *FakeNodeGroup) Nodes() ([]cloudprovider.Instance, error) { return []cloudprovider.Instance{}, nil } diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index d50cc0cebde4..cf3ca31ccbe5 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -279,6 +279,7 @@ var ( checkCapacityBatchProcessing = flag.Bool("check-capacity-batch-processing", false, "Whether to enable batch processing for check capacity requests.") checkCapacityProvisioningRequestMaxBatchSize = flag.Int("check-capacity-provisioning-request-max-batch-size", 10, "Maximum number of provisioning requests to process in a single batch.") checkCapacityProvisioningRequestBatchTimebox = flag.Duration("check-capacity-provisioning-request-batch-timebox", 10*time.Second, "Maximum time to process a batch of provisioning requests.") + forceDeleteLongUnregisteredNodes = flag.Bool("force-delete-unregistered-nodes", false, "Whether to enable force deletion of long unregistered nodes, regardless of the min size of the node group the belong to.") ) func isFlagPassed(name string) bool { @@ -457,6 +458,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { CheckCapacityBatchProcessing: *checkCapacityBatchProcessing, CheckCapacityProvisioningRequestMaxBatchSize: *checkCapacityProvisioningRequestMaxBatchSize, CheckCapacityProvisioningRequestBatchTimebox: *checkCapacityProvisioningRequestBatchTimebox, + ForceDeleteLongUnregisteredNodes: *forceDeleteLongUnregisteredNodes, } }