From 037dc7367aba21bb2f593be09d85a7c99d0f894d Mon Sep 17 00:00:00 2001 From: Benjamin Pineau Date: Wed, 6 Jan 2021 12:02:19 +0100 Subject: [PATCH] Don't pile up successive full refreshes during AWS scaledowns Force refreshing everything at every DeleteNodes calls causes slow down and throttling on large clusters with many ASGs (and lot of activity). That function might be called several times in a row during scale-down (once for each ASG having a node to be removed). Each time the forced refresh will re-discover all ASGs, all LaunchConfigurations, then re-list all instances from discovered ASGs. That immediate refresh isn't required anyway, as the cache's DeleteInstances concrete implementation will decrement the nodegroup size, and we can schedule a grouped refresh for the next loop iteration. --- .../cloudprovider/aws/aws_cloud_provider_test.go | 4 ++-- cluster-autoscaler/cloudprovider/aws/aws_manager.go | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go index 587833779335..4ae024c437d4 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go @@ -469,7 +469,7 @@ func TestDeleteNodes(t *testing.T) { err = asgs[0].DeleteNodes([]*apiv1.Node{node}) assert.NoError(t, err) service.AssertNumberOfCalls(t, "TerminateInstanceInAutoScalingGroup", 1) - service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 2) + service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 1) newSize, err := asgs[0].TargetSize() assert.NoError(t, err) @@ -516,7 +516,7 @@ func TestDeleteNodesWithPlaceholder(t *testing.T) { err = asgs[0].DeleteNodes([]*apiv1.Node{node}) assert.NoError(t, err) service.AssertNumberOfCalls(t, "SetDesiredCapacity", 1) - service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 2) + service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 1) newSize, err := asgs[0].TargetSize() assert.NoError(t, err) diff --git a/cluster-autoscaler/cloudprovider/aws/aws_manager.go b/cluster-autoscaler/cloudprovider/aws/aws_manager.go index d9f71bec8281..361eed052c91 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_manager.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_manager.go @@ -294,8 +294,9 @@ func (m *AwsManager) DeleteInstances(instances []*AwsInstanceRef) error { if err := m.asgCache.DeleteInstances(instances); err != nil { return err } - klog.V(2).Infof("Some ASG instances might have been deleted, forcing ASG list refresh") - return m.forceRefresh() + klog.V(2).Infof("DeleteInstances was called: scheduling an ASG list refresh for next main loop evaluation") + m.lastRefresh = time.Now().Add(-refreshInterval) + return nil } // GetAsgNodes returns Asg nodes.