From 423d0e68bc0dbfa286c360bc5535578d403f3ae5 Mon Sep 17 00:00:00 2001 From: Krishna Sarabu Date: Mon, 8 Jul 2024 07:58:42 -0700 Subject: [PATCH] 1.28 Backport fix - Fix/aws asg unsafe decommission #5829 #6911 --- .../cloudprovider/aws/auto_scaling_groups.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go b/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go index b159a71e956..07288f9e071 100644 --- a/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go +++ b/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go @@ -342,12 +342,12 @@ func (m *asgCache) DeleteInstances(instances []*AwsInstanceRef) error { } for _, instance := range instances { + if m.isPlaceholderInstance(instance) { // skipping placeholder as placeholder instances don't exist // and we have already reduced ASG size during placeholder check. continue } - // check if the instance is already terminating - if it is, don't bother terminating again // as doing so causes unnecessary API calls and can cause the curSize cached value to decrement // unnecessarily. @@ -355,11 +355,13 @@ func (m *asgCache) DeleteInstances(instances []*AwsInstanceRef) error { if err != nil { return err } + if lifecycle != nil && + *lifecycle == autoscaling.LifecycleStateTerminated || *lifecycle == autoscaling.LifecycleStateTerminating || *lifecycle == autoscaling.LifecycleStateTerminatingWait || *lifecycle == autoscaling.LifecycleStateTerminatingProceed { - klog.V(2).Infof("instance %s is already terminating, will skip instead", instance.Name) + klog.V(2).Infof("instance %s is already terminating in state %s, will skip instead", instance.Name, *lifecycle) continue } @@ -377,6 +379,7 @@ func (m *asgCache) DeleteInstances(instances []*AwsInstanceRef) error { // Proactively decrement the size so autoscaler makes better decisions commonAsg.curSize-- + } return nil }