diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 39e77e407e37..4500136377dd 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -399,7 +399,7 @@ func (scaleSet *ScaleSet) Belongs(node *apiv1.Node) (bool, error) { } // DeleteInstances deletes the given instances. All instances must be controlled by the same ASG. -func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef) error { +func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregisteredNodes bool) error { if len(instances) == 0 { return nil } @@ -461,9 +461,12 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef) error { // Proactively decrement scale set size so that we don't // go below minimum node count if cache data is stale - scaleSet.sizeMutex.Lock() - scaleSet.curSize -= int64(len(instanceIDs)) - scaleSet.sizeMutex.Unlock() + // only do it for non-unregistered nodes + if !hasUnregisteredNodes { + scaleSet.sizeMutex.Lock() + scaleSet.curSize -= int64(len(instanceIDs)) + scaleSet.sizeMutex.Unlock() + } go scaleSet.waitForDeleteInstances(future, requiredIds) return nil @@ -482,6 +485,7 @@ func (scaleSet *ScaleSet) DeleteNodes(nodes []*apiv1.Node) error { } refs := make([]*azureRef, 0, len(nodes)) + hasUnregisteredNodes := false for _, node := range nodes { belongs, err := scaleSet.Belongs(node) if err != nil { @@ -492,13 +496,16 @@ func (scaleSet *ScaleSet) DeleteNodes(nodes []*apiv1.Node) error { return fmt.Errorf("%s belongs to a different asg than %s", node.Name, scaleSet.Id()) } + if node.Annotations[cloudprovider.FakeNodeReasonAnnotation] == cloudprovider.FakeNodeUnregistered { + hasUnregisteredNodes = true + } ref := &azureRef{ Name: node.Spec.ProviderID, } refs = append(refs, ref) } - return scaleSet.DeleteInstances(refs) + return scaleSet.DeleteInstances(refs, hasUnregisteredNodes) } // Id returns ScaleSet id. diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index 51efd81ca0d0..b76581ec90a1 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -18,6 +18,7 @@ package azure import ( "fmt" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "net/http" "testing" "time" @@ -321,6 +322,82 @@ func TestDeleteNodes(t *testing.T) { assert.Equal(t, 1, targetSize) } +func TestDeleteNodeUnregistered(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + manager := newTestAzureManager(t) + vmssName := "test-asg" + var vmssCapacity int64 = 2 + + expectedScaleSets := []compute.VirtualMachineScaleSet{ + { + Name: &vmssName, + Sku: &compute.Sku{ + Capacity: &vmssCapacity, + }, + }, + } + expectedVMSSVMs := newTestVMSSVMList(2) + + mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) + mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).Times(2) + mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any()).Return(nil, nil) + mockVMSSClient.EXPECT().WaitForAsyncOperationResult(gomock.Any(), gomock.Any()).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() + manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient + mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) + mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() + manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient + err := manager.forceRefresh() + assert.NoError(t, err) + + resourceLimiter := cloudprovider.NewResourceLimiter( + map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000}, + map[string]int64{cloudprovider.ResourceNameCores: 10, cloudprovider.ResourceNameMemory: 100000000}) + provider, err := BuildAzureCloudProvider(manager, resourceLimiter) + assert.NoError(t, err) + + registered := manager.RegisterNodeGroup( + newTestScaleSet(manager, "test-asg")) + manager.explicitlyConfigured["test-asg"] = true + assert.True(t, registered) + err = manager.forceRefresh() + assert.NoError(t, err) + + scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet) + assert.True(t, ok) + + targetSize, err := scaleSet.TargetSize() + assert.NoError(t, err) + assert.Equal(t, 2, targetSize) + + // annotate node with unregistered annotation + annotations := make(map[string]string) + annotations[cloudprovider.FakeNodeReasonAnnotation] = cloudprovider.FakeNodeUnregistered + nodesToDelete := []*apiv1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Annotations: annotations, + }, + Spec: apiv1.NodeSpec{ + ProviderID: "azure://" + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0), + }, + }, + } + err = scaleSet.DeleteNodes(nodesToDelete) + assert.NoError(t, err) + + // Ensure the the cached size has NOT been proactively decremented + targetSize, err = scaleSet.TargetSize() + assert.NoError(t, err) + assert.Equal(t, 2, targetSize) + + // Ensure that the status for the instances is Deleting + instance0, found := scaleSet.getInstanceByProviderID("azure://" + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0)) + assert.True(t, found, true) + assert.Equal(t, instance0.Status.State, cloudprovider.InstanceDeleting) +} + func TestDeleteNoConflictRequest(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish()