From 756a3e155ddc749959a52146604f8199e1062dd7 Mon Sep 17 00:00:00 2001 From: Marwan Ahmed Date: Wed, 9 Jun 2021 18:59:35 -0700 Subject: [PATCH] dont proactively decrement azure cache for unregistered nodes --- .../cloudprovider/azure/azure_scale_set.go | 17 ++-- .../azure/azure_scale_set_test.go | 77 +++++++++++++++++++ 2 files changed, 89 insertions(+), 5 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index e5d49bad31da..e64f230e23ed 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -340,7 +340,7 @@ func (scaleSet *ScaleSet) Belongs(node *apiv1.Node) (bool, error) { } // DeleteInstances deletes the given instances. All instances must be controlled by the same ASG. -func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef) error { +func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregisteredNodes bool) error { if len(instances) == 0 { return nil } @@ -405,9 +405,12 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef) error { // Proactively decrement scale set size so that we don't // go below minimum node count if cache data is stale - scaleSet.sizeMutex.Lock() - scaleSet.curSize -= int64(len(instanceIDs)) - scaleSet.sizeMutex.Unlock() + // only do it for non-unregistered nodes + if !hasUnregisteredNodes { + scaleSet.sizeMutex.Lock() + scaleSet.curSize -= int64(len(instanceIDs)) + scaleSet.sizeMutex.Unlock() + } // Proactively set the status of the instances to be deleted in cache for _, instance := range instancesToDelete { @@ -432,6 +435,7 @@ func (scaleSet *ScaleSet) DeleteNodes(nodes []*apiv1.Node) error { } refs := make([]*azureRef, 0, len(nodes)) + hasUnregisteredNodes := false for _, node := range nodes { belongs, err := scaleSet.Belongs(node) if err != nil { @@ -442,13 +446,16 @@ func (scaleSet *ScaleSet) DeleteNodes(nodes []*apiv1.Node) error { return fmt.Errorf("%s belongs to a different asg than %s", node.Name, scaleSet.Id()) } + if node.Annotations[cloudprovider.FakeNodeReasonAnnotation] == cloudprovider.FakeNodeUnregistered { + hasUnregisteredNodes = true + } ref := &azureRef{ Name: node.Spec.ProviderID, } refs = append(refs, ref) } - return scaleSet.DeleteInstances(refs) + return scaleSet.DeleteInstances(refs, hasUnregisteredNodes) } // Id returns ScaleSet id. diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index 53291a8d48a0..dcb6185a1974 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -18,6 +18,7 @@ package azure import ( "fmt" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "net/http" "testing" "time" @@ -346,6 +347,82 @@ func TestDeleteNodes(t *testing.T) { assert.Equal(t, instance2.Status.State, cloudprovider.InstanceDeleting) } +func TestDeleteNodeUnregistered(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + manager := newTestAzureManager(t) + vmssName := "test-asg" + var vmssCapacity int64 = 2 + + expectedScaleSets := []compute.VirtualMachineScaleSet{ + { + Name: &vmssName, + Sku: &compute.Sku{ + Capacity: &vmssCapacity, + }, + }, + } + expectedVMSSVMs := newTestVMSSVMList(2) + + mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) + mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).Times(2) + mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any()).Return(nil, nil) + mockVMSSClient.EXPECT().WaitForAsyncOperationResult(gomock.Any(), gomock.Any()).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() + manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient + mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) + mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() + manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient + err := manager.forceRefresh() + assert.NoError(t, err) + + resourceLimiter := cloudprovider.NewResourceLimiter( + map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000}, + map[string]int64{cloudprovider.ResourceNameCores: 10, cloudprovider.ResourceNameMemory: 100000000}) + provider, err := BuildAzureCloudProvider(manager, resourceLimiter) + assert.NoError(t, err) + + registered := manager.RegisterNodeGroup( + newTestScaleSet(manager, "test-asg")) + manager.explicitlyConfigured["test-asg"] = true + assert.True(t, registered) + err = manager.forceRefresh() + assert.NoError(t, err) + + scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet) + assert.True(t, ok) + + targetSize, err := scaleSet.TargetSize() + assert.NoError(t, err) + assert.Equal(t, 2, targetSize) + + // annotate node with unregistered annotation + annotations := make(map[string]string) + annotations[cloudprovider.FakeNodeReasonAnnotation] = cloudprovider.FakeNodeUnregistered + nodesToDelete := []*apiv1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Annotations: annotations, + }, + Spec: apiv1.NodeSpec{ + ProviderID: "azure://" + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0), + }, + }, + } + err = scaleSet.DeleteNodes(nodesToDelete) + assert.NoError(t, err) + + // Ensure the the cached size has NOT been proactively decremented + targetSize, err = scaleSet.TargetSize() + assert.NoError(t, err) + assert.Equal(t, 2, targetSize) + + // Ensure that the status for the instances is Deleting + instance0, found := scaleSet.getInstanceByProviderID("azure://" + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0)) + assert.True(t, found, true) + assert.Equal(t, instance0.Status.State, cloudprovider.InstanceDeleting) +} + func TestDeleteNoConflictRequest(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish()