From d17c9deac9076400c4f3a4ead7e8590c78b68d53 Mon Sep 17 00:00:00 2001
From: Marwan Ahmed <marwanad@microsoft.com>
Date: Sun, 6 Jun 2021 13:16:14 -0700
Subject: [PATCH 1/4] annotate fakeNodes so that cloudprovider implementations
 can identify them if needed

---
 cluster-autoscaler/clusterstate/clusterstate.go | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
index 53aae7e1320d..1a5460a5a095 100644
--- a/cluster-autoscaler/clusterstate/clusterstate.go
+++ b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -55,6 +55,14 @@ const (
 
 	// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
 	NodeGroupBackoffResetTimeout = 3 * time.Hour
+
+	// FakeNodeReasonAnnotation is an annotation added to the fake placeholder nodes CA has created
+	// Note that this don't map to real nodes in k8s and are merely used for error handling
+	FakeNodeReasonAnnotation = "k8s.io/cluster-autoscaler/fake-node-reason"
+	// FakeNodeUnregistered represents a node that is identified by CA as unregistered
+	FakeNodeUnregistered = "unregistered"
+	// FakeNodeCreateError represents a node that is identified by CA as a created node with errors
+	FakeNodeCreateError = "create-error"
 )
 
 // ScaleUpRequest contains information about the requested node group scale up.
@@ -975,7 +983,7 @@ func getNotRegisteredNodes(allNodes []*apiv1.Node, cloudProviderNodeInstances ma
 		for _, instance := range instances {
 			if !registered.Has(instance.Id) {
 				notRegistered = append(notRegistered, UnregisteredNode{
-					Node:              fakeNode(instance),
+					Node:              fakeNode(instance, FakeNodeUnregistered),
 					UnregisteredSince: time,
 				})
 			}
@@ -1122,7 +1130,7 @@ func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() []*apiv1.Node {
 		_, _, instancesByErrorCode := csr.buildInstanceToErrorCodeMappings(nodeGroupInstances)
 		for _, instances := range instancesByErrorCode {
 			for _, instance := range instances {
-				nodesWithCreateErrors = append(nodesWithCreateErrors, fakeNode(instance))
+				nodesWithCreateErrors = append(nodesWithCreateErrors, fakeNode(instance, FakeNodeCreateError))
 			}
 		}
 	}
@@ -1139,10 +1147,13 @@ func (csr *ClusterStateRegistry) InvalidateNodeInstancesCacheEntry(nodeGroup clo
 	csr.cloudProviderNodeInstancesCache.InvalidateCacheEntry(nodeGroup)
 }
 
-func fakeNode(instance cloudprovider.Instance) *apiv1.Node {
+func fakeNode(instance cloudprovider.Instance, reason string) *apiv1.Node {
 	return &apiv1.Node{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: instance.Id,
+			Annotations: map[string]string{
+				FakeNodeReasonAnnotation: reason,
+			},
 		},
 		Spec: apiv1.NodeSpec{
 			ProviderID: instance.Id,

From 65e69fa07d9011fd3f34819f567b16c6dafff920 Mon Sep 17 00:00:00 2001
From: Marwan Ahmed <marwanad@microsoft.com>
Date: Tue, 8 Jun 2021 10:56:35 -0700
Subject: [PATCH 2/4] move annotations to cloudprovider package

---
 cluster-autoscaler/cloudprovider/cloud_provider.go | 10 ++++++++++
 cluster-autoscaler/clusterstate/clusterstate.go    | 14 +++-----------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/cloud_provider.go b/cluster-autoscaler/cloudprovider/cloud_provider.go
index 56bfefdf3270..98f17a5a3ca6 100644
--- a/cluster-autoscaler/cloudprovider/cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/cloud_provider.go
@@ -234,6 +234,16 @@ func (c InstanceErrorClass) String() string {
 	}
 }
 
+const (
+	// FakeNodeReasonAnnotation is an annotation added to the fake placeholder nodes CA has created
+	// Note that this don't map to real nodes in k8s and are merely used for error handling
+	FakeNodeReasonAnnotation = "k8s.io/cluster-autoscaler/fake-node-reason"
+	// FakeNodeUnregistered represents a node that is identified by CA as unregistered
+	FakeNodeUnregistered = "unregistered"
+	// FakeNodeCreateError represents a node that is identified by CA as a created node with errors
+	FakeNodeCreateError = "create-error"
+)
+
 // PricingModel contains information about the node price and how it changes in time.
 type PricingModel interface {
 	// NodePrice returns a price of running the given node for a given period of time.
diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
index 1a5460a5a095..ea30217af311 100644
--- a/cluster-autoscaler/clusterstate/clusterstate.go
+++ b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -55,14 +55,6 @@ const (
 
 	// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
 	NodeGroupBackoffResetTimeout = 3 * time.Hour
-
-	// FakeNodeReasonAnnotation is an annotation added to the fake placeholder nodes CA has created
-	// Note that this don't map to real nodes in k8s and are merely used for error handling
-	FakeNodeReasonAnnotation = "k8s.io/cluster-autoscaler/fake-node-reason"
-	// FakeNodeUnregistered represents a node that is identified by CA as unregistered
-	FakeNodeUnregistered = "unregistered"
-	// FakeNodeCreateError represents a node that is identified by CA as a created node with errors
-	FakeNodeCreateError = "create-error"
 )
 
 // ScaleUpRequest contains information about the requested node group scale up.
@@ -983,7 +975,7 @@ func getNotRegisteredNodes(allNodes []*apiv1.Node, cloudProviderNodeInstances ma
 		for _, instance := range instances {
 			if !registered.Has(instance.Id) {
 				notRegistered = append(notRegistered, UnregisteredNode{
-					Node:              fakeNode(instance, FakeNodeUnregistered),
+					Node:              fakeNode(instance, cloudprovider.FakeNodeUnregistered),
 					UnregisteredSince: time,
 				})
 			}
@@ -1130,7 +1122,7 @@ func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() []*apiv1.Node {
 		_, _, instancesByErrorCode := csr.buildInstanceToErrorCodeMappings(nodeGroupInstances)
 		for _, instances := range instancesByErrorCode {
 			for _, instance := range instances {
-				nodesWithCreateErrors = append(nodesWithCreateErrors, fakeNode(instance, FakeNodeCreateError))
+				nodesWithCreateErrors = append(nodesWithCreateErrors, fakeNode(instance, cloudprovider.FakeNodeCreateError))
 			}
 		}
 	}
@@ -1152,7 +1144,7 @@ func fakeNode(instance cloudprovider.Instance, reason string) *apiv1.Node {
 		ObjectMeta: metav1.ObjectMeta{
 			Name: instance.Id,
 			Annotations: map[string]string{
-				FakeNodeReasonAnnotation: reason,
+				cloudprovider.FakeNodeReasonAnnotation: reason,
 			},
 		},
 		Spec: apiv1.NodeSpec{

From 5375fb0c9ce4ef8fd911d2006dadd6d64415b495 Mon Sep 17 00:00:00 2001
From: Marwan Ahmed <marwanad@microsoft.com>
Date: Thu, 30 Sep 2021 16:29:46 -0700
Subject: [PATCH 3/4] fix 1.19 test

---
 .../azure/azure_scale_set_test.go             | 26 +++++++------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go
index b76581ec90a1..9f2ba54d55e5 100644
--- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go
@@ -328,7 +328,7 @@ func TestDeleteNodeUnregistered(t *testing.T) {
 
 	manager := newTestAzureManager(t)
 	vmssName := "test-asg"
-	var vmssCapacity int64 = 2
+	var vmssCapacity int64 = 3
 
 	expectedScaleSets := []compute.VirtualMachineScaleSet{
 		{
@@ -338,18 +338,17 @@ func TestDeleteNodeUnregistered(t *testing.T) {
 			},
 		},
 	}
-	expectedVMSSVMs := newTestVMSSVMList(2)
+	expectedVMSSVMs := newTestVMSSVMList(3)
 
 	mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
-	mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).Times(2)
+	mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes()
 	mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any()).Return(nil, nil)
 	mockVMSSClient.EXPECT().WaitForAsyncOperationResult(gomock.Any(), gomock.Any()).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes()
 	manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
 	mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
 	mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
 	manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient
-	err := manager.forceRefresh()
-	assert.NoError(t, err)
+	manager.regenerateCache()
 
 	resourceLimiter := cloudprovider.NewResourceLimiter(
 		map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000},
@@ -357,23 +356,21 @@ func TestDeleteNodeUnregistered(t *testing.T) {
 	provider, err := BuildAzureCloudProvider(manager, resourceLimiter)
 	assert.NoError(t, err)
 
-	registered := manager.RegisterNodeGroup(
-		newTestScaleSet(manager, "test-asg"))
-	manager.explicitlyConfigured["test-asg"] = true
+	registered := manager.RegisterAsg(newTestScaleSet(manager, "test-asg"))
 	assert.True(t, registered)
-	err = manager.forceRefresh()
-	assert.NoError(t, err)
+	manager.regenerateCache()
 
 	scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet)
 	assert.True(t, ok)
 
 	targetSize, err := scaleSet.TargetSize()
 	assert.NoError(t, err)
-	assert.Equal(t, 2, targetSize)
+	assert.Equal(t, 3, targetSize)
 
 	// annotate node with unregistered annotation
 	annotations := make(map[string]string)
 	annotations[cloudprovider.FakeNodeReasonAnnotation] = cloudprovider.FakeNodeUnregistered
+	// Perform the delete operation
 	nodesToDelete := []*apiv1.Node{
 		{
 			ObjectMeta: metav1.ObjectMeta{
@@ -390,12 +387,7 @@ func TestDeleteNodeUnregistered(t *testing.T) {
 	// Ensure the the cached size has NOT been proactively decremented
 	targetSize, err = scaleSet.TargetSize()
 	assert.NoError(t, err)
-	assert.Equal(t, 2, targetSize)
-
-	// Ensure that the status for the instances is Deleting
-	instance0, found := scaleSet.getInstanceByProviderID("azure://" + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0))
-	assert.True(t, found, true)
-	assert.Equal(t, instance0.Status.State, cloudprovider.InstanceDeleting)
+	assert.Equal(t, 3, targetSize)
 }
 
 func TestDeleteNoConflictRequest(t *testing.T) {

From 45520203ba03e51ac26e3390a3d1590d9485c512 Mon Sep 17 00:00:00 2001
From: Marwan Ahmed <marwanad@microsoft.com>
Date: Thu, 30 Sep 2021 16:49:37 -0700
Subject: [PATCH 4/4] remove flaky test that is no longer necessary

---
 .../cloudprovider/azure/azure_manager_test.go             | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
index 89cd03b4a3ee..9166d12a3fac 100644
--- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
@@ -1148,14 +1148,6 @@ func TestOverrideDefaultRateLimitConfig(t *testing.T) {
 	assert.Equal(t, &falseCloudProviderRateLimit.RateLimitConfig, newconfig)
 }
 
-func TestGetSubscriptionIdFromInstanceMetadata(t *testing.T) {
-	// metadataURL in azure_manager.go is not available for our tests, expect fail.
-	result, err := getSubscriptionIdFromInstanceMetadata()
-	expected := ""
-	assert.NotNil(t, err.Error())
-	assert.Equal(t, expected, result, "Verify return result failed, expected: %v, actual: %v", expected, result)
-}
-
 func TestManagerRefreshAndCleanup(t *testing.T) {
 	ctrl := gomock.NewController(t)
 	defer ctrl.Finish()