From d17c9deac9076400c4f3a4ead7e8590c78b68d53 Mon Sep 17 00:00:00 2001 From: Marwan Ahmed Date: Sun, 6 Jun 2021 13:16:14 -0700 Subject: [PATCH 1/4] annotate fakeNodes so that cloudprovider implementations can identify them if needed --- cluster-autoscaler/clusterstate/clusterstate.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 53aae7e1320d..1a5460a5a095 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -55,6 +55,14 @@ const ( // NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset. NodeGroupBackoffResetTimeout = 3 * time.Hour + + // FakeNodeReasonAnnotation is an annotation added to the fake placeholder nodes CA has created + // Note that this don't map to real nodes in k8s and are merely used for error handling + FakeNodeReasonAnnotation = "k8s.io/cluster-autoscaler/fake-node-reason" + // FakeNodeUnregistered represents a node that is identified by CA as unregistered + FakeNodeUnregistered = "unregistered" + // FakeNodeCreateError represents a node that is identified by CA as a created node with errors + FakeNodeCreateError = "create-error" ) // ScaleUpRequest contains information about the requested node group scale up. @@ -975,7 +983,7 @@ func getNotRegisteredNodes(allNodes []*apiv1.Node, cloudProviderNodeInstances ma for _, instance := range instances { if !registered.Has(instance.Id) { notRegistered = append(notRegistered, UnregisteredNode{ - Node: fakeNode(instance), + Node: fakeNode(instance, FakeNodeUnregistered), UnregisteredSince: time, }) } @@ -1122,7 +1130,7 @@ func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() []*apiv1.Node { _, _, instancesByErrorCode := csr.buildInstanceToErrorCodeMappings(nodeGroupInstances) for _, instances := range instancesByErrorCode { for _, instance := range instances { - nodesWithCreateErrors = append(nodesWithCreateErrors, fakeNode(instance)) + nodesWithCreateErrors = append(nodesWithCreateErrors, fakeNode(instance, FakeNodeCreateError)) } } } @@ -1139,10 +1147,13 @@ func (csr *ClusterStateRegistry) InvalidateNodeInstancesCacheEntry(nodeGroup clo csr.cloudProviderNodeInstancesCache.InvalidateCacheEntry(nodeGroup) } -func fakeNode(instance cloudprovider.Instance) *apiv1.Node { +func fakeNode(instance cloudprovider.Instance, reason string) *apiv1.Node { return &apiv1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: instance.Id, + Annotations: map[string]string{ + FakeNodeReasonAnnotation: reason, + }, }, Spec: apiv1.NodeSpec{ ProviderID: instance.Id, From 65e69fa07d9011fd3f34819f567b16c6dafff920 Mon Sep 17 00:00:00 2001 From: Marwan Ahmed Date: Tue, 8 Jun 2021 10:56:35 -0700 Subject: [PATCH 2/4] move annotations to cloudprovider package --- cluster-autoscaler/cloudprovider/cloud_provider.go | 10 ++++++++++ cluster-autoscaler/clusterstate/clusterstate.go | 14 +++----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/cloud_provider.go b/cluster-autoscaler/cloudprovider/cloud_provider.go index 56bfefdf3270..98f17a5a3ca6 100644 --- a/cluster-autoscaler/cloudprovider/cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/cloud_provider.go @@ -234,6 +234,16 @@ func (c InstanceErrorClass) String() string { } } +const ( + // FakeNodeReasonAnnotation is an annotation added to the fake placeholder nodes CA has created + // Note that this don't map to real nodes in k8s and are merely used for error handling + FakeNodeReasonAnnotation = "k8s.io/cluster-autoscaler/fake-node-reason" + // FakeNodeUnregistered represents a node that is identified by CA as unregistered + FakeNodeUnregistered = "unregistered" + // FakeNodeCreateError represents a node that is identified by CA as a created node with errors + FakeNodeCreateError = "create-error" +) + // PricingModel contains information about the node price and how it changes in time. type PricingModel interface { // NodePrice returns a price of running the given node for a given period of time. diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 1a5460a5a095..ea30217af311 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -55,14 +55,6 @@ const ( // NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset. NodeGroupBackoffResetTimeout = 3 * time.Hour - - // FakeNodeReasonAnnotation is an annotation added to the fake placeholder nodes CA has created - // Note that this don't map to real nodes in k8s and are merely used for error handling - FakeNodeReasonAnnotation = "k8s.io/cluster-autoscaler/fake-node-reason" - // FakeNodeUnregistered represents a node that is identified by CA as unregistered - FakeNodeUnregistered = "unregistered" - // FakeNodeCreateError represents a node that is identified by CA as a created node with errors - FakeNodeCreateError = "create-error" ) // ScaleUpRequest contains information about the requested node group scale up. @@ -983,7 +975,7 @@ func getNotRegisteredNodes(allNodes []*apiv1.Node, cloudProviderNodeInstances ma for _, instance := range instances { if !registered.Has(instance.Id) { notRegistered = append(notRegistered, UnregisteredNode{ - Node: fakeNode(instance, FakeNodeUnregistered), + Node: fakeNode(instance, cloudprovider.FakeNodeUnregistered), UnregisteredSince: time, }) } @@ -1130,7 +1122,7 @@ func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() []*apiv1.Node { _, _, instancesByErrorCode := csr.buildInstanceToErrorCodeMappings(nodeGroupInstances) for _, instances := range instancesByErrorCode { for _, instance := range instances { - nodesWithCreateErrors = append(nodesWithCreateErrors, fakeNode(instance, FakeNodeCreateError)) + nodesWithCreateErrors = append(nodesWithCreateErrors, fakeNode(instance, cloudprovider.FakeNodeCreateError)) } } } @@ -1152,7 +1144,7 @@ func fakeNode(instance cloudprovider.Instance, reason string) *apiv1.Node { ObjectMeta: metav1.ObjectMeta{ Name: instance.Id, Annotations: map[string]string{ - FakeNodeReasonAnnotation: reason, + cloudprovider.FakeNodeReasonAnnotation: reason, }, }, Spec: apiv1.NodeSpec{ From 5375fb0c9ce4ef8fd911d2006dadd6d64415b495 Mon Sep 17 00:00:00 2001 From: Marwan Ahmed Date: Thu, 30 Sep 2021 16:29:46 -0700 Subject: [PATCH 3/4] fix 1.19 test --- .../azure/azure_scale_set_test.go | 26 +++++++------------ 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index b76581ec90a1..9f2ba54d55e5 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -328,7 +328,7 @@ func TestDeleteNodeUnregistered(t *testing.T) { manager := newTestAzureManager(t) vmssName := "test-asg" - var vmssCapacity int64 = 2 + var vmssCapacity int64 = 3 expectedScaleSets := []compute.VirtualMachineScaleSet{ { @@ -338,18 +338,17 @@ func TestDeleteNodeUnregistered(t *testing.T) { }, }, } - expectedVMSSVMs := newTestVMSSVMList(2) + expectedVMSSVMs := newTestVMSSVMList(3) mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) - mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).Times(2) + mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes() mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any()).Return(nil, nil) mockVMSSClient.EXPECT().WaitForAsyncOperationResult(gomock.Any(), gomock.Any()).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient - err := manager.forceRefresh() - assert.NoError(t, err) + manager.regenerateCache() resourceLimiter := cloudprovider.NewResourceLimiter( map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000}, @@ -357,23 +356,21 @@ func TestDeleteNodeUnregistered(t *testing.T) { provider, err := BuildAzureCloudProvider(manager, resourceLimiter) assert.NoError(t, err) - registered := manager.RegisterNodeGroup( - newTestScaleSet(manager, "test-asg")) - manager.explicitlyConfigured["test-asg"] = true + registered := manager.RegisterAsg(newTestScaleSet(manager, "test-asg")) assert.True(t, registered) - err = manager.forceRefresh() - assert.NoError(t, err) + manager.regenerateCache() scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet) assert.True(t, ok) targetSize, err := scaleSet.TargetSize() assert.NoError(t, err) - assert.Equal(t, 2, targetSize) + assert.Equal(t, 3, targetSize) // annotate node with unregistered annotation annotations := make(map[string]string) annotations[cloudprovider.FakeNodeReasonAnnotation] = cloudprovider.FakeNodeUnregistered + // Perform the delete operation nodesToDelete := []*apiv1.Node{ { ObjectMeta: metav1.ObjectMeta{ @@ -390,12 +387,7 @@ func TestDeleteNodeUnregistered(t *testing.T) { // Ensure the the cached size has NOT been proactively decremented targetSize, err = scaleSet.TargetSize() assert.NoError(t, err) - assert.Equal(t, 2, targetSize) - - // Ensure that the status for the instances is Deleting - instance0, found := scaleSet.getInstanceByProviderID("azure://" + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0)) - assert.True(t, found, true) - assert.Equal(t, instance0.Status.State, cloudprovider.InstanceDeleting) + assert.Equal(t, 3, targetSize) } func TestDeleteNoConflictRequest(t *testing.T) { From 45520203ba03e51ac26e3390a3d1590d9485c512 Mon Sep 17 00:00:00 2001 From: Marwan Ahmed Date: Thu, 30 Sep 2021 16:49:37 -0700 Subject: [PATCH 4/4] remove flaky test that is no longer necessary --- .../cloudprovider/azure/azure_manager_test.go | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go index 89cd03b4a3ee..9166d12a3fac 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go @@ -1148,14 +1148,6 @@ func TestOverrideDefaultRateLimitConfig(t *testing.T) { assert.Equal(t, &falseCloudProviderRateLimit.RateLimitConfig, newconfig) } -func TestGetSubscriptionIdFromInstanceMetadata(t *testing.T) { - // metadataURL in azure_manager.go is not available for our tests, expect fail. - result, err := getSubscriptionIdFromInstanceMetadata() - expected := "" - assert.NotNil(t, err.Error()) - assert.Equal(t, expected, result, "Verify return result failed, expected: %v, actual: %v", expected, result) -} - func TestManagerRefreshAndCleanup(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish()