From d69e17ee65be04de57efe3d9fcf9d0e41dda3b5b Mon Sep 17 00:00:00 2001 From: t-qini Date: Tue, 24 Dec 2019 17:12:04 +0800 Subject: [PATCH] Delete outdated deployments. --- .../cloudprovider/azure/azure_agent_pool.go | 73 +++++++++++- .../azure/azure_agent_pool_test.go | 107 ++++++++++++++++++ .../cloudprovider/azure/azure_client.go | 40 +++++++ .../azure/azure_cloud_provider_test.go | 5 +- .../cloudprovider/azure/azure_fakes.go | 27 +++++ .../cloudprovider/azure/azure_manager.go | 14 +++ .../cloudprovider/azure/azure_manager_test.go | 24 ++-- 7 files changed, 275 insertions(+), 15 deletions(-) create mode 100644 cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go diff --git a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go index 642248ae261a..4888a778d9e5 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go @@ -30,14 +30,17 @@ import ( "github.com/Azure/go-autorest/autorest/to" apiv1 "k8s.io/api/core/v1" + utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/config/dynamic" "k8s.io/klog" schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo" ) -var ( - vmInstancesRefreshPeriod = 5 * time.Minute +const ( + vmInstancesRefreshPeriod = 5 * time.Minute + clusterAutoscalerDeploymentPrefix = `cluster-autoscaler-` + defaultMaxDeploymentsCount = 10 ) var virtualMachinesStatusCache struct { @@ -209,6 +212,62 @@ func (as *AgentPool) TargetSize() (int, error) { return int(size), nil } +func (as *AgentPool) getAllSucceededAndFailedDeployments() (succeededAndFailedDeployments []resources.DeploymentExtended, err error) { + ctx, cancel := getContextWithCancel() + defer cancel() + + deploymentsFilter := "provisioningState eq 'Succeeded' or provisioningState eq 'Failed'" + succeededAndFailedDeployments, err = as.manager.azClient.deploymentsClient.List(ctx, as.manager.config.ResourceGroup, deploymentsFilter, nil) + if err != nil { + klog.Errorf("getAllSucceededAndFailedDeployments: failed to list succeeded or failed deployments with error: %v", err) + return nil, err + } + + return succeededAndFailedDeployments, err +} + +// deleteOutdatedDeployments keeps the newest deployments in the resource group and delete others, +// since Azure resource group deployments have a hard cap of 800, outdated deployments must be deleted +// to prevent the `DeploymentQuotaExceeded` error. see: issue #2154. +func (as *AgentPool) deleteOutdatedDeployments() (err error) { + deployments, err := as.getAllSucceededAndFailedDeployments() + if err != nil { + return err + } + + for i := len(deployments) - 1; i >= 0; i-- { + klog.V(4).Infof("deleteOutdatedDeployments: found deployments[i].Name: %s", *deployments[i].Name) + if deployments[i].Name != nil && !strings.HasPrefix(*deployments[i].Name, clusterAutoscalerDeploymentPrefix) { + deployments = append(deployments[:i], deployments[i+1:]...) + } + } + + if int64(len(deployments)) <= as.manager.config.MaxDeploymentsCount { + klog.V(4).Infof("deleteOutdatedDeployments: the number of deployments (%d) is under threshold, skip deleting", len(deployments)) + return err + } + + sort.Slice(deployments, func(i, j int) bool { + return deployments[i].Properties.Timestamp.Time.After(deployments[j].Properties.Timestamp.Time) + }) + + toBeDeleted := deployments[as.manager.config.MaxDeploymentsCount:] + + ctx, cancel := getContextWithCancel() + defer cancel() + + errList := make([]error, 0) + for _, deployment := range toBeDeleted { + klog.V(4).Infof("deleteOutdatedDeployments: starts deleting outdated deployment (%s)", *deployment.Name) + _, err := as.manager.azClient.deploymentsClient.Delete(ctx, as.manager.config.ResourceGroup, *deployment.Name) + if err != nil { + errList = append(errList, err) + } + } + + return utilerrors.NewAggregate(errList) +} + // IncreaseSize increases agent pool size func (as *AgentPool) IncreaseSize(delta int) error { as.mutex.Lock() @@ -218,6 +277,11 @@ func (as *AgentPool) IncreaseSize(delta int) error { return fmt.Errorf("size increase must be positive") } + err := as.deleteOutdatedDeployments() + if err != nil { + klog.Warningf("IncreaseSize: failed to cleanup outdated deployments with err: %v.", err) + } + indexes, _, err := as.GetVMIndexes() if err != nil { return err @@ -404,6 +468,11 @@ func (as *AgentPool) DeleteNodes(nodes []*apiv1.Node) error { refs = append(refs, ref) } + err = as.deleteOutdatedDeployments() + if err != nil { + klog.Warningf("DeleteNodes: failed to cleanup outdated deployments with err: %v.", err) + } + return as.DeleteInstances(refs) } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go new file mode 100644 index 000000000000..058e43ac5460 --- /dev/null +++ b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go @@ -0,0 +1,107 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package azure + +import ( + "context" + "testing" + "time" + + "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2017-05-10/resources" + "github.com/Azure/go-autorest/autorest/date" + "github.com/Azure/go-autorest/autorest/to" + "github.com/stretchr/testify/assert" +) + +func newTestAgentPool(manager *AzureManager, name string) *AgentPool { + return &AgentPool{ + azureRef: azureRef{ + Name: name, + }, + manager: manager, + minSize: 1, + maxSize: 5, + } +} + +func TestDeleteOutdatedDeployments(t *testing.T) { + timeLayout := "2006-01-02 15:04:05" + timeBenchMark, _ := time.Parse(timeLayout, "2000-01-01 00:00:00") + + testCases := []struct { + deployments map[string]resources.DeploymentExtended + expectedDeploymentsNames map[string]bool + expectedErr error + desc string + }{ + { + deployments: map[string]resources.DeploymentExtended{ + "non-cluster-autoscaler-0000": { + Name: to.StringPtr("non-cluster-autoscaler-0000"), + Properties: &resources.DeploymentPropertiesExtended{ + ProvisioningState: to.StringPtr("Succeeded"), + Timestamp: &date.Time{Time: timeBenchMark.Add(2 * time.Minute)}, + }, + }, + "cluster-autoscaler-0000": { + Name: to.StringPtr("cluster-autoscaler-0000"), + Properties: &resources.DeploymentPropertiesExtended{ + ProvisioningState: to.StringPtr("Succeeded"), + Timestamp: &date.Time{Time: timeBenchMark}, + }, + }, + "cluster-autoscaler-0001": { + Name: to.StringPtr("cluster-autoscaler-0001"), + Properties: &resources.DeploymentPropertiesExtended{ + ProvisioningState: to.StringPtr("Succeeded"), + Timestamp: &date.Time{Time: timeBenchMark.Add(time.Minute)}, + }, + }, + "cluster-autoscaler-0002": { + Name: to.StringPtr("cluster-autoscaler-0002"), + Properties: &resources.DeploymentPropertiesExtended{ + ProvisioningState: to.StringPtr("Succeeded"), + Timestamp: &date.Time{Time: timeBenchMark.Add(2 * time.Minute)}, + }, + }, + }, + expectedDeploymentsNames: map[string]bool{ + "non-cluster-autoscaler-0000": true, + "cluster-autoscaler-0001": true, + "cluster-autoscaler-0002": true, + }, + expectedErr: nil, + desc: "cluster autoscaler provider azure should delete outdated deployments created by cluster autoscaler", + }, + } + + for _, test := range testCases { + testAS := newTestAgentPool(newTestAzureManager(t), "testAS") + testAS.manager.azClient.deploymentsClient = &DeploymentsClientMock{ + FakeStore: test.deployments, + } + + err := testAS.deleteOutdatedDeployments() + assert.Equal(t, test.expectedErr, err, test.desc) + existedDeployments, err := testAS.manager.azClient.deploymentsClient.List(context.Background(), "", "", to.Int32Ptr(0)) + existedDeploymentsNames := make(map[string]bool) + for _, deployment := range existedDeployments { + existedDeploymentsNames[*deployment.Name] = true + } + assert.Equal(t, test.expectedDeploymentsNames, existedDeploymentsNames, test.desc) + } +} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_client.go b/cluster-autoscaler/cloudprovider/azure/azure_client.go index 840e073ca0ec..e4f27587d0d4 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_client.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_client.go @@ -63,8 +63,10 @@ type InterfacesClient interface { // DeploymentsClient defines needed functions for azure network.DeploymentsClient. type DeploymentsClient interface { Get(ctx context.Context, resourceGroupName string, deploymentName string) (result resources.DeploymentExtended, err error) + List(ctx context.Context, resourceGroupName string, filter string, top *int32) (result []resources.DeploymentExtended, err error) ExportTemplate(ctx context.Context, resourceGroupName string, deploymentName string) (result resources.DeploymentExportResult, err error) CreateOrUpdate(ctx context.Context, resourceGroupName string, deploymentName string, parameters resources.Deployment) (resp *http.Response, err error) + Delete(ctx context.Context, resourceGroupName string, deploymentName string) (resp *http.Response, err error) } // DisksClient defines needed functions for azure disk.DisksClient. @@ -349,6 +351,44 @@ func (az *azDeploymentsClient) CreateOrUpdate(ctx context.Context, resourceGroup return future.Response(), err } +func (az *azDeploymentsClient) List(ctx context.Context, resourceGroupName, filter string, top *int32) (result []resources.DeploymentExtended, err error) { + klog.V(10).Infof("azDeploymentsClient.List(%q): start", resourceGroupName) + defer func() { + klog.V(10).Infof("azDeploymentsClient.List(%q): end", resourceGroupName) + }() + + iterator, err := az.client.ListByResourceGroupComplete(ctx, resourceGroupName, filter, top) + if err != nil { + return nil, err + } + + result = make([]resources.DeploymentExtended, 0) + for ; iterator.NotDone(); err = iterator.Next() { + if err != nil { + return nil, err + } + + result = append(result, iterator.Value()) + } + + return result, err +} + +func (az *azDeploymentsClient) Delete(ctx context.Context, resourceGroupName, deploymentName string) (resp *http.Response, err error) { + klog.V(10).Infof("azDeploymentsClient.Delete(%q,%q): start", resourceGroupName, deploymentName) + defer func() { + klog.V(10).Infof("azDeploymentsClient.Delete(%q,%q): end", resourceGroupName, deploymentName) + }() + + future, err := az.client.Delete(ctx, resourceGroupName, deploymentName) + if err != nil { + return future.Response(), err + } + + err = future.WaitForCompletionRef(ctx, az.client.Client) + return future.Response(), err +} + type azDisksClient struct { client compute.DisksClient } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go index 6a322e5685f5..4efc26ec488d 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go @@ -37,8 +37,9 @@ func newTestAzureManager(t *testing.T) *AzureManager { env: azure.PublicCloud, explicitlyConfigured: make(map[string]bool), config: &Config{ - ResourceGroup: "test", - VMType: vmTypeVMSS, + ResourceGroup: "test", + VMType: vmTypeVMSS, + MaxDeploymentsCount: 2, }, azClient: &azClient{ diff --git a/cluster-autoscaler/cloudprovider/azure/azure_fakes.go b/cluster-autoscaler/cloudprovider/azure/azure_fakes.go index daa94fe630ac..277704fc826f 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_fakes.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_fakes.go @@ -258,3 +258,30 @@ func (m *DeploymentsClientMock) CreateOrUpdate(ctx context.Context, resourceGrou deploy.Properties.Template = parameters.Properties.Template return nil, nil } + +// List gets all the deployments for a resource group. +func (m *DeploymentsClientMock) List(ctx context.Context, resourceGroupName, filter string, top *int32) (result []resources.DeploymentExtended, err error) { + m.mutex.Lock() + defer m.mutex.Unlock() + + result = make([]resources.DeploymentExtended, 0) + for i := range m.FakeStore { + result = append(result, m.FakeStore[i]) + } + + return result, nil +} + +// Delete deletes the given deployment +func (m *DeploymentsClientMock) Delete(ctx context.Context, resourceGroupName, deploymentName string) (resp *http.Response, err error) { + m.mutex.Lock() + defer m.mutex.Unlock() + + if _, ok := m.FakeStore[deploymentName]; !ok { + return nil, fmt.Errorf("there is no such a deployment with name %s", deploymentName) + } + + delete(m.FakeStore, deploymentName) + + return +} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager.go b/cluster-autoscaler/cloudprovider/azure/azure_manager.go index 3aa8df2bf106..b726978ed16b 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager.go @@ -106,6 +106,9 @@ type Config struct { // ASG cache TTL in seconds AsgCacheTTL int64 `json:"asgCacheTTL" yaml:"asgCacheTTL"` + + // number of latest deployments that will not be deleted + MaxDeploymentsCount int64 `json:"clusterAutoscalerDeploymentOutdateCountThreshold" yaml:"ClusterAutoscalerDeploymentOutdateCountThreshold"` } // TrimSpace removes all leading and trailing white spaces. @@ -171,6 +174,13 @@ func CreateAzureManager(configReader io.Reader, discoveryOpts cloudprovider.Node return nil, fmt.Errorf("failed to parse AZURE_ASG_CACHE_TTL %q: %v", asgCacheTTL, err) } } + + if threshold := os.Getenv("AZURE_MAX_DEPLOYMENT_COUNT"); threshold != "" { + cfg.MaxDeploymentsCount, err = strconv.ParseInt(threshold, 10, 0) + if err != nil { + return nil, fmt.Errorf("failed to parse AZURE_MAX_DEPLOYMENT_COUNT %q: %v", threshold, err) + } + } } cfg.TrimSpace() @@ -194,6 +204,10 @@ func CreateAzureManager(configReader io.Reader, discoveryOpts cloudprovider.Node cfg.AsgCacheTTL = int64(defaultAsgCacheTTL) } + if cfg.MaxDeploymentsCount == 0 { + cfg.MaxDeploymentsCount = int64(defaultMaxDeploymentsCount) + } + // Defaulting env to Azure Public Cloud. env := azure.PublicCloud if cfg.Cloud != "" { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go index 03aa8958daea..cb032b2519f8 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go @@ -37,26 +37,28 @@ const validAzureCfg = `{ "vnetName": "fakeName", "routeTableName": "fakeName", "primaryAvailabilitySetName": "fakeName", - "asgCacheTTL": 900}` + "asgCacheTTL": 900, + "maxDeploymentsCount": 8}` const invalidAzureCfg = `{{}"cloud": "AzurePublicCloud",}` func TestCreateAzureManagerValidConfig(t *testing.T) { manager, err := CreateAzureManager(strings.NewReader(validAzureCfg), cloudprovider.NodeGroupDiscoveryOptions{}) - expectdConfig := &Config{ - Cloud: "AzurePublicCloud", - TenantID: "fakeId", - SubscriptionID: "fakeId", - ResourceGroup: "fakeId", - VMType: "vmss", - AADClientID: "fakeId", - AADClientSecret: "fakeId", - AsgCacheTTL: 900, + expectedConfig := &Config{ + Cloud: "AzurePublicCloud", + TenantID: "fakeId", + SubscriptionID: "fakeId", + ResourceGroup: "fakeId", + VMType: "vmss", + AADClientID: "fakeId", + AADClientSecret: "fakeId", + AsgCacheTTL: 900, + MaxDeploymentsCount: 8, } assert.NoError(t, err) - assert.Equal(t, expectdConfig, manager.config, "unexpected azure manager configuration") + assert.Equal(t, expectedConfig, manager.config, "unexpected azure manager configuration") } func TestCreateAzureManagerInvalidConfig(t *testing.T) {