Skip to content

Commit

Permalink
Delete outdated deployments.
Browse files Browse the repository at this point in the history
  • Loading branch information
t-qini committed Dec 27, 2019
1 parent a1cd503 commit b49153d
Show file tree
Hide file tree
Showing 7 changed files with 289 additions and 5 deletions.
78 changes: 76 additions & 2 deletions cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,13 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
"k8s.io/klog"
schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
)

var (
vmInstancesRefreshPeriod = 5 * time.Minute
const (
vmInstancesRefreshPeriod = 5 * time.Minute
clusterAutoscalerDeploymentPrefix = `cluster-autoscaler-`
defaultMaxDeploymentsCount = 10
)

var virtualMachinesStatusCache struct {
Expand Down Expand Up @@ -209,6 +212,57 @@ func (as *AgentPool) TargetSize() (int, error) {
return int(size), nil
}

func (as *AgentPool) getAllSucceededAndFailedDeployments() (succeededAndFailedDeployments []resources.DeploymentExtended, err error) {
ctx, cancel := getContextWithCancel()
defer cancel()

deploymentsFilter := "provisioningState eq 'Succeeded' or provisioningState eq 'Failed'"
succeededAndFailedDeployments, err = as.manager.azClient.deploymentsClient.List(ctx, as.manager.config.ResourceGroup, deploymentsFilter, nil)
if err != nil {
klog.Errorf("getAllSucceededAndFailedDeployments: failed to list succeeded or failed deployments with error: %v", err)
return nil, err
}

return succeededAndFailedDeployments, err
}

// deleteOutdatedDeployments keeps the newest deployments in the resource group and delete others,
// since Azure resource group deployments have a hard cap of 800, outdated deployments must be deleted
// to prevent the `DeploymentQuotaExceeded` error. see: issue #2154.
func (as *AgentPool) deleteOutdatedDeployments(deployments []resources.DeploymentExtended) (err error) {
for i := len(deployments) - 1; i >= 0; i-- {
klog.V(4).Infof("deleteOutdatedDeployments: found deployments[i].Name: %s", *deployments[i].Name)
if deployments[i].Name != nil && !strings.HasPrefix(*deployments[i].Name, clusterAutoscalerDeploymentPrefix) {
deployments = append(deployments[:i], deployments[i+1:]...)
}
}

if int64(len(deployments)) <= as.manager.config.MaxDeploymentsCount {
klog.V(4).Infof("deleteOutdatedDeployments: the number of deployments (%d) is under threshold, skip deleting", len(deployments))
return err
}

sort.Slice(deployments, func(i, j int) bool {
return deployments[i].Properties.Timestamp.Time.After(deployments[j].Properties.Timestamp.Time)
})

toBeDeleted := deployments[as.manager.config.MaxDeploymentsCount:]

ctx, cancel := getContextWithCancel()
defer cancel()

errList := make([]error, 0)
for _, deployment := range toBeDeleted {
klog.V(4).Infof("deleteOutdatedDeployments: starts deleting outdated deployment (%s)", *deployment.Name)
_, err := as.manager.azClient.deploymentsClient.Delete(ctx, as.manager.config.ResourceGroup, *deployment.Name)
if err != nil {
errList = append(errList, err)
}
}

return utilerrors.NewAggregate(errList)
}

// IncreaseSize increases agent pool size
func (as *AgentPool) IncreaseSize(delta int) error {
as.mutex.Lock()
Expand All @@ -218,6 +272,16 @@ func (as *AgentPool) IncreaseSize(delta int) error {
return fmt.Errorf("size increase must be positive")
}

succeededDeployments, err := as.getAllSucceededAndFailedDeployments()
if err != nil {
klog.Warningf("IncreaseSize: failed to cleanup outdated deployments with err: %v.", err)
}

err = as.deleteOutdatedDeployments(succeededDeployments)
if err != nil {
return err
}

indexes, _, err := as.GetVMIndexes()
if err != nil {
return err
Expand Down Expand Up @@ -404,6 +468,16 @@ func (as *AgentPool) DeleteNodes(nodes []*apiv1.Node) error {
refs = append(refs, ref)
}

succeededAndFailedDeployments, err := as.getAllSucceededAndFailedDeployments()
if err != nil {
return err
}

err = as.deleteOutdatedDeployments(succeededAndFailedDeployments)
if err != nil {
klog.Warningf("DeleteNodes: failed to cleanup outdated deployments with err: %v.", err)
}

return as.DeleteInstances(refs)
}

Expand Down
126 changes: 126 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package azure

import (
"context"
"testing"
"time"

"github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2017-05-10/resources"
"github.com/Azure/go-autorest/autorest/date"
"github.com/Azure/go-autorest/autorest/to"
"github.com/stretchr/testify/assert"
)

func newTestAgentPool(manager *AzureManager, name string) *AgentPool {
return &AgentPool{
azureRef: azureRef{
Name: name,
},
manager: manager,
minSize: 1,
maxSize: 5,
}
}

func TestDeleteOutdatedDeployments(t *testing.T) {
timeLayout := "2006-01-02 15:04:05"
timeBenchMark, _ := time.Parse(timeLayout, "2000-01-01 00:00:00")

testCases := []struct {
deployments map[string]resources.DeploymentExtended
expectedDeployments []resources.DeploymentExtended
expectedErr error
desc string
}{
{
deployments: map[string]resources.DeploymentExtended{
"non-cluster-autoscaler-0000": {
Name: to.StringPtr("non-cluster-autoscaler-0000"),
Properties: &resources.DeploymentPropertiesExtended{
ProvisioningState: to.StringPtr("Succeeded"),
Timestamp: &date.Time{Time: timeBenchMark.Add(2 * time.Minute)},
},
},
"cluster-autoscaler-0000": {
Name: to.StringPtr("cluster-autoscaler-0000"),
Properties: &resources.DeploymentPropertiesExtended{
ProvisioningState: to.StringPtr("Succeeded"),
Timestamp: &date.Time{Time: timeBenchMark},
},
},
"cluster-autoscaler-0001": {
Name: to.StringPtr("cluster-autoscaler-0001"),
Properties: &resources.DeploymentPropertiesExtended{
ProvisioningState: to.StringPtr("Succeeded"),
Timestamp: &date.Time{Time: timeBenchMark.Add(time.Minute)},
},
},
"cluster-autoscaler-0002": {
Name: to.StringPtr("cluster-autoscaler-0002"),
Properties: &resources.DeploymentPropertiesExtended{
ProvisioningState: to.StringPtr("Succeeded"),
Timestamp: &date.Time{Time: timeBenchMark.Add(2 * time.Minute)},
},
},
},
expectedDeployments: []resources.DeploymentExtended{
{
Name: to.StringPtr("non-cluster-autoscaler-0000"),
Properties: &resources.DeploymentPropertiesExtended{
ProvisioningState: to.StringPtr("Succeeded"),
Timestamp: &date.Time{Time: timeBenchMark.Add(2 * time.Minute)},
},
},
{
Name: to.StringPtr("cluster-autoscaler-0001"),
Properties: &resources.DeploymentPropertiesExtended{
ProvisioningState: to.StringPtr("Succeeded"),
Timestamp: &date.Time{Time: timeBenchMark.Add(time.Minute)},
},
},
{
Name: to.StringPtr("cluster-autoscaler-0002"),
Properties: &resources.DeploymentPropertiesExtended{
ProvisioningState: to.StringPtr("Succeeded"),
Timestamp: &date.Time{Time: timeBenchMark.Add(2 * time.Minute)},
},
},
},
expectedErr: nil,
desc: "cluster autoscaler provider azure should delete outdated deployments created by cluster autoscaler",
},
}

for _, test := range testCases {
testAS := newTestAgentPool(newTestAzureManager(t), "testAS")
testAS.manager.azClient.deploymentsClient = &DeploymentsClientMock{
FakeStore: test.deployments,
}

allDeployments := make([]resources.DeploymentExtended, 0)
for _, v := range test.deployments {
allDeployments = append(allDeployments, v)
}

err := testAS.deleteOutdatedDeployments(allDeployments)
assert.Equal(t, test.expectedErr, err, test.desc)
existedDeployments, err := testAS.manager.azClient.deploymentsClient.List(context.Background(), "", "", to.Int32Ptr(0))
assert.Equal(t, test.expectedDeployments, existedDeployments, test.desc)
}
}
40 changes: 40 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ type InterfacesClient interface {
// DeploymentsClient defines needed functions for azure network.DeploymentsClient.
type DeploymentsClient interface {
Get(ctx context.Context, resourceGroupName string, deploymentName string) (result resources.DeploymentExtended, err error)
List(ctx context.Context, resourceGroupName string, filter string, top *int32) (result []resources.DeploymentExtended, err error)
ExportTemplate(ctx context.Context, resourceGroupName string, deploymentName string) (result resources.DeploymentExportResult, err error)
CreateOrUpdate(ctx context.Context, resourceGroupName string, deploymentName string, parameters resources.Deployment) (resp *http.Response, err error)
Delete(ctx context.Context, resourceGroupName string, deploymentName string) (resp *http.Response, err error)
}

// DisksClient defines needed functions for azure disk.DisksClient.
Expand Down Expand Up @@ -349,6 +351,44 @@ func (az *azDeploymentsClient) CreateOrUpdate(ctx context.Context, resourceGroup
return future.Response(), err
}

func (az *azDeploymentsClient) List(ctx context.Context, resourceGroupName, filter string, top *int32) (result []resources.DeploymentExtended, err error) {
klog.V(10).Infof("azDeploymentsClient.List(%q): start", resourceGroupName)
defer func() {
klog.V(10).Infof("azDeploymentsClient.List(%q): end", resourceGroupName)
}()

iterator, err := az.client.ListByResourceGroupComplete(ctx, resourceGroupName, filter, top)
if err != nil {
return nil, err
}

result = make([]resources.DeploymentExtended, 0)
for ; iterator.NotDone(); err = iterator.Next() {
if err != nil {
return nil, err
}

result = append(result, iterator.Value())
}

return result, err
}

func (az *azDeploymentsClient) Delete(ctx context.Context, resourceGroupName, deploymentName string) (resp *http.Response, err error) {
klog.V(10).Infof("azDeploymentsClient.Delete(%q,%q): start", resourceGroupName, deploymentName)
defer func() {
klog.V(10).Infof("azDeploymentsClient.Delete(%q,%q): end", resourceGroupName, deploymentName)
}()

future, err := az.client.Delete(ctx, resourceGroupName, deploymentName)
if err != nil {
return future.Response(), err
}

err = future.WaitForCompletionRef(ctx, az.client.Client)
return future.Response(), err
}

type azDisksClient struct {
client compute.DisksClient
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ func newTestAzureManager(t *testing.T) *AzureManager {
config: &Config{
ResourceGroup: "test",
VMType: vmTypeVMSS,
MaxDeploymentsCount: 2,
},

azClient: &azClient{
Expand Down
27 changes: 27 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_fakes.go
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,30 @@ func (m *DeploymentsClientMock) CreateOrUpdate(ctx context.Context, resourceGrou
deploy.Properties.Template = parameters.Properties.Template
return nil, nil
}

// List gets all the deployments for a resource group.
func (m *DeploymentsClientMock) List(ctx context.Context, resourceGroupName, filter string, top *int32) (result []resources.DeploymentExtended, err error) {
m.mutex.Lock()
defer m.mutex.Unlock()

result = make([]resources.DeploymentExtended, 0)
for i := range m.FakeStore {
result = append(result, m.FakeStore[i])
}

return result, nil
}

// Delete deletes the given deployment
func (m *DeploymentsClientMock) Delete(ctx context.Context, resourceGroupName, deploymentName string) (resp *http.Response, err error) {
m.mutex.Lock()
defer m.mutex.Unlock()

if _, ok := m.FakeStore[deploymentName]; !ok {
return nil, fmt.Errorf("there is no such a deployment with name %s", deploymentName)
}

delete(m.FakeStore, deploymentName)

return
}
14 changes: 14 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ type Config struct {

// ASG cache TTL in seconds
AsgCacheTTL int64 `json:"asgCacheTTL" yaml:"asgCacheTTL"`

// number of latest deployments that will not be deleted
MaxDeploymentsCount int64 `json:"clusterAutoscalerDeploymentOutdateCountThreshold" yaml:"ClusterAutoscalerDeploymentOutdateCountThreshold"`
}

// TrimSpace removes all leading and trailing white spaces.
Expand Down Expand Up @@ -171,6 +174,13 @@ func CreateAzureManager(configReader io.Reader, discoveryOpts cloudprovider.Node
return nil, fmt.Errorf("failed to parse AZURE_ASG_CACHE_TTL %q: %v", asgCacheTTL, err)
}
}

if threshold := os.Getenv("AZURE_MAX_DEPLOYMENT_COUNT"); threshold != "" {
cfg.MaxDeploymentsCount, err = strconv.ParseInt(threshold, 10, 0)
if err != nil {
return nil, fmt.Errorf("failed to parse AZURE_MAX_DEPLOYMENT_COUNT %q: %v", threshold, err)
}
}
}
cfg.TrimSpace()

Expand All @@ -194,6 +204,10 @@ func CreateAzureManager(configReader io.Reader, discoveryOpts cloudprovider.Node
cfg.AsgCacheTTL = int64(defaultAsgCacheTTL)
}

if cfg.MaxDeploymentsCount == 0 {
cfg.MaxDeploymentsCount = int64(defaultMaxDeploymentsCount)
}

// Defaulting env to Azure Public Cloud.
env := azure.PublicCloud
if cfg.Cloud != "" {
Expand Down
8 changes: 5 additions & 3 deletions cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,15 @@ const validAzureCfg = `{
"vnetName": "fakeName",
"routeTableName": "fakeName",
"primaryAvailabilitySetName": "fakeName",
"asgCacheTTL": 900}`
"asgCacheTTL": 900,
"MaxDeploymentsCount": 8}`

const invalidAzureCfg = `{{}"cloud": "AzurePublicCloud",}`

func TestCreateAzureManagerValidConfig(t *testing.T) {
manager, err := CreateAzureManager(strings.NewReader(validAzureCfg), cloudprovider.NodeGroupDiscoveryOptions{})

expectdConfig := &Config{
expectedConfig := &Config{
Cloud: "AzurePublicCloud",
TenantID: "fakeId",
SubscriptionID: "fakeId",
Expand All @@ -53,10 +54,11 @@ func TestCreateAzureManagerValidConfig(t *testing.T) {
AADClientID: "fakeId",
AADClientSecret: "fakeId",
AsgCacheTTL: 900,
MaxDeploymentsCount: 8,
}

assert.NoError(t, err)
assert.Equal(t, expectdConfig, manager.config, "unexpected azure manager configuration")
assert.Equal(t, expectedConfig, manager.config, "unexpected azure manager configuration")
}

func TestCreateAzureManagerInvalidConfig(t *testing.T) {
Expand Down

0 comments on commit b49153d

Please sign in to comment.