Skip to content

Commit

Permalink
Merge pull request #2690 from nilo19/qi-delete-outdated-deployments
Browse files Browse the repository at this point in the history
Delete outdated deployments.
  • Loading branch information
k8s-ci-robot authored Dec 28, 2019
2 parents dded45b + 48cea0f commit d840ac8
Show file tree
Hide file tree
Showing 7 changed files with 275 additions and 15 deletions.
73 changes: 71 additions & 2 deletions cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,17 @@ import (
"github.com/Azure/go-autorest/autorest/to"

apiv1 "k8s.io/api/core/v1"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
"k8s.io/klog"
schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
)

var (
vmInstancesRefreshPeriod = 5 * time.Minute
const (
vmInstancesRefreshPeriod = 5 * time.Minute
clusterAutoscalerDeploymentPrefix = `cluster-autoscaler-`
defaultMaxDeploymentsCount = 10
)

var virtualMachinesStatusCache struct {
Expand Down Expand Up @@ -209,6 +212,62 @@ func (as *AgentPool) TargetSize() (int, error) {
return int(size), nil
}

func (as *AgentPool) getAllSucceededAndFailedDeployments() (succeededAndFailedDeployments []resources.DeploymentExtended, err error) {
ctx, cancel := getContextWithCancel()
defer cancel()

deploymentsFilter := "provisioningState eq 'Succeeded' or provisioningState eq 'Failed'"
succeededAndFailedDeployments, err = as.manager.azClient.deploymentsClient.List(ctx, as.manager.config.ResourceGroup, deploymentsFilter, nil)
if err != nil {
klog.Errorf("getAllSucceededAndFailedDeployments: failed to list succeeded or failed deployments with error: %v", err)
return nil, err
}

return succeededAndFailedDeployments, err
}

// deleteOutdatedDeployments keeps the newest deployments in the resource group and delete others,
// since Azure resource group deployments have a hard cap of 800, outdated deployments must be deleted
// to prevent the `DeploymentQuotaExceeded` error. see: issue #2154.
func (as *AgentPool) deleteOutdatedDeployments() (err error) {
deployments, err := as.getAllSucceededAndFailedDeployments()
if err != nil {
return err
}

for i := len(deployments) - 1; i >= 0; i-- {
klog.V(4).Infof("deleteOutdatedDeployments: found deployments[i].Name: %s", *deployments[i].Name)
if deployments[i].Name != nil && !strings.HasPrefix(*deployments[i].Name, clusterAutoscalerDeploymentPrefix) {
deployments = append(deployments[:i], deployments[i+1:]...)
}
}

if int64(len(deployments)) <= as.manager.config.MaxDeploymentsCount {
klog.V(4).Infof("deleteOutdatedDeployments: the number of deployments (%d) is under threshold, skip deleting", len(deployments))
return err
}

sort.Slice(deployments, func(i, j int) bool {
return deployments[i].Properties.Timestamp.Time.After(deployments[j].Properties.Timestamp.Time)
})

toBeDeleted := deployments[as.manager.config.MaxDeploymentsCount:]

ctx, cancel := getContextWithCancel()
defer cancel()

errList := make([]error, 0)
for _, deployment := range toBeDeleted {
klog.V(4).Infof("deleteOutdatedDeployments: starts deleting outdated deployment (%s)", *deployment.Name)
_, err := as.manager.azClient.deploymentsClient.Delete(ctx, as.manager.config.ResourceGroup, *deployment.Name)
if err != nil {
errList = append(errList, err)
}
}

return utilerrors.NewAggregate(errList)
}

// IncreaseSize increases agent pool size
func (as *AgentPool) IncreaseSize(delta int) error {
as.mutex.Lock()
Expand All @@ -218,6 +277,11 @@ func (as *AgentPool) IncreaseSize(delta int) error {
return fmt.Errorf("size increase must be positive")
}

err := as.deleteOutdatedDeployments()
if err != nil {
klog.Warningf("IncreaseSize: failed to cleanup outdated deployments with err: %v.", err)
}

indexes, _, err := as.GetVMIndexes()
if err != nil {
return err
Expand Down Expand Up @@ -404,6 +468,11 @@ func (as *AgentPool) DeleteNodes(nodes []*apiv1.Node) error {
refs = append(refs, ref)
}

err = as.deleteOutdatedDeployments()
if err != nil {
klog.Warningf("DeleteNodes: failed to cleanup outdated deployments with err: %v.", err)
}

return as.DeleteInstances(refs)
}

Expand Down
107 changes: 107 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package azure

import (
"context"
"testing"
"time"

"github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2017-05-10/resources"
"github.com/Azure/go-autorest/autorest/date"
"github.com/Azure/go-autorest/autorest/to"
"github.com/stretchr/testify/assert"
)

func newTestAgentPool(manager *AzureManager, name string) *AgentPool {
return &AgentPool{
azureRef: azureRef{
Name: name,
},
manager: manager,
minSize: 1,
maxSize: 5,
}
}

func TestDeleteOutdatedDeployments(t *testing.T) {
timeLayout := "2006-01-02 15:04:05"
timeBenchMark, _ := time.Parse(timeLayout, "2000-01-01 00:00:00")

testCases := []struct {
deployments map[string]resources.DeploymentExtended
expectedDeploymentsNames map[string]bool
expectedErr error
desc string
}{
{
deployments: map[string]resources.DeploymentExtended{
"non-cluster-autoscaler-0000": {
Name: to.StringPtr("non-cluster-autoscaler-0000"),
Properties: &resources.DeploymentPropertiesExtended{
ProvisioningState: to.StringPtr("Succeeded"),
Timestamp: &date.Time{Time: timeBenchMark.Add(2 * time.Minute)},
},
},
"cluster-autoscaler-0000": {
Name: to.StringPtr("cluster-autoscaler-0000"),
Properties: &resources.DeploymentPropertiesExtended{
ProvisioningState: to.StringPtr("Succeeded"),
Timestamp: &date.Time{Time: timeBenchMark},
},
},
"cluster-autoscaler-0001": {
Name: to.StringPtr("cluster-autoscaler-0001"),
Properties: &resources.DeploymentPropertiesExtended{
ProvisioningState: to.StringPtr("Succeeded"),
Timestamp: &date.Time{Time: timeBenchMark.Add(time.Minute)},
},
},
"cluster-autoscaler-0002": {
Name: to.StringPtr("cluster-autoscaler-0002"),
Properties: &resources.DeploymentPropertiesExtended{
ProvisioningState: to.StringPtr("Succeeded"),
Timestamp: &date.Time{Time: timeBenchMark.Add(2 * time.Minute)},
},
},
},
expectedDeploymentsNames: map[string]bool{
"non-cluster-autoscaler-0000": true,
"cluster-autoscaler-0001": true,
"cluster-autoscaler-0002": true,
},
expectedErr: nil,
desc: "cluster autoscaler provider azure should delete outdated deployments created by cluster autoscaler",
},
}

for _, test := range testCases {
testAS := newTestAgentPool(newTestAzureManager(t), "testAS")
testAS.manager.azClient.deploymentsClient = &DeploymentsClientMock{
FakeStore: test.deployments,
}

err := testAS.deleteOutdatedDeployments()
assert.Equal(t, test.expectedErr, err, test.desc)
existedDeployments, err := testAS.manager.azClient.deploymentsClient.List(context.Background(), "", "", to.Int32Ptr(0))
existedDeploymentsNames := make(map[string]bool)
for _, deployment := range existedDeployments {
existedDeploymentsNames[*deployment.Name] = true
}
assert.Equal(t, test.expectedDeploymentsNames, existedDeploymentsNames, test.desc)
}
}
40 changes: 40 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ type InterfacesClient interface {
// DeploymentsClient defines needed functions for azure network.DeploymentsClient.
type DeploymentsClient interface {
Get(ctx context.Context, resourceGroupName string, deploymentName string) (result resources.DeploymentExtended, err error)
List(ctx context.Context, resourceGroupName string, filter string, top *int32) (result []resources.DeploymentExtended, err error)
ExportTemplate(ctx context.Context, resourceGroupName string, deploymentName string) (result resources.DeploymentExportResult, err error)
CreateOrUpdate(ctx context.Context, resourceGroupName string, deploymentName string, parameters resources.Deployment) (resp *http.Response, err error)
Delete(ctx context.Context, resourceGroupName string, deploymentName string) (resp *http.Response, err error)
}

// DisksClient defines needed functions for azure disk.DisksClient.
Expand Down Expand Up @@ -349,6 +351,44 @@ func (az *azDeploymentsClient) CreateOrUpdate(ctx context.Context, resourceGroup
return future.Response(), err
}

func (az *azDeploymentsClient) List(ctx context.Context, resourceGroupName, filter string, top *int32) (result []resources.DeploymentExtended, err error) {
klog.V(10).Infof("azDeploymentsClient.List(%q): start", resourceGroupName)
defer func() {
klog.V(10).Infof("azDeploymentsClient.List(%q): end", resourceGroupName)
}()

iterator, err := az.client.ListByResourceGroupComplete(ctx, resourceGroupName, filter, top)
if err != nil {
return nil, err
}

result = make([]resources.DeploymentExtended, 0)
for ; iterator.NotDone(); err = iterator.Next() {
if err != nil {
return nil, err
}

result = append(result, iterator.Value())
}

return result, err
}

func (az *azDeploymentsClient) Delete(ctx context.Context, resourceGroupName, deploymentName string) (resp *http.Response, err error) {
klog.V(10).Infof("azDeploymentsClient.Delete(%q,%q): start", resourceGroupName, deploymentName)
defer func() {
klog.V(10).Infof("azDeploymentsClient.Delete(%q,%q): end", resourceGroupName, deploymentName)
}()

future, err := az.client.Delete(ctx, resourceGroupName, deploymentName)
if err != nil {
return future.Response(), err
}

err = future.WaitForCompletionRef(ctx, az.client.Client)
return future.Response(), err
}

type azDisksClient struct {
client compute.DisksClient
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@ func newTestAzureManager(t *testing.T) *AzureManager {
env: azure.PublicCloud,
explicitlyConfigured: make(map[string]bool),
config: &Config{
ResourceGroup: "test",
VMType: vmTypeVMSS,
ResourceGroup: "test",
VMType: vmTypeVMSS,
MaxDeploymentsCount: 2,
},

azClient: &azClient{
Expand Down
27 changes: 27 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_fakes.go
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,30 @@ func (m *DeploymentsClientMock) CreateOrUpdate(ctx context.Context, resourceGrou
deploy.Properties.Template = parameters.Properties.Template
return nil, nil
}

// List gets all the deployments for a resource group.
func (m *DeploymentsClientMock) List(ctx context.Context, resourceGroupName, filter string, top *int32) (result []resources.DeploymentExtended, err error) {
m.mutex.Lock()
defer m.mutex.Unlock()

result = make([]resources.DeploymentExtended, 0)
for i := range m.FakeStore {
result = append(result, m.FakeStore[i])
}

return result, nil
}

// Delete deletes the given deployment
func (m *DeploymentsClientMock) Delete(ctx context.Context, resourceGroupName, deploymentName string) (resp *http.Response, err error) {
m.mutex.Lock()
defer m.mutex.Unlock()

if _, ok := m.FakeStore[deploymentName]; !ok {
return nil, fmt.Errorf("there is no such a deployment with name %s", deploymentName)
}

delete(m.FakeStore, deploymentName)

return
}
14 changes: 14 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ type Config struct {

// ASG cache TTL in seconds
AsgCacheTTL int64 `json:"asgCacheTTL" yaml:"asgCacheTTL"`

// number of latest deployments that will not be deleted
MaxDeploymentsCount int64 `json:"maxDeploymentsCount" yaml:"maxDeploymentsCount"`
}

// TrimSpace removes all leading and trailing white spaces.
Expand Down Expand Up @@ -171,6 +174,13 @@ func CreateAzureManager(configReader io.Reader, discoveryOpts cloudprovider.Node
return nil, fmt.Errorf("failed to parse AZURE_ASG_CACHE_TTL %q: %v", asgCacheTTL, err)
}
}

if threshold := os.Getenv("AZURE_MAX_DEPLOYMENT_COUNT"); threshold != "" {
cfg.MaxDeploymentsCount, err = strconv.ParseInt(threshold, 10, 0)
if err != nil {
return nil, fmt.Errorf("failed to parse AZURE_MAX_DEPLOYMENT_COUNT %q: %v", threshold, err)
}
}
}
cfg.TrimSpace()

Expand All @@ -194,6 +204,10 @@ func CreateAzureManager(configReader io.Reader, discoveryOpts cloudprovider.Node
cfg.AsgCacheTTL = int64(defaultAsgCacheTTL)
}

if cfg.MaxDeploymentsCount == 0 {
cfg.MaxDeploymentsCount = int64(defaultMaxDeploymentsCount)
}

// Defaulting env to Azure Public Cloud.
env := azure.PublicCloud
if cfg.Cloud != "" {
Expand Down
24 changes: 13 additions & 11 deletions cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,26 +37,28 @@ const validAzureCfg = `{
"vnetName": "fakeName",
"routeTableName": "fakeName",
"primaryAvailabilitySetName": "fakeName",
"asgCacheTTL": 900}`
"asgCacheTTL": 900,
"maxDeploymentsCount": 8}`

const invalidAzureCfg = `{{}"cloud": "AzurePublicCloud",}`

func TestCreateAzureManagerValidConfig(t *testing.T) {
manager, err := CreateAzureManager(strings.NewReader(validAzureCfg), cloudprovider.NodeGroupDiscoveryOptions{})

expectdConfig := &Config{
Cloud: "AzurePublicCloud",
TenantID: "fakeId",
SubscriptionID: "fakeId",
ResourceGroup: "fakeId",
VMType: "vmss",
AADClientID: "fakeId",
AADClientSecret: "fakeId",
AsgCacheTTL: 900,
expectedConfig := &Config{
Cloud: "AzurePublicCloud",
TenantID: "fakeId",
SubscriptionID: "fakeId",
ResourceGroup: "fakeId",
VMType: "vmss",
AADClientID: "fakeId",
AADClientSecret: "fakeId",
AsgCacheTTL: 900,
MaxDeploymentsCount: 8,
}

assert.NoError(t, err)
assert.Equal(t, expectdConfig, manager.config, "unexpected azure manager configuration")
assert.Equal(t, expectedConfig, manager.config, "unexpected azure manager configuration")
}

func TestCreateAzureManagerInvalidConfig(t *testing.T) {
Expand Down

0 comments on commit d840ac8

Please sign in to comment.