DataDog · tedcm · May 31, 2021 · May 6, 2021 · Sep 18, 2021 · May 10, 2021
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -0,0 +1,4 @@
+#This module configures the GitHub labeler to apply the "pci" label to all pci related changes automatically requiring an independent PR review. 
+
+pci:
+- vertical-pod-autoscaler/**/* #Directory that must comply with PCI change management
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
@@ -0,0 +1,11 @@
+name: Pull Request Labeler
+
+on: [pull_request]
+
+jobs:
+  label:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/labeler@v2
+      with:
+        repo-token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go b/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go
@@ -101,8 +101,15 @@ func newASGCache(awsService *awsWrapper, explicitSpecs []string, autoDiscoverySp
 var getInstanceTypeForAsg = func(m *asgCache, group *asg) (string, error) {
 	if obj, found, _ := m.asgInstanceTypeCache.GetByKey(group.AwsRef.Name); found {
 		return obj.(instanceTypeCachedObject).instanceType, nil
-	} else if result, err := m.awsService.getInstanceTypesForAsgs([]*asg{group}); err == nil {
-		return result[group.AwsRef.Name], nil
+	}
+
+	result, err := m.awsService.getInstanceTypesForAsgs([]*asg{group})
+	if err != nil {
+		return "", fmt.Errorf("could not get instance type for %s: %w", group.AwsRef.Name, err)
+	}
+
+	if instanceType, ok := result[group.AwsRef.Name]; ok {
+		return instanceType, nil
 	}
 
 	return "", fmt.Errorf("could not find instance type for %s", group.AwsRef.Name)

diff --git a/cluster-autoscaler/cloudprovider/aws/aws_wrapper.go b/cluster-autoscaler/cloudprovider/aws/aws_wrapper.go
@@ -293,9 +293,13 @@ func (m *awsWrapper) getInstanceTypesForAsgs(asgs []*asg) (map[string]string, er
 	}
 
 	for asgName, cfgName := range launchConfigsToQuery {
+		if instanceType, ok := launchConfigs[cfgName]; !ok || instanceType == "" {
+			klog.Warningf("Could not fetch %q launch configuration for ASG %q", cfgName, asgName)
+			continue
+		}
 		results[asgName] = launchConfigs[cfgName]
 	}
-	klog.V(4).Infof("Successfully queried %d launch configurations", len(launchConfigsToQuery))
+	klog.V(4).Infof("Successfully queried %d launch configurations", len(launchConfigs))
 
 	// Have to query LaunchTemplates one-at-a-time, since there's no way to query <lt, version> pairs in bulk
 	for asgName, lt := range launchTemplatesToQuery {

diff --git a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go
@@ -185,7 +185,7 @@ func TestGetVMsFromCache(t *testing.T) {
 	mockVMClient := mockvmclient.NewMockInterface(ctrl)
 	testAS.manager.azClient.virtualMachinesClient = mockVMClient
 	mockVMClient.EXPECT().List(gomock.Any(), testAS.manager.config.ResourceGroup).Return(expectedVMs, nil)
-	ac, err := newAzureCache(testAS.manager.azClient, refreshInterval, testAS.manager.config.ResourceGroup, vmTypeStandard)
+	ac, err := newAzureCache(testAS.manager.azClient, refreshInterval, testAS.manager.config.ResourceGroup, vmTypeStandard, false, "")
 	assert.NoError(t, err)
 	testAS.manager.azureCache = ac
 
@@ -203,7 +203,7 @@ func TestGetVMIndexes(t *testing.T) {
 	mockVMClient := mockvmclient.NewMockInterface(ctrl)
 	as.manager.azClient.virtualMachinesClient = mockVMClient
 	mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
-	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
+	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
 	assert.NoError(t, err)
 	as.manager.azureCache = ac
 
@@ -242,7 +242,7 @@ func TestGetCurSize(t *testing.T) {
 	mockVMClient := mockvmclient.NewMockInterface(ctrl)
 	as.manager.azClient.virtualMachinesClient = mockVMClient
 	mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
-	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
+	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
 	assert.NoError(t, err)
 	as.manager.azureCache = ac
 
@@ -266,7 +266,7 @@ func TestAgentPoolTargetSize(t *testing.T) {
 	as.manager.azClient.virtualMachinesClient = mockVMClient
 	expectedVMs := getExpectedVMs()
 	mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
-	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
+	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
 	assert.NoError(t, err)
 	as.manager.azureCache = ac
 
@@ -285,7 +285,7 @@ func TestAgentPoolIncreaseSize(t *testing.T) {
 	as.manager.azClient.virtualMachinesClient = mockVMClient
 	expectedVMs := getExpectedVMs()
 	mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil).MaxTimes(2)
-	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
+	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
 	assert.NoError(t, err)
 	as.manager.azureCache = ac
 
@@ -313,7 +313,7 @@ func TestDecreaseTargetSize(t *testing.T) {
 	as.manager.azClient.virtualMachinesClient = mockVMClient
 	expectedVMs := getExpectedVMs()
 	mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil).MaxTimes(3)
-	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
+	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
 	assert.NoError(t, err)
 	as.manager.azureCache = ac
 
@@ -431,7 +431,7 @@ func TestAgentPoolDeleteNodes(t *testing.T) {
 	mockSAClient := mockstorageaccountclient.NewMockInterface(ctrl)
 	as.manager.azClient.storageAccountsClient = mockSAClient
 	mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
-	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
+	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
 	assert.NoError(t, err)
 	as.manager.azureCache = ac
 
@@ -497,7 +497,7 @@ func TestAgentPoolNodes(t *testing.T) {
 	mockVMClient := mockvmclient.NewMockInterface(ctrl)
 	as.manager.azClient.virtualMachinesClient = mockVMClient
 	mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
-	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
+	ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
 	assert.NoError(t, err)
 	as.manager.azureCache = ac
 

diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_cache.go
@@ -17,6 +17,7 @@ limitations under the License.
 package azure
 
 import (
+	"context"
 	"reflect"
 	"regexp"
 	"strings"
@@ -25,6 +26,7 @@ import (
 
 	"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute"
 	"github.com/Azure/go-autorest/autorest/to"
+	"github.com/Azure/skewer"
 	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
 
 	"k8s.io/klog/v2"
@@ -55,9 +57,10 @@ type azureCache struct {
 	instanceToNodeGroup  map[azureRef]cloudprovider.NodeGroup
 	unownedInstances     map[azureRef]bool
 	autoscalingOptions   map[azureRef]map[string]string
+	skus                 map[string]*skewer.Cache
 }
 
-func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmType string) (*azureCache, error) {
+func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmType string, enableDynamicInstanceList bool, defaultLocation string) (*azureCache, error) {
 	cache := &azureCache{
 		interrupt:            make(chan struct{}),
 		azClient:             client,
@@ -70,6 +73,11 @@ func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmTy
 		instanceToNodeGroup:  make(map[azureRef]cloudprovider.NodeGroup),
 		unownedInstances:     make(map[azureRef]bool),
 		autoscalingOptions:   make(map[azureRef]map[string]string),
+		skus:                 make(map[string]*skewer.Cache),
+	}
+
+	if enableDynamicInstanceList {
+		cache.skus[defaultLocation] = &skewer.Cache{}
 	}
 
 	if err := cache.regenerate(); err != nil {
@@ -131,11 +139,21 @@ func (m *azureCache) regenerate() error {
 		newAutoscalingOptions[ref] = options
 	}
 
+	newSkuCache := make(map[string]*skewer.Cache)
+	for location := range m.skus {
+		cache, err := m.fetchSKUs(context.Background(), location)
+		if err != nil {
+			return err
+		}
+		newSkuCache[location] = cache
+	}
+
 	m.mutex.Lock()
 	defer m.mutex.Unlock()
 
 	m.instanceToNodeGroup = newInstanceToNodeGroupCache
 	m.autoscalingOptions = newAutoscalingOptions
+	m.skus = newSkuCache
 
 	// Reset unowned instances cache.
 	m.unownedInstances = make(map[azureRef]bool)
@@ -264,6 +282,31 @@ func (m *azureCache) Unregister(nodeGroup cloudprovider.NodeGroup) bool {
 	return changed
 }
 
+func (m *azureCache) fetchSKUs(ctx context.Context, location string) (*skewer.Cache, error) {
+	return skewer.NewCache(ctx,
+		skewer.WithLocation(location),
+		skewer.WithResourceClient(m.azClient.skuClient),
+	)
+}
+
+func (m *azureCache) GetSKU(ctx context.Context, skuName, location string) (skewer.SKU, error) {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+
+	cache, ok := m.skus[location]
+	if !ok {
+		var err error
+		cache, err = m.fetchSKUs(ctx, location)
+		if err != nil {
+			klog.V(1).Infof("Failed to instantiate cache, err: %v", err)
+			return skewer.SKU{}, err
+		}
+		m.skus[location] = cache
+	}
+
+	return cache.Get(ctx, skuName, skewer.VirtualMachines, location)
+}
+
 func (m *azureCache) getRegisteredNodeGroups() []cloudprovider.NodeGroup {
 	m.mutex.Lock()
 	defer m.mutex.Unlock()

diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go
@@ -75,7 +75,7 @@ func newTestAzureManager(t *testing.T) *AzureManager {
 		},
 	}
 
-	cache, error := newAzureCache(manager.azClient, refreshInterval, manager.config.ResourceGroup, vmTypeVMSS)
+	cache, error := newAzureCache(manager.azClient, refreshInterval, manager.config.ResourceGroup, vmTypeVMSS, false, "")
 	assert.NoError(t, error)
 
 	manager.azureCache = cache

diff --git a/cluster-autoscaler/cloudprovider/azure/azure_instance.go b/cluster-autoscaler/cloudprovider/azure/azure_instance.go
@@ -19,9 +19,7 @@ package azure
 import (
 	"context"
 	"fmt"
-	compute20190701 "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute"
 	"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute"
-	"github.com/Azure/skewer"
 	"k8s.io/klog/v2"
 	"regexp"
 	"strings"
@@ -61,24 +59,19 @@ var GetVMSSTypeStatically = func(template compute.VirtualMachineScaleSet) (*Inst
 
 // GetVMSSTypeDynamically fetched vmss instance information using sku api calls.
 // It is declared as a variable for testing purpose.
-var GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, skuClient compute20190701.ResourceSkusClient) (InstanceType, error) {
+var GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) {
 	ctx := context.Background()
-	var sku skewer.SKU
 	var vmssType InstanceType
 
-	cache, err := skewer.NewCache(ctx, skewer.WithLocation(*template.Location), skewer.WithResourceClient(skuClient))
-	if err != nil {
-		klog.V(1).Infof("Failed to instantiate cache, err: %v", err)
-		return vmssType, err
-	}
-
-	sku, err = cache.Get(ctx, *template.Sku.Name, skewer.VirtualMachines, *template.Location)
+	sku, err := azCache.GetSKU(ctx, *template.Sku.Name, *template.Location)
 	if err != nil {
 		// We didn't find an exact match but this is a promo type, check for matching standard
-		klog.V(1).Infof("No exact match found for %s, checking standard types. Error %v", *template.Sku.Name, err)
 		promoRe := regexp.MustCompile(`(?i)_promo`)
 		skuName := promoRe.ReplaceAllString(*template.Sku.Name, "")
-		sku, err = cache.Get(context.Background(), skuName, skewer.VirtualMachines, *template.Location)
+		if skuName != *template.Sku.Name {
+			klog.V(1).Infof("No exact match found for %q, checking standard type %q. Error %v", *template.Sku.Name, skuName, err)
+			sku, err = azCache.GetSKU(ctx, skuName, *template.Location)
+		}
 		if err != nil {
 			return vmssType, fmt.Errorf("instance type %q not supported. Error %v", *template.Sku.Name, err)
 		}