diff --git a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go index 605f514a97a4..e2ae266b7b3f 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go @@ -185,7 +185,7 @@ func TestGetVMsFromCache(t *testing.T) { mockVMClient := mockvmclient.NewMockInterface(ctrl) testAS.manager.azClient.virtualMachinesClient = mockVMClient mockVMClient.EXPECT().List(gomock.Any(), testAS.manager.config.ResourceGroup).Return(expectedVMs, nil) - ac, err := newAzureCache(testAS.manager.azClient, refreshInterval, testAS.manager.config.ResourceGroup, vmTypeStandard) + ac, err := newAzureCache(testAS.manager.azClient, refreshInterval, testAS.manager.config.ResourceGroup, vmTypeStandard, false, "") assert.NoError(t, err) testAS.manager.azureCache = ac @@ -203,7 +203,7 @@ func TestGetVMIndexes(t *testing.T) { mockVMClient := mockvmclient.NewMockInterface(ctrl) as.manager.azClient.virtualMachinesClient = mockVMClient mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard) + ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") assert.NoError(t, err) as.manager.azureCache = ac @@ -242,7 +242,7 @@ func TestGetCurSize(t *testing.T) { mockVMClient := mockvmclient.NewMockInterface(ctrl) as.manager.azClient.virtualMachinesClient = mockVMClient mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard) + ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") assert.NoError(t, err) as.manager.azureCache = ac @@ -266,7 +266,7 @@ func TestAgentPoolTargetSize(t *testing.T) { as.manager.azClient.virtualMachinesClient = mockVMClient expectedVMs := getExpectedVMs() mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard) + ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") assert.NoError(t, err) as.manager.azureCache = ac @@ -285,7 +285,7 @@ func TestAgentPoolIncreaseSize(t *testing.T) { as.manager.azClient.virtualMachinesClient = mockVMClient expectedVMs := getExpectedVMs() mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil).MaxTimes(2) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard) + ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") assert.NoError(t, err) as.manager.azureCache = ac @@ -313,7 +313,7 @@ func TestDecreaseTargetSize(t *testing.T) { as.manager.azClient.virtualMachinesClient = mockVMClient expectedVMs := getExpectedVMs() mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil).MaxTimes(3) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard) + ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") assert.NoError(t, err) as.manager.azureCache = ac @@ -431,7 +431,7 @@ func TestAgentPoolDeleteNodes(t *testing.T) { mockSAClient := mockstorageaccountclient.NewMockInterface(ctrl) as.manager.azClient.storageAccountsClient = mockSAClient mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard) + ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") assert.NoError(t, err) as.manager.azureCache = ac @@ -497,7 +497,7 @@ func TestAgentPoolNodes(t *testing.T) { mockVMClient := mockvmclient.NewMockInterface(ctrl) as.manager.azClient.virtualMachinesClient = mockVMClient mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard) + ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") assert.NoError(t, err) as.manager.azureCache = ac diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_cache.go index ddb2227cc4cc..4872c7cd555d 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cache.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cache.go @@ -17,6 +17,7 @@ limitations under the License. package azure import ( + "context" "reflect" "regexp" "strings" @@ -25,6 +26,7 @@ import ( "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute" "github.com/Azure/go-autorest/autorest/to" + "github.com/Azure/skewer" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/klog/v2" @@ -55,9 +57,10 @@ type azureCache struct { instanceToNodeGroup map[azureRef]cloudprovider.NodeGroup unownedInstances map[azureRef]bool autoscalingOptions map[azureRef]map[string]string + skus map[string]*skewer.Cache } -func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmType string) (*azureCache, error) { +func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmType string, enableDynamicInstanceList bool, defaultLocation string) (*azureCache, error) { cache := &azureCache{ interrupt: make(chan struct{}), azClient: client, @@ -70,6 +73,11 @@ func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmTy instanceToNodeGroup: make(map[azureRef]cloudprovider.NodeGroup), unownedInstances: make(map[azureRef]bool), autoscalingOptions: make(map[azureRef]map[string]string), + skus: make(map[string]*skewer.Cache), + } + + if enableDynamicInstanceList { + cache.skus[defaultLocation] = &skewer.Cache{} } if err := cache.regenerate(); err != nil { @@ -131,11 +139,21 @@ func (m *azureCache) regenerate() error { newAutoscalingOptions[ref] = options } + newSkuCache := make(map[string]*skewer.Cache) + for location := range m.skus { + cache, err := m.fetchSKUs(context.Background(), location) + if err != nil { + return err + } + newSkuCache[location] = cache + } + m.mutex.Lock() defer m.mutex.Unlock() m.instanceToNodeGroup = newInstanceToNodeGroupCache m.autoscalingOptions = newAutoscalingOptions + m.skus = newSkuCache // Reset unowned instances cache. m.unownedInstances = make(map[azureRef]bool) @@ -264,6 +282,31 @@ func (m *azureCache) Unregister(nodeGroup cloudprovider.NodeGroup) bool { return changed } +func (m *azureCache) fetchSKUs(ctx context.Context, location string) (*skewer.Cache, error) { + return skewer.NewCache(ctx, + skewer.WithLocation(location), + skewer.WithResourceClient(m.azClient.skuClient), + ) +} + +func (m *azureCache) GetSKU(ctx context.Context, skuName, location string) (skewer.SKU, error) { + m.mutex.Lock() + defer m.mutex.Unlock() + + cache, ok := m.skus[location] + if !ok { + var err error + cache, err = m.fetchSKUs(ctx, location) + if err != nil { + klog.V(1).Infof("Failed to instantiate cache, err: %v", err) + return skewer.SKU{}, err + } + m.skus[location] = cache + } + + return cache.Get(ctx, skuName, skewer.VirtualMachines, location) +} + func (m *azureCache) getRegisteredNodeGroups() []cloudprovider.NodeGroup { m.mutex.Lock() defer m.mutex.Unlock() diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go index 9e53e4bd391f..de3b7d08c2ec 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go @@ -75,7 +75,7 @@ func newTestAzureManager(t *testing.T) *AzureManager { }, } - cache, error := newAzureCache(manager.azClient, refreshInterval, manager.config.ResourceGroup, vmTypeVMSS) + cache, error := newAzureCache(manager.azClient, refreshInterval, manager.config.ResourceGroup, vmTypeVMSS, false, "") assert.NoError(t, error) manager.azureCache = cache diff --git a/cluster-autoscaler/cloudprovider/azure/azure_instance.go b/cluster-autoscaler/cloudprovider/azure/azure_instance.go index 5a6bf3ce1253..7479631561ac 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_instance.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_instance.go @@ -19,9 +19,7 @@ package azure import ( "context" "fmt" - compute20190701 "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute" - "github.com/Azure/skewer" "k8s.io/klog/v2" "regexp" "strings" @@ -61,24 +59,19 @@ var GetVMSSTypeStatically = func(template compute.VirtualMachineScaleSet) (*Inst // GetVMSSTypeDynamically fetched vmss instance information using sku api calls. // It is declared as a variable for testing purpose. -var GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, skuClient compute20190701.ResourceSkusClient) (InstanceType, error) { +var GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { ctx := context.Background() - var sku skewer.SKU var vmssType InstanceType - cache, err := skewer.NewCache(ctx, skewer.WithLocation(*template.Location), skewer.WithResourceClient(skuClient)) - if err != nil { - klog.V(1).Infof("Failed to instantiate cache, err: %v", err) - return vmssType, err - } - - sku, err = cache.Get(ctx, *template.Sku.Name, skewer.VirtualMachines, *template.Location) + sku, err := azCache.GetSKU(ctx, *template.Sku.Name, *template.Location) if err != nil { // We didn't find an exact match but this is a promo type, check for matching standard - klog.V(1).Infof("No exact match found for %s, checking standard types. Error %v", *template.Sku.Name, err) promoRe := regexp.MustCompile(`(?i)_promo`) skuName := promoRe.ReplaceAllString(*template.Sku.Name, "") - sku, err = cache.Get(context.Background(), skuName, skewer.VirtualMachines, *template.Location) + if skuName != *template.Sku.Name { + klog.V(1).Infof("No exact match found for %q, checking standard type %q. Error %v", *template.Sku.Name, skuName, err) + sku, err = azCache.GetSKU(ctx, skuName, *template.Location) + } if err != nil { return vmssType, fmt.Errorf("instance type %q not supported. Error %v", *template.Sku.Name, err) } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager.go b/cluster-autoscaler/cloudprovider/azure/azure_manager.go index ddf5288c7c04..dc04e1558281 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager.go @@ -91,7 +91,7 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi if cfg.VmssCacheTTL != 0 { cacheTTL = time.Duration(cfg.VmssCacheTTL) * time.Second } - cache, err := newAzureCache(azClient, cacheTTL, cfg.ResourceGroup, cfg.VMType) + cache, err := newAzureCache(azClient, cacheTTL, cfg.ResourceGroup, cfg.VMType, cfg.EnableDynamicInstanceList, cfg.Location) if err != nil { return nil, err } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 260c14eedd81..40a10b0b666b 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -479,8 +479,7 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, erro return nil, err } - node, err := buildNodeFromTemplate(scaleSet.Name, template, scaleSet.manager.azClient.skuClient, - scaleSet.enableDynamicInstanceList) + node, err := buildNodeFromTemplate(scaleSet.Name, template, scaleSet.manager) if err != nil { return nil, err } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index ae32ad7c3d4e..041871bdb248 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -26,7 +26,6 @@ import ( "sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssvmclient/mockvmssvmclient" "testing" - compute20190701 "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute" "github.com/Azure/go-autorest/autorest/to" "github.com/golang/mock/gomock" @@ -573,7 +572,7 @@ func TestTemplateNodeInfo(t *testing.T) { assert.NotEmpty(t, nodeInfo.Pods) t.Run("Checking dynamic workflow", func(t *testing.T) { - GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, skuClient compute20190701.ResourceSkusClient) (InstanceType, error) { + GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { vmssType := InstanceType{} vmssType.VCPU = 1 vmssType.GPU = 2 @@ -587,7 +586,7 @@ func TestTemplateNodeInfo(t *testing.T) { }) t.Run("Checking static workflow if dynamic fails", func(t *testing.T) { - GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, skuClient compute20190701.ResourceSkusClient) (InstanceType, error) { + GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { return InstanceType{}, fmt.Errorf("dynamic error exists") } GetVMSSTypeStatically = func(template compute.VirtualMachineScaleSet) (*InstanceType, error) { @@ -604,7 +603,7 @@ func TestTemplateNodeInfo(t *testing.T) { }) t.Run("Fails to find vmss instance information using static and dynamic workflow, instance not supported", func(t *testing.T) { - GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, skuClient compute20190701.ResourceSkusClient) (InstanceType, error) { + GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { return InstanceType{}, fmt.Errorf("dynamic error exists") } GetVMSSTypeStatically = func(template compute.VirtualMachineScaleSet) (*InstanceType, error) { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_template.go b/cluster-autoscaler/cloudprovider/azure/azure_template.go index 1124e847cea4..e52d090ccb75 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_template.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_template.go @@ -18,7 +18,6 @@ package azure import ( "fmt" - compute20190701 "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute" apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -73,8 +72,7 @@ func buildGenericLabels(template compute.VirtualMachineScaleSet, nodeName string return result } -func buildNodeFromTemplate(scaleSetName string, - template compute.VirtualMachineScaleSet, skuClient compute20190701.ResourceSkusClient, enableDynamicInstanceList bool) (*apiv1.Node, error) { +func buildNodeFromTemplate(scaleSetName string, template compute.VirtualMachineScaleSet, manager *AzureManager) (*apiv1.Node, error) { node := apiv1.Node{} nodeName := fmt.Sprintf("%s-asg-%d", scaleSetName, rand.Int63()) @@ -92,10 +90,10 @@ func buildNodeFromTemplate(scaleSetName string, // Fetching SKU information from SKU API if enableDynamicInstanceList is true. var dynamicErr error - if enableDynamicInstanceList { + if manager.config.EnableDynamicInstanceList { var vmssTypeDynamic InstanceType klog.V(1).Infof("Fetching instance information for SKU: %s from SKU API", *template.Sku.Name) - vmssTypeDynamic, dynamicErr = GetVMSSTypeDynamically(template, skuClient) + vmssTypeDynamic, dynamicErr = GetVMSSTypeDynamically(template, manager.azureCache) if dynamicErr == nil { vcpu = vmssTypeDynamic.VCPU gpuCount = vmssTypeDynamic.GPU @@ -104,7 +102,7 @@ func buildNodeFromTemplate(scaleSetName string, klog.Errorf("Dynamically fetching of instance information from SKU api failed with error: %v", dynamicErr) } } - if !enableDynamicInstanceList || dynamicErr != nil { + if !manager.config.EnableDynamicInstanceList || dynamicErr != nil { klog.V(1).Infof("Falling back to static SKU list for SKU: %s", *template.Sku.Name) // fall-back on static list of vmss if dynamic workflow fails. vmssTypeStatic, staticErr := GetVMSSTypeStatically(template)