Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CENG-663] VPA PCI labels #78

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1dfe22b
[local] Define Datadog custom variables and accessors
bpineau May 31, 2021
8ce700f
[local] Datadog's own podlistprocessor
bpineau May 6, 2021
8411596
[local] Hook in Datadog's podlistprocessor
bpineau Sep 18, 2021
578c711
[local] Long pending pods filter
bpineau May 10, 2021
694d7e2
[local] Set local-data custom resource on fresh nodes
bpineau May 31, 2021
8d653a0
[local] Datadog's template only nodeinfos provider
bpineau Jul 7, 2021
4874d6d
[local] Hook in template only nodeinfos provider
bpineau Sep 20, 2021
4562c5b
[local] Add podTemplateProcessor option
clamoriniere Sep 20, 2021
2fedbb3
[local] Add NodeInfos PodTemplate processor implementation
clamoriniere Sep 20, 2021
48b24ee
[local] Use PodTemplate Processor in NodeInfosProcessor
clamoriniere Sep 20, 2021
e1ef7a2
Don't deref nil nodegroup in deleteCreatedNodesWithErrors
bpineau May 30, 2022
3c061e6
Update Azure instance-types
bpineau Jul 21, 2022
a71a35d
[local] Backward compatible spread topology - hack
bpineau Aug 19, 2022
9162a2c
Azure: effectively cache instance-types SKUs
bpineau Jul 25, 2022
c248329
[local] Support topolvm/openebs storage for scaling decisions
dhenkel92 Aug 10, 2022
8448683
Merge pull request #52 from DataDog/dhenkel/support-lvm-storage
dhenkel92 Aug 24, 2022
a685d28
Add support for extended resource definition in GCE MIG template
zaymat Oct 11, 2022
433cce6
Code Review: Do not return an error on malformed extended_resource + …
zaymat Oct 19, 2022
b447a03
Merge pull request #64 from DataDog/mayeul/cherry-pick/add-extended-r…
zaymat Oct 25, 2022
3b34477
[local] AWS: don't cache empty instance-types
bpineau Jan 11, 2023
9faa5e4
gRPC expander: allow realistic server responses, and log errors
bpineau Jan 16, 2023
55e6ce7
Create labeler.yml
tedcm Feb 16, 2023
2e02a68
Create labeler.yml
tedcm Feb 16, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#This module configures the GitHub labeler to apply the "pci" label to all pci related changes automatically requiring an independent PR review.

pci:
- vertical-pod-autoscaler/**/* #Directory that must comply with PCI change management
11 changes: 11 additions & 0 deletions .github/workflows/labeler.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: Pull Request Labeler

on: [pull_request]

jobs:
label:
runs-on: ubuntu-latest
steps:
- uses: actions/labeler@v2
with:
repo-token: "${{ secrets.GITHUB_TOKEN }}"
11 changes: 9 additions & 2 deletions cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,15 @@ func newASGCache(awsService *awsWrapper, explicitSpecs []string, autoDiscoverySp
var getInstanceTypeForAsg = func(m *asgCache, group *asg) (string, error) {
if obj, found, _ := m.asgInstanceTypeCache.GetByKey(group.AwsRef.Name); found {
return obj.(instanceTypeCachedObject).instanceType, nil
} else if result, err := m.awsService.getInstanceTypesForAsgs([]*asg{group}); err == nil {
return result[group.AwsRef.Name], nil
}

result, err := m.awsService.getInstanceTypesForAsgs([]*asg{group})
if err != nil {
return "", fmt.Errorf("could not get instance type for %s: %w", group.AwsRef.Name, err)
}

if instanceType, ok := result[group.AwsRef.Name]; ok {
return instanceType, nil
}

return "", fmt.Errorf("could not find instance type for %s", group.AwsRef.Name)
Expand Down
6 changes: 5 additions & 1 deletion cluster-autoscaler/cloudprovider/aws/aws_wrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,13 @@ func (m *awsWrapper) getInstanceTypesForAsgs(asgs []*asg) (map[string]string, er
}

for asgName, cfgName := range launchConfigsToQuery {
if instanceType, ok := launchConfigs[cfgName]; !ok || instanceType == "" {
klog.Warningf("Could not fetch %q launch configuration for ASG %q", cfgName, asgName)
continue
}
results[asgName] = launchConfigs[cfgName]
}
klog.V(4).Infof("Successfully queried %d launch configurations", len(launchConfigsToQuery))
klog.V(4).Infof("Successfully queried %d launch configurations", len(launchConfigs))

// Have to query LaunchTemplates one-at-a-time, since there's no way to query <lt, version> pairs in bulk
for asgName, lt := range launchTemplatesToQuery {
Expand Down
16 changes: 8 additions & 8 deletions cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ func TestGetVMsFromCache(t *testing.T) {
mockVMClient := mockvmclient.NewMockInterface(ctrl)
testAS.manager.azClient.virtualMachinesClient = mockVMClient
mockVMClient.EXPECT().List(gomock.Any(), testAS.manager.config.ResourceGroup).Return(expectedVMs, nil)
ac, err := newAzureCache(testAS.manager.azClient, refreshInterval, testAS.manager.config.ResourceGroup, vmTypeStandard)
ac, err := newAzureCache(testAS.manager.azClient, refreshInterval, testAS.manager.config.ResourceGroup, vmTypeStandard, false, "")
assert.NoError(t, err)
testAS.manager.azureCache = ac

Expand All @@ -203,7 +203,7 @@ func TestGetVMIndexes(t *testing.T) {
mockVMClient := mockvmclient.NewMockInterface(ctrl)
as.manager.azClient.virtualMachinesClient = mockVMClient
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
assert.NoError(t, err)
as.manager.azureCache = ac

Expand Down Expand Up @@ -242,7 +242,7 @@ func TestGetCurSize(t *testing.T) {
mockVMClient := mockvmclient.NewMockInterface(ctrl)
as.manager.azClient.virtualMachinesClient = mockVMClient
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
assert.NoError(t, err)
as.manager.azureCache = ac

Expand All @@ -266,7 +266,7 @@ func TestAgentPoolTargetSize(t *testing.T) {
as.manager.azClient.virtualMachinesClient = mockVMClient
expectedVMs := getExpectedVMs()
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
assert.NoError(t, err)
as.manager.azureCache = ac

Expand All @@ -285,7 +285,7 @@ func TestAgentPoolIncreaseSize(t *testing.T) {
as.manager.azClient.virtualMachinesClient = mockVMClient
expectedVMs := getExpectedVMs()
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil).MaxTimes(2)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
assert.NoError(t, err)
as.manager.azureCache = ac

Expand Down Expand Up @@ -313,7 +313,7 @@ func TestDecreaseTargetSize(t *testing.T) {
as.manager.azClient.virtualMachinesClient = mockVMClient
expectedVMs := getExpectedVMs()
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil).MaxTimes(3)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
assert.NoError(t, err)
as.manager.azureCache = ac

Expand Down Expand Up @@ -431,7 +431,7 @@ func TestAgentPoolDeleteNodes(t *testing.T) {
mockSAClient := mockstorageaccountclient.NewMockInterface(ctrl)
as.manager.azClient.storageAccountsClient = mockSAClient
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
assert.NoError(t, err)
as.manager.azureCache = ac

Expand Down Expand Up @@ -497,7 +497,7 @@ func TestAgentPoolNodes(t *testing.T) {
mockVMClient := mockvmclient.NewMockInterface(ctrl)
as.manager.azClient.virtualMachinesClient = mockVMClient
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
assert.NoError(t, err)
as.manager.azureCache = ac

Expand Down
45 changes: 44 additions & 1 deletion cluster-autoscaler/cloudprovider/azure/azure_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package azure

import (
"context"
"reflect"
"regexp"
"strings"
Expand All @@ -25,6 +26,7 @@ import (

"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute"
"github.com/Azure/go-autorest/autorest/to"
"github.com/Azure/skewer"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"

"k8s.io/klog/v2"
Expand Down Expand Up @@ -55,9 +57,10 @@ type azureCache struct {
instanceToNodeGroup map[azureRef]cloudprovider.NodeGroup
unownedInstances map[azureRef]bool
autoscalingOptions map[azureRef]map[string]string
skus map[string]*skewer.Cache
}

func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmType string) (*azureCache, error) {
func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmType string, enableDynamicInstanceList bool, defaultLocation string) (*azureCache, error) {
cache := &azureCache{
interrupt: make(chan struct{}),
azClient: client,
Expand All @@ -70,6 +73,11 @@ func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmTy
instanceToNodeGroup: make(map[azureRef]cloudprovider.NodeGroup),
unownedInstances: make(map[azureRef]bool),
autoscalingOptions: make(map[azureRef]map[string]string),
skus: make(map[string]*skewer.Cache),
}

if enableDynamicInstanceList {
cache.skus[defaultLocation] = &skewer.Cache{}
}

if err := cache.regenerate(); err != nil {
Expand Down Expand Up @@ -131,11 +139,21 @@ func (m *azureCache) regenerate() error {
newAutoscalingOptions[ref] = options
}

newSkuCache := make(map[string]*skewer.Cache)
for location := range m.skus {
cache, err := m.fetchSKUs(context.Background(), location)
if err != nil {
return err
}
newSkuCache[location] = cache
}

m.mutex.Lock()
defer m.mutex.Unlock()

m.instanceToNodeGroup = newInstanceToNodeGroupCache
m.autoscalingOptions = newAutoscalingOptions
m.skus = newSkuCache

// Reset unowned instances cache.
m.unownedInstances = make(map[azureRef]bool)
Expand Down Expand Up @@ -264,6 +282,31 @@ func (m *azureCache) Unregister(nodeGroup cloudprovider.NodeGroup) bool {
return changed
}

func (m *azureCache) fetchSKUs(ctx context.Context, location string) (*skewer.Cache, error) {
return skewer.NewCache(ctx,
skewer.WithLocation(location),
skewer.WithResourceClient(m.azClient.skuClient),
)
}

func (m *azureCache) GetSKU(ctx context.Context, skuName, location string) (skewer.SKU, error) {
m.mutex.Lock()
defer m.mutex.Unlock()

cache, ok := m.skus[location]
if !ok {
var err error
cache, err = m.fetchSKUs(ctx, location)
if err != nil {
klog.V(1).Infof("Failed to instantiate cache, err: %v", err)
return skewer.SKU{}, err
}
m.skus[location] = cache
}

return cache.Get(ctx, skuName, skewer.VirtualMachines, location)
}

func (m *azureCache) getRegisteredNodeGroups() []cloudprovider.NodeGroup {
m.mutex.Lock()
defer m.mutex.Unlock()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func newTestAzureManager(t *testing.T) *AzureManager {
},
}

cache, error := newAzureCache(manager.azClient, refreshInterval, manager.config.ResourceGroup, vmTypeVMSS)
cache, error := newAzureCache(manager.azClient, refreshInterval, manager.config.ResourceGroup, vmTypeVMSS, false, "")
assert.NoError(t, error)

manager.azureCache = cache
Expand Down
19 changes: 6 additions & 13 deletions cluster-autoscaler/cloudprovider/azure/azure_instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ package azure
import (
"context"
"fmt"
compute20190701 "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute"
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute"
"github.com/Azure/skewer"
"k8s.io/klog/v2"
"regexp"
"strings"
Expand Down Expand Up @@ -61,24 +59,19 @@ var GetVMSSTypeStatically = func(template compute.VirtualMachineScaleSet) (*Inst

// GetVMSSTypeDynamically fetched vmss instance information using sku api calls.
// It is declared as a variable for testing purpose.
var GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, skuClient compute20190701.ResourceSkusClient) (InstanceType, error) {
var GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) {
ctx := context.Background()
var sku skewer.SKU
var vmssType InstanceType

cache, err := skewer.NewCache(ctx, skewer.WithLocation(*template.Location), skewer.WithResourceClient(skuClient))
if err != nil {
klog.V(1).Infof("Failed to instantiate cache, err: %v", err)
return vmssType, err
}

sku, err = cache.Get(ctx, *template.Sku.Name, skewer.VirtualMachines, *template.Location)
sku, err := azCache.GetSKU(ctx, *template.Sku.Name, *template.Location)
if err != nil {
// We didn't find an exact match but this is a promo type, check for matching standard
klog.V(1).Infof("No exact match found for %s, checking standard types. Error %v", *template.Sku.Name, err)
promoRe := regexp.MustCompile(`(?i)_promo`)
skuName := promoRe.ReplaceAllString(*template.Sku.Name, "")
sku, err = cache.Get(context.Background(), skuName, skewer.VirtualMachines, *template.Location)
if skuName != *template.Sku.Name {
klog.V(1).Infof("No exact match found for %q, checking standard type %q. Error %v", *template.Sku.Name, skuName, err)
sku, err = azCache.GetSKU(ctx, skuName, *template.Location)
}
if err != nil {
return vmssType, fmt.Errorf("instance type %q not supported. Error %v", *template.Sku.Name, err)
}
Expand Down
Loading