Skip to content

Commit

Permalink
Implement GetInstanceID for Azure and make instanceID to lower cases
Browse files Browse the repository at this point in the history
  • Loading branch information
feiskyer committed Feb 27, 2019
1 parent a9758b2 commit 2758133
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 76 deletions.
11 changes: 6 additions & 5 deletions cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"fmt"
"math/rand"
"sort"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -132,7 +133,7 @@ func (as *AgentPool) GetVMIndexes() ([]int, map[int]string, error) {
}

indexes = append(indexes, index)
indexToVM[index] = "azure://" + *instance.ID
indexToVM[index] = "azure://" + strings.ToLower(*instance.ID)
}

sortedIndexes := sort.IntSlice(indexes)
Expand Down Expand Up @@ -242,7 +243,7 @@ func (as *AgentPool) GetVirtualMachines() (instances []compute.VirtualMachine, e

tags := instance.Tags
vmPoolName := tags["poolName"]
if vmPoolName == nil || *vmPoolName != as.Id() {
if vmPoolName == nil || !strings.EqualFold(*vmPoolName, as.Id()) {
continue
}

Expand Down Expand Up @@ -292,7 +293,7 @@ func (as *AgentPool) Belongs(node *apiv1.Node) (bool, error) {
if targetAsg == nil {
return false, fmt.Errorf("%s doesn't belong to a known agent pool", node.Name)
}
if targetAsg.Id() != as.Id() {
if !strings.EqualFold(targetAsg.Id(), as.Id()) {
return false, nil
}
return true, nil
Expand All @@ -315,7 +316,7 @@ func (as *AgentPool) DeleteInstances(instances []*azureRef) error {
return err
}

if asg != commonAsg {
if !strings.EqualFold(asg.Id(), commonAsg.Id()) {
return fmt.Errorf("cannot delete instance (%s) which don't belong to the same node pool (%q)", instance.GetKey(), commonAsg)
}
}
Expand Down Expand Up @@ -398,7 +399,7 @@ func (as *AgentPool) Nodes() ([]cloudprovider.Instance, error) {
}

// To keep consistent with providerID from kubernetes cloud provider, do not convert ID to lower case.
name := "azure://" + *instance.ID
name := "azure://" + strings.ToLower(*instance.ID)
nodes = append(nodes, cloudprovider.Instance{Id: name})
}

Expand Down
41 changes: 29 additions & 12 deletions cluster-autoscaler/cloudprovider/azure/azure_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"fmt"
"reflect"
"regexp"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -63,7 +64,7 @@ func (m *asgCache) Register(asg cloudprovider.NodeGroup) bool {
defer m.mutex.Unlock()

for i := range m.registeredAsgs {
if existing := m.registeredAsgs[i]; existing.Id() == asg.Id() {
if existing := m.registeredAsgs[i]; strings.EqualFold(existing.Id(), asg.Id()) {
if reflect.DeepEqual(existing, asg) {
return false
}
Expand Down Expand Up @@ -94,7 +95,7 @@ func (m *asgCache) Unregister(asg cloudprovider.NodeGroup) bool {
updated := make([]cloudprovider.NodeGroup, 0, len(m.registeredAsgs))
changed := false
for _, existing := range m.registeredAsgs {
if existing.Id() == asg.Id() {
if strings.EqualFold(existing.Id(), asg.Id()) {
klog.V(1).Infof("Unregistered ASG %s", asg.Id())
changed = true
continue
Expand All @@ -117,42 +118,46 @@ func (m *asgCache) FindForInstance(instance *azureRef, vmType string) (cloudprov
m.mutex.Lock()
defer m.mutex.Unlock()

if m.notInRegisteredAsg[*instance] {
inst := azureRef{Name: strings.ToLower(instance.Name)}
if m.notInRegisteredAsg[inst] {
// We already know we don't own this instance. Return early and avoid
// additional calls.
return nil, nil
}

if vmType == vmTypeVMSS {
// Omit virtual machines not managed by vmss.
if ok := virtualMachineRE.Match([]byte(instance.Name)); ok {
if ok := virtualMachineRE.Match([]byte(inst.Name)); ok {
klog.V(3).Infof("Instance %q is not managed by vmss, omit it in autoscaler", instance.Name)
m.notInRegisteredAsg[*instance] = true
m.notInRegisteredAsg[inst] = true
return nil, nil
}
}

if vmType == vmTypeStandard {
// Omit virtual machines with providerID not in Azure resource ID format.
if ok := virtualMachineRE.Match([]byte(instance.Name)); !ok {
if ok := virtualMachineRE.Match([]byte(inst.Name)); !ok {
klog.V(3).Infof("Instance %q is not in Azure resource ID format, omit it in autoscaler", instance.Name)
m.notInRegisteredAsg[*instance] = true
m.notInRegisteredAsg[inst] = true
return nil, nil
}
}

if asg, found := m.instanceToAsg[*instance]; found {
// Look up caches for the instance.
if asg := m.getInstanceFromCache(inst.Name); asg != nil {
return asg, nil
}

// Not found, regenerate the cache and try again.
if err := m.regenerate(); err != nil {
return nil, fmt.Errorf("error while looking for ASG for instance %+v, error: %v", *instance, err)
return nil, fmt.Errorf("error while looking for ASG for instance %q, error: %v", instance.Name, err)
}
if config, found := m.instanceToAsg[*instance]; found {
return config, nil
if asg := m.getInstanceFromCache(inst.Name); asg != nil {
return asg, nil
}

m.notInRegisteredAsg[*instance] = true
// Add the instance to notInRegisteredAsg since it's unknown from Azure.
m.notInRegisteredAsg[inst] = true
return nil, nil
}

Expand All @@ -179,3 +184,15 @@ func (m *asgCache) regenerate() error {
m.instanceToAsg = newCache
return nil
}

// Get node group from cache. nil would be return if not found.
// Should be call with lock protected.
func (m *asgCache) getInstanceFromCache(providerID string) cloudprovider.NodeGroup {
for instanceID, asg := range m.instanceToAsg {
if strings.EqualFold(instanceID.GetKey(), providerID) {
return asg
}
}

return nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package azure
import (
"io"
"os"
"strings"

"k8s.io/klog"

Expand Down Expand Up @@ -110,6 +111,11 @@ func (azure *AzureCloudProvider) Refresh() error {
return azure.azureManager.Refresh()
}

// GetInstanceID gets the instance ID for the specified node.
func (azure *AzureCloudProvider) GetInstanceID(node *apiv1.Node) string {
return strings.ToLower(node.Spec.ProviderID)
}

// azureRef contains a reference to some entity in Azure world.
type azureRef struct {
Name string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ func (agentPool *ContainerServiceAgentPool) GetAKSAgentPool(agentProfiles *[]con
for _, value := range *agentProfiles {
profileName := *value.Name
klog.V(5).Infof("AKS AgentPool profile name: %s", profileName)
if profileName == (agentPool.azureRef.Name) {
if strings.EqualFold(profileName, agentPool.azureRef.Name) {
return &value
}
}
Expand All @@ -92,7 +92,7 @@ func (agentPool *ContainerServiceAgentPool) GetACSAgentPool(agentProfiles *[]con
for _, value := range *agentProfiles {
profileName := *value.Name
klog.V(5).Infof("ACS AgentPool profile name: %s", profileName)
if profileName == (agentPool.azureRef.Name) {
if strings.EqualFold(profileName, agentPool.azureRef.Name) {
return &value
}
}
Expand All @@ -105,7 +105,7 @@ func (agentPool *ContainerServiceAgentPool) GetACSAgentPool(agentProfiles *[]con
profileName := *value.Name
poolName := agentPool.azureRef.Name + "pool0"
klog.V(5).Infof("Workaround match check - ACS AgentPool Profile: %s <=> Poolname: %s", profileName, poolName)
if profileName == poolName {
if strings.EqualFold(profileName, poolName) {
return &value
}
}
Expand Down Expand Up @@ -270,7 +270,7 @@ func (agentPool *ContainerServiceAgentPool) SetNodeCount(count int) (err error)
func (agentPool *ContainerServiceAgentPool) GetProviderID(name string) string {
//TODO: come with a generic way to make it work with provider id formats
// in different version of k8s.
return "azure://" + name
return "azure://" + strings.ToLower(name)
}

//GetName extracts the name of the node (a format which underlying cloud service understands)
Expand All @@ -285,7 +285,7 @@ func (agentPool *ContainerServiceAgentPool) GetName(providerID string) (string,
return "", err
}
for _, vm := range vms {
if strings.Compare(*vm.ID, providerID) == 0 {
if strings.EqualFold(*vm.ID, providerID) {
return *vm.Name, nil
}
}
Expand Down Expand Up @@ -398,7 +398,7 @@ func (agentPool *ContainerServiceAgentPool) IsContainerServiceNode(tags map[stri
poolName := tags["poolName"]
if poolName != nil {
klog.V(5).Infof("Matching agentPool name: %s with tag name: %s", agentPool.azureRef.Name, *poolName)
if *poolName == agentPool.azureRef.Name {
if strings.EqualFold(*poolName, agentPool.azureRef.Name) {
return true
}
}
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/cloudprovider/azure/azure_fakes.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import (
)

const (
fakeVirtualMachineScaleSetVMID = "/subscriptions/test-subscription-id/resourceGroups/test-asg/providers/Microsoft.Compute/virtualMachineScaleSets/agents/virtualMachines/0"
fakeVirtualMachineScaleSetVMID = "/subscriptions/test-subscription-id/resourcegroups/test-asg/providers/microsoft.compute/virtualmachinescalesets/agents/virtualmachines/0"
)

// VirtualMachineScaleSetsClientMock mocks for VirtualMachineScaleSetsClient.
Expand Down
52 changes: 9 additions & 43 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@ type ScaleSet struct {
mutex sync.Mutex
lastRefresh time.Time
curSize int64
// virtualMachines holds a list of vmss instances (instanceID -> resourceID).
virtualMachines map[string]string
}

// NewScaleSet creates a new NewScaleSet.
Expand All @@ -58,11 +56,10 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager) (*ScaleSet, erro
azureRef: azureRef{
Name: spec.Name,
},
minSize: spec.MinSize,
maxSize: spec.MaxSize,
manager: az,
curSize: -1,
virtualMachines: make(map[string]string),
minSize: spec.MinSize,
maxSize: spec.MaxSize,
manager: az,
curSize: -1,
}

return scaleSet, nil
Expand Down Expand Up @@ -196,55 +193,24 @@ func (scaleSet *ScaleSet) IncreaseSize(delta int) error {
// GetScaleSetVms returns list of nodes for the given scale set.
// Note that the list results is not used directly because their resource ID format
// is not consistent with Get results.
// TODO(feiskyer): use list results directly after the issue fixed in Azure VMSS API.
func (scaleSet *ScaleSet) GetScaleSetVms() ([]string, error) {
ctx, cancel := getContextWithCancel()
defer cancel()

resourceGroup := scaleSet.manager.config.ResourceGroup
result, err := scaleSet.manager.azClient.virtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSet.Name, "", "", "")
vmList, err := scaleSet.manager.azClient.virtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSet.Name, "", "", "")
if err != nil {
klog.Errorf("VirtualMachineScaleSetVMsClient.List failed for %s: %v", scaleSet.Name, err)
return nil, err
}

instanceIDs := make([]string, 0)
for _, vm := range result {
instanceIDs = append(instanceIDs, *vm.InstanceID)
}

allVMs := make([]string, 0)
for _, instanceID := range instanceIDs {
// Get from cache first.
if v, ok := scaleSet.virtualMachines[instanceID]; ok {
allVMs = append(allVMs, v)
continue
}

// Not in cache, get from Azure API.
getCtx, getCancel := getContextWithCancel()
defer getCancel()
vm, err := scaleSet.manager.azClient.virtualMachineScaleSetVMsClient.Get(getCtx, resourceGroup, scaleSet.Name, instanceID)
if err != nil {
exists, realErr := checkResourceExistsFromError(err)
if realErr != nil {
klog.Errorf("Failed to get VirtualMachineScaleSetVM by (%s,%s), error: %v", scaleSet.Name, instanceID, err)
return nil, realErr
}

if !exists {
klog.Warningf("Couldn't find VirtualMachineScaleSetVM by (%s,%s), assuming it has been removed", scaleSet.Name, instanceID)
continue
}
}

for _, vm := range vmList {
// The resource ID is empty string, which indicates the instance may be in deleting state.
if len(*vm.ID) == 0 {
continue
}

// Save into cache.
scaleSet.virtualMachines[instanceID] = *vm.ID
allVMs = append(allVMs, *vm.ID)
}

Expand Down Expand Up @@ -294,7 +260,7 @@ func (scaleSet *ScaleSet) Belongs(node *apiv1.Node) (bool, error) {
if targetAsg == nil {
return false, fmt.Errorf("%s doesn't belong to a known scale set", node.Name)
}
if targetAsg.Id() != scaleSet.Id() {
if !strings.EqualFold(targetAsg.Id(), scaleSet.Id()) {
return false, nil
}
return true, nil
Expand All @@ -320,7 +286,7 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef) error {
return err
}

if asg != commonAsg {
if !strings.EqualFold(asg.Id(), commonAsg.Id()) {
return fmt.Errorf("cannot delete instance (%s) which don't belong to the same Scale Set (%q)", instance.Name, commonAsg)
}

Expand Down Expand Up @@ -490,7 +456,7 @@ func (scaleSet *ScaleSet) Nodes() ([]cloudprovider.Instance, error) {

instances := make([]cloudprovider.Instance, 0, len(vms))
for i := range vms {
name := "azure://" + vms[i]
name := "azure://" + strings.ToLower(vms[i])
instances = append(instances, cloudprovider.Instance{Id: name})
}

Expand Down
13 changes: 4 additions & 9 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,9 @@ func newTestScaleSet(manager *AzureManager, name string) *ScaleSet {
azureRef: azureRef{
Name: name,
},
manager: manager,
minSize: 1,
maxSize: 5,
virtualMachines: make(map[string]string),
manager: manager,
minSize: 1,
maxSize: 5,
}
}

Expand Down Expand Up @@ -92,7 +91,7 @@ func TestBelongs(t *testing.T) {

invalidNode := &apiv1.Node{
Spec: apiv1.NodeSpec{
ProviderID: "azure:///subscriptions/test-subscrition-id/resourceGroups/invalid-asg/providers/Microsoft.Compute/virtualMachineScaleSets/agents/virtualMachines/0",
ProviderID: "azure:///subscriptions/test-subscrition-id/resourcegroups/invalid-asg/providers/microsoft.compute/virtualmachinescalesets/agents/virtualmachines/0",
},
}
_, err := scaleSet.Belongs(invalidNode)
Expand Down Expand Up @@ -183,10 +182,6 @@ func TestScaleSetNodes(t *testing.T) {
ss, ok := group.(*ScaleSet)
assert.True(t, ok)
assert.NotNil(t, ss)
assert.Equal(t, ss.virtualMachines, map[string]string{
"0": fakeVirtualMachineScaleSetVMID,
})

instances, err := group.Nodes()
assert.NoError(t, err)
assert.Equal(t, len(instances), 1)
Expand Down

3 comments on commit 2758133

@jackfrancis
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@feiskyer would it be possible to cherry-pick the changes that reduce Azure API calls to older versions of cluster-autoscaler? Specifically, it'd be nice for folks running AKS Engine clusters @ Kubernetes versions v1.12 and v1.13 to be able to pick up a new version of cluster-autoscaler that would reduce API throttling events.

@feiskyer
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR has already been cherry picked to old releases:

Cluster Autoscaler 1.13 : cherry pick of #1738 #1755 Merged
Cluster Autoscaler 1.12 : cherry pick of #1738 #1756 Merged
Cluster Autoscaler 1.2 : cherry pick of #1738 #1757 Merged
Cluster Autoscaler 1.3 : cherry pick of #1738 #1758 Merged

@feiskyer
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jackfrancis so in aks-engine, we may need to upgrade CA versions for each release. Version matrix is here.

Please sign in to comment.