diff --git a/cluster-autoscaler/cloudprovider/azure/README.md b/cluster-autoscaler/cloudprovider/azure/README.md index 9fd9a51a134e..540b0588ddff 100644 --- a/cluster-autoscaler/cloudprovider/azure/README.md +++ b/cluster-autoscaler/cloudprovider/azure/README.md @@ -151,7 +151,8 @@ Make a copy of [cluster-autoscaler-standard-master.yaml](examples/cluster-autosc In the `cluster-autoscaler` spec, find the `image:` field and replace `{{ ca_version }}` with a specific cluster autoscaler release. -Below that, in the `command:` section, update the `--nodes=` arguments to reference your node limits and node pool name. For example, if node pool "k8s-nodepool-1" should scale from 1 to 10 nodes: +Below that, in the `command:` section, update the `--nodes=` arguments to reference your node limits and node pool name (tips: node pool name is NOT availability set name, e.g., the corresponding node pool name of the availability set +`agentpool1-availabilitySet-xxxxxxxx` would be `agentpool1`). For example, if node pool "k8s-nodepool-1" should scale from 1 to 10 nodes: ```yaml - --nodes=1:10:k8s-nodepool-1 diff --git a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go index 463f90663db3..642248ae261a 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go @@ -36,6 +36,16 @@ import ( schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo" ) +var ( + vmInstancesRefreshPeriod = 5 * time.Minute +) + +var virtualMachinesStatusCache struct { + lastRefresh time.Time + mutex sync.Mutex + virtualMachines []compute.VirtualMachine +} + // AgentPool implements NodeGroup interface for agent pools deployed by aks-engine. type AgentPool struct { azureRef @@ -117,9 +127,32 @@ func (as *AgentPool) MaxSize() int { return as.maxSize } +func (as *AgentPool) getVirtualMachinesFromCache() ([]compute.VirtualMachine, error) { + virtualMachinesStatusCache.mutex.Lock() + defer virtualMachinesStatusCache.mutex.Unlock() + + if virtualMachinesStatusCache.lastRefresh.Add(vmInstancesRefreshPeriod).After(time.Now()) { + return virtualMachinesStatusCache.virtualMachines, nil + } + + vms, err := as.GetVirtualMachines() + if err != nil { + if isAzureRequestsThrottled(err) { + klog.Warningf("getAllVirtualMachines: throttling with message %v, would return the cached vms", err) + return virtualMachinesStatusCache.virtualMachines, nil + } + + return []compute.VirtualMachine{}, err + } + virtualMachinesStatusCache.virtualMachines = vms + virtualMachinesStatusCache.lastRefresh = time.Now() + + return vms, err +} + // GetVMIndexes gets indexes of all virtual machines belonging to the agent pool. func (as *AgentPool) GetVMIndexes() ([]int, map[int]string, error) { - instances, err := as.GetVirtualMachines() + instances, err := as.getVirtualMachinesFromCache() if err != nil { return nil, nil, err } @@ -266,7 +299,7 @@ func (as *AgentPool) DecreaseTargetSize(delta int) error { as.mutex.Lock() defer as.mutex.Unlock() - nodes, err := as.GetVirtualMachines() + nodes, err := as.getVirtualMachinesFromCache() if err != nil { return err } @@ -391,7 +424,7 @@ func (as *AgentPool) TemplateNodeInfo() (*schedulernodeinfo.NodeInfo, error) { // Nodes returns a list of all nodes that belong to this node group. func (as *AgentPool) Nodes() ([]cloudprovider.Instance, error) { - instances, err := as.GetVirtualMachines() + instances, err := as.getVirtualMachinesFromCache() if err != nil { return nil, err }