Skip to content

Commit

Permalink
Merge pull request #5767 from DataDog/bugfix-azure-prevent-unneeded-s…
Browse files Browse the repository at this point in the history
…caledown

Azure: Bugfix: Check PowerState before setting OutOfResources on instance
  • Loading branch information
k8s-ci-robot authored Jul 5, 2023
2 parents 4606cdf + dbff9be commit b569db4
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 52 deletions.
44 changes: 31 additions & 13 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ func (scaleSet *ScaleSet) GetScaleSetVms() ([]compute.VirtualMachineScaleSetVM,
defer cancel()

resourceGroup := scaleSet.manager.config.ResourceGroup
vmList, rerr := scaleSet.manager.azClient.virtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSet.Name, "")
vmList, rerr := scaleSet.manager.azClient.virtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSet.Name, "instanceView")
klog.V(4).Infof("GetScaleSetVms: scaleSet.Name: %s, vmList: %v", scaleSet.Name, vmList)
if rerr != nil {
klog.Errorf("VirtualMachineScaleSetVMsClient.List failed for %s: %v", scaleSet.Name, rerr)
Expand Down Expand Up @@ -612,18 +612,26 @@ func buildInstanceCache(vmList interface{}) []cloudprovider.Instance {
switch vms := vmList.(type) {
case []compute.VirtualMachineScaleSetVM:
for _, vm := range vms {
addInstanceToCache(&instances, vm.ID, vm.ProvisioningState)
powerState := vmPowerStateRunning
if vm.InstanceView != nil && vm.InstanceView.Statuses != nil {
powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses)
}
addInstanceToCache(&instances, vm.ID, vm.ProvisioningState, powerState)
}
case []compute.VirtualMachine:
for _, vm := range vms {
addInstanceToCache(&instances, vm.ID, vm.ProvisioningState)
powerState := vmPowerStateRunning
if vm.InstanceView != nil && vm.InstanceView.Statuses != nil {
powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses)
}
addInstanceToCache(&instances, vm.ID, vm.ProvisioningState, powerState)
}
}

return instances
}

func addInstanceToCache(instances *[]cloudprovider.Instance, id *string, provisioningState *string) {
func addInstanceToCache(instances *[]cloudprovider.Instance, id *string, provisioningState *string, powerState string) {
// The resource ID is empty string, which indicates the instance may be in deleting state.
if len(*id) == 0 {
return
Expand All @@ -638,7 +646,7 @@ func addInstanceToCache(instances *[]cloudprovider.Instance, id *string, provisi

*instances = append(*instances, cloudprovider.Instance{
Id: "azure://" + resourceID,
Status: instanceStatusFromProvisioningState(provisioningState),
Status: instanceStatusFromProvisioningStateAndPowerState(resourceID, provisioningState, powerState),
})
}

Expand All @@ -665,13 +673,13 @@ func (scaleSet *ScaleSet) setInstanceStatusByProviderID(providerID string, statu
scaleSet.lastInstanceRefresh = time.Now()
}

// instanceStatusFromProvisioningState converts the VM provisioning state to cloudprovider.InstanceStatus
func instanceStatusFromProvisioningState(provisioningState *string) *cloudprovider.InstanceStatus {
// instanceStatusFromProvisioningStateAndPowerState converts the VM provisioning state and power state to cloudprovider.InstanceStatus
func instanceStatusFromProvisioningStateAndPowerState(resourceId string, provisioningState *string, powerState string) *cloudprovider.InstanceStatus {
if provisioningState == nil {
return nil
}

klog.V(5).Infof("Getting vm instance provisioning state %s", *provisioningState)
klog.V(5).Infof("Getting vm instance provisioning state %s for %s", *provisioningState, resourceId)

status := &cloudprovider.InstanceStatus{}
switch *provisioningState {
Expand All @@ -680,11 +688,21 @@ func instanceStatusFromProvisioningState(provisioningState *string) *cloudprovid
case string(compute.ProvisioningStateCreating):
status.State = cloudprovider.InstanceCreating
case string(compute.ProvisioningStateFailed):
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "provisioning-state-failed",
ErrorMessage: "Azure failed to provision a node for this node group",
// Provisioning can fail both during instance creation or after the instance is running.
// Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states,
// ProvisioningState represents the most recent provisioning state, therefore only report
// InstanceCreating errors when the power state indicates the instance has not yet started running
if !isRunningVmPowerState(powerState) {
klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", resourceId, powerState)
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "provisioning-state-failed",
ErrorMessage: "Azure failed to provision a node for this node group",
}
} else {
klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", resourceId, powerState)
status.State = cloudprovider.InstanceRunning
}
default:
status.State = cloudprovider.InstanceRunning
Expand Down
110 changes: 71 additions & 39 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,45 +226,77 @@ func TestIncreaseSize(t *testing.T) {
}

func TestIncreaseSizeOnVMProvisioningFailed(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

manager := newTestAzureManager(t)
vmssName := "vmss-failed-upscale"

expectedScaleSets := newTestVMSSList(3, "vmss-failed-upscale", "eastus", compute.Uniform)
expectedVMSSVMs := newTestVMSSVMList(3)
expectedVMSSVMs[2].ProvisioningState = to.StringPtr(string(compute.ProvisioningStateFailed))

mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil)
mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), manager.config.ResourceGroup, vmssName, gomock.Any()).Return(nil, nil)
mockVMSSClient.EXPECT().WaitForCreateOrUpdateResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes()
manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "vmss-failed-upscale", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient
manager.explicitlyConfigured["vmss-failed-upscale"] = true
registered := manager.RegisterNodeGroup(newTestScaleSet(manager, vmssName))
assert.True(t, registered)
manager.Refresh()

provider, err := BuildAzureCloudProvider(manager, nil)
assert.NoError(t, err)

scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet)
assert.True(t, ok)

// Increase size by one, but the new node fails provisioning
err = scaleSet.IncreaseSize(1)
assert.NoError(t, err)

nodes, err := scaleSet.Nodes()
assert.NoError(t, err)

assert.Equal(t, 3, len(nodes))
assert.Equal(t, cloudprovider.InstanceCreating, nodes[2].Status.State)
assert.Equal(t, cloudprovider.OutOfResourcesErrorClass, nodes[2].Status.ErrorInfo.ErrorClass)
testCases := map[string]struct {
expectInstanceRunning bool
isMissingInstanceView bool
statuses []compute.InstanceViewStatus
}{
"out of resources when no power state exists": {},
"out of resources when VM is stopped": {
statuses: []compute.InstanceViewStatus{{Code: to.StringPtr(vmPowerStateStopped)}},
},
"out of resources when VM reports invalid power state": {
statuses: []compute.InstanceViewStatus{{Code: to.StringPtr("PowerState/invalid")}},
},
"instance running when power state is running": {
expectInstanceRunning: true,
statuses: []compute.InstanceViewStatus{{Code: to.StringPtr(vmPowerStateRunning)}},
},
"instance running if instance view cannot be retrieved": {
expectInstanceRunning: true,
isMissingInstanceView: true,
},
}
for testName, testCase := range testCases {
t.Run(testName, func(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

manager := newTestAzureManager(t)
vmssName := "vmss-failed-upscale"

expectedScaleSets := newTestVMSSList(3, "vmss-failed-upscale", "eastus", compute.Uniform)
expectedVMSSVMs := newTestVMSSVMList(3)
expectedVMSSVMs[2].ProvisioningState = to.StringPtr(string(compute.ProvisioningStateFailed))
if !testCase.isMissingInstanceView {
expectedVMSSVMs[2].InstanceView = &compute.VirtualMachineScaleSetVMInstanceView{Statuses: &testCase.statuses}
}

mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil)
mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), manager.config.ResourceGroup, vmssName, gomock.Any()).Return(nil, nil)
mockVMSSClient.EXPECT().WaitForCreateOrUpdateResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes()
manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "vmss-failed-upscale", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient
manager.explicitlyConfigured["vmss-failed-upscale"] = true
registered := manager.RegisterNodeGroup(newTestScaleSet(manager, vmssName))
assert.True(t, registered)
manager.Refresh()

provider, err := BuildAzureCloudProvider(manager, nil)
assert.NoError(t, err)

scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet)
assert.True(t, ok)

// Increase size by one, but the new node fails provisioning
err = scaleSet.IncreaseSize(1)
assert.NoError(t, err)

nodes, err := scaleSet.Nodes()
assert.NoError(t, err)

assert.Equal(t, 3, len(nodes))
if testCase.expectInstanceRunning {
assert.Equal(t, cloudprovider.InstanceRunning, nodes[2].Status.State)
} else {
assert.Equal(t, cloudprovider.InstanceCreating, nodes[2].Status.State)
assert.Equal(t, cloudprovider.OutOfResourcesErrorClass, nodes[2].Status.ErrorInfo.ErrorClass)
}
})
}
}

func TestIncreaseSizeOnVMSSUpdating(t *testing.T) {
Expand Down
39 changes: 39 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,16 @@ const (
nodeTaintTagName = "k8s.io_cluster-autoscaler_node-template_taint_"
nodeResourcesTagName = "k8s.io_cluster-autoscaler_node-template_resources_"
nodeOptionsTagName = "k8s.io_cluster-autoscaler_node-template_autoscaling-options_"

// PowerStates reflect the operational state of a VM
// From https://learn.microsoft.com/en-us/java/api/com.microsoft.azure.management.compute.powerstate?view=azure-java-stable
vmPowerStateStarting = "PowerState/starting"
vmPowerStateRunning = "PowerState/running"
vmPowerStateStopping = "PowerState/stopping"
vmPowerStateStopped = "PowerState/stopped"
vmPowerStateDeallocating = "PowerState/deallocating"
vmPowerStateDeallocated = "PowerState/deallocated"
vmPowerStateUnknown = "PowerState/unknown"
)

var (
Expand Down Expand Up @@ -608,3 +618,32 @@ func isAzureRequestsThrottled(rerr *retry.Error) bool {

return rerr.HTTPStatusCode == http.StatusTooManyRequests
}

func isRunningVmPowerState(powerState string) bool {
return powerState == vmPowerStateRunning || powerState == vmPowerStateStarting
}

func isKnownVmPowerState(powerState string) bool {
knownPowerStates := map[string]bool{
vmPowerStateStarting: true,
vmPowerStateRunning: true,
vmPowerStateStopping: true,
vmPowerStateStopped: true,
vmPowerStateDeallocating: true,
vmPowerStateDeallocated: true,
vmPowerStateUnknown: true,
}
return knownPowerStates[powerState]
}

func vmPowerStateFromStatuses(statuses []compute.InstanceViewStatus) string {
for _, status := range statuses {
if status.Code == nil || !isKnownVmPowerState(*status.Code) {
continue
}
return *status.Code
}

// PowerState is not set if the VM is still creating (or has failed creation)
return vmPowerStateUnknown
}

0 comments on commit b569db4

Please sign in to comment.