Skip to content

Commit

Permalink
Improve delete node mechanisms in cluster-api autoscaler
Browse files Browse the repository at this point in the history
This change adds two different checks for improving the behavior of node
deletion. The first is to check against the node group minimum size
instead of zero for determining early scale down failure events. The
second is to help prevent race conditions in the annotation labeling for
machines marked for deletion.
  • Loading branch information
elmiko committed Apr 8, 2020
1 parent e816740 commit 0ad94a3
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,28 @@ func (r machineDeploymentScalableResource) SetSize(nreplicas int32) error {
return updateErr
}

func (r machineDeploymentScalableResource) UnmarkMachineForDeletion(machine *Machine) error {
u, err := r.controller.dynamicclient.Resource(*r.controller.machineResource).Namespace(machine.Namespace).Get(context.TODO(), machine.Name, metav1.GetOptions{})

if err != nil {
return err
}
if u == nil {
return fmt.Errorf("unknown machine %s", machine.Name)
}

u = u.DeepCopy()

annotations := u.GetAnnotations()
if _, ok := annotations[machineDeleteAnnotationKey]; ok {
delete(annotations, machineDeleteAnnotationKey)
u.SetAnnotations(annotations)
_, updateErr := r.controller.dynamicclient.Resource(*r.controller.machineResource).Namespace(u.GetNamespace()).Update(context.TODO(), u, metav1.UpdateOptions{})
return updateErr
}
return nil
}

func (r machineDeploymentScalableResource) MarkMachineForDeletion(machine *Machine) error {
u, err := r.controller.dynamicclient.Resource(*r.controller.machineResource).Namespace(machine.Namespace).Get(context.TODO(), machine.Name, metav1.GetOptions{})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,28 @@ func (r machineSetScalableResource) MarkMachineForDeletion(machine *Machine) err
return updateErr
}

func (r machineSetScalableResource) UnmarkMachineForDeletion(machine *Machine) error {
u, err := r.controller.dynamicclient.Resource(*r.controller.machineResource).Namespace(machine.Namespace).Get(context.TODO(), machine.Name, metav1.GetOptions{})

if err != nil {
return err
}
if u == nil {
return fmt.Errorf("unknown machine %s", machine.Name)
}

u = u.DeepCopy()

annotations := u.GetAnnotations()
if _, ok := annotations[machineDeleteAnnotationKey]; ok {
delete(annotations, machineDeleteAnnotationKey)
u.SetAnnotations(annotations)
_, updateErr := r.controller.dynamicclient.Resource(*r.controller.machineResource).Namespace(u.GetNamespace()).Update(context.TODO(), u, metav1.UpdateOptions{})
return updateErr
}
return nil
}

func newMachineSetScalableResource(controller *machineController, machineSet *MachineSet) (*machineSetScalableResource, error) {
minSize, maxSize, err := parseScalingBounds(machineSet.Annotations)
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,12 @@ func (ng *nodegroup) DeleteNodes(nodes []*corev1.Node) error {
}

// Step 2: if deleting len(nodes) would make the replica count
// <= 0, then the request to delete that many nodes is bogus
// and we fail fast.
// < node group minimum size, then the request to delete that many
// nodes is bogus and we fail fast.
replicas := ng.scalableResource.Replicas()

if replicas-int32(len(nodes)) <= 0 {
return fmt.Errorf("unable to delete %d machines in %q, machine replicas are <= 0 ", len(nodes), ng.Id())
if replicas-int32(len(nodes)) < int32(ng.MinSize()) {
return fmt.Errorf("unable to delete %d machines in %q, machine replicas are less than requested minimum", len(nodes), ng.Id())
}

// Step 3: annotate the corresponding machine that it is a
Expand Down Expand Up @@ -139,6 +139,7 @@ func (ng *nodegroup) DeleteNodes(nodes []*corev1.Node) error {
}

if err := ng.scalableResource.SetSize(replicas - 1); err != nil {
nodeGroup.scalableResource.UnmarkMachineForDeletion(machine)
return err
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,7 @@ type scalableResource interface {

// MarkMachineForDeletion marks machine for deletion
MarkMachineForDeletion(machine *Machine) error

// UnmarkMachineForDeletion unmarks machine for deletion
UnmarkMachineForDeletion(machine *Machine) error
}

0 comments on commit 0ad94a3

Please sign in to comment.