Skip to content

Commit

Permalink
Add unit test coverage for machineDeployments. reconcileOldMachineSets
Browse files Browse the repository at this point in the history
  • Loading branch information
enxebre committed Apr 29, 2021
1 parent a0c20fe commit 50565c3
Show file tree
Hide file tree
Showing 3 changed files with 227 additions and 36 deletions.
2 changes: 1 addition & 1 deletion controllers/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package controllers
import (
"context"
"fmt"
"sigs.k8s.io/cluster-api/util/collections"
"time"

"github.com/pkg/errors"
Expand All @@ -40,6 +39,7 @@ import (
kubedrain "sigs.k8s.io/cluster-api/third_party/kubernetes-drain"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/annotations"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/cluster-api/util/predicates"
Expand Down
70 changes: 37 additions & 33 deletions controllers/machinedeployment_rolling.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

// rolloutRolling implements the logic for rolling a new machine set.
// rolloutRolling implements the logic for rolling a new MachineSet.
func (r *MachineDeploymentReconciler) rolloutRolling(ctx context.Context, d *clusterv1.MachineDeployment, msList []*clusterv1.MachineSet) error {
newMS, oldMSs, err := r.getAllMachineSetsAndSyncRevision(ctx, d, msList, true)
if err != nil {
Expand Down Expand Up @@ -73,11 +73,11 @@ func (r *MachineDeploymentReconciler) rolloutRolling(ctx context.Context, d *clu

func (r *MachineDeploymentReconciler) reconcileNewMachineSet(ctx context.Context, allMSs []*clusterv1.MachineSet, newMS *clusterv1.MachineSet, deployment *clusterv1.MachineDeployment) error {
if deployment.Spec.Replicas == nil {
return errors.Errorf("spec replicas for MachineDeployment %s is nil, this is unexpected", client.ObjectKeyFromObject(deployment).String())
return errors.Errorf("spec.replicas for MachineDeployment %v is nil, this is unexpected", client.ObjectKeyFromObject(deployment))
}

if newMS.Spec.Replicas == nil {
return errors.Errorf("spec replicas for MachineSet %s is nil, this is unexpected", client.ObjectKeyFromObject(newMS).String())
return errors.Errorf("spec.replicas for MachineSet %v is nil, this is unexpected", client.ObjectKeyFromObject(newMS))
}

if *(newMS.Spec.Replicas) == *(deployment.Spec.Replicas) {
Expand All @@ -101,13 +101,13 @@ func (r *MachineDeploymentReconciler) reconcileOldMachineSets(ctx context.Contex
log := ctrl.LoggerFrom(ctx)

if deployment.Spec.Replicas == nil {
return errors.Errorf("spec replicas for MachineDeployment %q/%q is nil, this is unexpected",
deployment.Namespace, deployment.Name)
return errors.Errorf("spec.replicas for MachineDeployment %v is nil, this is unexpected",
client.ObjectKeyFromObject(deployment))
}

if newMS.Spec.Replicas == nil {
return errors.Errorf("spec replicas for MachineSet %q/%q is nil, this is unexpected",
newMS.Namespace, newMS.Name)
return errors.Errorf("spec.replicas for MachineSet %v is nil, this is unexpected",
client.ObjectKeyFromObject(newMS))
}

oldMachinesCount := mdutil.GetReplicaCountForMachineSets(oldMSs)
Expand All @@ -117,18 +117,18 @@ func (r *MachineDeploymentReconciler) reconcileOldMachineSets(ctx context.Contex
}

allMachinesCount := mdutil.GetReplicaCountForMachineSets(allMSs)
log.V(4).Info("New machine set has available machines",
"machineset", newMS.Name, "count", newMS.Status.AvailableReplicas)
log.V(4).Info("New MachineSet has available machines",
"machineset", client.ObjectKeyFromObject(newMS).String(), "available-replicas", newMS.Status.AvailableReplicas)
maxUnavailable := mdutil.MaxUnavailable(*deployment)

// Check if we can scale down. We can scale down in the following 2 cases:
// * Some old machine sets have unhealthy replicas, we could safely scale down those unhealthy replicas since that won't further
// * Some old MachineSets have unhealthy replicas, we could safely scale down those unhealthy replicas since that won't further
// increase unavailability.
// * New machine set has scaled up and it's replicas becomes ready, then we can scale down old machine sets in a further step.
// * New MachineSet has scaled up and it's replicas becomes ready, then we can scale down old MachineSets in a further step.
//
// maxScaledDown := allMachinesCount - minAvailable - newMachineSetMachinesUnavailable
// take into account not only maxUnavailable and any surge machines that have been created, but also unavailable machines from
// the newMS, so that the unavailable machines from the newMS would not make us scale down old machine sets in a further
// the newMS, so that the unavailable machines from the newMS would not make us scale down old MachineSets in a further
// step(that will increase unavailability).
//
// Concrete example:
Expand All @@ -139,18 +139,18 @@ func (r *MachineDeploymentReconciler) reconcileOldMachineSets(ctx context.Contex
//
// case 1:
// * Deployment is updated, newMS is created with 3 replicas, oldMS is scaled down to 8, and newMS is scaled up to 5.
// * The new machine set machines crashloop and never become available.
// * The new MachineSet machines crashloop and never become available.
// * allMachinesCount is 13. minAvailable is 8. newMSMachinesUnavailable is 5.
// * A node fails and causes one of the oldMS machines to become unavailable. However, 13 - 8 - 5 = 0, so the oldMS won't be scaled down.
// * The user notices the crashloop and does kubectl rollout undo to rollback.
// * newMSMachinesUnavailable is 1, since we rolled back to the good machine set, so maxScaledDown = 13 - 8 - 1 = 4. 4 of the crashlooping machines will be scaled down.
// * newMSMachinesUnavailable is 1, since we rolled back to the good MachineSet, so maxScaledDown = 13 - 8 - 1 = 4. 4 of the crashlooping machines will be scaled down.
// * The total number of machines will then be 9 and the newMS can be scaled up to 10.
//
// case 2:
// Same example, but pushing a new machine template instead of rolling back (aka "roll over"):
// * The new machine set created must start with 0 replicas because allMachinesCount is already at 13.
// * However, newMSMachinesUnavailable would also be 0, so the 2 old machine sets could be scaled down by 5 (13 - 8 - 0), which would then
// allow the new machine set to be scaled up by 5.
// * The new MachineSet created must start with 0 replicas because allMachinesCount is already at 13.
// * However, newMSMachinesUnavailable would also be 0, so the 2 old MachineSets could be scaled down by 5 (13 - 8 - 0), which would then
// allow the new MachineSet to be scaled up by 5.
minAvailable := *(deployment.Spec.Replicas) - maxUnavailable
newMSUnavailableMachineCount := *(newMS.Spec.Replicas) - newMS.Status.AvailableReplicas
maxScaledDown := allMachinesCount - minAvailable - newMSUnavailableMachineCount
Expand All @@ -167,32 +167,33 @@ func (r *MachineDeploymentReconciler) reconcileOldMachineSets(ctx context.Contex

log.V(4).Info("Cleaned up unhealthy replicas from old MachineSets", "count", cleanupCount)

// Scale down old machine sets, need check maxUnavailable to ensure we can scale down
// Scale down old MachineSets, need check maxUnavailable to ensure we can scale down
allMSs = oldMSs
allMSs = append(allMSs, newMS)
scaledDownCount, err := r.scaleDownOldMachineSetsForRollingUpdate(ctx, allMSs, oldMSs, deployment)
if err != nil {
return err
}

log.V(4).Info("Scaled down old MachineSets of deployment", "count", scaledDownCount)
log.V(4).Info("Scaled down old MachineSets of MachineDeployment", "count", scaledDownCount)
return nil
}

// cleanupUnhealthyReplicas will scale down old machine sets with unhealthy replicas, so that all unhealthy replicas will be deleted.
// cleanupUnhealthyReplicas will scale down old MachineSets with unhealthy replicas, so that all unhealthy replicas will be deleted.
func (r *MachineDeploymentReconciler) cleanupUnhealthyReplicas(ctx context.Context, oldMSs []*clusterv1.MachineSet, deployment *clusterv1.MachineDeployment, maxCleanupCount int32) ([]*clusterv1.MachineSet, int32, error) {
log := ctrl.LoggerFrom(ctx)

sort.Sort(mdutil.MachineSetsByCreationTimestamp(oldMSs))

// Safely scale down all old machine sets with unhealthy replicas. Replica set will sort the machines in the order
// such that not-ready < ready, unscheduled < scheduled, and pending < running. This ensures that unhealthy replicas will
// been deleted first and won't increase unavailability.
// Scale down all old MachineSets with any unhealthy replicas. MachineSet will honour Spec.DeletePolicy
// for deleting Machines. Machines with a deletion timestamp, with a failure message or without a nodeRef
// are preferred for all strategies.
// This results in a best effort to remove machines backing unhealthy nodes.
totalScaledDown := int32(0)

for _, targetMS := range oldMSs {
if targetMS.Spec.Replicas == nil {
return nil, 0, errors.Errorf("spec replicas for machine set %v is nil, this is unexpected", targetMS.Name)
return nil, 0, errors.Errorf("spec.replicas for MachineSet %v is nil, this is unexpected", client.ObjectKeyFromObject(targetMS))
}

if totalScaledDown >= maxCleanupCount {
Expand All @@ -201,12 +202,13 @@ func (r *MachineDeploymentReconciler) cleanupUnhealthyReplicas(ctx context.Conte

oldMSReplicas := *(targetMS.Spec.Replicas)
if oldMSReplicas == 0 {
// cannot scale down this machine set.
// cannot scale down this MachineSet.
continue
}

oldMSAvailableReplicas := targetMS.Status.AvailableReplicas
log.V(4).Info("Found available machines in old MS", "count", oldMSAvailableReplicas, "target-machineset", targetMS.Name)
log.V(4).Info("Found available Machines in old MachineSet",
"count", oldMSAvailableReplicas, "target-machineset", client.ObjectKeyFromObject(targetMS).String())
if oldMSReplicas == oldMSAvailableReplicas {
// no unhealthy replicas found, no scaling required.
continue
Expand All @@ -218,7 +220,8 @@ func (r *MachineDeploymentReconciler) cleanupUnhealthyReplicas(ctx context.Conte
newReplicasCount := oldMSReplicas - scaledDownCount

if newReplicasCount > oldMSReplicas {
return nil, 0, errors.Errorf("when cleaning up unhealthy replicas, got invalid request to scale down %s/%s %d -> %d", targetMS.Namespace, targetMS.Name, oldMSReplicas, newReplicasCount)
return nil, 0, errors.Errorf("when cleaning up unhealthy replicas, got invalid request to scale down %v: %d -> %d",
client.ObjectKeyFromObject(targetMS), oldMSReplicas, newReplicasCount)
}

if err := r.scaleMachineSet(ctx, targetMS, newReplicasCount, deployment); err != nil {
Expand All @@ -231,22 +234,22 @@ func (r *MachineDeploymentReconciler) cleanupUnhealthyReplicas(ctx context.Conte
return oldMSs, totalScaledDown, nil
}

// scaleDownOldMachineSetsForRollingUpdate scales down old machine sets when deployment strategy is "RollingUpdate".
// scaleDownOldMachineSetsForRollingUpdate scales down old MachineSets when deployment strategy is "RollingUpdate".
// Need check maxUnavailable to ensure availability.
func (r *MachineDeploymentReconciler) scaleDownOldMachineSetsForRollingUpdate(ctx context.Context, allMSs []*clusterv1.MachineSet, oldMSs []*clusterv1.MachineSet, deployment *clusterv1.MachineDeployment) (int32, error) {
log := ctrl.LoggerFrom(ctx)

if deployment.Spec.Replicas == nil {
return 0, errors.Errorf("spec replicas for deployment %v is nil, this is unexpected", deployment.Name)
return 0, errors.Errorf("spec.replicas for MachineDeployment %v is nil, this is unexpected", client.ObjectKeyFromObject(deployment))
}

maxUnavailable := mdutil.MaxUnavailable(*deployment)

// Check if we can scale down.
minAvailable := *(deployment.Spec.Replicas) - maxUnavailable

// Find the number of available machines.
availableMachineCount := mdutil.GetAvailableReplicaCountForMachineSets(allMSs)

// Check if we can scale down.
if availableMachineCount <= minAvailable {
// Cannot scale down.
return 0, nil
Expand All @@ -260,7 +263,7 @@ func (r *MachineDeploymentReconciler) scaleDownOldMachineSetsForRollingUpdate(ct
totalScaleDownCount := availableMachineCount - minAvailable
for _, targetMS := range oldMSs {
if targetMS.Spec.Replicas == nil {
return 0, errors.Errorf("spec replicas for machine set %v is nil, this is unexpected", targetMS.Name)
return 0, errors.Errorf("spec.replicas for MachineSet %v is nil, this is unexpected", client.ObjectKeyFromObject(targetMS))
}

if totalScaledDown >= totalScaleDownCount {
Expand All @@ -277,7 +280,8 @@ func (r *MachineDeploymentReconciler) scaleDownOldMachineSetsForRollingUpdate(ct
scaleDownCount := integer.Int32Min(*(targetMS.Spec.Replicas), totalScaleDownCount-totalScaledDown)
newReplicasCount := *(targetMS.Spec.Replicas) - scaleDownCount
if newReplicasCount > *(targetMS.Spec.Replicas) {
return totalScaledDown, errors.Errorf("when scaling down old MS, got invalid request to scale down %s/%s %d -> %d", targetMS.Namespace, targetMS.Name, *(targetMS.Spec.Replicas), newReplicasCount)
return totalScaledDown, errors.Errorf("when scaling down old MachineSet, got invalid request to scale down %v: %d -> %d",
client.ObjectKeyFromObject(targetMS), *(targetMS.Spec.Replicas), newReplicasCount)
}

if err := r.scaleMachineSet(ctx, targetMS, newReplicasCount, deployment); err != nil {
Expand Down
Loading

0 comments on commit 50565c3

Please sign in to comment.