Skip to content

Commit

Permalink
Merge pull request #3977 from vincepri/exclude-scale-down
Browse files Browse the repository at this point in the history
🐛 Scale down checks excludes machine about to be deleted
  • Loading branch information
k8s-ci-robot authored Dec 3, 2020
2 parents 90c5465 + a284134 commit 0baeb9e
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 59 deletions.
51 changes: 21 additions & 30 deletions controlplane/kubeadm/controllers/scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,15 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
) (ctrl.Result, error) {
logger := controlPlane.Logger()

// run preflight checks ensuring the control plane is stable before proceeding with a scale up/scale down operation; if not, wait.
if result, err := r.preflightChecks(ctx, controlPlane); err != nil || !result.IsZero() {
// Pick the Machine that we should scale down.
machineToDelete, err := selectMachineForScaleDown(controlPlane, outdatedMachines)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "failed to select machine for scale down")
}

// Run preflight checks ensuring the control plane is stable before proceeding with a scale up/scale down operation; if not, wait.
// Given that we're scaling down, we can exclude the machineToDelete from the preflight checks.
if result, err := r.preflightChecks(ctx, controlPlane, machineToDelete); err != nil || !result.IsZero() {
return result, err
}

Expand All @@ -103,11 +110,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
}

machineToDelete, err := selectMachineForScaleDown(controlPlane, outdatedMachines)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "failed to select machine for scale down")
}

if machineToDelete == nil {
logger.Info("Failed to pick control plane Machine to delete")
return ctrl.Result{}, errors.New("failed to pick control plane Machine to delete")
Expand Down Expand Up @@ -151,7 +153,7 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
// If the control plane is not passing preflight checks, it requeue.
//
// NOTE: this func uses KCP conditions, it is required to call reconcileControlPlaneConditions before this.
func (r *KubeadmControlPlaneReconciler) preflightChecks(_ context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) { //nolint:unparam
func (r *KubeadmControlPlaneReconciler) preflightChecks(_ context.Context, controlPlane *internal.ControlPlane, excludeFor ...*clusterv1.Machine) (ctrl.Result, error) { //nolint:unparam
logger := controlPlane.Logger()

// If there is no KCP-owned control-plane machines, then control-plane has not been initialized yet,
Expand Down Expand Up @@ -179,7 +181,18 @@ func (r *KubeadmControlPlaneReconciler) preflightChecks(_ context.Context, contr
)
}
machineErrors := []error{}

loopmachines:
for _, machine := range controlPlane.Machines {

for _, excluded := range excludeFor {
// If this machine should be excluded from the individual
// health check, continue the out loop.
if machine.Name == excluded.Name {
continue loopmachines
}
}

for _, condition := range allMachineHealthConditions {
if err := preflightCheckCondition("machine", machine, condition); err != nil {
machineErrors = append(machineErrors, err)
Expand All @@ -195,28 +208,6 @@ func (r *KubeadmControlPlaneReconciler) preflightChecks(_ context.Context, contr
return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, nil
}

// Check KCP conditions ; if there are health problems wait.
// NOTE: WE are checking KCP conditions for problems that can't be assigned to a specific machine, e.g.
// a control plane node without a corresponding machine
allKcpHealthConditions := []clusterv1.ConditionType{
controlplanev1.ControlPlaneComponentsHealthyCondition,
controlplanev1.EtcdClusterHealthyCondition,
}
kcpErrors := []error{}
for _, condition := range allKcpHealthConditions {
if err := preflightCheckCondition("control plane", controlPlane.KCP, condition); err != nil {
kcpErrors = append(kcpErrors, err)
}
}
if len(kcpErrors) > 0 {
aggregatedError := kerrors.NewAggregate(kcpErrors)
r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass preflight checks to continue reconciliation: %v", aggregatedError)
logger.Info("Waiting for control plane to pass preflight checks", "failures", aggregatedError.Error())

return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, nil
}

return ctrl.Result{}, nil
}

Expand Down
73 changes: 44 additions & 29 deletions controlplane/kubeadm/controllers/scale_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,13 +202,53 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing.
g.Expect(fakeClient.List(context.Background(), &controlPlaneMachines)).To(Succeed())
g.Expect(controlPlaneMachines.Items).To(HaveLen(0))
})
t.Run("does not deletes control plane Machine if preflight checks fails", func(t *testing.T) {
t.Run("deletes the oldest control plane Machine even if preflight checks fails", func(t *testing.T) {
g := NewWithT(t)

machines := map[string]*clusterv1.Machine{
"one": machine("one"),
"one": machine("one", withTimestamp(time.Now().Add(-1*time.Minute))),
"two": machine("two", withTimestamp(time.Now())),
"three": machine("three", withTimestamp(time.Now())),
}
fakeClient := newFakeClient(g, machines["one"])
setMachineHealthy(machines["two"])
setMachineHealthy(machines["three"])
fakeClient := newFakeClient(g, machines["one"], machines["two"], machines["three"])

r := &KubeadmControlPlaneReconciler{
recorder: record.NewFakeRecorder(32),
Client: fakeClient,
managementCluster: &fakeManagementCluster{
Workload: fakeWorkloadCluster{},
},
}

cluster := &clusterv1.Cluster{}
kcp := &controlplanev1.KubeadmControlPlane{}
controlPlane := &internal.ControlPlane{
KCP: kcp,
Cluster: cluster,
Machines: machines,
}

result, err := r.scaleDownControlPlane(context.Background(), cluster, kcp, controlPlane, controlPlane.Machines)
g.Expect(err).ToNot(HaveOccurred())
g.Expect(result).To(Equal(ctrl.Result{Requeue: true}))

controlPlaneMachines := clusterv1.MachineList{}
g.Expect(fakeClient.List(context.Background(), &controlPlaneMachines)).To(Succeed())
g.Expect(controlPlaneMachines.Items).To(HaveLen(2))
})

t.Run("does not scale down if preflight checks fail on any machine other than the one being deleted", func(t *testing.T) {
g := NewWithT(t)

machines := map[string]*clusterv1.Machine{
"one": machine("one", withTimestamp(time.Now().Add(-1*time.Minute))),
"two": machine("two", withTimestamp(time.Now())),
"three": machine("three", withTimestamp(time.Now())),
}
setMachineHealthy(machines["three"])
fakeClient := newFakeClient(g, machines["one"], machines["two"], machines["three"])

r := &KubeadmControlPlaneReconciler{
recorder: record.NewFakeRecorder(32),
Expand All @@ -232,7 +272,7 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing.

controlPlaneMachines := clusterv1.MachineList{}
g.Expect(fakeClient.List(context.Background(), &controlPlaneMachines)).To(Succeed())
g.Expect(controlPlaneMachines.Items).To(HaveLen(1))
g.Expect(controlPlaneMachines.Items).To(HaveLen(3))
})
}

Expand Down Expand Up @@ -350,31 +390,6 @@ func TestPreflightChecks(t *testing.T) {
},
expectResult: ctrl.Result{RequeueAfter: preflightFailedRequeueAfter},
},
{
name: "control plane with healthy machine conditions but with unhealthy kcp conditions should requeue",
kcp: &controlplanev1.KubeadmControlPlane{
Status: controlplanev1.KubeadmControlPlaneStatus{
Conditions: clusterv1.Conditions{
*conditions.FalseCondition(controlplanev1.ControlPlaneComponentsHealthyCondition, "fooReason", clusterv1.ConditionSeverityError, ""),
*conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition),
},
},
},
machines: []*clusterv1.Machine{
{
Status: clusterv1.MachineStatus{
Conditions: clusterv1.Conditions{
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineEtcdPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
},
},
},
},
expectResult: ctrl.Result{RequeueAfter: preflightFailedRequeueAfter},
},
{
name: "control plane with an healthy machine and an healthy kcp condition should pass",
kcp: &controlplanev1.KubeadmControlPlane{
Expand Down

0 comments on commit 0baeb9e

Please sign in to comment.