diff --git a/controlplane/kubeadm/controllers/scale.go b/controlplane/kubeadm/controllers/scale.go index 973f0a25e347..50d05189b840 100644 --- a/controlplane/kubeadm/controllers/scale.go +++ b/controlplane/kubeadm/controllers/scale.go @@ -92,8 +92,15 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( ) (ctrl.Result, error) { logger := controlPlane.Logger() - // run preflight checks ensuring the control plane is stable before proceeding with a scale up/scale down operation; if not, wait. - if result, err := r.preflightChecks(ctx, controlPlane); err != nil || !result.IsZero() { + // Pick the Machine that we should scale down. + machineToDelete, err := selectMachineForScaleDown(controlPlane, outdatedMachines) + if err != nil { + return ctrl.Result{}, errors.Wrap(err, "failed to select machine for scale down") + } + + // Run preflight checks ensuring the control plane is stable before proceeding with a scale up/scale down operation; if not, wait. + // Given that we're scaling down, we can exclude the machineToDelete from the preflight checks. + if result, err := r.preflightChecks(ctx, controlPlane, machineToDelete); err != nil || !result.IsZero() { return result, err } @@ -103,11 +110,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster") } - machineToDelete, err := selectMachineForScaleDown(controlPlane, outdatedMachines) - if err != nil { - return ctrl.Result{}, errors.Wrap(err, "failed to select machine for scale down") - } - if machineToDelete == nil { logger.Info("Failed to pick control plane Machine to delete") return ctrl.Result{}, errors.New("failed to pick control plane Machine to delete") @@ -151,7 +153,7 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( // If the control plane is not passing preflight checks, it requeue. // // NOTE: this func uses KCP conditions, it is required to call reconcileControlPlaneConditions before this. -func (r *KubeadmControlPlaneReconciler) preflightChecks(_ context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) { //nolint:unparam +func (r *KubeadmControlPlaneReconciler) preflightChecks(_ context.Context, controlPlane *internal.ControlPlane, excludeFor ...*clusterv1.Machine) (ctrl.Result, error) { //nolint:unparam logger := controlPlane.Logger() // If there is no KCP-owned control-plane machines, then control-plane has not been initialized yet, @@ -179,7 +181,18 @@ func (r *KubeadmControlPlaneReconciler) preflightChecks(_ context.Context, contr ) } machineErrors := []error{} + +loopmachines: for _, machine := range controlPlane.Machines { + + for _, excluded := range excludeFor { + // If this machine should be excluded from the individual + // health check, continue the out loop. + if machine.Name == excluded.Name { + continue loopmachines + } + } + for _, condition := range allMachineHealthConditions { if err := preflightCheckCondition("machine", machine, condition); err != nil { machineErrors = append(machineErrors, err) @@ -195,28 +208,6 @@ func (r *KubeadmControlPlaneReconciler) preflightChecks(_ context.Context, contr return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, nil } - // Check KCP conditions ; if there are health problems wait. - // NOTE: WE are checking KCP conditions for problems that can't be assigned to a specific machine, e.g. - // a control plane node without a corresponding machine - allKcpHealthConditions := []clusterv1.ConditionType{ - controlplanev1.ControlPlaneComponentsHealthyCondition, - controlplanev1.EtcdClusterHealthyCondition, - } - kcpErrors := []error{} - for _, condition := range allKcpHealthConditions { - if err := preflightCheckCondition("control plane", controlPlane.KCP, condition); err != nil { - kcpErrors = append(kcpErrors, err) - } - } - if len(kcpErrors) > 0 { - aggregatedError := kerrors.NewAggregate(kcpErrors) - r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "ControlPlaneUnhealthy", - "Waiting for control plane to pass preflight checks to continue reconciliation: %v", aggregatedError) - logger.Info("Waiting for control plane to pass preflight checks", "failures", aggregatedError.Error()) - - return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, nil - } - return ctrl.Result{}, nil } diff --git a/controlplane/kubeadm/controllers/scale_test.go b/controlplane/kubeadm/controllers/scale_test.go index 752d3453ddb0..969c16c4eacf 100644 --- a/controlplane/kubeadm/controllers/scale_test.go +++ b/controlplane/kubeadm/controllers/scale_test.go @@ -202,13 +202,53 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing. g.Expect(fakeClient.List(context.Background(), &controlPlaneMachines)).To(Succeed()) g.Expect(controlPlaneMachines.Items).To(HaveLen(0)) }) - t.Run("does not deletes control plane Machine if preflight checks fails", func(t *testing.T) { + t.Run("deletes the oldest control plane Machine even if preflight checks fails", func(t *testing.T) { g := NewWithT(t) machines := map[string]*clusterv1.Machine{ - "one": machine("one"), + "one": machine("one", withTimestamp(time.Now().Add(-1*time.Minute))), + "two": machine("two", withTimestamp(time.Now())), + "three": machine("three", withTimestamp(time.Now())), } - fakeClient := newFakeClient(g, machines["one"]) + setMachineHealthy(machines["two"]) + setMachineHealthy(machines["three"]) + fakeClient := newFakeClient(g, machines["one"], machines["two"], machines["three"]) + + r := &KubeadmControlPlaneReconciler{ + recorder: record.NewFakeRecorder(32), + Client: fakeClient, + managementCluster: &fakeManagementCluster{ + Workload: fakeWorkloadCluster{}, + }, + } + + cluster := &clusterv1.Cluster{} + kcp := &controlplanev1.KubeadmControlPlane{} + controlPlane := &internal.ControlPlane{ + KCP: kcp, + Cluster: cluster, + Machines: machines, + } + + result, err := r.scaleDownControlPlane(context.Background(), cluster, kcp, controlPlane, controlPlane.Machines) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result).To(Equal(ctrl.Result{Requeue: true})) + + controlPlaneMachines := clusterv1.MachineList{} + g.Expect(fakeClient.List(context.Background(), &controlPlaneMachines)).To(Succeed()) + g.Expect(controlPlaneMachines.Items).To(HaveLen(2)) + }) + + t.Run("does not scale down if preflight checks fail on any machine other than the one being deleted", func(t *testing.T) { + g := NewWithT(t) + + machines := map[string]*clusterv1.Machine{ + "one": machine("one", withTimestamp(time.Now().Add(-1*time.Minute))), + "two": machine("two", withTimestamp(time.Now())), + "three": machine("three", withTimestamp(time.Now())), + } + setMachineHealthy(machines["three"]) + fakeClient := newFakeClient(g, machines["one"], machines["two"], machines["three"]) r := &KubeadmControlPlaneReconciler{ recorder: record.NewFakeRecorder(32), @@ -232,7 +272,7 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing. controlPlaneMachines := clusterv1.MachineList{} g.Expect(fakeClient.List(context.Background(), &controlPlaneMachines)).To(Succeed()) - g.Expect(controlPlaneMachines.Items).To(HaveLen(1)) + g.Expect(controlPlaneMachines.Items).To(HaveLen(3)) }) } @@ -350,31 +390,6 @@ func TestPreflightChecks(t *testing.T) { }, expectResult: ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, }, - { - name: "control plane with healthy machine conditions but with unhealthy kcp conditions should requeue", - kcp: &controlplanev1.KubeadmControlPlane{ - Status: controlplanev1.KubeadmControlPlaneStatus{ - Conditions: clusterv1.Conditions{ - *conditions.FalseCondition(controlplanev1.ControlPlaneComponentsHealthyCondition, "fooReason", clusterv1.ConditionSeverityError, ""), - *conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition), - }, - }, - }, - machines: []*clusterv1.Machine{ - { - Status: clusterv1.MachineStatus{ - Conditions: clusterv1.Conditions{ - *conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition), - *conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition), - *conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition), - *conditions.TrueCondition(controlplanev1.MachineEtcdPodHealthyCondition), - *conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition), - }, - }, - }, - }, - expectResult: ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, - }, { name: "control plane with an healthy machine and an healthy kcp condition should pass", kcp: &controlplanev1.KubeadmControlPlane{