From 031cb49787d5b2696ef4b7736e88a569ce020e35 Mon Sep 17 00:00:00 2001 From: Yuvaraj Kakaraparthi Date: Sat, 20 May 2023 00:59:15 -0700 Subject: [PATCH] MachinesCreated condition on MachineSet should show failed preflight checks --- api/v1beta1/condition_consts.go | 4 + .../machineset/machineset_controller.go | 123 ++++++++---- .../machineset/machineset_controller_test.go | 186 ++++++++++++++++++ .../machineset/machineset_preflight.go | 27 +-- .../machineset/machineset_preflight_test.go | 4 +- 5 files changed, 287 insertions(+), 57 deletions(-) diff --git a/api/v1beta1/condition_consts.go b/api/v1beta1/condition_consts.go index c0c5de162efa..1596e2e5709f 100644 --- a/api/v1beta1/condition_consts.go +++ b/api/v1beta1/condition_consts.go @@ -243,6 +243,10 @@ const ( // MachinesReadyCondition reports an aggregate of current status of the machines controlled by the MachineSet. MachinesReadyCondition ConditionType = "MachinesReady" + // PreflightCheckFailedReason (Severity=Error) documents a MachineSet failing preflight checks + // to create machine(s). + PreflightCheckFailedReason = "PreflightCheckFailed" + // BootstrapTemplateCloningFailedReason (Severity=Error) documents a MachineSet failing to // clone the bootstrap template. BootstrapTemplateCloningFailedReason = "BootstrapTemplateCloningFailed" diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index 4a3e76ae7693..a7ad91fd5664 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -296,51 +296,13 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, filteredMachines = append(filteredMachines, machine) } - // Remediate failed Machines by deleting them. - var errs []error - machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines)) - for _, machine := range filteredMachines { - // filteredMachines contains machines in deleting status to calculate correct status. - // skip remediation for those in deleting status. - if !machine.DeletionTimestamp.IsZero() { - continue - } - if conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition) { - machinesToRemediate = append(machinesToRemediate, machine) - } - } - result := ctrl.Result{} - if len(machinesToRemediate) > 0 { - preflightChecksResult, err := r.runPreflightChecks(ctx, cluster, machineSet, "Machine Remediation") - if err != nil { - return ctrl.Result{}, err - } - // Delete the machines only if the preflight checks have passed. Do not delete machines if we cannot - // guarantee creating new machines. - if preflightChecksResult.IsZero() { - for _, machine := range machinesToRemediate { - log.Info("Deleting machine because marked as unhealthy by the MachineHealthCheck controller") - patch := client.MergeFrom(machine.DeepCopy()) - if err := r.Client.Delete(ctx, machine); err != nil { - errs = append(errs, errors.Wrap(err, "failed to delete")) - continue - } - conditions.MarkTrue(machine, clusterv1.MachineOwnerRemediatedCondition) - if err := r.Client.Status().Patch(ctx, machine, patch); err != nil && !apierrors.IsNotFound(err) { - errs = append(errs, errors.Wrap(err, "failed to update status")) - } - } - } else { - result = util.LowestNonZeroResult(result, preflightChecksResult) - } - } - err = kerrors.NewAggregate(errs) + reconcileUnhealthyMachinesResult, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, filteredMachines) if err != nil { - log.Info("Failed while deleting unhealthy machines", "err", err) - return ctrl.Result{}, errors.Wrap(err, "failed to remediate machines") + return ctrl.Result{}, errors.Wrap(err, "failed to reconcile unhealthy machines") } + result = util.LowestNonZeroResult(result, reconcileUnhealthyMachinesResult) if err := r.syncMachines(ctx, machineSet, filteredMachines); err != nil { return ctrl.Result{}, errors.Wrap(err, "failed to update Machines") @@ -482,8 +444,13 @@ func (r *Reconciler) syncReplicas(ctx context.Context, cluster *clusterv1.Cluste } } - result, err := r.runPreflightChecks(ctx, cluster, ms, "Scale up") + result, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Scale up") if err != nil || !result.IsZero() { + if err != nil { + // If the error is not nil use that as the message for the condition. + preflightCheckErrMessage = err.Error() + } + conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.PreflightCheckFailedReason, clusterv1.ConditionSeverityError, preflightCheckErrMessage) return result, err } @@ -991,6 +958,78 @@ func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Clus return node, nil } +func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + // List all unhealthy machines. + machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines)) + for _, m := range filteredMachines { + // filteredMachines contains machines in deleting status to calculate correct status. + // skip remediation for those in deleting status. + if !m.DeletionTimestamp.IsZero() { + continue + } + if conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) { + machinesToRemediate = append(machinesToRemediate, m) + } + } + + // If there are no machines to remediate return early. + if len(machinesToRemediate) == 0 { + return ctrl.Result{}, nil + } + + preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine Remediation") + if err != nil { + // If err is not nil use that as the preflightCheckErrMessage + preflightCheckErrMessage = err.Error() + } + + preflightChecksFailed := err != nil || !preflightChecksResult.IsZero() + if preflightChecksFailed { + // PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with + // WaitingForRemediationReason reason. + var errs []error + for _, m := range machinesToRemediate { + patchHelper, err := patch.NewHelper(m, r.Client) + if err != nil { + errs = append(errs, errors.Wrapf(err, "failed to create patch helper for Machine %s", klog.KObj(m))) + continue + } + conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, preflightCheckErrMessage) + if err := patchHelper.Patch(ctx, m); err != nil { + errs = append(errs, errors.Wrapf(err, "failed to patch Machine %s", klog.KObj(m))) + } + } + + if len(errs) > 0 { + return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to patch unhealthy Machines") + } + return preflightChecksResult, nil + } + + // PreflightChecks passed, so it is safe to remediate unhealthy machines. + // Remediate unhealthy machines by deleting them. + var errs []error + for _, m := range machinesToRemediate { + log.Info(fmt.Sprintf("Deleting Machine %s because it was marked as unhealthy by the MachineHealthCheck controller", klog.KObj(m))) + patch := client.MergeFrom(m.DeepCopy()) + if err := r.Client.Delete(ctx, m); err != nil { + errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m))) + continue + } + conditions.MarkTrue(m, clusterv1.MachineOwnerRemediatedCondition) + if err := r.Client.Status().Patch(ctx, m, patch); err != nil && !apierrors.IsNotFound(err) { + errs = append(errs, errors.Wrapf(err, "failed to update status of Machine %s", klog.KObj(m))) + } + } + + if len(errs) > 0 { + return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to delete unhealthy Machines") + } + + return ctrl.Result{}, nil +} + func reconcileExternalTemplateReference(ctx context.Context, c client.Client, cluster *clusterv1.Cluster, ref *corev1.ObjectReference) error { if !strings.HasSuffix(ref.Kind, clusterv1.TemplateSuffix) { return nil diff --git a/internal/controllers/machineset/machineset_controller_test.go b/internal/controllers/machineset/machineset_controller_test.go index e4519dd89f45..9888f8fc6946 100644 --- a/internal/controllers/machineset/machineset_controller_test.go +++ b/internal/controllers/machineset/machineset_controller_test.go @@ -26,6 +26,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/client-go/tools/record" + utilfeature "k8s.io/component-base/featuregate/testing" "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -33,6 +34,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/controllers/external" + "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/contract" "sigs.k8s.io/cluster-api/internal/test/builder" "sigs.k8s.io/cluster-api/internal/util/ssa" @@ -1326,6 +1328,190 @@ func TestMachineSetReconciler_syncMachines(t *testing.T) { }, 5*time.Second).Should(Succeed()) } +func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { + t.Run("should delete unhealthy machines if preflight checks pass", func(t *testing.T) { + defer utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachineSetPreflightChecks, true)() + + g := NewWithT(t) + + controlPlaneStable := builder.ControlPlane("default", "cp1"). + WithVersion("v1.26.2"). + WithStatusFields(map[string]interface{}{ + "status.version": "v1.26.2", + }). + Build() + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + } + machineSet := &clusterv1.MachineSet{} + + unhealthyMachine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "unhealthy-machine", + Namespace: "default", + }, + Status: clusterv1.MachineStatus{ + Conditions: []clusterv1.Condition{ + { + Type: clusterv1.MachineOwnerRemediatedCondition, + Status: corev1.ConditionFalse, + }, + }, + }, + } + healthyMachine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "healthy-machine", + Namespace: "default", + }, + } + + machines := []*clusterv1.Machine{unhealthyMachine, healthyMachine} + + fakeClient := fake.NewClientBuilder().WithObjects(controlPlaneStable, unhealthyMachine, healthyMachine).Build() + r := &Reconciler{Client: fakeClient} + _, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, machines) + g.Expect(err).To(BeNil()) + // Verify the unhealthy machine is deleted. + m := &clusterv1.Machine{} + err = r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m) + g.Expect(apierrors.IsNotFound(err)).To(BeTrue()) + // Verify the healthy machine is not deleted. + m = &clusterv1.Machine{} + g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).Should(Succeed()) + }) + + t.Run("should update the unhealthy machine MachineOwnerRemediated condition if preflight checks did not pass", func(t *testing.T) { + defer utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachineSetPreflightChecks, true)() + + g := NewWithT(t) + + // An upgrading control plane should cause the preflight checks to not pass. + controlPlaneUpgrading := builder.ControlPlane("default", "cp1"). + WithVersion("v1.26.2"). + WithStatusFields(map[string]interface{}{ + "status.version": "v1.25.2", + }). + Build() + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneUpgrading), + }, + } + machineSet := &clusterv1.MachineSet{} + + unhealthyMachine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "unhealthy-machine", + Namespace: "default", + }, + Status: clusterv1.MachineStatus{ + Conditions: []clusterv1.Condition{ + { + Type: clusterv1.MachineOwnerRemediatedCondition, + Status: corev1.ConditionFalse, + }, + }, + }, + } + healthyMachine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "healthy-machine", + Namespace: "default", + }, + } + + machines := []*clusterv1.Machine{unhealthyMachine, healthyMachine} + fakeClient := fake.NewClientBuilder().WithObjects(controlPlaneUpgrading, unhealthyMachine, healthyMachine).WithStatusSubresource(&clusterv1.Machine{}).Build() + r := &Reconciler{Client: fakeClient} + _, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, machines) + g.Expect(err).To(BeNil()) + + // Verify the unhealthy machine has the updated condition. + condition := clusterv1.MachineOwnerRemediatedCondition + m := &clusterv1.Machine{} + g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m)).To(Succeed()) + g.Expect(conditions.Has(m, condition)). + To(BeTrue(), "Machine should have the %s condition set", condition) + machineOwnerRemediatedCondition := conditions.Get(m, condition) + g.Expect(machineOwnerRemediatedCondition.Status). + To(Equal(corev1.ConditionFalse), "%s condition status should be false", condition) + g.Expect(machineOwnerRemediatedCondition.Reason). + To(Equal(clusterv1.WaitingForRemediationReason), "%s condition should have reason %s", condition, clusterv1.WaitingForRemediationReason) + + // Verify the healthy machine continues to not have the MachineOwnerRemediated condition. + m = &clusterv1.Machine{} + g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(conditions.Has(m, condition)). + To(BeFalse(), "Machine should not have the %s condition set", condition) + }) +} + +func TestMachineSetReconciler_syncReplicas(t *testing.T) { + t.Run("should hold off on creating new machines when preflight checks do not pass", func(t *testing.T) { + defer utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachineSetPreflightChecks, true)() + + g := NewWithT(t) + + // An upgrading control plane should cause the preflight checks to not pass. + controlPlaneUpgrading := builder.ControlPlane("default", "test-cp"). + WithVersion("v1.26.2"). + WithStatusFields(map[string]interface{}{ + "status.version": "v1.25.2", + }). + Build() + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneUpgrading), + }, + } + machineSet := &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-machineset", + Namespace: "default", + }, + Spec: clusterv1.MachineSetSpec{ + Replicas: pointer.Int32(1), + }, + } + + fakeClient := fake.NewClientBuilder().WithObjects(controlPlaneUpgrading, machineSet).WithStatusSubresource(&clusterv1.MachineSet{}).Build() + r := &Reconciler{Client: fakeClient} + result, err := r.syncReplicas(ctx, cluster, machineSet, nil) + g.Expect(err).To(BeNil()) + g.Expect(result.IsZero()).To(BeFalse(), "syncReplicas should not return a 'zero' result") + + // Verify the proper condition is set on the MachineSet. + condition := clusterv1.MachinesCreatedCondition + g.Expect(conditions.Has(machineSet, condition)). + To(BeTrue(), "MachineSet should have the %s condition set", condition) + machinesCreatedCondition := conditions.Get(machineSet, condition) + g.Expect(machinesCreatedCondition.Status). + To(Equal(corev1.ConditionFalse), "%s condition status should be %s", condition, corev1.ConditionFalse) + g.Expect(machinesCreatedCondition.Reason). + To(Equal(clusterv1.PreflightCheckFailedReason), "%s condition reason should be %s", condition, clusterv1.PreflightCheckFailedReason) + + // Verify no new Machines are created. + machineList := &clusterv1.MachineList{} + g.Expect(r.Client.List(ctx, machineList)).To(Succeed()) + g.Expect(machineList.Items).To(BeEmpty(), "There should not be any machines") + }) +} + func TestComputeDesiredMachine(t *testing.T) { duration5s := &metav1.Duration{Duration: 5 * time.Second} duration10s := &metav1.Duration{Duration: 10 * time.Second} diff --git a/internal/controllers/machineset/machineset_preflight.go b/internal/controllers/machineset/machineset_preflight.go index 582413c7c306..127a534099d0 100644 --- a/internal/controllers/machineset/machineset_preflight.go +++ b/internal/controllers/machineset/machineset_preflight.go @@ -45,28 +45,28 @@ type preflightCheckErrorMessage *string // the preflight checks fail. const preflightFailedRequeueAfter = 15 * time.Second -func (r *Reconciler) runPreflightChecks(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, action string) (ctrl.Result, error) { +func (r *Reconciler) runPreflightChecks(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, action string) (_ ctrl.Result, message string, retErr error) { log := ctrl.LoggerFrom(ctx) // If the MachineSetPreflightChecks feature gate is disabled return early. if !feature.Gates.Enabled(feature.MachineSetPreflightChecks) { - return ctrl.Result{}, nil + return ctrl.Result{}, "", nil } skipped := skippedPreflightChecks(ms) // If all the preflight checks are skipped then return early. if skipped.Has(clusterv1.MachineSetPreflightCheckAll) { - return ctrl.Result{}, nil + return ctrl.Result{}, "", nil } // If the cluster does not have a control plane reference then there is nothing to do. Return early. if cluster.Spec.ControlPlaneRef == nil { - return ctrl.Result{}, nil + return ctrl.Result{}, "", nil } // Get the control plane object. controlPlane, err := external.Get(ctx, r.Client, cluster.Spec.ControlPlaneRef, cluster.Namespace) if err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to perform %q: failed to perform preflight checks: failed to get ControlPlane %s", action, klog.KRef(cluster.Spec.ControlPlaneRef.Namespace, cluster.Spec.ControlPlaneRef.Name)) + return ctrl.Result{}, "", errors.Wrapf(err, "failed to perform %q: failed to perform preflight checks: failed to get ControlPlane %s", action, klog.KRef(cluster.Spec.ControlPlaneRef.Namespace, cluster.Spec.ControlPlaneRef.Name)) } cpKlogRef := klog.KRef(controlPlane.GetNamespace(), controlPlane.GetName()) @@ -76,13 +76,13 @@ func (r *Reconciler) runPreflightChecks(ctx context.Context, cluster *clusterv1. cpVersion, err := contract.ControlPlane().Version().Get(controlPlane) if err != nil { if errors.Is(err, contract.ErrFieldNotFound) { - return ctrl.Result{}, nil + return ctrl.Result{}, "", nil } - return ctrl.Result{}, errors.Wrapf(err, "failed to perform %q: failed to perform preflight checks: failed to get the version of ControlPlane %s", action, cpKlogRef) + return ctrl.Result{}, "", errors.Wrapf(err, "failed to perform %q: failed to perform preflight checks: failed to get the version of ControlPlane %s", action, cpKlogRef) } cpSemver, err := semver.ParseTolerant(*cpVersion) if err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to perform %q: failed to perform preflight checks: failed to parse version %q of ControlPlane %s", action, *cpVersion, cpKlogRef) + return ctrl.Result{}, "", errors.Wrapf(err, "failed to perform %q: failed to perform preflight checks: failed to parse version %q of ControlPlane %s", action, *cpVersion, cpKlogRef) } errList := []error{} @@ -103,7 +103,7 @@ func (r *Reconciler) runPreflightChecks(ctx context.Context, cluster *clusterv1. msVersion := *ms.Spec.Template.Spec.Version msSemver, err := semver.ParseTolerant(msVersion) if err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to perform %q: failed to perform preflight checks: failed to parse version %q of MachineSet %s", action, msVersion, klog.KObj(ms)) + return ctrl.Result{}, "", errors.Wrapf(err, "failed to perform %q: failed to perform preflight checks: failed to parse version %q of MachineSet %s", action, msVersion, klog.KObj(ms)) } // Run the kubernetes-version skew preflight check. @@ -127,17 +127,18 @@ func (r *Reconciler) runPreflightChecks(ctx context.Context, cluster *clusterv1. } if len(errList) > 0 { - return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errList), "failed to perform %q: failed to perform preflight checks", action) + return ctrl.Result{}, "", errors.Wrapf(kerrors.NewAggregate(errList), "failed to perform %q: failed to perform preflight checks", action) } if len(preflightCheckErrs) > 0 { preflightCheckErrStrings := []string{} for _, v := range preflightCheckErrs { preflightCheckErrStrings = append(preflightCheckErrStrings, *v) } - log.Info(fmt.Sprintf("Performing %q on hold because %s. The operation will continue after the preflight check(s) pass", action, strings.Join(preflightCheckErrStrings, "; "))) - return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, nil + msg := fmt.Sprintf("Performing %q on hold because %s. The operation will continue after the preflight check(s) pass", action, strings.Join(preflightCheckErrStrings, "; ")) + log.Info(msg) + return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, msg, nil } - return ctrl.Result{}, nil + return ctrl.Result{}, "", nil } func (r *Reconciler) controlPlaneStablePreflightCheck(controlPlane *unstructured.Unstructured) (preflightCheckErrorMessage, error) { diff --git a/internal/controllers/machineset/machineset_preflight_test.go b/internal/controllers/machineset/machineset_preflight_test.go index 45adfb9d5f78..de6062e556d8 100644 --- a/internal/controllers/machineset/machineset_preflight_test.go +++ b/internal/controllers/machineset/machineset_preflight_test.go @@ -499,7 +499,7 @@ func TestMachineSetReconciler_runPreflightChecks(t *testing.T) { } fakeClient := fake.NewClientBuilder().WithObjects(objs...).Build() r := &Reconciler{Client: fakeClient} - result, err := r.runPreflightChecks(ctx, tt.cluster, tt.machineSet, "") + result, _, err := r.runPreflightChecks(ctx, tt.cluster, tt.machineSet, "") if tt.wantErr { g.Expect(err).To(HaveOccurred()) } else { @@ -541,7 +541,7 @@ func TestMachineSetReconciler_runPreflightChecks(t *testing.T) { } fakeClient := fake.NewClientBuilder().WithObjects(controlPlane).Build() r := &Reconciler{Client: fakeClient} - result, err := r.runPreflightChecks(ctx, cluster, machineSet, "") + result, _, err := r.runPreflightChecks(ctx, cluster, machineSet, "") g.Expect(err).NotTo(HaveOccurred()) g.Expect(result.IsZero()).To(BeTrue()) })