Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 surface failed preflight checks on MachineSet in MachinesCreated condition #8669

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions api/v1beta1/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,10 @@ const (
// MachinesReadyCondition reports an aggregate of current status of the machines controlled by the MachineSet.
MachinesReadyCondition ConditionType = "MachinesReady"

// PreflightCheckFailedReason (Severity=Error) documents a MachineSet failing preflight checks
ykakarap marked this conversation as resolved.
Show resolved Hide resolved
// to create machine(s).
PreflightCheckFailedReason = "PreflightCheckFailed"

// BootstrapTemplateCloningFailedReason (Severity=Error) documents a MachineSet failing to
// clone the bootstrap template.
BootstrapTemplateCloningFailedReason = "BootstrapTemplateCloningFailed"
Expand Down
123 changes: 81 additions & 42 deletions internal/controllers/machineset/machineset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,51 +296,13 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster,
filteredMachines = append(filteredMachines, machine)
}

// Remediate failed Machines by deleting them.
var errs []error
machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines))
for _, machine := range filteredMachines {
// filteredMachines contains machines in deleting status to calculate correct status.
// skip remediation for those in deleting status.
if !machine.DeletionTimestamp.IsZero() {
continue
}
if conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition) {
machinesToRemediate = append(machinesToRemediate, machine)
}
}

result := ctrl.Result{}
if len(machinesToRemediate) > 0 {
preflightChecksResult, err := r.runPreflightChecks(ctx, cluster, machineSet, "Machine Remediation")
if err != nil {
return ctrl.Result{}, err
}
// Delete the machines only if the preflight checks have passed. Do not delete machines if we cannot
// guarantee creating new machines.
if preflightChecksResult.IsZero() {
for _, machine := range machinesToRemediate {
log.Info("Deleting machine because marked as unhealthy by the MachineHealthCheck controller")
patch := client.MergeFrom(machine.DeepCopy())
if err := r.Client.Delete(ctx, machine); err != nil {
errs = append(errs, errors.Wrap(err, "failed to delete"))
continue
}
conditions.MarkTrue(machine, clusterv1.MachineOwnerRemediatedCondition)
if err := r.Client.Status().Patch(ctx, machine, patch); err != nil && !apierrors.IsNotFound(err) {
errs = append(errs, errors.Wrap(err, "failed to update status"))
}
}
} else {
result = util.LowestNonZeroResult(result, preflightChecksResult)
}
}

err = kerrors.NewAggregate(errs)
reconcileUnhealthyMachinesResult, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, filteredMachines)
if err != nil {
log.Info("Failed while deleting unhealthy machines", "err", err)
return ctrl.Result{}, errors.Wrap(err, "failed to remediate machines")
return ctrl.Result{}, errors.Wrap(err, "failed to reconcile unhealthy machines")
}
result = util.LowestNonZeroResult(result, reconcileUnhealthyMachinesResult)

if err := r.syncMachines(ctx, machineSet, filteredMachines); err != nil {
return ctrl.Result{}, errors.Wrap(err, "failed to update Machines")
Expand Down Expand Up @@ -482,8 +444,13 @@ func (r *Reconciler) syncReplicas(ctx context.Context, cluster *clusterv1.Cluste
}
}

result, err := r.runPreflightChecks(ctx, cluster, ms, "Scale up")
result, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Scale up")
if err != nil || !result.IsZero() {
if err != nil {
// If the error is not nil use that as the message for the condition.
preflightCheckErrMessage = err.Error()
}
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.PreflightCheckFailedReason, clusterv1.ConditionSeverityError, preflightCheckErrMessage)
return result, err
}

Expand Down Expand Up @@ -991,6 +958,78 @@ func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Clus
return node, nil
}

func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) (ctrl.Result, error) {
log := ctrl.LoggerFrom(ctx)
// List all unhealthy machines.
machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines))
for _, m := range filteredMachines {
// filteredMachines contains machines in deleting status to calculate correct status.
// skip remediation for those in deleting status.
if !m.DeletionTimestamp.IsZero() {
continue
}
if conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) {
machinesToRemediate = append(machinesToRemediate, m)
}
}

// If there are no machines to remediate return early.
if len(machinesToRemediate) == 0 {
return ctrl.Result{}, nil
}

preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine Remediation")
if err != nil {
// If err is not nil use that as the preflightCheckErrMessage
preflightCheckErrMessage = err.Error()
}

preflightChecksFailed := err != nil || !preflightChecksResult.IsZero()
if preflightChecksFailed {
// PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with
// WaitingForRemediationReason reason.
var errs []error
for _, m := range machinesToRemediate {
patchHelper, err := patch.NewHelper(m, r.Client)
if err != nil {
errs = append(errs, errors.Wrapf(err, "failed to create patch helper for Machine %s", klog.KObj(m)))
continue
}
conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, preflightCheckErrMessage)
if err := patchHelper.Patch(ctx, m); err != nil {
errs = append(errs, errors.Wrapf(err, "failed to patch Machine %s", klog.KObj(m)))
}
}

if len(errs) > 0 {
return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to patch unhealthy Machines")
}
return preflightChecksResult, nil
}

// PreflightChecks passed, so it is safe to remediate unhealthy machines.
// Remediate unhealthy machines by deleting them.
var errs []error
for _, m := range machinesToRemediate {
log.Info(fmt.Sprintf("Deleting Machine %s because it was marked as unhealthy by the MachineHealthCheck controller", klog.KObj(m)))
patch := client.MergeFrom(m.DeepCopy())
if err := r.Client.Delete(ctx, m); err != nil {
errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m)))
continue
}
conditions.MarkTrue(m, clusterv1.MachineOwnerRemediatedCondition)
if err := r.Client.Status().Patch(ctx, m, patch); err != nil && !apierrors.IsNotFound(err) {
errs = append(errs, errors.Wrapf(err, "failed to update status of Machine %s", klog.KObj(m)))
}
}

if len(errs) > 0 {
return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to delete unhealthy Machines")
}

return ctrl.Result{}, nil
}

func reconcileExternalTemplateReference(ctx context.Context, c client.Client, cluster *clusterv1.Cluster, ref *corev1.ObjectReference) error {
if !strings.HasSuffix(ref.Kind, clusterv1.TemplateSuffix) {
return nil
Expand Down
186 changes: 186 additions & 0 deletions internal/controllers/machineset/machineset_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/client-go/tools/record"
utilfeature "k8s.io/component-base/featuregate/testing"
"k8s.io/utils/pointer"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/controllers/external"
"sigs.k8s.io/cluster-api/feature"
"sigs.k8s.io/cluster-api/internal/contract"
"sigs.k8s.io/cluster-api/internal/test/builder"
"sigs.k8s.io/cluster-api/internal/util/ssa"
Expand Down Expand Up @@ -1326,6 +1328,190 @@ func TestMachineSetReconciler_syncMachines(t *testing.T) {
}, 5*time.Second).Should(Succeed())
}

func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) {
t.Run("should delete unhealthy machines if preflight checks pass", func(t *testing.T) {
defer utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachineSetPreflightChecks, true)()

g := NewWithT(t)

controlPlaneStable := builder.ControlPlane("default", "cp1").
WithVersion("v1.26.2").
WithStatusFields(map[string]interface{}{
"status.version": "v1.26.2",
}).
Build()
cluster := &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "test-cluster",
Namespace: "default",
},
Spec: clusterv1.ClusterSpec{
ControlPlaneRef: contract.ObjToRef(controlPlaneStable),
},
}
machineSet := &clusterv1.MachineSet{}

unhealthyMachine := &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "unhealthy-machine",
Namespace: "default",
},
Status: clusterv1.MachineStatus{
Conditions: []clusterv1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionFalse,
},
},
},
}
healthyMachine := &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "healthy-machine",
Namespace: "default",
},
}

machines := []*clusterv1.Machine{unhealthyMachine, healthyMachine}

fakeClient := fake.NewClientBuilder().WithObjects(controlPlaneStable, unhealthyMachine, healthyMachine).Build()
r := &Reconciler{Client: fakeClient}
_, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, machines)
g.Expect(err).To(BeNil())
// Verify the unhealthy machine is deleted.
m := &clusterv1.Machine{}
err = r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m)
g.Expect(apierrors.IsNotFound(err)).To(BeTrue())
// Verify the healthy machine is not deleted.
m = &clusterv1.Machine{}
g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).Should(Succeed())
})

t.Run("should update the unhealthy machine MachineOwnerRemediated condition if preflight checks did not pass", func(t *testing.T) {
defer utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachineSetPreflightChecks, true)()

g := NewWithT(t)

// An upgrading control plane should cause the preflight checks to not pass.
controlPlaneUpgrading := builder.ControlPlane("default", "cp1").
WithVersion("v1.26.2").
WithStatusFields(map[string]interface{}{
"status.version": "v1.25.2",
}).
Build()
cluster := &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "test-cluster",
Namespace: "default",
},
Spec: clusterv1.ClusterSpec{
ControlPlaneRef: contract.ObjToRef(controlPlaneUpgrading),
},
}
machineSet := &clusterv1.MachineSet{}

unhealthyMachine := &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "unhealthy-machine",
Namespace: "default",
},
Status: clusterv1.MachineStatus{
Conditions: []clusterv1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionFalse,
},
},
},
}
healthyMachine := &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "healthy-machine",
Namespace: "default",
},
}

machines := []*clusterv1.Machine{unhealthyMachine, healthyMachine}
fakeClient := fake.NewClientBuilder().WithObjects(controlPlaneUpgrading, unhealthyMachine, healthyMachine).WithStatusSubresource(&clusterv1.Machine{}).Build()
r := &Reconciler{Client: fakeClient}
_, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, machines)
g.Expect(err).To(BeNil())

// Verify the unhealthy machine has the updated condition.
condition := clusterv1.MachineOwnerRemediatedCondition
m := &clusterv1.Machine{}
g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m)).To(Succeed())
g.Expect(conditions.Has(m, condition)).
To(BeTrue(), "Machine should have the %s condition set", condition)
machineOwnerRemediatedCondition := conditions.Get(m, condition)
g.Expect(machineOwnerRemediatedCondition.Status).
To(Equal(corev1.ConditionFalse), "%s condition status should be false", condition)
g.Expect(machineOwnerRemediatedCondition.Reason).
To(Equal(clusterv1.WaitingForRemediationReason), "%s condition should have reason %s", condition, clusterv1.WaitingForRemediationReason)

// Verify the healthy machine continues to not have the MachineOwnerRemediated condition.
m = &clusterv1.Machine{}
g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed())
g.Expect(conditions.Has(m, condition)).
To(BeFalse(), "Machine should not have the %s condition set", condition)
})
}

func TestMachineSetReconciler_syncReplicas(t *testing.T) {
t.Run("should hold off on creating new machines when preflight checks do not pass", func(t *testing.T) {
defer utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachineSetPreflightChecks, true)()

g := NewWithT(t)

// An upgrading control plane should cause the preflight checks to not pass.
controlPlaneUpgrading := builder.ControlPlane("default", "test-cp").
WithVersion("v1.26.2").
WithStatusFields(map[string]interface{}{
"status.version": "v1.25.2",
}).
Build()
cluster := &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "test-cluster",
Namespace: "default",
},
Spec: clusterv1.ClusterSpec{
ControlPlaneRef: contract.ObjToRef(controlPlaneUpgrading),
},
}
machineSet := &clusterv1.MachineSet{
ObjectMeta: metav1.ObjectMeta{
Name: "test-machineset",
Namespace: "default",
},
Spec: clusterv1.MachineSetSpec{
Replicas: pointer.Int32(1),
},
}

fakeClient := fake.NewClientBuilder().WithObjects(controlPlaneUpgrading, machineSet).WithStatusSubresource(&clusterv1.MachineSet{}).Build()
r := &Reconciler{Client: fakeClient}
result, err := r.syncReplicas(ctx, cluster, machineSet, nil)
g.Expect(err).To(BeNil())
g.Expect(result.IsZero()).To(BeFalse(), "syncReplicas should not return a 'zero' result")

// Verify the proper condition is set on the MachineSet.
condition := clusterv1.MachinesCreatedCondition
g.Expect(conditions.Has(machineSet, condition)).
To(BeTrue(), "MachineSet should have the %s condition set", condition)
machinesCreatedCondition := conditions.Get(machineSet, condition)
g.Expect(machinesCreatedCondition.Status).
To(Equal(corev1.ConditionFalse), "%s condition status should be %s", condition, corev1.ConditionFalse)
g.Expect(machinesCreatedCondition.Reason).
To(Equal(clusterv1.PreflightCheckFailedReason), "%s condition reason should be %s", condition, clusterv1.PreflightCheckFailedReason)

// Verify no new Machines are created.
machineList := &clusterv1.MachineList{}
g.Expect(r.Client.List(ctx, machineList)).To(Succeed())
g.Expect(machineList.Items).To(BeEmpty(), "There should not be any machines")
})
}

func TestComputeDesiredMachine(t *testing.T) {
duration5s := &metav1.Duration{Duration: 5 * time.Second}
duration10s := &metav1.Duration{Duration: 10 * time.Second}
Expand Down
Loading