Skip to content

Commit

Permalink
🌱 Extend MS ScalingUp and Remediationg conditions to include prefligh…
Browse files Browse the repository at this point in the history
…t check errors (kubernetes-sigs#11390)

* Extend MS ScalingUp and Remediationg conditions to include preflight
check errors

Signed-off-by: Stefan Büringer [email protected]

* Always set message on OwnerRemediated condition

Signed-off-by: Stefan Büringer [email protected]

---------

Signed-off-by: Stefan Büringer [email protected]
  • Loading branch information
sbueringer authored Nov 8, 2024
1 parent 0f319c6 commit e13abea
Show file tree
Hide file tree
Showing 12 changed files with 196 additions and 117 deletions.
7 changes: 4 additions & 3 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,9 +315,10 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")

v1beta2conditions.Set(machineToBeRemediated, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneMachineRemediationMachineDeletedV1Beta2Reason,
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneMachineRemediationMachineDeletedV1Beta2Reason,
Message: "Machine deletionTimestamp set",
})

// Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation.
Expand Down
52 changes: 27 additions & 25 deletions controlplane/kubeadm/internal/controllers/remediation_test.go

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions controlplane/kubeadm/internal/controllers/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,7 @@ func Test_setRemediatingCondition(t *testing.T) {
healthCheckSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionTrue}
healthCheckNotSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionFalse}
ownerRemediated := clusterv1.Condition{Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Message: "Remediation in progress"}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneMachineRemediationMachineDeletedV1Beta2Reason, Message: "Machine deletionTimestamp set"}

tests := []struct {
name string
Expand Down Expand Up @@ -550,7 +550,7 @@ func Test_setRemediatingCondition(t *testing.T) {
Type: controlplanev1.KubeadmControlPlaneRemediatingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: controlplanev1.KubeadmControlPlaneRemediatingV1Beta2Reason,
Message: "Remediation in progress from Machine m3",
Message: "Machine deletionTimestamp set from Machine m3",
},
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1502,7 +1502,7 @@ func TestSetRemediatingCondition(t *testing.T) {
healthCheckSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionTrue}
healthCheckNotSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionFalse}
ownerRemediated := clusterv1.Condition{Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Message: "Remediation in progress"}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, Message: "Machine deletionTimestamp set"}

tests := []struct {
name string
Expand Down Expand Up @@ -1550,7 +1550,7 @@ func TestSetRemediatingCondition(t *testing.T) {
Type: clusterv1.ClusterRemediatingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.ClusterRemediatingV1Beta2Reason,
Message: "Remediation in progress from Machine m3",
Message: "Machine deletionTimestamp set from Machine m3",
},
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,7 @@ func Test_setRemediatingCondition(t *testing.T) {
healthCheckSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionTrue}
healthCheckNotSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionFalse}
ownerRemediated := clusterv1.Condition{Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Message: "Remediation in progress"}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, Message: "Machine deletionTimestamp set"}

tests := []struct {
name string
Expand Down Expand Up @@ -932,7 +932,7 @@ func Test_setRemediatingCondition(t *testing.T) {
Type: clusterv1.MachineDeploymentRemediatingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeploymentRemediatingV1Beta2Reason,
Message: "Remediation in progress from Machine m3",
Message: "Machine deletionTimestamp set from Machine m3",
},
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -477,9 +477,10 @@ func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logg

if ownerRemediatedCondition := v1beta2conditions.Get(t.Machine, clusterv1.MachineOwnerRemediatedV1Beta2Condition); ownerRemediatedCondition == nil || ownerRemediatedCondition.Status == metav1.ConditionTrue {
v1beta2conditions.Set(t.Machine, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
Message: "Waiting for remediation",
})
}
}
Expand Down
11 changes: 8 additions & 3 deletions internal/controllers/machineset/machineset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,8 @@ type scope struct {
infrastructureObjectNotFound bool
getAndAdoptMachinesForMachineSetSucceeded bool
owningMachineDeployment *clusterv1.MachineDeployment
remediationPreflightCheckErrMessage string
scaleUpPreflightCheckErrMessage string
}

type machineSetReconcileFunc func(ctx context.Context, s *scope) (ctrl.Result, error)
Expand Down Expand Up @@ -602,6 +604,7 @@ func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, e
// If the error is not nil use that as the message for the condition.
preflightCheckErrMessage = err.Error()
}
s.scaleUpPreflightCheckErrMessage = preflightCheckErrMessage
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.PreflightCheckFailedReason, clusterv1.ConditionSeverityError, preflightCheckErrMessage)
return result, err
}
Expand Down Expand Up @@ -1350,6 +1353,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
if preflightChecksFailed {
// PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with
// WaitingForRemediationReason reason.
s.remediationPreflightCheckErrMessage = preflightCheckErrMessage
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Expand All @@ -1375,9 +1379,10 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
// Instead if we set the condition but the deletion does not go through on next reconcile either the
// condition will be fixed/updated or the Machine deletion will be retried.
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason,
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason,
Message: "Machine deletionTimestamp set",
}, &clusterv1.Condition{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionTrue,
Expand Down
29 changes: 19 additions & 10 deletions internal/controllers/machineset/machineset_controller_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package machineset
import (
"context"
"fmt"
"slices"
"sort"
"strings"
"time"
Expand Down Expand Up @@ -47,7 +48,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, s *scope) {
// Conditions

// Update the ScalingUp and ScalingDown condition.
setScalingUpCondition(ctx, s.machineSet, s.machines, s.bootstrapObjectNotFound, s.infrastructureObjectNotFound, s.getAndAdoptMachinesForMachineSetSucceeded)
setScalingUpCondition(ctx, s.machineSet, s.machines, s.bootstrapObjectNotFound, s.infrastructureObjectNotFound, s.getAndAdoptMachinesForMachineSetSucceeded, s.scaleUpPreflightCheckErrMessage)
setScalingDownCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded)

// MachinesReady condition: aggregate the Machine's Ready condition.
Expand All @@ -59,7 +60,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, s *scope) {
machines := collections.FromMachines(s.machines...)
machinesToBeRemediated := machines.Filter(collections.IsUnhealthyAndOwnerRemediated)
unhealthyMachines := machines.Filter(collections.IsUnhealthy)
setRemediatingCondition(ctx, s.machineSet, machinesToBeRemediated, unhealthyMachines, s.getAndAdoptMachinesForMachineSetSucceeded)
setRemediatingCondition(ctx, s.machineSet, machinesToBeRemediated, unhealthyMachines, s.getAndAdoptMachinesForMachineSetSucceeded, s.remediationPreflightCheckErrMessage)

setDeletingCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded)
}
Expand Down Expand Up @@ -92,7 +93,7 @@ func setReplicas(_ context.Context, ms *clusterv1.MachineSet, machines []*cluste
ms.Status.V1Beta2.UpToDateReplicas = ptr.To(upToDateReplicas)
}

func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, bootstrapObjectNotFound, infrastructureObjectNotFound, getAndAdoptMachinesForMachineSetSucceeded bool) {
func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, bootstrapObjectNotFound, infrastructureObjectNotFound, getAndAdoptMachinesForMachineSetSucceeded bool, scaleUpPreflightCheckErrMessage string) {
// If we got unexpected errors in listing the machines (this should never happen), surface them.
if !getAndAdoptMachinesForMachineSetSucceeded {
v1beta2conditions.Set(ms, metav1.Condition{
Expand Down Expand Up @@ -125,7 +126,7 @@ func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines
if currentReplicas >= desiredReplicas {
var message string
if missingReferencesMessage != "" {
message = fmt.Sprintf("Scaling up would be blocked %s", missingReferencesMessage)
message = fmt.Sprintf("Scaling up would be blocked because %s", missingReferencesMessage)
}
v1beta2conditions.Set(ms, metav1.Condition{
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
Expand All @@ -138,8 +139,11 @@ func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines

// Scaling up.
message := fmt.Sprintf("Scaling up from %d to %d replicas", currentReplicas, desiredReplicas)
if missingReferencesMessage != "" {
message += fmt.Sprintf(" is blocked %s", missingReferencesMessage)
if missingReferencesMessage != "" || scaleUpPreflightCheckErrMessage != "" {
blockMessages := slices.DeleteFunc([]string{missingReferencesMessage, scaleUpPreflightCheckErrMessage}, func(s string) bool {
return s == ""
})
message += fmt.Sprintf(" is blocked because %s", strings.Join(blockMessages, " and "))
}
v1beta2conditions.Set(ms, metav1.Condition{
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
Expand Down Expand Up @@ -281,7 +285,7 @@ func setMachinesUpToDateCondition(ctx context.Context, machineSet *clusterv1.Mac
v1beta2conditions.Set(machineSet, *upToDateCondition)
}

func setRemediatingCondition(ctx context.Context, machineSet *clusterv1.MachineSet, machinesToBeRemediated, unhealthyMachines collections.Machines, getAndAdoptMachinesForMachineSetSucceeded bool) {
func setRemediatingCondition(ctx context.Context, machineSet *clusterv1.MachineSet, machinesToBeRemediated, unhealthyMachines collections.Machines, getAndAdoptMachinesForMachineSetSucceeded bool, remediationPreflightCheckErrMessage string) {
if !getAndAdoptMachinesForMachineSetSucceeded {
v1beta2conditions.Set(machineSet, metav1.Condition{
Type: clusterv1.MachineSetRemediatingV1Beta2Condition,
Expand Down Expand Up @@ -320,11 +324,16 @@ func setRemediatingCondition(ctx context.Context, machineSet *clusterv1.MachineS
return
}

msg := remediatingCondition.Message
if remediationPreflightCheckErrMessage != "" {
msg = fmt.Sprintf("Triggering further remediations is blocked because %s; %s", remediationPreflightCheckErrMessage, remediatingCondition.Message)
}

v1beta2conditions.Set(machineSet, metav1.Condition{
Type: remediatingCondition.Type,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetRemediatingV1Beta2Reason,
Message: remediatingCondition.Message,
Message: msg,
})
}

Expand Down Expand Up @@ -383,10 +392,10 @@ func calculateMissingReferencesMessage(ms *clusterv1.MachineSet, bootstrapTempla
}

if len(missingObjects) == 1 {
return fmt.Sprintf("because %s does not exist", missingObjects[0])
return fmt.Sprintf("%s does not exist", missingObjects[0])
}

return fmt.Sprintf("because %s do not exist", strings.Join(missingObjects, " and "))
return fmt.Sprintf("%s do not exist", strings.Join(missingObjects, " and "))
}

func aggregateStaleMachines(machines []*clusterv1.Machine) string {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ func Test_setScalingUpCondition(t *testing.T) {
bootstrapObjectNotFound bool
infrastructureObjectNotFound bool
getAndAdoptMachinesForMachineSetSucceeded bool
scaleUpPreflightCheckErrMessage string
expectCondition metav1.Condition
}{
{
Expand Down Expand Up @@ -299,6 +300,27 @@ func Test_setScalingUpCondition(t *testing.T) {
Message: "Scaling up from 0 to 3 replicas is blocked because DockerMachineTemplate does not exist",
},
},
{
name: "scaling up and blocked by bootstrap and infrastructure object and preflight checks",
ms: scalingUpMachineSetWith3Replicas,
bootstrapObjectNotFound: true,
infrastructureObjectNotFound: true,
getAndAdoptMachinesForMachineSetSucceeded: true,
// This preflight check error can happen when a MachineSet is scaling up while the control plane
// already has a newer Kubernetes version.
scaleUpPreflightCheckErrMessage: "MachineSet version (1.25.5) and ControlPlane version (1.26.2) " +
"do not conform to kubeadm version skew policy as kubeadm only supports joining with the same " +
"major+minor version as the control plane (\"KubeadmVersionSkew\" preflight check failed)",
expectCondition: metav1.Condition{
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetScalingUpV1Beta2Reason,
Message: "Scaling up from 0 to 3 replicas is blocked because KubeadmBootstrapTemplate and DockerMachineTemplate " +
"do not exist and MachineSet version (1.25.5) and ControlPlane version (1.26.2) " +
"do not conform to kubeadm version skew policy as kubeadm only supports joining with the same " +
"major+minor version as the control plane (\"KubeadmVersionSkew\" preflight check failed)",
},
},
{
name: "deleting",
ms: deletingMachineSetWith3Replicas,
Expand All @@ -317,7 +339,7 @@ func Test_setScalingUpCondition(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)

setScalingUpCondition(ctx, tt.ms, tt.machines, tt.bootstrapObjectNotFound, tt.infrastructureObjectNotFound, tt.getAndAdoptMachinesForMachineSetSucceeded)
setScalingUpCondition(ctx, tt.ms, tt.machines, tt.bootstrapObjectNotFound, tt.infrastructureObjectNotFound, tt.getAndAdoptMachinesForMachineSetSucceeded, tt.scaleUpPreflightCheckErrMessage)

condition := v1beta2conditions.Get(tt.ms, clusterv1.MachineSetScalingUpV1Beta2Condition)
g.Expect(condition).ToNot(BeNil())
Expand Down Expand Up @@ -758,13 +780,15 @@ func Test_setRemediatingCondition(t *testing.T) {
healthCheckSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionTrue}
healthCheckNotSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionFalse}
ownerRemediated := clusterv1.Condition{Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Message: "Remediation in progress"}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, Message: "Machine deletionTimestamp set"}
ownerRemediatedWaitingForRemediationV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason, Message: "Waiting for remediation"}

tests := []struct {
name string
machineSet *clusterv1.MachineSet
machines []*clusterv1.Machine
getAndAdoptMachinesForMachineSetSucceeded bool
remediationPreflightCheckErrMessage string
expectCondition metav1.Condition
}{
{
Expand Down Expand Up @@ -806,7 +830,27 @@ func Test_setRemediatingCondition(t *testing.T) {
Type: clusterv1.MachineSetRemediatingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetRemediatingV1Beta2Reason,
Message: "Remediation in progress from Machine m3",
Message: "Machine deletionTimestamp set from Machine m3",
},
},
{
name: "With machines to be remediated by MS and preflight check error",
machineSet: &clusterv1.MachineSet{},
machines: []*clusterv1.Machine{
fakeMachine("m1", withConditions(healthCheckSucceeded)), // Healthy machine
fakeMachine("m2", withConditions(healthCheckNotSucceeded)), // Unhealthy machine, not yet marked for remediation
fakeMachine("m3", withConditions(healthCheckNotSucceeded, ownerRemediated), withV1Beta2Condition(ownerRemediatedV1Beta2)),
fakeMachine("m4", withConditions(healthCheckNotSucceeded, ownerRemediated), withV1Beta2Condition(ownerRemediatedWaitingForRemediationV1Beta2)),
},
getAndAdoptMachinesForMachineSetSucceeded: true,
// This preflight check error can happen when a Machine becomes unhealthy while the control plane is upgrading.
remediationPreflightCheckErrMessage: "KubeadmControlPlane ns1/cp1 is upgrading (\"ControlPlaneIsStable\" preflight check failed)",
expectCondition: metav1.Condition{
Type: clusterv1.MachineSetRemediatingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetRemediatingV1Beta2Reason,
Message: "Triggering further remediations is blocked because KubeadmControlPlane ns1/cp1 is upgrading (\"ControlPlaneIsStable\" preflight check failed); " +
"Machine deletionTimestamp set from Machine m3; Waiting for remediation from Machine m4",
},
},
{
Expand Down Expand Up @@ -852,7 +896,7 @@ func Test_setRemediatingCondition(t *testing.T) {
machinesToBeRemediated = machines.Filter(collections.IsUnhealthyAndOwnerRemediated)
unHealthyMachines = machines.Filter(collections.IsUnhealthy)
}
setRemediatingCondition(ctx, tt.machineSet, machinesToBeRemediated, unHealthyMachines, tt.getAndAdoptMachinesForMachineSetSucceeded)
setRemediatingCondition(ctx, tt.machineSet, machinesToBeRemediated, unHealthyMachines, tt.getAndAdoptMachinesForMachineSetSucceeded, tt.remediationPreflightCheckErrMessage)

condition := v1beta2conditions.Get(tt.machineSet, clusterv1.MachineSetRemediatingV1Beta2Condition)
g.Expect(condition).ToNot(BeNil())
Expand Down
Loading

0 comments on commit e13abea

Please sign in to comment.