diff --git a/api/v1alpha3/conversion.go b/api/v1alpha3/conversion.go index f8129ac1f299..87b9251b61ea 100644 --- a/api/v1alpha3/conversion.go +++ b/api/v1alpha3/conversion.go @@ -152,13 +152,36 @@ func (dst *MachineDeploymentList) ConvertFrom(srcRaw conversion.Hub) error { func (src *MachineHealthCheck) ConvertTo(dstRaw conversion.Hub) error { dst := dstRaw.(*v1alpha4.MachineHealthCheck) - return Convert_v1alpha3_MachineHealthCheck_To_v1alpha4_MachineHealthCheck(src, dst, nil) + if err := Convert_v1alpha3_MachineHealthCheck_To_v1alpha4_MachineHealthCheck(src, dst, nil); err != nil { + return err + } + + // Manually restore data. + restored := &v1alpha4.MachineHealthCheck{} + if ok, err := utilconversion.UnmarshalData(src, restored); err != nil || !ok { + return err + } + + if restored.Spec.UnhealthyRange != nil { + dst.Spec.UnhealthyRange = restored.Spec.UnhealthyRange + } + + return nil } func (dst *MachineHealthCheck) ConvertFrom(srcRaw conversion.Hub) error { src := srcRaw.(*v1alpha4.MachineHealthCheck) - return Convert_v1alpha4_MachineHealthCheck_To_v1alpha3_MachineHealthCheck(src, dst, nil) + if err := Convert_v1alpha4_MachineHealthCheck_To_v1alpha3_MachineHealthCheck(src, dst, nil); err != nil { + return err + } + + // Preserve Hub data on down-conversion except for metadata + if err := utilconversion.MarshalData(src, dst); err != nil { + return err + } + + return nil } func (src *MachineHealthCheckList) ConvertTo(dstRaw conversion.Hub) error { @@ -181,3 +204,7 @@ func Convert_v1alpha3_Bootstrap_To_v1alpha4_Bootstrap(in *Bootstrap, out *v1alph func Convert_v1alpha4_MachineRollingUpdateDeployment_To_v1alpha3_MachineRollingUpdateDeployment(in *v1alpha4.MachineRollingUpdateDeployment, out *MachineRollingUpdateDeployment, s apiconversion.Scope) error { return autoConvert_v1alpha4_MachineRollingUpdateDeployment_To_v1alpha3_MachineRollingUpdateDeployment(in, out, s) } + +func Convert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(in *v1alpha4.MachineHealthCheckSpec, out *MachineHealthCheckSpec, s apiconversion.Scope) error { + return autoConvert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(in, out, s) +} diff --git a/api/v1alpha3/conversion_test.go b/api/v1alpha3/conversion_test.go index 1e0c2de3941e..d3827160fc04 100644 --- a/api/v1alpha3/conversion_test.go +++ b/api/v1alpha3/conversion_test.go @@ -36,4 +36,5 @@ func TestFuzzyConversion(t *testing.T) { t.Run("for Machine", utilconversion.FuzzTestFunc(scheme, &v1alpha4.Machine{}, &Machine{})) t.Run("for MachineSet", utilconversion.FuzzTestFunc(scheme, &v1alpha4.MachineSet{}, &MachineSet{})) t.Run("for MachineDeployment", utilconversion.FuzzTestFunc(scheme, &v1alpha4.MachineDeployment{}, &MachineDeployment{})) + t.Run("for MachineHealthCheckSpec", utilconversion.FuzzTestFunc(scheme, &v1alpha4.MachineHealthCheck{}, &MachineHealthCheck{})) } diff --git a/api/v1alpha3/zz_generated.conversion.go b/api/v1alpha3/zz_generated.conversion.go index 0f00332f99b7..bf58367fe46c 100644 --- a/api/v1alpha3/zz_generated.conversion.go +++ b/api/v1alpha3/zz_generated.conversion.go @@ -892,7 +892,17 @@ func Convert_v1alpha4_MachineHealthCheck_To_v1alpha3_MachineHealthCheck(in *v1al func autoConvert_v1alpha3_MachineHealthCheckList_To_v1alpha4_MachineHealthCheckList(in *MachineHealthCheckList, out *v1alpha4.MachineHealthCheckList, s conversion.Scope) error { out.ListMeta = in.ListMeta - out.Items = *(*[]v1alpha4.MachineHealthCheck)(unsafe.Pointer(&in.Items)) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]v1alpha4.MachineHealthCheck, len(*in)) + for i := range *in { + if err := Convert_v1alpha3_MachineHealthCheck_To_v1alpha4_MachineHealthCheck(&(*in)[i], &(*out)[i], s); err != nil { + return err + } + } + } else { + out.Items = nil + } return nil } @@ -903,7 +913,17 @@ func Convert_v1alpha3_MachineHealthCheckList_To_v1alpha4_MachineHealthCheckList( func autoConvert_v1alpha4_MachineHealthCheckList_To_v1alpha3_MachineHealthCheckList(in *v1alpha4.MachineHealthCheckList, out *MachineHealthCheckList, s conversion.Scope) error { out.ListMeta = in.ListMeta - out.Items = *(*[]MachineHealthCheck)(unsafe.Pointer(&in.Items)) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]MachineHealthCheck, len(*in)) + for i := range *in { + if err := Convert_v1alpha4_MachineHealthCheck_To_v1alpha3_MachineHealthCheck(&(*in)[i], &(*out)[i], s); err != nil { + return err + } + } + } else { + out.Items = nil + } return nil } @@ -932,16 +952,12 @@ func autoConvert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckS out.Selector = in.Selector out.UnhealthyConditions = *(*[]UnhealthyCondition)(unsafe.Pointer(&in.UnhealthyConditions)) out.MaxUnhealthy = (*intstr.IntOrString)(unsafe.Pointer(in.MaxUnhealthy)) + // WARNING: in.UnhealthyRange requires manual conversion: does not exist in peer-type out.NodeStartupTimeout = (*metav1.Duration)(unsafe.Pointer(in.NodeStartupTimeout)) out.RemediationTemplate = (*v1.ObjectReference)(unsafe.Pointer(in.RemediationTemplate)) return nil } -// Convert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec is an autogenerated conversion function. -func Convert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(in *v1alpha4.MachineHealthCheckSpec, out *MachineHealthCheckSpec, s conversion.Scope) error { - return autoConvert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(in, out, s) -} - func autoConvert_v1alpha3_MachineHealthCheckStatus_To_v1alpha4_MachineHealthCheckStatus(in *MachineHealthCheckStatus, out *v1alpha4.MachineHealthCheckStatus, s conversion.Scope) error { out.ExpectedMachines = in.ExpectedMachines out.CurrentHealthy = in.CurrentHealthy diff --git a/api/v1alpha4/machinehealthcheck_types.go b/api/v1alpha4/machinehealthcheck_types.go index d34bce9c1be3..946db1c8bd73 100644 --- a/api/v1alpha4/machinehealthcheck_types.go +++ b/api/v1alpha4/machinehealthcheck_types.go @@ -45,6 +45,12 @@ type MachineHealthCheckSpec struct { // +optional MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"` + // Any further remediation is only allowed if the number of machines selected by "selector" as not healthy + // is within the range of "UnhealthyRange" + // +optional + // +kubebuilder:validation:Pattern=^\[[0-9]+-[0-9]+\]$ + UnhealthyRange *string `json:"unhealthyRange,omitempty"` + // Machines older than this duration without a node will be considered to have // failed and will be remediated. // +optional diff --git a/api/v1alpha4/zz_generated.deepcopy.go b/api/v1alpha4/zz_generated.deepcopy.go index 1dd174f68a4d..93bf447a53aa 100644 --- a/api/v1alpha4/zz_generated.deepcopy.go +++ b/api/v1alpha4/zz_generated.deepcopy.go @@ -577,6 +577,11 @@ func (in *MachineHealthCheckSpec) DeepCopyInto(out *MachineHealthCheckSpec) { *out = new(intstr.IntOrString) **out = **in } + if in.UnhealthyRange != nil { + in, out := &in.UnhealthyRange, &out.UnhealthyRange + *out = new(string) + **out = **in + } if in.NodeStartupTimeout != nil { in, out := &in.NodeStartupTimeout, &out.NodeStartupTimeout *out = new(metav1.Duration) diff --git a/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml b/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml index fba233ec6432..ece74c449ef3 100644 --- a/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml @@ -322,6 +322,10 @@ spec: type: object minItems: 1 type: array + unhealthyRange: + description: Any further remediation is only allowed if the number of machines selected by "selector" as not healthy is within the range of "UnhealthyRange" + pattern: ^\[[0-9]+-[0-9]+\]$ + type: string required: - clusterName - selector diff --git a/controllers/machinehealthcheck_controller.go b/controllers/machinehealthcheck_controller.go index e07f32e4d008..71d168774a91 100644 --- a/controllers/machinehealthcheck_controller.go +++ b/controllers/machinehealthcheck_controller.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "sort" + "strconv" "strings" "time" @@ -211,19 +212,39 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log healthy, unhealthy, nextCheckTimes := r.healthCheckTargets(targets, logger, m.Spec.NodeStartupTimeout.Duration) m.Status.CurrentHealthy = int32(len(healthy)) + var unhealthyLimitKey, unhealthyLimitValue interface{} + // check MHC current health against MaxUnhealthy - if !isAllowedRemediation(m) { + remediationAllowed, remediationCount, err := isAllowedRemediation(m) + if err != nil { + return ctrl.Result{}, errors.Wrapf(err, "error checking if remediation is allowed") + } + + if !remediationAllowed { + var message string + + if m.Spec.UnhealthyRange == nil { + unhealthyLimitKey = "max unhealthy" + unhealthyLimitValue = m.Spec.MaxUnhealthy + message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: %v, unhealthy: %v, maxUnhealthy: %v)", + totalTargets, + len(unhealthy), + m.Spec.MaxUnhealthy) + } else { + unhealthyLimitKey = "unhealthy range" + unhealthyLimitValue = *m.Spec.UnhealthyRange + message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: %v, unhealthy: %v, unhealthyRange: %v)", + totalTargets, + len(unhealthy), + *m.Spec.UnhealthyRange) + } + logger.V(3).Info( "Short-circuiting remediation", "total target", totalTargets, - "max unhealthy", m.Spec.MaxUnhealthy, + unhealthyLimitKey, unhealthyLimitValue, "unhealthy targets", len(unhealthy), ) - message := fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: %v, unhealthy: %v, maxUnhealthy: %v)", - totalTargets, - len(unhealthy), - m.Spec.MaxUnhealthy, - ) // Remediation not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy m.Status.RemediationsAllowed = 0 @@ -257,17 +278,12 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log logger.V(3).Info( "Remediations are allowed", "total target", totalTargets, - "max unhealthy", m.Spec.MaxUnhealthy, + unhealthyLimitKey, unhealthyLimitValue, "unhealthy targets", len(unhealthy), ) - maxUnhealthy, err := getMaxUnhealthy(m) - if err != nil { - return ctrl.Result{}, errors.Wrapf(err, "Failed to get value for maxUnhealthy") - } - // Remediation is allowed so maxUnhealthy - unhealthyMachineCount >= 0 - m.Status.RemediationsAllowed = int32(maxUnhealthy - unhealthyMachineCount(m)) + m.Status.RemediationsAllowed = remediationCount conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition) errList := r.PatchUnhealthyTargets(ctx, logger, unhealthy, cluster, m) @@ -517,21 +533,52 @@ func (r *MachineHealthCheckReconciler) watchClusterNodes(ctx context.Context, cl } // isAllowedRemediation checks the value of the MaxUnhealthy field to determine -// whether remediation should be allowed or not -func isAllowedRemediation(mhc *clusterv1.MachineHealthCheck) bool { - // TODO(JoelSpeed): return an error from isAllowedRemediation when maxUnhealthy - // is nil, we expect it to be defaulted always. - if mhc.Spec.MaxUnhealthy == nil { - return true +// returns whether remediation should be allowed or not, the remediation count, and error if any +func isAllowedRemediation(mhc *clusterv1.MachineHealthCheck) (bool, int32, error) { + var remediationAllowed bool + var remediationCount int32 + if mhc.Spec.UnhealthyRange != nil { + min, max, err := getUnhealthyRange(mhc) + if err != nil { + return false, 0, err + } + unhealthyMachineCount := unhealthyMachineCount(mhc) + remediationAllowed = unhealthyMachineCount >= min && unhealthyMachineCount <= max + remediationCount = int32(max - unhealthyMachineCount) + return remediationAllowed, remediationCount, nil } maxUnhealthy, err := getMaxUnhealthy(mhc) if err != nil { - return false + return false, 0, err } // Remediation is not allowed if unhealthy is above maxUnhealthy - return unhealthyMachineCount(mhc) <= maxUnhealthy + unhealthyMachineCount := unhealthyMachineCount(mhc) + remediationAllowed = unhealthyMachineCount <= maxUnhealthy + remediationCount = int32(maxUnhealthy - unhealthyMachineCount) + return remediationAllowed, remediationCount, nil +} + +// getUnhealthyRange parses an integer range and returns the min and max values +// Eg. [2-5] will return (2,5,nil) +func getUnhealthyRange(mhc *clusterv1.MachineHealthCheck) (int, int, error) { + // remove '[' and ']' + unhealthyRange := (*(mhc.Spec.UnhealthyRange))[1 : len(*mhc.Spec.UnhealthyRange)-1] + + parts := strings.Split(unhealthyRange, "-") + + min, err := strconv.ParseUint(parts[0], 10, 32) + if err != nil { + return 0, 0, err + } + + max, err := strconv.ParseUint(parts[1], 10, 32) + if err != nil { + return 0, 0, err + } + + return int(min), int(max), nil } func getMaxUnhealthy(mhc *clusterv1.MachineHealthCheck) (int, error) { diff --git a/controllers/machinehealthcheck_controller_test.go b/controllers/machinehealthcheck_controller_test.go index e8f863217d16..a353438ac578 100644 --- a/controllers/machinehealthcheck_controller_test.go +++ b/controllers/machinehealthcheck_controller_test.go @@ -373,6 +373,161 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { }).Should(Equal(0)) }) + t.Run("it marks unhealthy machines for remediation when number of unhealthy machines is within unhealthyRange", func(t *testing.T) { + g := NewWithT(t) + cluster := createNamespaceAndCluster(g) + + mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) + unhealthyRange := "[1-3]" + mhc.Spec.UnhealthyRange = &unhealthyRange + + g.Expect(testEnv.Create(ctx, mhc)).To(Succeed()) + defer func(do ...client.Object) { + g.Expect(testEnv.Cleanup(ctx, do...)).To(Succeed()) + }(cluster, mhc) + + // Healthy nodes and machines. + _, machines, cleanup1 := createMachinesWithNodes(g, cluster, + count(2), + createNodeRefForMachine(true), + markNodeAsHealthy(true), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup1() + // Unhealthy nodes and machines. + _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, + count(1), + createNodeRefForMachine(true), + markNodeAsHealthy(false), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup2() + machines = append(machines, unhealthyMachines...) + targetMachines := make([]string, len(machines)) + for i, m := range machines { + targetMachines[i] = m.Name + } + sort.Strings(targetMachines) + + // Make sure the status matches. + g.Eventually(func() *clusterv1.MachineHealthCheckStatus { + err := testEnv.Get(ctx, util.ObjectKey(mhc), mhc) + if err != nil { + return nil + } + return &mhc.Status + }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ + ExpectedMachines: 3, + CurrentHealthy: 2, + RemediationsAllowed: 2, + ObservedGeneration: 1, + Targets: targetMachines, + Conditions: clusterv1.Conditions{ + { + Type: clusterv1.RemediationAllowedCondition, + Status: corev1.ConditionTrue, + }, + }, + })) + }) + + t.Run("it marks unhealthy machines for remediation when the unhealthy Machines is not within UnhealthyRange", func(t *testing.T) { + g := NewWithT(t) + cluster := createNamespaceAndCluster(g) + + mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) + unhealthyRange := "[3-5]" + mhc.Spec.UnhealthyRange = &unhealthyRange + + g.Expect(testEnv.Create(ctx, mhc)).To(Succeed()) + defer func(do ...client.Object) { + g.Expect(testEnv.Cleanup(ctx, do...)).To(Succeed()) + }(cluster, mhc) + + // Healthy nodes and machines. + _, machines, cleanup1 := createMachinesWithNodes(g, cluster, + count(1), + createNodeRefForMachine(true), + markNodeAsHealthy(true), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup1() + // Unhealthy nodes and machines. + _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, + count(2), + createNodeRefForMachine(true), + markNodeAsHealthy(false), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup2() + machines = append(machines, unhealthyMachines...) + targetMachines := make([]string, len(machines)) + for i, m := range machines { + targetMachines[i] = m.Name + } + sort.Strings(targetMachines) + + // Make sure the status matches. + g.Eventually(func() *clusterv1.MachineHealthCheckStatus { + err := testEnv.Get(ctx, util.ObjectKey(mhc), mhc) + if err != nil { + return nil + } + return &mhc.Status + }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ + ExpectedMachines: 3, + CurrentHealthy: 1, + RemediationsAllowed: 0, + ObservedGeneration: 1, + Targets: targetMachines, + Conditions: clusterv1.Conditions{ + { + Type: clusterv1.RemediationAllowedCondition, + Status: corev1.ConditionFalse, + Severity: clusterv1.ConditionSeverityWarning, + Reason: clusterv1.TooManyUnhealthyReason, + Message: "Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: 3, unhealthy: 2, unhealthyRange: [3-5])", + }, + }, + })) + + // Calculate how many Machines have health check succeeded = false. + g.Eventually(func() (unhealthy int) { + machines := &clusterv1.MachineList{} + err := testEnv.List(ctx, machines, client.MatchingLabels{ + "selector": mhc.Spec.Selector.MatchLabels["selector"], + }) + if err != nil { + return -1 + } + + for i := range machines.Items { + if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSuccededCondition) { + unhealthy++ + } + } + return + }).Should(Equal(2)) + + // Calculate how many Machines have been remediated. + g.Eventually(func() (remediated int) { + machines := &clusterv1.MachineList{} + err := testEnv.List(ctx, machines, client.MatchingLabels{ + "selector": mhc.Spec.Selector.MatchLabels["selector"], + }) + if err != nil { + return -1 + } + + for i := range machines.Items { + if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { + remediated++ + } + } + return + }).Should(Equal(0)) + }) + t.Run("when a Machine has no Node ref for less than the NodeStartupTimeout", func(t *testing.T) { g := NewWithT(t) cluster := createNamespaceAndCluster(g) @@ -1677,7 +1832,7 @@ func TestIsAllowedRemediation(t *testing.T) { maxUnhealthy: nil, expectedMachines: int32(3), currentHealthy: int32(0), - allowed: true, + allowed: false, }, { name: "when maxUnhealthy is not an int or percentage", @@ -1746,7 +1901,8 @@ func TestIsAllowedRemediation(t *testing.T) { }, } - g.Expect(isAllowedRemediation(mhc)).To(Equal(tc.allowed)) + remediationAllowed, _, _ := isAllowedRemediation(mhc) + g.Expect(remediationAllowed).To(Equal(tc.allowed)) }) } }