diff --git a/api/v1alpha3/conversion.go b/api/v1alpha3/conversion.go index f8129ac1f299..87b9251b61ea 100644 --- a/api/v1alpha3/conversion.go +++ b/api/v1alpha3/conversion.go @@ -152,13 +152,36 @@ func (dst *MachineDeploymentList) ConvertFrom(srcRaw conversion.Hub) error { func (src *MachineHealthCheck) ConvertTo(dstRaw conversion.Hub) error { dst := dstRaw.(*v1alpha4.MachineHealthCheck) - return Convert_v1alpha3_MachineHealthCheck_To_v1alpha4_MachineHealthCheck(src, dst, nil) + if err := Convert_v1alpha3_MachineHealthCheck_To_v1alpha4_MachineHealthCheck(src, dst, nil); err != nil { + return err + } + + // Manually restore data. + restored := &v1alpha4.MachineHealthCheck{} + if ok, err := utilconversion.UnmarshalData(src, restored); err != nil || !ok { + return err + } + + if restored.Spec.UnhealthyRange != nil { + dst.Spec.UnhealthyRange = restored.Spec.UnhealthyRange + } + + return nil } func (dst *MachineHealthCheck) ConvertFrom(srcRaw conversion.Hub) error { src := srcRaw.(*v1alpha4.MachineHealthCheck) - return Convert_v1alpha4_MachineHealthCheck_To_v1alpha3_MachineHealthCheck(src, dst, nil) + if err := Convert_v1alpha4_MachineHealthCheck_To_v1alpha3_MachineHealthCheck(src, dst, nil); err != nil { + return err + } + + // Preserve Hub data on down-conversion except for metadata + if err := utilconversion.MarshalData(src, dst); err != nil { + return err + } + + return nil } func (src *MachineHealthCheckList) ConvertTo(dstRaw conversion.Hub) error { @@ -181,3 +204,7 @@ func Convert_v1alpha3_Bootstrap_To_v1alpha4_Bootstrap(in *Bootstrap, out *v1alph func Convert_v1alpha4_MachineRollingUpdateDeployment_To_v1alpha3_MachineRollingUpdateDeployment(in *v1alpha4.MachineRollingUpdateDeployment, out *MachineRollingUpdateDeployment, s apiconversion.Scope) error { return autoConvert_v1alpha4_MachineRollingUpdateDeployment_To_v1alpha3_MachineRollingUpdateDeployment(in, out, s) } + +func Convert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(in *v1alpha4.MachineHealthCheckSpec, out *MachineHealthCheckSpec, s apiconversion.Scope) error { + return autoConvert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(in, out, s) +} diff --git a/api/v1alpha3/conversion_test.go b/api/v1alpha3/conversion_test.go index 1e0c2de3941e..d3827160fc04 100644 --- a/api/v1alpha3/conversion_test.go +++ b/api/v1alpha3/conversion_test.go @@ -36,4 +36,5 @@ func TestFuzzyConversion(t *testing.T) { t.Run("for Machine", utilconversion.FuzzTestFunc(scheme, &v1alpha4.Machine{}, &Machine{})) t.Run("for MachineSet", utilconversion.FuzzTestFunc(scheme, &v1alpha4.MachineSet{}, &MachineSet{})) t.Run("for MachineDeployment", utilconversion.FuzzTestFunc(scheme, &v1alpha4.MachineDeployment{}, &MachineDeployment{})) + t.Run("for MachineHealthCheckSpec", utilconversion.FuzzTestFunc(scheme, &v1alpha4.MachineHealthCheck{}, &MachineHealthCheck{})) } diff --git a/api/v1alpha3/zz_generated.conversion.go b/api/v1alpha3/zz_generated.conversion.go index 0f00332f99b7..3000514fced1 100644 --- a/api/v1alpha3/zz_generated.conversion.go +++ b/api/v1alpha3/zz_generated.conversion.go @@ -219,11 +219,6 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } - if err := s.AddGeneratedConversionFunc((*v1alpha4.MachineHealthCheckSpec)(nil), (*MachineHealthCheckSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { - return Convert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(a.(*v1alpha4.MachineHealthCheckSpec), b.(*MachineHealthCheckSpec), scope) - }); err != nil { - return err - } if err := s.AddGeneratedConversionFunc((*MachineHealthCheckStatus)(nil), (*v1alpha4.MachineHealthCheckStatus)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha3_MachineHealthCheckStatus_To_v1alpha4_MachineHealthCheckStatus(a.(*MachineHealthCheckStatus), b.(*v1alpha4.MachineHealthCheckStatus), scope) }); err != nil { @@ -354,6 +349,11 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddConversionFunc((*v1alpha4.MachineHealthCheckSpec)(nil), (*MachineHealthCheckSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(a.(*v1alpha4.MachineHealthCheckSpec), b.(*MachineHealthCheckSpec), scope) + }); err != nil { + return err + } if err := s.AddConversionFunc((*v1alpha4.MachineRollingUpdateDeployment)(nil), (*MachineRollingUpdateDeployment)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha4_MachineRollingUpdateDeployment_To_v1alpha3_MachineRollingUpdateDeployment(a.(*v1alpha4.MachineRollingUpdateDeployment), b.(*MachineRollingUpdateDeployment), scope) }); err != nil { @@ -892,7 +892,17 @@ func Convert_v1alpha4_MachineHealthCheck_To_v1alpha3_MachineHealthCheck(in *v1al func autoConvert_v1alpha3_MachineHealthCheckList_To_v1alpha4_MachineHealthCheckList(in *MachineHealthCheckList, out *v1alpha4.MachineHealthCheckList, s conversion.Scope) error { out.ListMeta = in.ListMeta - out.Items = *(*[]v1alpha4.MachineHealthCheck)(unsafe.Pointer(&in.Items)) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]v1alpha4.MachineHealthCheck, len(*in)) + for i := range *in { + if err := Convert_v1alpha3_MachineHealthCheck_To_v1alpha4_MachineHealthCheck(&(*in)[i], &(*out)[i], s); err != nil { + return err + } + } + } else { + out.Items = nil + } return nil } @@ -903,7 +913,17 @@ func Convert_v1alpha3_MachineHealthCheckList_To_v1alpha4_MachineHealthCheckList( func autoConvert_v1alpha4_MachineHealthCheckList_To_v1alpha3_MachineHealthCheckList(in *v1alpha4.MachineHealthCheckList, out *MachineHealthCheckList, s conversion.Scope) error { out.ListMeta = in.ListMeta - out.Items = *(*[]MachineHealthCheck)(unsafe.Pointer(&in.Items)) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]MachineHealthCheck, len(*in)) + for i := range *in { + if err := Convert_v1alpha4_MachineHealthCheck_To_v1alpha3_MachineHealthCheck(&(*in)[i], &(*out)[i], s); err != nil { + return err + } + } + } else { + out.Items = nil + } return nil } @@ -932,16 +952,12 @@ func autoConvert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckS out.Selector = in.Selector out.UnhealthyConditions = *(*[]UnhealthyCondition)(unsafe.Pointer(&in.UnhealthyConditions)) out.MaxUnhealthy = (*intstr.IntOrString)(unsafe.Pointer(in.MaxUnhealthy)) + // WARNING: in.UnhealthyRange requires manual conversion: does not exist in peer-type out.NodeStartupTimeout = (*metav1.Duration)(unsafe.Pointer(in.NodeStartupTimeout)) out.RemediationTemplate = (*v1.ObjectReference)(unsafe.Pointer(in.RemediationTemplate)) return nil } -// Convert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec is an autogenerated conversion function. -func Convert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(in *v1alpha4.MachineHealthCheckSpec, out *MachineHealthCheckSpec, s conversion.Scope) error { - return autoConvert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(in, out, s) -} - func autoConvert_v1alpha3_MachineHealthCheckStatus_To_v1alpha4_MachineHealthCheckStatus(in *MachineHealthCheckStatus, out *v1alpha4.MachineHealthCheckStatus, s conversion.Scope) error { out.ExpectedMachines = in.ExpectedMachines out.CurrentHealthy = in.CurrentHealthy diff --git a/api/v1alpha4/machinehealthcheck_types.go b/api/v1alpha4/machinehealthcheck_types.go index d34bce9c1be3..5359917e0adf 100644 --- a/api/v1alpha4/machinehealthcheck_types.go +++ b/api/v1alpha4/machinehealthcheck_types.go @@ -45,6 +45,12 @@ type MachineHealthCheckSpec struct { // +optional MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"` + // Any further remediation is only allowed if the number of machines selected by "selector" as not healthy + // is within the range of "UnhealthyRange". Takes precedence over MaxUnhealthy. + // +optional + // +kubebuilder:validation:Pattern=^\[[0-9]+-[0-9]+\]$ + UnhealthyRange *string `json:"unhealthyRange,omitempty"` + // Machines older than this duration without a node will be considered to have // failed and will be remediated. // +optional diff --git a/api/v1alpha4/zz_generated.deepcopy.go b/api/v1alpha4/zz_generated.deepcopy.go index 1dd174f68a4d..93bf447a53aa 100644 --- a/api/v1alpha4/zz_generated.deepcopy.go +++ b/api/v1alpha4/zz_generated.deepcopy.go @@ -577,6 +577,11 @@ func (in *MachineHealthCheckSpec) DeepCopyInto(out *MachineHealthCheckSpec) { *out = new(intstr.IntOrString) **out = **in } + if in.UnhealthyRange != nil { + in, out := &in.UnhealthyRange, &out.UnhealthyRange + *out = new(string) + **out = **in + } if in.NodeStartupTimeout != nil { in, out := &in.NodeStartupTimeout, &out.NodeStartupTimeout *out = new(metav1.Duration) diff --git a/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml b/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml index fba233ec6432..ece74c449ef3 100644 --- a/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml @@ -322,6 +322,10 @@ spec: type: object minItems: 1 type: array + unhealthyRange: + description: Any further remediation is only allowed if the number of machines selected by "selector" as not healthy is within the range of "UnhealthyRange" + pattern: ^\[[0-9]+-[0-9]+\]$ + type: string required: - clusterName - selector diff --git a/controllers/machinehealthcheck_controller.go b/controllers/machinehealthcheck_controller.go index e07f32e4d008..e9bfdeb6d9d6 100644 --- a/controllers/machinehealthcheck_controller.go +++ b/controllers/machinehealthcheck_controller.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "sort" + "strconv" "strings" "time" @@ -211,19 +212,39 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log healthy, unhealthy, nextCheckTimes := r.healthCheckTargets(targets, logger, m.Spec.NodeStartupTimeout.Duration) m.Status.CurrentHealthy = int32(len(healthy)) + var unhealthyLimitKey, unhealthyLimitValue interface{} + // check MHC current health against MaxUnhealthy - if !isAllowedRemediation(m) { + remediationAllowed, remediationCount, err := isAllowedRemediation(m) + if err != nil { + return ctrl.Result{}, errors.Wrapf(err, "error checking if remediation is allowed") + } + + if !remediationAllowed { + var message string + + if m.Spec.UnhealthyRange == nil { + unhealthyLimitKey = "max unhealthy" + unhealthyLimitValue = m.Spec.MaxUnhealthy + message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: %v, unhealthy: %v, maxUnhealthy: %v)", + totalTargets, + len(unhealthy), + m.Spec.MaxUnhealthy) + } else { + unhealthyLimitKey = "unhealthy range" + unhealthyLimitValue = *m.Spec.UnhealthyRange + message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: %v, unhealthy: %v, unhealthyRange: %v)", + totalTargets, + len(unhealthy), + *m.Spec.UnhealthyRange) + } + logger.V(3).Info( "Short-circuiting remediation", "total target", totalTargets, - "max unhealthy", m.Spec.MaxUnhealthy, + unhealthyLimitKey, unhealthyLimitValue, "unhealthy targets", len(unhealthy), ) - message := fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: %v, unhealthy: %v, maxUnhealthy: %v)", - totalTargets, - len(unhealthy), - m.Spec.MaxUnhealthy, - ) // Remediation not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy m.Status.RemediationsAllowed = 0 @@ -257,17 +278,12 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log logger.V(3).Info( "Remediations are allowed", "total target", totalTargets, - "max unhealthy", m.Spec.MaxUnhealthy, + unhealthyLimitKey, unhealthyLimitValue, "unhealthy targets", len(unhealthy), ) - maxUnhealthy, err := getMaxUnhealthy(m) - if err != nil { - return ctrl.Result{}, errors.Wrapf(err, "Failed to get value for maxUnhealthy") - } - // Remediation is allowed so maxUnhealthy - unhealthyMachineCount >= 0 - m.Status.RemediationsAllowed = int32(maxUnhealthy - unhealthyMachineCount(m)) + m.Status.RemediationsAllowed = remediationCount conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition) errList := r.PatchUnhealthyTargets(ctx, logger, unhealthy, cluster, m) @@ -517,21 +533,56 @@ func (r *MachineHealthCheckReconciler) watchClusterNodes(ctx context.Context, cl } // isAllowedRemediation checks the value of the MaxUnhealthy field to determine -// whether remediation should be allowed or not -func isAllowedRemediation(mhc *clusterv1.MachineHealthCheck) bool { - // TODO(JoelSpeed): return an error from isAllowedRemediation when maxUnhealthy - // is nil, we expect it to be defaulted always. - if mhc.Spec.MaxUnhealthy == nil { - return true +// returns whether remediation should be allowed or not, the remediation count, and error if any +func isAllowedRemediation(mhc *clusterv1.MachineHealthCheck) (bool, int32, error) { + var remediationAllowed bool + var remediationCount int32 + if mhc.Spec.UnhealthyRange != nil { + min, max, err := getUnhealthyRange(mhc) + if err != nil { + return false, 0, err + } + unhealthyMachineCount := unhealthyMachineCount(mhc) + remediationAllowed = unhealthyMachineCount >= min && unhealthyMachineCount <= max + remediationCount = int32(max - unhealthyMachineCount) + return remediationAllowed, remediationCount, nil } maxUnhealthy, err := getMaxUnhealthy(mhc) if err != nil { - return false + return false, 0, err } // Remediation is not allowed if unhealthy is above maxUnhealthy - return unhealthyMachineCount(mhc) <= maxUnhealthy + unhealthyMachineCount := unhealthyMachineCount(mhc) + remediationAllowed = unhealthyMachineCount <= maxUnhealthy + remediationCount = int32(maxUnhealthy - unhealthyMachineCount) + return remediationAllowed, remediationCount, nil +} + +// getUnhealthyRange parses an integer range and returns the min and max values +// Eg. [2-5] will return (2,5,nil) +func getUnhealthyRange(mhc *clusterv1.MachineHealthCheck) (int, int, error) { + // remove '[' and ']' + unhealthyRange := (*(mhc.Spec.UnhealthyRange))[1 : len(*mhc.Spec.UnhealthyRange)-1] + + parts := strings.Split(unhealthyRange, "-") + + min, err := strconv.ParseUint(parts[0], 10, 32) + if err != nil { + return 0, 0, err + } + + max, err := strconv.ParseUint(parts[1], 10, 32) + if err != nil { + return 0, 0, err + } + + if max < min { + return 0, 0, errors.Errorf("max value %d cannot be less than min value %d for unhealthyRange", max, min) + } + + return int(min), int(max), nil } func getMaxUnhealthy(mhc *clusterv1.MachineHealthCheck) (int, error) { diff --git a/controllers/machinehealthcheck_controller_test.go b/controllers/machinehealthcheck_controller_test.go index e8f863217d16..a353438ac578 100644 --- a/controllers/machinehealthcheck_controller_test.go +++ b/controllers/machinehealthcheck_controller_test.go @@ -373,6 +373,161 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { }).Should(Equal(0)) }) + t.Run("it marks unhealthy machines for remediation when number of unhealthy machines is within unhealthyRange", func(t *testing.T) { + g := NewWithT(t) + cluster := createNamespaceAndCluster(g) + + mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) + unhealthyRange := "[1-3]" + mhc.Spec.UnhealthyRange = &unhealthyRange + + g.Expect(testEnv.Create(ctx, mhc)).To(Succeed()) + defer func(do ...client.Object) { + g.Expect(testEnv.Cleanup(ctx, do...)).To(Succeed()) + }(cluster, mhc) + + // Healthy nodes and machines. + _, machines, cleanup1 := createMachinesWithNodes(g, cluster, + count(2), + createNodeRefForMachine(true), + markNodeAsHealthy(true), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup1() + // Unhealthy nodes and machines. + _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, + count(1), + createNodeRefForMachine(true), + markNodeAsHealthy(false), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup2() + machines = append(machines, unhealthyMachines...) + targetMachines := make([]string, len(machines)) + for i, m := range machines { + targetMachines[i] = m.Name + } + sort.Strings(targetMachines) + + // Make sure the status matches. + g.Eventually(func() *clusterv1.MachineHealthCheckStatus { + err := testEnv.Get(ctx, util.ObjectKey(mhc), mhc) + if err != nil { + return nil + } + return &mhc.Status + }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ + ExpectedMachines: 3, + CurrentHealthy: 2, + RemediationsAllowed: 2, + ObservedGeneration: 1, + Targets: targetMachines, + Conditions: clusterv1.Conditions{ + { + Type: clusterv1.RemediationAllowedCondition, + Status: corev1.ConditionTrue, + }, + }, + })) + }) + + t.Run("it marks unhealthy machines for remediation when the unhealthy Machines is not within UnhealthyRange", func(t *testing.T) { + g := NewWithT(t) + cluster := createNamespaceAndCluster(g) + + mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) + unhealthyRange := "[3-5]" + mhc.Spec.UnhealthyRange = &unhealthyRange + + g.Expect(testEnv.Create(ctx, mhc)).To(Succeed()) + defer func(do ...client.Object) { + g.Expect(testEnv.Cleanup(ctx, do...)).To(Succeed()) + }(cluster, mhc) + + // Healthy nodes and machines. + _, machines, cleanup1 := createMachinesWithNodes(g, cluster, + count(1), + createNodeRefForMachine(true), + markNodeAsHealthy(true), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup1() + // Unhealthy nodes and machines. + _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, + count(2), + createNodeRefForMachine(true), + markNodeAsHealthy(false), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup2() + machines = append(machines, unhealthyMachines...) + targetMachines := make([]string, len(machines)) + for i, m := range machines { + targetMachines[i] = m.Name + } + sort.Strings(targetMachines) + + // Make sure the status matches. + g.Eventually(func() *clusterv1.MachineHealthCheckStatus { + err := testEnv.Get(ctx, util.ObjectKey(mhc), mhc) + if err != nil { + return nil + } + return &mhc.Status + }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ + ExpectedMachines: 3, + CurrentHealthy: 1, + RemediationsAllowed: 0, + ObservedGeneration: 1, + Targets: targetMachines, + Conditions: clusterv1.Conditions{ + { + Type: clusterv1.RemediationAllowedCondition, + Status: corev1.ConditionFalse, + Severity: clusterv1.ConditionSeverityWarning, + Reason: clusterv1.TooManyUnhealthyReason, + Message: "Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: 3, unhealthy: 2, unhealthyRange: [3-5])", + }, + }, + })) + + // Calculate how many Machines have health check succeeded = false. + g.Eventually(func() (unhealthy int) { + machines := &clusterv1.MachineList{} + err := testEnv.List(ctx, machines, client.MatchingLabels{ + "selector": mhc.Spec.Selector.MatchLabels["selector"], + }) + if err != nil { + return -1 + } + + for i := range machines.Items { + if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSuccededCondition) { + unhealthy++ + } + } + return + }).Should(Equal(2)) + + // Calculate how many Machines have been remediated. + g.Eventually(func() (remediated int) { + machines := &clusterv1.MachineList{} + err := testEnv.List(ctx, machines, client.MatchingLabels{ + "selector": mhc.Spec.Selector.MatchLabels["selector"], + }) + if err != nil { + return -1 + } + + for i := range machines.Items { + if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { + remediated++ + } + } + return + }).Should(Equal(0)) + }) + t.Run("when a Machine has no Node ref for less than the NodeStartupTimeout", func(t *testing.T) { g := NewWithT(t) cluster := createNamespaceAndCluster(g) @@ -1677,7 +1832,7 @@ func TestIsAllowedRemediation(t *testing.T) { maxUnhealthy: nil, expectedMachines: int32(3), currentHealthy: int32(0), - allowed: true, + allowed: false, }, { name: "when maxUnhealthy is not an int or percentage", @@ -1746,7 +1901,8 @@ func TestIsAllowedRemediation(t *testing.T) { }, } - g.Expect(isAllowedRemediation(mhc)).To(Equal(tc.allowed)) + remediationAllowed, _, _ := isAllowedRemediation(mhc) + g.Expect(remediationAllowed).To(Equal(tc.allowed)) }) } } diff --git a/docs/book/src/tasks/healthcheck.md b/docs/book/src/tasks/healthcheck.md index af86c175fbd8..bf5ead0037db 100644 --- a/docs/book/src/tasks/healthcheck.md +++ b/docs/book/src/tasks/healthcheck.md @@ -65,7 +65,9 @@ n order to prevent conflicts or unexpected behaviors when trying to remediate th ## Remediation short-circuiting To ensure that MachineHealthChecks only remediate Machines when the cluster is healthy, -short-circuiting is implemented to prevent further remediation via the `maxUnhealthy` field within the MachineHealthCheck spec. +short-circuiting is implemented to prevent further remediation via the `maxUnhealthy` and `unhealthyRange` fields within the MachineHealthCheck spec. + +### Max Unhealthy If the user defines a value for the `maxUnhealthy` field (either an absolute number or a percentage of the total Machines checked by this MachineHealthCheck), before remediating any Machines, the MachineHealthCheck will compare the value of `maxUnhealthy` with the number of Machines it has determined to be unhealthy. @@ -100,6 +102,30 @@ If `maxUnhealthy` is set to `40%` and there are 6 Machines being checked: Note, when the percentage is not a whole number, the allowed number is rounded down. +### Unhealthy Range + +If the user defines a value for the `unhealthyRange` field (bracketed values that specify a start and an end value), before remediating any Machines, +the MachineHealthCheck will check if the number of Machines it has determined to be unhealthy is within the range specified by `unhealthyRange`. +If it is is not within the range set by `unhealthyRange`, remediation will **not** be performed. + + + +#### With a range of values + +If `unhealthyRange` is set to `[3-5]` and there are 10 Machines being checked: +- If 2 or fewer nodes are unhealthy, remediation will not be performed. +- If 5 or more nodes are unhealthy, remediation will not be performed. +- In all other cases, remediation will be performed. + +Note, the above example had 10 machines as sample set. But, this would work the same way for any other number. +This is useful for dynamically scaling clusters where the number of machines keep changing frequently. + ## Limitations and Caveats of a MachineHealthCheck Before deploying a MachineHealthCheck, please familiarise yourself with the following limitations and caveats: diff --git a/docs/proposals/20191030-machine-health-checking.md b/docs/proposals/20191030-machine-health-checking.md index 531e671580bd..80a56fd89ee9 100644 --- a/docs/proposals/20191030-machine-health-checking.md +++ b/docs/proposals/20191030-machine-health-checking.md @@ -10,7 +10,7 @@ reviewers: - "@ncdc" - "@timothysc" creation-date: 2019-10-30 -last-updated: 2020-08-04 +last-updated: 2021-01-28 status: implementable see-also: replaces: @@ -89,7 +89,7 @@ MHC requests a remediation in one of the following ways: - Applying a Condition which the owning controller consumes to remediate the machine (default) - Creating a CR based on a template which signals external component to remediate the machine -It provides a short-circuit mechanism and limits remediation when the `maxUnhealthy` threshold is reached for a targeted group of machines. +It provides a short-circuit mechanism and limits remediation when the number of unhealthy machines is not within `unhealthyRange`, or has reached `maxUnhealthy` threshold for a targeted group of machines with `unhealthyRange` taking precedence. This is similar to what the node life cycle controller does for reducing the eviction rate as nodes become unhealthy in a given zone. E.g a large number of nodes in a single zone are down due to a networking issue. The machine health checker is an integration point between node problem detection tooling expressed as node conditions and remediation to achieve a node auto repairing feature. @@ -100,7 +100,7 @@ A machine is unhealthy when: - The Machine has no nodeRef. - The Machine has a nodeRef but the referenced node is not found. -If any of those criteria are met for longer than the given timeouts and the `maxUnhealthy` threshold has not been reached yet, the machine will be marked as failing the healthcheck. +If any of those criteria are met for longer than the given timeouts and the number of unhealthy machines is either within the `unhealthyRange` if specified, or has not reached `maxUnhealthy` threshold, the machine will be marked as failing the healthcheck. Timeouts: - For the node conditions the time outs are defined by the admin. @@ -299,7 +299,8 @@ type target struct { ``` - Calculate the number of unhealthy targets. -- Compare current number against `maxUnhealthy` threshold and temporary short circuits remediation if the threshold is met. +- Compare current number against `unhealthyRange`, if specified, and temporarily short circuit remediation if it's not within the range. +- If `unhealthyRange` is not specified, compare against `maxUnhealthy` threshold and temporarily short circuit remediation if the threshold is met. - Either marks unhealthy target machines with conditions or create an external remediation CR as described above. Out of band: