Skip to content

Commit

Permalink
support range of values for unhealthy machines in machine health chec…
Browse files Browse the repository at this point in the history
…k spec
  • Loading branch information
shysank committed Jan 27, 2021
1 parent a597f9e commit 565c402
Show file tree
Hide file tree
Showing 8 changed files with 295 additions and 33 deletions.
31 changes: 29 additions & 2 deletions api/v1alpha3/conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,36 @@ func (dst *MachineDeploymentList) ConvertFrom(srcRaw conversion.Hub) error {
func (src *MachineHealthCheck) ConvertTo(dstRaw conversion.Hub) error {
dst := dstRaw.(*v1alpha4.MachineHealthCheck)

return Convert_v1alpha3_MachineHealthCheck_To_v1alpha4_MachineHealthCheck(src, dst, nil)
if err := Convert_v1alpha3_MachineHealthCheck_To_v1alpha4_MachineHealthCheck(src, dst, nil); err != nil {
return err
}

// Manually restore data.
restored := &v1alpha4.MachineHealthCheck{}
if ok, err := utilconversion.UnmarshalData(src, restored); err != nil || !ok {
return err
}

if restored.Spec.UnhealthyRange != nil {
dst.Spec.UnhealthyRange = restored.Spec.UnhealthyRange
}

return nil
}

func (dst *MachineHealthCheck) ConvertFrom(srcRaw conversion.Hub) error {
src := srcRaw.(*v1alpha4.MachineHealthCheck)

return Convert_v1alpha4_MachineHealthCheck_To_v1alpha3_MachineHealthCheck(src, dst, nil)
if err := Convert_v1alpha4_MachineHealthCheck_To_v1alpha3_MachineHealthCheck(src, dst, nil); err != nil {
return err
}

// Preserve Hub data on down-conversion except for metadata
if err := utilconversion.MarshalData(src, dst); err != nil {
return err
}

return nil
}

func (src *MachineHealthCheckList) ConvertTo(dstRaw conversion.Hub) error {
Expand All @@ -181,3 +204,7 @@ func Convert_v1alpha3_Bootstrap_To_v1alpha4_Bootstrap(in *Bootstrap, out *v1alph
func Convert_v1alpha4_MachineRollingUpdateDeployment_To_v1alpha3_MachineRollingUpdateDeployment(in *v1alpha4.MachineRollingUpdateDeployment, out *MachineRollingUpdateDeployment, s apiconversion.Scope) error {
return autoConvert_v1alpha4_MachineRollingUpdateDeployment_To_v1alpha3_MachineRollingUpdateDeployment(in, out, s)
}

func Convert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(in *v1alpha4.MachineHealthCheckSpec, out *MachineHealthCheckSpec, s apiconversion.Scope) error {
return autoConvert_v1alpha4_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(in, out, s)
}
1 change: 1 addition & 0 deletions api/v1alpha3/conversion_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ func TestFuzzyConversion(t *testing.T) {
t.Run("for Machine", utilconversion.FuzzTestFunc(scheme, &v1alpha4.Machine{}, &Machine{}))
t.Run("for MachineSet", utilconversion.FuzzTestFunc(scheme, &v1alpha4.MachineSet{}, &MachineSet{}))
t.Run("for MachineDeployment", utilconversion.FuzzTestFunc(scheme, &v1alpha4.MachineDeployment{}, &MachineDeployment{}))
t.Run("for MachineHealthCheckSpec", utilconversion.FuzzTestFunc(scheme, &v1alpha4.MachineHealthCheck{}, &MachineHealthCheck{}))
}
30 changes: 23 additions & 7 deletions api/v1alpha3/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions api/v1alpha4/machinehealthcheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ type MachineHealthCheckSpec struct {
// +optional
MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"`

// Any further remediation is only allowed if the number of machines selected by "selector" as not healthy
// is within the range of "UnhealthyRange"
// +optional
// +kubebuilder:validation:Pattern=^\[[0-9]+-[0-9]+\]$
UnhealthyRange *string `json:"unhealthyRange,omitempty"`

// Machines older than this duration without a node will be considered to have
// failed and will be remediated.
// +optional
Expand Down
5 changes: 5 additions & 0 deletions api/v1alpha4/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,10 @@ spec:
type: object
minItems: 1
type: array
unhealthyRange:
description: Any further remediation is only allowed if the number of machines selected by "selector" as not healthy is within the range of "UnhealthyRange"
pattern: ^\[[0-9]+-[0-9]+\]$
type: string
required:
- clusterName
- selector
Expand Down
91 changes: 69 additions & 22 deletions controllers/machinehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"sort"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -211,19 +212,39 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
healthy, unhealthy, nextCheckTimes := r.healthCheckTargets(targets, logger, m.Spec.NodeStartupTimeout.Duration)
m.Status.CurrentHealthy = int32(len(healthy))

var unhealthyLimitKey, unhealthyLimitValue interface{}

// check MHC current health against MaxUnhealthy
if !isAllowedRemediation(m) {
remediationAllowed, remediationCount, err := isAllowedRemediation(m)
if err != nil {
return ctrl.Result{}, errors.Wrapf(err, "error checking if remediation is allowed")
}

if !remediationAllowed {
var message string

if m.Spec.UnhealthyRange == nil {
unhealthyLimitKey = "max unhealthy"
unhealthyLimitValue = m.Spec.MaxUnhealthy
message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: %v, unhealthy: %v, maxUnhealthy: %v)",
totalTargets,
len(unhealthy),
m.Spec.MaxUnhealthy)
} else {
unhealthyLimitKey = "unhealthy range"
unhealthyLimitValue = *m.Spec.UnhealthyRange
message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: %v, unhealthy: %v, unhealthyRange: %v)",
totalTargets,
len(unhealthy),
*m.Spec.UnhealthyRange)
}

logger.V(3).Info(
"Short-circuiting remediation",
"total target", totalTargets,
"max unhealthy", m.Spec.MaxUnhealthy,
unhealthyLimitKey, unhealthyLimitValue,
"unhealthy targets", len(unhealthy),
)
message := fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: %v, unhealthy: %v, maxUnhealthy: %v)",
totalTargets,
len(unhealthy),
m.Spec.MaxUnhealthy,
)

// Remediation not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy
m.Status.RemediationsAllowed = 0
Expand Down Expand Up @@ -257,17 +278,12 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
logger.V(3).Info(
"Remediations are allowed",
"total target", totalTargets,
"max unhealthy", m.Spec.MaxUnhealthy,
unhealthyLimitKey, unhealthyLimitValue,
"unhealthy targets", len(unhealthy),
)

maxUnhealthy, err := getMaxUnhealthy(m)
if err != nil {
return ctrl.Result{}, errors.Wrapf(err, "Failed to get value for maxUnhealthy")
}

// Remediation is allowed so maxUnhealthy - unhealthyMachineCount >= 0
m.Status.RemediationsAllowed = int32(maxUnhealthy - unhealthyMachineCount(m))
m.Status.RemediationsAllowed = remediationCount
conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition)

errList := r.PatchUnhealthyTargets(ctx, logger, unhealthy, cluster, m)
Expand Down Expand Up @@ -517,21 +533,52 @@ func (r *MachineHealthCheckReconciler) watchClusterNodes(ctx context.Context, cl
}

// isAllowedRemediation checks the value of the MaxUnhealthy field to determine
// whether remediation should be allowed or not
func isAllowedRemediation(mhc *clusterv1.MachineHealthCheck) bool {
// TODO(JoelSpeed): return an error from isAllowedRemediation when maxUnhealthy
// is nil, we expect it to be defaulted always.
if mhc.Spec.MaxUnhealthy == nil {
return true
// returns whether remediation should be allowed or not, the remediation count, and error if any
func isAllowedRemediation(mhc *clusterv1.MachineHealthCheck) (bool, int32, error) {
var remediationAllowed bool
var remediationCount int32
if mhc.Spec.UnhealthyRange != nil {
min, max, err := getUnhealthyRange(mhc)
if err != nil {
return false, 0, err
}
unhealthyMachineCount := unhealthyMachineCount(mhc)
remediationAllowed = unhealthyMachineCount >= min && unhealthyMachineCount <= max
remediationCount = int32(max - unhealthyMachineCount)
return remediationAllowed, remediationCount, nil
}

maxUnhealthy, err := getMaxUnhealthy(mhc)
if err != nil {
return false
return false, 0, err
}

// Remediation is not allowed if unhealthy is above maxUnhealthy
return unhealthyMachineCount(mhc) <= maxUnhealthy
unhealthyMachineCount := unhealthyMachineCount(mhc)
remediationAllowed = unhealthyMachineCount <= maxUnhealthy
remediationCount = int32(maxUnhealthy - unhealthyMachineCount)
return remediationAllowed, remediationCount, nil
}

// getUnhealthyRange parses an integer range and returns the min and max values
// Eg. [2-5] will return (2,5,nil)
func getUnhealthyRange(mhc *clusterv1.MachineHealthCheck) (int, int, error) {
// remove '[' and ']'
unhealthyRange := (*(mhc.Spec.UnhealthyRange))[1 : len(*mhc.Spec.UnhealthyRange)-1]

parts := strings.Split(unhealthyRange, "-")

min, err := strconv.ParseUint(parts[0], 10, 32)
if err != nil {
return 0, 0, err
}

max, err := strconv.ParseUint(parts[1], 10, 32)
if err != nil {
return 0, 0, err
}

return int(min), int(max), nil
}

func getMaxUnhealthy(mhc *clusterv1.MachineHealthCheck) (int, error) {
Expand Down
Loading

0 comments on commit 565c402

Please sign in to comment.