Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Allow users to disable MHC NodeStartupTimeout #4471

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions api/v1alpha4/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package v1alpha4

import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

const (
Expand Down Expand Up @@ -93,6 +94,11 @@ const (
ManagedByAnnotation = "cluster.x-k8s.io/managed-by"
)

var (
// ZeroDuration is a zero value of the metav1.Duration type.
ZeroDuration = metav1.Duration{}
)

// MachineAddressType describes a valid MachineAddress type.
type MachineAddressType string

Expand Down
2 changes: 2 additions & 0 deletions api/v1alpha4/machinehealthcheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ type MachineHealthCheckSpec struct {

// Machines older than this duration without a node will be considered to have
// failed and will be remediated.
// If not set, this value is defaulted to 10 minutes.
JoelSpeed marked this conversation as resolved.
Show resolved Hide resolved
// If you wish to disable this feature, set the value explicitly to 0.
// +optional
NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`
vincepri marked this conversation as resolved.
Show resolved Hide resolved

Expand Down
14 changes: 9 additions & 5 deletions api/v1alpha4/machinehealthcheck_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@ import (
)

var (
// Default time allowed for a node to start up. Can be made longer as part of
// spec if required for particular provider.
// DefaultNodeStartupTimeout is the time allowed for a node to start up.
// Can be made longer as part of spec if required for particular provider.
// 10 minutes should allow the instance to start and the node to join the
// cluster on most providers.
defaultNodeStartupTimeout = metav1.Duration{Duration: 10 * time.Minute}
DefaultNodeStartupTimeout = metav1.Duration{Duration: 10 * time.Minute}
// Minimum time allowed for a node to start up.
minNodeStartupTimeout = metav1.Duration{Duration: 30 * time.Second}
// We allow users to disable the nodeStartupTimeout by setting the duration to 0.
disabledNodeStartupTimeout = ZeroDuration
)

// SetMinNodeStartupTimeout allows users to optionally set a custom timeout
Expand Down Expand Up @@ -73,7 +75,7 @@ func (m *MachineHealthCheck) Default() {
}

if m.Spec.NodeStartupTimeout == nil {
m.Spec.NodeStartupTimeout = &defaultNodeStartupTimeout
m.Spec.NodeStartupTimeout = &DefaultNodeStartupTimeout
}
}

Expand Down Expand Up @@ -129,7 +131,9 @@ func (m *MachineHealthCheck) validate(old *MachineHealthCheck) error {
)
}

if m.Spec.NodeStartupTimeout != nil && m.Spec.NodeStartupTimeout.Seconds() < minNodeStartupTimeout.Seconds() {
if m.Spec.NodeStartupTimeout != nil &&
m.Spec.NodeStartupTimeout.Seconds() != disabledNodeStartupTimeout.Seconds() &&
m.Spec.NodeStartupTimeout.Seconds() < minNodeStartupTimeout.Seconds() {
JoelSpeed marked this conversation as resolved.
Show resolved Hide resolved
allErrs = append(
allErrs,
field.Invalid(field.NewPath("spec", "nodeStartupTimeout"), m.Spec.NodeStartupTimeout.Seconds(), "must be at least 30s"),
Expand Down
4 changes: 2 additions & 2 deletions api/v1alpha4/machinehealthcheck_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,9 @@ func TestMachineHealthCheckNodeStartupTimeout(t *testing.T) {
expectErr: true,
},
{
name: "when the nodeStartupTimeout is 0",
name: "when the nodeStartupTimeout is 0 (disabled)",
timeout: &zero,
expectErr: true,
expectErr: false,
},
}

Expand Down
4 changes: 3 additions & 1 deletion config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,9 @@ spec:
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: Machines older than this duration without a node will
be considered to have failed and will be remediated.
be considered to have failed and will be remediated. If not set,
this value is defaulted to 10 minutes. If you wish to disable this
feature, set the value explicitly to 0.
type: string
remediationTemplate:
description: "RemediationTemplate is a reference to a remediation
Expand Down
7 changes: 6 additions & 1 deletion controllers/machinehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,13 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
// do sort to avoid keep changing m.Status as the returned machines are not in order
sort.Strings(m.Status.Targets)

nodeStartupTimeout := m.Spec.NodeStartupTimeout
if nodeStartupTimeout == nil {
nodeStartupTimeout = &clusterv1.DefaultNodeStartupTimeout
}

// health check all targets and reconcile mhc status
healthy, unhealthy, nextCheckTimes := r.healthCheckTargets(targets, logger, m.Spec.NodeStartupTimeout.Duration)
healthy, unhealthy, nextCheckTimes := r.healthCheckTargets(targets, logger, *nodeStartupTimeout)
m.Status.CurrentHealthy = int32(len(healthy))

var unhealthyLimitKey, unhealthyLimitValue interface{}
Expand Down
21 changes: 16 additions & 5 deletions controllers/machinehealthcheck_targets.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ const (
EventDetectedUnhealthy string = "DetectedUnhealthy"
)

var (
// We allow users to disable the nodeStartupTimeout by setting the duration to 0.
disabledNodeStartupTimeout = clusterv1.ZeroDuration
)

// healthCheckTarget contains the information required to perform a health check
// on the node to determine if any remediation is required.
type healthCheckTarget struct {
Expand Down Expand Up @@ -81,7 +86,7 @@ func (t *healthCheckTarget) nodeName() string {
// If the target doesn't currently need rememdiation, provide a duration after
// which the target should next be checked.
// The target should be requeued after this duration.
func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode time.Duration) (bool, time.Duration) {
func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (bool, time.Duration) {
var nextCheckTimes []time.Duration
now := time.Now()

Expand Down Expand Up @@ -120,6 +125,12 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi

// the node has not been set yet
if t.Node == nil {
if timeoutForMachineToHaveNode == disabledNodeStartupTimeout {
// Startup timeout is disabled so no need to go any further.
// No node yet to check conditions, can return early here.
return false, 0
}

controlPlaneInitializedTime := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ControlPlaneInitializedCondition).Time
clusterInfraReadyTime := conditions.GetLastTransitionTime(t.Cluster, clusterv1.InfrastructureReadyCondition).Time
machineCreationTime := t.Machine.CreationTimestamp.Time
Expand All @@ -135,14 +146,14 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
}
logger.V(3).Info("Using comparison time", "time", comparisonTime)

if comparisonTime.Add(timeoutForMachineToHaveNode).Before(now) {
if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) {
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSuccededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutForMachineToHaveNode.String())
logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutForMachineToHaveNode.String())
return true, time.Duration(0)
}

durationUnhealthy := now.Sub(comparisonTime)
nextCheck := timeoutForMachineToHaveNode - durationUnhealthy + time.Second
nextCheck := timeoutForMachineToHaveNode.Duration - durationUnhealthy + time.Second

return false, nextCheck
}
Expand Down Expand Up @@ -261,7 +272,7 @@ func (r *MachineHealthCheckReconciler) getNodeFromMachine(ctx context.Context, c

// healthCheckTargets health checks a slice of targets
// and gives a data to measure the average health.
func (r *MachineHealthCheckReconciler) healthCheckTargets(targets []healthCheckTarget, logger logr.Logger, timeoutForMachineToHaveNode time.Duration) ([]healthCheckTarget, []healthCheckTarget, []time.Duration) {
func (r *MachineHealthCheckReconciler) healthCheckTargets(targets []healthCheckTarget, logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) ([]healthCheckTarget, []healthCheckTarget, []time.Duration) {
var nextCheckTimes []time.Duration
var unhealthy []healthCheckTarget
var healthy []healthCheckTarget
Expand Down Expand Up @@ -290,7 +301,7 @@ func (r *MachineHealthCheckReconciler) healthCheckTargets(targets []healthCheckT
continue
}

if t.Machine.DeletionTimestamp.IsZero() {
if t.Machine.DeletionTimestamp.IsZero() && t.Node != nil {
conditions.MarkTrue(t.Machine, clusterv1.MachineHealthCheckSuccededCondition)
healthy = append(healthy, t)
}
Expand Down
71 changes: 57 additions & 14 deletions controllers/machinehealthcheck_targets_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,19 @@ func TestHealthCheckTargets(t *testing.T) {
conditions.MarkTrue(cluster, clusterv1.InfrastructureReadyCondition)
conditions.MarkTrue(cluster, clusterv1.ControlPlaneInitializedCondition)

// Ensure the control plane was initialized earlier to prevent it interfering with
// NodeStartupTimeout testing.
conds := clusterv1.Conditions{}
for _, condition := range cluster.GetConditions() {
condition.LastTransitionTime = metav1.NewTime(condition.LastTransitionTime.Add(-1 * time.Hour))
conds = append(conds, condition)
}
cluster.SetConditions(conds)

mhcSelector := map[string]string{"cluster": clusterName, "machine-group": "foo"}

timeoutForMachineToHaveNode := 10 * time.Minute
disabledTimeoutForMachineToHaveNode := time.Duration(0)

// Create a test MHC
testMHC := &clusterv1.MachineHealthCheck{
Expand Down Expand Up @@ -229,15 +239,26 @@ func TestHealthCheckTargets(t *testing.T) {

testMachine := newTestMachine("machine1", namespace, clusterName, "node1", mhcSelector)

// Target for when the node has not yet been seen by the Machine controller
testMachineLastUpdated400s := testMachine.DeepCopy()
// Targets for when the node has not yet been seen by the Machine controller
testMachineCreated1200s := testMachine.DeepCopy()
nowMinus1200s := metav1.NewTime(time.Now().Add(-1200 * time.Second))
testMachineCreated1200s.ObjectMeta.CreationTimestamp = nowMinus1200s

nodeNotYetStartedTarget1200s := healthCheckTarget{
Cluster: cluster,
MHC: testMHC,
Machine: testMachineCreated1200s,
Node: nil,
}

testMachineCreated400s := testMachine.DeepCopy()
nowMinus400s := metav1.NewTime(time.Now().Add(-400 * time.Second))
testMachineLastUpdated400s.Status.LastUpdated = &nowMinus400s
testMachineCreated400s.ObjectMeta.CreationTimestamp = nowMinus400s

nodeNotYetStartedTarget := healthCheckTarget{
nodeNotYetStartedTarget400s := healthCheckTarget{
Cluster: cluster,
MHC: testMHC,
Machine: testMachineLastUpdated400s,
Machine: testMachineCreated400s,
Node: nil,
}

Expand Down Expand Up @@ -292,18 +313,26 @@ func TestHealthCheckTargets(t *testing.T) {
}

testCases := []struct {
desc string
targets []healthCheckTarget
expectedHealthy []healthCheckTarget
expectedNeedsRemediation []healthCheckTarget
expectedNextCheckTimes []time.Duration
desc string
targets []healthCheckTarget
timeoutForMachineToHaveNode *time.Duration
expectedHealthy []healthCheckTarget
expectedNeedsRemediation []healthCheckTarget
expectedNextCheckTimes []time.Duration
}{
{
desc: "when the node has not yet started",
targets: []healthCheckTarget{nodeNotYetStartedTarget},
desc: "when the node has not yet started for shorter than the timeout",
targets: []healthCheckTarget{nodeNotYetStartedTarget400s},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{},
expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode},
expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 400*time.Second},
},
{
desc: "when the node has not yet started for longer than the timeout",
targets: []healthCheckTarget{nodeNotYetStartedTarget1200s},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s},
expectedNextCheckTimes: []time.Duration{},
},
{
desc: "when the node has gone away",
Expand Down Expand Up @@ -340,6 +369,14 @@ func TestHealthCheckTargets(t *testing.T) {
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second},
},
{
desc: "when the node has not started for a long time but the startup timeout is disabled",
targets: []healthCheckTarget{nodeNotYetStartedTarget400s},
timeoutForMachineToHaveNode: &disabledTimeoutForMachineToHaveNode,
expectedHealthy: []healthCheckTarget{}, // The node is not healthy as it does not have a machine
expectedNeedsRemediation: []healthCheckTarget{},
expectedNextCheckTimes: []time.Duration{}, // We don't have a timeout so no way to know when to re-check
},
}

for _, tc := range testCases {
Expand All @@ -351,7 +388,13 @@ func TestHealthCheckTargets(t *testing.T) {
recorder: record.NewFakeRecorder(5),
}

healthy, unhealthy, nextCheckTimes := reconciler.healthCheckTargets(tc.targets, ctrl.LoggerFrom(ctx), timeoutForMachineToHaveNode)
// Allow individual test cases to override the timeoutForMachineToHaveNode.
timeout := metav1.Duration{Duration: timeoutForMachineToHaveNode}
if tc.timeoutForMachineToHaveNode != nil {
timeout.Duration = *tc.timeoutForMachineToHaveNode
}

healthy, unhealthy, nextCheckTimes := reconciler.healthCheckTargets(tc.targets, ctrl.LoggerFrom(ctx), timeout)

// Round durations down to nearest second account for minute differences
// in timing when running tests
Expand Down
8 changes: 7 additions & 1 deletion docs/book/src/tasks/healthcheck.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,13 @@ spec:
# (Optional) maxUnhealthy prevents further remediation if the cluster is already partially unhealthy
maxUnhealthy: 40%
# (Optional) nodeStartupTimeout determines how long a MachineHealthCheck should wait for
# a Node to join the cluster, before considering a Machine unhealthy
# a Node to join the cluster, before considering a Machine unhealthy.
# Defaults to 10 minutes if not specified.
# Set to 0 to disable the node startup timeout.
# Disabling this timeout will prevent a Machine from being considered unhealthy when
# the Node it created has not yet registered with the cluster. This can be useful when
# Nodes take a long time to start up or when you only want condition based checks for
# Machine health.
nodeStartupTimeout: 10m
# selector is used to determine which Machines should be health checked
selector:
Expand Down