From 2be6be2ade895dce4d48fb4f2cca48b686149293 Mon Sep 17 00:00:00 2001 From: Joel Speed Date: Tue, 13 Apr 2021 14:54:52 +0100 Subject: [PATCH] Allow users to disable MHC NodeStartupTimeout This add a new explicit 0 second option for the MHC NodeStartupTimeout which allows users to disable the functionality provided by NodeStartupTimeout. This will allow use cases where customers want to explicitly just have MHC with condition based health checks. --- api/v1alpha4/common_types.go | 6 ++ api/v1alpha4/machinehealthcheck_types.go | 2 + api/v1alpha4/machinehealthcheck_webhook.go | 14 ++-- .../machinehealthcheck_webhook_test.go | 4 +- .../cluster.x-k8s.io_machinehealthchecks.yaml | 4 +- controllers/machinehealthcheck_controller.go | 7 +- controllers/machinehealthcheck_targets.go | 21 ++++-- .../machinehealthcheck_targets_test.go | 71 +++++++++++++++---- docs/book/src/tasks/healthcheck.md | 8 ++- 9 files changed, 108 insertions(+), 29 deletions(-) diff --git a/api/v1alpha4/common_types.go b/api/v1alpha4/common_types.go index d699bcf164f6..2b57dca76ce6 100644 --- a/api/v1alpha4/common_types.go +++ b/api/v1alpha4/common_types.go @@ -18,6 +18,7 @@ package v1alpha4 import ( corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) const ( @@ -93,6 +94,11 @@ const ( ManagedByAnnotation = "cluster.x-k8s.io/managed-by" ) +var ( + // ZeroDuration is a zero value of the metav1.Duration type. + ZeroDuration = metav1.Duration{} +) + // MachineAddressType describes a valid MachineAddress type. type MachineAddressType string diff --git a/api/v1alpha4/machinehealthcheck_types.go b/api/v1alpha4/machinehealthcheck_types.go index b36550ca1259..0f73d9c99390 100644 --- a/api/v1alpha4/machinehealthcheck_types.go +++ b/api/v1alpha4/machinehealthcheck_types.go @@ -56,6 +56,8 @@ type MachineHealthCheckSpec struct { // Machines older than this duration without a node will be considered to have // failed and will be remediated. + // If not set, this value is defaulted to 10 minutes. + // If you wish to disable this feature, set the value explicitly to 0. // +optional NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"` diff --git a/api/v1alpha4/machinehealthcheck_webhook.go b/api/v1alpha4/machinehealthcheck_webhook.go index cf123bd92adb..5b5c65dd7a69 100644 --- a/api/v1alpha4/machinehealthcheck_webhook.go +++ b/api/v1alpha4/machinehealthcheck_webhook.go @@ -30,13 +30,15 @@ import ( ) var ( - // Default time allowed for a node to start up. Can be made longer as part of - // spec if required for particular provider. + // DefaultNodeStartupTimeout is the time allowed for a node to start up. + // Can be made longer as part of spec if required for particular provider. // 10 minutes should allow the instance to start and the node to join the // cluster on most providers. - defaultNodeStartupTimeout = metav1.Duration{Duration: 10 * time.Minute} + DefaultNodeStartupTimeout = metav1.Duration{Duration: 10 * time.Minute} // Minimum time allowed for a node to start up. minNodeStartupTimeout = metav1.Duration{Duration: 30 * time.Second} + // We allow users to disable the nodeStartupTimeout by setting the duration to 0. + disabledNodeStartupTimeout = ZeroDuration ) // SetMinNodeStartupTimeout allows users to optionally set a custom timeout @@ -73,7 +75,7 @@ func (m *MachineHealthCheck) Default() { } if m.Spec.NodeStartupTimeout == nil { - m.Spec.NodeStartupTimeout = &defaultNodeStartupTimeout + m.Spec.NodeStartupTimeout = &DefaultNodeStartupTimeout } } @@ -129,7 +131,9 @@ func (m *MachineHealthCheck) validate(old *MachineHealthCheck) error { ) } - if m.Spec.NodeStartupTimeout != nil && m.Spec.NodeStartupTimeout.Seconds() < minNodeStartupTimeout.Seconds() { + if m.Spec.NodeStartupTimeout != nil && + m.Spec.NodeStartupTimeout.Seconds() != disabledNodeStartupTimeout.Seconds() && + m.Spec.NodeStartupTimeout.Seconds() < minNodeStartupTimeout.Seconds() { allErrs = append( allErrs, field.Invalid(field.NewPath("spec", "nodeStartupTimeout"), m.Spec.NodeStartupTimeout.Seconds(), "must be at least 30s"), diff --git a/api/v1alpha4/machinehealthcheck_webhook_test.go b/api/v1alpha4/machinehealthcheck_webhook_test.go index 8d2da7611242..752bf43b9dec 100644 --- a/api/v1alpha4/machinehealthcheck_webhook_test.go +++ b/api/v1alpha4/machinehealthcheck_webhook_test.go @@ -177,9 +177,9 @@ func TestMachineHealthCheckNodeStartupTimeout(t *testing.T) { expectErr: true, }, { - name: "when the nodeStartupTimeout is 0", + name: "when the nodeStartupTimeout is 0 (disabled)", timeout: &zero, - expectErr: true, + expectErr: false, }, } diff --git a/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml b/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml index 59aab4875e81..9f49e01b6f81 100644 --- a/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml @@ -319,7 +319,9 @@ spec: x-kubernetes-int-or-string: true nodeStartupTimeout: description: Machines older than this duration without a node will - be considered to have failed and will be remediated. + be considered to have failed and will be remediated. If not set, + this value is defaulted to 10 minutes. If you wish to disable this + feature, set the value explicitly to 0. type: string remediationTemplate: description: "RemediationTemplate is a reference to a remediation diff --git a/controllers/machinehealthcheck_controller.go b/controllers/machinehealthcheck_controller.go index 67a0581cdb35..d6d9f554cb23 100644 --- a/controllers/machinehealthcheck_controller.go +++ b/controllers/machinehealthcheck_controller.go @@ -209,8 +209,13 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log // do sort to avoid keep changing m.Status as the returned machines are not in order sort.Strings(m.Status.Targets) + nodeStartupTimeout := m.Spec.NodeStartupTimeout + if nodeStartupTimeout == nil { + nodeStartupTimeout = &clusterv1.DefaultNodeStartupTimeout + } + // health check all targets and reconcile mhc status - healthy, unhealthy, nextCheckTimes := r.healthCheckTargets(targets, logger, m.Spec.NodeStartupTimeout.Duration) + healthy, unhealthy, nextCheckTimes := r.healthCheckTargets(targets, logger, *nodeStartupTimeout) m.Status.CurrentHealthy = int32(len(healthy)) var unhealthyLimitKey, unhealthyLimitValue interface{} diff --git a/controllers/machinehealthcheck_targets.go b/controllers/machinehealthcheck_targets.go index c81408c0e2d4..a45000b380f6 100644 --- a/controllers/machinehealthcheck_targets.go +++ b/controllers/machinehealthcheck_targets.go @@ -44,6 +44,11 @@ const ( EventDetectedUnhealthy string = "DetectedUnhealthy" ) +var ( + // We allow users to disable the nodeStartupTimeout by setting the duration to 0. + disabledNodeStartupTimeout = clusterv1.ZeroDuration +) + // healthCheckTarget contains the information required to perform a health check // on the node to determine if any remediation is required. type healthCheckTarget struct { @@ -81,7 +86,7 @@ func (t *healthCheckTarget) nodeName() string { // If the target doesn't currently need rememdiation, provide a duration after // which the target should next be checked. // The target should be requeued after this duration. -func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode time.Duration) (bool, time.Duration) { +func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (bool, time.Duration) { var nextCheckTimes []time.Duration now := time.Now() @@ -120,6 +125,12 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi // the node has not been set yet if t.Node == nil { + if timeoutForMachineToHaveNode == disabledNodeStartupTimeout { + // Startup timeout is disabled so no need to go any further. + // No node yet to check conditions, can return early here. + return false, 0 + } + controlPlaneInitializedTime := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ControlPlaneInitializedCondition).Time clusterInfraReadyTime := conditions.GetLastTransitionTime(t.Cluster, clusterv1.InfrastructureReadyCondition).Time machineCreationTime := t.Machine.CreationTimestamp.Time @@ -135,14 +146,14 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi } logger.V(3).Info("Using comparison time", "time", comparisonTime) - if comparisonTime.Add(timeoutForMachineToHaveNode).Before(now) { + if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) { conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSuccededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutForMachineToHaveNode.String()) logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutForMachineToHaveNode.String()) return true, time.Duration(0) } durationUnhealthy := now.Sub(comparisonTime) - nextCheck := timeoutForMachineToHaveNode - durationUnhealthy + time.Second + nextCheck := timeoutForMachineToHaveNode.Duration - durationUnhealthy + time.Second return false, nextCheck } @@ -261,7 +272,7 @@ func (r *MachineHealthCheckReconciler) getNodeFromMachine(ctx context.Context, c // healthCheckTargets health checks a slice of targets // and gives a data to measure the average health. -func (r *MachineHealthCheckReconciler) healthCheckTargets(targets []healthCheckTarget, logger logr.Logger, timeoutForMachineToHaveNode time.Duration) ([]healthCheckTarget, []healthCheckTarget, []time.Duration) { +func (r *MachineHealthCheckReconciler) healthCheckTargets(targets []healthCheckTarget, logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) ([]healthCheckTarget, []healthCheckTarget, []time.Duration) { var nextCheckTimes []time.Duration var unhealthy []healthCheckTarget var healthy []healthCheckTarget @@ -290,7 +301,7 @@ func (r *MachineHealthCheckReconciler) healthCheckTargets(targets []healthCheckT continue } - if t.Machine.DeletionTimestamp.IsZero() { + if t.Machine.DeletionTimestamp.IsZero() && t.Node != nil { conditions.MarkTrue(t.Machine, clusterv1.MachineHealthCheckSuccededCondition) healthy = append(healthy, t) } diff --git a/controllers/machinehealthcheck_targets_test.go b/controllers/machinehealthcheck_targets_test.go index 9860689fc0e0..46c44ba4a48c 100644 --- a/controllers/machinehealthcheck_targets_test.go +++ b/controllers/machinehealthcheck_targets_test.go @@ -197,9 +197,19 @@ func TestHealthCheckTargets(t *testing.T) { conditions.MarkTrue(cluster, clusterv1.InfrastructureReadyCondition) conditions.MarkTrue(cluster, clusterv1.ControlPlaneInitializedCondition) + // Ensure the control plane was initialized earlier to prevent it interfering with + // NodeStartupTimeout testing. + conds := clusterv1.Conditions{} + for _, condition := range cluster.GetConditions() { + condition.LastTransitionTime = metav1.NewTime(condition.LastTransitionTime.Add(-1 * time.Hour)) + conds = append(conds, condition) + } + cluster.SetConditions(conds) + mhcSelector := map[string]string{"cluster": clusterName, "machine-group": "foo"} timeoutForMachineToHaveNode := 10 * time.Minute + disabledTimeoutForMachineToHaveNode := time.Duration(0) // Create a test MHC testMHC := &clusterv1.MachineHealthCheck{ @@ -229,15 +239,26 @@ func TestHealthCheckTargets(t *testing.T) { testMachine := newTestMachine("machine1", namespace, clusterName, "node1", mhcSelector) - // Target for when the node has not yet been seen by the Machine controller - testMachineLastUpdated400s := testMachine.DeepCopy() + // Targets for when the node has not yet been seen by the Machine controller + testMachineCreated1200s := testMachine.DeepCopy() + nowMinus1200s := metav1.NewTime(time.Now().Add(-1200 * time.Second)) + testMachineCreated1200s.ObjectMeta.CreationTimestamp = nowMinus1200s + + nodeNotYetStartedTarget1200s := healthCheckTarget{ + Cluster: cluster, + MHC: testMHC, + Machine: testMachineCreated1200s, + Node: nil, + } + + testMachineCreated400s := testMachine.DeepCopy() nowMinus400s := metav1.NewTime(time.Now().Add(-400 * time.Second)) - testMachineLastUpdated400s.Status.LastUpdated = &nowMinus400s + testMachineCreated400s.ObjectMeta.CreationTimestamp = nowMinus400s - nodeNotYetStartedTarget := healthCheckTarget{ + nodeNotYetStartedTarget400s := healthCheckTarget{ Cluster: cluster, MHC: testMHC, - Machine: testMachineLastUpdated400s, + Machine: testMachineCreated400s, Node: nil, } @@ -292,18 +313,26 @@ func TestHealthCheckTargets(t *testing.T) { } testCases := []struct { - desc string - targets []healthCheckTarget - expectedHealthy []healthCheckTarget - expectedNeedsRemediation []healthCheckTarget - expectedNextCheckTimes []time.Duration + desc string + targets []healthCheckTarget + timeoutForMachineToHaveNode *time.Duration + expectedHealthy []healthCheckTarget + expectedNeedsRemediation []healthCheckTarget + expectedNextCheckTimes []time.Duration }{ { - desc: "when the node has not yet started", - targets: []healthCheckTarget{nodeNotYetStartedTarget}, + desc: "when the node has not yet started for shorter than the timeout", + targets: []healthCheckTarget{nodeNotYetStartedTarget400s}, expectedHealthy: []healthCheckTarget{}, expectedNeedsRemediation: []healthCheckTarget{}, - expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode}, + expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 400*time.Second}, + }, + { + desc: "when the node has not yet started for longer than the timeout", + targets: []healthCheckTarget{nodeNotYetStartedTarget1200s}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s}, + expectedNextCheckTimes: []time.Duration{}, }, { desc: "when the node has gone away", @@ -340,6 +369,14 @@ func TestHealthCheckTargets(t *testing.T) { expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second}, }, + { + desc: "when the node has not started for a long time but the startup timeout is disabled", + targets: []healthCheckTarget{nodeNotYetStartedTarget400s}, + timeoutForMachineToHaveNode: &disabledTimeoutForMachineToHaveNode, + expectedHealthy: []healthCheckTarget{}, // The node is not healthy as it does not have a machine + expectedNeedsRemediation: []healthCheckTarget{}, + expectedNextCheckTimes: []time.Duration{}, // We don't have a timeout so no way to know when to re-check + }, } for _, tc := range testCases { @@ -351,7 +388,13 @@ func TestHealthCheckTargets(t *testing.T) { recorder: record.NewFakeRecorder(5), } - healthy, unhealthy, nextCheckTimes := reconciler.healthCheckTargets(tc.targets, ctrl.LoggerFrom(ctx), timeoutForMachineToHaveNode) + // Allow individual test cases to override the timeoutForMachineToHaveNode. + timeout := metav1.Duration{Duration: timeoutForMachineToHaveNode} + if tc.timeoutForMachineToHaveNode != nil { + timeout.Duration = *tc.timeoutForMachineToHaveNode + } + + healthy, unhealthy, nextCheckTimes := reconciler.healthCheckTargets(tc.targets, ctrl.LoggerFrom(ctx), timeout) // Round durations down to nearest second account for minute differences // in timing when running tests diff --git a/docs/book/src/tasks/healthcheck.md b/docs/book/src/tasks/healthcheck.md index 059484f6904c..b6f20ed4f489 100644 --- a/docs/book/src/tasks/healthcheck.md +++ b/docs/book/src/tasks/healthcheck.md @@ -38,7 +38,13 @@ spec: # (Optional) maxUnhealthy prevents further remediation if the cluster is already partially unhealthy maxUnhealthy: 40% # (Optional) nodeStartupTimeout determines how long a MachineHealthCheck should wait for - # a Node to join the cluster, before considering a Machine unhealthy + # a Node to join the cluster, before considering a Machine unhealthy. + # Defaults to 10 minutes if not specified. + # Set to 0 to disable the node startup timeout. + # Disabling this timeout will prevent a Machine from being considered unhealthy when + # the Node it created has not yet registered with the cluster. This can be useful when + # Nodes take a long time to start up or when you only want condition based checks for + # Machine health. nodeStartupTimeout: 10m # selector is used to determine which Machines should be health checked selector: