From a43e870ed01122e71129ea74bc23fdbdc0511039 Mon Sep 17 00:00:00 2001 From: Florian Gutmann Date: Wed, 17 Aug 2022 02:45:22 +0000 Subject: [PATCH] Fix messages of MachineHealthCheckSucceedCondition Messages for the MachineHasFailureReason contained memory addresses instead of the failure reason text. This lead to hot reconciliation loops, because the message changed with every loop-run. Old: "FailureReason: 0xc000d80d10" New: "FailureReason: UpdateError" Messages for the UnhealthyNodeConditionReason contained the timeout value as a nested struct. Old: "Node failed to report startup in &Duration{Duration:10m0s,}" New: "Node failed to report startup in 10m0s" cr: https://code.amazon.com/reviews/CR-74430064 --- .../machinehealthcheck_targets.go | 11 +- .../machinehealthcheck_targets_test.go | 135 +++++++++++++----- 2 files changed, 108 insertions(+), 38 deletions(-) diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go index bbe5a54d760e..f22c79fe3503 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go @@ -92,13 +92,13 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi now := time.Now() if t.Machine.Status.FailureReason != nil { - conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", t.Machine.Status.FailureReason) + conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", *t.Machine.Status.FailureReason) logger.V(3).Info("Target is unhealthy", "failureReason", t.Machine.Status.FailureReason) return true, time.Duration(0) } if t.Machine.Status.FailureMessage != nil { - conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureMessage: %v", t.Machine.Status.FailureMessage) + conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureMessage: %v", *t.Machine.Status.FailureMessage) logger.V(3).Info("Target is unhealthy", "failureMessage", t.Machine.Status.FailureMessage) return true, time.Duration(0) } @@ -147,14 +147,15 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi } logger.V(3).Info("Using comparison time", "time", comparisonTime) + timeoutDuration := timeoutForMachineToHaveNode.Duration if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) { - conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutForMachineToHaveNode.String()) - logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutForMachineToHaveNode.String()) + conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration) + logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration) return true, time.Duration(0) } durationUnhealthy := now.Sub(comparisonTime) - nextCheck := timeoutForMachineToHaveNode.Duration - durationUnhealthy + time.Second + nextCheck := timeoutDuration - durationUnhealthy + time.Second return false, nextCheck } diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go index 8b0e96780ad4..17e6de8062fe 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go @@ -29,6 +29,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/errors" "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" ) @@ -209,6 +210,7 @@ func TestHealthCheckTargets(t *testing.T) { timeoutForMachineToHaveNode := 10 * time.Minute disabledTimeoutForMachineToHaveNode := time.Duration(0) + timeoutForUnhealthyConditions := 5 * time.Minute // Create a test MHC testMHC := &clusterv1.MachineHealthCheck{ @@ -225,12 +227,12 @@ func TestHealthCheckTargets(t *testing.T) { { Type: corev1.NodeReady, Status: corev1.ConditionUnknown, - Timeout: metav1.Duration{Duration: 5 * time.Minute}, + Timeout: metav1.Duration{Duration: timeoutForUnhealthyConditions}, }, { Type: corev1.NodeReady, Status: corev1.ConditionFalse, - Timeout: metav1.Duration{Duration: 5 * time.Minute}, + Timeout: metav1.Duration{Duration: timeoutForUnhealthyConditions}, }, }, }, @@ -249,6 +251,7 @@ func TestHealthCheckTargets(t *testing.T) { Machine: testMachineCreated1200s, Node: nil, } + nodeNotYetStartedTarget1200sCondition := newFailedHealthCheckCondition(clusterv1.NodeStartupTimeoutReason, "Node failed to report startup in %s", timeoutForMachineToHaveNode) testMachineCreated400s := testMachine.DeepCopy() nowMinus400s := metav1.NewTime(time.Now().Add(-400 * time.Second)) @@ -265,17 +268,18 @@ func TestHealthCheckTargets(t *testing.T) { nodeGoneAway := healthCheckTarget{ Cluster: cluster, MHC: testMHC, - Machine: testMachine, + Machine: testMachine.DeepCopy(), Node: &corev1.Node{}, nodeMissing: true, } + nodeGoneAwayCondition := newFailedHealthCheckCondition(clusterv1.NodeNotFoundReason, "") // Target for when the node has been in an unknown state for shorter than the timeout testNodeUnknown200 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 200*time.Second) nodeUnknown200 := healthCheckTarget{ Cluster: cluster, MHC: testMHC, - Machine: testMachine, + Machine: testMachine.DeepCopy(), Node: testNodeUnknown200, nodeMissing: false, } @@ -285,7 +289,7 @@ func TestHealthCheckTargets(t *testing.T) { nodeUnknown100 := healthCheckTarget{ Cluster: cluster, MHC: testMHC, - Machine: testMachine, + Machine: testMachine.DeepCopy(), Node: testNodeUnknown100, nodeMissing: false, } @@ -295,10 +299,11 @@ func TestHealthCheckTargets(t *testing.T) { nodeUnknown400 := healthCheckTarget{ Cluster: cluster, MHC: testMHC, - Machine: testMachine, + Machine: testMachine.DeepCopy(), Node: testNodeUnknown400, nodeMissing: false, } + nodeUnknown400Condition := newFailedHealthCheckCondition(clusterv1.UnhealthyNodeConditionReason, "Condition Ready on node is reporting status Unknown for more than %s", timeoutForUnhealthyConditions) // Target for when a node is healthy testNodeHealthy := newTestNode("node1") @@ -306,18 +311,43 @@ func TestHealthCheckTargets(t *testing.T) { nodeHealthy := healthCheckTarget{ Cluster: cluster, MHC: testMHC, - Machine: testMachine, + Machine: testMachine.DeepCopy(), Node: testNodeHealthy, nodeMissing: false, } + // Target for when the machine has a failure reason + failureReason := errors.UpdateMachineError + testMachineFailureReason := testMachine.DeepCopy() + testMachineFailureReason.Status.FailureReason = &failureReason + machineFailureReason := healthCheckTarget{ + Cluster: cluster, + MHC: testMHC, + Machine: testMachineFailureReason, + Node: nil, + } + machineFailureReasonCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureReason: %s", failureReason) + + // Target for when the machine has a failure message + failureMsg := "some failure message" + testMachineFailureMsg := testMachine.DeepCopy() + testMachineFailureMsg.Status.FailureMessage = &failureMsg + machineFailureMsg := healthCheckTarget{ + Cluster: cluster, + MHC: testMHC, + Machine: testMachineFailureMsg, + Node: nil, + } + machineFailureMsgCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureMessage: %s", failureMsg) + testCases := []struct { - desc string - targets []healthCheckTarget - timeoutForMachineToHaveNode *time.Duration - expectedHealthy []healthCheckTarget - expectedNeedsRemediation []healthCheckTarget - expectedNextCheckTimes []time.Duration + desc string + targets []healthCheckTarget + timeoutForMachineToHaveNode *time.Duration + expectedHealthy []healthCheckTarget + expectedNeedsRemediation []healthCheckTarget + expectedNeedsRemediationCondition []clusterv1.Condition + expectedNextCheckTimes []time.Duration }{ { desc: "when the node has not yet started for shorter than the timeout", @@ -327,18 +357,20 @@ func TestHealthCheckTargets(t *testing.T) { expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 400*time.Second}, }, { - desc: "when the node has not yet started for longer than the timeout", - targets: []healthCheckTarget{nodeNotYetStartedTarget1200s}, - expectedHealthy: []healthCheckTarget{}, - expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s}, - expectedNextCheckTimes: []time.Duration{}, + desc: "when the node has not yet started for longer than the timeout", + targets: []healthCheckTarget{nodeNotYetStartedTarget1200s}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s}, + expectedNeedsRemediationCondition: []clusterv1.Condition{nodeNotYetStartedTarget1200sCondition}, + expectedNextCheckTimes: []time.Duration{}, }, { - desc: "when the node has gone away", - targets: []healthCheckTarget{nodeGoneAway}, - expectedHealthy: []healthCheckTarget{}, - expectedNeedsRemediation: []healthCheckTarget{nodeGoneAway}, - expectedNextCheckTimes: []time.Duration{}, + desc: "when the node has gone away", + targets: []healthCheckTarget{nodeGoneAway}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{nodeGoneAway}, + expectedNeedsRemediationCondition: []clusterv1.Condition{nodeGoneAwayCondition}, + expectedNextCheckTimes: []time.Duration{}, }, { desc: "when the node has been in an unknown state for shorter than the timeout", @@ -348,11 +380,12 @@ func TestHealthCheckTargets(t *testing.T) { expectedNextCheckTimes: []time.Duration{100 * time.Second}, }, { - desc: "when the node has been in an unknown state for longer than the timeout", - targets: []healthCheckTarget{nodeUnknown400}, - expectedHealthy: []healthCheckTarget{}, - expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, - expectedNextCheckTimes: []time.Duration{}, + desc: "when the node has been in an unknown state for longer than the timeout", + targets: []healthCheckTarget{nodeUnknown400}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, + expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition}, + expectedNextCheckTimes: []time.Duration{}, }, { desc: "when the node is healthy", @@ -362,11 +395,12 @@ func TestHealthCheckTargets(t *testing.T) { expectedNextCheckTimes: []time.Duration{}, }, { - desc: "with a mix of healthy and unhealthy nodes", - targets: []healthCheckTarget{nodeUnknown100, nodeUnknown200, nodeUnknown400, nodeHealthy}, - expectedHealthy: []healthCheckTarget{nodeHealthy}, - expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, - expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second}, + desc: "with a mix of healthy and unhealthy nodes", + targets: []healthCheckTarget{nodeUnknown100, nodeUnknown200, nodeUnknown400, nodeHealthy}, + expectedHealthy: []healthCheckTarget{nodeHealthy}, + expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, + expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition}, + expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second}, }, { desc: "when the node has not started for a long time but the startup timeout is disabled", @@ -376,6 +410,22 @@ func TestHealthCheckTargets(t *testing.T) { expectedNeedsRemediation: []healthCheckTarget{}, expectedNextCheckTimes: []time.Duration{}, // We don't have a timeout so no way to know when to re-check }, + { + desc: "when the machine has a failure reason", + targets: []healthCheckTarget{machineFailureReason}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{machineFailureReason}, + expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureReasonCondition}, + expectedNextCheckTimes: []time.Duration{}, + }, + { + desc: "when the machine has a failure message", + targets: []healthCheckTarget{machineFailureMsg}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{machineFailureMsg}, + expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureMsgCondition}, + expectedNextCheckTimes: []time.Duration{}, + }, } for _, tc := range testCases { @@ -405,9 +455,24 @@ func TestHealthCheckTargets(t *testing.T) { return out } + // Remove the last transition time of the given conditions. Used for comparison with expected conditions. + removeLastTransitionTimes := func(in clusterv1.Conditions) clusterv1.Conditions { + out := clusterv1.Conditions{} + for _, c := range in { + withoutTime := c.DeepCopy() + withoutTime.LastTransitionTime = metav1.Time{} + out = append(out, *withoutTime) + } + return out + } + gs.Expect(healthy).To(ConsistOf(tc.expectedHealthy)) gs.Expect(unhealthy).To(ConsistOf(tc.expectedNeedsRemediation)) gs.Expect(nextCheckTimes).To(WithTransform(roundDurations, ConsistOf(tc.expectedNextCheckTimes))) + for i, expectedMachineConditions := range tc.expectedNeedsRemediationCondition { + actualConditions := unhealthy[i].Machine.GetConditions() + gs.Expect(actualConditions).To(WithTransform(removeLastTransitionTimes, ContainElements(expectedMachineConditions))) + } }) } } @@ -477,3 +542,7 @@ func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, statu }, } } + +func newFailedHealthCheckCondition(reason string, messageFormat string, messageArgs ...interface{}) clusterv1.Condition { + return *conditions.FalseCondition(clusterv1.MachineHealthCheckSucceededCondition, reason, clusterv1.ConditionSeverityWarning, messageFormat, messageArgs...) +}