Skip to content

Commit

Permalink
Fix messages of MachineHealthCheckSucceedCondition
Browse files Browse the repository at this point in the history
Messages for the MachineHasFailureReason contained
memory addresses instead of the failure reason text.
This lead to hot reconciliation loops, because the
message changed with every loop-run.
Old: "FailureReason: 0xc000d80d10"
New: "FailureReason: UpdateError"

Messages for the UnhealthyNodeConditionReason contained
the timeout value as a nested struct.
Old: "Node failed to report startup in &Duration{Duration:10m0s,}"
New: "Node failed to report startup in 10m0s"

cr: https://code.amazon.com/reviews/CR-74430064
  • Loading branch information
Florian Gutmann committed Aug 18, 2022
1 parent 8b86290 commit 0c45ee8
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,13 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
now := time.Now()

if t.Machine.Status.FailureReason != nil {
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", t.Machine.Status.FailureReason)
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", *t.Machine.Status.FailureReason)
logger.V(3).Info("Target is unhealthy", "failureReason", t.Machine.Status.FailureReason)
return true, time.Duration(0)
}

if t.Machine.Status.FailureMessage != nil {
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureMessage: %v", t.Machine.Status.FailureMessage)
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureMessage: %v", *t.Machine.Status.FailureMessage)
logger.V(3).Info("Target is unhealthy", "failureMessage", t.Machine.Status.FailureMessage)
return true, time.Duration(0)
}
Expand Down Expand Up @@ -148,14 +148,15 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
}
logger.V(3).Info("Using comparison time", "time", comparisonTime)

timeoutDuration := timeoutForMachineToHaveNode.Duration
if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) {
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutForMachineToHaveNode.String())
logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutForMachineToHaveNode.String())
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration)
logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration)
return true, time.Duration(0)
}

durationUnhealthy := now.Sub(comparisonTime)
nextCheck := timeoutForMachineToHaveNode.Duration - durationUnhealthy + time.Second
nextCheck := timeoutDuration - durationUnhealthy + time.Second

return false, nextCheck
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client/fake"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/errors"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
)
Expand Down Expand Up @@ -209,6 +210,7 @@ func TestHealthCheckTargets(t *testing.T) {

timeoutForMachineToHaveNode := 10 * time.Minute
disabledTimeoutForMachineToHaveNode := time.Duration(0)
timeoutForUnhealthyConditions := 5 * time.Minute

// Create a test MHC
testMHC := &clusterv1.MachineHealthCheck{
Expand All @@ -225,12 +227,12 @@ func TestHealthCheckTargets(t *testing.T) {
{
Type: corev1.NodeReady,
Status: corev1.ConditionUnknown,
Timeout: metav1.Duration{Duration: 5 * time.Minute},
Timeout: metav1.Duration{Duration: timeoutForUnhealthyConditions},
},
{
Type: corev1.NodeReady,
Status: corev1.ConditionFalse,
Timeout: metav1.Duration{Duration: 5 * time.Minute},
Timeout: metav1.Duration{Duration: timeoutForUnhealthyConditions},
},
},
},
Expand All @@ -249,6 +251,7 @@ func TestHealthCheckTargets(t *testing.T) {
Machine: testMachineCreated1200s,
Node: nil,
}
nodeNotYetStartedTarget1200sCondition := newFailedHealthCheckCondition(clusterv1.NodeStartupTimeoutReason, "Node failed to report startup in %s", timeoutForMachineToHaveNode)

testMachineCreated400s := testMachine.DeepCopy()
nowMinus400s := metav1.NewTime(time.Now().Add(-400 * time.Second))
Expand All @@ -265,17 +268,18 @@ func TestHealthCheckTargets(t *testing.T) {
nodeGoneAway := healthCheckTarget{
Cluster: cluster,
MHC: testMHC,
Machine: testMachine,
Machine: testMachine.DeepCopy(),
Node: &corev1.Node{},
nodeMissing: true,
}
nodeGoneAwayCondition := newFailedHealthCheckCondition(clusterv1.NodeNotFoundReason, "")

// Target for when the node has been in an unknown state for shorter than the timeout
testNodeUnknown200 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 200*time.Second)
nodeUnknown200 := healthCheckTarget{
Cluster: cluster,
MHC: testMHC,
Machine: testMachine,
Machine: testMachine.DeepCopy(),
Node: testNodeUnknown200,
nodeMissing: false,
}
Expand All @@ -285,7 +289,7 @@ func TestHealthCheckTargets(t *testing.T) {
nodeUnknown100 := healthCheckTarget{
Cluster: cluster,
MHC: testMHC,
Machine: testMachine,
Machine: testMachine.DeepCopy(),
Node: testNodeUnknown100,
nodeMissing: false,
}
Expand All @@ -295,29 +299,55 @@ func TestHealthCheckTargets(t *testing.T) {
nodeUnknown400 := healthCheckTarget{
Cluster: cluster,
MHC: testMHC,
Machine: testMachine,
Machine: testMachine.DeepCopy(),
Node: testNodeUnknown400,
nodeMissing: false,
}
nodeUnknown400Condition := newFailedHealthCheckCondition(clusterv1.UnhealthyNodeConditionReason, "Condition Ready on node is reporting status Unknown for more than %s", timeoutForUnhealthyConditions)

// Target for when a node is healthy
testNodeHealthy := newTestNode("node1")
testNodeHealthy.UID = "12345"
nodeHealthy := healthCheckTarget{
Cluster: cluster,
MHC: testMHC,
Machine: testMachine,
Machine: testMachine.DeepCopy(),
Node: testNodeHealthy,
nodeMissing: false,
}

// Target for when the machine has a failure reason
failureReason := errors.UpdateMachineError
testMachineFailureReason := testMachine.DeepCopy()
testMachineFailureReason.Status.FailureReason = &failureReason
machineFailureReason := healthCheckTarget{
Cluster: cluster,
MHC: testMHC,
Machine: testMachineFailureReason,
Node: nil,
}
machineFailureReasonCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureReason: %s", failureReason)

// Target for when the machine has a failure message
failureMsg := "some failure message"
testMachineFailureMsg := testMachine.DeepCopy()
testMachineFailureMsg.Status.FailureMessage = &failureMsg
machineFailureMsg := healthCheckTarget{
Cluster: cluster,
MHC: testMHC,
Machine: testMachineFailureMsg,
Node: nil,
}
machineFailureMsgCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureMessage: %s", failureMsg)

testCases := []struct {
desc string
targets []healthCheckTarget
timeoutForMachineToHaveNode *time.Duration
expectedHealthy []healthCheckTarget
expectedNeedsRemediation []healthCheckTarget
expectedNextCheckTimes []time.Duration
desc string
targets []healthCheckTarget
timeoutForMachineToHaveNode *time.Duration
expectedHealthy []healthCheckTarget
expectedNeedsRemediation []healthCheckTarget
expectedNeedsRemediationCondition []clusterv1.Condition
expectedNextCheckTimes []time.Duration
}{
{
desc: "when the node has not yet started for shorter than the timeout",
Expand All @@ -327,18 +357,20 @@ func TestHealthCheckTargets(t *testing.T) {
expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 400*time.Second},
},
{
desc: "when the node has not yet started for longer than the timeout",
targets: []healthCheckTarget{nodeNotYetStartedTarget1200s},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s},
expectedNextCheckTimes: []time.Duration{},
desc: "when the node has not yet started for longer than the timeout",
targets: []healthCheckTarget{nodeNotYetStartedTarget1200s},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s},
expectedNeedsRemediationCondition: []clusterv1.Condition{nodeNotYetStartedTarget1200sCondition},
expectedNextCheckTimes: []time.Duration{},
},
{
desc: "when the node has gone away",
targets: []healthCheckTarget{nodeGoneAway},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{nodeGoneAway},
expectedNextCheckTimes: []time.Duration{},
desc: "when the node has gone away",
targets: []healthCheckTarget{nodeGoneAway},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{nodeGoneAway},
expectedNeedsRemediationCondition: []clusterv1.Condition{nodeGoneAwayCondition},
expectedNextCheckTimes: []time.Duration{},
},
{
desc: "when the node has been in an unknown state for shorter than the timeout",
Expand All @@ -348,11 +380,12 @@ func TestHealthCheckTargets(t *testing.T) {
expectedNextCheckTimes: []time.Duration{100 * time.Second},
},
{
desc: "when the node has been in an unknown state for longer than the timeout",
targets: []healthCheckTarget{nodeUnknown400},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
expectedNextCheckTimes: []time.Duration{},
desc: "when the node has been in an unknown state for longer than the timeout",
targets: []healthCheckTarget{nodeUnknown400},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition},
expectedNextCheckTimes: []time.Duration{},
},
{
desc: "when the node is healthy",
Expand All @@ -362,11 +395,12 @@ func TestHealthCheckTargets(t *testing.T) {
expectedNextCheckTimes: []time.Duration{},
},
{
desc: "with a mix of healthy and unhealthy nodes",
targets: []healthCheckTarget{nodeUnknown100, nodeUnknown200, nodeUnknown400, nodeHealthy},
expectedHealthy: []healthCheckTarget{nodeHealthy},
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second},
desc: "with a mix of healthy and unhealthy nodes",
targets: []healthCheckTarget{nodeUnknown100, nodeUnknown200, nodeUnknown400, nodeHealthy},
expectedHealthy: []healthCheckTarget{nodeHealthy},
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition},
expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second},
},
{
desc: "when the node has not started for a long time but the startup timeout is disabled",
Expand All @@ -376,6 +410,22 @@ func TestHealthCheckTargets(t *testing.T) {
expectedNeedsRemediation: []healthCheckTarget{},
expectedNextCheckTimes: []time.Duration{}, // We don't have a timeout so no way to know when to re-check
},
{
desc: "when the machine has a failure reason",
targets: []healthCheckTarget{machineFailureReason},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{machineFailureReason},
expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureReasonCondition},
expectedNextCheckTimes: []time.Duration{},
},
{
desc: "when the machine has a failure message",
targets: []healthCheckTarget{machineFailureMsg},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{machineFailureMsg},
expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureMsgCondition},
expectedNextCheckTimes: []time.Duration{},
},
}

for _, tc := range testCases {
Expand Down Expand Up @@ -405,9 +455,24 @@ func TestHealthCheckTargets(t *testing.T) {
return out
}

// Remove the last transition time of the given conditions. Used for comparison with expected conditions.
removeLastTransitionTimes := func(in clusterv1.Conditions) clusterv1.Conditions {
out := clusterv1.Conditions{}
for _, c := range in {
withoutTime := c.DeepCopy()
withoutTime.LastTransitionTime = metav1.Time{}
out = append(out, *withoutTime)
}
return out
}

gs.Expect(healthy).To(ConsistOf(tc.expectedHealthy))
gs.Expect(unhealthy).To(ConsistOf(tc.expectedNeedsRemediation))
gs.Expect(nextCheckTimes).To(WithTransform(roundDurations, ConsistOf(tc.expectedNextCheckTimes)))
for i, expectedMachineConditions := range tc.expectedNeedsRemediationCondition {
actualConditions := unhealthy[i].Machine.GetConditions()
gs.Expect(actualConditions).To(WithTransform(removeLastTransitionTimes, ContainElements(expectedMachineConditions)))
}
})
}
}
Expand Down Expand Up @@ -477,3 +542,7 @@ func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, statu
},
}
}

func newFailedHealthCheckCondition(reason string, messageFormat string, messageArgs ...interface{}) clusterv1.Condition {
return *conditions.FalseCondition(clusterv1.MachineHealthCheckSucceededCondition, reason, clusterv1.ConditionSeverityWarning, messageFormat, messageArgs...)
}

0 comments on commit 0c45ee8

Please sign in to comment.