From 8a85616922cc47f5c741bb913a6ad7561932a982 Mon Sep 17 00:00:00 2001 From: Andy Goldstein Date: Fri, 25 Sep 2020 09:50:40 -0400 Subject: [PATCH] MHC: control plane init / cluster infra ready Change the MachineHealthCheck logic to require the Cluster's "control plane initialized" and "infrastructure ready" conditions to be true before proceeding to determine if a target is unhealhty. We can't look just at a Machine's last updated time when determining if the Machine has exceeded the node startup timeout. A node can't bootstrap until after the cluster's infrastructure is ready and the control plane is initialized, and we need to base node startup timeout on the latter of machine creation time, control plane initialized time, and cluster infrastructure ready time. Signed-off-by: Andy Goldstein --- controllers/cluster_controller.go | 7 +- controllers/machine_controller_test.go | 11 +- .../machinehealthcheck_controller_test.go | 432 +++++++++++++++--- controllers/machinehealthcheck_targets.go | 46 +- .../machinehealthcheck_targets_test.go | 41 +- 5 files changed, 450 insertions(+), 87 deletions(-) diff --git a/controllers/cluster_controller.go b/controllers/cluster_controller.go index be0b880bb465..805c86609dc3 100644 --- a/controllers/cluster_controller.go +++ b/controllers/cluster_controller.go @@ -462,19 +462,24 @@ func splitMachineList(list *clusterv1.MachineList) (*clusterv1.MachineList, *clu } func (r *ClusterReconciler) reconcileControlPlaneInitialized(ctx context.Context, cluster *clusterv1.Cluster) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + // Skip checking if the control plane is initialized when using a Control Plane Provider (this is reconciled in // reconcileControlPlane instead). if cluster.Spec.ControlPlaneRef != nil { + log.V(4).Info("Skipping reconcileControlPlaneInitialized because cluster has a controlPlaneRef") return ctrl.Result{}, nil } if conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) { + log.V(4).Info("Skipping reconcileControlPlaneInitialized because control plane already initialized") return ctrl.Result{}, nil } + log.V(4).Info("Checking for control plane initialization") + machines, err := getActiveMachinesInCluster(ctx, r.Client, cluster.Namespace, cluster.Name) if err != nil { - log := ctrl.LoggerFrom(ctx) log.Error(err, "unable to determine ControlPlaneInitialized") return ctrl.Result{}, err } diff --git a/controllers/machine_controller_test.go b/controllers/machine_controller_test.go index 3bc8b4aca355..4c060aa3ad88 100644 --- a/controllers/machine_controller_test.go +++ b/controllers/machine_controller_test.go @@ -109,14 +109,8 @@ func TestWatches(t *testing.T) { g.Expect(testEnv.Cleanup(ctx, do...)).To(Succeed()) }(ns, testCluster, defaultBootstrap) - // Patch cluster control plane initialized (this is required to start node watch) - patchHelper, err := patch.NewHelper(testCluster, testEnv) - g.Expect(err).ShouldNot(HaveOccurred()) - conditions.MarkTrue(testCluster, clusterv1.ControlPlaneInitializedCondition) - g.Expect(patchHelper.Patch(ctx, testCluster, patch.WithStatusObservedGeneration{})).To(Succeed()) - // Patch infra machine ready - patchHelper, err = patch.NewHelper(infraMachine, testEnv) + patchHelper, err := patch.NewHelper(infraMachine, testEnv) g.Expect(err).ShouldNot(HaveOccurred()) g.Expect(unstructured.SetNestedField(infraMachine.Object, true, "status", "ready")).To(Succeed()) g.Expect(patchHelper.Patch(ctx, infraMachine, patch.WithStatusObservedGeneration{})).To(Succeed()) @@ -132,6 +126,9 @@ func TestWatches(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ GenerateName: "machine-created-", Namespace: ns.Name, + Labels: map[string]string{ + clusterv1.MachineControlPlaneLabelName: "", + }, }, Spec: clusterv1.MachineSpec{ ClusterName: testCluster.Name, diff --git a/controllers/machinehealthcheck_controller_test.go b/controllers/machinehealthcheck_controller_test.go index ffa3c0b995e3..66ef56ede43c 100644 --- a/controllers/machinehealthcheck_controller_test.go +++ b/controllers/machinehealthcheck_controller_test.go @@ -38,6 +38,7 @@ import ( "k8s.io/utils/pointer" clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4" "sigs.k8s.io/cluster-api/controllers/remote" + capierrors "sigs.k8s.io/cluster-api/errors" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" @@ -174,6 +175,116 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { )) }) + t.Run("it ignores Machines not matching the label selector", func(t *testing.T) { + g := NewWithT(t) + ctx := context.TODO() + cluster := createNamespaceAndCluster(g) + + mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) + + g.Expect(testEnv.Create(ctx, mhc)).To(Succeed()) + defer func(do ...client.Object) { + g.Expect(testEnv.Cleanup(ctx, do...)).To(Succeed()) + }(cluster, mhc) + + // Healthy nodes and machines matching the MHC's label selector. + _, machines, cleanup := createMachinesWithNodes(g, cluster, + count(2), + firstMachineAsControlPlane(), + createNodeRefForMachine(true), + nodeStatus(corev1.ConditionTrue), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup() + targetMachines := make([]string, len(machines)) + for i, m := range machines { + targetMachines[i] = m.Name + } + sort.Strings(targetMachines) + + // Healthy nodes and machines NOT matching the MHC's label selector. + _, _, cleanup2 := createMachinesWithNodes(g, cluster, + count(2), + createNodeRefForMachine(true), + nodeStatus(corev1.ConditionTrue), + ) + defer cleanup2() + + // Make sure the status matches. + g.Eventually(func() *clusterv1.MachineHealthCheckStatus { + err := testEnv.Get(ctx, util.ObjectKey(mhc), mhc) + if err != nil { + return nil + } + return &mhc.Status + }, 5*time.Second, 100*time.Millisecond).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ + ExpectedMachines: 2, + CurrentHealthy: 2, + RemediationsAllowed: 2, + ObservedGeneration: 1, + Targets: targetMachines, + Conditions: clusterv1.Conditions{ + { + Type: clusterv1.RemediationAllowedCondition, + Status: corev1.ConditionTrue, + }, + }, + })) + }) + + t.Run("it doesn't mark anything unhealthy when cluster infrastructure is not ready", func(t *testing.T) { + g := NewWithT(t) + cluster := createNamespaceAndCluster(g) + + patchHelper, err := patch.NewHelper(cluster, testEnv.Client) + g.Expect(err).To(BeNil()) + + conditions.MarkFalse(cluster, clusterv1.InfrastructureReadyCondition, "SomeReason", clusterv1.ConditionSeverityError, "") + g.Expect(patchHelper.Patch(ctx, cluster)).To(Succeed()) + + mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) + + g.Expect(testEnv.Create(ctx, mhc)).To(Succeed()) + defer func(do ...client.Object) { + g.Expect(testEnv.Cleanup(ctx, do...)).To(Succeed()) + }(cluster, mhc) + + // Healthy nodes and machines. + _, machines, cleanup := createMachinesWithNodes(g, cluster, + count(2), + firstMachineAsControlPlane(), + createNodeRefForMachine(true), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup() + targetMachines := make([]string, len(machines)) + for i, m := range machines { + targetMachines[i] = m.Name + } + sort.Strings(targetMachines) + + // Make sure the status matches. + g.Eventually(func() *clusterv1.MachineHealthCheckStatus { + err := testEnv.Get(ctx, util.ObjectKey(mhc), mhc) + if err != nil { + return nil + } + return &mhc.Status + }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ + ExpectedMachines: 2, + CurrentHealthy: 2, + RemediationsAllowed: 2, + ObservedGeneration: 1, + Targets: targetMachines, + Conditions: clusterv1.Conditions{ + { + Type: clusterv1.RemediationAllowedCondition, + Status: corev1.ConditionTrue, + }, + }, + })) + }) + t.Run("it doesn't mark anything unhealthy when all Machines are healthy", func(t *testing.T) { g := NewWithT(t) cluster := createNamespaceAndCluster(g) @@ -188,8 +299,9 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { // Healthy nodes and machines. _, machines, cleanup := createMachinesWithNodes(g, cluster, count(2), + firstMachineAsControlPlane(), createNodeRefForMachine(true), - markNodeAsHealthy(true), + nodeStatus(corev1.ConditionTrue), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup() @@ -235,8 +347,9 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { // Healthy nodes and machines. _, machines, cleanup1 := createMachinesWithNodes(g, cluster, count(2), + firstMachineAsControlPlane(), createNodeRefForMachine(true), - markNodeAsHealthy(true), + nodeStatus(corev1.ConditionTrue), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup1() @@ -244,8 +357,126 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, count(1), createNodeRefForMachine(true), - markNodeAsHealthy(false), + nodeStatus(corev1.ConditionUnknown), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup2() + machines = append(machines, unhealthyMachines...) + targetMachines := make([]string, len(machines)) + for i, m := range machines { + targetMachines[i] = m.Name + } + sort.Strings(targetMachines) + + // Make sure the status matches. + g.Eventually(func() *clusterv1.MachineHealthCheckStatus { + err := testEnv.Get(ctx, util.ObjectKey(mhc), mhc) + if err != nil { + return nil + } + return &mhc.Status + }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ + ExpectedMachines: 3, + CurrentHealthy: 2, + RemediationsAllowed: 2, + ObservedGeneration: 1, + Targets: targetMachines, + Conditions: clusterv1.Conditions{ + { + Type: clusterv1.RemediationAllowedCondition, + Status: corev1.ConditionTrue, + }, + }, + })) + }) + + t.Run("it marks unhealthy machines for remediation when there a Machine has a failure reason", func(t *testing.T) { + g := NewWithT(t) + ctx := context.TODO() + cluster := createNamespaceAndCluster(g) + + mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) + + g.Expect(testEnv.Create(ctx, mhc)).To(Succeed()) + defer func(do ...client.Object) { + g.Expect(testEnv.Cleanup(ctx, do...)).To(Succeed()) + }(cluster, mhc) + + // Healthy nodes and machines. + _, machines, cleanup1 := createMachinesWithNodes(g, cluster, + count(2), + firstMachineAsControlPlane(), + createNodeRefForMachine(true), + nodeStatus(corev1.ConditionTrue), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup1() + // Machine with failure reason. + _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, + count(1), + createNodeRefForMachine(true), + nodeStatus(corev1.ConditionTrue), machineLabels(mhc.Spec.Selector.MatchLabels), + machineFailureReason("some failure"), + ) + defer cleanup2() + machines = append(machines, unhealthyMachines...) + targetMachines := make([]string, len(machines)) + for i, m := range machines { + targetMachines[i] = m.Name + } + sort.Strings(targetMachines) + + // Make sure the status matches. + g.Eventually(func() *clusterv1.MachineHealthCheckStatus { + err := testEnv.Get(ctx, util.ObjectKey(mhc), mhc) + if err != nil { + return nil + } + return &mhc.Status + }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ + ExpectedMachines: 3, + CurrentHealthy: 2, + RemediationsAllowed: 2, + ObservedGeneration: 1, + Targets: targetMachines, + Conditions: clusterv1.Conditions{ + { + Type: clusterv1.RemediationAllowedCondition, + Status: corev1.ConditionTrue, + }, + }, + })) + }) + + t.Run("it marks unhealthy machines for remediation when there a Machine has a failure message", func(t *testing.T) { + g := NewWithT(t) + ctx := context.TODO() + cluster := createNamespaceAndCluster(g) + + mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) + + g.Expect(testEnv.Create(ctx, mhc)).To(Succeed()) + defer func(do ...client.Object) { + g.Expect(testEnv.Cleanup(ctx, do...)).To(Succeed()) + }(cluster, mhc) + + // Healthy nodes and machines. + _, machines, cleanup1 := createMachinesWithNodes(g, cluster, + count(2), + firstMachineAsControlPlane(), + createNodeRefForMachine(true), + nodeStatus(corev1.ConditionTrue), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup1() + // Machine with failure message. + _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, + count(1), + createNodeRefForMachine(true), + nodeStatus(corev1.ConditionTrue), + machineLabels(mhc.Spec.Selector.MatchLabels), + machineFailureMessage("some failure"), ) defer cleanup2() machines = append(machines, unhealthyMachines...) @@ -293,8 +524,9 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { // Healthy nodes and machines. _, machines, cleanup1 := createMachinesWithNodes(g, cluster, count(1), + firstMachineAsControlPlane(), createNodeRefForMachine(true), - markNodeAsHealthy(true), + nodeStatus(corev1.ConditionTrue), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup1() @@ -302,7 +534,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, count(2), createNodeRefForMachine(true), - markNodeAsHealthy(false), + nodeStatus(corev1.ConditionUnknown), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup2() @@ -366,7 +598,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { } for i := range machines.Items { - if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { + if conditions.IsTrue(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { remediated++ } } @@ -378,6 +610,14 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { g := NewWithT(t) cluster := createNamespaceAndCluster(g) + // After the cluster exists, we have to set the infrastructure ready condition; otherwise, MachineHealthChecks + // will never fail when nodeStartupTimeout is exceeded. + patchHelper, err := patch.NewHelper(cluster, testEnv.GetClient()) + g.Expect(err).ToNot(HaveOccurred()) + + conditions.MarkTrue(cluster, clusterv1.InfrastructureReadyCondition) + g.Expect(patchHelper.Patch(ctx, cluster)).To(Succeed()) + mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) mhc.Spec.NodeStartupTimeout = &metav1.Duration{Duration: 5 * time.Hour} @@ -389,8 +629,9 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { // Healthy nodes and machines. _, machines, cleanup1 := createMachinesWithNodes(g, cluster, count(2), + firstMachineAsControlPlane(), createNodeRefForMachine(true), - markNodeAsHealthy(true), + nodeStatus(corev1.ConditionTrue), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup1() @@ -398,7 +639,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, count(1), createNodeRefForMachine(false), - markNodeAsHealthy(false), + nodeStatus(corev1.ConditionUnknown), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup2() @@ -459,7 +700,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { } for i := range machines.Items { - if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { + if conditions.IsTrue(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { remediated++ } } @@ -484,8 +725,9 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { // Healthy nodes and machines. _, machines, cleanup1 := createMachinesWithNodes(g, cluster, count(2), + firstMachineAsControlPlane(), createNodeRefForMachine(true), - markNodeAsHealthy(true), + nodeStatus(corev1.ConditionTrue), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup1() @@ -493,7 +735,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { _, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster, count(1), createNodeRefForMachine(false), - markNodeAsHealthy(false), + nodeStatus(corev1.ConditionUnknown), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup2() @@ -582,8 +824,9 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { // Healthy nodes and machines. nodes, machines, cleanup := createMachinesWithNodes(g, cluster, count(3), + firstMachineAsControlPlane(), createNodeRefForMachine(true), - markNodeAsHealthy(true), + nodeStatus(corev1.ConditionTrue), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup() @@ -674,8 +917,9 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { // Healthy nodes and machines. nodes, machines, cleanup := createMachinesWithNodes(g, cluster, count(1), + firstMachineAsControlPlane(), createNodeRefForMachine(true), - markNodeAsHealthy(true), + nodeStatus(corev1.ConditionTrue), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup() @@ -756,7 +1000,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { return }).Should(Equal(1)) - // Calculate how many Machines have been remediated. + // Calculate how many Machines have been marked for remediation g.Eventually(func() (remediated int) { machines := &clusterv1.MachineList{} err := testEnv.List(ctx, machines, client.MatchingLabels{ @@ -767,7 +1011,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { } for i := range machines.Items { - if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { + if conditions.IsFalse(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { remediated++ } } @@ -779,6 +1023,15 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { g := NewWithT(t) cluster := createNamespaceAndCluster(g) + // Create 1 control plane machine so MHC can proceed + _, _, cleanup := createMachinesWithNodes(g, cluster, + count(1), + firstMachineAsControlPlane(), + createNodeRefForMachine(true), + nodeStatus(corev1.ConditionTrue), + ) + defer cleanup() + mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) // Create infrastructure template resource. infraResource := map[string]interface{}{ @@ -930,8 +1183,9 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { // Healthy nodes and machines. nodes, machines, cleanup := createMachinesWithNodes(g, cluster, count(1), + firstMachineAsControlPlane(), createNodeRefForMachine(true), - markNodeAsHealthy(true), + nodeStatus(corev1.ConditionTrue), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup() @@ -1080,8 +1334,9 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { // Healthy nodes and machines. nodes, machines, cleanup := createMachinesWithNodes(g, cluster, count(1), + firstMachineAsControlPlane(), createNodeRefForMachine(true), - markNodeAsHealthy(true), + nodeStatus(corev1.ConditionTrue), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup() @@ -1227,8 +1482,9 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { // Healthy nodes and machines. nodes, machines, cleanup := createMachinesWithNodes(g, cluster, count(1), + firstMachineAsControlPlane(), createNodeRefForMachine(true), - markNodeAsHealthy(true), + nodeStatus(corev1.ConditionTrue), machineLabels(mhc.Spec.Selector.MatchLabels), ) defer cleanup() @@ -1840,18 +2096,38 @@ func ownerReferenceForCluster(ctx context.Context, g *WithT, c *clusterv1.Cluste func createNamespaceAndCluster(g *WithT) *clusterv1.Cluster { ns, err := testEnv.CreateNamespace(ctx, "test-mhc") g.Expect(err).ToNot(HaveOccurred()) + cluster := &clusterv1.Cluster{ ObjectMeta: metav1.ObjectMeta{ GenerateName: "test-cluster-", Namespace: ns.Name, }, } + g.Expect(testEnv.Create(ctx, cluster)).To(Succeed()) + + // Make sure the cluster is in the cache before proceeding g.Eventually(func() error { var cl clusterv1.Cluster return testEnv.Get(ctx, util.ObjectKey(cluster), &cl) }, timeout, 100*time.Millisecond).Should(Succeed()) + // This is required for MHC to perform checks + patchHelper, err := patch.NewHelper(cluster, testEnv.Client) + g.Expect(err).To(BeNil()) + conditions.MarkTrue(cluster, clusterv1.InfrastructureReadyCondition) + g.Expect(patchHelper.Patch(ctx, cluster)).To(Succeed()) + + // Wait for cluster in cache to be updated post-patch + g.Eventually(func() bool { + err := testEnv.Get(ctx, util.ObjectKey(cluster), cluster) + if err != nil { + return false + } + + return conditions.IsTrue(cluster, clusterv1.InfrastructureReadyCondition) + }, timeout, 100*time.Millisecond).Should(BeTrue()) + g.Expect(testEnv.CreateKubeconfigSecret(ctx, cluster)).To(Succeed()) return cluster @@ -1884,28 +2160,6 @@ func newRunningMachine(c *clusterv1.Cluster, labels map[string]string) *clusterv } } -// newNode creaetes a Node object with node condition Ready == True -func newNode() *corev1.Node { - return &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - GenerateName: "test-mhc-node-", - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - {Type: corev1.NodeReady, Status: corev1.ConditionTrue}, - }, - }, - } -} - -func setNodeUnhealthy(node *corev1.Node) { - node.Status.Conditions[0] = corev1.NodeCondition{ - Type: corev1.NodeReady, - Status: corev1.ConditionUnknown, - LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)), - } -} - func newInfraMachine(machine *clusterv1.Machine) (*unstructured.Unstructured, string) { providerID := fmt.Sprintf("test:////%v", uuid.NewUUID()) return &unstructured.Unstructured{ @@ -1924,10 +2178,13 @@ func newInfraMachine(machine *clusterv1.Machine) (*unstructured.Unstructured, st } type machinesWithNodes struct { - count int - markNodeAsHealthy bool - createNodeRefForMachine bool - labels map[string]string + count int + nodeStatus corev1.ConditionStatus + createNodeRefForMachine bool + firstMachineAsControlPlane bool + labels map[string]string + failureReason string + failureMessage string } type machineWithNodesOption func(m *machinesWithNodes) @@ -1938,9 +2195,15 @@ func count(n int) machineWithNodesOption { } } -func markNodeAsHealthy(b bool) machineWithNodesOption { +func firstMachineAsControlPlane() machineWithNodesOption { + return func(m *machinesWithNodes) { + m.firstMachineAsControlPlane = true + } +} + +func nodeStatus(s corev1.ConditionStatus) machineWithNodesOption { return func(m *machinesWithNodes) { - m.markNodeAsHealthy = b + m.nodeStatus = s } } @@ -1956,6 +2219,18 @@ func machineLabels(l map[string]string) machineWithNodesOption { } } +func machineFailureReason(s string) machineWithNodesOption { + return func(m *machinesWithNodes) { + m.failureReason = s + } +} + +func machineFailureMessage(s string) machineWithNodesOption { + return func(m *machinesWithNodes) { + m.failureMessage = s + } +} + func createMachinesWithNodes( g *WithT, c *clusterv1.Cluster, @@ -1975,6 +2250,12 @@ func createMachinesWithNodes( for i := 0; i < o.count; i++ { machine := newRunningMachine(c, o.labels) + if i == 0 && o.firstMachineAsControlPlane { + if machine.Labels == nil { + machine.Labels = make(map[string]string) + } + machine.Labels[clusterv1.MachineControlPlaneLabelName] = "" + } infraMachine, providerID := newInfraMachine(machine) g.Expect(testEnv.Create(ctx, infraMachine)).To(Succeed()) infraMachines = append(infraMachines, infraMachine) @@ -2008,35 +2289,60 @@ func createMachinesWithNodes( return machine.Status.LastUpdated }, timeout, 100*time.Millisecond).ShouldNot(BeNil()) + machinePatchHelper, err := patch.NewHelper(machine, testEnv.Client) + g.Expect(err).To(BeNil()) + if o.createNodeRefForMachine { - node := newNode() - if !o.markNodeAsHealthy { - setNodeUnhealthy(node) + // Create node + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-mhc-node-", + }, + Spec: corev1.NodeSpec{ + ProviderID: providerID, + }, } - machineStatus := machine.Status - node.Spec.ProviderID = providerID - nodeStatus := node.Status + g.Expect(testEnv.Create(ctx, node)).To(Succeed()) fmt.Printf("node created: %s\n", node.GetName()) - nodePatch := client.MergeFrom(node.DeepCopy()) - node.Status = nodeStatus - g.Expect(testEnv.Status().Patch(ctx, node, nodePatch)).To(Succeed()) + // Patch node status + nodePatchHelper, err := patch.NewHelper(node, testEnv.Client) + g.Expect(err).To(BeNil()) + + node.Status.Conditions = []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: o.nodeStatus, + LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)), + }, + } + + g.Expect(nodePatchHelper.Patch(ctx, node)).To(Succeed()) + nodes = append(nodes, node) - machinePatch := client.MergeFrom(machine.DeepCopy()) - machine.Status = machineStatus machine.Status.NodeRef = &corev1.ObjectReference{ Name: node.Name, } + } - // Adding one second to ensure there is a difference from the - // original time so that the patch works. That is, ensure the - // precision isn't lost during conversions. - lastUp := metav1.NewTime(machine.Status.LastUpdated.Add(time.Second)) - machine.Status.LastUpdated = &lastUp - g.Expect(testEnv.Status().Patch(ctx, machine, machinePatch)).To(Succeed()) + if o.failureReason != "" { + failureReason := capierrors.MachineStatusError(o.failureReason) + machine.Status.FailureReason = &failureReason } + if o.failureMessage != "" { + machine.Status.FailureMessage = pointer.StringPtr(o.failureMessage) + } + + // Adding one second to ensure there is a difference from the + // original time so that the patch works. That is, ensure the + // precision isn't lost during conversions. + lastUp := metav1.NewTime(machine.Status.LastUpdated.Add(time.Second)) + machine.Status.LastUpdated = &lastUp + + // Patch the machine to record the status changes + g.Expect(machinePatchHelper.Patch(ctx, machine)).To(Succeed()) machines = append(machines, machine) } diff --git a/controllers/machinehealthcheck_targets.go b/controllers/machinehealthcheck_targets.go index 3de56dbed53b..5c8f68d3b905 100644 --- a/controllers/machinehealthcheck_targets.go +++ b/controllers/machinehealthcheck_targets.go @@ -58,6 +58,7 @@ const ( // healthCheckTarget contains the information required to perform a health check // on the node to determine if any remediation is required. type healthCheckTarget struct { + Cluster *clusterv1.Cluster Machine *clusterv1.Machine Node *corev1.Node MHC *clusterv1.MachineHealthCheck @@ -114,19 +115,46 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi return true, time.Duration(0) } + // Don't penalize any Machine/Node if the control plane has not been initialized. + if !conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) { + logger.V(3).Info("Not evaluating target health because the control plane has not yet been initialized") + // Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated. + return false, 0 + } + + // Don't penalize any Machine/Node if the cluster infrastructure is not ready. + if !conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) { + logger.V(3).Info("Not evaluating target health because the cluster infrastructure is not ready") + // Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated. + return false, 0 + } + // the node has not been set yet if t.Node == nil { - // status not updated yet - if t.Machine.Status.LastUpdated == nil { - return false, timeoutForMachineToHaveNode + controlPlaneInitializedTime := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ControlPlaneInitializedCondition).Time + clusterInfraReadyTime := conditions.GetLastTransitionTime(t.Cluster, clusterv1.InfrastructureReadyCondition).Time + machineCreationTime := t.Machine.CreationTimestamp.Time + + // Use the latest of the 3 times + comparisonTime := machineCreationTime + logger.V(3).Info("Determining comparison time", "machineCreationTime", machineCreationTime, "clusterInfraReadyTime", clusterInfraReadyTime, "controlPlaneInitializedTime", controlPlaneInitializedTime) + if controlPlaneInitializedTime.After(comparisonTime) { + comparisonTime = controlPlaneInitializedTime + } + if clusterInfraReadyTime.After(comparisonTime) { + comparisonTime = clusterInfraReadyTime } - if t.Machine.Status.LastUpdated.Add(timeoutForMachineToHaveNode).Before(now) { + logger.V(3).Info("Using comparison time", "time", comparisonTime) + + if comparisonTime.Add(timeoutForMachineToHaveNode).Before(now) { conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSuccededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutForMachineToHaveNode.String()) logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutForMachineToHaveNode.String()) return true, time.Duration(0) } - durationUnhealthy := now.Sub(t.Machine.Status.LastUpdated.Time) + + durationUnhealthy := now.Sub(comparisonTime) nextCheck := timeoutForMachineToHaveNode - durationUnhealthy + time.Second + return false, nextCheck } @@ -168,6 +196,11 @@ func (r *MachineHealthCheckReconciler) getTargetsFromMHC(ctx context.Context, cl return nil, nil } + var cluster clusterv1.Cluster + if err := clusterClient.Get(ctx, client.ObjectKey{Namespace: mhc.Namespace, Name: mhc.Spec.ClusterName}, &cluster); err != nil { + return nil, errors.Wrapf(err, "error getting Cluster %s/%s for MachineHealthCheck %s", mhc.Namespace, mhc.Spec.ClusterName, mhc.Name) + } + targets := []healthCheckTarget{} for k := range machines { patchHelper, err := patch.NewHelper(&machines[k], r.Client) @@ -175,6 +208,7 @@ func (r *MachineHealthCheckReconciler) getTargetsFromMHC(ctx context.Context, cl return nil, errors.Wrap(err, "unable to initialize patch helper") } target := healthCheckTarget{ + Cluster: &cluster, MHC: mhc, Machine: &machines[k], patchHelper: patchHelper, @@ -194,7 +228,7 @@ func (r *MachineHealthCheckReconciler) getTargetsFromMHC(ctx context.Context, cl return targets, nil } -//getMachinesFromMHC fetches Machines matched by the MachineHealthCheck's +// getMachinesFromMHC fetches Machines matched by the MachineHealthCheck's // label selector func (r *MachineHealthCheckReconciler) getMachinesFromMHC(ctx context.Context, mhc *clusterv1.MachineHealthCheck) ([]clusterv1.Machine, error) { selector, err := metav1.LabelSelectorAsSelector(metav1.CloneSelectorAndAddLabel( diff --git a/controllers/machinehealthcheck_targets_test.go b/controllers/machinehealthcheck_targets_test.go index caea281d0da6..b36ba6d585fd 100644 --- a/controllers/machinehealthcheck_targets_test.go +++ b/controllers/machinehealthcheck_targets_test.go @@ -21,12 +21,12 @@ import ( "time" . "github.com/onsi/gomega" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/record" clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4" + "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -36,6 +36,14 @@ import ( func TestGetTargetsFromMHC(t *testing.T) { namespace := "test-mhc" clusterName := "test-cluster" + + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: clusterName, + }, + } + mhcSelector := map[string]string{"cluster": clusterName, "machine-group": "foo"} // Create a namespace for the tests @@ -62,7 +70,7 @@ func TestGetTargetsFromMHC(t *testing.T) { }, } - baseObjects := []client.Object{testNS, testMHC} + baseObjects := []client.Object{testNS, cluster, testMHC} // Initialise some test machines and nodes for use in the test cases @@ -160,8 +168,20 @@ func TestGetTargetsFromMHC(t *testing.T) { func TestHealthCheckTargets(t *testing.T) { namespace := "test-mhc" clusterName := "test-cluster" + + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: clusterName, + }, + } + conditions.MarkTrue(cluster, clusterv1.InfrastructureReadyCondition) + conditions.MarkTrue(cluster, clusterv1.ControlPlaneInitializedCondition) + mhcSelector := map[string]string{"cluster": clusterName, "machine-group": "foo"} + timeoutForMachineToHaveNode := 10 * time.Minute + // Create a test MHC testMHC := &clusterv1.MachineHealthCheck{ ObjectMeta: metav1.ObjectMeta{ @@ -196,6 +216,7 @@ func TestHealthCheckTargets(t *testing.T) { testMachineLastUpdated400s.Status.LastUpdated = &nowMinus400s nodeNotYetStartedTarget := healthCheckTarget{ + Cluster: cluster, MHC: testMHC, Machine: testMachineLastUpdated400s, Node: nil, @@ -203,6 +224,7 @@ func TestHealthCheckTargets(t *testing.T) { // Target for when the Node has been seen, but has now gone nodeGoneAway := healthCheckTarget{ + Cluster: cluster, MHC: testMHC, Machine: testMachine, Node: &corev1.Node{}, @@ -212,6 +234,7 @@ func TestHealthCheckTargets(t *testing.T) { // Target for when the node has been in an unknown state for shorter than the timeout testNodeUnknown200 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 200*time.Second) nodeUnknown200 := healthCheckTarget{ + Cluster: cluster, MHC: testMHC, Machine: testMachine, Node: testNodeUnknown200, @@ -221,6 +244,7 @@ func TestHealthCheckTargets(t *testing.T) { // Second Target for when the node has been in an unknown state for shorter than the timeout testNodeUnknown100 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 100*time.Second) nodeUnknown100 := healthCheckTarget{ + Cluster: cluster, MHC: testMHC, Machine: testMachine, Node: testNodeUnknown100, @@ -230,6 +254,7 @@ func TestHealthCheckTargets(t *testing.T) { // Target for when the node has been in an unknown state for longer than the timeout testNodeUnknown400 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 400*time.Second) nodeUnknown400 := healthCheckTarget{ + Cluster: cluster, MHC: testMHC, Machine: testMachine, Node: testNodeUnknown400, @@ -240,6 +265,7 @@ func TestHealthCheckTargets(t *testing.T) { testNodeHealthy := newTestNode("node1") testNodeHealthy.UID = "12345" nodeHealthy := healthCheckTarget{ + Cluster: cluster, MHC: testMHC, Machine: testMachine, Node: testNodeHealthy, @@ -258,7 +284,7 @@ func TestHealthCheckTargets(t *testing.T) { targets: []healthCheckTarget{nodeNotYetStartedTarget}, expectedHealthy: []healthCheckTarget{}, expectedNeedsRemediation: []healthCheckTarget{}, - expectedNextCheckTimes: []time.Duration{200 * time.Second}, + expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode}, }, { desc: "when the node has gone away", @@ -299,18 +325,13 @@ func TestHealthCheckTargets(t *testing.T) { for _, tc := range testCases { t.Run(tc.desc, func(t *testing.T) { - gs := NewGomegaWithT(t) - - gs.Expect(clusterv1.AddToScheme(scheme.Scheme)).To(Succeed()) - k8sClient := fake.NewClientBuilder().WithScheme(scheme.Scheme).Build() + gs := NewWithT(t) - // Create a test reconciler + // Create a test reconciler. reconciler := &MachineHealthCheckReconciler{ - Client: k8sClient, recorder: record.NewFakeRecorder(5), } - timeoutForMachineToHaveNode := 10 * time.Minute healthy, unhealthy, nextCheckTimes := reconciler.healthCheckTargets(tc.targets, ctrl.LoggerFrom(ctx), timeoutForMachineToHaveNode) // Round durations down to nearest second account for minute differences