From ee307a7a1c6a9693e41f52acff0a91ce2e265b67 Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Tue, 12 Dec 2023 19:22:15 +0100 Subject: [PATCH] Mark Machine healthy condition as unknown if we can't list wl nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stefan Büringer buringerst@vmware.com --- api/v1beta1/condition_consts.go | 5 ++ .../machine/machine_controller_noderef.go | 2 +- .../machine/machine_controller_test.go | 47 ++++++++++++++----- 3 files changed, 42 insertions(+), 12 deletions(-) diff --git a/api/v1beta1/condition_consts.go b/api/v1beta1/condition_consts.go index 5e2bc212ecdf..61e72dd58664 100644 --- a/api/v1beta1/condition_consts.go +++ b/api/v1beta1/condition_consts.go @@ -207,6 +207,11 @@ const ( // NodeConditionsFailedReason (Severity=Warning) documents a node is not in a healthy state due to the failed state of at least 1 Kubelet condition. NodeConditionsFailedReason = "NodeConditionsFailed" + + // NodeInspectionFailedReason documents a failure in inspecting the node. + // This reason is used when the Machine controller is unable to list Nodes to find + // the corresponding Node for a Machine by ProviderID. + NodeInspectionFailedReason = "NodeInspectionFailed" ) // Conditions and condition Reasons for the MachineHealthCheck object. diff --git a/internal/controllers/machine/machine_controller_noderef.go b/internal/controllers/machine/machine_controller_noderef.go index 7f32c72c48c5..e555ee525a71 100644 --- a/internal/controllers/machine/machine_controller_noderef.go +++ b/internal/controllers/machine/machine_controller_noderef.go @@ -79,8 +79,8 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result, // No need to requeue here. Nodes emit an event that triggers reconciliation. return ctrl.Result{}, nil } - log.Error(err, "Failed to retrieve Node by ProviderID") r.recorder.Event(machine, corev1.EventTypeWarning, "Failed to retrieve Node by ProviderID", err.Error()) + conditions.MarkUnknown(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeInspectionFailedReason, "Failed to get the Node for this Machine by ProviderID") return ctrl.Result{}, err } diff --git a/internal/controllers/machine/machine_controller_test.go b/internal/controllers/machine/machine_controller_test.go index 62ecfcba8245..5ecb52731a22 100644 --- a/internal/controllers/machine/machine_controller_test.go +++ b/internal/controllers/machine/machine_controller_test.go @@ -847,7 +847,9 @@ func TestMachineConditions(t *testing.T) { infraReady bool bootstrapReady bool beforeFunc func(bootstrap, infra *unstructured.Unstructured, m *clusterv1.Machine) + additionalObjects []client.Object conditionsToAssert []*clusterv1.Condition + wantErr bool }{ { name: "all conditions true", @@ -952,6 +954,26 @@ func TestMachineConditions(t *testing.T) { conditions.FalseCondition(clusterv1.ReadyCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityWarning, ""), }, }, + { + name: "machine ready and MachineNodeHealthy unknown", + infraReady: true, + bootstrapReady: true, + additionalObjects: []client.Object{&corev1.Node{ + // This is a duplicate node with the same providerID + // This should lead to an error when trying to get the Node for a Machine. + ObjectMeta: metav1.ObjectMeta{ + Name: "test-duplicate", + }, + Spec: corev1.NodeSpec{ProviderID: "test://id-1"}, + }}, + wantErr: true, + conditionsToAssert: []*clusterv1.Condition{ + conditions.TrueCondition(clusterv1.InfrastructureReadyCondition), + conditions.TrueCondition(clusterv1.BootstrapReadyCondition), + conditions.TrueCondition(clusterv1.ReadyCondition), + conditions.UnknownCondition(clusterv1.MachineNodeHealthyCondition, clusterv1.NodeInspectionFailedReason, "Failed to get the Node for this Machine by ProviderID"), + }, + }, } for _, tt := range testcases { @@ -966,15 +988,13 @@ func TestMachineConditions(t *testing.T) { tt.beforeFunc(bootstrap, infra, m) } - clientFake := fake.NewClientBuilder().WithObjects( - &testCluster, - m, - builder.GenericInfrastructureMachineCRD.DeepCopy(), - infra, - builder.GenericBootstrapConfigCRD.DeepCopy(), - bootstrap, - node, - ). + objs := []client.Object{&testCluster, m, node, + builder.GenericInfrastructureMachineCRD.DeepCopy(), infra, + builder.GenericBootstrapConfigCRD.DeepCopy(), bootstrap, + } + objs = append(objs, tt.additionalObjects...) + + clientFake := fake.NewClientBuilder().WithObjects(objs...). WithIndex(&corev1.Node{}, index.NodeProviderIDField, index.NodeByProviderID). WithStatusSubresource(&clusterv1.Machine{}). Build() @@ -982,12 +1002,17 @@ func TestMachineConditions(t *testing.T) { r := &Reconciler{ Client: clientFake, UnstructuredCachingClient: clientFake, + recorder: record.NewFakeRecorder(10), Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), clientFake, scheme.Scheme, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), ssaCache: ssa.NewCache(), } _, err := r.Reconcile(ctx, reconcile.Request{NamespacedName: util.ObjectKey(&machine)}) - g.Expect(err).ToNot(HaveOccurred()) + if tt.wantErr { + g.Expect(err).To(HaveOccurred()) + } else { + g.Expect(err).ToNot(HaveOccurred()) + } m = &clusterv1.Machine{} g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(&machine), m)).ToNot(HaveOccurred()) @@ -2233,7 +2258,7 @@ func assertCondition(t *testing.T, from conditions.Getter, condition *clusterv1. g.Expect(conditions.Has(from, condition.Type)).To(BeTrue()) if condition.Status == corev1.ConditionTrue { - conditions.IsTrue(from, condition.Type) + g.Expect(conditions.IsTrue(from, condition.Type)).To(BeTrue()) } else { conditionToBeAsserted := conditions.Get(from, condition.Type) g.Expect(conditionToBeAsserted.Status).To(Equal(condition.Status))