Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 Add Node related condition to Machine conditions #3890

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions api/v1alpha3/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,6 @@ const (
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
MachineHasFailureReason = "MachineHasFailure"

// NodeNotFoundReason is the reason used when a machine's node has previously been observed but is now gone.
NodeNotFoundReason = "NodeNotFound"

// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
NodeStartupTimeoutReason = "NodeStartupTimeout"

Expand All @@ -127,3 +124,24 @@ const (
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
WaitingForRemediationReason = "WaitingForRemediation"
)

// Conditions and condition Reasons for the Machine's Node object
const (
// MachineNodeHealthyCondition provides info about the operational state of the Kubernetes node hosted on the machine by summarizing node conditions.
// If the conditions defined in a Kubernetes node (i.e., NodeReady, NodeMemoryPressure, NodeDiskPressure, NodePIDPressure, and NodeNetworkUnavailable) are in a healthy state, it will be set to True.
MachineNodeHealthyCondition ConditionType = "NodeHealthy"

// WaitingForNodeRefReason (Severity=Info) documents a machine.spec.providerId is not assigned yet.
WaitingForNodeRefReason = "WaitingForNodeRef"

// NodeProvisioningReason (Severity=Info) documents machine in the process of provisioning a node.
// NB. provisioning --> NodeRef == ""
NodeProvisioningReason = "NodeProvisioning"

// NodeNotFoundReason (Severity=Error) documents a machine's node has previously been observed but is now gone.
// NB. provisioned --> NodeRef != ""
NodeNotFoundReason = "NodeNotFound"

// NodeConditionsFailedReason (Severity=Warning) documents a node is not in a healthy state due to the failed state of at least 1 Kubelet condition.
NodeConditionsFailedReason = "NodeConditionsFailed"
)
24 changes: 21 additions & 3 deletions api/v1alpha4/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,6 @@ const (
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
MachineHasFailureReason = "MachineHasFailure"

// NodeNotFoundReason is the reason used when a machine's node has previously been observed but is now gone.
NodeNotFoundReason = "NodeNotFound"

// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
NodeStartupTimeoutReason = "NodeStartupTimeout"

Expand All @@ -134,3 +131,24 @@ const (
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
WaitingForRemediationReason = "WaitingForRemediation"
)

// Conditions and condition Reasons for the Machine's Node object
const (
// MachineNodeHealthyCondition provides info about the operational state of the Kubernetes node hosted on the machine by summarizing node conditions.
// If the conditions defined in a Kubernetes node (i.e., NodeReady, NodeMemoryPressure, NodeDiskPressure, NodePIDPressure, and NodeNetworkUnavailable) are in a healthy state, it will be set to True.
MachineNodeHealthyCondition ConditionType = "NodeHealthy"

// WaitingForNodeRefReason (Severity=Info) documents a machine.spec.providerId is not assigned yet.
WaitingForNodeRefReason = "WaitingForNodeRef"

// NodeProvisioningReason (Severity=Info) documents machine in the process of provisioning a node.
// NB. provisioning --> NodeRef == ""
NodeProvisioningReason = "NodeProvisioning"

// NodeNotFoundReason (Severity=Error) documents a machine's node has previously been observed but is now gone.
// NB. provisioned --> NodeRef != ""
NodeNotFoundReason = "NodeNotFound"

// NodeConditionsFailedReason (Severity=Warning) documents a node is not in a healthy state due to the failed state of at least 1 Kubelet condition.
NodeConditionsFailedReason = "NodeConditionsFailed"
)
13 changes: 12 additions & 1 deletion controllers/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ func (r *MachineReconciler) reconcile(ctx context.Context, cluster *clusterv1.Cl
phases := []func(context.Context, *clusterv1.Cluster, *clusterv1.Machine) (ctrl.Result, error){
r.reconcileBootstrap,
r.reconcileInfrastructure,
r.reconcileNodeRef,
r.reconcileNode,
}

res := ctrl.Result{}
Expand Down Expand Up @@ -346,6 +346,16 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
// Return early and don't remove the finalizer if we got an error or
// the external reconciliation deletion isn't ready.

patchHelper, err := patch.NewHelper(m, r.Client)
if err != nil {
return ctrl.Result{}, err
}
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "")
if err := patchMachine(ctx, patchHelper, m); err != nil {
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletionFailedReason, clusterv1.ConditionSeverityInfo, "")
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
}

if ok, err := r.reconcileDeleteInfrastructure(ctx, m); !ok || err != nil {
return ctrl.Result{}, err
}
Expand All @@ -368,6 +378,7 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
})
if waitErr != nil {
log.Error(deleteNodeErr, "Timed out deleting node, moving on", "node", m.Status.NodeRef.Name)
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletionFailedReason, clusterv1.ConditionSeverityWarning, "")
r.recorder.Eventf(m, corev1.EventTypeWarning, "FailedDeleteNode", "error deleting Machine's node: %v", deleteNodeErr)
}
}
Expand Down
115 changes: 82 additions & 33 deletions controllers/machine_controller_noderef.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ package controllers
import (
"context"
"fmt"
"time"

"github.com/pkg/errors"
apicorev1 "k8s.io/api/core/v1"
corev1 "k8s.io/api/core/v1"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4"
"sigs.k8s.io/cluster-api/controllers/noderefutil"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)
Expand All @@ -34,24 +34,14 @@ var (
ErrNodeNotFound = errors.New("cannot find node with matching ProviderID")
)

func (r *MachineReconciler) reconcileNodeRef(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (ctrl.Result, error) {
log := ctrl.LoggerFrom(ctx, "cluster", cluster.Name)

// Check that the Machine hasn't been deleted or in the process.
if !machine.DeletionTimestamp.IsZero() {
return ctrl.Result{}, nil
}

// Check that the Machine doesn't already have a NodeRef.
if machine.Status.NodeRef != nil {
return ctrl.Result{}, nil
}

func (r *MachineReconciler) reconcileNode(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (ctrl.Result, error) {
log := ctrl.LoggerFrom(ctx, "machine", machine.Name, "namespace", machine.Namespace)
log = log.WithValues("cluster", cluster.Name)

// Check that the Machine has a valid ProviderID.
if machine.Spec.ProviderID == nil || *machine.Spec.ProviderID == "" {
log.Info("Machine doesn't have a valid ProviderID yet")
log.Info("Cannot reconcile Machine's Node, no valid ProviderID yet")
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.WaitingForNodeRefReason, clusterv1.ConditionSeverityInfo, "")
return ctrl.Result{}, nil
}

Expand All @@ -65,29 +55,93 @@ func (r *MachineReconciler) reconcileNodeRef(ctx context.Context, cluster *clust
return ctrl.Result{}, err
}

// Get the Node reference.
nodeRef, err := r.getNodeReference(ctx, remoteClient, providerID)
// Even if Status.NodeRef exists, continue to do the following checks to make sure Node is healthy
node, err := r.getNode(ctx, remoteClient, providerID)
if err != nil {
if err == ErrNodeNotFound {
log.Info(fmt.Sprintf("Cannot assign NodeRef to Machine: %s, requeuing", ErrNodeNotFound.Error()))
return ctrl.Result{RequeueAfter: 20 * time.Second}, nil
// While a NodeRef is set in the status, failing to get that node means the node is deleted.
// If Status.NodeRef is not set before, node still can be in the provisioning state.
if machine.Status.NodeRef != nil {
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityError, "")
return ctrl.Result{}, errors.Wrapf(err, "no matching Node for Machine %q in namespace %q", machine.Name, machine.Namespace)
}
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeProvisioningReason, clusterv1.ConditionSeverityWarning, "")
return ctrl.Result{Requeue: true}, nil
}
log.Error(err, "Failed to assign NodeRef")
r.recorder.Event(machine, apicorev1.EventTypeWarning, "FailedSetNodeRef", err.Error())
log.Error(err, "Failed to retrieve Node by ProviderID")
r.recorder.Event(machine, corev1.EventTypeWarning, "Failed to retrieve Node by ProviderID", err.Error())
return ctrl.Result{}, err
}

// Set the Machine NodeRef.
machine.Status.NodeRef = nodeRef
log.Info("Set Machine's NodeRef", "noderef", machine.Status.NodeRef.Name)
r.recorder.Event(machine, apicorev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name)
if machine.Status.NodeRef == nil {
machine.Status.NodeRef = &corev1.ObjectReference{
Kind: node.Kind,
APIVersion: node.APIVersion,
Name: node.Name,
UID: node.UID,
}
log.Info("Set Machine's NodeRef", "noderef", machine.Status.NodeRef.Name)
r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name)
}

// Do the remaining node health checks, then set the node health to true if all checks pass.
status, message := summarizeNodeConditions(node)
if status == corev1.ConditionFalse {
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, clusterv1.ConditionSeverityWarning, message)
return ctrl.Result{}, nil
}

conditions.MarkTrue(machine, clusterv1.MachineNodeHealthyCondition)
return ctrl.Result{}, nil
}

func (r *MachineReconciler) getNodeReference(ctx context.Context, c client.Reader, providerID *noderefutil.ProviderID) (*apicorev1.ObjectReference, error) {
// summarizeNodeConditions summarizes a Node's conditions and returns the summary of condition statuses and concatenate failed condition messages:
// if there is at least 1 semantically-negative condition, summarized status = False;
// if there is at least 1 semantically-positive condition when there is 0 semantically negative condition, summarized status = True;
// if all conditions are unknown, summarized status = Unknown.
// (semantically true conditions: NodeMemoryPressure/NodeDiskPressure/NodePIDPressure == false or Ready == true.)
func summarizeNodeConditions(node *corev1.Node) (corev1.ConditionStatus, string) {
totalNumOfConditionsChecked := 4
semanticallyFalseStatus := 0
unknownStatus := 0

message := ""
for _, condition := range node.Status.Conditions {
switch condition.Type {
case corev1.NodeMemoryPressure, corev1.NodeDiskPressure, corev1.NodePIDPressure:
if condition.Status != corev1.ConditionFalse {
message += fmt.Sprintf("Node condition %s is %s", condition.Type, condition.Status) + ". "
if condition.Status == corev1.ConditionUnknown {
unknownStatus++
continue
}
semanticallyFalseStatus++
}
case corev1.NodeReady:
if condition.Status != corev1.ConditionTrue {
message += fmt.Sprintf("Node condition %s is %s", condition.Type, condition.Status) + ". "
if condition.Status == corev1.ConditionUnknown {
unknownStatus++
continue
}
semanticallyFalseStatus++
}
}
}
if semanticallyFalseStatus > 0 {
return corev1.ConditionFalse, message
}
if semanticallyFalseStatus+unknownStatus < totalNumOfConditionsChecked {
return corev1.ConditionTrue, message
}
return corev1.ConditionUnknown, message
}

func (r *MachineReconciler) getNode(ctx context.Context, c client.Reader, providerID *noderefutil.ProviderID) (*corev1.Node, error) {
log := ctrl.LoggerFrom(ctx, "providerID", providerID)

nodeList := apicorev1.NodeList{}
nodeList := corev1.NodeList{}
for {
if err := c.List(ctx, &nodeList, client.Continue(nodeList.Continue)); err != nil {
return nil, err
Expand All @@ -101,12 +155,7 @@ func (r *MachineReconciler) getNodeReference(ctx context.Context, c client.Reade
}

if providerID.Equals(nodeProviderID) {
return &apicorev1.ObjectReference{
Kind: node.Kind,
APIVersion: node.APIVersion,
Name: node.Name,
UID: node.UID,
}, nil
return &node, nil
}
}

Expand Down
77 changes: 70 additions & 7 deletions controllers/machine_controller_noderef_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,10 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/record"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4"
"sigs.k8s.io/cluster-api/controllers/noderefutil"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)

func TestGetNodeReference(t *testing.T) {
Expand Down Expand Up @@ -106,21 +105,85 @@ func TestGetNodeReference(t *testing.T) {
providerID, err := noderefutil.NewProviderID(test.providerID)
gt.Expect(err).NotTo(HaveOccurred(), "Expected no error parsing provider id %q, got %v", test.providerID, err)

reference, err := r.getNodeReference(ctx, client, providerID)
node, err := r.getNode(ctx, client, providerID)
if test.err == nil {
g.Expect(err).To(BeNil())
} else {
gt.Expect(err).NotTo(BeNil())
gt.Expect(err).To(Equal(test.err), "Expected error %v, got %v", test.err, err)
}

if test.expected == nil && reference == nil {
if test.expected == nil && node == nil {
return
}

gt.Expect(reference.Name).To(Equal(test.expected.Name), "Expected NodeRef's name to be %v, got %v", reference.Name, test.expected.Name)
gt.Expect(reference.Namespace).To(Equal(test.expected.Namespace), "Expected NodeRef's namespace to be %v, got %v", reference.Namespace, test.expected.Namespace)
gt.Expect(node.Name).To(Equal(test.expected.Name), "Expected NodeRef's name to be %v, got %v", node.Name, test.expected.Name)
gt.Expect(node.Namespace).To(Equal(test.expected.Namespace), "Expected NodeRef's namespace to be %v, got %v", node.Namespace, test.expected.Namespace)
})

}
}

func TestSummarizeNodeConditions(t *testing.T) {
testCases := []struct {
name string
conditions []corev1.NodeCondition
status corev1.ConditionStatus
}{
{
name: "node is healthy",
conditions: []corev1.NodeCondition{
{Type: corev1.NodeReady, Status: corev1.ConditionTrue},
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionFalse},
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionFalse},
{Type: corev1.NodePIDPressure, Status: corev1.ConditionFalse},
},
status: corev1.ConditionTrue,
},
{
name: "all conditions are unknown",
conditions: []corev1.NodeCondition{
{Type: corev1.NodeReady, Status: corev1.ConditionUnknown},
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionUnknown},
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionUnknown},
{Type: corev1.NodePIDPressure, Status: corev1.ConditionUnknown},
},
status: corev1.ConditionUnknown,
},
{
name: "multiple semantically failed condition",
conditions: []corev1.NodeCondition{
{Type: corev1.NodeReady, Status: corev1.ConditionUnknown},
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionTrue},
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionTrue},
{Type: corev1.NodePIDPressure, Status: corev1.ConditionTrue},
},
status: corev1.ConditionFalse,
},
{
name: "one positive condition when the rest is unknown",
conditions: []corev1.NodeCondition{
{Type: corev1.NodeReady, Status: corev1.ConditionTrue},
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionUnknown},
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionUnknown},
{Type: corev1.NodePIDPressure, Status: corev1.ConditionUnknown},
},
status: corev1.ConditionTrue,
},
}
for _, test := range testCases {
t.Run(test.name, func(t *testing.T) {
g := NewWithT(t)
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-1",
},
Status: corev1.NodeStatus{
Conditions: test.conditions,
},
}
status, _ := summarizeNodeConditions(node)
g.Expect(status).To(Equal(test.status))
})
}
}
Loading