Skip to content

Commit

Permalink
Add Node related condition to Machine conditions
Browse files Browse the repository at this point in the history
  • Loading branch information
Sedef committed Nov 2, 2020
1 parent 3aac2fc commit 463c545
Show file tree
Hide file tree
Showing 8 changed files with 479 additions and 87 deletions.
24 changes: 21 additions & 3 deletions api/v1alpha3/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,6 @@ const (
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
MachineHasFailureReason = "MachineHasFailure"

// NodeNotFoundReason is the reason used when a machine's node has previously been observed but is now gone.
NodeNotFoundReason = "NodeNotFound"

// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
NodeStartupTimeoutReason = "NodeStartupTimeout"

Expand All @@ -127,3 +124,24 @@ const (
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
WaitingForRemediationReason = "WaitingForRemediation"
)

// Conditions and condition Reasons for the Machine's Node object
const (
// MachineNodeHealthyCondition provides info about the operational state of the Kubernetes node hosted on the machine by summarizing node conditions.
// If the conditions defined in a Kubernetes node (i.e., NodeReady, NodeMemoryPressure, NodeDiskPressure, NodePIDPressure, and NodeNetworkUnavailable) are in a healthy state, it will be set to True.
MachineNodeHealthyCondition ConditionType = "NodeHealthy"

// WaitingForNodeRefReason (Severity=Info) documents a machine.spec.providerId is not assigned yet.
WaitingForNodeRefReason = "WaitingForNodeRef"

// NodeProvisioningReason (Severity=Info) documents machine in the process of provisioning a node.
// NB. provisioning --> NodeRef == ""
NodeProvisioningReason = "NodeProvisioning"

// NodeNotFoundReason (Severity=Error) documents a machine's node has previously been observed but is now gone.
// NB. provisioned --> NodeRef != ""
NodeNotFoundReason = "NodeNotFound"

// NodeConditionsFailedReason (Severity=Warning) documents a node is not in a healthy state due to the failed state of at least 1 Kubelet condition.
NodeConditionsFailedReason = "NodeConditionsFailed"
)
24 changes: 21 additions & 3 deletions api/v1alpha4/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,6 @@ const (
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
MachineHasFailureReason = "MachineHasFailure"

// NodeNotFoundReason is the reason used when a machine's node has previously been observed but is now gone.
NodeNotFoundReason = "NodeNotFound"

// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
NodeStartupTimeoutReason = "NodeStartupTimeout"

Expand All @@ -134,3 +131,24 @@ const (
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
WaitingForRemediationReason = "WaitingForRemediation"
)

// Conditions and condition Reasons for the Machine's Node object
const (
// MachineNodeHealthyCondition provides info about the operational state of the Kubernetes node hosted on the machine by summarizing node conditions.
// If the conditions defined in a Kubernetes node (i.e., NodeReady, NodeMemoryPressure, NodeDiskPressure, NodePIDPressure, and NodeNetworkUnavailable) are in a healthy state, it will be set to True.
MachineNodeHealthyCondition ConditionType = "NodeHealthy"

// WaitingForNodeRefReason (Severity=Info) documents a machine.spec.providerId is not assigned yet.
WaitingForNodeRefReason = "WaitingForNodeRef"

// NodeProvisioningReason (Severity=Info) documents machine in the process of provisioning a node.
// NB. provisioning --> NodeRef == ""
NodeProvisioningReason = "NodeProvisioning"

// NodeNotFoundReason (Severity=Error) documents a machine's node has previously been observed but is now gone.
// NB. provisioned --> NodeRef != ""
NodeNotFoundReason = "NodeNotFound"

// NodeConditionsFailedReason (Severity=Warning) documents a node is not in a healthy state due to the failed state of at least 1 Kubelet condition.
NodeConditionsFailedReason = "NodeConditionsFailed"
)
13 changes: 12 additions & 1 deletion controllers/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ func (r *MachineReconciler) reconcile(ctx context.Context, cluster *clusterv1.Cl
phases := []func(context.Context, *clusterv1.Cluster, *clusterv1.Machine) (ctrl.Result, error){
r.reconcileBootstrap,
r.reconcileInfrastructure,
r.reconcileNodeRef,
r.reconcileNode,
}

res := ctrl.Result{}
Expand Down Expand Up @@ -346,6 +346,16 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
// Return early and don't remove the finalizer if we got an error or
// the external reconciliation deletion isn't ready.

patchHelper, err := patch.NewHelper(m, r.Client)
if err != nil {
return ctrl.Result{}, err
}
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "")
if err := patchMachine(ctx, patchHelper, m); err != nil {
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletionFailedReason, clusterv1.ConditionSeverityInfo, "")
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
}

if ok, err := r.reconcileDeleteInfrastructure(ctx, m); !ok || err != nil {
return ctrl.Result{}, err
}
Expand All @@ -368,6 +378,7 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
})
if waitErr != nil {
log.Error(deleteNodeErr, "Timed out deleting node, moving on", "node", m.Status.NodeRef.Name)
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletionFailedReason, clusterv1.ConditionSeverityWarning, "")
r.recorder.Eventf(m, corev1.EventTypeWarning, "FailedDeleteNode", "error deleting Machine's node: %v", deleteNodeErr)
}
}
Expand Down
115 changes: 82 additions & 33 deletions controllers/machine_controller_noderef.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ package controllers
import (
"context"
"fmt"
"time"

"github.com/pkg/errors"
apicorev1 "k8s.io/api/core/v1"
corev1 "k8s.io/api/core/v1"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4"
"sigs.k8s.io/cluster-api/controllers/noderefutil"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)
Expand All @@ -34,24 +34,14 @@ var (
ErrNodeNotFound = errors.New("cannot find node with matching ProviderID")
)

func (r *MachineReconciler) reconcileNodeRef(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (ctrl.Result, error) {
log := ctrl.LoggerFrom(ctx, "cluster", cluster.Name)

// Check that the Machine hasn't been deleted or in the process.
if !machine.DeletionTimestamp.IsZero() {
return ctrl.Result{}, nil
}

// Check that the Machine doesn't already have a NodeRef.
if machine.Status.NodeRef != nil {
return ctrl.Result{}, nil
}

func (r *MachineReconciler) reconcileNode(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (ctrl.Result, error) {
log := ctrl.LoggerFrom(ctx, "machine", machine.Name, "namespace", machine.Namespace)
log = log.WithValues("cluster", cluster.Name)

// Check that the Machine has a valid ProviderID.
if machine.Spec.ProviderID == nil || *machine.Spec.ProviderID == "" {
log.Info("Machine doesn't have a valid ProviderID yet")
log.Info("Cannot reconcile Machine's Node, no valid ProviderID yet")
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.WaitingForNodeRefReason, clusterv1.ConditionSeverityInfo, "")
return ctrl.Result{}, nil
}

Expand All @@ -65,29 +55,93 @@ func (r *MachineReconciler) reconcileNodeRef(ctx context.Context, cluster *clust
return ctrl.Result{}, err
}

// Get the Node reference.
nodeRef, err := r.getNodeReference(ctx, remoteClient, providerID)
// Even if Status.NodeRef exists, continue to do the following checks to make sure Node is healthy
node, err := r.getNode(ctx, remoteClient, providerID)
if err != nil {
if err == ErrNodeNotFound {
log.Info(fmt.Sprintf("Cannot assign NodeRef to Machine: %s, requeuing", ErrNodeNotFound.Error()))
return ctrl.Result{RequeueAfter: 20 * time.Second}, nil
// While a NodeRef is set in the status, failing to get that node means the node is deleted.
// If Status.NodeRef is not set before, node still can be in the provisioning state.
if machine.Status.NodeRef != nil {
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityError, "")
return ctrl.Result{}, errors.Wrapf(err, "no matching Node for Machine %q in namespace %q", machine.Name, machine.Namespace)
}
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeProvisioningReason, clusterv1.ConditionSeverityWarning, "")
return ctrl.Result{Requeue: true}, nil
}
log.Error(err, "Failed to assign NodeRef")
r.recorder.Event(machine, apicorev1.EventTypeWarning, "FailedSetNodeRef", err.Error())
log.Error(err, "Failed to retrieve Node by ProviderID")
r.recorder.Event(machine, corev1.EventTypeWarning, "Failed to retrieve Node by ProviderID", err.Error())
return ctrl.Result{}, err
}

// Set the Machine NodeRef.
machine.Status.NodeRef = nodeRef
log.Info("Set Machine's NodeRef", "noderef", machine.Status.NodeRef.Name)
r.recorder.Event(machine, apicorev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name)
if machine.Status.NodeRef == nil {
machine.Status.NodeRef = &corev1.ObjectReference{
Kind: node.Kind,
APIVersion: node.APIVersion,
Name: node.Name,
UID: node.UID,
}
log.Info("Set Machine's NodeRef", "noderef", machine.Status.NodeRef.Name)
r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name)
}

// Do the remaining node health checks, then set the node health to true if all checks pass.
status, message := summarizeNodeConditions(node)
if status == corev1.ConditionFalse {
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, clusterv1.ConditionSeverityWarning, message)
return ctrl.Result{}, nil
}

conditions.MarkTrue(machine, clusterv1.MachineNodeHealthyCondition)
return ctrl.Result{}, nil
}

func (r *MachineReconciler) getNodeReference(ctx context.Context, c client.Reader, providerID *noderefutil.ProviderID) (*apicorev1.ObjectReference, error) {
// summarizeNodeConditions summarizes a Node's conditions and returns the summary of condition statuses and concatenate failed condition messages:
// if there is at least 1 semantically-negative condition, summarized status = False;
// if there is at least 1 semantically-positive condition when there is 0 semantically negative condition, summarized status = True;
// if all conditions are unknown, summarized status = Unknown.
// (semantically true conditions: NodeMemoryPressure/NodeDiskPressure/NodePIDPressure == false or Ready == true.)
func summarizeNodeConditions(node *corev1.Node) (corev1.ConditionStatus, string) {
totalNumOfConditionsChecked := 4
semanticallyFalseStatus := 0
unknownStatus := 0

message := ""
for _, condition := range node.Status.Conditions {
switch condition.Type {
case corev1.NodeMemoryPressure, corev1.NodeDiskPressure, corev1.NodePIDPressure:
if condition.Status != corev1.ConditionFalse {
message += fmt.Sprintf("Node condition %s is %s", condition.Type, condition.Status) + ". "
if condition.Status == corev1.ConditionUnknown {
unknownStatus++
continue
}
semanticallyFalseStatus++
}
case corev1.NodeReady:
if condition.Status != corev1.ConditionTrue {
message += fmt.Sprintf("Node condition %s is %s", condition.Type, condition.Status) + ". "
if condition.Status == corev1.ConditionUnknown {
unknownStatus++
continue
}
semanticallyFalseStatus++
}
}
}
if semanticallyFalseStatus > 0 {
return corev1.ConditionFalse, message
}
if semanticallyFalseStatus+unknownStatus < totalNumOfConditionsChecked {
return corev1.ConditionTrue, message
}
return corev1.ConditionUnknown, message
}

func (r *MachineReconciler) getNode(ctx context.Context, c client.Reader, providerID *noderefutil.ProviderID) (*corev1.Node, error) {
log := ctrl.LoggerFrom(ctx, "providerID", providerID)

nodeList := apicorev1.NodeList{}
nodeList := corev1.NodeList{}
for {
if err := c.List(ctx, &nodeList, client.Continue(nodeList.Continue)); err != nil {
return nil, err
Expand All @@ -101,12 +155,7 @@ func (r *MachineReconciler) getNodeReference(ctx context.Context, c client.Reade
}

if providerID.Equals(nodeProviderID) {
return &apicorev1.ObjectReference{
Kind: node.Kind,
APIVersion: node.APIVersion,
Name: node.Name,
UID: node.UID,
}, nil
return &node, nil
}
}

Expand Down
77 changes: 70 additions & 7 deletions controllers/machine_controller_noderef_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,10 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/record"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4"
"sigs.k8s.io/cluster-api/controllers/noderefutil"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)

func TestGetNodeReference(t *testing.T) {
Expand Down Expand Up @@ -106,21 +105,85 @@ func TestGetNodeReference(t *testing.T) {
providerID, err := noderefutil.NewProviderID(test.providerID)
gt.Expect(err).NotTo(HaveOccurred(), "Expected no error parsing provider id %q, got %v", test.providerID, err)

reference, err := r.getNodeReference(ctx, client, providerID)
node, err := r.getNode(ctx, client, providerID)
if test.err == nil {
g.Expect(err).To(BeNil())
} else {
gt.Expect(err).NotTo(BeNil())
gt.Expect(err).To(Equal(test.err), "Expected error %v, got %v", test.err, err)
}

if test.expected == nil && reference == nil {
if test.expected == nil && node == nil {
return
}

gt.Expect(reference.Name).To(Equal(test.expected.Name), "Expected NodeRef's name to be %v, got %v", reference.Name, test.expected.Name)
gt.Expect(reference.Namespace).To(Equal(test.expected.Namespace), "Expected NodeRef's namespace to be %v, got %v", reference.Namespace, test.expected.Namespace)
gt.Expect(node.Name).To(Equal(test.expected.Name), "Expected NodeRef's name to be %v, got %v", node.Name, test.expected.Name)
gt.Expect(node.Namespace).To(Equal(test.expected.Namespace), "Expected NodeRef's namespace to be %v, got %v", node.Namespace, test.expected.Namespace)
})

}
}

func TestSummarizeNodeConditions(t *testing.T) {
testCases := []struct {
name string
conditions []corev1.NodeCondition
status corev1.ConditionStatus
}{
{
name: "node is healthy",
conditions: []corev1.NodeCondition{
{Type: corev1.NodeReady, Status: corev1.ConditionTrue},
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionFalse},
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionFalse},
{Type: corev1.NodePIDPressure, Status: corev1.ConditionFalse},
},
status: corev1.ConditionTrue,
},
{
name: "all conditions are unknown",
conditions: []corev1.NodeCondition{
{Type: corev1.NodeReady, Status: corev1.ConditionUnknown},
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionUnknown},
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionUnknown},
{Type: corev1.NodePIDPressure, Status: corev1.ConditionUnknown},
},
status: corev1.ConditionUnknown,
},
{
name: "multiple semantically failed condition",
conditions: []corev1.NodeCondition{
{Type: corev1.NodeReady, Status: corev1.ConditionUnknown},
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionTrue},
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionTrue},
{Type: corev1.NodePIDPressure, Status: corev1.ConditionTrue},
},
status: corev1.ConditionFalse,
},
{
name: "one positive condition when the rest is unknown",
conditions: []corev1.NodeCondition{
{Type: corev1.NodeReady, Status: corev1.ConditionTrue},
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionUnknown},
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionUnknown},
{Type: corev1.NodePIDPressure, Status: corev1.ConditionUnknown},
},
status: corev1.ConditionTrue,
},
}
for _, test := range testCases {
t.Run(test.name, func(t *testing.T) {
g := NewWithT(t)
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-1",
},
Status: corev1.NodeStatus{
Conditions: test.conditions,
},
}
status, _ := summarizeNodeConditions(node)
g.Expect(status).To(Equal(test.status))
})
}
}
Loading

0 comments on commit 463c545

Please sign in to comment.