Skip to content

Commit

Permalink
Add Taint during rolling update (kubernetes-sigs#10223)
Browse files Browse the repository at this point in the history
Co-authored-by: Hiromu Asahina <[email protected]>
  • Loading branch information
2 people authored and Dhairya-Arora01 committed May 25, 2024
1 parent 159a9f2 commit a9d239d
Show file tree
Hide file tree
Showing 8 changed files with 609 additions and 5 deletions.
8 changes: 8 additions & 0 deletions api/v1beta1/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,14 @@ const (
MachineSetPreflightCheckControlPlaneIsStable MachineSetPreflightCheck = "ControlPlaneIsStable"
)

// NodeOutdatedRevisionTaint can be added to Nodes at rolling updates in general triggered by updating MachineDeployment
// This taint is used to prevent unnecessary pod churn, i.e., as the first node is drained, pods previously running on
// that node are scheduled onto nodes who have yet to be replaced, but will be torn down soon.
var NodeOutdatedRevisionTaint = corev1.Taint{
Key: "node.cluster.x-k8s.io/outdated-revision",
Effect: corev1.TaintEffectPreferNoSchedule,
}

// NodeUninitializedTaint can be added to Nodes at creation by the bootstrap provider, e.g. the
// KubeadmBootstrap provider will add the taint.
// This taint is used to prevent workloads to be scheduled on Nodes before the node is initialized by Cluster API.
Expand Down
16 changes: 16 additions & 0 deletions internal/controllers/machine/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,14 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt
if err != nil {
return err
}
msToMachines, err := util.MachineSetToObjectsMapper(mgr.GetClient(), &clusterv1.MachineList{}, mgr.GetScheme())
if err != nil {
return err
}
mdToMachines, err := util.MachineDeploymentToObjectsMapper(mgr.GetClient(), &clusterv1.MachineList{}, mgr.GetScheme())
if err != nil {
return err
}

if r.nodeDeletionRetryTimeout.Nanoseconds() == 0 {
r.nodeDeletionRetryTimeout = 10 * time.Second
Expand All @@ -122,6 +130,14 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt
predicates.ResourceHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue),
),
)).
Watches(
&clusterv1.MachineSet{},
handler.EnqueueRequestsFromMapFunc(msToMachines),
).
Watches(
&clusterv1.MachineDeployment{},
handler.EnqueueRequestsFromMapFunc(mdToMachines),
).
Build(r)
if err != nil {
return errors.Wrap(err, "failed setting up with a controller manager")
Expand Down
67 changes: 65 additions & 2 deletions internal/controllers/machine/machine_controller_noderef.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,14 @@ import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/api/v1beta1/index"
"sigs.k8s.io/cluster-api/internal/controllers/machinedeployment/mdutil"
"sigs.k8s.io/cluster-api/internal/util/taints"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/annotations"
Expand Down Expand Up @@ -133,7 +135,7 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result,
_, nodeHadInterruptibleLabel := node.Labels[clusterv1.InterruptibleLabel]

// Reconcile node taints
if err := r.patchNode(ctx, remoteClient, node, nodeLabels, nodeAnnotations); err != nil {
if err := r.patchNode(ctx, remoteClient, node, nodeLabels, nodeAnnotations, machine); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "failed to reconcile Node %s", klog.KObj(node))
}
if !nodeHadInterruptibleLabel && interruptible {
Expand Down Expand Up @@ -255,7 +257,7 @@ func (r *Reconciler) getNode(ctx context.Context, c client.Reader, providerID st

// PatchNode is required to workaround an issue on Node.Status.Address which is incorrectly annotated as patchStrategy=merge
// and this causes SSA patch to fail in case there are two addresses with the same key https://github.com/kubernetes-sigs/cluster-api/issues/8417
func (r *Reconciler) patchNode(ctx context.Context, remoteClient client.Client, node *corev1.Node, newLabels, newAnnotations map[string]string) error {
func (r *Reconciler) patchNode(ctx context.Context, remoteClient client.Client, node *corev1.Node, newLabels, newAnnotations map[string]string, m *clusterv1.Machine) error {
newNode := node.DeepCopy()

// Adds the annotations CAPI sets on the node.
Expand Down Expand Up @@ -292,9 +294,70 @@ func (r *Reconciler) patchNode(ctx context.Context, remoteClient client.Client,
// Drop the NodeUninitializedTaint taint on the node given that we are reconciling labels.
hasTaintChanges := taints.RemoveNodeTaint(newNode, clusterv1.NodeUninitializedTaint)

// Set Taint to a node in an old MachineSet and unset Taint from a node in a new MachineSet
isOutdated, err := shouldNodeHaveOutdatedTaint(ctx, r.Client, m)
if err != nil {
return errors.Wrapf(err, "failed to check if Node %s is outdated", klog.KRef("", node.Name))
}
if isOutdated {
hasTaintChanges = taints.EnsureNodeTaint(newNode, clusterv1.NodeOutdatedRevisionTaint) || hasTaintChanges
} else {
hasTaintChanges = taints.RemoveNodeTaint(newNode, clusterv1.NodeOutdatedRevisionTaint) || hasTaintChanges
}

if !hasAnnotationChanges && !hasLabelChanges && !hasTaintChanges {
return nil
}

return remoteClient.Patch(ctx, newNode, client.StrategicMergeFrom(node))
}

func shouldNodeHaveOutdatedTaint(ctx context.Context, c client.Client, m *clusterv1.Machine) (bool, error) {
if _, hasLabel := m.Labels[clusterv1.MachineDeploymentNameLabel]; !hasLabel {
return false, nil
}

// Resolve the MachineSet name via owner references because the label value
// could also be a hash.
objKey, err := getOwnerMachineSetObjectKey(m.ObjectMeta)
if err != nil {
return false, err
}
ms := &clusterv1.MachineSet{}
if err := c.Get(ctx, *objKey, ms); err != nil {
return false, err
}
md := &clusterv1.MachineDeployment{}
objKey = &client.ObjectKey{
Namespace: m.ObjectMeta.Namespace,
Name: m.Labels[clusterv1.MachineDeploymentNameLabel],
}
if err := c.Get(ctx, *objKey, md); err != nil {
return false, err
}
msRev, err := mdutil.Revision(ms)
if err != nil {
return false, err
}
mdRev, err := mdutil.Revision(md)
if err != nil {
return false, err
}
if msRev < mdRev {
return true, nil
}
return false, nil
}

func getOwnerMachineSetObjectKey(obj metav1.ObjectMeta) (*client.ObjectKey, error) {
for _, ref := range obj.GetOwnerReferences() {
gv, err := schema.ParseGroupVersion(ref.APIVersion)
if err != nil {
return nil, err
}
if ref.Kind == "MachineSet" && gv.Group == clusterv1.GroupVersion.Group {
return &client.ObjectKey{Namespace: obj.Namespace, Name: ref.Name}, nil
}
}
return nil, errors.Errorf("failed to find MachineSet owner reference for Machine %s", klog.KRef(obj.GetNamespace(), obj.GetName()))
}
Loading

0 comments on commit a9d239d

Please sign in to comment.