Skip to content

Commit

Permalink
Add option to ignore draining after a while
Browse files Browse the repository at this point in the history
Add an option `nodeDrainTimeout` to KCP and machiceSpec of machinedeployment. 
`nodeDrainTimeout` defines the amount of time we want a node to be drained. The node is forcefully removed if the time is over.
Note: Unset this option means there is no time limit.
  • Loading branch information
namnx228 committed Sep 18, 2020
1 parent b545ca1 commit dc94fbd
Show file tree
Hide file tree
Showing 11 changed files with 54 additions and 4 deletions.
1 change: 1 addition & 0 deletions api/v1alpha2/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions api/v1alpha3/machine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ type MachineSpec struct {
// Must match a key in the FailureDomains map stored on the cluster object.
// +optional
FailureDomain *string `json:"failureDomain,omitempty"`

// NodeDrainTimeout is the total amount of time for draining a worker node
// Note that this NodeDrainTimeout is different from `kubectl drain --timeout`
// +optional
NodeDrainTimeout int64 `json:"nodeDrainTimeout,omitempty"`
}

// ANCHOR_END: MachineSpec
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,10 @@ spec:
description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
type: string
type: object
nodeDrainTimeout:
description: Test parameter for timeout for node draining
format: int64
type: integer
providerID:
description: ProviderID is the identification ID of the machine
provided by the provider. This field must match the provider
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/cluster.x-k8s.io_machines.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,10 @@ spec:
description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
type: string
type: object
nodeDrainTimeout:
description: Test parameter for timeout for node draining
format: int64
type: integer
providerID:
description: ProviderID is the identification ID of the machine provided
by the provider. This field must match the provider ID as seen on
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/cluster.x-k8s.io_machinesets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -825,6 +825,10 @@ spec:
description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
type: string
type: object
nodeDrainTimeout:
description: Test parameter for timeout for node draining
format: int64
type: integer
providerID:
description: ProviderID is the identification ID of the machine
provided by the provider. This field must match the provider
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/exp.cluster.x-k8s.io_machinepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,10 @@ spec:
description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
type: string
type: object
nodeDrainTimeout:
description: Test parameter for timeout for node draining
format: int64
type: integer
providerID:
description: ProviderID is the identification ID of the machine
provided by the provider. This field must match the provider
Expand Down
24 changes: 21 additions & 3 deletions controllers/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,14 +299,18 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
conditions.MarkTrue(m, clusterv1.PreDrainDeleteHookSucceededCondition)

// Drain node before deletion and issue a patch in order to make this operation visible to the users.
if _, exists := m.ObjectMeta.Annotations[clusterv1.ExcludeNodeDrainingAnnotation]; !exists {
if _, exists := m.ObjectMeta.Annotations[clusterv1.ExcludeNodeDrainingAnnotation]; !exists && !isNodeDraintimeoutOver(m) {
// if _, exists := m.ObjectMeta.Annotations[clusterv1.ExcludeNodeDrainingAnnotation]; !exists {
patchHelper, err := patch.NewHelper(m, r.Client)
if err != nil {
return ctrl.Result{}, err
}

logger.Info("Draining node", "node", m.Status.NodeRef.Name)
conditions.MarkFalse(m, clusterv1.DrainingSucceededCondition, clusterv1.DrainingReason, clusterv1.ConditionSeverityInfo, "Draining the node before deletion")
if conditions.Get(m, clusterv1.DrainingSucceededCondition) == nil {
conditions.MarkFalse(m, clusterv1.DrainingSucceededCondition, clusterv1.DrainingReason, clusterv1.ConditionSeverityInfo, "Draining the node before deletion")
}

if err := patchMachine(ctx, patchHelper, m); err != nil {
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
}
Expand Down Expand Up @@ -363,6 +367,21 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
return ctrl.Result{}, nil
}

func isNodeDraintimeoutOver(machine *clusterv1.Machine) bool {
// if the start draining condition does not exist
if conditions.Get(machine, clusterv1.DrainingSucceededCondition) == nil {
return false
}
// if the NodeDrainTineout type is not set by user
if machine.Spec.NodeDrainTimeout <= 0 {
return false
}
now := time.Now()
firstTimeDrain := conditions.GetLastTransitionTime(machine, clusterv1.DrainingSucceededCondition)
diff := now.Sub(firstTimeDrain.Time)
return diff.Seconds() >= float64(machine.Spec.NodeDrainTimeout)
}

// isDeleteNodeAllowed returns nil only if the Machine's NodeRef is not nil
// and if the Machine is not the last control plane node in the cluster.
func (r *MachineReconciler) isDeleteNodeAllowed(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) error {
Expand Down Expand Up @@ -419,7 +438,6 @@ func (r *MachineReconciler) drainNode(ctx context.Context, cluster *clusterv1.Cl
}
return errors.Errorf("unable to get node %q: %v", nodeName, err)
}

drainer := &kubedrain.Helper{
Client: kubeClient,
Force: true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ type KubeadmControlPlaneSpec struct {
// KubeadmControlPlane
// +optional
UpgradeAfter *metav1.Time `json:"upgradeAfter,omitempty"`

// NodeDrainTimeout is the total amount of time for draining a control plane node
// Note that this NodeDrainTimeout is different from `kubectl drain --timeout`
// +optional
NodeDrainTimeout int64 `json:"nodeDrainTimeout,omitempty"`
}

// KubeadmControlPlaneStatus defines the observed state of KubeadmControlPlane.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ func (in *KubeadmControlPlane) ValidateUpdate(old runtime.Object) error {
{spec, "replicas"},
{spec, "version"},
{spec, "upgradeAfter"},
{spec, "nodeDrainTimeout"},
}

allErrs := in.validateCommon()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,9 @@ spec:
format: int32
type: integer
type: object
nodeDrainTimeout:
format: int64
type: integer
replicas:
description: Number of desired machines. Defaults to 1. When stacked
etcd is used only odd numbers are permitted, as per [etcd best practice](https://etcd.io/docs/v3.3.12/faq/#why-an-odd-number-of-cluster-members).
Expand Down
3 changes: 2 additions & 1 deletion controlplane/kubeadm/controllers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,8 @@ func (r *KubeadmControlPlaneReconciler) generateMachine(ctx context.Context, kcp
Bootstrap: clusterv1.Bootstrap{
ConfigRef: bootstrapRef,
},
FailureDomain: failureDomain,
FailureDomain: failureDomain,
NodeDrainTimeout: kcp.Spec.NodeDrainTimeout,
},
}

Expand Down

0 comments on commit dc94fbd

Please sign in to comment.