Add option to ignore draining after a while

Add an option `nodeDrainTimeout` to KCP and machiceSpec of machinedeployment. `nodeDrainTimeout` defines the amount of time we want a node to be drained. The node is forcefully removed if the time is over. Note: Unset this option means there is no time limit.
kubernetes-sigs · Sep 18, 2020 · dc94fbd · dc94fbd
1 parent b545ca1
commit dc94fbd
Show file tree

Hide file tree

Showing 11 changed files with 54 additions and 4 deletions.
diff --git a/api/v1alpha2/zz_generated.conversion.go b/api/v1alpha2/zz_generated.conversion.go
diff --git a/api/v1alpha3/machine_types.go b/api/v1alpha3/machine_types.go
@@ -89,6 +89,11 @@ type MachineSpec struct {
 	// Must match a key in the FailureDomains map stored on the cluster object.
 	// +optional
 	FailureDomain *string `json:"failureDomain,omitempty"`
+
+	// NodeDrainTimeout is the total amount of time for draining a worker node
+	// Note that this NodeDrainTimeout is different from `kubectl drain --timeout`
+	// +optional
+	NodeDrainTimeout int64 `json:"nodeDrainTimeout,omitempty"`
 }
 
 // ANCHOR_END: MachineSpec

diff --git a/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml b/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml
@@ -926,6 +926,10 @@ spec:
                             description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
                             type: string
                         type: object
+                      nodeDrainTimeout:
+                        description: Test parameter for timeout for node draining
+                        format: int64
+                        type: integer
                       providerID:
                         description: ProviderID is the identification ID of the machine
                           provided by the provider. This field must match the provider

diff --git a/config/crd/bases/cluster.x-k8s.io_machines.yaml b/config/crd/bases/cluster.x-k8s.io_machines.yaml
@@ -514,6 +514,10 @@ spec:
                     description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
                     type: string
                 type: object
+              nodeDrainTimeout:
+                description: Test parameter for timeout for node draining
+                format: int64
+                type: integer
               providerID:
                 description: ProviderID is the identification ID of the machine provided
                   by the provider. This field must match the provider ID as seen on

diff --git a/config/crd/bases/cluster.x-k8s.io_machinesets.yaml b/config/crd/bases/cluster.x-k8s.io_machinesets.yaml
@@ -825,6 +825,10 @@ spec:
                             description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
                             type: string
                         type: object
+                      nodeDrainTimeout:
+                        description: Test parameter for timeout for node draining
+                        format: int64
+                        type: integer
                       providerID:
                         description: ProviderID is the identification ID of the machine
                           provided by the provider. This field must match the provider

diff --git a/config/crd/bases/exp.cluster.x-k8s.io_machinepools.yaml b/config/crd/bases/exp.cluster.x-k8s.io_machinepools.yaml
@@ -346,6 +346,10 @@ spec:
                             description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
                             type: string
                         type: object
+                      nodeDrainTimeout:
+                        description: Test parameter for timeout for node draining
+                        format: int64
+                        type: integer
                       providerID:
                         description: ProviderID is the identification ID of the machine
                           provided by the provider. This field must match the provider

diff --git a/controllers/machine_controller.go b/controllers/machine_controller.go
@@ -299,14 +299,18 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
 		conditions.MarkTrue(m, clusterv1.PreDrainDeleteHookSucceededCondition)
 
 		// Drain node before deletion and issue a patch in order to make this operation visible to the users.
-		if _, exists := m.ObjectMeta.Annotations[clusterv1.ExcludeNodeDrainingAnnotation]; !exists {
+		if _, exists := m.ObjectMeta.Annotations[clusterv1.ExcludeNodeDrainingAnnotation]; !exists && !isNodeDraintimeoutOver(m) {
+			// if _, exists := m.ObjectMeta.Annotations[clusterv1.ExcludeNodeDrainingAnnotation]; !exists {
 			patchHelper, err := patch.NewHelper(m, r.Client)
 			if err != nil {
 				return ctrl.Result{}, err
 			}
 
 			logger.Info("Draining node", "node", m.Status.NodeRef.Name)
-			conditions.MarkFalse(m, clusterv1.DrainingSucceededCondition, clusterv1.DrainingReason, clusterv1.ConditionSeverityInfo, "Draining the node before deletion")
+			if conditions.Get(m, clusterv1.DrainingSucceededCondition) == nil {
+				conditions.MarkFalse(m, clusterv1.DrainingSucceededCondition, clusterv1.DrainingReason, clusterv1.ConditionSeverityInfo, "Draining the node before deletion")
+			}
+
 			if err := patchMachine(ctx, patchHelper, m); err != nil {
 				return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
 			}
@@ -363,6 +367,21 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
 	return ctrl.Result{}, nil
 }
 
+func isNodeDraintimeoutOver(machine *clusterv1.Machine) bool {
+	// if the start draining condition does not exist
+	if conditions.Get(machine, clusterv1.DrainingSucceededCondition) == nil {
+		return false
+	}
+	// if the NodeDrainTineout type is not set by user
+	if machine.Spec.NodeDrainTimeout <= 0 {
+		return false
+	}
+	now := time.Now()
+	firstTimeDrain := conditions.GetLastTransitionTime(machine, clusterv1.DrainingSucceededCondition)
+	diff := now.Sub(firstTimeDrain.Time)
+	return diff.Seconds() >= float64(machine.Spec.NodeDrainTimeout)
+}
+
 // isDeleteNodeAllowed returns nil only if the Machine's NodeRef is not nil
 // and if the Machine is not the last control plane node in the cluster.
 func (r *MachineReconciler) isDeleteNodeAllowed(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) error {
@@ -419,7 +438,6 @@ func (r *MachineReconciler) drainNode(ctx context.Context, cluster *clusterv1.Cl
 		}
 		return errors.Errorf("unable to get node %q: %v", nodeName, err)
 	}
-
 	drainer := &kubedrain.Helper{
 		Client:              kubeClient,
 		Force:               true,

diff --git a/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_types.go b/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_types.go
@@ -66,6 +66,11 @@ type KubeadmControlPlaneSpec struct {
 	// KubeadmControlPlane
 	// +optional
 	UpgradeAfter *metav1.Time `json:"upgradeAfter,omitempty"`
+
+	// NodeDrainTimeout is the total amount of time for draining a control plane node
+	// Note that this NodeDrainTimeout is different from `kubectl drain --timeout`
+	// +optional
+	NodeDrainTimeout int64 `json:"nodeDrainTimeout,omitempty"`
 }
 
 // KubeadmControlPlaneStatus defines the observed state of KubeadmControlPlane.

diff --git a/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_webhook.go b/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_webhook.go
@@ -110,6 +110,7 @@ func (in *KubeadmControlPlane) ValidateUpdate(old runtime.Object) error {
 		{spec, "replicas"},
 		{spec, "version"},
 		{spec, "upgradeAfter"},
+		{spec, "nodeDrainTimeout"},
 	}
 
 	allErrs := in.validateCommon()

diff --git a/...rolplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml b/...rolplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml
@@ -1032,6 +1032,9 @@ spec:
                     format: int32
                     type: integer
                 type: object
+              nodeDrainTimeout:
+                format: int64
+                type: integer
               replicas:
                 description: Number of desired machines. Defaults to 1. When stacked
                   etcd is used only odd numbers are permitted, as per [etcd best practice](https://etcd.io/docs/v3.3.12/faq/#why-an-odd-number-of-cluster-members).

diff --git a/controlplane/kubeadm/controllers/helpers.go b/controlplane/kubeadm/controllers/helpers.go
@@ -237,7 +237,8 @@ func (r *KubeadmControlPlaneReconciler) generateMachine(ctx context.Context, kcp
 			Bootstrap: clusterv1.Bootstrap{
 				ConfigRef: bootstrapRef,
 			},
-			FailureDomain: failureDomain,
+			FailureDomain:    failureDomain,
+			NodeDrainTimeout: kcp.Spec.NodeDrainTimeout,
 		},
 	}