From d4fe42f8f6751c186fb7273ff87731a223d9ff6f Mon Sep 17 00:00:00 2001
From: Nam Xuan Nguyen <nam.xuan.nguyen@est.tech>
Date: Wed, 30 Sep 2020 13:17:52 +0300
Subject: [PATCH] Add the ability to specify a drain timeout for machines

Add an option `nodeDrainTimeout` to KCP and machiceSpec of machinedeployment.
`nodeDrainTimeout` defines the amount of time we want a node to be drained. The node is forcefully removed if the time is over.
Note: Unset this option means there is no time limit.
---
 api/v1alpha2/conversion.go                    |   1 +
 api/v1alpha2/zz_generated.conversion.go       |   1 +
 api/v1alpha3/machine_types.go                 |   6 +
 api/v1alpha3/zz_generated.deepcopy.go         |   5 +
 .../cluster.x-k8s.io_machinedeployments.yaml  |   7 +
 .../crd/bases/cluster.x-k8s.io_machines.yaml  |   6 +
 .../bases/cluster.x-k8s.io_machinesets.yaml   |   7 +
 .../exp.cluster.x-k8s.io_machinepools.yaml    |   7 +
 controllers/machine_controller.go             |  40 +++++-
 controllers/machine_controller_test.go        | 127 ++++++++++++++++++
 .../v1alpha3/kubeadm_control_plane_types.go   |   6 +
 .../v1alpha3/kubeadm_control_plane_webhook.go |   1 +
 .../api/v1alpha3/zz_generated.deepcopy.go     |   6 +
 ...cluster.x-k8s.io_kubeadmcontrolplanes.yaml |   7 +
 controlplane/kubeadm/controllers/helpers.go   |   3 +-
 15 files changed, 227 insertions(+), 3 deletions(-)

diff --git a/api/v1alpha2/conversion.go b/api/v1alpha2/conversion.go
index 00974460d07e..757b56df72f2 100644
--- a/api/v1alpha2/conversion.go
+++ b/api/v1alpha2/conversion.go
@@ -129,6 +129,7 @@ func restoreMachineSpec(restored *v1alpha3.MachineSpec, dst *v1alpha3.MachineSpe
 	}
 	dst.Bootstrap.DataSecretName = restored.Bootstrap.DataSecretName
 	dst.FailureDomain = restored.FailureDomain
+	dst.NodeDrainTimeout = restored.NodeDrainTimeout
 }
 
 func (dst *Machine) ConvertFrom(srcRaw conversion.Hub) error {
diff --git a/api/v1alpha2/zz_generated.conversion.go b/api/v1alpha2/zz_generated.conversion.go
index c096b057d71d..435282bf0890 100644
--- a/api/v1alpha2/zz_generated.conversion.go
+++ b/api/v1alpha2/zz_generated.conversion.go
@@ -891,6 +891,7 @@ func autoConvert_v1alpha3_MachineSpec_To_v1alpha2_MachineSpec(in *v1alpha3.Machi
 	out.Version = (*string)(unsafe.Pointer(in.Version))
 	out.ProviderID = (*string)(unsafe.Pointer(in.ProviderID))
 	// WARNING: in.FailureDomain requires manual conversion: does not exist in peer-type
+	// WARNING: in.NodeDrainTimeout requires manual conversion: does not exist in peer-type
 	return nil
 }
 
diff --git a/api/v1alpha3/machine_types.go b/api/v1alpha3/machine_types.go
index 54e3c8aadd52..43ab63c312cc 100644
--- a/api/v1alpha3/machine_types.go
+++ b/api/v1alpha3/machine_types.go
@@ -89,6 +89,12 @@ type MachineSpec struct {
 	// Must match a key in the FailureDomains map stored on the cluster object.
 	// +optional
 	FailureDomain *string `json:"failureDomain,omitempty"`
+
+	// NodeDrainTimeout is the total amount of time that the controller will spend on draining a node.
+	// The default value is 0, meaning that the node can be drained without any time limitations.
+	// NOTE: NodeDrainTimeout is different from `kubectl drain --timeout`
+	// +optional
+	NodeDrainTimeout *metav1.Duration `json:"nodeDrainTimeout,omitempty"`
 }
 
 // ANCHOR_END: MachineSpec
diff --git a/api/v1alpha3/zz_generated.deepcopy.go b/api/v1alpha3/zz_generated.deepcopy.go
index c0c5a506ef42..9d577525b330 100644
--- a/api/v1alpha3/zz_generated.deepcopy.go
+++ b/api/v1alpha3/zz_generated.deepcopy.go
@@ -802,6 +802,11 @@ func (in *MachineSpec) DeepCopyInto(out *MachineSpec) {
 		*out = new(string)
 		**out = **in
 	}
+	if in.NodeDrainTimeout != nil {
+		in, out := &in.NodeDrainTimeout, &out.NodeDrainTimeout
+		*out = new(metav1.Duration)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachineSpec.
diff --git a/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml b/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml
index cb9993dd5f71..dd02945d862a 100644
--- a/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml
+++ b/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml
@@ -926,6 +926,13 @@ spec:
                             description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
                             type: string
                         type: object
+                      nodeDrainTimeout:
+                        description: 'NodeDrainTimeout is the total amount of time
+                          that the controller will spend on draining a node. The default
+                          value is 0, meaning that the node can be drained without
+                          any time limitations. NOTE: NodeDrainTimeout is different
+                          from `kubectl drain --timeout`'
+                        type: string
                       providerID:
                         description: ProviderID is the identification ID of the machine
                           provided by the provider. This field must match the provider
diff --git a/config/crd/bases/cluster.x-k8s.io_machines.yaml b/config/crd/bases/cluster.x-k8s.io_machines.yaml
index 90d10bd477e6..98fcb91f1ebf 100644
--- a/config/crd/bases/cluster.x-k8s.io_machines.yaml
+++ b/config/crd/bases/cluster.x-k8s.io_machines.yaml
@@ -514,6 +514,12 @@ spec:
                     description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
                     type: string
                 type: object
+              nodeDrainTimeout:
+                description: 'NodeDrainTimeout is the total amount of time that the
+                  controller will spend on draining a node. The default value is 0,
+                  meaning that the node can be drained without any time limitations.
+                  NOTE: NodeDrainTimeout is different from `kubectl drain --timeout`'
+                type: string
               providerID:
                 description: ProviderID is the identification ID of the machine provided
                   by the provider. This field must match the provider ID as seen on
diff --git a/config/crd/bases/cluster.x-k8s.io_machinesets.yaml b/config/crd/bases/cluster.x-k8s.io_machinesets.yaml
index 6276d4f38e7a..22663b5cece9 100644
--- a/config/crd/bases/cluster.x-k8s.io_machinesets.yaml
+++ b/config/crd/bases/cluster.x-k8s.io_machinesets.yaml
@@ -825,6 +825,13 @@ spec:
                             description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
                             type: string
                         type: object
+                      nodeDrainTimeout:
+                        description: 'NodeDrainTimeout is the total amount of time
+                          that the controller will spend on draining a node. The default
+                          value is 0, meaning that the node can be drained without
+                          any time limitations. NOTE: NodeDrainTimeout is different
+                          from `kubectl drain --timeout`'
+                        type: string
                       providerID:
                         description: ProviderID is the identification ID of the machine
                           provided by the provider. This field must match the provider
diff --git a/config/crd/bases/exp.cluster.x-k8s.io_machinepools.yaml b/config/crd/bases/exp.cluster.x-k8s.io_machinepools.yaml
index c31ccd5f7389..afba16712268 100644
--- a/config/crd/bases/exp.cluster.x-k8s.io_machinepools.yaml
+++ b/config/crd/bases/exp.cluster.x-k8s.io_machinepools.yaml
@@ -346,6 +346,13 @@ spec:
                             description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
                             type: string
                         type: object
+                      nodeDrainTimeout:
+                        description: 'NodeDrainTimeout is the total amount of time
+                          that the controller will spend on draining a node. The default
+                          value is 0, meaning that the node can be drained without
+                          any time limitations. NOTE: NodeDrainTimeout is different
+                          from `kubectl drain --timeout`'
+                        type: string
                       providerID:
                         description: ProviderID is the identification ID of the machine
                           provided by the provider. This field must match the provider
diff --git a/controllers/machine_controller.go b/controllers/machine_controller.go
index addb94700036..b4a0fc2f951a 100644
--- a/controllers/machine_controller.go
+++ b/controllers/machine_controller.go
@@ -299,14 +299,20 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
 		conditions.MarkTrue(m, clusterv1.PreDrainDeleteHookSucceededCondition)
 
 		// Drain node before deletion and issue a patch in order to make this operation visible to the users.
-		if _, exists := m.ObjectMeta.Annotations[clusterv1.ExcludeNodeDrainingAnnotation]; !exists {
+		if r.isNodeDrainAllowed(m) {
 			patchHelper, err := patch.NewHelper(m, r.Client)
 			if err != nil {
 				return ctrl.Result{}, err
 			}
 
 			logger.Info("Draining node", "node", m.Status.NodeRef.Name)
-			conditions.MarkFalse(m, clusterv1.DrainingSucceededCondition, clusterv1.DrainingReason, clusterv1.ConditionSeverityInfo, "Draining the node before deletion")
+			// The DrainingSucceededCondition never exists before the node is drained for the first time,
+			// so its transition time can be used to record the first time draining.
+			// This `if` condition prevents the transition time to be changed more than once.
+			if conditions.Get(m, clusterv1.DrainingSucceededCondition) == nil {
+				conditions.MarkFalse(m, clusterv1.DrainingSucceededCondition, clusterv1.DrainingReason, clusterv1.ConditionSeverityInfo, "Draining the node before deletion")
+			}
+
 			if err := patchMachine(ctx, patchHelper, m); err != nil {
 				return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
 			}
@@ -363,6 +369,36 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
 	return ctrl.Result{}, nil
 }
 
+func (r *MachineReconciler) isNodeDrainAllowed(m *clusterv1.Machine) bool {
+	if _, exists := m.ObjectMeta.Annotations[clusterv1.ExcludeNodeDrainingAnnotation]; exists {
+		return false
+	}
+
+	if r.nodeDrainTimeoutExceeded(m) {
+		return false
+	}
+
+	return true
+
+}
+
+func (r *MachineReconciler) nodeDrainTimeoutExceeded(machine *clusterv1.Machine) bool {
+	// if the NodeDrainTineout type is not set by user
+	if machine.Spec.NodeDrainTimeout == nil || machine.Spec.NodeDrainTimeout.Seconds() <= 0 {
+		return false
+	}
+
+	// if the draining succeeded condition does not exist
+	if conditions.Get(machine, clusterv1.DrainingSucceededCondition) == nil {
+		return false
+	}
+
+	now := time.Now()
+	firstTimeDrain := conditions.GetLastTransitionTime(machine, clusterv1.DrainingSucceededCondition)
+	diff := now.Sub(firstTimeDrain.Time)
+	return diff.Seconds() >= machine.Spec.NodeDrainTimeout.Seconds()
+}
+
 // isDeleteNodeAllowed returns nil only if the Machine's NodeRef is not nil
 // and if the Machine is not the last control plane node in the cluster.
 func (r *MachineReconciler) isDeleteNodeAllowed(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) error {
diff --git a/controllers/machine_controller_test.go b/controllers/machine_controller_test.go
index 9ded9a2468e8..918ad5e043cd 100644
--- a/controllers/machine_controller_test.go
+++ b/controllers/machine_controller_test.go
@@ -681,6 +681,133 @@ func Test_clusterToActiveMachines(t *testing.T) {
 	}
 }
 
+func TestIsNodeDrainedAllowed(t *testing.T) {
+	testCluster := &clusterv1.Cluster{
+		TypeMeta:   metav1.TypeMeta{Kind: "Cluster", APIVersion: clusterv1.GroupVersion.String()},
+		ObjectMeta: metav1.ObjectMeta{Namespace: "default", Name: "test-cluster"},
+	}
+
+	tests := []struct {
+		name     string
+		machine  *clusterv1.Machine
+		expected bool
+	}{
+		{
+			name: "Exclude node draining annotation exists",
+			machine: &clusterv1.Machine{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:        "test-machine",
+					Namespace:   "default",
+					Finalizers:  []string{clusterv1.MachineFinalizer},
+					Annotations: map[string]string{clusterv1.ExcludeNodeDrainingAnnotation: "existed!!"},
+				},
+				Spec: clusterv1.MachineSpec{
+					ClusterName:       "test-cluster",
+					InfrastructureRef: corev1.ObjectReference{},
+					Bootstrap:         clusterv1.Bootstrap{Data: pointer.StringPtr("data")},
+				},
+				Status: clusterv1.MachineStatus{},
+			},
+			expected: false,
+		},
+		{
+			name: "Node draining timeout is over",
+			machine: &clusterv1.Machine{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:       "test-machine",
+					Namespace:  "default",
+					Finalizers: []string{clusterv1.MachineFinalizer},
+				},
+				Spec: clusterv1.MachineSpec{
+					ClusterName:       "test-cluster",
+					InfrastructureRef: corev1.ObjectReference{},
+					Bootstrap:         clusterv1.Bootstrap{Data: pointer.StringPtr("data")},
+					NodeDrainTimeout:  &metav1.Duration{Duration: time.Second * 60},
+				},
+
+				Status: clusterv1.MachineStatus{
+					Conditions: clusterv1.Conditions{
+						{
+							Type:               clusterv1.DrainingSucceededCondition,
+							Status:             corev1.ConditionFalse,
+							LastTransitionTime: metav1.Time{Time: time.Now().Add(-(time.Second * 70)).UTC()},
+						},
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			name: "Node draining timeout is not yet over",
+			machine: &clusterv1.Machine{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:       "test-machine",
+					Namespace:  "default",
+					Finalizers: []string{clusterv1.MachineFinalizer},
+				},
+				Spec: clusterv1.MachineSpec{
+					ClusterName:       "test-cluster",
+					InfrastructureRef: corev1.ObjectReference{},
+					Bootstrap:         clusterv1.Bootstrap{Data: pointer.StringPtr("data")},
+					NodeDrainTimeout:  &metav1.Duration{Duration: time.Second * 60},
+				},
+				Status: clusterv1.MachineStatus{
+					Conditions: clusterv1.Conditions{
+						{
+							Type:               clusterv1.DrainingSucceededCondition,
+							Status:             corev1.ConditionFalse,
+							LastTransitionTime: metav1.Time{Time: time.Now().Add(-(time.Second * 30)).UTC()},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			name: "NodeDrainTimeout option is set to its default value 0",
+			machine: &clusterv1.Machine{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:       "test-machine",
+					Namespace:  "default",
+					Finalizers: []string{clusterv1.MachineFinalizer},
+				},
+				Spec: clusterv1.MachineSpec{
+					ClusterName:       "test-cluster",
+					InfrastructureRef: corev1.ObjectReference{},
+					Bootstrap:         clusterv1.Bootstrap{Data: pointer.StringPtr("data")},
+				},
+				Status: clusterv1.MachineStatus{
+					Conditions: clusterv1.Conditions{
+						{
+							Type:               clusterv1.DrainingSucceededCondition,
+							Status:             corev1.ConditionFalse,
+							LastTransitionTime: metav1.Time{Time: time.Now().Add(-(time.Second * 1000)).UTC()},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			g := NewWithT(t)
+
+			var objs []runtime.Object
+			objs = append(objs, testCluster, tt.machine)
+
+			r := &MachineReconciler{
+				Client: helpers.NewFakeClientWithScheme(scheme.Scheme, objs...),
+				Log:    log.Log,
+				scheme: scheme.Scheme,
+			}
+
+			got := r.isNodeDrainAllowed(tt.machine)
+			g.Expect(got).To(Equal(tt.expected))
+		})
+	}
+}
+
 func TestIsDeleteNodeAllowed(t *testing.T) {
 	deletionts := metav1.Now()
 
diff --git a/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_types.go b/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_types.go
index 4bbc7eec6402..48f374d20533 100644
--- a/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_types.go
+++ b/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_types.go
@@ -66,6 +66,12 @@ type KubeadmControlPlaneSpec struct {
 	// KubeadmControlPlane
 	// +optional
 	UpgradeAfter *metav1.Time `json:"upgradeAfter,omitempty"`
+
+	// NodeDrainTimeout is the total amount of time that the controller will spend on draining a controlplane node
+	// The default value is 0, meaning that the node can be drained without any time limitations.
+	// NOTE: NodeDrainTimeout is different from `kubectl drain --timeout`
+	// +optional
+	NodeDrainTimeout *metav1.Duration `json:"nodeDrainTimeout,omitempty"`
 }
 
 // KubeadmControlPlaneStatus defines the observed state of KubeadmControlPlane.
diff --git a/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_webhook.go b/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_webhook.go
index c842c0fb1ac8..6bd273019c5d 100644
--- a/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_webhook.go
+++ b/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_webhook.go
@@ -110,6 +110,7 @@ func (in *KubeadmControlPlane) ValidateUpdate(old runtime.Object) error {
 		{spec, "replicas"},
 		{spec, "version"},
 		{spec, "upgradeAfter"},
+		{spec, "nodeDrainTimeout"},
 	}
 
 	allErrs := in.validateCommon()
diff --git a/controlplane/kubeadm/api/v1alpha3/zz_generated.deepcopy.go b/controlplane/kubeadm/api/v1alpha3/zz_generated.deepcopy.go
index 5b1ffc58e057..be97eb20a684 100644
--- a/controlplane/kubeadm/api/v1alpha3/zz_generated.deepcopy.go
+++ b/controlplane/kubeadm/api/v1alpha3/zz_generated.deepcopy.go
@@ -21,6 +21,7 @@ limitations under the License.
 package v1alpha3
 
 import (
+	"k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	apiv1alpha3 "sigs.k8s.io/cluster-api/api/v1alpha3"
 )
@@ -98,6 +99,11 @@ func (in *KubeadmControlPlaneSpec) DeepCopyInto(out *KubeadmControlPlaneSpec) {
 		in, out := &in.UpgradeAfter, &out.UpgradeAfter
 		*out = (*in).DeepCopy()
 	}
+	if in.NodeDrainTimeout != nil {
+		in, out := &in.NodeDrainTimeout, &out.NodeDrainTimeout
+		*out = new(v1.Duration)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeadmControlPlaneSpec.
diff --git a/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml b/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml
index 4b75b3247f04..9a33944a845a 100644
--- a/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml
+++ b/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml
@@ -1032,6 +1032,13 @@ spec:
                     format: int32
                     type: integer
                 type: object
+              nodeDrainTimeout:
+                description: 'NodeDrainTimeout is the total amount of time that the
+                  controller will spend on draining a controlplane node The default
+                  value is 0, meaning that the node can be drained without any time
+                  limitations. NOTE: NodeDrainTimeout is different from `kubectl drain
+                  --timeout`'
+                type: string
               replicas:
                 description: Number of desired machines. Defaults to 1. When stacked
                   etcd is used only odd numbers are permitted, as per [etcd best practice](https://etcd.io/docs/v3.3.12/faq/#why-an-odd-number-of-cluster-members).
diff --git a/controlplane/kubeadm/controllers/helpers.go b/controlplane/kubeadm/controllers/helpers.go
index fc89f2c2cee8..b5ce5fe63bb3 100644
--- a/controlplane/kubeadm/controllers/helpers.go
+++ b/controlplane/kubeadm/controllers/helpers.go
@@ -237,7 +237,8 @@ func (r *KubeadmControlPlaneReconciler) generateMachine(ctx context.Context, kcp
 			Bootstrap: clusterv1.Bootstrap{
 				ConfigRef: bootstrapRef,
 			},
-			FailureDomain: failureDomain,
+			FailureDomain:    failureDomain,
+			NodeDrainTimeout: kcp.Spec.NodeDrainTimeout,
 		},
 	}