Skip to content

Commit

Permalink
Add the ability to specify a drain timeout for machines
Browse files Browse the repository at this point in the history
Add an option `nodeDrainTimeout` to KCP and machiceSpec of machinedeployment. 
`nodeDrainTimeout` defines the amount of time we want a node to be drained. The node is forcefully removed if the time is over.
Note: Unset this option means there is no time limit.

Update api/v1alpha3/machine_types.go

Add GO tests

Add conversion for the API changes

Update API conversion

Fix Minor Typos
  • Loading branch information
namnx228 committed Sep 29, 2020
1 parent 7bbd5cc commit 8f1cbe5
Show file tree
Hide file tree
Showing 13 changed files with 234 additions and 4 deletions.
1 change: 1 addition & 0 deletions api/v1alpha2/conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ func restoreMachineSpec(restored *v1alpha3.MachineSpec, dst *v1alpha3.MachineSpe
}
dst.Bootstrap.DataSecretName = restored.Bootstrap.DataSecretName
dst.FailureDomain = restored.FailureDomain
dst.NodeDrainTimeout = restored.NodeDrainTimeout
}

func (dst *Machine) ConvertFrom(srcRaw conversion.Hub) error {
Expand Down
1 change: 1 addition & 0 deletions api/v1alpha2/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions api/v1alpha3/machine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ type MachineSpec struct {
// Must match a key in the FailureDomains map stored on the cluster object.
// +optional
FailureDomain *string `json:"failureDomain,omitempty"`

// NodeDrainTimeout is the total amount of time that the controller will spend on draining a node.
// The default value is 0, meaning that the node can be drained without any time limitations.
// NOTE: NodeDrainTimeout is different from `kubectl drain --timeout`
// +optional
NodeDrainTimeout int64 `json:"nodeDrainTimeout,omitempty"`
}

// ANCHOR_END: MachineSpec
Expand Down
8 changes: 8 additions & 0 deletions config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,14 @@ spec:
description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
type: string
type: object
nodeDrainTimeout:
description: 'NodeDrainTimeout is the total amount of time
that the controller will spend on draining a node. The default
value is 0, meaning that the node can be drained without
any time limitations. NOTE: NodeDrainTimeout is different
from `kubectl drain --timeout`'
format: int64
type: integer
providerID:
description: ProviderID is the identification ID of the machine
provided by the provider. This field must match the provider
Expand Down
7 changes: 7 additions & 0 deletions config/crd/bases/cluster.x-k8s.io_machines.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,13 @@ spec:
description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
type: string
type: object
nodeDrainTimeout:
description: 'NodeDrainTimeout is the total amount of time that the
controller will spend on draining a node. The default value is 0,
meaning that the node can be drained without any time limitations.
NOTE: NodeDrainTimeout is different from `kubectl drain --timeout`'
format: int64
type: integer
providerID:
description: ProviderID is the identification ID of the machine provided
by the provider. This field must match the provider ID as seen on
Expand Down
8 changes: 8 additions & 0 deletions config/crd/bases/cluster.x-k8s.io_machinesets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -825,6 +825,14 @@ spec:
description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
type: string
type: object
nodeDrainTimeout:
description: 'NodeDrainTimeout is the total amount of time
that the controller will spend on draining a node. The default
value is 0, meaning that the node can be drained without
any time limitations. NOTE: NodeDrainTimeout is different
from `kubectl drain --timeout`'
format: int64
type: integer
providerID:
description: ProviderID is the identification ID of the machine
provided by the provider. This field must match the provider
Expand Down
8 changes: 8 additions & 0 deletions config/crd/bases/exp.cluster.x-k8s.io_machinepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,14 @@ spec:
description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
type: string
type: object
nodeDrainTimeout:
description: 'NodeDrainTimeout is the total amount of time
that the controller will spend on draining a node. The default
value is 0, meaning that the node can be drained without
any time limitations. NOTE: NodeDrainTimeout is different
from `kubectl drain --timeout`'
format: int64
type: integer
providerID:
description: ProviderID is the identification ID of the machine
provided by the provider. This field must match the provider
Expand Down
39 changes: 36 additions & 3 deletions controllers/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,14 +299,20 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
conditions.MarkTrue(m, clusterv1.PreDrainDeleteHookSucceededCondition)

// Drain node before deletion and issue a patch in order to make this operation visible to the users.
if _, exists := m.ObjectMeta.Annotations[clusterv1.ExcludeNodeDrainingAnnotation]; !exists {
if drainAllowed := r.isNodeDrainAllowed(m); drainAllowed {
patchHelper, err := patch.NewHelper(m, r.Client)
if err != nil {
return ctrl.Result{}, err
}

logger.Info("Draining node", "node", m.Status.NodeRef.Name)
conditions.MarkFalse(m, clusterv1.DrainingSucceededCondition, clusterv1.DrainingReason, clusterv1.ConditionSeverityInfo, "Draining the node before deletion")
// The DrainingSucceededCondition never exists before the node is drained for the first time,
// so its transition time can be used to record the first time draining.
// This `if` condition prevents the transition time to be changed more than once.
if conditions.Get(m, clusterv1.DrainingSucceededCondition) == nil {
conditions.MarkFalse(m, clusterv1.DrainingSucceededCondition, clusterv1.DrainingReason, clusterv1.ConditionSeverityInfo, "Draining the node before deletion")
}

if err := patchMachine(ctx, patchHelper, m); err != nil {
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
}
Expand Down Expand Up @@ -363,6 +369,34 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
return ctrl.Result{}, nil
}

func (r *MachineReconciler) isNodeDrainAllowed(m *clusterv1.Machine) bool {
if _, exists := m.ObjectMeta.Annotations[clusterv1.ExcludeNodeDrainingAnnotation]; exists {
return false
}

if timeout := r.isNodeDrainTimeoutOver(m); timeout {
return false
}

return true

}

func (r *MachineReconciler) isNodeDrainTimeoutOver(machine *clusterv1.Machine) bool {
// if the start draining condition does not exist
if conditions.Get(machine, clusterv1.DrainingSucceededCondition) == nil {
return false
}
// if the NodeDrainTineout type is not set by user
if machine.Spec.NodeDrainTimeout <= 0 {
return false
}
now := time.Now()
firstTimeDrain := conditions.GetLastTransitionTime(machine, clusterv1.DrainingSucceededCondition)
diff := now.Sub(firstTimeDrain.Time)
return diff.Seconds() >= float64(machine.Spec.NodeDrainTimeout)
}

// isDeleteNodeAllowed returns nil only if the Machine's NodeRef is not nil
// and if the Machine is not the last control plane node in the cluster.
func (r *MachineReconciler) isDeleteNodeAllowed(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) error {
Expand Down Expand Up @@ -419,7 +453,6 @@ func (r *MachineReconciler) drainNode(ctx context.Context, cluster *clusterv1.Cl
}
return errors.Errorf("unable to get node %q: %v", nodeName, err)
}

drainer := &kubedrain.Helper{
Client: kubeClient,
Force: true,
Expand Down
142 changes: 142 additions & 0 deletions controllers/machine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,148 @@ func Test_clusterToActiveMachines(t *testing.T) {
}
}

func TestIsNodeDrainedAllowed(t *testing.T) {
testCluster := &clusterv1.Cluster{
TypeMeta: metav1.TypeMeta{Kind: "Cluster", APIVersion: clusterv1.GroupVersion.String()},
ObjectMeta: metav1.ObjectMeta{Namespace: "default", Name: "test-cluster"},
}

tests := []struct {
name string
machine *clusterv1.Machine
expected bool
}{
{
name: "Exclude node draining annotation exists",
machine: &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "test-machine",
Namespace: "default",
Finalizers: []string{clusterv1.MachineFinalizer},
Annotations: map[string]string{clusterv1.ExcludeNodeDrainingAnnotation: "existed!!"},
},
Spec: clusterv1.MachineSpec{
ClusterName: "test-cluster",
InfrastructureRef: corev1.ObjectReference{},
Bootstrap: clusterv1.Bootstrap{Data: pointer.StringPtr("data")},
},
Status: clusterv1.MachineStatus{},
},
expected: false,
},
{
name: "Node draining timeout is over",
machine: &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "test-machine",
Namespace: "default",
Finalizers: []string{clusterv1.MachineFinalizer},
},
Spec: clusterv1.MachineSpec{
ClusterName: "test-cluster",
InfrastructureRef: corev1.ObjectReference{},
Bootstrap: clusterv1.Bootstrap{Data: pointer.StringPtr("data")},
NodeDrainTimeout: 60,
},
Status: clusterv1.MachineStatus{
Conditions: clusterv1.Conditions{
{
Type: clusterv1.DrainingSucceededCondition,
Status: corev1.ConditionFalse,
LastTransitionTime: metav1.Time{Time: time.Now().Add(-(time.Second * 70)).UTC()},
},
},
},
},
expected: false,
},
{
name: "Node draining timeout is not yet over",
machine: &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "test-machine",
Namespace: "default",
Finalizers: []string{clusterv1.MachineFinalizer},
},
Spec: clusterv1.MachineSpec{
ClusterName: "test-cluster",
InfrastructureRef: corev1.ObjectReference{},
Bootstrap: clusterv1.Bootstrap{Data: pointer.StringPtr("data")},
NodeDrainTimeout: 60,
},
Status: clusterv1.MachineStatus{
Conditions: clusterv1.Conditions{
{
Type: clusterv1.DrainingSucceededCondition,
Status: corev1.ConditionFalse,
LastTransitionTime: metav1.Time{Time: time.Now().Add(-(time.Second * 30)).UTC()},
},
},
},
},
expected: true,
},
{
name: "NodeDrainTimeout option is set to its default value 0",
machine: &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "test-machine",
Namespace: "default",
Finalizers: []string{clusterv1.MachineFinalizer},
},
Spec: clusterv1.MachineSpec{
ClusterName: "test-cluster",
InfrastructureRef: corev1.ObjectReference{},
Bootstrap: clusterv1.Bootstrap{Data: pointer.StringPtr("data")},
},
Status: clusterv1.MachineStatus{
Conditions: clusterv1.Conditions{
{
Type: clusterv1.DrainingSucceededCondition,
Status: corev1.ConditionFalse,
LastTransitionTime: metav1.Time{Time: time.Now().Add(-(time.Second * 1000)).UTC()},
},
},
},
},
expected: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)

var objs []runtime.Object
objs = append(objs, testCluster)

m1 := &clusterv1.Machine{
TypeMeta: metav1.TypeMeta{
Kind: "Machine",
},
ObjectMeta: metav1.ObjectMeta{
Name: "m1",
Namespace: "default",
Labels: map[string]string{
clusterv1.ClusterLabelName: "test-cluster",
},
},
}
objs = append(objs, m1)

objs = append(objs, tt.machine)

r := &MachineReconciler{
Client: helpers.NewFakeClientWithScheme(scheme.Scheme, objs...),
Log: log.Log,
scheme: scheme.Scheme,
}

got := r.isNodeDrainAllowed(tt.machine)
g.Expect(got).To(Equal(tt.expected))
})
}
}

func TestIsDeleteNodeAllowed(t *testing.T) {
deletionts := metav1.Now()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ type KubeadmControlPlaneSpec struct {
// KubeadmControlPlane
// +optional
UpgradeAfter *metav1.Time `json:"upgradeAfter,omitempty"`

// NodeDrainTimeout is the total amount of time that the controller will spend on draining a controlplane node
// The default value is 0, meaning that the node can be drained without any time limitations.
// NOTE: NodeDrainTimeout is different from `kubectl drain --timeout`
// +optional
NodeDrainTimeout int64 `json:"nodeDrainTimeout,omitempty"`
}

// KubeadmControlPlaneStatus defines the observed state of KubeadmControlPlane.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ func (in *KubeadmControlPlane) ValidateUpdate(old runtime.Object) error {
{spec, "replicas"},
{spec, "version"},
{spec, "upgradeAfter"},
{spec, "nodeDrainTimeout"},
}

allErrs := in.validateCommon()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,14 @@ spec:
format: int32
type: integer
type: object
nodeDrainTimeout:
description: 'NodeDrainTimeout is the total amount of time that the
controller will spend on draining a controlplane node The default
value is 0, meaning that the node can be drained without any time
limitations. NOTE: NodeDrainTimeout is different from `kubectl drain
--timeout`'
format: int64
type: integer
replicas:
description: Number of desired machines. Defaults to 1. When stacked
etcd is used only odd numbers are permitted, as per [etcd best practice](https://etcd.io/docs/v3.3.12/faq/#why-an-odd-number-of-cluster-members).
Expand Down
3 changes: 2 additions & 1 deletion controlplane/kubeadm/controllers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,8 @@ func (r *KubeadmControlPlaneReconciler) generateMachine(ctx context.Context, kcp
Bootstrap: clusterv1.Bootstrap{
ConfigRef: bootstrapRef,
},
FailureDomain: failureDomain,
FailureDomain: failureDomain,
NodeDrainTimeout: kcp.Spec.NodeDrainTimeout,
},
}

Expand Down

0 comments on commit 8f1cbe5

Please sign in to comment.