Skip to content

Commit

Permalink
UPSTREAM: <carry>: openshift: Machine controller: drain node before m…
Browse files Browse the repository at this point in the history
…achine deletion

The node draining code itself is imported from github.com/openshift/kubernetes-drain.

At the same time it's currently impossible to use the controller-runtime client for node draining
due to missing Patch operation (kubernetes-sigs/controller-runtime#235).
Thus, the machine controller needs to initialize kubeclient as well in order to
implement the node draining logic. Once the Patch operation is implemented,
the draining logic can be updated to replace kube client with controller runtime client.

Also, initialize event recorder to generate node draining event.
  • Loading branch information
ingvagabund authored and openshift-merge-robot committed Mar 12, 2019
1 parent 0ca0637 commit 39a5baa
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 5 deletions.
5 changes: 5 additions & 0 deletions pkg/controller/machine/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,15 @@ go_library(
"//pkg/apis/machine/v1beta1:go_default_library",
"//pkg/controller/error:go_default_library",
"//pkg/util:go_default_library",
"//vendor/github.com/go-log/log/info:go_default_library",
"//vendor/github.com/openshift/kubernetes-drain:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//vendor/k8s.io/client-go/kubernetes:go_default_library",
"//vendor/k8s.io/client-go/rest:go_default_library",
"//vendor/k8s.io/client-go/tools/record:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
"//vendor/sigs.k8s.io/controller-runtime/pkg/client:go_default_library",
"//vendor/sigs.k8s.io/controller-runtime/pkg/controller:go_default_library",
Expand Down
74 changes: 69 additions & 5 deletions pkg/controller/machine/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,22 @@ package machine
import (
"context"
"errors"
"fmt"
"os"
"time"

"github.com/go-log/log/info"
machinev1 "github.com/openshift/cluster-api/pkg/apis/machine/v1beta1"
controllerError "github.com/openshift/cluster-api/pkg/controller/error"
"github.com/openshift/cluster-api/pkg/util"
kubedrain "github.com/openshift/kubernetes-drain"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/record"
"k8s.io/klog"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
Expand All @@ -37,7 +44,12 @@ import (
"sigs.k8s.io/controller-runtime/pkg/source"
)

const NodeNameEnvVar = "NODE_NAME"
const (
NodeNameEnvVar = "NODE_NAME"

// ExcludeNodeDrainingAnnotation annotation explicitly skips node draining if set
ExcludeNodeDrainingAnnotation = "machine.openshift.io/exclude-node-draining"
)

var DefaultActuator Actuator

Expand All @@ -48,10 +60,12 @@ func AddWithActuator(mgr manager.Manager, actuator Actuator) error {
// newReconciler returns a new reconcile.Reconciler
func newReconciler(mgr manager.Manager, actuator Actuator) reconcile.Reconciler {
r := &ReconcileMachine{
Client: mgr.GetClient(),
scheme: mgr.GetScheme(),
nodeName: os.Getenv(NodeNameEnvVar),
actuator: actuator,
Client: mgr.GetClient(),
eventRecorder: mgr.GetRecorder("machine-controller"),
config: mgr.GetConfig(),
scheme: mgr.GetScheme(),
nodeName: os.Getenv(NodeNameEnvVar),
actuator: actuator,
}

if r.nodeName == "" {
Expand Down Expand Up @@ -83,8 +97,11 @@ var _ reconcile.Reconciler = &ReconcileMachine{}
// ReconcileMachine reconciles a Machine object
type ReconcileMachine struct {
client.Client
config *rest.Config
scheme *runtime.Scheme

eventRecorder record.EventRecorder

actuator Actuator

// nodeName is the name of the node on which the machine controller is running, if not present, it is loaded from NODE_NAME.
Expand Down Expand Up @@ -145,6 +162,18 @@ func (r *ReconcileMachine) Reconcile(request reconcile.Request) (reconcile.Resul
return reconcile.Result{}, nil
}
klog.Infof("reconciling machine object %v triggers delete.", name)

// Drain node before deletion
// If a machine is not linked to a node, just delete the machine. Since a node
// can be unlinked from a machine when the node goes NotReady and is removed
// by cloud controller manager. In that case some machines would never get
// deleted without a manual intervention.
if _, exists := m.ObjectMeta.Annotations[ExcludeNodeDrainingAnnotation]; !exists && m.Status.NodeRef != nil {
if err := r.drainNode(m); err != nil {
return reconcile.Result{}, err
}
}

if err := r.actuator.Delete(ctx, cluster, m); err != nil {
klog.Errorf("Error deleting machine object %v; %v", name, err)
if requeueErr, ok := err.(*controllerError.RequeueAfterError); ok {
Expand Down Expand Up @@ -201,6 +230,41 @@ func (r *ReconcileMachine) Reconcile(request reconcile.Request) (reconcile.Resul
return reconcile.Result{}, nil
}

func (r *ReconcileMachine) drainNode(machine *machinev1.Machine) error {
kubeClient, err := kubernetes.NewForConfig(r.config)
if err != nil {
return fmt.Errorf("unable to build kube client: %v", err)
}
node, err := kubeClient.CoreV1().Nodes().Get(machine.Status.NodeRef.Name, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("unable to get node %q: %v", machine.Status.NodeRef.Name, err)
}

if err := kubedrain.Drain(
kubeClient,
[]*corev1.Node{node},
&kubedrain.DrainOptions{
Force: true,
IgnoreDaemonsets: true,
DeleteLocalData: true,
GracePeriodSeconds: -1,
Logger: info.New(klog.V(0)),
// If a pod is not evicted in 20 second, retry the eviction next time the
// machine gets reconciled again (to allow other machines to be reconciled)
Timeout: 20 * time.Second,
},
); err != nil {
// Machine still tries to terminate after drain failure
klog.Warningf("drain failed for machine %q: %v", machine.Name, err)
return &controllerError.RequeueAfterError{RequeueAfter: 20 * time.Second}
}

klog.Infof("drain successful for machine %q", machine.Name)
r.eventRecorder.Eventf(machine, corev1.EventTypeNormal, "Deleted", "Node %q drained", node.Name)

return nil
}

func (r *ReconcileMachine) getCluster(ctx context.Context, machine *machinev1.Machine) (*machinev1.Cluster, error) {
clusterList := machinev1.ClusterList{}
listOptions := &client.ListOptions{
Expand Down

0 comments on commit 39a5baa

Please sign in to comment.