Skip to content

Commit

Permalink
Merge pull request #2 from hardikdr/bugfix/drain-issue
Browse files Browse the repository at this point in the history
Bugfix for drain issue
  • Loading branch information
hardikdr authored Feb 9, 2018
2 parents ae91d69 + 2d8f23e commit 4a72108
Show file tree
Hide file tree
Showing 10 changed files with 150 additions and 54 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ LINT_FOLDERS := $(shell echo $(PACKAGES) | sed "s|$(REPOSITORY)|.|g")
BINARY_PATH := $(REPOSITORY)/cmd/$(PROJECT)

IMAGE_REPOSITORY := kvmprashanth/node-controller-manager
IMAGE_TAG := v0.1.0
IMAGE_TAG := v0.1.3

TYPES_FILES := $(shell find pkg/apis -name types.go)

Expand Down
2 changes: 1 addition & 1 deletion cmd/node-controller-manager/app/controllermanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func Run(s *options.NCMServer) error {
controlkubeconfig, err = clientcmd.BuildConfigFromFlags("", "")
} else {
//kubeconfig for the seedcluster where MachineCRDs are supposed to be registered.
controlkubeconfig, err = clientcmd.BuildConfigFromFlags("", s.ControlKubeconfig)
controlkubeconfig, err = clientcmd.BuildConfigFromFlags("", s.ControlKubeconfig)
}
if err != nil {
return err
Expand Down
40 changes: 0 additions & 40 deletions kubernetes/deployment/clusterrole.yaml

This file was deleted.

41 changes: 41 additions & 0 deletions kubernetes/deployment/control-cluster-role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: node-controller-manager
rules:
- apiGroups:
- machine.sapcloud.io
resources:
- awsmachineclasses
- azuremachineclasses
- gcpmachineclasses
- openstackmachineclasses
- machinedeployments
- machines
- machinesets
verbs:
- create
- delete
- deletecollection
- get
- list
- patch
- update
- watch
- apiGroups:
- ""
resources:
- nodes
- configmaps
- secrets
- endpoints
- events
verbs:
- create
- delete
- deletecollection
- get
- list
- patch
- update
- watch
4 changes: 2 additions & 2 deletions kubernetes/deployment/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ spec:
spec:
containers:
- name: node-controller-manager
image: kvmprashanth/node-controller-manager:v0.1.0
imagePullPolicy: Always
image: kvmprashanth/node-controller-manager:v0.1.3
imagePullPolicy: Always
command:
- ./node-controller-manager
- --v=2
Expand Down
12 changes: 12 additions & 0 deletions kubernetes/deployment/target-cluster-role-binding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: node-controller-manager
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: node-controller-manager
subjects:
- kind: ServiceAccount
name: default
namespace: default
44 changes: 44 additions & 0 deletions kubernetes/deployment/target-cluster-role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: node-controller-manager
rules:
- apiGroups:
- ""
resources:
- nodes
- endpoints
- replicationcontrollers
- pods
verbs:
- create
- delete
- deletecollection
- get
- list
- patch
- update
- watch
- apiGroups:
- ""
resources:
- pods/eviction
verbs:
- create
- apiGroups:
- extensions
- apps
resources:
- replicasets
- statefulsets
- daemonsets
- deployments
verbs:
- create
- delete
- deletecollection
- get
- list
- patch
- update
- watch
51 changes: 42 additions & 9 deletions pkg/controller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ func (c *controller) updateMachineState(machine *v1alpha1.Machine, node *v1.Node
*/

func (c *controller) createMachine(machine *v1alpha1.Machine, driver driver.Driver) error {
glog.V(3).Infof("Creating machine %s, please wait!", machine.Name)
glog.V(2).Infof("Creating machine %s, please wait!", machine.Name)

actualProviderID, nodeName, err := driver.Create()
if err != nil {
Expand Down Expand Up @@ -377,19 +377,40 @@ func (c *controller) deleteMachine(machine *v1alpha1.Machine, driver driver.Driv
// If machine was created on the cloud provider
machineID, _ := driver.GetExisting()

if machine.Status.CurrentStatus.Phase != v1alpha1.MachineTerminating {
lastOperation := v1alpha1.LastOperation{
Description: "Deleting machine from cloud provider",
State: "Processing",
Type: "Delete",
LastUpdateTime: metav1.Now(),
}
currentStatus := v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
TimeoutActive: false,
LastUpdateTime: metav1.Now(),
}
machine = c.updateMachineStatus(machine, lastOperation, currentStatus)
}

var err error
if machineID == "" {
err = errors.New("No provider-ID found on machine")
} else {
// force-deletion: "True" label should be present for deleting machine without draining it
if machine.Labels["force-deletion"] != "True" {
timeOutDuration := 5 * time.Minute
// Timeout value obtained by subtracting last operation with expected time out period
timeOut := metav1.Now().Add(-timeOutDuration).Sub(machine.Status.CurrentStatus.LastUpdateTime.Time)

// To perform drain 2 conditions must be satified
// 1. force-deletion: "True" label must not be present
// 2. Deletion operation must be less than 5 minutes old
if machine.Labels["force-deletion"] != "True" && timeOut < 0 {
buf := bytes.NewBuffer([]byte{})
errBuf := bytes.NewBuffer([]byte{})

nodeName := machine.Labels["node"]
drainOptions := NewDrainOptions(
c.targetCoreClient,
10*time.Minute, // TODO: Will need to configure timeout
timeOutDuration, // TODO: Will need to configure timeout
nodeName,
-1,
true,
Expand All @@ -400,9 +421,19 @@ func (c *controller) deleteMachine(machine *v1alpha1.Machine, driver driver.Driv
)
err = drainOptions.RunDrain()
if err != nil {
lastOperation := v1alpha1.LastOperation{
Description: "Drain failed - " + err.Error(),
State: "Failed",
Type: "Delete",
LastUpdateTime: metav1.Now(),
}
c.updateMachineStatus(machine, lastOperation, machine.Status.CurrentStatus)

// Machine still tries to terminate after drain failure
glog.V(2).Infof("Drain failed for machine %s - \nBuf:%v \nErrBuf:%v \nErr-Message:%v", machine.Name, buf, errBuf, err)
return err
}
glog.V(3).Infof("Drain successful - %v %v", buf, errBuf)
glog.V(2).Infof("Drain successful for machine %s - %v %v", machine.Name, buf, errBuf)
}
err = driver.Delete()
}
Expand All @@ -426,9 +457,10 @@ func (c *controller) deleteMachine(machine *v1alpha1.Machine, driver driver.Driv

return err
}

c.deleteMachineFinalizers(machine)
c.controlMachineClient.Machines(machine.Namespace).Delete(machine.Name, &metav1.DeleteOptions{})
glog.V(3).Infof("Machine %s deleted succesfullly", machine.Name)
glog.V(2).Infof("Machine %s deleted succesfullly", machine.Name)
}
return nil
}
Expand All @@ -442,23 +474,24 @@ func (c *controller) updateMachineStatus(
machine *v1alpha1.Machine,
lastOperation v1alpha1.LastOperation,
currentStatus v1alpha1.CurrentStatus,
) {
) *v1alpha1.Machine {
// Get the latest version of the machine so that we can avoid conflicts
machine, err := c.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{})
if err != nil {
return
return machine
}

clone := machine.DeepCopy()
clone.Status.LastOperation = lastOperation
clone.Status.CurrentStatus = currentStatus

_, err = c.controlMachineClient.Machines(clone.Namespace).Update(clone)
machine, err = c.controlMachineClient.Machines(clone.Namespace).Update(clone)
if err != nil {
// Keep retrying until update goes through
glog.V(4).Info("Warning: Updated failed, retrying, error: %q", err)
c.updateMachineStatus(machine, lastOperation, currentStatus)
}
return machine
}

func (c *controller) updateMachineConditions(machine *v1alpha1.Machine, conditions []v1.NodeCondition) *v1alpha1.Machine {
Expand Down
8 changes: 7 additions & 1 deletion pkg/controller/machineset.go
Original file line number Diff line number Diff line change
Expand Up @@ -623,12 +623,18 @@ func (c *controller) prepareMachineForDeletion(targetMachine *v1alpha1.Machine,
}

// Force trigger deletion to reflect in machine status
lastOperation := v1alpha1.LastOperation{
Description: "Deleting machine from cloud provider",
State: "Processing",
Type: "Delete",
LastUpdateTime: metav1.Now(),
}
currentStatus := v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
TimeoutActive: false,
LastUpdateTime: metav1.Now(),
}
c.updateMachineStatus(targetMachine, targetMachine.Status.LastOperation, currentStatus)
c.updateMachineStatus(targetMachine, lastOperation, currentStatus)
glog.V(2).Info("Delete machine from machineset:", targetMachine.Name)

if err := c.machineControl.DeleteMachine(targetMachine.Namespace, targetMachine.Name, machineSet); err != nil {
Expand Down

0 comments on commit 4a72108

Please sign in to comment.