From c73c4ee11427cdf47bfc5c805ff8e77df311504d Mon Sep 17 00:00:00 2001 From: prashanth26 Date: Tue, 11 Dec 2018 14:07:02 +0530 Subject: [PATCH] Optimise pod handling in machine updates --- pkg/controller/deployment_rolling.go | 104 ++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) diff --git a/pkg/controller/deployment_rolling.go b/pkg/controller/deployment_rolling.go index 9ae912360..47c83ea92 100644 --- a/pkg/controller/deployment_rolling.go +++ b/pkg/controller/deployment_rolling.go @@ -25,11 +25,21 @@ package controller import ( "fmt" "sort" + "time" - "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" - "github.com/golang/glog" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/integer" + + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + "github.com/golang/glog" +) + +var ( + maxRetryDeadline = 5 * time.Minute + conflictRetryInterval = 10 * time.Second ) // rolloutRolling implements the logic for rolling a new machine set. @@ -40,6 +50,11 @@ func (dc *controller) rolloutRolling(d *v1alpha1.MachineDeployment, isList []*v1 } allISs := append(oldISs, newIS) + err = dc.taintNodesBackingMachineSet(oldISs) + if err != nil { + return nil + } + // Scale up, if we can. scaledUp, err := dc.reconcileNewMachineSet(allISs, newIS, d) if err != nil { @@ -236,3 +251,88 @@ func (dc *controller) scaleDownOldMachineSetsForRollingUpdate(allISs []*v1alpha1 return totalScaledDown, nil } + +// taintNodesBackingMachineSet taints all nodes backing the machineSet +func (dc *controller) taintNodesBackingMachineSet(MachineSets []*v1alpha1.MachineSet) error { + + for _, machineSet := range MachineSets { + + if machineSet.Annotations["PreferNoSchedule"] != "" { + continue + } + + selector, err := metav1.LabelSelectorAsSelector(machineSet.Spec.Selector) + if err != nil { + return err + } + + // list all machines to include the machines that don't match the ms`s selector + // anymore but has the stale controller ref. + // TODO: Do the List and Filter in a single pass, or use an index. + filteredMachines, err := dc.machineLister.List(labels.Everything()) + if err != nil { + return err + } + // NOTE: filteredMachines are pointing to objects from cache - if you need to + // modify them, you need to copy it first. + filteredMachines, err = dc.claimMachines(machineSet, selector, filteredMachines) + if err != nil { + return err + } + + taints := v1.Taint{ + Key: "old-machine-set", + Value: "True", + Effect: "PreferNoSchedule", + } + + // Iterate through all machines and place the PreferNoSchedule taint + // to avoid scheduling on older machines + for _, machine := range filteredMachines { + if machine.Labels["node"] != "" { + err = AddOrUpdateTaintOnNode( + dc.targetCoreClient, + machine.Labels["node"], + &taints, + ) + if err != nil { + glog.Errorf("Node tainting failed for node: %s, %s", machine.Labels["node"], err) + return err + } + } + } + + retryDeadline := time.Now().Add(maxRetryDeadline) + for { + machineSet, err = dc.controlMachineClient.MachineSets(machineSet.Namespace).Get(machineSet.Name, metav1.GetOptions{}) + if err != nil && time.Now().Before(retryDeadline) { + glog.Warningf("Unable to fetch MachineSet object %s, Error: %+v", machineSet.Name, err) + time.Sleep(conflictRetryInterval) + continue + } else if err != nil { + // Timeout occurred + glog.Errorf("Unable to fetch MachineSet object %s, Error: %+v", machineSet.Name, err) + return err + } + + msCopy := machineSet.DeepCopy() + msCopy.Annotations["PreferNoSchedule"] = "True" + + _, err = dc.controlMachineClient.MachineSets(msCopy.Namespace).Update(msCopy) + if err != nil && time.Now().Before(retryDeadline) { + glog.Warningf("Unable to update MachineSet object %s, Error: %+v", machineSet.Name, err) + time.Sleep(conflictRetryInterval) + continue + } else if err != nil { + // Timeout occurred + glog.Errorf("Unable to update MachineSet object %s, Error: %+v", machineSet.Name, err) + return err + } + + // Break out of loop when update succeeds + break + } + } + + return nil +}