diff --git a/cmd/machine-controller-manager/app/controllermanager.go b/cmd/machine-controller-manager/app/controllermanager.go
index 2f9aec1f3..3128acd74 100644
--- a/cmd/machine-controller-manager/app/controllermanager.go
+++ b/cmd/machine-controller-manager/app/controllermanager.go
@@ -208,7 +208,7 @@ func StartControllers(s *options.MCMServer,
recorder record.EventRecorder,
stop <-chan struct{}) error {
- klog.V(5).Info("Getting available resources")
+ klog.V(4).Info("Getting available resources")
availableResources, err := getAvailableResources(controlCoreClientBuilder)
if err != nil {
return err
@@ -229,7 +229,7 @@ func StartControllers(s *options.MCMServer,
}
if availableResources[machineGVR] || availableResources[machineSetGVR] || availableResources[machineDeploymentGVR] {
- klog.V(5).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod)
+ klog.V(4).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod)
controlMachineInformerFactory := machineinformers.NewFilteredSharedInformerFactory(
controlMachineClientBuilder.ClientOrDie("control-machine-shared-informers"),
@@ -253,7 +253,7 @@ func StartControllers(s *options.MCMServer,
// All shared informers are v1alpha1 API level
machineSharedInformers := controlMachineInformerFactory.Machine().V1alpha1()
- klog.V(5).Infof("Creating controllers...")
+ klog.V(4).Infof("Creating controllers...")
mcmController, err := mcmcontroller.NewController(
s.Namespace,
controlMachineClient,
@@ -276,7 +276,7 @@ func StartControllers(s *options.MCMServer,
controlCoreInformerFactory.Start(stop)
targetCoreInformerFactory.Start(stop)
- klog.V(5).Info("Running controller")
+ klog.V(4).Info("Running controller")
go mcmController.Run(int(s.ConcurrentNodeSyncs), stop)
} else {
diff --git a/docs/documents/apis.md b/docs/documents/apis.md
index 6ea44ddc0..401062f35 100644
--- a/docs/documents/apis.md
+++ b/docs/documents/apis.md
@@ -1555,7 +1555,7 @@ to be.
CurrentStatus)
-
MachinePhase is a label for the condition of a machines at the current time.
+MachinePhase is a label for the condition of a machine at the current time.
@@ -1992,7 +1992,7 @@ MachineConfiguration
LastOperation)
-
MachineState is a current state of the machine.
+MachineState is a current state of the operation.
diff --git a/kubernetes/crds/machine.sapcloud.io_machines.yaml b/kubernetes/crds/machine.sapcloud.io_machines.yaml
index 1b800af25..744354ac5 100644
--- a/kubernetes/crds/machine.sapcloud.io_machines.yaml
+++ b/kubernetes/crds/machine.sapcloud.io_machines.yaml
@@ -245,7 +245,7 @@ spec:
format: date-time
type: string
phase:
- description: MachinePhase is a label for the condition of a machines
+ description: MachinePhase is a label for the condition of a machine
at the current time.
type: string
timeoutActive:
diff --git a/pkg/apis/machine/v1alpha1/machine_types.go b/pkg/apis/machine/v1alpha1/machine_types.go
index b5920e570..d3541743c 100644
--- a/pkg/apis/machine/v1alpha1/machine_types.go
+++ b/pkg/apis/machine/v1alpha1/machine_types.go
@@ -136,10 +136,10 @@ type LastOperation struct {
Type MachineOperationType `json:"type,omitempty"`
}
-// MachinePhase is a label for the condition of a machines at the current time.
+// MachinePhase is a label for the condition of a machine at the current time.
type MachinePhase string
-// These are the valid statuses of machines.
+// These are the valid phases of machines.
const (
// MachinePending means that the machine is being created
MachinePending MachinePhase = "Pending"
@@ -156,25 +156,25 @@ const (
// MachineUnknown indicates that the node is not ready at the movement
MachineUnknown MachinePhase = "Unknown"
- // MachineFailed means operation failed leading to machine status failure
+ // MachineFailed means operation timed out
MachineFailed MachinePhase = "Failed"
- // MachineCrashLoopBackOff means creation or deletion of the machine is failing. It means that machine object is present but there is no corresponding VM.
+ // MachineCrashLoopBackOff means machine creation is failing. It means that machine object is present but there is no corresponding VM.
MachineCrashLoopBackOff MachinePhase = "CrashLoopBackOff"
)
-// MachineState is a current state of the machine.
+// MachineState is a current state of the operation.
type MachineState string
-// These are the valid statuses of machines.
+// These are the valid states of operations performed on the machine.
const (
- // MachineStatePending means there are operations pending on this machine state
+ // MachineStateProcessing means operation is not yet complete
MachineStateProcessing MachineState = "Processing"
- // MachineStateFailed means operation failed leading to machine status failure
+ // MachineStateFailed means operation failed
MachineStateFailed MachineState = "Failed"
- // MachineStateSuccessful indicates that the node is not ready at the moment
+ // MachineStateSuccessful means operation completed successfully
MachineStateSuccessful MachineState = "Successful"
)
@@ -189,7 +189,7 @@ const (
// MachineOperationUpdate indicates that the operation was an update
MachineOperationUpdate MachineOperationType = "Update"
- // MachineOperationHealthCheck indicates that the operation was a create
+ // MachineOperationHealthCheck indicates that the operation was a health check of node object
MachineOperationHealthCheck MachineOperationType = "HealthCheck"
// MachineOperationDelete indicates that the operation was a delete
diff --git a/pkg/util/provider/app/app.go b/pkg/util/provider/app/app.go
index 3b095fce8..c1129b147 100644
--- a/pkg/util/provider/app/app.go
+++ b/pkg/util/provider/app/app.go
@@ -209,7 +209,7 @@ func StartControllers(s *options.MCServer,
recorder record.EventRecorder,
stop <-chan struct{}) error {
- klog.V(5).Info("Getting available resources")
+ klog.V(4).Info("Getting available resources")
availableResources, err := getAvailableResources(controlCoreClientBuilder)
if err != nil {
return err
@@ -239,7 +239,7 @@ func StartControllers(s *options.MCServer,
}
if availableResources[machineGVR] {
- klog.V(5).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod)
+ klog.V(4).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod)
controlMachineInformerFactory := machineinformers.NewFilteredSharedInformerFactory(
controlMachineClientBuilder.ClientOrDie("control-machine-shared-informers"),
@@ -274,7 +274,7 @@ func StartControllers(s *options.MCServer,
// All shared informers are v1alpha1 API level
machineSharedInformers := controlMachineInformerFactory.Machine().V1alpha1()
- klog.V(5).Infof("Creating controllers...")
+ klog.V(4).Infof("Creating controllers...")
machineController, err := machinecontroller.NewController(
s.Namespace,
controlMachineClient,
@@ -305,7 +305,7 @@ func StartControllers(s *options.MCServer,
controlCoreInformerFactory.Start(stop)
targetCoreInformerFactory.Start(stop)
- klog.V(5).Info("Running controller")
+ klog.V(4).Info("Running controller")
go machineController.Run(int(s.ConcurrentNodeSyncs), stop)
} else {
diff --git a/pkg/util/provider/drain/drain.go b/pkg/util/provider/drain/drain.go
index 0b5a9f6e4..8b607056e 100644
--- a/pkg/util/provider/drain/drain.go
+++ b/pkg/util/provider/drain/drain.go
@@ -934,7 +934,7 @@ func (o *Options) waitForReattach(ctx context.Context, podVolumeInfo PodVolumeIn
case incomingEvent := <-volumeAttachmentEventCh:
persistentVolumeName := *incomingEvent.Spec.Source.PersistentVolumeName
- klog.V(5).Infof("VolumeAttachment event received for PV: %s", persistentVolumeName)
+ klog.V(4).Infof("VolumeAttachment event received for PV: %s", persistentVolumeName)
// Checking if event for an PV that is being waited on
if _, present := pvsWaitingForReattachments[persistentVolumeName]; present {
diff --git a/pkg/util/provider/drain/volume_attachment.go b/pkg/util/provider/drain/volume_attachment.go
index f3dcc24ec..5c35afa3a 100644
--- a/pkg/util/provider/drain/volume_attachment.go
+++ b/pkg/util/provider/drain/volume_attachment.go
@@ -70,13 +70,13 @@ func (v *VolumeAttachmentHandler) dispatch(obj interface{}) {
// AddVolumeAttachment is the event handler for VolumeAttachment add
func (v *VolumeAttachmentHandler) AddVolumeAttachment(obj interface{}) {
- klog.V(5).Infof("Adding volume attachment object")
+ klog.V(4).Infof("Adding volume attachment object")
v.dispatch(obj)
}
// UpdateVolumeAttachment is the event handler for VolumeAttachment update
func (v *VolumeAttachmentHandler) UpdateVolumeAttachment(oldObj, newObj interface{}) {
- klog.V(5).Info("Updating volume attachment object")
+ klog.V(4).Info("Updating volume attachment object")
v.dispatch(newObj)
}
diff --git a/pkg/util/provider/machinecontroller/controller.go b/pkg/util/provider/machinecontroller/controller.go
index a9ff3c941..e4b458371 100644
--- a/pkg/util/provider/machinecontroller/controller.go
+++ b/pkg/util/provider/machinecontroller/controller.go
@@ -135,7 +135,6 @@ func NewController(
controller.pvcLister = pvcInformer.Lister()
controller.pvLister = pvInformer.Lister()
controller.secretLister = secretInformer.Lister()
- // TODO: Need to handle K8s versions below 1.13 differently
controller.volumeAttachementLister = volumeAttachmentInformer.Lister()
controller.machineClassLister = machineClassInformer.Lister()
controller.nodeLister = nodeInformer.Lister()
@@ -158,7 +157,7 @@ func NewController(
controller.pdbV1beta1Synced = pdbV1beta1Informer.Informer().HasSynced
}
- // Secret Controller Informers
+ // Secret Controller's Informers
secretInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: controller.secretAdd,
DeleteFunc: controller.secretDelete,
@@ -170,10 +169,9 @@ func NewController(
DeleteFunc: controller.machineClassToSecretDelete,
})
- // Machine Class Controller Informers
+ // Machine Class Controller's Informers
machineInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: controller.machineToMachineClassAdd,
- UpdateFunc: controller.machineToMachineClassUpdate,
DeleteFunc: controller.machineToMachineClassDelete,
})
@@ -183,7 +181,7 @@ func NewController(
DeleteFunc: controller.machineClassDelete,
})
- // Machine Controller Informers
+ // Machine Controller's Informers
nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: controller.addNodeToMachine,
UpdateFunc: controller.updateNodeToMachine,
@@ -196,7 +194,7 @@ func NewController(
DeleteFunc: controller.deleteMachine,
})
- // MachineSafety Controller Informers
+ // MachineSafety Controller's Informers
// We follow the kubernetes way of reconciling the safety controller
// done by adding empty key objects. We initialize it, to trigger
// running of different safety loop on MCM startup.
@@ -209,7 +207,7 @@ func NewController(
DeleteFunc: controller.deleteMachineToSafety,
})
- // Drain Controller Informers
+ // Drain Controller's Informers
if k8sutils.IsResourceSupported(
targetCoreClient,
schema.GroupResource{
diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go
index dbd95b480..e28c9e9b4 100644
--- a/pkg/util/provider/machinecontroller/machine.go
+++ b/pkg/util/provider/machinecontroller/machine.go
@@ -47,41 +47,50 @@ SECTION
Machine controller - Machine add, update, delete watches
*/
func (c *controller) addMachine(obj interface{}) {
- klog.V(5).Infof("Adding machine object")
- c.enqueueMachine(obj)
+ c.enqueueMachine(obj, "handling machine obj ADD event")
}
func (c *controller) updateMachine(oldObj, newObj interface{}) {
- klog.V(5).Info("Updating machine object")
- c.enqueueMachine(newObj)
+ oldMachine := oldObj.(*v1alpha1.Machine)
+ newMachine := newObj.(*v1alpha1.Machine)
+
+ if oldMachine == nil || newMachine == nil {
+ klog.Errorf("couldn't convert to machine resource from object")
+ return
+ }
+
+ if oldMachine.Generation == newMachine.Generation {
+ klog.V(3).Infof("Skipping non-spec updates for machine %s", oldMachine.Name)
+ return
+ }
+
+ c.enqueueMachine(newObj, "handling machine object UPDATE event")
}
func (c *controller) deleteMachine(obj interface{}) {
- klog.V(5).Info("Deleting machine object")
- c.enqueueMachine(obj)
+ c.enqueueMachine(obj, "handling machine object DELETE event")
}
-// isToBeEnqueued returns true if the key is to be managed by this controller
-func (c *controller) isToBeEnqueued(obj interface{}) (bool, string) {
+// getKeyForObj returns key for object, else returns false
+func (c *controller) getKeyForObj(obj interface{}) (string, bool) {
key, err := cache.MetaNamespaceKeyFunc(obj)
if err != nil {
klog.Errorf("Couldn't get key for object %+v: %v", obj, err)
- return false, ""
+ return "", false
}
-
- return true, key
+ return key, true
}
-func (c *controller) enqueueMachine(obj interface{}) {
- if toBeEnqueued, key := c.isToBeEnqueued(obj); toBeEnqueued {
- klog.V(5).Infof("Adding machine object to the queue %q", key)
+func (c *controller) enqueueMachine(obj interface{}, reason string) {
+ if key, ok := c.getKeyForObj(obj); ok {
+ klog.V(3).Infof("Adding machine object to queue %q, reason: %s", key, reason)
c.machineQueue.Add(key)
}
}
-func (c *controller) enqueueMachineAfter(obj interface{}, after time.Duration) {
- if toBeEnqueued, key := c.isToBeEnqueued(obj); toBeEnqueued {
- klog.V(5).Infof("Adding machine object to the queue %q after %s", key, after)
+func (c *controller) enqueueMachineAfter(obj interface{}, after time.Duration, reason string) {
+ if key, ok := c.getKeyForObj(obj); ok {
+ klog.V(3).Infof("Adding machine object to queue %q after %s, reason: %s", key, after, reason)
c.machineQueue.AddAfter(key, after)
}
}
@@ -105,16 +114,19 @@ func (c *controller) reconcileClusterMachineKey(key string) error {
}
retryPeriod, err := c.reconcileClusterMachine(ctx, machine)
- klog.V(5).Info(err, retryPeriod)
- c.enqueueMachineAfter(machine, time.Duration(retryPeriod))
+ var reEnqueReason = "periodic reconcile"
+ if err != nil {
+ reEnqueReason = err.Error()
+ }
+ c.enqueueMachineAfter(machine, time.Duration(retryPeriod), reEnqueReason)
return nil
}
func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alpha1.Machine) (machineutils.RetryPeriod, error) {
- klog.V(5).Infof("Start Reconciling machine: %q , nodeName: %q ,providerID: %q", machine.Name, getNodeName(machine), getProviderID(machine))
- defer klog.V(5).Infof("Stop Reconciling machine %q, nodeName: %q ,providerID: %q", machine.Name, getNodeName(machine), getProviderID(machine))
+ klog.V(2).Infof("reconcileClusterMachine: Start for %q with phase:%q, description:%q", machine.Name, machine.Status.CurrentStatus.Phase, machine.Status.LastOperation.Description)
+ defer klog.V(2).Infof("reconcileClusterMachine: Stop for %q", machine.Name)
if c.safetyOptions.MachineControllerFrozen && machine.DeletionTimestamp == nil {
// If Machine controller is frozen and
@@ -132,7 +144,7 @@ func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alp
validationerr := validation.ValidateMachine(internalMachine)
if validationerr.ToAggregate() != nil && len(validationerr.ToAggregate().Errors()) > 0 {
- err := fmt.Errorf("Validation of Machine failed %s", validationerr.ToAggregate().Error())
+ err := fmt.Errorf("validation of Machine failed %s", validationerr.ToAggregate().Error())
klog.Error(err)
return machineutils.LongRetry, err
}
@@ -140,7 +152,7 @@ func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alp
// Validate MachineClass
machineClass, secretData, retry, err := c.ValidateMachineClass(ctx, &machine.Spec.Class)
if err != nil {
- klog.Error(err)
+ klog.Errorf("cannot reconcile machine %s: %s", machine.Name, err)
return retry, err
}
@@ -193,8 +205,8 @@ SECTION
Machine controller - nodeToMachine
*/
var (
- errMultipleMachineMatch = errors.New("Multiple machines matching node")
- errNoMachineMatch = errors.New("No machines matching node found")
+ errMultipleMachineMatch = errors.New("multiple machines matching node")
+ errNoMachineMatch = errors.New("no machines matching node found")
)
func (c *controller) addNodeToMachine(obj interface{}) {
@@ -226,31 +238,34 @@ func (c *controller) addNodeToMachine(obj interface{}) {
return
}
- if machine.Status.CurrentStatus.Phase != v1alpha1.MachineCrashLoopBackOff && nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) {
- klog.V(5).Infof("Enqueue machine object %q as conditions of backing node %q have changed", machine.Name, getNodeName(machine))
- c.enqueueMachine(machine)
+ isMachineCrashLooping := machine.Status.CurrentStatus.Phase == v1alpha1.MachineCrashLoopBackOff
+ isMachineTerminating := machine.Status.CurrentStatus.Phase == v1alpha1.MachineTerminating
+ _, _, nodeConditionsHaveChanged := nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions)
+
+ if !isMachineCrashLooping && !isMachineTerminating && nodeConditionsHaveChanged {
+ c.enqueueMachine(machine, fmt.Sprintf("handling node UPDATE event. conditions of backing node %q have changed", getNodeName(machine)))
}
}
func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) {
+ oldNode := oldObj.(*corev1.Node)
node := newObj.(*corev1.Node)
if node == nil {
klog.Errorf("Couldn't convert to node from object")
return
}
+ machine, err := c.getMachineFromNode(node.Name)
+ if err != nil {
+ klog.Errorf("Unable to handle update event for node %s, couldn't fetch machine %s, Error: %s", machine.Name, err)
+ return
+ }
+
// check for the TriggerDeletionByMCM annotation on the node object
// if it is present then mark the machine object for deletion
if value, ok := node.Annotations[machineutils.TriggerDeletionByMCM]; ok && value == "true" {
-
- machine, err := c.getMachineFromNode(node.Name)
- if err != nil {
- klog.Errorf("Couldn't fetch machine %s, Error: %s", machine.Name, err)
- return
- }
-
if machine.DeletionTimestamp == nil {
- klog.Infof("Node %s is annotated to trigger deletion by MCM.", node.Name)
+ klog.Infof("Node %s for machine %s is annotated to trigger deletion by MCM.", node.Name, machine.Name)
if err := c.controlMachineClient.Machines(c.namespace).Delete(context.Background(), machine.Name, metav1.DeleteOptions{}); err != nil {
klog.Errorf("Machine object %s backing the node %s could not be marked for deletion.", machine.Name, node.Name)
return
@@ -259,6 +274,12 @@ func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) {
}
}
+ // to reconcile on addition/removal of essential taints in machine lifecycle, example - critical component taint
+ if addedOrRemovedEssentialTaints(oldNode, node, machineutils.EssentialTaints) {
+ c.enqueueMachine(machine, fmt.Sprintf("handling node UPDATE event. atleast one of essential taints on backing node %q has changed", getNodeName(machine)))
+ return
+ }
+
c.addNodeToMachine(newObj)
}
@@ -275,7 +296,10 @@ func (c *controller) deleteNodeToMachine(obj interface{}) {
return
}
- c.enqueueMachine(machine)
+ // donot respond if machine obj is already in termination flow
+ if machine.DeletionTimestamp == nil {
+ c.enqueueMachine(machine, fmt.Sprintf("backing node obj %q got deleted", key))
+ }
}
/*
@@ -302,9 +326,32 @@ func (c *controller) getMachineFromNode(nodeName string) (*v1alpha1.Machine, err
return machines[0], nil
}
+func addedOrRemovedEssentialTaints(oldNode, node *corev1.Node, taintKeys []string) bool {
+ mapOldNodeTaintKeys := make(map[string]bool)
+ mapNodeTaintKeys := make(map[string]bool)
+
+ for _, t := range oldNode.Spec.Taints {
+ mapOldNodeTaintKeys[t.Key] = true
+ }
+
+ for _, t := range node.Spec.Taints {
+ mapNodeTaintKeys[t.Key] = true
+ }
+
+ for _, tk := range taintKeys {
+ _, oldNodeHasTaint := mapOldNodeTaintKeys[tk]
+ _, newNodeHasTaint := mapNodeTaintKeys[tk]
+ if oldNodeHasTaint != newNodeHasTaint {
+ klog.V(2).Infof("Taint with key %q has been added/removed from the node %q", tk, node.Name)
+ return true
+ }
+ }
+ return false
+}
+
/*
SECTION
- Machine operations - Create, Update, Delete
+ Machine operations - Create, Delete
*/
func (c *controller) triggerCreationFlow(ctx context.Context, createMachineRequest *driver.CreateMachineRequest) (machineutils.RetryPeriod, error) {
@@ -397,7 +444,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque
}
// machine obj marked Failed for double security
- c.machineStatusUpdate(
+ updateRetryPeriod, updateErr := c.machineStatusUpdate(
ctx,
machine,
v1alpha1.LastOperation{
@@ -413,6 +460,10 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque
machine.Status.LastKnownState,
)
+ if updateErr != nil {
+ return updateRetryPeriod, updateErr
+ }
+
klog.V(2).Infof("Machine %q marked Failed as VM was referring to a stale node object", machine.Name)
return machineutils.ShortRetry, err
}
@@ -424,7 +475,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque
case codes.Unknown, codes.DeadlineExceeded, codes.Aborted, codes.Unavailable:
// GetMachineStatus() returned with one of the above error codes.
// Retry operation.
- c.machineStatusUpdate(
+ updateRetryPeriod, updateErr := c.machineStatusUpdate(
ctx,
machine,
v1alpha1.LastOperation{
@@ -439,11 +490,14 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque
},
machine.Status.LastKnownState,
)
+ if updateErr != nil {
+ return updateRetryPeriod, updateErr
+ }
return machineutils.ShortRetry, err
default:
- c.machineStatusUpdate(
+ updateRetryPeriod, updateErr := c.machineStatusUpdate(
ctx,
machine,
v1alpha1.LastOperation{
@@ -458,6 +512,9 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque
},
machine.Status.LastKnownState,
)
+ if updateErr != nil {
+ return updateRetryPeriod, updateErr
+ }
return machineutils.MediumRetry, err
}
@@ -528,44 +585,6 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque
return machineutils.LongRetry, nil
}
-func (c *controller) triggerUpdationFlow(ctx context.Context, machine *v1alpha1.Machine, actualProviderID string) (machineutils.RetryPeriod, error) {
- klog.V(2).Infof("Setting ProviderID of machine %s with backing node %s to %s", machine.Name, getNodeName(machine), actualProviderID)
-
- for {
- machine, err := c.controlMachineClient.Machines(machine.Namespace).Get(ctx, machine.Name, metav1.GetOptions{})
- if err != nil {
- klog.Warningf("Machine GET failed. Retrying, error: %s", err)
- continue
- }
-
- clone := machine.DeepCopy()
- clone.Spec.ProviderID = actualProviderID
- machine, err = c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{})
- if err != nil {
- klog.Warningf("Machine UPDATE failed. Retrying, error: %s", err)
- continue
- }
-
- clone = machine.DeepCopy()
- lastOperation := v1alpha1.LastOperation{
- Description: "Updated provider ID",
- State: v1alpha1.MachineStateSuccessful,
- Type: v1alpha1.MachineOperationUpdate,
- LastUpdateTime: metav1.Now(),
- }
- clone.Status.LastOperation = lastOperation
- _, err = c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(ctx, clone, metav1.UpdateOptions{})
- if err != nil {
- klog.Warningf("Machine/status UPDATE failed. Retrying, error: %s", err)
- continue
- }
- // Update went through, exit out of infinite loop
- break
- }
-
- return machineutils.LongRetry, nil
-}
-
func (c *controller) triggerDeletionFlow(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) {
var (
machine = deleteMachineRequest.Machine
diff --git a/pkg/util/provider/machinecontroller/machine_safety.go b/pkg/util/provider/machinecontroller/machine_safety.go
index 23ce2e132..44b99d286 100644
--- a/pkg/util/provider/machinecontroller/machine_safety.go
+++ b/pkg/util/provider/machinecontroller/machine_safety.go
@@ -107,8 +107,9 @@ func (c *controller) reconcileClusterMachineSafetyAPIServer(key string) error {
klog.V(2).Infof("SafetyController: Reinitializing machine health check for machine: %q with backing node: %q and providerID: %q", machine.Name, getNodeName(machine), getProviderID(machine))
}
- // En-queue after 30 seconds, to ensure all machine states are reconciled
- c.enqueueMachineAfter(machine, 30*time.Second)
+ // En-queue after 30 seconds, to ensure all machine phases are reconciled to their actual value
+ // as they are currently reset to `Running`
+ c.enqueueMachineAfter(machine, 30*time.Second, "kube-api-servers are up again, so reconcile of machine phase is needed")
}
c.safetyOptions.MachineControllerFrozen = false
diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go
index 98707bc5d..9bfd4bf81 100644
--- a/pkg/util/provider/machinecontroller/machine_test.go
+++ b/pkg/util/provider/machinecontroller/machine_test.go
@@ -781,7 +781,7 @@ var _ = Describe("machine", func() {
ErrorCode: codes.Internal.String(),
},
}, nil, nil, nil, true, metav1.Now()),
- err: nil,
+ err: status.Error(codes.Internal, "Provider is returning error on create call"),
retry: machineutils.MediumRetry,
},
}),
@@ -833,7 +833,7 @@ var _ = Describe("machine", func() {
ErrorCode: codes.ResourceExhausted.String(),
},
}, nil, nil, nil, true, metav1.Now()),
- err: nil,
+ err: status.Error(codes.ResourceExhausted, "Provider does not have capacity to create VM"),
retry: machineutils.MediumRetry,
},
}),
@@ -885,7 +885,7 @@ var _ = Describe("machine", func() {
ErrorCode: codes.Internal.String(),
},
}, nil, nil, nil, true, metav1.NewTime(metav1.Now().Add(-time.Hour))),
- err: nil,
+ err: status.Error(codes.Internal, "Provider is returning error on create call"),
retry: machineutils.MediumRetry,
},
}),
@@ -2794,7 +2794,7 @@ var _ = Describe("machine", func() {
},
},
expect: expect{
- err: fmt.Errorf("Machine deletion in process. Deletion of node object was succesful"),
+ err: fmt.Errorf("Machine deletion in process. Deletion of node object was successful"),
retry: machineutils.ShortRetry,
nodeDeleted: true,
machine: newMachine(
diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go
index a9b75e750..67919c5c8 100644
--- a/pkg/util/provider/machinecontroller/machine_util.go
+++ b/pkg/util/provider/machinecontroller/machine_util.go
@@ -60,6 +60,10 @@ import (
// emptyMap is a dummy emptyMap to compare with
var emptyMap = make(map[string]string)
+var (
+ errSuccessfulALTsync = errors.New("machine ALTs have been reconciled")
+ errSuccessfulPhaseUpdate = errors.New("machine creation is successful. Machine Phase/Conditions have been UPDATED")
+)
const (
maxReplacements = 1
@@ -68,43 +72,6 @@ const (
cacheUpdateTimeout = 1 * time.Second
)
-// TODO: use client library instead when it starts to support update retries
-//
-// see https://github.com/kubernetes/kubernetes/issues/21479
-// type updateMachineFunc func(machine *v1alpha1.Machine) error
-
-/*
-// UpdateMachineWithRetries updates a machine with given applyUpdate function. Note that machine not found error is ignored.
-// The returned bool value can be used to tell if the machine is actually updated.
-func UpdateMachineWithRetries(machineClient v1alpha1client.MachineInterface, machineLister v1alpha1listers.MachineLister, namespace, name string, applyUpdate updateMachineFunc) (*v1alpha1.Machine, error) {
- var machine *v1alpha1.Machine
-
- retryErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
- var err error
- machine, err = machineLister.Machines(namespace).Get(name)
- if err != nil {
- return err
- }
- machine = machine.DeepCopy()
- // Apply the update, then attempt to push it to the apiserver.
- if applyErr := applyUpdate(machine); applyErr != nil {
- return applyErr
- }
- machine, err = machineClient.Update(machine)
- return err
- })
-
- // Ignore the precondition violated error, this machine is already updated
- // with the desired label.
- if retryErr == errorsutil.ErrPreconditionViolated {
- klog.V(4).Infof("Machine %s precondition doesn't hold, skip updating it.", name)
- retryErr = nil
- }
-
- return machine, retryErr
-}
-*/
-
// ValidateMachineClass validates the machine class.
func (c *controller) ValidateMachineClass(_ context.Context, classSpec *v1alpha1.ClassSpec) (*v1alpha1.MachineClass, map[string][]byte, machineutils.RetryPeriod, error) {
var (
@@ -137,7 +104,6 @@ func (c *controller) ValidateMachineClass(_ context.Context, classSpec *v1alpha1
errMessage := fmt.Sprintf("The machine class %s has no finalizers set. So not reconciling the machine.", machineClass.Name)
err := errors.New(errMessage)
- klog.Warning(errMessage)
return nil, nil, machineutils.ShortRetry, err
}
@@ -219,19 +185,38 @@ func (c *controller) getSecret(ref *v1.SecretReference, MachineClassName string)
return secretRef, err
}
-// nodeConditionsHaveChanged compares two node statuses to see if any of the statuses have changed
-func nodeConditionsHaveChanged(machineConditions []v1.NodeCondition, nodeConditions []v1.NodeCondition) bool {
- if len(machineConditions) != len(nodeConditions) {
- return true
+// nodeConditionsHaveChanged compares two node status.conditions to see if any of the statuses have changed
+func nodeConditionsHaveChanged(oldConditions []v1.NodeCondition, newConditions []v1.NodeCondition) ([]v1.NodeCondition, []v1.NodeCondition, bool) {
+ var (
+ oldConditionsByType = make(map[v1.NodeConditionType]v1.NodeCondition, len(oldConditions))
+ newConditionsByType = make(map[v1.NodeConditionType]v1.NodeCondition, len(newConditions))
+ addedOrUpdatedConditions = make([]v1.NodeCondition, 0, len(newConditions))
+ removedConditions = make([]v1.NodeCondition, 0, len(oldConditions))
+ )
+
+ for _, c := range oldConditions {
+ oldConditionsByType[c.Type] = c
+ }
+ for _, c := range newConditions {
+ newConditionsByType[c.Type] = c
}
- for i := range nodeConditions {
- if nodeConditions[i].Status != machineConditions[i].Status {
- return true
+ // checking for any added/updated new condition
+ for _, c := range newConditions {
+ oldC, exists := oldConditionsByType[c.Type]
+ if !exists || (oldC.Status != c.Status) {
+ addedOrUpdatedConditions = append(addedOrUpdatedConditions, c)
}
}
- return false
+ // checking for any deleted condition
+ for _, c := range oldConditions {
+ if _, exists := newConditionsByType[c.Type]; !exists {
+ removedConditions = append(removedConditions, c)
+ }
+ }
+
+ return addedOrUpdatedConditions, removedConditions, len(addedOrUpdatedConditions) != 0 || len(removedConditions) != 0
}
func mergeDataMaps(in map[string][]byte, maps ...map[string][]byte) map[string][]byte {
@@ -314,10 +299,13 @@ func (c *controller) syncMachineNodeTemplates(ctx context.Context, machine *v1al
// Keep retrying until update goes through
klog.Errorf("Updated failed for node object of machine %q. Retrying, error: %q", machine.Name, err)
} else {
- // Return error even when machine object is updated
- err = fmt.Errorf("Machine ALTs have been reconciled")
+ // Return error to continue in next reconcile
+ err = errSuccessfulALTsync
}
+ if apierrors.IsConflict(err) {
+ return machineutils.ConflictRetry, err
+ }
return machineutils.ShortRetry, err
}
@@ -494,7 +482,7 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a
lastKnownState = createMachineResponse.LastKnownState
}
- c.machineStatusUpdate(
+ updateRetryPeriod, updateErr := c.machineStatusUpdate(
ctx,
machine,
v1alpha1.LastOperation{
@@ -511,7 +499,11 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a
lastKnownState,
)
- return retryRequired, nil
+ if updateErr != nil {
+ return updateRetryPeriod, updateErr
+ }
+
+ return retryRequired, err
}
func (c *controller) machineStatusUpdate(
@@ -520,7 +512,7 @@ func (c *controller) machineStatusUpdate(
lastOperation v1alpha1.LastOperation,
currentStatus v1alpha1.CurrentStatus,
lastKnownState string,
-) error {
+) (machineutils.RetryPeriod, error) {
clone := machine.DeepCopy()
clone.Status.LastOperation = lastOperation
clone.Status.CurrentStatus = currentStatus
@@ -528,7 +520,7 @@ func (c *controller) machineStatusUpdate(
if isMachineStatusSimilar(clone.Status, machine.Status) {
klog.V(3).Infof("Not updating the status of the machine object %q, as the content is similar", clone.Name)
- return nil
+ return machineutils.ShortRetry, nil
}
_, err := c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(ctx, clone, metav1.UpdateOptions{})
@@ -539,7 +531,11 @@ func (c *controller) machineStatusUpdate(
klog.V(2).Infof("Machine/status UPDATE for %q", machine.Name)
}
- return err
+ if apierrors.IsConflict(err) {
+ return machineutils.ConflictRetry, err
+ }
+
+ return machineutils.ShortRetry, err
}
// isMachineStatusSimilar checks if the status of 2 machines is similar or not.
@@ -574,7 +570,7 @@ func (c *controller) getCreateFailurePhase(machine *v1alpha1.Machine) v1alpha1.M
if timeOut > 0 {
// Machine creation timeout occured while joining of machine
// Machine set controller would replace this machine with a new one as phase is failed.
- klog.V(2).Infof("Machine %q , providerID %q and backing node %q couldn't join in creation timeout of %s. Changing phase to failed.", machine.Name, getProviderID(machine), getNodeName(machine), timeOutDuration)
+ klog.V(2).Infof("Machine %q , providerID %q and backing node %q couldn't join in creation timeout of %s. Changing phase to Failed.", machine.Name, getProviderID(machine), getNodeName(machine), timeOutDuration)
return v1alpha1.MachineFailed
}
@@ -601,14 +597,13 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph
// and corresponding node object went missing
// and if machine object still reports healthy
description = fmt.Sprintf(
- "Node object went missing. Machine %s is unhealthy - changing MachineState to Unknown",
+ "Node object went missing. Machine %s is unhealthy - changing MachinePhase to Unknown",
machine.Name,
)
klog.Warning(description)
clone.Status.CurrentStatus = v1alpha1.CurrentStatus{
- Phase: v1alpha1.MachineUnknown,
- // TimeoutActive: true,
+ Phase: v1alpha1.MachineUnknown,
LastUpdateTime: metav1.Now(),
}
clone.Status.LastOperation = v1alpha1.LastOperation{
@@ -625,9 +620,11 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph
return machineutils.ShortRetry, err
}
} else {
- if nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) {
+ populatedConditions, removedConditions, isChanged := nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions)
+ if isChanged {
clone.Status.Conditions = node.Status.Conditions
- klog.V(3).Infof("Conditions of Machine %q with providerID %q and backing node %q are changing", machine.Name, getProviderID(machine), getNodeName(machine))
+
+ klog.V(3).Infof("Conditions of node %q backing machine %q with providerID %q have changed.\nAdded/Updated Conditions:\n\n%s\nRemoved Conditions:\n\n%s\n", getNodeName(machine), machine.Name, getProviderID(machine), getFormattedNodeConditions(populatedConditions), getFormattedNodeConditions(removedConditions))
cloneDirty = true
}
@@ -649,7 +646,7 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph
description = fmt.Sprintf("Machine %s successfully re-joined the cluster", clone.Name)
lastOperationType = v1alpha1.MachineOperationHealthCheck
}
- klog.V(2).Info(description)
+ klog.V(2).Infof("%s with backing node %q and providerID %q", description, getNodeName(clone), getProviderID(clone))
// Machine is ready and has joined/re-joined the cluster
clone.Status.LastOperation = v1alpha1.LastOperation{
@@ -667,9 +664,9 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph
}
} else {
if clone.Status.CurrentStatus.Phase == v1alpha1.MachineRunning {
- // If machine is not healthy, and current state is running,
- // change the machinePhase to unknown and activate health check timeout
- description = fmt.Sprintf("Machine %s is unhealthy - changing MachineState to Unknown. Node conditions: %+v", clone.Name, clone.Status.Conditions)
+ // If machine is not healthy, and current phase is Running,
+ // change the machinePhase to Unknown and activate health check timeout
+ description = fmt.Sprintf("Machine %s is unhealthy - changing MachinePhase to Unknown. Node conditions: %+v", clone.Name, clone.Status.Conditions)
klog.Warning(description)
clone.Status.CurrentStatus = v1alpha1.CurrentStatus{
@@ -735,7 +732,7 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph
} else {
// Timeout occurred due to machine being unhealthy for too long
description = fmt.Sprintf(
- "Machine %s is not healthy since %s minutes. Changing status to failed. Node Conditions: %+v",
+ "Machine %s health checks failing since last %s minutes. Updating machine phase to Failed. Node Conditions: %+v",
machine.Name,
timeOutDuration,
machine.Status.Conditions,
@@ -749,19 +746,23 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph
} else {
// If timeout has not occurred, re-enqueue the machine
// after a specified sleep time
- c.enqueueMachineAfter(machine, sleepTime)
+ klog.V(4).Infof("Creation/Health Timeout hasn't occured yet , will re-enqueue after %s", time.Duration(sleepTime))
+ c.enqueueMachineAfter(machine, sleepTime, "re-check for creation/health timeout")
}
}
if cloneDirty {
_, err = c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(ctx, clone, metav1.UpdateOptions{})
if err != nil {
- // Keep retrying until update goes through
- klog.Errorf("Update failed for machine %q. Retrying, error: %q", machine.Name, err)
+ // Keep retrying across reconciles until update goes through
+ klog.Errorf("Update of Phase/Conditions failed for machine %q. Retrying, error: %q", machine.Name, err)
+ if apierrors.IsConflict(err) {
+ return machineutils.ConflictRetry, err
+ }
} else {
- klog.V(2).Infof("Machine State has been updated for %q with providerID %q and backing node %q", machine.Name, getProviderID(machine), getNodeName(machine))
- // Return error for continuing in next iteration
- err = fmt.Errorf("machine creation is successful. Machine State has been UPDATED")
+ klog.V(2).Infof("Machine Phase/Conditions have been updated for %q with providerID %q and are in sync with backing node %q", machine.Name, getProviderID(machine), getNodeName(machine))
+ // Return error to end the reconcile
+ err = errSuccessfulPhaseUpdate
}
return machineutils.ShortRetry, err
@@ -770,6 +771,18 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph
return machineutils.LongRetry, nil
}
+func getFormattedNodeConditions(conditions []v1.NodeCondition) string {
+ var result string
+ if len(conditions) == 0 {
+ return "\n"
+ }
+
+ for _, c := range conditions {
+ result = fmt.Sprintf("%sType: %s | Status: %s | Reason: %s | Message: %s\n", result, c.Type, c.Status, c.Reason, c.Message)
+ }
+ return result
+}
+
/*
SECTION
Manipulate Finalizers
@@ -856,7 +869,11 @@ func criticalComponentsNotReadyTaintPresent(node *v1.Node) bool {
}
func isPendingMachineWithCriticalComponentsNotReadyTaint(clone *v1alpha1.Machine, node *v1.Node) bool {
- return clone.Status.CurrentStatus.Phase == v1alpha1.MachinePending && criticalComponentsNotReadyTaintPresent(node)
+ if clone.Status.CurrentStatus.Phase == v1alpha1.MachinePending && criticalComponentsNotReadyTaintPresent(node) {
+ klog.V(3).Infof("Critical component taint %q still present on node %q for machine %q", machineutils.TaintNodeCriticalComponentsNotReady, getNodeName(clone), clone.Name)
+ return true
+ }
+ return false
}
/*
@@ -888,6 +905,10 @@ func (c *controller) setMachineTerminationStatus(ctx context.Context, deleteMach
// Return error even when machine object is updated to ensure reconcilation is restarted
err = fmt.Errorf("Machine deletion in process. Phase set to termination")
}
+
+ if apierrors.IsConflict(err) {
+ return machineutils.ConflictRetry, err
+ }
return machineutils.ShortRetry, err
}
@@ -948,7 +969,7 @@ func (c *controller) getVMStatus(ctx context.Context, getMachineStatusRequest *d
}
}
- c.machineStatusUpdate(
+ updateRetryPeriod, updateErr := c.machineStatusUpdate(
ctx,
getMachineStatusRequest.Machine,
v1alpha1.LastOperation{
@@ -964,6 +985,10 @@ func (c *controller) getVMStatus(ctx context.Context, getMachineStatusRequest *d
getMachineStatusRequest.Machine.Status.LastKnownState,
)
+ if updateErr != nil {
+ return updateRetryPeriod, updateErr
+ }
+
return retry, err
}
@@ -1076,7 +1101,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver
)
}
- // update node with the machine's state prior to termination
+ // update node with the machine's phase prior to termination
if err = c.UpdateNodeTerminationCondition(ctx, machine); err != nil {
if forceDeleteMachine {
klog.Warningf("Failed to update node conditions: %v. However, since it's a force deletion shall continue deletion of VM.", err)
@@ -1147,7 +1172,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver
}
}
- c.machineStatusUpdate(
+ updateRetryPeriod, updateErr := c.machineStatusUpdate(
ctx,
machine,
v1alpha1.LastOperation{
@@ -1163,6 +1188,10 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver
machine.Status.LastKnownState,
)
+ if updateErr != nil {
+ return updateRetryPeriod, updateErr
+ }
+
return machineutils.ShortRetry, err
}
@@ -1211,7 +1240,7 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine
}
now := metav1.Now()
klog.V(4).Infof("(deleteVolumeAttachmentsForNode) For node %q, machine %q, set LastOperation.Description: %q", nodeName, machine.Name, description)
- err = c.machineStatusUpdate(
+ updateRetryPeriod, updateErr := c.machineStatusUpdate(
ctx,
machine,
v1alpha1.LastOperation{
@@ -1223,6 +1252,11 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine
machine.Status.CurrentStatus,
machine.Status.LastKnownState,
)
+
+ if updateErr != nil {
+ return updateRetryPeriod, updateErr
+ }
+
return retryPeriod, err
}
@@ -1274,7 +1308,7 @@ func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver.
lastKnownState = deleteMachineResponse.LastKnownState
}
- c.machineStatusUpdate(
+ updateRetryPeriod, updateErr := c.machineStatusUpdate(
ctx,
machine,
v1alpha1.LastOperation{
@@ -1290,6 +1324,10 @@ func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver.
lastKnownState,
)
+ if updateErr != nil {
+ return updateRetryPeriod, updateErr
+ }
+
return retryRequired, err
}
@@ -1307,14 +1345,14 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac
// Delete node object
err = c.targetCoreClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{})
if err != nil && !apierrors.IsNotFound(err) {
- // If its an error, and anyother error than object not found
+ // If its an error, and any other error than object not found
description = fmt.Sprintf("Deletion of Node Object %q failed due to error: %s. %s", nodeName, err, machineutils.InitiateNodeDeletion)
state = v1alpha1.MachineStateFailed
} else if err == nil {
description = fmt.Sprintf("Deletion of Node Object %q is successful. %s", nodeName, machineutils.InitiateFinalizerRemoval)
state = v1alpha1.MachineStateProcessing
- err = fmt.Errorf("Machine deletion in process. Deletion of node object was succesful")
+ err = fmt.Errorf("Machine deletion in process. Deletion of node object was successful")
} else {
description = fmt.Sprintf("No node object found for %q, continuing deletion flow. %s", nodeName, machineutils.InitiateFinalizerRemoval)
state = v1alpha1.MachineStateProcessing
@@ -1326,7 +1364,7 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac
err = fmt.Errorf("Machine deletion in process. No node object found")
}
- c.machineStatusUpdate(
+ updateRetryPeriod, updateErr := c.machineStatusUpdate(
ctx,
machine,
v1alpha1.LastOperation{
@@ -1342,6 +1380,10 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac
machine.Status.LastKnownState,
)
+ if updateErr != nil {
+ return updateRetryPeriod, updateErr
+ }
+
return machineutils.ShortRetry, err
}
diff --git a/pkg/util/provider/machinecontroller/machine_util_test.go b/pkg/util/provider/machinecontroller/machine_util_test.go
index cadd5b550..2e83bfed2 100644
--- a/pkg/util/provider/machinecontroller/machine_util_test.go
+++ b/pkg/util/provider/machinecontroller/machine_util_test.go
@@ -196,7 +196,7 @@ var _ = Describe("machine_util", func() {
},
},
},
- err: fmt.Errorf("Machine ALTs have been reconciled"),
+ err: errSuccessfulALTsync,
},
}),
@@ -276,7 +276,7 @@ var _ = Describe("machine_util", func() {
},
},
},
- err: fmt.Errorf("Machine ALTs have been reconciled"),
+ err: errSuccessfulALTsync,
},
}),
@@ -422,7 +422,7 @@ var _ = Describe("machine_util", func() {
},
},
},
- err: fmt.Errorf("Machine ALTs have been reconciled"),
+ err: errSuccessfulALTsync,
},
}),
)
@@ -2177,7 +2177,7 @@ var _ = Describe("machine_util", func() {
},
expect: expect{
retryPeriod: machineutils.ShortRetry,
- err: errors.New("machine creation is successful. Machine State has been UPDATED"),
+ err: errSuccessfulPhaseUpdate,
expectedPhase: v1alpha1.MachineFailed,
},
}),
@@ -2196,7 +2196,7 @@ var _ = Describe("machine_util", func() {
},
expect: expect{
retryPeriod: machineutils.ShortRetry,
- err: errors.New("machine creation is successful. Machine State has been UPDATED"),
+ err: errSuccessfulPhaseUpdate,
expectedPhase: v1alpha1.MachineRunning,
},
}),
@@ -2215,7 +2215,7 @@ var _ = Describe("machine_util", func() {
},
expect: expect{
retryPeriod: machineutils.ShortRetry,
- err: errors.New("machine creation is successful. Machine State has been UPDATED"),
+ err: errSuccessfulPhaseUpdate,
expectedPhase: v1alpha1.MachineUnknown,
},
}),
@@ -2234,7 +2234,7 @@ var _ = Describe("machine_util", func() {
},
expect: expect{
retryPeriod: machineutils.ShortRetry,
- err: errors.New("machine creation is successful. Machine State has been UPDATED"),
+ err: errSuccessfulPhaseUpdate,
expectedPhase: v1alpha1.MachineUnknown,
},
}),
@@ -2250,7 +2250,7 @@ var _ = Describe("machine_util", func() {
},
expect: expect{
retryPeriod: machineutils.ShortRetry,
- err: errors.New("machine creation is successful. Machine State has been UPDATED"),
+ err: errSuccessfulPhaseUpdate,
expectedPhase: v1alpha1.MachineUnknown,
},
}),
@@ -2363,7 +2363,7 @@ var _ = Describe("machine_util", func() {
},
expect: expect{
retryPeriod: machineutils.ShortRetry,
- err: errors.New("machine creation is successful. Machine State has been UPDATED"),
+ err: errSuccessfulPhaseUpdate,
expectedPhase: v1alpha1.MachineRunning,
},
}),
@@ -2390,7 +2390,7 @@ var _ = Describe("machine_util", func() {
},
expect: expect{
retryPeriod: machineutils.ShortRetry,
- err: errors.New("machine creation is successful. Machine State has been UPDATED"),
+ err: errSuccessfulPhaseUpdate,
expectedPhase: v1alpha1.MachineRunning,
},
}),
@@ -2424,7 +2424,7 @@ var _ = Describe("machine_util", func() {
},
expect: expect{
retryPeriod: machineutils.ShortRetry,
- err: errors.New("machine creation is successful. Machine State has been UPDATED"),
+ err: errSuccessfulPhaseUpdate,
expectedPhase: v1alpha1.MachineRunning,
},
}),
diff --git a/pkg/util/provider/machinecontroller/machineclass.go b/pkg/util/provider/machinecontroller/machineclass.go
index 6ba849975..91a1ab460 100644
--- a/pkg/util/provider/machinecontroller/machineclass.go
+++ b/pkg/util/provider/machinecontroller/machineclass.go
@@ -45,36 +45,6 @@ func (c *controller) machineToMachineClassAdd(obj interface{}) {
}
}
-func (c *controller) machineToMachineClassUpdate(oldObj, newObj interface{}) {
- oldMachine, ok := oldObj.(*v1alpha1.Machine)
- if oldMachine == nil || !ok {
- klog.Warningf("Couldn't get machine from object: %+v", oldObj)
- return
- }
- newMachine, ok := newObj.(*v1alpha1.Machine)
- if newMachine == nil || !ok {
- klog.Warningf("Couldn't get machine from object: %+v", newObj)
- return
- }
-
- if oldMachine.Spec.Class.Kind == newMachine.Spec.Class.Kind {
- if newMachine.Spec.Class.Kind == machineutils.MachineClassKind {
- // Both old and new machine refer to the same machineClass object
- // And the correct kind so enqueuing only one of them.
- c.machineClassQueue.Add(newMachine.Spec.Class.Name)
- }
- } else {
- // If both are pointing to different machineClasses
- // we might have to enqueue both.
- if oldMachine.Spec.Class.Kind == machineutils.MachineClassKind {
- c.machineClassQueue.Add(oldMachine.Spec.Class.Name)
- }
- if newMachine.Spec.Class.Kind == machineutils.MachineClassKind {
- c.machineClassQueue.Add(newMachine.Spec.Class.Name)
- }
- }
-}
-
func (c *controller) machineToMachineClassDelete(obj interface{}) {
c.machineToMachineClassAdd(obj)
}
@@ -126,12 +96,12 @@ func (c *controller) reconcileClusterMachineClassKey(key string) error {
err = c.reconcileClusterMachineClass(ctx, class)
if err != nil {
- // Re-enqueue after a 10s window
- c.enqueueMachineClassAfter(class, 10*time.Second)
+ // Re-enqueue after a ShortRetry window
+ c.enqueueMachineClassAfter(class, time.Duration(machineutils.ShortRetry))
} else {
// Re-enqueue periodically to avoid missing of events
// TODO: Get ride of this logic
- c.enqueueMachineClassAfter(class, 10*time.Minute)
+ c.enqueueMachineClassAfter(class, time.Duration(machineutils.LongRetry))
}
return nil
@@ -159,7 +129,7 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1
if finalizers := sets.NewString(class.Finalizers...); !finalizers.Has(MCMFinalizerName) {
// Add machineClassFinalizer as if doesn't exist
- err = c.addMachineClassFinalizers(ctx, class)
+ err = c.addMCMFinalizerToMachineClass(ctx, class)
if err != nil {
return err
}
@@ -167,7 +137,7 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1
// Enqueue all machines once finalizer is added to machineClass
// This is to allow processing of such machines
for _, machine := range machines {
- c.enqueueMachine(machine)
+ c.enqueueMachine(machine, "finalizer placed on machineClass")
}
}
@@ -187,7 +157,7 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1
if finalizers := sets.NewString(class.Finalizers...); finalizers.Has(MCMFinalizerName) {
// Delete finalizer if exists on machineClass
- return c.deleteMachineClassFinalizers(ctx, class)
+ return c.deleteMCMFinalizerFromMachineClass(ctx, class)
}
return nil
@@ -198,19 +168,19 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1
Manipulate Finalizers
*/
-func (c *controller) addMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass) error {
+func (c *controller) addMCMFinalizerToMachineClass(ctx context.Context, class *v1alpha1.MachineClass) error {
finalizers := sets.NewString(class.Finalizers...)
finalizers.Insert(MCMFinalizerName)
- return c.updateMachineClassFinalizers(ctx, class, finalizers.List())
+ return c.updateMachineClassFinalizers(ctx, class, finalizers.List(), true)
}
-func (c *controller) deleteMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass) error {
+func (c *controller) deleteMCMFinalizerFromMachineClass(ctx context.Context, class *v1alpha1.MachineClass) error {
finalizers := sets.NewString(class.Finalizers...)
finalizers.Delete(MCMFinalizerName)
- return c.updateMachineClassFinalizers(ctx, class, finalizers.List())
+ return c.updateMachineClassFinalizers(ctx, class, finalizers.List(), false)
}
-func (c *controller) updateMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass, finalizers []string) error {
+func (c *controller) updateMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass, finalizers []string, addFinalizers bool) error {
// Get the latest version of the class so that we can avoid conflicts
class, err := c.controlMachineClient.MachineClasses(class.Namespace).Get(ctx, class.Name, metav1.GetOptions{})
if err != nil {
@@ -224,7 +194,11 @@ func (c *controller) updateMachineClassFinalizers(ctx context.Context, class *v1
klog.Warning("Updating machineClass failed, retrying. ", class.Name, err)
return err
}
- klog.V(3).Infof("Successfully added/removed finalizer on the machineclass %q", class.Name)
+ if addFinalizers {
+ klog.V(3).Infof("Successfully added finalizer on the machineclass %q", class.Name)
+ } else {
+ klog.V(3).Infof("Successfully removed finalizer on the machineclass %q", class.Name)
+ }
return err
}
diff --git a/pkg/util/provider/machinecontroller/machineclass_util.go b/pkg/util/provider/machinecontroller/machineclass_util.go
index fdd15ea15..7d2250faf 100644
--- a/pkg/util/provider/machinecontroller/machineclass_util.go
+++ b/pkg/util/provider/machinecontroller/machineclass_util.go
@@ -22,37 +22,6 @@ import (
"k8s.io/apimachinery/pkg/labels"
)
-/*
-TODO: Move this code to MCM/MachineSet controller as well?
-func (c *controller) findMachineDeploymentsForClass(kind, name string) ([]*v1alpha1.MachineDeployment, error) {
- machineDeployments, err := c.machineDeploymentLister.List(labels.Everything())
- if err != nil {
- return nil, err
- }
- var filtered []*v1alpha1.MachineDeployment
- for _, machineDeployment := range machineDeployments {
- if machineDeployment.Spec.Template.Spec.Class.Kind == kind && machineDeployment.Spec.Template.Spec.Class.Name == name {
- filtered = append(filtered, machineDeployment)
- }
- }
- return filtered, nil
-}
-
-func (c *controller) findMachineSetsForClass(kind, name string) ([]*v1alpha1.MachineSet, error) {
- machineSets, err := c.machineSetLister.List(labels.Everything())
- if err != nil {
- return nil, err
- }
- var filtered []*v1alpha1.MachineSet
- for _, machineSet := range machineSets {
- if machineSet.Spec.Template.Spec.Class.Kind == kind && machineSet.Spec.Template.Spec.Class.Name == name {
- filtered = append(filtered, machineSet)
- }
- }
- return filtered, nil
-}
-*/
-
func (c *controller) findMachinesForClass(kind, name string) ([]*v1alpha1.Machine, error) {
machines, err := c.machineLister.List(labels.Everything())
if err != nil {
diff --git a/pkg/util/provider/machinecontroller/secret.go b/pkg/util/provider/machinecontroller/secret.go
index b620ae0a4..6e072fb54 100644
--- a/pkg/util/provider/machinecontroller/secret.go
+++ b/pkg/util/provider/machinecontroller/secret.go
@@ -61,10 +61,10 @@ func (c *controller) reconcileClusterSecretKey(key string) error {
func (c *controller) reconcileClusterSecret(ctx context.Context, secret *corev1.Secret) error {
startTime := time.Now()
- klog.V(5).Infof("Start syncing %q", secret.Name)
+ klog.V(4).Infof("Start syncing %q", secret.Name)
defer func() {
c.enqueueSecretAfter(secret, 10*time.Minute)
- klog.V(5).Infof("Finished syncing %q (%v)", secret.Name, time.Since(startTime))
+ klog.V(4).Infof("Finished syncing %q (%v)", secret.Name, time.Since(startTime))
}()
// Check if machineClasses are referring to this secret
@@ -80,10 +80,6 @@ func (c *controller) reconcileClusterSecret(ctx context.Context, secret *corev1.
return err
}
} else {
- if finalizers := sets.NewString(secret.Finalizers...); !finalizers.Has(MCFinalizerName) {
- // Finalizer doesn't exist, simply return nil
- return nil
- }
err = c.deleteSecretFinalizers(ctx, secret)
if err != nil {
return err
diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go
index 8bc546a0f..3d91e1a68 100644
--- a/pkg/util/provider/machineutils/utils.go
+++ b/pkg/util/provider/machineutils/utils.go
@@ -54,9 +54,6 @@ const (
// MachineClassKind is used to identify the machineClassKind for generic machineClasses
MachineClassKind = "MachineClass"
- // MigratedMachineClass annotation helps in identifying machineClasses who have been migrated by migration controller
- MigratedMachineClass = "machine.sapcloud.io/migrated"
-
// NotManagedByMCM annotation helps in identifying the nodes which are not handled by MCM
NotManagedByMCM = "node.machine.sapcloud.io/not-managed-by-mcm"
@@ -82,10 +79,16 @@ type RetryPeriod time.Duration
// These are the valid values for RetryPeriod
const (
+ // ConflictRetry tells the controller to retry quickly - 200 milliseconds
+ ConflictRetry RetryPeriod = RetryPeriod(200 * time.Millisecond)
// ShortRetry tells the controller to retry after a short duration - 15 seconds
- ShortRetry RetryPeriod = RetryPeriod(15 * time.Second)
+ ShortRetry RetryPeriod = RetryPeriod(5 * time.Second)
// MediumRetry tells the controller to retry after a medium duration - 2 minutes
MediumRetry RetryPeriod = RetryPeriod(3 * time.Minute)
// LongRetry tells the controller to retry after a long duration - 10 minutes
LongRetry RetryPeriod = RetryPeriod(10 * time.Minute)
)
+
+// EssentialTaints are taints on node object which if added/removed, require an immediate reconcile by machine controller
+// TODO: update this when taints for ALT updation and PostCreate operations is introduced.
+var EssentialTaints = []string{TaintNodeCriticalComponentsNotReady}