From 2e4aec5cbe0d497704375f67ccf138c974f82346 Mon Sep 17 00:00:00 2001 From: vportella Date: Thu, 7 Sep 2023 15:25:09 +1000 Subject: [PATCH 1/3] Add a finalizer terminating a node and don't error is a node has been deleted --- .../transitioner/transitions.go | 15 ++++- pkg/controller/node.go | 59 ++++++++++++++++++- pkg/k8s/node.go | 15 +++++ 3 files changed, 87 insertions(+), 2 deletions(-) diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go index 51347ae..9e2993d 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions.go @@ -19,10 +19,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" ) -var ( +const ( scaleUpWait = 1 * time.Minute scaleUpLimit = 20 * time.Minute nodeEquilibriumWaitLimit = 5 * time.Minute + + nodeFinalizerName = "cyclops.atlassian.com/finalizer" ) // transitionUndefined transitions any CycleNodeRequests in the undefined phase to the pending phase @@ -274,6 +276,12 @@ func (t *CycleNodeRequestTransitioner) transitionInitialised() (reconcile.Result var validNodes []v1.CycleNodeRequestNode for _, node := range t.cycleNodeRequest.Status.CurrentNodes { + // Add the finalizer to the node before detaching it + if err := t.rm.AddFinalizerToNode(node.Name, nodeFinalizerName); err != nil { + t.rm.LogEvent(t.cycleNodeRequest, "AddFinalizerToNodeError", err.Error()) + return t.transitionToHealing(err) + } + alreadyDetaching, err := nodeGroups.DetachInstance(node.ProviderID) if alreadyDetaching { @@ -531,6 +539,11 @@ func (t *CycleNodeRequestTransitioner) transitionHealing() (reconcile.Result, er continue } + if err := t.rm.RemoveFinalizerFromNode(node.Name, nodeFinalizerName); err != nil { + t.rm.LogEvent(t.cycleNodeRequest, "RemoveFinalizerFromNodeError", err.Error()) + return t.transitionToFailed(err) + } + // try and re-attach the nodes, if any were un-attached t.rm.LogEvent(t.cycleNodeRequest, "AttachingNodes", "Attaching instances to nodes group: %v", node.Name) // if the node is already attached, ignore the error and continue to un-cordoning, otherwise return with error diff --git a/pkg/controller/node.go b/pkg/controller/node.go index 72ee213..19c966c 100644 --- a/pkg/controller/node.go +++ b/pkg/controller/node.go @@ -6,8 +6,10 @@ import ( "github.com/atlassian-labs/cyclops/pkg/k8s" v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -41,12 +43,30 @@ func (rm *ResourceManager) GetNode(name string) (*v1.Node, error) { func (rm *ResourceManager) DeleteNode(name string) error { // Get the node node, err := rm.GetNode(name) + + // If the node is not found then skip trying to delete it + if err != nil && apierrors.IsNotFound(err) { + rm.Logger.Info("node already deleted, skip deleting", "node", name) + return nil + } + + // Account for any other possible errors if err != nil { return err } + // The node exists as of the previous step, try deleting it now + err = rm.Client.Delete(context.TODO(), node) + + // Account for possible race conditions with other controllers managing + // nodes in the cluster + if apierrors.IsNotFound(err) { + rm.Logger.Info("node deletion attemp failed, node already deleted", "node", name) + return nil + } + // Delete the node - return rm.Client.Delete(context.TODO(), node) + return err } // DrainPods drains the pods off the named node. @@ -71,3 +91,40 @@ func (rm *ResourceManager) DrainPods(nodeName string, unhealthyAfter time.Durati return false, k8s.DrainPods(pods, rm.RawClient, unhealthyAfter) } + +func (rm *ResourceManager) AddFinalizerToNode(nodeName, finalizerName string) error { + return rm.manageFinalizerOnNode(nodeName, finalizerName, k8s.AddFinalizerToNode) +} + +func (rm *ResourceManager) RemoveFinalizerFromNode(nodeName, finalizerName string) error { + return rm.manageFinalizerOnNode(nodeName, finalizerName, k8s.RemoveFinalizerFromNode) +} + +func (rm *ResourceManager) manageFinalizerOnNode(nodeName, finalizerName string, fn func(*v1.Node, string, kubernetes.Interface) error) error { + // Get the node + node, err := rm.GetNode(nodeName) + + // If the node is not found then skip the finalizer operation + if err != nil && apierrors.IsNotFound(err) { + rm.Logger.Info("node deleted, skip adding finalizer", "node", nodeName) + return nil + } + + // Account for any other possible errors + if err != nil { + return err + } + + // The node exists as of the previous step, try managing the finalizer for + // it now + err = fn(node, finalizerName, rm.RawClient) + + // Account for possible race conditions with other controllers managing + // nodes in the cluster + if apierrors.IsNotFound(err) { + rm.Logger.Info("adding finalizer failed, node already deleted", "node", nodeName) + return nil + } + + return err +} diff --git a/pkg/k8s/node.go b/pkg/k8s/node.go index 98e4f5b..4fa59b8 100644 --- a/pkg/k8s/node.go +++ b/pkg/k8s/node.go @@ -12,6 +12,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) // CordonNode performs a patch operation on a node to mark it as unschedulable @@ -60,6 +61,20 @@ func AddLabelToNode(nodeName string, labelName string, labelValue string, client return PatchNode(nodeName, patches, client) } +// AddFinalizerToNode updates a node to add a finalizer to it +func AddFinalizerToNode(node *v1.Node, finalizerName string, client kubernetes.Interface) error { + controllerutil.AddFinalizer(node, finalizerName) + _, err := client.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{}) + return err +} + +// RemoveFinalizerFromNode updates a node to remove a finalizer from it +func RemoveFinalizerFromNode(node *v1.Node, finalizerName string, client kubernetes.Interface) error { + controllerutil.RemoveFinalizer(node, finalizerName) + _, err := client.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{}) + return err +} + // NodeLister defines an object that can list nodes with a label selector type NodeLister interface { List(labels.Selector) ([]*v1.Node, error) From 9b430aef71ebf82a81065a65536c5242e0f71f97 Mon Sep 17 00:00:00 2001 From: vportella Date: Thu, 7 Sep 2023 15:37:08 +1000 Subject: [PATCH 2/3] Remove leftover comment --- pkg/controller/node.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/controller/node.go b/pkg/controller/node.go index 19c966c..18bbf36 100644 --- a/pkg/controller/node.go +++ b/pkg/controller/node.go @@ -65,7 +65,6 @@ func (rm *ResourceManager) DeleteNode(name string) error { return nil } - // Delete the node return err } From 203e0b2242fedb819935276561226d9d08c6883b Mon Sep 17 00:00:00 2001 From: vportella Date: Thu, 7 Sep 2023 16:26:53 +1000 Subject: [PATCH 3/3] Remove finalizer from the CNS after deleting the node --- .../cyclenoderequest/transitioner/transitions.go | 8 ++++---- .../cyclenodestatus/transitioner/transitions.go | 8 +++++++- pkg/controller/node.go | 16 ++++++++++------ 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go index 9e2993d..f56a455 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions.go @@ -23,8 +23,6 @@ const ( scaleUpWait = 1 * time.Minute scaleUpLimit = 20 * time.Minute nodeEquilibriumWaitLimit = 5 * time.Minute - - nodeFinalizerName = "cyclops.atlassian.com/finalizer" ) // transitionUndefined transitions any CycleNodeRequests in the undefined phase to the pending phase @@ -276,8 +274,10 @@ func (t *CycleNodeRequestTransitioner) transitionInitialised() (reconcile.Result var validNodes []v1.CycleNodeRequestNode for _, node := range t.cycleNodeRequest.Status.CurrentNodes { + t.rm.Logger.Info("Adding finalizer to node", "node", node.Name) + // Add the finalizer to the node before detaching it - if err := t.rm.AddFinalizerToNode(node.Name, nodeFinalizerName); err != nil { + if err := t.rm.AddFinalizerToNode(node.Name); err != nil { t.rm.LogEvent(t.cycleNodeRequest, "AddFinalizerToNodeError", err.Error()) return t.transitionToHealing(err) } @@ -539,7 +539,7 @@ func (t *CycleNodeRequestTransitioner) transitionHealing() (reconcile.Result, er continue } - if err := t.rm.RemoveFinalizerFromNode(node.Name, nodeFinalizerName); err != nil { + if err := t.rm.RemoveFinalizerFromNode(node.Name); err != nil { t.rm.LogEvent(t.cycleNodeRequest, "RemoveFinalizerFromNodeError", err.Error()) return t.transitionToFailed(err) } diff --git a/pkg/controller/cyclenodestatus/transitioner/transitions.go b/pkg/controller/cyclenodestatus/transitioner/transitions.go index 3eb6398..2f821a7 100644 --- a/pkg/controller/cyclenodestatus/transitioner/transitions.go +++ b/pkg/controller/cyclenodestatus/transitioner/transitions.go @@ -173,7 +173,7 @@ func (t *CycleNodeStatusTransitioner) transitionDraining() (reconcile.Result, er } // transitionDeleting transitions any CycleNodeStatuses in the Deleting phase to the Terminating phase -// It will delete the node out of the Kubernetes API. +// It will delete the node out of the Kubernetes API and remove the finalizer. func (t *CycleNodeStatusTransitioner) transitionDeleting() (reconcile.Result, error) { t.rm.LogEvent(t.cycleNodeStatus, "DeletingNode", "Deleting node: %v", t.cycleNodeStatus.Status.CurrentNode.Name) err := t.rm.DeleteNode(t.cycleNodeStatus.Status.CurrentNode.Name) @@ -181,6 +181,12 @@ func (t *CycleNodeStatusTransitioner) transitionDeleting() (reconcile.Result, er return t.transitionToFailed(err) } + // Remove the finalizer to allow the node to be deleted + if err := t.rm.RemoveFinalizerFromNode(t.cycleNodeStatus.Status.CurrentNode.Name); err != nil { + t.rm.LogEvent(t.cycleNodeStatus, "RemoveFinalizerFromNodeError", err.Error()) + return t.transitionToFailed(err) + } + return t.transitionObject(v1.CycleNodeStatusTerminatingNode) } diff --git a/pkg/controller/node.go b/pkg/controller/node.go index 18bbf36..5a62b5d 100644 --- a/pkg/controller/node.go +++ b/pkg/controller/node.go @@ -13,6 +13,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +const ( + nodeFinalizerName = "cyclops.atlassian.com/finalizer" +) + // ListNodes lists nodes from Kubernetes, optionally filtered by a selector. func (rm *ResourceManager) ListNodes(selector labels.Selector) ([]v1.Node, error) { // List the nodes @@ -91,15 +95,15 @@ func (rm *ResourceManager) DrainPods(nodeName string, unhealthyAfter time.Durati return false, k8s.DrainPods(pods, rm.RawClient, unhealthyAfter) } -func (rm *ResourceManager) AddFinalizerToNode(nodeName, finalizerName string) error { - return rm.manageFinalizerOnNode(nodeName, finalizerName, k8s.AddFinalizerToNode) +func (rm *ResourceManager) AddFinalizerToNode(nodeName string) error { + return rm.manageFinalizerOnNode(nodeName, k8s.AddFinalizerToNode) } -func (rm *ResourceManager) RemoveFinalizerFromNode(nodeName, finalizerName string) error { - return rm.manageFinalizerOnNode(nodeName, finalizerName, k8s.RemoveFinalizerFromNode) +func (rm *ResourceManager) RemoveFinalizerFromNode(nodeName string) error { + return rm.manageFinalizerOnNode(nodeName, k8s.RemoveFinalizerFromNode) } -func (rm *ResourceManager) manageFinalizerOnNode(nodeName, finalizerName string, fn func(*v1.Node, string, kubernetes.Interface) error) error { +func (rm *ResourceManager) manageFinalizerOnNode(nodeName string, fn func(*v1.Node, string, kubernetes.Interface) error) error { // Get the node node, err := rm.GetNode(nodeName) @@ -116,7 +120,7 @@ func (rm *ResourceManager) manageFinalizerOnNode(nodeName, finalizerName string, // The node exists as of the previous step, try managing the finalizer for // it now - err = fn(node, finalizerName, rm.RawClient) + err = fn(node, nodeFinalizerName, rm.RawClient) // Account for possible race conditions with other controllers managing // nodes in the cluster