diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go index 51347ae..f56a455 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions.go @@ -19,7 +19,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" ) -var ( +const ( scaleUpWait = 1 * time.Minute scaleUpLimit = 20 * time.Minute nodeEquilibriumWaitLimit = 5 * time.Minute @@ -274,6 +274,14 @@ func (t *CycleNodeRequestTransitioner) transitionInitialised() (reconcile.Result var validNodes []v1.CycleNodeRequestNode for _, node := range t.cycleNodeRequest.Status.CurrentNodes { + t.rm.Logger.Info("Adding finalizer to node", "node", node.Name) + + // Add the finalizer to the node before detaching it + if err := t.rm.AddFinalizerToNode(node.Name); err != nil { + t.rm.LogEvent(t.cycleNodeRequest, "AddFinalizerToNodeError", err.Error()) + return t.transitionToHealing(err) + } + alreadyDetaching, err := nodeGroups.DetachInstance(node.ProviderID) if alreadyDetaching { @@ -531,6 +539,11 @@ func (t *CycleNodeRequestTransitioner) transitionHealing() (reconcile.Result, er continue } + if err := t.rm.RemoveFinalizerFromNode(node.Name); err != nil { + t.rm.LogEvent(t.cycleNodeRequest, "RemoveFinalizerFromNodeError", err.Error()) + return t.transitionToFailed(err) + } + // try and re-attach the nodes, if any were un-attached t.rm.LogEvent(t.cycleNodeRequest, "AttachingNodes", "Attaching instances to nodes group: %v", node.Name) // if the node is already attached, ignore the error and continue to un-cordoning, otherwise return with error diff --git a/pkg/controller/cyclenodestatus/transitioner/transitions.go b/pkg/controller/cyclenodestatus/transitioner/transitions.go index 3eb6398..2f821a7 100644 --- a/pkg/controller/cyclenodestatus/transitioner/transitions.go +++ b/pkg/controller/cyclenodestatus/transitioner/transitions.go @@ -173,7 +173,7 @@ func (t *CycleNodeStatusTransitioner) transitionDraining() (reconcile.Result, er } // transitionDeleting transitions any CycleNodeStatuses in the Deleting phase to the Terminating phase -// It will delete the node out of the Kubernetes API. +// It will delete the node out of the Kubernetes API and remove the finalizer. func (t *CycleNodeStatusTransitioner) transitionDeleting() (reconcile.Result, error) { t.rm.LogEvent(t.cycleNodeStatus, "DeletingNode", "Deleting node: %v", t.cycleNodeStatus.Status.CurrentNode.Name) err := t.rm.DeleteNode(t.cycleNodeStatus.Status.CurrentNode.Name) @@ -181,6 +181,12 @@ func (t *CycleNodeStatusTransitioner) transitionDeleting() (reconcile.Result, er return t.transitionToFailed(err) } + // Remove the finalizer to allow the node to be deleted + if err := t.rm.RemoveFinalizerFromNode(t.cycleNodeStatus.Status.CurrentNode.Name); err != nil { + t.rm.LogEvent(t.cycleNodeStatus, "RemoveFinalizerFromNodeError", err.Error()) + return t.transitionToFailed(err) + } + return t.transitionObject(v1.CycleNodeStatusTerminatingNode) } diff --git a/pkg/controller/node.go b/pkg/controller/node.go index 72ee213..5a62b5d 100644 --- a/pkg/controller/node.go +++ b/pkg/controller/node.go @@ -6,11 +6,17 @@ import ( "github.com/atlassian-labs/cyclops/pkg/k8s" v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" "sigs.k8s.io/controller-runtime/pkg/client" ) +const ( + nodeFinalizerName = "cyclops.atlassian.com/finalizer" +) + // ListNodes lists nodes from Kubernetes, optionally filtered by a selector. func (rm *ResourceManager) ListNodes(selector labels.Selector) ([]v1.Node, error) { // List the nodes @@ -41,12 +47,29 @@ func (rm *ResourceManager) GetNode(name string) (*v1.Node, error) { func (rm *ResourceManager) DeleteNode(name string) error { // Get the node node, err := rm.GetNode(name) + + // If the node is not found then skip trying to delete it + if err != nil && apierrors.IsNotFound(err) { + rm.Logger.Info("node already deleted, skip deleting", "node", name) + return nil + } + + // Account for any other possible errors if err != nil { return err } - // Delete the node - return rm.Client.Delete(context.TODO(), node) + // The node exists as of the previous step, try deleting it now + err = rm.Client.Delete(context.TODO(), node) + + // Account for possible race conditions with other controllers managing + // nodes in the cluster + if apierrors.IsNotFound(err) { + rm.Logger.Info("node deletion attemp failed, node already deleted", "node", name) + return nil + } + + return err } // DrainPods drains the pods off the named node. @@ -71,3 +94,40 @@ func (rm *ResourceManager) DrainPods(nodeName string, unhealthyAfter time.Durati return false, k8s.DrainPods(pods, rm.RawClient, unhealthyAfter) } + +func (rm *ResourceManager) AddFinalizerToNode(nodeName string) error { + return rm.manageFinalizerOnNode(nodeName, k8s.AddFinalizerToNode) +} + +func (rm *ResourceManager) RemoveFinalizerFromNode(nodeName string) error { + return rm.manageFinalizerOnNode(nodeName, k8s.RemoveFinalizerFromNode) +} + +func (rm *ResourceManager) manageFinalizerOnNode(nodeName string, fn func(*v1.Node, string, kubernetes.Interface) error) error { + // Get the node + node, err := rm.GetNode(nodeName) + + // If the node is not found then skip the finalizer operation + if err != nil && apierrors.IsNotFound(err) { + rm.Logger.Info("node deleted, skip adding finalizer", "node", nodeName) + return nil + } + + // Account for any other possible errors + if err != nil { + return err + } + + // The node exists as of the previous step, try managing the finalizer for + // it now + err = fn(node, nodeFinalizerName, rm.RawClient) + + // Account for possible race conditions with other controllers managing + // nodes in the cluster + if apierrors.IsNotFound(err) { + rm.Logger.Info("adding finalizer failed, node already deleted", "node", nodeName) + return nil + } + + return err +} diff --git a/pkg/k8s/node.go b/pkg/k8s/node.go index 98e4f5b..4fa59b8 100644 --- a/pkg/k8s/node.go +++ b/pkg/k8s/node.go @@ -12,6 +12,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) // CordonNode performs a patch operation on a node to mark it as unschedulable @@ -60,6 +61,20 @@ func AddLabelToNode(nodeName string, labelName string, labelValue string, client return PatchNode(nodeName, patches, client) } +// AddFinalizerToNode updates a node to add a finalizer to it +func AddFinalizerToNode(node *v1.Node, finalizerName string, client kubernetes.Interface) error { + controllerutil.AddFinalizer(node, finalizerName) + _, err := client.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{}) + return err +} + +// RemoveFinalizerFromNode updates a node to remove a finalizer from it +func RemoveFinalizerFromNode(node *v1.Node, finalizerName string, client kubernetes.Interface) error { + controllerutil.RemoveFinalizer(node, finalizerName) + _, err := client.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{}) + return err +} + // NodeLister defines an object that can list nodes with a label selector type NodeLister interface { List(labels.Selector) ([]*v1.Node, error)