From 4898f242278e73ee387d084ebbdb6c3a12601556 Mon Sep 17 00:00:00 2001 From: vportella Date: Fri, 19 Jul 2024 15:25:06 +1000 Subject: [PATCH 1/7] Skip named nodes that don't exist --- .../cyclenoderequest/transitioner/checks.go | 4 +- .../cyclenoderequest/transitioner/node.go | 87 ++++++++++--------- .../transitioner/transitions.go | 45 +++++----- .../cyclenoderequest/transitioner/util.go | 22 +++-- 4 files changed, 88 insertions(+), 70 deletions(-) diff --git a/pkg/controller/cyclenoderequest/transitioner/checks.go b/pkg/controller/cyclenoderequest/transitioner/checks.go index e2e303a..a4df2bc 100644 --- a/pkg/controller/cyclenoderequest/transitioner/checks.go +++ b/pkg/controller/cyclenoderequest/transitioner/checks.go @@ -189,7 +189,7 @@ func (t *CycleNodeRequestTransitioner) performHealthCheck(node v1.CycleNodeReque // performInitialHealthChecks on the nodes selected to be terminated before cycling begin. If any health // check fails return an error to prevent cycling from starting -func (t *CycleNodeRequestTransitioner) performInitialHealthChecks(kubeNodes []corev1.Node) error { +func (t *CycleNodeRequestTransitioner) performInitialHealthChecks(kubeNodes map[string]corev1.Node) error { // Build a set of ready nodes from which to check below readyNodesSet := make(map[string]v1.CycleNodeRequestNode) @@ -241,7 +241,7 @@ func (t *CycleNodeRequestTransitioner) performInitialHealthChecks(kubeNodes []co // performCyclingHealthChecks before terminating an instance selected for termination. Cycling pauses // until all health checks pass for the new instance before terminating the old one -func (t *CycleNodeRequestTransitioner) performCyclingHealthChecks(kubeNodes []corev1.Node) (bool, error) { +func (t *CycleNodeRequestTransitioner) performCyclingHealthChecks(kubeNodes map[string]corev1.Node) (bool, error) { var allHealthChecksPassed bool = true // Find new instsances attached to the nodegroup and perform health checks on them diff --git a/pkg/controller/cyclenoderequest/transitioner/node.go b/pkg/controller/cyclenoderequest/transitioner/node.go index 046148a..b22f119 100644 --- a/pkg/controller/cyclenoderequest/transitioner/node.go +++ b/pkg/controller/cyclenoderequest/transitioner/node.go @@ -11,12 +11,15 @@ import ( // listReadyNodes lists nodes that are "ready". By default lists nodes that have also not been touched by Cyclops. // A label is used to determine whether nodes have been touched by this CycleNodeRequest. -func (t *CycleNodeRequestTransitioner) listReadyNodes(includeInProgress bool) (nodes []corev1.Node, err error) { +func (t *CycleNodeRequestTransitioner) listReadyNodes(includeInProgress bool) (map[string]corev1.Node, error) { + nodes := make(map[string]corev1.Node) + // Get the nodes selector, err := t.cycleNodeRequest.NodeLabelSelector() if err != nil { return nodes, err } + nodeList, err := t.rm.ListNodes(selector) if err != nil { return nodes, err @@ -30,14 +33,16 @@ func (t *CycleNodeRequestTransitioner) listReadyNodes(includeInProgress bool) (n continue } } + // Only add "Ready" nodes for _, cond := range node.Status.Conditions { if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { - nodes = append(nodes, node) + nodes[node.Spec.ProviderID] = node break } } } + return nodes, nil } @@ -56,29 +61,34 @@ func (t *CycleNodeRequestTransitioner) getNodesToTerminate(numNodes int64) (node } for _, kubeNode := range kubeNodes { - // Skip nodes that are already being worked on so we don't duplicate our work if value, ok := kubeNode.Labels[cycleNodeLabel]; ok && value == t.cycleNodeRequest.Name { numNodesInProgress++ + } + } + + for _, nodeToTerminate := range t.cycleNodeRequest.Status.NodesToTerminate { + kubeNode, found := kubeNodes[nodeToTerminate.ProviderID] + + if !found { continue } - for _, nodeToTerminate := range t.cycleNodeRequest.Status.NodesToTerminate { - // Add nodes that need to be terminated but have not yet been actioned - if kubeNode.Name == nodeToTerminate.Name && kubeNode.Spec.ProviderID == nodeToTerminate.ProviderID { - nodes = append(nodes, &kubeNode) + // Skip nodes that are already being worked on so we don't duplicate our work + if value, ok := kubeNode.Labels[cycleNodeLabel]; ok && value == t.cycleNodeRequest.Name { + continue + } - for i := 0; i < len(t.cycleNodeRequest.Status.NodesAvailable); i++ { - if kubeNode.Name == t.cycleNodeRequest.Status.NodesAvailable[i].Name { - // Remove nodes from available if they are also scheduled for termination - // Slice syntax removes this node at `i` from the array - t.cycleNodeRequest.Status.NodesAvailable = append( - t.cycleNodeRequest.Status.NodesAvailable[:i], - t.cycleNodeRequest.Status.NodesAvailable[i+1:]..., - ) + // Add nodes that need to be terminated but have not yet been actioned + nodes = append(nodes, &kubeNode) - break - } - } + for i := 0; i < len(t.cycleNodeRequest.Status.NodesAvailable); i++ { + if kubeNode.Name == t.cycleNodeRequest.Status.NodesAvailable[i].Name { + // Remove nodes from available if they are also scheduled for termination + // Slice syntax removes this node at `i` from the array + t.cycleNodeRequest.Status.NodesAvailable = append( + t.cycleNodeRequest.Status.NodesAvailable[:i], + t.cycleNodeRequest.Status.NodesAvailable[i+1:]..., + ) break } @@ -94,33 +104,32 @@ func (t *CycleNodeRequestTransitioner) getNodesToTerminate(numNodes int64) (node } // addNamedNodesToTerminate adds the named nodes for this CycleNodeRequest to the list of nodes to terminate. -// Returns an error if any named node does not exist in the node group for this CycleNodeRequest. -func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes []corev1.Node, nodeGroupInstances map[string]cloudprovider.Instance) error { - for _, namedNode := range t.cycleNodeRequest.Spec.NodeNames { - foundNode := false - for _, kubeNode := range kubeNodes { - if kubeNode.Name == namedNode { - foundNode = true +// Skips any named node that does not exist in the node group for this CycleNodeRequest. +func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes map[string]corev1.Node, nodeGroupInstances map[string]cloudprovider.Instance) { + kubeNodesMap := make(map[string]corev1.Node) - t.cycleNodeRequest.Status.NodesAvailable = append( - t.cycleNodeRequest.Status.NodesAvailable, - newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()), - ) + for _, node := range kubeNodes { + kubeNodesMap[node.Name] = node + } - t.cycleNodeRequest.Status.NodesToTerminate = append( - t.cycleNodeRequest.Status.NodesToTerminate, - newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()), - ) + for _, namedNode := range t.cycleNodeRequest.Spec.NodeNames { + kubeNode, found := kubeNodesMap[namedNode] - break - } + if !found { + t.rm.Logger.Info("could not find node by name, skipping", "nodeName", namedNode) + continue } - if !foundNode { - return fmt.Errorf("could not find node by name: %v", namedNode) - } + t.cycleNodeRequest.Status.NodesAvailable = append( + t.cycleNodeRequest.Status.NodesAvailable, + newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()), + ) + + t.cycleNodeRequest.Status.NodesToTerminate = append( + t.cycleNodeRequest.Status.NodesToTerminate, + newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()), + ) } - return nil } // newCycleNodeRequestNode converts a corev1.Node to a v1.CycleNodeRequestNode. This is done multiple diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go index f56a455..3ec6af6 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions.go @@ -64,13 +64,11 @@ func (t *CycleNodeRequestTransitioner) transitionUndefined() (reconcile.Result, func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, error) { // Fetch the node names for the cycleNodeRequest, using the label selector provided t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Selecting nodes with label selector") + kubeNodes, err := t.listReadyNodes(true) if err != nil { return t.transitionToHealing(err) } - if len(kubeNodes) == 0 { - return t.transitionToHealing(fmt.Errorf("no nodes matched selector")) - } // Only retain nodes which still exist inside cloud provider var nodeProviderIDs []string @@ -83,25 +81,20 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er if err != nil { return t.transitionToHealing(errors.Wrap(err, "failed to check instances that exist from cloud provider")) } - var existingKubeNodes []corev1.Node - for _, node := range kubeNodes { - for _, validProviderID := range existingProviderIDs { - if node.Spec.ProviderID == validProviderID { - existingKubeNodes = append(existingKubeNodes, node) - break - } + existingKubeNodes := make(map[string]corev1.Node) + + for _, validProviderID := range existingProviderIDs { + if node, found := kubeNodes[validProviderID]; found { + existingKubeNodes[node.Spec.ProviderID] = node } } kubeNodes = existingKubeNodes - if len(kubeNodes) == 0 { - return t.transitionToHealing(fmt.Errorf("no existing nodes in cloud provider matched selector")) - } - // Describe the node group for the request t.rm.LogEvent(t.cycleNodeRequest, "FetchingNodeGroup", "Fetching node group: %v", t.cycleNodeRequest.GetNodeGroupNames()) + nodeGroups, err := t.rm.CloudProvider.GetNodeGroups(t.cycleNodeRequest.GetNodeGroupNames()) if err != nil { return t.transitionToHealing(err) @@ -109,22 +102,28 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er // get instances inside cloud provider node groups nodeGroupInstances := nodeGroups.Instances() + // Do some sanity checking before we start filtering things // Check the instance count of the node group matches the number of nodes found in Kubernetes if len(kubeNodes) != len(nodeGroupInstances) { - nodesNotInCPNodeGroup, nodesNotInKube := findOffendingNodes(kubeNodes, nodeGroupInstances) var offendingNodesInfo string + + nodesNotInCPNodeGroup, nodesNotInKube := findOffendingNodes(kubeNodes, nodeGroupInstances) + if len(nodesNotInCPNodeGroup) > 0 { offendingNodesInfo += "nodes not in node group: " offendingNodesInfo += strings.Join(nodesNotInCPNodeGroup, ",") } + if len(nodesNotInKube) > 0 { if offendingNodesInfo != "" { offendingNodesInfo += ";" } + offendingNodesInfo += "nodes not inside cluster: " offendingNodesInfo += strings.Join(nodesNotInKube, ",") } + t.rm.LogEvent(t.cycleNodeRequest, "NodeCountMismatch", "node group: %v, kube: %v. %v", len(nodeGroupInstances), len(kubeNodes), offendingNodesInfo) @@ -134,12 +133,16 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er if err != nil { return t.transitionToHealing(err) } + if timedOut { err := fmt.Errorf( - "node count mismatch, number of kubernetes of nodes does not match number of cloud provider instances after %v", - nodeEquilibriumWaitLimit) + "node count mismatch, number of kubernetes nodes does not match number of cloud provider instances after %v", + nodeEquilibriumWaitLimit, + ) + return t.transitionToHealing(err) } + return reconcile.Result{Requeue: true, RequeueAfter: requeueDuration}, nil } @@ -147,13 +150,11 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er if len(t.cycleNodeRequest.Spec.NodeNames) > 0 { // If specific node names are provided, check they actually exist in the node group t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Adding named nodes to NodesToTerminate") - err := t.addNamedNodesToTerminate(kubeNodes, nodeGroupInstances) - if err != nil { - return t.transitionToHealing(err) - } + t.addNamedNodesToTerminate(kubeNodes, nodeGroupInstances) } else { // Otherwise just add all the nodes in the node group t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Adding all node group nodes to NodesToTerminate") + for _, kubeNode := range kubeNodes { // Check to ensure the kubeNode object maps to an existing node in the ASG // If this isn't the case, this is a phantom node. Fail the cnr to be safe. @@ -205,7 +206,9 @@ func (t *CycleNodeRequestTransitioner) transitionInitialised() (reconcile.Result // The maximum nodes we can select are bounded by our concurrency. We take into account the number // of nodes we are already working on, and only introduce up to our concurrency cap more nodes in this step. maxNodesToSelect := t.cycleNodeRequest.Spec.CycleSettings.Concurrency - t.cycleNodeRequest.Status.ActiveChildren + t.rm.Logger.Info("Selecting nodes to terminate", "numNodes", maxNodesToSelect) + nodes, numNodesInProgress, err := t.getNodesToTerminate(maxNodesToSelect) if err != nil { return t.transitionToHealing(err) diff --git a/pkg/controller/cyclenoderequest/transitioner/util.go b/pkg/controller/cyclenoderequest/transitioner/util.go index aef2815..1564f9f 100644 --- a/pkg/controller/cyclenoderequest/transitioner/util.go +++ b/pkg/controller/cyclenoderequest/transitioner/util.go @@ -242,6 +242,7 @@ func (t *CycleNodeRequestTransitioner) checkIfTransitioning(numNodesToCycle, num transition, err := t.transitionObject(v1.CycleNodeRequestWaitingTermination) return true, transition, err } + // otherwise, we have finished everything, so transition to Successful transition, err := t.transitionToSuccessful() return true, transition, err @@ -252,21 +253,26 @@ func (t *CycleNodeRequestTransitioner) checkIfTransitioning(numNodesToCycle, num // findOffendingNodes finds the offending nodes information which cause number of nodes mismatch between // cloud provider node group and nodes inside kubernetes cluster using label selector -func findOffendingNodes(kubeNodes []corev1.Node, cloudProviderNodes map[string]cloudprovider.Instance) ([]string, []string) { - kubeNodesMap := make(map[string]corev1.Node) +func findOffendingNodes(kubeNodes map[string]corev1.Node, cloudProviderNodes map[string]cloudprovider.Instance) ([]string, []string) { var nodesNotInCPNodeGroup []string var nodesNotInKube []string + for _, kubeNode := range kubeNodes { - kubeNodesMap[kubeNode.Spec.ProviderID] = kubeNode if _, ok := cloudProviderNodes[kubeNode.Spec.ProviderID]; !ok { - nodesNotInCPNodeGroup = append(nodesNotInCPNodeGroup, fmt.Sprintf("id %q", kubeNode.Spec.ProviderID)) + nodesNotInCPNodeGroup = append(nodesNotInCPNodeGroup, + fmt.Sprintf("id %q", kubeNode.Spec.ProviderID), + ) } } + for cpNode := range cloudProviderNodes { - if _, ok := kubeNodesMap[cpNode]; !ok { - nodesNotInKube = append(nodesNotInKube, fmt.Sprintf("id %q in %q", - cpNode, - cloudProviderNodes[cpNode].NodeGroupName())) + if _, ok := kubeNodes[cpNode]; !ok { + nodesNotInKube = append(nodesNotInKube, + fmt.Sprintf("id %q in %q", + cpNode, + cloudProviderNodes[cpNode].NodeGroupName(), + ), + ) } } From 11a47673fda778040e07e15e2c2db164fd6f13f8 Mon Sep 17 00:00:00 2001 From: vportella Date: Fri, 19 Jul 2024 15:52:43 +1000 Subject: [PATCH 2/7] Fix util tests to account for changes --- .../transitioner/util_test.go | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pkg/controller/cyclenoderequest/transitioner/util_test.go b/pkg/controller/cyclenoderequest/transitioner/util_test.go index 97e2671..2c9aafc 100644 --- a/pkg/controller/cyclenoderequest/transitioner/util_test.go +++ b/pkg/controller/cyclenoderequest/transitioner/util_test.go @@ -45,17 +45,17 @@ func TestFindOffendingNodes(t *testing.T) { tests := []struct { name string - knodes []corev1.Node + knodes map[string]corev1.Node cnodes map[string]cloudprovider.Instance expectNotInCPNodeGroup []string expectNotInKube []string }{ { "kube nodes match cloud provider nodes", - []corev1.Node{ - buildNode(dummyInstanceA), - buildNode(dummyInstanceB), - buildNode(dummyInstanceC), + map[string]corev1.Node{ + dummyInstanceA.providerID: buildNode(dummyInstanceA), + dummyInstanceB.providerID: buildNode(dummyInstanceB), + dummyInstanceC.providerID: buildNode(dummyInstanceC), }, map[string]cloudprovider.Instance{ dummyInstanceA.providerID: &dummyInstanceA, @@ -67,10 +67,10 @@ func TestFindOffendingNodes(t *testing.T) { }, { "more nodes in kube than cloud provider", - []corev1.Node{ - buildNode(dummyInstanceA), - buildNode(dummyInstanceB), - buildNode(dummyInstanceC), + map[string]corev1.Node{ + dummyInstanceA.providerID: buildNode(dummyInstanceA), + dummyInstanceB.providerID: buildNode(dummyInstanceB), + dummyInstanceC.providerID: buildNode(dummyInstanceC), }, map[string]cloudprovider.Instance{ dummyInstanceA.providerID: &dummyInstanceA, @@ -81,9 +81,9 @@ func TestFindOffendingNodes(t *testing.T) { }, { "more nodes in cloud provider than kube", - []corev1.Node{ - buildNode(dummyInstanceA), - buildNode(dummyInstanceB), + map[string]corev1.Node{ + dummyInstanceA.providerID: buildNode(dummyInstanceA), + dummyInstanceB.providerID: buildNode(dummyInstanceB), }, map[string]cloudprovider.Instance{ dummyInstanceA.providerID: &dummyInstanceA, @@ -95,9 +95,9 @@ func TestFindOffendingNodes(t *testing.T) { }, { "no nodes in cloud provider", - []corev1.Node{ - buildNode(dummyInstanceA), - buildNode(dummyInstanceB), + map[string]corev1.Node{ + dummyInstanceA.providerID: buildNode(dummyInstanceA), + dummyInstanceB.providerID: buildNode(dummyInstanceB), }, map[string]cloudprovider.Instance{}, []string{"id \"aws:///us-east-1a/i-abcdefghijk\"", "id \"aws:///us-east-1b/i-bbcdefghijk\""}, @@ -105,7 +105,7 @@ func TestFindOffendingNodes(t *testing.T) { }, { "no nodes in kube", - []corev1.Node{}, + make(map[string]corev1.Node), map[string]cloudprovider.Instance{ dummyInstanceA.providerID: &dummyInstanceA, dummyInstanceB.providerID: &dummyInstanceB, @@ -115,7 +115,7 @@ func TestFindOffendingNodes(t *testing.T) { }, { "both cloud provider and kube nodes are empty", - []corev1.Node{}, + make(map[string]corev1.Node), map[string]cloudprovider.Instance{}, []string{}, []string{}, From 01b396e41aacbff67201ae1295a39e3cead8717e Mon Sep 17 00:00:00 2001 From: vportella Date: Mon, 22 Jul 2024 15:31:16 +1000 Subject: [PATCH 3/7] findOffendingNodes returns usable maps of nodes rather than lists of strings --- .../transitioner/transitions.go | 20 +++++++- .../cyclenoderequest/transitioner/util.go | 21 +++------ .../transitioner/util_test.go | 47 ++++++++++++------- 3 files changed, 54 insertions(+), 34 deletions(-) diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go index 3ec6af6..8692b86 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions.go @@ -111,8 +111,16 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er nodesNotInCPNodeGroup, nodesNotInKube := findOffendingNodes(kubeNodes, nodeGroupInstances) if len(nodesNotInCPNodeGroup) > 0 { + providerIDs := make([]string, 0) + + for providerID := range nodesNotInCPNodeGroup { + providerIDs = append(providerIDs, + fmt.Sprintf("id %q", providerID), + ) + } + offendingNodesInfo += "nodes not in node group: " - offendingNodesInfo += strings.Join(nodesNotInCPNodeGroup, ",") + offendingNodesInfo += strings.Join(providerIDs, ",") } if len(nodesNotInKube) > 0 { @@ -120,8 +128,16 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er offendingNodesInfo += ";" } + providerIDs := make([]string, 0) + + for providerID, node := range nodesNotInKube { + providerIDs = append(providerIDs, + fmt.Sprintf("id %q in %q", providerID, node.NodeGroupName()), + ) + } + offendingNodesInfo += "nodes not inside cluster: " - offendingNodesInfo += strings.Join(nodesNotInKube, ",") + offendingNodesInfo += strings.Join(providerIDs, ",") } t.rm.LogEvent(t.cycleNodeRequest, "NodeCountMismatch", diff --git a/pkg/controller/cyclenoderequest/transitioner/util.go b/pkg/controller/cyclenoderequest/transitioner/util.go index 1564f9f..8bade32 100644 --- a/pkg/controller/cyclenoderequest/transitioner/util.go +++ b/pkg/controller/cyclenoderequest/transitioner/util.go @@ -253,26 +253,19 @@ func (t *CycleNodeRequestTransitioner) checkIfTransitioning(numNodesToCycle, num // findOffendingNodes finds the offending nodes information which cause number of nodes mismatch between // cloud provider node group and nodes inside kubernetes cluster using label selector -func findOffendingNodes(kubeNodes map[string]corev1.Node, cloudProviderNodes map[string]cloudprovider.Instance) ([]string, []string) { - var nodesNotInCPNodeGroup []string - var nodesNotInKube []string +func findOffendingNodes(kubeNodes map[string]corev1.Node, cloudProviderNodes map[string]cloudprovider.Instance) (map[string]corev1.Node, map[string]cloudprovider.Instance) { + var nodesNotInCPNodeGroup = make(map[string]corev1.Node) + var nodesNotInKube = make(map[string]cloudprovider.Instance) for _, kubeNode := range kubeNodes { if _, ok := cloudProviderNodes[kubeNode.Spec.ProviderID]; !ok { - nodesNotInCPNodeGroup = append(nodesNotInCPNodeGroup, - fmt.Sprintf("id %q", kubeNode.Spec.ProviderID), - ) + nodesNotInCPNodeGroup[kubeNode.Spec.ProviderID] = kubeNode } } - for cpNode := range cloudProviderNodes { - if _, ok := kubeNodes[cpNode]; !ok { - nodesNotInKube = append(nodesNotInKube, - fmt.Sprintf("id %q in %q", - cpNode, - cloudProviderNodes[cpNode].NodeGroupName(), - ), - ) + for providerID, cpNode := range cloudProviderNodes { + if _, ok := kubeNodes[providerID]; !ok { + nodesNotInKube[providerID] = cpNode } } diff --git a/pkg/controller/cyclenoderequest/transitioner/util_test.go b/pkg/controller/cyclenoderequest/transitioner/util_test.go index 2c9aafc..22c326e 100644 --- a/pkg/controller/cyclenoderequest/transitioner/util_test.go +++ b/pkg/controller/cyclenoderequest/transitioner/util_test.go @@ -1,6 +1,7 @@ package transitioner import ( + "reflect" "testing" "github.com/atlassian-labs/cyclops/pkg/cloudprovider" @@ -47,8 +48,8 @@ func TestFindOffendingNodes(t *testing.T) { name string knodes map[string]corev1.Node cnodes map[string]cloudprovider.Instance - expectNotInCPNodeGroup []string - expectNotInKube []string + expectNotInCPNodeGroup map[string]corev1.Node + expectNotInKube map[string]cloudprovider.Instance }{ { "kube nodes match cloud provider nodes", @@ -62,8 +63,8 @@ func TestFindOffendingNodes(t *testing.T) { dummyInstanceB.providerID: &dummyInstanceB, dummyInstanceC.providerID: &dummyInstanceC, }, - []string{}, - []string{}, + make(map[string]corev1.Node), + make(map[string]cloudprovider.Instance), }, { "more nodes in kube than cloud provider", @@ -76,8 +77,10 @@ func TestFindOffendingNodes(t *testing.T) { dummyInstanceA.providerID: &dummyInstanceA, dummyInstanceB.providerID: &dummyInstanceB, }, - []string{"id \"aws:///us-east-1c/i-cbcdefghijk\""}, - []string{}, + map[string]corev1.Node{ + dummyInstanceC.providerID: buildNode(dummyInstanceC), + }, + make(map[string]cloudprovider.Instance), }, { "more nodes in cloud provider than kube", @@ -90,8 +93,10 @@ func TestFindOffendingNodes(t *testing.T) { dummyInstanceB.providerID: &dummyInstanceB, dummyInstanceC.providerID: &dummyInstanceC, }, - []string{}, - []string{"id \"aws:///us-east-1c/i-cbcdefghijk\" in \"GroupC\""}, + make(map[string]corev1.Node), + map[string]cloudprovider.Instance{ + dummyInstanceC.providerID: &dummyInstanceC, + }, }, { "no nodes in cloud provider", @@ -99,9 +104,12 @@ func TestFindOffendingNodes(t *testing.T) { dummyInstanceA.providerID: buildNode(dummyInstanceA), dummyInstanceB.providerID: buildNode(dummyInstanceB), }, - map[string]cloudprovider.Instance{}, - []string{"id \"aws:///us-east-1a/i-abcdefghijk\"", "id \"aws:///us-east-1b/i-bbcdefghijk\""}, - []string{}, + make(map[string]cloudprovider.Instance), + map[string]corev1.Node{ + dummyInstanceA.providerID: buildNode(dummyInstanceA), + dummyInstanceB.providerID: buildNode(dummyInstanceB), + }, + make(map[string]cloudprovider.Instance), }, { "no nodes in kube", @@ -110,23 +118,26 @@ func TestFindOffendingNodes(t *testing.T) { dummyInstanceA.providerID: &dummyInstanceA, dummyInstanceB.providerID: &dummyInstanceB, }, - []string{}, - []string{"id \"aws:///us-east-1a/i-abcdefghijk\" in \"GroupA\"", "id \"aws:///us-east-1b/i-bbcdefghijk\" in \"GroupB\""}, + make(map[string]corev1.Node), + map[string]cloudprovider.Instance{ + dummyInstanceA.providerID: &dummyInstanceA, + dummyInstanceB.providerID: &dummyInstanceB, + }, }, { "both cloud provider and kube nodes are empty", make(map[string]corev1.Node), - map[string]cloudprovider.Instance{}, - []string{}, - []string{}, + make(map[string]cloudprovider.Instance), + make(map[string]corev1.Node), + make(map[string]cloudprovider.Instance), }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { nodesNotInCPNodeGroup, nodesNotInKube := findOffendingNodes(test.knodes, test.cnodes) - assert.ElementsMatch(t, test.expectNotInCPNodeGroup, nodesNotInCPNodeGroup) - assert.ElementsMatch(t, test.expectNotInKube, nodesNotInKube) + assert.Equal(t, true, reflect.DeepEqual(test.expectNotInCPNodeGroup, nodesNotInCPNodeGroup)) + assert.Equal(t, true, reflect.DeepEqual(test.expectNotInKube, nodesNotInKube)) }) } } From 5e8160cacdaeee3c54fd87e131acb50957ab6f49 Mon Sep 17 00:00:00 2001 From: vportella Date: Tue, 13 Aug 2024 20:43:19 +1000 Subject: [PATCH 4/7] Reduce lenient validation to just missing node names and add strict validation option --- .../atlassian.com_cyclenoderequests_crd.yaml | 9 +++ .../atlassian.com_cyclenodestatuses_crd.yaml | 7 ++ deploy/crds/atlassian.com_nodegroups_crd.yaml | 7 ++ pkg/apis/atlassian/v1/common_types.go | 5 ++ .../atlassian/v1/zz_generated.deepcopy.go | 1 + .../cyclenoderequest/transitioner/node.go | 9 ++- .../transitioner/transitions.go | 9 ++- .../transitioner/transitions_pending_test.go | 64 +++++++++++++++++-- 8 files changed, 105 insertions(+), 6 deletions(-) diff --git a/deploy/crds/atlassian.com_cyclenoderequests_crd.yaml b/deploy/crds/atlassian.com_cyclenoderequests_crd.yaml index 9dc164f..da37872 100644 --- a/deploy/crds/atlassian.com_cyclenoderequests_crd.yaml +++ b/deploy/crds/atlassian.com_cyclenoderequests_crd.yaml @@ -44,6 +44,8 @@ spec: of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' type: string + clusterName: + type: string kind: description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client @@ -101,6 +103,13 @@ spec: - Drain - Wait type: string + strictValidation: + description: StrictValidation is a boolean which determines whether + named nodes selected in a CNR must exist and be valid nodes + before cycling can begin. If set to true when invalid nodes + are selected the CNR will be transitioned to the "Failed" phase + before cycling can begin again. + type: boolean required: - method type: object diff --git a/deploy/crds/atlassian.com_cyclenodestatuses_crd.yaml b/deploy/crds/atlassian.com_cyclenodestatuses_crd.yaml index f53d3e4..358ea72 100644 --- a/deploy/crds/atlassian.com_cyclenodestatuses_crd.yaml +++ b/deploy/crds/atlassian.com_cyclenodestatuses_crd.yaml @@ -101,6 +101,13 @@ spec: - Drain - Wait type: string + strictValidation: + description: StrictValidation is a boolean which determines whether + named nodes selected in a CNR must exist and be valid nodes + before cycling can begin. If set to true when invalid nodes + are selected the CNR will be transitioned to the "Failed" phase + before cycling can begin again. + type: boolean required: - method type: object diff --git a/deploy/crds/atlassian.com_nodegroups_crd.yaml b/deploy/crds/atlassian.com_nodegroups_crd.yaml index d75ee73..08b822c 100644 --- a/deploy/crds/atlassian.com_nodegroups_crd.yaml +++ b/deploy/crds/atlassian.com_nodegroups_crd.yaml @@ -93,6 +93,13 @@ spec: - Drain - Wait type: string + strictValidation: + description: StrictValidation is a boolean which determines whether + named nodes selected in a CNR must exist and be valid nodes + before cycling can begin. If set to true when invalid nodes + are selected the CNR will be transitioned to the "Failed" phase + before cycling can begin again. + type: boolean required: - method type: object diff --git a/pkg/apis/atlassian/v1/common_types.go b/pkg/apis/atlassian/v1/common_types.go index 09236dc..98e71f6 100644 --- a/pkg/apis/atlassian/v1/common_types.go +++ b/pkg/apis/atlassian/v1/common_types.go @@ -46,6 +46,11 @@ type CycleSettings struct { // in-progress CNS request timeout from the time it's worked on by the controller. // If no cyclingTimeout is provided, CNS will use the default controller CNS cyclingTimeout. CyclingTimeout *metav1.Duration `json:"cyclingTimeout,omitempty"` + + // StrictValidation is a boolean which determines whether named nodes selected in a CNR must + // exist and be valid nodes before cycling can begin. If set to true when invalid nodes are + // selected the CNR will be transitioned to the "Failed" phase before cycling can begin again. + StrictValidation bool `json:"strictValidation,omitempty"` } // HealthCheck defines the health check configuration for the NodeGroup diff --git a/pkg/apis/atlassian/v1/zz_generated.deepcopy.go b/pkg/apis/atlassian/v1/zz_generated.deepcopy.go index 04671d9..5df3ee0 100644 --- a/pkg/apis/atlassian/v1/zz_generated.deepcopy.go +++ b/pkg/apis/atlassian/v1/zz_generated.deepcopy.go @@ -1,3 +1,4 @@ +//go:build !ignore_autogenerated // +build !ignore_autogenerated // Code generated by operator-sdk. DO NOT EDIT. diff --git a/pkg/controller/cyclenoderequest/transitioner/node.go b/pkg/controller/cyclenoderequest/transitioner/node.go index b22f119..603dbd6 100644 --- a/pkg/controller/cyclenoderequest/transitioner/node.go +++ b/pkg/controller/cyclenoderequest/transitioner/node.go @@ -105,7 +105,7 @@ func (t *CycleNodeRequestTransitioner) getNodesToTerminate(numNodes int64) (node // addNamedNodesToTerminate adds the named nodes for this CycleNodeRequest to the list of nodes to terminate. // Skips any named node that does not exist in the node group for this CycleNodeRequest. -func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes map[string]corev1.Node, nodeGroupInstances map[string]cloudprovider.Instance) { +func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes map[string]corev1.Node, nodeGroupInstances map[string]cloudprovider.Instance) error { kubeNodesMap := make(map[string]corev1.Node) for _, node := range kubeNodes { @@ -117,6 +117,11 @@ func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes map[st if !found { t.rm.Logger.Info("could not find node by name, skipping", "nodeName", namedNode) + + if t.cycleNodeRequest.Spec.CycleSettings.StrictValidation { + return fmt.Errorf("could not find node by name: %v", namedNode) + } + continue } @@ -130,6 +135,8 @@ func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes map[st newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()), ) } + + return nil } // newCycleNodeRequestNode converts a corev1.Node to a v1.CycleNodeRequestNode. This is done multiple diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go index f16bde8..c122b89 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions.go @@ -70,6 +70,10 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er return t.transitionToHealing(err) } + if len(kubeNodes) == 0 { + return t.transitionToHealing(fmt.Errorf("no nodes matched selector")) + } + // Only retain nodes which still exist inside cloud provider var nodeProviderIDs []string @@ -176,7 +180,10 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er if len(t.cycleNodeRequest.Spec.NodeNames) > 0 { // If specific node names are provided, check they actually exist in the node group t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Adding named nodes to NodesToTerminate") - t.addNamedNodesToTerminate(kubeNodes, nodeGroupInstances) + err := t.addNamedNodesToTerminate(kubeNodes, nodeGroupInstances) + if err != nil { + return t.transitionToHealing(err) + } } else { // Otherwise just add all the nodes in the node group t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Adding all node group nodes to NodesToTerminate") diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go b/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go index 9c85f63..a5cccdd 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go @@ -59,7 +59,9 @@ func TestPendingSimpleCase(t *testing.T) { assert.Equal(t, cnr.Status.NumNodesCycled, 0) } -// Test to ensure the Pending phase will accept a CNR with a correct named node. +// Test to ensure the Pending phase will reject a CNR with a named node that +// does not match any of the nodes matching the node selector if strict +// validation is enabled. It should error out immediately. func TestPendingWithNamedNode(t *testing.T) { nodegroup, err := mock.NewNodegroup("ng-1", 2) if err != nil { @@ -108,9 +110,10 @@ func TestPendingWithNamedNode(t *testing.T) { assert.Equal(t, cnr.Status.NumNodesCycled, 0) } -// Test to ensure the Pending phase will reject a CNR with a named node that -// does not match any of the nodes matching the node selector. It should error -// out immediately. +// Test to ensure the Pending phase will accept a CNR with a named node that +// does not match any of the nodes matching the node selector if strict +// validation is not enabled. It will just select the select the nodes that +// exist. func TestPendingWrongNamedNode(t *testing.T) { nodegroup, err := mock.NewNodegroup("ng-1", 2) if err != nil { @@ -148,6 +151,59 @@ func TestPendingWrongNamedNode(t *testing.T) { WithCloudProviderInstances(nodegroup), ) + result, err := fakeTransitioner.Run() + assert.NoError(t, err) + assert.True(t, result.Requeue) + + // It should move to the Initialised phase and set up the status of the CNR + // in a predictable manner + assert.Equal(t, v1.CycleNodeRequestInitialised, cnr.Status.Phase) + assert.Len(t, cnr.Status.NodesToTerminate, 1) + assert.Equal(t, cnr.Status.ActiveChildren, int64(0)) + assert.Equal(t, cnr.Status.NumNodesCycled, 0) +} + +// Test to ensure the Pending phase will reject a CNR with a named node that +// does not match any of the nodes matching the node selector if strict +// validation is enabled. It should error out immediately. +func TestPendingWrongNamedNodeStrictValidation(t *testing.T) { + nodegroup, err := mock.NewNodegroup("ng-1", 2) + if err != nil { + assert.NoError(t, err) + } + + cnr := &v1.CycleNodeRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cnr-1", + Namespace: "kube-system", + }, + Spec: v1.CycleNodeRequestSpec{ + NodeGroupsList: []string{"ng-1"}, + CycleSettings: v1.CycleSettings{ + Concurrency: 1, + Method: v1.CycleNodeRequestMethodDrain, + StrictValidation: true, + }, + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "customer": "kitt", + }, + }, + NodeNames: []string{ + "ng-1-node-0", + "ng-1-node-2", + }, + }, + Status: v1.CycleNodeRequestStatus{ + Phase: v1.CycleNodeRequestPending, + }, + } + + fakeTransitioner := NewFakeTransitioner(cnr, + WithKubeNodes(nodegroup), + WithCloudProviderInstances(nodegroup), + ) + _, err = fakeTransitioner.Run() assert.Error(t, err) assert.Equal(t, v1.CycleNodeRequestHealing, cnr.Status.Phase) From 659206b284d9f904f428e55dc485f6a7021d8dae Mon Sep 17 00:00:00 2001 From: vportella Date: Tue, 13 Aug 2024 20:45:49 +1000 Subject: [PATCH 5/7] rename variable for clarity --- pkg/controller/cyclenoderequest/transitioner/node.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/controller/cyclenoderequest/transitioner/node.go b/pkg/controller/cyclenoderequest/transitioner/node.go index 603dbd6..dc9ad11 100644 --- a/pkg/controller/cyclenoderequest/transitioner/node.go +++ b/pkg/controller/cyclenoderequest/transitioner/node.go @@ -106,14 +106,14 @@ func (t *CycleNodeRequestTransitioner) getNodesToTerminate(numNodes int64) (node // addNamedNodesToTerminate adds the named nodes for this CycleNodeRequest to the list of nodes to terminate. // Skips any named node that does not exist in the node group for this CycleNodeRequest. func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes map[string]corev1.Node, nodeGroupInstances map[string]cloudprovider.Instance) error { - kubeNodesMap := make(map[string]corev1.Node) + nodeLookupByName := make(map[string]corev1.Node) for _, node := range kubeNodes { - kubeNodesMap[node.Name] = node + nodeLookupByName[node.Name] = node } for _, namedNode := range t.cycleNodeRequest.Spec.NodeNames { - kubeNode, found := kubeNodesMap[namedNode] + kubeNode, found := nodeLookupByName[namedNode] if !found { t.rm.Logger.Info("could not find node by name, skipping", "nodeName", namedNode) From 2b6d1db3fb2f8f2e203d438a7f3af9c87e537e5e Mon Sep 17 00:00:00 2001 From: vportella Date: Fri, 16 Aug 2024 13:38:59 +1000 Subject: [PATCH 6/7] Update CNR with new ValidationOptions sub-section --- .../atlassian.com_cyclenoderequests_crd.yaml | 20 ++++++++++++------- .../atlassian.com_cyclenodestatuses_crd.yaml | 7 ------- deploy/crds/atlassian.com_nodegroups_crd.yaml | 7 ------- pkg/apis/atlassian/v1/common_types.go | 5 ----- .../atlassian/v1/cyclenoderequest_types.go | 13 ++++++++++++ .../atlassian/v1/zz_generated.deepcopy.go | 17 ++++++++++++++++ .../cyclenoderequest/transitioner/node.go | 3 ++- .../transitioner/transitions_pending_test.go | 12 ++++++----- 8 files changed, 52 insertions(+), 32 deletions(-) diff --git a/deploy/crds/atlassian.com_cyclenoderequests_crd.yaml b/deploy/crds/atlassian.com_cyclenoderequests_crd.yaml index da37872..d6225e5 100644 --- a/deploy/crds/atlassian.com_cyclenoderequests_crd.yaml +++ b/deploy/crds/atlassian.com_cyclenoderequests_crd.yaml @@ -103,13 +103,6 @@ spec: - Drain - Wait type: string - strictValidation: - description: StrictValidation is a boolean which determines whether - named nodes selected in a CNR must exist and be valid nodes - before cycling can begin. If set to true when invalid nodes - are selected the CNR will be transitioned to the "Failed" phase - before cycling can begin again. - type: boolean required: - method type: object @@ -336,6 +329,19 @@ spec: description: SkipPreTerminationChecks is an optional flag to skip pre-termination checks during cycling type: boolean + validationOptions: + description: ValidationOptions stores the settings to use for validating + state of nodegroups in kube and the cloud provider for cycling the + nodes. + properties: + skipMissingNamedNodes: + description: SkipMissingNodeNames is a boolean which determines + whether named nodes selected in a CNR must exist and be valid + nodes before cycling can begin. If set to true named nodes which + don't exist will be ignored rather than transitioning the CNR + to the failed phase. + type: boolean + type: object required: - cycleSettings - nodeGroupName diff --git a/deploy/crds/atlassian.com_cyclenodestatuses_crd.yaml b/deploy/crds/atlassian.com_cyclenodestatuses_crd.yaml index 358ea72..f53d3e4 100644 --- a/deploy/crds/atlassian.com_cyclenodestatuses_crd.yaml +++ b/deploy/crds/atlassian.com_cyclenodestatuses_crd.yaml @@ -101,13 +101,6 @@ spec: - Drain - Wait type: string - strictValidation: - description: StrictValidation is a boolean which determines whether - named nodes selected in a CNR must exist and be valid nodes - before cycling can begin. If set to true when invalid nodes - are selected the CNR will be transitioned to the "Failed" phase - before cycling can begin again. - type: boolean required: - method type: object diff --git a/deploy/crds/atlassian.com_nodegroups_crd.yaml b/deploy/crds/atlassian.com_nodegroups_crd.yaml index 08b822c..d75ee73 100644 --- a/deploy/crds/atlassian.com_nodegroups_crd.yaml +++ b/deploy/crds/atlassian.com_nodegroups_crd.yaml @@ -93,13 +93,6 @@ spec: - Drain - Wait type: string - strictValidation: - description: StrictValidation is a boolean which determines whether - named nodes selected in a CNR must exist and be valid nodes - before cycling can begin. If set to true when invalid nodes - are selected the CNR will be transitioned to the "Failed" phase - before cycling can begin again. - type: boolean required: - method type: object diff --git a/pkg/apis/atlassian/v1/common_types.go b/pkg/apis/atlassian/v1/common_types.go index 98e71f6..09236dc 100644 --- a/pkg/apis/atlassian/v1/common_types.go +++ b/pkg/apis/atlassian/v1/common_types.go @@ -46,11 +46,6 @@ type CycleSettings struct { // in-progress CNS request timeout from the time it's worked on by the controller. // If no cyclingTimeout is provided, CNS will use the default controller CNS cyclingTimeout. CyclingTimeout *metav1.Duration `json:"cyclingTimeout,omitempty"` - - // StrictValidation is a boolean which determines whether named nodes selected in a CNR must - // exist and be valid nodes before cycling can begin. If set to true when invalid nodes are - // selected the CNR will be transitioned to the "Failed" phase before cycling can begin again. - StrictValidation bool `json:"strictValidation,omitempty"` } // HealthCheck defines the health check configuration for the NodeGroup diff --git a/pkg/apis/atlassian/v1/cyclenoderequest_types.go b/pkg/apis/atlassian/v1/cyclenoderequest_types.go index c6009af..5d13c74 100644 --- a/pkg/apis/atlassian/v1/cyclenoderequest_types.go +++ b/pkg/apis/atlassian/v1/cyclenoderequest_types.go @@ -26,6 +26,10 @@ type CycleNodeRequestSpec struct { // CycleSettings stores the settings to use for cycling the nodes. CycleSettings CycleSettings `json:"cycleSettings"` + // ValidationOptions stores the settings to use for validating state of nodegroups + // in kube and the cloud provider for cycling the nodes. + ValidationOptions ValidationOptions `json:"validationOptions,omitempty"` + // HealthChecks stores the settings to configure instance custom health checks HealthChecks []HealthCheck `json:"healthChecks,omitempty"` @@ -107,6 +111,15 @@ type CycleNodeRequestNode struct { PrivateIP string `json:"privateIp,omitempty"` } +// ValidationOptions stores the settings to use for validating state of nodegroups +// in kube and the cloud provider for cycling the nodes. +type ValidationOptions struct { + // SkipMissingNodeNames is a boolean which determines whether named nodes selected in a CNR must + // exist and be valid nodes before cycling can begin. If set to true named nodes which don't exist + // will be ignored rather than transitioning the CNR to the failed phase. + SkipMissingNamedNodes bool `json:"skipMissingNamedNodes,omitempty"` +} + // HealthCheckStatus groups all health checks status information for a node type HealthCheckStatus struct { // Ready keeps track of the first timestamp at which the node status was reported as "ready" diff --git a/pkg/apis/atlassian/v1/zz_generated.deepcopy.go b/pkg/apis/atlassian/v1/zz_generated.deepcopy.go index 5df3ee0..09dfe35 100644 --- a/pkg/apis/atlassian/v1/zz_generated.deepcopy.go +++ b/pkg/apis/atlassian/v1/zz_generated.deepcopy.go @@ -102,6 +102,7 @@ func (in *CycleNodeRequestSpec) DeepCopyInto(out *CycleNodeRequestSpec) { copy(*out, *in) } in.CycleSettings.DeepCopyInto(&out.CycleSettings) + out.ValidationOptions = in.ValidationOptions if in.HealthChecks != nil { in, out := &in.HealthChecks, &out.HealthChecks *out = make([]HealthCheck, len(*in)) @@ -585,3 +586,19 @@ func (in *TLSConfig) DeepCopy() *TLSConfig { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ValidationOptions) DeepCopyInto(out *ValidationOptions) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ValidationOptions. +func (in *ValidationOptions) DeepCopy() *ValidationOptions { + if in == nil { + return nil + } + out := new(ValidationOptions) + in.DeepCopyInto(out) + return out +} diff --git a/pkg/controller/cyclenoderequest/transitioner/node.go b/pkg/controller/cyclenoderequest/transitioner/node.go index dc9ad11..d6b7b53 100644 --- a/pkg/controller/cyclenoderequest/transitioner/node.go +++ b/pkg/controller/cyclenoderequest/transitioner/node.go @@ -118,10 +118,11 @@ func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes map[st if !found { t.rm.Logger.Info("could not find node by name, skipping", "nodeName", namedNode) - if t.cycleNodeRequest.Spec.CycleSettings.StrictValidation { + if !t.cycleNodeRequest.Spec.ValidationOptions.SkipMissingNamedNodes { return fmt.Errorf("could not find node by name: %v", namedNode) } + t.rm.LogEvent(t.cycleNodeRequest, "SkipMissingNamedNode", "Named node %s not found", namedNode) continue } diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go b/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go index a5cccdd..c95ccaa 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go @@ -114,7 +114,7 @@ func TestPendingWithNamedNode(t *testing.T) { // does not match any of the nodes matching the node selector if strict // validation is not enabled. It will just select the select the nodes that // exist. -func TestPendingWrongNamedNode(t *testing.T) { +func TestPendingWrongNamedNodeSkipMissingNamedNodes(t *testing.T) { nodegroup, err := mock.NewNodegroup("ng-1", 2) if err != nil { assert.NoError(t, err) @@ -131,6 +131,9 @@ func TestPendingWrongNamedNode(t *testing.T) { Concurrency: 1, Method: v1.CycleNodeRequestMethodDrain, }, + ValidationOptions: v1.ValidationOptions{ + SkipMissingNamedNodes: true, + }, Selector: metav1.LabelSelector{ MatchLabels: map[string]string{ "customer": "kitt", @@ -166,7 +169,7 @@ func TestPendingWrongNamedNode(t *testing.T) { // Test to ensure the Pending phase will reject a CNR with a named node that // does not match any of the nodes matching the node selector if strict // validation is enabled. It should error out immediately. -func TestPendingWrongNamedNodeStrictValidation(t *testing.T) { +func TestPendingWrongNamedNode(t *testing.T) { nodegroup, err := mock.NewNodegroup("ng-1", 2) if err != nil { assert.NoError(t, err) @@ -180,9 +183,8 @@ func TestPendingWrongNamedNodeStrictValidation(t *testing.T) { Spec: v1.CycleNodeRequestSpec{ NodeGroupsList: []string{"ng-1"}, CycleSettings: v1.CycleSettings{ - Concurrency: 1, - Method: v1.CycleNodeRequestMethodDrain, - StrictValidation: true, + Concurrency: 1, + Method: v1.CycleNodeRequestMethodDrain, }, Selector: metav1.LabelSelector{ MatchLabels: map[string]string{ From 370be5831d2f52501b9bc83ecb3488c875988489 Mon Sep 17 00:00:00 2001 From: vportella Date: Fri, 16 Aug 2024 15:09:20 +1000 Subject: [PATCH 7/7] Add validationOptions to nodegroup spec as well --- deploy/crds/atlassian.com_nodegroups_crd.yaml | 13 +++++++++++++ docs/automation/README.md | 2 ++ docs/cycling/README.md | 5 +++++ pkg/apis/atlassian/v1/common_types.go | 9 +++++++++ pkg/apis/atlassian/v1/cyclenoderequest_types.go | 9 --------- pkg/apis/atlassian/v1/nodegroup_types.go | 4 ++++ pkg/apis/atlassian/v1/zz_generated.deepcopy.go | 1 + pkg/generation/cnr.go | 7 +++++-- 8 files changed, 39 insertions(+), 11 deletions(-) diff --git a/deploy/crds/atlassian.com_nodegroups_crd.yaml b/deploy/crds/atlassian.com_nodegroups_crd.yaml index d75ee73..b537504 100644 --- a/deploy/crds/atlassian.com_nodegroups_crd.yaml +++ b/deploy/crds/atlassian.com_nodegroups_crd.yaml @@ -311,6 +311,19 @@ spec: description: SkipPreTerminationChecks is an optional flag to skip pre-termination checks during cycling type: boolean + validationOptions: + description: ValidationOptions stores the settings to use for validating + state of nodegroups in kube and the cloud provider for cycling the + nodes. + properties: + skipMissingNamedNodes: + description: SkipMissingNodeNames is a boolean which determines + whether named nodes selected in a CNR must exist and be valid + nodes before cycling can begin. If set to true named nodes which + don't exist will be ignored rather than transitioning the CNR + to the failed phase. + type: boolean + type: object required: - cycleSettings - nodeGroupName diff --git a/docs/automation/README.md b/docs/automation/README.md index 292ca21..b793f8b 100644 --- a/docs/automation/README.md +++ b/docs/automation/README.md @@ -34,6 +34,8 @@ spec: cycleSettings: method: Drain concurrency: 1 + validationOptions: + skipMissingNamedNodes: true healthChecks: - endpoint: http://{{ .NodeIP }}:8080/ready regexMatch: Ready diff --git a/docs/cycling/README.md b/docs/cycling/README.md index e17cfcf..4f374e9 100644 --- a/docs/cycling/README.md +++ b/docs/cycling/README.md @@ -106,6 +106,11 @@ spec: - "node-name-A" - "node-name-B" + # Optional section - collection of validation options to define stricter or more lenient validation during cycling. + validationOptions: + # Optional field - Skip node names defined in the CNR that do not match any existing nodes in the Kubernetes API. + skipMissingNamedNodes: true|false + cycleNodeSettings: # Method can be "Wait" or "Drain", defaults to "Drain" if not provided # "Wait" will wait for pods on the node to complete, while "Drain" will forcefully drain them off the node diff --git a/pkg/apis/atlassian/v1/common_types.go b/pkg/apis/atlassian/v1/common_types.go index 09236dc..181cea6 100644 --- a/pkg/apis/atlassian/v1/common_types.go +++ b/pkg/apis/atlassian/v1/common_types.go @@ -104,3 +104,12 @@ type TLSConfig struct { // sent as part of the request to the upstream host for mTLS. Key string `json:"key,omitempty"` } + +// ValidationOptions stores the settings to use for validating state of nodegroups +// in kube and the cloud provider for cycling the nodes. +type ValidationOptions struct { + // SkipMissingNodeNames is a boolean which determines whether named nodes selected in a CNR must + // exist and be valid nodes before cycling can begin. If set to true named nodes which don't exist + // will be ignored rather than transitioning the CNR to the failed phase. + SkipMissingNamedNodes bool `json:"skipMissingNamedNodes,omitempty"` +} diff --git a/pkg/apis/atlassian/v1/cyclenoderequest_types.go b/pkg/apis/atlassian/v1/cyclenoderequest_types.go index 5d13c74..867fabd 100644 --- a/pkg/apis/atlassian/v1/cyclenoderequest_types.go +++ b/pkg/apis/atlassian/v1/cyclenoderequest_types.go @@ -111,15 +111,6 @@ type CycleNodeRequestNode struct { PrivateIP string `json:"privateIp,omitempty"` } -// ValidationOptions stores the settings to use for validating state of nodegroups -// in kube and the cloud provider for cycling the nodes. -type ValidationOptions struct { - // SkipMissingNodeNames is a boolean which determines whether named nodes selected in a CNR must - // exist and be valid nodes before cycling can begin. If set to true named nodes which don't exist - // will be ignored rather than transitioning the CNR to the failed phase. - SkipMissingNamedNodes bool `json:"skipMissingNamedNodes,omitempty"` -} - // HealthCheckStatus groups all health checks status information for a node type HealthCheckStatus struct { // Ready keeps track of the first timestamp at which the node status was reported as "ready" diff --git a/pkg/apis/atlassian/v1/nodegroup_types.go b/pkg/apis/atlassian/v1/nodegroup_types.go index 52d4c69..9d0e0e0 100644 --- a/pkg/apis/atlassian/v1/nodegroup_types.go +++ b/pkg/apis/atlassian/v1/nodegroup_types.go @@ -19,6 +19,10 @@ type NodeGroupSpec struct { // CycleSettings stores the settings to use for cycling the nodes. CycleSettings CycleSettings `json:"cycleSettings"` + // ValidationOptions stores the settings to use for validating state of nodegroups + // in kube and the cloud provider for cycling the nodes. + ValidationOptions ValidationOptions `json:"validationOptions,omitempty"` + // Healthchecks stores the settings to configure instance custom health checks HealthChecks []HealthCheck `json:"healthChecks,omitempty"` diff --git a/pkg/apis/atlassian/v1/zz_generated.deepcopy.go b/pkg/apis/atlassian/v1/zz_generated.deepcopy.go index 09dfe35..dbc0e9d 100644 --- a/pkg/apis/atlassian/v1/zz_generated.deepcopy.go +++ b/pkg/apis/atlassian/v1/zz_generated.deepcopy.go @@ -462,6 +462,7 @@ func (in *NodeGroupSpec) DeepCopyInto(out *NodeGroupSpec) { } in.NodeSelector.DeepCopyInto(&out.NodeSelector) in.CycleSettings.DeepCopyInto(&out.CycleSettings) + out.ValidationOptions = in.ValidationOptions if in.HealthChecks != nil { in, out := &in.HealthChecks, &out.HealthChecks *out = make([]HealthCheck, len(*in)) diff --git a/pkg/generation/cnr.go b/pkg/generation/cnr.go index 9808b93..db5615d 100644 --- a/pkg/generation/cnr.go +++ b/pkg/generation/cnr.go @@ -3,9 +3,10 @@ package generation import ( "context" "fmt" - "github.com/atlassian-labs/cyclops/pkg/controller/cyclenoderequest" "strings" + "github.com/atlassian-labs/cyclops/pkg/controller/cyclenoderequest" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/validation" "sigs.k8s.io/controller-runtime/pkg/client" @@ -73,8 +74,9 @@ func GiveReason(cnr *atlassianv1.CycleNodeRequest, reason string) { } cnr.Annotations[cnrReasonAnnotationKey] = reason } + // SetAPIVersion adds apiVersion annotation to the cnr -func SetAPIVersion(cnr *atlassianv1.CycleNodeRequest, clientVersion string){ +func SetAPIVersion(cnr *atlassianv1.CycleNodeRequest, clientVersion string) { if cnr.Annotations == nil { cnr.Annotations = map[string]string{} } @@ -110,6 +112,7 @@ func GenerateCNR(nodeGroup atlassianv1.NodeGroup, nodes []string, name, namespac PreTerminationChecks: nodeGroup.Spec.PreTerminationChecks, SkipInitialHealthChecks: nodeGroup.Spec.SkipInitialHealthChecks, SkipPreTerminationChecks: nodeGroup.Spec.SkipPreTerminationChecks, + ValidationOptions: nodeGroup.Spec.ValidationOptions, }, } }