-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Skip named nodes in the CNR that don't exist #83
Changes from 2 commits
4898f24
11a4767
01b396e
834f0d0
5e8160c
659206b
2b6d1db
370be58
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,13 +64,11 @@ func (t *CycleNodeRequestTransitioner) transitionUndefined() (reconcile.Result, | |
func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, error) { | ||
// Fetch the node names for the cycleNodeRequest, using the label selector provided | ||
t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Selecting nodes with label selector") | ||
|
||
kubeNodes, err := t.listReadyNodes(true) | ||
if err != nil { | ||
return t.transitionToHealing(err) | ||
} | ||
if len(kubeNodes) == 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're losing the ability to detect user error here. Is there a way we can keep user error detection, while also not failing at nonexistent nodes? One strategy could be: if our selector did match some nodes, but none were still in the cluster, we can call it a success. If it matches no nodes, we don't call it a success. The only problem with this approach is nodegroups scaled to zero. Thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't really see how that's possible unless we have a way of knowing if a node name in the CNR matches a node that existed prior to the CNR being created. A flag here would help to toggle strict node validation and enable both use cases but I'm wary of adding more settings to a CNR. |
||
return t.transitionToHealing(fmt.Errorf("no nodes matched selector")) | ||
} | ||
|
||
// Only retain nodes which still exist inside cloud provider | ||
var nodeProviderIDs []string | ||
|
@@ -83,48 +81,49 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er | |
if err != nil { | ||
return t.transitionToHealing(errors.Wrap(err, "failed to check instances that exist from cloud provider")) | ||
} | ||
var existingKubeNodes []corev1.Node | ||
|
||
for _, node := range kubeNodes { | ||
for _, validProviderID := range existingProviderIDs { | ||
if node.Spec.ProviderID == validProviderID { | ||
existingKubeNodes = append(existingKubeNodes, node) | ||
break | ||
} | ||
existingKubeNodes := make(map[string]corev1.Node) | ||
|
||
for _, validProviderID := range existingProviderIDs { | ||
if node, found := kubeNodes[validProviderID]; found { | ||
existingKubeNodes[node.Spec.ProviderID] = node | ||
} | ||
} | ||
|
||
kubeNodes = existingKubeNodes | ||
|
||
if len(kubeNodes) == 0 { | ||
return t.transitionToHealing(fmt.Errorf("no existing nodes in cloud provider matched selector")) | ||
} | ||
|
||
// Describe the node group for the request | ||
t.rm.LogEvent(t.cycleNodeRequest, "FetchingNodeGroup", "Fetching node group: %v", t.cycleNodeRequest.GetNodeGroupNames()) | ||
|
||
nodeGroups, err := t.rm.CloudProvider.GetNodeGroups(t.cycleNodeRequest.GetNodeGroupNames()) | ||
if err != nil { | ||
return t.transitionToHealing(err) | ||
} | ||
|
||
// get instances inside cloud provider node groups | ||
nodeGroupInstances := nodeGroups.Instances() | ||
|
||
// Do some sanity checking before we start filtering things | ||
// Check the instance count of the node group matches the number of nodes found in Kubernetes | ||
if len(kubeNodes) != len(nodeGroupInstances) { | ||
nodesNotInCPNodeGroup, nodesNotInKube := findOffendingNodes(kubeNodes, nodeGroupInstances) | ||
var offendingNodesInfo string | ||
|
||
nodesNotInCPNodeGroup, nodesNotInKube := findOffendingNodes(kubeNodes, nodeGroupInstances) | ||
|
||
if len(nodesNotInCPNodeGroup) > 0 { | ||
offendingNodesInfo += "nodes not in node group: " | ||
offendingNodesInfo += strings.Join(nodesNotInCPNodeGroup, ",") | ||
} | ||
|
||
if len(nodesNotInKube) > 0 { | ||
if offendingNodesInfo != "" { | ||
offendingNodesInfo += ";" | ||
} | ||
|
||
offendingNodesInfo += "nodes not inside cluster: " | ||
offendingNodesInfo += strings.Join(nodesNotInKube, ",") | ||
} | ||
|
||
t.rm.LogEvent(t.cycleNodeRequest, "NodeCountMismatch", | ||
"node group: %v, kube: %v. %v", | ||
len(nodeGroupInstances), len(kubeNodes), offendingNodesInfo) | ||
|
@@ -134,26 +133,28 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er | |
if err != nil { | ||
return t.transitionToHealing(err) | ||
} | ||
|
||
if timedOut { | ||
err := fmt.Errorf( | ||
"node count mismatch, number of kubernetes of nodes does not match number of cloud provider instances after %v", | ||
nodeEquilibriumWaitLimit) | ||
"node count mismatch, number of kubernetes nodes does not match number of cloud provider instances after %v", | ||
nodeEquilibriumWaitLimit, | ||
) | ||
|
||
return t.transitionToHealing(err) | ||
} | ||
|
||
return reconcile.Result{Requeue: true, RequeueAfter: requeueDuration}, nil | ||
} | ||
|
||
// make a list of the nodes to terminate | ||
if len(t.cycleNodeRequest.Spec.NodeNames) > 0 { | ||
// If specific node names are provided, check they actually exist in the node group | ||
t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Adding named nodes to NodesToTerminate") | ||
err := t.addNamedNodesToTerminate(kubeNodes, nodeGroupInstances) | ||
if err != nil { | ||
return t.transitionToHealing(err) | ||
} | ||
t.addNamedNodesToTerminate(kubeNodes, nodeGroupInstances) | ||
} else { | ||
// Otherwise just add all the nodes in the node group | ||
t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Adding all node group nodes to NodesToTerminate") | ||
|
||
for _, kubeNode := range kubeNodes { | ||
// Check to ensure the kubeNode object maps to an existing node in the ASG | ||
// If this isn't the case, this is a phantom node. Fail the cnr to be safe. | ||
|
@@ -205,7 +206,9 @@ func (t *CycleNodeRequestTransitioner) transitionInitialised() (reconcile.Result | |
// The maximum nodes we can select are bounded by our concurrency. We take into account the number | ||
// of nodes we are already working on, and only introduce up to our concurrency cap more nodes in this step. | ||
maxNodesToSelect := t.cycleNodeRequest.Spec.CycleSettings.Concurrency - t.cycleNodeRequest.Status.ActiveChildren | ||
|
||
t.rm.Logger.Info("Selecting nodes to terminate", "numNodes", maxNodesToSelect) | ||
|
||
nodes, numNodesInProgress, err := t.getNodesToTerminate(maxNodesToSelect) | ||
if err != nil { | ||
return t.transitionToHealing(err) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: name this after the function of the variable. Something like
nodeLookupByName
. I kept trying to find it being used for something more than a lookup.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point, I initially intended for this to be a breaking change but now that I think about it making it configurable may be the better approach to cater to different use-cases.
👍 I'll add some tests to validate the intended behaviour.