Skip to content

Commit

Permalink
test: update node termination test to model eventual removal (#1320)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmdeal authored Jun 15, 2024
1 parent b350b58 commit c51bcd7
Showing 1 changed file with 83 additions and 42 deletions.
125 changes: 83 additions & 42 deletions pkg/controllers/node/termination/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,25 +99,35 @@ var _ = Describe("Termination", func() {

Context("Reconciliation", func() {
It("should delete nodes", func() {
ExpectApplied(ctx, env.Client, node)
ExpectApplied(ctx, env.Client, node, nodeClaim)
Expect(env.Client.Delete(ctx, node)).To(Succeed())
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNotFound(ctx, env.Client, node)
})
It("should delete nodeclaims associated with nodes", func() {
ExpectApplied(ctx, env.Client, node, nodeClaim)
Expect(env.Client.Delete(ctx, node)).To(Succeed())
node = ExpectNodeExists(ctx, env.Client, node.Name)

// The first reconciliation should trigger the Delete, and set the terminating status condition
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectExists(ctx, env.Client, nodeClaim)
nc := ExpectExists(ctx, env.Client, nodeClaim)
Expect(nc.StatusConditions().Get(v1beta1.ConditionTypeTerminating).IsTrue()).To(BeTrue())
ExpectNodeExists(ctx, env.Client, node.Name)

// The second reconciliation should call get, see the "instance" is terminated, and remove the node.
// We should have deleted the NodeClaim from the node termination controller, so removing it's finalizer should result in it being removed.
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectFinalizersRemoved(ctx, env.Client, nodeClaim)
ExpectNotFound(ctx, env.Client, node, nodeClaim)
})
It("should not race if deleting nodes in parallel", func() {
const numNodes = 10
var nodes []*v1.Node
for i := 0; i < 10; i++ {
for i := 0; i < numNodes; i++ {
node = test.Node(test.NodeOptions{
ObjectMeta: metav1.ObjectMeta{
Finalizers: []string{v1beta1.TerminationFinalizer},
Expand All @@ -129,34 +139,39 @@ var _ = Describe("Termination", func() {
nodes = append(nodes, node)
}

var wg sync.WaitGroup
// this is enough to trip the race detector
for i := 0; i < 10; i++ {
wg.Add(1)
go func(node *v1.Node) {
defer GinkgoRecover()
defer wg.Done()
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
}(nodes[i])
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
for range 2 {
var wg sync.WaitGroup
// this is enough to trip the race detector
for i := 0; i < numNodes; i++ {
wg.Add(1)
go func(node *v1.Node) {
defer GinkgoRecover()
defer wg.Done()
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
}(nodes[i])
}
wg.Wait()
}
wg.Wait()
ExpectNotFound(ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...)
})
It("should exclude nodes from load balancers when terminating", func() {
// This is a kludge to prevent the node from being deleted before we can
// inspect its labels
podNoEvict := test.Pod(test.PodOptions{
NodeName: node.Name,
labels := map[string]string{"foo": "bar"}
pod := test.Pod(test.PodOptions{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{v1beta1.DoNotDisruptAnnotationKey: "true"},
OwnerReferences: defaultOwnerRefs,
Labels: labels,
},
})
// Create a fully blocking PDB to prevent the node from being deleted before we can observe its labels
pdb := test.PodDisruptionBudget(test.PDBOptions{
Labels: labels,
MaxUnavailable: lo.ToPtr(intstr.FromInt(0)),
})

ExpectApplied(ctx, env.Client, node, podNoEvict)

ExpectApplied(ctx, env.Client, node, nodeClaim, pod, pdb)
ExpectManualBinding(ctx, env.Client, pod, node)
Expect(env.Client.Delete(ctx, node)).To(Succeed())
node = ExpectNodeExists(ctx, env.Client, node.Name)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
node = ExpectNodeExists(ctx, env.Client, node.Name)
Expect(node.Labels[v1.LabelNodeExcludeBalancers]).Should(Equal("karpenter"))
Expand All @@ -168,7 +183,7 @@ var _ = Describe("Termination", func() {
Tolerations: []v1.Toleration{{Key: v1beta1.DisruptionTaintKey, Operator: v1.TolerationOpEqual, Effect: v1beta1.DisruptionNoScheduleTaint.Effect, Value: v1beta1.DisruptionNoScheduleTaint.Value}},
ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs},
})
ExpectApplied(ctx, env.Client, node, podEvict, podSkip)
ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podSkip)

// Trigger Termination Controller
Expect(env.Client.Delete(ctx, node)).To(Succeed())
Expand All @@ -186,6 +201,8 @@ var _ = Describe("Termination", func() {

// Reconcile to delete node
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNotFound(ctx, env.Client, node)
})
Expand All @@ -196,7 +213,7 @@ var _ = Describe("Termination", func() {
Tolerations: []v1.Toleration{{Key: v1beta1.DisruptionTaintKey, Operator: v1.TolerationOpExists, Effect: v1beta1.DisruptionNoScheduleTaint.Effect}},
ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs},
})
ExpectApplied(ctx, env.Client, node, podEvict, podSkip)
ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podSkip)

// Trigger Termination Controller
Expect(env.Client.Delete(ctx, node)).To(Succeed())
Expand All @@ -216,6 +233,8 @@ var _ = Describe("Termination", func() {

// Reconcile to delete node
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNotFound(ctx, env.Client, node)
})
Expand All @@ -225,7 +244,7 @@ var _ = Describe("Termination", func() {
Tolerations: []v1.Toleration{{Key: v1.TaintNodeUnschedulable, Operator: v1.TolerationOpExists, Effect: v1.TaintEffectNoSchedule}},
ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs},
})
ExpectApplied(ctx, env.Client, node, podEvict)
ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict)

// Trigger Termination Controller
Expect(env.Client.Delete(ctx, node)).To(Succeed())
Expand All @@ -242,6 +261,8 @@ var _ = Describe("Termination", func() {

// Reconcile to delete node
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNotFound(ctx, env.Client, node)
})
Expand All @@ -253,7 +274,7 @@ var _ = Describe("Termination", func() {
},
})

ExpectApplied(ctx, env.Client, node, pod)
ExpectApplied(ctx, env.Client, node, nodeClaim, pod)
Expect(env.Client.Delete(ctx, node)).To(Succeed())
node = ExpectNodeExists(ctx, env.Client, node.Name)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
Expand All @@ -268,11 +289,11 @@ var _ = Describe("Termination", func() {
// Delete no owner refs pod to simulate successful eviction
ExpectDeleted(ctx, env.Client, pod)

// Reconcile node to evict pod
// Reconcile node to evict pod and delete node
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)

// Reconcile to delete node
ExpectNotFound(ctx, env.Client, node)
})
It("should delete nodes with terminal pods", func() {
Expand All @@ -285,10 +306,12 @@ var _ = Describe("Termination", func() {
Phase: v1.PodFailed,
})

ExpectApplied(ctx, env.Client, node, podEvictPhaseSucceeded, podEvictPhaseFailed)
ExpectApplied(ctx, env.Client, node, nodeClaim, podEvictPhaseSucceeded, podEvictPhaseFailed)
Expect(env.Client.Delete(ctx, node)).To(Succeed())
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Trigger Termination Controller, which should ignore these pods and delete the node
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNotFound(ctx, env.Client, node)
})
Expand All @@ -309,7 +332,7 @@ var _ = Describe("Termination", func() {
Phase: v1.PodRunning,
})

ExpectApplied(ctx, env.Client, node, podNoEvict, pdb)
ExpectApplied(ctx, env.Client, node, nodeClaim, podNoEvict, pdb)

// Trigger Termination Controller
Expect(env.Client.Delete(ctx, node)).To(Succeed())
Expand All @@ -331,14 +354,16 @@ var _ = Describe("Termination", func() {

// Reconcile to delete node
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNotFound(ctx, env.Client, node)
})
It("should evict pods in order", func() {
daemonEvict := test.DaemonSet()
daemonNodeCritical := test.DaemonSet(test.DaemonSetOptions{PodOptions: test.PodOptions{PriorityClassName: "system-node-critical"}})
daemonClusterCritical := test.DaemonSet(test.DaemonSetOptions{PodOptions: test.PodOptions{PriorityClassName: "system-cluster-critical"}})
ExpectApplied(ctx, env.Client, node, daemonEvict, daemonNodeCritical, daemonClusterCritical)
ExpectApplied(ctx, env.Client, node, nodeClaim, daemonEvict, daemonNodeCritical, daemonClusterCritical)

podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})
podDaemonEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{
Expand Down Expand Up @@ -368,7 +393,7 @@ var _ = Describe("Termination", func() {
BlockOwnerDeletion: lo.ToPtr(true),
}}}})

ExpectApplied(ctx, env.Client, node, podEvict, podNodeCritical, podClusterCritical, podDaemonEvict, podDaemonNodeCritical, podDaemonClusterCritical)
ExpectApplied(ctx, env.Client, node, nodeClaim, nodeClaim, podEvict, podNodeCritical, podClusterCritical, podDaemonEvict, podDaemonNodeCritical, podDaemonClusterCritical)

// Trigger Termination Controller
Expect(env.Client.Delete(ctx, node)).To(Succeed())
Expand Down Expand Up @@ -417,6 +442,8 @@ var _ = Describe("Termination", func() {

// Reconcile to delete node
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNotFound(ctx, env.Client, node)
})
Expand All @@ -425,7 +452,7 @@ var _ = Describe("Termination", func() {
podNodeCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-node-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})
podClusterCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-cluster-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})

ExpectApplied(ctx, env.Client, node, podEvict, podNodeCritical, podClusterCritical)
ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podNodeCritical, podClusterCritical)

// Trigger Termination Controller
Expect(env.Client.Delete(ctx, node)).To(Succeed())
Expand All @@ -452,13 +479,14 @@ var _ = Describe("Termination", func() {

// Reconcile to delete node
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNotFound(ctx, env.Client, node)
})
It("should not evict static pods", func() {
ExpectApplied(ctx, env.Client, node)
podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})
ExpectApplied(ctx, env.Client, node, podEvict)
ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict)

podNoEvict := test.Pod(test.PodOptions{
NodeName: node.Name,
Expand Down Expand Up @@ -496,13 +524,14 @@ var _ = Describe("Termination", func() {

// Reconcile to delete node
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNotFound(ctx, env.Client, node)

})
It("should not delete nodes until all pods are deleted", func() {
pods := test.Pods(2, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})
ExpectApplied(ctx, env.Client, node, pods[0], pods[1])
ExpectApplied(ctx, env.Client, node, nodeClaim, pods[0], pods[1])

// Trigger Termination Controller
Expect(env.Client.Delete(ctx, node)).To(Succeed())
Expand Down Expand Up @@ -530,12 +559,14 @@ var _ = Describe("Termination", func() {

// Reconcile to delete node
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNotFound(ctx, env.Client, node)
})
It("should delete nodes with no underlying instance even if not fully drained", func() {
pods := test.Pods(2, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})
ExpectApplied(ctx, env.Client, node, pods[0], pods[1])
ExpectApplied(ctx, env.Client, node, nodeClaim, pods[0], pods[1])

// Make Node NotReady since it's automatically marked as Ready on first deploy
ExpectMakeNodesNotReady(ctx, env.Client, node)
Expand Down Expand Up @@ -563,12 +594,14 @@ var _ = Describe("Termination", func() {

// Reconcile to delete node
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNotFound(ctx, env.Client, node)
})
It("should not delete nodes with no underlying instance if the node is still Ready", func() {
pods := test.Pods(2, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})
ExpectApplied(ctx, env.Client, node, pods[0], pods[1])
ExpectApplied(ctx, env.Client, node, nodeClaim, pods[0], pods[1])

// Trigger Termination Controller
Expect(env.Client.Delete(ctx, node)).To(Succeed())
Expand All @@ -594,13 +627,15 @@ var _ = Describe("Termination", func() {
// Reconcile to try to delete the node, but don't succeed because the readiness condition
// of the node still won't let us delete it
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNodeExists(ctx, env.Client, node.Name)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectExists(ctx, env.Client, node)
})
It("should wait for pods to terminate", func() {
pod := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})
fakeClock.SetTime(time.Now()) // make our fake clock match the pod creation time
ExpectApplied(ctx, env.Client, node, pod)
ExpectApplied(ctx, env.Client, node, nodeClaim, pod)

// Before grace period, node should not delete
Expect(env.Client.Delete(ctx, node)).To(Succeed())
Expand All @@ -613,6 +648,8 @@ var _ = Describe("Termination", func() {
// to eliminate test-flakiness we reset the time to current time + 90 seconds instead of just advancing
// the clock by 90 seconds.
fakeClock.SetTime(time.Now().Add(90 * time.Second))
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectNotFound(ctx, env.Client, node)
})
Expand All @@ -624,7 +661,7 @@ var _ = Describe("Termination", func() {
OwnerReferences: defaultOwnerRefs,
},
})
ExpectApplied(ctx, env.Client, node, pod)
ExpectApplied(ctx, env.Client, node, nodeClaim, pod)

// Trigger Termination Controller
Expect(env.Client.Delete(ctx, node)).To(Succeed())
Expand Down Expand Up @@ -675,6 +712,8 @@ var _ = Describe("Termination", func() {
ExpectApplied(ctx, env.Client, node)
Expect(env.Client.Delete(ctx, node)).To(Succeed())
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)

m, ok := FindMetricWithLabelValues("karpenter_nodes_termination_time_seconds", map[string]string{"nodepool": node.Labels[v1beta1.NodePoolLabelKey]})
Expand All @@ -685,6 +724,8 @@ var _ = Describe("Termination", func() {
ExpectApplied(ctx, env.Client, node)
Expect(env.Client.Delete(ctx, node)).To(Succeed())
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)

m, ok := FindMetricWithLabelValues("karpenter_nodes_terminated", map[string]string{"nodepool": node.Labels[v1beta1.NodePoolLabelKey]})
Expand Down

0 comments on commit c51bcd7

Please sign in to comment.