Skip to content

Commit

Permalink
chore: Ensure that NodeClaims are held in Unknown until they reach a …
Browse files Browse the repository at this point in the history
…terminal state (#1499)
  • Loading branch information
jonathan-innis authored Aug 2, 2024
1 parent fcbb8ea commit ff4c5e5
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 42 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.22.5
require (
github.com/Pallinder/go-randomdata v1.2.0
github.com/avast/retry-go v3.0.0+incompatible
github.com/awslabs/operatorpkg v0.0.0-20240731051558-05ffed2d693d
github.com/awslabs/operatorpkg v0.0.0-20240801183540-0a6e9812953c
github.com/docker/docker v27.1.1+incompatible
github.com/go-logr/logr v1.4.2
github.com/imdario/mergo v0.3.16
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk5
github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0=
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
github.com/awslabs/operatorpkg v0.0.0-20240731051558-05ffed2d693d h1:rWRK5NBFZqNtTE1Nb86MNlvdc9vr7DngvBKbBzOWZA0=
github.com/awslabs/operatorpkg v0.0.0-20240731051558-05ffed2d693d/go.mod h1:NmFIDk+owvhQnz7hsFgdShl1CXywHQaVCQwhLS+CIhY=
github.com/awslabs/operatorpkg v0.0.0-20240801183540-0a6e9812953c h1:/10pTC8+qjgaya6a2XelKgkD252yGr2FdDrVVsDYaMQ=
github.com/awslabs/operatorpkg v0.0.0-20240801183540-0a6e9812953c/go.mod h1:NmFIDk+owvhQnz7hsFgdShl1CXywHQaVCQwhLS+CIhY=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
Expand Down
16 changes: 6 additions & 10 deletions pkg/controllers/nodeclaim/lifecycle/initialization.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,34 +45,30 @@ type Initialization struct {
// c) all extended resources have been registered
// This method handles both nil nodepools and nodes without extended resources gracefully.
func (i *Initialization) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) {
if nodeClaim.StatusConditions().Get(v1.ConditionTypeInitialized).IsTrue() {
return reconcile.Result{}, nil
}
if !nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched).IsTrue() {
nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeInitialized, "NotLaunched", "Node not launched")
if !nodeClaim.StatusConditions().Get(v1.ConditionTypeInitialized).IsUnknown() {
return reconcile.Result{}, nil
}
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("provider-id", nodeClaim.Status.ProviderID))
node, err := nodeclaimutil.NodeForNodeClaim(ctx, i.kubeClient, nodeClaim)
if err != nil {
nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeInitialized, "NodeNotFound", "Node not registered with cluster")
nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "NodeNotFound", "Node not registered with cluster")
return reconcile.Result{}, nil //nolint:nilerr
}
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", node.Name)))
if nodeutil.GetCondition(node, corev1.NodeReady).Status != corev1.ConditionTrue {
nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeInitialized, "NodeNotReady", "Node status is NotReady")
nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "NodeNotReady", "Node status is NotReady")
return reconcile.Result{}, nil
}
if taint, ok := StartupTaintsRemoved(node, nodeClaim); !ok {
nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeInitialized, "StartupTaintsExist", fmt.Sprintf("StartupTaint %q still exists", formatTaint(taint)))
nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "StartupTaintsExist", fmt.Sprintf("StartupTaint %q still exists", formatTaint(taint)))
return reconcile.Result{}, nil
}
if taint, ok := KnownEphemeralTaintsRemoved(node); !ok {
nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeInitialized, "KnownEphemeralTaintsExist", fmt.Sprintf("KnownEphemeralTaint %q still exists", formatTaint(taint)))
nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "KnownEphemeralTaintsExist", fmt.Sprintf("KnownEphemeralTaint %q still exists", formatTaint(taint)))
return reconcile.Result{}, nil
}
if name, ok := RequestedResourcesRegistered(node, nodeClaim); !ok {
nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeInitialized, "ResourceNotRegistered", fmt.Sprintf("Resource %q was requested but not registered", name))
nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "ResourceNotRegistered", fmt.Sprintf("Resource %q was requested but not registered", name))
return reconcile.Result{}, nil
}
stored := node.DeepCopy()
Expand Down
16 changes: 8 additions & 8 deletions pkg/controllers/nodeclaim/lifecycle/initialization_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ var _ = Describe("Initialization", func() {

nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown))

node = ExpectExists(ctx, env.Client, node)
node.Status.Capacity = corev1.ResourceList{
Expand Down Expand Up @@ -175,7 +175,7 @@ var _ = Describe("Initialization", func() {

nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown))
})
It("should not consider the Node to be initialized when all requested resources aren't registered", func() {
nodeClaim := test.NodeClaim(v1.NodeClaim{
Expand Down Expand Up @@ -226,7 +226,7 @@ var _ = Describe("Initialization", func() {

nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown))
})
It("should consider the node to be initialized once all the resources are registered", func() {
nodeClaim := test.NodeClaim(v1.NodeClaim{
Expand Down Expand Up @@ -278,7 +278,7 @@ var _ = Describe("Initialization", func() {

nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown))

// Node now registers the resource
node = ExpectExists(ctx, env.Client, node)
Expand Down Expand Up @@ -363,7 +363,7 @@ var _ = Describe("Initialization", func() {
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown))
})
It("should consider the Node to be initialized once the startupTaints are removed", func() {
nodeClaim := test.NodeClaim(v1.NodeClaim{
Expand Down Expand Up @@ -420,7 +420,7 @@ var _ = Describe("Initialization", func() {
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown))

node = ExpectExists(ctx, env.Client, node)
node.Spec.Taints = []corev1.Taint{}
Expand Down Expand Up @@ -500,7 +500,7 @@ var _ = Describe("Initialization", func() {
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown))
})
It("should consider the Node to be initialized once the ephemeralTaints are removed", func() {
nodeClaim := test.NodeClaim(v1.NodeClaim{
Expand Down Expand Up @@ -570,7 +570,7 @@ var _ = Describe("Initialization", func() {
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse))
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown))

node = ExpectExists(ctx, env.Client, node)
node.Spec.Taints = []corev1.Taint{}
Expand Down
22 changes: 13 additions & 9 deletions pkg/controllers/nodeclaim/lifecycle/launch.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ type Launch struct {
}

func (l *Launch) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) {
if nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched).IsTrue() {
if !nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched).IsUnknown() {
return reconcile.Result{}, nil
}

Expand All @@ -60,11 +60,8 @@ func (l *Launch) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconc
} else {
created, err = l.launchNodeClaim(ctx, nodeClaim)
}
// Either the Node launch failed or the Node was deleted due to InsufficientCapacity/NotFound
// Either the Node launch failed or the Node was deleted due to InsufficientCapacity/NodeClassNotReady/NotFound
if err != nil || created == nil {
if cloudprovider.IsNodeClassNotReadyError(err) {
return reconcile.Result{Requeue: true}, nil
}
return reconcile.Result{}, err
}
l.cache.SetDefault(string(nodeClaim.UID), created)
Expand All @@ -91,11 +88,18 @@ func (l *Launch) launchNodeClaim(ctx context.Context, nodeClaim *v1.NodeClaim) (
}).Inc()
return nil, nil
case cloudprovider.IsNodeClassNotReadyError(err):
l.recorder.Publish(NodeClassNotReadyEvent(nodeClaim, err))
nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeLaunched, "LaunchFailed", truncateMessage(err.Error()))
return nil, fmt.Errorf("launching nodeclaim, %w", err)
log.FromContext(ctx).Error(err, "failed launching nodeclaim")
if err = l.kubeClient.Delete(ctx, nodeClaim); err != nil {
return nil, client.IgnoreNotFound(err)
}
metrics.NodeClaimsDisruptedTotal.With(prometheus.Labels{
metrics.ReasonLabel: "nodeclass_not_ready",
metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey],
metrics.CapacityTypeLabel: nodeClaim.Labels[v1.CapacityTypeLabelKey],
}).Inc()
return nil, nil
default:
nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeLaunched, "LaunchFailed", truncateMessage(err.Error()))
nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeLaunched, "LaunchFailed", truncateMessage(err.Error()))
return nil, fmt.Errorf("launching nodeclaim, %w", err)
}
}
Expand Down
10 changes: 4 additions & 6 deletions pkg/controllers/nodeclaim/lifecycle/launch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,12 @@ var _ = Describe("Launch", func() {
ExpectFinalizersRemoved(ctx, env.Client, nodeClaim)
ExpectNotFound(ctx, env.Client, nodeClaim)
})
It("should requeue with no error if NodeClassNotReady is returned from the cloudprovider", func() {
It("should delete the nodeclaim if NodeClassNotReady is returned from the cloudprovider", func() {
cloudProvider.NextCreateErr = cloudprovider.NewNodeClassNotReadyError(fmt.Errorf("nodeClass isn't ready"))
nodeClaim := test.NodeClaim()
ExpectApplied(ctx, env.Client, nodeClaim)
res := ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
Expect(res.Requeue).To(BeTrue())

nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeLaunched).Status).To(Equal(metav1.ConditionFalse))
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
ExpectFinalizersRemoved(ctx, env.Client, nodeClaim)
ExpectNotFound(ctx, env.Client, nodeClaim)
})
})
8 changes: 2 additions & 6 deletions pkg/controllers/nodeclaim/lifecycle/registration.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,14 @@ type Registration struct {
}

func (r *Registration) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) {
if nodeClaim.StatusConditions().Get(v1.ConditionTypeRegistered).IsTrue() {
return reconcile.Result{}, nil
}
if !nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched).IsTrue() {
nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeRegistered, "NotLaunched", "Node not launched")
if !nodeClaim.StatusConditions().Get(v1.ConditionTypeRegistered).IsUnknown() {
return reconcile.Result{}, nil
}
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("provider-id", nodeClaim.Status.ProviderID))
node, err := nodeclaimutil.NodeForNodeClaim(ctx, r.kubeClient, nodeClaim)
if err != nil {
if nodeclaimutil.IsNodeNotFoundError(err) {
nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeRegistered, "NodeNotFound", "Node not registered with cluster")
nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeRegistered, "NodeNotFound", "Node not registered with cluster")
return reconcile.Result{}, nil
}
if nodeclaimutil.IsDuplicateNodeError(err) {
Expand Down

0 comments on commit ff4c5e5

Please sign in to comment.