diff --git a/go.mod b/go.mod index b6cb3fd1c..9baf7234a 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.22.5 require ( github.com/Pallinder/go-randomdata v1.2.0 github.com/avast/retry-go v3.0.0+incompatible - github.com/awslabs/operatorpkg v0.0.0-20240731051558-05ffed2d693d + github.com/awslabs/operatorpkg v0.0.0-20240801183540-0a6e9812953c github.com/docker/docker v27.1.1+incompatible github.com/go-logr/logr v1.4.2 github.com/imdario/mergo v0.3.16 diff --git a/go.sum b/go.sum index f5b263e6b..ee04961e3 100644 --- a/go.sum +++ b/go.sum @@ -47,8 +47,8 @@ github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk5 github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= -github.com/awslabs/operatorpkg v0.0.0-20240731051558-05ffed2d693d h1:rWRK5NBFZqNtTE1Nb86MNlvdc9vr7DngvBKbBzOWZA0= -github.com/awslabs/operatorpkg v0.0.0-20240731051558-05ffed2d693d/go.mod h1:NmFIDk+owvhQnz7hsFgdShl1CXywHQaVCQwhLS+CIhY= +github.com/awslabs/operatorpkg v0.0.0-20240801183540-0a6e9812953c h1:/10pTC8+qjgaya6a2XelKgkD252yGr2FdDrVVsDYaMQ= +github.com/awslabs/operatorpkg v0.0.0-20240801183540-0a6e9812953c/go.mod h1:NmFIDk+owvhQnz7hsFgdShl1CXywHQaVCQwhLS+CIhY= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= diff --git a/pkg/controllers/nodeclaim/lifecycle/initialization.go b/pkg/controllers/nodeclaim/lifecycle/initialization.go index c017e6104..0c3979be8 100644 --- a/pkg/controllers/nodeclaim/lifecycle/initialization.go +++ b/pkg/controllers/nodeclaim/lifecycle/initialization.go @@ -45,34 +45,30 @@ type Initialization struct { // c) all extended resources have been registered // This method handles both nil nodepools and nodes without extended resources gracefully. func (i *Initialization) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) { - if nodeClaim.StatusConditions().Get(v1.ConditionTypeInitialized).IsTrue() { - return reconcile.Result{}, nil - } - if !nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched).IsTrue() { - nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeInitialized, "NotLaunched", "Node not launched") + if !nodeClaim.StatusConditions().Get(v1.ConditionTypeInitialized).IsUnknown() { return reconcile.Result{}, nil } ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("provider-id", nodeClaim.Status.ProviderID)) node, err := nodeclaimutil.NodeForNodeClaim(ctx, i.kubeClient, nodeClaim) if err != nil { - nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeInitialized, "NodeNotFound", "Node not registered with cluster") + nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "NodeNotFound", "Node not registered with cluster") return reconcile.Result{}, nil //nolint:nilerr } ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", node.Name))) if nodeutil.GetCondition(node, corev1.NodeReady).Status != corev1.ConditionTrue { - nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeInitialized, "NodeNotReady", "Node status is NotReady") + nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "NodeNotReady", "Node status is NotReady") return reconcile.Result{}, nil } if taint, ok := StartupTaintsRemoved(node, nodeClaim); !ok { - nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeInitialized, "StartupTaintsExist", fmt.Sprintf("StartupTaint %q still exists", formatTaint(taint))) + nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "StartupTaintsExist", fmt.Sprintf("StartupTaint %q still exists", formatTaint(taint))) return reconcile.Result{}, nil } if taint, ok := KnownEphemeralTaintsRemoved(node); !ok { - nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeInitialized, "KnownEphemeralTaintsExist", fmt.Sprintf("KnownEphemeralTaint %q still exists", formatTaint(taint))) + nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "KnownEphemeralTaintsExist", fmt.Sprintf("KnownEphemeralTaint %q still exists", formatTaint(taint))) return reconcile.Result{}, nil } if name, ok := RequestedResourcesRegistered(node, nodeClaim); !ok { - nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeInitialized, "ResourceNotRegistered", fmt.Sprintf("Resource %q was requested but not registered", name)) + nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "ResourceNotRegistered", fmt.Sprintf("Resource %q was requested but not registered", name)) return reconcile.Result{}, nil } stored := node.DeepCopy() diff --git a/pkg/controllers/nodeclaim/lifecycle/initialization_test.go b/pkg/controllers/nodeclaim/lifecycle/initialization_test.go index 69f225f6e..b8827c249 100644 --- a/pkg/controllers/nodeclaim/lifecycle/initialization_test.go +++ b/pkg/controllers/nodeclaim/lifecycle/initialization_test.go @@ -70,7 +70,7 @@ var _ = Describe("Initialization", func() { nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue)) - Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse)) + Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown)) node = ExpectExists(ctx, env.Client, node) node.Status.Capacity = corev1.ResourceList{ @@ -175,7 +175,7 @@ var _ = Describe("Initialization", func() { nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue)) - Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse)) + Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown)) }) It("should not consider the Node to be initialized when all requested resources aren't registered", func() { nodeClaim := test.NodeClaim(v1.NodeClaim{ @@ -226,7 +226,7 @@ var _ = Describe("Initialization", func() { nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue)) - Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse)) + Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown)) }) It("should consider the node to be initialized once all the resources are registered", func() { nodeClaim := test.NodeClaim(v1.NodeClaim{ @@ -278,7 +278,7 @@ var _ = Describe("Initialization", func() { nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue)) - Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse)) + Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown)) // Node now registers the resource node = ExpectExists(ctx, env.Client, node) @@ -363,7 +363,7 @@ var _ = Describe("Initialization", func() { ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue)) - Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse)) + Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown)) }) It("should consider the Node to be initialized once the startupTaints are removed", func() { nodeClaim := test.NodeClaim(v1.NodeClaim{ @@ -420,7 +420,7 @@ var _ = Describe("Initialization", func() { ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue)) - Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse)) + Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown)) node = ExpectExists(ctx, env.Client, node) node.Spec.Taints = []corev1.Taint{} @@ -500,7 +500,7 @@ var _ = Describe("Initialization", func() { ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue)) - Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse)) + Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown)) }) It("should consider the Node to be initialized once the ephemeralTaints are removed", func() { nodeClaim := test.NodeClaim(v1.NodeClaim{ @@ -570,7 +570,7 @@ var _ = Describe("Initialization", func() { ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue)) - Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionFalse)) + Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown)) node = ExpectExists(ctx, env.Client, node) node.Spec.Taints = []corev1.Taint{} diff --git a/pkg/controllers/nodeclaim/lifecycle/launch.go b/pkg/controllers/nodeclaim/lifecycle/launch.go index f785b5cab..55878542f 100644 --- a/pkg/controllers/nodeclaim/lifecycle/launch.go +++ b/pkg/controllers/nodeclaim/lifecycle/launch.go @@ -43,7 +43,7 @@ type Launch struct { } func (l *Launch) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) { - if nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched).IsTrue() { + if !nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched).IsUnknown() { return reconcile.Result{}, nil } @@ -60,11 +60,8 @@ func (l *Launch) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconc } else { created, err = l.launchNodeClaim(ctx, nodeClaim) } - // Either the Node launch failed or the Node was deleted due to InsufficientCapacity/NotFound + // Either the Node launch failed or the Node was deleted due to InsufficientCapacity/NodeClassNotReady/NotFound if err != nil || created == nil { - if cloudprovider.IsNodeClassNotReadyError(err) { - return reconcile.Result{Requeue: true}, nil - } return reconcile.Result{}, err } l.cache.SetDefault(string(nodeClaim.UID), created) @@ -91,11 +88,18 @@ func (l *Launch) launchNodeClaim(ctx context.Context, nodeClaim *v1.NodeClaim) ( }).Inc() return nil, nil case cloudprovider.IsNodeClassNotReadyError(err): - l.recorder.Publish(NodeClassNotReadyEvent(nodeClaim, err)) - nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeLaunched, "LaunchFailed", truncateMessage(err.Error())) - return nil, fmt.Errorf("launching nodeclaim, %w", err) + log.FromContext(ctx).Error(err, "failed launching nodeclaim") + if err = l.kubeClient.Delete(ctx, nodeClaim); err != nil { + return nil, client.IgnoreNotFound(err) + } + metrics.NodeClaimsDisruptedTotal.With(prometheus.Labels{ + metrics.ReasonLabel: "nodeclass_not_ready", + metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], + metrics.CapacityTypeLabel: nodeClaim.Labels[v1.CapacityTypeLabelKey], + }).Inc() + return nil, nil default: - nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeLaunched, "LaunchFailed", truncateMessage(err.Error())) + nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeLaunched, "LaunchFailed", truncateMessage(err.Error())) return nil, fmt.Errorf("launching nodeclaim, %w", err) } } diff --git a/pkg/controllers/nodeclaim/lifecycle/launch_test.go b/pkg/controllers/nodeclaim/lifecycle/launch_test.go index e3514db10..85f39c1a9 100644 --- a/pkg/controllers/nodeclaim/lifecycle/launch_test.go +++ b/pkg/controllers/nodeclaim/lifecycle/launch_test.go @@ -76,14 +76,12 @@ var _ = Describe("Launch", func() { ExpectFinalizersRemoved(ctx, env.Client, nodeClaim) ExpectNotFound(ctx, env.Client, nodeClaim) }) - It("should requeue with no error if NodeClassNotReady is returned from the cloudprovider", func() { + It("should delete the nodeclaim if NodeClassNotReady is returned from the cloudprovider", func() { cloudProvider.NextCreateErr = cloudprovider.NewNodeClassNotReadyError(fmt.Errorf("nodeClass isn't ready")) nodeClaim := test.NodeClaim() ExpectApplied(ctx, env.Client, nodeClaim) - res := ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) - Expect(res.Requeue).To(BeTrue()) - - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeLaunched).Status).To(Equal(metav1.ConditionFalse)) + ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) + ExpectFinalizersRemoved(ctx, env.Client, nodeClaim) + ExpectNotFound(ctx, env.Client, nodeClaim) }) }) diff --git a/pkg/controllers/nodeclaim/lifecycle/registration.go b/pkg/controllers/nodeclaim/lifecycle/registration.go index 01de40400..da26e3337 100644 --- a/pkg/controllers/nodeclaim/lifecycle/registration.go +++ b/pkg/controllers/nodeclaim/lifecycle/registration.go @@ -42,18 +42,14 @@ type Registration struct { } func (r *Registration) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) { - if nodeClaim.StatusConditions().Get(v1.ConditionTypeRegistered).IsTrue() { - return reconcile.Result{}, nil - } - if !nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched).IsTrue() { - nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeRegistered, "NotLaunched", "Node not launched") + if !nodeClaim.StatusConditions().Get(v1.ConditionTypeRegistered).IsUnknown() { return reconcile.Result{}, nil } ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("provider-id", nodeClaim.Status.ProviderID)) node, err := nodeclaimutil.NodeForNodeClaim(ctx, r.kubeClient, nodeClaim) if err != nil { if nodeclaimutil.IsNodeNotFoundError(err) { - nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeRegistered, "NodeNotFound", "Node not registered with cluster") + nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeRegistered, "NodeNotFound", "Node not registered with cluster") return reconcile.Result{}, nil } if nodeclaimutil.IsDuplicateNodeError(err) {