diff --git a/e2e/tests/stable/chainsaw-test.yaml b/e2e/tests/stable/chainsaw-test.yaml index 527e6d84..57381ab0 100755 --- a/e2e/tests/stable/chainsaw-test.yaml +++ b/e2e/tests/stable/chainsaw-test.yaml @@ -851,6 +851,264 @@ spec: metadata: name: avoid-localstack-c-bucket + # The following steps replicate an edge case scenario described here: + # https://github.com/crossplane/crossplane/issues/3037#issuecomment-1110142427 + # + # 1. Make all Localstack backends unreachable. + # 2. Create edge-case-bucket and wait for failure. + # 3. Update edge-case-bucket, removing its create-failed annotation. + # 4. Make all Localstack backends reachable again. + # 5. Restart Provider Ceph deployment. + # 6. Bucket edge-case-bucket should become Ready and Synced. + - name: Make localstack-a unreachable and therefore Unhealthy. + try: + - command: + args: + - patch + - service + - localstack-a + - -n + - crossplane-system + - --type=merge + - -p + - '{"spec":{"selector":{"io.kompose.service":"not-exists"}}}' + entrypoint: kubectl + - assert: + resource: + apiVersion: ceph.crossplane.io/v1alpha1 + kind: ProviderConfig + metadata: + name: localstack-a + status: + conditions: + - reason: HealthCheckFail + status: "False" + type: Ready + + - name: Make localstack-b unreachable and therefore Unhealthy. + try: + - command: + args: + - patch + - service + - localstack-b + - -n + - crossplane-system + - --type=merge + - -p + - '{"spec":{"selector":{"io.kompose.service":"not-exists"}}}' + entrypoint: kubectl + - assert: + resource: + apiVersion: ceph.crossplane.io/v1alpha1 + kind: ProviderConfig + metadata: + name: localstack-b + status: + conditions: + - reason: HealthCheckFail + status: "False" + type: Ready + + - name: Make localstack-c unreachable and therefore Unhealthy. + try: + - command: + args: + - patch + - service + - localstack-c + - -n + - crossplane-system + - --type=merge + - -p + - '{"spec":{"selector":{"io.kompose.service":"not-exists"}}}' + entrypoint: kubectl + - assert: + resource: + apiVersion: ceph.crossplane.io/v1alpha1 + kind: ProviderConfig + metadata: + name: localstack-c + status: + conditions: + - reason: HealthCheckFail + status: "False" + type: Ready + + - name: Apply edge-case-bucket. + try: + - apply: + resource: + apiVersion: provider-ceph.ceph.crossplane.io/v1alpha1 + kind: Bucket + metadata: + name: edge-case-bucket + labels: + provider-ceph.crossplane.io/validation-required: "true" + spec: + autoPause: true + forProvider: {} + + # Assert edge-case-bucket has failed. + - assert: + resource: + apiVersion: provider-ceph.ceph.crossplane.io/v1alpha1 + kind: Bucket + metadata: + name: edge-case-bucket + status: + conditions: + - reason: Creating + status: "False" + type: Ready + - reason: ReconcileError + status: "False" + type: Synced + + - name: Remove creation-failed annotation from edge-case-bucket. + try: + - command: + args: + - patch + - bucket + - edge-case-bucket + - --type=json + - -p + - '[{"op": "remove", "path": "/metadata/annotations/crossplane.io~1external-create-failed"}]' + entrypoint: kubectl + + + - name: Make localstack-a reachable again and therefore Healthy. + try: + - command: + args: + - patch + - service + - localstack-a + - -n + - crossplane-system + - --type=merge + - -p + - '{"spec":{"selector":{"io.kompose.service":"localstack-a"}}}' + entrypoint: kubectl + - assert: + resource: + apiVersion: ceph.crossplane.io/v1alpha1 + kind: ProviderConfig + metadata: + name: localstack-a + status: + conditions: + - reason: HealthCheckSuccess + status: "True" + type: Ready + + - name: Make localstack-b reachable again and therefore Healthy. + try: + - command: + args: + - patch + - service + - localstack-b + - -n + - crossplane-system + - --type=merge + - -p + - '{"spec":{"selector":{"io.kompose.service":"localstack-b"}}}' + entrypoint: kubectl + - assert: + resource: + apiVersion: ceph.crossplane.io/v1alpha1 + kind: ProviderConfig + metadata: + name: localstack-b + status: + conditions: + - reason: HealthCheckSuccess + status: "True" + type: Ready + + - name: Make localstack-c reachable again and therefore Healthy. + try: + - command: + args: + - patch + - service + - localstack-c + - -n + - crossplane-system + - --type=merge + - -p + - '{"spec":{"selector":{"io.kompose.service":"localstack-c"}}}' + entrypoint: kubectl + - assert: + resource: + apiVersion: ceph.crossplane.io/v1alpha1 + kind: ProviderConfig + metadata: + name: localstack-c + status: + conditions: + - reason: HealthCheckSuccess + status: "True" + type: Ready + + - name: Scale Provider Ceph deployment to zero in order to restart. + try: + - command: + args: + - scale + - -n + - crossplane-system + - deploy/provider-ceph-provider-cep + - --replicas=0 + entrypoint: kubectl + + - name: Assert edge-case-bucket is created successfully. + try: + # Assert edge-case-bucket has been created successfully. + - assert: + resource: + apiVersion: provider-ceph.ceph.crossplane.io/v1alpha1 + kind: Bucket + metadata: + name: edge-case-bucket + status: + conditions: + - reason: Available + status: "True" + type: Ready + - reason: ReconcileSuccess + status: "True" + type: Synced + + - name: Delete edge-case-bucket. + try: + - command: + # We need to "unpause" edge-case-bucket to allow deletion. + args: + - patch + - --type=merge + - buckets + - edge-case-bucket + - -p + - '{"metadata":{"labels":{"crossplane.io/paused":"false"}}}' + entrypoint: kubectl + - command: + args: + - delete + - bucket + - edge-case-bucket + entrypoint: kubectl + - error: + resource: + apiVersion: provider-ceph.ceph.crossplane.io/v1alpha1 + kind: Bucket + metadata: + name: edge-case-bucket + # End of edge case test scenario. + + # Clean up ProviderConfigs. - name: Delete all ProviderConfigs. try: - command: diff --git a/internal/controller/bucket/create.go b/internal/controller/bucket/create.go index f1ef5a2a..5196a889 100644 --- a/internal/controller/bucket/create.go +++ b/internal/controller/bucket/create.go @@ -35,6 +35,48 @@ func (c *external) Create(ctx context.Context, mg resource.Managed) (managed.Ext return managed.ExternalCreation{}, err } + // Just before Crossplane's over-arching Reconcile function calls external.Create, it + // updates the MR with the create-pending annotation. After external.Create is finished, + // the create-success or create-failure annotaion is then added to the MR by the Reconciler + // depending on the result of external.Create (this function). + // In the event that an "incomplete creation" occurs (ie the create-pending annotaion is the + // most recent annotation to be applied out of create-pending, create-failed and create-succeeded) + // then the MR will not be re-queued. This is to protect against the possibility of "leaked + // resources" (ie resources which have been created by the provider but are no longer under + // the provider's control). This must all happen sequentially in a single Reconcile loop. + // See this Github issue and comment for more context on what is a known issue: + // https://github.com/crossplane/crossplane/issues/3037#issuecomment-1110142427 + // + // This approach leaves the possibility for the following scenario: + // 1. The create-pending annotaion is applied to an MR by Crossplane and external.Create is called. + // 2. The provider pod crashes before it gets a chance to perform its external.Create function. + // 3. As a consequence, the resulting create-succeeded or create-failed annotations are never added to + // the MR. + // 4. The provider pod comes back online and the MR is automatically reconciled by Crossplane. + // 5. Crossplane interprets the MR annotaions and believes that an incomplete creation has + // occurred and therefore does not perform any action on the MR. + // 6. The MR is effectively paused, awaiting human intervention. + // + // To counteract this, we attempt to remove the create-pending annotation as immediately after it + // has been applied by Crossplane. It is safe for us to do this because the resources we are creating + // (S3 buckets) have deterministic names. This function will only create a bucket of the given + // name and will do so idempotently, therefore we are never in danger of creating a cascading + // number of buckets due to previous failures. + // Of course, this approach does not completely remove the possibility of us finding ourselves in + // the above scenario. It only mitigates it. As long as Crossplane persists with its existing logic + // then we can only make a "best-effort" to avoid it. + if err := c.updateBucketCR(ctx, bucket, func(_, bucketLatest *v1alpha1.Bucket) UpdateRequired { + meta.RemoveAnnotations(bucket, meta.AnnotationKeyExternalCreatePending) + + return NeedsObjectUpdate + }); err != nil { + c.log.Info("Failed to remove pending annotation", consts.KeyBucketName, bucket.Name) + err = errors.Wrap(err, errUpdateBucketCR) + traces.SetAndRecordError(span, err) + + return managed.ExternalCreation{}, err + } + span.SetAttributes(attribute.String(consts.KeyBucketName, bucket.Name)) ctx, cancel := context.WithTimeout(ctx, c.operationTimeout) @@ -171,15 +213,9 @@ func (c *external) waitForCreationAndUpdateBucketCR(ctx context.Context, bucket select { case <-ctx.Done(): c.log.Info("Context timeout waiting for bucket creation", consts.KeyBucketName, bucket.Name) - - return managed.ExternalCreation{}, ctx.Err() + createErr = ctx.Err() case beName := <-readyChan: err := c.updateBucketCR(ctx, bucket, func(_, bucketLatest *v1alpha1.Bucket) UpdateRequired { - // Remove the annotation, because Crossplane is not always able to do it. - // This workaround doesn't eliminates the problem, if this update fails, - // Crossplane skips object forever. - delete(bucketLatest.ObjectMeta.Annotations, meta.AnnotationKeyExternalCreatePending) - setAllBackendLabels(bucketLatest, providerNames) return NeedsObjectUpdate diff --git a/internal/controller/bucket/create_test.go b/internal/controller/bucket/create_test.go index 4442d344..09cc971e 100644 --- a/internal/controller/bucket/create_test.go +++ b/internal/controller/bucket/create_test.go @@ -50,7 +50,11 @@ func TestCreateBasicErrors(t *testing.T) { backendStore: backendstore.NewBackendStore(), }, args: args{ - mg: &v1alpha1.Bucket{}, + mg: &v1alpha1.Bucket{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bucket", + }, + }, }, want: want{ err: errors.New(errNoS3BackendsStored), @@ -67,6 +71,9 @@ func TestCreateBasicErrors(t *testing.T) { }, args: args{ mg: &v1alpha1.Bucket{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bucket", + }, Spec: v1alpha1.BucketSpec{ Providers: []string{"s3-backend-1"}, }, @@ -81,7 +88,11 @@ func TestCreateBasicErrors(t *testing.T) { backendStore: backendstore.NewBackendStore(), }, args: args{ - mg: &v1alpha1.Bucket{}, + mg: &v1alpha1.Bucket{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bucket", + }, + }, }, want: want{ err: errors.New(errNoS3BackendsStored), @@ -93,8 +104,16 @@ func TestCreateBasicErrors(t *testing.T) { t.Run(name, func(t *testing.T) { t.Parallel() + s := runtime.NewScheme() + s.AddKnownTypes(v1alpha1.SchemeGroupVersion, &v1alpha1.Bucket{}, &v1alpha1.BucketList{}) + + cl := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(tc.args.mg). + WithStatusSubresource(tc.args.mg) e := external{ + kubeClient: cl.Build(), backendStore: tc.fields.backendStore, log: logging.NewNopLogger(), }