Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle create-pending annotation edge case #238

Merged
merged 2 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
258 changes: 258 additions & 0 deletions e2e/tests/stable/chainsaw-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,264 @@ spec:
metadata:
name: avoid-localstack-c-bucket

# The following steps replicate an edge case scenario described here:
# https://github.com/crossplane/crossplane/issues/3037#issuecomment-1110142427
#
# 1. Make all Localstack backends unreachable.
# 2. Create edge-case-bucket and wait for failure.
# 3. Update edge-case-bucket, removing its create-failed annotation.
# 4. Make all Localstack backends reachable again.
# 5. Restart Provider Ceph deployment.
# 6. Bucket edge-case-bucket should become Ready and Synced.
- name: Make localstack-a unreachable and therefore Unhealthy.
try:
- command:
args:
- patch
- service
- localstack-a
- -n
- crossplane-system
- --type=merge
- -p
- '{"spec":{"selector":{"io.kompose.service":"not-exists"}}}'
entrypoint: kubectl
- assert:
resource:
apiVersion: ceph.crossplane.io/v1alpha1
kind: ProviderConfig
metadata:
name: localstack-a
status:
conditions:
- reason: HealthCheckFail
status: "False"
type: Ready

- name: Make localstack-b unreachable and therefore Unhealthy.
try:
- command:
args:
- patch
- service
- localstack-b
- -n
- crossplane-system
- --type=merge
- -p
- '{"spec":{"selector":{"io.kompose.service":"not-exists"}}}'
entrypoint: kubectl
- assert:
resource:
apiVersion: ceph.crossplane.io/v1alpha1
kind: ProviderConfig
metadata:
name: localstack-b
status:
conditions:
- reason: HealthCheckFail
status: "False"
type: Ready

- name: Make localstack-c unreachable and therefore Unhealthy.
try:
- command:
args:
- patch
- service
- localstack-c
- -n
- crossplane-system
- --type=merge
- -p
- '{"spec":{"selector":{"io.kompose.service":"not-exists"}}}'
entrypoint: kubectl
- assert:
resource:
apiVersion: ceph.crossplane.io/v1alpha1
kind: ProviderConfig
metadata:
name: localstack-c
status:
conditions:
- reason: HealthCheckFail
status: "False"
type: Ready

- name: Apply edge-case-bucket.
try:
- apply:
resource:
apiVersion: provider-ceph.ceph.crossplane.io/v1alpha1
kind: Bucket
metadata:
name: edge-case-bucket
labels:
provider-ceph.crossplane.io/validation-required: "true"
spec:
autoPause: true
forProvider: {}

# Assert edge-case-bucket has failed.
- assert:
resource:
apiVersion: provider-ceph.ceph.crossplane.io/v1alpha1
kind: Bucket
metadata:
name: edge-case-bucket
status:
conditions:
- reason: Creating
status: "False"
type: Ready
- reason: ReconcileError
status: "False"
type: Synced

- name: Remove creation-failed annotation from edge-case-bucket.
try:
- command:
args:
- patch
- bucket
- edge-case-bucket
- --type=json
- -p
- '[{"op": "remove", "path": "/metadata/annotations/crossplane.io~1external-create-failed"}]'
entrypoint: kubectl


- name: Make localstack-a reachable again and therefore Healthy.
try:
- command:
args:
- patch
- service
- localstack-a
- -n
- crossplane-system
- --type=merge
- -p
- '{"spec":{"selector":{"io.kompose.service":"localstack-a"}}}'
entrypoint: kubectl
- assert:
resource:
apiVersion: ceph.crossplane.io/v1alpha1
kind: ProviderConfig
metadata:
name: localstack-a
status:
conditions:
- reason: HealthCheckSuccess
status: "True"
type: Ready

- name: Make localstack-b reachable again and therefore Healthy.
try:
- command:
args:
- patch
- service
- localstack-b
- -n
- crossplane-system
- --type=merge
- -p
- '{"spec":{"selector":{"io.kompose.service":"localstack-b"}}}'
entrypoint: kubectl
- assert:
resource:
apiVersion: ceph.crossplane.io/v1alpha1
kind: ProviderConfig
metadata:
name: localstack-b
status:
conditions:
- reason: HealthCheckSuccess
status: "True"
type: Ready

- name: Make localstack-c reachable again and therefore Healthy.
try:
- command:
args:
- patch
- service
- localstack-c
- -n
- crossplane-system
- --type=merge
- -p
- '{"spec":{"selector":{"io.kompose.service":"localstack-c"}}}'
entrypoint: kubectl
- assert:
resource:
apiVersion: ceph.crossplane.io/v1alpha1
kind: ProviderConfig
metadata:
name: localstack-c
status:
conditions:
- reason: HealthCheckSuccess
status: "True"
type: Ready

- name: Scale Provider Ceph deployment to zero in order to restart.
try:
- command:
args:
- scale
- -n
- crossplane-system
- deploy/provider-ceph-provider-cep
- --replicas=0
entrypoint: kubectl

- name: Assert edge-case-bucket is created successfully.
try:
# Assert edge-case-bucket has been created successfully.
- assert:
resource:
apiVersion: provider-ceph.ceph.crossplane.io/v1alpha1
kind: Bucket
metadata:
name: edge-case-bucket
status:
conditions:
- reason: Available
status: "True"
type: Ready
- reason: ReconcileSuccess
status: "True"
type: Synced

- name: Delete edge-case-bucket.
try:
- command:
# We need to "unpause" edge-case-bucket to allow deletion.
args:
- patch
- --type=merge
- buckets
- edge-case-bucket
- -p
- '{"metadata":{"labels":{"crossplane.io/paused":"false"}}}'
entrypoint: kubectl
- command:
args:
- delete
- bucket
- edge-case-bucket
entrypoint: kubectl
- error:
resource:
apiVersion: provider-ceph.ceph.crossplane.io/v1alpha1
kind: Bucket
metadata:
name: edge-case-bucket
# End of edge case test scenario.

# Clean up ProviderConfigs.
- name: Delete all ProviderConfigs.
try:
- command:
Expand Down
50 changes: 43 additions & 7 deletions internal/controller/bucket/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,48 @@ func (c *external) Create(ctx context.Context, mg resource.Managed) (managed.Ext
return managed.ExternalCreation{}, err
}

// Just before Crossplane's over-arching Reconcile function calls external.Create, it
// updates the MR with the create-pending annotation. After external.Create is finished,
// the create-success or create-failure annotaion is then added to the MR by the Reconciler
// depending on the result of external.Create (this function).
// In the event that an "incomplete creation" occurs (ie the create-pending annotaion is the
// most recent annotation to be applied out of create-pending, create-failed and create-succeeded)
// then the MR will not be re-queued. This is to protect against the possibility of "leaked
// resources" (ie resources which have been created by the provider but are no longer under
// the provider's control). This must all happen sequentially in a single Reconcile loop.
// See this Github issue and comment for more context on what is a known issue:
// https://github.com/crossplane/crossplane/issues/3037#issuecomment-1110142427
//
// This approach leaves the possibility for the following scenario:
// 1. The create-pending annotaion is applied to an MR by Crossplane and external.Create is called.
// 2. The provider pod crashes before it gets a chance to perform its external.Create function.
// 3. As a consequence, the resulting create-succeeded or create-failed annotations are never added to
// the MR.
// 4. The provider pod comes back online and the MR is automatically reconciled by Crossplane.
// 5. Crossplane interprets the MR annotaions and believes that an incomplete creation has
// occurred and therefore does not perform any action on the MR.
// 6. The MR is effectively paused, awaiting human intervention.
//
// To counteract this, we attempt to remove the create-pending annotation as immediately after it
// has been applied by Crossplane. It is safe for us to do this because the resources we are creating
// (S3 buckets) have deterministic names. This function will only create a bucket of the given
// name and will do so idempotently, therefore we are never in danger of creating a cascading
// number of buckets due to previous failures.
// Of course, this approach does not completely remove the possibility of us finding ourselves in
// the above scenario. It only mitigates it. As long as Crossplane persists with its existing logic
// then we can only make a "best-effort" to avoid it.
if err := c.updateBucketCR(ctx, bucket, func(_, bucketLatest *v1alpha1.Bucket) UpdateRequired {
meta.RemoveAnnotations(bucket, meta.AnnotationKeyExternalCreatePending)

return NeedsObjectUpdate
}); err != nil {
c.log.Info("Failed to remove pending annotation", consts.KeyBucketName, bucket.Name)
err = errors.Wrap(err, errUpdateBucketCR)
traces.SetAndRecordError(span, err)

return managed.ExternalCreation{}, err
}

span.SetAttributes(attribute.String(consts.KeyBucketName, bucket.Name))

ctx, cancel := context.WithTimeout(ctx, c.operationTimeout)
Expand Down Expand Up @@ -171,15 +213,9 @@ func (c *external) waitForCreationAndUpdateBucketCR(ctx context.Context, bucket
select {
case <-ctx.Done():
c.log.Info("Context timeout waiting for bucket creation", consts.KeyBucketName, bucket.Name)

return managed.ExternalCreation{}, ctx.Err()
createErr = ctx.Err()
case beName := <-readyChan:
err := c.updateBucketCR(ctx, bucket, func(_, bucketLatest *v1alpha1.Bucket) UpdateRequired {
// Remove the annotation, because Crossplane is not always able to do it.
// This workaround doesn't eliminates the problem, if this update fails,
// Crossplane skips object forever.
delete(bucketLatest.ObjectMeta.Annotations, meta.AnnotationKeyExternalCreatePending)

setAllBackendLabels(bucketLatest, providerNames)

return NeedsObjectUpdate
Expand Down
23 changes: 21 additions & 2 deletions internal/controller/bucket/create_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ func TestCreateBasicErrors(t *testing.T) {
backendStore: backendstore.NewBackendStore(),
},
args: args{
mg: &v1alpha1.Bucket{},
mg: &v1alpha1.Bucket{
ObjectMeta: metav1.ObjectMeta{
Name: "bucket",
},
},
},
want: want{
err: errors.New(errNoS3BackendsStored),
Expand All @@ -67,6 +71,9 @@ func TestCreateBasicErrors(t *testing.T) {
},
args: args{
mg: &v1alpha1.Bucket{
ObjectMeta: metav1.ObjectMeta{
Name: "bucket",
},
Spec: v1alpha1.BucketSpec{
Providers: []string{"s3-backend-1"},
},
Expand All @@ -81,7 +88,11 @@ func TestCreateBasicErrors(t *testing.T) {
backendStore: backendstore.NewBackendStore(),
},
args: args{
mg: &v1alpha1.Bucket{},
mg: &v1alpha1.Bucket{
ObjectMeta: metav1.ObjectMeta{
Name: "bucket",
},
},
},
want: want{
err: errors.New(errNoS3BackendsStored),
Expand All @@ -93,8 +104,16 @@ func TestCreateBasicErrors(t *testing.T) {

t.Run(name, func(t *testing.T) {
t.Parallel()
s := runtime.NewScheme()
s.AddKnownTypes(v1alpha1.SchemeGroupVersion, &v1alpha1.Bucket{}, &v1alpha1.BucketList{})

cl := fake.NewClientBuilder().
WithScheme(s).
WithObjects(tc.args.mg).
WithStatusSubresource(tc.args.mg)

e := external{
kubeClient: cl.Build(),
backendStore: tc.fields.backendStore,
log: logging.NewNopLogger(),
}
Expand Down
Loading