From 561cf47d783c368fd8795acb82a5a39099a35984 Mon Sep 17 00:00:00 2001 From: David Ding Date: Thu, 2 Jun 2022 12:39:03 -0400 Subject: [PATCH] Fix cluster status update logic (#911) Previously, several actors would cancel their context when successful. This was a relic of the way the reconciliation loop was previously structured, but now leads to a bug where cluster status is not updated upon successful action. To address this, this patch deletes the CancelLoop function entirely. --- CHANGELOG.md | 4 ++ pkg/actor/BUILD.bazel | 1 - pkg/actor/cluster_restart.go | 1 - pkg/actor/context.go | 47 ----------------------- pkg/actor/decommission.go | 3 +- pkg/actor/deploy.go | 1 - pkg/actor/deploy_test.go | 2 +- pkg/actor/partitioned_update.go | 3 +- pkg/actor/resize_pvc.go | 4 +- pkg/actor/validate_version.go | 2 - pkg/controller/BUILD.bazel | 1 - pkg/controller/cluster_controller.go | 19 +-------- pkg/controller/cluster_controller_test.go | 15 +------- 13 files changed, 10 insertions(+), 93 deletions(-) delete mode 100644 pkg/actor/context.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 6048e7c86..dbce127a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). # [Unreleased](https://github.com/cockroachdb/cockroach-operator/compare/v2.7.0...master) +## Fixed + +* Delete the CancelLoop function, fixing a cluster status update bug + # [v2.7.0](https://github.com/cockroachdb/cockroach-operator/compare/v2.6.0...v2.7.0) ## Fixed diff --git a/pkg/actor/BUILD.bazel b/pkg/actor/BUILD.bazel index 5d4cd69c1..fe6c31f47 100644 --- a/pkg/actor/BUILD.bazel +++ b/pkg/actor/BUILD.bazel @@ -5,7 +5,6 @@ go_library( srcs = [ "actor.go", "cluster_restart.go", - "context.go", "decommission.go", "deploy.go", "director.go", diff --git a/pkg/actor/cluster_restart.go b/pkg/actor/cluster_restart.go index 63309ebbd..5227438a5 100644 --- a/pkg/actor/cluster_restart.go +++ b/pkg/actor/cluster_restart.go @@ -125,7 +125,6 @@ func (r *clusterRestart) Act(ctx context.Context, cluster *resource.Cluster, log log.Error(err, "failed reseting the restart cluster field") } log.V(DEBUGLEVEL).Info("completed cluster restart") - CancelLoop(ctx, log) return nil } diff --git a/pkg/actor/context.go b/pkg/actor/context.go deleted file mode 100644 index b4d9a67ef..000000000 --- a/pkg/actor/context.go +++ /dev/null @@ -1,47 +0,0 @@ -/* -Copyright 2022 The Cockroach Authors - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package actor - -import ( - "context" - "errors" - "github.com/go-logr/logr" -) - -type cancelFuncKey struct{} - -//ContextWithCancelFn func -func ContextWithCancelFn(ctx context.Context, fn context.CancelFunc) context.Context { - return context.WithValue(ctx, cancelFuncKey{}, fn) -} - -func getCancelFn(ctx context.Context, logger logr.Logger) context.CancelFunc { - f, ok := ctx.Value(cancelFuncKey{}).(context.CancelFunc) - - if f == nil || !ok { - return func() { - logger.Error(errors.New("missing parent cancel function in context"), "") - } - } - - return f -} - -//CancelLoop func -func CancelLoop(ctx context.Context, logger logr.Logger) { - getCancelFn(ctx, logger)() -} diff --git a/pkg/actor/decommission.go b/pkg/actor/decommission.go index b68cffcd5..6b5cfe825 100644 --- a/pkg/actor/decommission.go +++ b/pkg/actor/decommission.go @@ -19,6 +19,7 @@ package actor import ( "context" "fmt" + api "github.com/cockroachdb/cockroach-operator/apis/v1alpha1" "github.com/cockroachdb/cockroach-operator/pkg/clustersql" "github.com/cockroachdb/cockroach-operator/pkg/database" @@ -143,12 +144,10 @@ func (d decommission) Act(ctx context.Context, cluster *resource.Cluster, log lo /// now check if the decommissionStaleErr and update status log.Error(err, "decommission failed") cluster.SetFalse(api.DecommissionCondition) - CancelLoop(ctx, log) return err } // TO DO @alina we will need to save the status foreach action cluster.SetTrue(api.DecommissionCondition) log.V(DEBUGLEVEL).Info("decommission completed", "cond", ss.Status.Conditions) - CancelLoop(ctx, log) return nil } diff --git a/pkg/actor/deploy.go b/pkg/actor/deploy.go index dde7fcf36..69cc42983 100644 --- a/pkg/actor/deploy.go +++ b/pkg/actor/deploy.go @@ -81,7 +81,6 @@ func (d deploy) Act(ctx context.Context, cluster *resource.Cluster, log logr.Log if changed { log.Info("created/updated a resource, stopping request processing", "resource", b.ResourceName()) - CancelLoop(ctx, log) return nil } } diff --git a/pkg/actor/deploy_test.go b/pkg/actor/deploy_test.go index f190b6262..8ed8df167 100644 --- a/pkg/actor/deploy_test.go +++ b/pkg/actor/deploy_test.go @@ -81,7 +81,7 @@ func TestDeploysNotInitalizedClusterAfterVersionChecker(t *testing.T) { // 3 is the number of resources we expect to be created. The action should be repeated as it is // restarted on successful creation or update for i := 0; i < 3; i++ { - assert.NoError(t, deploy.Act(actor.ContextWithCancelFn(context.TODO(), func() {}), cluster, testLog)) + assert.NoError(t, deploy.Act(context.Background(), cluster, testLog)) } assert.Equal(t, expected, actual) diff --git a/pkg/actor/partitioned_update.go b/pkg/actor/partitioned_update.go index ebd818c15..b01ab75fb 100644 --- a/pkg/actor/partitioned_update.go +++ b/pkg/actor/partitioned_update.go @@ -19,7 +19,6 @@ package actor import ( "context" "fmt" - "github.com/go-logr/logr" "os" "strings" "time" @@ -31,6 +30,7 @@ import ( "github.com/cockroachdb/cockroach-operator/pkg/resource" "github.com/cockroachdb/cockroach-operator/pkg/update" "github.com/cockroachdb/errors" + "github.com/go-logr/logr" "go.uber.org/zap/zapcore" appsv1 "k8s.io/api/apps/v1" kubetypes "k8s.io/apimachinery/pkg/types" @@ -213,7 +213,6 @@ func (up *partitionedUpdate) Act(ctx context.Context, cluster *resource.Cluster, // TODO set status that we are completed. log.V(DEBUGLEVEL).Info("update completed with partitioned update", "new version", versionWantedCalFmtStr) - CancelLoop(ctx, log) return nil } diff --git a/pkg/actor/resize_pvc.go b/pkg/actor/resize_pvc.go index 308889a10..91c3bda39 100644 --- a/pkg/actor/resize_pvc.go +++ b/pkg/actor/resize_pvc.go @@ -19,7 +19,6 @@ package actor import ( "context" "fmt" - "github.com/go-logr/logr" "time" "github.com/cenkalti/backoff" @@ -27,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach-operator/pkg/kube" "github.com/cockroachdb/cockroach-operator/pkg/resource" "github.com/cockroachdb/errors" + "github.com/go-logr/logr" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -140,8 +140,6 @@ func (rp *resizePVC) Act(ctx context.Context, cluster *resource.Cluster, log log }*/ log.Info("PVC resize completed") - CancelLoop(ctx, log) - return nil } diff --git a/pkg/actor/validate_version.go b/pkg/actor/validate_version.go index ced485a40..4656958d5 100644 --- a/pkg/actor/validate_version.go +++ b/pkg/actor/validate_version.go @@ -136,7 +136,6 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log if changed { log.V(int(zapcore.DebugLevel)).Info("created/updated job, stopping request processing") - CancelLoop(ctx, log) return nil } @@ -349,7 +348,6 @@ func (v *versionChecker) completeVersionChecker( } log.V(int(zapcore.DebugLevel)).Info("completed version checker", "calVersion", version, "containerImage", imageName) - CancelLoop(ctx, log) return nil } diff --git a/pkg/controller/BUILD.bazel b/pkg/controller/BUILD.bazel index 44776718e..bd267c968 100644 --- a/pkg/controller/BUILD.bazel +++ b/pkg/controller/BUILD.bazel @@ -49,7 +49,6 @@ go_test( "@io_k8s_apimachinery//pkg/types:go_default_library", "@io_k8s_sigs_controller_runtime//:go_default_library", "@io_k8s_sigs_controller_runtime//pkg/client/fake:go_default_library", - "@io_k8s_sigs_controller_runtime//pkg/log:go_default_library", "@org_uber_go_zap//zaptest:go_default_library", ], ) diff --git a/pkg/controller/cluster_controller.go b/pkg/controller/cluster_controller.go index 08a60f9fa..029642847 100644 --- a/pkg/controller/cluster_controller.go +++ b/pkg/controller/cluster_controller.go @@ -147,8 +147,7 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request return noRequeue() } - // Save context cancellation function for actors to call if needed - ctx = actor.ContextWithCancelFn(ctx, cancel) + ctx = context.Background() log.Info(fmt.Sprintf("Running action with name: %s", actorToExecute.GetActionType())) if err := actorToExecute.Act(ctx, &cluster, log); err != nil { @@ -187,13 +186,6 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request cluster.SetActionFinished(actorToExecute.GetActionType()) } - // Stop processing and wait for Kubernetes scheduler to call us again as the actor - // modified actorToExecute resource owned by the controller - if cancelled(ctx) { - log.V(int(zapcore.InfoLevel)).Info("request was interrupted") - return noRequeue() - } - // Check if the resource has been updated while the controller worked on it fresh, err := cluster.IsFresh(fetcher) if err != nil { @@ -254,12 +246,3 @@ func InitClusterReconcilerWithLogger(l logr.Logger) func(ctrl.Manager) error { }).SetupWithManager(mgr) } } - -func cancelled(ctx context.Context) bool { - select { - case <-ctx.Done(): - return true - default: - return false - } -} diff --git a/pkg/controller/cluster_controller_test.go b/pkg/controller/cluster_controller_test.go index c5113eb24..f31164685 100644 --- a/pkg/controller/cluster_controller_test.go +++ b/pkg/controller/cluster_controller_test.go @@ -23,7 +23,6 @@ import ( "time" "github.com/go-logr/logr" - "sigs.k8s.io/controller-runtime/pkg/log" api "github.com/cockroachdb/cockroach-operator/apis/v1alpha1" "github.com/cockroachdb/cockroach-operator/pkg/actor" @@ -43,14 +42,10 @@ import ( ) type fakeActor struct { - cancelCtx bool - err error + err error } func (a *fakeActor) Act(ctx context.Context, _ *resource.Cluster, logger logr.Logger) error { - if a.cancelCtx { - actor.CancelLoop(ctx, log.NullLogger{}) - } return a.err } func (a *fakeActor) GetActionType() api.ActionType { @@ -115,14 +110,6 @@ func TestReconcile(t *testing.T) { want: ctrl.Result{Requeue: true}, wantErr: "", }, - { - name: "reconcile action cancels the context", - action: fakeActor{ - cancelCtx: true, - }, - want: ctrl.Result{Requeue: false}, - wantErr: "", - }, { name: "reconcile action fails to probe expected condition", action: fakeActor{