Skip to content

Commit

Permalink
Fix cluster status update logic (#911)
Browse files Browse the repository at this point in the history
Previously, several actors would cancel their context when successful.
This was a relic of the way the reconciliation loop was previously
structured, but now leads to a bug where cluster status is not updated
upon successful action. To address this, this patch deletes the
CancelLoop function entirely.
  • Loading branch information
davidwding authored Jun 2, 2022
1 parent 10b3b21 commit 561cf47
Show file tree
Hide file tree
Showing 13 changed files with 10 additions and 93 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

# [Unreleased](https://github.com/cockroachdb/cockroach-operator/compare/v2.7.0...master)

## Fixed

* Delete the CancelLoop function, fixing a cluster status update bug

# [v2.7.0](https://github.com/cockroachdb/cockroach-operator/compare/v2.6.0...v2.7.0)

## Fixed
Expand Down
1 change: 0 additions & 1 deletion pkg/actor/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ go_library(
srcs = [
"actor.go",
"cluster_restart.go",
"context.go",
"decommission.go",
"deploy.go",
"director.go",
Expand Down
1 change: 0 additions & 1 deletion pkg/actor/cluster_restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ func (r *clusterRestart) Act(ctx context.Context, cluster *resource.Cluster, log
log.Error(err, "failed reseting the restart cluster field")
}
log.V(DEBUGLEVEL).Info("completed cluster restart")
CancelLoop(ctx, log)
return nil
}

Expand Down
47 changes: 0 additions & 47 deletions pkg/actor/context.go

This file was deleted.

3 changes: 1 addition & 2 deletions pkg/actor/decommission.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package actor
import (
"context"
"fmt"

api "github.com/cockroachdb/cockroach-operator/apis/v1alpha1"
"github.com/cockroachdb/cockroach-operator/pkg/clustersql"
"github.com/cockroachdb/cockroach-operator/pkg/database"
Expand Down Expand Up @@ -143,12 +144,10 @@ func (d decommission) Act(ctx context.Context, cluster *resource.Cluster, log lo
/// now check if the decommissionStaleErr and update status
log.Error(err, "decommission failed")
cluster.SetFalse(api.DecommissionCondition)
CancelLoop(ctx, log)
return err
}
// TO DO @alina we will need to save the status foreach action
cluster.SetTrue(api.DecommissionCondition)
log.V(DEBUGLEVEL).Info("decommission completed", "cond", ss.Status.Conditions)
CancelLoop(ctx, log)
return nil
}
1 change: 0 additions & 1 deletion pkg/actor/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ func (d deploy) Act(ctx context.Context, cluster *resource.Cluster, log logr.Log

if changed {
log.Info("created/updated a resource, stopping request processing", "resource", b.ResourceName())
CancelLoop(ctx, log)
return nil
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/actor/deploy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func TestDeploysNotInitalizedClusterAfterVersionChecker(t *testing.T) {
// 3 is the number of resources we expect to be created. The action should be repeated as it is
// restarted on successful creation or update
for i := 0; i < 3; i++ {
assert.NoError(t, deploy.Act(actor.ContextWithCancelFn(context.TODO(), func() {}), cluster, testLog))
assert.NoError(t, deploy.Act(context.Background(), cluster, testLog))
}

assert.Equal(t, expected, actual)
Expand Down
3 changes: 1 addition & 2 deletions pkg/actor/partitioned_update.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package actor
import (
"context"
"fmt"
"github.com/go-logr/logr"
"os"
"strings"
"time"
Expand All @@ -31,6 +30,7 @@ import (
"github.com/cockroachdb/cockroach-operator/pkg/resource"
"github.com/cockroachdb/cockroach-operator/pkg/update"
"github.com/cockroachdb/errors"
"github.com/go-logr/logr"
"go.uber.org/zap/zapcore"
appsv1 "k8s.io/api/apps/v1"
kubetypes "k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -213,7 +213,6 @@ func (up *partitionedUpdate) Act(ctx context.Context, cluster *resource.Cluster,

// TODO set status that we are completed.
log.V(DEBUGLEVEL).Info("update completed with partitioned update", "new version", versionWantedCalFmtStr)
CancelLoop(ctx, log)
return nil
}

Expand Down
4 changes: 1 addition & 3 deletions pkg/actor/resize_pvc.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ package actor
import (
"context"
"fmt"
"github.com/go-logr/logr"
"time"

"github.com/cenkalti/backoff"
api "github.com/cockroachdb/cockroach-operator/apis/v1alpha1"
"github.com/cockroachdb/cockroach-operator/pkg/kube"
"github.com/cockroachdb/cockroach-operator/pkg/resource"
"github.com/cockroachdb/errors"
"github.com/go-logr/logr"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -140,8 +140,6 @@ func (rp *resizePVC) Act(ctx context.Context, cluster *resource.Cluster, log log
}*/

log.Info("PVC resize completed")
CancelLoop(ctx, log)

return nil
}

Expand Down
2 changes: 0 additions & 2 deletions pkg/actor/validate_version.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log

if changed {
log.V(int(zapcore.DebugLevel)).Info("created/updated job, stopping request processing")
CancelLoop(ctx, log)
return nil
}

Expand Down Expand Up @@ -349,7 +348,6 @@ func (v *versionChecker) completeVersionChecker(
}
log.V(int(zapcore.DebugLevel)).Info("completed version checker", "calVersion", version,
"containerImage", imageName)
CancelLoop(ctx, log)
return nil
}

Expand Down
1 change: 0 additions & 1 deletion pkg/controller/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ go_test(
"@io_k8s_apimachinery//pkg/types:go_default_library",
"@io_k8s_sigs_controller_runtime//:go_default_library",
"@io_k8s_sigs_controller_runtime//pkg/client/fake:go_default_library",
"@io_k8s_sigs_controller_runtime//pkg/log:go_default_library",
"@org_uber_go_zap//zaptest:go_default_library",
],
)
Expand Down
19 changes: 1 addition & 18 deletions pkg/controller/cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,7 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request
return noRequeue()
}

// Save context cancellation function for actors to call if needed
ctx = actor.ContextWithCancelFn(ctx, cancel)
ctx = context.Background()

log.Info(fmt.Sprintf("Running action with name: %s", actorToExecute.GetActionType()))
if err := actorToExecute.Act(ctx, &cluster, log); err != nil {
Expand Down Expand Up @@ -187,13 +186,6 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request
cluster.SetActionFinished(actorToExecute.GetActionType())
}

// Stop processing and wait for Kubernetes scheduler to call us again as the actor
// modified actorToExecute resource owned by the controller
if cancelled(ctx) {
log.V(int(zapcore.InfoLevel)).Info("request was interrupted")
return noRequeue()
}

// Check if the resource has been updated while the controller worked on it
fresh, err := cluster.IsFresh(fetcher)
if err != nil {
Expand Down Expand Up @@ -254,12 +246,3 @@ func InitClusterReconcilerWithLogger(l logr.Logger) func(ctrl.Manager) error {
}).SetupWithManager(mgr)
}
}

func cancelled(ctx context.Context) bool {
select {
case <-ctx.Done():
return true
default:
return false
}
}
15 changes: 1 addition & 14 deletions pkg/controller/cluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"time"

"github.com/go-logr/logr"
"sigs.k8s.io/controller-runtime/pkg/log"

api "github.com/cockroachdb/cockroach-operator/apis/v1alpha1"
"github.com/cockroachdb/cockroach-operator/pkg/actor"
Expand All @@ -43,14 +42,10 @@ import (
)

type fakeActor struct {
cancelCtx bool
err error
err error
}

func (a *fakeActor) Act(ctx context.Context, _ *resource.Cluster, logger logr.Logger) error {
if a.cancelCtx {
actor.CancelLoop(ctx, log.NullLogger{})
}
return a.err
}
func (a *fakeActor) GetActionType() api.ActionType {
Expand Down Expand Up @@ -115,14 +110,6 @@ func TestReconcile(t *testing.T) {
want: ctrl.Result{Requeue: true},
wantErr: "",
},
{
name: "reconcile action cancels the context",
action: fakeActor{
cancelCtx: true,
},
want: ctrl.Result{Requeue: false},
wantErr: "",
},
{
name: "reconcile action fails to probe expected condition",
action: fakeActor{
Expand Down

0 comments on commit 561cf47

Please sign in to comment.