Skip to content

Commit

Permalink
Operator upgrade should not cause rolling restart (#631)
Browse files Browse the repository at this point in the history
* Verify in the test that the operator upgrade does not cause operator to restart the StatefulSets

* Do not upgrade the StatefulSets if the ObservedGeneration == Generation and there is no annotation with allow-upgrade set. Add new event for reconcile, any change of annotations in the CassandraDatacenter

* Fix tests, add new test. Add webhook to validate the allowed values for allow-upgrade and add RequiresUpdate Condition to the Datacenter.Status

* Changelog and config secret test update to require allow-upgrade to set be set to "always"

* Rename allow-update to cassandra.datastax.com/autoupdate-spec to indicate any automated change of StatefulSet spec

* Add new task refresh, which ensures CassandraDatacenter is up to date
  • Loading branch information
burmanm authored Apr 17, 2024
1 parent 3879704 commit e598ce3
Show file tree
Hide file tree
Showing 21 changed files with 274 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/kindIntegTest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ jobs:
strategy:
matrix:
integration_test:
- upgrade_operator # Test is not setup to run against 4.0
- additional_seeds #TODO: Fails against C* 4.0, fix in https://github.com/k8ssandra/cass-operator/issues/459
- scale_down_unbalanced_racks #TODO: Fails against C* 4.0 and DSE 6.8, fix in https://github.com/k8ssandra/cass-operator/issues/459
runs-on: ubuntu-latest
Expand Down Expand Up @@ -187,6 +186,7 @@ jobs:
- superuser-secret-provided
- test_bad_config_and_fix
- test_mtls_mgmt_api
- upgrade_operator
# More than 3 workers tests:
- add_racks
#- additional_seeds #TODO: Fails against C* 4.0, fix in https://github.com/k8ssandra/cass-operator/issues/459
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/workflow-integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ jobs:
version:
- "3.11.14"
integration_test:
- upgrade_operator
- cdc_successful
- additional_seeds
- scale_down_unbalanced_racks
Expand Down Expand Up @@ -205,6 +204,7 @@ jobs:
- cluster_wide_install
- config_change
- config_secret
- upgrade_operator # Unnecessary to run against multiple versions
#- multi_cluster_management # cluster_wide_install verifies the same thing
#- oss_test_all_the_things # This is now the smoke test, see kind_smoke_tests job
- scale_down
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti

## unreleased

* [CHANGE] [#566](https://github.com/k8ssandra/cass-operator/issues/566) BREAKING: StatefulSets will no longer be automatically updated if CassandraDatacenter is not modified, unless an annotation "cassandra.datastax.com/autoupdate-spec" is set to the CassandraDatacenter with value "always" or "once". This means users of config secret should set this variable to "always" to keep their existing behavior. For other users, this means that for example the upgrades of operator will no longer automatically apply updated settings or system-logger image. The benefit is that updating the operator no longer causes the cluster to have a rolling restart. A new condition to indicate such change could be necessary is called "RequiresUpdate" and it will be set to True until the next refresh of reconcile has happened.
* [CHANGE] [#618](https://github.com/k8ssandra/cass-operator/issues/618) Update dependencies to support controller-runtime 0.17.2, modify required parts.
* [ENHANCEMENT] [#628](https://github.com/k8ssandra/cass-operator/issues/628) Replace pod task can replace any node, including those that have crashed
* [ENHANCEMENT] [#532](https://github.com/k8ssandra/cass-operator/issues/532) Instead of rejecting updates/creates with deprecated fields, return kubectl warnings.
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,9 @@ test: manifests generate fmt vet lint envtest ## Run tests.
integ-test: kustomize cert-manager helm ## Run integration tests from directory M_INTEG_DIR or set M_INTEG_DIR=all to run all the integration tests.
ifeq ($(M_INTEG_DIR), all)
# Run all the tests (exclude kustomize & testdata directories)
cd tests && go test -v ./... -timeout 300m --ginkgo.progress --ginkgo.v
cd tests && go test -v ./... -timeout 300m --ginkgo.show-node-events --ginkgo.v
else
cd tests/${M_INTEG_DIR} && go test -v ./... -timeout 300m --ginkgo.progress --ginkgo.v
cd tests/${M_INTEG_DIR} && go test -v ./... -timeout 300m --ginkgo.show-node-events --ginkgo.v
endif

.PHONY: version
Expand Down
13 changes: 13 additions & 0 deletions apis/cassandra/v1beta1/cassandradatacenter_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ const (
// cluster has gone through scale up operation.
NoAutomatedCleanupAnnotation = "cassandra.datastax.com/no-cleanup"

// UpdateAllowedAnnotation marks the Datacenter to allow upgrades to StatefulSets Spec even if CassandraDatacenter object was not modified. Allowed values are "once" and "always"
UpdateAllowedAnnotation = "cassandra.datastax.com/autoupdate-spec"

AllowUpdateAlways AllowUpdateType = "always"
AllowUpdateOnce AllowUpdateType = "once"

CassNodeState = "cassandra.datastax.com/node-state"

ProgressUpdating ProgressState = "Updating"
Expand All @@ -74,6 +80,8 @@ const (
DefaultInternodePort = 7000
)

type AllowUpdateType string

// ProgressState - this type exists so there's no chance of pushing random strings to our progress status
type ProgressState string

Expand Down Expand Up @@ -379,6 +387,7 @@ const (
DatacenterRollingRestart DatacenterConditionType = "RollingRestart"
DatacenterValid DatacenterConditionType = "Valid"
DatacenterDecommission DatacenterConditionType = "Decommission"
DatacenterRequiresUpdate DatacenterConditionType = "RequiresUpdate"

// DatacenterHealthy indicates if QUORUM can be reached from all deployed nodes.
// If this check fails, certain operations such as scaling up will not proceed.
Expand Down Expand Up @@ -961,3 +970,7 @@ func (dc *CassandraDatacenter) DatacenterName() string {
func (dc *CassandraDatacenter) UseClientImage() bool {
return dc.Spec.ServerType == "cassandra" && semver.Compare(fmt.Sprintf("v%s", dc.Spec.ServerVersion), "v4.1.0") >= 0
}

func (dc *CassandraDatacenter) GenerationChanged() bool {
return dc.Status.ObservedGeneration < dc.Generation
}
8 changes: 8 additions & 0 deletions apis/cassandra/v1beta1/cassandradatacenter_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

"github.com/k8ssandra/cass-operator/pkg/images"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
logf "sigs.k8s.io/controller-runtime/pkg/log"
Expand Down Expand Up @@ -336,6 +337,13 @@ func ValidateServiceLabelsAndAnnotations(dc CassandraDatacenter) error {
}
}

if metav1.HasAnnotation(dc.ObjectMeta, UpdateAllowedAnnotation) {
updateType := AllowUpdateType(dc.Annotations[UpdateAllowedAnnotation])
if updateType != AllowUpdateAlways && updateType != AllowUpdateOnce {
return attemptedTo("use %s annotation with value other than 'once' or 'always'", UpdateAllowedAnnotation)
}
}

return nil
}

Expand Down
32 changes: 32 additions & 0 deletions apis/cassandra/v1beta1/webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,38 @@ func Test_ValidateSingleDatacenter(t *testing.T) {
},
errString: "configure DatacenterService with reserved annotations and/or labels (prefixes cassandra.datastax.com and/or k8ssandra.io)",
},
{
name: "Allow upgrade should not accept invalid values",
dc: &CassandraDatacenter{
ObjectMeta: metav1.ObjectMeta{
Name: "exampleDC",
Annotations: map[string]string{
"cassandra.datastax.com/autoupdate-spec": "invalid",
},
},
Spec: CassandraDatacenterSpec{
ServerType: "dse",
ServerVersion: "6.8.42",
},
},
errString: "use cassandra.datastax.com/autoupdate-spec annotation with value other than 'once' or 'always'",
},
{
name: "Allow upgrade should accept once value",
dc: &CassandraDatacenter{
ObjectMeta: metav1.ObjectMeta{
Name: "exampleDC",
Annotations: map[string]string{
"cassandra.datastax.com/autoupdate-spec": "once",
},
},
Spec: CassandraDatacenterSpec{
ServerType: "dse",
ServerVersion: "6.8.42",
},
},
errString: "",
},
}

for _, tt := range tests {
Expand Down
3 changes: 3 additions & 0 deletions apis/control/v1alpha1/cassandratask_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ const (
CommandMove CassandraCommand = "move"
CommandGarbageCollect CassandraCommand = "garbagecollect"
CommandFlush CassandraCommand = "flush"
CommandRefresh CassandraCommand = "refresh"
)

type CassandraJob struct {
Expand Down Expand Up @@ -167,6 +168,8 @@ const (
JobFailed JobConditionType = "Failed"
// JobRunning means the job is currently executing
JobRunning JobConditionType = "Running"
// DatacenterUpdated
DatacenterUpdated JobConditionType = "DatacenterUpdated"
)

//+kubebuilder:object:root=true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ func (r *CassandraDatacenterReconciler) SetupWithManager(mgr ctrl.Manager) error
// Create a new managed controller builder
c := ctrl.NewControllerManagedBy(mgr).
Named("cassandradatacenter_controller").
For(&api.CassandraDatacenter{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
For(&api.CassandraDatacenter{}, builder.WithPredicates(predicate.Or(predicate.GenerationChangedPredicate{}, predicate.AnnotationChangedPredicate{}))). // We might want to consider annotation filtering
Owns(&appsv1.StatefulSet{}, builder.WithPredicates(managedByCassandraOperatorPredicate)).
Owns(&policyv1.PodDisruptionBudget{}, builder.WithPredicates(managedByCassandraOperatorPredicate)).
Owns(&corev1.Service{}, builder.WithPredicates(managedByCassandraOperatorPredicate))
Expand Down
12 changes: 10 additions & 2 deletions internal/controllers/control/cassandratask_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -309,12 +309,20 @@ JobDefinition:
flush(taskConfig)
case api.CommandGarbageCollect:
gc(taskConfig)
case api.CommandRefresh:
// This targets the Datacenter only
res, err = r.refreshDatacenter(ctx, dc, &cassTask)
if err != nil {
return ctrl.Result{}, err
}
completed = taskConfig.Completed
break JobDefinition
default:
err = fmt.Errorf("unknown job command: %s", job.Command)
return ctrl.Result{}, err
}

if !r.HasCondition(cassTask, api.JobRunning, metav1.ConditionTrue) {
if !r.HasCondition(&cassTask, api.JobRunning, metav1.ConditionTrue) {
valid, errValidate := taskConfig.Validate()
if errValidate != nil && valid {
// Retry, this is a transient error
Expand Down Expand Up @@ -423,7 +431,7 @@ func (r *CassandraTaskReconciler) SetupWithManager(mgr ctrl.Manager) error {
Complete(r)
}

func (r *CassandraTaskReconciler) HasCondition(task api.CassandraTask, condition api.JobConditionType, status metav1.ConditionStatus) bool {
func (r *CassandraTaskReconciler) HasCondition(task *api.CassandraTask, condition api.JobConditionType, status metav1.ConditionStatus) bool {
for _, cond := range task.Status.Conditions {
if cond.Type == string(condition) {
return cond.Status == status
Expand Down
46 changes: 45 additions & 1 deletion internal/controllers/control/cassandratask_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"github.com/k8ssandra/cass-operator/pkg/httphelper"

cassapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
cassdcapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
api "github.com/k8ssandra/cass-operator/apis/control/v1alpha1"
. "github.com/onsi/ginkgo/v2"
Expand Down Expand Up @@ -683,7 +684,6 @@ var _ = Describe("CassandraTask controller tests", func() {

AfterEach(func() {
deleteDatacenter(testNamespaceName)
// Expect(k8sClient.Delete(context.TODO(), testDc)).Should(Succeed())
})

Context("Restart", func() {
Expand Down Expand Up @@ -781,4 +781,48 @@ var _ = Describe("CassandraTask controller tests", func() {
})
})
})
Describe("Execute jobs against Datacenters", func() {
var testNamespaceName string
BeforeEach(func() {
testNamespaceName = fmt.Sprintf("test-task-%d", rand.Int31())
By("create datacenter", createDatacenter(testDatacenterName, testNamespaceName))
})

AfterEach(func() {
deleteDatacenter(testNamespaceName)
})

Context("Refresh", func() {
It("Adds an annotation if CassandraDatacenter does not have one and waits for completion", func() {
taskKey, task := buildTask(api.CommandRefresh, testNamespaceName)
Expect(k8sClient.Create(context.Background(), task)).Should(Succeed())

dc := &cassdcapi.CassandraDatacenter{}
Eventually(func() bool {
if err := k8sClient.Get(context.TODO(), types.NamespacedName{Name: testDatacenterName, Namespace: testNamespaceName}, dc); err != nil {
return false
}
if metav1.HasAnnotation(dc.ObjectMeta, cassapi.UpdateAllowedAnnotation) {
return dc.Annotations[cassapi.UpdateAllowedAnnotation] == "once"
}
return false
}, "5s", "50ms").Should(BeTrue())

delete(dc.Annotations, cassapi.UpdateAllowedAnnotation)
Expect(k8sClient.Update(context.Background(), dc)).Should(Succeed())

_ = waitForTaskCompletion(taskKey)
})
It("Completes if autoupdate-spec is always allowed", func() {
dc := &cassdcapi.CassandraDatacenter{}
Expect(k8sClient.Get(context.TODO(), types.NamespacedName{Name: testDatacenterName, Namespace: testNamespaceName}, dc)).To(Succeed())
metav1.SetMetaDataAnnotation(&dc.ObjectMeta, cassapi.UpdateAllowedAnnotation, string(cassapi.AllowUpdateAlways))
Expect(k8sClient.Update(context.Background(), dc)).Should(Succeed())

taskKey, task := buildTask(api.CommandRefresh, testNamespaceName)
Expect(k8sClient.Create(context.Background(), task)).Should(Succeed())
_ = waitForTaskCompletion(taskKey)
})
})
})
})
40 changes: 40 additions & 0 deletions internal/controllers/control/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"

cassapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
Expand Down Expand Up @@ -404,6 +405,45 @@ func compact(taskConfig *TaskConfiguration) {
taskConfig.AsyncFunc = compactAsync
}

// Refresh CassandraDatacenter

func (r *CassandraTaskReconciler) refreshDatacenter(ctx context.Context, dc *cassapi.CassandraDatacenter, task *api.CassandraTask) (ctrl.Result, error) {
// If there's no "always" annotation, add "once" annotation and check that it's removed (that indicates finished)
if metav1.HasAnnotation(dc.ObjectMeta, cassapi.UpdateAllowedAnnotation) {
// No need to add anything, process is still going or it was always allowed
val := cassapi.AllowUpdateType(dc.Annotations[cassapi.UpdateAllowedAnnotation])
if val == cassapi.AllowUpdateAlways {
// Nothing to do here, the autoupdate is set
return ctrl.Result{}, nil
} else {
// Still waiting for the refresh to happen
return ctrl.Result{RequeueAfter: JobRunningRequeue}, nil
}
}

if r.HasCondition(task, api.DatacenterUpdated, metav1.ConditionTrue) {
// The refresh has completed, since the annotation is gone
return ctrl.Result{}, nil
}

// Lets start the process
patch := client.MergeFrom(dc.DeepCopy())

metav1.SetMetaDataAnnotation(&dc.ObjectMeta, cassapi.UpdateAllowedAnnotation, string(cassapi.AllowUpdateOnce))

if err := r.Patch(ctx, dc, patch); err != nil {
return ctrl.Result{}, err
}

taskPatch := client.MergeFrom(task.DeepCopy())
if modified := SetCondition(task, api.DatacenterUpdated, metav1.ConditionTrue, "Datacenter updated to update spec once"); modified {
if err := r.Client.Status().Patch(ctx, task, taskPatch); err != nil {
return ctrl.Result{}, err
}
}
return ctrl.Result{RequeueAfter: JobRunningRequeue}, nil
}

// Common functions

func isCassandraUp(pod *corev1.Pod) bool {
Expand Down
13 changes: 13 additions & 0 deletions pkg/reconciliation/constructor.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/k8ssandra/cass-operator/pkg/oplabels"
"github.com/k8ssandra/cass-operator/pkg/utils"

corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
Expand Down Expand Up @@ -62,11 +63,23 @@ func setOperatorProgressStatus(rc *ReconciliationContext, newState api.ProgressS
if rc.Datacenter.Status.DatacenterName == nil {
rc.Datacenter.Status.DatacenterName = &rc.Datacenter.Spec.DatacenterName
}
rc.setCondition(api.NewDatacenterCondition(api.DatacenterRequiresUpdate, corev1.ConditionFalse))
}
if err := rc.Client.Status().Patch(rc.Ctx, rc.Datacenter, patch); err != nil {
rc.ReqLogger.Error(err, "error updating the Cassandra Operator Progress state")
return err
}

// The allow-upgrade=once annotation is temporary and should be removed after first successful reconcile
if metav1.HasAnnotation(rc.Datacenter.ObjectMeta, api.UpdateAllowedAnnotation) && rc.Datacenter.Annotations[api.UpdateAllowedAnnotation] == string(api.AllowUpdateOnce) {
// remove the annotation
patch = client.MergeFrom(rc.Datacenter.DeepCopy())
delete(rc.Datacenter.ObjectMeta.Annotations, api.UpdateAllowedAnnotation)
if err := rc.Client.Patch(rc.Ctx, rc.Datacenter, patch); err != nil {
rc.ReqLogger.Error(err, "error removing the allow-upgrade=once annotation")
return err
}
}

return nil
}
Loading

0 comments on commit e598ce3

Please sign in to comment.