Skip to content

Commit

Permalink
Recreate StatefulSet if required for updates to StS (#309)
Browse files Browse the repository at this point in the history
* Another try

* Update upgrade_operator to 3 nodes, 3 racks and modify some timeouts in tests to avoid flakiness

* Dump CassandraDatacenter and CassandraTask yaml to the build output also, Fixes #312

* Fix Completed() requeue logic in the cleanup task creation, modify scale_up test logic to ensure ScalingUp is set to False. Fixes #311

* Add missing CHANGELOG

* Decrease config_change to 2 -> 4 nodes setup with 2 racks instead of 3 -> 6

* Ignore get -o errors, operator_upgrade 1.8.0 has no CassandraTask

* Remove commented out lines

* Increase timeouts in the tests to avoid flakiness. It seems increasing node count in certain tests caused the first check to pass while the cluster was not really ready and this caused the test to fail. Also, the kubectl outputs to the build directory missed the namespace

* Modify config_change to use 1 rack for the testing

* config_change cluster and dcName to variables

* Missing dcName substitution
  • Loading branch information
burmanm authored Apr 5, 2022
1 parent ade9521 commit f406df8
Show file tree
Hide file tree
Showing 10 changed files with 154 additions and 57 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti

* [CHANGE] [#183](https://github.com/k8ssandra/cass-operator/issues/183) Move from PodDisruptionBudget v1beta1 to v1 (min. Kubernetes version 1.21)
* [CHANGE] [#264](https://github.com/k8ssandra/cass-operator/issues/264) Generate PodTemplateSpec in CassandraDatacenter with metadata
* [ENHANCEMENT] []() Update to Go 1.17 with updates to dependencies: Kube 1.23.4 and controller-runtime 0.11.1
* [FEATURE] [#309](https://github.com/k8ssandra/cass-operator/pull/309) If StatefulSets are modified in a way that they can't be updated directly, recreate them with new specs
* [ENHANCEMENT] [#292](https://github.com/k8ssandra/cass-operator/issues/292) Update to Go 1.17 with updates to dependencies: Kube 1.23.4 and controller-runtime 0.11.1
* [ENHANCEMENT] [#312](https://github.com/k8ssandra/cass-operator/issues/312) Integration tests now output CassandraDatacenter and CassandraTask CRD outputs to build directory
* [BUGFIX] [#298](https://github.com/k8ssandra/cass-operator/issues/298) EndpointState has incorrect json key
* [BUGFIX] [#304](https://github.com/k8ssandra/cass-operator/issues/304) Hostname lookups on Cassandra pods fail
* [BUGFIX] [#311](https://github.com/k8ssandra/cass-operator/issues/311) Fix cleanup retry reconcile bug

## v1.10.1

Expand Down
53 changes: 33 additions & 20 deletions pkg/reconciliation/reconcile_racks.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,17 +164,13 @@ func (rc *ReconciliationContext) CheckRackCreation() result.ReconcileResult {
func (rc *ReconciliationContext) desiredStatefulSetForExistingStatefulSet(sts *appsv1.StatefulSet, rackName string) (desiredSts *appsv1.StatefulSet, err error) {
dc := rc.Datacenter

// have to use zero here, because each statefulset is created with no replicas
// in GetStatefulSetForRack()
replicas := 0

// when Cass Operator was released, we accidentally used the incorrect managed-by
// label of "cassandra-operator" we have since fixed this to be "cass-operator",
// but unfortunately, we cannot modify the labels in the volumeClaimTemplates of a
// StatefulSet. Consequently, we must preserve the old labels in this case.
usesDefunct := usesDefunctPvcManagedByLabel(sts)

return newStatefulSetForCassandraDatacenter(sts, rackName, dc, replicas, usesDefunct)
return newStatefulSetForCassandraDatacenter(sts, rackName, dc, int(*sts.Spec.Replicas), usesDefunct)
}

func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult {
Expand Down Expand Up @@ -275,11 +271,13 @@ func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult {
statefulSet.SetResourceVersion(resVersion)
err = rc.Client.Update(rc.Ctx, statefulSet)
if err != nil {
logger.Error(
err,
"Unable to perform update on statefulset for config",
"statefulSet", statefulSet)
return result.Error(err)
if errors.IsInvalid(err) {
if err = rc.deleteStatefulSet(statefulSet); err != nil {
return result.Error(err)
}
} else {
return result.Error(err)
}
}

if err := rc.enableQuietPeriod(20); err != nil {
Expand Down Expand Up @@ -334,15 +332,15 @@ func (rc *ReconciliationContext) CheckRackForceUpgrade() result.ReconcileResult
return result.Continue()
}

for idx := range rc.desiredRackInformation {
for idx, nextRack := range rc.desiredRackInformation {
rackName := rc.desiredRackInformation[idx].RackName
if slice.ContainsString(forceRacks, rackName, nil) {

statefulSet := rc.statefulSets[idx]

// have to use zero here, because each statefulset is created with no replicas
// in GetStatefulSetForRack()
desiredSts, err := newStatefulSetForCassandraDatacenter(statefulSet, rackName, dc, 0, false)
desiredSts, err := newStatefulSetForCassandraDatacenter(statefulSet, rackName, dc, nextRack.NodeCount, false)
if err != nil {
logger.Error(err, "error calling newStatefulSetForCassandraDatacenter")
return result.Error(err)
Expand Down Expand Up @@ -386,13 +384,15 @@ func (rc *ReconciliationContext) CheckRackForceUpgrade() result.ReconcileResult
)

if err := rc.Client.Update(rc.Ctx, statefulSet); err != nil {
logger.Error(
err,
"Unable to perform update on statefulset for force update config",
"statefulSet", statefulSet)
return result.Error(err)
if errors.IsInvalid(err) {
if err = rc.deleteStatefulSet(statefulSet); err != nil {
// logger.Error(err, "Failed to delete the StatefulSet", "Invalid", errors.IsInvalid(err), "Forbidden", errors.IsForbidden(err))
return result.Error(err)
}
} else {
return result.Error(err)
}
}

}
}

Expand All @@ -408,6 +408,19 @@ func (rc *ReconciliationContext) CheckRackForceUpgrade() result.ReconcileResult
return result.Done()
}

func (rc *ReconciliationContext) deleteStatefulSet(statefulSet *appsv1.StatefulSet) error {
policy := metav1.DeletePropagationOrphan
cascadePolicy := client.DeleteOptions{
PropagationPolicy: &policy,
}

if err := rc.Client.Delete(rc.Ctx, statefulSet, &cascadePolicy); err != nil {
return err
}

return nil
}

func (rc *ReconciliationContext) CheckRackLabels() result.ReconcileResult {
rc.ReqLogger.Info("reconcile_racks::CheckRackLabels")

Expand Down Expand Up @@ -1388,7 +1401,7 @@ func (rc *ReconciliationContext) GetStatefulSetForRack(
currentStatefulSet,
nextRack.RackName,
rc.Datacenter,
0,
nextRack.NodeCount,
false)
if err != nil {
return nil, false, err
Expand Down Expand Up @@ -2222,7 +2235,7 @@ func (rc *ReconciliationContext) CheckClearActionConditions() result.ReconcileRe
// Explicitly handle scaling up here because we want to run a cleanup afterwards
if dc.GetConditionStatus(api.DatacenterScalingUp) == corev1.ConditionTrue {
// Call the first node with cleanup, wait until it has finished and then move on to the next pod..
if res := rc.cleanupAfterScaling(); !res.Completed() {
if res := rc.cleanupAfterScaling(); res.Completed() {
return res
}

Expand Down
36 changes: 18 additions & 18 deletions tests/config_change/config_change_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@ import (
)

var (
testName = "Config change rollout"
namespace = "test-config-change-rollout"
dcName = "dc1"
dcYaml = "../testdata/default-three-rack-three-node-dc.yaml"
dcResource = fmt.Sprintf("CassandraDatacenter/%s", dcName)
dcLabel = fmt.Sprintf("cassandra.datastax.com/datacenter=%s", dcName)
ns = ginkgo_util.NewWrapper(testName, namespace)
testName = "Config change rollout"
namespace = "test-config-change-rollout"
dcName = "dc2"
clusterName = "cluster2"
dcYaml = "../testdata/default-single-rack-single-node-dc.yaml"
dcResource = fmt.Sprintf("CassandraDatacenter/%s", dcName)
dcLabel = fmt.Sprintf("cassandra.datastax.com/datacenter=%s", dcName)
ns = ginkgo_util.NewWrapper(testName, namespace)
)

func TestLifecycle(t *testing.T) {
Expand Down Expand Up @@ -50,40 +51,39 @@ var _ = Describe(testName, func() {

ns.WaitForOperatorReady()

step := "creating a datacenter resource with 3 racks/3 nodes"
step := "creating a datacenter resource with 1 racks/1 node"
k := kubectl.ApplyFiles(dcYaml)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterReady(dcName)

step = "scale up to 6 nodes"
json := `{"spec": {"size": 6}}`
step = "scale up to 3 nodes"
json := `{"spec": {"size": 3}}`
k = kubectl.PatchMerge(dcResource, json)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30)
ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60)
ns.WaitForDatacenterReady(dcName)

step = "change the config"
json = "{\"spec\": {\"config\": {\"cassandra-yaml\": {\"roles_validity_in_ms\": 256000}, \"jvm-options\": {\"garbage_collector\": \"CMS\"}}}}"
json = "{\"spec\": {\"config\": {\"cassandra-yaml\": {\"roles_validity_in_ms\": 256000}}}}"
k = kubectl.PatchMerge(dcResource, json)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30)
ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60)
ns.WaitForDatacenterOperatorProgress(dcName, "Ready", 1800)

step = "checking that the init container got the updated config roles_validity_in_ms=256000, garbage_collector=CMS"
step = "checking that the init container got the updated config roles_validity_in_ms=256000"
json = "jsonpath={.spec.initContainers[0].env[7].value}"
k = kubectl.Get("pod/cluster1-dc1-r1-sts-0").
k = kubectl.Get(fmt.Sprintf("pod/%s-%s-r1-sts-0", clusterName, dcName)).
FormatOutput(json)
ns.WaitForOutputContainsAndLog(step, k, "\"roles_validity_in_ms\":256000", 30)
ns.WaitForOutputContainsAndLog(step, k, "\"garbage_collector\":\"CMS\"", 30)

step = "checking that statefulsets have the right owner reference"
json = "jsonpath={.metadata.ownerReferences[0].name}"
k = kubectl.Get("sts/cluster1-dc1-r1-sts").
k = kubectl.Get(fmt.Sprintf("sts/%s-%s-r1-sts", clusterName, dcName)).
FormatOutput(json)
ns.WaitForOutputAndLog(step, k, "dc1", 30)
ns.WaitForOutputAndLog(step, k, dcName, 30)

step = "deleting the dc"
k = kubectl.DeleteFromFiles(dcYaml)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,15 @@ var _ = Describe(testName, func() {
k = kubectl.PatchMerge(dcResource, json)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30)
ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60)
ns.WaitForDatacenterReady(dcName)

step = "scale up to 5 nodes"
json = "{\"spec\": {\"size\": 5}}"
k = kubectl.PatchMerge(dcResource, json)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30)
ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60)
ns.WaitForDatacenterReady(dcName)

step = "stopping the dc"
Expand Down
5 changes: 4 additions & 1 deletion tests/scale_up/scale_up_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,16 @@ var _ = Describe(testName, func() {

// Ensure we have a single CassandraTask created which is a cleanup (and it succeeded)
ns.CheckForCompletedCassandraTasks(dcName, "cleanup", 1)
// ns.CheckForCompletedCassandraTask(dcName, "cleanup")

step = "scale up to 4 nodes"
json = "{\"spec\": {\"size\": 4}}"
k = kubectl.PatchMerge(dcResource, json)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterCondition(dcName, "ScalingUp", string(corev1.ConditionTrue))
ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60)
ns.WaitForDatacenterReady(dcName)
ns.WaitForDatacenterCondition(dcName, "ScalingUp", string(corev1.ConditionFalse))

// Ensure we have two CassandraTasks created which are cleanup (and they succeeded)
ns.CheckForCompletedCassandraTasks(dcName, "cleanup", 2)
Expand All @@ -89,8 +90,10 @@ var _ = Describe(testName, func() {
k = kubectl.PatchMerge(dcResource, json)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterCondition(dcName, "ScalingUp", string(corev1.ConditionTrue))
ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60)
ns.WaitForDatacenterReady(dcName)
ns.WaitForDatacenterCondition(dcName, "ScalingUp", string(corev1.ConditionFalse))

// Ensure we have three CassandraTasks created which are cleanup (and they succeeded)
ns.CheckForCompletedCassandraTasks(dcName, "cleanup", 3)
Expand Down
26 changes: 26 additions & 0 deletions tests/testdata/default-two-rack-two-node-dc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: cassandra.datastax.com/v1beta1
kind: CassandraDatacenter
metadata:
name: dc1
spec:
clusterName: cluster1
serverType: cassandra
serverVersion: "4.0.3"
managementApiAuth:
insecure: {}
size: 2
storageConfig:
cassandraDataVolumeClaimSpec:
storageClassName: standard
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
racks:
- name: r1
- name: r2
config:
jvm-server-options:
initial_heap_size: "512m"
max_heap_size: "512m"
4 changes: 3 additions & 1 deletion tests/testdata/operator-1.7.1-oss-dc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ spec:
serverVersion: "3.11.7"
managementApiAuth:
insecure: {}
size: 1
size: 3
storageConfig:
cassandraDataVolumeClaimSpec:
storageClassName: standard
Expand All @@ -19,6 +19,8 @@ spec:
storage: 1Gi
racks:
- name: r1
- name: r2
- name: r3
config:
jvm-options:
initial_heap_size: "512m"
Expand Down
23 changes: 18 additions & 5 deletions tests/upgrade_operator/upgrade_operator_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ func TestLifecycle(t *testing.T) {
kubectl.DumpAllLogs(logPath).ExecV()
fmt.Printf("\n\tPost-run logs dumped at: %s\n\n", logPath)
ns.Terminate()
kustomize.Undeploy(namespace)
})

RegisterFailHandler(Fail)
Expand Down Expand Up @@ -69,12 +70,17 @@ var _ = Describe(testName, func() {

ns.WaitForOperatorReady()

step := "creating a datacenter resource with 1 racks/1 node"
step := "creating a datacenter resource with 3 racks/3 node"
k := kubectl.ApplyFiles(dcYaml)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterReady(dcName)

// Get UID of the cluster pod
// step = "get Cassandra pods UID"
// k = kubectl.Get("pod/cluster1-dc1-r1-sts-0").FormatOutput("jsonpath={.metadata.uid}")
// createdPodUID := ns.OutputAndLog(step, k)

step = "get name of 1.8.0 operator pod"
json := "jsonpath={.items[].metadata.name}"
k = kubectl.Get("pods").WithFlag("selector", "name=cass-operator").FormatOutput(json)
Expand All @@ -91,10 +97,17 @@ var _ = Describe(testName, func() {
// give the operator a minute to reconcile and update the datacenter
time.Sleep(1 * time.Minute)

ns.WaitForDatacenterReadyWithTimeouts(dcName, 800, 60)
ns.WaitForDatacenterReadyWithTimeouts(dcName, 1200, 1200)

ns.ExpectDoneReconciling(dcName)

// Verify Pod hasn't restarted
// step = "get Cassandra pods UID"
// k = kubectl.Get("pod/cluster1-dc1-r1-sts-0").FormatOutput("jsonpath={.metadata.uid}")
// postUpgradeCassPodUID := ns.OutputAndLog(step, k)

// Expect(createdPodUID).To(Equal(postUpgradeCassPodUID))

// Verify PodDisruptionBudget is available (1.11 updates from v1beta1 -> v1)
json = "jsonpath={.items[].metadata.name}"
k = kubectl.Get("poddisruptionbudgets").WithLabel("cassandra.datastax.com/datacenter").FormatOutput(json)
Expand All @@ -106,9 +119,9 @@ var _ = Describe(testName, func() {
k = kubectl.PatchMerge(dcResource, json)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30)
ns.WaitForDatacenterReadyPodCount(dcName, 1)
ns.WaitForDatacenterReadyWithTimeouts(dcName, 800, 60)
ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60)
ns.WaitForDatacenterReadyWithTimeouts(dcName, 1200, 1200)
ns.WaitForDatacenterReadyPodCount(dcName, 3)

ns.ExpectDoneReconciling(dcName)

Expand Down
Loading

0 comments on commit f406df8

Please sign in to comment.