From f406df868afffbde5c04332b5d429fb938c31f55 Mon Sep 17 00:00:00 2001 From: Michael Burman Date: Tue, 5 Apr 2022 14:38:07 +0300 Subject: [PATCH] Recreate StatefulSet if required for updates to StS (#309) * Another try * Update upgrade_operator to 3 nodes, 3 racks and modify some timeouts in tests to avoid flakiness * Dump CassandraDatacenter and CassandraTask yaml to the build output also, Fixes #312 * Fix Completed() requeue logic in the cleanup task creation, modify scale_up test logic to ensure ScalingUp is set to False. Fixes #311 * Add missing CHANGELOG * Decrease config_change to 2 -> 4 nodes setup with 2 racks instead of 3 -> 6 * Ignore get -o errors, operator_upgrade 1.8.0 has no CassandraTask * Remove commented out lines * Increase timeouts in the tests to avoid flakiness. It seems increasing node count in certain tests caused the first check to pass while the cluster was not really ready and this caused the test to fail. Also, the kubectl outputs to the build directory missed the namespace * Modify config_change to use 1 rack for the testing * config_change cluster and dcName to variables * Missing dcName substitution --- CHANGELOG.md | 5 +- pkg/reconciliation/reconcile_racks.go | 53 ++++++++++++------- .../config_change/config_change_suite_test.go | 36 ++++++------- .../oss_test_all_the_things_suite_test.go | 4 +- tests/scale_up/scale_up_suite_test.go | 5 +- .../default-two-rack-two-node-dc.yaml | 26 +++++++++ tests/testdata/operator-1.7.1-oss-dc.yaml | 4 +- .../upgrade_operator_suite_test.go | 23 ++++++-- tests/util/ginkgo/lib.go | 18 +++---- tests/util/kubectl/kubectl.go | 37 +++++++++++++ 10 files changed, 154 insertions(+), 57 deletions(-) create mode 100644 tests/testdata/default-two-rack-two-node-dc.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index c5c66e67..90cc522d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,9 +13,12 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti * [CHANGE] [#183](https://github.com/k8ssandra/cass-operator/issues/183) Move from PodDisruptionBudget v1beta1 to v1 (min. Kubernetes version 1.21) * [CHANGE] [#264](https://github.com/k8ssandra/cass-operator/issues/264) Generate PodTemplateSpec in CassandraDatacenter with metadata -* [ENHANCEMENT] []() Update to Go 1.17 with updates to dependencies: Kube 1.23.4 and controller-runtime 0.11.1 +* [FEATURE] [#309](https://github.com/k8ssandra/cass-operator/pull/309) If StatefulSets are modified in a way that they can't be updated directly, recreate them with new specs +* [ENHANCEMENT] [#292](https://github.com/k8ssandra/cass-operator/issues/292) Update to Go 1.17 with updates to dependencies: Kube 1.23.4 and controller-runtime 0.11.1 +* [ENHANCEMENT] [#312](https://github.com/k8ssandra/cass-operator/issues/312) Integration tests now output CassandraDatacenter and CassandraTask CRD outputs to build directory * [BUGFIX] [#298](https://github.com/k8ssandra/cass-operator/issues/298) EndpointState has incorrect json key * [BUGFIX] [#304](https://github.com/k8ssandra/cass-operator/issues/304) Hostname lookups on Cassandra pods fail +* [BUGFIX] [#311](https://github.com/k8ssandra/cass-operator/issues/311) Fix cleanup retry reconcile bug ## v1.10.1 diff --git a/pkg/reconciliation/reconcile_racks.go b/pkg/reconciliation/reconcile_racks.go index eee9e144..5a9d5724 100644 --- a/pkg/reconciliation/reconcile_racks.go +++ b/pkg/reconciliation/reconcile_racks.go @@ -164,17 +164,13 @@ func (rc *ReconciliationContext) CheckRackCreation() result.ReconcileResult { func (rc *ReconciliationContext) desiredStatefulSetForExistingStatefulSet(sts *appsv1.StatefulSet, rackName string) (desiredSts *appsv1.StatefulSet, err error) { dc := rc.Datacenter - // have to use zero here, because each statefulset is created with no replicas - // in GetStatefulSetForRack() - replicas := 0 - // when Cass Operator was released, we accidentally used the incorrect managed-by // label of "cassandra-operator" we have since fixed this to be "cass-operator", // but unfortunately, we cannot modify the labels in the volumeClaimTemplates of a // StatefulSet. Consequently, we must preserve the old labels in this case. usesDefunct := usesDefunctPvcManagedByLabel(sts) - return newStatefulSetForCassandraDatacenter(sts, rackName, dc, replicas, usesDefunct) + return newStatefulSetForCassandraDatacenter(sts, rackName, dc, int(*sts.Spec.Replicas), usesDefunct) } func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult { @@ -275,11 +271,13 @@ func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult { statefulSet.SetResourceVersion(resVersion) err = rc.Client.Update(rc.Ctx, statefulSet) if err != nil { - logger.Error( - err, - "Unable to perform update on statefulset for config", - "statefulSet", statefulSet) - return result.Error(err) + if errors.IsInvalid(err) { + if err = rc.deleteStatefulSet(statefulSet); err != nil { + return result.Error(err) + } + } else { + return result.Error(err) + } } if err := rc.enableQuietPeriod(20); err != nil { @@ -334,7 +332,7 @@ func (rc *ReconciliationContext) CheckRackForceUpgrade() result.ReconcileResult return result.Continue() } - for idx := range rc.desiredRackInformation { + for idx, nextRack := range rc.desiredRackInformation { rackName := rc.desiredRackInformation[idx].RackName if slice.ContainsString(forceRacks, rackName, nil) { @@ -342,7 +340,7 @@ func (rc *ReconciliationContext) CheckRackForceUpgrade() result.ReconcileResult // have to use zero here, because each statefulset is created with no replicas // in GetStatefulSetForRack() - desiredSts, err := newStatefulSetForCassandraDatacenter(statefulSet, rackName, dc, 0, false) + desiredSts, err := newStatefulSetForCassandraDatacenter(statefulSet, rackName, dc, nextRack.NodeCount, false) if err != nil { logger.Error(err, "error calling newStatefulSetForCassandraDatacenter") return result.Error(err) @@ -386,13 +384,15 @@ func (rc *ReconciliationContext) CheckRackForceUpgrade() result.ReconcileResult ) if err := rc.Client.Update(rc.Ctx, statefulSet); err != nil { - logger.Error( - err, - "Unable to perform update on statefulset for force update config", - "statefulSet", statefulSet) - return result.Error(err) + if errors.IsInvalid(err) { + if err = rc.deleteStatefulSet(statefulSet); err != nil { + // logger.Error(err, "Failed to delete the StatefulSet", "Invalid", errors.IsInvalid(err), "Forbidden", errors.IsForbidden(err)) + return result.Error(err) + } + } else { + return result.Error(err) + } } - } } @@ -408,6 +408,19 @@ func (rc *ReconciliationContext) CheckRackForceUpgrade() result.ReconcileResult return result.Done() } +func (rc *ReconciliationContext) deleteStatefulSet(statefulSet *appsv1.StatefulSet) error { + policy := metav1.DeletePropagationOrphan + cascadePolicy := client.DeleteOptions{ + PropagationPolicy: &policy, + } + + if err := rc.Client.Delete(rc.Ctx, statefulSet, &cascadePolicy); err != nil { + return err + } + + return nil +} + func (rc *ReconciliationContext) CheckRackLabels() result.ReconcileResult { rc.ReqLogger.Info("reconcile_racks::CheckRackLabels") @@ -1388,7 +1401,7 @@ func (rc *ReconciliationContext) GetStatefulSetForRack( currentStatefulSet, nextRack.RackName, rc.Datacenter, - 0, + nextRack.NodeCount, false) if err != nil { return nil, false, err @@ -2222,7 +2235,7 @@ func (rc *ReconciliationContext) CheckClearActionConditions() result.ReconcileRe // Explicitly handle scaling up here because we want to run a cleanup afterwards if dc.GetConditionStatus(api.DatacenterScalingUp) == corev1.ConditionTrue { // Call the first node with cleanup, wait until it has finished and then move on to the next pod.. - if res := rc.cleanupAfterScaling(); !res.Completed() { + if res := rc.cleanupAfterScaling(); res.Completed() { return res } diff --git a/tests/config_change/config_change_suite_test.go b/tests/config_change/config_change_suite_test.go index 754197c9..22b35659 100644 --- a/tests/config_change/config_change_suite_test.go +++ b/tests/config_change/config_change_suite_test.go @@ -16,13 +16,14 @@ import ( ) var ( - testName = "Config change rollout" - namespace = "test-config-change-rollout" - dcName = "dc1" - dcYaml = "../testdata/default-three-rack-three-node-dc.yaml" - dcResource = fmt.Sprintf("CassandraDatacenter/%s", dcName) - dcLabel = fmt.Sprintf("cassandra.datastax.com/datacenter=%s", dcName) - ns = ginkgo_util.NewWrapper(testName, namespace) + testName = "Config change rollout" + namespace = "test-config-change-rollout" + dcName = "dc2" + clusterName = "cluster2" + dcYaml = "../testdata/default-single-rack-single-node-dc.yaml" + dcResource = fmt.Sprintf("CassandraDatacenter/%s", dcName) + dcLabel = fmt.Sprintf("cassandra.datastax.com/datacenter=%s", dcName) + ns = ginkgo_util.NewWrapper(testName, namespace) ) func TestLifecycle(t *testing.T) { @@ -50,40 +51,39 @@ var _ = Describe(testName, func() { ns.WaitForOperatorReady() - step := "creating a datacenter resource with 3 racks/3 nodes" + step := "creating a datacenter resource with 1 racks/1 node" k := kubectl.ApplyFiles(dcYaml) ns.ExecAndLog(step, k) ns.WaitForDatacenterReady(dcName) - step = "scale up to 6 nodes" - json := `{"spec": {"size": 6}}` + step = "scale up to 3 nodes" + json := `{"spec": {"size": 3}}` k = kubectl.PatchMerge(dcResource, json) ns.ExecAndLog(step, k) - ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30) + ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60) ns.WaitForDatacenterReady(dcName) step = "change the config" - json = "{\"spec\": {\"config\": {\"cassandra-yaml\": {\"roles_validity_in_ms\": 256000}, \"jvm-options\": {\"garbage_collector\": \"CMS\"}}}}" + json = "{\"spec\": {\"config\": {\"cassandra-yaml\": {\"roles_validity_in_ms\": 256000}}}}" k = kubectl.PatchMerge(dcResource, json) ns.ExecAndLog(step, k) - ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30) + ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60) ns.WaitForDatacenterOperatorProgress(dcName, "Ready", 1800) - step = "checking that the init container got the updated config roles_validity_in_ms=256000, garbage_collector=CMS" + step = "checking that the init container got the updated config roles_validity_in_ms=256000" json = "jsonpath={.spec.initContainers[0].env[7].value}" - k = kubectl.Get("pod/cluster1-dc1-r1-sts-0"). + k = kubectl.Get(fmt.Sprintf("pod/%s-%s-r1-sts-0", clusterName, dcName)). FormatOutput(json) ns.WaitForOutputContainsAndLog(step, k, "\"roles_validity_in_ms\":256000", 30) - ns.WaitForOutputContainsAndLog(step, k, "\"garbage_collector\":\"CMS\"", 30) step = "checking that statefulsets have the right owner reference" json = "jsonpath={.metadata.ownerReferences[0].name}" - k = kubectl.Get("sts/cluster1-dc1-r1-sts"). + k = kubectl.Get(fmt.Sprintf("sts/%s-%s-r1-sts", clusterName, dcName)). FormatOutput(json) - ns.WaitForOutputAndLog(step, k, "dc1", 30) + ns.WaitForOutputAndLog(step, k, dcName, 30) step = "deleting the dc" k = kubectl.DeleteFromFiles(dcYaml) diff --git a/tests/oss_test_all_the_things/oss_test_all_the_things_suite_test.go b/tests/oss_test_all_the_things/oss_test_all_the_things_suite_test.go index 3abb6057..7c808b43 100644 --- a/tests/oss_test_all_the_things/oss_test_all_the_things_suite_test.go +++ b/tests/oss_test_all_the_things/oss_test_all_the_things_suite_test.go @@ -69,7 +69,7 @@ var _ = Describe(testName, func() { k = kubectl.PatchMerge(dcResource, json) ns.ExecAndLog(step, k) - ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30) + ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60) ns.WaitForDatacenterReady(dcName) step = "scale up to 5 nodes" @@ -77,7 +77,7 @@ var _ = Describe(testName, func() { k = kubectl.PatchMerge(dcResource, json) ns.ExecAndLog(step, k) - ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30) + ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60) ns.WaitForDatacenterReady(dcName) step = "stopping the dc" diff --git a/tests/scale_up/scale_up_suite_test.go b/tests/scale_up/scale_up_suite_test.go index 9ceb35e7..0af3bd7b 100644 --- a/tests/scale_up/scale_up_suite_test.go +++ b/tests/scale_up/scale_up_suite_test.go @@ -71,15 +71,16 @@ var _ = Describe(testName, func() { // Ensure we have a single CassandraTask created which is a cleanup (and it succeeded) ns.CheckForCompletedCassandraTasks(dcName, "cleanup", 1) - // ns.CheckForCompletedCassandraTask(dcName, "cleanup") step = "scale up to 4 nodes" json = "{\"spec\": {\"size\": 4}}" k = kubectl.PatchMerge(dcResource, json) ns.ExecAndLog(step, k) + ns.WaitForDatacenterCondition(dcName, "ScalingUp", string(corev1.ConditionTrue)) ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60) ns.WaitForDatacenterReady(dcName) + ns.WaitForDatacenterCondition(dcName, "ScalingUp", string(corev1.ConditionFalse)) // Ensure we have two CassandraTasks created which are cleanup (and they succeeded) ns.CheckForCompletedCassandraTasks(dcName, "cleanup", 2) @@ -89,8 +90,10 @@ var _ = Describe(testName, func() { k = kubectl.PatchMerge(dcResource, json) ns.ExecAndLog(step, k) + ns.WaitForDatacenterCondition(dcName, "ScalingUp", string(corev1.ConditionTrue)) ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60) ns.WaitForDatacenterReady(dcName) + ns.WaitForDatacenterCondition(dcName, "ScalingUp", string(corev1.ConditionFalse)) // Ensure we have three CassandraTasks created which are cleanup (and they succeeded) ns.CheckForCompletedCassandraTasks(dcName, "cleanup", 3) diff --git a/tests/testdata/default-two-rack-two-node-dc.yaml b/tests/testdata/default-two-rack-two-node-dc.yaml new file mode 100644 index 00000000..7306d25e --- /dev/null +++ b/tests/testdata/default-two-rack-two-node-dc.yaml @@ -0,0 +1,26 @@ +apiVersion: cassandra.datastax.com/v1beta1 +kind: CassandraDatacenter +metadata: + name: dc1 +spec: + clusterName: cluster1 + serverType: cassandra + serverVersion: "4.0.3" + managementApiAuth: + insecure: {} + size: 2 + storageConfig: + cassandraDataVolumeClaimSpec: + storageClassName: standard + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi + racks: + - name: r1 + - name: r2 + config: + jvm-server-options: + initial_heap_size: "512m" + max_heap_size: "512m" diff --git a/tests/testdata/operator-1.7.1-oss-dc.yaml b/tests/testdata/operator-1.7.1-oss-dc.yaml index 4fee5b8c..24c0a98d 100644 --- a/tests/testdata/operator-1.7.1-oss-dc.yaml +++ b/tests/testdata/operator-1.7.1-oss-dc.yaml @@ -8,7 +8,7 @@ spec: serverVersion: "3.11.7" managementApiAuth: insecure: {} - size: 1 + size: 3 storageConfig: cassandraDataVolumeClaimSpec: storageClassName: standard @@ -19,6 +19,8 @@ spec: storage: 1Gi racks: - name: r1 + - name: r2 + - name: r3 config: jvm-options: initial_heap_size: "512m" diff --git a/tests/upgrade_operator/upgrade_operator_suite_test.go b/tests/upgrade_operator/upgrade_operator_suite_test.go index 01bdf055..ae863ee6 100644 --- a/tests/upgrade_operator/upgrade_operator_suite_test.go +++ b/tests/upgrade_operator/upgrade_operator_suite_test.go @@ -32,6 +32,7 @@ func TestLifecycle(t *testing.T) { kubectl.DumpAllLogs(logPath).ExecV() fmt.Printf("\n\tPost-run logs dumped at: %s\n\n", logPath) ns.Terminate() + kustomize.Undeploy(namespace) }) RegisterFailHandler(Fail) @@ -69,12 +70,17 @@ var _ = Describe(testName, func() { ns.WaitForOperatorReady() - step := "creating a datacenter resource with 1 racks/1 node" + step := "creating a datacenter resource with 3 racks/3 node" k := kubectl.ApplyFiles(dcYaml) ns.ExecAndLog(step, k) ns.WaitForDatacenterReady(dcName) + // Get UID of the cluster pod + // step = "get Cassandra pods UID" + // k = kubectl.Get("pod/cluster1-dc1-r1-sts-0").FormatOutput("jsonpath={.metadata.uid}") + // createdPodUID := ns.OutputAndLog(step, k) + step = "get name of 1.8.0 operator pod" json := "jsonpath={.items[].metadata.name}" k = kubectl.Get("pods").WithFlag("selector", "name=cass-operator").FormatOutput(json) @@ -91,10 +97,17 @@ var _ = Describe(testName, func() { // give the operator a minute to reconcile and update the datacenter time.Sleep(1 * time.Minute) - ns.WaitForDatacenterReadyWithTimeouts(dcName, 800, 60) + ns.WaitForDatacenterReadyWithTimeouts(dcName, 1200, 1200) ns.ExpectDoneReconciling(dcName) + // Verify Pod hasn't restarted + // step = "get Cassandra pods UID" + // k = kubectl.Get("pod/cluster1-dc1-r1-sts-0").FormatOutput("jsonpath={.metadata.uid}") + // postUpgradeCassPodUID := ns.OutputAndLog(step, k) + + // Expect(createdPodUID).To(Equal(postUpgradeCassPodUID)) + // Verify PodDisruptionBudget is available (1.11 updates from v1beta1 -> v1) json = "jsonpath={.items[].metadata.name}" k = kubectl.Get("poddisruptionbudgets").WithLabel("cassandra.datastax.com/datacenter").FormatOutput(json) @@ -106,9 +119,9 @@ var _ = Describe(testName, func() { k = kubectl.PatchMerge(dcResource, json) ns.ExecAndLog(step, k) - ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30) - ns.WaitForDatacenterReadyPodCount(dcName, 1) - ns.WaitForDatacenterReadyWithTimeouts(dcName, 800, 60) + ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 60) + ns.WaitForDatacenterReadyWithTimeouts(dcName, 1200, 1200) + ns.WaitForDatacenterReadyPodCount(dcName, 3) ns.ExpectDoneReconciling(dcName) diff --git a/tests/util/ginkgo/lib.go b/tests/util/ginkgo/lib.go index 1cb653f7..92e6426e 100644 --- a/tests/util/ginkgo/lib.go +++ b/tests/util/ginkgo/lib.go @@ -153,14 +153,14 @@ func (ns *NsWrapper) genTestLogDir(description string) string { func (ns *NsWrapper) ExecAndLog(description string, kcmd kubectl.KCmd) { ginkgo.By(description) - defer kubectl.DumpLogs(ns.genTestLogDir(description), ns.Namespace).ExecVPanic() + defer kubectl.DumpClusterInfo(ns.genTestLogDir(description), ns.Namespace) execErr := ns.ExecV(kcmd) Expect(execErr).ToNot(HaveOccurred()) } func (ns *NsWrapper) ExecAndLogAndExpectErrorString(description string, kcmd kubectl.KCmd, expectedError string) { ginkgo.By(description) - defer kubectl.DumpLogs(ns.genTestLogDir(description), ns.Namespace).ExecVPanic() + defer kubectl.DumpClusterInfo(ns.genTestLogDir(description), ns.Namespace) _, captureErr, execErr := ns.ExecVCapture(kcmd) Expect(execErr).To(HaveOccurred()) Expect(captureErr).Should(ContainSubstring(expectedError)) @@ -168,7 +168,7 @@ func (ns *NsWrapper) ExecAndLogAndExpectErrorString(description string, kcmd kub func (ns *NsWrapper) OutputAndLog(description string, kcmd kubectl.KCmd) string { ginkgo.By(description) - defer kubectl.DumpLogs(ns.genTestLogDir(description), ns.Namespace).ExecVPanic() + defer kubectl.DumpClusterInfo(ns.genTestLogDir(description), ns.Namespace) output, execErr := ns.Output(kcmd) Expect(execErr).ToNot(HaveOccurred()) return output @@ -176,21 +176,21 @@ func (ns *NsWrapper) OutputAndLog(description string, kcmd kubectl.KCmd) string func (ns *NsWrapper) WaitForOutputAndLog(description string, kcmd kubectl.KCmd, expected string, seconds int) { ginkgo.By(description) - defer kubectl.DumpLogs(ns.genTestLogDir(description), ns.Namespace).ExecVPanic() + defer kubectl.DumpClusterInfo(ns.genTestLogDir(description), ns.Namespace) execErr := ns.WaitForOutput(kcmd, expected, seconds) Expect(execErr).ToNot(HaveOccurred()) } func (ns *NsWrapper) WaitForOutputPatternAndLog(description string, kcmd kubectl.KCmd, expected string, seconds int) { ginkgo.By(description) - defer kubectl.DumpLogs(ns.genTestLogDir(description), ns.Namespace).ExecVPanic() + defer kubectl.DumpClusterInfo(ns.genTestLogDir(description), ns.Namespace) execErr := ns.WaitForOutputPattern(kcmd, expected, seconds) Expect(execErr).ToNot(HaveOccurred()) } func (ns *NsWrapper) WaitForOutputContainsAndLog(description string, kcmd kubectl.KCmd, expected string, seconds int) { ginkgo.By(description) - defer kubectl.DumpLogs(ns.genTestLogDir(description), ns.Namespace).ExecVPanic() + defer kubectl.DumpClusterInfo(ns.genTestLogDir(description), ns.Namespace) execErr := ns.WaitForOutputContains(kcmd, expected, seconds) Expect(execErr).ToNot(HaveOccurred()) } @@ -267,7 +267,7 @@ func (ns *NsWrapper) WaitForDatacenterReadyPodCountWithTimeout(dcName string, co } func (ns *NsWrapper) WaitForDatacenterReady(dcName string) { - ns.WaitForDatacenterReadyWithTimeouts(dcName, 600, 30) + ns.WaitForDatacenterReadyWithTimeouts(dcName, 1200, 1200) } func (ns *NsWrapper) WaitForDatacenterReadyWithTimeouts(dcName string, podCountTimeout int, dcReadyTimeout int) { @@ -383,7 +383,7 @@ func (ns *NsWrapper) WaitForOperatorReady() { WithLabel("name=cass-operator"). WithFlag("field-selector", "status.phase=Running"). FormatOutput(json) - ns.WaitForOutputAndLog(step, k, "true", 240) + ns.WaitForOutputAndLog(step, k, "true", 300) } // kubectl create secret docker-registry github-docker-registry --docker-username=USER --docker-password=PASS --docker-server docker.pkg.github.com @@ -529,5 +529,5 @@ func (ns *NsWrapper) CheckForCompletedCassandraTasks(dcName, command string, cou WithLabel(fmt.Sprintf("cassandra.datastax.com/datacenter=%s", dcName)). WithLabel("control.k8ssandra.io/status=completed"). FormatOutput(json) - ns.WaitForOutputAndLog(step, k, duplicate(command, count), 60) + ns.WaitForOutputAndLog(step, k, duplicate(command, count), 120) } diff --git a/tests/util/kubectl/kubectl.go b/tests/util/kubectl/kubectl.go index f8b2b59b..bd3896a7 100644 --- a/tests/util/kubectl/kubectl.go +++ b/tests/util/kubectl/kubectl.go @@ -351,6 +351,43 @@ func DumpLogs(path string, namespace string) KCmd { return KCmd{Command: "cluster-info", Args: args, Flags: flags} } +// DumpClusterInfo Executes `kubectl cluster-info dump -o yaml` on each cluster. The output +// is stored under /build/test. +func DumpClusterInfo(path string, namespace string) error { + _ = os.MkdirAll(path, os.ModePerm) + + dumpCmd := DumpLogs(path, namespace) + dumpCmd.ExecVPanic() + + // Store the list of pods in an easy to read format. + podWide := Get("pods", "-o", "wide", "-n", namespace).OutputPanic() + storeOutput(path, "pods", "out", podWide) + + // Dump all objects that we need to investigate failures as a flat list and as yaml manifests + for _, objectType := range []string{"CassandraDatacenter", "CassandraTask"} { + // Get the list of objects + output, _ := Get(objectType, "-o", "wide", "-n", namespace).Output() + storeOutput(path, objectType, "out", output) + + // Get the yamls for each object + output, _ = Get(objectType, "-o", "yaml", "-n", namespace).Output() + storeOutput(path, objectType, "yaml", output) + } + + return nil +} + +func storeOutput(path, objectType, ext, output string) { + filePath := fmt.Sprintf("%s/%s.%s", path, objectType, ext) + outputFile, err := os.Create(filePath) + if err != nil { + panic("Failed to create log file") + } + defer outputFile.Close() + outputFile.WriteString(output) + outputFile.Sync() +} + func ExecOnPod(podName string, args ...string) KCmd { execArgs := []string{podName} execArgs = append(execArgs, args...)