From beb3baa3571622ba2e2afbe28e8211cdcefd947b Mon Sep 17 00:00:00 2001 From: Michael Burman Date: Tue, 19 Apr 2022 09:45:09 +0300 Subject: [PATCH 1/4] Add logging to catch the flakiness of decommission_dc test --- .github/workflows/kindIntegTest.yml | 154 +++++++++--------- .../decommission_dc_suite_test.go | 2 + 2 files changed, 79 insertions(+), 77 deletions(-) diff --git a/.github/workflows/kindIntegTest.yml b/.github/workflows/kindIntegTest.yml index 33138627..4a6945a3 100644 --- a/.github/workflows/kindIntegTest.yml +++ b/.github/workflows/kindIntegTest.yml @@ -60,46 +60,46 @@ jobs: matrix: integration_test: # Single worker tests: - - additional_serviceoptions - - additional_volumes - # - delete_node_terminated_container # This does not test any operator behavior - - podspec_simple - # - smoke_test_oss # See next job - - smoke_test_dse - # - terminate # Completes too fast, the test doesn't catch it - - timeout_prestop_termination - - upgrade_operator - - webhook_validation - # Three worker tests: - - canary_upgrade - - config_change_condition - # - delete_node_lost_readiness # DSE specific behavior - - host_network - - internode-encryption-generated - #- no_infinite_reconcile # smoke_test_* should take care of this - - node_replace - - nodeport_service - - rolling_restart - - stop_resume - - superuser-secret-generated - - superuser-secret-provided - - test_bad_config_and_fix - - test_mtls_mgmt_api - # More than 3 workers tests: - - add_racks - - additional_seeds - - cluster_wide_install - - config_change - - config_secret - - multi_cluster_management - - oss_test_all_the_things - - scale_down - # - scale_down_not_enough_space # Not enough stable test - - scale_down_unbalanced_racks - - scale_up - - scale_up_stop_resume - - seed_selection - - config_fql + # - additional_serviceoptions + # - additional_volumes + # # - delete_node_terminated_container # This does not test any operator behavior + # - podspec_simple + # # - smoke_test_oss # See next job + # - smoke_test_dse + # # - terminate # Completes too fast, the test doesn't catch it + # - timeout_prestop_termination + # - upgrade_operator + # - webhook_validation + # # Three worker tests: + # - canary_upgrade + # - config_change_condition + # # - delete_node_lost_readiness # DSE specific behavior + # - host_network + # - internode-encryption-generated + # #- no_infinite_reconcile # smoke_test_* should take care of this + # - node_replace + # - nodeport_service + # - rolling_restart + # - stop_resume + # - superuser-secret-generated + # - superuser-secret-provided + # - test_bad_config_and_fix + # - test_mtls_mgmt_api + # # More than 3 workers tests: + # - add_racks + # - additional_seeds + # - cluster_wide_install + # - config_change + # - config_secret + # - multi_cluster_management + # - oss_test_all_the_things + # - scale_down + # # - scale_down_not_enough_space # Not enough stable test + # - scale_down_unbalanced_racks + # - scale_up + # - scale_up_stop_resume + # - seed_selection + # - config_fql - decommission_dc # - stop_resume_scale_up # Odd insufficient CPU issues in kind+GHA # let other tests continue to run @@ -125,40 +125,40 @@ jobs: with: name: k8s-logs-${{ matrix.integration_test }} path: ./build/kubectl_dump - kind_smoke_tests: - needs: build_docker_images - strategy: - matrix: - version: - - "3.11.7" - - "3.11.11" - - "4.0.0" - - "4.0.1" - include: - - version: 3.11.7 - serverImage: k8ssandra/cass-management-api:3.11.7-v0.1.24 # k8ssandra 1.1 - - version: 4.0.0 - serverImage: k8ssandra/cass-management-api:4.0.0-v0.1.28 # k8ssandra 1.3 - fail-fast: true - runs-on: ubuntu-latest - env: - CGO_ENABLED: 0 - M_INTEG_DIR: smoke_test_oss - M_SERVER_VERSION: ${{ matrix.version }} - M_SERVER_IMAGE: ${{ matrix.serverImage }} - steps: - - uses: actions/checkout@v2 - if: github.event_name == 'pull_request' - with: - ref: ${{ github.event.pull_request.head.sha }} - - uses: actions/checkout@v2 - if: github.event_name != 'pull_request' - - uses: ./.github/actions/run-integ-test - with: - integration_test: smoke_test_oss - - name: Archive k8s logs - if: ${{ failure() }} - uses: actions/upload-artifact@v2 - with: - name: k8s-logs-smoke_test_oss-${{ matrix.version }}-${{ matrix.serverImage }} - path: ./build/kubectl_dump + # kind_smoke_tests: + # needs: build_docker_images + # strategy: + # matrix: + # version: + # - "3.11.7" + # - "3.11.11" + # - "4.0.0" + # - "4.0.1" + # include: + # - version: 3.11.7 + # serverImage: k8ssandra/cass-management-api:3.11.7-v0.1.24 # k8ssandra 1.1 + # - version: 4.0.0 + # serverImage: k8ssandra/cass-management-api:4.0.0-v0.1.28 # k8ssandra 1.3 + # fail-fast: true + # runs-on: ubuntu-latest + # env: + # CGO_ENABLED: 0 + # M_INTEG_DIR: smoke_test_oss + # M_SERVER_VERSION: ${{ matrix.version }} + # M_SERVER_IMAGE: ${{ matrix.serverImage }} + # steps: + # - uses: actions/checkout@v2 + # if: github.event_name == 'pull_request' + # with: + # ref: ${{ github.event.pull_request.head.sha }} + # - uses: actions/checkout@v2 + # if: github.event_name != 'pull_request' + # - uses: ./.github/actions/run-integ-test + # with: + # integration_test: smoke_test_oss + # - name: Archive k8s logs + # if: ${{ failure() }} + # uses: actions/upload-artifact@v2 + # with: + # name: k8s-logs-smoke_test_oss-${{ matrix.version }}-${{ matrix.serverImage }} + # path: ./build/kubectl_dump diff --git a/tests/decommission_dc/decommission_dc_suite_test.go b/tests/decommission_dc/decommission_dc_suite_test.go index 43caedd8..da2e3dbf 100644 --- a/tests/decommission_dc/decommission_dc_suite_test.go +++ b/tests/decommission_dc/decommission_dc_suite_test.go @@ -65,6 +65,8 @@ func findDatacenters(nodeName string) []string { dcs = append(dcs, strings.TrimSpace(dcParts[1])) } + fmt.Printf("Nodetool status output:\n\n%s\n", output) + return dcs } From bb656cbf18624cb27da9de7f016fc5a856e6a835 Mon Sep 17 00:00:00 2001 From: Michael Burman Date: Tue, 19 Apr 2022 12:53:50 +0300 Subject: [PATCH 2/4] Add requeue if we still have pods although decommission has succeeded --- pkg/reconciliation/reconcile_datacenter.go | 2 ++ pkg/reconciliation/reconcile_racks.go | 35 ++++++++++++---------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/pkg/reconciliation/reconcile_datacenter.go b/pkg/reconciliation/reconcile_datacenter.go index 2563d713..791c4994 100644 --- a/pkg/reconciliation/reconcile_datacenter.go +++ b/pkg/reconciliation/reconcile_datacenter.go @@ -70,6 +70,8 @@ func (rc *ReconciliationContext) ProcessDeletion() result.ReconcileResult { // Exiting to let other parts of the process take care of the decommission return result.Continue() } + // How could we have pods if we've decommissioned everything? + return result.RequeueSoon(5) } } diff --git a/pkg/reconciliation/reconcile_racks.go b/pkg/reconciliation/reconcile_racks.go index 5a9d5724..a0dfe3ed 100644 --- a/pkg/reconciliation/reconcile_racks.go +++ b/pkg/reconciliation/reconcile_racks.go @@ -2132,27 +2132,32 @@ func (rc *ReconciliationContext) cleanupAfterScaling() result.ReconcileResult { } if task != nil { - if task.Status.CompletionTime != nil { - // Job was completed, remove it from followed task - dc := rc.Datacenter - dcPatch := client.MergeFrom(dc.DeepCopy()) + return rc.activeTaskCompleted(task) + } - rc.Datacenter.Status.RemoveTrackedTask(task.ObjectMeta) + // Create the cleanup task + err = rc.createTask("cleanup") + if err != nil { + return result.Error(err) + } - if err := rc.Client.Status().Patch(rc.Ctx, dc, dcPatch); err != nil { - return result.Error(err) - } + return result.RequeueSoon(10) +} - return result.Continue() - } - } else { - // Create the cleanup task - err := rc.createTask("cleanup") - if err != nil { +func (rc *ReconciliationContext) activeTaskCompleted(task *taskapi.CassandraTask) result.ReconcileResult { + if task.Status.CompletionTime != nil { + // Job was completed, remove it from followed task + dc := rc.Datacenter + dcPatch := client.MergeFrom(dc.DeepCopy()) + + rc.Datacenter.Status.RemoveTrackedTask(task.ObjectMeta) + + if err := rc.Client.Status().Patch(rc.Ctx, dc, dcPatch); err != nil { return result.Error(err) } - } + return result.Continue() + } return result.RequeueSoon(10) } From e105c6e82263746d04dd8c7fbdf5066bd682478d Mon Sep 17 00:00:00 2001 From: Michael Burman Date: Tue, 19 Apr 2022 15:15:11 +0300 Subject: [PATCH 3/4] Revert logging changes --- .github/workflows/kindIntegTest.yml | 154 +++++++++--------- .../decommission_dc_suite_test.go | 2 - 2 files changed, 77 insertions(+), 79 deletions(-) diff --git a/.github/workflows/kindIntegTest.yml b/.github/workflows/kindIntegTest.yml index 4a6945a3..5202ac16 100644 --- a/.github/workflows/kindIntegTest.yml +++ b/.github/workflows/kindIntegTest.yml @@ -60,46 +60,46 @@ jobs: matrix: integration_test: # Single worker tests: - # - additional_serviceoptions - # - additional_volumes - # # - delete_node_terminated_container # This does not test any operator behavior - # - podspec_simple - # # - smoke_test_oss # See next job - # - smoke_test_dse - # # - terminate # Completes too fast, the test doesn't catch it - # - timeout_prestop_termination - # - upgrade_operator - # - webhook_validation - # # Three worker tests: - # - canary_upgrade - # - config_change_condition - # # - delete_node_lost_readiness # DSE specific behavior - # - host_network - # - internode-encryption-generated - # #- no_infinite_reconcile # smoke_test_* should take care of this - # - node_replace - # - nodeport_service - # - rolling_restart - # - stop_resume - # - superuser-secret-generated - # - superuser-secret-provided - # - test_bad_config_and_fix - # - test_mtls_mgmt_api - # # More than 3 workers tests: - # - add_racks - # - additional_seeds - # - cluster_wide_install - # - config_change - # - config_secret - # - multi_cluster_management - # - oss_test_all_the_things - # - scale_down - # # - scale_down_not_enough_space # Not enough stable test - # - scale_down_unbalanced_racks - # - scale_up - # - scale_up_stop_resume - # - seed_selection - # - config_fql + - additional_serviceoptions + - additional_volumes + # - delete_node_terminated_container # This does not test any operator behavior + - podspec_simple + # - smoke_test_oss # See next job + - smoke_test_dse + # - terminate # Completes too fast, the test doesn't catch it + - timeout_prestop_termination + - upgrade_operator + - webhook_validation + # Three worker tests: + - canary_upgrade + - config_change_condition + # - delete_node_lost_readiness # DSE specific behavior + - host_network + - internode-encryption-generated + #- no_infinite_reconcile # smoke_test_* should take care of this + - node_replace + - nodeport_service + - rolling_restart + - stop_resume + - superuser-secret-generated + - superuser-secret-provided + - test_bad_config_and_fix + - test_mtls_mgmt_api + # More than 3 workers tests: + - add_racks + - additional_seeds + - cluster_wide_install + - config_change + - config_secret + - multi_cluster_management + - oss_test_all_the_things + - scale_down + # - scale_down_not_enough_space # Not enough stable test + - scale_down_unbalanced_racks + - scale_up + - scale_up_stop_resume + - seed_selection + - config_fql - decommission_dc # - stop_resume_scale_up # Odd insufficient CPU issues in kind+GHA # let other tests continue to run @@ -125,40 +125,40 @@ jobs: with: name: k8s-logs-${{ matrix.integration_test }} path: ./build/kubectl_dump - # kind_smoke_tests: - # needs: build_docker_images - # strategy: - # matrix: - # version: - # - "3.11.7" - # - "3.11.11" - # - "4.0.0" - # - "4.0.1" - # include: - # - version: 3.11.7 - # serverImage: k8ssandra/cass-management-api:3.11.7-v0.1.24 # k8ssandra 1.1 - # - version: 4.0.0 - # serverImage: k8ssandra/cass-management-api:4.0.0-v0.1.28 # k8ssandra 1.3 - # fail-fast: true - # runs-on: ubuntu-latest - # env: - # CGO_ENABLED: 0 - # M_INTEG_DIR: smoke_test_oss - # M_SERVER_VERSION: ${{ matrix.version }} - # M_SERVER_IMAGE: ${{ matrix.serverImage }} - # steps: - # - uses: actions/checkout@v2 - # if: github.event_name == 'pull_request' - # with: - # ref: ${{ github.event.pull_request.head.sha }} - # - uses: actions/checkout@v2 - # if: github.event_name != 'pull_request' - # - uses: ./.github/actions/run-integ-test - # with: - # integration_test: smoke_test_oss - # - name: Archive k8s logs - # if: ${{ failure() }} - # uses: actions/upload-artifact@v2 - # with: - # name: k8s-logs-smoke_test_oss-${{ matrix.version }}-${{ matrix.serverImage }} - # path: ./build/kubectl_dump + kind_smoke_tests: + needs: build_docker_images + strategy: + matrix: + version: + - "3.11.7" + - "3.11.11" + - "4.0.0" + - "4.0.3" + include: + - version: 3.11.7 + serverImage: k8ssandra/cass-management-api:3.11.7-v0.1.24 # k8ssandra 1.1 + - version: 4.0.0 + serverImage: k8ssandra/cass-management-api:4.0.0-v0.1.28 # k8ssandra 1.3 + fail-fast: true + runs-on: ubuntu-latest + env: + CGO_ENABLED: 0 + M_INTEG_DIR: smoke_test_oss + M_SERVER_VERSION: ${{ matrix.version }} + M_SERVER_IMAGE: ${{ matrix.serverImage }} + steps: + - uses: actions/checkout@v2 + if: github.event_name == 'pull_request' + with: + ref: ${{ github.event.pull_request.head.sha }} + - uses: actions/checkout@v2 + if: github.event_name != 'pull_request' + - uses: ./.github/actions/run-integ-test + with: + integration_test: smoke_test_oss + - name: Archive k8s logs + if: ${{ failure() }} + uses: actions/upload-artifact@v2 + with: + name: k8s-logs-smoke_test_oss-${{ matrix.version }}-${{ matrix.serverImage }} + path: ./build/kubectl_dump diff --git a/tests/decommission_dc/decommission_dc_suite_test.go b/tests/decommission_dc/decommission_dc_suite_test.go index da2e3dbf..43caedd8 100644 --- a/tests/decommission_dc/decommission_dc_suite_test.go +++ b/tests/decommission_dc/decommission_dc_suite_test.go @@ -65,8 +65,6 @@ func findDatacenters(nodeName string) []string { dcs = append(dcs, strings.TrimSpace(dcParts[1])) } - fmt.Printf("Nodetool status output:\n\n%s\n", output) - return dcs } From ac692bc1b0f8809be716186e9b2e540d631a5a7f Mon Sep 17 00:00:00 2001 From: Michael Burman Date: Tue, 19 Apr 2022 15:31:16 +0300 Subject: [PATCH 4/4] CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6aacdba4..503c1995 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti * [CHANGE] [#264](https://github.com/k8ssandra/cass-operator/issues/264) Generate PodTemplateSpec in CassandraDatacenter with metadata * [CHANGE] [#183](https://github.com/k8ssandra/cass-operator/issues/183) Move from PodDisruptionBudget v1beta1 to v1 (min. Kubernetes version 1.21) * [ENHANCEMENT] [#292](https://github.com/k8ssandra/cass-operator/issues/292) Update to Go 1.17 with updates to dependencies: Kube 1.23.4 and controller-runtime 0.11.1 +* [BUGFIX] [#322](https://github.com/k8ssandra/cass-operator/pull/322) Add missing requeue if decommissioned pods haven't been removed y et ## v1.10.3