diff --git a/CHANGELOG.md b/CHANGELOG.md index 5591b79ad..c2f300e6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ - (Bugfix) Fix ArangoBackup Create Backoff & ArangoBackupPolicy propagation - (Maintenance) Add IndexMethod Documentation - (Bugfix) Fix VersionCheck args propagation +- (Feature) EnforcedResignLeadership action ## [1.2.33](https://github.com/arangodb/kube-arangodb/tree/1.2.33) (2023-09-27) - (Maintenance) Bump golang.org/x/net to v0.13.0 diff --git a/README.md b/README.md index 9e9430565..17920a9a1 100644 --- a/README.md +++ b/README.md @@ -58,32 +58,33 @@ covers individual newer features separately. #### Operator Features -| Feature | Operator Version | Introduced | ArangoDB Version | ArangoDB Edition | State | Enabled | Flag | Remarks | -|:-------------------------------------------------------------------------------------|:-----------------|:-----------|:-----------------|:----------------------|:-------------|:--------|:------------------------------------------------------|:--------------------------------------------------------------------------| -| Copy resources spec to init containers | 1.2.33 | 1.2.33 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.init-containers-copy-resources | Copy resources spec to built-in init containers if they are not specified | -| [Rebalancer V2](docs/design/features/rebalancer_v2.md) | 1.2.31 | 1.2.31 | >= 3.10.0 | Community, Enterprise | Alpha | False | --deployment.feature.rebalancer-v2 | N/A | -| [Secured containers](docs/design/features/secured_containers.md) | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.secured-containers | If set to True Operator will run containers in secure mode | -| Version Check V2 | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.upgrade-version-check-V2 | N/A | -| [Operator Ephemeral Volumes](docs/design/features/ephemeral_volumes.md) | 1.2.31 | 1.2.2 | >= 3.8.0 | Community, Enterprise | Beta | False | --deployment.feature.ephemeral-volumes | N/A | -| [Force Rebuild Out Synced Shards](docs/design/features/rebuild_out_synced_shards.md) | 1.2.27 | 1.2.27 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.force-rebuild-out-synced-shards | It should be used only if user is aware of the risks. | -| [Spec Default Restore](docs/design/features/deployment_spec_defaults.md) | 1.2.25 | 1.2.21 | >= 3.8.0 | Community, Enterprise | Beta | True | --deployment.feature.deployment-spec-defaults-restore | If set to False Operator will not change ArangoDeployment Spec | -| Version Check | 1.2.23 | 1.1.4 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.upgrade-version-check | N/A | -| [Failover Leader service](docs/design/features/failover_leader_service.md) | 1.2.13 | 1.2.13 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.failover-leadership | N/A | -| Graceful Restart | 1.2.5 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | ---deployment.feature.graceful-shutdown | N/A | -| Optional Graceful Restart | 1.2.0 | 1.2.5 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.optional-graceful-shutdown | N/A | -| Operator Internal Metrics Exporter | 1.2.0 | 1.2.0 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A | -| Operator Maintenance Management Support | 1.2.0 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.maintenance | N/A | -| Encryption Key Rotation Support | 1.2.0 | 1.0.3 | >= 3.8.0 | Enterprise | NotSupported | False | --deployment.feature.encryption-rotation | N/A | -| TLS Runtime Rotation Support | 1.1.0 | 1.0.4 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-rotation | N/A | -| JWT Rotation Support | 1.1.0 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.jwt-rotation | N/A | -| Operator Single Mode | 1.0.4 | 1.0.4 | >= 3.8.0 | Community, Enterprise | Production | False | --mode.single | Only 1 instance of Operator allowed in namespace when feature is enabled | -| TLS SNI Support | 1.0.3 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-sni | N/A | -| Disabling of liveness probes | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | -| Pod Disruption Budgets | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | -| Prometheus Metrics Exporter | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | Prometheus required | -| Sidecar Containers | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | -| Volume Claim Templates | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | -| Volume Resizing | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Feature | Operator Version | Introduced | ArangoDB Version | ArangoDB Edition | State | Enabled | Flag | Remarks | +|:-------------------------------------------------------------------------------------|:-----------------|:-----------|:-----------------|:----------------------|:-------------|:--------|:------------------------------------------------------|:-----------------------------------------------------------------------------------| +| Enforced ResignLeadership | 1.2.34 | 1.2.34 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.enforced-resign-leadership | Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer | +| Copy resources spec to init containers | 1.2.33 | 1.2.33 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.init-containers-copy-resources | Copy resources spec to built-in init containers if they are not specified | +| [Rebalancer V2](docs/design/features/rebalancer_v2.md) | 1.2.31 | 1.2.31 | >= 3.10.0 | Community, Enterprise | Alpha | False | --deployment.feature.rebalancer-v2 | N/A | +| [Secured containers](docs/design/features/secured_containers.md) | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.secured-containers | If set to True Operator will run containers in secure mode | +| Version Check V2 | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.upgrade-version-check-V2 | N/A | +| [Operator Ephemeral Volumes](docs/design/features/ephemeral_volumes.md) | 1.2.31 | 1.2.2 | >= 3.8.0 | Community, Enterprise | Beta | False | --deployment.feature.ephemeral-volumes | N/A | +| [Force Rebuild Out Synced Shards](docs/design/features/rebuild_out_synced_shards.md) | 1.2.27 | 1.2.27 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.force-rebuild-out-synced-shards | It should be used only if user is aware of the risks. | +| [Spec Default Restore](docs/design/features/deployment_spec_defaults.md) | 1.2.25 | 1.2.21 | >= 3.8.0 | Community, Enterprise | Beta | True | --deployment.feature.deployment-spec-defaults-restore | If set to False Operator will not change ArangoDeployment Spec | +| Version Check | 1.2.23 | 1.1.4 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.upgrade-version-check | N/A | +| [Failover Leader service](docs/design/features/failover_leader_service.md) | 1.2.13 | 1.2.13 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.failover-leadership | N/A | +| Graceful Restart | 1.2.5 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | ---deployment.feature.graceful-shutdown | N/A | +| Optional Graceful Restart | 1.2.0 | 1.2.5 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.optional-graceful-shutdown | N/A | +| Operator Internal Metrics Exporter | 1.2.0 | 1.2.0 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A | +| Operator Maintenance Management Support | 1.2.0 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.maintenance | N/A | +| Encryption Key Rotation Support | 1.2.0 | 1.0.3 | >= 3.8.0 | Enterprise | NotSupported | False | --deployment.feature.encryption-rotation | N/A | +| TLS Runtime Rotation Support | 1.1.0 | 1.0.4 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-rotation | N/A | +| JWT Rotation Support | 1.1.0 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.jwt-rotation | N/A | +| Operator Single Mode | 1.0.4 | 1.0.4 | >= 3.8.0 | Community, Enterprise | Production | False | --mode.single | Only 1 instance of Operator allowed in namespace when feature is enabled | +| TLS SNI Support | 1.0.3 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-sni | N/A | +| Disabling of liveness probes | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Pod Disruption Budgets | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Prometheus Metrics Exporter | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | Prometheus required | +| Sidecar Containers | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Volume Claim Templates | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Volume Resizing | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | diff --git a/docs/generated/actions.md b/docs/generated/actions.md index acf96d7a6..a32c19615 100644 --- a/docs/generated/actions.md +++ b/docs/generated/actions.md @@ -29,6 +29,7 @@ | EncryptionKeyRefresh | no | 10m0s | no | Enterprise Only | Refresh the encryption keys on member | | EncryptionKeyRemove | no | 10m0s | no | Enterprise Only | Remove the encryption key to the pool | | EncryptionKeyStatusUpdate | no | 10m0s | no | Enterprise Only | Update status of encryption propagation | +| EnforceResignLeadership | no | 45m0s | yes | Community & Enterprise | Run the ResignLeadership job on DBServer and checks data compatibility after | | Idle | no | 10m0s | no | Community & Enterprise | Define idle operation in case if preconditions are not meet | | JWTAdd | no | 10m0s | no | Enterprise Only | Adds new JWT to the pool | | JWTClean | no | 10m0s | no | Enterprise Only | Remove JWT key from the pool | @@ -122,6 +123,7 @@ spec: EncryptionKeyRefresh: 10m0s EncryptionKeyRemove: 10m0s EncryptionKeyStatusUpdate: 10m0s + EnforceResignLeadership: 45m0s Idle: 10m0s JWTAdd: 10m0s JWTClean: 10m0s diff --git a/internal/actions.yaml b/internal/actions.yaml index 095693e2e..3209b4c38 100644 --- a/internal/actions.yaml +++ b/internal/actions.yaml @@ -29,6 +29,10 @@ actions: description: Run the ResignLeadership job on DBServer timeout: 30m optional: true + EnforceResignLeadership: + description: Run the ResignLeadership job on DBServer and checks data compatibility after + timeout: 45m + optional: true KillMemberPod: description: Execute Delete on Pod (put pod in Terminating state) scopes: diff --git a/internal/features.yaml b/internal/features.yaml index cbca81a4c..0637631e2 100644 --- a/internal/features.yaml +++ b/internal/features.yaml @@ -220,3 +220,10 @@ features: releases: - operatorVersion: 1.2.33 state: Production + - name: Enforced ResignLeadership + enabled: true + remarks: Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer + flag: --deployment.feature.enforced-resign-leadership + releases: + - operatorVersion: 1.2.34 + state: Production diff --git a/pkg/apis/deployment/v1/actions.generated.go b/pkg/apis/deployment/v1/actions.generated.go index 3a44e4386..5a34c9b90 100644 --- a/pkg/apis/deployment/v1/actions.generated.go +++ b/pkg/apis/deployment/v1/actions.generated.go @@ -98,6 +98,9 @@ const ( // ActionEncryptionKeyStatusUpdateDefaultTimeout define default timeout for action ActionEncryptionKeyStatusUpdate ActionEncryptionKeyStatusUpdateDefaultTimeout time.Duration = ActionsDefaultTimeout + // ActionEnforceResignLeadershipDefaultTimeout define default timeout for action ActionEnforceResignLeadership + ActionEnforceResignLeadershipDefaultTimeout time.Duration = 2700 * time.Second // 45m0s + // ActionIdleDefaultTimeout define default timeout for action ActionIdle ActionIdleDefaultTimeout time.Duration = ActionsDefaultTimeout @@ -353,6 +356,9 @@ const ( // ActionTypeEncryptionKeyStatusUpdate in scopes Normal. Update status of encryption propagation ActionTypeEncryptionKeyStatusUpdate ActionType = "EncryptionKeyStatusUpdate" + // ActionTypeEnforceResignLeadership in scopes Normal. Run the ResignLeadership job on DBServer and checks data compatibility after + ActionTypeEnforceResignLeadership ActionType = "EnforceResignLeadership" + // ActionTypeIdle in scopes Normal. Define idle operation in case if preconditions are not meet ActionTypeIdle ActionType = "Idle" @@ -587,6 +593,8 @@ func (a ActionType) DefaultTimeout() time.Duration { return ActionEncryptionKeyRemoveDefaultTimeout case ActionTypeEncryptionKeyStatusUpdate: return ActionEncryptionKeyStatusUpdateDefaultTimeout + case ActionTypeEnforceResignLeadership: + return ActionEnforceResignLeadershipDefaultTimeout case ActionTypeIdle: return ActionIdleDefaultTimeout case ActionTypeJWTAdd: @@ -761,6 +769,8 @@ func (a ActionType) Priority() ActionPriority { return ActionPriorityNormal case ActionTypeEncryptionKeyStatusUpdate: return ActionPriorityNormal + case ActionTypeEnforceResignLeadership: + return ActionPriorityNormal case ActionTypeIdle: return ActionPriorityNormal case ActionTypeJWTAdd: @@ -947,6 +957,8 @@ func (a ActionType) Optional() bool { return false case ActionTypeEncryptionKeyStatusUpdate: return false + case ActionTypeEnforceResignLeadership: + return true case ActionTypeIdle: return false case ActionTypeJWTAdd: diff --git a/pkg/apis/deployment/v2alpha1/actions.generated.go b/pkg/apis/deployment/v2alpha1/actions.generated.go index 0a6d8f402..61bc46bd6 100644 --- a/pkg/apis/deployment/v2alpha1/actions.generated.go +++ b/pkg/apis/deployment/v2alpha1/actions.generated.go @@ -98,6 +98,9 @@ const ( // ActionEncryptionKeyStatusUpdateDefaultTimeout define default timeout for action ActionEncryptionKeyStatusUpdate ActionEncryptionKeyStatusUpdateDefaultTimeout time.Duration = ActionsDefaultTimeout + // ActionEnforceResignLeadershipDefaultTimeout define default timeout for action ActionEnforceResignLeadership + ActionEnforceResignLeadershipDefaultTimeout time.Duration = 2700 * time.Second // 45m0s + // ActionIdleDefaultTimeout define default timeout for action ActionIdle ActionIdleDefaultTimeout time.Duration = ActionsDefaultTimeout @@ -353,6 +356,9 @@ const ( // ActionTypeEncryptionKeyStatusUpdate in scopes Normal. Update status of encryption propagation ActionTypeEncryptionKeyStatusUpdate ActionType = "EncryptionKeyStatusUpdate" + // ActionTypeEnforceResignLeadership in scopes Normal. Run the ResignLeadership job on DBServer and checks data compatibility after + ActionTypeEnforceResignLeadership ActionType = "EnforceResignLeadership" + // ActionTypeIdle in scopes Normal. Define idle operation in case if preconditions are not meet ActionTypeIdle ActionType = "Idle" @@ -587,6 +593,8 @@ func (a ActionType) DefaultTimeout() time.Duration { return ActionEncryptionKeyRemoveDefaultTimeout case ActionTypeEncryptionKeyStatusUpdate: return ActionEncryptionKeyStatusUpdateDefaultTimeout + case ActionTypeEnforceResignLeadership: + return ActionEnforceResignLeadershipDefaultTimeout case ActionTypeIdle: return ActionIdleDefaultTimeout case ActionTypeJWTAdd: @@ -761,6 +769,8 @@ func (a ActionType) Priority() ActionPriority { return ActionPriorityNormal case ActionTypeEncryptionKeyStatusUpdate: return ActionPriorityNormal + case ActionTypeEnforceResignLeadership: + return ActionPriorityNormal case ActionTypeIdle: return ActionPriorityNormal case ActionTypeJWTAdd: @@ -947,6 +957,8 @@ func (a ActionType) Optional() bool { return false case ActionTypeEncryptionKeyStatusUpdate: return false + case ActionTypeEnforceResignLeadership: + return true case ActionTypeIdle: return false case ActionTypeJWTAdd: diff --git a/pkg/deployment/features/resign_leadership.go b/pkg/deployment/features/resign_leadership.go new file mode 100644 index 000000000..5dae35c99 --- /dev/null +++ b/pkg/deployment/features/resign_leadership.go @@ -0,0 +1,38 @@ +// +// DISCLAIMER +// +// Copyright 2023 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// + +package features + +func init() { + registerFeature(enforcedResignLeadership) +} + +var enforcedResignLeadership = &feature{ + name: "enforced-resign-leadership", + description: "Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer", + version: "3.7.0", + enterpriseRequired: false, + enabledByDefault: true, +} + +// EnforcedResignLeadership returns enforced ResignLeadership. +func EnforcedResignLeadership() Feature { + return enforcedResignLeadership +} diff --git a/pkg/deployment/reconcile/action.register.generated.go b/pkg/deployment/reconcile/action.register.generated.go index a2ea2c0ed..be9aa9bf0 100644 --- a/pkg/deployment/reconcile/action.register.generated.go +++ b/pkg/deployment/reconcile/action.register.generated.go @@ -93,6 +93,9 @@ var ( _ Action = &actionEncryptionKeyStatusUpdate{} _ actionFactory = newEncryptionKeyStatusUpdateAction + _ Action = &actionEnforceResignLeadership{} + _ actionFactory = newEnforceResignLeadershipAction + _ Action = &actionIdle{} _ actionFactory = newIdleAction @@ -599,6 +602,20 @@ func init() { registerAction(action, function) } + // EnforceResignLeadership + { + // Get Action type + action := api.ActionTypeEnforceResignLeadership + + // Get Action defition + function := newEnforceResignLeadershipAction + + // Wrap action main function + + // Register action + registerAction(action, function) + } + // Idle { // Get Action type diff --git a/pkg/deployment/reconcile/action.register.generated_test.go b/pkg/deployment/reconcile/action.register.generated_test.go index 2462502f7..8330b07fe 100644 --- a/pkg/deployment/reconcile/action.register.generated_test.go +++ b/pkg/deployment/reconcile/action.register.generated_test.go @@ -276,6 +276,16 @@ func Test_Actions(t *testing.T) { }) }) + t.Run("EnforceResignLeadership", func(t *testing.T) { + ActionsExistence(t, api.ActionTypeEnforceResignLeadership) + t.Run("Internal", func(t *testing.T) { + require.False(t, api.ActionTypeEnforceResignLeadership.Internal()) + }) + t.Run("Optional", func(t *testing.T) { + require.True(t, api.ActionTypeEnforceResignLeadership.Optional()) + }) + }) + t.Run("Idle", func(t *testing.T) { ActionsExistence(t, api.ActionTypeIdle) t.Run("Internal", func(t *testing.T) { diff --git a/pkg/deployment/reconcile/action_enforce_resign_leadership.go b/pkg/deployment/reconcile/action_enforce_resign_leadership.go new file mode 100644 index 000000000..5887c0097 --- /dev/null +++ b/pkg/deployment/reconcile/action_enforce_resign_leadership.go @@ -0,0 +1,169 @@ +// +// DISCLAIMER +// +// Copyright 2023 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// + +package reconcile + +import ( + "context" + + "github.com/arangodb/go-driver" + + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" + "github.com/arangodb/kube-arangodb/pkg/deployment/agency/state" + "github.com/arangodb/kube-arangodb/pkg/util/errors" + "github.com/arangodb/kube-arangodb/pkg/util/globals" +) + +const ( + resignLeadershipJobID api.PlanLocalKey = "resignLeadershipJobID" +) + +// newEnforceResignLeadershipAction creates a new Action that implements the given +// planned ResignLeadership action. +func newEnforceResignLeadershipAction(action api.Action, actionCtx ActionContext) Action { + a := &actionEnforceResignLeadership{} + + a.actionImpl = newActionImplDefRef(action, actionCtx) + + return a +} + +// actionEnforceResignLeadership implements an ResignLeadershipAction. +type actionEnforceResignLeadership struct { + actionImpl +} + +// Start performs the start of the ReasignLeadership process on DBServer. +func (a *actionEnforceResignLeadership) Start(ctx context.Context) (bool, error) { + group := a.action.Group + + if a.actionCtx.GetSpec().Mode.Get() != api.DeploymentModeCluster { + a.log.Debug("Resign only allowed in cluster mode") + return true, nil + } + + switch group { + case api.ServerGroupDBServers: + if agencyState, agencyOK := a.actionCtx.GetAgencyCache(); !agencyOK { + a.log.Warn("AgencyCache is not ready") + return false, nil + } else if agencyState.Supervision.Maintenance.Exists() { + // We are done, action cannot be handled on maintenance mode + a.log.Warn("Maintenance is enabled, skipping action") + return true, nil + } + + return false, nil + default: + return true, nil + } +} + +// CheckProgress checks if the Job is completed, if not then start it. Repeat in case of error or if still a leader +func (a *actionEnforceResignLeadership) CheckProgress(ctx context.Context) (bool, bool, error) { + group := a.action.Group + m, ok := a.actionCtx.GetMemberStatusByID(a.action.MemberID) + if !ok { + a.log.Error("No such member") + return true, false, nil + } + + if group != api.ServerGroupDBServers { + // Only DBServers can use ResignLeadership job + return true, false, nil + } + + agencyState, agencyOK := a.actionCtx.GetAgencyCache() + if !agencyOK { + a.log.Error("Unable to get maintenance mode") + return false, false, nil + } else if agencyState.Supervision.Maintenance.Exists() { + a.log.Warn("Maintenance is enabled, skipping action") + // We are done, action cannot be handled on maintenance mode + m.CleanoutJobID = "" + if err := a.actionCtx.UpdateMember(ctx, m); err != nil { + return false, false, errors.WithStack(err) + } + return true, false, nil + } else if isServerRebooted(a.log, a.action, agencyState, driver.ServerID(m.ID)) { + return true, false, nil + } + + // Lets start resign job if required + if j, ok := a.actionCtx.Get(a.action, resignLeadershipJobID); ok && j != "" { + _, jobStatus := agencyState.Target.GetJob(state.JobID(m.CleanoutJobID)) + switch jobStatus { + case state.JobPhaseFailed: + m.CleanoutJobID = "" + if err := a.actionCtx.UpdateMember(ctx, m); err != nil { + return false, false, errors.WithStack(err) + } + a.log.Error("Resign server job failed") + return false, false, nil + case state.JobPhaseFinished: + m.CleanoutJobID = "" + if err := a.actionCtx.UpdateMember(ctx, m); err != nil { + return false, false, errors.WithStack(err) + } + default: + return false, false, nil + } + + // Remove key + a.actionCtx.Add(resignLeadershipJobID, "", true) + + // Job is Finished, check if we are not a leader anymore + if agencyState.PlanLeaderServers().Contains(state.Server(m.ID)) { + // We are still a leader! + a.log.Warn("DBServers is still a leader for shards") + return false, false, nil + } + return true, false, nil + } + + // Job not in progress, start it + client, err := a.actionCtx.GetMembersState().State().GetDatabaseClient() + if err != nil { + a.log.Err(err).Error("Unable to get client") + return false, false, errors.WithStack(err) + } + + ctxChild, cancel := globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx) + defer cancel() + cluster, err := client.Cluster(ctxChild) + if err != nil { + a.log.Err(err).Error("Unable to get cluster client") + return false, false, errors.WithStack(err) + } + + var jobID string + ctxChild, cancel = globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx) + defer cancel() + jobCtx := driver.WithJobIDResponse(ctxChild, &jobID) + a.log.Debug("Temporary shutdown, resign leadership") + if err := cluster.ResignServer(jobCtx, m.ID); err != nil { + a.log.Err(err).Debug("Failed to resign server") + return false, false, errors.WithStack(err) + } + + a.actionCtx.Add(resignLeadershipJobID, jobID, true) + + return false, false, nil +} diff --git a/pkg/deployment/reconcile/action_resign_leadership.go b/pkg/deployment/reconcile/action_resign_leadership.go index f857885ac..0fa245300 100644 --- a/pkg/deployment/reconcile/action_resign_leadership.go +++ b/pkg/deployment/reconcile/action_resign_leadership.go @@ -22,7 +22,6 @@ package reconcile import ( "context" - "strconv" "github.com/arangodb/go-driver" @@ -132,7 +131,7 @@ func (a *actionResignLeadership) CheckProgress(ctx context.Context) (bool, bool, return false, false, errors.WithStack(err) } return true, false, nil - } else if a.isServerRebooted(agencyState, driver.ServerID(m.ID)) { + } else if isServerRebooted(a.log, a.action, agencyState, driver.ServerID(m.ID)) { return true, false, nil } @@ -157,31 +156,3 @@ func (a *actionResignLeadership) CheckProgress(ctx context.Context) (bool, bool, } return false, false, nil } - -// isServerRebooted returns true when a given server ID was rebooted during resignation of leadership. -func (a *actionResignLeadership) isServerRebooted(agencyState state.State, serverID driver.ServerID) bool { - rebootID, ok := agencyState.GetRebootID(serverID) - if !ok { - return false - } - - v, ok := a.action.Params[actionResignLeadershipRebootID.String()] - if !ok { - a.log.Warn("missing reboot ID in action's locals") - return false - } - - r, err := strconv.Atoi(v) - if err != nil { - a.log.Err(err).Warn("reboot ID '%s' supposed to be a number", v) - return false - } - - if rebootID <= r { - // Server has not been restarted. - return false - } - - a.log.Warn("resign leadership aborted because rebootID has changed from %d to %d", r, rebootID) - return true -} diff --git a/pkg/deployment/reconcile/action_resign_leadership_utils.go b/pkg/deployment/reconcile/action_resign_leadership_utils.go new file mode 100644 index 000000000..824bf0d3d --- /dev/null +++ b/pkg/deployment/reconcile/action_resign_leadership_utils.go @@ -0,0 +1,66 @@ +// +// DISCLAIMER +// +// Copyright 2023 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// + +package reconcile + +import ( + "strconv" + + "github.com/arangodb/go-driver" + + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" + "github.com/arangodb/kube-arangodb/pkg/deployment/agency/state" + "github.com/arangodb/kube-arangodb/pkg/deployment/features" + "github.com/arangodb/kube-arangodb/pkg/logging" +) + +func getResignLeadershipActionType() api.ActionType { + if features.EnforcedResignLeadership().Enabled() { + return api.ActionTypeEnforceResignLeadership + } + return api.ActionTypeResignLeadership +} + +// isServerRebooted returns true when a given server ID was rebooted during resignation of leadership. +func isServerRebooted(log logging.Logger, action api.Action, agencyState state.State, serverID driver.ServerID) bool { + rebootID, ok := agencyState.GetRebootID(serverID) + if !ok { + return false + } + + v, ok := action.Params[actionResignLeadershipRebootID.String()] + if !ok { + return false + } + + r, err := strconv.Atoi(v) + if err != nil { + log.Err(err).Warn("reboot ID '%s' supposed to be a number", v) + return false + } + + if rebootID <= r { + // Server has not been restarted. + return false + } + + log.Warn("resign leadership aborted because rebootID has changed from %d to %d", r, rebootID) + return true +} diff --git a/pkg/deployment/reconcile/helper_wrap.go b/pkg/deployment/reconcile/helper_wrap.go index 9336edc35..e50f6de9d 100644 --- a/pkg/deployment/reconcile/helper_wrap.go +++ b/pkg/deployment/reconcile/helper_wrap.go @@ -64,7 +64,7 @@ func withResignLeadership(group api.ServerGroup, member api.MemberStatus, reason return plan } - action := actions.NewAction(api.ActionTypeResignLeadership, group, member, reason) + action := actions.NewAction(getResignLeadershipActionType(), group, member, reason) if rebootID != nil { action = actionResignLeadershipRebootID.Register(action, "%d", *rebootID) } diff --git a/pkg/deployment/reconcile/plan_builder_storage.go b/pkg/deployment/reconcile/plan_builder_storage.go index 27e47bb75..914ed1f61 100644 --- a/pkg/deployment/reconcile/plan_builder_storage.go +++ b/pkg/deployment/reconcile/plan_builder_storage.go @@ -175,7 +175,7 @@ func (r *Reconciler) pvcResizePlan(group api.ServerGroup, member api.MemberStatu } case api.PVCResizeModeRotate: return withWaitForMember(api.Plan{ - actions.NewAction(api.ActionTypeResignLeadership, group, member), + actions.NewAction(getResignLeadershipActionType(), group, member), actions.NewAction(api.ActionTypeKillMemberPod, group, member), actions.NewAction(api.ActionTypeRotateStartMember, group, member), actions.NewAction(api.ActionTypePVCResize, group, member),