Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] EnforcedResignLeadership action #1439

Merged
merged 2 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- (Bugfix) Fix ArangoBackup Create Backoff & ArangoBackupPolicy propagation
- (Maintenance) Add IndexMethod Documentation
- (Bugfix) Fix VersionCheck args propagation
- (Feature) EnforcedResignLeadership action

## [1.2.33](https://github.com/arangodb/kube-arangodb/tree/1.2.33) (2023-09-27)
- (Maintenance) Bump golang.org/x/net to v0.13.0
Expand Down
53 changes: 27 additions & 26 deletions README.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions docs/generated/actions.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
| EncryptionKeyRefresh | no | 10m0s | no | Enterprise Only | Refresh the encryption keys on member |
| EncryptionKeyRemove | no | 10m0s | no | Enterprise Only | Remove the encryption key to the pool |
| EncryptionKeyStatusUpdate | no | 10m0s | no | Enterprise Only | Update status of encryption propagation |
| EnforceResignLeadership | no | 45m0s | yes | Community & Enterprise | Run the ResignLeadership job on DBServer and checks data compatibility after |
| Idle | no | 10m0s | no | Community & Enterprise | Define idle operation in case if preconditions are not meet |
| JWTAdd | no | 10m0s | no | Enterprise Only | Adds new JWT to the pool |
| JWTClean | no | 10m0s | no | Enterprise Only | Remove JWT key from the pool |
Expand Down Expand Up @@ -122,6 +123,7 @@ spec:
EncryptionKeyRefresh: 10m0s
EncryptionKeyRemove: 10m0s
EncryptionKeyStatusUpdate: 10m0s
EnforceResignLeadership: 45m0s
Idle: 10m0s
JWTAdd: 10m0s
JWTClean: 10m0s
Expand Down
4 changes: 4 additions & 0 deletions internal/actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ actions:
description: Run the ResignLeadership job on DBServer
timeout: 30m
optional: true
EnforceResignLeadership:
description: Run the ResignLeadership job on DBServer and checks data compatibility after
timeout: 45m
optional: true
KillMemberPod:
description: Execute Delete on Pod (put pod in Terminating state)
scopes:
Expand Down
7 changes: 7 additions & 0 deletions internal/features.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -220,3 +220,10 @@ features:
releases:
- operatorVersion: 1.2.33
state: Production
- name: Enforced ResignLeadership
enabled: true
remarks: Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer
flag: --deployment.feature.enforced-resign-leadership
releases:
- operatorVersion: 1.2.34
state: Production
12 changes: 12 additions & 0 deletions pkg/apis/deployment/v1/actions.generated.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ const (
// ActionEncryptionKeyStatusUpdateDefaultTimeout define default timeout for action ActionEncryptionKeyStatusUpdate
ActionEncryptionKeyStatusUpdateDefaultTimeout time.Duration = ActionsDefaultTimeout

// ActionEnforceResignLeadershipDefaultTimeout define default timeout for action ActionEnforceResignLeadership
ActionEnforceResignLeadershipDefaultTimeout time.Duration = 2700 * time.Second // 45m0s

// ActionIdleDefaultTimeout define default timeout for action ActionIdle
ActionIdleDefaultTimeout time.Duration = ActionsDefaultTimeout

Expand Down Expand Up @@ -353,6 +356,9 @@ const (
// ActionTypeEncryptionKeyStatusUpdate in scopes Normal. Update status of encryption propagation
ActionTypeEncryptionKeyStatusUpdate ActionType = "EncryptionKeyStatusUpdate"

// ActionTypeEnforceResignLeadership in scopes Normal. Run the ResignLeadership job on DBServer and checks data compatibility after
ActionTypeEnforceResignLeadership ActionType = "EnforceResignLeadership"

// ActionTypeIdle in scopes Normal. Define idle operation in case if preconditions are not meet
ActionTypeIdle ActionType = "Idle"

Expand Down Expand Up @@ -587,6 +593,8 @@ func (a ActionType) DefaultTimeout() time.Duration {
return ActionEncryptionKeyRemoveDefaultTimeout
case ActionTypeEncryptionKeyStatusUpdate:
return ActionEncryptionKeyStatusUpdateDefaultTimeout
case ActionTypeEnforceResignLeadership:
return ActionEnforceResignLeadershipDefaultTimeout
case ActionTypeIdle:
return ActionIdleDefaultTimeout
case ActionTypeJWTAdd:
Expand Down Expand Up @@ -761,6 +769,8 @@ func (a ActionType) Priority() ActionPriority {
return ActionPriorityNormal
case ActionTypeEncryptionKeyStatusUpdate:
return ActionPriorityNormal
case ActionTypeEnforceResignLeadership:
return ActionPriorityNormal
case ActionTypeIdle:
return ActionPriorityNormal
case ActionTypeJWTAdd:
Expand Down Expand Up @@ -947,6 +957,8 @@ func (a ActionType) Optional() bool {
return false
case ActionTypeEncryptionKeyStatusUpdate:
return false
case ActionTypeEnforceResignLeadership:
return true
case ActionTypeIdle:
return false
case ActionTypeJWTAdd:
Expand Down
12 changes: 12 additions & 0 deletions pkg/apis/deployment/v2alpha1/actions.generated.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ const (
// ActionEncryptionKeyStatusUpdateDefaultTimeout define default timeout for action ActionEncryptionKeyStatusUpdate
ActionEncryptionKeyStatusUpdateDefaultTimeout time.Duration = ActionsDefaultTimeout

// ActionEnforceResignLeadershipDefaultTimeout define default timeout for action ActionEnforceResignLeadership
ActionEnforceResignLeadershipDefaultTimeout time.Duration = 2700 * time.Second // 45m0s

// ActionIdleDefaultTimeout define default timeout for action ActionIdle
ActionIdleDefaultTimeout time.Duration = ActionsDefaultTimeout

Expand Down Expand Up @@ -353,6 +356,9 @@ const (
// ActionTypeEncryptionKeyStatusUpdate in scopes Normal. Update status of encryption propagation
ActionTypeEncryptionKeyStatusUpdate ActionType = "EncryptionKeyStatusUpdate"

// ActionTypeEnforceResignLeadership in scopes Normal. Run the ResignLeadership job on DBServer and checks data compatibility after
ActionTypeEnforceResignLeadership ActionType = "EnforceResignLeadership"

// ActionTypeIdle in scopes Normal. Define idle operation in case if preconditions are not meet
ActionTypeIdle ActionType = "Idle"

Expand Down Expand Up @@ -587,6 +593,8 @@ func (a ActionType) DefaultTimeout() time.Duration {
return ActionEncryptionKeyRemoveDefaultTimeout
case ActionTypeEncryptionKeyStatusUpdate:
return ActionEncryptionKeyStatusUpdateDefaultTimeout
case ActionTypeEnforceResignLeadership:
return ActionEnforceResignLeadershipDefaultTimeout
case ActionTypeIdle:
return ActionIdleDefaultTimeout
case ActionTypeJWTAdd:
Expand Down Expand Up @@ -761,6 +769,8 @@ func (a ActionType) Priority() ActionPriority {
return ActionPriorityNormal
case ActionTypeEncryptionKeyStatusUpdate:
return ActionPriorityNormal
case ActionTypeEnforceResignLeadership:
return ActionPriorityNormal
case ActionTypeIdle:
return ActionPriorityNormal
case ActionTypeJWTAdd:
Expand Down Expand Up @@ -947,6 +957,8 @@ func (a ActionType) Optional() bool {
return false
case ActionTypeEncryptionKeyStatusUpdate:
return false
case ActionTypeEnforceResignLeadership:
return true
case ActionTypeIdle:
return false
case ActionTypeJWTAdd:
Expand Down
38 changes: 38 additions & 0 deletions pkg/deployment/features/resign_leadership.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
//
// DISCLAIMER
//
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//

package features

func init() {
registerFeature(enforcedResignLeadership)
}

var enforcedResignLeadership = &feature{
name: "enforced-resign-leadership",
description: "Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer",
jwierzbo marked this conversation as resolved.
Show resolved Hide resolved
version: "3.7.0",
enterpriseRequired: false,
enabledByDefault: true,
}

// EnforcedResignLeadership returns enforced ResignLeadership.
func EnforcedResignLeadership() Feature {
return enforcedResignLeadership
}
17 changes: 17 additions & 0 deletions pkg/deployment/reconcile/action.register.generated.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ var (
_ Action = &actionEncryptionKeyStatusUpdate{}
_ actionFactory = newEncryptionKeyStatusUpdateAction

_ Action = &actionEnforceResignLeadership{}
_ actionFactory = newEnforceResignLeadershipAction

_ Action = &actionIdle{}
_ actionFactory = newIdleAction

Expand Down Expand Up @@ -599,6 +602,20 @@ func init() {
registerAction(action, function)
}

// EnforceResignLeadership
{
// Get Action type
action := api.ActionTypeEnforceResignLeadership

// Get Action defition
function := newEnforceResignLeadershipAction

// Wrap action main function

// Register action
registerAction(action, function)
}

// Idle
{
// Get Action type
Expand Down
10 changes: 10 additions & 0 deletions pkg/deployment/reconcile/action.register.generated_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,16 @@ func Test_Actions(t *testing.T) {
})
})

t.Run("EnforceResignLeadership", func(t *testing.T) {
ActionsExistence(t, api.ActionTypeEnforceResignLeadership)
t.Run("Internal", func(t *testing.T) {
require.False(t, api.ActionTypeEnforceResignLeadership.Internal())
})
t.Run("Optional", func(t *testing.T) {
require.True(t, api.ActionTypeEnforceResignLeadership.Optional())
})
})

t.Run("Idle", func(t *testing.T) {
ActionsExistence(t, api.ActionTypeIdle)
t.Run("Internal", func(t *testing.T) {
Expand Down
169 changes: 169 additions & 0 deletions pkg/deployment/reconcile/action_enforce_resign_leadership.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
//
// DISCLAIMER
//
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//

package reconcile

import (
"context"

"github.com/arangodb/go-driver"

api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/agency/state"
"github.com/arangodb/kube-arangodb/pkg/util/errors"
"github.com/arangodb/kube-arangodb/pkg/util/globals"
)

const (
resignLeadershipJobID api.PlanLocalKey = "resignLeadershipJobID"
)

// newEnforceResignLeadershipAction creates a new Action that implements the given
// planned ResignLeadership action.
func newEnforceResignLeadershipAction(action api.Action, actionCtx ActionContext) Action {
a := &actionEnforceResignLeadership{}

a.actionImpl = newActionImplDefRef(action, actionCtx)

return a
}

// actionEnforceResignLeadership implements an ResignLeadershipAction.
type actionEnforceResignLeadership struct {
actionImpl
}

// Start performs the start of the ReasignLeadership process on DBServer.
func (a *actionEnforceResignLeadership) Start(ctx context.Context) (bool, error) {
group := a.action.Group

if a.actionCtx.GetSpec().Mode.Get() != api.DeploymentModeCluster {
a.log.Debug("Resign only allowed in cluster mode")
return true, nil
}

switch group {
case api.ServerGroupDBServers:
if agencyState, agencyOK := a.actionCtx.GetAgencyCache(); !agencyOK {
a.log.Warn("AgencyCache is not ready")
return false, nil
} else if agencyState.Supervision.Maintenance.Exists() {
// We are done, action cannot be handled on maintenance mode
a.log.Warn("Maintenance is enabled, skipping action")
return true, nil
}

return false, nil
default:
return true, nil
}
}

// CheckProgress checks if the Job is completed, if not then start it. Repeat in case of error or if still a leader
func (a *actionEnforceResignLeadership) CheckProgress(ctx context.Context) (bool, bool, error) {
group := a.action.Group
m, ok := a.actionCtx.GetMemberStatusByID(a.action.MemberID)
if !ok {
a.log.Error("No such member")
return true, false, nil
}

if group != api.ServerGroupDBServers {
// Only DBServers can use ResignLeadership job
return true, false, nil
}

agencyState, agencyOK := a.actionCtx.GetAgencyCache()
if !agencyOK {
a.log.Error("Unable to get maintenance mode")
return false, false, nil
} else if agencyState.Supervision.Maintenance.Exists() {
a.log.Warn("Maintenance is enabled, skipping action")
// We are done, action cannot be handled on maintenance mode
m.CleanoutJobID = ""
if err := a.actionCtx.UpdateMember(ctx, m); err != nil {
return false, false, errors.WithStack(err)
}
return true, false, nil
} else if isServerRebooted(a.log, a.action, agencyState, driver.ServerID(m.ID)) {
return true, false, nil
}

// Lets start resign job if required
if j, ok := a.actionCtx.Get(a.action, resignLeadershipJobID); ok && j != "" {
_, jobStatus := agencyState.Target.GetJob(state.JobID(m.CleanoutJobID))
switch jobStatus {
case state.JobPhaseFailed:
m.CleanoutJobID = ""
if err := a.actionCtx.UpdateMember(ctx, m); err != nil {
return false, false, errors.WithStack(err)
}
a.log.Error("Resign server job failed")
return false, false, nil
case state.JobPhaseFinished:
m.CleanoutJobID = ""
if err := a.actionCtx.UpdateMember(ctx, m); err != nil {
return false, false, errors.WithStack(err)
}
default:
return false, false, nil
}

// Remove key
a.actionCtx.Add(resignLeadershipJobID, "", true)

// Job is Finished, check if we are not a leader anymore
if agencyState.PlanLeaderServers().Contains(state.Server(m.ID)) {
// We are still a leader!
jwierzbo marked this conversation as resolved.
Show resolved Hide resolved
a.log.Warn("DBServers is still a leader for shards")
return false, false, nil
}
return true, false, nil
}

// Job not in progress, start it
client, err := a.actionCtx.GetMembersState().State().GetDatabaseClient()
if err != nil {
a.log.Err(err).Error("Unable to get client")
return false, false, errors.WithStack(err)
}

ctxChild, cancel := globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx)
defer cancel()
cluster, err := client.Cluster(ctxChild)
if err != nil {
a.log.Err(err).Error("Unable to get cluster client")
return false, false, errors.WithStack(err)
}

var jobID string
ctxChild, cancel = globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx)
defer cancel()
jobCtx := driver.WithJobIDResponse(ctxChild, &jobID)
a.log.Debug("Temporary shutdown, resign leadership")
if err := cluster.ResignServer(jobCtx, m.ID); err != nil {
a.log.Err(err).Debug("Failed to resign server")
return false, false, errors.WithStack(err)
}

a.actionCtx.Add(resignLeadershipJobID, jobID, true)

return false, false, nil
}
Loading