Skip to content

Commit

Permalink
Merge pull request #92 from arangodb/check-member-failure
Browse files Browse the repository at this point in the history
Check member failure
  • Loading branch information
ewoutp authored Apr 3, 2018
2 parents a07eb76 + 6bf7a56 commit 1173d8e
Show file tree
Hide file tree
Showing 32 changed files with 1,101 additions and 218 deletions.
48 changes: 48 additions & 0 deletions pkg/apis/deployment/v1alpha/member_phase.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
//
// DISCLAIMER
//
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//

package v1alpha

// MemberPhase is a strongly typed lifetime phase of a deployment member
type MemberPhase string

const (
// MemberPhaseNone indicates that the state is not set yet
MemberPhaseNone MemberPhase = ""
// MemberPhaseCreated indicates that all resources needed for the member have been created
MemberPhaseCreated MemberPhase = "Created"
// MemberPhaseFailed indicates that the member is gone beyond hope of recovery. It must be replaced with a new member.
MemberPhaseFailed MemberPhase = "Failed"
// MemberPhaseCleanOut indicates that a dbserver is in the process of being cleaned out
MemberPhaseCleanOut MemberPhase = "CleanOut"
// MemberPhaseShuttingDown indicates that a member is shutting down
MemberPhaseShuttingDown MemberPhase = "ShuttingDown"
// MemberPhaseRotating indicates that a member is being rotated
MemberPhaseRotating MemberPhase = "Rotating"
// MemberPhaseUpgrading indicates that a member is in the process of upgrading its database data format
MemberPhaseUpgrading MemberPhase = "Upgrading"
)

// IsFailed returns true when given phase == "Failed"
func (p MemberPhase) IsFailed() bool {
return p == MemberPhaseFailed
}
41 changes: 0 additions & 41 deletions pkg/apis/deployment/v1alpha/member_state.go

This file was deleted.

21 changes: 19 additions & 2 deletions pkg/apis/deployment/v1alpha/member_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ package v1alpha
import (
"time"

"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand All @@ -33,8 +34,10 @@ type MemberStatus struct {
// ID holds the unique ID of the member.
// This id is also used within the ArangoDB cluster to identify this server.
ID string `json:"id"`
// State holds the current state of this member
State MemberState `json:"state"`
// Phase holds the current lifetime phase of this member
Phase MemberPhase `json:"phase"`
// CreatedAt holds the creation timestamp of this member.
CreatedAt metav1.Time `json:"created-at"`
// PersistentVolumeClaimName holds the name of the persistent volume claim used for this member (if any).
PersistentVolumeClaimName string `json:"persistentVolumeClaimName,omitempty"`
// PodName holds the name of the Pod that currently runs this member
Expand Down Expand Up @@ -78,3 +81,17 @@ func (s MemberStatus) RecentTerminationsSince(timestamp time.Time) int {
}
return count
}

// IsNotReadySince returns true when the given member has not been ready since the given timestamp.
// That means it:
// - A) Was created before timestamp and never reached a ready state or
// - B) The Ready condition is set to false, and last transision is before timestamp
func (s MemberStatus) IsNotReadySince(timestamp time.Time) bool {
cond, found := s.Conditions.Get(ConditionTypeReady)
if found {
// B
return cond.Status != v1.ConditionTrue && cond.LastTransitionTime.Time.Before(timestamp)
}
// A
return s.CreatedAt.Time.Before(timestamp)
}
4 changes: 2 additions & 2 deletions pkg/apis/deployment/v1alpha/member_status_list.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,15 @@ func (l MemberStatusList) SelectMemberToRemove() (MemberStatus, error) {
if len(l) > 0 {
// Try to find a not ready member
for _, m := range l {
if m.State == MemberStateNone {
if m.Phase == MemberPhaseNone {
return m, nil
}
}
// Pick a random member that is in created state
perm := rand.Perm(len(l))
for _, idx := range perm {
m := l[idx]
if m.State == MemberStateCreated {
if m.Phase == MemberPhaseCreated {
return m, nil
}
}
Expand Down
21 changes: 21 additions & 0 deletions pkg/apis/deployment/v1alpha/member_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,24 @@ func TestMemberStatusRecentTerminations(t *testing.T) {
assert.Equal(t, 2, s.RemoveTerminationsBefore(time.Now()))
assert.Len(t, s.RecentTerminations, 1)
}

// TestMemberStatusIsNotReadySince tests the functions related to MemberStatus.IsNotReadySince.
func TestMemberStatusIsNotReadySince(t *testing.T) {
s := MemberStatus{
CreatedAt: metav1.Now(),
}
assert.False(t, s.IsNotReadySince(time.Now().Add(-time.Hour)))

s.CreatedAt.Time = time.Now().Add(-time.Hour)
assert.False(t, s.IsNotReadySince(time.Now().Add(-2*time.Hour)))
assert.True(t, s.IsNotReadySince(time.Now().Add(-(time.Hour - time.Minute))))

s.CreatedAt = metav1.Now()
s.Conditions.Update(ConditionTypeReady, true, "", "")
assert.False(t, s.IsNotReadySince(time.Now().Add(-time.Minute)))
assert.False(t, s.IsNotReadySince(time.Now().Add(time.Minute)))

s.Conditions.Update(ConditionTypeReady, false, "", "")
assert.False(t, s.IsNotReadySince(time.Now().Add(-time.Minute)))
assert.True(t, s.IsNotReadySince(time.Now().Add(time.Minute)))
}
1 change: 1 addition & 0 deletions pkg/apis/deployment/v1alpha/zz_generated.deepcopy.go
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ func (in *ImageInfo) DeepCopy() *ImageInfo {
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MemberStatus) DeepCopyInto(out *MemberStatus) {
*out = *in
in.CreatedAt.DeepCopyInto(&out.CreatedAt)
if in.Conditions != nil {
in, out := &in.Conditions, &out.Conditions
*out = make(ConditionList, len(*in))
Expand Down
15 changes: 12 additions & 3 deletions pkg/deployment/context_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,14 @@ func (d *Deployment) GetServerClient(ctx context.Context, group api.ServerGroup,
}

// GetAgencyClients returns a client connection for every agency member.
func (d *Deployment) GetAgencyClients(ctx context.Context) ([]arangod.Agency, error) {
// If the given predicate is not nil, only agents are included where the given predicate returns true.
func (d *Deployment) GetAgencyClients(ctx context.Context, predicate func(id string) bool) ([]arangod.Agency, error) {
agencyMembers := d.status.Members.Agents
result := make([]arangod.Agency, 0, len(agencyMembers))
for _, m := range agencyMembers {
if predicate != nil && !predicate(m.ID) {
continue
}
client, err := d.GetServerClient(ctx, api.ServerGroupAgents, m.ID)
if err != nil {
return nil, maskAny(err)
Expand All @@ -115,9 +119,11 @@ func (d *Deployment) GetAgencyClients(ctx context.Context) ([]arangod.Agency, er
}

// CreateMember adds a new member to the given group.
func (d *Deployment) CreateMember(group api.ServerGroup) error {
// If ID is non-empty, it will be used, otherwise a new ID is created.
func (d *Deployment) CreateMember(group api.ServerGroup, id string) error {
log := d.deps.Log
if err := d.createMember(group, d.apiObject); err != nil {
id, err := d.createMember(group, id, d.apiObject)
if err != nil {
log.Debug().Err(err).Str("group", group.AsRole()).Msg("Failed to create member")
return maskAny(err)
}
Expand All @@ -126,6 +132,9 @@ func (d *Deployment) CreateMember(group api.ServerGroup) error {
log.Debug().Err(err).Msg("Updating CR status failed")
return maskAny(err)
}
// Create event about it
d.CreateEvent(k8sutil.NewMemberAddEvent(id, group.AsRole(), d.apiObject))

return nil
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/deployment/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (

api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
"github.com/arangodb/kube-arangodb/pkg/deployment/reconcile"
"github.com/arangodb/kube-arangodb/pkg/deployment/resilience"
"github.com/arangodb/kube-arangodb/pkg/deployment/resources"
"github.com/arangodb/kube-arangodb/pkg/generated/clientset/versioned"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
Expand Down Expand Up @@ -92,6 +93,7 @@ type Deployment struct {
recentInspectionErrors int
clusterScalingIntegration *clusterScalingIntegration
reconciler *reconcile.Reconciler
resilience *resilience.Resilience
resources *resources.Resources
}

Expand All @@ -111,6 +113,7 @@ func New(config Config, deps Dependencies, apiObject *api.ArangoDeployment) (*De
clientCache: newClientCache(deps.KubeCli, apiObject),
}
d.reconciler = reconcile.NewReconciler(deps.Log, d)
d.resilience = resilience.NewResilience(deps.Log, d)
d.resources = resources.NewResources(deps.Log, d)
if d.status.AcceptedSpec == nil {
// We've validated the spec, so let's use it from now.
Expand Down
6 changes: 6 additions & 0 deletions pkg/deployment/deployment_inspector.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
d.CreateEvent(k8sutil.NewErrorEvent("Pod inspection failed", err, d.apiObject))
}

// Check members for resilience
if err := d.resilience.CheckMemberFailure(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Member failure detection failed", err, d.apiObject))
}

// Create scale/update plan
if err := d.reconciler.CreatePlan(); err != nil {
hasError = true
Expand Down
Loading

0 comments on commit 1173d8e

Please sign in to comment.