Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check member failure #92

Merged
merged 11 commits into from
Apr 3, 2018
48 changes: 48 additions & 0 deletions pkg/apis/deployment/v1alpha/member_phase.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
//
// DISCLAIMER
//
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//

package v1alpha

// MemberPhase is a strongly typed lifetime phase of a deployment member
type MemberPhase string

const (
// MemberPhaseNone indicates that the state is not set yet
MemberPhaseNone MemberPhase = ""
// MemberPhaseCreated indicates that all resources needed for the member have been created
MemberPhaseCreated MemberPhase = "Created"
// MemberPhaseFailed indicates that the member is gone beyond hope of recovery. It must be replaced with a new member.
MemberPhaseFailed MemberPhase = "Failed"
// MemberPhaseCleanOut indicates that a dbserver is in the process of being cleaned out
MemberPhaseCleanOut MemberPhase = "CleanOut"
// MemberPhaseShuttingDown indicates that a member is shutting down
MemberPhaseShuttingDown MemberPhase = "ShuttingDown"
// MemberPhaseRotating indicates that a member is being rotated
MemberPhaseRotating MemberPhase = "Rotating"
// MemberPhaseUpgrading indicates that a member is in the process of upgrading its database data format
MemberPhaseUpgrading MemberPhase = "Upgrading"
)

// IsFailed returns true when given phase == "Failed"
func (p MemberPhase) IsFailed() bool {
return p == MemberPhaseFailed
}
41 changes: 0 additions & 41 deletions pkg/apis/deployment/v1alpha/member_state.go

This file was deleted.

21 changes: 19 additions & 2 deletions pkg/apis/deployment/v1alpha/member_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ package v1alpha
import (
"time"

"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand All @@ -33,8 +34,10 @@ type MemberStatus struct {
// ID holds the unique ID of the member.
// This id is also used within the ArangoDB cluster to identify this server.
ID string `json:"id"`
// State holds the current state of this member
State MemberState `json:"state"`
// Phase holds the current lifetime phase of this member
Phase MemberPhase `json:"phase"`
// CreatedAt holds the creation timestamp of this member.
CreatedAt metav1.Time `json:"created-at"`
// PersistentVolumeClaimName holds the name of the persistent volume claim used for this member (if any).
PersistentVolumeClaimName string `json:"persistentVolumeClaimName,omitempty"`
// PodName holds the name of the Pod that currently runs this member
Expand Down Expand Up @@ -78,3 +81,17 @@ func (s MemberStatus) RecentTerminationsSince(timestamp time.Time) int {
}
return count
}

// IsNotReadySince returns true when the given member has not been ready since the given timestamp.
// That means it:
// - A) Was created before timestamp and never reached a ready state or
// - B) The Ready condition is set to false, and last transision is before timestamp
func (s MemberStatus) IsNotReadySince(timestamp time.Time) bool {
cond, found := s.Conditions.Get(ConditionTypeReady)
if found {
// B
return cond.Status != v1.ConditionTrue && cond.LastTransitionTime.Time.Before(timestamp)
}
// A
return s.CreatedAt.Time.Before(timestamp)
}
4 changes: 2 additions & 2 deletions pkg/apis/deployment/v1alpha/member_status_list.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,15 @@ func (l MemberStatusList) SelectMemberToRemove() (MemberStatus, error) {
if len(l) > 0 {
// Try to find a not ready member
for _, m := range l {
if m.State == MemberStateNone {
if m.Phase == MemberPhaseNone {
return m, nil
}
}
// Pick a random member that is in created state
perm := rand.Perm(len(l))
for _, idx := range perm {
m := l[idx]
if m.State == MemberStateCreated {
if m.Phase == MemberPhaseCreated {
return m, nil
}
}
Expand Down
21 changes: 21 additions & 0 deletions pkg/apis/deployment/v1alpha/member_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,24 @@ func TestMemberStatusRecentTerminations(t *testing.T) {
assert.Equal(t, 2, s.RemoveTerminationsBefore(time.Now()))
assert.Len(t, s.RecentTerminations, 1)
}

// TestMemberStatusIsNotReadySince tests the functions related to MemberStatus.IsNotReadySince.
func TestMemberStatusIsNotReadySince(t *testing.T) {
s := MemberStatus{
CreatedAt: metav1.Now(),
}
assert.False(t, s.IsNotReadySince(time.Now().Add(-time.Hour)))

s.CreatedAt.Time = time.Now().Add(-time.Hour)
assert.False(t, s.IsNotReadySince(time.Now().Add(-2*time.Hour)))
assert.True(t, s.IsNotReadySince(time.Now().Add(-(time.Hour - time.Minute))))

s.CreatedAt = metav1.Now()
s.Conditions.Update(ConditionTypeReady, true, "", "")
assert.False(t, s.IsNotReadySince(time.Now().Add(-time.Minute)))
assert.False(t, s.IsNotReadySince(time.Now().Add(time.Minute)))

s.Conditions.Update(ConditionTypeReady, false, "", "")
assert.False(t, s.IsNotReadySince(time.Now().Add(-time.Minute)))
assert.True(t, s.IsNotReadySince(time.Now().Add(time.Minute)))
}
1 change: 1 addition & 0 deletions pkg/apis/deployment/v1alpha/zz_generated.deepcopy.go
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ func (in *ImageInfo) DeepCopy() *ImageInfo {
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MemberStatus) DeepCopyInto(out *MemberStatus) {
*out = *in
in.CreatedAt.DeepCopyInto(&out.CreatedAt)
if in.Conditions != nil {
in, out := &in.Conditions, &out.Conditions
*out = make(ConditionList, len(*in))
Expand Down
15 changes: 12 additions & 3 deletions pkg/deployment/context_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,14 @@ func (d *Deployment) GetServerClient(ctx context.Context, group api.ServerGroup,
}

// GetAgencyClients returns a client connection for every agency member.
func (d *Deployment) GetAgencyClients(ctx context.Context) ([]arangod.Agency, error) {
// If the given predicate is not nil, only agents are included where the given predicate returns true.
func (d *Deployment) GetAgencyClients(ctx context.Context, predicate func(id string) bool) ([]arangod.Agency, error) {
agencyMembers := d.status.Members.Agents
result := make([]arangod.Agency, 0, len(agencyMembers))
for _, m := range agencyMembers {
if predicate != nil && !predicate(m.ID) {
continue
}
client, err := d.GetServerClient(ctx, api.ServerGroupAgents, m.ID)
if err != nil {
return nil, maskAny(err)
Expand All @@ -115,9 +119,11 @@ func (d *Deployment) GetAgencyClients(ctx context.Context) ([]arangod.Agency, er
}

// CreateMember adds a new member to the given group.
func (d *Deployment) CreateMember(group api.ServerGroup) error {
// If ID is non-empty, it will be used, otherwise a new ID is created.
func (d *Deployment) CreateMember(group api.ServerGroup, id string) error {
log := d.deps.Log
if err := d.createMember(group, d.apiObject); err != nil {
id, err := d.createMember(group, id, d.apiObject)
if err != nil {
log.Debug().Err(err).Str("group", group.AsRole()).Msg("Failed to create member")
return maskAny(err)
}
Expand All @@ -126,6 +132,9 @@ func (d *Deployment) CreateMember(group api.ServerGroup) error {
log.Debug().Err(err).Msg("Updating CR status failed")
return maskAny(err)
}
// Create event about it
d.CreateEvent(k8sutil.NewMemberAddEvent(id, group.AsRole(), d.apiObject))

return nil
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/deployment/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (

api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
"github.com/arangodb/kube-arangodb/pkg/deployment/reconcile"
"github.com/arangodb/kube-arangodb/pkg/deployment/resilience"
"github.com/arangodb/kube-arangodb/pkg/deployment/resources"
"github.com/arangodb/kube-arangodb/pkg/generated/clientset/versioned"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
Expand Down Expand Up @@ -92,6 +93,7 @@ type Deployment struct {
recentInspectionErrors int
clusterScalingIntegration *clusterScalingIntegration
reconciler *reconcile.Reconciler
resilience *resilience.Resilience
resources *resources.Resources
}

Expand All @@ -111,6 +113,7 @@ func New(config Config, deps Dependencies, apiObject *api.ArangoDeployment) (*De
clientCache: newClientCache(deps.KubeCli, apiObject),
}
d.reconciler = reconcile.NewReconciler(deps.Log, d)
d.resilience = resilience.NewResilience(deps.Log, d)
d.resources = resources.NewResources(deps.Log, d)
if d.status.AcceptedSpec == nil {
// We've validated the spec, so let's use it from now.
Expand Down
6 changes: 6 additions & 0 deletions pkg/deployment/deployment_inspector.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
d.CreateEvent(k8sutil.NewErrorEvent("Pod inspection failed", err, d.apiObject))
}

// Check members for resilience
if err := d.resilience.CheckMemberFailure(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Member failure detection failed", err, d.apiObject))
}

// Create scale/update plan
if err := d.reconciler.CreatePlan(); err != nil {
hasError = true
Expand Down
Loading