Skip to content

Commit

Permalink
Merge pull request #77 from arangodb/resilient-improvements
Browse files Browse the repository at this point in the history
Added terminated-pod cleanup to speed up re-creation of pods.
  • Loading branch information
ewoutp authored Mar 26, 2018
2 parents 05fca79 + ba2141f commit d85dea4
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 1 deletion.
15 changes: 15 additions & 0 deletions pkg/deployment/context_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,21 @@ func (d *Deployment) DeletePod(podName string) error {
return nil
}

// CleanupPod deletes a given pod with force and explicit UID.
// If the pod does not exist, the error is ignored.
func (d *Deployment) CleanupPod(p v1.Pod) error {
log := d.deps.Log
podName := p.GetName()
ns := p.GetNamespace()
options := metav1.NewDeleteOptions(0)
options.Preconditions = metav1.NewUIDPreconditions(string(p.GetUID()))
if err := d.deps.KubeCli.Core().Pods(ns).Delete(podName, options); err != nil && !k8sutil.IsNotFound(err) {
log.Debug().Err(err).Str("pod", podName).Msg("Failed to cleanup pod")
return maskAny(err)
}
return nil
}

// DeletePvc deletes a persistent volume claim with given name in the namespace
// of the deployment. If the pvc does not exist, the error is ignored.
func (d *Deployment) DeletePvc(pvcName string) error {
Expand Down
6 changes: 6 additions & 0 deletions pkg/deployment/deployment_inspector.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
d.CreateEvent(k8sutil.NewErrorEvent("Pod creation failed", err, d.apiObject))
}

// At the end of the inspect, we cleanup terminated pods.
if d.resources.CleanupTerminatedPods(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Pod cleanup failed", err, d.apiObject))
}

// Update next interval (on errors)
if hasError {
if d.recentInspectionErrors == 0 {
Expand Down
3 changes: 3 additions & 0 deletions pkg/deployment/resources/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,7 @@ type Context interface {
CreateEvent(evt *v1.Event)
// GetOwnedPods returns a list of all pods owned by the deployment.
GetOwnedPods() ([]v1.Pod, error)
// CleanupPod deletes a given pod with force and explicit UID.
// If the pod does not exist, the error is ignored.
CleanupPod(p v1.Pod) error
}
73 changes: 73 additions & 0 deletions pkg/deployment/resources/pod_cleanup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
//
// DISCLAIMER
//
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//

package resources

import (
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"

api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
)

// CleanupTerminatedPods removes all pods in Terminated state that belong to a member in Created state.
func (r *Resources) CleanupTerminatedPods() error {
log := r.log

pods, err := r.context.GetOwnedPods()
if err != nil {
log.Debug().Err(err).Msg("Failed to get owned pods")
return maskAny(err)
}

// Update member status from all pods found
status := r.context.GetStatus()
for _, p := range pods {
if k8sutil.IsArangoDBImageIDAndVersionPod(p) {
// Image ID pods are not relevant to inspect here
continue
}

// Check pod state
if !(k8sutil.IsPodSucceeded(&p) || k8sutil.IsPodFailed(&p)) {
continue
}

// Find member status
memberStatus, _, found := status.Members.MemberStatusByPodName(p.GetName())
if !found {
log.Debug().Str("pod", p.GetName()).Msg("no memberstatus found for pod")
continue
}

// Check member termination condition
if !memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) {
continue
}

// Ok, we can delete the pod
log.Debug().Str("pod-name", p.GetName()).Msg("Cleanup terminated pod")
if err := r.context.CleanupPod(p); err != nil {
log.Warn().Err(err).Str("pod-name", p.GetName()).Msg("Failed to cleanup pod")
}
}
return nil
}
2 changes: 2 additions & 0 deletions pkg/deployment/resources/pod_creator.go
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, group api.Server
if err := k8sutil.CreateArangodPod(kubecli, spec.IsDevelopment(), apiObject, role, m.ID, m.PodName, m.PersistentVolumeClaimName, info.ImageID, spec.GetImagePullPolicy(), args, env, livenessProbe, readinessProbe, tlsKeyfileSecretName, rocksdbEncryptionSecretName); err != nil {
return maskAny(err)
}
log.Debug().Str("pod-name", m.PodName).Msg("Created pod")
} else if group.IsArangosync() {
// Find image ID
info, found := status.Images.GetByImage(spec.Sync.GetImage())
Expand All @@ -390,6 +391,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, group api.Server
if err := k8sutil.CreateArangoSyncPod(kubecli, spec.IsDevelopment(), apiObject, role, m.ID, m.PodName, info.ImageID, spec.Sync.GetImagePullPolicy(), args, env, livenessProbe, affinityWithRole); err != nil {
return maskAny(err)
}
log.Debug().Str("pod-name", m.PodName).Msg("Created pod")
}
// Record new member state
m.State = newState
Expand Down
6 changes: 5 additions & 1 deletion pkg/deployment/resources/pod_inspector.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,27 +70,30 @@ func (r *Resources) InspectPods() error {
if k8sutil.IsPodSucceeded(&p) {
// Pod has terminated with exit code 0.
if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") {
log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Terminated to true: Pod Succeeded")
updateMemberStatusNeeded = true
}
} else if k8sutil.IsPodFailed(&p) {
// Pod has terminated with at least 1 container with a non-zero exit code.
if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") {
log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Terminated to true: Pod Failed")
updateMemberStatusNeeded = true
}
}
if k8sutil.IsPodReady(&p) {
// Pod is now ready
if memberStatus.Conditions.Update(api.ConditionTypeReady, true, "Pod Ready", "") {
log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Ready to true")
updateMemberStatusNeeded = true
}
} else {
// Pod is not ready
if memberStatus.Conditions.Update(api.ConditionTypeReady, false, "Pod Not Ready", "") {
log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Ready to false")
updateMemberStatusNeeded = true
}
}
if updateMemberStatusNeeded {
log.Debug().Str("pod-name", p.GetName()).Msg("Updated member status member for pod")
if err := status.Members.UpdateMemberStatus(memberStatus, group); err != nil {
return maskAny(err)
}
Expand Down Expand Up @@ -123,6 +126,7 @@ func (r *Resources) InspectPods() error {
}
}
default:
log.Debug().Str("pod-name", podName).Msg("Pod is gone")
m.State = api.MemberStateNone // This is trigger a recreate of the pod.
// Create event
events = append(events, k8sutil.NewPodGoneEvent(podName, group.AsRole(), apiObject))
Expand Down
4 changes: 4 additions & 0 deletions tests/resilience_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,10 @@ func TestResiliencePVC(t *testing.T) {

// Delete one pvc after the other
apiObject.ForeachServerGroup(func(group api.ServerGroup, spec api.ServerGroupSpec, status *api.MemberStatusList) error {
if group == api.ServerGroupCoordinators {
// Coordinators have no PVC
return nil
}
for _, m := range *status {
// Get current pvc so we can compare UID later
originalPVC, err := kubecli.CoreV1().PersistentVolumeClaims(ns).Get(m.PersistentVolumeClaimName, metav1.GetOptions{})
Expand Down

0 comments on commit d85dea4

Please sign in to comment.