diff --git a/pkg/deployment/context_impl.go b/pkg/deployment/context_impl.go index 1528de10d..19f780b4d 100644 --- a/pkg/deployment/context_impl.go +++ b/pkg/deployment/context_impl.go @@ -141,6 +141,21 @@ func (d *Deployment) DeletePod(podName string) error { return nil } +// CleanupPod deletes a given pod with force and explicit UID. +// If the pod does not exist, the error is ignored. +func (d *Deployment) CleanupPod(p v1.Pod) error { + log := d.deps.Log + podName := p.GetName() + ns := p.GetNamespace() + options := metav1.NewDeleteOptions(0) + options.Preconditions = metav1.NewUIDPreconditions(string(p.GetUID())) + if err := d.deps.KubeCli.Core().Pods(ns).Delete(podName, options); err != nil && !k8sutil.IsNotFound(err) { + log.Debug().Err(err).Str("pod", podName).Msg("Failed to cleanup pod") + return maskAny(err) + } + return nil +} + // DeletePvc deletes a persistent volume claim with given name in the namespace // of the deployment. If the pvc does not exist, the error is ignored. func (d *Deployment) DeletePvc(pvcName string) error { diff --git a/pkg/deployment/deployment_inspector.go b/pkg/deployment/deployment_inspector.go index b3311facf..7229aec6f 100644 --- a/pkg/deployment/deployment_inspector.go +++ b/pkg/deployment/deployment_inspector.go @@ -87,6 +87,12 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration d.CreateEvent(k8sutil.NewErrorEvent("Pod creation failed", err, d.apiObject)) } + // At the end of the inspect, we cleanup terminated pods. + if d.resources.CleanupTerminatedPods(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("Pod cleanup failed", err, d.apiObject)) + } + // Update next interval (on errors) if hasError { if d.recentInspectionErrors == 0 { diff --git a/pkg/deployment/resources/context.go b/pkg/deployment/resources/context.go index f1bb69200..e7a5d6683 100644 --- a/pkg/deployment/resources/context.go +++ b/pkg/deployment/resources/context.go @@ -62,4 +62,7 @@ type Context interface { CreateEvent(evt *v1.Event) // GetOwnedPods returns a list of all pods owned by the deployment. GetOwnedPods() ([]v1.Pod, error) + // CleanupPod deletes a given pod with force and explicit UID. + // If the pod does not exist, the error is ignored. + CleanupPod(p v1.Pod) error } diff --git a/pkg/deployment/resources/pod_cleanup.go b/pkg/deployment/resources/pod_cleanup.go new file mode 100644 index 000000000..a625c0cf8 --- /dev/null +++ b/pkg/deployment/resources/pod_cleanup.go @@ -0,0 +1,73 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package resources + +import ( + "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" + + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" +) + +// CleanupTerminatedPods removes all pods in Terminated state that belong to a member in Created state. +func (r *Resources) CleanupTerminatedPods() error { + log := r.log + + pods, err := r.context.GetOwnedPods() + if err != nil { + log.Debug().Err(err).Msg("Failed to get owned pods") + return maskAny(err) + } + + // Update member status from all pods found + status := r.context.GetStatus() + for _, p := range pods { + if k8sutil.IsArangoDBImageIDAndVersionPod(p) { + // Image ID pods are not relevant to inspect here + continue + } + + // Check pod state + if !(k8sutil.IsPodSucceeded(&p) || k8sutil.IsPodFailed(&p)) { + continue + } + + // Find member status + memberStatus, _, found := status.Members.MemberStatusByPodName(p.GetName()) + if !found { + log.Debug().Str("pod", p.GetName()).Msg("no memberstatus found for pod") + continue + } + + // Check member termination condition + if !memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { + continue + } + + // Ok, we can delete the pod + log.Debug().Str("pod-name", p.GetName()).Msg("Cleanup terminated pod") + if err := r.context.CleanupPod(p); err != nil { + log.Warn().Err(err).Str("pod-name", p.GetName()).Msg("Failed to cleanup pod") + } + } + return nil +} diff --git a/pkg/deployment/resources/pod_creator.go b/pkg/deployment/resources/pod_creator.go index 0289b5045..373f98757 100644 --- a/pkg/deployment/resources/pod_creator.go +++ b/pkg/deployment/resources/pod_creator.go @@ -369,6 +369,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, group api.Server if err := k8sutil.CreateArangodPod(kubecli, spec.IsDevelopment(), apiObject, role, m.ID, m.PodName, m.PersistentVolumeClaimName, info.ImageID, spec.GetImagePullPolicy(), args, env, livenessProbe, readinessProbe, tlsKeyfileSecretName, rocksdbEncryptionSecretName); err != nil { return maskAny(err) } + log.Debug().Str("pod-name", m.PodName).Msg("Created pod") } else if group.IsArangosync() { // Find image ID info, found := status.Images.GetByImage(spec.Sync.GetImage()) @@ -390,6 +391,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, group api.Server if err := k8sutil.CreateArangoSyncPod(kubecli, spec.IsDevelopment(), apiObject, role, m.ID, m.PodName, info.ImageID, spec.Sync.GetImagePullPolicy(), args, env, livenessProbe, affinityWithRole); err != nil { return maskAny(err) } + log.Debug().Str("pod-name", m.PodName).Msg("Created pod") } // Record new member state m.State = newState diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index 2bf307714..c6ecec8d8 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -70,27 +70,30 @@ func (r *Resources) InspectPods() error { if k8sutil.IsPodSucceeded(&p) { // Pod has terminated with exit code 0. if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") { + log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Terminated to true: Pod Succeeded") updateMemberStatusNeeded = true } } else if k8sutil.IsPodFailed(&p) { // Pod has terminated with at least 1 container with a non-zero exit code. if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") { + log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Terminated to true: Pod Failed") updateMemberStatusNeeded = true } } if k8sutil.IsPodReady(&p) { // Pod is now ready if memberStatus.Conditions.Update(api.ConditionTypeReady, true, "Pod Ready", "") { + log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Ready to true") updateMemberStatusNeeded = true } } else { // Pod is not ready if memberStatus.Conditions.Update(api.ConditionTypeReady, false, "Pod Not Ready", "") { + log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Ready to false") updateMemberStatusNeeded = true } } if updateMemberStatusNeeded { - log.Debug().Str("pod-name", p.GetName()).Msg("Updated member status member for pod") if err := status.Members.UpdateMemberStatus(memberStatus, group); err != nil { return maskAny(err) } @@ -123,6 +126,7 @@ func (r *Resources) InspectPods() error { } } default: + log.Debug().Str("pod-name", podName).Msg("Pod is gone") m.State = api.MemberStateNone // This is trigger a recreate of the pod. // Create event events = append(events, k8sutil.NewPodGoneEvent(podName, group.AsRole(), apiObject)) diff --git a/tests/resilience_test.go b/tests/resilience_test.go index 5fa7e30cc..814f239d1 100644 --- a/tests/resilience_test.go +++ b/tests/resilience_test.go @@ -140,6 +140,10 @@ func TestResiliencePVC(t *testing.T) { // Delete one pvc after the other apiObject.ForeachServerGroup(func(group api.ServerGroup, spec api.ServerGroupSpec, status *api.MemberStatusList) error { + if group == api.ServerGroupCoordinators { + // Coordinators have no PVC + return nil + } for _, m := range *status { // Get current pvc so we can compare UID later originalPVC, err := kubecli.CoreV1().PersistentVolumeClaims(ns).Get(m.PersistentVolumeClaimName, metav1.GetOptions{})