diff --git a/pkg/apis/deployment/v1alpha/server_group.go b/pkg/apis/deployment/v1alpha/server_group.go index 894ba0b1e..4ccb7f21b 100644 --- a/pkg/apis/deployment/v1alpha/server_group.go +++ b/pkg/apis/deployment/v1alpha/server_group.go @@ -101,6 +101,16 @@ func (g ServerGroup) DefaultTerminationGracePeriod() time.Duration { } } +// IsStateless returns true when the groups runs servers without a persistent volume. +func (g ServerGroup) IsStateless() bool { + switch g { + case ServerGroupCoordinators, ServerGroupSyncMasters, ServerGroupSyncWorkers: + return true + default: + return false + } +} + // IsArangod returns true when the groups runs servers of type `arangod`. func (g ServerGroup) IsArangod() bool { switch g { diff --git a/pkg/deployment/resources/pod_cleanup.go b/pkg/deployment/resources/pod_cleanup.go index a625c0cf8..e004e6f02 100644 --- a/pkg/deployment/resources/pod_cleanup.go +++ b/pkg/deployment/resources/pod_cleanup.go @@ -23,11 +23,17 @@ package resources import ( + "time" + "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" ) +const ( + statelessTerminationPeriod = time.Minute // We wait this long for a stateless server to terminate on it's own. Afterwards we kill it. +) + // CleanupTerminatedPods removes all pods in Terminated state that belong to a member in Created state. func (r *Resources) CleanupTerminatedPods() error { log := r.log @@ -47,20 +53,29 @@ func (r *Resources) CleanupTerminatedPods() error { } // Check pod state - if !(k8sutil.IsPodSucceeded(&p) || k8sutil.IsPodFailed(&p)) { + if !(k8sutil.IsPodSucceeded(&p) || k8sutil.IsPodFailed(&p) || k8sutil.IsPodTerminating(&p)) { continue } // Find member status - memberStatus, _, found := status.Members.MemberStatusByPodName(p.GetName()) + memberStatus, group, found := status.Members.MemberStatusByPodName(p.GetName()) if !found { - log.Debug().Str("pod", p.GetName()).Msg("no memberstatus found for pod") - continue - } - - // Check member termination condition - if !memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { - continue + log.Debug().Str("pod", p.GetName()).Msg("no memberstatus found for pod. Performing cleanup") + } else { + // Check member termination condition + if !memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { + if !group.IsStateless() { + // For statefull members, we have to wait for confirmed termination + continue + } else { + // If a stateless server does not terminate within a reasonable amount or time, we kill it. + t := p.GetDeletionTimestamp() + if t == nil || t.Add(statelessTerminationPeriod).After(time.Now()) { + // Either delete timestamp is not set, or not yet waiting long enough + continue + } + } + } } // Ok, we can delete the pod diff --git a/pkg/util/k8sutil/pods.go b/pkg/util/k8sutil/pods.go index 80f6c4a78..5103da127 100644 --- a/pkg/util/k8sutil/pods.go +++ b/pkg/util/k8sutil/pods.go @@ -123,6 +123,12 @@ func IsPodMarkedForDeletion(pod *v1.Pod) bool { return pod.DeletionTimestamp != nil } +// IsPodTerminating returns true if the pod has been marked for deletion +// but is still running. +func IsPodTerminating(pod *v1.Pod) bool { + return IsPodMarkedForDeletion(pod) && pod.Status.Phase == v1.PodRunning +} + // IsArangoDBImageIDAndVersionPod returns true if the given pod is used for fetching image ID and ArangoDB version of an image func IsArangoDBImageIDAndVersionPod(p v1.Pod) bool { role, found := p.GetLabels()[LabelKeyRole]