Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added detection on unschedulable pods #82

Merged
merged 4 commits into from
Mar 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg/apis/deployment/v1alpha/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ const (
ConditionTypeTerminated ConditionType = "Terminated"
// ConditionTypeAutoUpgrade indicates that the member has to be started with `--database.auto-upgrade` once.
ConditionTypeAutoUpgrade ConditionType = "AutoUpgrade"
// ConditionTypePodSchedulingFailure indicates that one or more pods belonging to the deployment cannot be schedule.
ConditionTypePodSchedulingFailure ConditionType = "PodSchedulingFailure"
// ConditionTypeSecretsChanged indicates that the value of one of more secrets used by
// the deployment have changed. Once that is the case, the operator will no longer
// touch the deployment, until the original secrets have been restored.
Expand Down
32 changes: 32 additions & 0 deletions pkg/deployment/resources/pod_inspector.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
package resources

import (
"fmt"
"time"

"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
"k8s.io/api/core/v1"

Expand All @@ -34,6 +37,10 @@ var (
inspectedPodCounter = metrics.MustRegisterCounter("deployment", "inspected_pods", "Number of pod inspections")
)

const (
podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod.
)

// InspectPods lists all pods that belong to the given deployment and updates
// the member status of the deployment accordingly.
func (r *Resources) InspectPods() error {
Expand All @@ -49,6 +56,8 @@ func (r *Resources) InspectPods() error {
// Update member status from all pods found
status := r.context.GetStatus()
apiObject := r.context.GetAPIObject()
var podNamesWithScheduleTimeout []string
var unscheduledPodNames []string
for _, p := range pods {
if k8sutil.IsArangoDBImageIDAndVersionPod(p) {
// Image ID pods are not relevant to inspect here
Expand Down Expand Up @@ -93,6 +102,13 @@ func (r *Resources) InspectPods() error {
updateMemberStatusNeeded = true
}
}
if k8sutil.IsPodNotScheduledFor(&p, podScheduleTimeout) {
// Pod cannot be scheduled for to long
log.Debug().Str("pod-name", p.GetName()).Msg("Pod scheduling timeout")
podNamesWithScheduleTimeout = append(podNamesWithScheduleTimeout, p.GetName())
} else if !k8sutil.IsPodScheduled(&p) {
unscheduledPodNames = append(unscheduledPodNames, p.GetName())
}
if updateMemberStatusNeeded {
if err := status.Members.UpdateMemberStatus(memberStatus, group); err != nil {
return maskAny(err)
Expand Down Expand Up @@ -151,6 +167,22 @@ func (r *Resources) InspectPods() error {
// TODO handle other State values
}

// Update conditions
if len(podNamesWithScheduleTimeout) > 0 {
if status.Conditions.Update(api.ConditionTypePodSchedulingFailure, true,
"Pods Scheduling Timeout",
fmt.Sprintf("The following pods cannot be scheduled: %v", podNamesWithScheduleTimeout)) {
r.context.CreateEvent(k8sutil.NewPodsSchedulingFailureEvent(podNamesWithScheduleTimeout, r.context.GetAPIObject()))
}
} else if status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) &&
len(unscheduledPodNames) == 0 {
if status.Conditions.Update(api.ConditionTypePodSchedulingFailure, false,
"Pods Scheduling Resolved",
"No pod reports a scheduling timeout") {
r.context.CreateEvent(k8sutil.NewPodsSchedulingResolvedEvent(r.context.GetAPIObject()))
}
}

// Save status
if err := r.context.UpdateStatus(status); err != nil {
return maskAny(err)
Expand Down
19 changes: 19 additions & 0 deletions pkg/util/k8sutil/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,25 @@ func NewImmutableFieldEvent(fieldName string, apiObject APIObject) *v1.Event {
return event
}

// NewPodsSchedulingFailureEvent creates an event indicating that one of more cannot be scheduled.
func NewPodsSchedulingFailureEvent(unscheduledPodNames []string, apiObject APIObject) *v1.Event {
event := newDeploymentEvent(apiObject)
event.Type = v1.EventTypeNormal
event.Reason = "Pods Scheduling Failure"
event.Message = fmt.Sprintf("One or more pods are not scheduled in time. Pods: %v", unscheduledPodNames)
return event
}

// NewPodsSchedulingResolvedEvent creates an event indicating that an earlier problem with
// pod scheduling has been resolved.
func NewPodsSchedulingResolvedEvent(apiObject APIObject) *v1.Event {
event := newDeploymentEvent(apiObject)
event.Type = v1.EventTypeNormal
event.Reason = "Pods Scheduling Resolved"
event.Message = "All pods have been scheduled"
return event
}

// NewSecretsChangedEvent creates an event indicating that one of more secrets have changed.
func NewSecretsChangedEvent(changedSecretNames []string, apiObject APIObject) *v1.Event {
event := newDeploymentEvent(apiObject)
Expand Down
16 changes: 16 additions & 0 deletions pkg/util/k8sutil/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ package k8sutil
import (
"fmt"
"path/filepath"
"time"

"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -87,6 +88,21 @@ func IsPodFailed(pod *v1.Pod) bool {
return pod.Status.Phase == v1.PodFailed
}

// IsPodScheduled returns true if the pod has been scheduled.
func IsPodScheduled(pod *v1.Pod) bool {
condition := getPodCondition(&pod.Status, v1.PodScheduled)
return condition != nil && condition.Status == v1.ConditionTrue
}

// IsPodNotScheduledFor returns true if the pod has not been scheduled
// for longer than the given duration.
func IsPodNotScheduledFor(pod *v1.Pod, timeout time.Duration) bool {
condition := getPodCondition(&pod.Status, v1.PodScheduled)
return condition != nil &&
condition.Status == v1.ConditionFalse &&
condition.LastTransitionTime.Time.Add(timeout).Before(time.Now())
}

// IsArangoDBImageIDAndVersionPod returns true if the given pod is used for fetching image ID and ArangoDB version of an image
func IsArangoDBImageIDAndVersionPod(p v1.Pod) bool {
role, found := p.GetLabels()[LabelKeyRole]
Expand Down