Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add handling for podFailurePolicy #269

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion pkg/controllers/jobset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ import (
const (
RestartsKey string = "jobset.sigs.k8s.io/restart-attempt"
parallelDeletions int = 50

// The JobConditionReasonPodFailurePolicy constant is defined here
// since the constant is not exported as part of Job API and thus
// cannot be referenced directly, so we have to redefine it here for now.
JobConditionReasonPodFailurePolicy = "PodFailurePolicy"
)

var (
Expand Down Expand Up @@ -459,12 +464,30 @@ func (r *JobSetReconciler) executeFailurePolicy(ctx context.Context, js *jobset.
}

func (r *JobSetReconciler) executeRestartPolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) error {
if js.Spec.FailurePolicy.MaxRestarts == 0 {
if js.Spec.FailurePolicy.MaxRestarts == 0 || r.triggeredPodFailurePolicy(ctx, js, ownedJobs) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So just checking, we only want to obey PodFailurePolicy if RestartPolicy is specified. Default behavior would be to fail on any failure?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If no jobset failure policy is specified, the jobset will fail immediately without restarts anyway. So the only place we need to do this podFailurePolicy check is if restarting is an option.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this behavior would be immediately intuitive to users:
If one Job fails due to PodFailure policy, the entire job set fails.

Either make sure this is properly documented or make it a JobSet's FailurePolicy whether to respect the Job's.

return r.failJobSet(ctx, js)
}
return r.restartPolicyRecreateAll(ctx, js, ownedJobs)
}

// If a child job has failed due to triggering a PodFailurePolicy,
// we should fail the JobSet immediately rather than restarting.
// This allows the user to configure a PodFailurePolicy such that
// job failures under certain conditions do not cause the JobSet to
// restart, while others do.
func (r *JobSetReconciler) triggeredPodFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) bool {
log := ctrl.LoggerFrom(ctx)
for _, failedJob := range ownedJobs.failed {
for _, c := range failedJob.Status.Conditions {
Copy link
Contributor

@kannon92 kannon92 Aug 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could use IsStatusConditionTrue
true from kubernetes here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at this more, I don't think you would be able to use this. That allows you to see if you have a FailedJob conditon but it doesn't match on the reason. I guess you could match on FailedJob condition and then check reason.

if c.Type == batchv1.JobFailed && c.Reason == JobConditionReasonPodFailurePolicy && c.Status == corev1.ConditionTrue {
log.V(2).Info("jobset %s child job %s failed due to triggering a PodFailurePolicy", js.Name, failedJob.Name)
return true
}
}
}
return false
}

func (r *JobSetReconciler) restartPolicyRecreateAll(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) error {
log := ctrl.LoggerFrom(ctx)

Expand Down
6 changes: 6 additions & 0 deletions pkg/util/testing/wrappers.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,12 @@ func MakeJobTemplate(name, ns string) *JobTemplateWrapper {
}
}

// PodFailurePolicy sets the job.spec.podFailurePolicy
func (j *JobTemplateWrapper) PodFailurePolicy(podFailurePolicy *batchv1.PodFailurePolicy) *JobTemplateWrapper {
j.Spec.PodFailurePolicy = podFailurePolicy
return j
}

// CompletionMode sets the value of job.spec.completionMode
func (j *JobTemplateWrapper) CompletionMode(mode batchv1.CompletionMode) *JobTemplateWrapper {
j.Spec.CompletionMode = &mode
Expand Down
43 changes: 43 additions & 0 deletions test/integration/controller/jobset_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,49 @@ var _ = ginkgo.Describe("JobSet controller", func() {
},
},
}),
ginkgo.Entry("child job fails due to triggering PodFailurePolicy", &testCase{
makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper {
jobSetName := "test-podfailurepolicy"
return testing.MakeJobSet(jobSetName, ns.Name).
SuccessPolicy(&jobset.SuccessPolicy{Operator: jobset.OperatorAll, TargetReplicatedJobs: []string{}}).
EnableDNSHostnames(true).
NetworkSubdomain(jobSetName).
ReplicatedJob(testing.MakeReplicatedJob("replicated-job-a").
Job(testing.MakeJobTemplate("test-job-A", ns.Name).
PodFailurePolicy(&batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{
{
Action: batchv1.PodFailurePolicyActionFailJob,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
ContainerName: &testing.TestPodSpec.Containers[0].Name,
Operator: "NotIn",
Values: []int32{143}, // SIGTERM
},
},
},
}).
PodSpec(testing.TestPodSpec).Obj()).
Replicas(1).
Obj())
},
updates: []*update{
{
jobUpdateFn: func(jobList *batchv1.JobList) {
ginkgo.By("fail job with condition reason indicating it matched a podFailurePolicy")
job := &jobList.Items[0]
updateJobStatusConditions(job, batchv1.JobStatus{
Conditions: append(job.Status.Conditions, batchv1.JobCondition{
Type: batchv1.JobFailed,
Status: corev1.ConditionTrue,
Reason: controllers.JobConditionReasonPodFailurePolicy,
}),
})
},
// check JobSet fails immediately without restarting.
checkJobSetCondition: testutil.JobSetFailed,
},
},
}),
ginkgo.Entry("job succeeds after one failure", &testCase{
makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper {
return testJobSet(ns).
Expand Down