Skip to content

Commit

Permalink
add handling for podFailurePolicy
Browse files Browse the repository at this point in the history
  • Loading branch information
danielvegamyhre committed Aug 24, 2023
1 parent 040cbd4 commit 609397c
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 3 deletions.
25 changes: 22 additions & 3 deletions pkg/controllers/jobset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ const (
)

var (
jobOwnerKey = ".metadata.controller"
apiGVStr = jobset.GroupVersion.String()
jobOwnerKey = ".metadata.controller"
apiGVStr = jobset.GroupVersion.String()
JobConditionReasonPodFailurePolicy = "PodFailurePolicy"
)

// JobSetReconciler reconciles a JobSet object
Expand Down Expand Up @@ -459,12 +460,30 @@ func (r *JobSetReconciler) executeFailurePolicy(ctx context.Context, js *jobset.
}

func (r *JobSetReconciler) executeRestartPolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) error {
if js.Spec.FailurePolicy.MaxRestarts == 0 {
if js.Spec.FailurePolicy.MaxRestarts == 0 || r.triggeredPodFailurePolicy(ctx, js, ownedJobs) {
return r.failJobSet(ctx, js)
}
return r.restartPolicyRecreateAll(ctx, js, ownedJobs)
}

// If a child job has failed due to triggering a PodFailurePolicy,
// we should fail the JobSet immediately rather than restarting.
// This allows the user to configure a PodFailurePolicy such that
// job failures under certain conditions do not cause the JobSet to
// restart, while others do.
func (r *JobSetReconciler) triggeredPodFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) bool {
log := ctrl.LoggerFrom(ctx)
for _, failedJob := range ownedJobs.failed {
for _, c := range failedJob.Status.Conditions {
if c.Reason == JobConditionReasonPodFailurePolicy {
log.V(2).Info("jobset %s child job %s failed due to triggering a PodFailurePolicy", js.Name, failedJob.Name)
return true
}
}
}
return false
}

func (r *JobSetReconciler) restartPolicyRecreateAll(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) error {
log := ctrl.LoggerFrom(ctx)

Expand Down
26 changes: 26 additions & 0 deletions test/integration/controller/jobset_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,32 @@ var _ = ginkgo.Describe("JobSet controller", func() {
},
},
}),
ginkgo.Entry("child job fails due to triggering PodFailurePolicy", &testCase{
makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper {
// Set up JobSet which allows up to 3 restarts.
return testJobSet(ns).
FailurePolicy(&jobset.FailurePolicy{
MaxRestarts: 3,
})
},
updates: []*update{
{
jobUpdateFn: func(jobList *batchv1.JobList) {
ginkgo.By("fail job with condition reason indicating it matched a podFailurePolicy")
job := &jobList.Items[0]
updateJobStatusConditions(job, batchv1.JobStatus{
Conditions: append(job.Status.Conditions, batchv1.JobCondition{
Type: batchv1.JobFailed,
Status: corev1.ConditionTrue,
Reason: controllers.JobConditionReasonPodFailurePolicy,
}),
})
},
// check JobSet fails immediately without restarting.
checkJobSetCondition: testutil.JobSetFailed,
},
},
}),
ginkgo.Entry("job succeeds after one failure", &testCase{
makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper {
return testJobSet(ns).
Expand Down

0 comments on commit 609397c

Please sign in to comment.