diff --git a/pkg/controllers/jobset_controller.go b/pkg/controllers/jobset_controller.go index 1b355c2ff..490a1bb2a 100644 --- a/pkg/controllers/jobset_controller.go +++ b/pkg/controllers/jobset_controller.go @@ -45,8 +45,9 @@ const ( ) var ( - jobOwnerKey = ".metadata.controller" - apiGVStr = jobset.GroupVersion.String() + jobOwnerKey = ".metadata.controller" + apiGVStr = jobset.GroupVersion.String() + JobConditionReasonPodFailurePolicy = "PodFailurePolicy" ) // JobSetReconciler reconciles a JobSet object @@ -459,12 +460,30 @@ func (r *JobSetReconciler) executeFailurePolicy(ctx context.Context, js *jobset. } func (r *JobSetReconciler) executeRestartPolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) error { - if js.Spec.FailurePolicy.MaxRestarts == 0 { + if js.Spec.FailurePolicy.MaxRestarts == 0 || r.triggeredPodFailurePolicy(ctx, js, ownedJobs) { return r.failJobSet(ctx, js) } return r.restartPolicyRecreateAll(ctx, js, ownedJobs) } +// If a child job has failed due to triggering a PodFailurePolicy, +// we should fail the JobSet immediately rather than restarting. +// This allows the user to configure a PodFailurePolicy such that +// job failures under certain conditions do not cause the JobSet to +// restart, while others do. +func (r *JobSetReconciler) triggeredPodFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) bool { + log := ctrl.LoggerFrom(ctx) + for _, failedJob := range ownedJobs.failed { + for _, c := range failedJob.Status.Conditions { + if c.Reason == JobConditionReasonPodFailurePolicy { + log.V(2).Info("jobset %s child job %s failed due to triggering a PodFailurePolicy", js.Name, failedJob.Name) + return true + } + } + } + return false +} + func (r *JobSetReconciler) restartPolicyRecreateAll(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) error { log := ctrl.LoggerFrom(ctx) diff --git a/test/integration/controller/jobset_controller_test.go b/test/integration/controller/jobset_controller_test.go index 8e8db9611..0e00e8912 100644 --- a/test/integration/controller/jobset_controller_test.go +++ b/test/integration/controller/jobset_controller_test.go @@ -396,6 +396,32 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, }, }), + ginkgo.Entry("child job fails due to triggering PodFailurePolicy", &testCase{ + makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { + // Set up JobSet which allows up to 3 restarts. + return testJobSet(ns). + FailurePolicy(&jobset.FailurePolicy{ + MaxRestarts: 3, + }) + }, + updates: []*update{ + { + jobUpdateFn: func(jobList *batchv1.JobList) { + ginkgo.By("fail job with condition reason indicating it matched a podFailurePolicy") + job := &jobList.Items[0] + updateJobStatusConditions(job, batchv1.JobStatus{ + Conditions: append(job.Status.Conditions, batchv1.JobCondition{ + Type: batchv1.JobFailed, + Status: corev1.ConditionTrue, + Reason: controllers.JobConditionReasonPodFailurePolicy, + }), + }) + }, + // check JobSet fails immediately without restarting. + checkJobSetCondition: testutil.JobSetFailed, + }, + }, + }), ginkgo.Entry("job succeeds after one failure", &testCase{ makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { return testJobSet(ns).