kubernetes-sigs · danielvegamyhre · Aug 24, 2023 · Aug 25, 2023 · Aug 25, 2023 · Aug 28, 2023
diff --git a/pkg/controllers/jobset_controller.go b/pkg/controllers/jobset_controller.go
@@ -42,6 +42,11 @@ import (
 const (
 	RestartsKey       string = "jobset.sigs.k8s.io/restart-attempt"
 	parallelDeletions int    = 50
+
+	// The JobConditionReasonPodFailurePolicy constant is defined here
+	// since the constant is not exported as part of Job API and thus
+	// cannot be referenced directly, so we have to redefine it here for now.
+	JobConditionReasonPodFailurePolicy = "PodFailurePolicy"
 )
 
 var (
@@ -459,12 +464,30 @@ func (r *JobSetReconciler) executeFailurePolicy(ctx context.Context, js *jobset.
 }
 
 func (r *JobSetReconciler) executeRestartPolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) error {
-	if js.Spec.FailurePolicy.MaxRestarts == 0 {
+	if js.Spec.FailurePolicy.MaxRestarts == 0 || r.triggeredPodFailurePolicy(ctx, js, ownedJobs) {
 		return r.failJobSet(ctx, js)
 	}
 	return r.restartPolicyRecreateAll(ctx, js, ownedJobs)
 }
 
+// If a child job has failed due to triggering a PodFailurePolicy,
+// we should fail the JobSet immediately rather than restarting.
+// This allows the user to configure a PodFailurePolicy such that
+// job failures under certain conditions do not cause the JobSet to
+// restart, while others do.
+func (r *JobSetReconciler) triggeredPodFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) bool {
+	log := ctrl.LoggerFrom(ctx)
+	for _, failedJob := range ownedJobs.failed {
+		for _, c := range failedJob.Status.Conditions {
+			if c.Type == batchv1.JobFailed && c.Reason == JobConditionReasonPodFailurePolicy && c.Status == corev1.ConditionTrue {
+				log.V(2).Info("jobset %s child job %s failed due to triggering a PodFailurePolicy", js.Name, failedJob.Name)
+				return true
+			}
+		}
+	}
+	return false
+}
+
 func (r *JobSetReconciler) restartPolicyRecreateAll(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) error {
 	log := ctrl.LoggerFrom(ctx)
 

diff --git a/pkg/util/testing/wrappers.go b/pkg/util/testing/wrappers.go
@@ -161,6 +161,12 @@ func MakeJobTemplate(name, ns string) *JobTemplateWrapper {
 	}
 }
 
+// PodFailurePolicy sets the job.spec.podFailurePolicy
+func (j *JobTemplateWrapper) PodFailurePolicy(podFailurePolicy *batchv1.PodFailurePolicy) *JobTemplateWrapper {
+	j.Spec.PodFailurePolicy = podFailurePolicy
+	return j
+}
+
 // CompletionMode sets the value of job.spec.completionMode
 func (j *JobTemplateWrapper) CompletionMode(mode batchv1.CompletionMode) *JobTemplateWrapper {
 	j.Spec.CompletionMode = &mode

diff --git a/test/integration/controller/jobset_controller_test.go b/test/integration/controller/jobset_controller_test.go
@@ -396,6 +396,49 @@ var _ = ginkgo.Describe("JobSet controller", func() {
 				},
 			},
 		}),
+		ginkgo.Entry("child job fails due to triggering PodFailurePolicy", &testCase{
+			makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper {
+				jobSetName := "test-podfailurepolicy"
+				return testing.MakeJobSet(jobSetName, ns.Name).
+					SuccessPolicy(&jobset.SuccessPolicy{Operator: jobset.OperatorAll, TargetReplicatedJobs: []string{}}).
+					EnableDNSHostnames(true).
+					NetworkSubdomain(jobSetName).
+					ReplicatedJob(testing.MakeReplicatedJob("replicated-job-a").
+						Job(testing.MakeJobTemplate("test-job-A", ns.Name).
+							PodFailurePolicy(&batchv1.PodFailurePolicy{
+								Rules: []batchv1.PodFailurePolicyRule{
+									{
+										Action: batchv1.PodFailurePolicyActionFailJob,
+										OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
+											ContainerName: &testing.TestPodSpec.Containers[0].Name,
+											Operator:      "NotIn",
+											Values:        []int32{143}, // SIGTERM
+										},
+									},
+								},
+							}).
+							PodSpec(testing.TestPodSpec).Obj()).
+						Replicas(1).
+						Obj())
+			},
+			updates: []*update{
+				{
+					jobUpdateFn: func(jobList *batchv1.JobList) {
+						ginkgo.By("fail job with condition reason indicating it matched a podFailurePolicy")
+						job := &jobList.Items[0]
+						updateJobStatusConditions(job, batchv1.JobStatus{
+							Conditions: append(job.Status.Conditions, batchv1.JobCondition{
+								Type:   batchv1.JobFailed,
+								Status: corev1.ConditionTrue,
+								Reason: controllers.JobConditionReasonPodFailurePolicy,
+							}),
+						})
+					},
+					// check JobSet fails immediately without restarting.
+					checkJobSetCondition: testutil.JobSetFailed,
+				},
+			},
+		}),
 		ginkgo.Entry("job succeeds after one failure", &testCase{
 			makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper {
 				return testJobSet(ns).